Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 8 additions & 5 deletions tree/dataframe/inc/ROOT/RDF/RInterface.hxx
Original file line number Diff line number Diff line change
Expand Up @@ -1361,11 +1361,14 @@ public:
return true;
RLogScopedVerbosity showInfo{ROOT::Detail::RDF::RDFLogChannel(), ROOT::ELogLevel::kInfo};
R__LOG_INFO(ROOT::Detail::RDF::RDFLogChannel())
<< "\n\tIn ROOT 6.38, the default compression settings of Snapshot have been changed from 101 (ZLIB with "
"compression level 1, the TTree default) to 505 (ZSTD with compression level 5). This change may result "
"in smaller Snapshot output dataset size by default. In order to suppress this message, set "
"'ROOT_RDF_SNAPSHOT_INFO=0' in your environment or set 'ROOT.RDF.Snapshot.Info: 0' in your .rootrc "
"file.";
<< "\n\tIn ROOT 6.38.00, the default compression settings of Snapshot were changed from 101 (ZLIB with "
"compression level 1, the TTree default) to 505 (ZSTD with compression level 5). The decision was based "
"on empirical evidence available up to that point. New studies summarised at "
"https://github.com/root-project/root/pull/21753 show that in certain cases "
"compression setting 101 is still the best option for TTree. Thus, this choice is reverted in ROOT "
"6.38.06 and later releases. "
"In order to suppress this message, set 'ROOT_RDF_SNAPSHOT_INFO=0' in your environment or set "
"'ROOT.RDF.Snapshot.Info: 0' in your .rootrc file.";
return true;
}();
// like columnList but with `#var` columns removed
Expand Down
9 changes: 4 additions & 5 deletions tree/dataframe/inc/ROOT/RSnapshotOptions.hxx
Original file line number Diff line number Diff line change
Expand Up @@ -56,13 +56,13 @@ Note that for RNTuple, the defaults correspond to those set in RNTupleWriteOptio
<td><code>fCompressionAlgorithm</code></td>
<td><code>ROOT::RCompressionSetting::EAlgorithm</code></td>
<td>Zstd</td>
<td>Compression algorithm for the output dataset</td>
<td>Compression algorithm for the output dataset, defaults to ROOT::RCompressionSetting::EAlgorithm::EValues::kUndefined. This is converted to ZLIB by default for TTree and ZSTD by default for RNTuple</td>
</tr>
<tr>
<td><code>fCompressionLevel</code></td>
<td><code>int</code></td>
<td>5</td>
<td>Compression level for the output dataset</td>
<td>Compression level for the output dataset, defaults to 0 (uncompressed). If the default value of `fCompressionAlgorithm` is not modified, the compression level is changed to 1 by default for TTree and 5 by default for RNTuple</td>
</tr>
<tr>
<td><code>fOutputFormat</code></td>
Expand Down Expand Up @@ -184,9 +184,8 @@ struct RSnapshotOptions {
}
std::string fMode = "RECREATE"; ///< Mode of creation of output file
ESnapshotOutputFormat fOutputFormat = ESnapshotOutputFormat::kDefault; ///< Which data format to write to
ECAlgo fCompressionAlgorithm =
ROOT::RCompressionSetting::EAlgorithm::kZSTD; ///< Compression algorithm of output file
int fCompressionLevel = 5; ///< Compression level of output file
ECAlgo fCompressionAlgorithm = ECAlgo::kUndefined; ///< Compression algorithm of output file
int fCompressionLevel = 0; ///< Compression level of output file
bool fLazy = false; ///< Do not start the event loop when Snapshot is called
bool fOverwriteIfExists = false; ///< If fMode is "UPDATE", overwrite object in output file if it already exists
bool fVector2RVec = true; ///< If set to true will convert std::vector columns to RVec when saving to disk
Expand Down
34 changes: 27 additions & 7 deletions tree/dataframe/src/RDFSnapshotHelpers.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,28 @@ void SetBranchesHelper(TTree *inputTree, TTree &outputTree,
throw std::logic_error(
"RDataFrame::Snapshot: something went wrong when creating a TTree branch, please report this as a bug.");
}

auto GetSnapshotCompressionSettings(const ROOT::RDF::RSnapshotOptions &options)
{
using CompAlgo = ROOT::RCompressionSetting::EAlgorithm::EValues;
using OutputFormat = ROOT::RDF::ESnapshotOutputFormat;

if (options.fOutputFormat == OutputFormat::kTTree || options.fOutputFormat == OutputFormat::kDefault) {
// The default compression settings for TTree is 101
if (options.fCompressionAlgorithm == CompAlgo::kUndefined) {
return ROOT::CompressionSettings(CompAlgo::kZLIB, 1);
}
return ROOT::CompressionSettings(options.fCompressionAlgorithm, options.fCompressionLevel);
} else if (options.fOutputFormat == OutputFormat::kRNTuple) {
// The default compression settings for RNTuple is 505
if (options.fCompressionAlgorithm == CompAlgo::kUndefined) {
return ROOT::CompressionSettings(CompAlgo::kZSTD, 5);
}
return ROOT::CompressionSettings(options.fCompressionAlgorithm, options.fCompressionLevel);
} else {
throw std::invalid_argument("RDataFrame::Snapshot: unrecognized output format");
}
}
} // namespace

ROOT::Internal::RDF::RBranchData::RBranchData(std::string inputBranchName, std::string outputBranchName, bool isDefine,
Expand Down Expand Up @@ -535,8 +557,7 @@ void ROOT::Internal::RDF::UntypedSnapshotTTreeHelper::SetEmptyBranches(TTree *in
void ROOT::Internal::RDF::UntypedSnapshotTTreeHelper::Initialize()
{
fOutputFile.reset(
TFile::Open(fFileName.c_str(), fOptions.fMode.c_str(), /*ftitle=*/"",
ROOT::CompressionSettings(fOptions.fCompressionAlgorithm, fOptions.fCompressionLevel)));
TFile::Open(fFileName.c_str(), fOptions.fMode.c_str(), /*ftitle=*/"", GetSnapshotCompressionSettings(fOptions)));
if (!fOutputFile)
throw std::runtime_error("Snapshot: could not create output file " + fFileName);

Expand Down Expand Up @@ -774,9 +795,9 @@ void ROOT::Internal::RDF::UntypedSnapshotTTreeHelperMT::SetEmptyBranches(TTree *

void ROOT::Internal::RDF::UntypedSnapshotTTreeHelperMT::Initialize()
{
const auto cs = ROOT::CompressionSettings(fOptions.fCompressionAlgorithm, fOptions.fCompressionLevel);
auto outFile =
std::unique_ptr<TFile>{TFile::Open(fFileName.c_str(), fOptions.fMode.c_str(), /*ftitle=*/fFileName.c_str(), cs)};
std::unique_ptr<TFile>{TFile::Open(fFileName.c_str(), fOptions.fMode.c_str(), /*ftitle=*/fFileName.c_str(),
GetSnapshotCompressionSettings(fOptions))};
if (!outFile)
throw std::runtime_error("Snapshot: could not create output file " + fFileName);
fOutputFile = outFile.get();
Expand Down Expand Up @@ -918,7 +939,7 @@ void ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper::Initialize()
model->Freeze();

ROOT::RNTupleWriteOptions writeOptions;
writeOptions.SetCompression(fOptions.fCompressionAlgorithm, fOptions.fCompressionLevel);
writeOptions.SetCompression(GetSnapshotCompressionSettings(fOptions));
writeOptions.SetInitialUnzippedPageSize(fOptions.fInitialUnzippedPageSize);
writeOptions.SetMaxUnzippedPageSize(fOptions.fMaxUnzippedPageSize);
writeOptions.SetApproxZippedClusterSize(fOptions.fApproxZippedClusterSize);
Expand Down Expand Up @@ -1140,8 +1161,7 @@ ROOT::Internal::RDF::SnapshotHelperWithVariations::SnapshotHelperWithVariations(

TDirectory::TContext fileCtxt;
fOutputHandle = std::make_shared<SnapshotOutputWriter>(
TFile::Open(filename.data(), fOptions.fMode.c_str(), /*ftitle=*/"",
ROOT::CompressionSettings(fOptions.fCompressionAlgorithm, fOptions.fCompressionLevel)));
TFile::Open(filename.data(), fOptions.fMode.c_str(), /*ftitle=*/"", GetSnapshotCompressionSettings(fOptions)));
if (!fOutputHandle->fFile)
throw std::runtime_error(std::string{"Snapshot: could not create output file "} + std::string{filename});

Expand Down
16 changes: 16 additions & 0 deletions tree/dataframe/test/dataframe_snapshot.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,22 @@ TEST(RDFSnapshotMore, BasketSizePreservation)
TestBasketSizePreservation();
}

// Test for default compression settings
TEST(RDFSnapshotMore, DefaultCompressionSettings)
{
struct FileGuardRAII {
std::string fFilename{"RDFSnapshotMore_default_compression_settings.root"};
std::string fTreeName{"tree"};
~FileGuardRAII() { std::remove(fFilename.c_str()); }
} fileGuard;
ROOT::RDataFrame df{1};
df.Define("x", [] { return 42; }).Snapshot(fileGuard.fTreeName, fileGuard.fFilename, {"x"});

auto f = std::make_unique<TFile>(fileGuard.fFilename.c_str());
EXPECT_EQ(f->GetCompressionAlgorithm(), ROOT::RCompressionSetting::EAlgorithm::EValues::kZLIB);
EXPECT_EQ(f->GetCompressionLevel(), 1);
}

// fixture that provides fixed and variable sized arrays as RDF columns
class RDFSnapshotArrays : public ::testing::Test {
protected:
Expand Down
20 changes: 20 additions & 0 deletions tree/dataframe/test/dataframe_snapshot_ntuple.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,26 @@ TEST(RDFSnapshotRNTuple, WriteOpts)
}
}

TEST(RDFSnapshotRNTuple, DefaultCompressionSettings)
{
FileRAII fileGuard{"RDFSnapshotRNTuple_default_compression_settings.root"};
const std::vector<std::string> columns = {"x"};

auto df = ROOT::RDataFrame(25ull).Define("x", [] { return 10; });

RSnapshotOptions opts;
opts.fOutputFormat = ROOT::RDF::ESnapshotOutputFormat::kRNTuple;

auto sdf = df.Snapshot("ntuple", fileGuard.GetPath(), {"x"}, opts);

EXPECT_EQ(columns, sdf->GetColumnNames());

auto reader = RNTupleReader::Open("ntuple", fileGuard.GetPath());
auto compSettings = *reader->GetDescriptor().GetClusterDescriptor(0).GetColumnRange(0).GetCompressionSettings();
// The RNTuple default should be 505
EXPECT_EQ(505, compSettings);
}

TEST(RDFSnapshotRNTuple, Compression)
{
FileRAII fileGuard{"RDFSnapshotRNTuple_compression.root"};
Expand Down
Loading