diff --git a/tree/dataframe/inc/ROOT/RDF/RInterface.hxx b/tree/dataframe/inc/ROOT/RDF/RInterface.hxx
index b03a088571aa8..e8871489b05ef 100644
--- a/tree/dataframe/inc/ROOT/RDF/RInterface.hxx
+++ b/tree/dataframe/inc/ROOT/RDF/RInterface.hxx
@@ -1361,11 +1361,14 @@ public:
return true;
RLogScopedVerbosity showInfo{ROOT::Detail::RDF::RDFLogChannel(), ROOT::ELogLevel::kInfo};
R__LOG_INFO(ROOT::Detail::RDF::RDFLogChannel())
- << "\n\tIn ROOT 6.38, the default compression settings of Snapshot have been changed from 101 (ZLIB with "
- "compression level 1, the TTree default) to 505 (ZSTD with compression level 5). This change may result "
- "in smaller Snapshot output dataset size by default. In order to suppress this message, set "
- "'ROOT_RDF_SNAPSHOT_INFO=0' in your environment or set 'ROOT.RDF.Snapshot.Info: 0' in your .rootrc "
- "file.";
+ << "\n\tIn ROOT 6.38.00, the default compression settings of Snapshot were changed from 101 (ZLIB with "
+ "compression level 1, the TTree default) to 505 (ZSTD with compression level 5). The decision was based "
+ "on empirical evidence available up to that point. New studies summarised at "
+ "https://github.com/root-project/root/pull/21753 show that in certain cases "
+ "compression setting 101 is still the best option for TTree. Thus, this choice is reverted in ROOT "
+ "6.38.06 and later releases. "
+ "In order to suppress this message, set 'ROOT_RDF_SNAPSHOT_INFO=0' in your environment or set "
+ "'ROOT.RDF.Snapshot.Info: 0' in your .rootrc file.";
return true;
}();
// like columnList but with `#var` columns removed
diff --git a/tree/dataframe/inc/ROOT/RSnapshotOptions.hxx b/tree/dataframe/inc/ROOT/RSnapshotOptions.hxx
index ca0ec6d137cb8..4a4daa325e626 100644
--- a/tree/dataframe/inc/ROOT/RSnapshotOptions.hxx
+++ b/tree/dataframe/inc/ROOT/RSnapshotOptions.hxx
@@ -56,13 +56,13 @@ Note that for RNTuple, the defaults correspond to those set in RNTupleWriteOptio
fCompressionAlgorithm |
ROOT::RCompressionSetting::EAlgorithm |
Zstd |
-Compression algorithm for the output dataset |
+Compression algorithm for the output dataset, defaults to ROOT::RCompressionSetting::EAlgorithm::EValues::kUndefined. This is converted to ZLIB by default for TTree and ZSTD by default for RNTuple |
fCompressionLevel |
int |
5 |
-Compression level for the output dataset |
+Compression level for the output dataset, defaults to 0 (uncompressed). If the default value of `fCompressionAlgorithm` is not modified, the compression level is changed to 1 by default for TTree and 5 by default for RNTuple |
fOutputFormat |
@@ -184,9 +184,8 @@ struct RSnapshotOptions {
}
std::string fMode = "RECREATE"; ///< Mode of creation of output file
ESnapshotOutputFormat fOutputFormat = ESnapshotOutputFormat::kDefault; ///< Which data format to write to
- ECAlgo fCompressionAlgorithm =
- ROOT::RCompressionSetting::EAlgorithm::kZSTD; ///< Compression algorithm of output file
- int fCompressionLevel = 5; ///< Compression level of output file
+ ECAlgo fCompressionAlgorithm = ECAlgo::kUndefined; ///< Compression algorithm of output file
+ int fCompressionLevel = 0; ///< Compression level of output file
bool fLazy = false; ///< Do not start the event loop when Snapshot is called
bool fOverwriteIfExists = false; ///< If fMode is "UPDATE", overwrite object in output file if it already exists
bool fVector2RVec = true; ///< If set to true will convert std::vector columns to RVec when saving to disk
diff --git a/tree/dataframe/src/RDFSnapshotHelpers.cxx b/tree/dataframe/src/RDFSnapshotHelpers.cxx
index ec06e521cb935..4a6065e74467b 100644
--- a/tree/dataframe/src/RDFSnapshotHelpers.cxx
+++ b/tree/dataframe/src/RDFSnapshotHelpers.cxx
@@ -364,6 +364,28 @@ void SetBranchesHelper(TTree *inputTree, TTree &outputTree,
throw std::logic_error(
"RDataFrame::Snapshot: something went wrong when creating a TTree branch, please report this as a bug.");
}
+
+auto GetSnapshotCompressionSettings(const ROOT::RDF::RSnapshotOptions &options)
+{
+ using CompAlgo = ROOT::RCompressionSetting::EAlgorithm::EValues;
+ using OutputFormat = ROOT::RDF::ESnapshotOutputFormat;
+
+ if (options.fOutputFormat == OutputFormat::kTTree || options.fOutputFormat == OutputFormat::kDefault) {
+ // The default compression settings for TTree is 101
+ if (options.fCompressionAlgorithm == CompAlgo::kUndefined) {
+ return ROOT::CompressionSettings(CompAlgo::kZLIB, 1);
+ }
+ return ROOT::CompressionSettings(options.fCompressionAlgorithm, options.fCompressionLevel);
+ } else if (options.fOutputFormat == OutputFormat::kRNTuple) {
+ // The default compression settings for RNTuple is 505
+ if (options.fCompressionAlgorithm == CompAlgo::kUndefined) {
+ return ROOT::CompressionSettings(CompAlgo::kZSTD, 5);
+ }
+ return ROOT::CompressionSettings(options.fCompressionAlgorithm, options.fCompressionLevel);
+ } else {
+ throw std::invalid_argument("RDataFrame::Snapshot: unrecognized output format");
+ }
+}
} // namespace
ROOT::Internal::RDF::RBranchData::RBranchData(std::string inputBranchName, std::string outputBranchName, bool isDefine,
@@ -535,8 +557,7 @@ void ROOT::Internal::RDF::UntypedSnapshotTTreeHelper::SetEmptyBranches(TTree *in
void ROOT::Internal::RDF::UntypedSnapshotTTreeHelper::Initialize()
{
fOutputFile.reset(
- TFile::Open(fFileName.c_str(), fOptions.fMode.c_str(), /*ftitle=*/"",
- ROOT::CompressionSettings(fOptions.fCompressionAlgorithm, fOptions.fCompressionLevel)));
+ TFile::Open(fFileName.c_str(), fOptions.fMode.c_str(), /*ftitle=*/"", GetSnapshotCompressionSettings(fOptions)));
if (!fOutputFile)
throw std::runtime_error("Snapshot: could not create output file " + fFileName);
@@ -774,9 +795,9 @@ void ROOT::Internal::RDF::UntypedSnapshotTTreeHelperMT::SetEmptyBranches(TTree *
void ROOT::Internal::RDF::UntypedSnapshotTTreeHelperMT::Initialize()
{
- const auto cs = ROOT::CompressionSettings(fOptions.fCompressionAlgorithm, fOptions.fCompressionLevel);
auto outFile =
- std::unique_ptr{TFile::Open(fFileName.c_str(), fOptions.fMode.c_str(), /*ftitle=*/fFileName.c_str(), cs)};
+ std::unique_ptr{TFile::Open(fFileName.c_str(), fOptions.fMode.c_str(), /*ftitle=*/fFileName.c_str(),
+ GetSnapshotCompressionSettings(fOptions))};
if (!outFile)
throw std::runtime_error("Snapshot: could not create output file " + fFileName);
fOutputFile = outFile.get();
@@ -918,7 +939,7 @@ void ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper::Initialize()
model->Freeze();
ROOT::RNTupleWriteOptions writeOptions;
- writeOptions.SetCompression(fOptions.fCompressionAlgorithm, fOptions.fCompressionLevel);
+ writeOptions.SetCompression(GetSnapshotCompressionSettings(fOptions));
writeOptions.SetInitialUnzippedPageSize(fOptions.fInitialUnzippedPageSize);
writeOptions.SetMaxUnzippedPageSize(fOptions.fMaxUnzippedPageSize);
writeOptions.SetApproxZippedClusterSize(fOptions.fApproxZippedClusterSize);
@@ -1140,8 +1161,7 @@ ROOT::Internal::RDF::SnapshotHelperWithVariations::SnapshotHelperWithVariations(
TDirectory::TContext fileCtxt;
fOutputHandle = std::make_shared(
- TFile::Open(filename.data(), fOptions.fMode.c_str(), /*ftitle=*/"",
- ROOT::CompressionSettings(fOptions.fCompressionAlgorithm, fOptions.fCompressionLevel)));
+ TFile::Open(filename.data(), fOptions.fMode.c_str(), /*ftitle=*/"", GetSnapshotCompressionSettings(fOptions)));
if (!fOutputHandle->fFile)
throw std::runtime_error(std::string{"Snapshot: could not create output file "} + std::string{filename});
diff --git a/tree/dataframe/test/dataframe_snapshot.cxx b/tree/dataframe/test/dataframe_snapshot.cxx
index 3a07e265961de..324e85494bd68 100644
--- a/tree/dataframe/test/dataframe_snapshot.cxx
+++ b/tree/dataframe/test/dataframe_snapshot.cxx
@@ -247,6 +247,22 @@ TEST(RDFSnapshotMore, BasketSizePreservation)
TestBasketSizePreservation();
}
+// Test for default compression settings
+TEST(RDFSnapshotMore, DefaultCompressionSettings)
+{
+ struct FileGuardRAII {
+ std::string fFilename{"RDFSnapshotMore_default_compression_settings.root"};
+ std::string fTreeName{"tree"};
+ ~FileGuardRAII() { std::remove(fFilename.c_str()); }
+ } fileGuard;
+ ROOT::RDataFrame df{1};
+ df.Define("x", [] { return 42; }).Snapshot(fileGuard.fTreeName, fileGuard.fFilename, {"x"});
+
+ auto f = std::make_unique(fileGuard.fFilename.c_str());
+ EXPECT_EQ(f->GetCompressionAlgorithm(), ROOT::RCompressionSetting::EAlgorithm::EValues::kZLIB);
+ EXPECT_EQ(f->GetCompressionLevel(), 1);
+}
+
// fixture that provides fixed and variable sized arrays as RDF columns
class RDFSnapshotArrays : public ::testing::Test {
protected:
diff --git a/tree/dataframe/test/dataframe_snapshot_ntuple.cxx b/tree/dataframe/test/dataframe_snapshot_ntuple.cxx
index 69fad452f0165..03039ce963395 100644
--- a/tree/dataframe/test/dataframe_snapshot_ntuple.cxx
+++ b/tree/dataframe/test/dataframe_snapshot_ntuple.cxx
@@ -170,6 +170,26 @@ TEST(RDFSnapshotRNTuple, WriteOpts)
}
}
+TEST(RDFSnapshotRNTuple, DefaultCompressionSettings)
+{
+ FileRAII fileGuard{"RDFSnapshotRNTuple_default_compression_settings.root"};
+ const std::vector columns = {"x"};
+
+ auto df = ROOT::RDataFrame(25ull).Define("x", [] { return 10; });
+
+ RSnapshotOptions opts;
+ opts.fOutputFormat = ROOT::RDF::ESnapshotOutputFormat::kRNTuple;
+
+ auto sdf = df.Snapshot("ntuple", fileGuard.GetPath(), {"x"}, opts);
+
+ EXPECT_EQ(columns, sdf->GetColumnNames());
+
+ auto reader = RNTupleReader::Open("ntuple", fileGuard.GetPath());
+ auto compSettings = *reader->GetDescriptor().GetClusterDescriptor(0).GetColumnRange(0).GetCompressionSettings();
+ // The RNTuple default should be 505
+ EXPECT_EQ(505, compSettings);
+}
+
TEST(RDFSnapshotRNTuple, Compression)
{
FileRAII fileGuard{"RDFSnapshotRNTuple_compression.root"};