From f85cec8673a3ee156dcb9b36c6a31ec13b954505 Mon Sep 17 00:00:00 2001 From: Vincenzo Eduardo Padulano Date: Tue, 31 Mar 2026 13:53:24 +0200 Subject: [PATCH] [df] Revert choice to change default Snapshot TTree compression settings https://github.com/root-project/root/commit/61088a368bab8ca7b922c5a43a9a4e04d8dedafc made the deliberate choice to change the default compression settings when calling Snapshot with TTree output format from 101 to 505. This choice was the result of internal discussion within the team, based on the empirical evidence available up to that point that showed ZSTD outperforming ZLIB on all metrics for the TTree datasets (as well as for RNTuple datasets). This commit proposes to revert that choice based on new evidence, summarised at https://github.com/vepadulano/ttree-lossless-compression-studies. The main takeaway message from that study is that TTree datasets with branches of type ROOT::RVec where many (if not all) of the collections are empty are compressed better with ZLIB than with ZSTD. Being this case actually quite relevant, as most datasets are made of branches with collection types and as the result of analysis steps these collections may be skimmed quite drastically, there is enough motivation to move the default compression settings for TTree back to 101. This commit changes the default RSnapshotOptions values for compression settings respectively to 'kUndefined' and '0' for the compression algorithm and the compression level. When the 'kUndefined' compression algorithm is used, Snapshot will behave differently depending on the output format: the settings will be 101 for TTree and 505 for RNTuple. Add one test per respective output format to check the default values are respected. The message shown to users of ROOT 6.38 has been updated to take the new findings into account. --- tree/dataframe/inc/ROOT/RDF/RInterface.hxx | 13 ++++--- tree/dataframe/inc/ROOT/RSnapshotOptions.hxx | 9 +++-- tree/dataframe/src/RDFSnapshotHelpers.cxx | 34 +++++++++++++++---- tree/dataframe/test/dataframe_snapshot.cxx | 16 +++++++++ .../test/dataframe_snapshot_ntuple.cxx | 20 +++++++++++ 5 files changed, 75 insertions(+), 17 deletions(-) diff --git a/tree/dataframe/inc/ROOT/RDF/RInterface.hxx b/tree/dataframe/inc/ROOT/RDF/RInterface.hxx index b03a088571aa8..e8871489b05ef 100644 --- a/tree/dataframe/inc/ROOT/RDF/RInterface.hxx +++ b/tree/dataframe/inc/ROOT/RDF/RInterface.hxx @@ -1361,11 +1361,14 @@ public: return true; RLogScopedVerbosity showInfo{ROOT::Detail::RDF::RDFLogChannel(), ROOT::ELogLevel::kInfo}; R__LOG_INFO(ROOT::Detail::RDF::RDFLogChannel()) - << "\n\tIn ROOT 6.38, the default compression settings of Snapshot have been changed from 101 (ZLIB with " - "compression level 1, the TTree default) to 505 (ZSTD with compression level 5). This change may result " - "in smaller Snapshot output dataset size by default. In order to suppress this message, set " - "'ROOT_RDF_SNAPSHOT_INFO=0' in your environment or set 'ROOT.RDF.Snapshot.Info: 0' in your .rootrc " - "file."; + << "\n\tIn ROOT 6.38.00, the default compression settings of Snapshot were changed from 101 (ZLIB with " + "compression level 1, the TTree default) to 505 (ZSTD with compression level 5). The decision was based " + "on empirical evidence available up to that point. New studies summarised at " + "https://github.com/root-project/root/pull/21753 show that in certain cases " + "compression setting 101 is still the best option for TTree. Thus, this choice is reverted in ROOT " + "6.38.06 and later releases. " + "In order to suppress this message, set 'ROOT_RDF_SNAPSHOT_INFO=0' in your environment or set " + "'ROOT.RDF.Snapshot.Info: 0' in your .rootrc file."; return true; }(); // like columnList but with `#var` columns removed diff --git a/tree/dataframe/inc/ROOT/RSnapshotOptions.hxx b/tree/dataframe/inc/ROOT/RSnapshotOptions.hxx index ca0ec6d137cb8..4a4daa325e626 100644 --- a/tree/dataframe/inc/ROOT/RSnapshotOptions.hxx +++ b/tree/dataframe/inc/ROOT/RSnapshotOptions.hxx @@ -56,13 +56,13 @@ Note that for RNTuple, the defaults correspond to those set in RNTupleWriteOptio fCompressionAlgorithm ROOT::RCompressionSetting::EAlgorithm Zstd -Compression algorithm for the output dataset +Compression algorithm for the output dataset, defaults to ROOT::RCompressionSetting::EAlgorithm::EValues::kUndefined. This is converted to ZLIB by default for TTree and ZSTD by default for RNTuple fCompressionLevel int 5 -Compression level for the output dataset +Compression level for the output dataset, defaults to 0 (uncompressed). If the default value of `fCompressionAlgorithm` is not modified, the compression level is changed to 1 by default for TTree and 5 by default for RNTuple fOutputFormat @@ -184,9 +184,8 @@ struct RSnapshotOptions { } std::string fMode = "RECREATE"; ///< Mode of creation of output file ESnapshotOutputFormat fOutputFormat = ESnapshotOutputFormat::kDefault; ///< Which data format to write to - ECAlgo fCompressionAlgorithm = - ROOT::RCompressionSetting::EAlgorithm::kZSTD; ///< Compression algorithm of output file - int fCompressionLevel = 5; ///< Compression level of output file + ECAlgo fCompressionAlgorithm = ECAlgo::kUndefined; ///< Compression algorithm of output file + int fCompressionLevel = 0; ///< Compression level of output file bool fLazy = false; ///< Do not start the event loop when Snapshot is called bool fOverwriteIfExists = false; ///< If fMode is "UPDATE", overwrite object in output file if it already exists bool fVector2RVec = true; ///< If set to true will convert std::vector columns to RVec when saving to disk diff --git a/tree/dataframe/src/RDFSnapshotHelpers.cxx b/tree/dataframe/src/RDFSnapshotHelpers.cxx index ec06e521cb935..4a6065e74467b 100644 --- a/tree/dataframe/src/RDFSnapshotHelpers.cxx +++ b/tree/dataframe/src/RDFSnapshotHelpers.cxx @@ -364,6 +364,28 @@ void SetBranchesHelper(TTree *inputTree, TTree &outputTree, throw std::logic_error( "RDataFrame::Snapshot: something went wrong when creating a TTree branch, please report this as a bug."); } + +auto GetSnapshotCompressionSettings(const ROOT::RDF::RSnapshotOptions &options) +{ + using CompAlgo = ROOT::RCompressionSetting::EAlgorithm::EValues; + using OutputFormat = ROOT::RDF::ESnapshotOutputFormat; + + if (options.fOutputFormat == OutputFormat::kTTree || options.fOutputFormat == OutputFormat::kDefault) { + // The default compression settings for TTree is 101 + if (options.fCompressionAlgorithm == CompAlgo::kUndefined) { + return ROOT::CompressionSettings(CompAlgo::kZLIB, 1); + } + return ROOT::CompressionSettings(options.fCompressionAlgorithm, options.fCompressionLevel); + } else if (options.fOutputFormat == OutputFormat::kRNTuple) { + // The default compression settings for RNTuple is 505 + if (options.fCompressionAlgorithm == CompAlgo::kUndefined) { + return ROOT::CompressionSettings(CompAlgo::kZSTD, 5); + } + return ROOT::CompressionSettings(options.fCompressionAlgorithm, options.fCompressionLevel); + } else { + throw std::invalid_argument("RDataFrame::Snapshot: unrecognized output format"); + } +} } // namespace ROOT::Internal::RDF::RBranchData::RBranchData(std::string inputBranchName, std::string outputBranchName, bool isDefine, @@ -535,8 +557,7 @@ void ROOT::Internal::RDF::UntypedSnapshotTTreeHelper::SetEmptyBranches(TTree *in void ROOT::Internal::RDF::UntypedSnapshotTTreeHelper::Initialize() { fOutputFile.reset( - TFile::Open(fFileName.c_str(), fOptions.fMode.c_str(), /*ftitle=*/"", - ROOT::CompressionSettings(fOptions.fCompressionAlgorithm, fOptions.fCompressionLevel))); + TFile::Open(fFileName.c_str(), fOptions.fMode.c_str(), /*ftitle=*/"", GetSnapshotCompressionSettings(fOptions))); if (!fOutputFile) throw std::runtime_error("Snapshot: could not create output file " + fFileName); @@ -774,9 +795,9 @@ void ROOT::Internal::RDF::UntypedSnapshotTTreeHelperMT::SetEmptyBranches(TTree * void ROOT::Internal::RDF::UntypedSnapshotTTreeHelperMT::Initialize() { - const auto cs = ROOT::CompressionSettings(fOptions.fCompressionAlgorithm, fOptions.fCompressionLevel); auto outFile = - std::unique_ptr{TFile::Open(fFileName.c_str(), fOptions.fMode.c_str(), /*ftitle=*/fFileName.c_str(), cs)}; + std::unique_ptr{TFile::Open(fFileName.c_str(), fOptions.fMode.c_str(), /*ftitle=*/fFileName.c_str(), + GetSnapshotCompressionSettings(fOptions))}; if (!outFile) throw std::runtime_error("Snapshot: could not create output file " + fFileName); fOutputFile = outFile.get(); @@ -918,7 +939,7 @@ void ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper::Initialize() model->Freeze(); ROOT::RNTupleWriteOptions writeOptions; - writeOptions.SetCompression(fOptions.fCompressionAlgorithm, fOptions.fCompressionLevel); + writeOptions.SetCompression(GetSnapshotCompressionSettings(fOptions)); writeOptions.SetInitialUnzippedPageSize(fOptions.fInitialUnzippedPageSize); writeOptions.SetMaxUnzippedPageSize(fOptions.fMaxUnzippedPageSize); writeOptions.SetApproxZippedClusterSize(fOptions.fApproxZippedClusterSize); @@ -1140,8 +1161,7 @@ ROOT::Internal::RDF::SnapshotHelperWithVariations::SnapshotHelperWithVariations( TDirectory::TContext fileCtxt; fOutputHandle = std::make_shared( - TFile::Open(filename.data(), fOptions.fMode.c_str(), /*ftitle=*/"", - ROOT::CompressionSettings(fOptions.fCompressionAlgorithm, fOptions.fCompressionLevel))); + TFile::Open(filename.data(), fOptions.fMode.c_str(), /*ftitle=*/"", GetSnapshotCompressionSettings(fOptions))); if (!fOutputHandle->fFile) throw std::runtime_error(std::string{"Snapshot: could not create output file "} + std::string{filename}); diff --git a/tree/dataframe/test/dataframe_snapshot.cxx b/tree/dataframe/test/dataframe_snapshot.cxx index 3a07e265961de..324e85494bd68 100644 --- a/tree/dataframe/test/dataframe_snapshot.cxx +++ b/tree/dataframe/test/dataframe_snapshot.cxx @@ -247,6 +247,22 @@ TEST(RDFSnapshotMore, BasketSizePreservation) TestBasketSizePreservation(); } +// Test for default compression settings +TEST(RDFSnapshotMore, DefaultCompressionSettings) +{ + struct FileGuardRAII { + std::string fFilename{"RDFSnapshotMore_default_compression_settings.root"}; + std::string fTreeName{"tree"}; + ~FileGuardRAII() { std::remove(fFilename.c_str()); } + } fileGuard; + ROOT::RDataFrame df{1}; + df.Define("x", [] { return 42; }).Snapshot(fileGuard.fTreeName, fileGuard.fFilename, {"x"}); + + auto f = std::make_unique(fileGuard.fFilename.c_str()); + EXPECT_EQ(f->GetCompressionAlgorithm(), ROOT::RCompressionSetting::EAlgorithm::EValues::kZLIB); + EXPECT_EQ(f->GetCompressionLevel(), 1); +} + // fixture that provides fixed and variable sized arrays as RDF columns class RDFSnapshotArrays : public ::testing::Test { protected: diff --git a/tree/dataframe/test/dataframe_snapshot_ntuple.cxx b/tree/dataframe/test/dataframe_snapshot_ntuple.cxx index 69fad452f0165..03039ce963395 100644 --- a/tree/dataframe/test/dataframe_snapshot_ntuple.cxx +++ b/tree/dataframe/test/dataframe_snapshot_ntuple.cxx @@ -170,6 +170,26 @@ TEST(RDFSnapshotRNTuple, WriteOpts) } } +TEST(RDFSnapshotRNTuple, DefaultCompressionSettings) +{ + FileRAII fileGuard{"RDFSnapshotRNTuple_default_compression_settings.root"}; + const std::vector columns = {"x"}; + + auto df = ROOT::RDataFrame(25ull).Define("x", [] { return 10; }); + + RSnapshotOptions opts; + opts.fOutputFormat = ROOT::RDF::ESnapshotOutputFormat::kRNTuple; + + auto sdf = df.Snapshot("ntuple", fileGuard.GetPath(), {"x"}, opts); + + EXPECT_EQ(columns, sdf->GetColumnNames()); + + auto reader = RNTupleReader::Open("ntuple", fileGuard.GetPath()); + auto compSettings = *reader->GetDescriptor().GetClusterDescriptor(0).GetColumnRange(0).GetCompressionSettings(); + // The RNTuple default should be 505 + EXPECT_EQ(505, compSettings); +} + TEST(RDFSnapshotRNTuple, Compression) { FileRAII fileGuard{"RDFSnapshotRNTuple_compression.root"};