diff --git a/tree/dataframe/inc/ROOT/RDF/RInterface.hxx b/tree/dataframe/inc/ROOT/RDF/RInterface.hxx index b03a088571aa8..e8871489b05ef 100644 --- a/tree/dataframe/inc/ROOT/RDF/RInterface.hxx +++ b/tree/dataframe/inc/ROOT/RDF/RInterface.hxx @@ -1361,11 +1361,14 @@ public: return true; RLogScopedVerbosity showInfo{ROOT::Detail::RDF::RDFLogChannel(), ROOT::ELogLevel::kInfo}; R__LOG_INFO(ROOT::Detail::RDF::RDFLogChannel()) - << "\n\tIn ROOT 6.38, the default compression settings of Snapshot have been changed from 101 (ZLIB with " - "compression level 1, the TTree default) to 505 (ZSTD with compression level 5). This change may result " - "in smaller Snapshot output dataset size by default. In order to suppress this message, set " - "'ROOT_RDF_SNAPSHOT_INFO=0' in your environment or set 'ROOT.RDF.Snapshot.Info: 0' in your .rootrc " - "file."; + << "\n\tIn ROOT 6.38.00, the default compression settings of Snapshot were changed from 101 (ZLIB with " + "compression level 1, the TTree default) to 505 (ZSTD with compression level 5). The decision was based " + "on empirical evidence available up to that point. New studies summarised at " + "https://github.com/root-project/root/pull/21753 show that in certain cases " + "compression setting 101 is still the best option for TTree. Thus, this choice is reverted in ROOT " + "6.38.06 and later releases. " + "In order to suppress this message, set 'ROOT_RDF_SNAPSHOT_INFO=0' in your environment or set " + "'ROOT.RDF.Snapshot.Info: 0' in your .rootrc file."; return true; }(); // like columnList but with `#var` columns removed diff --git a/tree/dataframe/inc/ROOT/RSnapshotOptions.hxx b/tree/dataframe/inc/ROOT/RSnapshotOptions.hxx index ca0ec6d137cb8..4a4daa325e626 100644 --- a/tree/dataframe/inc/ROOT/RSnapshotOptions.hxx +++ b/tree/dataframe/inc/ROOT/RSnapshotOptions.hxx @@ -56,13 +56,13 @@ Note that for RNTuple, the defaults correspond to those set in RNTupleWriteOptio fCompressionAlgorithm ROOT::RCompressionSetting::EAlgorithm Zstd -Compression algorithm for the output dataset +Compression algorithm for the output dataset, defaults to ROOT::RCompressionSetting::EAlgorithm::EValues::kUndefined. This is converted to ZLIB by default for TTree and ZSTD by default for RNTuple fCompressionLevel int 5 -Compression level for the output dataset +Compression level for the output dataset, defaults to 0 (uncompressed). If the default value of `fCompressionAlgorithm` is not modified, the compression level is changed to 1 by default for TTree and 5 by default for RNTuple fOutputFormat @@ -184,9 +184,8 @@ struct RSnapshotOptions { } std::string fMode = "RECREATE"; ///< Mode of creation of output file ESnapshotOutputFormat fOutputFormat = ESnapshotOutputFormat::kDefault; ///< Which data format to write to - ECAlgo fCompressionAlgorithm = - ROOT::RCompressionSetting::EAlgorithm::kZSTD; ///< Compression algorithm of output file - int fCompressionLevel = 5; ///< Compression level of output file + ECAlgo fCompressionAlgorithm = ECAlgo::kUndefined; ///< Compression algorithm of output file + int fCompressionLevel = 0; ///< Compression level of output file bool fLazy = false; ///< Do not start the event loop when Snapshot is called bool fOverwriteIfExists = false; ///< If fMode is "UPDATE", overwrite object in output file if it already exists bool fVector2RVec = true; ///< If set to true will convert std::vector columns to RVec when saving to disk diff --git a/tree/dataframe/src/RDFSnapshotHelpers.cxx b/tree/dataframe/src/RDFSnapshotHelpers.cxx index ec06e521cb935..4a6065e74467b 100644 --- a/tree/dataframe/src/RDFSnapshotHelpers.cxx +++ b/tree/dataframe/src/RDFSnapshotHelpers.cxx @@ -364,6 +364,28 @@ void SetBranchesHelper(TTree *inputTree, TTree &outputTree, throw std::logic_error( "RDataFrame::Snapshot: something went wrong when creating a TTree branch, please report this as a bug."); } + +auto GetSnapshotCompressionSettings(const ROOT::RDF::RSnapshotOptions &options) +{ + using CompAlgo = ROOT::RCompressionSetting::EAlgorithm::EValues; + using OutputFormat = ROOT::RDF::ESnapshotOutputFormat; + + if (options.fOutputFormat == OutputFormat::kTTree || options.fOutputFormat == OutputFormat::kDefault) { + // The default compression settings for TTree is 101 + if (options.fCompressionAlgorithm == CompAlgo::kUndefined) { + return ROOT::CompressionSettings(CompAlgo::kZLIB, 1); + } + return ROOT::CompressionSettings(options.fCompressionAlgorithm, options.fCompressionLevel); + } else if (options.fOutputFormat == OutputFormat::kRNTuple) { + // The default compression settings for RNTuple is 505 + if (options.fCompressionAlgorithm == CompAlgo::kUndefined) { + return ROOT::CompressionSettings(CompAlgo::kZSTD, 5); + } + return ROOT::CompressionSettings(options.fCompressionAlgorithm, options.fCompressionLevel); + } else { + throw std::invalid_argument("RDataFrame::Snapshot: unrecognized output format"); + } +} } // namespace ROOT::Internal::RDF::RBranchData::RBranchData(std::string inputBranchName, std::string outputBranchName, bool isDefine, @@ -535,8 +557,7 @@ void ROOT::Internal::RDF::UntypedSnapshotTTreeHelper::SetEmptyBranches(TTree *in void ROOT::Internal::RDF::UntypedSnapshotTTreeHelper::Initialize() { fOutputFile.reset( - TFile::Open(fFileName.c_str(), fOptions.fMode.c_str(), /*ftitle=*/"", - ROOT::CompressionSettings(fOptions.fCompressionAlgorithm, fOptions.fCompressionLevel))); + TFile::Open(fFileName.c_str(), fOptions.fMode.c_str(), /*ftitle=*/"", GetSnapshotCompressionSettings(fOptions))); if (!fOutputFile) throw std::runtime_error("Snapshot: could not create output file " + fFileName); @@ -774,9 +795,9 @@ void ROOT::Internal::RDF::UntypedSnapshotTTreeHelperMT::SetEmptyBranches(TTree * void ROOT::Internal::RDF::UntypedSnapshotTTreeHelperMT::Initialize() { - const auto cs = ROOT::CompressionSettings(fOptions.fCompressionAlgorithm, fOptions.fCompressionLevel); auto outFile = - std::unique_ptr{TFile::Open(fFileName.c_str(), fOptions.fMode.c_str(), /*ftitle=*/fFileName.c_str(), cs)}; + std::unique_ptr{TFile::Open(fFileName.c_str(), fOptions.fMode.c_str(), /*ftitle=*/fFileName.c_str(), + GetSnapshotCompressionSettings(fOptions))}; if (!outFile) throw std::runtime_error("Snapshot: could not create output file " + fFileName); fOutputFile = outFile.get(); @@ -918,7 +939,7 @@ void ROOT::Internal::RDF::UntypedSnapshotRNTupleHelper::Initialize() model->Freeze(); ROOT::RNTupleWriteOptions writeOptions; - writeOptions.SetCompression(fOptions.fCompressionAlgorithm, fOptions.fCompressionLevel); + writeOptions.SetCompression(GetSnapshotCompressionSettings(fOptions)); writeOptions.SetInitialUnzippedPageSize(fOptions.fInitialUnzippedPageSize); writeOptions.SetMaxUnzippedPageSize(fOptions.fMaxUnzippedPageSize); writeOptions.SetApproxZippedClusterSize(fOptions.fApproxZippedClusterSize); @@ -1140,8 +1161,7 @@ ROOT::Internal::RDF::SnapshotHelperWithVariations::SnapshotHelperWithVariations( TDirectory::TContext fileCtxt; fOutputHandle = std::make_shared( - TFile::Open(filename.data(), fOptions.fMode.c_str(), /*ftitle=*/"", - ROOT::CompressionSettings(fOptions.fCompressionAlgorithm, fOptions.fCompressionLevel))); + TFile::Open(filename.data(), fOptions.fMode.c_str(), /*ftitle=*/"", GetSnapshotCompressionSettings(fOptions))); if (!fOutputHandle->fFile) throw std::runtime_error(std::string{"Snapshot: could not create output file "} + std::string{filename}); diff --git a/tree/dataframe/test/dataframe_snapshot.cxx b/tree/dataframe/test/dataframe_snapshot.cxx index 3a07e265961de..324e85494bd68 100644 --- a/tree/dataframe/test/dataframe_snapshot.cxx +++ b/tree/dataframe/test/dataframe_snapshot.cxx @@ -247,6 +247,22 @@ TEST(RDFSnapshotMore, BasketSizePreservation) TestBasketSizePreservation(); } +// Test for default compression settings +TEST(RDFSnapshotMore, DefaultCompressionSettings) +{ + struct FileGuardRAII { + std::string fFilename{"RDFSnapshotMore_default_compression_settings.root"}; + std::string fTreeName{"tree"}; + ~FileGuardRAII() { std::remove(fFilename.c_str()); } + } fileGuard; + ROOT::RDataFrame df{1}; + df.Define("x", [] { return 42; }).Snapshot(fileGuard.fTreeName, fileGuard.fFilename, {"x"}); + + auto f = std::make_unique(fileGuard.fFilename.c_str()); + EXPECT_EQ(f->GetCompressionAlgorithm(), ROOT::RCompressionSetting::EAlgorithm::EValues::kZLIB); + EXPECT_EQ(f->GetCompressionLevel(), 1); +} + // fixture that provides fixed and variable sized arrays as RDF columns class RDFSnapshotArrays : public ::testing::Test { protected: diff --git a/tree/dataframe/test/dataframe_snapshot_ntuple.cxx b/tree/dataframe/test/dataframe_snapshot_ntuple.cxx index 69fad452f0165..03039ce963395 100644 --- a/tree/dataframe/test/dataframe_snapshot_ntuple.cxx +++ b/tree/dataframe/test/dataframe_snapshot_ntuple.cxx @@ -170,6 +170,26 @@ TEST(RDFSnapshotRNTuple, WriteOpts) } } +TEST(RDFSnapshotRNTuple, DefaultCompressionSettings) +{ + FileRAII fileGuard{"RDFSnapshotRNTuple_default_compression_settings.root"}; + const std::vector columns = {"x"}; + + auto df = ROOT::RDataFrame(25ull).Define("x", [] { return 10; }); + + RSnapshotOptions opts; + opts.fOutputFormat = ROOT::RDF::ESnapshotOutputFormat::kRNTuple; + + auto sdf = df.Snapshot("ntuple", fileGuard.GetPath(), {"x"}, opts); + + EXPECT_EQ(columns, sdf->GetColumnNames()); + + auto reader = RNTupleReader::Open("ntuple", fileGuard.GetPath()); + auto compSettings = *reader->GetDescriptor().GetClusterDescriptor(0).GetColumnRange(0).GetCompressionSettings(); + // The RNTuple default should be 505 + EXPECT_EQ(505, compSettings); +} + TEST(RDFSnapshotRNTuple, Compression) { FileRAII fileGuard{"RDFSnapshotRNTuple_compression.root"};