From 49cdd8ba667840a6ad4a6acf048cb0dca9501cd6 Mon Sep 17 00:00:00 2001 From: Student2 Date: Tue, 9 Jun 2026 09:33:25 +0200 Subject: [PATCH 1/3] testing new print statement --- .../pythonizations/python/ROOT/_pythonization/_ml_dataloader.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bindings/pyroot/pythonizations/python/ROOT/_pythonization/_ml_dataloader.py b/bindings/pyroot/pythonizations/python/ROOT/_pythonization/_ml_dataloader.py index 3ab53859622e4..05f4a55a3ddca 100644 --- a/bindings/pyroot/pythonizations/python/ROOT/_pythonization/_ml_dataloader.py +++ b/bindings/pyroot/pythonizations/python/ROOT/_pythonization/_ml_dataloader.py @@ -4,6 +4,7 @@ # Author: Vincenzo Eduardo Padulano, CERN 10/2024 # Author: Martin Føll, University of Oslo (UiO) & CERN 01/2026 # Author: Silia Taider, CERN 02/2026 +# Author: Jonah Ascoli, CERN 06/2026 ################################################################################ # Copyright (C) 1995-2026, Rene Brun and Fons Rademakers. # @@ -161,6 +162,7 @@ def __init__( """ from ROOT import RDF + print("This is jonah's testing branch") if rdataframes is None: rdataframes = [] From d875a11293457add97decf7ff141029d033a438f Mon Sep 17 00:00:00 2001 From: Student2 Date: Thu, 11 Jun 2026 15:53:21 +0200 Subject: [PATCH 2/3] [tutorials][ML] Add resampling tutorial This Pull request adds a tutorial to show the syntax of resampling --- .../ml_dataloader_resampling.py | 84 +++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 tutorials/machine_learning/ml_dataloader_resampling.py diff --git a/tutorials/machine_learning/ml_dataloader_resampling.py b/tutorials/machine_learning/ml_dataloader_resampling.py new file mode 100644 index 0000000000000..bceee538d1549 --- /dev/null +++ b/tutorials/machine_learning/ml_dataloader_resampling.py @@ -0,0 +1,84 @@ +### \file +### \ingroup tutorial_ml +### \notebook -nodraw +### Example of resampling when one class is underrepresented in the dataset. +### +### \macro_code +### \macro_output +### \author Jonah Ascoli + +import os + +import ROOT +import torch + +torch.manual_seed(42) + +# Create an imbalanced dataset with two classes, one of which is underrepresented. +# Here, we'll create two files, one with even numbers and one with odd numbers, +# and then merge them to form a dataset with underrepresented odd numbers. +ROOT.RDataFrame(100000).Define("b1", "(int) 2 * rdfentry_").Define("b2", "(int) b1%2").Snapshot("tree", "major.root") +ROOT.RDataFrame(100).Define("b1", "(int) 2 * rdfentry_ + 1").Define("b2", "(int) b1%2").Snapshot("tree", "minor.root") +df_major = ROOT.RDataFrame("tree", "major.root") +df_minor = ROOT.RDataFrame("tree", "minor.root") + +batch_size = 16 +batches_in_memory = 10 +num_epochs = 10 + +loss_fn = torch.nn.BCEWithLogitsLoss() + + +def train_model(model, optimizer, dataloader): + train, val = dataloader.train_test_split(test_size=0.2) + for _ in range(num_epochs): + model.train() + for X, y in train.as_torch(): + optimizer.zero_grad() + loss = loss_fn(model(X), y) + loss.backward() + optimizer.step() + losses = [] + for X, y in val.as_torch(): + with torch.no_grad(): + loss = loss_fn(model(X), y) + losses.append(loss.item()) + print(f"Validation Loss: {sum(losses) / len(losses)}") + + +# First, let's try to create a dataloader without resampling and see how it handles the underrepresented class. +dl = ROOT.Experimental.ML.RDataLoader( + [df_major, df_minor], + batch_size=batch_size, + batches_in_memory=batches_in_memory, + target="b2", + set_seed=42, +) + +basic_model = torch.nn.Linear(1, 1) # Simple linear model for binary classification +basic_optimizer = torch.optim.Adam(basic_model.parameters()) + +print("Training without resampling:") +train_model(basic_model, basic_optimizer, dl) + +# Now, let's try the same thing with oversampling +# Strategy: more batches of the underrepresented class +# Takes more time per epoch, but each epoch is more effective +dl_oversampled = ROOT.Experimental.ML.RDataLoader( + [df_major, df_minor], + batch_size=batch_size, + batches_in_memory=batches_in_memory, + target="b2", + set_seed=42, + load_eager=True, # Must be enabled for resampling + sampling_type="oversampling", # Can also be "undersampling" +) + +oversampling_model = torch.nn.Linear(1, 1) +oversampling_optimizer = torch.optim.Adam(oversampling_model.parameters()) + +print("Training with oversampling:") +train_model(oversampling_model, oversampling_optimizer, dl_oversampled) + +os.remove("major.root") +os.remove("minor.root") From 15141b5f97b0297b2e10d5014b97a0440f0e6ae4 Mon Sep 17 00:00:00 2001 From: Student2 Date: Thu, 11 Jun 2026 16:05:02 +0200 Subject: [PATCH 3/3] Remove debugging print statement --- .../pythonizations/python/ROOT/_pythonization/_ml_dataloader.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/bindings/pyroot/pythonizations/python/ROOT/_pythonization/_ml_dataloader.py b/bindings/pyroot/pythonizations/python/ROOT/_pythonization/_ml_dataloader.py index 05f4a55a3ddca..3ab53859622e4 100644 --- a/bindings/pyroot/pythonizations/python/ROOT/_pythonization/_ml_dataloader.py +++ b/bindings/pyroot/pythonizations/python/ROOT/_pythonization/_ml_dataloader.py @@ -4,7 +4,6 @@ # Author: Vincenzo Eduardo Padulano, CERN 10/2024 # Author: Martin Føll, University of Oslo (UiO) & CERN 01/2026 # Author: Silia Taider, CERN 02/2026 -# Author: Jonah Ascoli, CERN 06/2026 ################################################################################ # Copyright (C) 1995-2026, Rene Brun and Fons Rademakers. # @@ -162,7 +161,6 @@ def __init__( """ from ROOT import RDF - print("This is jonah's testing branch") if rdataframes is None: rdataframes = []