Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions tutorials/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ if(MSVC AND NOT win_broken_tests)
list(APPEND dataframe_veto machine_learning/ml_dataloader_TensorFlow.py)
list(APPEND dataframe_veto machine_learning/ml_dataloader_PyTorch.py)
list(APPEND dataframe_veto machine_learning/ml_dataloader_filters_vectors.py)
list(APPEND dataframe_veto machine_learning/ml_dataloader_Higgs_Classification.py)
# df036* and df037* seem to trigger OS errors when trying to delete the
# test files created in the tutorials. It is unclear why.
list(APPEND dataframe_veto analysis/dataframe/df036_missingBranches.C)
Expand Down Expand Up @@ -128,6 +129,7 @@ if (NOT dataframe)
list(APPEND dataframe_veto machine_learning/ml_dataloader_TensorFlow.py)
list(APPEND dataframe_veto machine_learning/ml_dataloader_PyTorch.py)
list(APPEND dataframe_veto machine_learning/ml_dataloader_filters_vectors.py)
list(APPEND dataframe_veto machine_learning/ml_dataloader_Higgs_Classification.py)
# RooFit tutorials depending on RDataFrame
list(APPEND dataframe_veto
roofit/roofit/rf408_RDataFrameToRooFit.C
Expand Down Expand Up @@ -937,6 +939,7 @@ if(pyroot)
file(GLOB requires_torch RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
machine_learning/pytorch/*.py
machine_learning/ml_dataloader_PyTorch.py
machine_learning/ml_dataloader_Higgs_Classification.py
)
file(GLOB requires_xgboost RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
machine_learning/tmva101_Training.py
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
1 change: 1 addition & 0 deletions tutorials/machine_learning/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -137,4 +137,5 @@
| ml_dataloader_NumPy.py | Loading batches of events from a ROOT dataset as Python generators of numpy arrays. |
| ml_dataloader_PyTorch.py | Loading batches of events from a ROOT dataset into a basic PyTorch workflow. |
| ml_dataloader_TensorFlow.py | Loading batches of events from a ROOT dataset into a basic TensorFlow workflow. |
| ml_dataloader_Higgs_Classification | Loading batches of events from different files for a data-normalization workflow. |

238 changes: 238 additions & 0 deletions tutorials/machine_learning/ml_dataloader_Higgs_Classification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,238 @@
## \file
## \ingroup tutorial_dataframe
## \notebook -draw
## The Higgs to four lepton analysis from the ATLAS Open Data release of 2020, with RDataFrame.
##
## This tutorial is a continuation of the HiggsToFourLeptons tutorial.
## We will build a model to classify the data as Higgs or not Higgs.
##
##
## \macro_image
## \macro_code
## \macro_output
##
## \date June 2026
## \authors Jonah Ascoli (CERN), Martin Foll (CERN), Silia Taider (CERN)

import matplotlib.pyplot as plt
import ROOT
import sklearn.metrics as skl
import torch
from torch import nn
from tqdm import tqdm

data_dir = ROOT.gROOT.GetTutorialDir().Data() + "/machine_learning/data/"
df_train = ROOT.RDataFrame("tree", data_dir + "ml_dataloader_Higgs_Classification_train.root")
df_val = ROOT.RDataFrame("tree", data_dir + "ml_dataloader_Higgs_Classification_val.root")
df_test = ROOT.RDataFrame("tree", data_dir + "ml_dataloader_Higgs_Classification_test.root")


# Classifier model with adjustable hidden layers
class Classifier(nn.Module):
def __init__(
self,
num_features: int,
hidden_layers: list[int],
p: float = 0.2,
use_dropout: bool = False,
use_batchnorm: bool = True,
):
super().__init__()

layers = []
in_dim = num_features

for out_dim in hidden_layers:
block = [nn.Linear(in_dim, out_dim)]

if use_batchnorm:
block.append(nn.BatchNorm1d(out_dim))

block.append(nn.ReLU())

if use_dropout:
block.append(nn.Dropout(p))

layers.append(nn.Sequential(*block))
in_dim = out_dim

self.hidden = nn.Sequential(*layers)
self.output_layer = nn.Linear(in_dim, 1)

def forward(self, x):
x = self.hidden(x)
x = self.output_layer(x)
return torch.sigmoid(x)


batch_size = 1000
batches_in_memory = 1000
drop_remainder = True
columns = ["m4l", "good_lep", "goodlep_E", "goodlep_eta", "goodlep_phi", "goodlep_pt", "goodlep_type", "isHiggsRef"]
target = "isHiggsRef"
max_vec_sizes = {"good_lep": 4, "goodlep_E": 4, "goodlep_eta": 4, "goodlep_phi": 4, "goodlep_pt": 4, "goodlep_type": 4}
shuffle = True
set_seed = 42

# Normalize the data!
for var in tqdm(columns[:-1], desc="Normalizing the training data..."):
if var == "m4l": # The only non-vector column
mean = df_train.Mean(var).GetValue()
stddev = df_train.StdDev(var).GetValue()
df_train = df_train.Redefine(var, f"({var} - {mean}) / {stddev}")
# The validation and testing data should be normalized based on the
# mean and standard deviation calculated from the training data.
df_val = df_val.Redefine(var, f"({var} - {mean}) / {stddev}")
df_test = df_test.Redefine(var, f"({var} - {mean}) / {stddev}")
else:
# Each vector event has 4 columns, and we need to take a column-wise mean and stddev
means = []
stddevs = []
for i in range(max_vec_sizes[var]):
scalar_column = f"{var}_{i}"
df_train = df_train.Define(scalar_column, f"{var}[{i}]")
means.append(df_train.Mean(scalar_column).GetValue())
stddevs.append(df_train.StdDev(scalar_column).GetValue())
mean_vec = ROOT.RVec("double")(means)
stddev_vec = ROOT.RVec("double")(stddevs)
for i in range(len(stddevs)):
if stddevs[i] == 0:
stddevs[i] = 0.01 # Avoids division by 0
expr = ", ".join(f"(({var}[{i}] - {means[i]}) / {stddevs[i]})" for i in range(max_vec_sizes[var]))
df_train = df_train.Redefine(var, f"ROOT::RVec<double>{{{expr}}}")
# The validation and testing data should be normalized based on the
# mean and standard deviation calculated from the training data.
df_val = df_val.Redefine(var, f"ROOT::RVec<double>{{{expr}}}")
df_test = df_test.Redefine(var, f"ROOT::RVec<double>{{{expr}}}")

train = ROOT.Experimental.ML.RDataLoader(
df_train,
batch_size=batch_size,
batches_in_memory=batches_in_memory,
drop_remainder=drop_remainder,
columns=columns,
target=target,
max_vec_sizes=max_vec_sizes,
shuffle=shuffle,
set_seed=set_seed,
)
val = ROOT.Experimental.ML.RDataLoader(
df_val,
batch_size=batch_size,
batches_in_memory=batches_in_memory,
drop_remainder=drop_remainder,
columns=columns,
target=target,
max_vec_sizes=max_vec_sizes,
shuffle=shuffle,
set_seed=set_seed,
)
test = ROOT.Experimental.ML.RDataLoader(
df_test,
batch_size=batch_size,
batches_in_memory=batches_in_memory,
drop_remainder=drop_remainder,
columns=columns,
target=target,
max_vec_sizes=max_vec_sizes,
shuffle=shuffle,
set_seed=set_seed,
)


num_features = sum(max_vec_sizes.values()) + len([0 for i in train.train_columns if i not in max_vec_sizes])
# Must be calculated manually since columns is not yet expanded

torch.manual_seed(set_seed)
hidden_layers = [60, 60]
model = Classifier(num_features=num_features, hidden_layers=hidden_layers, p=0.2, use_dropout=False)
loss_fn = nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

epochs = 1000
last_val_losses = [float("inf")] * 6
# Early stopping criterion: most recent 3 avg. losses are worse than the 3 before that
avg_val_losses = []
pbar = tqdm(range(epochs), desc="Initializing training...")
for epoch in pbar:
# training
model.train()

for i, (x_train, y_train) in enumerate(train.as_torch()):
outputs = model(x_train)
loss = loss_fn(outputs, y_train)

optimizer.zero_grad()
loss.backward()
optimizer.step()

# validation
model.eval()
val_loss = 0
val_correct = 0
val_total = 0

with torch.no_grad():
for j, (x_val, y_val) in enumerate(val.as_torch()):
outputs = model(x_val)
loss = loss_fn(outputs, y_val)
val_loss += loss.item()

preds = (outputs > 0.5).float()
val_correct += (preds == y_val).sum().item()
val_total += y_val.size(0)

avg_val_loss = val_loss / (j + 1)
pbar.set_description(f"Avg. val loss: {avg_val_loss}")
avg_val_losses.append(avg_val_loss)
val_accuracy = val_correct / val_total
del last_val_losses[0]
last_val_losses.append(avg_val_loss)
# Early stopping check
if min(last_val_losses[-3:]) > max(last_val_losses[:3]):
print(f"Validation loss has not improved for 6 epochs, stopping training after {epoch + 1} epochs.")
epochs = epoch + 1
break

# Testing
model.eval()
test_loss = 0
test_correct = 0
test_total = 0

test_preds = []
test_true = []
with torch.no_grad():
for j, (x_test, y_test) in enumerate(test.as_torch()):
outputs = model(x_test)
loss = loss_fn(outputs, y_test)
test_loss += loss.item()
test_preds += outputs
test_true += y_test

preds = (outputs > 0.5).float()
test_correct += (preds == y_test).sum().item()
test_total += y_test.size(0)

avg_test_loss = test_loss / (j + 1)
test_accuracy = test_correct / test_total

print(f"Testing Loss: {avg_test_loss:.4f} Accuracy: {test_accuracy:.4f}\n")

# Analysis
fig = plt.figure()
ax = plt.axes()
ax.plot([i for i in range(epochs)], avg_val_losses)
plt.title("Loss curve")
plt.xlabel("Epoch")
plt.ylabel("Validation loss")
plt.savefig("loss_curve")

fpr, tpr, thresholds = skl.roc_curve(test_true, test_preds)
fig = plt.figure()
ax = plt.axes()
ax.plot(fpr[:-1], tpr[:-1])
plt.title("ROC curve")
plt.xlabel("False positive rate")
plt.ylabel("True positive rate")
plt.savefig("ROC_curve")
Loading