diff --git a/tutorials/CMakeLists.txt b/tutorials/CMakeLists.txt index 93f2ab646eb8d..e694819355af1 100644 --- a/tutorials/CMakeLists.txt +++ b/tutorials/CMakeLists.txt @@ -76,6 +76,7 @@ if(MSVC AND NOT win_broken_tests) list(APPEND dataframe_veto machine_learning/ml_dataloader_TensorFlow.py) list(APPEND dataframe_veto machine_learning/ml_dataloader_PyTorch.py) list(APPEND dataframe_veto machine_learning/ml_dataloader_filters_vectors.py) + list(APPEND dataframe_veto machine_learning/ml_dataloader_Higgs_Classification.py) # df036* and df037* seem to trigger OS errors when trying to delete the # test files created in the tutorials. It is unclear why. list(APPEND dataframe_veto analysis/dataframe/df036_missingBranches.C) @@ -128,6 +129,7 @@ if (NOT dataframe) list(APPEND dataframe_veto machine_learning/ml_dataloader_TensorFlow.py) list(APPEND dataframe_veto machine_learning/ml_dataloader_PyTorch.py) list(APPEND dataframe_veto machine_learning/ml_dataloader_filters_vectors.py) + list(APPEND dataframe_veto machine_learning/ml_dataloader_Higgs_Classification.py) # RooFit tutorials depending on RDataFrame list(APPEND dataframe_veto roofit/roofit/rf408_RDataFrameToRooFit.C @@ -937,6 +939,7 @@ if(pyroot) file(GLOB requires_torch RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} machine_learning/pytorch/*.py machine_learning/ml_dataloader_PyTorch.py + machine_learning/ml_dataloader_Higgs_Classification.py ) file(GLOB requires_xgboost RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} machine_learning/tmva101_Training.py diff --git a/tutorials/machine_learning/data/ml_dataloader_Higgs_Classification_test.root b/tutorials/machine_learning/data/ml_dataloader_Higgs_Classification_test.root new file mode 100644 index 0000000000000..746bb05294a95 Binary files /dev/null and b/tutorials/machine_learning/data/ml_dataloader_Higgs_Classification_test.root differ diff --git a/tutorials/machine_learning/data/ml_dataloader_Higgs_Classification_train.root b/tutorials/machine_learning/data/ml_dataloader_Higgs_Classification_train.root new file mode 100644 index 0000000000000..b2ca61fa655d6 Binary files /dev/null and b/tutorials/machine_learning/data/ml_dataloader_Higgs_Classification_train.root differ diff --git a/tutorials/machine_learning/data/ml_dataloader_Higgs_Classification_val.root b/tutorials/machine_learning/data/ml_dataloader_Higgs_Classification_val.root new file mode 100644 index 0000000000000..b41f3e37aa553 Binary files /dev/null and b/tutorials/machine_learning/data/ml_dataloader_Higgs_Classification_val.root differ diff --git a/tutorials/machine_learning/index.md b/tutorials/machine_learning/index.md index 0a39cc3346db2..ae390de1ec8e9 100644 --- a/tutorials/machine_learning/index.md +++ b/tutorials/machine_learning/index.md @@ -137,4 +137,5 @@ | ml_dataloader_NumPy.py | Loading batches of events from a ROOT dataset as Python generators of numpy arrays. | | ml_dataloader_PyTorch.py | Loading batches of events from a ROOT dataset into a basic PyTorch workflow. | | ml_dataloader_TensorFlow.py | Loading batches of events from a ROOT dataset into a basic TensorFlow workflow. | +| ml_dataloader_Higgs_Classification | Loading batches of events from different files for a data-normalization workflow. | diff --git a/tutorials/machine_learning/ml_dataloader_Higgs_Classification.py b/tutorials/machine_learning/ml_dataloader_Higgs_Classification.py new file mode 100644 index 0000000000000..8ec678351b110 --- /dev/null +++ b/tutorials/machine_learning/ml_dataloader_Higgs_Classification.py @@ -0,0 +1,238 @@ +## \file +## \ingroup tutorial_dataframe +## \notebook -draw +## The Higgs to four lepton analysis from the ATLAS Open Data release of 2020, with RDataFrame. +## +## This tutorial is a continuation of the HiggsToFourLeptons tutorial. +## We will build a model to classify the data as Higgs or not Higgs. +## +## +## \macro_image +## \macro_code +## \macro_output +## +## \date June 2026 +## \authors Jonah Ascoli (CERN), Martin Foll (CERN), Silia Taider (CERN) + +import matplotlib.pyplot as plt +import ROOT +import sklearn.metrics as skl +import torch +from torch import nn +from tqdm import tqdm + +data_dir = ROOT.gROOT.GetTutorialDir().Data() + "/machine_learning/data/" +df_train = ROOT.RDataFrame("tree", data_dir + "ml_dataloader_Higgs_Classification_train.root") +df_val = ROOT.RDataFrame("tree", data_dir + "ml_dataloader_Higgs_Classification_val.root") +df_test = ROOT.RDataFrame("tree", data_dir + "ml_dataloader_Higgs_Classification_test.root") + + +# Classifier model with adjustable hidden layers +class Classifier(nn.Module): + def __init__( + self, + num_features: int, + hidden_layers: list[int], + p: float = 0.2, + use_dropout: bool = False, + use_batchnorm: bool = True, + ): + super().__init__() + + layers = [] + in_dim = num_features + + for out_dim in hidden_layers: + block = [nn.Linear(in_dim, out_dim)] + + if use_batchnorm: + block.append(nn.BatchNorm1d(out_dim)) + + block.append(nn.ReLU()) + + if use_dropout: + block.append(nn.Dropout(p)) + + layers.append(nn.Sequential(*block)) + in_dim = out_dim + + self.hidden = nn.Sequential(*layers) + self.output_layer = nn.Linear(in_dim, 1) + + def forward(self, x): + x = self.hidden(x) + x = self.output_layer(x) + return torch.sigmoid(x) + + +batch_size = 1000 +batches_in_memory = 1000 +drop_remainder = True +columns = ["m4l", "good_lep", "goodlep_E", "goodlep_eta", "goodlep_phi", "goodlep_pt", "goodlep_type", "isHiggsRef"] +target = "isHiggsRef" +max_vec_sizes = {"good_lep": 4, "goodlep_E": 4, "goodlep_eta": 4, "goodlep_phi": 4, "goodlep_pt": 4, "goodlep_type": 4} +shuffle = True +set_seed = 42 + +# Normalize the data! +for var in tqdm(columns[:-1], desc="Normalizing the training data..."): + if var == "m4l": # The only non-vector column + mean = df_train.Mean(var).GetValue() + stddev = df_train.StdDev(var).GetValue() + df_train = df_train.Redefine(var, f"({var} - {mean}) / {stddev}") + # The validation and testing data should be normalized based on the + # mean and standard deviation calculated from the training data. + df_val = df_val.Redefine(var, f"({var} - {mean}) / {stddev}") + df_test = df_test.Redefine(var, f"({var} - {mean}) / {stddev}") + else: + # Each vector event has 4 columns, and we need to take a column-wise mean and stddev + means = [] + stddevs = [] + for i in range(max_vec_sizes[var]): + scalar_column = f"{var}_{i}" + df_train = df_train.Define(scalar_column, f"{var}[{i}]") + means.append(df_train.Mean(scalar_column).GetValue()) + stddevs.append(df_train.StdDev(scalar_column).GetValue()) + mean_vec = ROOT.RVec("double")(means) + stddev_vec = ROOT.RVec("double")(stddevs) + for i in range(len(stddevs)): + if stddevs[i] == 0: + stddevs[i] = 0.01 # Avoids division by 0 + expr = ", ".join(f"(({var}[{i}] - {means[i]}) / {stddevs[i]})" for i in range(max_vec_sizes[var])) + df_train = df_train.Redefine(var, f"ROOT::RVec{{{expr}}}") + # The validation and testing data should be normalized based on the + # mean and standard deviation calculated from the training data. + df_val = df_val.Redefine(var, f"ROOT::RVec{{{expr}}}") + df_test = df_test.Redefine(var, f"ROOT::RVec{{{expr}}}") + +train = ROOT.Experimental.ML.RDataLoader( + df_train, + batch_size=batch_size, + batches_in_memory=batches_in_memory, + drop_remainder=drop_remainder, + columns=columns, + target=target, + max_vec_sizes=max_vec_sizes, + shuffle=shuffle, + set_seed=set_seed, +) +val = ROOT.Experimental.ML.RDataLoader( + df_val, + batch_size=batch_size, + batches_in_memory=batches_in_memory, + drop_remainder=drop_remainder, + columns=columns, + target=target, + max_vec_sizes=max_vec_sizes, + shuffle=shuffle, + set_seed=set_seed, +) +test = ROOT.Experimental.ML.RDataLoader( + df_test, + batch_size=batch_size, + batches_in_memory=batches_in_memory, + drop_remainder=drop_remainder, + columns=columns, + target=target, + max_vec_sizes=max_vec_sizes, + shuffle=shuffle, + set_seed=set_seed, +) + + +num_features = sum(max_vec_sizes.values()) + len([0 for i in train.train_columns if i not in max_vec_sizes]) +# Must be calculated manually since columns is not yet expanded + +torch.manual_seed(set_seed) +hidden_layers = [60, 60] +model = Classifier(num_features=num_features, hidden_layers=hidden_layers, p=0.2, use_dropout=False) +loss_fn = nn.BCELoss() +optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9) + +epochs = 1000 +last_val_losses = [float("inf")] * 6 +# Early stopping criterion: most recent 3 avg. losses are worse than the 3 before that +avg_val_losses = [] +pbar = tqdm(range(epochs), desc="Initializing training...") +for epoch in pbar: + # training + model.train() + + for i, (x_train, y_train) in enumerate(train.as_torch()): + outputs = model(x_train) + loss = loss_fn(outputs, y_train) + + optimizer.zero_grad() + loss.backward() + optimizer.step() + + # validation + model.eval() + val_loss = 0 + val_correct = 0 + val_total = 0 + + with torch.no_grad(): + for j, (x_val, y_val) in enumerate(val.as_torch()): + outputs = model(x_val) + loss = loss_fn(outputs, y_val) + val_loss += loss.item() + + preds = (outputs > 0.5).float() + val_correct += (preds == y_val).sum().item() + val_total += y_val.size(0) + + avg_val_loss = val_loss / (j + 1) + pbar.set_description(f"Avg. val loss: {avg_val_loss}") + avg_val_losses.append(avg_val_loss) + val_accuracy = val_correct / val_total + del last_val_losses[0] + last_val_losses.append(avg_val_loss) + # Early stopping check + if min(last_val_losses[-3:]) > max(last_val_losses[:3]): + print(f"Validation loss has not improved for 6 epochs, stopping training after {epoch + 1} epochs.") + epochs = epoch + 1 + break + +# Testing +model.eval() +test_loss = 0 +test_correct = 0 +test_total = 0 + +test_preds = [] +test_true = [] +with torch.no_grad(): + for j, (x_test, y_test) in enumerate(test.as_torch()): + outputs = model(x_test) + loss = loss_fn(outputs, y_test) + test_loss += loss.item() + test_preds += outputs + test_true += y_test + + preds = (outputs > 0.5).float() + test_correct += (preds == y_test).sum().item() + test_total += y_test.size(0) + +avg_test_loss = test_loss / (j + 1) +test_accuracy = test_correct / test_total + +print(f"Testing Loss: {avg_test_loss:.4f} Accuracy: {test_accuracy:.4f}\n") + +# Analysis +fig = plt.figure() +ax = plt.axes() +ax.plot([i for i in range(epochs)], avg_val_losses) +plt.title("Loss curve") +plt.xlabel("Epoch") +plt.ylabel("Validation loss") +plt.savefig("loss_curve") + +fpr, tpr, thresholds = skl.roc_curve(test_true, test_preds) +fig = plt.figure() +ax = plt.axes() +ax.plot(fpr[:-1], tpr[:-1]) +plt.title("ROC curve") +plt.xlabel("False positive rate") +plt.ylabel("True positive rate") +plt.savefig("ROC_curve")