From d74f4eb4cdff53a4381ba4fc2ec977441a6cbbc7 Mon Sep 17 00:00:00 2001 From: Gernot Maier Date: Thu, 29 Jan 2026 15:36:44 +0100 Subject: [PATCH 1/6] improve pd concat --- src/eventdisplay_ml/data_processing.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/src/eventdisplay_ml/data_processing.py b/src/eventdisplay_ml/data_processing.py index 3a49b43..5ea813e 100644 --- a/src/eventdisplay_ml/data_processing.py +++ b/src/eventdisplay_ml/data_processing.py @@ -730,14 +730,19 @@ def load_training_data(model_configs, file_list, analysis_type): observatory=model_configs.get("observatory", "veritas"), ) if analysis_type == "stereo_analysis": - df_flat["MCxoff"] = _to_numpy_1d(df["MCxoff"], np.float32) - df_flat["MCyoff"] = _to_numpy_1d(df["MCyoff"], np.float32) - df_flat["MCe0"] = np.log10(_to_numpy_1d(df["MCe0"], np.float32)) + new_cols = { + "MCxoff": _to_numpy_1d(df["MCxoff"], np.float32), + "MCyoff": _to_numpy_1d(df["MCyoff"], np.float32), + "MCe0": np.log10(_to_numpy_1d(df["MCe0"], np.float32)), + } elif analysis_type == "classification": - df_flat["ze_bin"] = zenith_in_bins( - 90.0 - _to_numpy_1d(df["ArrayPointing_Elevation"], np.float32), - model_configs.get("zenith_bins_deg", []), - ) + new_cols = { + "ze_bin": zenith_in_bins( + 90.0 - _to_numpy_1d(df["ArrayPointing_Elevation"], np.float32), + model_configs.get("zenith_bins_deg", []), + ) + } + df_flat = pd.concat([df_flat, pd.DataFrame(new_cols, index=df_flat.index)], axis=1) dfs.append(df_flat) From d6da3e9bcff919663912667f9b80518f2d114c3f Mon Sep 17 00:00:00 2001 From: Gernot Maier Date: Thu, 29 Jan 2026 15:58:36 +0100 Subject: [PATCH 2/6] shap per energy --- src/eventdisplay_ml/evaluate.py | 144 ++++++++++++++++++++++++++++++++ 1 file changed, 144 insertions(+) diff --git a/src/eventdisplay_ml/evaluate.py b/src/eventdisplay_ml/evaluate.py index 30acd83..b242038 100644 --- a/src/eventdisplay_ml/evaluate.py +++ b/src/eventdisplay_ml/evaluate.py @@ -82,6 +82,10 @@ def evaluate_regression_model(model, x_test, y_test, df, x_cols, y_data, name): feature_importance(model, x_cols, y_data.columns, name) if name == "xgboost": shap_feature_importance(model, x_test, y_data.columns) + # Optional for now + # shap_feature_importance_by_energy( + # model, x_test, df, y_test, y_data.columns + # ) df_pred = pd.DataFrame(y_pred, columns=target_features("stereo_analysis")) calculate_resolution( @@ -253,3 +257,143 @@ def shap_feature_importance(model, x_data, target_names, max_points=1000, n_top= for j in idx[:n_top]: if j < n_features: _logger.info(f"{x_data.columns[j]:25s} {imp[j]:.6e}") + + +def shap_feature_importance_by_energy( + model, + x_test, + df, + y_test, + target_names, + log_e_min=-2.0, + log_e_max=2.5, + n_bins=9, + max_points=1000, + n_top=5, +): + """Calculate SHAP feature importance for each energy bin. + + Computes SHAP values separately for events in different energy ranges, + allowing analysis of feature importance as a function of energy. + Uses the same energy binning as calculate_resolution for consistency. + Outputs results in tabular format for easy comparison across energy bins. + """ + # Extract energy values and create bins + mce0_values = df.loc[y_test.index, "MCe0"].values + bins = np.linspace(log_e_min, log_e_max, n_bins + 1) + bin_indices = np.digitize(mce0_values, bins) + + n_features = len(x_test.columns) + n_targets = len(target_names) + + # Store importance values for each target across all bins + target_importance_data = {target: {} for target in target_names} + bin_info = [] + + # Collect stratified samples for all bins, then compute SHAP once + sampled_frames = [] + sampled_bin_labels = [] + + for bin_idx in range(1, n_bins + 1): + mask = bin_indices == bin_idx + n_events = mask.sum() + + if n_events == 0: + continue + + bin_lower = bins[bin_idx - 1] + bin_upper = bins[bin_idx] + mean_log_e = mce0_values[mask].mean() + + bin_label = f"LogE={mean_log_e:.2f}" + bin_info.append( + { + "label": bin_label, + "mean_log_e": mean_log_e, + "n_events": n_events, + "range": f"[{bin_lower:.2f}, {bin_upper:.2f}]", + } + ) + + x_bin = x_test.iloc[mask] + n_sample = min(len(x_bin), max_points) + x_sample = x_bin.sample(n=n_sample, random_state=None) + + sampled_frames.append(x_sample) + sampled_bin_labels.extend([bin_label] * len(x_sample)) + + if not sampled_frames: + _logger.info("No events found in any energy bin for SHAP calculation.") + return + + x_sampled_all = pd.concat(sampled_frames, axis=0) + dmatrix = xgb.DMatrix(x_sampled_all) + shap_vals = model.get_booster().predict(dmatrix, pred_contribs=True) + shap_vals = shap_vals.reshape(len(x_sampled_all), n_targets, n_features + 1) + + # Aggregate SHAP importance per bin from the single SHAP run + sampled_bin_labels = np.array(sampled_bin_labels) + for i, target in enumerate(target_names): + target_shap = shap_vals[:, i, :-1] + for info in bin_info: + bin_label = info["label"] + bin_mask = sampled_bin_labels == bin_label + if not np.any(bin_mask): + continue + + imp = np.abs(target_shap[bin_mask]).mean(axis=0) + for j, feature_name in enumerate(x_test.columns): + if feature_name not in target_importance_data[target]: + target_importance_data[target][feature_name] = {} + target_importance_data[target][feature_name][bin_label] = imp[j] + + # Create and display tables for each target + _logger.info(f"\n{'=' * 100}") + _logger.info("SHAP Feature Importance by Energy Bin (Tabular Format)") + _logger.info(f"Calculated over {n_bins} bins [{log_e_min}, {log_e_max}]") + _logger.info(f"{'=' * 100}") + + # Display bin information + _logger.info("\nEnergy Bin Information:") + for info in bin_info: + _logger.info(f" {info['label']:12s}: Range {info['range']:15s}, N = {info['n_events']:6d}") + + for target in target_names: + _logger.info(f"\n\n=== SHAP Importance for {target} ===") + + # Find top N features in each bin, then take union of all top features + all_top_features = set() + for info in bin_info: + bin_label = info["label"] + # Get importance values for this bin + bin_importance = { + feature: values.get(bin_label, 0) + for feature, values in target_importance_data[target].items() + } + # Get top N features for this bin + top_in_bin = sorted(bin_importance.items(), key=lambda x: x[1], reverse=True)[:n_top] + all_top_features.update([f[0] for f in top_in_bin]) + + # Sort features by their average importance across all bins + feature_avg_importance = {} + for feature_name in all_top_features: + values = [ + target_importance_data[target][feature_name].get(info["label"], 0) + for info in bin_info + ] + feature_avg_importance[feature_name] = np.mean(values) + + sorted_features = sorted(feature_avg_importance.items(), key=lambda x: x[1], reverse=True) + + # Build DataFrame with all features that were top N in at least one bin + data_rows = [] + for feature_name, _ in sorted_features: + row = {"Feature": feature_name} + for info in bin_info: + bin_label = info["label"] + value = target_importance_data[target][feature_name].get(bin_label, np.nan) + row[bin_label] = value + data_rows.append(row) + + df_table = pd.DataFrame(data_rows) + _logger.info(f"\n{df_table.to_markdown(index=False, floatfmt='.4e')}") From 9f01384d1438c77ad17f5227b2b1f74e0531e730 Mon Sep 17 00:00:00 2001 From: Gernot Maier Date: Fri, 30 Jan 2026 15:17:15 +0100 Subject: [PATCH 3/6] Update src/eventdisplay_ml/data_processing.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/eventdisplay_ml/data_processing.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/eventdisplay_ml/data_processing.py b/src/eventdisplay_ml/data_processing.py index 5ea813e..f2246d8 100644 --- a/src/eventdisplay_ml/data_processing.py +++ b/src/eventdisplay_ml/data_processing.py @@ -742,7 +742,8 @@ def load_training_data(model_configs, file_list, analysis_type): model_configs.get("zenith_bins_deg", []), ) } - df_flat = pd.concat([df_flat, pd.DataFrame(new_cols, index=df_flat.index)], axis=1) + for col_name, values in new_cols.items(): + df_flat[col_name] = values dfs.append(df_flat) From 03c9f2eb564feb8ea6c90be96108f22d647702a2 Mon Sep 17 00:00:00 2001 From: Gernot Maier Date: Fri, 30 Jan 2026 15:17:41 +0100 Subject: [PATCH 4/6] Update src/eventdisplay_ml/evaluate.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/eventdisplay_ml/evaluate.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/eventdisplay_ml/evaluate.py b/src/eventdisplay_ml/evaluate.py index b242038..09fa046 100644 --- a/src/eventdisplay_ml/evaluate.py +++ b/src/eventdisplay_ml/evaluate.py @@ -281,7 +281,10 @@ def shap_feature_importance_by_energy( # Extract energy values and create bins mce0_values = df.loc[y_test.index, "MCe0"].values bins = np.linspace(log_e_min, log_e_max, n_bins + 1) - bin_indices = np.digitize(mce0_values, bins) + # Use pd.cut with include_lowest=True to match calculate_resolution binning + bin_categories = pd.cut(mce0_values, bins=bins, include_lowest=True, right=True) + # Convert categorical bins to 1-based integer indices (NaN -> code -1, becomes 0) + bin_indices = bin_categories.cat.codes + 1 n_features = len(x_test.columns) n_targets = len(target_names) From cb6bf5bafc51d4726a371a1e2438ac40f848f3e0 Mon Sep 17 00:00:00 2001 From: Gernot Maier Date: Fri, 30 Jan 2026 15:18:07 +0100 Subject: [PATCH 5/6] Update src/eventdisplay_ml/evaluate.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/eventdisplay_ml/evaluate.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/eventdisplay_ml/evaluate.py b/src/eventdisplay_ml/evaluate.py index 09fa046..2101240 100644 --- a/src/eventdisplay_ml/evaluate.py +++ b/src/eventdisplay_ml/evaluate.py @@ -308,13 +308,14 @@ def shap_feature_importance_by_energy( bin_upper = bins[bin_idx] mean_log_e = mce0_values[mask].mean() - bin_label = f"LogE={mean_log_e:.2f}" + # Use a stable, unique bin label based on the explicit energy range + bin_label = f"[{bin_lower:.2f}, {bin_upper:.2f}]" bin_info.append( { "label": bin_label, "mean_log_e": mean_log_e, "n_events": n_events, - "range": f"[{bin_lower:.2f}, {bin_upper:.2f}]", + "range": bin_label, } ) From 28406e9f5b5b1be1acc6f50e28451b65bbc99100 Mon Sep 17 00:00:00 2001 From: Gernot Maier Date: Fri, 30 Jan 2026 15:20:47 +0100 Subject: [PATCH 6/6] comments --- src/eventdisplay_ml/evaluate.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/eventdisplay_ml/evaluate.py b/src/eventdisplay_ml/evaluate.py index 2101240..cd49336 100644 --- a/src/eventdisplay_ml/evaluate.py +++ b/src/eventdisplay_ml/evaluate.py @@ -68,7 +68,9 @@ def evaluate_classification_model(model, x_test, y_test, df, x_cols, name): shap_feature_importance(model, x_test, ["label"]) -def evaluate_regression_model(model, x_test, y_test, df, x_cols, y_data, name): +def evaluate_regression_model( + model, x_test, y_test, df, x_cols, y_data, name, shap_per_energy=False +): """Evaluate the trained model on the test set and log performance metrics.""" score = model.score(x_test, y_test) _logger.info(f"XGBoost Multi-Target R^2 Score (Testing Set): {score:.4f}") @@ -82,10 +84,8 @@ def evaluate_regression_model(model, x_test, y_test, df, x_cols, y_data, name): feature_importance(model, x_cols, y_data.columns, name) if name == "xgboost": shap_feature_importance(model, x_test, y_data.columns) - # Optional for now - # shap_feature_importance_by_energy( - # model, x_test, df, y_test, y_data.columns - # ) + if shap_per_energy: + shap_feature_importance_by_energy(model, x_test, df, y_test, y_data.columns) df_pred = pd.DataFrame(y_pred, columns=target_features("stereo_analysis")) calculate_resolution(