From 0ff13a8d138b5f906403ba693f3b3b430e0bf063 Mon Sep 17 00:00:00 2001 From: Guilherme Seidyo Imai Aldeia Date: Tue, 24 Mar 2026 11:14:33 -0400 Subject: [PATCH 1/5] Enabling log for debugging --- tests/python/test_final_model_selection.py | 23 ++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/tests/python/test_final_model_selection.py b/tests/python/test_final_model_selection.py index 79401a73..7e94c138 100644 --- a/tests/python/test_final_model_selection.py +++ b/tests/python/test_final_model_selection.py @@ -84,8 +84,11 @@ def test_classification_selection(): idx = np.argmin([p.fitness.linear_complexity for p in model.archive_]) -@pytest.mark.parametrize("scorer", ['log', 'accuracy', 'balanced_accuracy', 'average_precision_score']) -@pytest.mark.parametrize("class_weights", ['unbalanced', 'support', [1.0, 1.0], [1.0, 1.3]]) +# @pytest.mark.parametrize("scorer", ['log', 'accuracy', 'balanced_accuracy', 'average_precision_score']) +# @pytest.mark.parametrize("class_weights", ['unbalanced', 'support', [1.0, 1.0], [1.0, 1.3]]) + +@pytest.mark.parametrize("scorer", ['log']) +@pytest.mark.parametrize("class_weights", ['unbalanced']) def test_final_model_selection_best_validation_ci_replicated(scorer, class_weights): # Small dataset for testing X, y = make_classification(n_samples=100, n_features=6, n_informative=4, random_state=42) @@ -146,9 +149,9 @@ def eval(individual, sample=None, log=False): if log: # silencing eval() during bootstrap, but enabling detailed info when re-calculating losses and comparing with brush's metrics print('evaluating', individual.program.get_model()) - print(np.round(y, 2)) - print(np.round(y_pred, 2)) - print('(sorted)', np.sort(y_pred)) + print('rounded y', np.round(y, 2)) + print('rounded preds', np.round(y_pred, 2)) + print('sorted preds', np.sort(y_pred)) if est.class_weights not in ['unbalanced'] and est.parameters_.scorer not in ['balanced_accuracy']: sample_weight = None @@ -181,20 +184,24 @@ def eval(individual, sample=None, log=False): print('(eval) sample weights', sample_weight) print('(eval) loss', loss_f(y[sample], y_pred[sample], sample_weight=sample_weight[sample])) - return loss_f(y[sample], y_pred[sample], sample_weight=sample_weight[sample]) + calculated_loss = loss_f(y[sample], y_pred[sample], sample_weight=sample_weight[sample]) + print('calculated loss:', calculated_loss) + return calculated_loss else: # Cases where we ignore weights if log: print('(eval) using no class weights') print('(eval) sample weights not defined. using unbalanced version') print('(eval) loss', loss_f(y[sample], y_pred[sample])) - return loss_f(y[sample], y_pred[sample]) + calculated_loss = loss_f(y[sample], y_pred[sample]) + print('calculated loss:', calculated_loss) + return calculated_loss # Bootstrap validation samples print("scorer and class weights;", scorer, class_weights) print("original loss", est.best_estimator_.fitness.loss) print("original loss_v", est.best_estimator_.fitness.loss_v) - print("recalculated loss", eval(est.best_estimator_)) + print("recalculated loss", eval(est.best_estimator_, log=True)) np.random.seed(0) val_samples = [eval(est.best_estimator_, np.random.randint(len(y), size=len(y))) From 3cbb0f6881a65e7cb6bf84946d74955ffa9ef49d Mon Sep 17 00:00:00 2001 From: Guilherme Seidyo Imai Aldeia Date: Tue, 24 Mar 2026 13:04:44 -0400 Subject: [PATCH 2/5] Improved log messages --- tests/python/test_final_model_selection.py | 25 +++++++++++----------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/tests/python/test_final_model_selection.py b/tests/python/test_final_model_selection.py index 7e94c138..e679ebaa 100644 --- a/tests/python/test_final_model_selection.py +++ b/tests/python/test_final_model_selection.py @@ -84,11 +84,8 @@ def test_classification_selection(): idx = np.argmin([p.fitness.linear_complexity for p in model.archive_]) -# @pytest.mark.parametrize("scorer", ['log', 'accuracy', 'balanced_accuracy', 'average_precision_score']) -# @pytest.mark.parametrize("class_weights", ['unbalanced', 'support', [1.0, 1.0], [1.0, 1.3]]) - -@pytest.mark.parametrize("scorer", ['log']) -@pytest.mark.parametrize("class_weights", ['unbalanced']) +@pytest.mark.parametrize("scorer", ['log', 'accuracy', 'balanced_accuracy', 'average_precision_score']) +@pytest.mark.parametrize("class_weights", ['unbalanced', 'support', [1.0, 1.0], [1.0, 1.3]]) def test_final_model_selection_best_validation_ci_replicated(scorer, class_weights): # Small dataset for testing X, y = make_classification(n_samples=100, n_features=6, n_informative=4, random_state=42) @@ -133,7 +130,8 @@ def test_final_model_selection_best_validation_ci_replicated(scorer, class_weigh } loss_f = loss_f_dict[est.parameters_.scorer] - def eval(individual, sample=None, log=False): + def eval_with_sklearn(individual, sample=None, log=False): + if sample is None: sample = np.arange(len(data.y)) @@ -147,7 +145,10 @@ def eval(individual, sample=None, log=False): else: y_pred = np.array(individual.predict(data)).astype(float) - if log: # silencing eval() during bootstrap, but enabling detailed info when re-calculating losses and comparing with brush's metrics + eps = 0 + y_pred = np.clip(y_pred, eps, 1-eps) + + if log: # silencing eval_with_sklearn() during bootstrap, but enabling detailed info when re-calculating losses and comparing with brush's metrics print('evaluating', individual.program.get_model()) print('rounded y', np.round(y, 2)) print('rounded preds', np.round(y_pred, 2)) @@ -201,22 +202,22 @@ def eval(individual, sample=None, log=False): print("scorer and class weights;", scorer, class_weights) print("original loss", est.best_estimator_.fitness.loss) print("original loss_v", est.best_estimator_.fitness.loss_v) - print("recalculated loss", eval(est.best_estimator_, log=True)) + print("recalculated loss", eval_with_sklearn(est.best_estimator_, log=True)) np.random.seed(0) - val_samples = [eval(est.best_estimator_, np.random.randint(len(y), size=len(y))) + val_samples = [eval_with_sklearn(est.best_estimator_, np.random.randint(len(y), size=len(y))) for _ in range(100)] lower_ci, upper_ci = np.quantile(val_samples, 0.05), np.quantile(val_samples, 0.95) print(f"CI bounds: {lower_ci:.4f}, {upper_ci:.4f}") # Evaluate all archive members - new_losses = [eval(ind, log=True) for ind in est.archive_] + new_losses = [eval_with_sklearn(ind, log=True) for ind in est.archive_] candidates = [(l, p) for l, p in zip(new_losses, est.archive_) if lower_ci <= l <= upper_ci] print('first arch ind', est.archive_[0].get_model()) - print("Original losses from archive (brush's auprc) ", [ind.fitness.loss for ind in est.archive_]) - print("Original losses_v from archive (brush's auprc) ", [ind.fitness.loss_v for ind in est.archive_]) + print("Original losses from archive (brush's metric) ", [ind.fitness.loss for ind in est.archive_]) + print("Original losses_v from archive (brush's metric) ", [ind.fitness.loss_v for ind in est.archive_]) print("Recalculated losses with sklearn (should match)", new_losses) print(f"Num candidates in CI: {len(candidates)}") From 181796d58a6d976079d30feced89943685835719 Mon Sep 17 00:00:00 2001 From: Guilherme Seidyo Imai Aldeia Date: Tue, 24 Mar 2026 13:59:21 -0400 Subject: [PATCH 3/5] Clipping pred_proba to something greater than brush and sklearn eps This avoids different behaviors due to numeric handlinging log(0) and log(1) --- tests/python/test_final_model_selection.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/python/test_final_model_selection.py b/tests/python/test_final_model_selection.py index e679ebaa..1740c3e5 100644 --- a/tests/python/test_final_model_selection.py +++ b/tests/python/test_final_model_selection.py @@ -142,11 +142,12 @@ def eval_with_sklearn(individual, sample=None, log=False): if est.parameters_.scorer in ["log", "average_precision_score"]: y_pred = np.array(individual.predict_proba(data)).astype(float) + + eps = 0.01 + y_pred = np.clip(y_pred, eps, 1-eps) else: y_pred = np.array(individual.predict(data)).astype(float) - eps = 0 - y_pred = np.clip(y_pred, eps, 1-eps) if log: # silencing eval_with_sklearn() during bootstrap, but enabling detailed info when re-calculating losses and comparing with brush's metrics print('evaluating', individual.program.get_model()) From cadcbe663f81391f81a3eaf80d00618501f36090 Mon Sep 17 00:00:00 2001 From: Guilherme Seidyo Imai Aldeia Date: Tue, 24 Mar 2026 14:19:49 -0400 Subject: [PATCH 4/5] Decreasing the eps a little bit --- tests/python/test_final_model_selection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python/test_final_model_selection.py b/tests/python/test_final_model_selection.py index 1740c3e5..846d6b5a 100644 --- a/tests/python/test_final_model_selection.py +++ b/tests/python/test_final_model_selection.py @@ -143,7 +143,7 @@ def eval_with_sklearn(individual, sample=None, log=False): if est.parameters_.scorer in ["log", "average_precision_score"]: y_pred = np.array(individual.predict_proba(data)).astype(float) - eps = 0.01 + eps = 0.001 y_pred = np.clip(y_pred, eps, 1-eps) else: y_pred = np.array(individual.predict(data)).astype(float) From fa35282785e9e98838cbbdb01ecf21d9a0a9f0b5 Mon Sep 17 00:00:00 2001 From: Guilherme Seidyo Imai Aldeia Date: Tue, 24 Mar 2026 14:28:41 -0400 Subject: [PATCH 5/5] Applying clipping only for log loss This is where brush and sklearn can have differences in calculation --- tests/python/test_final_model_selection.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/python/test_final_model_selection.py b/tests/python/test_final_model_selection.py index 846d6b5a..546d1a2a 100644 --- a/tests/python/test_final_model_selection.py +++ b/tests/python/test_final_model_selection.py @@ -143,8 +143,9 @@ def eval_with_sklearn(individual, sample=None, log=False): if est.parameters_.scorer in ["log", "average_precision_score"]: y_pred = np.array(individual.predict_proba(data)).astype(float) - eps = 0.001 - y_pred = np.clip(y_pred, eps, 1-eps) + if est.parameters_.scorer == "log": + eps = 0.001 + y_pred = np.clip(y_pred, eps, 1-eps) else: y_pred = np.array(individual.predict(data)).astype(float)