From e539a5b5b22ba0d1dd18201ed56c5a1e8bbf3d4e Mon Sep 17 00:00:00 2001 From: Guilherme Seidyo Imai Aldeia Date: Wed, 22 Apr 2026 17:08:22 -0400 Subject: [PATCH 01/16] Removing old TODO comment --- src/bandit/bandit.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/bandit/bandit.cpp b/src/bandit/bandit.cpp index c76a46f6..deebf3ae 100644 --- a/src/bandit/bandit.cpp +++ b/src/bandit/bandit.cpp @@ -1,5 +1,4 @@ #include "bandit.h" -#include // FOR DEBUGGING PURPOSES. TODO: remove it later namespace Brush { namespace MAB { From dd7bbc482a59c1d8b75afb6a2ce80bb6f45b0c40 Mon Sep 17 00:00:00 2001 From: Guilherme Seidyo Imai Aldeia Date: Wed, 22 Apr 2026 17:08:59 -0400 Subject: [PATCH 02/16] Removed old try catch to pick errors with scalar(w) --- src/program/operator.h | 62 ++++++++++++++---------------------------- 1 file changed, 21 insertions(+), 41 deletions(-) diff --git a/src/program/operator.h b/src/program/operator.h index bda086e1..edf221dc 100644 --- a/src/program/operator.h +++ b/src/program/operator.h @@ -29,53 +29,33 @@ namespace util{ } } - try //TODO: remove this try catch after debugging it - { - w = Scalar(tn.data.W); - } - catch (const std::exception& e) { - std::string err_msg = "Null pointer dereference: *weights is nullptr. " - "TreeNode ret_type: " + std::to_string(static_cast(tn.data.ret_type)) + - ", name: " + tn.data.name; - std::cerr << "[EXCEPTION] get_weight: caught std::exception: " << e.what() << err_msg << std::endl; - throw; // Re-throw to allow crash - } + w = Scalar(tn.data.W); } else { - try //TODO: remove this try catch after debugging it - { - if (*weights == nullptr) { - std::string err_msg = "Null pointer dereference: *weights is nullptr. " - "TreeNode ret_type: " + std::to_string(static_cast(tn.data.ret_type)) + - ", name: " + tn.data.name; - HANDLE_ERROR_THROW("Null pointer dereference: *weights is nullptr. " + err_msg); - } - - // NLS case 1: floating point weight is stored in weights - if constexpr (is_same_v) - w = **weights; + if (*weights == nullptr) { + std::string err_msg = "Null pointer dereference: *weights is nullptr. " + "TreeNode ret_type: " + std::to_string(static_cast(tn.data.ret_type)) + + ", name: " + tn.data.name; + HANDLE_ERROR_THROW("Null pointer dereference: *weights is nullptr. " + err_msg); + } - // NLS case 2: a Jet/Dual weight is stored in weights, but this constant is a - // integer type. We need to do some casting - else if constexpr (is_same_v && is_same_v) { - using WScalar = typename Scalar::Scalar; - WScalar tmp = WScalar((**weights).a); - w = Scalar(tmp); - } - // NLS case 3: a Jet/Dual weight is stored in weights, matching Scalar type - else - w = Scalar(**weights); + // NLS case 1: floating point weight is stored in weights + if constexpr (is_same_v) + w = **weights; - *weights = *weights+1; + // NLS case 2: a Jet/Dual weight is stored in weights, but this constant is a + // integer type. We need to do some casting + else if constexpr (is_same_v && is_same_v) { + using WScalar = typename Scalar::Scalar; + WScalar tmp = WScalar((**weights).a); + w = Scalar(tmp); } - catch (const std::exception& e) { - std::string err_msg = "Null pointer dereference: *weights is nullptr. " - "TreeNode ret_type: " + std::to_string(static_cast(tn.data.ret_type)) + - ", name: " + tn.data.name; - std::cerr << "[EXCEPTION] get_weight: caught std::exception: " << e.what() << err_msg << std::endl; - throw; // Re-throw to allow crash - } + // NLS case 3: a Jet/Dual weight is stored in weights, matching Scalar type + else + w = Scalar(**weights); + + *weights = *weights+1; } return w; }; From 91c0c85857bb9d99d8f65633acb9b19bf2380594 Mon Sep 17 00:00:00 2001 From: Guilherme Seidyo Imai Aldeia Date: Fri, 24 Apr 2026 14:07:42 -0400 Subject: [PATCH 03/16] Deleted solved TODOs --- src/params.h | 2 +- src/program/program.h | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/params.h b/src/params.h index 3924fa2a..dfa41c19 100644 --- a/src/params.h +++ b/src/params.h @@ -38,7 +38,7 @@ struct Parameters unsigned int max_size = 50; vector objectives{"scorer","linear_complexity"}; // scorer should be generic and deducted based on mode - string bandit = "thompson"; // TODO: should I rename dummy? + string bandit = "thompson"; string sel = "lexicase"; //selection method string surv = "nsga2"; //survival method std::unordered_map functions; diff --git a/src/program/program.h b/src/program/program.h index 1167e846..c47c8a16 100644 --- a/src/program/program.h +++ b/src/program/program.h @@ -7,7 +7,6 @@ license: GNU/GPL v3 #define PROGRAM_H //external includes -// #include #include "assert.h" @@ -410,7 +409,7 @@ template struct Program */ string get_dot_model(string extras="") const { - // TODO: make the node names their hash or index, and the node label the nodetype name. + // TODO: make node IDs stable (hash or index) and labels reflect nodetype names. // ref: https://stackoverflow.com/questions/10579041/graphviz-create-new-node-with-this-same-label#10579155 string out = "digraph G {\n"; if (! extras.empty()) From b38e515672659af22a58840985ed1e33d02f8683 Mon Sep 17 00:00:00 2001 From: Guilherme Seidyo Imai Aldeia Date: Fri, 24 Apr 2026 14:08:03 -0400 Subject: [PATCH 04/16] Added bandit initialization checks --- src/bandit/bandit.cpp | 13 +++++++++++-- src/bandit/bandit.h | 4 ++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/src/bandit/bandit.cpp b/src/bandit/bandit.cpp index deebf3ae..e24dd76d 100644 --- a/src/bandit/bandit.cpp +++ b/src/bandit/bandit.cpp @@ -34,8 +34,6 @@ Bandit::Bandit(string type, map arms_probs) : type(type) { } void Bandit::set_bandit() { - // TODO: a flag that is set to true when this function is called. make all - // other methods to raise an error if bandit was not set if (type == "thompson") { pbandit = make_unique(probabilities); } else if (type == "dynamic_thompson") { @@ -45,6 +43,14 @@ void Bandit::set_bandit() { } else { HANDLE_ERROR_THROW("Undefined Selection Operator " + this->type + "\n"); } + + bandit_set = true; +} + +void Bandit::ensure_bandit_set() const { + if (!bandit_set || !pbandit) { + HANDLE_ERROR_THROW("Bandit operator is not set. Call set_bandit() before use.\n"); + } } string Bandit::get_type() { @@ -72,6 +78,7 @@ void Bandit::set_probs(map arms_probs) { } map Bandit::sample_probs(bool update) { + ensure_bandit_set(); map new_probs = this->pbandit->sample_probs(update); // making all probabilities strictly positive @@ -87,10 +94,12 @@ map Bandit::sample_probs(bool update) { } string Bandit::choose() { + ensure_bandit_set(); return this->pbandit->choose(); } void Bandit::update(string arm, float reward) { + ensure_bandit_set(); this->pbandit->update(arm, reward); } diff --git a/src/bandit/bandit.h b/src/bandit/bandit.h index 3ccefdcb..51ded92a 100644 --- a/src/bandit/bandit.h +++ b/src/bandit/bandit.h @@ -117,6 +117,10 @@ struct Bandit * @param reward The received reward. */ void update(string arm, float reward); + +private: + bool bandit_set = false; + void ensure_bandit_set() const; }; //TODO: serialization should save the type of bandit and its parameters From f99444913a0149bdf30ff1e400a903ded69ac37a Mon Sep 17 00:00:00 2001 From: Guilherme Seidyo Imai Aldeia Date: Fri, 24 Apr 2026 14:12:24 -0400 Subject: [PATCH 05/16] Forward declarations to avoid importing .cpp --- src/bindings/bind_engines.h | 12 ------------ src/bindings/bind_evaluator.h | 1 - src/bindings/bind_selection.h | 6 ------ src/bindings/bind_variation.h | 5 ----- src/engine.h | 6 +++++- src/eval/evaluation.cpp | 7 ++++++- src/eval/evaluation.h | 5 +++++ src/pop/archive.cpp | 22 +++++++--------------- src/pop/archive.h | 16 +++++----------- src/pop/population.cpp | 7 ++++++- src/pop/population.h | 5 +++++ src/selection/lexicase.cpp | 5 +++++ src/selection/lexicase.h | 10 ++++++++++ src/selection/nsga2.cpp | 7 ++++++- src/selection/nsga2.h | 5 +++++ src/selection/selection.cpp | 5 +++++ src/selection/selection.h | 5 +++++ src/selection/selection_operator.cpp | 7 ++++++- src/selection/selection_operator.h | 5 +++++ src/vary/variation.h | 12 +++++------- 20 files changed, 91 insertions(+), 62 deletions(-) diff --git a/src/bindings/bind_engines.h b/src/bindings/bind_engines.h index fe31169c..b92a5071 100644 --- a/src/bindings/bind_engines.h +++ b/src/bindings/bind_engines.h @@ -1,34 +1,23 @@ #include "module.h" #include "../engine.h" -#include "../engine.cpp" -// TODO: figure out why do I need to include the whole thing (otherwise it gives me symbol errors) #include "../bandit/bandit.h" #include "../bandit/bandit_operator.h" #include "../bandit/dummy.h" #include "../bandit/thompson.h" #include "../ind/individual.h" -#include "../ind/individual.cpp" #include "../vary/variation.h" -#include "../vary/variation.cpp" #include "../eval/evaluation.h" -#include "../eval/evaluation.cpp" -#include "../pop/population.cpp" #include "../pop/population.h" #include "../selection/selection.h" -#include "../selection/selection.cpp" #include "../selection/selection_operator.h" -#include "../selection/selection_operator.cpp" #include "../selection/nsga2.h" -#include "../selection/nsga2.cpp" #include "../selection/lexicase.h" -#include "../selection/lexicase.cpp" -#include "../pop/archive.cpp" #include "../pop/archive.h" using Reg = Brush::RegressorEngine; @@ -93,7 +82,6 @@ void bind_engine(py::module& m, string name) }, [](nl::json j) { // __setstate__ T p = j; - // TODO: do I need to get the data and ss reference, then call init for this new instance? return p; }) ) diff --git a/src/bindings/bind_evaluator.h b/src/bindings/bind_evaluator.h index 90ea3ab5..75417220 100644 --- a/src/bindings/bind_evaluator.h +++ b/src/bindings/bind_evaluator.h @@ -1,6 +1,5 @@ #include "module.h" #include "../eval/evaluation.h" -#include "../eval/evaluation.cpp" namespace py = pybind11; namespace br = Brush; diff --git a/src/bindings/bind_selection.h b/src/bindings/bind_selection.h index af146f0c..412face1 100644 --- a/src/bindings/bind_selection.h +++ b/src/bindings/bind_selection.h @@ -1,16 +1,10 @@ #include "module.h" -// TODO: figure out why im having symbol errors (if i dont include the cpp here as well) #include "../selection/selection.h" -#include "../selection/selection.cpp" #include "../selection/selection_operator.h" -#include "../selection/selection_operator.cpp" #include "../selection/nsga2.h" -#include "../selection/nsga2.cpp" #include "../selection/lexicase.h" -#include "../selection/lexicase.cpp" -#include "../pop/population.cpp" #include "../pop/population.h" namespace py = pybind11; diff --git a/src/bindings/bind_variation.h b/src/bindings/bind_variation.h index 3e55f884..2b6305ec 100644 --- a/src/bindings/bind_variation.h +++ b/src/bindings/bind_variation.h @@ -1,22 +1,17 @@ #include "module.h" #include "../pop/population.h" -#include "../pop/population.cpp" #include "../bandit/bandit.h" #include "../bandit/bandit_operator.h" #include "../bandit/dummy.h" #include "../bandit/thompson.h" #include "../ind/individual.h" -#include "../ind/individual.cpp" -#include "../simplification/constants.cpp" #include "../simplification/constants.h" -#include "../simplification/inexact.cpp" #include "../simplification/inexact.h" #include "../vary/variation.h" -#include "../vary/variation.cpp" namespace py = pybind11; namespace nl = nlohmann; diff --git a/src/engine.h b/src/engine.h index 4c65802b..9f0527c6 100644 --- a/src/engine.h +++ b/src/engine.h @@ -130,7 +130,6 @@ class Engine{ /// train the model void run(Dataset &d); - // TODO: should params and ss be private? (that would require better json handling) Parameters params; ///< hyperparameters of brush, which the user can interact SearchSpace ss; @@ -161,5 +160,10 @@ NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Engine, params, best_in NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Engine, params, best_ind, archive, pop, ss, is_fitted); NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Engine, params, best_ind, archive, pop, ss, is_fitted); +extern template class Engine; +extern template class Engine; +extern template class Engine; +extern template class Engine; + } // Brush #endif diff --git a/src/eval/evaluation.cpp b/src/eval/evaluation.cpp index b3220da0..b36d8bca 100644 --- a/src/eval/evaluation.cpp +++ b/src/eval/evaluation.cpp @@ -111,4 +111,9 @@ void Evaluation::assign_fit(Individual& ind, const Dataset& data, } } // Pop -} // Brush \ No newline at end of file +} // Brush + +template class Brush::Eval::Evaluation; +template class Brush::Eval::Evaluation; +template class Brush::Eval::Evaluation; +template class Brush::Eval::Evaluation; \ No newline at end of file diff --git a/src/eval/evaluation.h b/src/eval/evaluation.h index b4b1240c..2c988918 100644 --- a/src/eval/evaluation.h +++ b/src/eval/evaluation.h @@ -90,6 +90,11 @@ class Evaluation { // representation program (TODO: implement) }; +extern template class Evaluation; +extern template class Evaluation; +extern template class Evaluation; +extern template class Evaluation; + } //selection } //brush #endif diff --git a/src/pop/archive.cpp b/src/pop/archive.cpp index fd16b833..425e5f46 100644 --- a/src/pop/archive.cpp +++ b/src/pop/archive.cpp @@ -51,26 +51,14 @@ bool Archive::sortObj1(const Individual& lhs, } template -bool Archive::sameFitComplexity(const Individual& lhs, +bool Archive::sameObjectives(const Individual& lhs, const Individual& rhs) { - // TODO: delete this one - return (lhs.fitness == rhs.fitness); // fitness' operator== is overloaded to compare wvalues. // we also check complexity equality to avoid the case where the user // did not specified complexity as one of the objectives - // return (lhs.fitness == rhs.fitness - // && lhs.fitness.complexity == rhs.fitness.complexity); -} - -// TODO: i could get rid of one of these -template -bool Archive::sameObjectives(const Individual& lhs, - const Individual& rhs) -{ - return (lhs.fitness == rhs.fitness); } template @@ -149,7 +137,6 @@ void Archive::update(Population& pop, const Parameters& params) std::sort(individuals.begin(), individuals.end(), &sortObj1); } - /* auto it = std::unique(individuals.begin(),individuals.end(), &sameFitComplexity); */ auto it = std::unique(individuals.begin(),individuals.end(), &sameObjectives); @@ -157,4 +144,9 @@ void Archive::update(Population& pop, const Parameters& params) } } // Pop -} // Brush \ No newline at end of file +} // Brush + +template struct Brush::Pop::Archive; +template struct Brush::Pop::Archive; +template struct Brush::Pop::Archive; +template struct Brush::Pop::Archive; \ No newline at end of file diff --git a/src/pop/archive.h b/src/pop/archive.h index 44c4d5fa..18a45081 100644 --- a/src/pop/archive.h +++ b/src/pop/archive.h @@ -81,17 +81,6 @@ struct Archive */ static bool sortObj1(const Individual& lhs, const Individual& rhs); - /** - * @brief Checks if two individuals have the same fitness complexity. - * - * This static function is used to check if two individuals have the same fitness complexity. - * It is used as a comparison function for finding duplicates in the population. - * - * @param lhs The left-hand side individual to compare. - * @param rhs The right-hand side individual to compare. - */ - static bool sameFitComplexity(const Individual& lhs, const Individual& rhs); - /** * @brief Checks if two individuals have the same objectives. * @@ -104,6 +93,11 @@ struct Archive static bool sameObjectives(const Individual& lhs, const Individual& rhs); }; +extern template struct Archive; +extern template struct Archive; +extern template struct Archive; +extern template struct Archive; + //serialization NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Archive, individuals, sort_complexity, linear_complexity); NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Archive, individuals, sort_complexity, linear_complexity); diff --git a/src/pop/population.cpp b/src/pop/population.cpp index 13318f0c..9082916d 100644 --- a/src/pop/population.cpp +++ b/src/pop/population.cpp @@ -211,7 +211,7 @@ void Population::update(vector> survivors) template string Population::print_models(string sep) { - // TODO: rename it. This function does not print anything, just returns a string + // TODO: rename it. This function returns a string; it does not print. string output = ""; for (int j=0; j::migrate() } // Pop } // Brush + +template class Brush::Pop::Population; +template class Brush::Pop::Population; +template class Brush::Pop::Population; +template class Brush::Pop::Population; diff --git a/src/pop/population.h b/src/pop/population.h index afa4560a..9dfef905 100644 --- a/src/pop/population.h +++ b/src/pop/population.h @@ -97,6 +97,11 @@ class Population{ }; }; +extern template class Population; +extern template class Population; +extern template class Population; +extern template class Population; + NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Population, individuals, island_indexes, pop_size, num_islands, mig_prob, linear_complexity); NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Population, diff --git a/src/selection/lexicase.cpp b/src/selection/lexicase.cpp index c04527ff..1cbccfdc 100644 --- a/src/selection/lexicase.cpp +++ b/src/selection/lexicase.cpp @@ -180,3 +180,8 @@ vector Lexicase::survive(Population& pop, int island, } } + +template class Brush::Sel::Lexicase; +template class Brush::Sel::Lexicase; +template class Brush::Sel::Lexicase; +template class Brush::Sel::Lexicase; diff --git a/src/selection/lexicase.h b/src/selection/lexicase.h index 9613bfcb..1b986508 100644 --- a/src/selection/lexicase.h +++ b/src/selection/lexicase.h @@ -31,8 +31,18 @@ class Lexicase : public SelectionOperator /// lexicase survival vector survive(Population& pop, int island, const Parameters& p); + + void set_lexicase_pool(vector s) { this->lexicase_pool = s; } + +private: + vector lexicase_pool; }; +extern template class Lexicase; +extern template class Lexicase; +extern template class Lexicase; +extern template class Lexicase; + } // Sel } // Brush #endif \ No newline at end of file diff --git a/src/selection/nsga2.cpp b/src/selection/nsga2.cpp index 57eac371..4fa8bb7a 100644 --- a/src/selection/nsga2.cpp +++ b/src/selection/nsga2.cpp @@ -253,4 +253,9 @@ void NSGA2::crowding_distance(Population& pop, vector>& front, } } // selection -} // Brush \ No newline at end of file +} // Brush + +template class Brush::Sel::NSGA2; +template class Brush::Sel::NSGA2; +template class Brush::Sel::NSGA2; +template class Brush::Sel::NSGA2; \ No newline at end of file diff --git a/src/selection/nsga2.h b/src/selection/nsga2.h index f883d832..73496031 100644 --- a/src/selection/nsga2.h +++ b/src/selection/nsga2.h @@ -77,6 +77,11 @@ class NSGA2 : public SelectionOperator size_t tournament(Population& pop, size_t i, size_t j) const; }; + extern template class NSGA2; + extern template class NSGA2; + extern template class NSGA2; + extern template class NSGA2; + } // selection } // Brush #endif \ No newline at end of file diff --git a/src/selection/selection.cpp b/src/selection/selection.cpp index f1097d02..ad8f1363 100644 --- a/src/selection/selection.cpp +++ b/src/selection/selection.cpp @@ -64,3 +64,8 @@ vector Selection::survive(Population& pop, int island, } // Sel } // Brush + +template struct Brush::Sel::Selection; +template struct Brush::Sel::Selection; +template struct Brush::Sel::Selection; +template struct Brush::Sel::Selection; diff --git a/src/selection/selection.h b/src/selection/selection.h index 2ab6c344..85f91290 100644 --- a/src/selection/selection.h +++ b/src/selection/selection.h @@ -47,6 +47,11 @@ struct Selection const Parameters& params); }; +extern template struct Selection; +extern template struct Selection; +extern template struct Selection; +extern template struct Selection; + } // Sel } // Brush #endif \ No newline at end of file diff --git a/src/selection/selection_operator.cpp b/src/selection/selection_operator.cpp index f4268b55..93a99e5a 100644 --- a/src/selection/selection_operator.cpp +++ b/src/selection/selection_operator.cpp @@ -26,4 +26,9 @@ vector SelectionOperator::survive(Population& pop, int island, }; } // selection -} // Brush \ No newline at end of file +} // Brush + +template class Brush::Sel::SelectionOperator; +template class Brush::Sel::SelectionOperator; +template class Brush::Sel::SelectionOperator; +template class Brush::Sel::SelectionOperator; \ No newline at end of file diff --git a/src/selection/selection_operator.h b/src/selection/selection_operator.h index da56eddb..7446a076 100644 --- a/src/selection/selection_operator.h +++ b/src/selection/selection_operator.h @@ -59,6 +59,11 @@ class SelectionOperator virtual vector survive(Population& pop, int island, const Parameters& p); }; +extern template class SelectionOperator; +extern template class SelectionOperator; +extern template class SelectionOperator; +extern template class SelectionOperator; + } // selection } // Brush #endif diff --git a/src/vary/variation.h b/src/vary/variation.h index a7f61547..3efb76f2 100644 --- a/src/vary/variation.h +++ b/src/vary/variation.h @@ -317,7 +317,6 @@ class Variation { ind.program.fit(data.get_training_data()); // simplify before calculating fitness (order matters, as they are not refitted and constants simplifier does not replace with the right value.) - // TODO: constants_simplifier should set the correct value for the constant (so we dont have to refit). // simplify constants first to avoid letting the lsh simplifier to visit redundant branches if (parameters.constants_simplification && do_simplification) @@ -604,12 +603,6 @@ class Variation { Inexact_simplifier inexact_simplifier; }; -// // Explicitly instantiate the template for brush program types -// template class Variation; -// template class Variation; -// template class Variation; -// template class Variation; - class MutationBase { public: using Iter = tree::pre_order_iterator; @@ -633,6 +626,11 @@ class MutationBase { const Parameters& params); }; +extern template class Variation; +extern template class Variation; +extern template class Variation; +extern template class Variation; + } //namespace Var } //namespace Brush #endif \ No newline at end of file From 55e9659f5339693cf3c7a906d408844c6880873c Mon Sep 17 00:00:00 2001 From: Guilherme Seidyo Imai Aldeia Date: Fri, 24 Apr 2026 14:12:53 -0400 Subject: [PATCH 06/16] Clearing old if statement used for debugging --- src/vary/search_space.cpp | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/vary/search_space.cpp b/src/vary/search_space.cpp index aed8d86c..639155a6 100644 --- a/src/vary/search_space.cpp +++ b/src/vary/search_space.cpp @@ -1,5 +1,5 @@ #include "search_space.h" -#include "../program/program.h" // TODO: dont import this header here +#include "../program/program.h" namespace Brush{ @@ -285,11 +285,6 @@ std::optional> SearchSpace::sample_subtree(Node root, int max_d, int return std::nullopt; // it will always have a terminal (because we create constants). - // TODO: I guess I can remove this line below and it will still work - // if ( (terminal_map.find(root.ret_type) == terminal_map.end()) - // || (!has_solution_space(terminal_weights.at(root.ret_type).begin(), - // terminal_weights.at(root.ret_type).end())) ) - // return std::nullopt; auto Tree = tree(); auto spot = Tree.insert(Tree.begin(), root); From 30344e44b0fd74cd20f9efaefc6d906c54239029 Mon Sep 17 00:00:00 2001 From: Guilherme Seidyo Imai Aldeia Date: Fri, 24 Apr 2026 14:13:39 -0400 Subject: [PATCH 07/16] New test for data. improved train-validation split logic (fixed leaking) --- src/data/data.cpp | 18 +++++++----------- tests/cpp/test_data.cpp | 22 ++++++++++++++++++++-- 2 files changed, 27 insertions(+), 13 deletions(-) diff --git a/src/data/data.cpp b/src/data/data.cpp index 3ba6a260..ea1ca2cf 100644 --- a/src/data/data.cpp +++ b/src/data/data.cpp @@ -247,7 +247,7 @@ void Dataset::init() back_inserter(validation_data_idx), [&](int element) { return element; }); } - else if (classification && true) // figuring out training and validation data indexes + else if (classification) // figuring out training and validation data indexes { // Stratified split for classification problems. TODO: parameters to change stratify behavior? (and set false by default) std::map> class_indices; // TODO: I think I can remove many std:: from the code.. for (size_t i = 0; i < n_samples; ++i) { @@ -270,15 +270,11 @@ void Dataset::init() std::transform(idx.begin(), idx.begin() + n_train_samples, back_inserter(training_data_idx), [&](int element) { return indices[element]; }); - - if (n_class_samples - n_train_samples == 0) - { - // same indices from the training data to the validation data - std::transform(idx.begin(), idx.begin() + n_train_samples, - back_inserter(validation_data_idx), - [&](int element) { return indices[element]; }); - } - else + + // stratified split so train/validation never overlap when a class is + // too small. Now, if a class has no remaining samples for validation, + // it contributes only to training (validation gets none for that class). + if (n_class_samples - n_train_samples > 0) { std::transform(idx.begin() + n_train_samples, idx.end(), back_inserter(validation_data_idx), @@ -290,7 +286,7 @@ void Dataset::init() // logic for non-classification problems vector idx(n_samples); - if (shuffle_split) // TODO: make sure this works with multiple threads and fixed random state + if (shuffle_split) idx = r.shuffled_index(n_samples); else std::iota(idx.begin(), idx.end(), 0); diff --git a/tests/cpp/test_data.cpp b/tests/cpp/test_data.cpp index 2ae1a449..f5f154eb 100644 --- a/tests/cpp/test_data.cpp +++ b/tests/cpp/test_data.cpp @@ -154,8 +154,6 @@ TEST(Data, ShuffleTrueFalse) 2 , 1 , 3 , 2.1, 3.7, -5.2; - X.transposeInPlace(); - ArrayXf y(20); y << 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, @@ -183,4 +181,24 @@ TEST(Data, ShuffleTrueFalse) Dataset dt6(X, y, {}, {}, {}, true, 1.0, 1.0, false); // TODO: write some assertions here + const int total = dt1.get_n_samples(); + + ASSERT_TRUE(dt1.use_validation); + ASSERT_TRUE(dt2.use_validation); + ASSERT_EQ(dt1.get_training_data().get_n_samples() + dt1.get_validation_data().get_n_samples(), total); + ASSERT_EQ(dt2.get_training_data().get_n_samples() + dt2.get_validation_data().get_n_samples(), total); + + ASSERT_FALSE(dt3.use_validation); + ASSERT_FALSE(dt4.use_validation); + ASSERT_EQ(dt3.get_training_data().get_n_samples(), total); + ASSERT_EQ(dt3.get_validation_data().get_n_samples(), total); + ASSERT_EQ(dt4.get_training_data().get_n_samples(), total); + ASSERT_EQ(dt4.get_validation_data().get_n_samples(), total); + + ASSERT_FALSE(dt5.use_validation); + ASSERT_FALSE(dt6.use_validation); + ASSERT_EQ(dt5.get_training_data().get_n_samples(), total); + ASSERT_EQ(dt5.get_validation_data().get_n_samples(), total); + ASSERT_EQ(dt6.get_training_data().get_n_samples(), total); + ASSERT_EQ(dt6.get_validation_data().get_n_samples(), total); } From 622d034ddd2052a08d83f42c64792a826fe274d0 Mon Sep 17 00:00:00 2001 From: Guilherme Seidyo Imai Aldeia Date: Fri, 24 Apr 2026 14:14:59 -0400 Subject: [PATCH 08/16] Removed useless declaration --- src/program/node.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/program/node.cpp b/src/program/node.cpp index 4e1adf0f..00bc6f04 100644 --- a/src/program/node.cpp +++ b/src/program/node.cpp @@ -321,9 +321,6 @@ void from_json(const json &j, Node& p) { j.at("W").get_to(p.W); } - - json new_json = p; } - } From 75a4643bc011ef18a9d9d813b62dce6bcbd753c7 Mon Sep 17 00:00:00 2001 From: Guilherme Seidyo Imai Aldeia Date: Fri, 24 Apr 2026 14:15:30 -0400 Subject: [PATCH 09/16] Removed TODO note. using fixed threshold for const simplification --- src/simplification/constants.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/simplification/constants.h b/src/simplification/constants.h index 8db1c676..845e3b66 100644 --- a/src/simplification/constants.h +++ b/src/simplification/constants.h @@ -59,7 +59,7 @@ namespace Brush { namespace Simpl{ HANDLE_ERROR_THROW("No predict available for the class."); } - if (variance(branch_pred) < 1e-5) // TODO: calculate threshold based on data + if (variance(branch_pred) < 1e-6) { // get constant equivalent to its argtype (all data types should have // a constant defined in the search space for its given type). It will be From 567795f631d4e54643e33c316b3680314f8421af Mon Sep 17 00:00:00 2001 From: Guilherme Seidyo Imai Aldeia Date: Fri, 24 Apr 2026 14:16:16 -0400 Subject: [PATCH 10/16] Cleaning more TODO statements --- src/ind/individual.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/ind/individual.h b/src/ind/individual.h index 41188316..852d55c7 100644 --- a/src/ind/individual.h +++ b/src/ind/individual.h @@ -13,7 +13,7 @@ namespace Pop{ template class Individual{ -public: // TODO: make these private (and work with nlohman json) +public: Program program; ///< executable data structure // store just info that we dont have a getter. size, depth, complexity: they can all be obtained with program. @@ -64,7 +64,6 @@ class Individual{ variation = "born"; }; - // TODO: replace occurences of program.fit with these (also predict and predict_proba) Individual &fit(const Dataset& data) { program.fit(data); this->is_fitted_ = true; @@ -127,7 +126,6 @@ class Individual{ }; /// set parent ids using parents void set_parents(const vector& parents){ parent_id = parents; }; /// set parent ids using id values - // TODO: USE setters and getters intead of accessing it directly // template // void Individual::set_objectives(const vector& objectives) @@ -156,7 +154,6 @@ class Individual{ vector weights; weights.resize(0); for (const auto& obj : objectives) { - // TODO: do i need to use find or this can be done directly? auto it = weightsMap.find(obj); if (it != weightsMap.end()) { weights.push_back(it->second); @@ -189,7 +186,7 @@ void to_json(json &j, const Individual &p) template void from_json(const json &j, Individual& p) -{// TODO: figure out if this works with private attributes and try to actually make them private (and use getters and setters) +{ j.at("program").get_to( p.program ); j.at("fitness").get_to( p.fitness ); j.at("id").get_to( p.id ); From 90bd0c1dd012d0624848369fea20da37d2a18b26 Mon Sep 17 00:00:00 2001 From: Guilherme Seidyo Imai Aldeia Date: Fri, 24 Apr 2026 14:16:37 -0400 Subject: [PATCH 11/16] Forward declaration of engines --- src/engine.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/engine.cpp b/src/engine.cpp index dfee3490..a6891b10 100644 --- a/src/engine.cpp +++ b/src/engine.cpp @@ -610,4 +610,9 @@ void Engine::run(Dataset &data) //When you have tasks that are created at runtime (e.g., subflow, // cudaFlow), you need to execute the graph first to spawn these tasks and dump the entire graph. } -} \ No newline at end of file +} + +template class Brush::Engine; +template class Brush::Engine; +template class Brush::Engine; +template class Brush::Engine; \ No newline at end of file From 41e18d5651d58675ecded98f6cca5a71b6290605 Mon Sep 17 00:00:00 2001 From: Guilherme Seidyo Imai Aldeia Date: Fri, 24 Apr 2026 14:17:01 -0400 Subject: [PATCH 12/16] Using enums, improving comparisons --- src/vary/variation.cpp | 180 ++++++++++++++++++++++++++++++----------- 1 file changed, 134 insertions(+), 46 deletions(-) diff --git a/src/vary/variation.cpp b/src/vary/variation.cpp index a63112c4..c2df980e 100644 --- a/src/vary/variation.cpp +++ b/src/vary/variation.cpp @@ -8,6 +8,60 @@ using namespace Brush; using namespace Pop; using namespace MAB; +namespace { +enum class MutationType { + Point, + Insert, + Delete, + Subtree, + ToggleWeightOn, + ToggleWeightOff, + Crossover, + Unknown +}; + +MutationType mutation_type_from_string(const std::string& choice) { + if (choice == "point") + return MutationType::Point; + if (choice == "insert") + return MutationType::Insert; + if (choice == "delete") + return MutationType::Delete; + if (choice == "subtree") + return MutationType::Subtree; + if (choice == "toggle_weight_on") + return MutationType::ToggleWeightOn; + if (choice == "toggle_weight_off") + return MutationType::ToggleWeightOff; + if (choice == "cx") + return MutationType::Crossover; + return MutationType::Unknown; +} + +const char* mutation_type_to_string(MutationType choice) { + switch (choice) { + case MutationType::Point: + return "point"; + case MutationType::Insert: + return "insert"; + case MutationType::Delete: + return "delete"; + case MutationType::Subtree: + return "subtree"; + case MutationType::ToggleWeightOn: + return "toggle_weight_on"; + case MutationType::ToggleWeightOff: + return "toggle_weight_off"; + case MutationType::Crossover: + return "cx"; + case MutationType::Unknown: + return "unknown"; + } + + return "unknown"; +} +} // namespace + /// @brief replace node with same typed node /// @param prog the program /// @param Tree the program tree @@ -249,7 +303,7 @@ class ToggleWeightOffMutation : public MutationBase static auto mutate(Program& program, Iter spot, Variation& variator, const Parameters& params) { - if (spot.node->data.get_is_weighted()==false) // TODO: This condition should never happen. Make sure it dont, then remove it. (this is also true for toggleweighton, also fix that) + if (spot.node->data.get_is_weighted()==false) // TODO: This condition should never happen. Verified by find_spots; keep guard for safety. (this is also true for toggleweighton, also fix that) return false; spot.node->data.set_is_weighted(false); @@ -390,7 +444,7 @@ class SplitMutation : public MutationBase }; /** - * @brief Stochastically swaps subtrees between root and other, returning a new program. + * @brief Stochastically swaps subtrees between parents, returning a new individual. * * The spot where the cross will take place in the `root` parent is sampled * based on attribute `get_prob_change` of each node in the tree. After selecting @@ -401,16 +455,16 @@ class SplitMutation : public MutationBase * candidate to replace the spot node. In this case, the method returns * `std::nullopt` (and has overloads so it can be used in a boolean context). * - * If the cross succeeds, the child program can be accessed through the + * If the cross succeeds, the child individual can be accessed through the * `.value()` attribute of the `std::optional`. * TODO: update this documentation (it doesnt take the program but the individual. also update mutation documentation) - * This means that, if you use the cross as `auto opt = mutate(parent, SS)`, + * This means that, if you use the cross as `auto opt = cross(mom, dad)`, * either `opt==false` or `opt.value()` contains the child. * * @tparam T the program type - * @param root the root parent - * @param other the donating parent - * @return `std::optional` that may contain the child program of type `T` + * @param mom the first parent + * @param dad the donating parent + * @return `std::optional` that may contain the child individual of type `T` */ template std::optional> Variation::cross( @@ -450,7 +504,7 @@ std::optional> Variation::cross( { // There is no spot that has a probability to be selected return std::nullopt; } - + // pick a subtree to insert. Selection is based on other_weights Program other(dad.program); @@ -507,7 +561,7 @@ std::optional> Variation::cross( child.Tree.move_ontop(child_spot, other_spot); Individual ind(child); - ind.set_variation("cx"); // TODO: use enum here to make it faster + ind.set_variation(mutation_type_to_string(MutationType::Crossover)); return ind; } @@ -579,25 +633,39 @@ std::optional> Variation::mutate( // picking a valid mutation option choice = r.random_choice(parameters.mutation_probs); } + + const auto mutation_choice = mutation_type_from_string(choice); + if (mutation_choice == MutationType::Unknown) { + std::string msg = fmt::format("{} not a valid mutation choice", choice); + HANDLE_ERROR_THROW(msg); + } Program copy(parent.program); vector weights; // choose location by weighted sampling of program - if (choice.compare("point") == 0) // TODO: use enum here to optimize - weights = PointMutation::find_spots(copy, (*this), parameters); - else if (choice.compare("insert") == 0) - weights = InsertMutation::find_spots(copy, (*this), parameters); - else if (choice.compare("delete") == 0) - weights = DeleteMutation::find_spots(copy, (*this), parameters); - else if (choice.compare("subtree") == 0) - weights = SubtreeMutation::find_spots(copy, (*this), parameters); - else if (choice.compare("toggle_weight_on") == 0) - weights = ToggleWeightOnMutation::find_spots(copy, (*this), parameters); - else if (choice.compare("toggle_weight_off") == 0) - weights = ToggleWeightOffMutation::find_spots(copy, (*this), parameters); - else { - std::string msg = fmt::format("{} not a valid mutation choice", choice); - HANDLE_ERROR_THROW(msg); + switch (mutation_choice) { + case MutationType::Point: + weights = PointMutation::find_spots(copy, (*this), parameters); + break; + case MutationType::Insert: + weights = InsertMutation::find_spots(copy, (*this), parameters); + break; + case MutationType::Delete: + weights = DeleteMutation::find_spots(copy, (*this), parameters); + break; + case MutationType::Subtree: + weights = SubtreeMutation::find_spots(copy, (*this), parameters); + break; + case MutationType::ToggleWeightOn: + weights = ToggleWeightOnMutation::find_spots(copy, (*this), parameters); + break; + case MutationType::ToggleWeightOff: + weights = ToggleWeightOffMutation::find_spots(copy, (*this), parameters); + break; + case MutationType::Crossover: + case MutationType::Unknown: + HANDLE_ERROR_THROW("Crossover is not a valid mutation choice\n"); + break; } if (std::all_of(weights.begin(), weights.end(), [](const auto& w) { @@ -621,18 +689,30 @@ std::optional> Variation::mutate( // program tree. Here we call the mutation function and return the result bool success; - if (choice.compare("point") == 0) - success = PointMutation::mutate(child, spot, (*this), parameters); - else if (choice.compare("insert") == 0) - success = InsertMutation::mutate(child, spot, (*this), parameters); - else if (choice.compare("delete") == 0) - success = DeleteMutation::mutate(child, spot, (*this), parameters); - else if (choice.compare("subtree") == 0) - success = SubtreeMutation::mutate(child, spot, (*this), parameters); - else if (choice.compare("toggle_weight_on") == 0) - success = ToggleWeightOnMutation::mutate(child, spot, (*this), parameters); - else // it must be"toggle_weight_off" - success = ToggleWeightOffMutation::mutate(child, spot, (*this), parameters); + switch (mutation_choice) { + case MutationType::Point: + success = PointMutation::mutate(child, spot, (*this), parameters); + break; + case MutationType::Insert: + success = InsertMutation::mutate(child, spot, (*this), parameters); + break; + case MutationType::Delete: + success = DeleteMutation::mutate(child, spot, (*this), parameters); + break; + case MutationType::Subtree: + success = SubtreeMutation::mutate(child, spot, (*this), parameters); + break; + case MutationType::ToggleWeightOn: + success = ToggleWeightOnMutation::mutate(child, spot, (*this), parameters); + break; + case MutationType::ToggleWeightOff: + success = ToggleWeightOffMutation::mutate(child, spot, (*this), parameters); + break; + case MutationType::Crossover: + case MutationType::Unknown: + success = false; + break; + } if (// strict mutation --- returns only valid solutions. ( success @@ -656,7 +736,6 @@ std::optional> Variation::mutate( if (choice.compare("point") == 0 || choice.compare("insert") == 0 || choice.compare("delete") == 0 - // || choice.compare("subtree") == 0 // TODO: disable this one ) { ind.set_sampled_nodes({spot.node->data}); } @@ -760,9 +839,6 @@ template void Variation::update_ss() { // propagate bandits learnt information to the search space. - // TODO: not all arms are initialized, if the user set something to zero then we must - // disable it. So, during update, we need to properly handle these skipped arms. --> remove this for nodes, allow it just for variations. If the user doesnt want to use a feature or op, he should not set it at the first place. We need to do this with variations because the user - // can choose it directly instead of letting brush to figure out. // variation: getting new probabilities for variation operators auto variation_probs = variation_bandit.sample_probs(true); @@ -787,12 +863,14 @@ void Variation::update_ss() search_space.terminal_map.at(datatype).end(), [&](auto& node) { return node.get_feature() == terminal_name; }); - // if (it != search_space.terminal_map.at(datatype).end()) { - auto index = std::distance(search_space.terminal_map.at(datatype).begin(), it); + if (it == search_space.terminal_map.at(datatype).end()) { + continue; + } + + auto index = std::distance(search_space.terminal_map.at(datatype).begin(), it); - // Update the terminal weights with the second value - search_space.terminal_weights.at(datatype)[index] = terminal_prob; - // } + // Update the terminal weights with the second value + search_space.terminal_weights.at(datatype)[index] = terminal_prob; } } @@ -802,14 +880,19 @@ void Variation::update_ss() auto op_probs = bandit.sample_probs(true); for (auto& [op_name, op_prob] : op_probs) { - + bool updated = false; for (const auto& [node_type, node_value]: search_space.node_map.at(ret_type).at(args_type)) { if (node_value.name == op_name) { search_space.node_map_weights.at(ret_type).at(args_type).at(node_type) = op_prob; + updated = true; + break; } } + if (!updated) { + continue; + } } } } @@ -817,3 +900,8 @@ void Variation::update_ss() } //namespace Var } //namespace Brush + +template class Brush::Var::Variation; +template class Brush::Var::Variation; +template class Brush::Var::Variation; +template class Brush::Var::Variation; From bc05619303116bbc2a9d264560f1960a9867d0ba Mon Sep 17 00:00:00 2001 From: Guilherme Seidyo Imai Aldeia Date: Fri, 24 Apr 2026 14:18:19 -0400 Subject: [PATCH 13/16] Update test include statements. simple metrics test --- tests/cpp/test_brush.cpp | 19 +------- tests/cpp/test_evaluation.cpp | 30 +++++++++++++ tests/cpp/test_individuals.cpp | 80 +++++++++++++++++++++++++++++++++- tests/cpp/test_population.cpp | 7 --- tests/cpp/testsHeader.h | 7 --- 5 files changed, 110 insertions(+), 33 deletions(-) diff --git a/tests/cpp/test_brush.cpp b/tests/cpp/test_brush.cpp index 93fa6378..227f5ebf 100644 --- a/tests/cpp/test_brush.cpp +++ b/tests/cpp/test_brush.cpp @@ -6,7 +6,6 @@ // #include "../../src/program/dispatch_table.h" #include "../../src/data/io.h" #include "../../src/engine.h" -#include "../../src/engine.cpp" #include "../../src/selection/selection.h" #include "../../src/selection/selection_operator.h" #include "../../src/selection/nsga2.h" @@ -17,27 +16,11 @@ #include "../../src/simplification/constants.h" #include "../../src/simplification/inexact.h" -// TODO: omg i need to figure out why my code only works if i import basically the whole stuff. It seems to be related to templating -#include "../../src/selection/selection.cpp" -#include "../../src/selection/selection_operator.cpp" -#include "../../src/selection/nsga2.cpp" -#include "../../src/selection/lexicase.cpp" -#include "../../src/eval/evaluation.cpp" -#include "../../src/pop/archive.cpp" -#include "../../src/pop/population.cpp" -// #include "../../src/bandit/bandit.cpp" -// #include "../../src/bandit/bandit_operator.cpp" -// #include "../../src/bandit/dummy.cpp" -// #include "../../src/bandit/thompson.cpp" -#include "../../src/simplification/constants.cpp" -#include "../../src/simplification/inexact.cpp" - // TODO: test predict from archive // TODO: rename it to test_engine - // TODO: test serialization of archive (get archive and save to json) - // TODO: test logger, verbose, print stats, etc. + TEST(Engine, EngineWorks) { MatrixXf X(10,2); diff --git a/tests/cpp/test_evaluation.cpp b/tests/cpp/test_evaluation.cpp index 5b3d4fd2..9b40a462 100644 --- a/tests/cpp/test_evaluation.cpp +++ b/tests/cpp/test_evaluation.cpp @@ -54,6 +54,36 @@ TEST(Evaluation, accuracy) ASSERT_EQ(((int)(score*10000)), 3999); } +TEST(Evaluation, ScorerRegressionMSE) +{ + VectorXf y(3), yhat(3), loss_expected(3), loss(3); + y << 1.0, 2.0, 3.0; + yhat << 1.0, 4.0, 2.0; + + float expected = mse(y, yhat, loss_expected); + + Scorer scorer("mse"); + float actual = scorer.score(y, yhat, loss, {}); + + ASSERT_NEAR(actual, expected, 1e-6); + ASSERT_TRUE(loss.isApprox(loss_expected, 1e-6)); +} + +TEST(Evaluation, ScorerBinaryAccuracy) +{ + VectorXf y(4), yhat(4), loss_expected(4), loss(4); + y << 0.0, 1.0, 1.0, 0.0; + yhat << 0.1, 0.9, 0.2, 0.8; + + float expected = zero_one_loss(y, yhat, loss_expected); + + Scorer scorer("accuracy"); + float actual = scorer.score(y, yhat, loss, {}); + + ASSERT_NEAR(actual, expected, 1e-6); + ASSERT_TRUE(loss.isApprox(loss_expected, 1e-6)); +} + // TEST(EvaluationTest, UpdateFitnessTest) { // // TODO: Add test case for update_fitness function diff --git a/tests/cpp/test_individuals.cpp b/tests/cpp/test_individuals.cpp index 5b3e5df6..bbc78489 100644 --- a/tests/cpp/test_individuals.cpp +++ b/tests/cpp/test_individuals.cpp @@ -1,3 +1,81 @@ // TODO: test predict, predict proba, fit. -// TODO: test parent_id and id \ No newline at end of file +// TODO: test parent_id and id + +#include "testsHeader.h" + +using namespace Brush; +using namespace Brush::Pop; + +TEST(Individual, FitAndPredictRegression) +{ + MatrixXf X(4, 2); + ArrayXf y(4); + + X << 1.0, 2.0, + 2.0, 1.0, + 3.0, 0.5, + 4.0, 1.5; + y << 3.0, 3.0, 3.5, 5.5; + + Dataset data(X, y); + SearchSpace ss(data); + + // We must have a SearchSpace reference, so the operator ret-type checks dont + // fail even when feature names look right --- node metadata is consistent. + Parameters params; + RegressorProgram prg = ss.make_regressor(0, 0, params); + Individual ind(prg); + + ASSERT_FALSE(ind.get_is_fitted()); + ind.fit(data); + ASSERT_TRUE(ind.get_is_fitted()); + + auto y_pred = ind.predict(data); + ASSERT_EQ(y_pred.size(), y.size()); +} + +TEST(Individual, PredictProbaBinaryClassifier) +{ + MatrixXf X(6, 2); + ArrayXf y(6); + + X << 0.0, 1.0, + 1.0, 0.0, + 0.5, 0.5, + 0.2, 0.8, + 0.8, 0.2, + 1.0, 1.0; + y << 0.0, 1.0, 0.0, 1.0, 1.0, 0.0; + + Dataset data(X, y, {}, {}, {}, true); + SearchSpace ss(data); + + Parameters params; + params.set_n_classes(data.y); + params.set_sample_weights(data.y); + + ClassifierProgram prg = ss.make_classifier(0, 0, params); + Individual ind(prg); + + auto prob = ind.predict_proba(data); + ASSERT_EQ(prob.size(), y.size()); +} + +TEST(Individual, ParentIdAndId) +{ + Individual p1; + Individual p2; + Individual child; + + p1.set_id(3); + p2.set_id(7); + child.set_id(11); + + child.set_parents(std::vector>{p1, p2}); + + ASSERT_EQ(child.id, 11u); + ASSERT_EQ(child.parent_id.size(), 2u); + ASSERT_EQ(child.parent_id.at(0), 3u); + ASSERT_EQ(child.parent_id.at(1), 7u); +} \ No newline at end of file diff --git a/tests/cpp/test_population.cpp b/tests/cpp/test_population.cpp index 0f5aebf3..8994e927 100644 --- a/tests/cpp/test_population.cpp +++ b/tests/cpp/test_population.cpp @@ -1,12 +1,5 @@ #include "testsHeader.h" -#include "../../src/ind/individual.cpp" -#include "../../src/pop/population.cpp" // TODO: figure out if thats ok to include cpps instead of headers -#include "../../src/eval/evaluation.cpp" -#include "../../src/selection/nsga2.cpp" -#include "../../src/selection/lexicase.cpp" -#include "../../src/selection/selection_operator.cpp" -#include "../../src/selection/selection.cpp" // #include "../../src/bandit/bandit.cpp" // #include "../../src/bandit/bandit_operator.cpp" diff --git a/tests/cpp/testsHeader.h b/tests/cpp/testsHeader.h index 2e9f56c9..e8be0227 100644 --- a/tests/cpp/testsHeader.h +++ b/tests/cpp/testsHeader.h @@ -22,10 +22,6 @@ using std::stoi; using std::to_string; using std::stof; -// this is a compiler-specific hack and a bad practice. TODO: delete it -// #define private public - -// TODO: remove these lots of imports and keep only essential stuff #include #include "../../src/init.h" #include "../../src/params.h" @@ -35,7 +31,6 @@ using std::stof; #include "../../src/program/program.h" #include "../../src/ind/individual.h" #include "../../src/vary/search_space.h" -#include "../../src/params.h" #include "../../src/vary/variation.h" #include "../../src/selection/selection.h" #include "../../src/selection/selection_operator.h" @@ -52,8 +47,6 @@ using std::stof; #include "../../src/simplification/constants.h" #include "../../src/simplification/inexact.h" -// TODO: is this ok? (otherwise I would have to create a test separated file, or move the implementation to the header) -#include "../../src/vary/variation.cpp" using namespace Brush; using namespace Brush::Data; From e41316f633f4c6480b5ba551d482e77357975c55 Mon Sep 17 00:00:00 2001 From: Guilherme Seidyo Imai Aldeia Date: Sun, 26 Apr 2026 20:11:31 -0400 Subject: [PATCH 14/16] Small improvements --- src/eval/metrics.cpp | 50 ++++++++++++++++++++++++---------- src/selection/selection.cpp | 4 +-- src/selection/selection.h | 2 +- tests/cpp/test_individuals.cpp | 1 + tests/python/test_params.py | 2 +- 5 files changed, 41 insertions(+), 18 deletions(-) diff --git a/src/eval/metrics.cpp b/src/eval/metrics.cpp index 0b522c7b..891c0b67 100644 --- a/src/eval/metrics.cpp +++ b/src/eval/metrics.cpp @@ -141,23 +141,34 @@ float average_precision_score(const VectorXf& y, const VectorXf& predict_proba, int num_instances = y.size(); float eps = 1e-4f; // first we set the loss vector values + float tie_tol = 1e-7f; + + // Guard against NaN/Inf and out-of-range probabilities to keep sort + // comparators and downstream math well-defined. + vector p_clean(num_instances, 0.5f); loss.resize(num_instances); for (int i = 0; i < num_instances; ++i) { float p = predict_proba(i); + if (!std::isfinite(p)) { + p = 0.5f; + } + if (p < eps) { + p = eps; + } else if (p > 1.0f - eps) { + p = 1.0f - eps; + } + p_clean[i] = p; // The loss vector is used in lexicase selection. we need to set something useful here // that does make sense on individual level. Using log loss here. - if (p < eps || 1 - p < eps) - loss(i) = -(y(i)*log(eps) + (1-y(i))*log(1-eps)); - else - loss(i) = -(y(i)*log(p) + (1-y(i))*log(1-p)); + loss(i) = -(y(i)*log(p) + (1-y(i))*log(1-p)); } // get argsort of predict proba (descending) vector order(num_instances); iota(order.begin(), order.end(), 0); stable_sort(order.begin(), order.end(), [&](int i, int j) { - return predict_proba(i) > predict_proba(j); // descending + return p_clean[i] > p_clean[j]; // descending }); float ysum = 0.0f; @@ -168,8 +179,18 @@ float average_precision_score(const VectorXf& y, const VectorXf& predict_proba, int idx = order[i]; y_sorted[i] = y(idx); - p_sorted[i] = predict_proba(idx); - w_sorted[i] = class_weights.empty() ? 1.0f : class_weights.at(y(idx)); + p_sorted[i] = p_clean[idx]; + + if (class_weights.empty()) { + w_sorted[i] = 1.0f; + } else { + int cls = static_cast(std::round(y_sorted[i])); + if (cls < 0 || cls >= static_cast(class_weights.size())) { + w_sorted[i] = 1.0f; + } else { + w_sorted[i] = class_weights[cls]; + } + } ysum += y_sorted[i] * w_sorted[i]; } @@ -182,7 +203,7 @@ float average_precision_score(const VectorXf& y, const VectorXf& predict_proba, // detect constant prediction case (all p_sorted equal within tolerance). // because p_sorted is sorted, the first element is the maximum, and the last is the minimum, - if (abs(p_sorted.back() - p_sorted.front()) <= eps) { + if (fabs(p_sorted.back() - p_sorted.front()) <= tie_tol) { // All predictions are (effectively) constant. float total_weight = std::accumulate(w_sorted.begin(), w_sorted.end(), 0.0f); @@ -192,12 +213,13 @@ float average_precision_score(const VectorXf& y, const VectorXf& predict_proba, } // Find the indexes where prediction changes, so we can treat it as one block - vector unique_indices = {}; // this one will be used to calculate the AUC - set unique_probas = {}; // keep track of unique elements (this wont be used other than that) - - for (int i=0; i unique_indices = {}; + unique_indices.push_back(0); + for (int i = 1; i < num_instances; ++i) { + if (fabs(p_sorted[i] - p_sorted[i - 1]) > tie_tol) { unique_indices.push_back(i); + } + } unique_indices.push_back(num_instances); // last index is the number of elements @@ -223,7 +245,7 @@ float average_precision_score(const VectorXf& y, const VectorXf& predict_proba, // integrate PR curve float average_precision = 0.0f; - for (size_t i = 0; i < num_instances; ++i) { + for (size_t i = 0; i < precision.size() - 1; ++i) { average_precision += (recall[i+1] - recall[i]) * precision[i+1]; } diff --git a/src/selection/selection.cpp b/src/selection/selection.cpp index ad8f1363..f16f37d4 100644 --- a/src/selection/selection.cpp +++ b/src/selection/selection.cpp @@ -30,9 +30,9 @@ template void Selection::set_operator() { if (this->type == "nsga2") - pselector = new NSGA2(survival); + pselector = std::make_shared>(survival); else if (this->type == "lexicase") - pselector = new Lexicase(survival); + pselector = std::make_shared>(survival); else HANDLE_ERROR_THROW("Undefined Selection Operator " + this->type + "\n"); diff --git a/src/selection/selection.h b/src/selection/selection.h index 85f91290..865507c1 100644 --- a/src/selection/selection.h +++ b/src/selection/selection.h @@ -24,7 +24,7 @@ template struct Selection { public: - SelectionOperator* pselector; // TODO: THIS SHOULD BE A SHARED POINTER + std::shared_ptr> pselector; string type; bool survival; diff --git a/tests/cpp/test_individuals.cpp b/tests/cpp/test_individuals.cpp index bbc78489..c00dcb44 100644 --- a/tests/cpp/test_individuals.cpp +++ b/tests/cpp/test_individuals.cpp @@ -58,6 +58,7 @@ TEST(Individual, PredictProbaBinaryClassifier) ClassifierProgram prg = ss.make_classifier(0, 0, params); Individual ind(prg); + ind.fit(data); auto prob = ind.predict_proba(data); ASSERT_EQ(prob.size(), y.size()); } diff --git a/tests/python/test_params.py b/tests/python/test_params.py index ee59fec4..07ec6693 100644 --- a/tests/python/test_params.py +++ b/tests/python/test_params.py @@ -273,7 +273,7 @@ def test_fitness_weights_match_scorer_sign(scorer, expected_weights): # Choose estimator type based on scorer # (by default objectives are ["scorer", "linear_complexity"]) - if scorer in ("mse"): + if scorer in ("mse", ): # add more metrics for regression when I implement them est = BrushRegressor(scorer=scorer, pop_size=20, max_gens=10, verbosity=0) est.fit(X, y_reg) else: From 151aaa0b1c8f54f518ae6ec4253a556aa1a78260 Mon Sep 17 00:00:00 2001 From: Guilherme Seidyo Imai Aldeia Date: Mon, 27 Apr 2026 09:41:08 -0400 Subject: [PATCH 15/16] Fixed segfault in test_params. AUPRC do not handle validation splits with 1 sample --- pybrush/BrushEstimator.py | 1 - src/eval/metrics.cpp | 48 ++++++++++--------------------------- tests/python/test_params.py | 21 +++++++++++----- 3 files changed, 28 insertions(+), 42 deletions(-) diff --git a/pybrush/BrushEstimator.py b/pybrush/BrushEstimator.py index 0a03e1b8..5a74ac7b 100644 --- a/pybrush/BrushEstimator.py +++ b/pybrush/BrushEstimator.py @@ -395,7 +395,6 @@ def predict_proba(self, X): feature_names=self.feature_names_, validation_size=0.0) - prob = self.best_estimator_.program.predict_proba(data) if self.parameters_.n_classes == 2: diff --git a/src/eval/metrics.cpp b/src/eval/metrics.cpp index 891c0b67..c1f261e9 100644 --- a/src/eval/metrics.cpp +++ b/src/eval/metrics.cpp @@ -141,34 +141,23 @@ float average_precision_score(const VectorXf& y, const VectorXf& predict_proba, int num_instances = y.size(); float eps = 1e-4f; // first we set the loss vector values - float tie_tol = 1e-7f; - - // Guard against NaN/Inf and out-of-range probabilities to keep sort - // comparators and downstream math well-defined. - vector p_clean(num_instances, 0.5f); loss.resize(num_instances); for (int i = 0; i < num_instances; ++i) { float p = predict_proba(i); - if (!std::isfinite(p)) { - p = 0.5f; - } - if (p < eps) { - p = eps; - } else if (p > 1.0f - eps) { - p = 1.0f - eps; - } - p_clean[i] = p; // The loss vector is used in lexicase selection. we need to set something useful here // that does make sense on individual level. Using log loss here. - loss(i) = -(y(i)*log(p) + (1-y(i))*log(1-p)); + if (p < eps || 1 - p < eps) + loss(i) = -(y(i)*log(eps) + (1-y(i))*log(1-eps)); + else + loss(i) = -(y(i)*log(p) + (1-y(i))*log(1-p)); } // get argsort of predict proba (descending) vector order(num_instances); iota(order.begin(), order.end(), 0); stable_sort(order.begin(), order.end(), [&](int i, int j) { - return p_clean[i] > p_clean[j]; // descending + return predict_proba(i) > predict_proba(j); // descending }); float ysum = 0.0f; @@ -179,18 +168,8 @@ float average_precision_score(const VectorXf& y, const VectorXf& predict_proba, int idx = order[i]; y_sorted[i] = y(idx); - p_sorted[i] = p_clean[idx]; - - if (class_weights.empty()) { - w_sorted[i] = 1.0f; - } else { - int cls = static_cast(std::round(y_sorted[i])); - if (cls < 0 || cls >= static_cast(class_weights.size())) { - w_sorted[i] = 1.0f; - } else { - w_sorted[i] = class_weights[cls]; - } - } + p_sorted[i] = predict_proba(idx); + w_sorted[i] = class_weights.empty() ? 1.0f : class_weights.at(y(idx)); ysum += y_sorted[i] * w_sorted[i]; } @@ -203,7 +182,7 @@ float average_precision_score(const VectorXf& y, const VectorXf& predict_proba, // detect constant prediction case (all p_sorted equal within tolerance). // because p_sorted is sorted, the first element is the maximum, and the last is the minimum, - if (fabs(p_sorted.back() - p_sorted.front()) <= tie_tol) { + if (fabs(p_sorted.back() - p_sorted.front()) <= eps) { // All predictions are (effectively) constant. float total_weight = std::accumulate(w_sorted.begin(), w_sorted.end(), 0.0f); @@ -213,13 +192,12 @@ float average_precision_score(const VectorXf& y, const VectorXf& predict_proba, } // Find the indexes where prediction changes, so we can treat it as one block - vector unique_indices = {}; - unique_indices.push_back(0); - for (int i = 1; i < num_instances; ++i) { - if (fabs(p_sorted[i] - p_sorted[i - 1]) > tie_tol) { + vector unique_indices = {}; // this one will be used to calculate the AUC + set unique_probas = {}; // keep track of unique elements (this wont be used other than that) + + for (int i=0; i np.median(x0)).astype(float) + # Choose estimator type based on scorer # (by default objectives are ["scorer", "linear_complexity"]) if scorer in ("mse", ): # add more metrics for regression when I implement them - est = BrushRegressor(scorer=scorer, pop_size=20, max_gens=10, verbosity=0) + est = BrushRegressor(scorer=scorer, pop_size=20, max_gens=10, verbosity=1) est.fit(X, y_reg) else: - est = BrushClassifier(scorer=scorer, pop_size=20, max_gens=10, verbosity=0) + est = BrushClassifier(scorer=scorer, pop_size=20, max_gens=10, verbosity=1) est.fit(X, y_clf) # Check estimator-level weights From 006473c286a0285b9abf312f7ed7e2e541747d88 Mon Sep 17 00:00:00 2001 From: Guilherme Aldeia Date: Mon, 27 Apr 2026 10:36:43 -0400 Subject: [PATCH 16/16] Change epsilon value in loss calculation --- src/eval/metrics.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/eval/metrics.cpp b/src/eval/metrics.cpp index c1f261e9..16a3f854 100644 --- a/src/eval/metrics.cpp +++ b/src/eval/metrics.cpp @@ -140,7 +140,7 @@ float average_precision_score(const VectorXf& y, const VectorXf& predict_proba, // Assuming y contains binary labels (0 or 1) int num_instances = y.size(); - float eps = 1e-4f; // first we set the loss vector values + float eps = 1e-6f; // first we set the loss vector values loss.resize(num_instances); for (int i = 0; i < num_instances; ++i) { float p = predict_proba(i); @@ -342,4 +342,4 @@ float multi_zero_one_loss(const VectorXf& y, } } // metrics -} // Brush \ No newline at end of file +} // Brush