diff --git a/pybrush/BrushEstimator.py b/pybrush/BrushEstimator.py index 0a03e1b8..5a74ac7b 100644 --- a/pybrush/BrushEstimator.py +++ b/pybrush/BrushEstimator.py @@ -395,7 +395,6 @@ def predict_proba(self, X): feature_names=self.feature_names_, validation_size=0.0) - prob = self.best_estimator_.program.predict_proba(data) if self.parameters_.n_classes == 2: diff --git a/src/bandit/bandit.cpp b/src/bandit/bandit.cpp index c76a46f6..e24dd76d 100644 --- a/src/bandit/bandit.cpp +++ b/src/bandit/bandit.cpp @@ -1,5 +1,4 @@ #include "bandit.h" -#include // FOR DEBUGGING PURPOSES. TODO: remove it later namespace Brush { namespace MAB { @@ -35,8 +34,6 @@ Bandit::Bandit(string type, map arms_probs) : type(type) { } void Bandit::set_bandit() { - // TODO: a flag that is set to true when this function is called. make all - // other methods to raise an error if bandit was not set if (type == "thompson") { pbandit = make_unique(probabilities); } else if (type == "dynamic_thompson") { @@ -46,6 +43,14 @@ void Bandit::set_bandit() { } else { HANDLE_ERROR_THROW("Undefined Selection Operator " + this->type + "\n"); } + + bandit_set = true; +} + +void Bandit::ensure_bandit_set() const { + if (!bandit_set || !pbandit) { + HANDLE_ERROR_THROW("Bandit operator is not set. Call set_bandit() before use.\n"); + } } string Bandit::get_type() { @@ -73,6 +78,7 @@ void Bandit::set_probs(map arms_probs) { } map Bandit::sample_probs(bool update) { + ensure_bandit_set(); map new_probs = this->pbandit->sample_probs(update); // making all probabilities strictly positive @@ -88,10 +94,12 @@ map Bandit::sample_probs(bool update) { } string Bandit::choose() { + ensure_bandit_set(); return this->pbandit->choose(); } void Bandit::update(string arm, float reward) { + ensure_bandit_set(); this->pbandit->update(arm, reward); } diff --git a/src/bandit/bandit.h b/src/bandit/bandit.h index 3ccefdcb..51ded92a 100644 --- a/src/bandit/bandit.h +++ b/src/bandit/bandit.h @@ -117,6 +117,10 @@ struct Bandit * @param reward The received reward. */ void update(string arm, float reward); + +private: + bool bandit_set = false; + void ensure_bandit_set() const; }; //TODO: serialization should save the type of bandit and its parameters diff --git a/src/bindings/bind_engines.h b/src/bindings/bind_engines.h index e86a4331..ff788d22 100644 --- a/src/bindings/bind_engines.h +++ b/src/bindings/bind_engines.h @@ -1,34 +1,23 @@ #include "module.h" #include "../engine.h" -#include "../engine.cpp" -// TODO: figure out why do I need to include the whole thing (otherwise it gives me symbol errors) #include "../bandit/bandit.h" #include "../bandit/bandit_operator.h" #include "../bandit/dummy.h" #include "../bandit/thompson.h" #include "../ind/individual.h" -#include "../ind/individual.cpp" #include "../vary/variation.h" -#include "../vary/variation.cpp" #include "../eval/evaluation.h" -#include "../eval/evaluation.cpp" -#include "../pop/population.cpp" #include "../pop/population.h" #include "../selection/selection.h" -#include "../selection/selection.cpp" #include "../selection/selection_operator.h" -#include "../selection/selection_operator.cpp" #include "../selection/nsga2.h" -#include "../selection/nsga2.cpp" #include "../selection/lexicase.h" -#include "../selection/lexicase.cpp" -#include "../pop/archive.cpp" #include "../pop/archive.h" using Reg = Brush::RegressorEngine; @@ -94,7 +83,6 @@ void bind_engine(py::module& m, string name) }, [](nl::json j) { // __setstate__ T p = j; - // TODO: do I need to get the data and ss reference, then call init for this new instance? return p; }) ) diff --git a/src/bindings/bind_evaluator.h b/src/bindings/bind_evaluator.h index 90ea3ab5..75417220 100644 --- a/src/bindings/bind_evaluator.h +++ b/src/bindings/bind_evaluator.h @@ -1,6 +1,5 @@ #include "module.h" #include "../eval/evaluation.h" -#include "../eval/evaluation.cpp" namespace py = pybind11; namespace br = Brush; diff --git a/src/bindings/bind_selection.h b/src/bindings/bind_selection.h index af146f0c..412face1 100644 --- a/src/bindings/bind_selection.h +++ b/src/bindings/bind_selection.h @@ -1,16 +1,10 @@ #include "module.h" -// TODO: figure out why im having symbol errors (if i dont include the cpp here as well) #include "../selection/selection.h" -#include "../selection/selection.cpp" #include "../selection/selection_operator.h" -#include "../selection/selection_operator.cpp" #include "../selection/nsga2.h" -#include "../selection/nsga2.cpp" #include "../selection/lexicase.h" -#include "../selection/lexicase.cpp" -#include "../pop/population.cpp" #include "../pop/population.h" namespace py = pybind11; diff --git a/src/bindings/bind_variation.h b/src/bindings/bind_variation.h index 3e55f884..2b6305ec 100644 --- a/src/bindings/bind_variation.h +++ b/src/bindings/bind_variation.h @@ -1,22 +1,17 @@ #include "module.h" #include "../pop/population.h" -#include "../pop/population.cpp" #include "../bandit/bandit.h" #include "../bandit/bandit_operator.h" #include "../bandit/dummy.h" #include "../bandit/thompson.h" #include "../ind/individual.h" -#include "../ind/individual.cpp" -#include "../simplification/constants.cpp" #include "../simplification/constants.h" -#include "../simplification/inexact.cpp" #include "../simplification/inexact.h" #include "../vary/variation.h" -#include "../vary/variation.cpp" namespace py = pybind11; namespace nl = nlohmann; diff --git a/src/data/data.cpp b/src/data/data.cpp index 832a8f38..d7a23ff0 100644 --- a/src/data/data.cpp +++ b/src/data/data.cpp @@ -312,7 +312,7 @@ void Dataset::init() back_inserter(validation_data_idx), [&](int element) { return element; }); } - else if (classification && true) // figuring out training and validation data indexes + else if (classification) // figuring out training and validation data indexes { // Stratified split for classification problems. TODO: parameters to change stratify behavior? (and set false by default) std::map> class_indices; // TODO: I think I can remove many std:: from the code.. for (size_t i = 0; i < n_samples; ++i) { @@ -335,15 +335,11 @@ void Dataset::init() std::transform(idx.begin(), idx.begin() + n_train_samples, back_inserter(training_data_idx), [&](int element) { return indices[element]; }); - - if (n_class_samples - n_train_samples == 0) - { - // same indices from the training data to the validation data - std::transform(idx.begin(), idx.begin() + n_train_samples, - back_inserter(validation_data_idx), - [&](int element) { return indices[element]; }); - } - else + + // stratified split so train/validation never overlap when a class is + // too small. Now, if a class has no remaining samples for validation, + // it contributes only to training (validation gets none for that class). + if (n_class_samples - n_train_samples > 0) { std::transform(idx.begin() + n_train_samples, idx.end(), back_inserter(validation_data_idx), @@ -355,7 +351,7 @@ void Dataset::init() // logic for non-classification problems vector idx(n_samples); - if (shuffle_split) // TODO: make sure this works with multiple threads and fixed random state + if (shuffle_split) idx = r.shuffled_index(n_samples); else std::iota(idx.begin(), idx.end(), 0); diff --git a/src/engine.cpp b/src/engine.cpp index 4e979c1e..4424b207 100644 --- a/src/engine.cpp +++ b/src/engine.cpp @@ -687,4 +687,9 @@ void Engine::run(Dataset &data) //When you have tasks that are created at runtime (e.g., subflow, // cudaFlow), you need to execute the graph first to spawn these tasks and dump the entire graph. } -} \ No newline at end of file +} + +template class Brush::Engine; +template class Brush::Engine; +template class Brush::Engine; +template class Brush::Engine; \ No newline at end of file diff --git a/src/engine.h b/src/engine.h index ad082e9a..b73a60fb 100644 --- a/src/engine.h +++ b/src/engine.h @@ -130,7 +130,6 @@ class Engine{ /// train the model void run(Dataset &d); - // TODO: should params and ss be private? (that would require better json handling) Parameters params; ///< hyperparameters of brush, which the user can interact SearchSpace ss; @@ -161,5 +160,10 @@ NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Engine, params, best_in NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Engine, params, best_ind, archive, pop, ss, is_fitted); NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Engine, params, best_ind, archive, pop, ss, is_fitted); +extern template class Engine; +extern template class Engine; +extern template class Engine; +extern template class Engine; + } // Brush #endif diff --git a/src/eval/evaluation.cpp b/src/eval/evaluation.cpp index d5a366a0..0f244220 100644 --- a/src/eval/evaluation.cpp +++ b/src/eval/evaluation.cpp @@ -121,4 +121,9 @@ void Evaluation::assign_fit(Individual& ind, const Dataset& data, } } // Pop -} // Brush \ No newline at end of file +} // Brush + +template class Brush::Eval::Evaluation; +template class Brush::Eval::Evaluation; +template class Brush::Eval::Evaluation; +template class Brush::Eval::Evaluation; \ No newline at end of file diff --git a/src/eval/evaluation.h b/src/eval/evaluation.h index b4b1240c..2c988918 100644 --- a/src/eval/evaluation.h +++ b/src/eval/evaluation.h @@ -90,6 +90,11 @@ class Evaluation { // representation program (TODO: implement) }; +extern template class Evaluation; +extern template class Evaluation; +extern template class Evaluation; +extern template class Evaluation; + } //selection } //brush #endif diff --git a/src/eval/metrics.cpp b/src/eval/metrics.cpp index 0b522c7b..16a3f854 100644 --- a/src/eval/metrics.cpp +++ b/src/eval/metrics.cpp @@ -140,7 +140,7 @@ float average_precision_score(const VectorXf& y, const VectorXf& predict_proba, // Assuming y contains binary labels (0 or 1) int num_instances = y.size(); - float eps = 1e-4f; // first we set the loss vector values + float eps = 1e-6f; // first we set the loss vector values loss.resize(num_instances); for (int i = 0; i < num_instances; ++i) { float p = predict_proba(i); @@ -182,7 +182,7 @@ float average_precision_score(const VectorXf& y, const VectorXf& predict_proba, // detect constant prediction case (all p_sorted equal within tolerance). // because p_sorted is sorted, the first element is the maximum, and the last is the minimum, - if (abs(p_sorted.back() - p_sorted.front()) <= eps) { + if (fabs(p_sorted.back() - p_sorted.front()) <= eps) { // All predictions are (effectively) constant. float total_weight = std::accumulate(w_sorted.begin(), w_sorted.end(), 0.0f); @@ -193,7 +193,7 @@ float average_precision_score(const VectorXf& y, const VectorXf& predict_proba, // Find the indexes where prediction changes, so we can treat it as one block vector unique_indices = {}; // this one will be used to calculate the AUC - set unique_probas = {}; // keep track of unique elements (this wont be used other than that) + set unique_probas = {}; // keep track of unique elements (this wont be used other than that) for (int i=0; i class Individual{ -public: // TODO: make these private (and work with nlohman json) +public: Program program; ///< executable data structure // store just info that we dont have a getter. size, depth, complexity: they can all be obtained with program. @@ -64,7 +64,6 @@ class Individual{ variation = "born"; }; - // TODO: replace occurences of program.fit with these (also predict and predict_proba) Individual &fit(const Dataset& data) { program.fit(data); this->is_fitted_ = true; @@ -153,7 +152,6 @@ class Individual{ }; /// set parent ids using parents void set_parents(const vector& parents){ parent_id = parents; }; /// set parent ids using id values - // TODO: USE setters and getters intead of accessing it directly // template // void Individual::set_objectives(const vector& objectives) @@ -182,7 +180,6 @@ class Individual{ vector weights; weights.resize(0); for (const auto& obj : objectives) { - // TODO: do i need to use find or this can be done directly? auto it = weightsMap.find(obj); if (it != weightsMap.end()) { weights.push_back(it->second); @@ -215,7 +212,7 @@ void to_json(json &j, const Individual &p) template void from_json(const json &j, Individual& p) -{// TODO: figure out if this works with private attributes and try to actually make them private (and use getters and setters) +{ j.at("program").get_to( p.program ); j.at("fitness").get_to( p.fitness ); j.at("id").get_to( p.id ); diff --git a/src/params.h b/src/params.h index 83ddbe8b..5b7f71c7 100644 --- a/src/params.h +++ b/src/params.h @@ -38,7 +38,7 @@ struct Parameters unsigned int max_size = 50; vector objectives{"scorer","linear_complexity"}; // scorer should be generic and deducted based on mode - string bandit = "thompson"; // TODO: should I rename dummy? + string bandit = "thompson"; string sel = "lexicase"; //selection method string surv = "nsga2"; //survival method std::unordered_map functions; diff --git a/src/pop/archive.cpp b/src/pop/archive.cpp index 3afafa54..4de21745 100644 --- a/src/pop/archive.cpp +++ b/src/pop/archive.cpp @@ -51,26 +51,14 @@ bool Archive::sortObj1(const Individual& lhs, } template -bool Archive::sameFitComplexity(const Individual& lhs, +bool Archive::sameObjectives(const Individual& lhs, const Individual& rhs) { - // TODO: delete this one - return (lhs.fitness == rhs.fitness); // fitness' operator== is overloaded to compare wvalues. // we also check complexity equality to avoid the case where the user // did not specified complexity as one of the objectives - // return (lhs.fitness == rhs.fitness - // && lhs.fitness.complexity == rhs.fitness.complexity); -} - -// TODO: i could get rid of one of these -template -bool Archive::sameObjectives(const Individual& lhs, - const Individual& rhs) -{ - return (lhs.fitness == rhs.fitness); } template @@ -157,7 +145,6 @@ void Archive::update(Population& pop, const Parameters& params) std::stable_sort(individuals.begin(), individuals.end(), &sortObj1); } - /* auto it = std::unique(individuals.begin(),individuals.end(), &sameFitComplexity); */ auto it = std::unique(individuals.begin(),individuals.end(), &sameObjectives); @@ -165,4 +152,9 @@ void Archive::update(Population& pop, const Parameters& params) } } // Pop -} // Brush \ No newline at end of file +} // Brush + +template struct Brush::Pop::Archive; +template struct Brush::Pop::Archive; +template struct Brush::Pop::Archive; +template struct Brush::Pop::Archive; \ No newline at end of file diff --git a/src/pop/archive.h b/src/pop/archive.h index 44c4d5fa..18a45081 100644 --- a/src/pop/archive.h +++ b/src/pop/archive.h @@ -81,17 +81,6 @@ struct Archive */ static bool sortObj1(const Individual& lhs, const Individual& rhs); - /** - * @brief Checks if two individuals have the same fitness complexity. - * - * This static function is used to check if two individuals have the same fitness complexity. - * It is used as a comparison function for finding duplicates in the population. - * - * @param lhs The left-hand side individual to compare. - * @param rhs The right-hand side individual to compare. - */ - static bool sameFitComplexity(const Individual& lhs, const Individual& rhs); - /** * @brief Checks if two individuals have the same objectives. * @@ -104,6 +93,11 @@ struct Archive static bool sameObjectives(const Individual& lhs, const Individual& rhs); }; +extern template struct Archive; +extern template struct Archive; +extern template struct Archive; +extern template struct Archive; + //serialization NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Archive, individuals, sort_complexity, linear_complexity); NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Archive, individuals, sort_complexity, linear_complexity); diff --git a/src/pop/population.cpp b/src/pop/population.cpp index 65137163..9051daf7 100644 --- a/src/pop/population.cpp +++ b/src/pop/population.cpp @@ -212,7 +212,7 @@ void Population::update(vector> survivors) template string Population::print_models(string sep) { - // TODO: rename it. This function does not print anything, just returns a string + // TODO: rename it. This function returns a string; it does not print. string output = ""; for (int j=0; j::migrate() } // Pop } // Brush + +template class Brush::Pop::Population; +template class Brush::Pop::Population; +template class Brush::Pop::Population; +template class Brush::Pop::Population; diff --git a/src/pop/population.h b/src/pop/population.h index 6cf8b39c..adbbbd80 100644 --- a/src/pop/population.h +++ b/src/pop/population.h @@ -109,6 +109,11 @@ class Population{ }; }; +extern template class Population; +extern template class Population; +extern template class Population; +extern template class Population; + NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Population, individuals, island_indexes, pop_size, num_islands, mig_prob, linear_complexity); NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(Population, diff --git a/src/program/node.cpp b/src/program/node.cpp index b690b476..99c94966 100644 --- a/src/program/node.cpp +++ b/src/program/node.cpp @@ -387,9 +387,6 @@ void from_json(const json &j, Node& p) { j.at("W").get_to(p.W); } - - json new_json = p; } - } diff --git a/src/program/operator.h b/src/program/operator.h index 3aef94bc..e84d5843 100644 --- a/src/program/operator.h +++ b/src/program/operator.h @@ -29,53 +29,33 @@ namespace util{ } } - try //TODO: remove this try catch after debugging it - { - w = Scalar(tn.data.W); - } - catch (const std::exception& e) { - std::string err_msg = "Null pointer dereference: *weights is nullptr. " - "TreeNode ret_type: " + std::to_string(static_cast(tn.data.ret_type)) + - ", name: " + tn.data.name; - std::cerr << "[EXCEPTION] get_weight: caught std::exception: " << e.what() << err_msg << std::endl; - throw; // Re-throw to allow crash - } + w = Scalar(tn.data.W); } else { - try //TODO: remove this try catch after debugging it - { - if (*weights == nullptr) { - std::string err_msg = "Null pointer dereference: *weights is nullptr. " - "TreeNode ret_type: " + std::to_string(static_cast(tn.data.ret_type)) + - ", name: " + tn.data.name; - HANDLE_ERROR_THROW("Null pointer dereference: *weights is nullptr. " + err_msg); - } - - // NLS case 1: floating point weight is stored in weights - if constexpr (is_same_v) - w = **weights; + if (*weights == nullptr) { + std::string err_msg = "Null pointer dereference: *weights is nullptr. " + "TreeNode ret_type: " + std::to_string(static_cast(tn.data.ret_type)) + + ", name: " + tn.data.name; + HANDLE_ERROR_THROW("Null pointer dereference: *weights is nullptr. " + err_msg); + } - // NLS case 2: a Jet/Dual weight is stored in weights, but this constant is a - // integer type. We need to do some casting - else if constexpr (is_same_v && is_same_v) { - using WScalar = typename Scalar::Scalar; - WScalar tmp = WScalar((**weights).a); - w = Scalar(tmp); - } - // NLS case 3: a Jet/Dual weight is stored in weights, matching Scalar type - else - w = Scalar(**weights); + // NLS case 1: floating point weight is stored in weights + if constexpr (is_same_v) + w = **weights; - *weights = *weights+1; + // NLS case 2: a Jet/Dual weight is stored in weights, but this constant is a + // integer type. We need to do some casting + else if constexpr (is_same_v && is_same_v) { + using WScalar = typename Scalar::Scalar; + WScalar tmp = WScalar((**weights).a); + w = Scalar(tmp); } - catch (const std::exception& e) { - std::string err_msg = "Null pointer dereference: *weights is nullptr. " - "TreeNode ret_type: " + std::to_string(static_cast(tn.data.ret_type)) + - ", name: " + tn.data.name; - std::cerr << "[EXCEPTION] get_weight: caught std::exception: " << e.what() << err_msg << std::endl; - throw; // Re-throw to allow crash - } + // NLS case 3: a Jet/Dual weight is stored in weights, matching Scalar type + else + w = Scalar(**weights); + + *weights = *weights+1; } return w; }; diff --git a/src/program/program.h b/src/program/program.h index 1a2c731c..8986211b 100644 --- a/src/program/program.h +++ b/src/program/program.h @@ -7,7 +7,6 @@ license: GNU/GPL v3 #define PROGRAM_H //external includes -// #include #include "assert.h" @@ -490,7 +489,7 @@ template struct Program */ string get_dot_model(string extras="") const { - // TODO: make the node names their hash or index, and the node label the nodetype name. + // TODO: make node IDs stable (hash or index) and labels reflect nodetype names. // ref: https://stackoverflow.com/questions/10579041/graphviz-create-new-node-with-this-same-label#10579155 string out = "digraph G {\n"; if (! extras.empty()) diff --git a/src/selection/lexicase.cpp b/src/selection/lexicase.cpp index e9698270..37aa7e06 100644 --- a/src/selection/lexicase.cpp +++ b/src/selection/lexicase.cpp @@ -191,3 +191,8 @@ vector Lexicase::survive(Population& pop, int island, } } + +template class Brush::Sel::Lexicase; +template class Brush::Sel::Lexicase; +template class Brush::Sel::Lexicase; +template class Brush::Sel::Lexicase; diff --git a/src/selection/lexicase.h b/src/selection/lexicase.h index 9613bfcb..1b986508 100644 --- a/src/selection/lexicase.h +++ b/src/selection/lexicase.h @@ -31,8 +31,18 @@ class Lexicase : public SelectionOperator /// lexicase survival vector survive(Population& pop, int island, const Parameters& p); + + void set_lexicase_pool(vector s) { this->lexicase_pool = s; } + +private: + vector lexicase_pool; }; +extern template class Lexicase; +extern template class Lexicase; +extern template class Lexicase; +extern template class Lexicase; + } // Sel } // Brush #endif \ No newline at end of file diff --git a/src/selection/nsga2.cpp b/src/selection/nsga2.cpp index 5c1b7fde..810540d1 100644 --- a/src/selection/nsga2.cpp +++ b/src/selection/nsga2.cpp @@ -291,4 +291,9 @@ void NSGA2::crowding_distance(Population& pop, vector>& front, } } // selection -} // Brush \ No newline at end of file +} // Brush + +template class Brush::Sel::NSGA2; +template class Brush::Sel::NSGA2; +template class Brush::Sel::NSGA2; +template class Brush::Sel::NSGA2; \ No newline at end of file diff --git a/src/selection/nsga2.h b/src/selection/nsga2.h index f883d832..73496031 100644 --- a/src/selection/nsga2.h +++ b/src/selection/nsga2.h @@ -77,6 +77,11 @@ class NSGA2 : public SelectionOperator size_t tournament(Population& pop, size_t i, size_t j) const; }; + extern template class NSGA2; + extern template class NSGA2; + extern template class NSGA2; + extern template class NSGA2; + } // selection } // Brush #endif \ No newline at end of file diff --git a/src/selection/selection.cpp b/src/selection/selection.cpp index f1097d02..f16f37d4 100644 --- a/src/selection/selection.cpp +++ b/src/selection/selection.cpp @@ -30,9 +30,9 @@ template void Selection::set_operator() { if (this->type == "nsga2") - pselector = new NSGA2(survival); + pselector = std::make_shared>(survival); else if (this->type == "lexicase") - pselector = new Lexicase(survival); + pselector = std::make_shared>(survival); else HANDLE_ERROR_THROW("Undefined Selection Operator " + this->type + "\n"); @@ -64,3 +64,8 @@ vector Selection::survive(Population& pop, int island, } // Sel } // Brush + +template struct Brush::Sel::Selection; +template struct Brush::Sel::Selection; +template struct Brush::Sel::Selection; +template struct Brush::Sel::Selection; diff --git a/src/selection/selection.h b/src/selection/selection.h index 2ab6c344..865507c1 100644 --- a/src/selection/selection.h +++ b/src/selection/selection.h @@ -24,7 +24,7 @@ template struct Selection { public: - SelectionOperator* pselector; // TODO: THIS SHOULD BE A SHARED POINTER + std::shared_ptr> pselector; string type; bool survival; @@ -47,6 +47,11 @@ struct Selection const Parameters& params); }; +extern template struct Selection; +extern template struct Selection; +extern template struct Selection; +extern template struct Selection; + } // Sel } // Brush #endif \ No newline at end of file diff --git a/src/selection/selection_operator.cpp b/src/selection/selection_operator.cpp index f4268b55..93a99e5a 100644 --- a/src/selection/selection_operator.cpp +++ b/src/selection/selection_operator.cpp @@ -26,4 +26,9 @@ vector SelectionOperator::survive(Population& pop, int island, }; } // selection -} // Brush \ No newline at end of file +} // Brush + +template class Brush::Sel::SelectionOperator; +template class Brush::Sel::SelectionOperator; +template class Brush::Sel::SelectionOperator; +template class Brush::Sel::SelectionOperator; \ No newline at end of file diff --git a/src/selection/selection_operator.h b/src/selection/selection_operator.h index da56eddb..7446a076 100644 --- a/src/selection/selection_operator.h +++ b/src/selection/selection_operator.h @@ -59,6 +59,11 @@ class SelectionOperator virtual vector survive(Population& pop, int island, const Parameters& p); }; +extern template class SelectionOperator; +extern template class SelectionOperator; +extern template class SelectionOperator; +extern template class SelectionOperator; + } // selection } // Brush #endif diff --git a/src/simplification/constants.h b/src/simplification/constants.h index 8db1c676..845e3b66 100644 --- a/src/simplification/constants.h +++ b/src/simplification/constants.h @@ -59,7 +59,7 @@ namespace Brush { namespace Simpl{ HANDLE_ERROR_THROW("No predict available for the class."); } - if (variance(branch_pred) < 1e-5) // TODO: calculate threshold based on data + if (variance(branch_pred) < 1e-6) { // get constant equivalent to its argtype (all data types should have // a constant defined in the search space for its given type). It will be diff --git a/src/vary/search_space.cpp b/src/vary/search_space.cpp index 7bc19273..85c8ce42 100644 --- a/src/vary/search_space.cpp +++ b/src/vary/search_space.cpp @@ -1,5 +1,5 @@ #include "search_space.h" -#include "../program/program.h" // TODO: dont import this header here +#include "../program/program.h" namespace Brush{ @@ -297,11 +297,6 @@ std::optional> SearchSpace::sample_subtree(Node root, int max_d, int return std::nullopt; // it will always have a terminal (because we create constants). - // TODO: I guess I can remove this line below and it will still work - // if ( (terminal_map.find(root.ret_type) == terminal_map.end()) - // || (!has_solution_space(terminal_weights.at(root.ret_type).begin(), - // terminal_weights.at(root.ret_type).end())) ) - // return std::nullopt; auto Tree = tree(); auto spot = Tree.insert(Tree.begin(), root); diff --git a/src/vary/variation.cpp b/src/vary/variation.cpp index 5cd2a8d5..f36b21aa 100644 --- a/src/vary/variation.cpp +++ b/src/vary/variation.cpp @@ -8,6 +8,60 @@ using namespace Brush; using namespace Pop; using namespace MAB; +namespace { +enum class MutationType { + Point, + Insert, + Delete, + Subtree, + ToggleWeightOn, + ToggleWeightOff, + Crossover, + Unknown +}; + +MutationType mutation_type_from_string(const std::string& choice) { + if (choice == "point") + return MutationType::Point; + if (choice == "insert") + return MutationType::Insert; + if (choice == "delete") + return MutationType::Delete; + if (choice == "subtree") + return MutationType::Subtree; + if (choice == "toggle_weight_on") + return MutationType::ToggleWeightOn; + if (choice == "toggle_weight_off") + return MutationType::ToggleWeightOff; + if (choice == "cx") + return MutationType::Crossover; + return MutationType::Unknown; +} + +const char* mutation_type_to_string(MutationType choice) { + switch (choice) { + case MutationType::Point: + return "point"; + case MutationType::Insert: + return "insert"; + case MutationType::Delete: + return "delete"; + case MutationType::Subtree: + return "subtree"; + case MutationType::ToggleWeightOn: + return "toggle_weight_on"; + case MutationType::ToggleWeightOff: + return "toggle_weight_off"; + case MutationType::Crossover: + return "cx"; + case MutationType::Unknown: + return "unknown"; + } + + return "unknown"; +} +} // namespace + /// @brief replace node with same typed node /// @param prog the program /// @param Tree the program tree @@ -301,7 +355,7 @@ class ToggleWeightOffMutation : public MutationBase static auto mutate(Program& program, Iter spot, Variation& variator, const Parameters& params) { - if (spot.node->data.get_is_weighted()==false) // TODO: This condition should never happen. Make sure it dont, then remove it. (this is also true for toggleweighton, also fix that) + if (spot.node->data.get_is_weighted()==false) // TODO: This condition should never happen. Verified by find_spots; keep guard for safety. (this is also true for toggleweighton, also fix that) return false; spot.node->data.set_is_weighted(false); @@ -402,7 +456,7 @@ class SubtreeMutation : public MutationBase }; /** - * @brief Stochastically swaps subtrees between root and other, returning a new program. + * @brief Stochastically swaps subtrees between parents, returning a new individual. * * The spot where the cross will take place in the `root` parent is sampled * based on attribute `get_prob_change` of each node in the tree. After selecting @@ -413,16 +467,16 @@ class SubtreeMutation : public MutationBase * candidate to replace the spot node. In this case, the method returns * `std::nullopt` (and has overloads so it can be used in a boolean context). * - * If the cross succeeds, the child program can be accessed through the + * If the cross succeeds, the child individual can be accessed through the * `.value()` attribute of the `std::optional`. * TODO: update this documentation (it doesnt take the program but the individual. also update mutation documentation) - * This means that, if you use the cross as `auto opt = mutate(parent, SS)`, + * This means that, if you use the cross as `auto opt = cross(mom, dad)`, * either `opt==false` or `opt.value()` contains the child. * * @tparam T the program type - * @param root the root parent - * @param other the donating parent - * @return `std::optional` that may contain the child program of type `T` + * @param mom the first parent + * @param dad the donating parent + * @return `std::optional` that may contain the child individual of type `T` */ template std::optional> Variation::cross( @@ -462,7 +516,7 @@ std::optional> Variation::cross( { // There is no spot that has a probability to be selected return std::nullopt; } - + // pick a subtree to insert. Selection is based on other_weights Program other(dad.program); @@ -529,7 +583,7 @@ std::optional> Variation::cross( child.Tree.move_ontop(child_spot, other_spot); Individual ind(child); - ind.set_variation("cx"); // TODO: use enum here to make it faster + ind.set_variation(mutation_type_to_string(MutationType::Crossover)); return ind; } @@ -601,25 +655,39 @@ std::optional> Variation::mutate( // picking a valid mutation option choice = r.random_choice(parameters.mutation_probs); } + + const auto mutation_choice = mutation_type_from_string(choice); + if (mutation_choice == MutationType::Unknown) { + std::string msg = fmt::format("{} not a valid mutation choice", choice); + HANDLE_ERROR_THROW(msg); + } Program copy(parent.program); vector weights; // choose location by weighted sampling of program - if (choice.compare("point") == 0) // TODO: use enum here to optimize - weights = PointMutation::find_spots(copy, (*this), parameters); - else if (choice.compare("insert") == 0) - weights = InsertMutation::find_spots(copy, (*this), parameters); - else if (choice.compare("delete") == 0) - weights = DeleteMutation::find_spots(copy, (*this), parameters); - else if (choice.compare("subtree") == 0) - weights = SubtreeMutation::find_spots(copy, (*this), parameters); - else if (choice.compare("toggle_weight_on") == 0) - weights = ToggleWeightOnMutation::find_spots(copy, (*this), parameters); - else if (choice.compare("toggle_weight_off") == 0) - weights = ToggleWeightOffMutation::find_spots(copy, (*this), parameters); - else { - std::string msg = fmt::format("{} not a valid mutation choice", choice); - HANDLE_ERROR_THROW(msg); + switch (mutation_choice) { + case MutationType::Point: + weights = PointMutation::find_spots(copy, (*this), parameters); + break; + case MutationType::Insert: + weights = InsertMutation::find_spots(copy, (*this), parameters); + break; + case MutationType::Delete: + weights = DeleteMutation::find_spots(copy, (*this), parameters); + break; + case MutationType::Subtree: + weights = SubtreeMutation::find_spots(copy, (*this), parameters); + break; + case MutationType::ToggleWeightOn: + weights = ToggleWeightOnMutation::find_spots(copy, (*this), parameters); + break; + case MutationType::ToggleWeightOff: + weights = ToggleWeightOffMutation::find_spots(copy, (*this), parameters); + break; + case MutationType::Crossover: + case MutationType::Unknown: + HANDLE_ERROR_THROW("Crossover is not a valid mutation choice\n"); + break; } if (std::all_of(weights.begin(), weights.end(), [](const auto& w) { @@ -643,18 +711,30 @@ std::optional> Variation::mutate( // program tree. Here we call the mutation function and return the result bool success; - if (choice.compare("point") == 0) - success = PointMutation::mutate(child, spot, (*this), parameters); - else if (choice.compare("insert") == 0) - success = InsertMutation::mutate(child, spot, (*this), parameters); - else if (choice.compare("delete") == 0) - success = DeleteMutation::mutate(child, spot, (*this), parameters); - else if (choice.compare("subtree") == 0) - success = SubtreeMutation::mutate(child, spot, (*this), parameters); - else if (choice.compare("toggle_weight_on") == 0) - success = ToggleWeightOnMutation::mutate(child, spot, (*this), parameters); - else // it must be"toggle_weight_off" - success = ToggleWeightOffMutation::mutate(child, spot, (*this), parameters); + switch (mutation_choice) { + case MutationType::Point: + success = PointMutation::mutate(child, spot, (*this), parameters); + break; + case MutationType::Insert: + success = InsertMutation::mutate(child, spot, (*this), parameters); + break; + case MutationType::Delete: + success = DeleteMutation::mutate(child, spot, (*this), parameters); + break; + case MutationType::Subtree: + success = SubtreeMutation::mutate(child, spot, (*this), parameters); + break; + case MutationType::ToggleWeightOn: + success = ToggleWeightOnMutation::mutate(child, spot, (*this), parameters); + break; + case MutationType::ToggleWeightOff: + success = ToggleWeightOffMutation::mutate(child, spot, (*this), parameters); + break; + case MutationType::Crossover: + case MutationType::Unknown: + success = false; + break; + } if (// strict mutation --- returns only valid solutions. ( success @@ -679,7 +759,6 @@ std::optional> Variation::mutate( if (choice.compare("point") == 0 || choice.compare("insert") == 0 || choice.compare("delete") == 0 - // || choice.compare("subtree") == 0 // TODO: disable this one ) { ind.set_sampled_nodes({spot.node->data}); } @@ -783,9 +862,6 @@ template void Variation::update_ss() { // propagate bandits learnt information to the search space. - // TODO: not all arms are initialized, if the user set something to zero then we must - // disable it. So, during update, we need to properly handle these skipped arms. --> remove this for nodes, allow it just for variations. If the user doesnt want to use a feature or op, he should not set it at the first place. We need to do this with variations because the user - // can choose it directly instead of letting brush to figure out. // variation: getting new probabilities for variation operators auto variation_probs = variation_bandit.sample_probs(true); @@ -810,12 +886,14 @@ void Variation::update_ss() search_space.terminal_map.at(datatype).end(), [&](auto& node) { return node.get_feature() == terminal_name; }); - // if (it != search_space.terminal_map.at(datatype).end()) { - auto index = std::distance(search_space.terminal_map.at(datatype).begin(), it); + if (it == search_space.terminal_map.at(datatype).end()) { + continue; + } + + auto index = std::distance(search_space.terminal_map.at(datatype).begin(), it); - // Update the terminal weights with the second value - search_space.terminal_weights.at(datatype)[index] = terminal_prob; - // } + // Update the terminal weights with the second value + search_space.terminal_weights.at(datatype)[index] = terminal_prob; } } @@ -825,14 +903,19 @@ void Variation::update_ss() auto op_probs = bandit.sample_probs(true); for (auto& [op_name, op_prob] : op_probs) { - + bool updated = false; for (const auto& [node_type, node_value]: search_space.node_map.at(ret_type).at(args_type)) { if (node_value.name == op_name) { search_space.node_map_weights.at(ret_type).at(args_type).at(node_type) = op_prob; + updated = true; + break; } } + if (!updated) { + continue; + } } } } @@ -840,3 +923,8 @@ void Variation::update_ss() } //namespace Var } //namespace Brush + +template class Brush::Var::Variation; +template class Brush::Var::Variation; +template class Brush::Var::Variation; +template class Brush::Var::Variation; diff --git a/src/vary/variation.h b/src/vary/variation.h index d7111a39..02a2b06f 100644 --- a/src/vary/variation.h +++ b/src/vary/variation.h @@ -346,7 +346,6 @@ class Variation { ind.program.fit(data.get_training_data()); // simplify before calculating fitness (order matters, as they are not refitted and constants simplifier does not replace with the right value.) - // TODO: constants_simplifier should set the correct value for the constant (so we dont have to refit). // simplify constants first to avoid letting the lsh simplifier to visit redundant branches if (parameters.constants_simplification && do_simplification) @@ -633,12 +632,6 @@ class Variation { Inexact_simplifier inexact_simplifier; }; -// // Explicitly instantiate the template for brush program types -// template class Variation; -// template class Variation; -// template class Variation; -// template class Variation; - class MutationBase { public: using Iter = tree::pre_order_iterator; @@ -662,6 +655,11 @@ class MutationBase { const Parameters& params); }; +extern template class Variation; +extern template class Variation; +extern template class Variation; +extern template class Variation; + } //namespace Var } //namespace Brush #endif \ No newline at end of file diff --git a/tests/cpp/test_brush.cpp b/tests/cpp/test_brush.cpp index e100f84a..765f1b65 100644 --- a/tests/cpp/test_brush.cpp +++ b/tests/cpp/test_brush.cpp @@ -6,7 +6,6 @@ // #include "../../src/program/dispatch_table.h" #include "../../src/data/io.h" #include "../../src/engine.h" -#include "../../src/engine.cpp" #include "../../src/selection/selection.h" #include "../../src/selection/selection_operator.h" #include "../../src/selection/nsga2.h" @@ -17,27 +16,11 @@ #include "../../src/simplification/constants.h" #include "../../src/simplification/inexact.h" -// TODO: omg i need to figure out why my code only works if i import basically the whole stuff. It seems to be related to templating -#include "../../src/selection/selection.cpp" -#include "../../src/selection/selection_operator.cpp" -#include "../../src/selection/nsga2.cpp" -#include "../../src/selection/lexicase.cpp" -#include "../../src/eval/evaluation.cpp" -#include "../../src/pop/archive.cpp" -#include "../../src/pop/population.cpp" -// #include "../../src/bandit/bandit.cpp" -// #include "../../src/bandit/bandit_operator.cpp" -// #include "../../src/bandit/dummy.cpp" -// #include "../../src/bandit/thompson.cpp" -#include "../../src/simplification/constants.cpp" -#include "../../src/simplification/inexact.cpp" - // TODO: test predict from archive // TODO: rename it to test_engine - // TODO: test serialization of archive (get archive and save to json) - // TODO: test logger, verbose, print stats, etc. + TEST(Engine, EngineWorks) { MatrixXf X(10,2); diff --git a/tests/cpp/test_data.cpp b/tests/cpp/test_data.cpp index 2ae1a449..f5f154eb 100644 --- a/tests/cpp/test_data.cpp +++ b/tests/cpp/test_data.cpp @@ -154,8 +154,6 @@ TEST(Data, ShuffleTrueFalse) 2 , 1 , 3 , 2.1, 3.7, -5.2; - X.transposeInPlace(); - ArrayXf y(20); y << 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, @@ -183,4 +181,24 @@ TEST(Data, ShuffleTrueFalse) Dataset dt6(X, y, {}, {}, {}, true, 1.0, 1.0, false); // TODO: write some assertions here + const int total = dt1.get_n_samples(); + + ASSERT_TRUE(dt1.use_validation); + ASSERT_TRUE(dt2.use_validation); + ASSERT_EQ(dt1.get_training_data().get_n_samples() + dt1.get_validation_data().get_n_samples(), total); + ASSERT_EQ(dt2.get_training_data().get_n_samples() + dt2.get_validation_data().get_n_samples(), total); + + ASSERT_FALSE(dt3.use_validation); + ASSERT_FALSE(dt4.use_validation); + ASSERT_EQ(dt3.get_training_data().get_n_samples(), total); + ASSERT_EQ(dt3.get_validation_data().get_n_samples(), total); + ASSERT_EQ(dt4.get_training_data().get_n_samples(), total); + ASSERT_EQ(dt4.get_validation_data().get_n_samples(), total); + + ASSERT_FALSE(dt5.use_validation); + ASSERT_FALSE(dt6.use_validation); + ASSERT_EQ(dt5.get_training_data().get_n_samples(), total); + ASSERT_EQ(dt5.get_validation_data().get_n_samples(), total); + ASSERT_EQ(dt6.get_training_data().get_n_samples(), total); + ASSERT_EQ(dt6.get_validation_data().get_n_samples(), total); } diff --git a/tests/cpp/test_evaluation.cpp b/tests/cpp/test_evaluation.cpp index 5b3d4fd2..9b40a462 100644 --- a/tests/cpp/test_evaluation.cpp +++ b/tests/cpp/test_evaluation.cpp @@ -54,6 +54,36 @@ TEST(Evaluation, accuracy) ASSERT_EQ(((int)(score*10000)), 3999); } +TEST(Evaluation, ScorerRegressionMSE) +{ + VectorXf y(3), yhat(3), loss_expected(3), loss(3); + y << 1.0, 2.0, 3.0; + yhat << 1.0, 4.0, 2.0; + + float expected = mse(y, yhat, loss_expected); + + Scorer scorer("mse"); + float actual = scorer.score(y, yhat, loss, {}); + + ASSERT_NEAR(actual, expected, 1e-6); + ASSERT_TRUE(loss.isApprox(loss_expected, 1e-6)); +} + +TEST(Evaluation, ScorerBinaryAccuracy) +{ + VectorXf y(4), yhat(4), loss_expected(4), loss(4); + y << 0.0, 1.0, 1.0, 0.0; + yhat << 0.1, 0.9, 0.2, 0.8; + + float expected = zero_one_loss(y, yhat, loss_expected); + + Scorer scorer("accuracy"); + float actual = scorer.score(y, yhat, loss, {}); + + ASSERT_NEAR(actual, expected, 1e-6); + ASSERT_TRUE(loss.isApprox(loss_expected, 1e-6)); +} + // TEST(EvaluationTest, UpdateFitnessTest) { // // TODO: Add test case for update_fitness function diff --git a/tests/cpp/test_individuals.cpp b/tests/cpp/test_individuals.cpp index 5b3e5df6..c00dcb44 100644 --- a/tests/cpp/test_individuals.cpp +++ b/tests/cpp/test_individuals.cpp @@ -1,3 +1,82 @@ // TODO: test predict, predict proba, fit. -// TODO: test parent_id and id \ No newline at end of file +// TODO: test parent_id and id + +#include "testsHeader.h" + +using namespace Brush; +using namespace Brush::Pop; + +TEST(Individual, FitAndPredictRegression) +{ + MatrixXf X(4, 2); + ArrayXf y(4); + + X << 1.0, 2.0, + 2.0, 1.0, + 3.0, 0.5, + 4.0, 1.5; + y << 3.0, 3.0, 3.5, 5.5; + + Dataset data(X, y); + SearchSpace ss(data); + + // We must have a SearchSpace reference, so the operator ret-type checks dont + // fail even when feature names look right --- node metadata is consistent. + Parameters params; + RegressorProgram prg = ss.make_regressor(0, 0, params); + Individual ind(prg); + + ASSERT_FALSE(ind.get_is_fitted()); + ind.fit(data); + ASSERT_TRUE(ind.get_is_fitted()); + + auto y_pred = ind.predict(data); + ASSERT_EQ(y_pred.size(), y.size()); +} + +TEST(Individual, PredictProbaBinaryClassifier) +{ + MatrixXf X(6, 2); + ArrayXf y(6); + + X << 0.0, 1.0, + 1.0, 0.0, + 0.5, 0.5, + 0.2, 0.8, + 0.8, 0.2, + 1.0, 1.0; + y << 0.0, 1.0, 0.0, 1.0, 1.0, 0.0; + + Dataset data(X, y, {}, {}, {}, true); + SearchSpace ss(data); + + Parameters params; + params.set_n_classes(data.y); + params.set_sample_weights(data.y); + + ClassifierProgram prg = ss.make_classifier(0, 0, params); + Individual ind(prg); + + ind.fit(data); + auto prob = ind.predict_proba(data); + ASSERT_EQ(prob.size(), y.size()); +} + +TEST(Individual, ParentIdAndId) +{ + Individual p1; + Individual p2; + Individual child; + + p1.set_id(3); + p2.set_id(7); + child.set_id(11); + + child.set_parents(std::vector>{p1, p2}); + + ASSERT_EQ(child.id, 11u); + ASSERT_EQ(child.parent_id.size(), 2u); + ASSERT_EQ(child.parent_id.at(0), 3u); + ASSERT_EQ(child.parent_id.at(1), 7u); +} \ No newline at end of file diff --git a/tests/cpp/test_population.cpp b/tests/cpp/test_population.cpp index 0f5aebf3..8994e927 100644 --- a/tests/cpp/test_population.cpp +++ b/tests/cpp/test_population.cpp @@ -1,12 +1,5 @@ #include "testsHeader.h" -#include "../../src/ind/individual.cpp" -#include "../../src/pop/population.cpp" // TODO: figure out if thats ok to include cpps instead of headers -#include "../../src/eval/evaluation.cpp" -#include "../../src/selection/nsga2.cpp" -#include "../../src/selection/lexicase.cpp" -#include "../../src/selection/selection_operator.cpp" -#include "../../src/selection/selection.cpp" // #include "../../src/bandit/bandit.cpp" // #include "../../src/bandit/bandit_operator.cpp" diff --git a/tests/cpp/testsHeader.h b/tests/cpp/testsHeader.h index bd89ab30..daeda239 100644 --- a/tests/cpp/testsHeader.h +++ b/tests/cpp/testsHeader.h @@ -23,10 +23,6 @@ using std::stoi; using std::to_string; using std::stof; -// this is a compiler-specific hack and a bad practice. TODO: delete it -// #define private public - -// TODO: remove these lots of imports and keep only essential stuff #include #include "../../src/init.h" #include "../../src/params.h" @@ -36,7 +32,6 @@ using std::stof; #include "../../src/program/program.h" #include "../../src/ind/individual.h" #include "../../src/vary/search_space.h" -#include "../../src/params.h" #include "../../src/vary/variation.h" #include "../../src/selection/selection.h" #include "../../src/selection/selection_operator.h" @@ -53,8 +48,6 @@ using std::stof; #include "../../src/simplification/constants.h" #include "../../src/simplification/inexact.h" -// TODO: is this ok? (otherwise I would have to create a test separated file, or move the implementation to the header) -#include "../../src/vary/variation.cpp" using namespace Brush; using namespace Brush::Data; diff --git a/tests/python/test_params.py b/tests/python/test_params.py index ee59fec4..8eee29e4 100644 --- a/tests/python/test_params.py +++ b/tests/python/test_params.py @@ -266,18 +266,27 @@ def test_fitness_weights_match_scorer_sign(scorer, expected_weights): both at estimator and individual (population) level. """ - # simple toy dataset - X = np.array([[1.2, 2.0], [2.0, 3.5], [3.0, 4.0], [4.0, 5.0]]) - y_reg = np.array([1.0, 2.0, 3.0, 4.0]) - y_clf = np.array([0, 1, 0, 1]) + # Use a larger dataset to avoid tiny-sample metric edge cases. + # Caveat: if using too few samples and validation split, AUPRC will fail + # if there is only one sample in the validation partition, so for small datasets + # you need to make sure that either validation_size is zero, or the ratio that you + # set is enough for having at least two samples there. Also notice that we have + # a stratified logic for validation split, so it should be taken into account. + n_samples = 80 + x0 = np.linspace(0.0, 8.0, n_samples) + x1 = np.linspace(1.0, 9.0, n_samples) + X = np.column_stack((x0, x1)) + y_reg = 1.5 * x0 + 0.5 * x1 + y_clf = (x0 > np.median(x0)).astype(float) + # Choose estimator type based on scorer # (by default objectives are ["scorer", "linear_complexity"]) - if scorer in ("mse"): - est = BrushRegressor(scorer=scorer, pop_size=20, max_gens=10, verbosity=0) + if scorer in ("mse", ): # add more metrics for regression when I implement them + est = BrushRegressor(scorer=scorer, pop_size=20, max_gens=10, verbosity=1) est.fit(X, y_reg) else: - est = BrushClassifier(scorer=scorer, pop_size=20, max_gens=10, verbosity=0) + est = BrushClassifier(scorer=scorer, pop_size=20, max_gens=10, verbosity=1) est.fit(X, y_clf) # Check estimator-level weights