diff --git a/.gitignore b/.gitignore index 9167fd029..7677a4898 100644 --- a/.gitignore +++ b/.gitignore @@ -13,6 +13,12 @@ data/ceeaus data/breast-cancer data/housing data/cranfield +data/Gov +data/MQ2007 +data/MQ2008 +data/OHSUMED biicode.conf bii/ bin/ +cmake-build-debug/* + diff --git a/include/meta/classify/classifier/svm_wrapper.h b/include/meta/classify/classifier/svm_wrapper.h index fe9ab8985..f5e0675ac 100644 --- a/include/meta/classify/classifier/svm_wrapper.h +++ b/include/meta/classify/classifier/svm_wrapper.h @@ -73,6 +73,17 @@ class svm_wrapper : public classifier svm_wrapper(dataset_view_type docs, const std::string& svm_path, kernel kernel_opt = kernel::None); + /** + * Constructor. Should only be used as RankSVM. + * @param svm_path The path to the liblinear/libsvm library + * @param kernel_opt Which kind of kernel you want to use (default: + * None) + * This constructor assumes that caller has written training documents + * into svm-train file + */ + svm_wrapper(const std::string& svm_path, + kernel kernel_opt = kernel::None); + /** * Loads a svm_wrapper from a stream. * @param in The stream to read from @@ -81,6 +92,12 @@ class svm_wrapper : public classifier void save(std::ostream& out) const override; + /** + * Save weights as RankSVM to a stream. Should only be used as RankSVM. + * @param out + */ + void save_weights(std::ostream& out) const; + /** * Classifies a document into a specific group, as determined by * training data. @@ -89,6 +106,14 @@ class svm_wrapper : public classifier */ class_label classify(const feature_vector& doc) const override; + /** + * Compute score of given document by dot product with weights + * learned by this SVM (should only be used as RankSVM). + * @param doc The document to compute score + * @return score of this document + */ + double computeScore(feature_vector& doc); + /** * Classifies a collection document into specific groups, as determined * by training data. @@ -121,6 +146,18 @@ class svm_wrapper : public classifier /** the list of class_labels (mainly for serializing the model) */ std::vector labels_; + + /** weights learned by this SVM */ + std::vector weights_; + + /** + * Load weights from train model file written by this SVM. Should + * only be used as RankSVM. + * + * @param + * @return + */ + void load_weights(); }; class svm_wrapper_exception : public std::runtime_error diff --git a/include/meta/learn/learntorank/pairwise_letor.h b/include/meta/learn/learntorank/pairwise_letor.h new file mode 100644 index 000000000..b2bc1ad55 --- /dev/null +++ b/include/meta/learn/learntorank/pairwise_letor.h @@ -0,0 +1,204 @@ +/** + * @file pairwise_letor.h + * @author Mihika Dave, Anthony Huang, Rachneet Kaur + * @date 12/18/17 + */ + +#ifndef META_PAIRWISE_LETOR_H +#define META_PAIRWISE_LETOR_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "meta/learn/loss/all.h" +#include "meta/learn/loss/hinge.h" +#include "meta/learn/loss/loss_function.h" +#include "meta/learn/loss/loss_function_factory.h" +#include "meta/learn/sgd.h" +#include "meta/learn/instance.h" +#include "meta/learn/dataset.h" +#include "meta/classify/classifier/svm_wrapper.h" +#include "meta/classify/classifier/classifier.h" + + +using namespace std; +using namespace meta::util; +using namespace meta::classify; + +namespace meta +{ +namespace learn +{ +namespace learntorank +{ +/** + * This class implements pairwise learning to rank with binary classifiers. + * The ranker here mainly follows the Stochastic Pairwise Descent algorithm + * based on D. Sculley's paper on 'Large Scale Learning to Rank'. + * + * @see https://static.googleusercontent.com/media/research.google.com/en// + * pubs/archive/35662.pdf + */ +class pairwise_letor { + public: + using tupl = std::tuple; + + enum DATA_TYPE { + TRAINING, + VALIDATION, + TESTING + }; + + enum CLASSIFY_TYPE { + LIBSVM, + SPD, + }; + + typedef struct forwardnode { + operator int() const { + return label; + } + operator feature_vector() const { + return fv; + } + int label; + feature_vector fv; + } forward_node; + + public: + + /** + * Constructor. + * @param num_features The number of features for the pairwise model + * @param classify_type The type of classifier to use + * @param hasModel If the sgd/svm model is loaded from file + * @param model_file The path to model file + */ + pairwise_letor(size_t num_features, CLASSIFY_TYPE classify_type, + bool hasModel, string model_file); + + ~pairwise_letor(); + + /** + * Train the pairwise ranker model + * @param data_dir Path to directory containing train.txt + */ + void train(string data_dir); + + /** + * Train the svm with a pair of data samples + * @param data_dir Path to directory containing train.txt + * @param svm_path The path to the liblinear/libsvm library + */ + void train_svm(string data_dir, string svm_path); + + /** + * Validate the learnt model + * @param data_dir The path to the directory containing vali.txt + */ + void validate(string data_dir); + + /** + * Test the model on testing dataset + * @param data_dir The path to the directory containing test.txt + */ + void test(string data_dir); + + private: + + /// number of features for this letor model + size_t num_features_; + + /// type of classifier to use + CLASSIFY_TYPE classify_type_; + + /// sgd_model for training and testing + unique_ptr model_; + + /// binary svm wrapper for training and testing + unique_ptr wrapper_; + + /** + * Read data from the dataset and store it as nested hash-tables + * @param data_type The type of data (train, vali, or test) + * @param data_dir Path to directory containing train/vali/test.txt + * @param qids Vector to store ids of queries + * @param dataset Map to store nested data mapping: query => label => doc + * @param docids Map to store docids in each query and label + * @param relevance_map Map to store relevance of docs in each query + */ + void read_data(DATA_TYPE data_type, + string data_dir, + vector& qids, + unordered_map>>& dataset, + unordered_map>>& docids, + unordered_map>& relevance_map); + + /** + * Return a random pair of tuple for training the svm classifier + * Tuple is of type (feature_vec, label, qid) + * @param training_qids Vector holding ids of all queries + * @param train_dataset Map holding nested data mapping: query => label => doc + * @param random_seed The random seed used to randomly choose data + * @return the random pair + */ + pair getRandomPair( + vector& training_qids, + unordered_map>>& train_dataset, + int random_seed); + + /** + * Build nodes from dataset for training svm_wrapper + * @param train_dataset Map holding nested data mapping: query => label => doc + * @param dataset_nodes Vector holding data nodes for SVM training + */ + void build_dataset_nodes( + unordered_map>>& train_dataset, + vector& dataset_nodes); + + /** + * Compare the relative rank between the 2 data samples + * @param p1 The first pair to compare + * @param p2 The second pair to compare + * @return whether p1 is ranked before p2 + */ + static bool compare_docscore( + const pair &p1, const pair &p2) { + return p1.second > p2.second; + } + + /** + * Compute the DCG + * @param limit The number of positions to compute DCG + * @param rankings Vector holding ranking at each position + * @return computed DCG + */ + double compute_dcg(int limit, vector &rankings); + + /** + * Evaluate the dataset for precision, mean average precision, NDCG + * @param qids Vector holding id for queries + * @param dataset Map holding nested data mapping: query => label => doc + * @param docids Map holding doc ids for each query and label + * @param relevance_map Map holding relevance of each doc for each query + */ + void evaluate(vector& qids, + unordered_map>>& dataset, + unordered_map>>& docids, + unordered_map>& relevance_map); + +}; + +} +} +} +#endif diff --git a/include/meta/learn/sgd.h b/include/meta/learn/sgd.h index e341eb1b3..261420363 100644 --- a/include/meta/learn/sgd.h +++ b/include/meta/learn/sgd.h @@ -81,6 +81,11 @@ class sgd_model */ void save(std::ostream& out) const; + /** + * Saves the weights of current model in non-compact format. + */ + void save_weights(std::ostream& out) const; + /** * Calibrates the learning rate for the model based on sample data. * Search strategy inspired by Leon Bottou's SGD package. diff --git a/include/meta/meta.h b/include/meta/meta.h index d4860fcd3..5f539deb1 100644 --- a/include/meta/meta.h +++ b/include/meta/meta.h @@ -105,6 +105,12 @@ namespace loss } } +/** + * Learning to rank algorithms + */ +namespace learn2rank +{ +} /** * Algorithms for regression. */ diff --git a/src/classify/classifier/svm_wrapper.cpp b/src/classify/classifier/svm_wrapper.cpp index f4f50ea0e..161aac174 100644 --- a/src/classify/classifier/svm_wrapper.cpp +++ b/src/classify/classifier/svm_wrapper.cpp @@ -4,6 +4,8 @@ */ #include +#include +#include #include "meta/classify/classifier/svm_wrapper.h" #include "meta/io/filesystem.h" @@ -88,17 +90,79 @@ svm_wrapper::svm_wrapper(dataset_view_type docs, const std::string& svm_path, #ifndef _WIN32 std::string command = svm_path_ + executable_ + "train " - + options_.at(kernel_) + " svm-train"; + + options_.at(kernel_) + " svm-train" + " svm-train.model"; command += " > /dev/null 2>&1"; #else // see comment in classify() auto command = "\"\"" + svm_path_ + executable_ + "train.exe\" " - + options_.at(kernel_) + " svm-train"; + + options_.at(kernel_) + " svm-train" + " svm-train.model"; command += " > NUL 2>&1\""; #endif system(command.c_str()); } +svm_wrapper::svm_wrapper(const std::string& svm_path, + kernel kernel_opt /* = None */) + : svm_path_{svm_path}, kernel_{kernel_opt} +{ + + std::string base_path; + std::string exename; + if (kernel_opt == kernel::None) + { + base_path = "liblinear/build/"; + exename = "train"; + } + else + { + base_path = "libsvm/build/"; + exename = "svm-train"; + } + +#ifdef _WIN32 + exename += ".exe"; +#endif + + if (filesystem::file_exists(svm_path_ + base_path + exename)) + { + executable_ = base_path; + } + else if (filesystem::file_exists(svm_path_ + base_path + "Release/" + + exename)) + { + executable_ = base_path + "Release/"; + } + else if (filesystem::file_exists(svm_path_ + base_path + "Debug/" + + exename)) + { + executable_ = base_path + "Debug/"; + } + else + { + throw svm_wrapper_exception{ + "Could not find liblinear/libsvm binaries from root '" + svm_path_ + + "'"}; + } + + if (kernel_opt != kernel::None) + executable_ += "svm-"; + + +#ifndef _WIN32 + std::string command = svm_path_ + executable_ + "train " + + options_.at(kernel_) + " svm-train" + " svm-train.model"; + command += " > /dev/null 2>&1"; +#else + // see comment in classify() + auto command = "\"\"" + svm_path_ + executable_ + "train.exe\" " + + options_.at(kernel_) + " svm-train" + " svm-train.model"; + command += " > NUL 2>&1\""; +#endif + system(command.c_str()); + + load_weights(); +} + svm_wrapper::svm_wrapper(std::istream& in) : svm_path_{io::packed::read(in)} { @@ -110,13 +174,38 @@ svm_wrapper::svm_wrapper(std::istream& in) for (std::size_t i = 0; i < size; ++i) io::packed::read(in, labels_[i]); - std::ofstream out{"svm-train.model"}; - auto model_lines = io::packed::read(in); + { + std::ofstream out{"svm-train.model"}; + auto model_lines = io::packed::read(in); + std::string line; + for (std::size_t i = 0; i < model_lines; ++i) + { + std::getline(in, line); + out << line << "\n"; + } + } + + load_weights(); +} + +void svm_wrapper::load_weights() { + auto num_lines = filesystem::num_lines("svm-train.model"); + std::ifstream in{"svm-train.model"}; std::string line; - for (std::size_t i = 0; i < model_lines; ++i) + std::size_t i = 0; + for (; i < num_lines; ++i) { std::getline(in, line); - out << line << "\n"; + if (line.find("bias") == 0) { + std::getline(in, line); + i += 2; + break; + } + } + double temp_weight; + for (; i < num_lines; ++i) { + in >> temp_weight; + weights_.push_back(temp_weight); } } @@ -143,6 +232,14 @@ void svm_wrapper::save(std::ostream& out) const } } +void svm_wrapper::save_weights(std::ostream& out) const +{ + for (const auto& weight : weights_) + { + out << weight << std::endl; + } +} + class_label svm_wrapper::classify(const feature_vector& doc) const { // create input for liblinear @@ -182,6 +279,19 @@ class_label svm_wrapper::classify(const feature_vector& doc) const return labels_.at(lbl - 1); } +double svm_wrapper::computeScore(feature_vector& doc) { + if (kernel_ != kernel::None) { + return 0.0; + } + + auto score = 0.0; + for (std::size_t i = 0; i < weights_.size(); i++) { + score += weights_[i] * doc[term_id{i}]; + } + + return score; +} + confusion_matrix svm_wrapper::test(multiclass_dataset_view docs) const { // create input for liblinear/libsvm diff --git a/src/learn/CMakeLists.txt b/src/learn/CMakeLists.txt index 7e8b6c9e3..47ce134d6 100644 --- a/src/learn/CMakeLists.txt +++ b/src/learn/CMakeLists.txt @@ -1,8 +1,9 @@ project(meta-learn) add_subdirectory(loss) +add_subdirectory(tools) -add_library(meta-learn sgd.cpp) +add_library(meta-learn sgd.cpp learntorank/pairwise_letor.cpp ../../include/meta/learn/learntorank/pairwise_letor.h) target_link_libraries(meta-learn meta-io meta-loss cpptoml) install(TARGETS meta-learn diff --git a/src/learn/learntorank/pairwise_letor.cpp b/src/learn/learntorank/pairwise_letor.cpp new file mode 100644 index 000000000..646d72c8f --- /dev/null +++ b/src/learn/learntorank/pairwise_letor.cpp @@ -0,0 +1,435 @@ +/** + * @file pairwise_letor.cpp + * @author Mihika Dave, Anthony Huang, Rachneet Kaur + * @date 12/18/17 + */ + +#include + +#include "meta/learn/learntorank/pairwise_letor.h" + +namespace meta +{ +namespace learn +{ +namespace learntorank +{ + +pairwise_letor::pairwise_letor(size_t num_features, CLASSIFY_TYPE classify_type, + bool hasModel, string model_file) + : num_features_{num_features}, classify_type_{classify_type} +{ + if (hasModel) { + ifstream in{model_file}; + if (classify_type == pairwise_letor::SPD) { + model_ = make_unique(in); + } else { + string wrapper_id = io::packed::read(in); + assert(svm_wrapper::id.compare(wrapper_id) == 0); + wrapper_ = make_unique(in); + } + } else { + model_ = make_unique(num_features); + } +} + +pairwise_letor::~pairwise_letor() { + ofstream out_weights{"letor.weights"}; + if (classify_type_ == pairwise_letor::SPD) { + ofstream out{"letor_sgd_train.model"}; + model_->save(out); + model_->save_weights(out_weights); + } else { + ofstream out{"letor_svm_train.model"}; + wrapper_->save(out); + wrapper_->save_weights(out_weights); + } +} + +void pairwise_letor::train(string data_dir) { + auto training_qids = make_unique>(); + auto training_dataset = + make_unique>>>(); + auto docids = + make_unique>>>(); + auto relevance_map = + make_unique>>(); + read_data(TRAINING,data_dir,*training_qids,*training_dataset, + *docids,*relevance_map); + auto n_iter = 100000; + + auto loss = loss::make_loss_function(loss::hinge::id.to_string()); + + for (size_t i = 0; i < n_iter; ++i) { + auto random_seed = i; + auto data_pair + = getRandomPair(*training_qids, *training_dataset, random_seed); + feature_vector a, b; + int y_a, y_b; + string qid; + tie(a, y_a, qid) = data_pair.first; + tie(b, y_b, qid) = data_pair.second; + auto x = a; + x -= b; + auto expected_label = y_a - y_b; + auto los = model_->train_one(x, expected_label, *loss); + } + loss.reset(); +} + +void pairwise_letor::read_data(DATA_TYPE data_type, + string data_dir, + vector& qids, + unordered_map>>& dataset, + unordered_map>>& docids, + unordered_map>& relevance_map) { + auto start = chrono::high_resolution_clock::now(); + + auto data_file = data_dir; + switch (data_type) { + case TRAINING: + data_file += "/train.txt"; + break; + case VALIDATION: + data_file += "/vali.txt"; + break; + case TESTING: + data_file += "/test.txt"; + break; + } + std::ifstream infile(data_file); + string line; + auto qid_docids = unordered_map(); + while (std::getline(infile, line)) { + std::istringstream iss(line); + size_t label, feature_id; + string qid; + double feature_val; + string tmp_str, docid; + iss >> label >> tmp_str; + stringstream ss(tmp_str.substr(tmp_str.find(':') + 1, tmp_str.find(' '))); + ss >> qid; + + if (dataset.find(qid) == dataset.end()) { + qids.push_back(qid); + dataset[qid] = unordered_map>(); + } + auto& query_dataset = dataset[qid]; + + if (query_dataset.find(label) == query_dataset.end()) { + query_dataset[label] = vector(); + } + auto& label_dataset = query_dataset[label]; + + label_dataset.push_back(feature_vector(0)); + auto& features = label_dataset.back(); + for (auto feature_idx = 0; feature_idx < num_features_; feature_idx++) { + iss >> tmp_str; + stringstream ssid(tmp_str.substr(0, tmp_str.find(':'))); + ssid >> feature_id; + --feature_id; + stringstream ssval(tmp_str.substr(tmp_str.find(':') + 1)); + ssval >> feature_val; + features[term_id{feature_id}] = feature_val; + } + + if (data_type != TRAINING) { + if (docids.find(qid) == docids.end()) { + docids[qid] = unordered_map >(); + qid_docids[qid] = 0; + } + auto& query_docids = docids[qid]; + + if (query_docids.find(label) == query_docids.end()) { + query_docids[label] = vector(); + } + auto& label_docids = query_docids[label]; + + docid = qid + to_string(qid_docids[qid]++); + label_docids.push_back(docid); + if (relevance_map.find(qid) == relevance_map.end()) { + relevance_map[qid] = unordered_map(); + } + auto& doc_relevance = relevance_map[qid]; + doc_relevance[docid] = label; + } + } + + auto end = chrono::high_resolution_clock::now(); + chrono::duration elapsed_time = end - start; + cout << "Time spent in read_data in seconds: " + << elapsed_time.count() << endl; +} + +void pairwise_letor::build_dataset_nodes( + unordered_map>>& train_dataset, + vector& dataset_nodes) { + for (auto& query_iter : train_dataset) { + auto& query_dataset = query_iter.second; + vector label_keys; + for (auto& iter : query_dataset) { + label_keys.push_back(iter.first); + } + for (int i = 0; i < label_keys.size(); i++) { + for (int j = i + 1; j < label_keys.size(); j++) { + auto temp_label = label_keys[i] > label_keys[j] ? 1 : -1; + auto& vec1 = query_dataset[label_keys[i]]; + auto& vec2 = query_dataset[label_keys[j]]; + for (auto& vec1_iter : vec1) { + for (auto& vec2_iter : vec2) { + dataset_nodes.push_back(forward_node()); + auto& temp_node = dataset_nodes.back(); + temp_node.label = temp_label; + temp_node.fv = vec1_iter; + temp_node.fv -= vec2_iter; + } + } + } + } + } +} + +pair pairwise_letor::getRandomPair( + vector& training_qids, + unordered_map>>& train_dataset, + int random_seed) { + + default_random_engine generator(random_seed); + auto rel_levels = 0; + string qid; + do { + //select q uniformly at random from Q + auto max_q = training_qids.size(); + uniform_int_distribution qid_distribution(0, max_q - 1); + auto q_index = qid_distribution(generator); + qid = training_qids[q_index]; + auto& qid_vec = train_dataset[qid]; + rel_levels = qid_vec.size(); + + } while (rel_levels <= 1); + auto& qid_vec = train_dataset[qid]; + + //select ya uniformly at random from Y [q] + auto max_ya = qid_vec.size(); + uniform_int_distribution ya_distribution(0, max_ya - 1); + auto ya_index = ya_distribution(generator); + + auto count = 0; + auto ya = ya_index; + for (auto& iter : qid_vec) { + if (count > ya) { + break; + } + ya_index = iter.first; + count++; + } + + //select (a, ya, q) uniformly at random from P[q][ya] + auto max_a = qid_vec[ya_index].size(); + uniform_int_distribution a_distribution(0, max_a - 1); + auto a_index = a_distribution(generator); + auto& a = qid_vec[ya_index][a_index]; + auto d1 = make_tuple(a, ya_index, qid); + + //select yb uniformly at random from Y [q] − ya. + auto max_yb = max_ya - 1; + uniform_int_distribution yb_distribution(0, max_yb - 1); + auto yb_index = yb_distribution(generator); + count = 0; + auto yb = yb_index; + for (auto& iter : qid_vec) { + if (count > yb) { + break; + } + yb_index = iter.first; + if (yb_index == ya_index) { + continue; + } + count++; + } + + //select (b, yb, q) uniformly at random from P[q][yb] + auto max_b = qid_vec[yb_index].size(); + uniform_int_distribution b_distribution(0, max_b - 1); + auto b_index = b_distribution(generator); + auto& b = qid_vec[yb_index][b_index]; + auto d2 = make_tuple(b, yb_index, qid); + + return make_pair(d1, d2); +} + +void pairwise_letor::train_svm(string data_dir, string svm_path) { + auto training_qids = make_unique>(); + + auto training_dataset = + make_unique>>>(); + + auto docids = + make_unique>>>(); + auto relevance_map = + make_unique>>(); + + read_data(TRAINING,data_dir,*training_qids,*training_dataset, + *docids,*relevance_map); + + auto dataset_nodes = make_unique>(); + + build_dataset_nodes(*training_dataset, *dataset_nodes); + + //multiclass_dataset *mcdata = new multiclass_dataset(dataset_nodes->begin(), dataset_nodes->end(), feature_nums); + //classifier::dataset_view_type *mcdv = new classifier::dataset_view_type(*mcdata); + //mcdv->shuffle(); + { + random_shuffle(dataset_nodes->begin(), dataset_nodes->end()); + ofstream out{"svm-train"}; + for (const auto &node : *dataset_nodes) { + out << node.label; + for (const auto &count : node.fv) + out << ' ' << (count.first + 1) << ':' << count.second; + out << endl; + } + } + + wrapper_ = make_unique(svm_path); + +} + +void pairwise_letor::validate(string data_dir) { + auto validation_qids = make_unique>(); + auto validation_dataset = + make_unique>>>(); + auto validation_docids = + make_unique>>>(); + auto relevance_map + = make_unique>>(); + read_data(VALIDATION,data_dir,*validation_qids,*validation_dataset, + *validation_docids,*relevance_map); + cout << "Evaluation on Validation set" << endl; + evaluate(*validation_qids, *validation_dataset, *validation_docids, + *relevance_map); + +} + +void pairwise_letor::test(string data_dir) { + auto testing_qids = make_unique>(); + auto testing_dataset = + make_unique>>>(); + auto testing_docids = + make_unique>>>(); + auto relevance_map + = make_unique>>(); + read_data(TESTING, data_dir, *testing_qids, *testing_dataset, + *testing_docids, *relevance_map); + cout << "Evaluating on test data" << endl; + evaluate(*testing_qids, *testing_dataset, *testing_docids, + *relevance_map); + +} + +void pairwise_letor::evaluate(vector& qids, + unordered_map>>& dataset, + unordered_map>>& docids, + unordered_map>& relevance_map) { + auto query_num = 0; + double top_precisions[10]; + double mean_ap = 0; + double top_ndcgs[10]; + vector temp_precisions; + vector temp_relevances; + vector dcg_rankings; + double temp_ap, total_relevances, temp_ndcg, temp_idcg; + for (auto index = 0; index < 10; index++) { + top_precisions[index] = 0; + top_ndcgs[index] = 0; + } + vector> doc_scores; + for (auto& query_iter : dataset) { + auto& query_dataset = query_iter.second; + auto& query_docids = docids[query_iter.first]; + + for (auto& label_iter : query_dataset) { + auto& label_dataset = label_iter.second; + auto& label_docids = query_docids[label_iter.first]; + for (size_t doc_idx = 0; doc_idx < label_docids.size(); doc_idx++) { + auto& fv = label_dataset[doc_idx]; + auto docid = label_docids[doc_idx]; + auto score = classify_type_ == LIBSVM ? + wrapper_->computeScore(fv) : model_->predict(fv); + doc_scores.push_back(make_pair(docid, score)); + } + } + sort(doc_scores.begin(), doc_scores.end(), compare_docscore); + if (doc_scores.size() >= 10) { + auto& query_relevances = relevance_map[query_iter.first]; + auto temp_relevance = query_relevances[doc_scores[0].first]; + auto last_precision = temp_relevance > 0 ? 1 : 0; + temp_ap = last_precision * last_precision; + temp_precisions.push_back(last_precision); + temp_relevances.push_back(temp_relevance); + for (auto score_idx = 1; score_idx < doc_scores.size(); score_idx++) { + temp_relevance = query_relevances[doc_scores[score_idx].first]; + last_precision += (temp_relevance > 0 ? 1 : 0); + temp_ap += + ((double) last_precision / (score_idx + 1) * (temp_relevance > 0 ? 1 : 0)); + temp_precisions.push_back(last_precision); + temp_relevances.push_back(temp_relevance); + } + total_relevances = last_precision; + if (total_relevances > 0) { //must check + for (auto index = 0; index < 10; index++) { + top_precisions[index] += (temp_precisions[index] / (index + 1)); + } + mean_ap += (temp_ap / total_relevances); + for (auto index = 0; index < 10; index++) { + dcg_rankings.push_back(query_relevances[doc_scores[index].first]); + } + sort(temp_relevances.begin(), temp_relevances.end(), std::greater()); + for (auto index = 0; index < 10; index++) { + temp_ndcg = compute_dcg(index + 1, dcg_rankings); + temp_idcg = compute_dcg(index + 1, temp_relevances); + top_ndcgs[index] += (temp_ndcg / temp_idcg); + } + query_num++; + } + temp_precisions.clear(); + temp_relevances.clear(); + dcg_rankings.clear(); + } + doc_scores.clear(); + } + + cout << endl << "Precision on Test Data: " << endl; + for (auto index = 0; index < 10; index++) { + top_precisions[index] /= query_num; + cout << "Precision at position " << (index + 1) + << ": " << top_precisions[index] << endl; + } + mean_ap /= query_num; + + cout << endl << "Mean Average Precision on Test Data: " << endl; + cout << "MAP: " << mean_ap << endl; + + cout << endl << "NDCG on Test Data: " << endl; + for (auto index = 0; index < 10; index++) { + top_ndcgs[index] /= query_num; + cout << "NDCG at position " << (index + 1) << ": " + << top_ndcgs[index] << endl; + } + +} + +double pairwise_letor::compute_dcg(int limit, vector &rankings) { + double dcg = 0, dg; + + dg = pow(2, rankings[0]) - 1; + dcg += dg; + for (auto index = 1; index < limit; index++) { + dg = pow(2, rankings[index]) - 1; + dg /= log2(index + 1); + dcg += dg; + } + return dcg; +} +} +} +} \ No newline at end of file diff --git a/src/learn/sgd.cpp b/src/learn/sgd.cpp index 86646c468..60d4ce143 100644 --- a/src/learn/sgd.cpp +++ b/src/learn/sgd.cpp @@ -71,6 +71,14 @@ void sgd_model::save(std::ostream& out) const io::packed::write(out, t_); } +void sgd_model::save_weights(std::ostream& out) const +{ + for (const auto& weight_val : weights_) + { + out << weight_val.weight << std::endl; + } +} + double sgd_model::predict(const feature_vector& x) const { auto val = scale_ * bias_.weight; diff --git a/src/learn/tools/CMakeLists.txt b/src/learn/tools/CMakeLists.txt new file mode 100644 index 000000000..9901be384 --- /dev/null +++ b/src/learn/tools/CMakeLists.txt @@ -0,0 +1,3 @@ +add_executable(letor_main pairwise_letor_main.cpp) +target_link_libraries(letor_main meta-learn meta-loss meta-classify) + diff --git a/src/learn/tools/pairwise_letor_main.cpp b/src/learn/tools/pairwise_letor_main.cpp new file mode 100644 index 000000000..04e3d003b --- /dev/null +++ b/src/learn/tools/pairwise_letor_main.cpp @@ -0,0 +1,131 @@ +/** + * @file pairwise_letor_main.cpp + * @author Mihika Dave, Anthony Huang, Rachneet Kaur + * @date 12/18/17 + */ + +#include "meta/learn/learntorank/pairwise_letor.h" + +using namespace meta; +using namespace learn; +using namespace learntorank; + +/** + * Train the pairwise ranker using spd + * @param data_dir The path to directory containing train.txt + * @param num_features The number of features + * @param hasModel If the pairwise model is built from model file + * @param model_file The path to model file + */ +void train_spd(const string &data_dir, int num_features, int hasModel, + const string &model_file) { + //start timer + auto start = chrono::steady_clock::now(); + int continue_training; + if (hasModel) { + cout << + "Do you want to continue training the loaded sgd model? 1(yes)/0(no)" << endl; + cin >> continue_training; + } + pairwise_letor letor_model(num_features, pairwise_letor::SPD, + hasModel, model_file); + if (!hasModel || continue_training) { + cout << "start training sgd!" << endl; + + letor_model.train(data_dir); + } + auto end = chrono::steady_clock::now(); + chrono::duration training_time = end - start; + cout << "Training time in seconds: " << training_time.count() << endl; + + letor_model.validate(data_dir); + + letor_model.test(data_dir); + + cout << "trained sgd model has been saved to letor_sgd_train.model" << endl; + +} + +/** + * Train the pairwise ranker using libsvm + * @param data_dir The path to directory containing train.txt + * @param num_features The number of features + * @param hasModel If the pairwise model is built from model file + * @param model_file The path to model file + */ +void train_libsvm(const string &data_dir, int num_features, + int hasModel, const string &model_file) { + auto start = chrono::steady_clock::now(); + pairwise_letor letor_model(num_features, pairwise_letor::LIBSVM, + hasModel, model_file); + if (!hasModel) { + cout << "Please specify path to libsvm modules" << endl; + string svm_path; + cin >> svm_path; + svm_path += "/"; + cout << "Starting to train svm!" << endl; + letor_model.train_svm(data_dir, svm_path); + } + auto end = chrono::steady_clock::now(); + chrono::duration training_time = end - start; + cout << "Training time in seconds: " << training_time.count() << endl; + + letor_model.validate(data_dir); + + letor_model.test(data_dir); + + cout << "trained svm model has been saved to letor_svm_train.model" << endl; + +} + +int main(int argc, char *argv[]) { + cout << "Hello! This is Learning To Rank LETOR!" << std::endl; + if (argc != 3) { + std::cerr << + "Please specify path for training directory and the number of features" + << std::endl; + std::cerr << "Usage: ./letor_main [-data_dir] [-num_features]" << std::endl; + return 1; + } + + string data_dir; + data_dir = argv[1]; + int num_features; + stringstream ss(argv[2]); + ss >> num_features; + + int hasModel; + string model_file; + cout << "Do you want to load trained model from file? 1(yes)/0(no)" << endl; + cin >> hasModel; + if (hasModel) { + cout << "Please specify path to your model file" << endl; + cin >> model_file; + cout << "Path to your model file is: " << model_file << endl; + } + + int selected_method; + cout << "Please select classification method to use: 0(libsvm), 1(spd)" << endl; + cin >> selected_method; + switch (selected_method) { + case 0: + cout << "libsvm will be used for training and testing" << endl; + break; + case 1: + cout << "spd will be used for training and testing" << endl; + break; + default: + break; + } + + if (selected_method == 0) { + train_libsvm(data_dir, num_features, hasModel, model_file); + } else { + train_spd(data_dir, num_features, hasModel, model_file); + } + cout << "letor weights value are saved to letor.weights" << endl; + + cout << "Exiting Learning To Rank!" << std::endl; + return 0; +} +