diff --git a/build.rs b/build.rs index 1d94bfe..f211ff3 100644 --- a/build.rs +++ b/build.rs @@ -2,7 +2,6 @@ extern crate bindgen; use std::env; use std::fs; -use std::io; use std::path::{Path, PathBuf}; fn get_catboost_version() -> String { @@ -38,213 +37,63 @@ fn get_platform_info() -> (String, String) { } fn download_model_interface_headers(out_dir: &Path) -> Result<(), Box> { - let version = get_catboost_version(); - // Create the model_interface directory let model_interface_dir = out_dir.join("libs/model_interface"); fs::create_dir_all(&model_interface_dir)?; - // Download the c_api.h file - let c_api_url = format!( - "https://raw.githubusercontent.com/catboost/catboost/v{}/catboost/libs/model_interface/c_api.h", - version - ); + // Use bundled c_api.h file (hardcoded for testing) + let manifest_dir = std::path::Path::new(file!()).parent().unwrap(); + let bundled_c_api = manifest_dir.join("c_api.h"); + let c_api_path = model_interface_dir.join("c_api.h"); - println!("cargo:warning=Downloading c_api.h from: {}", c_api_url); + println!("cargo:warning=Using bundled c_api.h from: {}", bundled_c_api.display()); - let response = ureq::get(&c_api_url).call()?; - let status = response.status(); - if !(200..300).contains(&status) { - return Err(format!("Failed to download c_api.h: HTTP {}", status).into()); - } - - let c_api_path = model_interface_dir.join("c_api.h"); - let mut file = fs::File::create(&c_api_path)?; - io::copy(&mut response.into_reader(), &mut file)?; + fs::copy(&bundled_c_api, &c_api_path)?; Ok(()) } fn download_compiled_library(out_dir: &Path) -> Result<(), Box> { let (os, arch) = get_platform_info(); - let version = get_catboost_version(); - // Create the library directory early + // Create the library directory let lib_dir = out_dir.join("libs"); fs::create_dir_all(&lib_dir)?; - // Parse version to determine URL format - // v1.0.x - v1.1.x use simple filenames - // v1.2+ use platform-specific versioned filenames - let version_parts: Vec<&str> = version.split('.').collect(); - let major: u32 = version_parts - .first() - .and_then(|s| s.parse().ok()) - .unwrap_or(1); - let minor: u32 = version_parts - .get(1) - .and_then(|s| s.parse().ok()) - .unwrap_or(0); - - let use_new_format = major > 1 || (major == 1 && minor >= 2); - - // Determine download URL based on version and platform - let (lib_filename, download_url) = if use_new_format { - // v1.2+ format with platform and version in filename - match (os.as_str(), arch.as_str()) { - ("linux", "x86_64") => ( - "libcatboostmodel.so".to_string(), - format!( - "https://github.com/catboost/catboost/releases/download/v{}/libcatboostmodel-linux-x86_64-{}.so", - version, version - ), - ), - ("linux", "aarch64") => ( - "libcatboostmodel.so".to_string(), - format!( - "https://github.com/catboost/catboost/releases/download/v{}/libcatboostmodel-linux-aarch64-{}.so", - version, version - ), - ), - ("darwin", "x86_64") | ("darwin", "aarch64") => ( - "libcatboostmodel.dylib".to_string(), - format!( - "https://github.com/catboost/catboost/releases/download/v{}/libcatboostmodel-darwin-universal2-{}.dylib", - version, version - ), - ), - ("windows", "x86_64") => { - // On Windows, we need to download both the DLL and LIB files - // First download the DLL - let dll_url = format!( - "https://github.com/catboost/catboost/releases/download/v{}/catboostmodel-windows-x86_64-{}.dll", - version, version - ); - println!("cargo:warning=Downloading Windows DLL from: {}", dll_url); - let dll_response = ureq::get(&dll_url).call()?; - if !(200..300).contains(&dll_response.status()) { - return Err( - format!("Failed to download DLL: HTTP {}", dll_response.status()).into(), - ); - } - let dll_path = lib_dir.join("catboostmodel.dll"); - let mut dll_file = fs::File::create(&dll_path)?; - io::copy(&mut dll_response.into_reader(), &mut dll_file)?; - - // Then download the LIB file - let lib_url = format!( - "https://github.com/catboost/catboost/releases/download/v{}/catboostmodel-windows-x86_64-{}.lib", - version, version - ); - println!("cargo:warning=Downloading Windows LIB from: {}", lib_url); - let lib_response = ureq::get(&lib_url).call()?; - if !(200..300).contains(&lib_response.status()) { - return Err( - format!("Failed to download LIB: HTTP {}", lib_response.status()).into(), - ); - } - let lib_path = lib_dir.join("catboostmodel.lib"); - let mut lib_file = fs::File::create(&lib_path)?; - io::copy(&mut lib_response.into_reader(), &mut lib_file)?; - - // Return early for Windows since we've already downloaded both files - println!( - "cargo:warning=Downloaded CatBoost library to: {}", - dll_path.display() - ); - return Ok(()); - } - ("windows", "aarch64") => ( - "catboostmodel.dll".to_string(), - format!( - "https://github.com/catboost/catboost/releases/download/v{}/catboostmodel-windows-aarch64-{}.dll", - version, version - ), - ), - _ => return Err(format!("Unsupported platform: {}-{}", os, arch).into()), - } - } else { - // v1.0.x - v1.1.x format with simple filenames - match os.as_str() { - "linux" => ( - "libcatboostmodel.so".to_string(), - format!( - "https://github.com/catboost/catboost/releases/download/v{}/libcatboostmodel.so", - version - ), - ), - "darwin" => ( - "libcatboostmodel.dylib".to_string(), - format!( - "https://github.com/catboost/catboost/releases/download/v{}/libcatboostmodel.dylib", - version - ), - ), - "windows" => { - // On Windows, we need to download both the DLL and LIB files - // First download the DLL - let dll_url = format!( - "https://github.com/catboost/catboost/releases/download/v{}/catboostmodel.dll", - version - ); - println!("cargo:warning=Downloading Windows DLL from: {}", dll_url); - let dll_response = ureq::get(&dll_url).call()?; - if !(200..300).contains(&dll_response.status()) { - return Err( - format!("Failed to download DLL: HTTP {}", dll_response.status()).into(), - ); - } - let dll_path = lib_dir.join("catboostmodel.dll"); - let mut dll_file = fs::File::create(&dll_path)?; - io::copy(&mut dll_response.into_reader(), &mut dll_file)?; - - // Then download the LIB file - let lib_url = format!( - "https://github.com/catboost/catboost/releases/download/v{}/catboostmodel.lib", - version - ); - println!("cargo:warning=Downloading Windows LIB from: {}", lib_url); - let lib_response = ureq::get(&lib_url).call()?; - if !(200..300).contains(&lib_response.status()) { - return Err( - format!("Failed to download LIB: HTTP {}", lib_response.status()).into(), - ); - } - let lib_path = lib_dir.join("catboostmodel.lib"); - let mut lib_file = fs::File::create(&lib_path)?; - io::copy(&mut lib_response.into_reader(), &mut lib_file)?; - - // Return early for Windows since we've already downloaded both files - println!( - "cargo:warning=Downloaded CatBoost library to: {}", - dll_path.display() - ); - return Ok(()); - } - _ => return Err(format!("Unsupported platform: {}", os).into()), + // Use bundled library file based on target platform (hardcoded for testing) + let manifest_dir = std::path::Path::new(file!()).parent().unwrap(); + + // Determine source and target filenames based on OS and architecture + let (bundled_lib, lib_filename) = match (os.as_str(), arch.as_str()) { + ("windows", _) => ( + manifest_dir.join("catboostmodel.dll"), + "catboostmodel.dll" + ), + ("darwin", _) => ( + manifest_dir.join("libcatboostmodel.dylib"), + "libcatboostmodel.dylib" + ), + ("linux", "x86_64") => ( + manifest_dir.join("libcatboostmodel-x86_64.so"), + "libcatboostmodel.so" + ), + ("linux", "aarch64") => ( + manifest_dir.join("libcatboostmodel.so"), + "libcatboostmodel.so" + ), + _ => { + return Err(format!("Unsupported platform: {}-{}", os, arch).into()); } }; - println!( - "cargo:warning=Downloading CatBoost v{} library from: {}", - version, download_url - ); + let lib_path = lib_dir.join(lib_filename); - // Download the library directly into the `libs` directory with its correct name - let lib_path = lib_dir.join(&lib_filename); - let mut dest = fs::File::create(&lib_path)?; - - let response = ureq::get(&download_url).call()?; - let status = response.status(); - if !(200..300).contains(&status) { - return Err(format!("Failed to download library: HTTP {}", status).into()); - } + println!("cargo:warning=Using bundled {} library from: {}", arch, bundled_lib.display()); - // SIMPLIFIED: No need for extraction, just copy the downloaded content - io::copy(&mut response.into_reader(), &mut dest)?; + fs::copy(&bundled_lib, &lib_path)?; println!( - "cargo:warning=Downloaded CatBoost library to: {}", + "cargo:warning=Copied CatBoost library to: {}", lib_path.display() ); diff --git a/c_api.h b/c_api.h new file mode 100644 index 0000000..a1b02f1 --- /dev/null +++ b/c_api.h @@ -0,0 +1,692 @@ +#pragma once + +#include +#include + + +#define CATBOOST_APPLIER_MAJOR 1 +#define CATBOOST_APPLIER_MINOR 2 +#define CATBOOST_APPLIER_FIX 8 + +#if defined(__cplusplus) +extern "C" { +#endif + + +#if defined(_WIN32) && !defined(CATBOOST_API_STATIC_LIB) +#ifdef _WINDLL +#define CATBOOST_API __declspec(dllexport) +#else +#define CATBOOST_API __declspec(dllimport) +#endif +#else +#define CATBOOST_API +#endif + +typedef void DataWrapperHandle; + +typedef void DataProviderHandle; + +/** + * Create empty data wrapper + * @return + */ +CATBOOST_API DataWrapperHandle* DataWrapperCreate(size_t docsCount); + +CATBOOST_API void DataWrapperDelete(DataWrapperHandle* dataWrapperHandle); + +CATBOOST_API void AddFloatFeatures(DataWrapperHandle* dataWrapperHandle, const float** floatFeatures, size_t floatFeaturesSize); + +CATBOOST_API void AddCatFeatures(DataWrapperHandle* dataWrapperHandle, const char*** catFeatures, size_t catFeaturesSize); + +CATBOOST_API void AddTextFeatures(DataWrapperHandle* dataWrapperHandle, const char*** textFeatures, size_t textFeaturesSize); + +CATBOOST_API void AddEmbeddingFeatures(DataWrapperHandle* dataWrapperHandle, const float*** embeddingFeatures, size_t* embeddingDimensions, size_t embeddingFeaturesSize); + +CATBOOST_API DataProviderHandle* BuildDataProvider(DataWrapperHandle* dataWrapperHandle); + +typedef void ModelCalcerHandle; + +enum EApiPredictionType { + APT_RAW_FORMULA_VAL = 0, + APT_EXPONENT = 1, + APT_RMSE_WITH_UNCERTAINTY = 2, + APT_PROBABILITY = 3, + APT_CLASS = 4, + APT_MULTI_PROBABILITY = 5, +}; + +enum ECatBoostApiFormulaEvaluatorType { + CBA_FET_CPU = 0, + CBA_FET_GPU = 1, +}; + +/** + * Create empty model handle + * @return + */ +CATBOOST_API ModelCalcerHandle* ModelCalcerCreate(); + +/** + * Delete model handle + * @param calcer + */ +CATBOOST_API void ModelCalcerDelete(ModelCalcerHandle* modelHandle); + +/** + * If error occured will return stored exception message. + * If no error occured, will return invalid pointer + * The underlying variable is thread-local so: + * - it is thread-safe to get it + * - indicates only errors that happened in the current thread + * @return Error message string. Uses UTF-8 encoding + */ +CATBOOST_API const char* GetErrorString(); + +/** + * Load model from file into given model handle + * @param calcer + * @param filename path to the file. Uses UTF-8 encoding + * @return false if error occured + */ +CATBOOST_API bool LoadFullModelFromFile( + ModelCalcerHandle* modelHandle, + const char* filename); + +/** + * Load model from memory buffer into given model handle + * @param calcer + * @param binaryBuffer pointer to a memory buffer where model file is mapped + * @param binaryBufferSize size of the buffer in bytes + * @return false if error occured + */ +CATBOOST_API bool LoadFullModelFromBuffer( + ModelCalcerHandle* modelHandle, + const void* binaryBuffer, + size_t binaryBufferSize); + + +/** + * Use model directly from given memory region with zero-copy method + * @param calcer + * @param binaryBuffer pointer to a memory buffer where model file is mapped + * @param binaryBufferSize size of the buffer in bytes + * @return false if error occured + */ +CATBOOST_API bool LoadFullModelZeroCopy( + ModelCalcerHandle* modelHandle, + const void* binaryBuffer, + size_t binaryBufferSize); + +/** + * Use CUDA GPU device for model evaluation +*/ +CATBOOST_API bool EnableGPUEvaluation(ModelCalcerHandle* modelHandle, int deviceId); + +/** + * Get supported formula evaluator types + * formulaEvaluatorTypes array must be deallocated using free() after use. + * + * @param modelHandle model handle + * @param formulaEvaluatorTypes address of the pointer to an array that will be initialized with formula evaluator types + * @param formulaEvaluatorTypesCount address of the variable where the size of formulaEvaluatorTypes array will be stored + * @return true on success, false on error + */ +CATBOOST_API bool GetSupportedEvaluatorTypes( + ModelCalcerHandle* modelHandle, + enum ECatBoostApiFormulaEvaluatorType** formulaEvaluatorTypes, + size_t* formulaEvaluatorTypesCount); + + +/** + * Set prediction type for model evaluation +*/ +CATBOOST_API bool SetPredictionType(ModelCalcerHandle* modelHandle, enum EApiPredictionType predictionType); + +/** + * Set prediction type for model evaluation with string constant +*/ +CATBOOST_API bool SetPredictionTypeString(ModelCalcerHandle* modelHandle, const char* predictionTypeStr); + + +/** + * **Use this method only if you really understand what you want.** + * Calculate raw model predictions on flat feature vectors + * Flat here means that float features and categorical feature are in the same float array. + * @param calcer model handle + * @param docCount number of objects + * @param floatFeatures array of array of float (first dimension is object index, second is feature index) + * @param floatFeaturesSize float values array size + * @param result pointer to user allocated results vector + * @param resultSize Result size should be equal to modelApproxDimension * docCount + * (e.g. for non multiclass models should be equal to docCount) + * @return false if error occured + */ +CATBOOST_API bool CalcModelPredictionFlat( + ModelCalcerHandle* modelHandle, + size_t docCount, + const float** floatFeatures, size_t floatFeaturesSize, + double* result, size_t resultSize); + + +/** + * **Use this method only if you really understand what you want.** + * Calculate raw model predictions on flat feature vectors + * taking into consideration only the trees in the range [treeStart; treeEnd) + * Flat here means that float features and categorical feature are in the same float array. + * @param calcer model handle + * @param docCount number of objects + * @param treeStart the index of the first tree to be used when applying the model (zero-based) + * @param treeEnd the index of the last tree to be used when applying the model (non-inclusive, zero-based) + * @param floatFeatures array of array of float (first dimension is object index, second is feature index) + * @param floatFeaturesSize float values array size + * @param result pointer to user allocated results vector + * @param resultSize Result size should be equal to modelApproxDimension * docCount + * (e.g. for non multiclass models should be equal to docCount) + * @return false if error occured + */ +CATBOOST_API bool CalcModelPredictionFlatStaged( + ModelCalcerHandle* modelHandle, + size_t docCount, + size_t treeStart, size_t treeEnd, + const float** floatFeatures, size_t floatFeaturesSize, + double* result, size_t resultSize); + + +/** + * **Use this method only if you really understand what you want.** + * Calculate raw model predictions on transposed dataset layout + * @param calcer model handle + * @param docCount number of objects + * @param floatFeatures array of array of float (first dimension is feature index, second is object index) + * @param floatFeaturesSize float values array size + * @param result pointer to user allocated results vector + * @param resultSize Result size should be equal to modelApproxDimension * docCount + * (e.g. for non multiclass models should be equal to docCount) + * @return false if error occured + */ +CATBOOST_API bool CalcModelPredictionFlatTransposed( + ModelCalcerHandle* modelHandle, + size_t docCount, + const float** floatFeatures, size_t floatFeaturesSize, + double* result, size_t resultSize); + + +/** + * **Use this method only if you really understand what you want.** + * Calculate raw model predictions on transposed dataset layout + * taking into consideration only the trees in the range [treeStart; treeEnd) + * @param calcer model handle + * @param docCount number of objects + * @param treeStart the index of the first tree to be used when applying the model (zero-based) + * @param treeEnd the index of the last tree to be used when applying the model (non-inclusive, zero-based) + * @param floatFeatures array of array of float (first dimension is feature index, second is object index) + * @param floatFeaturesSize float values array size + * @param result pointer to user allocated results vector + * @param resultSize Result size should be equal to modelApproxDimension * docCount + * (e.g. for non multiclass models should be equal to docCount) + * @return false if error occured + */ +CATBOOST_API bool CalcModelPredictionFlatTransposedStaged( + ModelCalcerHandle* modelHandle, + size_t docCount, + size_t treeStart, size_t treeEnd, + const float** floatFeatures, size_t floatFeaturesSize, + double* result, size_t resultSize); + + +/** + * Calculate raw model predictions on float features and string categorical feature values + * @param calcer model handle + * @param docCount object count + * @param floatFeatures array of array of float (first dimension is object index, second is feature index) + * @param floatFeaturesSize float feature count + * @param catFeatures array of array of char* categorical value pointers. + * String pointer should point to zero terminated string. + * @param catFeaturesSize categorical feature count + * @param result pointer to user allocated results vector + * @param resultSize result size should be equal to modelApproxDimension * docCount + * (e.g. for non multiclass models should be equal to docCount) + * @return false if error occured + */ +CATBOOST_API bool CalcModelPrediction( + ModelCalcerHandle* modelHandle, + size_t docCount, + const float** floatFeatures, size_t floatFeaturesSize, + const char*** catFeatures, size_t catFeaturesSize, + double* result, size_t resultSize); + +/** + * Calculate raw model predictions on float features and string categorical feature values + * taking into consideration only the trees in the range [treeStart; treeEnd) + * @param calcer model handle + * @param docCount object count + * @param treeStart the index of the first tree to be used when applying the model (zero-based) + * @param treeEnd the index of the last tree to be used when applying the model (non-inclusive, zero-based) + * @param floatFeatures array of array of float (first dimension is object index, second is feature index) + * @param floatFeaturesSize float feature count + * @param catFeatures array of array of char* categorical value pointers. + * String pointer should point to zero terminated string. + * @param catFeaturesSize categorical feature count + * @param result pointer to user allocated results vector + * @param resultSize result size should be equal to modelApproxDimension * docCount + * (e.g. for non multiclass models should be equal to docCount) + * @return false if error occured + */ +CATBOOST_API bool CalcModelPredictionStaged( + ModelCalcerHandle* modelHandle, + size_t docCount, + size_t treeStart, size_t treeEnd, + const float** floatFeatures, size_t floatFeaturesSize, + const char*** catFeatures, size_t catFeaturesSize, + double* result, size_t resultSize); + + +/** + * Calculate raw model predictions on float features and string categorical feature values + * @param calcer model handle + * @param docCount object count + * @param floatFeatures array of array of float (first dimension is object index, second is feature index) + * @param floatFeaturesSize float feature count + * @param catFeatures array of array of char* categorical value pointers. + * String pointer should point to zero terminated string. + * @param catFeaturesSize categorical feature count + * @param textFeatures array of array of char* text value pointers. + * String pointer should point to zero terminated string. + * @param textFeaturesSize text feature count + * @param result pointer to user allocated results vector + * @param resultSize result size should be equal to modelApproxDimension * docCount + * (e.g. for non multiclass models should be equal to docCount) + * @return false if error occured + */ +CATBOOST_API bool CalcModelPredictionText( + ModelCalcerHandle* modelHandle, + size_t docCount, + const float** floatFeatures, size_t floatFeaturesSize, + const char*** catFeatures, size_t catFeaturesSize, + const char*** textFeatures, size_t textFeaturesSize, + double* result, size_t resultSize); + + +/** + * Calculate raw model predictions on float features and string categorical feature values + * taking into consideration only the trees in the range [treeStart; treeEnd) + * @param calcer model handle + * @param docCount object count + * @param treeStart the index of the first tree to be used when applying the model (zero-based) + * @param treeEnd the index of the last tree to be used when applying the model (non-inclusive, zero-based) + * @param floatFeatures array of array of float (first dimension is object index, second is feature index) + * @param floatFeaturesSize float feature count + * @param catFeatures array of array of char* categorical value pointers. + * String pointer should point to zero terminated string. + * @param catFeaturesSize categorical feature count + * @param textFeatures array of array of char* text value pointers. + * String pointer should point to zero terminated string. + * @param textFeaturesSize text feature count + * @param result pointer to user allocated results vector + * @param resultSize result size should be equal to modelApproxDimension * docCount + * (e.g. for non multiclass models should be equal to docCount) + * @return false if error occured + */ +CATBOOST_API bool CalcModelPredictionTextStaged( + ModelCalcerHandle* modelHandle, + size_t docCount, + size_t treeStart, size_t treeEnd, + const float** floatFeatures, size_t floatFeaturesSize, + const char*** catFeatures, size_t catFeaturesSize, + const char*** textFeatures, size_t textFeaturesSize, + double* result, size_t resultSize); + + +/** + * Calculate raw model predictions on float features and string categorical feature values + * @param calcer model handle + * @param docCount object count + * @param floatFeatures array of array of float (first dimension is object index, second is feature index) + * @param floatFeaturesSize float feature count + * @param catFeatures array of array of char* categorical value pointers. + * String pointer should point to zero terminated string. + * @param catFeaturesSize categorical feature count + * @param textFeatures array of array of char* text value pointers. + * String pointer should point to zero terminated string. + * @param textFeaturesSize text feature count + * @param embeddingFeatures array of array of array of float (first dimension is object index, second is feature index, third is index in embedding array). + * String pointer should point to zero terminated string. + * @param embeddingFeaturesSize embedding feature count + * @param result pointer to user allocated results vector + * @param resultSize result size should be equal to modelApproxDimension * docCount + * (e.g. for non multiclass models should be equal to docCount) + * @return false if error occured + */ +CATBOOST_API bool CalcModelPredictionTextAndEmbeddings( + ModelCalcerHandle* modelHandle, + size_t docCount, + const float** floatFeatures, size_t floatFeaturesSize, + const char*** catFeatures, size_t catFeaturesSize, + const char*** textFeatures, size_t textFeaturesSize, + const float*** embeddingFeatures, size_t* embeddingDimensions, size_t embeddingFeaturesSize, + double* result, size_t resultSize); + + +/** + * Calculate raw model predictions on float features and string categorical feature values + * taking into consideration only the trees in the range [treeStart; treeEnd) + * @param calcer model handle + * @param docCount object count + * @param treeStart the index of the first tree to be used when applying the model (zero-based) + * @param treeEnd the index of the last tree to be used when applying the model (non-inclusive, zero-based) + * @param floatFeatures array of array of float (first dimension is object index, second is feature index) + * @param floatFeaturesSize float feature count + * @param catFeatures array of array of char* categorical value pointers. + * String pointer should point to zero terminated string. + * @param catFeaturesSize categorical feature count + * @param textFeatures array of array of char* text value pointers. + * String pointer should point to zero terminated string. + * @param textFeaturesSize text feature count + * @param embeddingFeatures array of array of array of float (first dimension is object index, second is feature index, third is index in embedding array). + * String pointer should point to zero terminated string. + * @param embeddingFeaturesSize embedding feature count + * @param result pointer to user allocated results vector + * @param resultSize result size should be equal to modelApproxDimension * docCount + * (e.g. for non multiclass models should be equal to docCount) + * @return false if error occured + */ +CATBOOST_API bool CalcModelPredictionTextAndEmbeddingsStaged( + ModelCalcerHandle* modelHandle, + size_t docCount, + size_t treeStart, size_t treeEnd, + const float** floatFeatures, size_t floatFeaturesSize, + const char*** catFeatures, size_t catFeaturesSize, + const char*** textFeatures, size_t textFeaturesSize, + const float*** embeddingFeatures, size_t* embeddingDimensions, size_t embeddingFeaturesSize, + double* result, size_t resultSize); + + +/** + * Calculate raw model prediction on float features and string categorical feature values for single object + * @param calcer model handle + * @param floatFeatures array of float features + * @param floatFeaturesSize float feature count + * @param catFeatures array of char* categorical feature value pointers. + * Each string pointer should point to zero terminated string. + * @param catFeaturesSize categorical feature count + * @param result pointer to user allocated results vector (or single double) + * @param resultSize result size should be equal to modelApproxDimension + * (e.g. for non multiclass models should be equal to 1) + * @return false if error occured + */ +CATBOOST_API bool CalcModelPredictionSingle( + ModelCalcerHandle* modelHandle, + const float* floatFeatures, size_t floatFeaturesSize, + const char** catFeatures, size_t catFeaturesSize, + double* result, size_t resultSize); + + +/** + * Calculate raw model prediction on float features and string categorical feature values for single object + * taking into consideration only the trees in the range [treeStart; treeEnd) + * @param calcer model handle + * @param treeStart the index of the first tree to be used when applying the model (zero-based) + * @param treeEnd the index of the last tree to be used when applying the model (non-inclusive, zero-based) + * @param floatFeatures array of float features + * @param floatFeaturesSize float feature count + * @param catFeatures array of char* categorical feature value pointers. + * Each string pointer should point to zero terminated string. + * @param catFeaturesSize categorical feature count + * @param result pointer to user allocated results vector (or single double) + * @param resultSize result size should be equal to modelApproxDimension + * (e.g. for non multiclass models should be equal to 1) + * @return false if error occured + */ +CATBOOST_API bool CalcModelPredictionSingleStaged( + ModelCalcerHandle* modelHandle, + size_t treeStart, size_t treeEnd, + const float* floatFeatures, size_t floatFeaturesSize, + const char** catFeatures, size_t catFeaturesSize, + double* result, size_t resultSize); + + +/** + * Calculate raw model predictions on float features and hashed categorical feature values + * @param calcer model handle + * @param docCount object count + * @param floatFeatures array of array of float (first dimension is object index, second if feature index) + * @param floatFeaturesSize float feature count + * @param catFeatures array of array of integers - hashed categorical feature values. + * @param catFeaturesSize categorical feature count + * @param result pointer to user allocated results vector + * @param resultSize result size should be equal to modelApproxDimension * docCount + * (e.g. for non multiclass models should be equal to docCount) + * @return false if error occured + */ +CATBOOST_API bool CalcModelPredictionWithHashedCatFeatures( + ModelCalcerHandle* modelHandle, + size_t docCount, + const float** floatFeatures, size_t floatFeaturesSize, + const int** catFeatures, size_t catFeaturesSize, + double* result, size_t resultSize); + +CATBOOST_API bool CalcModelPredictionWithHashedCatFeaturesAndTextFeatures( + ModelCalcerHandle* modelHandle, + size_t docCount, + const float** floatFeatures, size_t floatFeaturesSize, + const int** catFeatures, size_t catFeaturesSize, + const char*** textFeatures, size_t textFeaturesSize, + double* result, size_t resultSize); + +CATBOOST_API bool CalcModelPredictionWithHashedCatFeaturesAndTextAndEmbeddingFeatures( + ModelCalcerHandle* modelHandle, + size_t docCount, + const float** floatFeatures, size_t floatFeaturesSize, + const int** catFeatures, size_t catFeaturesSize, + const char*** textFeatures, size_t textFeaturesSize, + const float*** embeddingFeatures, size_t* embeddingDimensions, size_t embeddingFeaturesSize, + double* result, size_t resultSize); + +/** + * Methods equivalent to the methods above + * only returning a prediction for the specific class + * @param classId number of the class should be in [0, modelApproxDimension - 1] + * @param resultSize result size should be equal to docCount +*/ +CATBOOST_API bool PredictSpecificClassFlat( + ModelCalcerHandle* modelHandle, + size_t docCount, + const float** floatFeatures, size_t floatFeaturesSize, + int classId, + double* result, size_t resultSize); + +CATBOOST_API bool PredictSpecificClass( + ModelCalcerHandle* modelHandle, + size_t docCount, + const float** floatFeatures, size_t floatFeaturesSize, + const char*** catFeatures, size_t catFeaturesSize, + int classId, + double* result, size_t resultSize); + +CATBOOST_API bool PredictSpecificClassText( + ModelCalcerHandle* modelHandle, + size_t docCount, + const float** floatFeatures, size_t floatFeaturesSize, + const char*** catFeatures, size_t catFeaturesSize, + const char*** textFeatures, size_t textFeaturesSize, + int classId, + double* result, size_t resultSize); + +CATBOOST_API bool PredictSpecificClassTextAndEmbeddings( + ModelCalcerHandle* modelHandle, + size_t docCount, + const float** floatFeatures, size_t floatFeaturesSize, + const char*** catFeatures, size_t catFeaturesSize, + const char*** textFeatures, size_t textFeaturesSize, + const float*** embeddingFeatures, size_t* embeddingDimensions, size_t embeddingFeaturesSize, + int classId, + double* result, size_t resultSize); + +CATBOOST_API bool PredictSpecificClassSingle( + ModelCalcerHandle* modelHandle, + const float* floatFeatures, size_t floatFeaturesSize, + const char** catFeatures, size_t catFeaturesSize, + int classId, + double* result, size_t resultSize); + +CATBOOST_API bool PredictSpecificClassWithHashedCatFeatures( + ModelCalcerHandle* modelHandle, + size_t docCount, + const float** floatFeatures, size_t floatFeaturesSize, + const int** catFeatures, size_t catFeaturesSize, + int classId, + double* result, size_t resultSize); + +CATBOOST_API bool PredictSpecificClassWithHashedCatFeaturesAndTextFeatures( + ModelCalcerHandle* modelHandle, + size_t docCount, + const float** floatFeatures, size_t floatFeaturesSize, + const int** catFeatures, size_t catFeaturesSize, + const char*** textFeatures, size_t textFeaturesSize, + int classId, + double* result, size_t resultSize); + +CATBOOST_API bool PredictSpecificClassWithHashedCatFeaturesAndTextAndEmbeddingFeatures( + ModelCalcerHandle* modelHandle, + size_t docCount, + const float** floatFeatures, size_t floatFeaturesSize, + const int** catFeatures, size_t catFeaturesSize, + const char*** textFeatures, size_t textFeaturesSize, + const float*** embeddingFeatures, size_t* embeddingDimensions, size_t embeddingFeaturesSize, + int classId, + double* result, size_t resultSize); + +/** + * Get hash for given string value + * @param data we don't expect data to be zero terminated, so pass correct size + * @param size string length + * @return hash value + */ +CATBOOST_API int GetStringCatFeatureHash(const char* data, size_t size); + +/** + * Special case for hash calculation - integer hash. + * Internally we cast value to string and then calulcate string hash function. + * Used in ClickHouse for catboost model evaluation on integer cat features. + * @param val integer cat feature value + * @return hash value + */ +CATBOOST_API int GetIntegerCatFeatureHash(long long val); + +/** + * Get expected float feature count for model + * @param calcer model handle + */ +CATBOOST_API size_t GetFloatFeaturesCount(ModelCalcerHandle* modelHandle); + +/** + * Get expected indices of float features used in the model. + * indices array must be deallocated using free() after use. + * @param modelHandle model handle + * @param indices indices of the features + * @param count indices size + * @return true on success, false on error + */ +CATBOOST_API bool GetFloatFeatureIndices(ModelCalcerHandle* modelHandle, size_t** indices, size_t* count); + +/** + * Get expected categorical feature count for model + * @param calcer model handle + */ +CATBOOST_API size_t GetCatFeaturesCount(ModelCalcerHandle* modelHandle); + +/** + * Get expected indices of category features used in the model. + * indices array must be deallocated using free() after use. + * @param modelHandle model handle + * @param indices indices of the features + * @param count indices size + * @return true on success, false on error + */ +CATBOOST_API bool GetCatFeatureIndices(ModelCalcerHandle* modelHandle, size_t** indices, size_t* count); + +/** + * Get expected text feature count for model + * @param calcer model handle + */ +CATBOOST_API size_t GetTextFeaturesCount(ModelCalcerHandle* modelHandle); + +/** + * Get expected indices of text features used in the model. + * indices array must be deallocated using free() after use. + * @param modelHandle model handle + * @param indices indices of the features + * @param count indices size + * @return true on success, false on error + */ +CATBOOST_API bool GetTextFeatureIndices(ModelCalcerHandle* modelHandle, size_t** indices, size_t* count); + +/** + * Get expected embedding feature count for model + * @param calcer model handle + */ +CATBOOST_API size_t GetEmbeddingFeaturesCount(ModelCalcerHandle* modelHandle); + +/** + * Get expected indices of embedding features used in the model. + * indices array must be deallocated using free() after use. + * @param modelHandle model handle + * @param indices indices of the features + * @param count indices size + * @return true on success, false on error + */ +CATBOOST_API bool GetEmbeddingFeatureIndices(ModelCalcerHandle* modelHandle, size_t** indices, size_t* count); + +/** + * Get number of trees in model + * @param calcer model handle + */ +CATBOOST_API size_t GetTreeCount(ModelCalcerHandle* modelHandle); + +/** + * Get number of dimensions in model + * @param calcer model handle + */ +CATBOOST_API size_t GetDimensionsCount(ModelCalcerHandle* modelHandle); + +/** + * Get number of dimensions for current prediction + * For default `APT_RAW_FORMULA_VAL`, `APT_EXPONENT`, `APT_PROBABILITY`, `APT_CLASS` prediction type GetPredictionDimensionsCount == GetDimensionsCount + * For `APT_RMSE_WITH_UNCERTAINTY` - returns 2 (value prediction and predicted uncertainty) + * @param calcer model handle + */ +CATBOOST_API size_t GetPredictionDimensionsCount(ModelCalcerHandle* modelHandle); + + +/** + * Check if model metadata holds some value for provided key + * @param calcer model handle + */ +CATBOOST_API bool CheckModelMetadataHasKey(ModelCalcerHandle* modelHandle, const char* keyPtr, size_t keySize); + +/** + * Get model metainfo value size for some key. Returns 0 both if key is missing in model metadata and if it is really missing + * @param calcer model handle + */ +CATBOOST_API size_t GetModelInfoValueSize(ModelCalcerHandle* modelHandle, const char* keyPtr, size_t keySize); + +/** + * Get model metainfo for some key. Returns const char* pointer to inner string. If key is missing in model metainfo storage this method will return nullptr + * @param calcer model handle + */ +CATBOOST_API const char* GetModelInfoValue(ModelCalcerHandle* modelHandle, const char* keyPtr, size_t keySize); + + +/** + * Get names of features used in the model. + * individual strings in featureNames array and featureNames array itself must be deallocated using free() after use. + * + * @return true on success, false on error + */ +CATBOOST_API bool GetModelUsedFeaturesNames(ModelCalcerHandle* modelHandle, char*** featureNames, size_t* featureCount); + + +#if defined(__cplusplus) +} +#endif diff --git a/libcatboostmodel-x86_64.so b/libcatboostmodel-x86_64.so new file mode 100755 index 0000000..342a0a5 Binary files /dev/null and b/libcatboostmodel-x86_64.so differ diff --git a/libcatboostmodel.dylib b/libcatboostmodel.dylib new file mode 100755 index 0000000..96dc82a Binary files /dev/null and b/libcatboostmodel.dylib differ diff --git a/libcatboostmodel.so b/libcatboostmodel.so new file mode 100755 index 0000000..2fcbb0a Binary files /dev/null and b/libcatboostmodel.so differ diff --git a/src/model.rs b/src/model.rs index eb4a76b..79030da 100644 --- a/src/model.rs +++ b/src/model.rs @@ -4,9 +4,14 @@ use crate::sys; use std::ffi::{CStr, CString}; use std::os::raw::c_char; use std::path::Path; +use std::sync::Arc; pub struct Model { handle: *mut sys::ModelCalcerHandle, + /// Buffer owner for zero-copy loading - keeps the buffer alive for model's lifetime + /// When using LoadFullModelZeroCopy, the model doesn't copy data and instead + /// points directly to this buffer. This field MUST stay alive as long as the model exists. + _buffer_owner: Option>>, } unsafe impl Send for Model {} @@ -17,6 +22,7 @@ impl Model { let model_handle = unsafe { sys::ModelCalcerCreate() }; Model { handle: model_handle, + _buffer_owner: None, } } @@ -30,7 +36,14 @@ impl Model { Ok(model) } - /// Load a model from a buffer + /// Load a model from a buffer (copies data internally) + /// + /// WARNING: This method uses LoadFullModelFromBuffer which copies data through + /// CatBoost's internal memory pools. On ARM64 (aarch64), these memory pools have + /// a known memory leak issue where memory is not returned to the OS. + /// + /// For production use on ARM64, prefer `load_buffer_zero_copy` which avoids + /// the memory leak by not copying data. pub fn load_buffer>>(buffer: P) -> CatBoostResult { let model = Model::new(); CatBoostError::check_return_value(unsafe { @@ -43,6 +56,45 @@ impl Model { Ok(model) } + /// Load a model from a buffer using zero-copy approach + /// + /// This method uses LoadFullModelZeroCopy which does NOT copy the model data. + /// Instead, the model keeps a reference to the buffer and reads from it directly. + /// + /// **Advantages:** + /// - Lower memory usage (no duplicate copy of model data) + /// - Fixes ARM64 (aarch64) memory leak issue caused by internal memory pools + /// - Faster loading (no copying overhead) + /// + /// **Important:** The buffer is kept alive via Arc> for the model's lifetime. + /// When the Model is dropped, the buffer is automatically freed. + /// + /// # Example + /// ```no_run + /// use catboost_rust::Model; + /// use std::fs; + /// + /// let buffer = fs::read("model.cbm").unwrap(); + /// let model = Model::load_buffer_zero_copy(buffer).unwrap(); + /// // Buffer stays alive, model can be used safely + /// ``` + pub fn load_buffer_zero_copy(buffer: Vec) -> CatBoostResult { + let buffer_arc = Arc::new(buffer); + let mut model = Model::new(); + + CatBoostError::check_return_value(unsafe { + sys::LoadFullModelZeroCopy( + model.handle, + buffer_arc.as_ptr() as *const std::os::raw::c_void, + buffer_arc.len(), + ) + })?; + + // CRITICAL: Keep buffer alive by storing Arc in model + model._buffer_owner = Some(buffer_arc); + Ok(model) + } + fn set_or_check_object_count< TFeature, TObjectFeatures: AsRef<[TFeature]>,