From b6c1f836b07be55ae1f76286157e6cca2d2ed3f3 Mon Sep 17 00:00:00 2001 From: msquared Date: Wed, 8 Oct 2025 19:34:34 +0200 Subject: [PATCH 01/13] working config-based training. dirty not clean --- README.md | 41 +++++----- configs/model_config.yaml | 36 +++++++++ results/train_multinb_v2.json | 46 +++++++++++ src/__init__.py | 0 src/model_training.py | 147 ++++++++++++++-------------------- src/model_utils.py | 44 ++++++++++ 6 files changed, 209 insertions(+), 105 deletions(-) create mode 100644 configs/model_config.yaml create mode 100644 results/train_multinb_v2.json create mode 100644 src/__init__.py create mode 100644 src/model_utils.py diff --git a/README.md b/README.md index 62ecda8..47d060b 100644 --- a/README.md +++ b/README.md @@ -1,20 +1,21 @@ # 🌍 ML Model Deployment App -## 📝 Descrizione +## 📝 Descrizione -Questa app espone una semplice API REST basata su **FastAPI** per il riconoscimento automatico della lingua di un testo. Nato come toy project per l'implementazione di best practice in ambito deployment di modelli ML. +Questa app espone una semplice API REST basata su **FastAPI** per il riconoscimento automatico della lingua di un testo. Nato come education project per l'implementazione di best practice in ambito deployment di modelli ML. Il modello di classificazione è allenato su 4 lingue: **inglese, francese, italiano e spagnolo** utilizzando **Multinomial Naive Bayes** e pipeline di preprocessing con **TF-IDF**. Il progetto include: -- Script per training pipeline del modello di riconoscimento della lingua (con GridSearchCV per ottimizzare MultinomialNB) con pipeline di scikit-learn. -- Semplice API REST per per esposizione endpoint di predizione via FastAPI, pronta per essere deployata con Docker -- Endpoint di health check dell'API e per ottenere la versione del modello +- Script per training del modello di riconoscimento della lingua (con GridSearchCV per ottimizzare MultinomialNB) con pipeline di scikit-learn. +- Semplice API REST per per esposizione endpoint di predizione +- Endpoint di health check dell'API e endpoint per ottenere la versione del modello +- Deployment con Docker -⚠️ __WARNING__ : This is a WIP +⚠️ **WARNING** : This is a WIP --- -## 🚀 Funzionalità +## 🚀 Funzionalità - Allenamento e salvataggio modello di classificazione - Endpoint `/language_detection` per la predizione della lingua di uno o più testi @@ -23,21 +24,20 @@ Il progetto include: - Deploy containerizzato con **Docker** per un avvio rapido e portabile - Logging centralizzato per monitorare richieste e performance - --- ## ⚙️ Installazione e avvio locale #### 1️⃣ Clona il repository -``` +```bash git clone https://github.com/mrcmilano/ml_api_deployment.git cd ml_api_deployment ``` #### 2️⃣ Crea ed attiva un ambiente virtuale -``` +```bash python -m venv venv source venv/bin/activate # su macOS/Linux venv\Scripts\activate # su Windows @@ -45,19 +45,19 @@ venv\Scripts\activate # su Windows #### 3️⃣ Installa le dipendenze -``` +```bash pip install -r requirements.txt ``` #### 4️⃣ Allena modello basato su MultinomialNB -``` +```bash python src/model_training.py ``` #### 5️⃣ Avvia API in locale -``` +```bash uvicorn app.main:app --host 0.0.0.0 --port 8000 --reload ``` @@ -65,13 +65,13 @@ uvicorn app.main:app --host 0.0.0.0 --port 8000 --reload #### Build dell'immagine -``` +```bash docker build -t language-api . ``` #### Avvio del container -``` +```bash docker run -d -p 8000:8000 language-api ``` @@ -83,11 +83,13 @@ docker run -d -p 8000:8000 language-api | `/language_detection` | POST | Rileva la lingua di uno o più testi | | `/model_version` | GET | Restituisce la versione del modello | -### Esempio richiesta API +### Esempio richiesta API + #### POST /language_detection + ##### Richiesta -``` +```bash curl -X POST "http://127.0.0.1:8000/language_detection" \ -H "Content-Type: application/json" \ -d '{"texts": ["Bonjour à tous!", "Ciao come va?", "Hello world!"]}' @@ -96,9 +98,10 @@ curl -X POST "http://127.0.0.1:8000/language_detection" \ ## ✅ Test Per eseguire unit tests: -``` + +```bash pytest -v ``` --- -☑️ __DISCLAIMER__ : Vibe-coded with ChatGPT - v5 and v4.1 +☑️ **DISCLAIMER** : Vibe-coded with ChatGPT - v5 and v4.1 diff --git a/configs/model_config.yaml b/configs/model_config.yaml new file mode 100644 index 0000000..e4795d8 --- /dev/null +++ b/configs/model_config.yaml @@ -0,0 +1,36 @@ +experiment_name: "multinomial_nb_v2" +dataset: + path: "data/lang_detection.csv" + text_column: "Text" + label_column: "Language" + +model: + type: "MultinomialNB" + params: + alpha: 0.3 + fit_prior: true + random_seed: 42 + +vectorizer: + type: "TfidfVectorizer" + params: + # ngram_range: (1, 2) temporaneamente commentato per evitare errori di parsing YAML + max_df: 0.9 + min_df: 2 + +evaluation: + metric: "accuracy" + # param_grid: + # "vectorizer__ngram_range": [(1,1), (1,2)], + # "vectorizer__max_df": [0.9, 1.0], + # "vectorizer__min_df": [1, 2], + # "clf__alpha": [0.1, 0.5, 1.0] + cv_folds: 5 + +output: + model_dir: "models/text/language_classification/v2" + results_file: "results/train_multinb_v2.json" + model_filename: "best_classifier.pkl" + label_enc_filename: "label_encoder.pkl" + + diff --git a/results/train_multinb_v2.json b/results/train_multinb_v2.json new file mode 100644 index 0000000..0fbaa4a --- /dev/null +++ b/results/train_multinb_v2.json @@ -0,0 +1,46 @@ +{ + "timestamp": "2025-10-08T19:31:25.453038", + "config": { + "experiment_name": "multinomial_nb_v2", + "dataset": { + "path": "data/lang_detection.csv", + "text_column": "Text", + "label_column": "Language" + }, + "model": { + "type": "MultinomialNB", + "params": { + "alpha": 0.3, + "fit_prior": true + }, + "random_seed": 42 + }, + "vectorizer": { + "type": "TfidfVectorizer", + "params": { + "max_df": 0.9, + "min_df": 2 + } + }, + "evaluation": { + "metric": "accuracy", + "cv_folds": 5 + }, + "output": { + "model_dir": "models/text/language_classification/v2", + "results_file": "results/train_multinb_v2.json", + "model_filename": "best_classifier.pkl", + "label_enc_filename": "label_encoder.pkl" + } + }, + "results": { + "cv_scores": [ + 0.9825918762088974, + 0.988394584139265, + 0.9758103531688437, + 0.9332365747460087, + 0.9404934687953556 + ], + "mean_score": 0.9641053714116741 + } +} \ No newline at end of file diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/model_training.py b/src/model_training.py index 82b7def..dfc81df 100644 --- a/src/model_training.py +++ b/src/model_training.py @@ -1,100 +1,75 @@ -# =============================== -# Multi-class Language Classification -# =============================== -import pprint import pandas as pd -from sklearn.pipeline import Pipeline +from sklearn.model_selection import cross_val_score from sklearn.preprocessing import LabelEncoder -from sklearn.preprocessing import FunctionTransformer +from sklearn.pipeline import Pipeline from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.naive_bayes import MultinomialNB -from sklearn.model_selection import GridSearchCV -from sklearn.metrics import classification_report, confusion_matrix -import seaborn as sns -import matplotlib.pyplot as plt +from sklearn.metrics import get_scorer +# from app.utils import clean_texts # funzione di pulizia testo sta qui per essere caricata in produzione +from src.model_utils import load_config, clean_texts, save_pipeline_obj, log_experiment +from sklearn.preprocessing import FunctionTransformer +import numpy as np +import os + +def main(config_path: str): + # Carica configurazione + cfg = load_config(config_path) + print(f"Loaded configuration: {config_path}") + + # Carica e prepara dataset di allenamento + print(f"Loading dataset from {cfg['dataset']['path']}") + df = pd.read_csv(cfg["dataset"]["path"]) + X = df[cfg["dataset"]["text_column"]] + y = df[cfg["dataset"]["label_column"]] -# ------------------------------- -# Caricamento dati -# ------------------------------- -# Devo esistere le colonne 'text' e 'language' -# TODO: aggiungere sanity check sul dataset per verificare che le colonne esistano -df = pd.read_csv('../data/lang_detection.csv') -df.columns = [x.lower() for x in df.columns] + # Pulizia e encoding label + le = LabelEncoder() + y_enc = le.fit_transform(y) -# ------------------------------- -# Label Encoding della variabile target -# ------------------------------- -le = LabelEncoder() -y = le.fit_transform(df["language"]) # trasforma le lingue in numeri interi + # Costruzione pipeline + vectorizer_params = cfg["vectorizer"]["params"] + if "ngram_range" in vectorizer_params: # Converte ngram_range in tupla per evitare errori + vectorizer_params["ngram_range"] = tuple(vectorizer_params["ngram_range"]) + vectorizer = TfidfVectorizer(**vectorizer_params) + print(f"Vectorizer params: {vectorizer_params}") + + # modello + model_params = cfg["model"]["params"] + model = MultinomialNB(**model_params) + print(f"Model params: {model_params}") -# ------------------------------- -# Funzione di pulizia testo (vectorized) + standardizzazione UTF-8 -# ------------------------------- -def clean_texts(texts): - return ( - pd.Series(texts) - .astype(str) - .map(lambda x: x.encode("utf-8", errors="ignore").decode("utf-8", errors="ignore")) - .str.lower() - .str.replace(r"[^\w\s]", "", regex=True) - .str.replace(r"\s+", " ", regex=True) - .str.strip() - .tolist() - ) + pipeline = Pipeline([ + ("cleaner", FunctionTransformer(clean_texts)), + ("vectorizer", vectorizer), + ("clf", model) + ]) -# ------------------------------- -# Pipeline compatta -# ------------------------------- -pipeline = Pipeline([ - ("cleaner", FunctionTransformer(clean_texts)), - ("vectorizer", TfidfVectorizer()), - ("clf", MultinomialNB()) -]) + # Cross-validation + scorer = get_scorer(cfg["evaluation"]["metric"]) + scores = cross_val_score(pipeline, X, y_enc, cv=cfg["evaluation"]["cv_folds"], scoring=scorer) -# ------------------------------- -# Parametri da ottimizzare con GridSearchCV -# ------------------------------- -param_grid = { - "vectorizer__ngram_range": [(1,1), (1,2)], - "vectorizer__max_df": [0.9, 1.0], - "vectorizer__min_df": [1, 2], - "clf__alpha": [0.1, 0.5, 1.0] -} + mean_score = np.mean(scores) + print(f"{cfg['evaluation']['metric']} (CV mean): {mean_score:.4f}") -grid = GridSearchCV( - pipeline, - param_grid, - cv=5, - scoring="accuracy", - n_jobs=-1, - verbose=1 -) + # Fit finale su tutto il dataset + pipeline.fit(X, y_enc) -# ------------------------------- -# Fit sul dataset -# ------------------------------- -grid.fit(df["text"], y) -pprint("✅ Migliori parametri:", grid.best_params_) -pprint("✅ Miglior score (CV):", grid.best_score_) + # Salvataggio modello e encoder + output_dir = cfg["output"]["model_dir"] + save_pipeline_obj(pipeline, os.path.join(output_dir, cfg["output"]["model_filename"])) + save_pipeline_obj(le, os.path.join(output_dir, cfg["output"]["label_enc_filename"])) -best_model = grid.best_estimator_ -y_pred = best_model.predict(df["text"]) + # Log dei risultati + results = { + "cv_scores": scores.tolist(), + "mean_score": mean_score, + } + log_experiment(results, cfg, cfg["output"]["results_file"]) -# ------------------------------- -# Classification Report -# ------------------------------- -pprint("\nClassification Report:") -pprint(classification_report(y, y_pred, target_names=le.classes_, digits=4)) +if __name__ == "__main__": + import argparse + parser = argparse.ArgumentParser(description="Train a text classification model") + parser.add_argument("--config", type=str, required=True, help="Path to YAML config file") + args = parser.parse_args() -# ------------------------------- -# Confusion Matrix visuale -# ------------------------------- -cm = confusion_matrix(y, y_pred) -plt.figure(figsize=(6,5)) -sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", - xticklabels=le.classes_, yticklabels=le.classes_) -plt.xlabel("Predicted") -plt.ylabel("True") -plt.title("Confusion Matrix - Multiclass Language Classification") -plt.tight_layout() -plt.show() \ No newline at end of file + main(args.config) diff --git a/src/model_utils.py b/src/model_utils.py new file mode 100644 index 0000000..1a9e8ce --- /dev/null +++ b/src/model_utils.py @@ -0,0 +1,44 @@ +import yaml +import json +import os +import re +import string +from datetime import datetime +from joblib import dump + +def load_config(path: str) -> dict: + with open(path, "r") as f: + return yaml.safe_load(f) + +def save_json(data: dict, path: str): + os.makedirs(os.path.dirname(path), exist_ok=True) + with open(path, "w") as f: + json.dump(data, f, indent=4) + +def save_pipeline_obj(model, path: str): + os.makedirs(os.path.dirname(path), exist_ok=True) + dump(model, path) + +def log_experiment(results: dict, config: dict, output_file: str): + """ + Unisce risultati e metadati di configurazione in un singolo file JSON + """ + log = { + "timestamp": datetime.now().isoformat(), + "config": config, + "results": results, + } + save_json(log, output_file) + print(f"Experiment log saved to {output_file}") + +def clean_texts(texts): + """ + Pulisce una lista di testi: + - lowercase + - rimozione punteggiatura + - rimozione spazi extra + """ + return [ + re.sub(r"\s+", " ", t.lower().translate(str.maketrans("", "", string.punctuation))).strip() + for t in texts + ] \ No newline at end of file From 011e835bb51ef47a2c44c16c46bf1c496c2ad942 Mon Sep 17 00:00:00 2001 From: msquared Date: Sat, 11 Oct 2025 11:19:46 +0200 Subject: [PATCH 02/13] training now uses correct config file. fixed yaml tuple reading --- .gitignore | 5 +- configs/model_config.yaml | 33 +++++---- results/.gitkeep | 0 results/train_multinb_v2.json | 77 +++++++++++++++----- src/model_training.py | 132 +++++++++++++++++++++++----------- src/model_utils.py | 4 +- 6 files changed, 173 insertions(+), 78 deletions(-) create mode 100644 results/.gitkeep diff --git a/.gitignore b/.gitignore index ad469f0..b3d3a5a 100644 --- a/.gitignore +++ b/.gitignore @@ -19,7 +19,7 @@ venv/ *.nbconvert.ipynb *.jupyter/ -# Data, Models, Notebooks +# Data, Models, Notebooks, Results data/* !data/.gitkeep @@ -29,6 +29,9 @@ models/* notebooks/* !notebooks/.gitkeep +results/* +!results/.gitkeep + # Directories for archiving archived/ diff --git a/configs/model_config.yaml b/configs/model_config.yaml index e4795d8..949d386 100644 --- a/configs/model_config.yaml +++ b/configs/model_config.yaml @@ -1,36 +1,35 @@ experiment_name: "multinomial_nb_v2" + dataset: path: "data/lang_detection.csv" + keep_languages: ["English", "Spanish", "French", "Italian"] text_column: "Text" label_column: "Language" model: - type: "MultinomialNB" params: - alpha: 0.3 - fit_prior: true - random_seed: 42 + alpha: 1.0 vectorizer: - type: "TfidfVectorizer" params: - # ngram_range: (1, 2) temporaneamente commentato per evitare errori di parsing YAML - max_df: 0.9 - min_df: 2 + ngram_range: !!python/tuple [1, 2] # YAML nativo: tuple vere, non stringhe + max_df: 0.95 + min_df: 1 -evaluation: - metric: "accuracy" - # param_grid: - # "vectorizer__ngram_range": [(1,1), (1,2)], - # "vectorizer__max_df": [0.9, 1.0], - # "vectorizer__min_df": [1, 2], - # "clf__alpha": [0.1, 0.5, 1.0] +grid_search: + enabled: true cv_folds: 5 + metric: "accuracy" + n_jobs: 1 # evita warning multiprocess su macOS + param_grid: + vectorizer__ngram_range: + - !!python/tuple [1, 1] + - !!python/tuple [1, 2] + vectorizer__max_df: [0.9, 0.95, 1.0] + clf__alpha: [0.5, 1.0, 1.5] output: model_dir: "models/text/language_classification/v2" results_file: "results/train_multinb_v2.json" model_filename: "best_classifier.pkl" label_enc_filename: "label_encoder.pkl" - - diff --git a/results/.gitkeep b/results/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/results/train_multinb_v2.json b/results/train_multinb_v2.json index 0fbaa4a..a1c41ed 100644 --- a/results/train_multinb_v2.json +++ b/results/train_multinb_v2.json @@ -1,30 +1,60 @@ { - "timestamp": "2025-10-08T19:31:25.453038", + "timestamp": "2025-10-11T11:15:07.863659", "config": { "experiment_name": "multinomial_nb_v2", "dataset": { "path": "data/lang_detection.csv", + "keep_languages": [ + "English", + "Spanish", + "French", + "Italian" + ], "text_column": "Text", "label_column": "Language" }, "model": { - "type": "MultinomialNB", "params": { - "alpha": 0.3, - "fit_prior": true - }, - "random_seed": 42 + "alpha": 1.0 + } }, "vectorizer": { - "type": "TfidfVectorizer", "params": { - "max_df": 0.9, - "min_df": 2 + "ngram_range": [ + 1, + 2 + ], + "max_df": 0.95, + "min_df": 1 } }, - "evaluation": { + "grid_search": { + "enabled": true, + "cv_folds": 5, "metric": "accuracy", - "cv_folds": 5 + "n_jobs": 1, + "param_grid": { + "vectorizer__ngram_range": [ + [ + 1, + 1 + ], + [ + 1, + 2 + ] + ], + "vectorizer__max_df": [ + 0.9, + 0.95, + 1.0 + ], + "clf__alpha": [ + 0.5, + 1.0, + 1.5 + ] + } }, "output": { "model_dir": "models/text/language_classification/v2", @@ -35,12 +65,25 @@ }, "results": { "cv_scores": [ - 0.9825918762088974, - 0.988394584139265, - 0.9758103531688437, - 0.9332365747460087, - 0.9404934687953556 + 0.9864629759949957, + 0.982888289415383, + 0.9864629759949957, + 0.982888289415383, + 0.9864629759949957, + 0.982888289415383, + 0.9818662522480256, + 0.9770147522610577, + 0.9818662522480256, + 0.9770147522610577, + 0.9818662522480256, + 0.9770147522610577, + 0.9775249563426904, + 0.9726728047540856, + 0.9775249563426904, + 0.9726728047540856, + 0.9775249563426904, + 0.9726728047540856 ], - "mean_score": 0.9641053714116741 + "mean_score": 0.9797383385027065 } } \ No newline at end of file diff --git a/src/model_training.py b/src/model_training.py index dfc81df..629bce9 100644 --- a/src/model_training.py +++ b/src/model_training.py @@ -1,75 +1,125 @@ +import os +import ast +import numpy as np import pandas as pd -from sklearn.model_selection import cross_val_score -from sklearn.preprocessing import LabelEncoder +from sklearn.model_selection import GridSearchCV +from sklearn.preprocessing import LabelEncoder, FunctionTransformer from sklearn.pipeline import Pipeline from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.naive_bayes import MultinomialNB -from sklearn.metrics import get_scorer -# from app.utils import clean_texts # funzione di pulizia testo sta qui per essere caricata in produzione -from src.model_utils import load_config, clean_texts, save_pipeline_obj, log_experiment -from sklearn.preprocessing import FunctionTransformer -import numpy as np -import os +from sklearn.metrics import classification_report +from src.model_utils import ( + load_config, clean_texts, save_pipeline_obj, log_experiment +) -def main(config_path: str): - # Carica configurazione - cfg = load_config(config_path) - print(f"Loaded configuration: {config_path}") - # Carica e prepara dataset di allenamento - print(f"Loading dataset from {cfg['dataset']['path']}") +def parse_config(config: dict) -> dict: + """Convert YAML-loaded strings like '(1, 2)' into tuples.""" + def convert_value(v): + if isinstance(v, str) and v.startswith("(") and v.endswith(")"): + try: + return ast.literal_eval(v) + except Exception: + return v + if isinstance(v, list): + return [convert_value(x) for x in v] + if isinstance(v, dict): + return {k: convert_value(x) for k, x in v.items()} + return v + + return convert_value(config) + + +def prepare_data(cfg): df = pd.read_csv(cfg["dataset"]["path"]) + if "keep_languages" in cfg["dataset"]: + df = df[df[cfg["dataset"]["label_column"]].isin(cfg["dataset"]["keep_languages"])] X = df[cfg["dataset"]["text_column"]] y = df[cfg["dataset"]["label_column"]] - - # Pulizia e encoding label le = LabelEncoder() y_enc = le.fit_transform(y) + return X, y, y_enc, le + - # Costruzione pipeline - vectorizer_params = cfg["vectorizer"]["params"] - if "ngram_range" in vectorizer_params: # Converte ngram_range in tupla per evitare errori - vectorizer_params["ngram_range"] = tuple(vectorizer_params["ngram_range"]) - vectorizer = TfidfVectorizer(**vectorizer_params) - print(f"Vectorizer params: {vectorizer_params}") - - # modello - model_params = cfg["model"]["params"] - model = MultinomialNB(**model_params) - print(f"Model params: {model_params}") - - pipeline = Pipeline([ +def build_pipeline(cfg): + vectorizer = TfidfVectorizer(**cfg["vectorizer"]["params"]) + clf = MultinomialNB(**cfg["model"]["params"]) + return Pipeline([ ("cleaner", FunctionTransformer(clean_texts)), ("vectorizer", vectorizer), - ("clf", model) + ("clf", clf), ]) - # Cross-validation - scorer = get_scorer(cfg["evaluation"]["metric"]) - scores = cross_val_score(pipeline, X, y_enc, cv=cfg["evaluation"]["cv_folds"], scoring=scorer) - mean_score = np.mean(scores) - print(f"{cfg['evaluation']['metric']} (CV mean): {mean_score:.4f}") +def train_model(cfg): + cfg = parse_config(cfg) + X, y, y_enc, le = prepare_data(cfg) + pipeline = build_pipeline(cfg) + + print("Pipeline configuration:") + for name, step in pipeline.steps: + print(f" - {name}: {step}") + + scores, mean_score = None, None + + if cfg["grid_search"]["enabled"]: + param_grid = cfg["grid_search"]["param_grid"] + if not param_grid: + raise ValueError("Grid search enabled but no param_grid provided") - # Fit finale su tutto il dataset - pipeline.fit(X, y_enc) + param_grid = parse_config(param_grid) + print(f"Starting GridSearchCV with params:\n{param_grid}") + + grid = GridSearchCV( + pipeline, + param_grid=param_grid, + cv=cfg["grid_search"]["cv_folds"], + scoring=cfg["grid_search"]["metric"], + verbose=2, + n_jobs=cfg["grid_search"].get("n_jobs", -1), + ) + grid.fit(X, y_enc) + pipeline = grid.best_estimator_ + scores = grid.cv_results_["mean_test_score"] + mean_score = float(np.mean(scores)) + + print(f"Best params: {grid.best_params_}") + print(f"Mean CV score: {mean_score:.4f}") + + y_pred_enc = pipeline.predict(X) + y_pred = le.inverse_transform(y_pred_enc) + print("\nClassification Report:") + print(classification_report(y, y_pred, target_names=le.classes_, digits=4)) + + + else: + print("GridSearchCV disabled. Training default pipeline...") + pipeline.fit(X, y_enc) + + return pipeline, le, scores, mean_score + + +def main(config_path: str): + cfg = load_config(config_path) + pipeline, le, scores, mean_score = train_model(cfg) - # Salvataggio modello e encoder output_dir = cfg["output"]["model_dir"] + os.makedirs(output_dir, exist_ok=True) + save_pipeline_obj(pipeline, os.path.join(output_dir, cfg["output"]["model_filename"])) save_pipeline_obj(le, os.path.join(output_dir, cfg["output"]["label_enc_filename"])) - # Log dei risultati results = { - "cv_scores": scores.tolist(), + "cv_scores": scores.tolist() if scores is not None else None, "mean_score": mean_score, } log_experiment(results, cfg, cfg["output"]["results_file"]) + print("Training complete and model saved.") + if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="Train a text classification model") parser.add_argument("--config", type=str, required=True, help="Path to YAML config file") args = parser.parse_args() - main(args.config) diff --git a/src/model_utils.py b/src/model_utils.py index 1a9e8ce..d7bd5c4 100644 --- a/src/model_utils.py +++ b/src/model_utils.py @@ -7,8 +7,8 @@ from joblib import dump def load_config(path: str) -> dict: - with open(path, "r") as f: - return yaml.safe_load(f) + with open(path, "r", encoding="utf-8") as f: + return yaml.load(f, Loader=yaml.FullLoader) def save_json(data: dict, path: str): os.makedirs(os.path.dirname(path), exist_ok=True) From fab9b8e28ad6ceddc90879c91bddcde28bfded84 Mon Sep 17 00:00:00 2001 From: msquared Date: Sat, 11 Oct 2025 12:07:14 +0200 Subject: [PATCH 03/13] fixed best params logging in results --- results/train_multinb_v2.json | 12 ++++++++++-- src/model_training.py | 12 ++++++++---- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/results/train_multinb_v2.json b/results/train_multinb_v2.json index a1c41ed..a5f1200 100644 --- a/results/train_multinb_v2.json +++ b/results/train_multinb_v2.json @@ -1,5 +1,5 @@ { - "timestamp": "2025-10-11T11:15:07.863659", + "timestamp": "2025-10-11T12:05:03.550187", "config": { "experiment_name": "multinomial_nb_v2", "dataset": { @@ -84,6 +84,14 @@ 0.9775249563426904, 0.9726728047540856 ], - "mean_score": 0.9797383385027065 + "mean_score": 0.9797383385027065, + "best_estimator_params": { + "clf__alpha": 0.5, + "vectorizer__max_df": 0.9, + "vectorizer__ngram_range": [ + 1, + 1 + ] + } } } \ No newline at end of file diff --git a/src/model_training.py b/src/model_training.py index 629bce9..8db69a1 100644 --- a/src/model_training.py +++ b/src/model_training.py @@ -1,5 +1,6 @@ import os import ast +import json import numpy as np import pandas as pd from sklearn.model_selection import GridSearchCV @@ -82,8 +83,9 @@ def train_model(cfg): pipeline = grid.best_estimator_ scores = grid.cv_results_["mean_test_score"] mean_score = float(np.mean(scores)) + best_params = grid.best_params_ - print(f"Best params: {grid.best_params_}") + print(f"Best params: {best_params}") print(f"Mean CV score: {mean_score:.4f}") y_pred_enc = pipeline.predict(X) @@ -96,23 +98,25 @@ def train_model(cfg): print("GridSearchCV disabled. Training default pipeline...") pipeline.fit(X, y_enc) - return pipeline, le, scores, mean_score + return pipeline, le, scores, mean_score, best_params def main(config_path: str): cfg = load_config(config_path) - pipeline, le, scores, mean_score = train_model(cfg) + pipeline, le, scores, mean_score, best_params = train_model(cfg) output_dir = cfg["output"]["model_dir"] os.makedirs(output_dir, exist_ok=True) save_pipeline_obj(pipeline, os.path.join(output_dir, cfg["output"]["model_filename"])) save_pipeline_obj(le, os.path.join(output_dir, cfg["output"]["label_enc_filename"])) - + results = { "cv_scores": scores.tolist() if scores is not None else None, "mean_score": mean_score, + "best_estimator_params": best_params } + log_experiment(results, cfg, cfg["output"]["results_file"]) print("Training complete and model saved.") From 369e71db0190394f2c208aa0c0c58c0fff4a3089 Mon Sep 17 00:00:00 2001 From: msquared Date: Sat, 11 Oct 2025 14:45:10 +0200 Subject: [PATCH 04/13] working training pipeline and results logging --- configs/model_config.yaml | 1 + results/train_multinb_v2.json | 29 +++++++++++++++++++++++++--- src/model_training.py | 36 +++++++++++++---------------------- src/model_utils.py | 25 +++++++++++++++++++++++- 4 files changed, 64 insertions(+), 27 deletions(-) diff --git a/configs/model_config.yaml b/configs/model_config.yaml index 949d386..c683b9e 100644 --- a/configs/model_config.yaml +++ b/configs/model_config.yaml @@ -26,6 +26,7 @@ grid_search: - !!python/tuple [1, 1] - !!python/tuple [1, 2] vectorizer__max_df: [0.9, 0.95, 1.0] + vectorizer__min_df: [1, 2] clf__alpha: [0.5, 1.0, 1.5] output: diff --git a/results/train_multinb_v2.json b/results/train_multinb_v2.json index a5f1200..4c16b5c 100644 --- a/results/train_multinb_v2.json +++ b/results/train_multinb_v2.json @@ -1,5 +1,5 @@ { - "timestamp": "2025-10-11T12:05:03.550187", + "timestamp": "2025-10-11T14:32:26.822052", "config": { "experiment_name": "multinomial_nb_v2", "dataset": { @@ -49,6 +49,10 @@ 0.95, 1.0 ], + "vectorizer__min_df": [ + 1, + 2 + ], "clf__alpha": [ 0.5, 1.0, @@ -67,27 +71,46 @@ "cv_scores": [ 0.9864629759949957, 0.982888289415383, + 0.9828869862121093, + 0.9826322099721116, 0.9864629759949957, 0.982888289415383, + 0.9828869862121093, + 0.9826322099721116, 0.9864629759949957, 0.982888289415383, + 0.9828869862121093, + 0.9826322099721116, 0.9818662522480256, 0.9770147522610577, + 0.981610172804754, + 0.9803336851982172, 0.9818662522480256, 0.9770147522610577, + 0.981610172804754, + 0.9803336851982172, 0.9818662522480256, 0.9770147522610577, + 0.981610172804754, + 0.9803336851982172, 0.9775249563426904, 0.9726728047540856, + 0.9798228295149478, + 0.9767589986186046, 0.9775249563426904, 0.9726728047540856, + 0.9798228295149478, + 0.9767589986186046, 0.9775249563426904, - 0.9726728047540856 + 0.9726728047540856, + 0.9798228295149478, + 0.9767589986186046 ], - "mean_score": 0.9797383385027065, + "mean_score": 0.9802062427780819, "best_estimator_params": { "clf__alpha": 0.5, "vectorizer__max_df": 0.9, + "vectorizer__min_df": 1, "vectorizer__ngram_range": [ 1, 1 diff --git a/src/model_training.py b/src/model_training.py index 8db69a1..dce1e3d 100644 --- a/src/model_training.py +++ b/src/model_training.py @@ -1,8 +1,7 @@ import os -import ast -import json import numpy as np import pandas as pd +from datetime import datetime from sklearn.model_selection import GridSearchCV from sklearn.preprocessing import LabelEncoder, FunctionTransformer from sklearn.pipeline import Pipeline @@ -10,25 +9,9 @@ from sklearn.naive_bayes import MultinomialNB from sklearn.metrics import classification_report from src.model_utils import ( - load_config, clean_texts, save_pipeline_obj, log_experiment -) - - -def parse_config(config: dict) -> dict: - """Convert YAML-loaded strings like '(1, 2)' into tuples.""" - def convert_value(v): - if isinstance(v, str) and v.startswith("(") and v.endswith(")"): - try: - return ast.literal_eval(v) - except Exception: - return v - if isinstance(v, list): - return [convert_value(x) for x in v] - if isinstance(v, dict): - return {k: convert_value(x) for k, x in v.items()} - return v - - return convert_value(config) + load_config, parse_config, clean_texts,\ + save_pipeline_obj, log_experiment, +) def prepare_data(cfg): @@ -110,14 +93,21 @@ def main(config_path: str): save_pipeline_obj(pipeline, os.path.join(output_dir, cfg["output"]["model_filename"])) save_pipeline_obj(le, os.path.join(output_dir, cfg["output"]["label_enc_filename"])) - + results = { "cv_scores": scores.tolist() if scores is not None else None, "mean_score": mean_score, "best_estimator_params": best_params } - log_experiment(results, cfg, cfg["output"]["results_file"]) + # Costruisce il path con timestamp per il file dei risultati + results_file_with_ts = os.path.join( + os.path.dirname(cfg["output"]["results_file"]), + f"{os.path.splitext(os.path.basename(cfg['output']['results_file']))[0]}_" \ + f"{datetime.now():%Y%m%d_%H%M%S}.json" + ) + + log_experiment(results, cfg, results_file_with_ts) print("Training complete and model saved.") diff --git a/src/model_utils.py b/src/model_utils.py index d7bd5c4..0f84613 100644 --- a/src/model_utils.py +++ b/src/model_utils.py @@ -2,23 +2,45 @@ import json import os import re +import ast import string from datetime import datetime from joblib import dump + def load_config(path: str) -> dict: with open(path, "r", encoding="utf-8") as f: return yaml.load(f, Loader=yaml.FullLoader) + +def parse_config(config: dict) -> dict: + """Convert YAML-loaded strings like '(1, 2)' into tuples.""" + def convert_value(v): + if isinstance(v, str) and v.startswith("(") and v.endswith(")"): + try: + return ast.literal_eval(v) + except Exception: + return v + if isinstance(v, list): + return [convert_value(x) for x in v] + if isinstance(v, dict): + return {k: convert_value(x) for k, x in v.items()} + return v + + return convert_value(config) + + def save_json(data: dict, path: str): os.makedirs(os.path.dirname(path), exist_ok=True) with open(path, "w") as f: json.dump(data, f, indent=4) + def save_pipeline_obj(model, path: str): os.makedirs(os.path.dirname(path), exist_ok=True) dump(model, path) + def log_experiment(results: dict, config: dict, output_file: str): """ Unisce risultati e metadati di configurazione in un singolo file JSON @@ -31,6 +53,7 @@ def log_experiment(results: dict, config: dict, output_file: str): save_json(log, output_file) print(f"Experiment log saved to {output_file}") + def clean_texts(texts): """ Pulisce una lista di testi: @@ -41,4 +64,4 @@ def clean_texts(texts): return [ re.sub(r"\s+", " ", t.lower().translate(str.maketrans("", "", string.punctuation))).strip() for t in texts - ] \ No newline at end of file + ] From 7fa3828cb6d46fb0a0d390b187fc6f9c6dcc4b5f Mon Sep 17 00:00:00 2001 From: msquared Date: Sat, 11 Oct 2025 20:29:04 +0200 Subject: [PATCH 05/13] add unit tests github action --- .github/workflows/unit-tests.yml | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 .github/workflows/unit-tests.yml diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml new file mode 100644 index 0000000..8d68290 --- /dev/null +++ b/.github/workflows/unit-tests.yml @@ -0,0 +1,29 @@ +name: Unit Tests + +on: + push: + branches: [main, develop] + pull_request: + branches: [main] + +jobs: + test: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v5 + + - name: Set up Python + uses: actions/setup-python@v6 + with: + python-version: "3.13" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + pip install pytest + + - name: Run pytest + run: pytest -v --maxfail=1 --disable-warnings From 8a591d39e3dc05cb85ce4d862a5c0d2a9db02990 Mon Sep 17 00:00:00 2001 From: msquared Date: Sat, 11 Oct 2025 20:30:55 +0200 Subject: [PATCH 06/13] extend actions for push and PR to develop --- .github/workflows/unit-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 8d68290..6b54e50 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -4,7 +4,7 @@ on: push: branches: [main, develop] pull_request: - branches: [main] + branches: [main, develop] jobs: test: From 04923077e64bcd65898badc2526b12fdcfb2931e Mon Sep 17 00:00:00 2001 From: msquared Date: Sat, 11 Oct 2025 20:46:22 +0200 Subject: [PATCH 07/13] Add httpx dependency to fix action tests --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index d3a03b4..4847cfb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,5 +5,5 @@ pandas==2.2.3 joblib==1.4.2 fastapi==0.118.0 pydantic==2.11.9 - - +httpx==0.28.1 +pytest==8.4.2 \ No newline at end of file From 415647271e866fd7655fac7b47d31025d4d05d6c Mon Sep 17 00:00:00 2001 From: msquared Date: Sun, 12 Oct 2025 14:04:19 +0200 Subject: [PATCH 08/13] Separate dev v prod tests. remove model obj dependency for actions --- pytest.ini | 2 ++ tests/dev/__init__.py | 0 tests/{test_api.py => dev/test_api_dev.py} | 0 tests/{ => dev}/test_browser.html | 0 tests/prod/__init__.py | 0 tests/prod/test_api_prod.py | 42 ++++++++++++++++++++++ 6 files changed, 44 insertions(+) create mode 100644 pytest.ini create mode 100644 tests/dev/__init__.py rename tests/{test_api.py => dev/test_api_dev.py} (100%) rename tests/{ => dev}/test_browser.html (100%) create mode 100644 tests/prod/__init__.py create mode 100644 tests/prod/test_api_prod.py diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..ddb6c6a --- /dev/null +++ b/pytest.ini @@ -0,0 +1,2 @@ +[pytest] +testpaths = tests/prod diff --git a/tests/dev/__init__.py b/tests/dev/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_api.py b/tests/dev/test_api_dev.py similarity index 100% rename from tests/test_api.py rename to tests/dev/test_api_dev.py diff --git a/tests/test_browser.html b/tests/dev/test_browser.html similarity index 100% rename from tests/test_browser.html rename to tests/dev/test_browser.html diff --git a/tests/prod/__init__.py b/tests/prod/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/prod/test_api_prod.py b/tests/prod/test_api_prod.py new file mode 100644 index 0000000..15935af --- /dev/null +++ b/tests/prod/test_api_prod.py @@ -0,0 +1,42 @@ +from fastapi.testclient import TestClient +from unittest.mock import patch +from app.main import app + +client = TestClient(app) + + +# --- MOCK FUNCTIONS --- +def mock_predict_language_safe(texts): + return ["English" for _ in texts] + + +# --- TESTS --- + +def test_status_endpoint(): + """Health check endpoint""" + response = client.get("/status") + assert response.status_code == 200 + assert response.json() == {"status": "ok"} + + +def test_model_version_endpoint(): + """Model version endpoint""" + response = client.get("/model_version") + assert response.status_code == 200 + data = response.json() + assert "model_version" in data + assert isinstance(data["model_version"], str) + + +@patch("app.utils.predict_language_safe", side_effect=mock_predict_language_safe) +def test_language_detection(mock_func): + """Language detection endpoint with mocked model""" + payload = {"texts": ["Hello world!", "Ciao come state?"]} + response = client.post("/language_detection", json=payload) + assert response.status_code == 200 + data = response.json() + assert "predictions" in data + assert isinstance(data["predictions"], list) + assert "English" in data["predictions"] + + # mock_func.assert_called_once() # controlla che la funzione mock sia stata chiamata From 4d535a9632424863ade5a443ab78c5982c7ac79b Mon Sep 17 00:00:00 2001 From: msquared Date: Sun, 12 Oct 2025 16:43:58 +0200 Subject: [PATCH 09/13] Fix import error for prod tests with mock API --- tests/prod/test_api_prod.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/tests/prod/test_api_prod.py b/tests/prod/test_api_prod.py index 15935af..7d75ba7 100644 --- a/tests/prod/test_api_prod.py +++ b/tests/prod/test_api_prod.py @@ -5,13 +5,11 @@ client = TestClient(app) -# --- MOCK FUNCTIONS --- -def mock_predict_language_safe(texts): - return ["English" for _ in texts] +# mock function to replace the actual model prediction +def mock_predict_language_safe(pipeline, le, texts, **kwargs): + return ["English", "Italian"] -# --- TESTS --- - def test_status_endpoint(): """Health check endpoint""" response = client.get("/status") @@ -28,7 +26,7 @@ def test_model_version_endpoint(): assert isinstance(data["model_version"], str) -@patch("app.utils.predict_language_safe", side_effect=mock_predict_language_safe) +@patch("app.main.predict_language_safe", side_effect=mock_predict_language_safe) def test_language_detection(mock_func): """Language detection endpoint with mocked model""" payload = {"texts": ["Hello world!", "Ciao come state?"]} @@ -38,5 +36,6 @@ def test_language_detection(mock_func): assert "predictions" in data assert isinstance(data["predictions"], list) assert "English" in data["predictions"] - - # mock_func.assert_called_once() # controlla che la funzione mock sia stata chiamata + assert "Italian" in data["predictions"] + + mock_func.assert_called_once() # controlla che la funzione mock sia stata chiamata From 0dbf9a99855c874b6c845f00c282cd1942f15e57 Mon Sep 17 00:00:00 2001 From: msquared Date: Sun, 12 Oct 2025 17:03:00 +0200 Subject: [PATCH 10/13] add artifacts loader in app.main to manage non exist ojbs --- app/main.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/app/main.py b/app/main.py index c869b41..c686d0c 100644 --- a/app/main.py +++ b/app/main.py @@ -30,11 +30,25 @@ model_lock = threading.Lock() + +def load_artifacts(): + """Carica modelli e altri artefatti necessari""" + if not os.path.exists(MODEL_DIR): + raise FileNotFoundError(f"Model directory {MODEL_DIR} does not exist.") + if not os.path.isfile(os.path.join(MODEL_DIR, MODEL_FILENAME)): + raise FileNotFoundError(f"Model file {MODEL_FILENAME} not found in {MODEL_DIR}.") + if not os.path.isfile(os.path.join(MODEL_DIR, LABEL_ENCOD_FILENAME)): + raise FileNotFoundError(f"Label encoder file {LABEL_ENCOD_FILENAME} not found in {MODEL_DIR}.") # ------------------------------- # Load model and encoder # ------------------------------- -model = joblib.load(os.path.join(MODEL_DIR, MODEL_FILENAME)) -le = joblib.load(os.path.join(MODEL_DIR, LABEL_ENCOD_FILENAME)) + model = joblib.load(os.path.join(MODEL_DIR, MODEL_FILENAME)) + le = joblib.load(os.path.join(MODEL_DIR, LABEL_ENCOD_FILENAME)) + return model, le +model, le = load_artifacts() +# --- IGNORE --- +# model = joblib.load(os.path.join(MODEL_DIR, MODEL_FILENAME)) +# le = joblib.load(os.path.join(MODEL_DIR, LABEL_ENCOD_FILENAME)) logger.info(f"Loaded model version {MODEL_VERSION} from {MODEL_DIR}") # ------------------------------- From fd17893df5220f5f544be1425ab77241151a65f5 Mon Sep 17 00:00:00 2001 From: msquared Date: Fri, 17 Oct 2025 19:36:55 +0200 Subject: [PATCH 11/13] cleaned and reorg tests --- {src => tests}/test_endpoint.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename {src => tests}/test_endpoint.sh (100%) diff --git a/src/test_endpoint.sh b/tests/test_endpoint.sh similarity index 100% rename from src/test_endpoint.sh rename to tests/test_endpoint.sh From 02c0a301d8436c93d45e5c0a886639e750b4e458 Mon Sep 17 00:00:00 2001 From: msquared Date: Fri, 17 Oct 2025 20:57:04 +0200 Subject: [PATCH 12/13] codex fix for unit tests failure --- .github/workflows/unit-tests.yml | 2 +- app/main.py | 33 ++++++++++++++++---------------- tests/dev/test_api_dev.py | 5 +++++ tests/prod/test_api_prod.py | 5 +++++ 4 files changed, 28 insertions(+), 17 deletions(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 6b54e50..ca2312e 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -17,7 +17,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v6 with: - python-version: "3.13" + python-version: "3.11" - name: Install dependencies run: | diff --git a/app/main.py b/app/main.py index c686d0c..a2111d1 100644 --- a/app/main.py +++ b/app/main.py @@ -1,13 +1,15 @@ import logging +import os +import threading import time -from fastapi import FastAPI, Request -from pydantic import BaseModel from typing import List + import joblib -import os +from fastapi import FastAPI, Request +from pydantic import BaseModel + # clean_texts usata da predict_language_safe nella pipeline -from app.utils import predict_language_safe, clean_texts -import threading +from app.utils import predict_language_safe, clean_texts # ------------------------------- # Logging setup @@ -27,6 +29,7 @@ MODEL_DIR = f"models/text/language_classification/{MODEL_VERSION}" MODEL_FILENAME = "best_classifier.pkl" LABEL_ENCOD_FILENAME = "label_encoder.pkl" +SKIP_MODEL_LOADING = os.getenv("SKIP_MODEL_LOADING", "false").lower() in {"1", "true", "yes"} model_lock = threading.Lock() @@ -39,17 +42,15 @@ def load_artifacts(): raise FileNotFoundError(f"Model file {MODEL_FILENAME} not found in {MODEL_DIR}.") if not os.path.isfile(os.path.join(MODEL_DIR, LABEL_ENCOD_FILENAME)): raise FileNotFoundError(f"Label encoder file {LABEL_ENCOD_FILENAME} not found in {MODEL_DIR}.") -# ------------------------------- -# Load model and encoder -# ------------------------------- - model = joblib.load(os.path.join(MODEL_DIR, MODEL_FILENAME)) - le = joblib.load(os.path.join(MODEL_DIR, LABEL_ENCOD_FILENAME)) - return model, le -model, le = load_artifacts() -# --- IGNORE --- -# model = joblib.load(os.path.join(MODEL_DIR, MODEL_FILENAME)) -# le = joblib.load(os.path.join(MODEL_DIR, LABEL_ENCOD_FILENAME)) -logger.info(f"Loaded model version {MODEL_VERSION} from {MODEL_DIR}") +# Load model and encoder on module import unless explicitly skipped +model = None +le = None + +if SKIP_MODEL_LOADING: + logger.info("Skipping model load because SKIP_MODEL_LOADING is set.") +else: + model, le = load_artifacts() + logger.info(f"Loaded model version {MODEL_VERSION} from {MODEL_DIR}") # ------------------------------- # FastAPI setup diff --git a/tests/dev/test_api_dev.py b/tests/dev/test_api_dev.py index add231d..464ecc3 100644 --- a/tests/dev/test_api_dev.py +++ b/tests/dev/test_api_dev.py @@ -1,5 +1,10 @@ +import os import pytest from fastapi.testclient import TestClient + +# Skip heavy artifact loading during unit tests +os.environ.setdefault("SKIP_MODEL_LOADING", "false") + from app.main import app client = TestClient(app) diff --git a/tests/prod/test_api_prod.py b/tests/prod/test_api_prod.py index 7d75ba7..12da850 100644 --- a/tests/prod/test_api_prod.py +++ b/tests/prod/test_api_prod.py @@ -1,5 +1,10 @@ +import os from fastapi.testclient import TestClient from unittest.mock import patch + +# Skip heavy artifact loading during unit tests +os.environ.setdefault("SKIP_MODEL_LOADING", "true") + from app.main import app client = TestClient(app) From b6d1a4e270cedf795a9598148220f852230bfb28 Mon Sep 17 00:00:00 2001 From: msquared Date: Fri, 17 Oct 2025 21:15:22 +0200 Subject: [PATCH 13/13] codex fix for codex error in returning model artefacts from loading function --- app/main.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/app/main.py b/app/main.py index a2111d1..34f5577 100644 --- a/app/main.py +++ b/app/main.py @@ -42,6 +42,9 @@ def load_artifacts(): raise FileNotFoundError(f"Model file {MODEL_FILENAME} not found in {MODEL_DIR}.") if not os.path.isfile(os.path.join(MODEL_DIR, LABEL_ENCOD_FILENAME)): raise FileNotFoundError(f"Label encoder file {LABEL_ENCOD_FILENAME} not found in {MODEL_DIR}.") + model = joblib.load(os.path.join(MODEL_DIR, MODEL_FILENAME)) + le = joblib.load(os.path.join(MODEL_DIR, LABEL_ENCOD_FILENAME)) + return model, le # Load model and encoder on module import unless explicitly skipped model = None le = None