diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml new file mode 100644 index 0000000..ca2312e --- /dev/null +++ b/.github/workflows/unit-tests.yml @@ -0,0 +1,29 @@ +name: Unit Tests + +on: + push: + branches: [main, develop] + pull_request: + branches: [main, develop] + +jobs: + test: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v5 + + - name: Set up Python + uses: actions/setup-python@v6 + with: + python-version: "3.11" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + pip install pytest + + - name: Run pytest + run: pytest -v --maxfail=1 --disable-warnings diff --git a/.gitignore b/.gitignore index ad469f0..b3d3a5a 100644 --- a/.gitignore +++ b/.gitignore @@ -19,7 +19,7 @@ venv/ *.nbconvert.ipynb *.jupyter/ -# Data, Models, Notebooks +# Data, Models, Notebooks, Results data/* !data/.gitkeep @@ -29,6 +29,9 @@ models/* notebooks/* !notebooks/.gitkeep +results/* +!results/.gitkeep + # Directories for archiving archived/ diff --git a/README.md b/README.md index 62ecda8..47d060b 100644 --- a/README.md +++ b/README.md @@ -1,20 +1,21 @@ # 🌍 ML Model Deployment App -## 📝 Descrizione +## 📝 Descrizione -Questa app espone una semplice API REST basata su **FastAPI** per il riconoscimento automatico della lingua di un testo. Nato come toy project per l'implementazione di best practice in ambito deployment di modelli ML. +Questa app espone una semplice API REST basata su **FastAPI** per il riconoscimento automatico della lingua di un testo. Nato come education project per l'implementazione di best practice in ambito deployment di modelli ML. Il modello di classificazione è allenato su 4 lingue: **inglese, francese, italiano e spagnolo** utilizzando **Multinomial Naive Bayes** e pipeline di preprocessing con **TF-IDF**. Il progetto include: -- Script per training pipeline del modello di riconoscimento della lingua (con GridSearchCV per ottimizzare MultinomialNB) con pipeline di scikit-learn. -- Semplice API REST per per esposizione endpoint di predizione via FastAPI, pronta per essere deployata con Docker -- Endpoint di health check dell'API e per ottenere la versione del modello +- Script per training del modello di riconoscimento della lingua (con GridSearchCV per ottimizzare MultinomialNB) con pipeline di scikit-learn. +- Semplice API REST per per esposizione endpoint di predizione +- Endpoint di health check dell'API e endpoint per ottenere la versione del modello +- Deployment con Docker -⚠️ __WARNING__ : This is a WIP +⚠️ **WARNING** : This is a WIP --- -## 🚀 Funzionalità +## 🚀 Funzionalità - Allenamento e salvataggio modello di classificazione - Endpoint `/language_detection` per la predizione della lingua di uno o più testi @@ -23,21 +24,20 @@ Il progetto include: - Deploy containerizzato con **Docker** per un avvio rapido e portabile - Logging centralizzato per monitorare richieste e performance - --- ## ⚙️ Installazione e avvio locale #### 1️⃣ Clona il repository -``` +```bash git clone https://github.com/mrcmilano/ml_api_deployment.git cd ml_api_deployment ``` #### 2️⃣ Crea ed attiva un ambiente virtuale -``` +```bash python -m venv venv source venv/bin/activate # su macOS/Linux venv\Scripts\activate # su Windows @@ -45,19 +45,19 @@ venv\Scripts\activate # su Windows #### 3️⃣ Installa le dipendenze -``` +```bash pip install -r requirements.txt ``` #### 4️⃣ Allena modello basato su MultinomialNB -``` +```bash python src/model_training.py ``` #### 5️⃣ Avvia API in locale -``` +```bash uvicorn app.main:app --host 0.0.0.0 --port 8000 --reload ``` @@ -65,13 +65,13 @@ uvicorn app.main:app --host 0.0.0.0 --port 8000 --reload #### Build dell'immagine -``` +```bash docker build -t language-api . ``` #### Avvio del container -``` +```bash docker run -d -p 8000:8000 language-api ``` @@ -83,11 +83,13 @@ docker run -d -p 8000:8000 language-api | `/language_detection` | POST | Rileva la lingua di uno o più testi | | `/model_version` | GET | Restituisce la versione del modello | -### Esempio richiesta API +### Esempio richiesta API + #### POST /language_detection + ##### Richiesta -``` +```bash curl -X POST "http://127.0.0.1:8000/language_detection" \ -H "Content-Type: application/json" \ -d '{"texts": ["Bonjour à tous!", "Ciao come va?", "Hello world!"]}' @@ -96,9 +98,10 @@ curl -X POST "http://127.0.0.1:8000/language_detection" \ ## ✅ Test Per eseguire unit tests: -``` + +```bash pytest -v ``` --- -☑️ __DISCLAIMER__ : Vibe-coded with ChatGPT - v5 and v4.1 +☑️ **DISCLAIMER** : Vibe-coded with ChatGPT - v5 and v4.1 diff --git a/app/main.py b/app/main.py index c869b41..34f5577 100644 --- a/app/main.py +++ b/app/main.py @@ -1,13 +1,15 @@ import logging +import os +import threading import time -from fastapi import FastAPI, Request -from pydantic import BaseModel from typing import List + import joblib -import os +from fastapi import FastAPI, Request +from pydantic import BaseModel + # clean_texts usata da predict_language_safe nella pipeline -from app.utils import predict_language_safe, clean_texts -import threading +from app.utils import predict_language_safe, clean_texts # ------------------------------- # Logging setup @@ -27,15 +29,31 @@ MODEL_DIR = f"models/text/language_classification/{MODEL_VERSION}" MODEL_FILENAME = "best_classifier.pkl" LABEL_ENCOD_FILENAME = "label_encoder.pkl" +SKIP_MODEL_LOADING = os.getenv("SKIP_MODEL_LOADING", "false").lower() in {"1", "true", "yes"} model_lock = threading.Lock() -# ------------------------------- -# Load model and encoder -# ------------------------------- -model = joblib.load(os.path.join(MODEL_DIR, MODEL_FILENAME)) -le = joblib.load(os.path.join(MODEL_DIR, LABEL_ENCOD_FILENAME)) -logger.info(f"Loaded model version {MODEL_VERSION} from {MODEL_DIR}") + +def load_artifacts(): + """Carica modelli e altri artefatti necessari""" + if not os.path.exists(MODEL_DIR): + raise FileNotFoundError(f"Model directory {MODEL_DIR} does not exist.") + if not os.path.isfile(os.path.join(MODEL_DIR, MODEL_FILENAME)): + raise FileNotFoundError(f"Model file {MODEL_FILENAME} not found in {MODEL_DIR}.") + if not os.path.isfile(os.path.join(MODEL_DIR, LABEL_ENCOD_FILENAME)): + raise FileNotFoundError(f"Label encoder file {LABEL_ENCOD_FILENAME} not found in {MODEL_DIR}.") + model = joblib.load(os.path.join(MODEL_DIR, MODEL_FILENAME)) + le = joblib.load(os.path.join(MODEL_DIR, LABEL_ENCOD_FILENAME)) + return model, le +# Load model and encoder on module import unless explicitly skipped +model = None +le = None + +if SKIP_MODEL_LOADING: + logger.info("Skipping model load because SKIP_MODEL_LOADING is set.") +else: + model, le = load_artifacts() + logger.info(f"Loaded model version {MODEL_VERSION} from {MODEL_DIR}") # ------------------------------- # FastAPI setup diff --git a/configs/model_config.yaml b/configs/model_config.yaml new file mode 100644 index 0000000..c683b9e --- /dev/null +++ b/configs/model_config.yaml @@ -0,0 +1,36 @@ +experiment_name: "multinomial_nb_v2" + +dataset: + path: "data/lang_detection.csv" + keep_languages: ["English", "Spanish", "French", "Italian"] + text_column: "Text" + label_column: "Language" + +model: + params: + alpha: 1.0 + +vectorizer: + params: + ngram_range: !!python/tuple [1, 2] # YAML nativo: tuple vere, non stringhe + max_df: 0.95 + min_df: 1 + +grid_search: + enabled: true + cv_folds: 5 + metric: "accuracy" + n_jobs: 1 # evita warning multiprocess su macOS + param_grid: + vectorizer__ngram_range: + - !!python/tuple [1, 1] + - !!python/tuple [1, 2] + vectorizer__max_df: [0.9, 0.95, 1.0] + vectorizer__min_df: [1, 2] + clf__alpha: [0.5, 1.0, 1.5] + +output: + model_dir: "models/text/language_classification/v2" + results_file: "results/train_multinb_v2.json" + model_filename: "best_classifier.pkl" + label_enc_filename: "label_encoder.pkl" diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..ddb6c6a --- /dev/null +++ b/pytest.ini @@ -0,0 +1,2 @@ +[pytest] +testpaths = tests/prod diff --git a/requirements.txt b/requirements.txt index d3a03b4..4847cfb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,5 +5,5 @@ pandas==2.2.3 joblib==1.4.2 fastapi==0.118.0 pydantic==2.11.9 - - +httpx==0.28.1 +pytest==8.4.2 \ No newline at end of file diff --git a/results/.gitkeep b/results/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/results/train_multinb_v2.json b/results/train_multinb_v2.json new file mode 100644 index 0000000..4c16b5c --- /dev/null +++ b/results/train_multinb_v2.json @@ -0,0 +1,120 @@ +{ + "timestamp": "2025-10-11T14:32:26.822052", + "config": { + "experiment_name": "multinomial_nb_v2", + "dataset": { + "path": "data/lang_detection.csv", + "keep_languages": [ + "English", + "Spanish", + "French", + "Italian" + ], + "text_column": "Text", + "label_column": "Language" + }, + "model": { + "params": { + "alpha": 1.0 + } + }, + "vectorizer": { + "params": { + "ngram_range": [ + 1, + 2 + ], + "max_df": 0.95, + "min_df": 1 + } + }, + "grid_search": { + "enabled": true, + "cv_folds": 5, + "metric": "accuracy", + "n_jobs": 1, + "param_grid": { + "vectorizer__ngram_range": [ + [ + 1, + 1 + ], + [ + 1, + 2 + ] + ], + "vectorizer__max_df": [ + 0.9, + 0.95, + 1.0 + ], + "vectorizer__min_df": [ + 1, + 2 + ], + "clf__alpha": [ + 0.5, + 1.0, + 1.5 + ] + } + }, + "output": { + "model_dir": "models/text/language_classification/v2", + "results_file": "results/train_multinb_v2.json", + "model_filename": "best_classifier.pkl", + "label_enc_filename": "label_encoder.pkl" + } + }, + "results": { + "cv_scores": [ + 0.9864629759949957, + 0.982888289415383, + 0.9828869862121093, + 0.9826322099721116, + 0.9864629759949957, + 0.982888289415383, + 0.9828869862121093, + 0.9826322099721116, + 0.9864629759949957, + 0.982888289415383, + 0.9828869862121093, + 0.9826322099721116, + 0.9818662522480256, + 0.9770147522610577, + 0.981610172804754, + 0.9803336851982172, + 0.9818662522480256, + 0.9770147522610577, + 0.981610172804754, + 0.9803336851982172, + 0.9818662522480256, + 0.9770147522610577, + 0.981610172804754, + 0.9803336851982172, + 0.9775249563426904, + 0.9726728047540856, + 0.9798228295149478, + 0.9767589986186046, + 0.9775249563426904, + 0.9726728047540856, + 0.9798228295149478, + 0.9767589986186046, + 0.9775249563426904, + 0.9726728047540856, + 0.9798228295149478, + 0.9767589986186046 + ], + "mean_score": 0.9802062427780819, + "best_estimator_params": { + "clf__alpha": 0.5, + "vectorizer__max_df": 0.9, + "vectorizer__min_df": 1, + "vectorizer__ngram_range": [ + 1, + 1 + ] + } + } +} \ No newline at end of file diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/model_training.py b/src/model_training.py index 82b7def..dce1e3d 100644 --- a/src/model_training.py +++ b/src/model_training.py @@ -1,100 +1,119 @@ -# =============================== -# Multi-class Language Classification -# =============================== -import pprint +import os +import numpy as np import pandas as pd +from datetime import datetime +from sklearn.model_selection import GridSearchCV +from sklearn.preprocessing import LabelEncoder, FunctionTransformer from sklearn.pipeline import Pipeline -from sklearn.preprocessing import LabelEncoder -from sklearn.preprocessing import FunctionTransformer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.naive_bayes import MultinomialNB -from sklearn.model_selection import GridSearchCV -from sklearn.metrics import classification_report, confusion_matrix -import seaborn as sns -import matplotlib.pyplot as plt - -# ------------------------------- -# Caricamento dati -# ------------------------------- -# Devo esistere le colonne 'text' e 'language' -# TODO: aggiungere sanity check sul dataset per verificare che le colonne esistano -df = pd.read_csv('../data/lang_detection.csv') -df.columns = [x.lower() for x in df.columns] - -# ------------------------------- -# Label Encoding della variabile target -# ------------------------------- -le = LabelEncoder() -y = le.fit_transform(df["language"]) # trasforma le lingue in numeri interi - -# ------------------------------- -# Funzione di pulizia testo (vectorized) + standardizzazione UTF-8 -# ------------------------------- -def clean_texts(texts): - return ( - pd.Series(texts) - .astype(str) - .map(lambda x: x.encode("utf-8", errors="ignore").decode("utf-8", errors="ignore")) - .str.lower() - .str.replace(r"[^\w\s]", "", regex=True) - .str.replace(r"\s+", " ", regex=True) - .str.strip() - .tolist() +from sklearn.metrics import classification_report +from src.model_utils import ( + load_config, parse_config, clean_texts,\ + save_pipeline_obj, log_experiment, +) + + +def prepare_data(cfg): + df = pd.read_csv(cfg["dataset"]["path"]) + if "keep_languages" in cfg["dataset"]: + df = df[df[cfg["dataset"]["label_column"]].isin(cfg["dataset"]["keep_languages"])] + X = df[cfg["dataset"]["text_column"]] + y = df[cfg["dataset"]["label_column"]] + le = LabelEncoder() + y_enc = le.fit_transform(y) + return X, y, y_enc, le + + +def build_pipeline(cfg): + vectorizer = TfidfVectorizer(**cfg["vectorizer"]["params"]) + clf = MultinomialNB(**cfg["model"]["params"]) + return Pipeline([ + ("cleaner", FunctionTransformer(clean_texts)), + ("vectorizer", vectorizer), + ("clf", clf), + ]) + + +def train_model(cfg): + cfg = parse_config(cfg) + X, y, y_enc, le = prepare_data(cfg) + pipeline = build_pipeline(cfg) + + print("Pipeline configuration:") + for name, step in pipeline.steps: + print(f" - {name}: {step}") + + scores, mean_score = None, None + + if cfg["grid_search"]["enabled"]: + param_grid = cfg["grid_search"]["param_grid"] + if not param_grid: + raise ValueError("Grid search enabled but no param_grid provided") + + param_grid = parse_config(param_grid) + print(f"Starting GridSearchCV with params:\n{param_grid}") + + grid = GridSearchCV( + pipeline, + param_grid=param_grid, + cv=cfg["grid_search"]["cv_folds"], + scoring=cfg["grid_search"]["metric"], + verbose=2, + n_jobs=cfg["grid_search"].get("n_jobs", -1), + ) + grid.fit(X, y_enc) + pipeline = grid.best_estimator_ + scores = grid.cv_results_["mean_test_score"] + mean_score = float(np.mean(scores)) + best_params = grid.best_params_ + + print(f"Best params: {best_params}") + print(f"Mean CV score: {mean_score:.4f}") + + y_pred_enc = pipeline.predict(X) + y_pred = le.inverse_transform(y_pred_enc) + print("\nClassification Report:") + print(classification_report(y, y_pred, target_names=le.classes_, digits=4)) + + + else: + print("GridSearchCV disabled. Training default pipeline...") + pipeline.fit(X, y_enc) + + return pipeline, le, scores, mean_score, best_params + + +def main(config_path: str): + cfg = load_config(config_path) + pipeline, le, scores, mean_score, best_params = train_model(cfg) + + output_dir = cfg["output"]["model_dir"] + os.makedirs(output_dir, exist_ok=True) + + save_pipeline_obj(pipeline, os.path.join(output_dir, cfg["output"]["model_filename"])) + save_pipeline_obj(le, os.path.join(output_dir, cfg["output"]["label_enc_filename"])) + + results = { + "cv_scores": scores.tolist() if scores is not None else None, + "mean_score": mean_score, + "best_estimator_params": best_params + } + + # Costruisce il path con timestamp per il file dei risultati + results_file_with_ts = os.path.join( + os.path.dirname(cfg["output"]["results_file"]), + f"{os.path.splitext(os.path.basename(cfg['output']['results_file']))[0]}_" \ + f"{datetime.now():%Y%m%d_%H%M%S}.json" ) -# ------------------------------- -# Pipeline compatta -# ------------------------------- -pipeline = Pipeline([ - ("cleaner", FunctionTransformer(clean_texts)), - ("vectorizer", TfidfVectorizer()), - ("clf", MultinomialNB()) -]) - -# ------------------------------- -# Parametri da ottimizzare con GridSearchCV -# ------------------------------- -param_grid = { - "vectorizer__ngram_range": [(1,1), (1,2)], - "vectorizer__max_df": [0.9, 1.0], - "vectorizer__min_df": [1, 2], - "clf__alpha": [0.1, 0.5, 1.0] -} - -grid = GridSearchCV( - pipeline, - param_grid, - cv=5, - scoring="accuracy", - n_jobs=-1, - verbose=1 -) - -# ------------------------------- -# Fit sul dataset -# ------------------------------- -grid.fit(df["text"], y) -pprint("✅ Migliori parametri:", grid.best_params_) -pprint("✅ Miglior score (CV):", grid.best_score_) - -best_model = grid.best_estimator_ -y_pred = best_model.predict(df["text"]) - -# ------------------------------- -# Classification Report -# ------------------------------- -pprint("\nClassification Report:") -pprint(classification_report(y, y_pred, target_names=le.classes_, digits=4)) - -# ------------------------------- -# Confusion Matrix visuale -# ------------------------------- -cm = confusion_matrix(y, y_pred) -plt.figure(figsize=(6,5)) -sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", - xticklabels=le.classes_, yticklabels=le.classes_) -plt.xlabel("Predicted") -plt.ylabel("True") -plt.title("Confusion Matrix - Multiclass Language Classification") -plt.tight_layout() -plt.show() \ No newline at end of file + log_experiment(results, cfg, results_file_with_ts) + print("Training complete and model saved.") + + +if __name__ == "__main__": + import argparse + parser = argparse.ArgumentParser(description="Train a text classification model") + parser.add_argument("--config", type=str, required=True, help="Path to YAML config file") + args = parser.parse_args() + main(args.config) diff --git a/src/model_utils.py b/src/model_utils.py new file mode 100644 index 0000000..0f84613 --- /dev/null +++ b/src/model_utils.py @@ -0,0 +1,67 @@ +import yaml +import json +import os +import re +import ast +import string +from datetime import datetime +from joblib import dump + + +def load_config(path: str) -> dict: + with open(path, "r", encoding="utf-8") as f: + return yaml.load(f, Loader=yaml.FullLoader) + + +def parse_config(config: dict) -> dict: + """Convert YAML-loaded strings like '(1, 2)' into tuples.""" + def convert_value(v): + if isinstance(v, str) and v.startswith("(") and v.endswith(")"): + try: + return ast.literal_eval(v) + except Exception: + return v + if isinstance(v, list): + return [convert_value(x) for x in v] + if isinstance(v, dict): + return {k: convert_value(x) for k, x in v.items()} + return v + + return convert_value(config) + + +def save_json(data: dict, path: str): + os.makedirs(os.path.dirname(path), exist_ok=True) + with open(path, "w") as f: + json.dump(data, f, indent=4) + + +def save_pipeline_obj(model, path: str): + os.makedirs(os.path.dirname(path), exist_ok=True) + dump(model, path) + + +def log_experiment(results: dict, config: dict, output_file: str): + """ + Unisce risultati e metadati di configurazione in un singolo file JSON + """ + log = { + "timestamp": datetime.now().isoformat(), + "config": config, + "results": results, + } + save_json(log, output_file) + print(f"Experiment log saved to {output_file}") + + +def clean_texts(texts): + """ + Pulisce una lista di testi: + - lowercase + - rimozione punteggiatura + - rimozione spazi extra + """ + return [ + re.sub(r"\s+", " ", t.lower().translate(str.maketrans("", "", string.punctuation))).strip() + for t in texts + ] diff --git a/tests/dev/__init__.py b/tests/dev/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_api.py b/tests/dev/test_api_dev.py similarity index 95% rename from tests/test_api.py rename to tests/dev/test_api_dev.py index add231d..464ecc3 100644 --- a/tests/test_api.py +++ b/tests/dev/test_api_dev.py @@ -1,5 +1,10 @@ +import os import pytest from fastapi.testclient import TestClient + +# Skip heavy artifact loading during unit tests +os.environ.setdefault("SKIP_MODEL_LOADING", "false") + from app.main import app client = TestClient(app) diff --git a/tests/test_browser.html b/tests/dev/test_browser.html similarity index 100% rename from tests/test_browser.html rename to tests/dev/test_browser.html diff --git a/tests/prod/__init__.py b/tests/prod/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/prod/test_api_prod.py b/tests/prod/test_api_prod.py new file mode 100644 index 0000000..12da850 --- /dev/null +++ b/tests/prod/test_api_prod.py @@ -0,0 +1,46 @@ +import os +from fastapi.testclient import TestClient +from unittest.mock import patch + +# Skip heavy artifact loading during unit tests +os.environ.setdefault("SKIP_MODEL_LOADING", "true") + +from app.main import app + +client = TestClient(app) + + +# mock function to replace the actual model prediction +def mock_predict_language_safe(pipeline, le, texts, **kwargs): + return ["English", "Italian"] + + +def test_status_endpoint(): + """Health check endpoint""" + response = client.get("/status") + assert response.status_code == 200 + assert response.json() == {"status": "ok"} + + +def test_model_version_endpoint(): + """Model version endpoint""" + response = client.get("/model_version") + assert response.status_code == 200 + data = response.json() + assert "model_version" in data + assert isinstance(data["model_version"], str) + + +@patch("app.main.predict_language_safe", side_effect=mock_predict_language_safe) +def test_language_detection(mock_func): + """Language detection endpoint with mocked model""" + payload = {"texts": ["Hello world!", "Ciao come state?"]} + response = client.post("/language_detection", json=payload) + assert response.status_code == 200 + data = response.json() + assert "predictions" in data + assert isinstance(data["predictions"], list) + assert "English" in data["predictions"] + assert "Italian" in data["predictions"] + + mock_func.assert_called_once() # controlla che la funzione mock sia stata chiamata diff --git a/src/test_endpoint.sh b/tests/test_endpoint.sh similarity index 100% rename from src/test_endpoint.sh rename to tests/test_endpoint.sh