mrcmilano · mrcmilano · Oct 17, 2025 · Oct 8, 2025 · Oct 11, 2025 · Oct 11, 2025
diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
@@ -0,0 +1,29 @@
+name: Unit Tests
+
+on:
+  push:
+    branches: [main, develop]
+  pull_request:
+    branches: [main, develop]
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v5
+
+      - name: Set up Python
+        uses: actions/setup-python@v6
+        with:
+          python-version: "3.11"
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+          pip install pytest
+
+      - name: Run pytest
+        run: pytest -v --maxfail=1 --disable-warnings
diff --git a/.gitignore b/.gitignore
@@ -19,7 +19,7 @@ venv/
 *.nbconvert.ipynb
 *.jupyter/
 
-# Data, Models, Notebooks
+# Data, Models, Notebooks, Results
 data/*
 !data/.gitkeep
 
@@ -29,6 +29,9 @@ models/*
 notebooks/*
 !notebooks/.gitkeep
 
+results/*
+!results/.gitkeep
+
 # Directories for archiving
 archived/
 

diff --git a/README.md b/README.md
@@ -1,20 +1,21 @@
 # 🌍 ML Model Deployment App
 
-## 📝 Descrizione 
+## 📝 Descrizione
 
-Questa app espone una semplice API REST basata su **FastAPI** per il riconoscimento automatico della lingua di un testo. Nato come toy project per l'implementazione di best practice in ambito deployment di modelli ML.
+Questa app espone una semplice API REST basata su **FastAPI** per il riconoscimento automatico della lingua di un testo. Nato come education project per l'implementazione di best practice in ambito deployment di modelli ML.
 Il modello di classificazione è allenato su 4 lingue: **inglese, francese, italiano e spagnolo** utilizzando **Multinomial Naive Bayes** e pipeline di preprocessing con **TF-IDF**.
 
 Il progetto include:
-- Script per training pipeline del modello di riconoscimento della lingua (con GridSearchCV per ottimizzare MultinomialNB) con pipeline di scikit-learn. 
-- Semplice API REST per per esposizione endpoint di predizione via FastAPI, pronta per essere deployata con Docker
-- Endpoint di health check dell'API e per ottenere la versione del modello
+- Script per training del modello di riconoscimento della lingua (con GridSearchCV per ottimizzare MultinomialNB) con pipeline di scikit-learn.
+- Semplice API REST per per esposizione endpoint di predizione
+- Endpoint di health check dell'API e endpoint per ottenere la versione del modello
+- Deployment con Docker
 
-⚠️ __WARNING__ : This is a WIP
+⚠️ **WARNING** : This is a WIP
 
 ---
 
-## 🚀 Funzionalità 
+## 🚀 Funzionalità
 
 - Allenamento e salvataggio modello di classificazione
 - Endpoint `/language_detection` per la predizione della lingua di uno o più testi  
@@ -23,55 +24,54 @@ Il progetto include:
 - Deploy containerizzato con **Docker** per un avvio rapido e portabile  
 - Logging centralizzato per monitorare richieste e performance  
 
-
 ---
 
 ## ⚙️ Installazione e avvio locale
 
 #### 1️⃣ Clona il repository
 
-```
+```bash
 git clone https://github.com/mrcmilano/ml_api_deployment.git
 cd ml_api_deployment
 ```
 
 #### 2️⃣ Crea ed attiva un ambiente virtuale
 
-```
+```bash
 python -m venv venv
 source venv/bin/activate  # su macOS/Linux
 venv\Scripts\activate     # su Windows
 ```
 
 #### 3️⃣ Installa le dipendenze
 
-```
+```bash
 pip install -r requirements.txt
 ```
 
 #### 4️⃣ Allena modello basato su MultinomialNB
 
-```
+```bash
 python src/model_training.py
 ```
 
 #### 5️⃣ Avvia API in locale
 
-```
+```bash
 uvicorn app.main:app --host 0.0.0.0 --port 8000 --reload
 ```
 
 ## 🐳 Esecuzione con Docker
 
 #### Build dell'immagine
 
-```
+```bash
 docker build -t language-api .
 ```
 
 #### Avvio del container
 
-```
+```bash
 docker run -d -p 8000:8000 language-api
 ```
 
@@ -83,11 +83,13 @@ docker run -d -p 8000:8000 language-api
 | `/language_detection` | POST   | Rileva la lingua di uno o più testi |
 | `/model_version`      | GET    | Restituisce la versione del modello |
 
-### Esempio richiesta API 
+### Esempio richiesta API
+
 #### POST /language_detection
+
 ##### Richiesta
 
-```
+```bash
 curl -X POST "http://127.0.0.1:8000/language_detection" \
     -H "Content-Type: application/json" \
     -d '{"texts": ["Bonjour à tous!", "Ciao come va?", "Hello world!"]}'
@@ -96,9 +98,10 @@ curl -X POST "http://127.0.0.1:8000/language_detection" \
 ## ✅ Test
 
 Per eseguire unit tests:
-```
+
+```bash
 pytest -v
 ```
 
 ---
-☑️ __DISCLAIMER__ : Vibe-coded with ChatGPT - v5 and v4.1
+☑️ **DISCLAIMER** : Vibe-coded with ChatGPT - v5 and v4.1
diff --git a/app/main.py b/app/main.py
@@ -1,13 +1,15 @@
 import logging
+import os
+import threading
 import time
-from fastapi import FastAPI, Request
-from pydantic import BaseModel
 from typing import List
+
 import joblib
-import os
+from fastapi import FastAPI, Request
+from pydantic import BaseModel
+
 # clean_texts usata da predict_language_safe nella pipeline
-from app.utils import predict_language_safe, clean_texts 
-import threading
+from app.utils import predict_language_safe, clean_texts
 
 # -------------------------------
 # Logging setup
@@ -27,15 +29,31 @@
 MODEL_DIR = f"models/text/language_classification/{MODEL_VERSION}"
 MODEL_FILENAME = "best_classifier.pkl"
 LABEL_ENCOD_FILENAME = "label_encoder.pkl"
+SKIP_MODEL_LOADING = os.getenv("SKIP_MODEL_LOADING", "false").lower() in {"1", "true", "yes"}
 
 model_lock = threading.Lock()
 
-# -------------------------------
-# Load model and encoder
-# -------------------------------
-model = joblib.load(os.path.join(MODEL_DIR, MODEL_FILENAME))
-le = joblib.load(os.path.join(MODEL_DIR, LABEL_ENCOD_FILENAME))
-logger.info(f"Loaded model version {MODEL_VERSION} from {MODEL_DIR}")
+
+def load_artifacts():
+    """Carica modelli e altri artefatti necessari"""
+    if not os.path.exists(MODEL_DIR):
+        raise FileNotFoundError(f"Model directory {MODEL_DIR} does not exist.")
+    if not os.path.isfile(os.path.join(MODEL_DIR, MODEL_FILENAME)):
+        raise FileNotFoundError(f"Model file {MODEL_FILENAME} not found in {MODEL_DIR}.")
+    if not os.path.isfile(os.path.join(MODEL_DIR, LABEL_ENCOD_FILENAME)):
+        raise FileNotFoundError(f"Label encoder file {LABEL_ENCOD_FILENAME} not found in {MODEL_DIR}.")
+    model = joblib.load(os.path.join(MODEL_DIR, MODEL_FILENAME))
+    le = joblib.load(os.path.join(MODEL_DIR, LABEL_ENCOD_FILENAME))
+    return model, le
+# Load model and encoder on module import unless explicitly skipped
+model = None
+le = None
+
+if SKIP_MODEL_LOADING:
+    logger.info("Skipping model load because SKIP_MODEL_LOADING is set.")
+else:
+    model, le = load_artifacts()
+    logger.info(f"Loaded model version {MODEL_VERSION} from {MODEL_DIR}")
 
 # -------------------------------
 # FastAPI setup

diff --git a/configs/model_config.yaml b/configs/model_config.yaml
@@ -0,0 +1,36 @@
+experiment_name: "multinomial_nb_v2"
+
+dataset:
+  path: "data/lang_detection.csv"
+  keep_languages: ["English", "Spanish", "French", "Italian"]
+  text_column: "Text"
+  label_column: "Language"
+
+model:
+  params:
+    alpha: 1.0
+
+vectorizer:
+  params:
+    ngram_range: !!python/tuple [1, 2]   # YAML nativo: tuple vere, non stringhe
+    max_df: 0.95
+    min_df: 1
+
+grid_search:
+  enabled: true
+  cv_folds: 5
+  metric: "accuracy"
+  n_jobs: 1 # evita warning multiprocess su macOS
+  param_grid:
+    vectorizer__ngram_range:
+      - !!python/tuple [1, 1]
+      - !!python/tuple [1, 2]
+    vectorizer__max_df: [0.9, 0.95, 1.0]
+    vectorizer__min_df: [1, 2]
+    clf__alpha: [0.5, 1.0, 1.5]
+
+output:
+  model_dir: "models/text/language_classification/v2"
+  results_file: "results/train_multinb_v2.json"
+  model_filename: "best_classifier.pkl"
+  label_enc_filename: "label_encoder.pkl"
diff --git a/pytest.ini b/pytest.ini
@@ -0,0 +1,2 @@
+[pytest]
+testpaths = tests/prod
diff --git a/requirements.txt b/requirements.txt
@@ -5,5 +5,5 @@ pandas==2.2.3
 joblib==1.4.2
 fastapi==0.118.0
 pydantic==2.11.9
-
-
+httpx==0.28.1
+pytest==8.4.2
diff --git a/results/.gitkeep b/results/.gitkeep
diff --git a/results/train_multinb_v2.json b/results/train_multinb_v2.json
@@ -0,0 +1,120 @@
+{
+    "timestamp": "2025-10-11T14:32:26.822052",
+    "config": {
+        "experiment_name": "multinomial_nb_v2",
+        "dataset": {
+            "path": "data/lang_detection.csv",
+            "keep_languages": [
+                "English",
+                "Spanish",
+                "French",
+                "Italian"
+            ],
+            "text_column": "Text",
+            "label_column": "Language"
+        },
+        "model": {
+            "params": {
+                "alpha": 1.0
+            }
+        },
+        "vectorizer": {
+            "params": {
+                "ngram_range": [
+                    1,
+                    2
+                ],
+                "max_df": 0.95,
+                "min_df": 1
+            }
+        },
+        "grid_search": {
+            "enabled": true,
+            "cv_folds": 5,
+            "metric": "accuracy",
+            "n_jobs": 1,
+            "param_grid": {
+                "vectorizer__ngram_range": [
+                    [
+                        1,
+                        1
+                    ],
+                    [
+                        1,
+                        2
+                    ]
+                ],
+                "vectorizer__max_df": [
+                    0.9,
+                    0.95,
+                    1.0
+                ],
+                "vectorizer__min_df": [
+                    1,
+                    2
+                ],
+                "clf__alpha": [
+                    0.5,
+                    1.0,
+                    1.5
+                ]
+            }
+        },
+        "output": {
+            "model_dir": "models/text/language_classification/v2",
+            "results_file": "results/train_multinb_v2.json",
+            "model_filename": "best_classifier.pkl",
+            "label_enc_filename": "label_encoder.pkl"
+        }
+    },
+    "results": {
+        "cv_scores": [
+            0.9864629759949957,
+            0.982888289415383,
+            0.9828869862121093,
+            0.9826322099721116,
+            0.9864629759949957,
+            0.982888289415383,
+            0.9828869862121093,
+            0.9826322099721116,
+            0.9864629759949957,
+            0.982888289415383,
+            0.9828869862121093,
+            0.9826322099721116,
+            0.9818662522480256,
+            0.9770147522610577,
+            0.981610172804754,
+            0.9803336851982172,
+            0.9818662522480256,
+            0.9770147522610577,
+            0.981610172804754,
+            0.9803336851982172,
+            0.9818662522480256,
+            0.9770147522610577,
+            0.981610172804754,
+            0.9803336851982172,
+            0.9775249563426904,
+            0.9726728047540856,
+            0.9798228295149478,
+            0.9767589986186046,
+            0.9775249563426904,
+            0.9726728047540856,
+            0.9798228295149478,
+            0.9767589986186046,
+            0.9775249563426904,
+            0.9726728047540856,
+            0.9798228295149478,
+            0.9767589986186046
+        ],
+        "mean_score": 0.9802062427780819,
+        "best_estimator_params": {
+            "clf__alpha": 0.5,
+            "vectorizer__max_df": 0.9,
+            "vectorizer__min_df": 1,
+            "vectorizer__ngram_range": [
+                1,
+                1
+            ]
+        }
+    }
+}
diff --git a/src/__init__.py b/src/__init__.py