mrcmilano · mrcmilano · Oct 20, 2025 · Oct 8, 2025 · Oct 11, 2025 · Oct 11, 2025
diff --git a/.dockerignore b/.dockerignore
@@ -6,8 +6,7 @@ venv/
 data/
 notebooks/
 scripts/
-src/
 *.ipynb
 *.env
 .DS_Store
-.env
+.env
diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
@@ -17,7 +17,7 @@ jobs:
       - name: Set up Python
         uses: actions/setup-python@v6
         with:
-          python-version: "3.11"
+          python-version: "3.13"
 
       - name: Install dependencies
         run: |

diff --git a/.gitignore b/.gitignore
@@ -46,4 +46,3 @@ Thumbs.db
 *.tar
 *.bak
 *.swp
-Dockerfile.dev
diff --git a/Dockerfile b/Dockerfile
@@ -6,6 +6,8 @@ FROM python:3.11-slim
 
 # Per disabilitare buffering dei logs
 ENV PYTHONUNBUFFERED=1
+ENV APP_ENV=prod
+ENV MODEL_VERSION=v2
 
 WORKDIR /app
 
@@ -14,6 +16,7 @@ RUN pip install --no-cache-dir -r requirements.txt
 
 # Copiamo app e modelli nella directory di lavoro
 COPY app/ ./app
+COPY src/ ./src
 COPY models/ ./models
 
 EXPOSE 8000

diff --git a/Dockerfile.dev b/Dockerfile.dev
@@ -0,0 +1,22 @@
+# =========================
+# Dockerfile - Sviluppo
+# =========================
+FROM python:3.11-slim
+
+# per disabilitare buffering dei logs
+ENV PYTHONUNBUFFERED=1
+ENV APP_ENV=dev
+ENV MODEL_VERSION=v2
+
+WORKDIR /app
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Copiamo solo i modelli (il codice sarà montato come bind mount)
+COPY src/ ./src
+COPY models/ ./models
+
+EXPOSE 8000
+
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"]
diff --git a/README.md b/README.md
@@ -75,6 +75,44 @@ docker build -t language-api .
 docker run -d -p 8000:8000 language-api
 ```
 
+### 📦 Versionamento dei modelli
+
+L'API carica gli artefatti dal percorso `models/text/language_classification/<MODEL_VERSION>`.  
+Imposta `MODEL_VERSION` (o `MODEL_DIR`) come variabile d'ambiente per scegliere quale modello servire:
+
+```bash
+docker run -e MODEL_VERSION=v2 -p 8000:8000 language-api
+```
+
+Assicurati che la cartella corrispondente contenga `best_classifier.pkl` e `label_encoder.pkl`.
+
+### 🔁 Development con Docker
+
+Per iterare rapidamente senza rebuild continui, usa l'immagine di sviluppo e monta il codice locale:
+
+```bash
+docker build -f Dockerfile.dev -t language-api-dev .
+docker run --rm -it \
+  -v $(pwd)/app:/app/app \
+  -v $(pwd)/configs:/app/configs \
+  -v $(pwd)/src:/app/src \
+  -v $(pwd)/tests:/app/tests \
+  -p 8000:8000 \
+  language-api-dev
+```
+
+`uvicorn --reload` rileva automaticamente le modifiche nella directory montata. Imposta `-e MODEL_VERSION=vX` se vuoi testare un modello diverso.
+
+## 🧠 Training del modello
+
+Usa `src/model_training.py` per addestrare un nuovo classificatore partendo dalla configurazione YAML:
+
+```bash
+python src/model_training.py --config configs/model_config.yaml
+```
+
+Il training produce `best_classifier.pkl` e `label_encoder.pkl` nella cartella `output.model_dir` definita nel file di config, oltre a salvare un file di risultati con timestamp. Aggiorna `MODEL_VERSION` (o `MODEL_DIR`) per puntare al modello appena creato.
+
 ## 📡 Endpoint disponibili
 
 | Endpoint              | Metodo | Descrizione                         |
@@ -104,4 +142,4 @@ pytest -v
 ```
 
 ---
-☑️ **DISCLAIMER** : Vibe-coded with ChatGPT - v5 and v4.1
+☑️ **DISCLAIMER** : Vibe-coded with OpenAI ChatGPT - v5 and v4.1
diff --git a/app/main.py b/app/main.py
@@ -25,35 +25,51 @@
 # Config
 # -------------------------------
 CONFIDENCE_THRESHOLD = 0.5
-MODEL_VERSION = "v1"
-MODEL_DIR = f"models/text/language_classification/{MODEL_VERSION}"
+MODEL_VERSION = os.getenv("MODEL_VERSION", "v1")
+DEFAULT_MODEL_DIR = f"models/text/language_classification/{MODEL_VERSION}"
+MODEL_DIR = os.getenv("MODEL_DIR", DEFAULT_MODEL_DIR)
 MODEL_FILENAME = "best_classifier.pkl"
 LABEL_ENCOD_FILENAME = "label_encoder.pkl"
 SKIP_MODEL_LOADING = os.getenv("SKIP_MODEL_LOADING", "false").lower() in {"1", "true", "yes"}
 
-model_lock = threading.Lock()
+APP_ENV = os.getenv("APP_ENV", "dev").lower()
+SKIP_MODEL_LOADING = os.getenv("SKIP_MODEL_LOADING", "false").lower() in {"1", "true", "yes"}
 
+model_lock = threading.Lock()
 
 def load_artifacts():
-    """Carica modelli e altri artefatti necessari"""
-    if not os.path.exists(MODEL_DIR):
-        raise FileNotFoundError(f"Model directory {MODEL_DIR} does not exist.")
-    if not os.path.isfile(os.path.join(MODEL_DIR, MODEL_FILENAME)):
-        raise FileNotFoundError(f"Model file {MODEL_FILENAME} not found in {MODEL_DIR}.")
-    if not os.path.isfile(os.path.join(MODEL_DIR, LABEL_ENCOD_FILENAME)):
-        raise FileNotFoundError(f"Label encoder file {LABEL_ENCOD_FILENAME} not found in {MODEL_DIR}.")
+    """Carica modelli e altri artefatti necessari."""
+    try:
+        if not os.path.exists(MODEL_DIR):
+            raise FileNotFoundError(f"Model directory {MODEL_DIR} does not exist.")
+        if not os.path.isfile(os.path.join(MODEL_DIR, MODEL_FILENAME)):
+            raise FileNotFoundError(f"Model file {MODEL_FILENAME} not found in {MODEL_DIR}.")
+        if not os.path.isfile(os.path.join(MODEL_DIR, LABEL_ENCOD_FILENAME)):
+            raise FileNotFoundError(f"Label encoder file {LABEL_ENCOD_FILENAME} not found in {MODEL_DIR}.")
+    except FileNotFoundError as exc:
+        if APP_ENV == "prod":
+            logger.warning("Skipping model load in prod environment: %s", exc)
+            return None, None
+        raise
+
     model = joblib.load(os.path.join(MODEL_DIR, MODEL_FILENAME))
     le = joblib.load(os.path.join(MODEL_DIR, LABEL_ENCOD_FILENAME))
     return model, le
-# Load model and encoder on module import unless explicitly skipped
+
+# -------------------------------
+# Load model and encoder
+# -------------------------------
 model = None
 le = None
 
 if SKIP_MODEL_LOADING:
     logger.info("Skipping model load because SKIP_MODEL_LOADING is set.")
 else:
     model, le = load_artifacts()
-    logger.info(f"Loaded model version {MODEL_VERSION} from {MODEL_DIR}")
+    if model is None or le is None:
+        logger.info("Model artifacts not loaded; predictions disabled for this run.")
+    else:
+        logger.info(f"Loaded model version {MODEL_VERSION} from {MODEL_DIR}")
 
 # -------------------------------
 # FastAPI setup

diff --git a/pytest.ini b/pytest.ini
@@ -1,2 +1,4 @@
 [pytest]
-testpaths = tests/prod
+testpaths =
+    tests/prod
+    tests/docker
diff --git a/tests/dev/test_api_dev.py b/tests/dev/test_api_dev.py
@@ -1,66 +1,51 @@
 import os
+import sys
 import pytest
 from fastapi.testclient import TestClient
 
-# Skip heavy artifact loading during unit tests
-os.environ.setdefault("SKIP_MODEL_LOADING", "false")
 
-from app.main import app
+@pytest.fixture
+def dev_app(monkeypatch):
+    monkeypatch.setenv("APP_ENV", "dev")
+    monkeypatch.setenv("MODEL_VERSION", "v2")
+    monkeypatch.delenv("SKIP_MODEL_LOADING", raising=False)
+    monkeypatch.delenv("MODEL_DIR", raising=False)
+    sys.modules.pop("app.main", None)
+    import app.main as main
 
-client = TestClient(app)
+    if main.model is None or main.le is None:
+        pytest.fail("Model artifacts must be available in dev environment for these tests.")
+    return main
 
 
-def test_status():
+@pytest.fixture
+def client(dev_app):
+    return TestClient(dev_app.app)
+
+
+def test_status(client):
     """Testa che l'endpoint /status risponda correttamente"""
     response = client.get("/status")
     assert response.status_code == 200
     assert response.json() == {"status": "ok"}
 
 
-def test_model_version():
+def test_model_version(client, dev_app):
     """Testa che l'endpoint /model_version ritorni la versione modello"""
     response = client.get("/model_version")
     assert response.status_code == 200
     data = response.json()
-    assert "model_version" in data
-    assert isinstance(data["model_version"], str)
+    assert data["model_version"] == dev_app.MODEL_VERSION
+    assert dev_app.MODEL_VERSION == "v2"
+    assert os.path.normpath(dev_app.MODEL_DIR).split(os.sep)[-1] == "v2"
 
 
-def test_language_detection():
+def test_language_detection(client):
     """Testa che l'endpoint /language_detection ritorni predizioni"""
     payload = {"texts": ["Hello world!", "Ciao come state?"]}
     response = client.post("/language_detection", json=payload)
     assert response.status_code == 200
     data = response.json()
     assert "predictions" in data
     assert isinstance(data["predictions"], list)
-
-
-def test_language_detection_empty_list():
-    """Testa che una lista vuota venga gestita senza crash"""
-    payload = {"texts": []}
-    response = client.post("/language_detection", json=payload)
-    assert response.status_code == 200
-    data = response.json()
-    assert "predictions" in data
-    assert data["predictions"] == []
-
-def test_language_detection_invalid_input():
-    """Testa che un input non valido venga gestito correttamente"""
-    payload = {"text": "!!@±±#€@-=>./,"}  # input non valido 
-    response = client.post("/language_detection", json=payload)
-    assert response.status_code == 422
-    data = response.json()
-    assert "detail" in data
-    assert isinstance(data["detail"], list)
-
-# TODO: FIXME: questo test fallisce, capire perché
-# def test_language_detection_mixed_input():
-#     """Testa che una lista mista di testi venga gestita correttamente"""
-#     payload = {"texts": ["Hello world!", "", "!!@±±\#€@-=>./,", "Ciao come state?"]}
-#     response = client.post("/language_detection", json=payload)
-#     assert response.status_code == 200
-#     data = response.json()
-#     assert "predictions" in data
-#     assert isinstance(data["predictions"], list)
-#     assert len(data["predictions"]) == 4
+    assert len(data["predictions"]) == len(payload["texts"])
diff --git a/tests/docker/test_dockerfiles.py b/tests/docker/test_dockerfiles.py
@@ -0,0 +1,89 @@
+import re
+from pathlib import Path
+
+REPO_ROOT = Path(__file__).resolve().parents[2]
+
+
+def read_dockerfile(path: str) -> str:
+    return (REPO_ROOT / path).read_text(encoding="utf-8")
+
+
+def extract_copy_sources(contents: str):
+    for line in contents.splitlines():
+        stripped = line.strip()
+        if not stripped or stripped.startswith("#"):
+            continue
+        match = re.match(r"^COPY\s+([^\s]+)\s+(.*)$", stripped, re.IGNORECASE)
+        if match:
+            sources = match.group(1).split()
+            for src in sources:
+                yield src.rstrip("/")
+
+
+def test_production_dockerfile_sets_app_env():
+    contents = read_dockerfile("Dockerfile")
+    assert "ENV APP_ENV=prod" in contents
+
+
+def test_development_dockerfile_sets_app_env():
+    contents = read_dockerfile("Dockerfile.dev")
+    assert "ENV APP_ENV=dev" in contents
+
+
+def test_dockerfiles_define_model_version():
+    prod = read_dockerfile("Dockerfile")
+    dev = read_dockerfile("Dockerfile.dev")
+    assert "ENV MODEL_VERSION=" in prod
+    assert "ENV MODEL_VERSION=" in dev
+
+
+def test_production_copy_sources_exist():
+    contents = read_dockerfile("Dockerfile")
+    missing = [
+        src for src in extract_copy_sources(contents)
+        if not (REPO_ROOT / src).exists()
+    ]
+    assert not missing, f"Dockerfile COPY sources missing from repo: {missing}"
+
+
+def test_development_copy_sources_exist():
+    contents = read_dockerfile("Dockerfile.dev")
+    missing = [
+        src for src in extract_copy_sources(contents)
+        if not (REPO_ROOT / src).exists()
+    ]
+    assert not missing, f"Dockerfile.dev COPY sources missing from repo: {missing}"
+
+
+def test_requirements_cover_web_stack():
+    requirements = (REPO_ROOT / "requirements.txt").read_text(encoding="utf-8")
+    assert "fastapi" in requirements.lower()
+    assert "uvicorn" in requirements.lower()
+
+
+def test_uvicorn_entrypoint_module_exists():
+    contents = read_dockerfile("Dockerfile")
+    cmd_line = next(
+        (line for line in contents.splitlines() if line.strip().startswith("CMD")), ""
+    )
+    assert "app.main:app" in cmd_line
+    assert (REPO_ROOT / "app" / "main.py").exists()
+
+
+def test_models_directory_tracked_even_without_artifacts():
+    models_dir = REPO_ROOT / "models"
+    assert models_dir.exists() and models_dir.is_dir()
+
+
+def test_dev_dockerfile_does_not_copy_app_code():
+    contents = read_dockerfile("Dockerfile.dev")
+    copy_lines = [
+        line for line in contents.splitlines() if line.strip().upper().startswith("COPY")
+    ]
+    assert all("app/" not in line.split() for line in copy_lines), "Dev Dockerfile should rely on bind mount for app/ code"
+
+
+def test_dockerignore_allows_app_and_src():
+    dockerignore = (REPO_ROOT / ".dockerignore").read_text(encoding="utf-8").splitlines()
+    assert "app/" not in dockerignore, "app/ must be included in build context"
+    assert "src/" not in dockerignore, "src/ must be included in build context"
-Original file line number
+Diff line change
@@ Expand Up / @@ -46,4 +46,3 @@ Thumbs.db @@
     *.tar
     *.bak
     *.swp
-    Dockerfile.dev