From 9d7c4977c78047cf3c4b27901e0fd52b14f11a1d Mon Sep 17 00:00:00 2001
From: msquared <mrc.milano@gmail.com>
Date: Mon, 20 Oct 2025 21:46:31 +0200
Subject: [PATCH 1/2] add docker build testing and hot-reload for dev image
 plus improved README

---
 .dockerignore                    |  3 +-
 Dockerfile                       |  3 ++
 README.md                        | 40 +++++++++++++-
 app/main.py                      | 53 +++++++++++++++----
 pytest.ini                       |  4 ++
 tests/dev/test_api_dev.py        | 52 +++++++++++++++++++
 tests/docker/test_dockerfiles.py | 89 ++++++++++++++++++++++++++++++++
 tests/prod/test_api_prod.py      | 62 ++++++++++++++++++++++
 8 files changed, 293 insertions(+), 13 deletions(-)
 create mode 100644 pytest.ini
 create mode 100644 tests/dev/test_api_dev.py
 create mode 100644 tests/docker/test_dockerfiles.py
 create mode 100644 tests/prod/test_api_prod.py

diff --git a/.dockerignore b/.dockerignore
index 48a30ce..356ad3a 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -6,8 +6,7 @@ venv/
 data/
 notebooks/
 scripts/
-src/
 *.ipynb
 *.env
 .DS_Store
-.env
\ No newline at end of file
+.env
diff --git a/Dockerfile b/Dockerfile
index 09b3396..00efa2c 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -6,6 +6,8 @@ FROM python:3.11-slim
 
 # Per disabilitare buffering dei logs
 ENV PYTHONUNBUFFERED=1
+ENV APP_ENV=prod
+ENV MODEL_VERSION=v2
 
 WORKDIR /app
 
@@ -14,6 +16,7 @@ RUN pip install --no-cache-dir -r requirements.txt
 
 # Copiamo app e modelli nella directory di lavoro
 COPY app/ ./app
+COPY src/ ./src
 COPY models/ ./models
 
 EXPOSE 8000
diff --git a/README.md b/README.md
index 47d060b..00ab2dc 100644
--- a/README.md
+++ b/README.md
@@ -75,6 +75,44 @@ docker build -t language-api .
 docker run -d -p 8000:8000 language-api
 ```
 
+### 📦 Versionamento dei modelli
+
+L'API carica gli artefatti dal percorso `models/text/language_classification/<MODEL_VERSION>`.  
+Imposta `MODEL_VERSION` (o `MODEL_DIR`) come variabile d'ambiente per scegliere quale modello servire:
+
+```bash
+docker run -e MODEL_VERSION=v2 -p 8000:8000 language-api
+```
+
+Assicurati che la cartella corrispondente contenga `best_classifier.pkl` e `label_encoder.pkl`.
+
+### 🔁 Development con Docker
+
+Per iterare rapidamente senza rebuild continui, usa l'immagine di sviluppo e monta il codice locale:
+
+```bash
+docker build -f Dockerfile.dev -t language-api-dev .
+docker run --rm -it \
+  -v $(pwd)/app:/app/app \
+  -v $(pwd)/configs:/app/configs \
+  -v $(pwd)/src:/app/src \
+  -v $(pwd)/tests:/app/tests \
+  -p 8000:8000 \
+  language-api-dev
+```
+
+`uvicorn --reload` rileva automaticamente le modifiche nella directory montata. Imposta `-e MODEL_VERSION=vX` se vuoi testare un modello diverso.
+
+## 🧠 Training del modello
+
+Usa `src/model_training.py` per addestrare un nuovo classificatore partendo dalla configurazione YAML:
+
+```bash
+python src/model_training.py --config configs/model_config.yaml
+```
+
+Il training produce `best_classifier.pkl` e `label_encoder.pkl` nella cartella `output.model_dir` definita nel file di config, oltre a salvare un file di risultati con timestamp. Aggiorna `MODEL_VERSION` (o `MODEL_DIR`) per puntare al modello appena creato.
+
 ## 📡 Endpoint disponibili
 
 | Endpoint              | Metodo | Descrizione                         |
@@ -104,4 +142,4 @@ pytest -v
 ```
 
 ---
-☑️ **DISCLAIMER** : Vibe-coded with ChatGPT - v5 and v4.1
+☑️ **DISCLAIMER** : Vibe-coded with OpenAI ChatGPT - v5 and v4.1
diff --git a/app/main.py b/app/main.py
index c869b41..ef39420 100644
--- a/app/main.py
+++ b/app/main.py
@@ -1,13 +1,15 @@
 import logging
+import os
+import threading
 import time
-from fastapi import FastAPI, Request
-from pydantic import BaseModel
 from typing import List
+
 import joblib
-import os
+from fastapi import FastAPI, Request
+from pydantic import BaseModel
+
 # clean_texts usata da predict_language_safe nella pipeline
-from app.utils import predict_language_safe, clean_texts 
-import threading
+from app.utils import predict_language_safe, clean_texts
 
 # -------------------------------
 # Logging setup
@@ -23,19 +25,50 @@
 # Config
 # -------------------------------
 CONFIDENCE_THRESHOLD = 0.5
-MODEL_VERSION = "v1"
-MODEL_DIR = f"models/text/language_classification/{MODEL_VERSION}"
+MODEL_VERSION = os.getenv("MODEL_VERSION", "v1")
+DEFAULT_MODEL_DIR = f"models/text/language_classification/{MODEL_VERSION}"
+MODEL_DIR = os.getenv("MODEL_DIR", DEFAULT_MODEL_DIR)
 MODEL_FILENAME = "best_classifier.pkl"
 LABEL_ENCOD_FILENAME = "label_encoder.pkl"
 
+APP_ENV = os.getenv("APP_ENV", "dev").lower()
+SKIP_MODEL_LOADING = os.getenv("SKIP_MODEL_LOADING", "false").lower() in {"1", "true", "yes"}
+
 model_lock = threading.Lock()
 
+def load_artifacts():
+    """Carica modelli e altri artefatti necessari."""
+    try:
+        if not os.path.exists(MODEL_DIR):
+            raise FileNotFoundError(f"Model directory {MODEL_DIR} does not exist.")
+        if not os.path.isfile(os.path.join(MODEL_DIR, MODEL_FILENAME)):
+            raise FileNotFoundError(f"Model file {MODEL_FILENAME} not found in {MODEL_DIR}.")
+        if not os.path.isfile(os.path.join(MODEL_DIR, LABEL_ENCOD_FILENAME)):
+            raise FileNotFoundError(f"Label encoder file {LABEL_ENCOD_FILENAME} not found in {MODEL_DIR}.")
+    except FileNotFoundError as exc:
+        if APP_ENV == "prod":
+            logger.warning("Skipping model load in prod environment: %s", exc)
+            return None, None
+        raise
+
+    model = joblib.load(os.path.join(MODEL_DIR, MODEL_FILENAME))
+    le = joblib.load(os.path.join(MODEL_DIR, LABEL_ENCOD_FILENAME))
+    return model, le
+
 # -------------------------------
 # Load model and encoder
 # -------------------------------
-model = joblib.load(os.path.join(MODEL_DIR, MODEL_FILENAME))
-le = joblib.load(os.path.join(MODEL_DIR, LABEL_ENCOD_FILENAME))
-logger.info(f"Loaded model version {MODEL_VERSION} from {MODEL_DIR}")
+model = None
+le = None
+
+if SKIP_MODEL_LOADING:
+    logger.info("Skipping model load because SKIP_MODEL_LOADING is set.")
+else:
+    model, le = load_artifacts()
+    if model is None or le is None:
+        logger.info("Model artifacts not loaded; predictions disabled for this run.")
+    else:
+        logger.info(f"Loaded model version {MODEL_VERSION} from {MODEL_DIR}")
 
 # -------------------------------
 # FastAPI setup
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 0000000..173a8ad
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,4 @@
+[pytest]
+testpaths =
+    tests/prod
+    tests/docker
diff --git a/tests/dev/test_api_dev.py b/tests/dev/test_api_dev.py
new file mode 100644
index 0000000..fca61ae
--- /dev/null
+++ b/tests/dev/test_api_dev.py
@@ -0,0 +1,52 @@
+import os
+import sys
+
+import pytest
+from fastapi.testclient import TestClient
+
+
+@pytest.fixture
+def dev_app(monkeypatch):
+    monkeypatch.setenv("APP_ENV", "dev")
+    monkeypatch.setenv("MODEL_VERSION", "v2")
+    monkeypatch.delenv("SKIP_MODEL_LOADING", raising=False)
+    monkeypatch.delenv("MODEL_DIR", raising=False)
+    sys.modules.pop("app.main", None)
+    import app.main as main
+
+    if main.model is None or main.le is None:
+        pytest.fail("Model artifacts must be available in dev environment for these tests.")
+    return main
+
+
+@pytest.fixture
+def client(dev_app):
+    return TestClient(dev_app.app)
+
+
+def test_status(client):
+    """Testa che l'endpoint /status risponda correttamente"""
+    response = client.get("/status")
+    assert response.status_code == 200
+    assert response.json() == {"status": "ok"}
+
+
+def test_model_version(client, dev_app):
+    """Testa che l'endpoint /model_version ritorni la versione modello"""
+    response = client.get("/model_version")
+    assert response.status_code == 200
+    data = response.json()
+    assert data["model_version"] == dev_app.MODEL_VERSION
+    assert dev_app.MODEL_VERSION == "v2"
+    assert os.path.normpath(dev_app.MODEL_DIR).split(os.sep)[-1] == "v2"
+
+
+def test_language_detection(client):
+    """Testa che l'endpoint /language_detection ritorni predizioni"""
+    payload = {"texts": ["Hello world!", "Ciao come state?"]}
+    response = client.post("/language_detection", json=payload)
+    assert response.status_code == 200
+    data = response.json()
+    assert "predictions" in data
+    assert isinstance(data["predictions"], list)
+    assert len(data["predictions"]) == len(payload["texts"])
diff --git a/tests/docker/test_dockerfiles.py b/tests/docker/test_dockerfiles.py
new file mode 100644
index 0000000..543274e
--- /dev/null
+++ b/tests/docker/test_dockerfiles.py
@@ -0,0 +1,89 @@
+import re
+from pathlib import Path
+
+REPO_ROOT = Path(__file__).resolve().parents[2]
+
+
+def read_dockerfile(path: str) -> str:
+    return (REPO_ROOT / path).read_text(encoding="utf-8")
+
+
+def extract_copy_sources(contents: str):
+    for line in contents.splitlines():
+        stripped = line.strip()
+        if not stripped or stripped.startswith("#"):
+            continue
+        match = re.match(r"^COPY\s+([^\s]+)\s+(.*)$", stripped, re.IGNORECASE)
+        if match:
+            sources = match.group(1).split()
+            for src in sources:
+                yield src.rstrip("/")
+
+
+def test_production_dockerfile_sets_app_env():
+    contents = read_dockerfile("Dockerfile")
+    assert "ENV APP_ENV=prod" in contents
+
+
+def test_development_dockerfile_sets_app_env():
+    contents = read_dockerfile("Dockerfile.dev")
+    assert "ENV APP_ENV=dev" in contents
+
+
+def test_dockerfiles_define_model_version():
+    prod = read_dockerfile("Dockerfile")
+    dev = read_dockerfile("Dockerfile.dev")
+    assert "ENV MODEL_VERSION=" in prod
+    assert "ENV MODEL_VERSION=" in dev
+
+
+def test_production_copy_sources_exist():
+    contents = read_dockerfile("Dockerfile")
+    missing = [
+        src for src in extract_copy_sources(contents)
+        if not (REPO_ROOT / src).exists()
+    ]
+    assert not missing, f"Dockerfile COPY sources missing from repo: {missing}"
+
+
+def test_development_copy_sources_exist():
+    contents = read_dockerfile("Dockerfile.dev")
+    missing = [
+        src for src in extract_copy_sources(contents)
+        if not (REPO_ROOT / src).exists()
+    ]
+    assert not missing, f"Dockerfile.dev COPY sources missing from repo: {missing}"
+
+
+def test_requirements_cover_web_stack():
+    requirements = (REPO_ROOT / "requirements.txt").read_text(encoding="utf-8")
+    assert "fastapi" in requirements.lower()
+    assert "uvicorn" in requirements.lower()
+
+
+def test_uvicorn_entrypoint_module_exists():
+    contents = read_dockerfile("Dockerfile")
+    cmd_line = next(
+        (line for line in contents.splitlines() if line.strip().startswith("CMD")), ""
+    )
+    assert "app.main:app" in cmd_line
+    assert (REPO_ROOT / "app" / "main.py").exists()
+
+
+def test_models_directory_tracked_even_without_artifacts():
+    models_dir = REPO_ROOT / "models"
+    assert models_dir.exists() and models_dir.is_dir()
+
+
+def test_dev_dockerfile_does_not_copy_app_code():
+    contents = read_dockerfile("Dockerfile.dev")
+    copy_lines = [
+        line for line in contents.splitlines() if line.strip().upper().startswith("COPY")
+    ]
+    assert all("app/" not in line.split() for line in copy_lines), "Dev Dockerfile should rely on bind mount for app/ code"
+
+
+def test_dockerignore_allows_app_and_src():
+    dockerignore = (REPO_ROOT / ".dockerignore").read_text(encoding="utf-8").splitlines()
+    assert "app/" not in dockerignore, "app/ must be included in build context"
+    assert "src/" not in dockerignore, "src/ must be included in build context"
diff --git a/tests/prod/test_api_prod.py b/tests/prod/test_api_prod.py
new file mode 100644
index 0000000..0f2def7
--- /dev/null
+++ b/tests/prod/test_api_prod.py
@@ -0,0 +1,62 @@
+import sys
+from unittest.mock import patch
+
+import pytest
+from fastapi.testclient import TestClient
+
+
+def mock_predict_language_safe(_, __, texts, **kwargs):
+    return ["English", "Italian"][: len(texts)]
+
+
+@pytest.fixture
+def prod_app(monkeypatch, tmp_path):
+    monkeypatch.setenv("APP_ENV", "prod")
+    monkeypatch.setenv("MODEL_DIR", str(tmp_path / "missing"))
+    monkeypatch.delenv("SKIP_MODEL_LOADING", raising=False)
+    sys.modules.pop("app.main", None)
+    import app.main as main
+
+    return main
+
+
+@pytest.fixture
+def client(prod_app):
+    return TestClient(prod_app.app)
+
+
+def test_status_endpoint(client):
+    """Health check endpoint"""
+    response = client.get("/status")
+    assert response.status_code == 200
+    assert response.json() == {"status": "ok"}
+
+
+def test_model_version_endpoint(client):
+    """Model version endpoint"""
+    response = client.get("/model_version")
+    assert response.status_code == 200
+    data = response.json()
+    assert "model_version" in data
+    assert isinstance(data["model_version"], str)
+
+
+def test_language_detection_with_mock(client, prod_app):
+    """Language detection endpoint with mocked model"""
+    payload = {"texts": ["Hello world!", "Ciao come state?"]}
+    with patch("app.main.predict_language_safe", side_effect=mock_predict_language_safe) as mock_func:
+        response = client.post("/language_detection", json=payload)
+    assert response.status_code == 200
+    data = response.json()
+    assert "predictions" in data
+    assert isinstance(data["predictions"], list)
+    assert set(data["predictions"]) == {"English", "Italian"}
+
+    mock_func.assert_called_once()
+
+
+def test_prod_environment_skips_model_loading(prod_app):
+    """Ensure prod environment does not load model artifacts during tests."""
+    assert prod_app.APP_ENV == "prod"
+    assert prod_app.model is None
+    assert prod_app.le is None

From a78ff8aba9650c98712bd6a6ac1c0e3d3adc4b44 Mon Sep 17 00:00:00 2001
From: msquared <mrc.milano@gmail.com>
Date: Mon, 20 Oct 2025 22:07:52 +0200
Subject: [PATCH 2/2] modify .gitignore to include Dockerfile.dev in versioning

---
 .gitignore     |  1 -
 Dockerfile.dev | 22 ++++++++++++++++++++++
 2 files changed, 22 insertions(+), 1 deletion(-)
 create mode 100644 Dockerfile.dev

diff --git a/.gitignore b/.gitignore
index b3d3a5a..acc0cc0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -46,4 +46,3 @@ Thumbs.db
 *.tar
 *.bak
 *.swp
-Dockerfile.dev
diff --git a/Dockerfile.dev b/Dockerfile.dev
new file mode 100644
index 0000000..4859d4c
--- /dev/null
+++ b/Dockerfile.dev
@@ -0,0 +1,22 @@
+# =========================
+# Dockerfile - Sviluppo
+# =========================
+FROM python:3.11-slim
+
+# per disabilitare buffering dei logs
+ENV PYTHONUNBUFFERED=1
+ENV APP_ENV=dev
+ENV MODEL_VERSION=v2
+
+WORKDIR /app
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Copiamo solo i modelli (il codice sarà montato come bind mount)
+COPY src/ ./src
+COPY models/ ./models
+
+EXPOSE 8000
+
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"]