diff --git a/.dockerignore b/.dockerignore index 48a30ce..356ad3a 100644 --- a/.dockerignore +++ b/.dockerignore @@ -6,8 +6,7 @@ venv/ data/ notebooks/ scripts/ -src/ *.ipynb *.env .DS_Store -.env \ No newline at end of file +.env diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index ca2312e..6b54e50 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -17,7 +17,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v6 with: - python-version: "3.11" + python-version: "3.13" - name: Install dependencies run: | diff --git a/.gitignore b/.gitignore index b3d3a5a..acc0cc0 100644 --- a/.gitignore +++ b/.gitignore @@ -46,4 +46,3 @@ Thumbs.db *.tar *.bak *.swp -Dockerfile.dev diff --git a/Dockerfile b/Dockerfile index 09b3396..00efa2c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -6,6 +6,8 @@ FROM python:3.11-slim # Per disabilitare buffering dei logs ENV PYTHONUNBUFFERED=1 +ENV APP_ENV=prod +ENV MODEL_VERSION=v2 WORKDIR /app @@ -14,6 +16,7 @@ RUN pip install --no-cache-dir -r requirements.txt # Copiamo app e modelli nella directory di lavoro COPY app/ ./app +COPY src/ ./src COPY models/ ./models EXPOSE 8000 diff --git a/Dockerfile.dev b/Dockerfile.dev new file mode 100644 index 0000000..4859d4c --- /dev/null +++ b/Dockerfile.dev @@ -0,0 +1,22 @@ +# ========================= +# Dockerfile - Sviluppo +# ========================= +FROM python:3.11-slim + +# per disabilitare buffering dei logs +ENV PYTHONUNBUFFERED=1 +ENV APP_ENV=dev +ENV MODEL_VERSION=v2 + +WORKDIR /app + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copiamo solo i modelli (il codice sarà montato come bind mount) +COPY src/ ./src +COPY models/ ./models + +EXPOSE 8000 + +CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"] diff --git a/README.md b/README.md index 47d060b..00ab2dc 100644 --- a/README.md +++ b/README.md @@ -75,6 +75,44 @@ docker build -t language-api . docker run -d -p 8000:8000 language-api ``` +### 📦 Versionamento dei modelli + +L'API carica gli artefatti dal percorso `models/text/language_classification/`. +Imposta `MODEL_VERSION` (o `MODEL_DIR`) come variabile d'ambiente per scegliere quale modello servire: + +```bash +docker run -e MODEL_VERSION=v2 -p 8000:8000 language-api +``` + +Assicurati che la cartella corrispondente contenga `best_classifier.pkl` e `label_encoder.pkl`. + +### 🔁 Development con Docker + +Per iterare rapidamente senza rebuild continui, usa l'immagine di sviluppo e monta il codice locale: + +```bash +docker build -f Dockerfile.dev -t language-api-dev . +docker run --rm -it \ + -v $(pwd)/app:/app/app \ + -v $(pwd)/configs:/app/configs \ + -v $(pwd)/src:/app/src \ + -v $(pwd)/tests:/app/tests \ + -p 8000:8000 \ + language-api-dev +``` + +`uvicorn --reload` rileva automaticamente le modifiche nella directory montata. Imposta `-e MODEL_VERSION=vX` se vuoi testare un modello diverso. + +## 🧠 Training del modello + +Usa `src/model_training.py` per addestrare un nuovo classificatore partendo dalla configurazione YAML: + +```bash +python src/model_training.py --config configs/model_config.yaml +``` + +Il training produce `best_classifier.pkl` e `label_encoder.pkl` nella cartella `output.model_dir` definita nel file di config, oltre a salvare un file di risultati con timestamp. Aggiorna `MODEL_VERSION` (o `MODEL_DIR`) per puntare al modello appena creato. + ## 📡 Endpoint disponibili | Endpoint | Metodo | Descrizione | @@ -104,4 +142,4 @@ pytest -v ``` --- -☑️ **DISCLAIMER** : Vibe-coded with ChatGPT - v5 and v4.1 +☑️ **DISCLAIMER** : Vibe-coded with OpenAI ChatGPT - v5 and v4.1 diff --git a/app/main.py b/app/main.py index 34f5577..033c8b2 100644 --- a/app/main.py +++ b/app/main.py @@ -25,27 +25,40 @@ # Config # ------------------------------- CONFIDENCE_THRESHOLD = 0.5 -MODEL_VERSION = "v1" -MODEL_DIR = f"models/text/language_classification/{MODEL_VERSION}" +MODEL_VERSION = os.getenv("MODEL_VERSION", "v1") +DEFAULT_MODEL_DIR = f"models/text/language_classification/{MODEL_VERSION}" +MODEL_DIR = os.getenv("MODEL_DIR", DEFAULT_MODEL_DIR) MODEL_FILENAME = "best_classifier.pkl" LABEL_ENCOD_FILENAME = "label_encoder.pkl" SKIP_MODEL_LOADING = os.getenv("SKIP_MODEL_LOADING", "false").lower() in {"1", "true", "yes"} -model_lock = threading.Lock() +APP_ENV = os.getenv("APP_ENV", "dev").lower() +SKIP_MODEL_LOADING = os.getenv("SKIP_MODEL_LOADING", "false").lower() in {"1", "true", "yes"} +model_lock = threading.Lock() def load_artifacts(): - """Carica modelli e altri artefatti necessari""" - if not os.path.exists(MODEL_DIR): - raise FileNotFoundError(f"Model directory {MODEL_DIR} does not exist.") - if not os.path.isfile(os.path.join(MODEL_DIR, MODEL_FILENAME)): - raise FileNotFoundError(f"Model file {MODEL_FILENAME} not found in {MODEL_DIR}.") - if not os.path.isfile(os.path.join(MODEL_DIR, LABEL_ENCOD_FILENAME)): - raise FileNotFoundError(f"Label encoder file {LABEL_ENCOD_FILENAME} not found in {MODEL_DIR}.") + """Carica modelli e altri artefatti necessari.""" + try: + if not os.path.exists(MODEL_DIR): + raise FileNotFoundError(f"Model directory {MODEL_DIR} does not exist.") + if not os.path.isfile(os.path.join(MODEL_DIR, MODEL_FILENAME)): + raise FileNotFoundError(f"Model file {MODEL_FILENAME} not found in {MODEL_DIR}.") + if not os.path.isfile(os.path.join(MODEL_DIR, LABEL_ENCOD_FILENAME)): + raise FileNotFoundError(f"Label encoder file {LABEL_ENCOD_FILENAME} not found in {MODEL_DIR}.") + except FileNotFoundError as exc: + if APP_ENV == "prod": + logger.warning("Skipping model load in prod environment: %s", exc) + return None, None + raise + model = joblib.load(os.path.join(MODEL_DIR, MODEL_FILENAME)) le = joblib.load(os.path.join(MODEL_DIR, LABEL_ENCOD_FILENAME)) return model, le -# Load model and encoder on module import unless explicitly skipped + +# ------------------------------- +# Load model and encoder +# ------------------------------- model = None le = None @@ -53,7 +66,10 @@ def load_artifacts(): logger.info("Skipping model load because SKIP_MODEL_LOADING is set.") else: model, le = load_artifacts() - logger.info(f"Loaded model version {MODEL_VERSION} from {MODEL_DIR}") + if model is None or le is None: + logger.info("Model artifacts not loaded; predictions disabled for this run.") + else: + logger.info(f"Loaded model version {MODEL_VERSION} from {MODEL_DIR}") # ------------------------------- # FastAPI setup diff --git a/pytest.ini b/pytest.ini index ddb6c6a..173a8ad 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,2 +1,4 @@ [pytest] -testpaths = tests/prod +testpaths = + tests/prod + tests/docker diff --git a/tests/dev/test_api_dev.py b/tests/dev/test_api_dev.py index 464ecc3..ec47706 100644 --- a/tests/dev/test_api_dev.py +++ b/tests/dev/test_api_dev.py @@ -1,32 +1,46 @@ import os +import sys import pytest from fastapi.testclient import TestClient -# Skip heavy artifact loading during unit tests -os.environ.setdefault("SKIP_MODEL_LOADING", "false") -from app.main import app +@pytest.fixture +def dev_app(monkeypatch): + monkeypatch.setenv("APP_ENV", "dev") + monkeypatch.setenv("MODEL_VERSION", "v2") + monkeypatch.delenv("SKIP_MODEL_LOADING", raising=False) + monkeypatch.delenv("MODEL_DIR", raising=False) + sys.modules.pop("app.main", None) + import app.main as main -client = TestClient(app) + if main.model is None or main.le is None: + pytest.fail("Model artifacts must be available in dev environment for these tests.") + return main -def test_status(): +@pytest.fixture +def client(dev_app): + return TestClient(dev_app.app) + + +def test_status(client): """Testa che l'endpoint /status risponda correttamente""" response = client.get("/status") assert response.status_code == 200 assert response.json() == {"status": "ok"} -def test_model_version(): +def test_model_version(client, dev_app): """Testa che l'endpoint /model_version ritorni la versione modello""" response = client.get("/model_version") assert response.status_code == 200 data = response.json() - assert "model_version" in data - assert isinstance(data["model_version"], str) + assert data["model_version"] == dev_app.MODEL_VERSION + assert dev_app.MODEL_VERSION == "v2" + assert os.path.normpath(dev_app.MODEL_DIR).split(os.sep)[-1] == "v2" -def test_language_detection(): +def test_language_detection(client): """Testa che l'endpoint /language_detection ritorni predizioni""" payload = {"texts": ["Hello world!", "Ciao come state?"]} response = client.post("/language_detection", json=payload) @@ -34,33 +48,4 @@ def test_language_detection(): data = response.json() assert "predictions" in data assert isinstance(data["predictions"], list) - - -def test_language_detection_empty_list(): - """Testa che una lista vuota venga gestita senza crash""" - payload = {"texts": []} - response = client.post("/language_detection", json=payload) - assert response.status_code == 200 - data = response.json() - assert "predictions" in data - assert data["predictions"] == [] - -def test_language_detection_invalid_input(): - """Testa che un input non valido venga gestito correttamente""" - payload = {"text": "!!@±±#€@-=>./,"} # input non valido - response = client.post("/language_detection", json=payload) - assert response.status_code == 422 - data = response.json() - assert "detail" in data - assert isinstance(data["detail"], list) - -# TODO: FIXME: questo test fallisce, capire perché -# def test_language_detection_mixed_input(): -# """Testa che una lista mista di testi venga gestita correttamente""" -# payload = {"texts": ["Hello world!", "", "!!@±±\#€@-=>./,", "Ciao come state?"]} -# response = client.post("/language_detection", json=payload) -# assert response.status_code == 200 -# data = response.json() -# assert "predictions" in data -# assert isinstance(data["predictions"], list) -# assert len(data["predictions"]) == 4 + assert len(data["predictions"]) == len(payload["texts"]) diff --git a/tests/docker/test_dockerfiles.py b/tests/docker/test_dockerfiles.py new file mode 100644 index 0000000..543274e --- /dev/null +++ b/tests/docker/test_dockerfiles.py @@ -0,0 +1,89 @@ +import re +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parents[2] + + +def read_dockerfile(path: str) -> str: + return (REPO_ROOT / path).read_text(encoding="utf-8") + + +def extract_copy_sources(contents: str): + for line in contents.splitlines(): + stripped = line.strip() + if not stripped or stripped.startswith("#"): + continue + match = re.match(r"^COPY\s+([^\s]+)\s+(.*)$", stripped, re.IGNORECASE) + if match: + sources = match.group(1).split() + for src in sources: + yield src.rstrip("/") + + +def test_production_dockerfile_sets_app_env(): + contents = read_dockerfile("Dockerfile") + assert "ENV APP_ENV=prod" in contents + + +def test_development_dockerfile_sets_app_env(): + contents = read_dockerfile("Dockerfile.dev") + assert "ENV APP_ENV=dev" in contents + + +def test_dockerfiles_define_model_version(): + prod = read_dockerfile("Dockerfile") + dev = read_dockerfile("Dockerfile.dev") + assert "ENV MODEL_VERSION=" in prod + assert "ENV MODEL_VERSION=" in dev + + +def test_production_copy_sources_exist(): + contents = read_dockerfile("Dockerfile") + missing = [ + src for src in extract_copy_sources(contents) + if not (REPO_ROOT / src).exists() + ] + assert not missing, f"Dockerfile COPY sources missing from repo: {missing}" + + +def test_development_copy_sources_exist(): + contents = read_dockerfile("Dockerfile.dev") + missing = [ + src for src in extract_copy_sources(contents) + if not (REPO_ROOT / src).exists() + ] + assert not missing, f"Dockerfile.dev COPY sources missing from repo: {missing}" + + +def test_requirements_cover_web_stack(): + requirements = (REPO_ROOT / "requirements.txt").read_text(encoding="utf-8") + assert "fastapi" in requirements.lower() + assert "uvicorn" in requirements.lower() + + +def test_uvicorn_entrypoint_module_exists(): + contents = read_dockerfile("Dockerfile") + cmd_line = next( + (line for line in contents.splitlines() if line.strip().startswith("CMD")), "" + ) + assert "app.main:app" in cmd_line + assert (REPO_ROOT / "app" / "main.py").exists() + + +def test_models_directory_tracked_even_without_artifacts(): + models_dir = REPO_ROOT / "models" + assert models_dir.exists() and models_dir.is_dir() + + +def test_dev_dockerfile_does_not_copy_app_code(): + contents = read_dockerfile("Dockerfile.dev") + copy_lines = [ + line for line in contents.splitlines() if line.strip().upper().startswith("COPY") + ] + assert all("app/" not in line.split() for line in copy_lines), "Dev Dockerfile should rely on bind mount for app/ code" + + +def test_dockerignore_allows_app_and_src(): + dockerignore = (REPO_ROOT / ".dockerignore").read_text(encoding="utf-8").splitlines() + assert "app/" not in dockerignore, "app/ must be included in build context" + assert "src/" not in dockerignore, "src/ must be included in build context" diff --git a/tests/prod/test_api_prod.py b/tests/prod/test_api_prod.py index 12da850..6336dc6 100644 --- a/tests/prod/test_api_prod.py +++ b/tests/prod/test_api_prod.py @@ -1,28 +1,37 @@ -import os -from fastapi.testclient import TestClient +import sys from unittest.mock import patch +import pytest +from fastapi.testclient import TestClient + + +def mock_predict_language_safe(_, __, texts, **kwargs): + return ["English", "Italian"][: len(texts)] -# Skip heavy artifact loading during unit tests -os.environ.setdefault("SKIP_MODEL_LOADING", "true") -from app.main import app +@pytest.fixture +def prod_app(monkeypatch, tmp_path): + monkeypatch.setenv("APP_ENV", "prod") + monkeypatch.setenv("MODEL_DIR", str(tmp_path / "missing")) + monkeypatch.delenv("SKIP_MODEL_LOADING", raising=False) + sys.modules.pop("app.main", None) + import app.main as main -client = TestClient(app) + return main -# mock function to replace the actual model prediction -def mock_predict_language_safe(pipeline, le, texts, **kwargs): - return ["English", "Italian"] +@pytest.fixture +def client(prod_app): + return TestClient(prod_app.app) -def test_status_endpoint(): +def test_status_endpoint(client): """Health check endpoint""" response = client.get("/status") assert response.status_code == 200 assert response.json() == {"status": "ok"} -def test_model_version_endpoint(): +def test_model_version_endpoint(client): """Model version endpoint""" response = client.get("/model_version") assert response.status_code == 200 @@ -31,16 +40,22 @@ def test_model_version_endpoint(): assert isinstance(data["model_version"], str) -@patch("app.main.predict_language_safe", side_effect=mock_predict_language_safe) -def test_language_detection(mock_func): +def test_language_detection_with_mock(client, prod_app): """Language detection endpoint with mocked model""" payload = {"texts": ["Hello world!", "Ciao come state?"]} - response = client.post("/language_detection", json=payload) + with patch("app.main.predict_language_safe", side_effect=mock_predict_language_safe) as mock_func: + response = client.post("/language_detection", json=payload) assert response.status_code == 200 data = response.json() assert "predictions" in data assert isinstance(data["predictions"], list) - assert "English" in data["predictions"] - assert "Italian" in data["predictions"] - - mock_func.assert_called_once() # controlla che la funzione mock sia stata chiamata + assert set(data["predictions"]) == {"English", "Italian"} + + mock_func.assert_called_once() + + +def test_prod_environment_skips_model_loading(prod_app): + """Ensure prod environment does not load model artifacts during tests.""" + assert prod_app.APP_ENV == "prod" + assert prod_app.model is None + assert prod_app.le is None