Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions .dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@ venv/
data/
notebooks/
scripts/
src/
*.ipynb
*.env
.DS_Store
.env
.env
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -46,4 +46,3 @@ Thumbs.db
*.tar
*.bak
*.swp
Dockerfile.dev
3 changes: 3 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ FROM python:3.11-slim

# Per disabilitare buffering dei logs
ENV PYTHONUNBUFFERED=1
ENV APP_ENV=prod
ENV MODEL_VERSION=v2

WORKDIR /app

Expand All @@ -14,6 +16,7 @@ RUN pip install --no-cache-dir -r requirements.txt

# Copiamo app e modelli nella directory di lavoro
COPY app/ ./app
COPY src/ ./src
COPY models/ ./models

EXPOSE 8000
Expand Down
22 changes: 22 additions & 0 deletions Dockerfile.dev
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# =========================
# Dockerfile - Sviluppo
# =========================
FROM python:3.11-slim

# per disabilitare buffering dei logs
ENV PYTHONUNBUFFERED=1
ENV APP_ENV=dev
ENV MODEL_VERSION=v2

WORKDIR /app

COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

# Copiamo solo i modelli (il codice sarà montato come bind mount)
COPY src/ ./src
COPY models/ ./models

EXPOSE 8000

CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"]
40 changes: 39 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,44 @@ docker build -t language-api .
docker run -d -p 8000:8000 language-api
```

### 📦 Versionamento dei modelli

L'API carica gli artefatti dal percorso `models/text/language_classification/<MODEL_VERSION>`.
Imposta `MODEL_VERSION` (o `MODEL_DIR`) come variabile d'ambiente per scegliere quale modello servire:

```bash
docker run -e MODEL_VERSION=v2 -p 8000:8000 language-api
```

Assicurati che la cartella corrispondente contenga `best_classifier.pkl` e `label_encoder.pkl`.

### 🔁 Development con Docker

Per iterare rapidamente senza rebuild continui, usa l'immagine di sviluppo e monta il codice locale:

```bash
docker build -f Dockerfile.dev -t language-api-dev .
docker run --rm -it \
-v $(pwd)/app:/app/app \
-v $(pwd)/configs:/app/configs \
-v $(pwd)/src:/app/src \
-v $(pwd)/tests:/app/tests \
-p 8000:8000 \
language-api-dev
```

`uvicorn --reload` rileva automaticamente le modifiche nella directory montata. Imposta `-e MODEL_VERSION=vX` se vuoi testare un modello diverso.

## 🧠 Training del modello

Usa `src/model_training.py` per addestrare un nuovo classificatore partendo dalla configurazione YAML:

```bash
python src/model_training.py --config configs/model_config.yaml
```

Il training produce `best_classifier.pkl` e `label_encoder.pkl` nella cartella `output.model_dir` definita nel file di config, oltre a salvare un file di risultati con timestamp. Aggiorna `MODEL_VERSION` (o `MODEL_DIR`) per puntare al modello appena creato.

## 📡 Endpoint disponibili

| Endpoint | Metodo | Descrizione |
Expand Down Expand Up @@ -104,4 +142,4 @@ pytest -v
```

---
☑️ **DISCLAIMER** : Vibe-coded with ChatGPT - v5 and v4.1
☑️ **DISCLAIMER** : Vibe-coded with OpenAI ChatGPT - v5 and v4.1
53 changes: 43 additions & 10 deletions app/main.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
import logging
import os
import threading
import time
from fastapi import FastAPI, Request
from pydantic import BaseModel
from typing import List

import joblib
import os
from fastapi import FastAPI, Request
from pydantic import BaseModel

# clean_texts usata da predict_language_safe nella pipeline
from app.utils import predict_language_safe, clean_texts
import threading
from app.utils import predict_language_safe, clean_texts

# -------------------------------
# Logging setup
Expand All @@ -23,19 +25,50 @@
# Config
# -------------------------------
CONFIDENCE_THRESHOLD = 0.5
MODEL_VERSION = "v1"
MODEL_DIR = f"models/text/language_classification/{MODEL_VERSION}"
MODEL_VERSION = os.getenv("MODEL_VERSION", "v1")
DEFAULT_MODEL_DIR = f"models/text/language_classification/{MODEL_VERSION}"
MODEL_DIR = os.getenv("MODEL_DIR", DEFAULT_MODEL_DIR)
MODEL_FILENAME = "best_classifier.pkl"
LABEL_ENCOD_FILENAME = "label_encoder.pkl"

APP_ENV = os.getenv("APP_ENV", "dev").lower()
SKIP_MODEL_LOADING = os.getenv("SKIP_MODEL_LOADING", "false").lower() in {"1", "true", "yes"}

model_lock = threading.Lock()

def load_artifacts():
"""Carica modelli e altri artefatti necessari."""
try:
if not os.path.exists(MODEL_DIR):
raise FileNotFoundError(f"Model directory {MODEL_DIR} does not exist.")
if not os.path.isfile(os.path.join(MODEL_DIR, MODEL_FILENAME)):
raise FileNotFoundError(f"Model file {MODEL_FILENAME} not found in {MODEL_DIR}.")
if not os.path.isfile(os.path.join(MODEL_DIR, LABEL_ENCOD_FILENAME)):
raise FileNotFoundError(f"Label encoder file {LABEL_ENCOD_FILENAME} not found in {MODEL_DIR}.")
except FileNotFoundError as exc:
if APP_ENV == "prod":
logger.warning("Skipping model load in prod environment: %s", exc)
return None, None
raise

model = joblib.load(os.path.join(MODEL_DIR, MODEL_FILENAME))
le = joblib.load(os.path.join(MODEL_DIR, LABEL_ENCOD_FILENAME))
return model, le

# -------------------------------
# Load model and encoder
# -------------------------------
model = joblib.load(os.path.join(MODEL_DIR, MODEL_FILENAME))
le = joblib.load(os.path.join(MODEL_DIR, LABEL_ENCOD_FILENAME))
logger.info(f"Loaded model version {MODEL_VERSION} from {MODEL_DIR}")
model = None
le = None

if SKIP_MODEL_LOADING:
logger.info("Skipping model load because SKIP_MODEL_LOADING is set.")
else:
model, le = load_artifacts()
if model is None or le is None:
logger.info("Model artifacts not loaded; predictions disabled for this run.")
else:
logger.info(f"Loaded model version {MODEL_VERSION} from {MODEL_DIR}")

# -------------------------------
# FastAPI setup
Expand Down
4 changes: 4 additions & 0 deletions pytest.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
[pytest]
testpaths =
tests/prod
tests/docker
52 changes: 52 additions & 0 deletions tests/dev/test_api_dev.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import os
import sys

import pytest
from fastapi.testclient import TestClient


@pytest.fixture
def dev_app(monkeypatch):
monkeypatch.setenv("APP_ENV", "dev")
monkeypatch.setenv("MODEL_VERSION", "v2")
monkeypatch.delenv("SKIP_MODEL_LOADING", raising=False)
monkeypatch.delenv("MODEL_DIR", raising=False)
sys.modules.pop("app.main", None)
import app.main as main

if main.model is None or main.le is None:
pytest.fail("Model artifacts must be available in dev environment for these tests.")
return main


@pytest.fixture
def client(dev_app):
return TestClient(dev_app.app)


def test_status(client):
"""Testa che l'endpoint /status risponda correttamente"""
response = client.get("/status")
assert response.status_code == 200
assert response.json() == {"status": "ok"}


def test_model_version(client, dev_app):
"""Testa che l'endpoint /model_version ritorni la versione modello"""
response = client.get("/model_version")
assert response.status_code == 200
data = response.json()
assert data["model_version"] == dev_app.MODEL_VERSION
assert dev_app.MODEL_VERSION == "v2"
assert os.path.normpath(dev_app.MODEL_DIR).split(os.sep)[-1] == "v2"


def test_language_detection(client):
"""Testa che l'endpoint /language_detection ritorni predizioni"""
payload = {"texts": ["Hello world!", "Ciao come state?"]}
response = client.post("/language_detection", json=payload)
assert response.status_code == 200
data = response.json()
assert "predictions" in data
assert isinstance(data["predictions"], list)
assert len(data["predictions"]) == len(payload["texts"])
89 changes: 89 additions & 0 deletions tests/docker/test_dockerfiles.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import re
from pathlib import Path

REPO_ROOT = Path(__file__).resolve().parents[2]


def read_dockerfile(path: str) -> str:
return (REPO_ROOT / path).read_text(encoding="utf-8")


def extract_copy_sources(contents: str):
for line in contents.splitlines():
stripped = line.strip()
if not stripped or stripped.startswith("#"):
continue
match = re.match(r"^COPY\s+([^\s]+)\s+(.*)$", stripped, re.IGNORECASE)
if match:
sources = match.group(1).split()
for src in sources:
yield src.rstrip("/")


def test_production_dockerfile_sets_app_env():
contents = read_dockerfile("Dockerfile")
assert "ENV APP_ENV=prod" in contents


def test_development_dockerfile_sets_app_env():
contents = read_dockerfile("Dockerfile.dev")
assert "ENV APP_ENV=dev" in contents


def test_dockerfiles_define_model_version():
prod = read_dockerfile("Dockerfile")
dev = read_dockerfile("Dockerfile.dev")
assert "ENV MODEL_VERSION=" in prod
assert "ENV MODEL_VERSION=" in dev


def test_production_copy_sources_exist():
contents = read_dockerfile("Dockerfile")
missing = [
src for src in extract_copy_sources(contents)
if not (REPO_ROOT / src).exists()
]
assert not missing, f"Dockerfile COPY sources missing from repo: {missing}"


def test_development_copy_sources_exist():
contents = read_dockerfile("Dockerfile.dev")
missing = [
src for src in extract_copy_sources(contents)
if not (REPO_ROOT / src).exists()
]
assert not missing, f"Dockerfile.dev COPY sources missing from repo: {missing}"


def test_requirements_cover_web_stack():
requirements = (REPO_ROOT / "requirements.txt").read_text(encoding="utf-8")
assert "fastapi" in requirements.lower()
assert "uvicorn" in requirements.lower()


def test_uvicorn_entrypoint_module_exists():
contents = read_dockerfile("Dockerfile")
cmd_line = next(
(line for line in contents.splitlines() if line.strip().startswith("CMD")), ""
)
assert "app.main:app" in cmd_line
assert (REPO_ROOT / "app" / "main.py").exists()


def test_models_directory_tracked_even_without_artifacts():
models_dir = REPO_ROOT / "models"
assert models_dir.exists() and models_dir.is_dir()


def test_dev_dockerfile_does_not_copy_app_code():
contents = read_dockerfile("Dockerfile.dev")
copy_lines = [
line for line in contents.splitlines() if line.strip().upper().startswith("COPY")
]
assert all("app/" not in line.split() for line in copy_lines), "Dev Dockerfile should rely on bind mount for app/ code"


def test_dockerignore_allows_app_and_src():
dockerignore = (REPO_ROOT / ".dockerignore").read_text(encoding="utf-8").splitlines()
assert "app/" not in dockerignore, "app/ must be included in build context"
assert "src/" not in dockerignore, "src/ must be included in build context"
62 changes: 62 additions & 0 deletions tests/prod/test_api_prod.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import sys
from unittest.mock import patch

import pytest
from fastapi.testclient import TestClient


def mock_predict_language_safe(_, __, texts, **kwargs):
return ["English", "Italian"][: len(texts)]


@pytest.fixture
def prod_app(monkeypatch, tmp_path):
monkeypatch.setenv("APP_ENV", "prod")
monkeypatch.setenv("MODEL_DIR", str(tmp_path / "missing"))
monkeypatch.delenv("SKIP_MODEL_LOADING", raising=False)
sys.modules.pop("app.main", None)
import app.main as main

return main


@pytest.fixture
def client(prod_app):
return TestClient(prod_app.app)


def test_status_endpoint(client):
"""Health check endpoint"""
response = client.get("/status")
assert response.status_code == 200
assert response.json() == {"status": "ok"}


def test_model_version_endpoint(client):
"""Model version endpoint"""
response = client.get("/model_version")
assert response.status_code == 200
data = response.json()
assert "model_version" in data
assert isinstance(data["model_version"], str)


def test_language_detection_with_mock(client, prod_app):
"""Language detection endpoint with mocked model"""
payload = {"texts": ["Hello world!", "Ciao come state?"]}
with patch("app.main.predict_language_safe", side_effect=mock_predict_language_safe) as mock_func:
response = client.post("/language_detection", json=payload)
assert response.status_code == 200
data = response.json()
assert "predictions" in data
assert isinstance(data["predictions"], list)
assert set(data["predictions"]) == {"English", "Italian"}

mock_func.assert_called_once()


def test_prod_environment_skips_model_loading(prod_app):
"""Ensure prod environment does not load model artifacts during tests."""
assert prod_app.APP_ENV == "prod"
assert prod_app.model is None
assert prod_app.le is None