diff --git a/.codex/config.toml b/.codex/config.toml index 32f2af1..7f0488d 100644 --- a/.codex/config.toml +++ b/.codex/config.toml @@ -1,19 +1,46 @@ model = "gpt-5.4" model_reasoning_effort = "medium" +model_reasoning_summary = "concise" +model_verbosity = "low" + approval_policy = "on-request" sandbox_mode = "workspace-write" +[sandbox_workspace_write] +network_access = false + [tools] web_search = false +[profiles.quick] +model = "gpt-5.4" +model_reasoning_effort = "low" +model_reasoning_summary = "concise" +model_verbosity = "low" +approval_policy = "on-request" +sandbox_mode = "workspace-write" + [profiles.review] model = "gpt-5.4" model_reasoning_effort = "high" +plan_mode_reasoning_effort = "high" +model_reasoning_summary = "concise" +model_verbosity = "medium" approval_policy = "on-request" sandbox_mode = "workspace-write" -[profiles.quick] +[profiles.readonly] model = "gpt-5.4" -model_reasoning_effort = "low" +model_reasoning_effort = "medium" +model_reasoning_summary = "concise" +model_verbosity = "low" +approval_policy = "on-request" +sandbox_mode = "read-only" + +[profiles.docs] +model = "gpt-5.4" +model_reasoning_effort = "medium" +model_reasoning_summary = "concise" +model_verbosity = "medium" approval_policy = "on-request" sandbox_mode = "workspace-write" \ No newline at end of file diff --git a/README.md b/README.md index 5e7f5b4..b9a963f 100644 --- a/README.md +++ b/README.md @@ -121,6 +121,17 @@ Home server Backend + DB ``` +## Observability + +Human Engine includes a minimal observability layer for backend logs: + +- structured JSON logging in the FastAPI backend +- Promtail pipeline with docker log parsing and JSON extraction +- Loki for log storage +- Grafana dashboard for API requests, HealthKit sync, readiness recompute, errors, and pipeline trace + +See: [docs/architecture/OBSERVABILITY.md](docs/architecture/OBSERVABILITY.md) + ## Architecture Principles - Simplicity over complexity @@ -146,6 +157,7 @@ docs/ system documentation - [backend/README.md](backend/README.md) - [docs/architecture/ARCHITECTURE.md](docs/architecture/ARCHITECTURE.md) +- [docs/architecture/OBSERVABILITY.md](docs/architecture/OBSERVABILITY.md) - [backend/ROADMAP.md](backend/ROADMAP.md) - [docs/models/model_v2_architecture.md](docs/models/model_v2_architecture.md) - [docs/product/CURRENT_STATE.md](docs/product/CURRENT_STATE.md) diff --git a/backend/backend/app.py b/backend/backend/app.py index 28b1a3e..4084046 100644 --- a/backend/backend/app.py +++ b/backend/backend/app.py @@ -1,16 +1,20 @@ from __future__ import annotations import json +import logging +import time +import uuid from datetime import datetime from typing import Any import psycopg import requests from fastapi import FastAPI, HTTPException, Query, Request -from fastapi.responses import RedirectResponse +from fastapi.responses import JSONResponse, RedirectResponse from pydantic import BaseModel, Field from backend.config import settings +from backend.core.logging import configure_logging, log_event from backend.db import get_conn from backend.services.fitness_service import recompute_fitness_state from backend.services.ingest_service import process_one_strava_ingest_job @@ -39,6 +43,9 @@ from backend.services.healthkit_pipeline import ingest_and_process_healthkit_payload from backend.services.readiness_query import get_readiness_daily_for_date +configure_logging() +logger = logging.getLogger(__name__) + app = FastAPI(title="Human Engine API", version="0.1.0") @@ -83,6 +90,106 @@ class AIRideBriefingResponse(BaseModel): data: AIRideBriefingData +def _get_request_id(request: Request) -> str: + request_id = getattr(request.state, "request_id", None) + if request_id: + return request_id + + request_id = request.headers.get("x-request-id") or str(uuid.uuid4()) + request.state.request_id = request_id + return request_id + + +def _extract_user_id(request: Request) -> str | None: + path_user_id = request.path_params.get("user_id") + if path_user_id: + return str(path_user_id) + + header_user_id = request.headers.get("x-user-id") + if header_user_id: + return header_user_id + + return None + + +@app.middleware("http") +async def request_logging_middleware(request: Request, call_next): + started_at = time.perf_counter() + request_id = _get_request_id(request) + user_id = _extract_user_id(request) + request.state.user_id = user_id + + log_event( + logger, + "api_request_started", + method=request.method, + path=request.url.path, + request_id=request_id, + user_id=user_id, + ) + + try: + response = await call_next(request) + except Exception as exc: + duration_ms = round((time.perf_counter() - started_at) * 1000, 2) + log_event( + logger, + "error", + level=logging.ERROR, + error_type=type(exc).__name__, + error=str(exc), + context="request_exception", + method=request.method, + path=request.url.path, + request_id=request_id, + user_id=_extract_user_id(request), + duration_ms=duration_ms, + ) + raise + + duration_ms = round((time.perf_counter() - started_at) * 1000, 2) + user_id = _extract_user_id(request) + request.state.user_id = user_id + response.headers["X-Request-ID"] = request_id + + log_event( + logger, + "api_request_finished", + method=request.method, + path=request.url.path, + status_code=response.status_code, + duration_ms=duration_ms, + request_id=request_id, + user_id=user_id, + ) + + if response.status_code >= 400: + log_event( + logger, + "error", + level=logging.ERROR, + error_type="HTTPError", + error=f"request failed with status {response.status_code}", + context="http_response", + method=request.method, + path=request.url.path, + status_code=response.status_code, + request_id=request_id, + user_id=user_id, + ) + + return response + + +@app.exception_handler(Exception) +async def unhandled_exception_handler(request: Request, exc: Exception): + return JSONResponse( + status_code=500, + content={"detail": f"internal server error: {type(exc).__name__}"}, + headers={"X-Request-ID": _get_request_id(request)}, + ) + + @app.get("/healthz") def healthz(): return {"ok": True} @@ -267,6 +374,17 @@ async def strava_webhook_receive(request: Request): if not object_type or object_id is None or owner_id is None or event_time_unix is None: raise HTTPException(status_code=400, detail="invalid strava webhook payload") + log_event( + logger, + "strava_webhook_received", + request_id=_get_request_id(request), + owner_id=owner_id, + object_id=object_id, + object_type=object_type, + aspect_type=aspect_type, + subscription_id=subscription_id, + ) + dedupe_key = ( f"strava:{subscription_id}:{owner_id}:" f"{object_type}:{object_id}:{aspect_type}:{event_time_unix}" @@ -356,6 +474,21 @@ async def strava_webhook_receive(request: Request): f"webhook_{aspect_type}", ), ) + job_row = cur.fetchone() + job_id = job_row[0] if job_row else None + + if job_id is not None: + log_event( + logger, + "strava_ingest_job_created", + request_id=_get_request_id(request), + user_id=user_id, + job_id=job_id, + owner_id=owner_id, + activity_id=object_id, + aspect_type=aspect_type, + webhook_event_id=webhook_event_id, + ) conn.commit() @@ -1217,12 +1350,32 @@ def ingest_and_process_healthkit_payload_endpoint(user_id: str, payload: HealthS @app.post("/api/v1/healthkit/full-sync/{user_id}") def full_sync_healthkit_payload_endpoint(user_id: str, payload: HealthSyncPayload): + started_at = time.perf_counter() + counts = { + "sleep": len(payload.sleepNights), + "hrv": len(payload.hrvSamples), + "rhr": len(payload.restingHeartRateDaily), + } + + log_event( + logger, + "healthkit_full_sync_started", + user_id=user_id, + counts=counts, + ) + try: result = ingest_and_process_healthkit_payload( user_id=user_id, payload=payload, ) - print("FULL_SYNC_RESULT:", result) + log_event( + logger, + "healthkit_full_sync_finished", + user_id=user_id, + counts=counts, + duration_ms=round((time.perf_counter() - started_at) * 1000, 2), + ) return result except Exception as e: raise HTTPException( @@ -1236,4 +1389,4 @@ def get_readiness_daily_endpoint(user_id: str, target_date: str): return get_readiness_daily_for_date( user_id=user_id, target_date=target_date, - ) \ No newline at end of file + ) diff --git a/backend/backend/core/__init__.py b/backend/backend/core/__init__.py new file mode 100644 index 0000000..614d002 --- /dev/null +++ b/backend/backend/core/__init__.py @@ -0,0 +1 @@ +"""Core backend utilities.""" diff --git a/backend/backend/core/logging.py b/backend/backend/core/logging.py new file mode 100644 index 0000000..b9ef1b7 --- /dev/null +++ b/backend/backend/core/logging.py @@ -0,0 +1,76 @@ +from __future__ import annotations + +import json +import logging +import sys +from datetime import datetime, timezone +from typing import Any + + +SERVICE_NAME = "human-engine-backend" +_RESERVED_RECORD_FIELDS = set(logging.makeLogRecord({}).__dict__.keys()) + + +def _json_default(value: Any) -> Any: + if isinstance(value, datetime): + return value.astimezone(timezone.utc).isoformat().replace("+00:00", "Z") + return str(value) + + +class JsonFormatter(logging.Formatter): + def format(self, record: logging.LogRecord) -> str: + payload: dict[str, Any] = { + "timestamp": datetime.fromtimestamp( + record.created, + tz=timezone.utc, + ).isoformat(timespec="milliseconds").replace("+00:00", "Z"), + "level": record.levelname, + "service": getattr(record, "service", SERVICE_NAME), + } + + event = getattr(record, "event", None) + if event: + payload["event"] = event + + message = record.getMessage() + if message: + payload["message"] = message + + for key, value in record.__dict__.items(): + if key in _RESERVED_RECORD_FIELDS or key.startswith("_"): + continue + if key in payload or value is None: + continue + payload[key] = value + + if record.exc_info: + payload["exception"] = self.formatException(record.exc_info) + + return json.dumps(payload, ensure_ascii=False, default=_json_default) + + +def configure_logging(level: int = logging.INFO) -> None: + handler = logging.StreamHandler(sys.stdout) + handler.setFormatter(JsonFormatter()) + + root_logger = logging.getLogger() + root_logger.handlers.clear() + root_logger.setLevel(level) + root_logger.addHandler(handler) + + for logger_name in ("uvicorn", "uvicorn.access", "uvicorn.error"): + logger = logging.getLogger(logger_name) + logger.handlers.clear() + logger.propagate = True + logger.setLevel(level) + + +def log_event(logger: logging.Logger, event: str, **kwargs: Any) -> None: + level = kwargs.pop("level", logging.INFO) + message = kwargs.pop("message", "") + extra = { + "service": SERVICE_NAME, + "event": event, + **{key: value for key, value in kwargs.items() if value is not None}, + } + logger.log(level, message, extra=extra) diff --git a/backend/backend/services/healthkit_pipeline.py b/backend/backend/services/healthkit_pipeline.py index 72aa661..e7604d2 100644 --- a/backend/backend/services/healthkit_pipeline.py +++ b/backend/backend/services/healthkit_pipeline.py @@ -1,7 +1,9 @@ from __future__ import annotations +import logging from typing import Any +from backend.core.logging import log_event from backend.schemas.healthkit import HealthSyncPayload from backend.services.health_recovery_daily import recompute_health_recovery_daily_for_date from backend.services.healthkit_ingest import save_healthkit_ingest_raw @@ -9,6 +11,8 @@ from backend.services.load_state_v2 import recompute_load_state_daily_v2 from backend.services.readiness_daily import recompute_readiness_daily_for_date +logger = logging.getLogger(__name__) + def _collect_affected_dates(payload: HealthSyncPayload) -> list[str]: dates: set[str] = set() @@ -61,6 +65,17 @@ def ingest_and_process_healthkit_payload(user_id: str, payload: HealthSyncPayloa ) readiness_results.append(readiness_result) + log_event( + logger, + "healthkit_payload_processed", + user_id=user_id, + affected_dates_count=len(affected_dates), + sleep_count=len(payload.sleepNights), + hrv_count=len(payload.hrvSamples), + rhr_count=len(payload.restingHeartRateDaily), + readiness_days_recomputed=len(readiness_results), + ) + return { "ok": True, "user_id": user_id, diff --git a/backend/backend/services/readiness_daily.py b/backend/backend/services/readiness_daily.py index de0dca8..5f64180 100644 --- a/backend/backend/services/readiness_daily.py +++ b/backend/backend/services/readiness_daily.py @@ -1,12 +1,17 @@ from __future__ import annotations import json +import logging +import time from typing import Any from fastapi import HTTPException +from backend.core.logging import log_event from backend.db import get_conn +logger = logging.getLogger(__name__) + def _clamp(value: float, low: float, high: float) -> float: return max(low, min(high, value)) @@ -36,132 +41,163 @@ def _describe_readiness_status(score: float | None) -> str: def recompute_readiness_daily_for_date(user_id: str, target_date: str) -> dict[str, Any]: - with get_conn() as conn: - with conn.cursor() as cur: - cur.execute( - """ - select freshness - from load_state_daily_v2 - where user_id = %s - and date = %s - and version = 'v2'; - """, - (user_id, target_date), - ) - load_row = cur.fetchone() - - cur.execute( - """ - select - recovery_score_simple, - recovery_explanation_json - from health_recovery_daily - where user_id = %s - and date = %s; - """, - (user_id, target_date), - ) - recovery_row = cur.fetchone() - - freshness = load_row[0] if load_row else None - recovery_score_simple = recovery_row[0] if recovery_row else None - recovery_explanation = recovery_row[1] if recovery_row else None - - if isinstance(recovery_explanation, str): - recovery_explanation = json.loads(recovery_explanation) - - if freshness is None and recovery_score_simple is None: - raise HTTPException( - status_code=404, - detail=f"no load or recovery data found for user_id={user_id} date={target_date}", + started_at = time.perf_counter() + log_event( + logger, + "readiness_recompute_started", + user_id=user_id, + target_date=target_date, + ) + + try: + with get_conn() as conn: + with conn.cursor() as cur: + cur.execute( + """ + select freshness + from load_state_daily_v2 + where user_id = %s + and date = %s + and version = 'v2'; + """, + (user_id, target_date), + ) + load_row = cur.fetchone() + + cur.execute( + """ + select + recovery_score_simple, + recovery_explanation_json + from health_recovery_daily + where user_id = %s + and date = %s; + """, + (user_id, target_date), + ) + recovery_row = cur.fetchone() + + freshness = load_row[0] if load_row else None + recovery_score_simple = recovery_row[0] if recovery_row else None + recovery_explanation = recovery_row[1] if recovery_row else None + + if isinstance(recovery_explanation, str): + recovery_explanation = json.loads(recovery_explanation) + + if freshness is None and recovery_score_simple is None: + raise HTTPException( + status_code=404, + detail=f"no load or recovery data found for user_id={user_id} date={target_date}", + ) + + freshness_norm = _normalize_freshness(freshness) + + # V2 baseline formula: + # readiness = 60% load-state + 40% recovery-state + if freshness_norm is None: + readiness_score_raw = recovery_score_simple + elif recovery_score_simple is None: + readiness_score_raw = freshness_norm + else: + readiness_score_raw = 0.6 * freshness_norm + 0.4 * recovery_score_simple + + readiness_score = ( + _clamp(round(readiness_score_raw, 1), 0.0, 100.0) + if readiness_score_raw is not None + else None ) - freshness_norm = _normalize_freshness(freshness) - - # V2 baseline formula: - # readiness = 60% load-state + 40% recovery-state - if freshness_norm is None: - readiness_score_raw = recovery_score_simple - elif recovery_score_simple is None: - readiness_score_raw = freshness_norm - else: - readiness_score_raw = 0.6 * freshness_norm + 0.4 * recovery_score_simple - - readiness_score = ( - _clamp(round(readiness_score_raw, 1), 0.0, 100.0) - if readiness_score_raw is not None - else None - ) - - good_day_probability = ( - round(readiness_score / 100.0, 3) - if readiness_score is not None - else None - ) - - status_text = _describe_readiness_status(readiness_score) - - explanation_json = { - "freshness": freshness, - "freshness_norm": freshness_norm, - "recovery_score_simple": recovery_score_simple, - "weights": { - "freshness_norm": 0.6, - "recovery_score_simple": 0.4, - }, - "formula": "0.6 * freshness_norm + 0.4 * recovery_score_simple", - "recovery_explanation": recovery_explanation, - } - - cur.execute( - """ - insert into readiness_daily ( - user_id, - date, - freshness, - recovery_score_simple, - readiness_score_raw, - readiness_score, - good_day_probability, - status_text, - explanation_json, - version, - updated_at + good_day_probability = ( + round(readiness_score / 100.0, 3) + if readiness_score is not None + else None ) - values ( - %s, %s, %s, %s, %s, %s, %s, %s, %s::jsonb, 'v2', now() + + status_text = _describe_readiness_status(readiness_score) + + explanation_json = { + "freshness": freshness, + "freshness_norm": freshness_norm, + "recovery_score_simple": recovery_score_simple, + "weights": { + "freshness_norm": 0.6, + "recovery_score_simple": 0.4, + }, + "formula": "0.6 * freshness_norm + 0.4 * recovery_score_simple", + "recovery_explanation": recovery_explanation, + } + + cur.execute( + """ + insert into readiness_daily ( + user_id, + date, + freshness, + recovery_score_simple, + readiness_score_raw, + readiness_score, + good_day_probability, + status_text, + explanation_json, + version, + updated_at + ) + values ( + %s, %s, %s, %s, %s, %s, %s, %s, %s::jsonb, 'v2', now() + ) + on conflict (user_id, date, version) do update set + freshness = excluded.freshness, + recovery_score_simple = excluded.recovery_score_simple, + readiness_score_raw = excluded.readiness_score_raw, + readiness_score = excluded.readiness_score, + good_day_probability = excluded.good_day_probability, + status_text = excluded.status_text, + explanation_json = excluded.explanation_json, + updated_at = now(); + """, + ( + user_id, + target_date, + freshness, + recovery_score_simple, + readiness_score_raw, + readiness_score, + good_day_probability, + status_text, + json.dumps(explanation_json), + ), ) - on conflict (user_id, date, version) do update set - freshness = excluded.freshness, - recovery_score_simple = excluded.recovery_score_simple, - readiness_score_raw = excluded.readiness_score_raw, - readiness_score = excluded.readiness_score, - good_day_probability = excluded.good_day_probability, - status_text = excluded.status_text, - explanation_json = excluded.explanation_json, - updated_at = now(); - """, - ( - user_id, - target_date, - freshness, - recovery_score_simple, - readiness_score_raw, - readiness_score, - good_day_probability, - status_text, - json.dumps(explanation_json), - ), - ) - conn.commit() - - return { - "ok": True, - "user_id": user_id, - "date": target_date, - "freshness": freshness, - "recovery_score_simple": recovery_score_simple, - "readiness_score": readiness_score, - "good_day_probability": good_day_probability, - "status_text": status_text, - } + conn.commit() + + result = { + "ok": True, + "user_id": user_id, + "date": target_date, + "freshness": freshness, + "recovery_score_simple": recovery_score_simple, + "readiness_score": readiness_score, + "good_day_probability": good_day_probability, + "status_text": status_text, + } + log_event( + logger, + "readiness_recompute_finished", + user_id=user_id, + target_date=target_date, + readiness_score=readiness_score, + good_day_probability=good_day_probability, + duration_ms=round((time.perf_counter() - started_at) * 1000, 2), + ) + return result + except Exception as exc: + log_event( + logger, + "error", + level=logging.ERROR, + error_type=type(exc).__name__, + error=str(exc), + context="readiness_recompute", + user_id=user_id, + target_date=target_date, + ) + raise diff --git a/backend/backend/worker.py b/backend/backend/worker.py index c3bd15b..6a53792 100644 --- a/backend/backend/worker.py +++ b/backend/backend/worker.py @@ -2,14 +2,13 @@ import time from datetime import datetime, timezone +from backend.core.logging import configure_logging, log_event from backend.services.ingest_service import process_one_strava_ingest_job from backend.services.notification_service import send_daily_readiness -logging.basicConfig( - level=logging.INFO, - format="%(asctime)s %(levelname)s %(message)s", -) +configure_logging() +logger = logging.getLogger(__name__) DAILY_READINESS_USER_ID = "sergey" DAILY_READINESS_HOUR_UTC = 7 @@ -24,11 +23,15 @@ def maybe_send_daily_readiness() -> None: sent = send_daily_readiness(DAILY_READINESS_USER_ID, for_date=now.date()) if sent: - logging.info("Daily readiness summary sent for %s", DAILY_READINESS_USER_ID) + log_event( + logger, + "daily_readiness_sent", + user_id=DAILY_READINESS_USER_ID, + ) def main() -> None: - logging.info("Human Engine worker started") + log_event(logger, "worker_started") while True: try: @@ -37,16 +40,30 @@ def main() -> None: result = process_one_strava_ingest_job() if result.get("message") == "no pending jobs": - logging.info("No pending jobs, sleeping 10s") + log_event(logger, "worker_idle", sleep_seconds=10) time.sleep(10) else: - logging.info("Processed job: %s", result) + log_event( + logger, + "strava_ingest_job_processed", + job_id=result.get("job_id"), + user_id=result.get("user_id"), + activity_id=result.get("activity_id"), + result=result, + ) time.sleep(1) except Exception as e: - logging.exception("Worker error: %s", e) + log_event( + logger, + "error", + level=logging.ERROR, + error_type=type(e).__name__, + error=str(e), + context="worker_loop", + ) time.sleep(5) if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/backend/tests/test_structured_logging.py b/backend/tests/test_structured_logging.py new file mode 100644 index 0000000..f14533f --- /dev/null +++ b/backend/tests/test_structured_logging.py @@ -0,0 +1,54 @@ +import json +import logging + +from backend.core.logging import JsonFormatter, log_event + + +def test_json_formatter_renders_structured_event(): + formatter = JsonFormatter() + record = logging.LogRecord( + name="test", + level=logging.INFO, + pathname=__file__, + lineno=10, + msg="", + args=(), + exc_info=None, + ) + record.event = "strava_webhook_received" + record.owner_id = 123 + + payload = json.loads(formatter.format(record)) + + assert payload["level"] == "INFO" + assert payload["service"] == "human-engine-backend" + assert payload["event"] == "strava_webhook_received" + assert payload["owner_id"] == 123 + assert "timestamp" in payload + assert "message" not in payload + + +def test_log_event_emits_json_payload(): + messages: list[str] = [] + logger = logging.getLogger("test_structured_logging") + logger.handlers.clear() + logger.setLevel(logging.INFO) + logger.propagate = False + + handler = logging.StreamHandler() + handler.setFormatter(JsonFormatter()) + handler.emit = lambda record: messages.append(handler.format(record)) + logger.addHandler(handler) + + log_event( + logger, + "healthkit_full_sync_started", + user_id="user-1", + counts={"sleep": 2, "hrv": 3, "rhr": 1}, + ) + + payload = json.loads(messages[0]) + + assert payload["event"] == "healthkit_full_sync_started" + assert payload["user_id"] == "user-1" + assert payload["counts"] == {"sleep": 2, "hrv": 3, "rhr": 1} diff --git a/docs/architecture/OBSERVABILITY.md b/docs/architecture/OBSERVABILITY.md new file mode 100644 index 0000000..f6e299a --- /dev/null +++ b/docs/architecture/OBSERVABILITY.md @@ -0,0 +1,186 @@ +# Observability + +## 1. Overview + +Observability в Human Engine нужен не как отдельный продуктовый слой, а как способ видеть работу deterministic core. + +Цели: + +- понимать, какие pipeline steps были выполнены +- проверять воспроизводимость backend behavior +- объяснять, почему система пришла к конкретному состоянию +- быстро находить ошибки ingestion, recompute и API paths + +Принципы: + +- deterministic: события отражают фактические шаги системы +- reproducible: логи помогают восстановить последовательность обработки +- explainable: ключевые решения backend видны через structured events + +--- + +## 2. Logging Architecture + +Backend пишет structured JSON logs в stdout. Docker сохраняет stdout как docker logs. Promtail читает docker logs, парсит JSON payload и отправляет записи в Loki. Grafana использует Loki как datasource для анализа. + +```text +Backend (JSON logs) + | + v +Docker logs + | + v +Promtail (docker + json parsing) + | + v +Loki + | + v +Grafana +``` + +Observability stack находится в `infra/monitoring/observability` и не меняет deterministic backend logic. + +--- + +## 3. Structured Logging + +Backend logs are JSON objects. + +Required fields: + +- `timestamp` +- `level` +- `service` +- `event` + +Common additional fields: + +- `user_id` +- `request_id` +- `path` +- `duration_ms` +- `job_id` +- `activity_id` + +Example: + +```json +{ + "timestamp": "...", + "level": "INFO", + "service": "human-engine-backend", + "event": "readiness_recompute_finished", + "user_id": "sergey", + "readiness_score": 56.1 +} +``` + +The `event` field is the main unit of observability. Logs should describe system events, not free-form text lines. + +--- + +## 4. Events + +Key events currently used by the backend: + +API: + +- `api_request_started` +- `api_request_finished` + +HealthKit: + +- `healthkit_full_sync_started` +- `healthkit_full_sync_finished` +- `healthkit_payload_processed` + +Readiness: + +- `readiness_recompute_started` +- `readiness_recompute_finished` + +Errors: + +- `error` + +Error events include context fields such as `error_type`, `error`, `context`, `path`, `request_id`, and `user_id` when available. + +--- + +## 5. Promtail Pipeline + +Promtail pipeline: + +- `docker` stage unwraps Docker log records +- `json` stage parses backend JSON payload +- `labels` stage promotes only stable fields: + - `service` + - `event` + - `level` + +`user_id` is intentionally not a label. + +Reason: label values create Loki streams. User-specific labels increase cardinality and make storage and queries more expensive. Use `user_id` as a parsed JSON field in LogQL queries instead. + +--- + +## 6. Grafana Dashboard + +The Grafana dashboard is the main operational view for backend behavior. + +Panels: + +- event volume: shows how many structured events are produced over time; answers "is the system active?" +- durations: shows request or job durations; answers "what became slower?" +- errors: shows `event="error"` records; answers "what failed and where?" +- HealthKit sync: shows full-sync start/finish and payload processing; answers "did sync run and complete?" +- readiness recompute: shows readiness recompute events; answers "was readiness recalculated?" +- pipeline trace: shows ordered events for a user, request, or job; answers "what happened in this processing path?" + +The dashboard is for analysis and operations. It should not become a source of product logic. + +--- + +## 7. How to Debug + +Find all events for a user: + +```logql +{service="human-engine-backend"} | json | user_id="sergey" +``` + +Check readiness recompute: + +```logql +{service="human-engine-backend", event="readiness_recompute_finished"} +``` + +Check errors: + +```logql +{service="human-engine-backend", event="error"} +``` + +Trace one API request: + +```logql +{service="human-engine-backend"} | json | request_id="..." +``` + +Check slow API requests: + +```logql +{service="human-engine-backend", event="api_request_finished"} | json | duration_ms > 1000 +``` + +--- + +## 8. Design Principles + +- Do not log everything. +- Log events, not arbitrary strings. +- Keep labels minimal and stable. +- Keep domain calculations outside observability. +- Prefer readable logs over exhaustive logs. +- Add fields only when they help debug a real system path. diff --git a/infra/monitoring/observability/README.md b/infra/monitoring/observability/README.md new file mode 100644 index 0000000..762f7c5 --- /dev/null +++ b/infra/monitoring/observability/README.md @@ -0,0 +1,101 @@ +# Human Engine Observability + +Минимальный observability stack для логов Human Engine: + +- Grafana +- Loki +- Promtail + +Стек вынесен в отдельный `docker compose` и не меняет основной compose приложения. + +## Что собирается + +В первой версии собираются только docker json logs контейнеров: + +- `human-engine-backend` +- `human-engine-postgres` + +Promtail использует Docker service discovery для поиска нужных контейнеров, а сами логи читает из стандартных Docker json log files в `/var/lib/docker/containers/*/*-json.log`. + +## Хранение на HDD + +Данные хранятся на отдельном диске в: + +- `/data/observability/loki` +- `/data/observability/grafana` +- `/data/observability/promtail` + +## Подготовка каталогов + +```bash +sudo mkdir -p /data/observability/loki +sudo mkdir -p /data/observability/grafana +sudo mkdir -p /data/observability/promtail +sudo chown -R 472:472 /data/observability/grafana +sudo chown -R 10001:10001 /data/observability/loki +``` + +Если Loki или Grafana пожалуется на права доступа, скорректируйте владельца каталогов после первого запуска по фактическому UID процесса внутри контейнера. + +## Запуск + +Из директории observability: + +```bash +cd /srv/human-engine/infra/monitoring/observability +docker compose up -d +``` + +Или одной командой из любого места: + +```bash +docker compose -f /srv/human-engine/infra/monitoring/observability/docker-compose.yml up -d +``` + +## Открыть Grafana + +- URL: `http://localhost:3001` +- login: `admin` +- password: `admin` + +Datasource `Loki` создается автоматически через provisioning. + +## Проверка + +Проверить, что стек поднялся: + +```bash +docker compose -f /srv/human-engine/infra/monitoring/observability/docker-compose.yml ps +``` + +Проверить, что Loki доступен: + +```bash +curl http://localhost:3100/ready +``` + +Проверить, что backend пишет логи: + +```bash +docker logs human-engine-backend --tail 20 +``` + +В Grafana: + +1. Откройте `Explore` +2. Выберите datasource `Loki` +3. Выполните запрос: + +```logql +{container="human-engine-backend"} +``` + +Для Postgres: + +```logql +{container="human-engine-postgres"} +``` + +## Retention + +Loki настроен как одиночный локальный инстанс с filesystem storage и retention `7 days` (`168h`). diff --git a/infra/monitoring/observability/docker-compose.yml b/infra/monitoring/observability/docker-compose.yml new file mode 100644 index 0000000..cef33dd --- /dev/null +++ b/infra/monitoring/observability/docker-compose.yml @@ -0,0 +1,43 @@ +services: + loki: + image: grafana/loki:3.0.0 + container_name: human-engine-loki + restart: unless-stopped + command: + - -config.file=/etc/loki/loki-config.yaml + ports: + - "3100:3100" + volumes: + - ./loki-config.yaml:/etc/loki/loki-config.yaml:ro + - /data/observability/loki:/loki + + promtail: + image: grafana/promtail:3.0.0 + container_name: human-engine-promtail + restart: unless-stopped + command: + - -config.file=/etc/promtail/promtail-config.yaml + depends_on: + - loki + volumes: + - ./promtail-config.yaml:/etc/promtail/promtail-config.yaml:ro + - /data/observability/promtail:/var/lib/promtail + - /var/lib/docker/containers:/var/lib/docker/containers:ro + - /var/run/docker.sock:/var/run/docker.sock:ro + + grafana: + image: grafana/grafana:11.1.0 + container_name: human-engine-grafana + restart: unless-stopped + depends_on: + - loki + ports: + - "3001:3000" + environment: + GF_SECURITY_ADMIN_USER: admin + GF_SECURITY_ADMIN_PASSWORD: admin + GF_USERS_ALLOW_SIGN_UP: "false" + volumes: + - /data/observability/grafana:/var/lib/grafana + - ./grafana/provisioning:/etc/grafana/provisioning:ro + diff --git a/infra/monitoring/observability/grafana/provisioning/datasources/loki.yaml b/infra/monitoring/observability/grafana/provisioning/datasources/loki.yaml new file mode 100644 index 0000000..fd2a2d8 --- /dev/null +++ b/infra/monitoring/observability/grafana/provisioning/datasources/loki.yaml @@ -0,0 +1,10 @@ +apiVersion: 1 + +datasources: + - name: Loki + type: loki + access: proxy + url: http://loki:3100 + isDefault: true + editable: false + diff --git a/infra/monitoring/observability/loki-config.yaml b/infra/monitoring/observability/loki-config.yaml new file mode 100644 index 0000000..bd11108 --- /dev/null +++ b/infra/monitoring/observability/loki-config.yaml @@ -0,0 +1,38 @@ +auth_enabled: false + +server: + http_listen_port: 3100 + +common: + path_prefix: /loki + replication_factor: 1 + ring: + kvstore: + store: inmemory + +schema_config: + configs: + - from: 2024-01-01 + store: tsdb + object_store: filesystem + schema: v13 + index: + prefix: index_ + period: 24h + +storage_config: + tsdb_shipper: + active_index_directory: /loki/index + cache_location: /loki/index_cache + filesystem: + directory: /loki/chunks + +compactor: + working_directory: /loki/compactor + compaction_interval: 10m + retention_enabled: true + delete_request_store: filesystem + +limits_config: + retention_period: 168h + allow_structured_metadata: false diff --git a/infra/monitoring/observability/promtail-config.yaml b/infra/monitoring/observability/promtail-config.yaml new file mode 100644 index 0000000..973eda1 --- /dev/null +++ b/infra/monitoring/observability/promtail-config.yaml @@ -0,0 +1,46 @@ +server: + http_listen_port: 9080 + grpc_listen_port: 0 + +positions: + filename: /var/lib/promtail/positions.yaml + +clients: + - url: http://loki:3100/loki/api/v1/push + +scrape_configs: + - job_name: docker + + static_configs: + - targets: + - localhost + labels: + job: docker + __path__: /var/lib/docker/containers/*/*-json.log + + pipeline_stages: + - docker: {} + + - match: + selector: '{job="docker"}' + stages: + - json: + expressions: + timestamp: + level: + service: + event: + message: + user_id: + job_id: + activity_id: + request_id: + path: + method: + status_code: + duration_ms: + + - labels: + service: + event: + level: \ No newline at end of file