Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
57379d9
docs(api): Phase 7b foundation implementation plan
strausmann May 17, 2026
5ea32cb
feat(api): add serialize_datetime_utc helper for RFC3339 with Z
strausmann May 17, 2026
163fd76
fix(api): TemplateRead emits RFC3339 datetimes with Z suffix
strausmann May 17, 2026
1acc15a
refactor(api): hoist api_client_with_seed fixture into conftest
strausmann May 17, 2026
71e388a
fix(api): PrinterRead + JobRead emit RFC3339 datetimes with Z suffix
strausmann May 17, 2026
828aae8
refactor(api): SQLAlchemy datetime columns are timezone-aware UTC
strausmann May 17, 2026
ed3c55b
fix(api): alembic data migration normalises naive datetimes to UTC
strausmann May 17, 2026
3d01502
fix(integration): suppress alembic fileConfig in migration test to re…
strausmann May 17, 2026
efbbc31
feat(api): derive_printer_id helper for deterministic UUIDv5
strausmann May 17, 2026
c8f46d5
feat(api): upsert_runtime_printer lifespan helper
strausmann May 17, 2026
9845ad4
refactor(api): driver.make_queue_printer accepts optional printer_id
strausmann May 17, 2026
b420e1c
fix(api): seed_templates aborts on empty loader cache instead of sile…
strausmann May 17, 2026
77ceb3a
fix(api): re-order lifespan — load_dir before seed_templates + upsert…
strausmann May 17, 2026
6943a1e
feat(api): verify_alembic_at_head fails fast on revision drift
strausmann May 17, 2026
4fb107d
feat(api): readiness response schema (CheckStatus + ReadinessResponse)
strausmann May 17, 2026
8020e40
feat(api): readiness aggregator — database/alembic/templates/printer_…
strausmann May 17, 2026
499003c
feat(api): readiness aggregator — remaining 4 checks
strausmann May 17, 2026
424948b
feat(api): expose /readiness deep-check endpoint
strausmann May 17, 2026
9790e52
test(api): regression guard — /healthz must answer 200 even when DB b…
strausmann May 17, 2026
47175f1
feat(status): StatusProbeProducer persists printer_status_cache rows
strausmann May 17, 2026
07aaba4
feat(status): PrinterStatus carries cache freshness + offline reason
strausmann May 17, 2026
411c79a
fix(status): /api/printers/{id}/status reads from cache, no sync SNMP
strausmann May 17, 2026
a096a90
feat(ui): proxy /docs, /openapi.json, /redoc to the backend
strausmann May 17, 2026
1db3bee
docs(api): document /healthz vs /readiness contract in the README
strausmann May 17, 2026
6ace6de
fix(api): readiness sse_bus check supports real EventBus + Settings cap
strausmann May 17, 2026
6ced8bb
fix(api): populate PrinterStatus.tape_loaded + error_state from cache
strausmann May 17, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,22 @@ curl http://localhost:8080/healthz # frontend → backend_reachable: true
| `POST` | `/jobs/{job_id}/resume` | Resume a job paused by tape mismatch (after the user changed the tape physically) | — |
| `POST` | `/printer/resume` | Resume the printer queue after a recoverable error halted it (tape empty / cover open / offline) | — |
| `GET` | `/healthz` | Liveness probe for orchestrators | — |
| `GET` | `/readiness` | Readiness probe — deep check for reverse-proxy routing | — |

### Health Probes

The backend exposes two HTTP probes with different semantics:

| Endpoint | Purpose | What it answers |
|----------|---------|-----------------|
| `GET /healthz` | Liveness — Docker / Kubernetes container restart signal | "the process and the event loop are alive" |
| `GET /readiness` | Readiness — reverse-proxy routing signal | "the process can serve traffic right now": database connectable, alembic at head, templates seeded, runtime printer matches DB, SNMP probe fresh, queue worker alive, SSE bus capacity ok |

`/readiness` returns HTTP 200 with `status` of `ready` (all checks ok) or `degraded` (non-critical checks failing — still routable), and HTTP 503 with `not-ready` when a critical check (database, alembic, template_seed) fails.

Pangolin's `targets[0].healthcheck.path` can use `/readiness` for deep checks instead of `/healthz`; Docker container healthchecks should stay on `/healthz` to avoid restart loops on transient DB failures.

See `docs/superpowers/specs/2026-05-17-phase-7b-foundation-design.md` for the full check list and rationale.

### `POST /print` request body

Expand Down
47 changes: 47 additions & 0 deletions backend/alembic/versions/20260517_phase7b_datetime_tz.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
"""Phase 7b — normalise existing datetime rows to timezone-aware ISO strings.

Existing rows from Phase 5 inserts contain naive datetimes (no TZ suffix)
that break the Go frontend's RFC3339 parser. This migration appends
`+00:00` to any value that does NOT already contain `+` or end with `Z`.
SQLite is dynamically typed so no ALTER TABLE is required — the new column
type from B4 only affects new inserts via the SQLAlchemy layer.

Revision ID: 20260517_phase7b_datetime_tz
Revises: b2668b6e8845
Create Date: 2026-05-17
"""

from alembic import op

# revision identifiers, used by Alembic.
revision = "20260517_phase7b_datetime_tz"
down_revision = "b2668b6e8845"
branch_labels = None
depends_on = None


_TABLES_DT = [
("templates", ["created_at", "updated_at"]),
("printers", ["created_at", "updated_at"]),
("jobs", ["created_at", "updated_at", "started_at", "finished_at"]),
("presets", ["created_at", "updated_at"]),
("printer_state", ["updated_at"]),
("printer_status_cache", ["captured_at", "updated_at"]),
]


def upgrade() -> None:
for table, cols in _TABLES_DT:
for col in cols:
op.execute(
f"UPDATE {table} SET {col} = {col} || '+00:00' "
f"WHERE {col} IS NOT NULL "
f"AND {col} NOT LIKE '%+%' "
f"AND {col} NOT LIKE '%Z'"
)


def downgrade() -> None:
# The naive-datetime state being reverted to is exactly the bug we
# are fixing. Downgrade is intentionally a no-op.
pass
3 changes: 2 additions & 1 deletion backend/app/api/routes/print.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import logging
from typing import Any
from uuid import UUID

from fastapi import APIRouter, HTTPException, Request, status
from fastapi.responses import JSONResponse
Expand Down Expand Up @@ -32,7 +33,7 @@
class _PrinterResumeResponse(BaseModel):
"""200 response body for POST /printer/resume."""

printer_id: str
printer_id: UUID | str
state: str


Expand Down
75 changes: 30 additions & 45 deletions backend/app/api/routes/printers.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@

from __future__ import annotations

import asyncio
import dataclasses
import logging
from datetime import UTC, datetime
Expand Down Expand Up @@ -166,65 +165,51 @@ def _error_label(block: Any) -> str | None:
@router.get(
"/{printer_id}/status",
response_model=PrinterStatus,
summary="Force a fresh printer status probe",
summary="Return the latest cached printer status",
description=(
"Sends an ESC i S command to the printer over TCP/9100. "
"The result is written back to ``printer_status_cache`` and returned. "
"Returns 503 when the printer is unreachable."
"Returns the most recent status written by the background SNMP probe worker. "
"The response is served from ``printer_status_cache`` — no synchronous SNMP "
"probe is performed, so the response always returns in <10 ms. "
"When no probe has completed yet ``online`` is ``null`` and ``note`` explains why. "
"Returns 404 when the printer is not registered."
),
)
async def get_printer_status(
printer_id: UUID,
session: SessionDep,
) -> PrinterStatus:
"""Probe the printer and update the cache."""
printer = await _get_printer_or_404(session, printer_id)
"""Return the latest cached status for a printer; no sync SNMP probe."""
await _get_printer_or_404(session, printer_id)

host: str | None = printer.connection.get("host") if printer.connection else None
if not host:
raise HTTPException(
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
detail=f"printer {printer_id} has no 'host' in connection config",
row = await cache_repo.get(session, printer_id)
if row is None or row.captured_at is None:
return PrinterStatus(
printer_id=printer_id,
online=None,
captured_at=None,
note="No probe yet — wait up to 30s for first probe cycle",
)

port: int = int(printer.connection.get("port", 9100))

try:
result = await asyncio.to_thread(_probe_status_sync, host, port)
except OSError as exc:
raise HTTPException(
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
detail=f"printer {printer_id} unreachable: {exc}",
) from exc
parsed = row.parsed or {}
captured = row.captured_at
if captured.tzinfo is None:
captured = captured.replace(tzinfo=UTC)
age_s = int((datetime.now(UTC) - captured).total_seconds())

block = result["block"]
raw: bytes = result["raw"]
now = datetime.now(UTC)

parsed: dict[str, Any] = {
"media_width_mm": block.media_width_mm,
"media_type": block.media_type.name,
"status_type": block.status_type.name,
"phase_type": block.phase_type.name,
"errors": int(block.errors),
"tape_color": block.tape_color.name,
"text_color": block.text_color.name,
}
loaded_tape_mm = parsed.get("loaded_tape_mm")
tape_loaded = f"{loaded_tape_mm}mm" if loaded_tape_mm else None

await cache_repo.upsert(
session,
printer_id,
raw_block=raw,
parsed=parsed,
captured_at=now,
)
error_flags = parsed.get("error_flags") or []
error_state = ", ".join(error_flags) if error_flags else None

return PrinterStatus(
printer_id=printer_id,
online=True,
tape_loaded=_tape_label(block),
error_state=_error_label(block),
captured_at=now,
online=parsed.get("online"),
tape_loaded=tape_loaded,
error_state=error_state,
captured_at=row.captured_at,
last_probe_age_s=age_s,
last_error=parsed.get("last_error"),
)
Comment thread
strausmann marked this conversation as resolved.
Comment thread
strausmann marked this conversation as resolved.


Expand Down
129 changes: 126 additions & 3 deletions backend/app/db/lifespan.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,28 @@

Call order in main.py lifespan:
1. run_migrations() — apply pending Alembic revisions
2. recover_inflight_jobs() — mark stale QUEUED/PRINTING jobs as failed_restart
3. seed_templates() — upsert YAML seed templates into DB
4. ensure_printer_state() — create missing printer_state rows
1b. verify_alembic_at_head() — assert DB revision == script head (fail fast)
2. _discover_plugins() — register integration + model plugins (idempotent)
3. TemplateLoader.load_dir() — populate in-memory template cache (Cluster 1a)
4. recover_inflight_jobs() — mark stale QUEUED/PRINTING jobs as failed_restart
5. seed_templates() — YAML → DB upsert (defensive check on cache)
6. upsert_runtime_printer() — env → DB Printer row (Cluster 1b)
7. ensure_printer_state() — create missing printer_state rows per Printer

Note: steps 2 and 3 must precede step 5 — TemplateLoader.load_dir() validates
templates against IntegrationRegistry (populated in step 2), and seed_templates()
reads from the cache that load_dir() populates in step 3.
"""

from __future__ import annotations

from uuid import UUID

from sqlalchemy.ext.asyncio import AsyncSession

from app.config import Settings
from app.models.printer import Printer
from app.services.printer_identity import derive_printer_id
from app.services.template_loader import TemplateLoader


Expand Down Expand Up @@ -49,6 +62,55 @@ def _upgrade() -> None:
await asyncio.to_thread(_upgrade)


async def verify_alembic_at_head(settings: Settings) -> None:
"""Raise RuntimeError if the DB's alembic revision does not match the script head.

Lifespan calls this right after run_migrations() so a half-applied or
corrupted DB fails startup loudly with a clear log line, instead of
crashing later inside ORM queries with cryptic schema errors.

Takes settings explicitly so unit tests can verify against ad-hoc DBs
without monkey-patching the get_settings() lru_cache singleton — that's
the C2/D2 testability pattern.
"""
import asyncio
from pathlib import Path as _Path

from alembic.config import Config
from alembic.runtime.migration import MigrationContext
from alembic.script import ScriptDirectory
from sqlalchemy import create_engine

# backend/app/db/lifespan.py → parents[2] = backend/
ini_path = _Path(__file__).resolve().parents[2] / "alembic.ini"

def _check() -> tuple[str | None, str | None]:
cfg = Config(str(ini_path))
# Prevent alembic from calling logging.config.fileConfig() which would
# reconfigure the root logger and break pytest caplog fixtures.
cfg.attributes["configure_logger"] = False
script = ScriptDirectory.from_config(cfg)
head_rev = script.get_current_head()

# SQLAlchemy's synchronous engine: strip the async driver suffix
sync_url = settings.database_url.replace("+aiosqlite", "")
engine = create_engine(sync_url)
try:
with engine.connect() as conn:
ctx = MigrationContext.configure(conn)
current_rev = ctx.get_current_revision()
finally:
engine.dispose()

return current_rev, head_rev

current_rev, head_rev = await asyncio.to_thread(_check)
if current_rev != head_rev:
raise RuntimeError(
f"Alembic migration drift detected: DB at {current_rev!r}, expected head {head_rev!r}"
)


async def recover_inflight_jobs(session: AsyncSession) -> int:
"""Mark any QUEUED or PRINTING jobs as FAILED_RESTART.

Expand All @@ -70,8 +132,16 @@ async def seed_templates(session: AsyncSession, loader: type[TemplateLoader]) ->
main.py can call by name, and is the natural seam for unit tests that
want to inject a mock loader without touching the real registry.

Raises RuntimeError if the loader cache is empty — calling seed_templates
without first running TemplateLoader.load_dir() is a lifespan-ordering bug.

Returns the count of rows touched (inserted or updated).
"""
if not loader._cache:
raise RuntimeError(
"seed_templates called with empty TemplateLoader cache — "
"TemplateLoader.load_dir() must run before seed_templates()."
)
return await loader.seed_db(session)


Expand Down Expand Up @@ -102,3 +172,56 @@ async def ensure_printer_state(session: AsyncSession) -> int:
await session.commit()

return created


async def upsert_runtime_printer(
session: AsyncSession,
settings: Settings,
) -> UUID | None:
"""Materialise one Printer row from env config; return its deterministic id.

Returns ``None`` when the environment does NOT declare a printer host
(e.g. mock backend in CI). The lifespan calls this between
``seed_templates`` and ``ensure_printer_state`` so every restart
keeps the single runtime printer row consistent with the current env.

The Printer row is keyed by the deterministic UUIDv5 produced by
``derive_printer_id(model, host, port)`` — the same id that the
print-queue driver uses, so the DB row and the in-memory printer share
one stable identity across restarts.
"""
model: str = settings.printer_model
# Resolve host: pt750w takes precedence, ql820 is the fallback.
host: str = settings.pt750w_host or settings.ql820_host or ""
port: int = settings.pt750w_port if settings.pt750w_host else settings.ql820_port

if not (model and host and port):
return None

printer_id: UUID = derive_printer_id(model, host, port)
connection: dict[str, object] = {
"host": host,
"port": port,
"snmp": settings.printer_discover_via_snmp,
"snmp_community": settings.printer_snmp_community,
}
name: str = f"{model} ({host})"

existing = await session.get(Printer, printer_id)
if existing is not None:
existing.name = name
existing.connection = connection
existing.enabled = True
else:
session.add(
Printer(
id=printer_id,
name=name,
model=model.lower(),
backend=settings.printer_backend,
connection=connection,
enabled=True,
)
)
await session.flush()
return printer_id
Loading
Loading