Skip to content

Commit cb57e84

Browse files
committed
observe(hls): log first playlist push + first stream.m3u8 per camera
The previous regex fix (6c83fea) didn't resolve the Pi playback issue — segments keep pushing but the player still spins after 1200+ segments. Guessing further burns time; we need ground truth from Fly logs. Add two one-shot INFO logs that fire exactly once per camera id per backend restart: - ``hls: first playlist push`` — on POST /playlist, captures raw size, rewritten size, first segment URI, and resolved codecs. Tells us instantly whether CloudNode is reaching the endpoint at all, and if so what shape its playlist is. - ``hls: first stream.m3u8 HIT/MISS`` — on GET /stream.m3u8, distinguishes between "cache hit and served" vs "CloudNode never pushed — 404 back to browser". A MISS with segment cache non-empty is the smoking gun for "segments arriving but playlist isn't". Gated behind a ``set[str]`` flag so we log once and shut up — with hls.js retrying at 400ms and CloudNode pushing at 1 Hz, unmuted INFO logs would flood. ``cleanup_camera_cache`` clears the flag so a reconnecting camera re-announces. No behavior change — only telemetry.
1 parent 6c83fea commit cb57e84

1 file changed

Lines changed: 64 additions & 1 deletion

File tree

backend/app/api/hls.py

Lines changed: 64 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,16 @@
5555
# Track playlist update count per camera — used to throttle cache eviction sweeps.
5656
_playlist_update_count: dict[str, int] = {}
5757

58+
# ── One-shot diagnostic logging ───────────────────────────────────────
59+
# On a fresh backend, every new camera id logs its first playlist push
60+
# and first stream.m3u8 fetch exactly once. This gives an operator
61+
# ground truth about what the pipeline is doing ("did CloudNode reach
62+
# /playlist at all? what's the first segment URI look like?") without
63+
# drowning Fly logs in per-segment spam. Cleared when the camera is
64+
# evicted from the cache so a reconnect relogs.
65+
_first_playlist_logged: set[str] = set()
66+
_first_stream_get_logged: set[str] = set()
67+
5868
# ── Stream access logging (rate-limited) ─────────────────────────────
5969
_ACCESS_LOG_INTERVAL = 300.0 # 5 minutes
6070
_last_access_logged: dict[tuple[str, str], float] = {}
@@ -69,6 +79,8 @@ def cleanup_camera_cache(camera_id: str):
6979
_segment_cache.pop(camera_id, None)
7080
_playlist_cache.pop(camera_id, None)
7181
_playlist_update_count.pop(camera_id, None)
82+
_first_playlist_logged.discard(camera_id)
83+
_first_stream_get_logged.discard(camera_id)
7284

7385

7486
def _evict_segment_cache(camera_id: str):
@@ -228,13 +240,36 @@ async def get_hls_playlist(
228240
# Serve from cache (populated by POST /playlist from CloudNode).
229241
cached = _playlist_cache.get(camera_id)
230242
if cached and (time.monotonic() - cached[1]) < _PLAYLIST_CACHE_MAX_AGE:
243+
if camera_id not in _first_stream_get_logged:
244+
_first_stream_get_logged.add(camera_id)
245+
logger.info(
246+
"hls: first stream.m3u8 HIT for cam=%s (playlist_age=%.1fs, bytes=%d, cached_segments=%d)",
247+
camera_id,
248+
time.monotonic() - cached[1],
249+
len(cached[0]),
250+
len(_segment_cache.get(camera_id, {})),
251+
)
231252
return Response(
232253
content=cached[0],
233254
media_type="application/vnd.apple.mpegurl",
234255
headers=headers,
235256
)
236257

237-
# No cached playlist — CloudNode hasn't pushed one yet.
258+
# No cached playlist — CloudNode hasn't pushed one yet (or it went
259+
# stale). Log once per camera so operators can tell the "CloudNode
260+
# isn't pushing playlists" case apart from "stream.m3u8 never called".
261+
# With hls.js retrying every 400ms, unmuted INFO would be a flood —
262+
# the one-shot flag keeps it to one line per camera per restart.
263+
if camera_id not in _first_stream_get_logged:
264+
_first_stream_get_logged.add(camera_id)
265+
cached_bytes = len(_segment_cache.get(camera_id, {}))
266+
logger.warning(
267+
"hls: first stream.m3u8 MISS for cam=%s (playlist_cached=%s, segment_cache_entries=%d) — "
268+
"CloudNode hasn't POST /playlist for this camera yet",
269+
camera_id,
270+
cached is not None,
271+
cached_bytes,
272+
)
238273
raise HTTPException(status_code=404, detail="Stream not started yet")
239274

240275

@@ -363,6 +398,34 @@ async def update_hls_playlist(
363398
)
364399
_playlist_cache[camera_id] = (rewritten, time.monotonic())
365400

401+
# First-push diagnostic log — capture the first raw segment URI so
402+
# we can see how FFmpeg is shaping it in production (bare basename
403+
# vs path-prefixed vs something else entirely). Lives under INFO
404+
# so it shows up in Fly's default log view, one line per camera per
405+
# backend restart — not the kind of thing you want to see at 1 Hz.
406+
if camera_id not in _first_playlist_logged:
407+
_first_playlist_logged.add(camera_id)
408+
# Sample the first non-comment, non-blank line — that's the
409+
# first segment URI (ground truth for the rewriter).
410+
sample_uri = next(
411+
(
412+
ln.strip()
413+
for ln in playlist_content.splitlines()
414+
if ln.strip() and not ln.lstrip().startswith("#")
415+
),
416+
"<none>",
417+
)
418+
logger.info(
419+
"hls: first playlist push cam=%s raw_bytes=%d rewritten_bytes=%d "
420+
"first_segment_uri=%r codecs=%s/%s",
421+
camera_id,
422+
len(playlist_content),
423+
len(rewritten),
424+
sample_uri[:200],
425+
video_codec,
426+
audio_codec,
427+
)
428+
366429
# Periodic cache eviction.
367430
count = _playlist_update_count.get(camera_id, 0) + 1
368431
_playlist_update_count[camera_id] = count

0 commit comments

Comments
 (0)