From c2f50b1b2c43feaf8ecab8978aa718de65fd78d9 Mon Sep 17 00:00:00 2001 From: enwaiax Date: Mon, 16 Mar 2026 15:25:42 +0000 Subject: [PATCH] fix: guard file unlink in audio extraction to prevent crash in library mode In library mode, audio content arrives as base64-encoded binary data (not a file path). PR #1119 added file-path support for Dataloader but left Path.unlink() unconditional, causing OSError (ENAMETOOLONG) when the base64 string (~2MB) is treated as a filename. Use a `source_file_path` sentinel so unlink only runs when content was actually resolved from an on-disk file (Dataloader/V2 API path). Fixes: NVBug 5984261 Made-with: Cursor --- .../extract/audio/audio_extraction.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/api/src/nv_ingest_api/internal/extract/audio/audio_extraction.py b/api/src/nv_ingest_api/internal/extract/audio/audio_extraction.py index 24ab60446..61b2d96ef 100644 --- a/api/src/nv_ingest_api/internal/extract/audio/audio_extraction.py +++ b/api/src/nv_ingest_api/internal/extract/audio/audio_extraction.py @@ -59,25 +59,24 @@ def _extract_from_audio(row: pd.Series, audio_client: Any, trace_info: Dict, seg raise ValueError("Row does not contain 'metadata'.") base64_audio = metadata.pop("content") + source_file_path = None try: - base64_file_path = base64_audio - if not base64_file_path: + if not base64_audio: return [row.to_list()] - base64_file_path = base64.b64decode(base64_file_path).decode("utf-8") - if not base64_file_path: - return [row.to_list()] - if Path(base64_file_path).exists(): - base64_audio = read_file_as_base64(base64_file_path) + decoded_path = base64.b64decode(base64_audio).decode("utf-8") + if decoded_path and Path(decoded_path).exists(): + source_file_path = decoded_path + base64_audio = read_file_as_base64(decoded_path) except (UnicodeDecodeError, base64.binascii.Error): pass content_metadata = metadata.get("content_metadata", {}) - # Only extract transcript if content type is audio if (content_metadata.get("type") != ContentTypeEnum.AUDIO) or (base64_audio in (None, "")): return [row.to_list()] - logger.debug(f"Removing file {base64_file_path}") - Path(base64_file_path).unlink(missing_ok=True) + if source_file_path is not None: + logger.debug(f"Removing temporary file {source_file_path}") + Path(source_file_path).unlink(missing_ok=True) # Get the result from the inference model segments, transcript = audio_client.infer(