From 907fb52c444eed6ce35edb1e118b3eea009f67cd Mon Sep 17 00:00:00 2001 From: Ben Batman Date: Tue, 8 Jul 2025 21:56:00 -0400 Subject: [PATCH 1/5] fixing character identification bug, update readme --- README.md | 50 ++++++++++++++++++++++++++-------- pyproject.toml | 14 ++++++++-- requirements.txt | 1 + src/captionalchemy/__init__.py | 2 +- src/captionalchemy/caption.py | 17 ++++++------ 5 files changed, 62 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index 9449711..9cff557 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,13 @@ A Python package for creating intelligent closed captions with face detection an ## Installation ```bash -pip install -e . +pip install captionalchemy +``` + +If you have a GPU and want to use hardware acceleration: + +```bash +pip install captionalchemy[cuda] ``` ### Prerequisites @@ -24,6 +30,8 @@ pip install -e . - CUDA-capable GPU (optional, for acceleration) - Whisper.cpp capable (optional on MacOS) +If using Whisper.cpp on MacOS, follow installation instructions [[here](https://github.com/ggml-org/whisper.cpp?tab=readme-ov-file#core-ml-support)] + ## Quick Start 1. **Set up environment variables** (create `.env` file): @@ -45,9 +53,28 @@ pip install -e . ``` 3. **Generate captions**: - ```bash - captionalchemy video.mp4 -f srt -o my_captions - ``` + +```bash +captionalchemy video.mp4 -f srt -o my_captions +``` + +or in a python script + +```python +from dotenv import load_dotenv +from captionalchemy import caption + +load_dotenv() + +caption.run_pipeline( + video_url_or_path="path/to/your/video.mp4", # this can be a video URL or local file + character_identification=False, # True by default + known_faces_json="path/to/known_faces.json", + embed_faces_json="path/to/embed_faces.json", # name of the output file + caption_output_path="my_captions/output", # will write output to output.srt (or .vtt/.smi) + caption_format="srt" +) +``` ## Usage @@ -140,16 +167,13 @@ John Doe: Welcome to our presentation on quantum computing. Jane Smith: Thanks John. Let's start with the basics. ``` -## Development +## Development and Contributing ### Setup Development Environment ```bash # Install in development mode -pip install -e . - -# Install development dependencies -pip install -r requirements-dev.txt +pip install -e ".[dev]" ``` ### Running Tests @@ -164,8 +188,8 @@ pytest # Linting flake8 -# Type checking -mypy src/ +# Code formatting +black src/ tests/ ``` ## Requirements @@ -205,3 +229,7 @@ MIT License - see LICENSE file for details. - Check the logs with `-v` flag for detailed error information - Ensure all dependencies are properly installed - Verify video file format compatibility + +``` + +``` diff --git a/pyproject.toml b/pyproject.toml index 33bf528..66fb175 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "captionalchemy" -version = "0.1.0" +dynamic = ["version"] description = "A Python package to create closed captions with face detection and recognition." readme = "README.md" license = { text = "MIT" } @@ -30,6 +30,7 @@ dependencies = [ "panns-inference==0.1.1", "librosa==0.11.0", "tqdm==4.67.1", + "torch==2.7.0", ] classifiers = [ @@ -42,7 +43,16 @@ requires-python = ">=3.10" [project.optional-dependencies] -dev = ["pytest==8.4.0", "mypy==1.16.0", "flake8==7.2.0", "black==25.1.0", "pytest-cov==6.2.1"] +dev = [ + "pytest==8.4.0", + "mypy==1.16.0", + "flake8==7.2.0", + "black==25.1.0", + "pytest-cov==6.2.1", + "setuptools-scm==8.3.1", + +] +cuda = ["onnxruntime-gpu==1.22.0"] [tool.setuptools.packages.find] where = ["src"] diff --git a/requirements.txt b/requirements.txt index 06507c0..871350b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,3 +10,4 @@ onnxruntime==1.22.0 panns-inference==0.1.1 librosa==0.11.0 tqdm==4.67.1 +torch==2.7.0 diff --git a/src/captionalchemy/__init__.py b/src/captionalchemy/__init__.py index 3dc1f76..8b13789 100644 --- a/src/captionalchemy/__init__.py +++ b/src/captionalchemy/__init__.py @@ -1 +1 @@ -__version__ = "0.1.0" + diff --git a/src/captionalchemy/caption.py b/src/captionalchemy/caption.py index 29df031..ebd825c 100644 --- a/src/captionalchemy/caption.py +++ b/src/captionalchemy/caption.py @@ -2,7 +2,7 @@ import tempfile import os import uuid -from typing import Literal +from typing import Literal, Optional import argparse from dotenv import load_dotenv, find_dotenv @@ -34,8 +34,8 @@ def run_pipeline( video_url_or_path: str, character_identification: bool = True, - known_faces_json: str = "example/known_faces.json", - embed_faces_json: str = "example/embed_faces.json", + known_faces_json: Optional[str] = None, + embed_faces_json: Optional[str] = None, caption_output_path: str = "output_captions", caption_format: Literal["vtt", "srt", "smi"] = "srt", ): @@ -56,7 +56,8 @@ def run_pipeline( caption_format: Format for output captions (srt, vtt, smi). """ logger.info("Embedding known faces...") - embed_faces(known_faces_json, embed_faces_json) + if character_identification: + embed_faces(known_faces_json, embed_faces_json) video_manager = VideoManager(use_file_buffer=False) if caption_format == "srt": writer = SRTCaptionWriter() @@ -231,14 +232,14 @@ def _build_arg_parser() -> argparse.ArgumentParser: parser.add_argument( "--known-faces-json", - default="example/known_faces.json", - help="Path to JSON file listing known faces to embed (default: 'example/known_faces.json').", + default=None, + help="Path to JSON file listing known faces to embed (default: 'known_faces.json').", ) parser.add_argument( "--embed-faces-json", - default="example/embed_faces.json", - help="JSON path to store face embeddings (default: 'example/embed_faces.json').", + default="embed_faces.json", + help="JSON path to store face embeddings (default: 'embed_faces.json').", ) parser.add_argument( From a8c658c139b5a3951df3abca7fe8c3bef7937312 Mon Sep 17 00:00:00 2001 From: Ben Batman Date: Tue, 8 Jul 2025 22:47:36 -0400 Subject: [PATCH 2/5] error if no known_faces.json file is found --- example/known_faces.json | 2 +- src/captionalchemy/caption.py | 17 ++++++++--------- .../tools/cv/embed_known_faces.py | 11 +++++++++-- 3 files changed, 18 insertions(+), 12 deletions(-) diff --git a/example/known_faces.json b/example/known_faces.json index 0a3d65d..b547be0 100644 --- a/example/known_faces.json +++ b/example/known_faces.json @@ -1,6 +1,6 @@ [ { "name": "SpaceTime Dude", - "image_path": "images/spacetime_dude.png" + "image_path": "example/images/spacetime_dude.png" } ] diff --git a/src/captionalchemy/caption.py b/src/captionalchemy/caption.py index ebd825c..1c95f1e 100644 --- a/src/captionalchemy/caption.py +++ b/src/captionalchemy/caption.py @@ -34,8 +34,8 @@ def run_pipeline( video_url_or_path: str, character_identification: bool = True, - known_faces_json: Optional[str] = None, - embed_faces_json: Optional[str] = None, + known_faces_json: str = "known_faces.json", + embed_faces_json: str = "embed_faces.json", caption_output_path: str = "output_captions", caption_format: Literal["vtt", "srt", "smi"] = "srt", ): @@ -93,17 +93,16 @@ def run_pipeline( speech_segments = get_speech_segments( audio_path, os.getenv("HF_AUTH_TOKEN", ""), device ) - logger.info(f"Speech segments detected: {speech_segments}") logger.info("Detecting non-speech segments...") non_speech_events = detect_non_speech_segments(audio_path, device=device) - print(non_speech_events) if not speech_segments: logger.warning("No speech segments detected. Exiting.") return # Diarize + logger.info("Running diarization...") diarization_result = diarize( audio_path ) # { "SPEAKER_00": {"start": 3.25409375, "end": 606.2990937500001}, ..., } @@ -232,7 +231,7 @@ def _build_arg_parser() -> argparse.ArgumentParser: parser.add_argument( "--known-faces-json", - default=None, + default="known_faces.json", help="Path to JSON file listing known faces to embed (default: 'known_faces.json').", ) @@ -254,10 +253,10 @@ def main(): parser = _build_arg_parser() args = parser.parse_args() - if args.verbose: - logging.getLogger().setLevel(logging.DEBUG) - else: - logging.getLogger().setLevel(logging.INFO) + logging.basicConfig( + level=logging.DEBUG if args.verbose else logging.INFO, + format="%(asctime)s %(levelname)s [%(name)s] %(message)s", + ) logger = logging.getLogger("captionalchemy") logger.info("Starting captionalchemy pipeline...") diff --git a/src/captionalchemy/tools/cv/embed_known_faces.py b/src/captionalchemy/tools/cv/embed_known_faces.py index d683d78..260b18c 100644 --- a/src/captionalchemy/tools/cv/embed_known_faces.py +++ b/src/captionalchemy/tools/cv/embed_known_faces.py @@ -32,8 +32,15 @@ def embed_faces( logger = logging.getLogger(__name__) if not known_faces_json: raise ValueError("known_faces.json must be provided.") - with open(known_faces_json, "r") as f: - known_list = json.load(f) + try: + with open(known_faces_json, "r") as f: + known_list = json.load(f) + + except FileNotFoundError as e: + logger.error(f"Error loading known faces JSON: {e}") + raise FileNotFoundError( + f"Could not find the known faces JSON file: {known_faces_json}" + ) from e insightface.model_zoo.get_model provider = ( From aa563b5cd633e961bc192a76e353727cf3bc6557 Mon Sep 17 00:00:00 2001 From: Ben Batman Date: Wed, 9 Jul 2025 08:49:26 -0400 Subject: [PATCH 3/5] update default embed json name, update readme --- README.md | 3 ++- src/captionalchemy/tools/cv/embed_known_faces.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 9cff557..e255e89 100644 --- a/README.md +++ b/README.md @@ -27,10 +27,11 @@ pip install captionalchemy[cuda] - Python 3.10+ - FFmpeg (for video/audio processing) -- CUDA-capable GPU (optional, for acceleration) +- CUDA-capable GPU (optional, for acceleration but is highly recommended for the diarization) - Whisper.cpp capable (optional on MacOS) If using Whisper.cpp on MacOS, follow installation instructions [[here](https://github.com/ggml-org/whisper.cpp?tab=readme-ov-file#core-ml-support)] +Clone the whisper repo into your working directory. ## Quick Start diff --git a/src/captionalchemy/tools/cv/embed_known_faces.py b/src/captionalchemy/tools/cv/embed_known_faces.py index 260b18c..23bfcde 100644 --- a/src/captionalchemy/tools/cv/embed_known_faces.py +++ b/src/captionalchemy/tools/cv/embed_known_faces.py @@ -8,7 +8,7 @@ def embed_faces( - known_faces_json: str, output_embeddings_json: str = "embedded_faces.json" + known_faces_json: str, output_embeddings_json: str = "embed_faces.json" ) -> None: """ JSON should be structured as: From 18e400e3707d2bc415898fb1c4689eb91bc76515 Mon Sep 17 00:00:00 2001 From: Ben Batman Date: Wed, 9 Jul 2025 08:54:29 -0400 Subject: [PATCH 4/5] fixing linter check --- src/captionalchemy/__init__.py | 1 - src/captionalchemy/caption.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/src/captionalchemy/__init__.py b/src/captionalchemy/__init__.py index 8b13789..e69de29 100644 --- a/src/captionalchemy/__init__.py +++ b/src/captionalchemy/__init__.py @@ -1 +0,0 @@ - diff --git a/src/captionalchemy/caption.py b/src/captionalchemy/caption.py index 1c95f1e..9528cfe 100644 --- a/src/captionalchemy/caption.py +++ b/src/captionalchemy/caption.py @@ -2,7 +2,7 @@ import tempfile import os import uuid -from typing import Literal, Optional +from typing import Literal import argparse from dotenv import load_dotenv, find_dotenv From 7a8e1e1e079acc10df7d31a21bcad1051f35958a Mon Sep 17 00:00:00 2001 From: Ben Batman Date: Wed, 9 Jul 2025 09:18:53 -0400 Subject: [PATCH 5/5] updating tests --- src/captionalchemy/caption.py | 7 ++++++- tests/test_main.py | 15 ++++++++------- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/src/captionalchemy/caption.py b/src/captionalchemy/caption.py index 9528cfe..1c487d7 100644 --- a/src/captionalchemy/caption.py +++ b/src/captionalchemy/caption.py @@ -253,11 +253,16 @@ def main(): parser = _build_arg_parser() args = parser.parse_args() + level = logging.DEBUG if args.verbose else logging.INFO + logging.basicConfig( - level=logging.DEBUG if args.verbose else logging.INFO, + level=level, format="%(asctime)s %(levelname)s [%(name)s] %(message)s", ) + root_logger = logging.getLogger() + root_logger.setLevel(level) + logger = logging.getLogger("captionalchemy") logger.info("Starting captionalchemy pipeline...") run_pipeline( diff --git a/tests/test_main.py b/tests/test_main.py index 93289f5..0b24964 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -289,8 +289,9 @@ def test_pipeline_no_speech_segments( character_identification=False, ) + # Verify mock_embed_faces was not called + mock_embed_faces.assert_not_called() # Verify early functions were called - mock_embed_faces.assert_called_once() mock_vad.assert_called_once() @patch("captionalchemy.caption.detect_non_speech_segments") @@ -483,8 +484,8 @@ def test_pipeline_character_identification_disabled( character_identification=False, ) - # embed_faces should still be called - mock_embed_faces.assert_called_once() + # embed_faces should not be called + mock_embed_faces.assert_not_called() # Should process speech events even without face recognition assert mock_transcriber.transcribe_audio.call_count == len(speech_events) @@ -591,8 +592,8 @@ def test_build_arg_parser_default_values(self): assert args.format == "srt" assert args.output == "output_captions" assert args.character_identification is True - assert args.known_faces_json == "example/known_faces.json" - assert args.embed_faces_json == "example/embed_faces.json" + assert args.known_faces_json == "known_faces.json" + assert args.embed_faces_json == "embed_faces.json" assert args.verbose is False def test_build_arg_parser_custom_values(self): @@ -667,8 +668,8 @@ def test_main_with_default_args( mock_run_pipeline.assert_called_once_with( video_url_or_path="test_video.mp4", character_identification=True, - known_faces_json="example/known_faces.json", - embed_faces_json="example/embed_faces.json", + known_faces_json="known_faces.json", + embed_faces_json="embed_faces.json", caption_output_path="output_captions", caption_format="srt", )