From 907fb52c444eed6ce35edb1e118b3eea009f67cd Mon Sep 17 00:00:00 2001
From: Ben Batman <benbatman2@gmail.com>
Date: Tue, 8 Jul 2025 21:56:00 -0400
Subject: [PATCH 1/5] fixing character identification bug, update readme

---
 README.md                      | 50 ++++++++++++++++++++++++++--------
 pyproject.toml                 | 14 ++++++++--
 requirements.txt               |  1 +
 src/captionalchemy/__init__.py |  2 +-
 src/captionalchemy/caption.py  | 17 ++++++------
 5 files changed, 62 insertions(+), 22 deletions(-)

diff --git a/README.md b/README.md
index 9449711..9cff557 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,13 @@ A Python package for creating intelligent closed captions with face detection an
 ## Installation
 
 ```bash
-pip install -e .
+pip install captionalchemy
+```
+
+If you have a GPU and want to use hardware acceleration:
+
+```bash
+pip install captionalchemy[cuda]
 ```
 
 ### Prerequisites
@@ -24,6 +30,8 @@ pip install -e .
 - CUDA-capable GPU (optional, for acceleration)
 - Whisper.cpp capable (optional on MacOS)
 
+If using Whisper.cpp on MacOS, follow installation instructions [[here](https://github.com/ggml-org/whisper.cpp?tab=readme-ov-file#core-ml-support)]
+
 ## Quick Start
 
 1. **Set up environment variables** (create `.env` file):
@@ -45,9 +53,28 @@ pip install -e .
    ```
 
 3. **Generate captions**:
-   ```bash
-   captionalchemy video.mp4 -f srt -o my_captions
-   ```
+
+```bash
+captionalchemy video.mp4 -f srt -o my_captions
+```
+
+or in a python script
+
+```python
+from dotenv import load_dotenv
+from captionalchemy import caption
+
+load_dotenv()
+
+caption.run_pipeline(
+    video_url_or_path="path/to/your/video.mp4",         # this can be a video URL or local file
+    character_identification=False,                      # True by default
+    known_faces_json="path/to/known_faces.json",
+    embed_faces_json="path/to/embed_faces.json",        # name of the output file
+    caption_output_path="my_captions/output",           # will write output to output.srt (or .vtt/.smi)
+    caption_format="srt"
+)
+```
 
 ## Usage
 
@@ -140,16 +167,13 @@ John Doe: Welcome to our presentation on quantum computing.
 Jane Smith: Thanks John. Let's start with the basics.
 ```
 
-## Development
+## Development and Contributing
 
 ### Setup Development Environment
 
 ```bash
 # Install in development mode
-pip install -e .
-
-# Install development dependencies
-pip install -r requirements-dev.txt
+pip install -e ".[dev]"
 ```
 
 ### Running Tests
@@ -164,8 +188,8 @@ pytest
 # Linting
 flake8
 
-# Type checking
-mypy src/
+# Code formatting
+black src/ tests/
 ```
 
 ## Requirements
@@ -205,3 +229,7 @@ MIT License - see LICENSE file for details.
 - Check the logs with `-v` flag for detailed error information
 - Ensure all dependencies are properly installed
 - Verify video file format compatibility
+
+```
+
+```
diff --git a/pyproject.toml b/pyproject.toml
index 33bf528..66fb175 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "captionalchemy"
-version = "0.1.0"
+dynamic = ["version"]
 description = "A Python package to create closed captions with face detection and recognition."
 readme = "README.md"
 license = { text = "MIT" }
@@ -30,6 +30,7 @@ dependencies = [
     "panns-inference==0.1.1",
     "librosa==0.11.0",
     "tqdm==4.67.1",
+    "torch==2.7.0",
 ]
 
 classifiers = [
@@ -42,7 +43,16 @@ requires-python = ">=3.10"
 
 
 [project.optional-dependencies]
-dev = ["pytest==8.4.0", "mypy==1.16.0", "flake8==7.2.0", "black==25.1.0", "pytest-cov==6.2.1"]
+dev = [
+    "pytest==8.4.0",
+    "mypy==1.16.0",
+    "flake8==7.2.0",
+    "black==25.1.0",
+    "pytest-cov==6.2.1",
+    "setuptools-scm==8.3.1",
+
+]
+cuda = ["onnxruntime-gpu==1.22.0"]
 
 [tool.setuptools.packages.find]
 where = ["src"]
diff --git a/requirements.txt b/requirements.txt
index 06507c0..871350b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,3 +10,4 @@ onnxruntime==1.22.0
 panns-inference==0.1.1
 librosa==0.11.0
 tqdm==4.67.1
+torch==2.7.0
diff --git a/src/captionalchemy/__init__.py b/src/captionalchemy/__init__.py
index 3dc1f76..8b13789 100644
--- a/src/captionalchemy/__init__.py
+++ b/src/captionalchemy/__init__.py
@@ -1 +1 @@
-__version__ = "0.1.0"
+
diff --git a/src/captionalchemy/caption.py b/src/captionalchemy/caption.py
index 29df031..ebd825c 100644
--- a/src/captionalchemy/caption.py
+++ b/src/captionalchemy/caption.py
@@ -2,7 +2,7 @@
 import tempfile
 import os
 import uuid
-from typing import Literal
+from typing import Literal, Optional
 import argparse
 
 from dotenv import load_dotenv, find_dotenv
@@ -34,8 +34,8 @@
 def run_pipeline(
     video_url_or_path: str,
     character_identification: bool = True,
-    known_faces_json: str = "example/known_faces.json",
-    embed_faces_json: str = "example/embed_faces.json",
+    known_faces_json: Optional[str] = None,
+    embed_faces_json: Optional[str] = None,
     caption_output_path: str = "output_captions",
     caption_format: Literal["vtt", "srt", "smi"] = "srt",
 ):
@@ -56,7 +56,8 @@ def run_pipeline(
         caption_format: Format for output captions (srt, vtt, smi).
     """
     logger.info("Embedding known faces...")
-    embed_faces(known_faces_json, embed_faces_json)
+    if character_identification:
+        embed_faces(known_faces_json, embed_faces_json)
     video_manager = VideoManager(use_file_buffer=False)
     if caption_format == "srt":
         writer = SRTCaptionWriter()
@@ -231,14 +232,14 @@ def _build_arg_parser() -> argparse.ArgumentParser:
 
     parser.add_argument(
         "--known-faces-json",
-        default="example/known_faces.json",
-        help="Path to JSON file listing known faces to embed (default: 'example/known_faces.json').",
+        default=None,
+        help="Path to JSON file listing known faces to embed (default: 'known_faces.json').",
     )
 
     parser.add_argument(
         "--embed-faces-json",
-        default="example/embed_faces.json",
-        help="JSON path to store face embeddings (default: 'example/embed_faces.json').",
+        default="embed_faces.json",
+        help="JSON path to store face embeddings (default: 'embed_faces.json').",
     )
 
     parser.add_argument(

From a8c658c139b5a3951df3abca7fe8c3bef7937312 Mon Sep 17 00:00:00 2001
From: Ben Batman <benbatman2@gmail.com>
Date: Tue, 8 Jul 2025 22:47:36 -0400
Subject: [PATCH 2/5] error if no known_faces.json file is found

---
 example/known_faces.json                        |  2 +-
 src/captionalchemy/caption.py                   | 17 ++++++++---------
 .../tools/cv/embed_known_faces.py               | 11 +++++++++--
 3 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/example/known_faces.json b/example/known_faces.json
index 0a3d65d..b547be0 100644
--- a/example/known_faces.json
+++ b/example/known_faces.json
@@ -1,6 +1,6 @@
 [
   {
     "name": "SpaceTime Dude",
-    "image_path": "images/spacetime_dude.png"
+    "image_path": "example/images/spacetime_dude.png"
   }
 ]
diff --git a/src/captionalchemy/caption.py b/src/captionalchemy/caption.py
index ebd825c..1c95f1e 100644
--- a/src/captionalchemy/caption.py
+++ b/src/captionalchemy/caption.py
@@ -34,8 +34,8 @@
 def run_pipeline(
     video_url_or_path: str,
     character_identification: bool = True,
-    known_faces_json: Optional[str] = None,
-    embed_faces_json: Optional[str] = None,
+    known_faces_json: str = "known_faces.json",
+    embed_faces_json: str = "embed_faces.json",
     caption_output_path: str = "output_captions",
     caption_format: Literal["vtt", "srt", "smi"] = "srt",
 ):
@@ -93,17 +93,16 @@ def run_pipeline(
         speech_segments = get_speech_segments(
             audio_path, os.getenv("HF_AUTH_TOKEN", ""), device
         )
-        logger.info(f"Speech segments detected: {speech_segments}")
 
         logger.info("Detecting non-speech segments...")
         non_speech_events = detect_non_speech_segments(audio_path, device=device)
-        print(non_speech_events)
 
         if not speech_segments:
             logger.warning("No speech segments detected. Exiting.")
             return
 
         # Diarize
+        logger.info("Running diarization...")
         diarization_result = diarize(
             audio_path
         )  # { "SPEAKER_00": {"start": 3.25409375, "end": 606.2990937500001}, ..., }
@@ -232,7 +231,7 @@ def _build_arg_parser() -> argparse.ArgumentParser:
 
     parser.add_argument(
         "--known-faces-json",
-        default=None,
+        default="known_faces.json",
         help="Path to JSON file listing known faces to embed (default: 'known_faces.json').",
     )
 
@@ -254,10 +253,10 @@ def main():
     parser = _build_arg_parser()
     args = parser.parse_args()
 
-    if args.verbose:
-        logging.getLogger().setLevel(logging.DEBUG)
-    else:
-        logging.getLogger().setLevel(logging.INFO)
+    logging.basicConfig(
+        level=logging.DEBUG if args.verbose else logging.INFO,
+        format="%(asctime)s %(levelname)s [%(name)s] %(message)s",
+    )
 
     logger = logging.getLogger("captionalchemy")
     logger.info("Starting captionalchemy pipeline...")
diff --git a/src/captionalchemy/tools/cv/embed_known_faces.py b/src/captionalchemy/tools/cv/embed_known_faces.py
index d683d78..260b18c 100644
--- a/src/captionalchemy/tools/cv/embed_known_faces.py
+++ b/src/captionalchemy/tools/cv/embed_known_faces.py
@@ -32,8 +32,15 @@ def embed_faces(
     logger = logging.getLogger(__name__)
     if not known_faces_json:
         raise ValueError("known_faces.json must be provided.")
-    with open(known_faces_json, "r") as f:
-        known_list = json.load(f)
+    try:
+        with open(known_faces_json, "r") as f:
+            known_list = json.load(f)
+
+    except FileNotFoundError as e:
+        logger.error(f"Error loading known faces JSON: {e}")
+        raise FileNotFoundError(
+            f"Could not find the known faces JSON file: {known_faces_json}"
+        ) from e
 
     insightface.model_zoo.get_model
     provider = (

From aa563b5cd633e961bc192a76e353727cf3bc6557 Mon Sep 17 00:00:00 2001
From: Ben Batman <benbatman2@gmail.com>
Date: Wed, 9 Jul 2025 08:49:26 -0400
Subject: [PATCH 3/5] update default embed json name, update readme

---
 README.md                                        | 3 ++-
 src/captionalchemy/tools/cv/embed_known_faces.py | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 9cff557..e255e89 100644
--- a/README.md
+++ b/README.md
@@ -27,10 +27,11 @@ pip install captionalchemy[cuda]
 
 - Python 3.10+
 - FFmpeg (for video/audio processing)
-- CUDA-capable GPU (optional, for acceleration)
+- CUDA-capable GPU (optional, for acceleration but is highly recommended for the diarization)
 - Whisper.cpp capable (optional on MacOS)
 
 If using Whisper.cpp on MacOS, follow installation instructions [[here](https://github.com/ggml-org/whisper.cpp?tab=readme-ov-file#core-ml-support)]
+Clone the whisper repo into your working directory.
 
 ## Quick Start
 
diff --git a/src/captionalchemy/tools/cv/embed_known_faces.py b/src/captionalchemy/tools/cv/embed_known_faces.py
index 260b18c..23bfcde 100644
--- a/src/captionalchemy/tools/cv/embed_known_faces.py
+++ b/src/captionalchemy/tools/cv/embed_known_faces.py
@@ -8,7 +8,7 @@
 
 
 def embed_faces(
-    known_faces_json: str, output_embeddings_json: str = "embedded_faces.json"
+    known_faces_json: str, output_embeddings_json: str = "embed_faces.json"
 ) -> None:
     """
     JSON should be structured as:

From 18e400e3707d2bc415898fb1c4689eb91bc76515 Mon Sep 17 00:00:00 2001
From: Ben Batman <benbatman2@gmail.com>
Date: Wed, 9 Jul 2025 08:54:29 -0400
Subject: [PATCH 4/5] fixing linter check

---
 src/captionalchemy/__init__.py | 1 -
 src/captionalchemy/caption.py  | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/captionalchemy/__init__.py b/src/captionalchemy/__init__.py
index 8b13789..e69de29 100644
--- a/src/captionalchemy/__init__.py
+++ b/src/captionalchemy/__init__.py
@@ -1 +0,0 @@
-
diff --git a/src/captionalchemy/caption.py b/src/captionalchemy/caption.py
index 1c95f1e..9528cfe 100644
--- a/src/captionalchemy/caption.py
+++ b/src/captionalchemy/caption.py
@@ -2,7 +2,7 @@
 import tempfile
 import os
 import uuid
-from typing import Literal, Optional
+from typing import Literal
 import argparse
 
 from dotenv import load_dotenv, find_dotenv

From 7a8e1e1e079acc10df7d31a21bcad1051f35958a Mon Sep 17 00:00:00 2001
From: Ben Batman <benbatman2@gmail.com>
Date: Wed, 9 Jul 2025 09:18:53 -0400
Subject: [PATCH 5/5] updating tests

---
 src/captionalchemy/caption.py |  7 ++++++-
 tests/test_main.py            | 15 ++++++++-------
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/src/captionalchemy/caption.py b/src/captionalchemy/caption.py
index 9528cfe..1c487d7 100644
--- a/src/captionalchemy/caption.py
+++ b/src/captionalchemy/caption.py
@@ -253,11 +253,16 @@ def main():
     parser = _build_arg_parser()
     args = parser.parse_args()
 
+    level = logging.DEBUG if args.verbose else logging.INFO
+
     logging.basicConfig(
-        level=logging.DEBUG if args.verbose else logging.INFO,
+        level=level,
         format="%(asctime)s %(levelname)s [%(name)s] %(message)s",
     )
 
+    root_logger = logging.getLogger()
+    root_logger.setLevel(level)
+
     logger = logging.getLogger("captionalchemy")
     logger.info("Starting captionalchemy pipeline...")
     run_pipeline(
diff --git a/tests/test_main.py b/tests/test_main.py
index 93289f5..0b24964 100644
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -289,8 +289,9 @@ def test_pipeline_no_speech_segments(
             character_identification=False,
         )
 
+        # Verify mock_embed_faces was not called
+        mock_embed_faces.assert_not_called()
         # Verify early functions were called
-        mock_embed_faces.assert_called_once()
         mock_vad.assert_called_once()
 
     @patch("captionalchemy.caption.detect_non_speech_segments")
@@ -483,8 +484,8 @@ def test_pipeline_character_identification_disabled(
             character_identification=False,
         )
 
-        # embed_faces should still be called
-        mock_embed_faces.assert_called_once()
+        # embed_faces should not be called
+        mock_embed_faces.assert_not_called()
 
         # Should process speech events even without face recognition
         assert mock_transcriber.transcribe_audio.call_count == len(speech_events)
@@ -591,8 +592,8 @@ def test_build_arg_parser_default_values(self):
         assert args.format == "srt"
         assert args.output == "output_captions"
         assert args.character_identification is True
-        assert args.known_faces_json == "example/known_faces.json"
-        assert args.embed_faces_json == "example/embed_faces.json"
+        assert args.known_faces_json == "known_faces.json"
+        assert args.embed_faces_json == "embed_faces.json"
         assert args.verbose is False
 
     def test_build_arg_parser_custom_values(self):
@@ -667,8 +668,8 @@ def test_main_with_default_args(
         mock_run_pipeline.assert_called_once_with(
             video_url_or_path="test_video.mp4",
             character_identification=True,
-            known_faces_json="example/known_faces.json",
-            embed_faces_json="example/embed_faces.json",
+            known_faces_json="known_faces.json",
+            embed_faces_json="embed_faces.json",
             caption_output_path="output_captions",
             caption_format="srt",
         )