benbatman · benbatman · Jul 9, 2025 · Jul 9, 2025 · Jul 9, 2025 · Jul 9, 2025
diff --git a/README.md b/README.md
@@ -14,16 +14,25 @@ A Python package for creating intelligent closed captions with face detection an
 ## Installation
 
 ```bash
-pip install -e .
+pip install captionalchemy
+```
+
+If you have a GPU and want to use hardware acceleration:
+
+```bash
+pip install captionalchemy[cuda]
 ```
 
 ### Prerequisites
 
 - Python 3.10+
 - FFmpeg (for video/audio processing)
-- CUDA-capable GPU (optional, for acceleration)
+- CUDA-capable GPU (optional, for acceleration but is highly recommended for the diarization)
 - Whisper.cpp capable (optional on MacOS)
 
+If using Whisper.cpp on MacOS, follow installation instructions [[here](https://github.com/ggml-org/whisper.cpp?tab=readme-ov-file#core-ml-support)]
+Clone the whisper repo into your working directory.
+
 ## Quick Start
 
 1. **Set up environment variables** (create `.env` file):
@@ -45,9 +54,28 @@ pip install -e .
    ```
 
 3. **Generate captions**:
-   ```bash
-   captionalchemy video.mp4 -f srt -o my_captions
-   ```
+
+```bash
+captionalchemy video.mp4 -f srt -o my_captions
+```
+
+or in a python script
+
+```python
+from dotenv import load_dotenv
+from captionalchemy import caption
+
+load_dotenv()
+
+caption.run_pipeline(
+    video_url_or_path="path/to/your/video.mp4",         # this can be a video URL or local file
+    character_identification=False,                      # True by default
+    known_faces_json="path/to/known_faces.json",
+    embed_faces_json="path/to/embed_faces.json",        # name of the output file
+    caption_output_path="my_captions/output",           # will write output to output.srt (or .vtt/.smi)
+    caption_format="srt"
+)
+```
 
 ## Usage
 
@@ -140,16 +168,13 @@ John Doe: Welcome to our presentation on quantum computing.
 Jane Smith: Thanks John. Let's start with the basics.
 ```
 
-## Development
+## Development and Contributing
 
 ### Setup Development Environment
 
 ```bash
 # Install in development mode
-pip install -e .
-
-# Install development dependencies
-pip install -r requirements-dev.txt
+pip install -e ".[dev]"
 ```
 
 ### Running Tests
@@ -164,8 +189,8 @@ pytest
 # Linting
 flake8
 
-# Type checking
-mypy src/
+# Code formatting
+black src/ tests/
 ```
 
 ## Requirements
@@ -205,3 +230,7 @@ MIT License - see LICENSE file for details.
 - Check the logs with `-v` flag for detailed error information
 - Ensure all dependencies are properly installed
 - Verify video file format compatibility
+
+```
+
+```
diff --git a/example/known_faces.json b/example/known_faces.json
@@ -1,6 +1,6 @@
 [
   {
     "name": "SpaceTime Dude",
-    "image_path": "images/spacetime_dude.png"
+    "image_path": "example/images/spacetime_dude.png"
   }
 ]
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "captionalchemy"
-version = "0.1.0"
+dynamic = ["version"]
 description = "A Python package to create closed captions with face detection and recognition."
 readme = "README.md"
 license = { text = "MIT" }
@@ -30,6 +30,7 @@ dependencies = [
     "panns-inference==0.1.1",
     "librosa==0.11.0",
     "tqdm==4.67.1",
+    "torch==2.7.0",
 ]
 
 classifiers = [
@@ -42,7 +43,16 @@ requires-python = ">=3.10"
 
 
 [project.optional-dependencies]
-dev = ["pytest==8.4.0", "mypy==1.16.0", "flake8==7.2.0", "black==25.1.0", "pytest-cov==6.2.1"]
+dev = [
+    "pytest==8.4.0",
+    "mypy==1.16.0",
+    "flake8==7.2.0",
+    "black==25.1.0",
+    "pytest-cov==6.2.1",
+    "setuptools-scm==8.3.1",
+
+]
+cuda = ["onnxruntime-gpu==1.22.0"]
 
 [tool.setuptools.packages.find]
 where = ["src"]

diff --git a/requirements.txt b/requirements.txt
@@ -10,3 +10,4 @@ onnxruntime==1.22.0
 panns-inference==0.1.1
 librosa==0.11.0
 tqdm==4.67.1
+torch==2.7.0
diff --git a/src/captionalchemy/__init__.py b/src/captionalchemy/__init__.py
@@ -1 +0,0 @@
-__version__ = "0.1.0"

diff --git a/src/captionalchemy/caption.py b/src/captionalchemy/caption.py
@@ -34,8 +34,8 @@
 def run_pipeline(
     video_url_or_path: str,
     character_identification: bool = True,
-    known_faces_json: str = "example/known_faces.json",
-    embed_faces_json: str = "example/embed_faces.json",
+    known_faces_json: str = "known_faces.json",
+    embed_faces_json: str = "embed_faces.json",
     caption_output_path: str = "output_captions",
     caption_format: Literal["vtt", "srt", "smi"] = "srt",
 ):
@@ -56,7 +56,8 @@ def run_pipeline(
         caption_format: Format for output captions (srt, vtt, smi).
     """
     logger.info("Embedding known faces...")
-    embed_faces(known_faces_json, embed_faces_json)
+    if character_identification:
+        embed_faces(known_faces_json, embed_faces_json)
     video_manager = VideoManager(use_file_buffer=False)
     if caption_format == "srt":
         writer = SRTCaptionWriter()
@@ -92,17 +93,16 @@ def run_pipeline(
         speech_segments = get_speech_segments(
             audio_path, os.getenv("HF_AUTH_TOKEN", ""), device
         )
-        logger.info(f"Speech segments detected: {speech_segments}")
 
         logger.info("Detecting non-speech segments...")
         non_speech_events = detect_non_speech_segments(audio_path, device=device)
-        print(non_speech_events)
 
         if not speech_segments:
             logger.warning("No speech segments detected. Exiting.")
             return
 
         # Diarize
+        logger.info("Running diarization...")
         diarization_result = diarize(
             audio_path
         )  # { "SPEAKER_00": {"start": 3.25409375, "end": 606.2990937500001}, ..., }
@@ -231,14 +231,14 @@ def _build_arg_parser() -> argparse.ArgumentParser:
 
     parser.add_argument(
         "--known-faces-json",
-        default="example/known_faces.json",
-        help="Path to JSON file listing known faces to embed (default: 'example/known_faces.json').",
+        default="known_faces.json",
+        help="Path to JSON file listing known faces to embed (default: 'known_faces.json').",
     )
 
     parser.add_argument(
         "--embed-faces-json",
-        default="example/embed_faces.json",
-        help="JSON path to store face embeddings (default: 'example/embed_faces.json').",
+        default="embed_faces.json",
+        help="JSON path to store face embeddings (default: 'embed_faces.json').",
     )
 
     parser.add_argument(
@@ -253,10 +253,15 @@ def main():
     parser = _build_arg_parser()
     args = parser.parse_args()
 
-    if args.verbose:
-        logging.getLogger().setLevel(logging.DEBUG)
-    else:
-        logging.getLogger().setLevel(logging.INFO)
+    level = logging.DEBUG if args.verbose else logging.INFO
+
+    logging.basicConfig(
+        level=level,
+        format="%(asctime)s %(levelname)s [%(name)s] %(message)s",
+    )
+
+    root_logger = logging.getLogger()
+    root_logger.setLevel(level)
 
     logger = logging.getLogger("captionalchemy")
     logger.info("Starting captionalchemy pipeline...")

diff --git a/src/captionalchemy/tools/cv/embed_known_faces.py b/src/captionalchemy/tools/cv/embed_known_faces.py
@@ -8,7 +8,7 @@
 
 
 def embed_faces(
-    known_faces_json: str, output_embeddings_json: str = "embedded_faces.json"
+    known_faces_json: str, output_embeddings_json: str = "embed_faces.json"
 ) -> None:
     """
     JSON should be structured as:
@@ -32,8 +32,15 @@ def embed_faces(
     logger = logging.getLogger(__name__)
     if not known_faces_json:
         raise ValueError("known_faces.json must be provided.")
-    with open(known_faces_json, "r") as f:
-        known_list = json.load(f)
+    try:
+        with open(known_faces_json, "r") as f:
+            known_list = json.load(f)
+
+    except FileNotFoundError as e:
+        logger.error(f"Error loading known faces JSON: {e}")
+        raise FileNotFoundError(
+            f"Could not find the known faces JSON file: {known_faces_json}"
+        ) from e
 
     insightface.model_zoo.get_model
     provider = (

diff --git a/tests/test_main.py b/tests/test_main.py
@@ -289,8 +289,9 @@ def test_pipeline_no_speech_segments(
             character_identification=False,
         )
 
+        # Verify mock_embed_faces was not called
+        mock_embed_faces.assert_not_called()
         # Verify early functions were called
-        mock_embed_faces.assert_called_once()
         mock_vad.assert_called_once()
 
     @patch("captionalchemy.caption.detect_non_speech_segments")
@@ -483,8 +484,8 @@ def test_pipeline_character_identification_disabled(
             character_identification=False,
         )
 
-        # embed_faces should still be called
-        mock_embed_faces.assert_called_once()
+        # embed_faces should not be called
+        mock_embed_faces.assert_not_called()
 
         # Should process speech events even without face recognition
         assert mock_transcriber.transcribe_audio.call_count == len(speech_events)
@@ -591,8 +592,8 @@ def test_build_arg_parser_default_values(self):
         assert args.format == "srt"
         assert args.output == "output_captions"
         assert args.character_identification is True
-        assert args.known_faces_json == "example/known_faces.json"
-        assert args.embed_faces_json == "example/embed_faces.json"
+        assert args.known_faces_json == "known_faces.json"
+        assert args.embed_faces_json == "embed_faces.json"
         assert args.verbose is False
 
     def test_build_arg_parser_custom_values(self):
@@ -667,8 +668,8 @@ def test_main_with_default_args(
         mock_run_pipeline.assert_called_once_with(
             video_url_or_path="test_video.mp4",
             character_identification=True,
-            known_faces_json="example/known_faces.json",
-            embed_faces_json="example/embed_faces.json",
+            known_faces_json="known_faces.json",
+            embed_faces_json="embed_faces.json",
             caption_output_path="output_captions",
             caption_format="srt",
         )