Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 41 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,25 @@ A Python package for creating intelligent closed captions with face detection an
## Installation

```bash
pip install -e .
pip install captionalchemy
```

If you have a GPU and want to use hardware acceleration:

```bash
pip install captionalchemy[cuda]
```

### Prerequisites

- Python 3.10+
- FFmpeg (for video/audio processing)
- CUDA-capable GPU (optional, for acceleration)
- CUDA-capable GPU (optional, for acceleration but is highly recommended for the diarization)
- Whisper.cpp capable (optional on MacOS)

If using Whisper.cpp on MacOS, follow installation instructions [[here](https://github.com/ggml-org/whisper.cpp?tab=readme-ov-file#core-ml-support)]
Clone the whisper repo into your working directory.

## Quick Start

1. **Set up environment variables** (create `.env` file):
Expand All @@ -45,9 +54,28 @@ pip install -e .
```

3. **Generate captions**:
```bash
captionalchemy video.mp4 -f srt -o my_captions
```

```bash
captionalchemy video.mp4 -f srt -o my_captions
```

or in a python script

```python
from dotenv import load_dotenv
from captionalchemy import caption

load_dotenv()

caption.run_pipeline(
video_url_or_path="path/to/your/video.mp4", # this can be a video URL or local file
character_identification=False, # True by default
known_faces_json="path/to/known_faces.json",
embed_faces_json="path/to/embed_faces.json", # name of the output file
caption_output_path="my_captions/output", # will write output to output.srt (or .vtt/.smi)
caption_format="srt"
)
```

## Usage

Expand Down Expand Up @@ -140,16 +168,13 @@ John Doe: Welcome to our presentation on quantum computing.
Jane Smith: Thanks John. Let's start with the basics.
```

## Development
## Development and Contributing

### Setup Development Environment

```bash
# Install in development mode
pip install -e .

# Install development dependencies
pip install -r requirements-dev.txt
pip install -e ".[dev]"
```

### Running Tests
Expand All @@ -164,8 +189,8 @@ pytest
# Linting
flake8

# Type checking
mypy src/
# Code formatting
black src/ tests/
```

## Requirements
Expand Down Expand Up @@ -205,3 +230,7 @@ MIT License - see LICENSE file for details.
- Check the logs with `-v` flag for detailed error information
- Ensure all dependencies are properly installed
- Verify video file format compatibility

```

```
2 changes: 1 addition & 1 deletion example/known_faces.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[
{
"name": "SpaceTime Dude",
"image_path": "images/spacetime_dude.png"
"image_path": "example/images/spacetime_dude.png"
}
]
14 changes: 12 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "captionalchemy"
version = "0.1.0"
dynamic = ["version"]
description = "A Python package to create closed captions with face detection and recognition."
readme = "README.md"
license = { text = "MIT" }
Expand All @@ -30,6 +30,7 @@ dependencies = [
"panns-inference==0.1.1",
"librosa==0.11.0",
"tqdm==4.67.1",
"torch==2.7.0",
]

classifiers = [
Expand All @@ -42,7 +43,16 @@ requires-python = ">=3.10"


[project.optional-dependencies]
dev = ["pytest==8.4.0", "mypy==1.16.0", "flake8==7.2.0", "black==25.1.0", "pytest-cov==6.2.1"]
dev = [
"pytest==8.4.0",
"mypy==1.16.0",
"flake8==7.2.0",
"black==25.1.0",
"pytest-cov==6.2.1",
"setuptools-scm==8.3.1",

]
cuda = ["onnxruntime-gpu==1.22.0"]

[tool.setuptools.packages.find]
where = ["src"]
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ onnxruntime==1.22.0
panns-inference==0.1.1
librosa==0.11.0
tqdm==4.67.1
torch==2.7.0
1 change: 0 additions & 1 deletion src/captionalchemy/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +0,0 @@
__version__ = "0.1.0"
31 changes: 18 additions & 13 deletions src/captionalchemy/caption.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@
def run_pipeline(
video_url_or_path: str,
character_identification: bool = True,
known_faces_json: str = "example/known_faces.json",
embed_faces_json: str = "example/embed_faces.json",
known_faces_json: str = "known_faces.json",
embed_faces_json: str = "embed_faces.json",
caption_output_path: str = "output_captions",
caption_format: Literal["vtt", "srt", "smi"] = "srt",
):
Expand All @@ -56,7 +56,8 @@ def run_pipeline(
caption_format: Format for output captions (srt, vtt, smi).
"""
logger.info("Embedding known faces...")
embed_faces(known_faces_json, embed_faces_json)
if character_identification:
embed_faces(known_faces_json, embed_faces_json)
video_manager = VideoManager(use_file_buffer=False)
if caption_format == "srt":
writer = SRTCaptionWriter()
Expand Down Expand Up @@ -92,17 +93,16 @@ def run_pipeline(
speech_segments = get_speech_segments(
audio_path, os.getenv("HF_AUTH_TOKEN", ""), device
)
logger.info(f"Speech segments detected: {speech_segments}")

logger.info("Detecting non-speech segments...")
non_speech_events = detect_non_speech_segments(audio_path, device=device)
print(non_speech_events)

if not speech_segments:
logger.warning("No speech segments detected. Exiting.")
return

# Diarize
logger.info("Running diarization...")
diarization_result = diarize(
audio_path
) # { "SPEAKER_00": {"start": 3.25409375, "end": 606.2990937500001}, ..., }
Expand Down Expand Up @@ -231,14 +231,14 @@ def _build_arg_parser() -> argparse.ArgumentParser:

parser.add_argument(
"--known-faces-json",
default="example/known_faces.json",
help="Path to JSON file listing known faces to embed (default: 'example/known_faces.json').",
default="known_faces.json",
help="Path to JSON file listing known faces to embed (default: 'known_faces.json').",
)

parser.add_argument(
"--embed-faces-json",
default="example/embed_faces.json",
help="JSON path to store face embeddings (default: 'example/embed_faces.json').",
default="embed_faces.json",
help="JSON path to store face embeddings (default: 'embed_faces.json').",
)

parser.add_argument(
Expand All @@ -253,10 +253,15 @@ def main():
parser = _build_arg_parser()
args = parser.parse_args()

if args.verbose:
logging.getLogger().setLevel(logging.DEBUG)
else:
logging.getLogger().setLevel(logging.INFO)
level = logging.DEBUG if args.verbose else logging.INFO

logging.basicConfig(
level=level,
format="%(asctime)s %(levelname)s [%(name)s] %(message)s",
)

root_logger = logging.getLogger()
root_logger.setLevel(level)

logger = logging.getLogger("captionalchemy")
logger.info("Starting captionalchemy pipeline...")
Expand Down
13 changes: 10 additions & 3 deletions src/captionalchemy/tools/cv/embed_known_faces.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@


def embed_faces(
known_faces_json: str, output_embeddings_json: str = "embedded_faces.json"
known_faces_json: str, output_embeddings_json: str = "embed_faces.json"
) -> None:
"""
JSON should be structured as:
Expand All @@ -32,8 +32,15 @@ def embed_faces(
logger = logging.getLogger(__name__)
if not known_faces_json:
raise ValueError("known_faces.json must be provided.")
with open(known_faces_json, "r") as f:
known_list = json.load(f)
try:
with open(known_faces_json, "r") as f:
known_list = json.load(f)

except FileNotFoundError as e:
logger.error(f"Error loading known faces JSON: {e}")
raise FileNotFoundError(
f"Could not find the known faces JSON file: {known_faces_json}"
) from e

insightface.model_zoo.get_model
provider = (
Expand Down
15 changes: 8 additions & 7 deletions tests/test_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,8 +289,9 @@ def test_pipeline_no_speech_segments(
character_identification=False,
)

# Verify mock_embed_faces was not called
mock_embed_faces.assert_not_called()
# Verify early functions were called
mock_embed_faces.assert_called_once()
mock_vad.assert_called_once()

@patch("captionalchemy.caption.detect_non_speech_segments")
Expand Down Expand Up @@ -483,8 +484,8 @@ def test_pipeline_character_identification_disabled(
character_identification=False,
)

# embed_faces should still be called
mock_embed_faces.assert_called_once()
# embed_faces should not be called
mock_embed_faces.assert_not_called()

# Should process speech events even without face recognition
assert mock_transcriber.transcribe_audio.call_count == len(speech_events)
Expand Down Expand Up @@ -591,8 +592,8 @@ def test_build_arg_parser_default_values(self):
assert args.format == "srt"
assert args.output == "output_captions"
assert args.character_identification is True
assert args.known_faces_json == "example/known_faces.json"
assert args.embed_faces_json == "example/embed_faces.json"
assert args.known_faces_json == "known_faces.json"
assert args.embed_faces_json == "embed_faces.json"
assert args.verbose is False

def test_build_arg_parser_custom_values(self):
Expand Down Expand Up @@ -667,8 +668,8 @@ def test_main_with_default_args(
mock_run_pipeline.assert_called_once_with(
video_url_or_path="test_video.mp4",
character_identification=True,
known_faces_json="example/known_faces.json",
embed_faces_json="example/embed_faces.json",
known_faces_json="known_faces.json",
embed_faces_json="embed_faces.json",
caption_output_path="output_captions",
caption_format="srt",
)
Expand Down