diff --git a/.claude/agents/archivist.md b/.claude/agents/archivist.md new file mode 100644 index 0000000..6054b40 --- /dev/null +++ b/.claude/agents/archivist.md @@ -0,0 +1,67 @@ +--- +name: archivist +description: Metadata enrichment and curation. Use to enrich track metadata, review flagged conflicts, and curate album art quality. +model: sonnet +tools: + - Read + - Write + - Edit + - Glob + - Grep + - Bash +permissionMode: acceptEdits +skills: + - enrich +color: amber +maxTurns: 50 +memory: project +--- +# Archivist + +Metadata enrichment and curation agent for the Crate music library. + +## Role + +Curate and enrich track metadata using external APIs. Run the enrichment pipeline, review flagged conflicts, and help resolve uncertain matches interactively. + +## Capabilities + +- **Run enrichment**: Invoke `python tools/enrich_metadata.py` with appropriate flags +- **Review queue**: Read and walk through `review_queue.json` flagged tracks with the user +- **Apply corrections**: Edit `metadata_enriched.json` to apply chosen corrections +- **Re-enrich**: Re-run enrichment after manual corrections using `--resume` + +## Understanding the Pipeline + +### Confidence Scoring +- **>= 0.85**: Auto-accepted — fields updated directly +- **0.50–0.85**: Flagged for manual review +- **< 0.50**: Skipped — original metadata kept + +### Conflict Classifications +- `confirmed`: External data matches existing tags — no action needed +- `supplement`: Existing field was empty, external has data — auto-filled if confidence >= 0.50 +- `likely_correction`: Multiple sources disagree with existing tag — flagged with suggested correction +- `alternative`: One source disagrees — noted but existing kept + +### Artwork Selection +Album art scored 0–100 on resolution, source, type, and format. Only upgrades when new score exceeds old by > 10 points. + +## Process + +1. Check if `metadata_base.json` exists in the metadata directory +2. Run enrichment: `python tools/enrich_metadata.py --input metadata/metadata_base.json --output metadata/` +3. Review `metadata/review_queue.json` — present each flagged item to the user +4. For each flagged track, show existing vs suggested values and let the user choose +5. Apply corrections to `metadata/metadata_enriched.json` +6. If corrections were made, offer to re-run with `--resume` to fill remaining gaps + +## On Blockers + +If the MusicBrainz API is unreachable, the script falls back to offline mode (copies base metadata as-is with `status: skipped`). Report this and suggest retrying later. + +## Constraints + +- **Respect rate limits**: Never bypass the 1 req/sec MusicBrainz limit +- **Don't auto-apply review items**: Always present flagged tracks to the user for decision +- **Keep originals**: Never delete or overwrite `metadata_base.json` diff --git a/.claude/skills/enrich.md b/.claude/skills/enrich.md new file mode 100644 index 0000000..7f983ef --- /dev/null +++ b/.claude/skills/enrich.md @@ -0,0 +1,140 @@ +# Enrich — Metadata Enrichment Pipeline + +## When to Use + +- After adding new music to the library +- When tracks have incomplete or incorrect metadata +- To fetch album art for tracks missing artwork +- Periodically to re-enrich with improved matching + +## Pipeline + +``` +./tools/pipeline.sh [/path/to/new/music] +``` + +That single command handles everything: + +``` +[extract] → [upload] → [enrich] → [publish] +``` + +| Step | What it does | When it runs | +|------|-------------|--------------| +| Extract | Scans audio files for ID3/Vorbis tags | Only with a path argument | +| Upload | Uploads new audio to S3 | Only with a path argument | +| Enrich | Queries MusicBrainz + Cover Art Archive | Always | +| Publish | Uploads artwork to S3, pushes manifest | Always (unless `--skip-publish`) | + +## Common Usage + +```bash +# Re-enrich entire library (idempotent — skips already-processed tracks) +./tools/pipeline.sh + +# Add new music and enrich everything +./tools/pipeline.sh /path/to/new/tracks + +# Preview what enrichment would do (writes dry_run_report.json) +./tools/pipeline.sh --dry-run + +# Apply a previous dry run (reads cached results, no re-querying) +./tools/pipeline.sh + +# Re-process everything from scratch +./tools/pipeline.sh --no-resume + +# Limit to first N tracks (useful for testing) +./tools/pipeline.sh --limit 10 +``` + +## Options + +| Flag | Effect | +|------|--------| +| `--dry-run` | Preview matches, write `dry_run_report.json`, don't modify anything | +| `--skip-publish` | Enrich locally but don't push to S3 | +| `--skip-upload` | Skip uploading new audio files | +| `--skip-artwork` | Skip album art fetching | +| `--no-resume` | Re-process all tracks from scratch | +| `--limit N` | Only process first N tracks | + +## How It Works + +### Matching +1. Searches MusicBrainz by `artist + title`, then `artist + album`, then `title only` +2. Scores candidates (0.0–1.0) using weighted field similarity +3. Thresholds: **>= 0.85** auto-accept, **0.50–0.85** flag for review, **< 0.50** skip + +### Dry-Run → Real Run +- `--dry-run` saves all match results to `dry_run_report.json` +- A subsequent real run loads cached results — zero API re-queries +- After applying, the report is deleted + +### Resume +- `.enrichment_state.json` tracks processed track IDs +- `--resume` (on by default) skips already-processed tracks +- When resuming, reads from `metadata_enriched.json` to preserve prior work + +## Output Files + +| File | Purpose | +|------|---------| +| `metadata/metadata_enriched.json` | Full metadata with enrichment data per track | +| `metadata/review_queue.json` | Tracks needing human review | +| `metadata/dry_run_report.json` | Dry-run results (consumed by next real run) | +| `metadata/.enrichment_state.json` | Resume checkpoint | +| `metadata/manifest_enriched.json` | Clean manifest built during publish | +| `metadata/artwork/*_enriched.jpg` | Downloaded album art | + +## Review Queue + +Tracks are flagged for review when: +- Match confidence is between 0.50 and 0.85 +- Multiple sources disagree with existing tags (`likely_correction`) +- Multiple high-confidence candidates disagree with each other +- Album art upgrade available when existing art is present +- Track has neither artist nor title + +Use the **archivist agent** to walk through flagged tracks interactively. + +## Conflict Classifications + +| Classification | Meaning | Action | +|---------------|---------|--------| +| `confirmed` | External data matches existing | No change | +| `supplement` | Empty field filled | Auto-filled | +| `likely_correction` | Multiple sources disagree with tag | Flagged | +| `alternative` | One source offers different value | Noted, kept existing | + +## Individual Scripts + +For fine-grained control, run scripts directly: + +```bash +# Enrich only +python tools/enrich_metadata.py --input metadata/manifest.json --output metadata/ --resume + +# Publish only (after manual edits to metadata_enriched.json) +python tools/publish_manifest.py --metadata-dir metadata/ + +# Extract only +python tools/extract_metadata.py /path/to/audio --output metadata/ +``` + +## Rate Limits + +- MusicBrainz: 1 req/sec (enforced) +- Cover Art Archive: 1 req/sec (enforced) +- Full run: ~2-3 seconds per track +- 118 tracks ≈ 4-6 minutes + +## Troubleshooting + +| Issue | Solution | +|-------|----------| +| "MusicBrainz API is unreachable" | Check internet; falls back to offline mode | +| Many "no match" results | Tracks may have poor/missing metadata | +| Interrupted mid-run | Just re-run — `--resume` is default | +| Want to re-process one track | Remove its ID from `.enrichment_state.json` | +| Artwork not showing in app | Check CloudFront invalidation completed | diff --git a/.gitignore b/.gitignore index b4ce6c0..20ac907 100644 --- a/.gitignore +++ b/.gitignore @@ -159,6 +159,14 @@ fffff.at-archive/ # Artwork is in S3, not git metadata/artwork/ +# Enrichment pipeline output (regenerated by pipeline.sh) +metadata/manifest.json +metadata/metadata_enriched.json +metadata/manifest_enriched.json +metadata/review_queue.json +metadata/.enrichment_state.json +metadata/dry_run_report.json + # Local dev manifest (copy from production for testing) www/manifest.json diff --git a/CHANGELOG.md b/CHANGELOG.md index cbe5bc4..6a06e75 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,19 @@ # Changelog +## [0.2.0] - 2026-02-15T20:41:35-05:00 + +### Added +- Metadata enrichment pipeline via MusicBrainz, Cover Art Archive, and iTunes Search API +- Single entrypoint `pipeline.sh` for extract, upload, enrich, and publish steps +- Confidence-based matching with auto-accept, review, and skip thresholds +- Resume and dry-run support for idempotent re-runs +- Publish step uploads artwork to S3 and pushes enriched manifest +- Generative CSS gradient backgrounds for tracks without album artwork +- Archivist agent and enrich skill for future curation workflows + +### Changed +- `batch_upload.py` accepts enriched metadata format with `--enriched` flag + ## [0.1.0] - 2026-02-14T01:12:36+00:00 ### Added diff --git a/VERSION b/VERSION index 6e8bf73..0ea3a94 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.1.0 +0.2.0 diff --git a/tools/batch_upload.py b/tools/batch_upload.py index 8bf40a7..6e50a4e 100644 --- a/tools/batch_upload.py +++ b/tools/batch_upload.py @@ -20,6 +20,7 @@ AWS_PROFILE = os.environ.get('AWS_PROFILE', 'default') TRACKS_BUCKET = os.environ.get('TRACKS_BUCKET', '') METADATA_FILE = 'metadata_base.json' +ENRICHED_METADATA_FILE = 'metadata_enriched.json' MANIFEST_FILE = 'manifest.json' @@ -59,11 +60,35 @@ def get_content_type(filepath: Path) -> str: return types.get(ext, 'application/octet-stream') -def load_metadata(metadata_dir: Path) -> dict: - """Load metadata_base.json.""" +def load_metadata(metadata_dir: Path, enriched: bool = False) -> dict: + """Load metadata JSON. Prefers enriched if --enriched flag is set. + + Normalizes tracks to dict format regardless of input shape (list or dict). + """ + if enriched: + enriched_file = metadata_dir / ENRICHED_METADATA_FILE + if enriched_file.exists(): + print(f"Using enriched metadata: {enriched_file}") + with open(enriched_file) as f: + data = json.load(f) + return _normalize_tracks(data) + print(f"Enriched metadata not found, falling back to base") metadata_file = metadata_dir / METADATA_FILE with open(metadata_file) as f: - return json.load(f) + data = json.load(f) + return _normalize_tracks(data) + + +def _normalize_tracks(data: dict) -> dict: + """Ensure tracks is a dict keyed by path/id (handles manifest list format).""" + tracks = data.get('tracks', {}) + if isinstance(tracks, list): + tracks_dict = {} + for track in tracks: + key = track.get('path') or track.get('original_path') or track['id'] + tracks_dict[key] = track + data['tracks'] = tracks_dict + return data def save_metadata(metadata_dir: Path, metadata: dict): @@ -145,12 +170,17 @@ def main(): action='store_true', help='Skip uploading artwork files' ) + parser.add_argument( + '--enriched', + action='store_true', + help='Use metadata_enriched.json instead of metadata_base.json' + ) args = parser.parse_args() # Load metadata print(f"Loading metadata from {args.metadata_dir}...") - metadata = load_metadata(args.metadata_dir) + metadata = load_metadata(args.metadata_dir, enriched=args.enriched) total_tracks = len(metadata['tracks']) print(f"Found {total_tracks} tracks in metadata") diff --git a/tools/enrich_metadata.py b/tools/enrich_metadata.py new file mode 100644 index 0000000..7a33787 --- /dev/null +++ b/tools/enrich_metadata.py @@ -0,0 +1,1123 @@ +#!/usr/bin/env python3 +""" +Crate Metadata Enrichment Pipeline + +Enriches track metadata using free external APIs (MusicBrainz, Cover Art Archive). +Auto-accepts high-confidence matches and flags uncertain ones for manual review. + +Pipeline position: + extract_metadata.py → enrich_metadata.py → batch_upload.py + metadata_base.json metadata_enriched.json manifest.json + S3 + review_queue.json +""" + +import argparse +import json +import os +import sys +import time +import threading +from concurrent.futures import ThreadPoolExecutor, as_completed +from datetime import datetime, timezone +from pathlib import Path +from urllib.parse import quote + +from typing import Dict, List, Optional, Set, Tuple + +import requests + +# --- Configuration --- + +MB_BASE_URL = "https://musicbrainz.org/ws/2" +CAA_BASE_URL = "https://coverartarchive.org" +MB_USER_AGENT = "CrateEnrichment/1.0 (https://crate.rmzi.world)" + +# Thresholds +AUTO_ACCEPT_THRESHOLD = 0.85 +REVIEW_THRESHOLD = 0.50 + +# Field weights for scoring +FIELD_WEIGHTS = { + 'artist': 0.35, + 'title': 0.35, + 'album': 0.15, + 'year': 0.10, + 'duration': 0.05, +} + +# Artwork scoring +ART_RESOLUTION_SCORES = [(1200, 40), (1000, 35), (500, 20), (250, 10)] +ART_SOURCE_SCORES = {'coverartarchive': 30, 'itunes': 25, 'discogs': 20, 'existing': 15} +ART_TYPE_SCORES = {'front': 20, 'unknown': 10} +ART_FORMAT_SCORES = {'jpeg': 10, 'jpg': 10, 'png': 7} +ART_UPGRADE_MARGIN = 10 + + +# --- Rate Limiter --- + +class RateLimiter: + """Token-bucket rate limiter per API.""" + + def __init__(self, rate: float = 1.0): + self._rate = rate + self._lock = threading.Lock() + self._last_call = 0.0 + + def wait(self): + with self._lock: + now = time.monotonic() + elapsed = now - self._last_call + wait_time = (1.0 / self._rate) - elapsed + if wait_time > 0: + time.sleep(wait_time) + self._last_call = time.monotonic() + + +# --- Enrichment State (resume support) --- + +class EnrichmentState: + """Tracks which track IDs have been processed for resume support.""" + + def __init__(self, state_file: Path): + self._file = state_file + self._processed: Set[str] = set() + self._load() + + def _load(self): + if self._file.exists(): + try: + data = json.loads(self._file.read_text()) + self._processed = set(data.get('processed', [])) + except (json.JSONDecodeError, KeyError): + self._processed = set() + + def save(self): + self._file.write_text(json.dumps({ + 'processed': sorted(self._processed), + 'updated_at': datetime.now(timezone.utc).isoformat().replace('+00:00', 'Z'), + }, indent=2)) + + def is_processed(self, track_id: str) -> bool: + return track_id in self._processed + + def mark_processed(self, track_id: str): + self._processed.add(track_id) + + @property + def count(self) -> int: + return len(self._processed) + + +# --- String Similarity --- + +def normalize(s: str) -> str: + """Normalize a string for comparison.""" + if not s: + return "" + s = s.lower().strip() + # Remove common noise: "feat.", "ft.", parenthetical info + for noise in ['feat.', 'ft.', 'featuring']: + s = s.replace(noise, ' ') + # Collapse whitespace + return ' '.join(s.split()) + + +def token_set(s: str) -> Set[str]: + """Split normalized string into token set.""" + return set(normalize(s).split()) + + +def string_similarity(a: str, b: str) -> float: + """Compute similarity between two strings (0.0–1.0). + + Uses exact match (1.0) or token Jaccard overlap. + """ + if not a or not b: + return 0.0 + na, nb = normalize(a), normalize(b) + if na == nb: + return 1.0 + ta, tb = token_set(a), token_set(b) + if not ta or not tb: + return 0.0 + intersection = ta & tb + union = ta | tb + return len(intersection) / len(union) + + +# --- Match Scorer --- + +class MatchScorer: + """Scores candidate matches against existing track metadata.""" + + def score(self, existing: dict, candidate: dict, source_count: int = 1) -> float: + """Compute weighted match score (0.0–1.0).""" + total = 0.0 + for field, weight in FIELD_WEIGHTS.items(): + existing_val = existing.get(field) + candidate_val = candidate.get(field) + + if field == 'year': + total += weight * self._year_similarity(existing_val, candidate_val) + elif field == 'duration': + total += weight * self._duration_similarity(existing_val, candidate_val) + else: + total += weight * string_similarity( + str(existing_val) if existing_val else '', + str(candidate_val) if candidate_val else '', + ) + + # Multi-source bonus + if source_count >= 3: + total = min(1.0, total + 0.10) + elif source_count >= 2: + total = min(1.0, total + 0.05) + + return round(total, 4) + + @staticmethod + def _year_similarity(a, b) -> float: + if a is None or b is None: + return 0.0 + try: + diff = abs(int(a) - int(b)) + except (ValueError, TypeError): + return 0.0 + if diff == 0: + return 1.0 + if diff == 1: + return 0.8 + if diff <= 3: + return 0.4 + return 0.0 + + @staticmethod + def _duration_similarity(a, b) -> float: + if a is None or b is None: + return 0.0 + try: + diff = abs(int(a) - int(b)) + except (ValueError, TypeError): + return 0.0 + if diff <= 2: + return 1.0 + if diff <= 5: + return 0.7 + if diff <= 10: + return 0.3 + return 0.0 + + +# --- Conflict Resolver --- + +class ConflictResolver: + """Categorizes disagreements between existing tags and external data.""" + + def resolve(self, field: str, existing_val, enriched_val, + source_count: int = 1) -> dict: + """Classify a field conflict. + + Returns dict with 'classification', 'action', and details. + """ + has_existing = existing_val is not None and str(existing_val).strip() != '' + has_enriched = enriched_val is not None and str(enriched_val).strip() != '' + + if not has_enriched: + return {'classification': 'no_data', 'action': 'keep'} + + if not has_existing and has_enriched: + return { + 'classification': 'supplement', + 'action': 'auto_fill', + 'value': enriched_val, + } + + # Both exist — compare + sim = string_similarity(str(existing_val), str(enriched_val)) + if sim >= 0.9: + return {'classification': 'confirmed', 'action': 'keep'} + + if source_count >= 2: + return { + 'classification': 'likely_correction', + 'action': 'flag_review', + 'existing': existing_val, + 'suggested': enriched_val, + 'similarity': round(sim, 3), + } + + return { + 'classification': 'alternative', + 'action': 'keep', + 'alternative': enriched_val, + } + + +# --- Artwork Selector --- + +class ArtworkSelector: + """Scores and selects best available album art.""" + + @staticmethod + def score_artwork(width: int = 0, source: str = 'unknown', + art_type: str = 'unknown', fmt: str = 'jpeg') -> int: + """Score artwork on a 0–100 scale.""" + score = 0 + + # Resolution (40 pts) + for threshold, pts in ART_RESOLUTION_SCORES: + if width >= threshold: + score += pts + break + + # Source (30 pts) + score += ART_SOURCE_SCORES.get(source, 0) + + # Type (20 pts) + score += ART_TYPE_SCORES.get(art_type, 0) + + # Format (10 pts) + score += ART_FORMAT_SCORES.get(fmt.lower(), 0) + + return score + + def should_upgrade(self, existing_score: int, new_score: int) -> bool: + return new_score > existing_score + ART_UPGRADE_MARGIN + + +# --- MusicBrainz Client --- + +class MusicBrainzClient: + """Primary lookup via MusicBrainz recording search.""" + + def __init__(self, limiter: RateLimiter): + self._limiter = limiter + self._session = requests.Session() + self._session.headers.update({ + 'User-Agent': MB_USER_AGENT, + 'Accept': 'application/json', + }) + + def ping(self) -> bool: + """Check if MusicBrainz API is reachable.""" + try: + self._limiter.wait() + resp = self._session.get(f"{MB_BASE_URL}/recording", params={ + 'query': 'test', 'limit': 1, 'fmt': 'json', + }, timeout=10) + return resp.status_code == 200 + except requests.RequestException: + return False + + def search_recordings(self, artist: str = None, title: str = None, + album: str = None) -> List[dict]: + """Search MusicBrainz for recordings matching the given fields. + + Returns a list of candidate dicts with normalized fields. + """ + candidates = [] + + # Strategy 1: artist + title (primary) + if artist and title: + results = self._query(f'artist:"{artist}" AND recording:"{title}"') + candidates.extend(results) + + # Strategy 2: artist + album (fallback) + if artist and album and not candidates: + results = self._query(f'artist:"{artist}" AND release:"{album}"') + candidates.extend(results) + + # Strategy 3: title only (last resort) + if title and not candidates: + results = self._query(f'recording:"{title}"') + candidates.extend(results) + + return candidates + + def _query(self, query: str, limit: int = 5) -> List[dict]: + """Execute a MusicBrainz recording search query.""" + self._limiter.wait() + try: + resp = self._session.get(f"{MB_BASE_URL}/recording", params={ + 'query': query, 'limit': limit, 'fmt': 'json', + }, timeout=15) + if resp.status_code != 200: + return [] + data = resp.json() + except (requests.RequestException, json.JSONDecodeError): + return [] + + candidates = [] + for rec in data.get('recordings', []): + candidate = { + 'title': rec.get('title'), + 'artist': None, + 'album': None, + 'year': None, + 'duration': None, + 'mb_recording_id': rec.get('id'), + 'mb_release_id': None, + 'mb_score': rec.get('score', 0), + 'source': 'musicbrainz', + } + + # Artist from artist-credit + credits = rec.get('artist-credit', []) + if credits: + candidate['artist'] = credits[0].get('name') or credits[0].get('artist', {}).get('name') + + # Duration in seconds + length_ms = rec.get('length') + if length_ms: + candidate['duration'] = length_ms // 1000 + + # Release info (first release) + releases = rec.get('releases', []) + if releases: + rel = releases[0] + candidate['album'] = rel.get('title') + candidate['mb_release_id'] = rel.get('id') + date = rel.get('date', '') + if date and len(date) >= 4: + try: + candidate['year'] = int(date[:4]) + except ValueError: + pass + + candidates.append(candidate) + + return candidates + + +# --- Cover Art Archive Client --- + +class CoverArtArchiveClient: + """Fetches album art via MusicBrainz release IDs.""" + + def __init__(self, limiter: RateLimiter): + self._limiter = limiter + self._session = requests.Session() + self._session.headers.update({ + 'User-Agent': MB_USER_AGENT, + 'Accept': 'application/json', + }) + + def get_cover_art(self, release_id: str) -> Optional[dict]: + """Fetch cover art info for a MusicBrainz release. + + Returns dict with 'url', 'width', 'type', 'format' or None. + """ + if not release_id: + return None + + self._limiter.wait() + try: + resp = self._session.get( + f"{CAA_BASE_URL}/release/{release_id}", + timeout=15, + allow_redirects=True, + ) + if resp.status_code != 200: + return None + data = resp.json() + except (requests.RequestException, json.JSONDecodeError): + return None + + images = data.get('images', []) + if not images: + return None + + # Prefer front cover + best = None + for img in images: + if img.get('front', False): + best = img + break + if not best: + best = images[0] + + # Get the best thumbnail URL (prefer 1200, fall back to large, then original) + thumbnails = best.get('thumbnails', {}) + url = thumbnails.get('1200') or thumbnails.get('large') or best.get('image') + if not url: + return None + + # Estimate width from thumbnail key + width = 1200 if '1200' in str(thumbnails.get('1200', '')) else 500 + + art_type = 'front' if best.get('front', False) else 'unknown' + fmt = 'jpeg' if url.endswith('.jpg') or url.endswith('.jpeg') else 'png' + + return { + 'url': url, + 'width': width, + 'type': art_type, + 'format': fmt, + 'source': 'coverartarchive', + } + + def download_art(self, url: str, output_path: Path) -> bool: + """Download artwork to a file. Returns True on success.""" + try: + resp = self._session.get(url, timeout=30, stream=True) + if resp.status_code != 200: + return False + with open(output_path, 'wb') as f: + for chunk in resp.iter_content(8192): + f.write(chunk) + return True + except requests.RequestException: + return False + + +# --- iTunes Search API Client --- + +class ITunesArtworkClient: + """Fetches album art via iTunes Search API as a fallback source.""" + + def __init__(self, limiter: RateLimiter): + self._limiter = limiter + self._session = requests.Session() + self._session.headers.update({ + 'User-Agent': MB_USER_AGENT, + 'Accept': 'application/json', + }) + + def search_artwork(self, artist: str, title: str) -> Optional[dict]: + """Search iTunes for artwork matching artist and title. + + Returns dict with 'url', 'width', 'type', 'format', 'source' or None. + """ + if not artist or not title: + return None + + self._limiter.wait() + try: + # Build search query + query = f"{artist} {title}" + resp = self._session.get( + "https://itunes.apple.com/search", + params={ + 'term': query, + 'media': 'music', + 'limit': 3, + }, + timeout=15, + ) + if resp.status_code != 200: + return None + data = resp.json() + except (requests.RequestException, json.JSONDecodeError): + return None + + results = data.get('results', []) + if not results: + return None + + # Find best matching result (validate artist similarity) + for result in results: + result_artist = result.get('artistName', '') + if not result_artist: + continue + + # Validate artist match + artist_sim = string_similarity(artist, result_artist) + if artist_sim < 0.5: + continue + + # Get artwork URL and resize to 1200x1200 + artwork_url = result.get('artworkUrl100') + if not artwork_url: + continue + + # Replace 100x100 with 1200x1200 + artwork_url_hires = artwork_url.replace('100x100', '1200x1200') + + return { + 'url': artwork_url_hires, + 'width': 1200, + 'type': 'front', + 'format': 'jpeg', + 'source': 'itunes', + } + + return None + + def download_art(self, url: str, output_path: Path) -> bool: + """Download artwork to a file. Returns True on success.""" + try: + resp = self._session.get(url, timeout=30, stream=True) + if resp.status_code != 200: + return False + with open(output_path, 'wb') as f: + for chunk in resp.iter_content(8192): + f.write(chunk) + return True + except requests.RequestException: + return False + + +# --- Main Enrichment Logic --- + +def enrich_track(track: dict, mb_client: MusicBrainzClient, + caa_client: CoverArtArchiveClient, + itunes_client: ITunesArtworkClient, + scorer: MatchScorer, resolver: ConflictResolver, + artwork_selector: ArtworkSelector, + output_dir: Path, skip_artwork: bool = False, + dry_run: bool = False) -> dict: + """Enrich a single track's metadata. + + Returns dict with: + - 'enrichment': enrichment details to attach to track + - 'review': review queue entry or None + - 'updates': dict of field updates to apply to top-level track + """ + track_id = track['id'] + enrichment = { + 'status': 'processed', + 'timestamp': datetime.now(timezone.utc).isoformat().replace('+00:00', 'Z'), + 'match_confidence': 0.0, + 'source': None, + 'fields_updated': [], + 'fields_confirmed': [], + 'conflicts': [], + } + review = None + updates = {} + review_reasons = [] + + artist = track.get('artist') + title = track.get('title') + + # If no artist and no title, flag for review immediately + if not artist and not title: + enrichment['status'] = 'no_metadata' + review = { + 'track_id': track_id, + 'filename': track.get('original_filename', ''), + 'reason': ['no_artist_or_title'], + 'existing': {f: track.get(f) for f in FIELD_WEIGHTS}, + 'suggestions': [], + } + return {'enrichment': enrichment, 'review': review, 'updates': updates} + + # Search MusicBrainz + candidates = mb_client.search_recordings(artist=artist, title=title, + album=track.get('album')) + if not candidates: + enrichment['status'] = 'no_match' + return {'enrichment': enrichment, 'review': None, 'updates': updates} + + # Score candidates + scored = [] + for cand in candidates: + score = scorer.score(track, cand) + scored.append((score, cand)) + scored.sort(key=lambda x: x[0], reverse=True) + + best_score, best_candidate = scored[0] + enrichment['match_confidence'] = best_score + enrichment['source'] = best_candidate.get('source', 'musicbrainz') + + # Check for multiple high-confidence disagreements + if len(scored) >= 2: + second_score, second_candidate = scored[1] + if second_score >= REVIEW_THRESHOLD and best_score - second_score < 0.10: + if normalize(str(best_candidate.get('title', ''))) != normalize(str(second_candidate.get('title', ''))): + review_reasons.append('multiple_high_confidence_disagree') + + # Apply threshold decisions + if best_score >= AUTO_ACCEPT_THRESHOLD: + enrichment['status'] = 'auto_accepted' + elif best_score >= REVIEW_THRESHOLD: + enrichment['status'] = 'review_needed' + review_reasons.append('confidence_between_0.50_and_0.85') + else: + enrichment['status'] = 'below_threshold' + return {'enrichment': enrichment, 'review': None, 'updates': updates} + + # Resolve conflicts per field + for field in ['artist', 'title', 'album', 'year', 'genre']: + existing_val = track.get(field) + enriched_val = best_candidate.get(field) + + resolution = resolver.resolve(field, existing_val, enriched_val) + classification = resolution['classification'] + + if classification == 'confirmed': + enrichment['fields_confirmed'].append(field) + elif classification == 'supplement' and best_score >= REVIEW_THRESHOLD: + if not dry_run: + updates[field] = resolution['value'] + enrichment['fields_updated'].append(field) + elif classification == 'likely_correction': + enrichment['conflicts'].append({ + 'field': field, + 'existing': resolution['existing'], + 'suggested': resolution['suggested'], + 'similarity': resolution.get('similarity'), + }) + review_reasons.append(f'likely_correction:{field}') + elif classification == 'alternative': + enrichment['conflicts'].append({ + 'field': field, + 'existing': existing_val, + 'alternative': resolution.get('alternative'), + }) + + # Auto-accept updates for high-confidence matches + if best_score >= AUTO_ACCEPT_THRESHOLD and classification == 'supplement': + pass # already handled above + if best_score >= AUTO_ACCEPT_THRESHOLD and classification in ('likely_correction', 'alternative'): + # Even auto-accepted tracks flag corrections for review + pass + + # Store MB IDs for reference + enrichment['mb_recording_id'] = best_candidate.get('mb_recording_id') + enrichment['mb_release_id'] = best_candidate.get('mb_release_id') + + # Artwork enrichment + art_info = None + if not skip_artwork and best_candidate.get('mb_release_id'): + art_info = caa_client.get_cover_art(best_candidate['mb_release_id']) + + # Fallback to iTunes if CAA has no art + if not skip_artwork and not art_info and artist and title: + art_info = itunes_client.search_artwork(artist, title) + + if not skip_artwork and art_info: + new_score = artwork_selector.score_artwork( + width=art_info.get('width', 0), + source=art_info.get('source', 'coverartarchive'), + art_type=art_info.get('type', 'unknown'), + fmt=art_info.get('format', 'jpeg'), + ) + + existing_art_score = 0 + if track.get('artwork_path'): + existing_art_score = artwork_selector.score_artwork( + width=500, source='existing', art_type='front', fmt='jpeg', + ) + + should_upgrade = artwork_selector.should_upgrade(existing_art_score, new_score) + enrichment['artwork'] = { + 'available': True, + 'new_score': new_score, + 'existing_score': existing_art_score, + 'upgrade': should_upgrade, + 'source': art_info.get('source', 'unknown'), + } + + if should_upgrade and track.get('artwork_path'): + review_reasons.append('artwork_upgrade_with_existing') + + if should_upgrade and not dry_run: + art_ext = 'jpg' if art_info['format'] in ('jpeg', 'jpg') else 'png' + art_path = output_dir / 'artwork' / f"{track_id}_enriched.{art_ext}" + art_path.parent.mkdir(parents=True, exist_ok=True) + + # Use appropriate download method based on source + download_success = False + if art_info['source'] == 'itunes': + download_success = itunes_client.download_art(art_info['url'], art_path) + else: + download_success = caa_client.download_art(art_info['url'], art_path) + + if download_success: + updates['artwork_path'] = str(art_path) + enrichment['fields_updated'].append('artwork') + + # Build review entry if needed + if review_reasons: + suggestions = [{ + 'source': best_candidate.get('source', 'musicbrainz'), + 'confidence': best_score, + 'fields': {f: best_candidate.get(f) for f in FIELD_WEIGHTS}, + }] + # Add second candidate if close + if len(scored) >= 2 and scored[1][0] >= REVIEW_THRESHOLD: + suggestions.append({ + 'source': scored[1][1].get('source', 'musicbrainz'), + 'confidence': scored[1][0], + 'fields': {f: scored[1][1].get(f) for f in FIELD_WEIGHTS}, + }) + + review = { + 'track_id': track_id, + 'filename': track.get('original_filename', ''), + 'reason': review_reasons, + 'existing': {f: track.get(f) for f in ['artist', 'title', 'album', 'year', 'genre']}, + 'suggestions': suggestions, + 'conflicts': enrichment['conflicts'], + } + + return {'enrichment': enrichment, 'review': review, 'updates': updates} + + +def check_connectivity(mb_client: MusicBrainzClient) -> bool: + """Ping MusicBrainz to verify connectivity.""" + print("Checking MusicBrainz connectivity...") + if mb_client.ping(): + print(" MusicBrainz API is reachable.") + return True + print(" MusicBrainz API is unreachable.", file=sys.stderr) + return False + + +def _normalize_input(raw: dict) -> Tuple[dict, str]: + """Normalize input to dict-keyed tracks format. + + Supports: + - metadata_base.json: tracks is a dict keyed by file path + - manifest.json: tracks is a list of track objects + + Returns (normalized_metadata, format_name). + """ + tracks = raw.get('tracks', {}) + if isinstance(tracks, list): + # manifest.json format — convert list to dict keyed by track id + tracks_dict = {} + for track in tracks: + key = track.get('path') or track['id'] + tracks_dict[key] = track + # Ensure required fields exist + track.setdefault('original_filename', key.split('/')[-1]) + raw['tracks'] = tracks_dict + return raw, 'manifest' + return raw, 'metadata_base' + + +def _denormalize_output(metadata: dict, fmt: str) -> dict: + """Convert back to original format for output.""" + if fmt == 'manifest': + output = dict(metadata) + output['tracks'] = list(metadata['tracks'].values()) + output['generated'] = datetime.now(timezone.utc).isoformat().replace('+00:00', 'Z') + return output + return metadata + + +def run_offline_fallback(input_path: Path, output_dir: Path): + """Copy base metadata as enriched with status=skipped.""" + print("Running in offline mode — copying base metadata as-is.") + with open(input_path) as f: + raw = json.load(f) + + metadata, fmt = _normalize_input(raw) + + for track in metadata['tracks'].values(): + track['enrichment'] = { + 'status': 'skipped', + 'reason': 'offline', + 'timestamp': datetime.now(timezone.utc).isoformat().replace('+00:00', 'Z'), + } + + output = _denormalize_output(metadata, fmt) + out_path = output_dir / 'metadata_enriched.json' + with open(out_path, 'w') as f: + json.dump(output, f, indent=2) + print(f"Wrote {out_path} (offline fallback, {len(metadata['tracks'])} tracks)") + + +def main(): + parser = argparse.ArgumentParser( + description='Enrich track metadata using external APIs' + ) + parser.add_argument( + '--input', type=Path, required=True, + help='Path to metadata_base.json', + ) + parser.add_argument( + '--output', type=Path, required=True, + help='Output directory for enriched metadata and artwork', + ) + parser.add_argument( + '--resume', action='store_true', + help='Resume from previous enrichment state', + ) + parser.add_argument( + '--dry-run', action='store_true', + help='Score and match without writing updates', + ) + parser.add_argument( + '--skip-artwork', action='store_true', + help='Skip album art fetching', + ) + parser.add_argument( + '--discogs-token', type=str, default=os.environ.get('DISCOGS_TOKEN'), + help='Discogs API token (optional, for future use)', + ) + parser.add_argument( + '--lastfm-key', type=str, default=os.environ.get('LASTFM_API_KEY'), + help='Last.fm API key (optional, for future use)', + ) + parser.add_argument( + '--limit', type=int, default=0, + help='Limit number of tracks to process (0 = unlimited)', + ) + + args = parser.parse_args() + + # Validate input + if not args.input.exists(): + print(f"Error: Input file not found: {args.input}", file=sys.stderr) + return 1 + + args.output.mkdir(parents=True, exist_ok=True) + + # When resuming, prefer the enriched output (preserves prior work) + enriched_path = args.output / 'metadata_enriched.json' + if args.resume and enriched_path.exists(): + print(f"Resuming from enriched output: {enriched_path}") + with open(enriched_path) as f: + raw = json.load(f) + else: + print(f"Loading metadata from {args.input}...") + with open(args.input) as f: + raw = json.load(f) + + metadata, input_format = _normalize_input(raw) + total_tracks = len(metadata.get('tracks', {})) + print(f"Found {total_tracks} tracks (format: {input_format})") + + # Initialize clients + mb_limiter = RateLimiter(rate=1.0) + mb_client = MusicBrainzClient(mb_limiter) + + # Connectivity check + if not check_connectivity(mb_client): + run_offline_fallback(args.input, args.output) + return 0 + + caa_limiter = RateLimiter(rate=1.0) + caa_client = CoverArtArchiveClient(caa_limiter) + itunes_limiter = RateLimiter(rate=1.0) + itunes_client = ITunesArtworkClient(itunes_limiter) + scorer = MatchScorer() + resolver = ConflictResolver() + artwork_selector = ArtworkSelector() + + # Resume state + state_file = args.output / '.enrichment_state.json' + state = EnrichmentState(state_file) if args.resume else EnrichmentState(state_file) + if args.resume: + print(f"Resuming: {state.count} tracks already processed") + + # Load dry-run report if available (replay cached results instead of re-querying) + dry_run_cache = {} + if not args.dry_run: + report_path = args.output / 'dry_run_report.json' + if report_path.exists(): + try: + with open(report_path) as f: + report = json.load(f) + if report.get('mode') == 'dry_run': + for entry in report.get('tracks', []): + tid = entry.get('track_id') + if tid: + dry_run_cache[tid] = entry + print(f"Loaded dry-run report: {len(dry_run_cache)} cached results") + except (json.JSONDecodeError, KeyError): + pass + + # Prepare track list + track_items = list(metadata['tracks'].items()) + if args.limit > 0: + track_items = track_items[:args.limit] + + # Process tracks + review_queue = [] + dry_run_results = [] # Collect proposed changes for dry-run report + processed = 0 + skipped = 0 + auto_accepted = 0 + flagged = 0 + no_match = 0 + from_cache = 0 + + for i, (file_key, track) in enumerate(track_items): + track_id = track.get('id', '') + + # Skip already processed (resume) + if args.resume and state.is_processed(track_id): + skipped += 1 + continue + + print(f"[{i+1}/{len(track_items)}] {track.get('original_filename', track_id)[:60]}...") + + # Check dry-run cache first (avoid re-querying APIs) + cached = dry_run_cache.get(track_id) if not args.dry_run else None + if cached: + enrichment = cached['enrichment'] + enrichment['timestamp'] = datetime.now(timezone.utc).isoformat().replace('+00:00', 'Z') + updates = dict(cached.get('proposed_updates', {})) + + # Artwork was skipped during dry-run — download now if upgrade was flagged + art_info = enrichment.get('artwork', {}) + if (not args.skip_artwork and art_info.get('upgrade') + and enrichment.get('mb_release_id') + and 'artwork_path' not in updates): + caa_art = caa_client.get_cover_art(enrichment['mb_release_id']) + if caa_art: + art_ext = 'jpg' if caa_art['format'] in ('jpeg', 'jpg') else 'png' + art_path = args.output / 'artwork' / f"{track_id}_enriched.{art_ext}" + art_path.parent.mkdir(parents=True, exist_ok=True) + if caa_client.download_art(caa_art['url'], art_path): + updates['artwork_path'] = str(art_path) + enrichment.setdefault('fields_updated', []).append('artwork') + + result = { + 'enrichment': enrichment, + 'updates': updates, + 'review': cached.get('review'), + } + from_cache += 1 + else: + try: + result = enrich_track( + track, mb_client, caa_client, itunes_client, scorer, resolver, + artwork_selector, args.output, + skip_artwork=args.skip_artwork, + dry_run=args.dry_run, + ) + except Exception as e: + print(f" Error: {e}", file=sys.stderr) + metadata['tracks'][file_key]['enrichment'] = { + 'status': 'error', + 'error': str(e), + 'timestamp': datetime.now(timezone.utc).isoformat().replace('+00:00', 'Z'), + } + state.mark_processed(track_id) + continue + + # Apply enrichment + enrichment = result['enrichment'] + metadata['tracks'][file_key]['enrichment'] = enrichment + + # Apply field updates (unless dry-run) + if not args.dry_run: + for field, value in result['updates'].items(): + metadata['tracks'][file_key][field] = value + + # Collect dry-run proposed changes (store full result for replay) + if args.dry_run: + dry_run_results.append({ + 'track_id': track_id, + 'filename': track.get('original_filename', ''), + 'enrichment': enrichment, + 'proposed_updates': result['updates'], + 'review': result['review'], + }) + + # Collect review items + if result['review']: + review_queue.append(result['review']) + flagged += 1 + + # Stats + status = enrichment.get('status', '') + if status == 'auto_accepted': + auto_accepted += 1 + conf = enrichment.get('match_confidence', 0) + updated = enrichment.get('fields_updated', []) + print(f" ✓ Auto-accepted (confidence: {conf:.2f}, updated: {updated})") + elif status == 'review_needed': + conf = enrichment.get('match_confidence', 0) + print(f" ? Review needed (confidence: {conf:.2f})") + elif status == 'no_match': + no_match += 1 + print(f" - No match found") + elif status == 'no_metadata': + print(f" ! No artist or title — flagged") + elif status == 'below_threshold': + no_match += 1 + conf = enrichment.get('match_confidence', 0) + print(f" - Below threshold (confidence: {conf:.2f})") + + state.mark_processed(track_id) + processed += 1 + + # Checkpoint every 25 tracks + if processed % 25 == 0: + if args.dry_run: + _save_dry_run_report(dry_run_results, args.output) + else: + _save_checkpoint(metadata, review_queue, state, args.output, input_format) + print(f" Checkpoint saved ({processed} processed)") + + # Final save + if args.dry_run: + _save_dry_run_report(dry_run_results, args.output) + else: + _save_checkpoint(metadata, review_queue, state, args.output, input_format) + # Clean up dry-run report after successful apply + report_path = args.output / 'dry_run_report.json' + if report_path.exists(): + report_path.unlink() + print("Dry-run report consumed and removed.") + + # Summary + print(f"\n{'=' * 50}") + print(f"Enrichment complete{' [DRY RUN]' if args.dry_run else ''}") + print(f" Total tracks: {len(track_items)}") + print(f" Processed: {processed}") + if from_cache: + print(f" From cache: {from_cache} (via dry-run report)") + print(f" Skipped (resume):{skipped}") + print(f" Auto-accepted: {auto_accepted}") + print(f" Flagged review: {flagged}") + print(f" No match: {no_match}") + print(f"\nOutput:") + if args.dry_run: + print(f" {args.output / 'dry_run_report.json'} ({len(dry_run_results)} tracks with proposed changes)") + else: + print(f" {args.output / 'metadata_enriched.json'}") + if review_queue: + print(f" {args.output / 'review_queue.json'} ({len(review_queue)} items)") + print(f" {state_file}") + + return 0 + + +def _save_checkpoint(metadata: dict, review_queue: list, state: EnrichmentState, + output_dir: Path, fmt: str = 'metadata_base'): + """Save enriched metadata, review queue, and state.""" + output = _denormalize_output(metadata, fmt) + enriched_path = output_dir / 'metadata_enriched.json' + with open(enriched_path, 'w') as f: + json.dump(output, f, indent=2) + + if review_queue: + review_path = output_dir / 'review_queue.json' + with open(review_path, 'w') as f: + json.dump({ + 'version': 1, + 'generated': datetime.now(timezone.utc).isoformat().replace('+00:00', 'Z'), + 'items': review_queue, + }, f, indent=2) + + state.save() + + +def _save_dry_run_report(results: list, output_dir: Path): + """Save dry-run report with all proposed changes.""" + report_path = output_dir / 'dry_run_report.json' + with open(report_path, 'w') as f: + json.dump({ + 'version': 1, + 'generated': datetime.now(timezone.utc).isoformat().replace('+00:00', 'Z'), + 'mode': 'dry_run', + 'summary': { + 'total': len(results), + 'auto_accept': sum(1 for r in results if r['enrichment'].get('status') == 'auto_accepted'), + 'review_needed': sum(1 for r in results if r['enrichment'].get('status') == 'review_needed'), + 'with_updates': sum(1 for r in results if r['proposed_updates']), + 'with_conflicts': sum(1 for r in results if r['enrichment'].get('conflicts')), + }, + 'tracks': results, + }, f, indent=2) + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/tools/pipeline.sh b/tools/pipeline.sh new file mode 100755 index 0000000..5955e44 --- /dev/null +++ b/tools/pipeline.sh @@ -0,0 +1,166 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Crate Music Pipeline +# +# Single entrypoint for the full metadata pipeline: +# extract → upload → enrich → publish +# +# Usage: +# ./tools/pipeline.sh /path/to/new/music # Ingest new files, enrich, publish +# ./tools/pipeline.sh # Re-enrich + publish existing library +# ./tools/pipeline.sh --dry-run # Preview without changes + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +PROJECT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)" +METADATA_DIR="${METADATA_DIR:-$PROJECT_DIR/metadata}" +export AWS_PROFILE="${AWS_PROFILE:-personal}" +export TRACKS_BUCKET="${TRACKS_BUCKET:-crate-tracks.rmzi.world}" + +# --- Defaults --- +DRY_RUN=false +SKIP_UPLOAD=false +SKIP_ARTWORK=false +SKIP_PUBLISH=false +RESUME=true # Always resume by default (idempotent) +LIMIT=0 +INPUT_DIR="" + +# --- Parse args --- +while [[ $# -gt 0 ]]; do + case "$1" in + --dry-run) DRY_RUN=true; shift ;; + --skip-upload) SKIP_UPLOAD=true; shift ;; + --skip-artwork) SKIP_ARTWORK=true; shift ;; + --skip-publish) SKIP_PUBLISH=true; shift ;; + --no-resume) RESUME=false; shift ;; + --limit) LIMIT="$2"; shift 2 ;; + --help|-h) + echo "Usage: $(basename "$0") [OPTIONS] [/path/to/new/music]" + echo "" + echo "With a path: extract → upload → enrich → publish to S3" + echo "Without a path: re-enrich existing library → publish to S3" + echo "" + echo "Options:" + echo " --dry-run Preview all steps without writing or uploading" + echo " --skip-upload Skip uploading new audio files to S3" + echo " --skip-publish Enrich only, don't publish manifest to S3" + echo " --skip-artwork Skip album art fetching during enrichment" + echo " --no-resume Re-process all tracks (ignore previous state)" + echo " --limit N Limit enrichment to N tracks" + echo " -h, --help Show this help" + echo "" + echo "Environment:" + echo " METADATA_DIR Override metadata directory (default: ./metadata)" + echo " TRACKS_BUCKET S3 bucket (default: crate-tracks.rmzi.world)" + echo " AWS_PROFILE AWS profile (default: personal)" + exit 0 + ;; + -*) + echo "Unknown option: $1" >&2 + exit 1 + ;; + *) + INPUT_DIR="$1"; shift ;; + esac +done + +mkdir -p "$METADATA_DIR" + +echo "=== Crate Pipeline ===" +echo " Metadata dir: $METADATA_DIR" +[[ -n "$INPUT_DIR" ]] && echo " Input dir: $INPUT_DIR" +echo " Dry run: $DRY_RUN" +echo "" + +# --- Step 1: Extract (only if new music provided) --- +if [[ -n "$INPUT_DIR" ]]; then + if [[ ! -d "$INPUT_DIR" ]]; then + echo "Error: $INPUT_DIR is not a directory" >&2 + exit 1 + fi + + echo "--- Step 1: Extract metadata ---" + python3 "$SCRIPT_DIR/extract_metadata.py" "$INPUT_DIR" --output "$METADATA_DIR" --resume + echo "" + + # --- Step 2: Upload new tracks to S3 --- + if [[ "$SKIP_UPLOAD" == false && "$DRY_RUN" == false ]]; then + echo "--- Step 2: Upload new tracks to S3 ---" + python3 "$SCRIPT_DIR/batch_upload.py" --metadata-dir "$METADATA_DIR" + echo "" + else + echo "--- Step 2: Upload (skipped) ---" + echo "" + fi +else + echo "--- No input directory — skipping extract and upload ---" + echo "" +fi + +# --- Step 3: Enrich entire library --- +# Find the best available metadata source +ENRICH_INPUT="" +if [[ -f "$METADATA_DIR/metadata_base.json" ]]; then + ENRICH_INPUT="$METADATA_DIR/metadata_base.json" +elif [[ -f "$METADATA_DIR/manifest.json" ]]; then + ENRICH_INPUT="$METADATA_DIR/manifest.json" +else + echo "No local metadata found — pulling manifest.json from S3..." + if aws s3 cp "s3://${TRACKS_BUCKET}/manifest.json" "$METADATA_DIR/manifest.json" 2>/dev/null; then + ENRICH_INPUT="$METADATA_DIR/manifest.json" + TRACK_COUNT=$(python3 -c "import json; print(len(json.load(open('$METADATA_DIR/manifest.json')).get('tracks', [])))" 2>/dev/null || echo '?') + echo " Downloaded manifest.json ($TRACK_COUNT tracks)" + else + echo "No metadata found locally or in S3. Run extract_metadata.py first." + exit 1 + fi +fi +echo "" + +echo "--- Step 3: Enrich metadata (entire library) ---" +echo " Input: $ENRICH_INPUT" +ENRICH_ARGS=(--input "$ENRICH_INPUT" --output "$METADATA_DIR") + +[[ "$DRY_RUN" == true ]] && ENRICH_ARGS+=(--dry-run) +[[ "$RESUME" == true ]] && ENRICH_ARGS+=(--resume) +[[ "$SKIP_ARTWORK" == true ]] && ENRICH_ARGS+=(--skip-artwork) +[[ "$LIMIT" -gt 0 ]] && ENRICH_ARGS+=(--limit "$LIMIT") + +python3 "$SCRIPT_DIR/enrich_metadata.py" "${ENRICH_ARGS[@]}" +echo "" + +# --- Step 4: Publish to S3 --- +if [[ "$SKIP_PUBLISH" == false && -f "$METADATA_DIR/metadata_enriched.json" ]]; then + echo "--- Step 4: Publish enriched metadata to S3 ---" + PUBLISH_ARGS=(--metadata-dir "$METADATA_DIR") + [[ "$DRY_RUN" == true ]] && PUBLISH_ARGS+=(--dry-run) + + python3 "$SCRIPT_DIR/publish_manifest.py" "${PUBLISH_ARGS[@]}" + echo "" +elif [[ "$DRY_RUN" == true && -f "$METADATA_DIR/dry_run_report.json" ]]; then + echo "--- Step 4: Publish (skipped — dry run, no enriched file) ---" + echo "" +else + echo "--- Step 4: Publish (skipped) ---" + echo "" +fi + +# --- Summary --- +echo "=== Pipeline complete ===" +echo "" +echo "Files:" +for f in metadata_base.json manifest.json metadata_enriched.json manifest_enriched.json dry_run_report.json review_queue.json; do + [[ -f "$METADATA_DIR/$f" ]] && echo " $METADATA_DIR/$f" +done +echo "" + +if [[ -f "$METADATA_DIR/review_queue.json" ]]; then + REVIEW_COUNT=$(python3 -c "import json; print(len(json.load(open('$METADATA_DIR/review_queue.json')).get('items', [])))" 2>/dev/null || echo "?") + echo "$REVIEW_COUNT tracks flagged for review." + echo " Inspect: cat metadata/review_queue.json | python3 -m json.tool" + echo " Or run the archivist agent to review interactively." + echo "" +fi + +echo "To re-run: ./tools/pipeline.sh" diff --git a/tools/publish_manifest.py b/tools/publish_manifest.py new file mode 100644 index 0000000..d09c0f4 --- /dev/null +++ b/tools/publish_manifest.py @@ -0,0 +1,203 @@ +#!/usr/bin/env python3 +""" +Publish enriched metadata to S3. + +Reads metadata_enriched.json, uploads new artwork to S3, +builds a clean manifest.json, and pushes it to the tracks bucket. + +Usage: + python tools/publish_manifest.py --metadata-dir metadata/ + python tools/publish_manifest.py --metadata-dir metadata/ --dry-run +""" + +import argparse +import json +import os +import sys +from datetime import datetime, timezone +from pathlib import Path + +import boto3 +from botocore.exceptions import ClientError + +AWS_PROFILE = os.environ.get('AWS_PROFILE', 'personal') +TRACKS_BUCKET = os.environ.get('TRACKS_BUCKET', 'crate-tracks.rmzi.world') + + +def get_s3_client(): + session = boto3.Session(profile_name=AWS_PROFILE) + return session.client('s3') + + +def upload_file(s3_client, local_path: Path, s3_key: str, content_type: str) -> bool: + try: + s3_client.upload_file( + str(local_path), TRACKS_BUCKET, s3_key, + ExtraArgs={'ContentType': content_type}, + ) + return True + except ClientError as e: + print(f" Error uploading {local_path.name}: {e}", file=sys.stderr) + return False + + +def load_enriched(metadata_dir: Path) -> dict: + """Load enriched metadata, falling back to manifest.json.""" + enriched = metadata_dir / 'metadata_enriched.json' + if enriched.exists(): + with open(enriched) as f: + return json.load(f) + manifest = metadata_dir / 'manifest.json' + if manifest.exists(): + with open(manifest) as f: + return json.load(f) + raise FileNotFoundError(f"No metadata_enriched.json or manifest.json in {metadata_dir}") + + +def normalize_tracks(data: dict) -> list: + """Return tracks as a list regardless of input format.""" + tracks = data.get('tracks', []) + if isinstance(tracks, dict): + return list(tracks.values()) + return tracks + + +def build_manifest(tracks: list) -> dict: + """Build a clean manifest.json from enriched tracks.""" + clean_tracks = [] + for track in tracks: + clean = { + 'id': track['id'], + 'path': track.get('path') or track.get('s3_path', ''), + 'artist': track.get('artist'), + 'album': track.get('album'), + 'title': track.get('title'), + 'year': track.get('year'), + 'duration': track.get('duration'), + 'artwork': track.get('artwork'), + 'tagged': track.get('tagged', False), + 'original_filename': track.get('original_filename', ''), + 'uploaded': track.get('uploaded', True), + } + # Include genre if present (enrichment may have added it) + if track.get('genre'): + clean['genre'] = track['genre'] + clean_tracks.append(clean) + + return { + 'version': 1, + 'generated': datetime.now(timezone.utc).isoformat().replace('+00:00', 'Z'), + 'tracks': clean_tracks, + } + + +def main(): + parser = argparse.ArgumentParser(description='Publish enriched metadata to S3') + parser.add_argument( + '--metadata-dir', type=Path, required=True, + help='Directory containing metadata_enriched.json and artwork/', + ) + parser.add_argument( + '--dry-run', action='store_true', + help='Show what would be uploaded without actually uploading', + ) + args = parser.parse_args() + + # Load enriched data + print(f"Loading enriched metadata from {args.metadata_dir}...") + data = load_enriched(args.metadata_dir) + tracks = normalize_tracks(data) + print(f"Found {len(tracks)} tracks") + + artwork_dir = args.metadata_dir / 'artwork' + artwork_uploaded = 0 + artwork_skipped = 0 + + if not args.dry_run: + s3_client = get_s3_client() + else: + s3_client = None + + # Step 1: Upload enriched artwork to S3 and update track references + print("\n--- Uploading enriched artwork ---") + for track in tracks: + artwork_path = track.get('artwork_path', '') + + # Check if this is a local enriched artwork file + if not artwork_path or not artwork_path.endswith(('.jpg', '.jpeg', '.png')): + continue + + local_path = Path(artwork_path) + if not local_path.exists(): + # Maybe it's already an S3 path + continue + + # This is a local file — upload it + s3_key = f"artwork/{local_path.name}" + content_type = 'image/jpeg' if local_path.suffix in ('.jpg', '.jpeg') else 'image/png' + + if args.dry_run: + print(f" Would upload: {local_path.name} -> s3://{TRACKS_BUCKET}/{s3_key}") + track['artwork'] = s3_key + artwork_uploaded += 1 + else: + if upload_file(s3_client, local_path, s3_key, content_type): + print(f" Uploaded: {local_path.name} -> {s3_key}") + track['artwork'] = s3_key + artwork_uploaded += 1 + else: + artwork_skipped += 1 + + # Step 2: Ensure existing artwork references are preserved + for track in tracks: + # If track already had artwork from a previous upload (s3_artwork_path) + if not track.get('artwork') and track.get('s3_artwork_path'): + track['artwork'] = track['s3_artwork_path'] + + print(f" Artwork uploaded: {artwork_uploaded}, skipped: {artwork_skipped}") + + # Step 3: Build clean manifest + print("\n--- Building manifest ---") + manifest = build_manifest(tracks) + manifest_path = args.metadata_dir / 'manifest_enriched.json' + with open(manifest_path, 'w') as f: + json.dump(manifest, f, indent=2) + print(f" Wrote {manifest_path}") + + # Count stats + with_art = sum(1 for t in manifest['tracks'] if t.get('artwork')) + without_art = sum(1 for t in manifest['tracks'] if not t.get('artwork')) + print(f" Tracks with artwork: {with_art}") + print(f" Tracks without artwork: {without_art}") + + # Step 4: Upload manifest to S3 + print("\n--- Publishing manifest.json ---") + if args.dry_run: + print(f" Would upload: manifest.json -> s3://{TRACKS_BUCKET}/manifest.json") + else: + try: + s3_client.put_object( + Bucket=TRACKS_BUCKET, + Key='manifest.json', + Body=json.dumps(manifest, indent=2), + ContentType='application/json', + ) + print(f" Uploaded manifest.json ({len(manifest['tracks'])} tracks)") + except ClientError as e: + print(f" Error uploading manifest: {e}", file=sys.stderr) + return 1 + + # Also update the local enriched file with correct artwork paths + if not args.dry_run: + enriched_path = args.metadata_dir / 'metadata_enriched.json' + if enriched_path.exists(): + with open(enriched_path, 'w') as f: + json.dump(data, f, indent=2) + print(f" Updated {enriched_path} with S3 artwork paths") + + print(f"\nDone! {'[DRY RUN]' if args.dry_run else 'Published to S3.'}") + return 0 + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/www/js/player.js b/www/js/player.js index f9cb49c..920890a 100644 --- a/www/js/player.js +++ b/www/js/player.js @@ -6,7 +6,7 @@ import { CONFIG } from './config.js'; import { state, isSecretMode } from './state.js'; import { elements } from './elements.js'; -import { formatTime, getMediaUrl } from './utils.js'; +import { formatTime, getMediaUrl, generateTrackGradient } from './utils.js'; import { trackEvent } from './analytics.js'; import { setSignedCookies, clearAllCookies } from './cookies.js'; import { loadHeardTracks, loadFavoriteTracks, saveFavoriteTracks, setSecretUnlocked } from './storage.js'; @@ -118,10 +118,14 @@ export function updateArtwork(track) { elements.artworkImage.src = getMediaUrl(track.artwork); elements.artworkImage.alt = `${track.artist || 'Unknown'} - ${track.album || 'Unknown'}`; elements.artworkContainer.classList.remove('no-art'); + // Clear any background gradient + elements.artworkContainer.style.background = ''; } else { elements.artworkImage.src = ''; elements.artworkImage.alt = ''; elements.artworkContainer.classList.add('no-art'); + // Generate and apply gradient background + elements.artworkContainer.style.background = generateTrackGradient(track.id); } } diff --git a/www/js/tracks.js b/www/js/tracks.js index 5a20b58..ecb6f01 100644 --- a/www/js/tracks.js +++ b/www/js/tracks.js @@ -5,7 +5,7 @@ import { state, isSecretMode } from './state.js'; import { elements } from './elements.js'; -import { seededRandom, escapeHtml, getMediaUrl } from './utils.js'; +import { seededRandom, escapeHtml, getMediaUrl, generateTrackGradient } from './utils.js'; import { saveHeardTracks } from './storage.js'; import { trackEvent } from './analytics.js'; import { showAuthError } from './ui.js'; @@ -164,10 +164,12 @@ export function renderTrackList() { const year = track.year || ''; const artworkSrc = track.artwork ? getMediaUrl(track.artwork) : ''; const thumbClass = artworkSrc ? '' : 'no-art'; + // Generate gradient for tracks without artwork + const thumbStyle = artworkSrc ? '' : `style="background: ${generateTrackGradient(track.id)}"`; return `