YouTube-Ripper/ripper.py at main · gonkey42/YouTube-Ripper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
"""Core logic for YouTube audio/video extraction, transcription, and text generation."""

import os
import queue
import re
import subprocess
import threading
from dataclasses import dataclass
from datetime import datetime
from glob import escape as glob_escape
from pathlib import Path

import yt_dlp
from yt_dlp.utils import DownloadError

DEFAULT_OUTPUT_DIR = Path(__file__).parent / "output"


def _configured_output_dir() -> Path:
    configured = os.environ.get("YOUTUBE_RIPPER_OUTPUT_DIR")
    if configured:
        return Path(configured).expanduser()
    return DEFAULT_OUTPUT_DIR


def _whisper_model_name() -> str:
    return os.environ.get("YOUTUBE_RIPPER_WHISPER_MODEL", "base")


def _whisper_device() -> str:
    return os.environ.get("YOUTUBE_RIPPER_WHISPER_DEVICE", "cpu")


def _whisper_compute_type() -> str:
    return os.environ.get("YOUTUBE_RIPPER_WHISPER_COMPUTE_TYPE", "int8")


OUTPUT_DIR = _configured_output_dir()
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

DEFAULT_COOKIES_FROM_BROWSER = "chrome:Profile 1"


@dataclass(frozen=True)
class MediaInfo:
    title: str
    video_id: str


@dataclass(frozen=True)
class DownloadedMedia:
    path: Path
    info: MediaInfo


_whisper_model = None


def _get_whisper_model():
    """Lazy-load the faster-whisper model (singleton)."""
    global _whisper_model
    if _whisper_model is None:
        from faster_whisper import WhisperModel

        _whisper_model = WhisperModel(
            _whisper_model_name(),
            compute_type=_whisper_compute_type(),
            device=_whisper_device(),
        )
    return _whisper_model


def _sanitize_filename(title: str) -> str:
    """Convert a video title into a safe filename."""
    name = re.sub(r'[<>:"/\\|?*]', "", title)
    name = re.sub(r"\s+", " ", name).strip()
    return name[:200] if name else "untitled"


def _output_stem(info: MediaInfo) -> str:
    """Return the shared output filename stem for a YouTube item."""
    return f"{_sanitize_filename(info.title)} [{info.video_id}]"


def _format_bytes(num_bytes) -> str:
    """Format bytes into human-readable string."""
    if not num_bytes:
        return "?"
    for unit in ("B", "KB", "MB", "GB"):
        if abs(num_bytes) < 1024:
            return f"{num_bytes:.1f}{unit}"
        num_bytes /= 1024
    return f"{num_bytes:.1f}TB"


def _format_speed(bps) -> str:
    """Format bytes/sec into human-readable speed."""
    if not bps:
        return "?"
    return f"{_format_bytes(bps)}/s"


def _format_eta(seconds) -> str:
    """Format seconds into M:SS or H:MM:SS."""
    if seconds is None:
        return "?"
    seconds = int(seconds)
    if seconds < 3600:
        return f"{seconds // 60}:{seconds % 60:02d}"
    return f"{seconds // 3600}:{(seconds % 3600) // 60:02d}:{seconds % 60:02d}"


QUALITY_FORMATS = {
    "1080p": "bestvideo[height<=1080]+bestaudio/best[height<=1080]",
    "720p": "bestvideo[height<=720]+bestaudio/best[height<=720]",
    "480p": "bestvideo[height<=480]+bestaudio/best[height<=480]",
    "best": "bestvideo+bestaudio/best",
}


def _parse_cookies_from_browser(value: str) -> tuple[str, str | None, str | None, str | None] | None:
    value = value.strip()
    if not value or value.lower() in {"0", "false", "none", "off"}:
        return None

    browser, _, profile = value.partition(":")
    browser = browser.strip()
    profile = profile.strip() or None
    return (browser, profile, None, None)


def _yt_dlp_opts(extra_opts: dict | None = None, *, use_cookies: bool = True) -> dict:
    """Build common yt-dlp options, including YouTube auth cookies."""
    opts = dict(extra_opts or {})

    if not use_cookies:
        return opts

    cookie_file = (
        os.environ.get("YOUTUBE_RIPPER_COOKIES")
        or os.environ.get("YT_DLP_COOKIES")
    )
    if cookie_file:
        opts["cookiefile"] = cookie_file
        return opts

    cookies_from_browser = (
        os.environ.get("YOUTUBE_RIPPER_COOKIES_FROM_BROWSER")
        or os.environ.get("YT_DLP_COOKIES_FROM_BROWSER")
        or DEFAULT_COOKIES_FROM_BROWSER
    )
    parsed = _parse_cookies_from_browser(cookies_from_browser)
    if parsed:
        opts["cookiesfrombrowser"] = parsed

    return opts


def _has_cookie_options(opts: dict) -> bool:
    return "cookiefile" in opts or "cookiesfrombrowser" in opts


def _run_ytdlp_with_cookie_fallback(base_opts: dict, action):
    opts = _yt_dlp_opts(base_opts)
    try:
        with yt_dlp.YoutubeDL(opts) as ydl:
            return action(ydl)
    except DownloadError:
        if not _has_cookie_options(opts):
            raise
        retry_opts = _yt_dlp_opts(base_opts, use_cookies=False)
        with yt_dlp.YoutubeDL(retry_opts) as ydl:
            return action(ydl)


def _fetch_media_info(url: str) -> MediaInfo:
    """Fetch title and id without downloading media."""
    base_opts = {"quiet": True, "no_warnings": True, "skip_download": True}
    info = _run_ytdlp_with_cookie_fallback(
        base_opts,
        lambda ydl: ydl.extract_info(url, download=False),
    )

    return MediaInfo(
        title=info.get("title", "Untitled"),
        video_id=info.get("id", "unknown"),
    )


def download_video(
    url: str,
    quality: str = "1080p",
    progress_queue: queue.Queue | None = None,
) -> DownloadedMedia:
    """Download video from a YouTube URL as MP4."""
    info = _fetch_media_info(url)
    output_stem = _output_stem(info)
    output_path = OUTPUT_DIR / f"{output_stem}.mp4"

    def progress_hook(d):
        if progress_queue is None:
            return
        status = d.get("status")
        if status == "downloading":
            total = d.get("total_bytes") or d.get("total_bytes_estimate") or 0
            downloaded = d.get("downloaded_bytes", 0)
            percent = (downloaded / total * 100) if total > 0 else 0
            progress_queue.put({
                "status": "downloading",
                "percent": round(percent, 1),
                "speed": _format_speed(d.get("speed")),
                "eta": _format_eta(d.get("eta")),
            })
        elif status == "finished":
            progress_queue.put({"status": "finished"})

    def postprocessor_hook(d):
        if progress_queue is None:
            return
        if d.get("status") == "started" and "Merger" in d.get("postprocessor", ""):
            progress_queue.put({"status": "merging"})

    fmt = QUALITY_FORMATS.get(quality, QUALITY_FORMATS["1080p"])
    dl_opts = {
        "format": fmt,
        "merge_output_format": "mp4",
        "outtmpl": str(OUTPUT_DIR / f"{output_stem}.%(ext)s"),
        "progress_hooks": [progress_hook],
        "postprocessor_hooks": [postprocessor_hook],
        "quiet": True,
        "no_warnings": True,
    }
    _run_ytdlp_with_cookie_fallback(dl_opts, lambda ydl: ydl.download([url]))

    return DownloadedMedia(path=output_path, info=info)


def extract_audio_from_video(video_path: Path) -> Path:
    """Extract temporary audio from a downloaded video for transcription."""
    audio_path = video_path.with_name(f"{video_path.stem}.transcription.m4a")

    def _remove_partial_audio():
        if audio_path.exists():
            audio_path.unlink()

    cmd = [
        "ffmpeg",
        "-y",
        "-i",
        str(video_path),
        "-vn",
        "-acodec",
        "aac",
        "-b:a",
        "128k",
        str(audio_path),
    ]

    try:
        subprocess.run(
            cmd,
            check=True,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True,
        )
    except FileNotFoundError as exc:
        _remove_partial_audio()
        raise RuntimeError("ffmpeg is required to extract audio from downloaded video.") from exc
    except subprocess.CalledProcessError as exc:
        _remove_partial_audio()
        message = (exc.stderr or exc.stdout or str(exc)).strip()
        raise RuntimeError(f"ffmpeg audio extraction failed: {message}") from exc

    return audio_path


def download_audio(url: str) -> DownloadedMedia:
    """Download audio from a YouTube URL as M4A."""
    info = _fetch_media_info(url)
    output_stem = _output_stem(info)
    output_path = OUTPUT_DIR / f"{output_stem}.m4a"

    dl_opts = {
        "format": "bestaudio[ext=m4a]/bestaudio",
        "outtmpl": str(OUTPUT_DIR / f"{output_stem}.%(ext)s"),
        "postprocessors": [
            {
                "key": "FFmpegExtractAudio",
                "preferredcodec": "m4a",
            }
        ],
        "quiet": True,
        "no_warnings": True,
    }
    _run_ytdlp_with_cookie_fallback(dl_opts, lambda ydl: ydl.download([url]))

    # Find the actual output file (extension may vary depending on source)
    if not output_path.exists():
        # Look for any audio file with the sanitized name
        candidates = list(OUTPUT_DIR.glob(f"{glob_escape(output_stem)}.*"))
        audio_exts = {".m4a", ".webm", ".opus", ".mp3", ".ogg", ".wav"}
        for candidate in candidates:
            if candidate.suffix.lower() in audio_exts:
                output_path = candidate
                break

    return DownloadedMedia(path=output_path, info=info)


def transcribe_audio(filepath: Path):
    """Transcribe an audio file using faster-whisper. Returns list of segments."""
    model = _get_whisper_model()
    segments, _info = model.transcribe(str(filepath), beam_size=5, language="en")
    return list(segments)


def _run_transcription_with_keepalive(filepath: Path):
    """Run transcription in a thread, yielding keepalive SSE messages."""
    result_holder = {"segments": None, "error": None}
    done_event = threading.Event()

    def _transcribe():
        try:
            result_holder["segments"] = transcribe_audio(filepath)
        except Exception as e:
            result_holder["error"] = str(e)
        finally:
            done_event.set()

    thread = threading.Thread(target=_transcribe, daemon=True)
    thread.start()

    while not done_event.wait(timeout=3):
        yield ("status", "Still transcribing...")

    thread.join(timeout=5)

    if result_holder["error"]:
        yield ("error", f"Transcription failed: {result_holder['error']}")
        return

    yield ("_done", result_holder["segments"])


def format_transcript(segments, title: str, url: str) -> str:
    """Format transcript segments into a readable plain-text transcript."""
    full_text = " ".join(seg.text.strip() for seg in segments if seg.text.strip())

    if not full_text:
        full_text = "(No speech detected in this video.)"

    # Split into sentences
    sentences = re.split(r"(?<=[.!?])\s+", full_text)

    # Group into paragraphs of ~4 sentences each
    paragraphs = []
    for i in range(0, len(sentences), 4):
        chunk = " ".join(sentences[i : i + 4])
        if chunk.strip():
            paragraphs.append(chunk.strip())

    date_str = datetime.now().strftime("%B %d, %Y")

    body = "\n\n".join(paragraphs)
    return f"{title}\n\nSource: {url}\nTranscribed: {date_str}\n\n{body}\n"


def generate_text(transcript: str, info: MediaInfo) -> Path:
    """Write a plain-text transcript."""
    text_path = OUTPUT_DIR / f"{_output_stem(info)}.txt"
    text_path.write_text(transcript, encoding="utf-8")
    return text_path


def _transcribe_to_text_file(audio_path: Path, info: MediaInfo, url: str):
    """Transcribe audio and write its transcript, yielding stream events."""
    yield ("status", "Transcribing with Whisper... (this may take a minute)")

    segments = None
    for msg_type, msg_data in _run_transcription_with_keepalive(audio_path):
        if msg_type == "_done":
            segments = msg_data
        else:
            yield (msg_type, msg_data)
            if msg_type == "error":
                return

    yield ("status", "Generating text file...")

    try:
        transcript = format_transcript(segments, info.title, url)
        text_path = generate_text(transcript, info)
    except Exception as e:
        yield ("error", f"Text generation failed: {e}")
        return

    yield ("_done", text_path)


def _run_video_download_with_progress(url: str, quality: str):
    """Run download_video in a thread, yielding SSE status messages for progress."""
    prog_queue = queue.Queue()
    result_holder = {"media": None, "error": None}

    def _download():
        try:
            result_holder["media"] = download_video(url, quality, progress_queue=prog_queue)
        except Exception as e:
            result_holder["error"] = str(e)
        finally:
            prog_queue.put(None)  # sentinel

    thread = threading.Thread(target=_download, daemon=True)
    thread.start()

    last_percent = -1
    while True:
        try:
            update = prog_queue.get(timeout=0.5)
        except queue.Empty:
            continue

        if update is None:
            break

        status = update.get("status")
        if status == "downloading":
            percent = update["percent"]
            if percent - last_percent >= 2 or percent >= 99:
                last_percent = percent
                msg = f"Downloading video... {percent}%"
                if update["speed"] != "?":
                    msg += f" ({update['speed']}"
                    if update["eta"] != "?":
                        msg += f", ETA {update['eta']}"
                    msg += ")"
                yield ("status", msg)
        elif status == "merging":
            yield ("status", "Merging video and audio streams...")
        elif status == "finished":
            yield ("status", "Download complete, finalizing...")

    thread.join(timeout=10)

    if result_holder["error"]:
        yield ("error", f"Video download failed: {result_holder['error']}")
        return

    media = result_holder["media"]

    if media and media.path.exists():
        size = _format_bytes(media.path.stat().st_size)
        yield ("status", f"Video saved: {media.info.title} ({size})")

    yield ("_done", media)


def process(url: str, mode: str, quality: str = "1080p"):
    """Main orchestrator. Yields (type, data) tuples for SSE streaming.

    mode: 'audio', 'text', 'both', 'video', or 'video_text'
    """
    # --- Video only mode ---
    if mode == "video":
        yield ("status", "Fetching video info...")

        downloaded_video = None
        for msg_type, msg_data in _run_video_download_with_progress(url, quality):
            if msg_type == "_done":
                downloaded_video = msg_data
            else:
                yield (msg_type, msg_data)
                if msg_type == "error":
                    return

        if downloaded_video is None:
            yield ("error", "Video download produced no output.")
            return

        yield ("status", "Done! Your files are ready.")
        yield ("result", {"video": downloaded_video.path.name})
        return

    # --- Video + Text mode ---
    if mode == "video_text":
        yield ("status", "Fetching video info...")

        downloaded_video = None
        for msg_type, msg_data in _run_video_download_with_progress(url, quality):
            if msg_type == "_done":
                downloaded_video = msg_data
            else:
                yield (msg_type, msg_data)
                if msg_type == "error":
                    return

        if downloaded_video is None:
            yield ("error", "Video download produced no output.")
            return

        yield ("status", "Extracting audio for transcription...")
        try:
            audio_path = extract_audio_from_video(downloaded_video.path)
        except Exception as e:
            yield ("error", f"Audio extraction failed: {e}")
            return

        try:
            text_path = None
            for msg_type, msg_data in _transcribe_to_text_file(audio_path, downloaded_video.info, url):
                if msg_type == "_done":
                    text_path = msg_data
                else:
                    yield (msg_type, msg_data)
                    if msg_type == "error":
                        return

            if text_path is None:
                yield ("error", "Text generation produced no output.")
                return
        except Exception as e:
            yield ("error", f"Text generation failed: {e}")
            return
        finally:
            if audio_path.exists():
                audio_path.unlink()

        yield ("status", "Done! Your files are ready.")
        yield ("result", {"video": downloaded_video.path.name, "text": text_path.name})
        return

    # --- Audio / Text / Both modes (unchanged) ---
    yield ("status", "Fetching video info and downloading audio...")

    try:
        downloaded_audio = download_audio(url)
        audio_path = downloaded_audio.path
        info = downloaded_audio.info
    except Exception as e:
        yield ("error", f"Download failed: {e}")
        return

    yield ("status", f"Downloaded: {info.title}")

    result = {}

    if mode in ("audio", "both"):
        result["audio"] = audio_path.name

    if mode in ("text", "both"):
        text_path = None
        for msg_type, msg_data in _transcribe_to_text_file(audio_path, info, url):
            if msg_type == "_done":
                text_path = msg_data
            else:
                yield (msg_type, msg_data)
                if msg_type == "error":
                    return

        if text_path is None:
            yield ("error", "Text generation produced no output.")
            return

        result["text"] = text_path.name

    # If text-only, clean up the audio file
    if mode == "text" and audio_path.exists():
        audio_path.unlink()
        yield ("status", "Audio file cleaned up (text-only mode).")

    yield ("status", "Done! Your files are ready.")
    yield ("result", result)