Spaces:
Runtime error
Runtime error
| # YouTubeSummerizer.py β full, updated script (yt-dlp captions + DistilBART summary + Gradio UI) | |
| import requests | |
| import webvtt | |
| from yt_dlp import YoutubeDL | |
| import gradio as gr | |
| import torch | |
| from transformers import pipeline | |
| # ---------------- Summarizer ---------------- | |
| # NOTE: Uses CPU by default. If you hit RAM errors, restart and it will stream-load weights. | |
| text_summary = pipeline( | |
| "summarization", | |
| model="sshleifer/distilbart-cnn-12-6", | |
| torch_dtype=torch.bfloat16 | |
| ) | |
| def summarize_text(text: str) -> str: | |
| # Light chunking for very long captions (kept simple) | |
| words = text.split() | |
| chunks, step = [], 350 | |
| for i in range(0, len(words), step): | |
| chunks.append(" ".join(words[i:i+step])) | |
| partial = [] | |
| for c in chunks: | |
| out = text_summary(c, max_length=180, min_length=60, do_sample=False) | |
| partial.append(out[0]["summary_text"]) | |
| merged = " ".join(partial) | |
| if len(merged.split()) > 320: | |
| return text_summary(merged, max_length=200, min_length=80, do_sample=False)[0]["summary_text"] | |
| return merged | |
| # ---------------- Helpers to get captions via yt-dlp ---------------- | |
| def _pick_caption_url(info, preferred=("en", "en-US", "en-GB")): | |
| subs = info.get("subtitles") or {} | |
| autos = info.get("automatic_captions") or {} | |
| # Prefer manual β auto β any | |
| for d in (subs, autos): | |
| for code in preferred: | |
| if code in d and d[code]: | |
| return d[code][0]["url"] | |
| for d in (subs, autos): | |
| for tracks in d.values(): | |
| if tracks: | |
| return tracks[0]["url"] | |
| return None | |
| def _fetch_caption_text(video_url: str) -> str | None: | |
| """ | |
| Try multiple ways to fetch a captions VTT URL, without spamming errors. | |
| Order: Edge -> Firefox -> Chrome (Default/Profile 1) -> no cookies. | |
| """ | |
| attempts = [ | |
| {"skip_download": True, "quiet": True, "cookiesfrombrowser": ("edge",)}, | |
| {"skip_download": True, "quiet": True, "cookiesfrombrowser": ("firefox",)}, | |
| {"skip_download": True, "quiet": True, "cookiesfrombrowser": ("chrome", "Default")}, | |
| {"skip_download": True, "quiet": True, "cookiesfrombrowser": ("chrome", "Profile 1")}, | |
| {"skip_download": True, "quiet": True}, # no cookies | |
| ] | |
| for opts in attempts: | |
| try: | |
| with YoutubeDL(opts) as ydl: | |
| info = ydl.extract_info(video_url, download=False) | |
| vtt_url = _pick_caption_url(info) | |
| if not vtt_url: | |
| continue | |
| r = requests.get(vtt_url, timeout=20) | |
| r.raise_for_status() | |
| # Parse WebVTT text in-memory | |
| lines = [] | |
| for cue in webvtt.from_string(r.text): | |
| t = cue.text.strip().replace("\n", " ") | |
| if t: | |
| lines.append(t) | |
| result = " ".join(lines).strip() | |
| if result: | |
| return result | |
| except Exception: | |
| continue | |
| return None | |
| # ---------------- Main function used by Gradio ---------------- | |
| def get_youtube_transcript(video_url: str) -> str: | |
| text = _fetch_caption_text(video_url) | |
| if not text: | |
| return "No captions available or captions endpoint blocked. Try another video, network, or export cookies to a cookies.txt file." | |
| try: | |
| return summarize_text(text) | |
| except Exception as e: | |
| return f"Summarizer error: {e}" | |
| # ---------------- Gradio UI ---------------- | |
| gr.close_all() | |
| demo = gr.Interface( | |
| fn=get_youtube_transcript, | |
| inputs=[gr.Textbox(label="Input YouTube Url to summarize", lines=1, placeholder="https://www.youtube.com/watch?v=...")], | |
| outputs=[gr.Textbox(label="Summarized text", lines=10)], | |
| title="@Sahibhim GenAI Project 2: YouTube Script Summarizer", | |
| description="Paste a YouTube link. App fetches captions (manual or auto) and summarizes them." | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |