Sahibhim's picture
Create app.py
1b8009f verified
# YouTubeSummerizer.py β€” full, updated script (yt-dlp captions + DistilBART summary + Gradio UI)
import requests
import webvtt
from yt_dlp import YoutubeDL
import gradio as gr
import torch
from transformers import pipeline
# ---------------- Summarizer ----------------
# NOTE: Uses CPU by default. If you hit RAM errors, restart and it will stream-load weights.
text_summary = pipeline(
"summarization",
model="sshleifer/distilbart-cnn-12-6",
torch_dtype=torch.bfloat16
)
def summarize_text(text: str) -> str:
# Light chunking for very long captions (kept simple)
words = text.split()
chunks, step = [], 350
for i in range(0, len(words), step):
chunks.append(" ".join(words[i:i+step]))
partial = []
for c in chunks:
out = text_summary(c, max_length=180, min_length=60, do_sample=False)
partial.append(out[0]["summary_text"])
merged = " ".join(partial)
if len(merged.split()) > 320:
return text_summary(merged, max_length=200, min_length=80, do_sample=False)[0]["summary_text"]
return merged
# ---------------- Helpers to get captions via yt-dlp ----------------
def _pick_caption_url(info, preferred=("en", "en-US", "en-GB")):
subs = info.get("subtitles") or {}
autos = info.get("automatic_captions") or {}
# Prefer manual β†’ auto β†’ any
for d in (subs, autos):
for code in preferred:
if code in d and d[code]:
return d[code][0]["url"]
for d in (subs, autos):
for tracks in d.values():
if tracks:
return tracks[0]["url"]
return None
def _fetch_caption_text(video_url: str) -> str | None:
"""
Try multiple ways to fetch a captions VTT URL, without spamming errors.
Order: Edge -> Firefox -> Chrome (Default/Profile 1) -> no cookies.
"""
attempts = [
{"skip_download": True, "quiet": True, "cookiesfrombrowser": ("edge",)},
{"skip_download": True, "quiet": True, "cookiesfrombrowser": ("firefox",)},
{"skip_download": True, "quiet": True, "cookiesfrombrowser": ("chrome", "Default")},
{"skip_download": True, "quiet": True, "cookiesfrombrowser": ("chrome", "Profile 1")},
{"skip_download": True, "quiet": True}, # no cookies
]
for opts in attempts:
try:
with YoutubeDL(opts) as ydl:
info = ydl.extract_info(video_url, download=False)
vtt_url = _pick_caption_url(info)
if not vtt_url:
continue
r = requests.get(vtt_url, timeout=20)
r.raise_for_status()
# Parse WebVTT text in-memory
lines = []
for cue in webvtt.from_string(r.text):
t = cue.text.strip().replace("\n", " ")
if t:
lines.append(t)
result = " ".join(lines).strip()
if result:
return result
except Exception:
continue
return None
# ---------------- Main function used by Gradio ----------------
def get_youtube_transcript(video_url: str) -> str:
text = _fetch_caption_text(video_url)
if not text:
return "No captions available or captions endpoint blocked. Try another video, network, or export cookies to a cookies.txt file."
try:
return summarize_text(text)
except Exception as e:
return f"Summarizer error: {e}"
# ---------------- Gradio UI ----------------
gr.close_all()
demo = gr.Interface(
fn=get_youtube_transcript,
inputs=[gr.Textbox(label="Input YouTube Url to summarize", lines=1, placeholder="https://www.youtube.com/watch?v=...")],
outputs=[gr.Textbox(label="Summarized text", lines=10)],
title="@Sahibhim GenAI Project 2: YouTube Script Summarizer",
description="Paste a YouTube link. App fetches captions (manual or auto) and summarizes them."
)
if __name__ == "__main__":
demo.launch()