Spaces:

Sahibhim
/

YouTubeSummarizer

Runtime error

App Files Files Community

YouTubeSummarizer / app.py

Sahibhim

Create app.py

1b8009f verified 7 months ago

raw

history blame contribute delete

3.94 kB

	# YouTubeSummerizer.py — full, updated script (yt-dlp captions + DistilBART summary + Gradio UI)

	import requests
	import webvtt
	from yt_dlp import YoutubeDL
	import gradio as gr
	import torch
	from transformers import pipeline

	# ---------------- Summarizer ----------------
	# NOTE: Uses CPU by default. If you hit RAM errors, restart and it will stream-load weights.
	text_summary = pipeline(
	"summarization",
	model="sshleifer/distilbart-cnn-12-6",
	torch_dtype=torch.bfloat16
	)

	def summarize_text(text: str) -> str:
	# Light chunking for very long captions (kept simple)
	words = text.split()
	chunks, step = [], 350
	for i in range(0, len(words), step):
	chunks.append(" ".join(words[i:i+step]))
	partial = []
	for c in chunks:
	out = text_summary(c, max_length=180, min_length=60, do_sample=False)
	partial.append(out[0]["summary_text"])
	merged = " ".join(partial)
	if len(merged.split()) > 320:
	return text_summary(merged, max_length=200, min_length=80, do_sample=False)[0]["summary_text"]
	return merged

	# ---------------- Helpers to get captions via yt-dlp ----------------
	def _pick_caption_url(info, preferred=("en", "en-US", "en-GB")):
	subs = info.get("subtitles") or {}
	autos = info.get("automatic_captions") or {}
	# Prefer manual → auto → any
	for d in (subs, autos):
	for code in preferred:
	if code in d and d[code]:
	return d[code][0]["url"]
	for d in (subs, autos):
	for tracks in d.values():
	if tracks:
	return tracks[0]["url"]
	return None

	def _fetch_caption_text(video_url: str) -> str \| None:
	"""
	Try multiple ways to fetch a captions VTT URL, without spamming errors.
	Order: Edge -> Firefox -> Chrome (Default/Profile 1) -> no cookies.
	"""
	attempts = [
	{"skip_download": True, "quiet": True, "cookiesfrombrowser": ("edge",)},
	{"skip_download": True, "quiet": True, "cookiesfrombrowser": ("firefox",)},
	{"skip_download": True, "quiet": True, "cookiesfrombrowser": ("chrome", "Default")},
	{"skip_download": True, "quiet": True, "cookiesfrombrowser": ("chrome", "Profile 1")},
	{"skip_download": True, "quiet": True}, # no cookies
	]
	for opts in attempts:
	try:
	with YoutubeDL(opts) as ydl:
	info = ydl.extract_info(video_url, download=False)

	vtt_url = _pick_caption_url(info)
	if not vtt_url:
	continue

	r = requests.get(vtt_url, timeout=20)
	r.raise_for_status()

	# Parse WebVTT text in-memory
	lines = []
	for cue in webvtt.from_string(r.text):
	t = cue.text.strip().replace("\n", " ")
	if t:
	lines.append(t)
	result = " ".join(lines).strip()
	if result:
	return result
	except Exception:
	continue
	return None

	# ---------------- Main function used by Gradio ----------------
	def get_youtube_transcript(video_url: str) -> str:
	text = _fetch_caption_text(video_url)
	if not text:
	return "No captions available or captions endpoint blocked. Try another video, network, or export cookies to a cookies.txt file."
	try:
	return summarize_text(text)
	except Exception as e:
	return f"Summarizer error: {e}"

	# ---------------- Gradio UI ----------------
	gr.close_all()
	demo = gr.Interface(
	fn=get_youtube_transcript,
	inputs=[gr.Textbox(label="Input YouTube Url to summarize", lines=1, placeholder="https://www.youtube.com/watch?v=...")],
	outputs=[gr.Textbox(label="Summarized text", lines=10)],
	title="@Sahibhim GenAI Project 2: YouTube Script Summarizer",
	description="Paste a YouTube link. App fetches captions (manual or auto) and summarizes them."
	)

	if __name__ == "__main__":
	demo.launch()