Spaces:

fcyber
/

YouTubeScriptMaster

Sleeping

fcyber-labs

first commit

0cf3992 4 months ago

10.6 kB

	import warnings
	from core.state import AgenticState
	import json
	from loguru import logger


	# Configuration
	MODEL_NAME = "sshleifer/distilbart-cnn-12-6"

	@logger.catch
	def load_bart_summarizer():

	"""Load BART model with proper configuration"""

	from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
	import torch
	warnings.filterwarnings("ignore")

	device = 0 if torch.cuda.is_available() else -1

	logger.info(f"The model {MODEL_NAME} is loaded", MODEL_NAME=MODEL_NAME)
	try:
	# Try pipeline first
	summarizer = pipeline(
	"summarization",
	model=MODEL_NAME,
	device=device,
	truncation=True
	)
	return summarizer
	except:
	# Fallback to direct model loading
	tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
	model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

	if device == 0:
	model = model.cuda()

	class BartSummarizer:
	def __init__(self, model, tokenizer, device):
	self.model = model
	self.tokenizer = tokenizer
	self.device = device

	def __call__(self, text, max_length=150, min_length=30, **kwargs):
	inputs = self.tokenizer(text, return_tensors="pt",
	truncation=True, max_length=1024)
	if self.device == 0:
	inputs = {k: v.cuda() for k, v in inputs.items()}

	with torch.no_grad():
	summary_ids = self.model.generate(
	inputs['input_ids'],
	max_length=max_length,
	min_length=min_length,
	num_beams=4,
	early_stopping=True,
	no_repeat_ngram_size=3,
	**kwargs
	)

	summary = self.tokenizer.decode(summary_ids[0],
	skip_special_tokens=True)
	return [{"summary_text": summary}]

	return BartSummarizer(model, tokenizer, device)


	@logger.catch
	def clean_summary_output(result):
	"""Extract clean string from various response formats"""
	if isinstance(result, list) and len(result) > 0:
	if 'summary_text' in result[0]:
	return result[0]['summary_text']
	elif 'generated_text' in result[0]:
	return result[0]['generated_text']
	else:
	return str(result[0])
	elif isinstance(result, dict):
	return result.get('summary_text', result.get('generated_text', str(result)))
	else:
	return str(result)


	@logger.catch
	def synthesize_executive_summary(sections, summarizer):
	"""Create a flowing executive summary from all sections"""
	# Collect all section summaries and key points
	all_content = []
	for section in sections:
	all_content.append(section.get('summary', ''))
	all_content.extend(section.get('key_points', []))

	combined = " ".join(all_content)[:2000]

	if not combined:
	logger.error("Node 5: No summary available")
	return "No summary available."


	# Create a prompt-like input for better summaries
	prompt = f"Summarize the following interview content: {combined}"

	try:
	result = summarizer(
	prompt,
	max_length=180,
	min_length=80,
	do_sample=False
	)
	return clean_summary_output(result)
	except:
	# Fallback to first section summary
	return sections[0].get('summary', 'Summary unavailable.')


	@logger.catch
	def generate_tldr(sections, summarizer):
	"""Generate a concise one-sentence TL;DR"""
	# Use first section as base
	first_section = sections[0].get('summary', '')[:300]

	if not first_section:
	return "Video summary."

	try:
	result = summarizer(
	first_section,
	max_length=40,
	min_length=15,
	do_sample=False
	)
	tldr = clean_summary_output(result)

	# Ensure it's a complete sentence
	if not tldr.endswith(('.', '!', '?')):
	tldr += '.'
	return tldr
	except:
	return sections[0].get('title', 'Summary') + '.'


	@logger.catch
	def format_section_content(section, index, video_id):
	"""Format a single section with proper structure"""
	title = section.get('title', f'Section {index}')
	title = title.replace('-', '').strip()
	if not title or len(title) < 5:
	title = f"Part {index}"

	summary = section.get('summary', 'No summary available.')
	if not summary.endswith(('.', '!', '?')):
	summary += '.'

	key_points = section.get('key_points', [])
	if not key_points:
	key_points = ["Key insights from this section"]

	points_formatted = "\n".join([f"- {p}" for p in key_points if p])

	# Explanation
	explanation = section.get('explanation', section.get('content', ''))
	explanation_section = f"\nExplanation\n\n{explanation}\n" if explanation and len(explanation) > len(summary) else ""

	return f"""
	### {index}. {title}

	Summary

	{summary}
	{explanation_section}
	Key Insights

	{points_formatted}
	"""


	@logger.catch
	async def node_5_beautiful_presentation(state: AgenticState) -> AgenticState:
	""" Node 5: Beautiful Presentation witg local model
	"""
	logger.info("🚀 Node 5: Beautiful Presentation (Local) started...")

	import asyncio
	structured = getattr(state, "structured_script", {})

	# Get metadata directly from state attributes
	sections = structured.get("sections", [])

	if not sections:
	state.errors.append({"type": "missing_structure"})
	logger.error("Mode 5: Missing structure")

	return state

	# Get metadata directly from state attributes
	video_metadata = getattr(state, "video_metadata", {}) or {}

	video_id = (
	getattr(state, "video_id", None)
	or video_metadata.get("video_id")
	or "UNKNOWN"
	)

	title = (
	getattr(state, "title", None)
	or video_metadata.get("title")
	or (sections[0].get("title") if sections else None)
	or "YouTube Video Summary"
	)

	channel = (
	getattr(state, "channel", None)
	or video_metadata.get("channel")
	or "Unknown Channel"
	)

	duration_human = (
	getattr(state, "duration_human", None)
	or video_metadata.get("duration_human")
	or None
	)

	# Upload date
	upload_date = (
	getattr(state, "upload_date", None)
	or video_metadata.get("upload_date")
	or None
	)

	# Fallback to section title if needed
	if not title or title == "Unknown Title" or title == "YouTube Video Summary":
	if sections and len(sections) > 0:
	title = sections[0].get("title", "YouTube Video Summary")

	# Load summarizer
	loop = asyncio.get_event_loop()
	summarizer = await loop.run_in_executor(None, load_bart_summarizer)


	# Executive Summary - Synthesize all sections

	logger.info(" Generating executive summary...")
	exec_summary = await loop.run_in_executor(
	None, synthesize_executive_summary, sections, summarizer)


	# TL;DR - One sentence

	logger.info(" Generating TL;DR...")
	tldr = generate_tldr(sections, summarizer)


	# Main Topics - From structured data

	topics_md = ""
	topics = structured.get("main_topics", [])
	if topics:
	# Clean topics
	clean_topics = [t.strip() for t in topics if t and len(t) > 3]
	if clean_topics:
	topics_md = "## Main Topics\n\n"
	topics_md += "\n".join(f"- {t}" for t in clean_topics[:8])


	# Table of Contents

	toc = "## Table of Contents\n\n"
	for i, sec in enumerate(sections, 1):
	title_text = sec.get('title', f'Section {i}')
	clean_title = title_text.replace('-', '').strip()
	if clean_title and not clean_title.startswith('The following'):
	toc += f"{i}. {clean_title}\n"


	# Sections - Properly formatted

	logger.info(" Formatting {sections} sections...", sections=len(sections))
	sections_md = []
	for i, sec in enumerate(sections, 1):
	sections_md.append(format_section_content(sec, i, video_id))


	# Quotes - Clean and deduplicate

	quotes_md = ""
	quotes = structured.get("key_quotes", [])
	if quotes:
	# Clean quotes
	clean_quotes = []
	for q in quotes:
	if q and len(q) > 10:
	# Remove any surrounding quotes
	q = q.strip('"').strip()
	if q not in clean_quotes:
	clean_quotes.append(q)

	if clean_quotes:
	quotes_md = "## Key Quotes\n\n"
	for q in clean_quotes[:6]:
	quotes_md += f'> "{q}"\n\n'


	# Entities - Clean and sort

	entities_md = ""
	entities = structured.get("mentioned_entities", [])
	if entities:
	# Clean and deduplicate
	clean_entities = []
	for e in entities:
	if e and len(e) > 1:
	# Remove duplicates and clean
	if e not in clean_entities:
	clean_entities.append(e)

	if clean_entities:
	entities_md = "## Key Mentions\n\n"
	for ent in sorted(clean_entities)[:20]:
	entities_md += f"- {ent}\n"


	# Final Markdown

	final_md = f"""# {title}

	Channel: {channel}
	Video ID: {video_id}
	Duration: {duration_human}
	Updated: {upload_date}


	---

	## Executive Summary
	{exec_summary}

	## TL;DR
	{tldr}

	---

	{topics_md}

	---

	{toc}

	---

	{chr(10).join(sections_md)}

	---

	{quotes_md}

	---

	{entities_md}

	---

	Generated with YouTubeScriptMaster (Local AI Mode)
	"""

	# Clean up any remaining formatting issues
	final_md = final_md.replace(' ', ' ').replace('\n\n\n', '\n\n')

	state.final_formatted_markdown = final_md
	state.presentation_complete = True

	logger.info("✅ Node 5 complete")



	quotes_count = len(clean_quotes) if 'clean_quotes' in locals() else 0
	entities_count = len(clean_entities) if 'clean_entities' in locals() else 0

	logger.info(f" Sections: {len(sections_md)}")
	logger.info(f" Executive summary: {exec_summary[:100] if exec_summary else ''}...")
	logger.info(f" TL;DR: {tldr}")
	logger.info(f" Quotes: {quotes_count}")
	logger.info(f" Entities: {entities_count}")

	return state