import warnings from core.state import AgenticState import json from loguru import logger # Configuration MODEL_NAME = "sshleifer/distilbart-cnn-12-6" @logger.catch def load_bart_summarizer(): """Load BART model with proper configuration""" from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM import torch warnings.filterwarnings("ignore") device = 0 if torch.cuda.is_available() else -1 logger.info(f"The model {MODEL_NAME} is loaded", MODEL_NAME=MODEL_NAME) try: # Try pipeline first summarizer = pipeline( "summarization", model=MODEL_NAME, device=device, truncation=True ) return summarizer except: # Fallback to direct model loading tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME) if device == 0: model = model.cuda() class BartSummarizer: def __init__(self, model, tokenizer, device): self.model = model self.tokenizer = tokenizer self.device = device def __call__(self, text, max_length=150, min_length=30, **kwargs): inputs = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=1024) if self.device == 0: inputs = {k: v.cuda() for k, v in inputs.items()} with torch.no_grad(): summary_ids = self.model.generate( inputs['input_ids'], max_length=max_length, min_length=min_length, num_beams=4, early_stopping=True, no_repeat_ngram_size=3, **kwargs ) summary = self.tokenizer.decode(summary_ids[0], skip_special_tokens=True) return [{"summary_text": summary}] return BartSummarizer(model, tokenizer, device) @logger.catch def clean_summary_output(result): """Extract clean string from various response formats""" if isinstance(result, list) and len(result) > 0: if 'summary_text' in result[0]: return result[0]['summary_text'] elif 'generated_text' in result[0]: return result[0]['generated_text'] else: return str(result[0]) elif isinstance(result, dict): return result.get('summary_text', result.get('generated_text', str(result))) else: return str(result) @logger.catch def synthesize_executive_summary(sections, summarizer): """Create a flowing executive summary from all sections""" # Collect all section summaries and key points all_content = [] for section in sections: all_content.append(section.get('summary', '')) all_content.extend(section.get('key_points', [])) combined = " ".join(all_content)[:2000] if not combined: logger.error("Node 5: No summary available") return "No summary available." # Create a prompt-like input for better summaries prompt = f"Summarize the following interview content: {combined}" try: result = summarizer( prompt, max_length=180, min_length=80, do_sample=False ) return clean_summary_output(result) except: # Fallback to first section summary return sections[0].get('summary', 'Summary unavailable.') @logger.catch def generate_tldr(sections, summarizer): """Generate a concise one-sentence TL;DR""" # Use first section as base first_section = sections[0].get('summary', '')[:300] if not first_section: return "Video summary." try: result = summarizer( first_section, max_length=40, min_length=15, do_sample=False ) tldr = clean_summary_output(result) # Ensure it's a complete sentence if not tldr.endswith(('.', '!', '?')): tldr += '.' return tldr except: return sections[0].get('title', 'Summary') + '.' @logger.catch def format_section_content(section, index, video_id): """Format a single section with proper structure""" title = section.get('title', f'Section {index}') title = title.replace('-', '').strip() if not title or len(title) < 5: title = f"Part {index}" summary = section.get('summary', 'No summary available.') if not summary.endswith(('.', '!', '?')): summary += '.' key_points = section.get('key_points', []) if not key_points: key_points = ["Key insights from this section"] points_formatted = "\n".join([f"- {p}" for p in key_points if p]) # Explanation explanation = section.get('explanation', section.get('content', '')) explanation_section = f"\n**Explanation**\n\n{explanation}\n" if explanation and len(explanation) > len(summary) else "" return f""" ### {index}. {title} **Summary** {summary} {explanation_section} **Key Insights** {points_formatted} """ @logger.catch async def node_5_beautiful_presentation(state: AgenticState) -> AgenticState: """ Node 5: Beautiful Presentation witg local model """ logger.info("🚀 Node 5: Beautiful Presentation (Local) started...") import asyncio structured = getattr(state, "structured_script", {}) # Get metadata directly from state attributes sections = structured.get("sections", []) if not sections: state.errors.append({"type": "missing_structure"}) logger.error("Mode 5: Missing structure") return state # Get metadata directly from state attributes video_metadata = getattr(state, "video_metadata", {}) or {} video_id = ( getattr(state, "video_id", None) or video_metadata.get("video_id") or "UNKNOWN" ) title = ( getattr(state, "title", None) or video_metadata.get("title") or (sections[0].get("title") if sections else None) or "YouTube Video Summary" ) channel = ( getattr(state, "channel", None) or video_metadata.get("channel") or "Unknown Channel" ) duration_human = ( getattr(state, "duration_human", None) or video_metadata.get("duration_human") or None ) # Upload date upload_date = ( getattr(state, "upload_date", None) or video_metadata.get("upload_date") or None ) # Fallback to section title if needed if not title or title == "Unknown Title" or title == "YouTube Video Summary": if sections and len(sections) > 0: title = sections[0].get("title", "YouTube Video Summary") # Load summarizer loop = asyncio.get_event_loop() summarizer = await loop.run_in_executor(None, load_bart_summarizer) # Executive Summary - Synthesize all sections logger.info(" Generating executive summary...") exec_summary = await loop.run_in_executor( None, synthesize_executive_summary, sections, summarizer) # TL;DR - One sentence logger.info(" Generating TL;DR...") tldr = generate_tldr(sections, summarizer) # Main Topics - From structured data topics_md = "" topics = structured.get("main_topics", []) if topics: # Clean topics clean_topics = [t.strip() for t in topics if t and len(t) > 3] if clean_topics: topics_md = "## Main Topics\n\n" topics_md += "\n".join(f"- {t}" for t in clean_topics[:8]) # Table of Contents toc = "## Table of Contents\n\n" for i, sec in enumerate(sections, 1): title_text = sec.get('title', f'Section {i}') clean_title = title_text.replace('-', '').strip() if clean_title and not clean_title.startswith('The following'): toc += f"{i}. {clean_title}\n" # Sections - Properly formatted logger.info(" Formatting {sections} sections...", sections=len(sections)) sections_md = [] for i, sec in enumerate(sections, 1): sections_md.append(format_section_content(sec, i, video_id)) # Quotes - Clean and deduplicate quotes_md = "" quotes = structured.get("key_quotes", []) if quotes: # Clean quotes clean_quotes = [] for q in quotes: if q and len(q) > 10: # Remove any surrounding quotes q = q.strip('"').strip() if q not in clean_quotes: clean_quotes.append(q) if clean_quotes: quotes_md = "## Key Quotes\n\n" for q in clean_quotes[:6]: quotes_md += f'> "{q}"\n\n' # Entities - Clean and sort entities_md = "" entities = structured.get("mentioned_entities", []) if entities: # Clean and deduplicate clean_entities = [] for e in entities: if e and len(e) > 1: # Remove duplicates and clean if e not in clean_entities: clean_entities.append(e) if clean_entities: entities_md = "## Key Mentions\n\n" for ent in sorted(clean_entities)[:20]: entities_md += f"- {ent}\n" # Final Markdown final_md = f"""# {title} *Channel: {channel}* *Video ID: {video_id}* *Duration: {duration_human}* *Updated: {upload_date}* --- ## Executive Summary {exec_summary} ## TL;DR {tldr} --- {topics_md} --- {toc} --- {chr(10).join(sections_md)} --- {quotes_md} --- {entities_md} --- *Generated with YouTubeScriptMaster (Local AI Mode)* """ # Clean up any remaining formatting issues final_md = final_md.replace(' ', ' ').replace('\n\n\n', '\n\n') state.final_formatted_markdown = final_md state.presentation_complete = True logger.info("✅ Node 5 complete") quotes_count = len(clean_quotes) if 'clean_quotes' in locals() else 0 entities_count = len(clean_entities) if 'clean_entities' in locals() else 0 logger.info(f" Sections: {len(sections_md)}") logger.info(f" Executive summary: {exec_summary[:100] if exec_summary else ''}...") logger.info(f" TL;DR: {tldr}") logger.info(f" Quotes: {quotes_count}") logger.info(f" Entities: {entities_count}") return state