import time from generation.script_generator import get_generator from processing.pdf_reader import extract_text_from_pdf from processing.url_fetcher import fetch_paper_from_url from synthesis.tts_engine import get_tts_engine from utils.config import ( MAX_CONTEXT_CHARS, ) from utils.history import save_to_history class PodcastAgent: def __init__( self, provider_mode="own_inference", own_base_url=None, own_api_key=None, own_model=None, openai_key=None, openai_model=None, tts_provider="elevenlabs", elevenlabs_key=None, host_voice=None, guest_voice=None, max_tokens=None, target_dialogue_count=15, context_limit=None, persona_mode="friendly_explainer", ): """ Initialize PodcastAgent with user-provided settings (BYOK). Args: provider_mode: "own_inference" or "openai" own_base_url: Base URL for own inference server own_api_key: API key for own inference server own_model: Model name for own inference server openai_key: OpenAI API key openai_model: OpenAI model name tts_provider: "elevenlabs" (ElevenLabs required) elevenlabs_key: ElevenLabs API key (required) host_voice: Voice ID for host guest_voice: Voice ID for guest max_tokens: Maximum tokens for generation target_dialogue_count: Target number of dialogue exchanges (default: 15) context_limit: Maximum characters for multi-paper processing (default: MAX_CONTEXT_CHARS) persona_mode: Podcast persona mode (default: "friendly_explainer") """ self.logs = [] self.provider_mode = provider_mode # "own_inference" or "openai" self.own_base_url = own_base_url self.own_api_key = own_api_key self.own_model = own_model self.openai_key = openai_key self.openai_model = openai_model self.tts_provider = tts_provider self.elevenlabs_key = elevenlabs_key self.host_voice = host_voice self.guest_voice = guest_voice self.max_tokens = max_tokens self.target_dialogue_count = target_dialogue_count self.context_limit = context_limit if context_limit else MAX_CONTEXT_CHARS self.persona_mode = persona_mode def log(self, message): timestamp = time.strftime("%H:%M:%S") entry = f"[{timestamp}] {message}" print(entry) self.logs.append(entry) return entry def process(self, url: str = None, pdf_file=None): """ Orchestrates the conversion from URL or uploaded PDF to Podcast. Args: url: Paper URL (arXiv or medRxiv) pdf_file: Uploaded PDF file object """ # Determine source if pdf_file: yield self.log( f"Received uploaded PDF: {pdf_file.name if hasattr(pdf_file, 'name') else 'file'}" ) pdf_path = pdf_file.name if hasattr(pdf_file, "name") else pdf_file source_ref = "Uploaded PDF" elif url: yield self.log(f"Received request for URL: {url}") # Step 1: Fetch Paper yield self.log("Thinking: I need to download the paper first.") yield self.log(f"Tool Call: fetch_paper({url})") pdf_path = fetch_paper_from_url(url) if not pdf_path: yield self.log("Error: Failed to download paper.") return None, "\n".join(self.logs) yield self.log(f"Paper downloaded to: {pdf_path}") source_ref = url else: yield self.log( "Error: No input provided. Please provide either a URL or upload a PDF." ) return None, "\n".join(self.logs) # Step 2: Read PDF yield self.log("Thinking: Now I need to extract the text content.") yield self.log(f"Tool Call: read_pdf({pdf_path})") text = extract_text_from_pdf(pdf_path) if not text: yield self.log("Error: Failed to extract text.") return None, self.logs text_length = len(text) yield self.log(f"Extracted {text_length:,} characters.") # Check context limit if text_length > self.context_limit: yield self.log(f"āš ļø Context limit exceeded!") yield self.log(f"Paper size: {text_length:,} characters") yield self.log(f"Maximum allowed: {self.context_limit:,} characters") yield self.log("") yield self.log("āŒ Error: Paper is too large to process with current settings.") yield self.log("") yield self.log("šŸ“‹ How to fix:") yield self.log(" 1. Open 'Advanced Options' below") yield self.log(" 2. Enable 'Batch Mode'") yield self.log(f" 3. Adjust 'Max Context Limit' slider to at least {text_length:,} characters") yield self.log(" 4. Try again") yield self.log("") return None, "\n".join(self.logs) # Step 3: Generate Script yield self.log( "Thinking: The text is ready. I will now generate a podcast script using the LLM." ) if self.provider_mode == "demo": yield self.log("Using Demo Inference") elif self.provider_mode == "own_inference": yield self.log(f"Using Own Inference: {self.own_base_url}") else: yield self.log(f"Using OpenAI ({self.openai_model or 'gpt-4o-mini'})") yield self.log("Tool Call: generate_script(...)") generator = get_generator( provider_mode=self.provider_mode, own_base_url=self.own_base_url, own_api_key=self.own_api_key, own_model=self.own_model, openai_key=self.openai_key, openai_model=self.openai_model, max_tokens=self.max_tokens, ) script = generator.generate_podcast_script(text, target_dialogue_count=self.target_dialogue_count, persona_mode=self.persona_mode) if not script: yield self.log("Error: Failed to generate script.") return None, self.logs yield self.log(f"Generated script with {len(script)} dialogue turns (target: {self.target_dialogue_count}).") # Step 4: Synthesize Audio yield self.log("Thinking: The script looks good. Sending it to the TTS engine.") tts_name = "ElevenLabs TTS" if self.tts_provider == "elevenlabs" else "Supertonic TTS (CPU)" yield self.log(f"Using {tts_name}") yield self.log("Tool Call: synthesize_podcast(...)") tts = get_tts_engine( tts_provider=self.tts_provider, custom_api_key=self.elevenlabs_key if self.tts_provider == "elevenlabs" else None, host_voice=self.host_voice, guest_voice=self.guest_voice ) audio_path = tts.synthesize_dialogue(script) if not audio_path: yield self.log("Error: Failed to synthesize audio.") return None, self.logs yield self.log(f"Podcast generated successfully at: {audio_path}") # Save to history save_to_history(source_ref, audio_path, len(script)) yield self.log("āœ“ Saved to history") return audio_path, "\n".join(self.logs) def process_multiple(self, urls: list = None, pdf_files: list = None): """ Orchestrates the conversion from multiple URLs or PDFs to a single comprehensive Podcast. Args: urls: List of paper URLs (arXiv or medRxiv) pdf_files: List of uploaded PDF file objects """ all_texts = [] source_refs = [] total_chars = 0 # Process URLs if urls: yield self.log(f"Received {len(urls)} URLs to process.") yield self.log(f"Context limit: {self.context_limit:,} characters") for i, url in enumerate(urls, 1): yield self.log(f"\n=== Processing Paper {i}/{len(urls)} ===") yield self.log(f"URL: {url}") # Step 1: Fetch Paper yield self.log(f"Tool Call: fetch_paper({url})") pdf_path = fetch_paper_from_url(url) if not pdf_path: yield self.log(f"Warning: Failed to download paper {i}, skipping.") continue yield self.log(f"Paper {i} downloaded successfully.") # Step 2: Read PDF yield self.log(f"Tool Call: read_pdf({pdf_path})") text = extract_text_from_pdf(pdf_path) if not text: yield self.log( f"Warning: Failed to extract text from paper {i}, skipping." ) continue text_length = len(text) yield self.log(f"Extracted {text_length:,} characters from paper {i}.") # Check context limit if total_chars + text_length > self.context_limit: yield self.log(f"āš ļø Context limit reached!") yield self.log( f"Current total: {total_chars:,} chars + Paper {i}: {text_length:,} chars = {total_chars + text_length:,} chars" ) yield self.log(f"Maximum allowed: {self.context_limit:,} chars") yield self.log( f"Stopping at {len(all_texts)} papers. Remaining papers will be skipped." ) break all_texts.append(f"=== PAPER {i} ===\n{text}\n") source_refs.append(url) total_chars += text_length yield self.log( f"āœ“ Paper {i} added. Total context: {total_chars:,} chars ({(total_chars / self.context_limit) * 100:.1f}% of limit)" ) # Process PDFs if pdf_files: yield self.log(f"\nReceived {len(pdf_files)} PDF files to process.") if not urls: # Only show limit if we didn't already show it for URLs yield self.log(f"Context limit: {self.context_limit:,} characters") for i, pdf_file in enumerate(pdf_files, 1): # Calculate paper number (continues from URL count) paper_num = (len(urls) if urls else 0) + i yield self.log(f"\n=== Processing PDF {i}/{len(pdf_files)} (Paper {paper_num}) ===") pdf_name = pdf_file.name if hasattr(pdf_file, "name") else f"file_{i}" yield self.log(f"File: {pdf_name}") pdf_path = pdf_file.name if hasattr(pdf_file, "name") else pdf_file # Read PDF yield self.log(f"Tool Call: read_pdf({pdf_path})") text = extract_text_from_pdf(pdf_path) if not text: yield self.log( f"Warning: Failed to extract text from PDF {i}, skipping." ) continue text_length = len(text) yield self.log(f"Extracted {text_length:,} characters from PDF {i}.") # Check context limit if total_chars + text_length > self.context_limit: yield self.log(f"āš ļø Context limit reached!") yield self.log( f"Current total: {total_chars:,} chars + PDF {i}: {text_length:,} chars = {total_chars + text_length:,} chars" ) yield self.log(f"Maximum allowed: {self.context_limit:,} chars") yield self.log( f"Stopping at {len(all_texts)} files. Remaining items will be skipped." ) break all_texts.append(f"=== PAPER {paper_num} ===\n{text}\n") source_refs.append(f"Uploaded PDF: {pdf_name}") total_chars += text_length yield self.log( f"āœ“ PDF {i} added. Total context: {total_chars:,} chars ({(total_chars / self.context_limit) * 100:.1f}% of limit)" ) if not all_texts: yield self.log("Error: No papers were successfully processed.") return None, "\n".join(self.logs) # Combine all texts yield self.log(f"\nāœ“ Successfully processed {len(all_texts)} papers") yield self.log( f"Total context: {total_chars:,} characters ({(total_chars / self.context_limit) * 100:.1f}% of limit)" ) yield self.log( f"Thinking: Now I'll combine all papers into a comprehensive podcast script." ) combined_text = "\n\n".join(all_texts) # Step 3: Generate Comprehensive Script yield self.log( "\nThinking: Creating a comprehensive podcast script covering all papers." ) if self.provider_mode == "demo": yield self.log("Using Demo Inference") elif self.provider_mode == "own_inference": yield self.log(f"Using Own Inference: {self.own_base_url}") else: yield self.log(f"Using OpenAI ({self.openai_model or 'gpt-4o-mini'})") yield self.log("Tool Call: generate_script(...)") generator = get_generator( provider_mode=self.provider_mode, own_base_url=self.own_base_url, own_api_key=self.own_api_key, own_model=self.own_model, openai_key=self.openai_key, openai_model=self.openai_model, max_tokens=self.max_tokens, ) # Add instruction for multi-paper script multi_paper_prompt = f"[MULTIPLE PAPERS - {len(all_texts)} papers total. Create a comprehensive podcast discussing all papers.]\n\n{combined_text}" script = generator.generate_podcast_script(multi_paper_prompt, target_dialogue_count=self.target_dialogue_count, persona_mode=self.persona_mode) if not script: yield self.log("Error: Failed to generate script.") return None, self.logs yield self.log( f"Generated comprehensive script with {len(script)} dialogue turns." ) # Step 4: Synthesize Audio yield self.log( "\nThinking: The script looks good. Sending it to the TTS engine." ) tts_name = "ElevenLabs TTS" if self.tts_provider == "elevenlabs" else "Supertonic TTS (CPU)" yield self.log(f"Using {tts_name}") yield self.log("Tool Call: synthesize_podcast(...)") tts = get_tts_engine( tts_provider=self.tts_provider, custom_api_key=self.elevenlabs_key if self.tts_provider == "elevenlabs" else None, host_voice=self.host_voice, guest_voice=self.guest_voice ) audio_path = tts.synthesize_dialogue(script) if not audio_path: yield self.log("Error: Failed to synthesize audio.") return None, self.logs yield self.log(f"Podcast generated successfully at: {audio_path}") # Save to history source_ref = f"Multiple papers: {', '.join(source_refs[:3])}{'...' if len(source_refs) > 3 else ''}" save_to_history(source_ref, audio_path, len(script)) yield self.log("āœ“ Saved to history") return audio_path, "\n".join(self.logs)