File size: 7,368 Bytes
3ac5c08 61caafb b65edab 3ac5c08 eab7fca b65edab 25acce6 b65edab eab7fca b65edab eab7fca 3ac5c08 b65edab 61caafb b65edab 3ac5c08 b65edab 61caafb b65edab 61caafb 3ac5c08 b65edab 3ac5c08 b65edab 3ac5c08 b65edab 61caafb b65edab 03eec30 b65edab 03eec30 cea6d8c 3ac5c08 cea6d8c b65edab cea6d8c b65edab cea6d8c 7c4326e b65edab 3ac5c08 b65edab cea6d8c b65edab 3ac5c08 b65edab 7c4326e b65edab 3ac5c08 b65edab 03eec30 b65edab 3ac5c08 b65edab 61caafb b65edab 61caafb b65edab 61caafb b65edab 3ac5c08 b65edab |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 |
import os
import tempfile
import gradio as gr
import shutil
import ast
import numpy as np
import soundfile as sf
import warnings
try:
from moshi.models.tts import TTSModel
except ImportError:
print("Moshi TTSModel not available β install Kyutaiβs version via pip.")
TTSModel = None
from notebook_lm_kokoro import (
generate_podcast_script,
generate_audio_from_script,
generate_audio_kyutai,
KPipeline,
)
warnings.filterwarnings("ignore")
def process_segment(entry, voice_map):
speaker, dialogue = entry
chosen_voice = voice_map.get(speaker, "af_heart")
pipeline = KPipeline(lang_code="a", repo_id="hexgrad/Kokoro-82M")
generator = pipeline(dialogue, voice=chosen_voice)
return np.concatenate([audio for _, _, audio in generator], axis=0) if generator else None
def generate_audio_from_script_with_voices(script, speaker1_voice, speaker2_voice, output_file):
print("[DEBUG] Raw transcript string:")
print(script)
voice_map = {"Speaker 1": speaker1_voice, "Speaker 2": speaker2_voice}
try:
transcript_list = ast.literal_eval(script)
if not isinstance(transcript_list, list):
raise ValueError("Transcript is not a list")
entries = [entry for entry in transcript_list if isinstance(entry, tuple) and len(entry) == 2]
results = [process_segment(entry, voice_map) for entry in entries if entry is not None]
if not results:
return None
sample_rate = 24000
pause = np.zeros(sample_rate, dtype=np.float32)
final_audio = results[0]
for seg in results[1:]:
final_audio = np.concatenate((final_audio, pause, seg), axis=0)
sf.write(output_file, final_audio, sample_rate)
return output_file
except Exception as e:
print(f"Transcript parse error: {e}")
return None
def process_pdf(pdf_file, speaker1_voice, speaker2_voice, kyutai_voice1, kyutai_voice2,
provider, openai_key=None, openrouter_key=None, openrouter_base=None, tts_engine=None):
try:
if provider == "openai" and not openai_key:
return "OpenAI API key is required", None
if provider == "openrouter" and not openrouter_key:
return "OpenRouter API key is required", None
if provider in ["openai", "kyutai"]:
os.environ["OPENAI_API_KEY"] = openai_key or ""
os.environ["OPENROUTER_API_BASE"] = "https://api.openai.com/v1"
if provider in ["openrouter", "kyutai"]:
os.environ["OPENAI_API_KEY"] = openrouter_key or ""
os.environ["OPENROUTER_API_BASE"] = openrouter_base or "https://openrouter.ai/api/v1"
if pdf_file is None:
return "No file uploaded", None
tmp_path = pdf_file.name
script_provider = "openrouter" if provider == "kyutai" and openrouter_key else provider
transcript, _ = generate_podcast_script(pdf_file.name, provider=script_provider)
if transcript is None:
return "Transcript generation failed: got None", None
if not transcript.strip().startswith("["):
return f"Malformed transcript:\n{transcript}", None
audio_path = os.path.join(os.path.dirname(tmp_path), f"audio_{os.path.basename(tmp_path).replace('.pdf', '.wav')}")
if tts_engine == "kyutai":
result = generate_audio_kyutai(transcript, kyutai_voice1, kyutai_voice2, audio_path)
else:
result = generate_audio_from_script_with_voices(transcript, speaker1_voice, speaker2_voice, audio_path)
return ("Process complete!", result) if result else ("Error generating audio", None)
except Exception as e:
print(f"process_pdf error: {e}")
return f"Error: {e}", None
def update_ui(provider, tts_engine):
return [
gr.update(visible=tts_engine == "kokoro"),
gr.update(visible=tts_engine == "kokoro"),
gr.update(visible=tts_engine == "kyutai"),
gr.update(visible=tts_engine == "kyutai"),
gr.update(visible=provider in ["openai", "kyutai"]),
gr.update(visible=provider in ["openrouter", "kyutai"]),
gr.update(visible=provider == "openrouter"),
]
def create_gradio_app():
css = ".gradio-container {max-width: 900px !important}"
with gr.Blocks(css=css, theme=gr.themes.Soft()) as app:
gr.Markdown("# π§ PDF to Podcast β NotebookLM + Kokoro/Kyutai")
pdf_input = gr.File(file_types=[".pdf"], type="filepath", label="π Upload your PDF", scale=2)
with gr.Row():
speaker1_voice = gr.Dropdown(["af_heart", "af_bella", "hf_beta"], value="af_heart", label="Speaker 1 Voice")
speaker2_voice = gr.Dropdown(["af_nicole", "af_heart", "bf_emma"], value="bf_emma", label="Speaker 2 Voice")
provider = gr.Radio(["openai", "openrouter"], value="openrouter", label="API Provider")
openai_key = gr.Textbox(type="password", label="OpenAI Key")
openrouter_key = gr.Textbox(type="password", label="OpenRouter Key")
openrouter_base = gr.Textbox(placeholder="https://openrouter.ai/api/v1", label="OpenRouter Base URL")
tts_engine = gr.Radio(["kokoro", "kyutai"], value="kokoro", label="TTS Engine")
with gr.Row():
kyutai_voice1 = gr.Dropdown([
"expresso/ex03-ex01_happy_001_channel1_334s.wav",
"expresso/ex03-ex02_narration_001_channel1_674s.wav",
"vctk/p226_023_mic1.wav"
],
value="expresso/ex03-ex01_happy_001_channel1_334s.wav",
label="Kyutai Voice 1",
visible=True)
kyutai_voice2 = gr.Dropdown([
"expresso/ex03-ex01_happy_001_channel1_334s.wav",
"expresso/ex03-ex02_narration_001_channel1_674s.wav",
"vctk/p225_023_mic1.wav"
],
value="expresso/ex03-ex02_narration_001_channel1_674s.wav",
label="Kyutai Voice 2",
visible=True)
submit_btn = gr.Button("ποΈ Generate Podcast", variant="primary")
status_output = gr.Textbox(label="π Status", interactive=False)
audio_output = gr.Audio(type="filepath", label="π΅ Your Podcast")
submit_btn.click(
process_pdf,
inputs=[pdf_input, speaker1_voice, speaker2_voice, kyutai_voice1, kyutai_voice2,
provider, openai_key, openrouter_key, openrouter_base, tts_engine],
outputs=[status_output, audio_output]
)
provider.change(update_ui, [provider, tts_engine],
[speaker1_voice, speaker2_voice, kyutai_voice1, kyutai_voice2,
openai_key, openrouter_key, openrouter_base])
tts_engine.change(update_ui, [provider, tts_engine],
[speaker1_voice, speaker2_voice, kyutai_voice1, kyutai_voice2,
openai_key, openrouter_key, openrouter_base])
gr.Markdown("""
**π Tips**
- Upload a clean, structured PDF.
- Pick your API provider and enter relevant keys.
- Choose the TTS engine and customize voices.
""")
return app
if __name__ == "__main__":
create_gradio_app().queue().launch(server_name="0.0.0.0", server_port=7860, share=True, debug=True, pwa=True)
|