open-notebooklm

Running on Zero

File size: 5,885 Bytes

import queue
import threading
import spaces
import os
import io
import soundfile as sf
import gradio as gr
import numpy as np
import time
import pymupdf
import requests
from pathlib import Path

import torch
from huggingface_hub import InferenceClient
from kokoro import KModel, KPipeline

# -----------------------------------------------------------------------------
# Download example PDF
# -----------------------------------------------------------------------------
os.makedirs("examples", exist_ok=True)
response = requests.get("https://www.palantir.com/assets/xrfr7uokpv1b/1wtb4LWF7XIuJisnMwH0XW/dc37fdda646a5df6c5b86f695ce990c0/NYT_-_Our_Oppenheimer_Moment-_The_Creation_of_A.I._Weapons.pdf")
with open("examples/Essay_Palantir.pdf", 'wb') as f:
    f.write(response.content)

# -----------------------------------------------------------------------------
# LLM that writes the script
# -----------------------------------------------------------------------------
from prompts import SYSTEM_PROMPT

client = InferenceClient(
    "meta-llama/Llama-3.3-70B-Instruct",
    provider="cerebras",
    token=os.getenv("HF_TOKEN"),
)


def generate_podcast_script(subject: str, steering_question: str | None = None) -> str:
    """Ask the LLM for a script of a podcast given by two hosts."""
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": f"""Please analyze this content and create an engaging podcast discussion:
{subject[:10000]}"""},
    ]
    if steering_question and len(steering_question) > 0:
        messages.append({"role": "user", "content": f"You could focus on this question: {steering_question}"})

    response = client.chat_completion(
        messages,
        max_tokens=8156,
    )
    full_text = response.choices[0].message.content
    assert "[JANE]" in full_text
    dialogue_start_index = full_text.find("[JANE]")
    podcast_text = full_text[dialogue_start_index:]
    return podcast_text

# -----------------------------------------------------------------------------
# Kokoro TTS
# -----------------------------------------------------------------------------
CUDA_AVAILABLE = torch.cuda.is_available()

kmodel = KModel(repo_id='hexgrad/Kokoro-82M').to("cuda" if CUDA_AVAILABLE else "cpu").eval()
kpipeline = KPipeline(lang_code="a")  # English voices

MALE_VOICE = "am_fenrir"
FEMALE_VOICE = "af_heart"

# Pre‑warm voices to avoid first‑call latency
for v in (MALE_VOICE, FEMALE_VOICE):
    kpipeline.load_voice(v)

@spaces.GPU
def generate_podcast(url: str, pdf_path: str, topic: str):
    if pdf_path:
        with pymupdf.open(pdf_path) as pdf_doc:
            material_text = ""
            for page in pdf_doc:
                material_text += page.get_text()
    elif url:
        response = requests.get(f'https://r.jina.ai/{url}')
        material_text = response.text
    else:
        raise gr.Error("Please provide either a URL or upload a PDF file to generate a podcast.")
    
    # Generate podcast script!
    podcast_script = generate_podcast_script(material_text, topic)

    lines = [l for l in podcast_script.strip().splitlines() if l.strip()]

    pipeline = kpipeline
    pipeline_voice_female = pipeline.load_voice(FEMALE_VOICE)
    pipeline_voice_male = pipeline.load_voice(MALE_VOICE)

    speed = 1.
    sr = 24000

    for line in lines:
        # Expect "[S1] ..." or "[S2] ..."
        if line.startswith("[MIKE]"):
            pipeline_voice = pipeline_voice_male
            voice = MALE_VOICE
            utterance = line[len("[MIKE]"):].strip()
        elif line.startswith("[JANE]"):
            pipeline_voice = pipeline_voice_female
            voice = FEMALE_VOICE
            utterance = line[len("[JANE]"):].strip()
        else:  # fallback
            pipeline_voice = pipeline_voice_female
            voice = FEMALE_VOICE
            utterance = line

        for _, ps, _ in pipeline(utterance, voice, speed):
            t0 = time.time()
            ref_s = pipeline_voice[len(ps) - 1]
            audio_numpy = kmodel(ps, ref_s, speed).numpy()
            yield (sr, audio_numpy)
            t1 = time.time()
            print(f"PROCESSED '{utterance}' in {int(t1-t0)} seconds. {audio_numpy.shape}")

EXAMPLES = [
    ["https://en.wikipedia.org/wiki/Tupac_Shakur", None, "How does using this compare with other inference solutions?"],
    [None, str(Path("examples/Essay_Palantir.pdf")), "Make sure to keep some critic spirit in the analysis!"],
]

demo = gr.Interface(
    title="Open NotebookLM 🎙️",
    description="""Generates a podcast discussion between two hosts about the materials of your choice. 
Upload a PDF or provide a webpage URL to create your podcast discussion.
Based on [Kokoro TTS](https://huggingface.co/hexgrad/Kokoro-82M), lightning-fast inference for [Llama-3.3-70B](meta-llama/Llama-3.3-70B-Instruct) by Cerebras, and uses elements from a NotebookLM app by [Gabriel Chua](https://huggingface.co/spaces/gabrielchua/open-notebooklm).""",
    fn=generate_podcast,
    inputs=[
        gr.Textbox(
            label="🔗 Type a Webpage URL to discuss it (Optional)",
            placeholder="The URL you want to discuss the content for.",
        ),
        gr.File(
            label="Upload a PDF as discussion material (Optional)",
            file_types=[".pdf"],
            file_count="single",
        ),
        gr.Textbox(label="🤔 Do you have a more specific topic or question on the materials?", placeholder="You can leave this blank."),
    ],
    outputs=[
        gr.Audio(
            label="Listen to your podcast! 🔊",
            format="wav",
            streaming=True,
        ),
    ],
    theme=gr.themes.Soft(),
    submit_btn="Generate podcast 🎙️", 
    # clear_btn=gr.Button("🗑️"),
    examples=EXAMPLES,
    cache_examples=True,
)

if __name__ == "__main__":
    demo.launch()