File size: 5,366 Bytes
53744b5
03ef672
 
 
53744b5
4c19533
03ef672
fe00684
f1adb14
 
50d2a40
03ef672
f1adb14
 
53744b5
cca7e91
369b2d2
03ef672
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
617d576
 
 
 
03ef672
 
 
 
 
 
 
fe00684
03ef672
 
 
617d576
 
03ef672
 
 
 
617d576
cca7e91
03ef672
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cca7e91
03ef672
 
 
 
 
 
cca7e91
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# =============================================================
# Lecture β†’ English Podcast Generator
# β€’ Script: HF Inference API (Qwen/Qwen2.5-Coder-32B-Instruct)
# β€’ Audio: MeloTTS (English)
# =============================================================

import io
import re
import tempfile
import textwrap
from pathlib import Path
from typing import List

import gradio as gr
from PyPDF2 import PdfReader
from huggingface_hub import InferenceClient

import torch
import nltk
nltk.download('averaged_perceptron_tagger_eng')
from melo.api import TTS

# ────────────────────────────────────────────────────────────────────
# 1) Setup HF client & MeloTTS for English
# ────────────────────────────────────────────────────────────────────
hf_client = InferenceClient()  # anonymous/public access

device = 'cuda' if torch.cuda.is_available() else 'cpu'
melo_en = TTS(language='EN', device=device)
speaker_ids = melo_en.hps.data.spk2id
default_speaker = next(iter(speaker_ids.keys()))

# ────────────────────────────────────────────────────────────────────
# 2) Prompt template
# ────────────────────────────────────────────────────────────────────
PROMPT = textwrap.dedent("""
You are producing a lively two-host educational podcast in English.
Summarize the following lecture content into a dialogue of approximately 300 words.
Make it engaging: hosts ask questions, clarify ideas with analogies,
and wrap up with a concise recap. Preserve technical accuracy.
Use Markdown for host names (e.g., **Host 1:**).

### Lecture Content
{content}
""")

# ────────────────────────────────────────────────────────────────────
# 3) Helpers
# ────────────────────────────────────────────────────────────────────
def extract_pdf_text(pdf_path: str) -> str:
    reader = PdfReader(pdf_path)
    return "\n".join(page.extract_text() or "" for page in reader.pages)

def split_to_chunks(text: str, limit: int = 280) -> List[str]:
    sents = [s.strip() for s in re.split(r"(?<=[.!?])\s+", text) if s.strip()]
    chunks, curr = [], ""
    for sent in sents:
        if curr and len(curr) + len(sent) + 1 > limit:
            chunks.append(curr)
            curr = sent
        else:
            curr = f"{curr} {sent}".strip() if curr else sent
    if curr:
        chunks.append(curr)
    return chunks

# ────────────────────────────────────────────────────────────────────
# 4) Main generate function
# ────────────────────────────────────────────────────────────────────
def generate_podcast(lecture_pdf: gr.File):
    if not lecture_pdf:
        raise gr.Error("Please upload a lecture PDF.")
    # 1️⃣ Extract & prompt
    raw = extract_pdf_text(lecture_pdf.name)
    prompt = PROMPT.format(content=raw)
    # 2️⃣ HF text generation
    out = hf_client.text_generation(
        inputs=prompt,
        model="Qwen/Qwen2.5-Coder-32B-Instruct",
        parameters={"max_new_tokens": 512, "temperature": 0.5}
    )
    # InferenceClient returns a dict or a str depending on version
    script = out.get("generated_text") if isinstance(out, dict) else out

    # 3️⃣ MeloTTS audio
    tmpdir = Path(tempfile.mkdtemp())
    bio = io.BytesIO()
    progress = gr.Progress()
    # use the default English speaker
    melo_en.tts_to_file(
        script,
        speaker_ids[default_speaker],
        bio,
        speed=1.0,
        pbar=progress.tqdm,
        format="wav"
    )
    audio_bytes = bio.getvalue()

    return script, audio_bytes

# ────────────────────────────────────────────────────────────────────
# 5) Gradio UI
# ────────────────────────────────────────────────────────────────────
with gr.Blocks() as demo:
    gr.Markdown("## Lecture β†’ English Podcast")
    pdf_in = gr.File(label="Upload Lecture PDF", file_types=[".pdf"])
    btn = gr.Button("Generate Podcast")
    script_md = gr.Markdown(label="Podcast Script")
    audio_out = gr.Audio(label="Podcast Audio", type="bytes")
    btn.click(fn=generate_podcast, inputs=[pdf_in], outputs=[script_md, audio_out])

    demo.launch()