File size: 11,718 Bytes
89019a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1577653
89019a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ced4ec2
89019a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1577653
 
89019a3
 
 
 
 
1577653
89019a3
 
 
a7f1536
89019a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a7f1536
89019a3
da660a7
89019a3
 
 
 
 
 
 
 
 
 
 
1d7b566
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
import concurrent.futures as cf
import glob
import io
import os
import time
from pathlib import Path
from tempfile import NamedTemporaryFile
from typing import List, Literal
import requests

import gradio as gr
import sentry_sdk
from fastapi import FastAPI
from fastapi.staticfiles import StaticFiles
from loguru import logger
from openai import OpenAI
from promptic import llm
from pydantic import BaseModel, ValidationError
from pypdf import PdfReader
from tenacity import retry, retry_if_exception_type

import pdfplumber
import concurrent.futures
from pydub import AudioSegment
# from docx import Document
# from pptx import Presentation

from langchain_openai import ChatOpenAI
from langchain.chains.summarize import load_summarize_chain
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders.llmsherpa import LLMSherpaFileLoader
from langchain_community.document_loaders import WebBaseLoader


sentry_sdk.init(os.getenv("SENTRY_DSN"))

app = FastAPI()

app.mount("/static", StaticFiles(directory="static"), name="static")


class DialogueItem(BaseModel):
    text: str
    speaker: Literal["female-1", "male-1", "female-2"]

    @property
    def voice(self):
        return {
            "female-1": "nova",
            "male-1": "onyx",
            "female-2": "shimmer",
        }[self.speaker]


class Dialogue(BaseModel):
    scratchpad: str
    dialogue: List[DialogueItem]


@retry(retry=retry_if_exception_type(ValidationError))
@llm(model="gpt-4o")
def generate_dialogue(text: str) -> Dialogue:
    """
Your task is to take the input text provided and turn it into an engaging, informative podcast dialogue. The input text may be messy or unstructured, as it could come from a variety of sources like PDFs or web pages. Don't worry about the formatting issues or any irrelevant information; your goal is to extract the key points and interesting facts that could be fully discussed in a podcast.

Here is the input text you will be working with:

<input_text>
{text}
</input_text>

First, carefully read through the input text and identify the main topics, key points, and any interesting facts or anecdotes. Think about how you could present this information in a fun, engaging way that would be suitable for an audio podcast.

<scratchpad>
Brainstorm creative ways to discuss the main topics and key points you identified in the input text. Consider using analogies, storytelling techniques, or hypothetical scenarios to make the content more relatable and engaging for listeners.
Keep in mind that your podcast should be accessible to a general audience, so avoid using too much jargon or assuming prior knowledge of the topic. If necessary, think of ways to briefly explain any complex concepts in simple terms.

Use your imagination to fill in any gaps in the input text or to come up with thought-provoking questions that could be explored in the podcast. The goal is to create an informative and entertaining dialogue, so feel free to be creative in your approach.

Write your brainstorming ideas and a rough outline for the podcast dialogue here. Be sure to note the key insights and takeaways you want to reiterate at the end.
</scratchpad>

Now that you have brainstormed ideas and created a rough outline, it's time to write the actual podcast dialogue. Aim for a natural, conversational flow between the host and any guest speakers. Incorporate the best ideas from your brainstorming session and make sure to explain any complex topics in an easy-to-understand way.

<podcast_dialogue>
Write your engaging, informative podcast dialogue here, based on the key points and creative ideas you came up with during the brainstorming session. Use a conversational tone and include any necessary context or explanations to make the content accessible to a general audience. Use made-up names for the hosts and guests to create a more engaging and immersive experience for listeners. Design your output to be read aloud -- it will be directly converted into audio.

Make the dialogue sound like a natural conversation between Taiwanese people. Use colloquial language zh-hant, cultural references, and a friendly tone that reflects how people in Taiwan typically speak to each other. Incorporate local phrases and expressions to make the conversation authentic and relatable.

Throughout the dialogue, sprinkle in new insights or interesting ideas that might arise naturally from the conversation. These could be personal anecdotes, hypothetical scenarios, or surprising facts that keep the listeners engaged.

Include emotional cues to make the conversation more engaging, such as laughter, excitement, or surprise.

Make the dialogue as long and detailed as possible, while still staying on topic and maintaining an engaging flow. Aim to use your full output capacity to create the longest podcast episode you can, while still communicating the key information from the input text in an entertaining way.

At the end of the dialogue, have the host and guest speakers naturally summarize the main insights and takeaways from their discussion. This should flow organically from the conversation, reiterating the key points in a casual, conversational manner. Avoid making it sound like an obvious recap - the goal is to reinforce the central ideas one last time before signing off.
</podcast_dialogue>
    """


def get_mp3(text: str, voice: str, api_key: str = None) -> bytes:
    client = OpenAI(
        api_key=api_key or os.getenv("OPENAI_API_KEY"),
    )

    with client.audio.speech.with_streaming_response.create(
        model="tts-1",
        voice=voice,
        input=text,
    ) as response:
        with io.BytesIO() as file:
            for chunk in response.iter_bytes():
                file.write(chunk)
            return file.getvalue()

def summarize_chunk(chunk_text: str) -> str:
    llm = ChatOpenAI(temperature=0, model_name="gpt-4o")
    document = Document(page_content=chunk_text)
    summarize_chain = load_summarize_chain(llm=llm)
    summary = summarize_chain.invoke([document])
    return summary["output_text"]

# Function to read and extract text from a DOCX file
def get_doc_text(filename: str) -> str:
    from docx import Document as DocxDocument
    doc = DocxDocument(filename)
    full_text = [para.text for para in doc.paragraphs]
    return '\n'.join(full_text)

def get_pdf_text(filename: str) -> str:
    full_text = []
    with pdfplumber.open(filename) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                full_text.append(text)
    return '\n'.join(full_text)

# Function to split text into smaller chunks
def split_text(text: str, chunk_size: int = 1000, chunk_overlap: int = 200) -> List[str]:
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    chunks = text_splitter.split_text(text)
    return chunks

# Function to summarize a large document with text splitting in parallel
def summarize_large_document(filename: str, chunk_size: int = 1000, chunk_overlap: int = 200) -> str:
    if filename.endswith(".docx"):
        text = get_doc_text(filename)
    elif filename.endswith(".pdf"):
        text = get_pdf_text(filename)
    else:
        raise ValueError("Unsupported file type")
    chunks = split_text(text, chunk_size, chunk_overlap)
    
    summaries = []
    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = {executor.submit(summarize_chunk, chunk): chunk for chunk in chunks}
        for future in concurrent.futures.as_completed(futures):
            summaries.append(future.result())
    
    # Combine all summaries into one final summary
    final_summary = "\n".join(summaries)
    return final_summary


def summarize_with_sherpa(url: str) -> str:
    response = requests.head(url)
    content_type = response.headers.get('content-type')
    allowed_types = [
        'application/vnd.openxmlformats-officedocument.wordprocessingml.document',  # DOCX
        'application/vnd.openxmlformats-officedocument.presentationml.presentation',  # PPTX
        'text/html',  # HTML
        'text/plain',  # TXT
        'application/xml',  # XML
        'application/pdf', # PDF
    ]
    if content_type not in allowed_types:
        loader = WebBaseLoader(url)
        # print(content_type)
    else:
        loader = LLMSherpaFileLoader(
            file_path=url,
            new_indent_parser=True,
            apply_ocr=True,
            strategy="text",
            llmsherpa_api_url="https://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=all",
        )
    docs = loader.load()
    return docs[0].page_content

def generate_audio(file=None, url=None, openai_api_key: str = None) -> bytes:

    if not os.getenv("OPENAI_API_KEY", openai_api_key):
        raise gr.Error("OpenAI API key is required")

    text = ""
    if file:
        try:
            text = summarize_large_document(file)
        except ValueError as e:
            raise gr.Error(str(e))
    elif url:
        try:
            text = summarize_with_sherpa(url)
        except Exception as e:
            raise gr.Error(str(e))

    print(text)

    llm_output = generate_dialogue(text)

    audio = b""
    transcript = ""

    characters = 0

    with cf.ThreadPoolExecutor() as executor:
        futures = []
        for line in llm_output.dialogue:
            transcript_line = f"{line.speaker}: {line.text}"
            future = executor.submit(get_mp3, line.text, line.voice, openai_api_key)
            futures.append((future, transcript_line))
            characters += len(line.text)

        for future, transcript_line in futures:
            audio_chunk = future.result()
            audio += audio_chunk
            transcript += transcript_line + "\n\n"

    logger.info(f"Generated {characters} characters of audio")

    temporary_directory = "./gradio_cached_examples/tmp/"
    os.makedirs(temporary_directory, exist_ok=True)

    # we use a temporary file because Gradio's audio component doesn't work with raw bytes in Safari
    temporary_file = NamedTemporaryFile(
        dir=temporary_directory,
        delete=False,
        suffix=".mp3",
    )
    temporary_file.write(audio)
    temporary_file.close()

    audio_segment = AudioSegment.from_file(temporary_file.name)
    duration = len(audio_segment) / 1000.0  # duration in seconds
    # Delete any files in the temp directory that end with .mp3 and are over a day old
    for file in glob.glob(f"{temporary_directory}*.mp3"):
        if os.path.isfile(file) and time.time() - os.path.getmtime(file) > 24 * 60 * 60:
            os.remove(file)

    return temporary_file.name, transcript, duration


demo = gr.Interface(
    title="Antyhing to Podcast",
    description=Path("description.md").read_text(),
    fn=generate_audio,
    examples=[],
    inputs=[
        gr.File(
            label="PDF",
        ),
        gr.Textbox(
            label="URL",
            placeholder="Enter URL of a PDF, DOCX, or PPTX file",
        ),
        gr.Textbox(
            label="OpenAI API Key",
            visible=not os.getenv("OPENAI_API_KEY"),
        ),
    ],
    outputs=[
        gr.Audio(label="Audio", format="mp3"),
        gr.Textbox(label="Transcript"),
    ],
    allow_flagging=False,
    clear_btn=None,
    head="Anythin Podcast",
    cache_examples="lazy",
    api_name="anything-to-podcast",
)


demo = demo.queue(
    max_size=20,
    default_concurrency_limit=20,
)

app = gr.mount_gradio_app(app, demo, path="/")

if __name__ == "__main__":
    demo.launch(show_api=True)