PierreBrunelle commited on
Commit
201bb3e
·
verified ·
1 Parent(s): 703043d

Upload 2 files

Browse files
Files changed (2) hide show
  1. src/interface.py +139 -0
  2. src/processor.py +117 -0
src/interface.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from .processor import process_document
3
+
4
+ def create_interface():
5
+ with gr.Blocks(theme=gr.themes.Base()) as demo:
6
+ gr.HTML(
7
+ """
8
+ <div style="margin-bottom: 1rem;">
9
+ <img src="https://raw.githubusercontent.com/pixeltable/pixeltable/main/docs/source/data/pixeltable-logo-large.png"
10
+ alt="Pixeltable" style="max-width: 150px;" />
11
+ <h1>Document to Audio Synthesis</h1>
12
+ </div>
13
+ """
14
+ )
15
+
16
+ with gr.Row():
17
+ with gr.Column():
18
+ with gr.Accordion("What does it do?", open=True):
19
+ gr.Markdown("""
20
+ - PDF document processing and text extraction
21
+ - Intelligent content transformation and summarization
22
+ - High-quality audio synthesis with voice selection
23
+ - Configurable processing parameters
24
+ - Downloadable audio output
25
+ """)
26
+ with gr.Column():
27
+ with gr.Accordion("How does it work?", open=True):
28
+ gr.Markdown("""
29
+ 1. **Document Processing**
30
+ - Chunks document using token-based segmentation
31
+ - Maintains document structure and context
32
+
33
+ 2. **Content Processing**
34
+ - Transforms text using LLM optimization
35
+ - Generates optimized audio scripts
36
+
37
+ 3. **Audio Synthesis**
38
+ - Converts scripts to natural speech
39
+ - Multiple voice models available
40
+ """)
41
+
42
+ with gr.Row():
43
+ with gr.Column():
44
+ api_key = gr.Textbox(
45
+ label="OpenAI API Key",
46
+ placeholder="sk-...",
47
+ type="password"
48
+ )
49
+ file_input = gr.File(
50
+ label="Input Document (PDF)",
51
+ file_types=[".pdf"]
52
+ )
53
+
54
+ with gr.Accordion("Synthesis Parameters", open=True):
55
+ voice_select = gr.Radio(
56
+ choices=["alloy", "echo", "fable", "onyx", "nova", "shimmer"],
57
+ value="onyx",
58
+ label="Voice Model",
59
+ info="TTS voice model selection"
60
+ )
61
+ style_select = gr.Radio(
62
+ choices=["Technical", "Narrative", "Instructional", "Descriptive"],
63
+ value="Technical",
64
+ label="Processing Style",
65
+ info="Content processing approach"
66
+ )
67
+
68
+ with gr.Accordion("Processing Parameters", open=False):
69
+ chunk_size = gr.Slider(
70
+ minimum=100, maximum=1000, value=300, step=50,
71
+ label="Chunk Size (tokens)",
72
+ info="Text segmentation size"
73
+ )
74
+ temperature = gr.Slider(
75
+ minimum=0, maximum=1, value=0.7, step=0.1,
76
+ label="Temperature",
77
+ info="LLM randomness factor"
78
+ )
79
+ max_tokens = gr.Slider(
80
+ minimum=100, maximum=1000, value=300, step=50,
81
+ label="Max Tokens",
82
+ info="Maximum output token limit"
83
+ )
84
+
85
+ process_btn = gr.Button("Process Document", variant="primary")
86
+ status_output = gr.Textbox(label="Status")
87
+
88
+ with gr.Tabs():
89
+ with gr.TabItem("Content Processing"):
90
+ output_table = gr.Dataframe(
91
+ headers=["Segment", "Processed Content", "Audio Script"],
92
+ wrap=True
93
+ )
94
+ with gr.TabItem("Audio Output"):
95
+ audio_output = gr.Audio(
96
+ label="Synthesized Audio",
97
+ type="filepath",
98
+ show_download_button=True
99
+ )
100
+
101
+ gr.Markdown("""
102
+ ### Technical Notes
103
+ - Token limit affects processing speed and memory usage
104
+ - Temperature values > 0.8 may introduce content variations
105
+ - Audio synthesis has a 4096 token limit per segment
106
+
107
+ ### Performance Considerations
108
+ - Chunk size directly impacts processing time
109
+ - Higher temperatures increase LLM compute time
110
+ - Audio synthesis scales with script length
111
+ """)
112
+
113
+ gr.HTML(
114
+ """
115
+ <div style="text-align: center; margin-top: 1rem; padding-top: 1rem; border-top: 1px solid #ccc;">
116
+ <p style="margin: 0; color: #666; font-size: 0.8em;">
117
+ Powered by <a href="https://github.com/pixeltable/pixeltable" target="_blank" style="color: #F25022; text-decoration: none;">Pixeltable</a>
118
+ | <a href="https://docs.pixeltable.io" target="_blank" style="color: #666;">Documentation</a>
119
+ | <a href="https://huggingface.co/spaces/Pixeltable/document-to-audio-synthesis" target="_blank" style="color: #666;">Hugging Face Space</a>
120
+ </p>
121
+ </div>
122
+ """
123
+ )
124
+
125
+ def update_interface(pdf_file, api_key, voice, style, chunk_size, temperature, max_tokens):
126
+ return process_document(
127
+ pdf_file, api_key, voice, style, chunk_size, temperature, max_tokens
128
+ )
129
+
130
+ process_btn.click(
131
+ update_interface,
132
+ inputs=[
133
+ file_input, api_key, voice_select, style_select,
134
+ chunk_size, temperature, max_tokens
135
+ ],
136
+ outputs=[output_table, audio_output, status_output]
137
+ )
138
+
139
+ return demo
src/processor.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pixeltable as pxt
2
+ from pixeltable.iterators import DocumentSplitter
3
+ from pixeltable.functions import openai
4
+ import os
5
+ import requests
6
+ import tempfile
7
+ import gradio as gr
8
+
9
+ def process_document(pdf_file, api_key, voice_choice, style_choice, chunk_size, temperature, max_tokens, progress=gr.Progress()):
10
+ try:
11
+ os.environ['OPENAI_API_KEY'] = api_key
12
+
13
+ progress(0.1, desc="Initializing...")
14
+ pxt.drop_dir('document_audio', force=True)
15
+ pxt.create_dir('document_audio')
16
+
17
+ docs = pxt.create_table(
18
+ 'document_audio.documents',
19
+ {
20
+ 'document': pxt.Document,
21
+ 'voice': pxt.String,
22
+ 'style': pxt.String
23
+ }
24
+ )
25
+
26
+ progress(0.2, desc="Processing document...")
27
+ docs.insert([{'document': pdf_file.name, 'voice': voice_choice, 'style': style_choice}])
28
+
29
+ chunks = pxt.create_view(
30
+ 'document_audio.chunks',
31
+ docs,
32
+ iterator=DocumentSplitter.create(
33
+ document=docs.document,
34
+ separators='token_limit',
35
+ limit=chunk_size
36
+ )
37
+ )
38
+
39
+ progress(0.4, desc="Text processing...")
40
+ chunks['content_response'] = openai.chat_completions(
41
+ messages=[
42
+ {
43
+ 'role': 'system',
44
+ 'content': """Transform this text segment into clear, concise content.
45
+ Structure:
46
+ 1. Core concepts and points
47
+ 2. Supporting details
48
+ 3. Key takeaways"""
49
+ },
50
+ {'role': 'user', 'content': chunks.text}
51
+ ],
52
+ model='gpt-4o-mini-2024-07-18',
53
+ max_tokens=max_tokens,
54
+ temperature=temperature
55
+ )
56
+ chunks['content'] = chunks.content_response['choices'][0]['message']['content']
57
+
58
+ progress(0.6, desc="Script generation...")
59
+ chunks['script_response'] = openai.chat_completions(
60
+ messages=[
61
+ {
62
+ 'role': 'system',
63
+ 'content': f"""Convert content to audio script.
64
+ Style: {docs.style}
65
+ Format:
66
+ - Clear sentence structures
67
+ - Natural pauses (...)
68
+ - Term definitions when needed
69
+ - Proper transitions"""
70
+ },
71
+ {'role': 'user', 'content': chunks.content}
72
+ ],
73
+ model='gpt-4o-mini-2024-07-18',
74
+ max_tokens=max_tokens,
75
+ temperature=temperature
76
+ )
77
+ chunks['script'] = chunks.script_response['choices'][0]['message']['content']
78
+
79
+ progress(0.8, desc="Audio synthesis...")
80
+ @pxt.udf(return_type=pxt.Audio)
81
+ def generate_audio(script: str, voice: str):
82
+ if not script or not voice:
83
+ return None
84
+ try:
85
+ response = requests.post(
86
+ "https://api.openai.com/v1/audio/speech",
87
+ headers={"Authorization": f"Bearer {api_key}"},
88
+ json={"model": "tts-1", "input": script, "voice": voice}
89
+ )
90
+ if response.status_code == 200:
91
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
92
+ temp_file.write(response.content)
93
+ temp_file.close()
94
+ return temp_file.name
95
+ except Exception as e:
96
+ print(f"Error in audio synthesis: {e}")
97
+ return None
98
+
99
+ chunks['audio'] = generate_audio(chunks.script, docs.voice)
100
+
101
+ audio_path = chunks.select(chunks.audio).tail(1)['audio'][0]
102
+
103
+ results = chunks.select(
104
+ chunks.content,
105
+ chunks.script
106
+ ).collect()
107
+
108
+ display_data = [
109
+ [f"Segment {idx + 1}", row['content'], row['script']]
110
+ for idx, row in enumerate(results)
111
+ ]
112
+
113
+ progress(1.0, desc="Complete")
114
+ return display_data, audio_path, "Processing complete"
115
+
116
+ except Exception as e:
117
+ return None, None, f"Error: {str(e)}"