edwko commited on
Commit
bf642c7
·
verified ·
1 Parent(s): 80a96cc

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +160 -0
app.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ import outetts
4
+ from outetts.version.v1.interface import _DEFAULT_SPEAKERS
5
+
6
+ model_config = outetts.HFModelConfig_v1(
7
+ model_path="OuteAI/OuteTTS-0.2-500M",
8
+ language="en",
9
+ )
10
+ interface = outetts.InterfaceHF(model_version="0.2", cfg=model_config)
11
+
12
+ def get_available_speakers(language):
13
+ """Get available speakers for the selected language."""
14
+ if language not in interface.languages:
15
+ return []
16
+ speakers = list(_DEFAULT_SPEAKERS[language].keys())
17
+ speakers.insert(0, "None")
18
+ return speakers
19
+
20
+ def change_interface_language(language):
21
+ """Change interface language and update available speakers."""
22
+ try:
23
+ interface.change_language(language)
24
+ speakers = get_available_speakers(language)
25
+ return gr.update(choices=speakers, value="male_1"), gr.update(visible=True)
26
+ except ValueError as e:
27
+ return gr.update(choices=["None"], value="None"), gr.update(visible=False)
28
+
29
+ def generate_tts(
30
+ text, temperature, repetition_penalty, language,
31
+ speaker_selection, reference_audio, reference_text
32
+ ):
33
+ """Generate TTS with error handling and new features."""
34
+ try:
35
+ # Validate inputs for custom speaker
36
+ if reference_audio and reference_text:
37
+ if not os.path.exists(reference_audio):
38
+ raise ValueError("Reference audio file not found")
39
+ if not reference_text.strip():
40
+ raise ValueError("Reference transcription text is required")
41
+ speaker = interface.create_speaker(reference_audio, reference_text)
42
+
43
+ # Use selected default speaker
44
+ elif speaker_selection and speaker_selection != "None":
45
+ speaker = interface.load_default_speaker(speaker_selection)
46
+
47
+ # No speaker - random characteristics
48
+ else:
49
+ speaker = None
50
+
51
+ # Generate audio
52
+ output = interface.generate(
53
+ text=text,
54
+ speaker=speaker,
55
+ temperature=temperature,
56
+ repetition_penalty=repetition_penalty,
57
+ max_length=4096
58
+ )
59
+
60
+ # Verify output
61
+ if output.audio is None:
62
+ raise ValueError("Model failed to generate audio. This may be due to input length constraints or early EOS token.")
63
+
64
+ # Save and return output
65
+ output_path = "output.wav"
66
+ output.save(output_path)
67
+ return output_path, None
68
+
69
+ except Exception as e:
70
+ return None, str(e)
71
+
72
+ with gr.Blocks() as demo:
73
+ gr.Markdown("# OuteTTS-0.2-500M Text-to-Speech Demo")
74
+
75
+ error_box = gr.Textbox(label="Error Messages", visible=False)
76
+
77
+ with gr.Row():
78
+ with gr.Column():
79
+ # Language selection
80
+ language_dropdown = gr.Dropdown(
81
+ choices=list(interface.languages),
82
+ value="en",
83
+ label="Interface Language"
84
+ )
85
+
86
+ # Speaker selection
87
+ speaker_dropdown = gr.Dropdown(
88
+ choices=get_available_speakers("en"),
89
+ value="male_1",
90
+ label="Speaker Selection"
91
+ )
92
+
93
+ text_input = gr.Textbox(
94
+ label="Text to Synthesize",
95
+ placeholder="Enter text here..."
96
+ )
97
+
98
+ temperature = gr.Slider(
99
+ 0.1, 1.0,
100
+ value=0.1,
101
+ label="Temperature (lower = more stable tone, higher = more expressive)"
102
+ )
103
+
104
+ repetition_penalty = gr.Slider(
105
+ 0.5, 2.0,
106
+ value=1.1,
107
+ label="Repetition Penalty"
108
+ )
109
+
110
+ gr.Markdown("""
111
+ ### Voice Cloning Guidelines:
112
+ - Use 10-15 seconds of clear, noise-free audio
113
+ - Provide accurate transcription
114
+ - Longer audio clips will reduce maximum output length
115
+ - Custom speaker overrides speaker selection
116
+ """)
117
+
118
+ reference_audio = gr.Audio(
119
+ label="Reference Audio (for voice cloning)",
120
+ type="filepath"
121
+ )
122
+
123
+ reference_text = gr.Textbox(
124
+ label="Reference Transcription Text",
125
+ placeholder="Enter exact transcription of reference audio"
126
+ )
127
+
128
+ submit_button = gr.Button("Generate Speech")
129
+
130
+ with gr.Column():
131
+ audio_output = gr.Audio(
132
+ label="Generated Audio",
133
+ type="filepath"
134
+ )
135
+
136
+ language_dropdown.change(
137
+ fn=change_interface_language,
138
+ inputs=[language_dropdown],
139
+ outputs=[speaker_dropdown, speaker_dropdown]
140
+ )
141
+
142
+ submit_button.click(
143
+ fn=generate_tts,
144
+ inputs=[
145
+ text_input,
146
+ temperature,
147
+ repetition_penalty,
148
+ language_dropdown,
149
+ speaker_dropdown,
150
+ reference_audio,
151
+ reference_text
152
+ ],
153
+ outputs=[audio_output, error_box]
154
+ ).then(
155
+ fn=lambda x: gr.update(visible=bool(x)),
156
+ inputs=[error_box],
157
+ outputs=[error_box]
158
+ )
159
+
160
+ demo.launch()