Pendrokar commited on
Commit
8852839
โ€ข
1 Parent(s): 00b88cc

gradio client file

Browse files
Files changed (1) hide show
  1. gr_client.py +349 -0
gr_client.py ADDED
@@ -0,0 +1,349 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import copy
4
+ import time
5
+ import requests
6
+ import json
7
+ from huggingface_hub import hf_hub_download
8
+ import gradio as gr
9
+ from gradio_client import Client
10
+
11
+ voice_models = [
12
+ ("Male #6671", "ccby_nvidia_hifi_6671_M"),
13
+ ("Male #6670", "ccby_nvidia_hifi_6670_M"),
14
+ ("Male #9017", "ccby_nvidia_hifi_9017_M"),
15
+ ("Male #6097", "ccby_nvidia_hifi_6097_M"),
16
+ ("Female #92", "ccby_nvidia_hifi_92_F"),
17
+ ("Female #11697", "ccby_nvidia_hifi_11697_F"),
18
+ ("Female #12787", "ccby_nvidia_hifi_12787_F"),
19
+ ("Female #11614", "ccby_nv_hifi_11614_F"),
20
+ ("Female #8051", "ccby_nvidia_hifi_8051_F"),
21
+ ("Female #9136", "ccby_nvidia_hifi_9136_F"),
22
+ ]
23
+ current_voice_model = None
24
+
25
+ # order ranked by similarity to English due to the xVASynth's use of ARPAbet instead of IPA
26
+ languages = [
27
+ ("๐Ÿ‡ฌ๐Ÿ‡ง EN", "en"),
28
+ ("๐Ÿ‡ฉ๐Ÿ‡ช DE", "de"),
29
+ ("๐Ÿ‡ช๐Ÿ‡ธ ES", "es"),
30
+ ("๐Ÿ‡ฎ๐Ÿ‡น IT", "it"),
31
+ ("๐Ÿ‡ณ๐Ÿ‡ฑ NL", "nl"),
32
+ ("๐Ÿ‡ต๐Ÿ‡น PT", "pt"),
33
+ ("๐Ÿ‡ต๐Ÿ‡ฑ PL", "pl"),
34
+ ("๐Ÿ‡ท๐Ÿ‡ด RO", "ro"),
35
+ ("๐Ÿ‡ธ๐Ÿ‡ช SV", "sv"),
36
+ ("๐Ÿ‡ฉ๐Ÿ‡ฐ DA", "da"),
37
+ ("๐Ÿ‡ซ๐Ÿ‡ฎ FI", "fi"),
38
+ ("๐Ÿ‡ญ๐Ÿ‡บ HU", "hu"),
39
+ ("๐Ÿ‡ฌ๐Ÿ‡ท EL", "el"),
40
+ ("๐Ÿ‡ซ๐Ÿ‡ท FR", "fr"),
41
+ ("๐Ÿ‡ท๐Ÿ‡บ RU", "ru"),
42
+ ("๐Ÿ‡บ๐Ÿ‡ฆ UK", "uk"),
43
+ ("๐Ÿ‡น๐Ÿ‡ท TR", "tr"),
44
+ ("๐Ÿ‡ธ๐Ÿ‡ฆ AR", "ar"),
45
+ ("๐Ÿ‡ฎ๐Ÿ‡ณ HI", "hi"),
46
+ ("๐Ÿ‡ฏ๐Ÿ‡ต JP", "jp"),
47
+ ("๐Ÿ‡ฐ๐Ÿ‡ท KO", "ko"),
48
+ ("๐Ÿ‡จ๐Ÿ‡ณ ZH", "zh"),
49
+ ("๐Ÿ‡ป๐Ÿ‡ณ VI", "vi"),
50
+ ("๐Ÿ‡ป๐Ÿ‡ฆ LA", "la"),
51
+ ("HA", "ha"),
52
+ ("SW", "sw"),
53
+ ("๐Ÿ‡ณ๐Ÿ‡ฌ YO", "yo"),
54
+ ("WO", "wo"),
55
+ ]
56
+
57
+ # Translated from English by DeepMind's Gemini Pro
58
+ default_text = {
59
+ "ar": "ู‡ุฐุง ู‡ูˆ ุตูˆุชูŠ.",
60
+ "da": "Sรฅdan lyder min stemme.",
61
+ "de": "So klingt meine Stimme.",
62
+ "el": "ฮˆฯ„ฯƒฮน ฮฑฮบฮฟฯฮณฮตฯ„ฮฑฮน ฮท ฯ†ฯ‰ฮฝฮฎ ฮผฮฟฯ….",
63
+ "en": "This is what my voice sounds like.",
64
+ "es": "Asรญ suena mi voz.",
65
+ "fi": "Nรคin รครคneni kuulostaa.",
66
+ "fr": "Voici ร  quoi ressemble ma voix.",
67
+ "ha": "Wannan ne muryata ke.",
68
+ "hi": "เคฏเคน เคฎเฅ‡เคฐเฅ€ เค†เคตเคพเคœเคผ เค•เฅˆเคธเฅ€ เคฒเค—เคคเฅ€ เคนเฅˆเฅค",
69
+ "hu": "รgy hangzik a hangom.",
70
+ "it": "Cosรฌ suona la mia voce.",
71
+ "jp": "ใ“ใ‚ŒใŒ็งใฎๅฃฐใงใ™ใ€‚",
72
+ "ko": "์—ฌ๊ธฐ ์ œ ๋ชฉ์†Œ๋ฆฌ๊ฐ€ ์–ด๋–ค์ง€ ๋“ค์–ด๋ณด์„ธ์š”.",
73
+ "la": "Haec est vox mea sonans.",
74
+ "nl": "Dit is hoe mijn stem klinkt.",
75
+ "pl": "Tak brzmi mรณj gล‚os.",
76
+ "pt": "ร‰ assim que minha voz soa.",
77
+ "ro": "Aศ™a sunฤƒ vocea mea.",
78
+ "ru": "ะ’ะพั‚ ะบะฐะบ ะทะฒัƒั‡ะธั‚ ะผะพะน ะณะพะปะพั.",
79
+ "sv": "Sรฅhรคr lรฅter min rรถst.",
80
+ "sw": "Sauti yangu inasikika hivi.",
81
+ "tr": "Benim sesimin sesi bรถyle.",
82
+ "uk": "ะžััŒ ัะบ ะทะฒัƒั‡ะธั‚ัŒ ะผั–ะน ะณะพะปะพั.",
83
+ "vi": "ฤรขy lร  giแปng nรณi cแปงa tรดi.",
84
+ "wo": "Ndox li neen xewnaal ma.",
85
+ "yo": "รŒyรญ ni ohรนn mi ล„lรก.",
86
+ "zh": "่ฟ™ๆ˜ฏๆˆ‘็š„ๅฃฐ้Ÿณใ€‚",
87
+ }
88
+
89
+
90
+ def predict(
91
+ input_text,
92
+ voice,
93
+ lang,
94
+ pacing,
95
+ pitch,
96
+ energy,
97
+ anger,
98
+ happy,
99
+ sad,
100
+ surprise,
101
+ deepmoji_checked
102
+ ):
103
+ wav_path, response = client.predict(
104
+ input_text, # str in 'Input Text' Textbox component
105
+ voice, # Literal['ccby_nvidia_hifi_6670_M', 'ccby_nv_hifi_11614_F', 'ccby_nvidia_hifi_11697_F', 'ccby_nvidia_hifi_12787_F', 'ccby_nvidia_hifi_6097_M', 'ccby_nvidia_hifi_6671_M', 'ccby_nvidia_hifi_8051_F', 'ccby_nvidia_hifi_9017_M', 'ccby_nvidia_hifi_9136_F', 'ccby_nvidia_hifi_92_F'] in 'Voice' Radio component
106
+ lang, # Literal['en', 'de', 'es', 'it', 'fr', 'ru', 'tr', 'la', 'ro', 'da', 'vi', 'ha', 'nl', 'zh', 'ar', 'uk', 'hi', 'ko', 'pl', 'sw', 'fi', 'hu', 'pt', 'yo', 'sv', 'el', 'wo', 'jp'] in 'Language' Radio component
107
+ pacing, # float (numeric value between 0.5 and 2.0) in 'Duration' Slider component
108
+ pitch, # float (numeric value between 0 and 1.0) in 'Pitch' Slider component
109
+ energy, # float (numeric value between 0.1 and 1.0) in 'Energy' Slider component
110
+ anger, # float (numeric value between 0 and 1.0) in '๐Ÿ˜  Anger' Slider component
111
+ happy, # float (numeric value between 0 and 1.0) in '๐Ÿ˜ƒ Happiness' Slider component
112
+ sad, # float (numeric value between 0 and 1.0) in '๐Ÿ˜ญ Sadness' Slider component
113
+ surprise, # float (numeric value between 0 and 1.0) in '๐Ÿ˜ฎ Surprise' Slider component
114
+ deepmoji_checked, # bool
115
+ api_name="/predict"
116
+ )
117
+ json_data = json.loads(response)
118
+ arpabet = json_data['arpabet'].replace('|<PAD>|', ' ')
119
+
120
+ return [
121
+ wav_path,
122
+ arpabet,
123
+ round(json_data['em_angry'][0], 2),
124
+ round(json_data['em_happy'][0], 2),
125
+ round(json_data['em_sad'][0], 2),
126
+ round(json_data['em_surprise'][0], 2)
127
+ ]
128
+
129
+ input_textbox = gr.Textbox(
130
+ label="Input Text",
131
+ value="This is what my voice sounds like.",
132
+ info="Also accepts ARPAbet symbols placed within {} brackets.",
133
+ lines=1,
134
+ max_lines=5,
135
+ autofocus=True
136
+ )
137
+ pacing_slider = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Duration")
138
+ pitch_slider = gr.Slider(0, 1.0, value=0.5, step=0.05, label="Pitch", visible=False)
139
+ energy_slider = gr.Slider(0.1, 1.0, value=1.0, step=0.05, label="Energy", visible=False)
140
+ anger_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="๐Ÿ˜  Anger", info="Tread lightly beyond 0.9")
141
+ happy_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="๐Ÿ˜ƒ Happiness", info="Tread lightly beyond 0.7")
142
+ sad_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="๐Ÿ˜ญ Sadness", info="Duration increased when beyond 0.2")
143
+ surprise_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="๐Ÿ˜ฎ Surprise", info="Does not play well with Happiness with either being beyond 0.3")
144
+ voice_radio = gr.Radio(
145
+ voice_models,
146
+ value="ccby_nvidia_hifi_6671_M",
147
+ label="Voice",
148
+ info="NVIDIA HIFI CC-BY-4.0 xVAPitch voice model"
149
+ )
150
+
151
+ def set_default_text(lang, deepmoji_checked):
152
+ # DeepMoji only works on English Text
153
+ # checkbox_enabled = True
154
+ # if lang != 'en':
155
+ # checkbox_enabled = False
156
+
157
+ if lang == 'en':
158
+ checkbox_enabled = gr.Checkbox(
159
+ label="Use DeepMoji",
160
+ info="Auto adjust emotional values",
161
+ value=deepmoji_checked,
162
+ interactive=True
163
+ )
164
+ else:
165
+ checkbox_enabled = gr.Checkbox(
166
+ label="Use DeepMoji",
167
+ info="Works only with English!",
168
+ value=False,
169
+ interactive=False
170
+ )
171
+
172
+ return default_text[lang], checkbox_enabled # Return the modified textbox (important for Blocks)
173
+
174
+ def reset_em_sliders(
175
+ deepmoji_enabled,
176
+ anger,
177
+ happy,
178
+ sad,
179
+ surprise
180
+ ):
181
+ if (deepmoji_enabled):
182
+ return (0, 0, 0, 0)
183
+ else:
184
+ return (
185
+ anger,
186
+ happy,
187
+ sad,
188
+ surprise
189
+ )
190
+
191
+ def toggle_deepmoji(
192
+ checked,
193
+ anger,
194
+ happy,
195
+ sad,
196
+ surprise
197
+ ):
198
+ if checked:
199
+ return (0, 0, 0, 0)
200
+ else:
201
+ return (
202
+ anger,
203
+ happy,
204
+ sad,
205
+ surprise
206
+ )
207
+
208
+ language_radio = gr.Radio(
209
+ languages,
210
+ value="en",
211
+ label="Language",
212
+ info="Will be more monotone and have an English accent. Tested mostly by a native Briton."
213
+ )
214
+
215
+ with gr.Blocks() as demo:
216
+ gr.Markdown("# xVASynth TTS")
217
+
218
+ with gr.Row(): # Main row for inputs and language selection
219
+ with gr.Column(): # Input column
220
+ input_textbox = gr.Textbox(
221
+ label="Input Text",
222
+ value="This is what my voice sounds like.",
223
+ info="Also accepts ARPAbet symbols placed within {} brackets.",
224
+ lines=1,
225
+ max_lines=5,
226
+ autofocus=True
227
+ )
228
+ language_radio = gr.Radio(
229
+ languages,
230
+ value="en",
231
+ label="Language",
232
+ info="Will be more monotone and have an English accent. Tested mostly by a native Briton."
233
+ )
234
+ pacing_slider = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Duration")
235
+ with gr.Column(): # Control column
236
+ voice_radio = gr.Radio(
237
+ voice_models,
238
+ value="ccby_nvidia_hifi_6671_M",
239
+ label="Voice",
240
+ info="NVIDIA HIFI CC-BY-4.0 xVAPitch voice model"
241
+ )
242
+ pitch_slider = gr.Slider(0, 1.0, value=0.5, step=0.05, label="Pitch", visible=False)
243
+ energy_slider = gr.Slider(0.1, 1.0, value=1.0, step=0.05, label="Energy", visible=False)
244
+ with gr.Row(): # Main row for inputs and language selection
245
+ with gr.Column(): # Input column
246
+ anger_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="๐Ÿ˜  Anger", info="Tread lightly beyond 0.9")
247
+ sad_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="๐Ÿ˜ญ Sadness", info="Duration increased when beyond 0.2")
248
+ with gr.Column(): # Input column
249
+ happy_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="๐Ÿ˜ƒ Happiness", info="Tread lightly beyond 0.7")
250
+ surprise_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="๐Ÿ˜ฎ Surprise", info="Can oversaturate Happiness")
251
+ deepmoji_checkbox = gr.Checkbox(label="Use DeepMoji", info="Auto adjust emotional values")
252
+
253
+ # Event handling using click
254
+ btn = gr.Button("Generate")
255
+
256
+ with gr.Row(): # Main row for inputs and language selection
257
+ with gr.Column(): # Input column
258
+ output_wav = gr.Audio(label="22kHz audio output", type="filepath", editable=False)
259
+ with gr.Column(): # Input column
260
+ output_arpabet = gr.Textbox(label="ARPAbet", interactive=False)
261
+
262
+ btn.click(
263
+ fn=predict,
264
+ inputs=[
265
+ input_textbox,
266
+ voice_radio,
267
+ language_radio,
268
+ pacing_slider,
269
+ pitch_slider,
270
+ energy_slider,
271
+ anger_slider,
272
+ happy_slider,
273
+ sad_slider,
274
+ surprise_slider,
275
+ deepmoji_checkbox
276
+ ],
277
+ outputs=[
278
+ output_wav,
279
+ output_arpabet,
280
+ anger_slider,
281
+ happy_slider,
282
+ sad_slider,
283
+ surprise_slider
284
+ ]
285
+ )
286
+
287
+ language_radio.change(
288
+ set_default_text,
289
+ inputs=[language_radio, deepmoji_checkbox],
290
+ outputs=[input_textbox, deepmoji_checkbox]
291
+ )
292
+
293
+ deepmoji_checkbox.change(
294
+ toggle_deepmoji,
295
+ inputs=[
296
+ deepmoji_checkbox,
297
+ anger_slider,
298
+ happy_slider,
299
+ sad_slider,
300
+ surprise_slider
301
+ ],
302
+ outputs=[
303
+ anger_slider,
304
+ happy_slider,
305
+ sad_slider,
306
+ surprise_slider
307
+ ]
308
+ )
309
+
310
+ input_textbox.change(
311
+ reset_em_sliders,
312
+ inputs=[
313
+ deepmoji_checkbox,
314
+ anger_slider,
315
+ happy_slider,
316
+ sad_slider,
317
+ surprise_slider
318
+ ],
319
+ outputs=[
320
+ anger_slider,
321
+ happy_slider,
322
+ sad_slider,
323
+ surprise_slider
324
+ ]
325
+ )
326
+
327
+ voice_radio.change(
328
+ reset_em_sliders,
329
+ inputs=[
330
+ deepmoji_checkbox,
331
+ anger_slider,
332
+ happy_slider,
333
+ sad_slider,
334
+ surprise_slider
335
+ ],
336
+ outputs=[
337
+ anger_slider,
338
+ happy_slider,
339
+ sad_slider,
340
+ surprise_slider
341
+ ]
342
+ )
343
+
344
+ if __name__ == "__main__":
345
+ print('running Gradio interface')
346
+ # gradio_app.launch()
347
+ client = Client("Pendrokar/xVASynth")
348
+
349
+ demo.launch()