hysts HF staff commited on
Commit
748b9ed
1 Parent(s): b691933

Migrate to gradio 4.x

Browse files
Files changed (3) hide show
  1. app.py +258 -337
  2. requirements.txt +1 -1
  3. style.css +1 -1
app.py CHANGED
@@ -1,6 +1,7 @@
1
  from __future__ import annotations
2
 
3
  import os
 
4
 
5
  import gradio as gr
6
  import numpy as np
@@ -17,26 +18,20 @@ from lang_list import (
17
  TEXT_SOURCE_LANGUAGE_NAMES,
18
  )
19
 
20
- snapshot_download(repo_id="meta-private/M4Tv2", repo_type="model", local_dir="models")
 
21
 
22
- DESCRIPTION = """# SeamlessM4T
 
23
 
24
  [SeamlessM4T](https://github.com/facebookresearch/seamless_communication) is designed to provide high-quality
25
  translation, allowing people from different linguistic communities to communicate effortlessly through speech and text.
26
-
27
  This unified model enables multiple tasks like Speech-to-Speech (S2ST), Speech-to-Text (S2TT), Text-to-Speech (T2ST)
28
  translation and more, without relying on multiple separate models.
29
  """
30
 
31
  CACHE_EXAMPLES = os.getenv("CACHE_EXAMPLES") == "1" and torch.cuda.is_available()
32
 
33
- TASK_NAMES = [
34
- "S2ST (Speech to Speech translation)",
35
- "S2TT (Speech to Text translation)",
36
- "T2ST (Text to Speech translation)",
37
- "T2TT (Text to Text translation)",
38
- "ASR (Automatic Speech Recognition)",
39
- ]
40
  AUDIO_SAMPLE_RATE = 16000.0
41
  MAX_INPUT_AUDIO_LENGTH = 60 # in seconds
42
  DEFAULT_TARGET_LANGUAGE = "French"
@@ -55,388 +50,314 @@ translator = Translator(
55
  )
56
 
57
 
58
- def predict(
59
- task_name: str,
60
- audio_source: str,
61
- input_audio_mic: str | None,
62
- input_audio_file: str | None,
63
- input_text: str | None,
64
- source_language: str | None,
65
- target_language: str,
66
- ) -> tuple[tuple[int, np.ndarray] | None, str]:
67
- task_name = task_name.split()[0]
68
- source_language_code = LANGUAGE_NAME_TO_CODE[source_language] if source_language else None
69
- target_language_code = LANGUAGE_NAME_TO_CODE[target_language]
70
 
71
- if task_name in ["S2ST", "S2TT", "ASR"]:
72
- if audio_source == "microphone":
73
- input_data = input_audio_mic
74
- else:
75
- input_data = input_audio_file
76
-
77
- arr, org_sr = torchaudio.load(input_data)
78
- new_arr = torchaudio.functional.resample(arr, orig_freq=org_sr, new_freq=AUDIO_SAMPLE_RATE)
79
- max_length = int(MAX_INPUT_AUDIO_LENGTH * AUDIO_SAMPLE_RATE)
80
- if new_arr.shape[1] > max_length:
81
- new_arr = new_arr[:, :max_length]
82
- gr.Warning(f"Input audio is too long. Only the first {MAX_INPUT_AUDIO_LENGTH} seconds is used.")
83
- torchaudio.save(input_data, new_arr, sample_rate=int(AUDIO_SAMPLE_RATE))
84
- else:
85
- input_data = input_text
86
  out_texts, out_audios = translator.predict(
87
- input=input_data,
88
- task_str=task_name,
89
  tgt_lang=target_language_code,
90
- src_lang=source_language_code,
91
  )
92
  out_text = str(out_texts[0])
93
-
94
- if task_name in ["S2ST", "T2ST"]:
95
- out_wav = out_audios.audio_wavs[0]
96
- return (int(AUDIO_SAMPLE_RATE), out_wav.cpu().detach().numpy()), out_text
97
- else:
98
- return None, out_text
99
-
100
-
101
- def process_s2st_example(input_audio_file: str, target_language: str) -> tuple[tuple[int, np.ndarray] | None, str]:
102
- return predict(
103
- task_name="S2ST",
104
- audio_source="file",
105
- input_audio_mic=None,
106
- input_audio_file=input_audio_file,
107
- input_text=None,
108
- source_language=None,
109
- target_language=target_language,
110
- )
111
 
112
 
113
- def process_s2tt_example(input_audio_file: str, target_language: str) -> tuple[tuple[int, np.ndarray] | None, str]:
114
- return predict(
115
- task_name="S2TT",
116
- audio_source="file",
117
- input_audio_mic=None,
118
- input_audio_file=input_audio_file,
119
- input_text=None,
120
- source_language=None,
121
- target_language=target_language,
122
- )
123
-
124
-
125
- def process_t2st_example(
126
- input_text: str, source_language: str, target_language: str
127
- ) -> tuple[tuple[int, np.ndarray] | None, str]:
128
- return predict(
129
- task_name="T2ST",
130
- audio_source="",
131
- input_audio_mic=None,
132
- input_audio_file=None,
133
- input_text=input_text,
134
- source_language=source_language,
135
- target_language=target_language,
136
  )
 
137
 
138
 
139
- def process_t2tt_example(
140
- input_text: str, source_language: str, target_language: str
141
- ) -> tuple[tuple[int, np.ndarray] | None, str]:
142
- return predict(
143
- task_name="T2TT",
144
- audio_source="",
145
- input_audio_mic=None,
146
- input_audio_file=None,
147
- input_text=input_text,
148
- source_language=source_language,
149
- target_language=target_language,
150
  )
 
 
 
151
 
152
 
153
- def process_asr_example(input_audio_file: str, target_language: str) -> tuple[tuple[int, np.ndarray] | None, str]:
154
- return predict(
155
- task_name="ASR",
156
- audio_source="file",
157
- input_audio_mic=None,
158
- input_audio_file=input_audio_file,
159
- input_text=None,
160
- source_language=None,
161
- target_language=target_language,
162
  )
 
163
 
164
 
165
- def update_audio_ui(audio_source: str) -> tuple[dict, dict]:
166
- mic = audio_source == "microphone"
167
- return (
168
- gr.update(visible=mic, value=None), # input_audio_mic
169
- gr.update(visible=not mic, value=None), # input_audio_file
 
 
170
  )
 
171
 
172
 
173
- def update_input_ui(task_name: str) -> tuple[dict, dict, dict, dict]:
174
- task_name = task_name.split()[0]
175
- if task_name == "S2ST":
176
- return (
177
- gr.update(visible=True), # audio_box
178
- gr.update(visible=False), # input_text
179
- gr.update(visible=False), # source_language
180
- gr.update(
181
- visible=True, choices=S2ST_TARGET_LANGUAGE_NAMES, value=DEFAULT_TARGET_LANGUAGE
182
- ), # target_language
183
- )
184
- elif task_name == "S2TT":
185
- return (
186
- gr.update(visible=True), # audio_box
187
- gr.update(visible=False), # input_text
188
- gr.update(visible=False), # source_language
189
- gr.update(
190
- visible=True, choices=S2TT_TARGET_LANGUAGE_NAMES, value=DEFAULT_TARGET_LANGUAGE
191
- ), # target_language
192
- )
193
- elif task_name == "T2ST":
194
- return (
195
- gr.update(visible=False), # audio_box
196
- gr.update(visible=True), # input_text
197
- gr.update(visible=True), # source_language
198
- gr.update(
199
- visible=True, choices=S2ST_TARGET_LANGUAGE_NAMES, value=DEFAULT_TARGET_LANGUAGE
200
- ), # target_language
201
- )
202
- elif task_name == "T2TT":
203
- return (
204
- gr.update(visible=False), # audio_box
205
- gr.update(visible=True), # input_text
206
- gr.update(visible=True), # source_language
207
- gr.update(
208
- visible=True, choices=T2TT_TARGET_LANGUAGE_NAMES, value=DEFAULT_TARGET_LANGUAGE
209
- ), # target_language
210
  )
211
- elif task_name == "ASR":
212
- return (
213
- gr.update(visible=True), # audio_box
214
- gr.update(visible=False), # input_text
215
- gr.update(visible=False), # source_language
216
- gr.update(
217
- visible=True, choices=S2TT_TARGET_LANGUAGE_NAMES, value=DEFAULT_TARGET_LANGUAGE
218
- ), # target_language
219
  )
220
- else:
221
- raise ValueError(f"Unknown task: {task_name}")
 
 
 
 
 
 
 
 
 
 
 
 
 
222
 
 
 
 
 
 
 
223
 
224
- def update_output_ui(task_name: str) -> tuple[dict, dict]:
225
- task_name = task_name.split()[0]
226
- if task_name in ["S2ST", "T2ST"]:
227
- return (
228
- gr.update(visible=True, value=None), # output_audio
229
- gr.update(value=None), # output_text
230
- )
231
- elif task_name in ["S2TT", "T2TT", "ASR"]:
232
- return (
233
- gr.update(visible=False, value=None), # output_audio
234
- gr.update(value=None), # output_text
235
  )
236
- else:
237
- raise ValueError(f"Unknown task: {task_name}")
238
-
239
-
240
- def update_example_ui(task_name: str) -> tuple[dict, dict, dict, dict, dict]:
241
- task_name = task_name.split()[0]
242
- return (
243
- gr.update(visible=task_name == "S2ST"), # s2st_example_row
244
- gr.update(visible=task_name == "S2TT"), # s2tt_example_row
245
- gr.update(visible=task_name == "T2ST"), # t2st_example_row
246
- gr.update(visible=task_name == "T2TT"), # t2tt_example_row
247
- gr.update(visible=task_name == "ASR"), # asr_example_row
 
 
 
 
248
  )
249
 
250
-
251
- with gr.Blocks(css="style.css") as demo:
252
- gr.Markdown(DESCRIPTION)
253
- gr.DuplicateButton(
254
- value="Duplicate Space for private use",
255
- elem_id="duplicate-button",
256
- visible=os.getenv("SHOW_DUPLICATE_BUTTON") == "1",
257
  )
 
 
258
  with gr.Group():
259
- task_name = gr.Dropdown(
260
- label="Task",
261
- choices=TASK_NAMES,
262
- value=TASK_NAMES[0],
263
- )
264
  with gr.Row():
265
  source_language = gr.Dropdown(
266
  label="Source language",
267
  choices=TEXT_SOURCE_LANGUAGE_NAMES,
268
  value="English",
269
- visible=False,
270
  )
271
  target_language = gr.Dropdown(
272
  label="Target language",
273
- choices=S2ST_TARGET_LANGUAGE_NAMES,
274
  value=DEFAULT_TARGET_LANGUAGE,
275
  )
276
- with gr.Row() as audio_box:
277
- audio_source = gr.Radio(
278
- label="Audio source",
279
- choices=["file", "microphone"],
280
- value="file",
281
- )
282
- input_audio_mic = gr.Audio(
283
- label="Input speech",
284
- type="filepath",
285
- source="microphone",
286
- visible=False,
287
- )
288
- input_audio_file = gr.Audio(
289
- label="Input speech",
290
- type="filepath",
291
- source="upload",
292
- visible=True,
293
- )
294
- input_text = gr.Textbox(label="Input text", visible=False)
295
  btn = gr.Button("Translate")
296
- with gr.Column():
297
- output_audio = gr.Audio(
298
- label="Translated speech",
299
- autoplay=False,
300
- streaming=False,
301
- type="numpy",
302
- )
303
- output_text = gr.Textbox(label="Translated text")
304
-
305
- with gr.Row(visible=True) as s2st_example_row:
306
- s2st_examples = gr.Examples(
307
- examples=[
308
- ["assets/sample_input.mp3", "French"],
309
- ["assets/sample_input.mp3", "Mandarin Chinese"],
310
- ["assets/sample_input_2.mp3", "Hindi"],
311
- ["assets/sample_input_2.mp3", "Spanish"],
312
- ],
313
- inputs=[input_audio_file, target_language],
314
- outputs=[output_audio, output_text],
315
- fn=process_s2st_example,
316
- cache_examples=CACHE_EXAMPLES,
317
  )
318
- with gr.Row(visible=False) as s2tt_example_row:
319
- s2tt_examples = gr.Examples(
320
- examples=[
321
- ["assets/sample_input.mp3", "French"],
322
- ["assets/sample_input.mp3", "Mandarin Chinese"],
323
- ["assets/sample_input_2.mp3", "Hindi"],
324
- ["assets/sample_input_2.mp3", "Spanish"],
 
325
  ],
326
- inputs=[input_audio_file, target_language],
327
- outputs=[output_audio, output_text],
328
- fn=process_s2tt_example,
329
- cache_examples=CACHE_EXAMPLES,
330
- )
331
- with gr.Row(visible=False) as t2st_example_row:
332
- t2st_examples = gr.Examples(
333
- examples=[
334
- ["My favorite animal is the elephant.", "English", "French"],
335
- ["My favorite animal is the elephant.", "English", "Mandarin Chinese"],
336
- [
337
- "Meta AI's Seamless M4T model is democratising spoken communication across language barriers",
338
- "English",
339
- "Hindi",
340
- ],
341
- [
342
- "Meta AI's Seamless M4T model is democratising spoken communication across language barriers",
343
- "English",
344
- "Spanish",
345
- ],
346
  ],
347
- inputs=[input_text, source_language, target_language],
348
- outputs=[output_audio, output_text],
349
- fn=process_t2st_example,
350
- cache_examples=CACHE_EXAMPLES,
351
- )
352
- with gr.Row(visible=False) as t2tt_example_row:
353
- t2tt_examples = gr.Examples(
354
- examples=[
355
- ["My favorite animal is the elephant.", "English", "French"],
356
- ["My favorite animal is the elephant.", "English", "Mandarin Chinese"],
357
- [
358
- "Meta AI's Seamless M4T model is democratising spoken communication across language barriers",
359
- "English",
360
- "Hindi",
361
- ],
362
- [
363
- "Meta AI's Seamless M4T model is democratising spoken communication across language barriers",
364
- "English",
365
- "Spanish",
366
- ],
367
  ],
368
- inputs=[input_text, source_language, target_language],
369
- outputs=[output_audio, output_text],
370
- fn=process_t2tt_example,
371
- cache_examples=CACHE_EXAMPLES,
372
- )
373
- with gr.Row(visible=False) as asr_example_row:
374
- asr_examples = gr.Examples(
375
- examples=[
376
- ["assets/sample_input.mp3", "English"],
377
- ["assets/sample_input_2.mp3", "English"],
378
  ],
379
- inputs=[input_audio_file, target_language],
380
- outputs=[output_audio, output_text],
381
- fn=process_asr_example,
382
- cache_examples=CACHE_EXAMPLES,
383
- )
384
-
385
- audio_source.change(
386
- fn=update_audio_ui,
387
- inputs=audio_source,
388
- outputs=[
389
- input_audio_mic,
390
- input_audio_file,
391
  ],
392
- queue=False,
 
 
 
393
  api_name=False,
394
  )
395
- task_name.change(
396
- fn=update_input_ui,
397
- inputs=task_name,
398
- outputs=[
399
- audio_box,
400
- input_text,
401
- source_language,
402
- target_language,
403
- ],
404
- queue=False,
405
- api_name=False,
406
- ).then(
407
- fn=update_output_ui,
408
- inputs=task_name,
409
  outputs=[output_audio, output_text],
410
- queue=False,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
411
  api_name=False,
412
- ).then(
413
- fn=update_example_ui,
414
- inputs=task_name,
415
- outputs=[
416
- s2st_example_row,
417
- s2tt_example_row,
418
- t2st_example_row,
419
- t2tt_example_row,
420
- asr_example_row,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
421
  ],
422
- queue=False,
 
 
 
423
  api_name=False,
424
  )
425
 
426
  btn.click(
427
- fn=predict,
428
- inputs=[
429
- task_name,
430
- audio_source,
431
- input_audio_mic,
432
- input_audio_file,
433
- input_text,
434
- source_language,
435
- target_language,
436
- ],
437
- outputs=[output_audio, output_text],
438
- api_name="run",
 
439
  )
440
 
 
 
 
 
 
 
 
 
 
 
 
 
 
441
  if __name__ == "__main__":
442
  demo.queue(max_size=50).launch()
 
1
  from __future__ import annotations
2
 
3
  import os
4
+ import pathlib
5
 
6
  import gradio as gr
7
  import numpy as np
 
18
  TEXT_SOURCE_LANGUAGE_NAMES,
19
  )
20
 
21
+ if not pathlib.Path("models").exists():
22
+ snapshot_download(repo_id="meta-private/M4Tv2", repo_type="model", local_dir="models")
23
 
24
+ DESCRIPTION = """\
25
+ # SeamlessM4T
26
 
27
  [SeamlessM4T](https://github.com/facebookresearch/seamless_communication) is designed to provide high-quality
28
  translation, allowing people from different linguistic communities to communicate effortlessly through speech and text.
 
29
  This unified model enables multiple tasks like Speech-to-Speech (S2ST), Speech-to-Text (S2TT), Text-to-Speech (T2ST)
30
  translation and more, without relying on multiple separate models.
31
  """
32
 
33
  CACHE_EXAMPLES = os.getenv("CACHE_EXAMPLES") == "1" and torch.cuda.is_available()
34
 
 
 
 
 
 
 
 
35
  AUDIO_SAMPLE_RATE = 16000.0
36
  MAX_INPUT_AUDIO_LENGTH = 60 # in seconds
37
  DEFAULT_TARGET_LANGUAGE = "French"
 
50
  )
51
 
52
 
53
+ def preprocess_audio(input_audio: str) -> None:
54
+ arr, org_sr = torchaudio.load(input_audio)
55
+ new_arr = torchaudio.functional.resample(arr, orig_freq=org_sr, new_freq=AUDIO_SAMPLE_RATE)
56
+ max_length = int(MAX_INPUT_AUDIO_LENGTH * AUDIO_SAMPLE_RATE)
57
+ if new_arr.shape[1] > max_length:
58
+ new_arr = new_arr[:, :max_length]
59
+ gr.Warning(f"Input audio is too long. Only the first {MAX_INPUT_AUDIO_LENGTH} seconds is used.")
60
+ torchaudio.save(input_audio, new_arr, sample_rate=int(AUDIO_SAMPLE_RATE))
 
 
 
 
61
 
62
+
63
+ def run_s2st(input_audio: str, target_language: str) -> tuple[tuple[int, np.ndarray] | None, str]:
64
+ preprocess_audio(input_audio)
65
+ target_language_code = LANGUAGE_NAME_TO_CODE[target_language]
 
 
 
 
 
 
 
 
 
 
 
66
  out_texts, out_audios = translator.predict(
67
+ input=input_audio,
68
+ task_str="S2ST",
69
  tgt_lang=target_language_code,
 
70
  )
71
  out_text = str(out_texts[0])
72
+ out_wav = out_audios.audio_wavs[0].cpu().detach().numpy()
73
+ return (int(AUDIO_SAMPLE_RATE), out_wav), out_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
 
76
+ def run_s2tt(input_audio: str, target_language: str) -> str:
77
+ preprocess_audio(input_audio)
78
+ target_language_code = LANGUAGE_NAME_TO_CODE[target_language]
79
+ out_texts, _ = translator.predict(
80
+ input=input_audio,
81
+ task_str="S2TT",
82
+ tgt_lang=target_language_code,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  )
84
+ return str(out_texts[0])
85
 
86
 
87
+ def run_t2st(input_text: str, source_language: str, target_language: str) -> tuple[tuple[int, np.ndarray] | None, str]:
88
+ source_language_code = LANGUAGE_NAME_TO_CODE[source_language]
89
+ target_language_code = LANGUAGE_NAME_TO_CODE[target_language]
90
+ out_texts, out_audios = translator.predict(
91
+ input=input_text,
92
+ task_str="T2ST",
93
+ tgt_lang=target_language_code,
94
+ src_lang=source_language_code,
 
 
 
95
  )
96
+ out_text = str(out_texts[0])
97
+ out_wav = out_audios.audio_wavs[0].cpu().detach().numpy()
98
+ return (int(AUDIO_SAMPLE_RATE), out_wav), out_text
99
 
100
 
101
+ def run_t2tt(input_text: str, source_language: str, target_language: str) -> str:
102
+ source_language_code = LANGUAGE_NAME_TO_CODE[source_language]
103
+ target_language_code = LANGUAGE_NAME_TO_CODE[target_language]
104
+ out_texts, _ = translator.predict(
105
+ input=input_text,
106
+ task_str="T2TT",
107
+ tgt_lang=target_language_code,
108
+ src_lang=source_language_code,
 
109
  )
110
+ return str(out_texts[0])
111
 
112
 
113
+ def run_asr(input_audio: str, target_language: str) -> str:
114
+ preprocess_audio(input_audio)
115
+ target_language_code = LANGUAGE_NAME_TO_CODE[target_language]
116
+ out_texts, _ = translator.predict(
117
+ input=input_audio,
118
+ task_str="ASR",
119
+ tgt_lang=target_language_code,
120
  )
121
+ return str(out_texts[0])
122
 
123
 
124
+ with gr.Blocks() as demo_s2st:
125
+ with gr.Group():
126
+ target_language = gr.Dropdown(
127
+ label="Target language",
128
+ choices=S2ST_TARGET_LANGUAGE_NAMES,
129
+ value=DEFAULT_TARGET_LANGUAGE,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  )
131
+ input_audio = gr.Audio(label="Input speech", type="filepath")
132
+ btn = gr.Button("Translate")
133
+ output_audio = gr.Audio(
134
+ label="Translated speech",
135
+ autoplay=False,
136
+ streaming=False,
137
+ type="numpy",
 
138
  )
139
+ output_text = gr.Textbox(label="Translated text")
140
+
141
+ gr.Examples(
142
+ examples=[
143
+ ["assets/sample_input.mp3", "French"],
144
+ ["assets/sample_input.mp3", "Mandarin Chinese"],
145
+ ["assets/sample_input_2.mp3", "Hindi"],
146
+ ["assets/sample_input_2.mp3", "Spanish"],
147
+ ],
148
+ inputs=[input_audio, target_language],
149
+ outputs=[output_audio, output_text],
150
+ fn=run_s2st,
151
+ cache_examples=CACHE_EXAMPLES,
152
+ api_name=False,
153
+ )
154
 
155
+ btn.click(
156
+ fn=run_s2st,
157
+ inputs=[input_audio, target_language],
158
+ outputs=[output_audio, output_text],
159
+ api_name="s2st",
160
+ )
161
 
162
+ with gr.Blocks() as demo_s2tt:
163
+ with gr.Group():
164
+ target_language = gr.Dropdown(
165
+ label="Target language",
166
+ choices=S2TT_TARGET_LANGUAGE_NAMES,
167
+ value=DEFAULT_TARGET_LANGUAGE,
 
 
 
 
 
168
  )
169
+ input_audio = gr.Audio(label="Input speech", type="filepath")
170
+ btn = gr.Button("Translate")
171
+ output_text = gr.Textbox(label="Translated text")
172
+
173
+ gr.Examples(
174
+ examples=[
175
+ ["assets/sample_input.mp3", "French"],
176
+ ["assets/sample_input.mp3", "Mandarin Chinese"],
177
+ ["assets/sample_input_2.mp3", "Hindi"],
178
+ ["assets/sample_input_2.mp3", "Spanish"],
179
+ ],
180
+ inputs=[input_audio, target_language],
181
+ outputs=output_text,
182
+ fn=run_s2tt,
183
+ cache_examples=CACHE_EXAMPLES,
184
+ api_name=False,
185
  )
186
 
187
+ btn.click(
188
+ fn=run_s2tt,
189
+ inputs=[input_audio, target_language],
190
+ outputs=output_text,
191
+ api_name="s2tt",
 
 
192
  )
193
+
194
+ with gr.Blocks() as demo_t2st:
195
  with gr.Group():
 
 
 
 
 
196
  with gr.Row():
197
  source_language = gr.Dropdown(
198
  label="Source language",
199
  choices=TEXT_SOURCE_LANGUAGE_NAMES,
200
  value="English",
 
201
  )
202
  target_language = gr.Dropdown(
203
  label="Target language",
204
+ choices=T2TT_TARGET_LANGUAGE_NAMES,
205
  value=DEFAULT_TARGET_LANGUAGE,
206
  )
207
+ input_text = gr.Textbox(label="Input text")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
  btn = gr.Button("Translate")
209
+ output_audio = gr.Audio(
210
+ label="Translated speech",
211
+ autoplay=False,
212
+ streaming=False,
213
+ type="numpy",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
  )
215
+ output_text = gr.Textbox(label="Translated text")
216
+
217
+ gr.Examples(
218
+ examples=[
219
+ [
220
+ "My favorite animal is the elephant.",
221
+ "English",
222
+ "French",
223
  ],
224
+ [
225
+ "My favorite animal is the elephant.",
226
+ "English",
227
+ "Mandarin Chinese",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
228
  ],
229
+ [
230
+ "Meta AI's Seamless M4T model is democratising spoken communication across language barriers",
231
+ "English",
232
+ "Hindi",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233
  ],
234
+ [
235
+ "Meta AI's Seamless M4T model is democratising spoken communication across language barriers",
236
+ "English",
237
+ "Spanish",
 
 
 
 
 
 
238
  ],
 
 
 
 
 
 
 
 
 
 
 
 
239
  ],
240
+ inputs=[input_text, source_language, target_language],
241
+ outputs=[output_audio, output_text],
242
+ fn=run_t2st,
243
+ cache_examples=CACHE_EXAMPLES,
244
  api_name=False,
245
  )
246
+
247
+ gr.on(
248
+ triggers=[input_text.submit, btn.click],
249
+ fn=run_t2st,
250
+ inputs=[input_text, source_language, target_language],
 
 
 
 
 
 
 
 
 
251
  outputs=[output_audio, output_text],
252
+ api_name="t2st",
253
+ )
254
+
255
+ with gr.Blocks() as demo_t2tt:
256
+ with gr.Group():
257
+ with gr.Row():
258
+ source_language = gr.Dropdown(
259
+ label="Source language",
260
+ choices=TEXT_SOURCE_LANGUAGE_NAMES,
261
+ value="English",
262
+ )
263
+ target_language = gr.Dropdown(
264
+ label="Target language",
265
+ choices=T2TT_TARGET_LANGUAGE_NAMES,
266
+ value=DEFAULT_TARGET_LANGUAGE,
267
+ )
268
+ input_text = gr.Textbox(label="Input text")
269
+ btn = gr.Button("Translate")
270
+ output_text = gr.Textbox(label="Translated text")
271
+
272
+ gr.Examples(
273
+ examples=[
274
+ [
275
+ "My favorite animal is the elephant.",
276
+ "English",
277
+ "French",
278
+ ],
279
+ [
280
+ "My favorite animal is the elephant.",
281
+ "English",
282
+ "Mandarin Chinese",
283
+ ],
284
+ [
285
+ "Meta AI's Seamless M4T model is democratising spoken communication across language barriers",
286
+ "English",
287
+ "Hindi",
288
+ ],
289
+ [
290
+ "Meta AI's Seamless M4T model is democratising spoken communication across language barriers",
291
+ "English",
292
+ "Spanish",
293
+ ],
294
+ ],
295
+ inputs=[input_text, source_language, target_language],
296
+ outputs=output_text,
297
+ fn=run_t2tt,
298
+ cache_examples=CACHE_EXAMPLES,
299
  api_name=False,
300
+ )
301
+
302
+ gr.on(
303
+ triggers=[input_text.submit, btn.click],
304
+ fn=run_t2tt,
305
+ inputs=[input_text, source_language, target_language],
306
+ outputs=output_text,
307
+ api_name="t2tt",
308
+ )
309
+
310
+ with gr.Blocks() as demo_asr:
311
+ with gr.Group():
312
+ target_language = gr.Dropdown(
313
+ label="Target language",
314
+ choices=S2ST_TARGET_LANGUAGE_NAMES,
315
+ value=DEFAULT_TARGET_LANGUAGE,
316
+ )
317
+ input_audio = gr.Audio(label="Input speech", type="filepath")
318
+ btn = gr.Button("Translate")
319
+ output_text = gr.Textbox(label="Translated text")
320
+
321
+ gr.Examples(
322
+ examples=[
323
+ ["assets/sample_input.mp3", "English"],
324
+ ["assets/sample_input_2.mp3", "English"],
325
  ],
326
+ inputs=[input_audio, target_language],
327
+ outputs=output_text,
328
+ fn=run_asr,
329
+ cache_examples=CACHE_EXAMPLES,
330
  api_name=False,
331
  )
332
 
333
  btn.click(
334
+ fn=run_asr,
335
+ inputs=[input_audio, target_language],
336
+ outputs=output_text,
337
+ api_name="asr",
338
+ )
339
+
340
+
341
+ with gr.Blocks(css="style.css") as demo:
342
+ gr.Markdown(DESCRIPTION)
343
+ gr.DuplicateButton(
344
+ value="Duplicate Space for private use",
345
+ elem_id="duplicate-button",
346
+ visible=os.getenv("SHOW_DUPLICATE_BUTTON") == "1",
347
  )
348
 
349
+ with gr.Tabs():
350
+ with gr.Tab(label="S2ST"):
351
+ demo_s2st.render()
352
+ with gr.Tab(label="S2TT"):
353
+ demo_s2tt.render()
354
+ with gr.Tab(label="T2ST"):
355
+ demo_t2st.render()
356
+ with gr.Tab(label="T2TT"):
357
+ demo_t2tt.render()
358
+ with gr.Tab(label="ASR"):
359
+ demo_asr.render()
360
+
361
+
362
  if __name__ == "__main__":
363
  demo.queue(max_size=50).launch()
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
- gradio==3.50.2
2
  omegaconf==2.3.0
3
  torch==2.1.0
4
  torchaudio==2.1.0
 
1
+ gradio==4.3.0
2
  omegaconf==2.3.0
3
  torch==2.1.0
4
  torchaudio==2.1.0
style.css CHANGED
@@ -9,7 +9,7 @@ h1 {
9
  border-radius: 100vh;
10
  }
11
 
12
- #component-0 {
13
  max-width: 730px;
14
  margin: auto;
15
  padding-top: 1.5rem;
 
9
  border-radius: 100vh;
10
  }
11
 
12
+ .contain {
13
  max-width: 730px;
14
  margin: auto;
15
  padding-top: 1.5rem;