Mark Duppenthaler commited on
Commit
5cae5d7
1 Parent(s): 108017c

Update with streaming input

Browse files
Files changed (3) hide show
  1. Dockerfile +5 -1
  2. app.py +54 -407
  3. requirements.txt +3 -1
Dockerfile CHANGED
@@ -53,4 +53,8 @@ ENV PYTHONPATH=${HOME}/app \
53
  GRADIO_SERVER_NAME=0.0.0.0 \
54
  GRADIO_THEME=huggingface \
55
  SYSTEM=spaces
56
- CMD ["python", "app.py"]
 
 
 
 
 
53
  GRADIO_SERVER_NAME=0.0.0.0 \
54
  GRADIO_THEME=huggingface \
55
  SYSTEM=spaces
56
+
57
+ # gradio instead of python for reload on file save with mountin pwd volume:
58
+ # docker run -p 7860:7860 -v $(pwd):/home/user/app seamless_m4t_text
59
+ CMD ["gradio", "app.py"]
60
+ # CMD ["python", "app.py"]
app.py CHANGED
@@ -8,428 +8,75 @@ import torch
8
  import torchaudio
9
  from seamless_communication.models.inference.translator import Translator
10
 
11
- from lang_list import (
12
- LANGUAGE_NAME_TO_CODE,
13
- S2ST_TARGET_LANGUAGE_NAMES,
14
- S2TT_TARGET_LANGUAGE_NAMES,
15
- T2TT_TARGET_LANGUAGE_NAMES,
16
- TEXT_SOURCE_LANGUAGE_NAMES,
17
- )
18
 
19
- DESCRIPTION = """# SeamlessM4T
20
 
21
- [SeamlessM4T](https://github.com/facebookresearch/seamless_communication) is designed to provide high-quality
22
- translation, allowing people from different linguistic communities to communicate effortlessly through speech and text.
 
23
 
24
- This unified model enables multiple tasks like Speech-to-Speech (S2ST), Speech-to-Text (S2TT), Text-to-Speech (T2ST)
25
- translation and more, without relying on multiple separate models.
26
- """
27
 
28
- CACHE_EXAMPLES = os.getenv("CACHE_EXAMPLES") == "1"
 
 
 
 
 
29
 
30
- TASK_NAMES = [
31
- "S2ST (Speech to Speech translation)",
32
- "S2TT (Speech to Text translation)",
33
- "T2ST (Text to Speech translation)",
34
- "T2TT (Text to Text translation)",
35
- "ASR (Automatic Speech Recognition)",
36
- ]
37
- AUDIO_SAMPLE_RATE = 16000.0
38
- MAX_INPUT_AUDIO_LENGTH = 60 # in seconds
39
- DEFAULT_TARGET_LANGUAGE = "French"
40
 
41
- device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
42
- translator = Translator(
43
- model_name_or_card="seamlessM4T_large",
44
- vocoder_name_or_card="vocoder_36langs",
45
- device=device,
46
- dtype=torch.float16,
47
- )
48
-
49
-
50
- def predict(
51
- task_name: str,
52
- audio_source: str,
53
- input_audio_mic: str | None,
54
- input_audio_file: str | None,
55
- input_text: str | None,
56
- source_language: str | None,
57
- target_language: str,
58
- ) -> tuple[tuple[int, np.ndarray] | None, str]:
59
- task_name = task_name.split()[0]
60
- source_language_code = LANGUAGE_NAME_TO_CODE[source_language] if source_language else None
61
- target_language_code = LANGUAGE_NAME_TO_CODE[target_language]
62
-
63
- if task_name in ["S2ST", "S2TT", "ASR"]:
64
- if audio_source == "microphone":
65
- input_data = input_audio_mic
66
- else:
67
- input_data = input_audio_file
68
-
69
- arr, org_sr = torchaudio.load(input_data)
70
- new_arr = torchaudio.functional.resample(arr, orig_freq=org_sr, new_freq=AUDIO_SAMPLE_RATE)
71
- max_length = int(MAX_INPUT_AUDIO_LENGTH * AUDIO_SAMPLE_RATE)
72
- if new_arr.shape[1] > max_length:
73
- new_arr = new_arr[:, :max_length]
74
- gr.Warning(f"Input audio is too long. Only the first {MAX_INPUT_AUDIO_LENGTH} seconds is used.")
75
- torchaudio.save(input_data, new_arr, sample_rate=int(AUDIO_SAMPLE_RATE))
76
- else:
77
- input_data = input_text
78
- text_out, wav, sr = translator.predict(
79
- input=input_data,
80
- task_str=task_name,
81
- tgt_lang=target_language_code,
82
- src_lang=source_language_code,
83
- ngram_filtering=True,
84
- sample_rate=AUDIO_SAMPLE_RATE,
85
- )
86
- if task_name in ["S2ST", "T2ST"]:
87
- return (sr, wav.cpu().detach().numpy()), text_out
88
- else:
89
- return None, text_out
90
-
91
-
92
- def process_s2st_example(input_audio_file: str, target_language: str) -> tuple[tuple[int, np.ndarray] | None, str]:
93
- return predict(
94
- task_name="S2ST",
95
- audio_source="file",
96
- input_audio_mic=None,
97
- input_audio_file=input_audio_file,
98
- input_text=None,
99
- source_language=None,
100
- target_language=target_language,
101
- )
102
-
103
-
104
- def process_s2tt_example(input_audio_file: str, target_language: str) -> tuple[tuple[int, np.ndarray] | None, str]:
105
- return predict(
106
- task_name="S2TT",
107
- audio_source="file",
108
- input_audio_mic=None,
109
- input_audio_file=input_audio_file,
110
- input_text=None,
111
- source_language=None,
112
- target_language=target_language,
113
- )
114
-
115
-
116
- def process_t2st_example(
117
- input_text: str, source_language: str, target_language: str
118
- ) -> tuple[tuple[int, np.ndarray] | None, str]:
119
- return predict(
120
- task_name="T2ST",
121
- audio_source="",
122
- input_audio_mic=None,
123
- input_audio_file=None,
124
- input_text=input_text,
125
- source_language=source_language,
126
- target_language=target_language,
127
- )
128
-
129
-
130
- def process_t2tt_example(
131
- input_text: str, source_language: str, target_language: str
132
- ) -> tuple[tuple[int, np.ndarray] | None, str]:
133
- return predict(
134
- task_name="T2TT",
135
- audio_source="",
136
- input_audio_mic=None,
137
- input_audio_file=None,
138
- input_text=input_text,
139
- source_language=source_language,
140
- target_language=target_language,
141
- )
142
-
143
-
144
- def process_asr_example(input_audio_file: str, target_language: str) -> tuple[tuple[int, np.ndarray] | None, str]:
145
- return predict(
146
- task_name="ASR",
147
- audio_source="file",
148
- input_audio_mic=None,
149
- input_audio_file=input_audio_file,
150
- input_text=None,
151
- source_language=None,
152
- target_language=target_language,
153
- )
154
-
155
-
156
- def update_audio_ui(audio_source: str) -> tuple[dict, dict]:
157
- mic = audio_source == "microphone"
158
- return (
159
- gr.update(visible=mic, value=None), # input_audio_mic
160
- gr.update(visible=not mic, value=None), # input_audio_file
161
- )
162
 
163
 
164
- def update_input_ui(task_name: str) -> tuple[dict, dict, dict, dict]:
165
- task_name = task_name.split()[0]
166
- if task_name == "S2ST":
167
- return (
168
- gr.update(visible=True), # audio_box
169
- gr.update(visible=False), # input_text
170
- gr.update(visible=False), # source_language
171
- gr.update(
172
- visible=True, choices=S2ST_TARGET_LANGUAGE_NAMES, value=DEFAULT_TARGET_LANGUAGE
173
- ), # target_language
174
- )
175
- elif task_name == "S2TT":
176
- return (
177
- gr.update(visible=True), # audio_box
178
- gr.update(visible=False), # input_text
179
- gr.update(visible=False), # source_language
180
- gr.update(
181
- visible=True, choices=S2TT_TARGET_LANGUAGE_NAMES, value=DEFAULT_TARGET_LANGUAGE
182
- ), # target_language
183
- )
184
- elif task_name == "T2ST":
185
- return (
186
- gr.update(visible=False), # audio_box
187
- gr.update(visible=True), # input_text
188
- gr.update(visible=True), # source_language
189
- gr.update(
190
- visible=True, choices=S2ST_TARGET_LANGUAGE_NAMES, value=DEFAULT_TARGET_LANGUAGE
191
- ), # target_language
192
- )
193
- elif task_name == "T2TT":
194
- return (
195
- gr.update(visible=False), # audio_box
196
- gr.update(visible=True), # input_text
197
- gr.update(visible=True), # source_language
198
- gr.update(
199
- visible=True, choices=T2TT_TARGET_LANGUAGE_NAMES, value=DEFAULT_TARGET_LANGUAGE
200
- ), # target_language
201
- )
202
- elif task_name == "ASR":
203
- return (
204
- gr.update(visible=True), # audio_box
205
- gr.update(visible=False), # input_text
206
- gr.update(visible=False), # source_language
207
- gr.update(
208
- visible=True, choices=S2TT_TARGET_LANGUAGE_NAMES, value=DEFAULT_TARGET_LANGUAGE
209
- ), # target_language
210
- )
211
- else:
212
- raise ValueError(f"Unknown task: {task_name}")
213
 
 
 
214
 
215
- def update_output_ui(task_name: str) -> tuple[dict, dict]:
216
- task_name = task_name.split()[0]
217
- if task_name in ["S2ST", "T2ST"]:
218
- return (
219
- gr.update(visible=True, value=None), # output_audio
220
- gr.update(value=None), # output_text
221
- )
222
- elif task_name in ["S2TT", "T2TT", "ASR"]:
223
- return (
224
- gr.update(visible=False, value=None), # output_audio
225
- gr.update(value=None), # output_text
226
- )
227
- else:
228
- raise ValueError(f"Unknown task: {task_name}")
229
 
 
 
 
230
 
231
- def update_example_ui(task_name: str) -> tuple[dict, dict, dict, dict, dict]:
232
- task_name = task_name.split()[0]
233
- return (
234
- gr.update(visible=task_name == "S2ST"), # s2st_example_row
235
- gr.update(visible=task_name == "S2TT"), # s2tt_example_row
236
- gr.update(visible=task_name == "T2ST"), # t2st_example_row
237
- gr.update(visible=task_name == "T2TT"), # t2tt_example_row
238
- gr.update(visible=task_name == "ASR"), # asr_example_row
239
- )
 
 
240
 
 
241
 
242
- with gr.Blocks(css="style.css") as demo:
243
- gr.Markdown(DESCRIPTION)
244
- gr.DuplicateButton(
245
- value="Duplicate Space for private use",
246
- elem_id="duplicate-button",
247
- visible=os.getenv("SHOW_DUPLICATE_BUTTON") == "1",
248
- )
249
- with gr.Group():
250
- task_name = gr.Dropdown(
251
- label="Task",
252
- choices=TASK_NAMES,
253
- value=TASK_NAMES[0],
254
- )
255
- with gr.Row():
256
- source_language = gr.Dropdown(
257
- label="Source language",
258
- choices=TEXT_SOURCE_LANGUAGE_NAMES,
259
- value="English",
260
- visible=False,
261
- )
262
- target_language = gr.Dropdown(
263
- label="Target language",
264
- choices=S2ST_TARGET_LANGUAGE_NAMES,
265
- value=DEFAULT_TARGET_LANGUAGE,
266
- )
267
- with gr.Row() as audio_box:
268
- audio_source = gr.Radio(
269
- label="Audio source",
270
- choices=["file", "microphone"],
271
- value="file",
272
- )
273
- input_audio_mic = gr.Audio(
274
- label="Input speech",
275
- type="filepath",
276
- source="microphone",
277
- visible=False,
278
- )
279
- input_audio_file = gr.Audio(
280
- label="Input speech",
281
- type="filepath",
282
- source="upload",
283
- visible=True,
284
- )
285
- input_text = gr.Textbox(label="Input text", visible=False)
286
- btn = gr.Button("Translate")
287
- with gr.Column():
288
- output_audio = gr.Audio(
289
- label="Translated speech",
290
- autoplay=False,
291
- streaming=False,
292
- type="numpy",
293
- )
294
- output_text = gr.Textbox(label="Translated text")
295
 
296
- with gr.Row(visible=True) as s2st_example_row:
297
- s2st_examples = gr.Examples(
298
- examples=[
299
- ["assets/sample_input.mp3", "French"],
300
- ["assets/sample_input.mp3", "Mandarin Chinese"],
301
- ["assets/sample_input_2.mp3", "Hindi"],
302
- ["assets/sample_input_2.mp3", "Spanish"],
303
- ],
304
- inputs=[input_audio_file, target_language],
305
- outputs=[output_audio, output_text],
306
- fn=process_s2st_example,
307
- cache_examples=CACHE_EXAMPLES,
308
- )
309
- with gr.Row(visible=False) as s2tt_example_row:
310
- s2tt_examples = gr.Examples(
311
- examples=[
312
- ["assets/sample_input.mp3", "French"],
313
- ["assets/sample_input.mp3", "Mandarin Chinese"],
314
- ["assets/sample_input_2.mp3", "Hindi"],
315
- ["assets/sample_input_2.mp3", "Spanish"],
316
- ],
317
- inputs=[input_audio_file, target_language],
318
- outputs=[output_audio, output_text],
319
- fn=process_s2tt_example,
320
- cache_examples=CACHE_EXAMPLES,
321
- )
322
- with gr.Row(visible=False) as t2st_example_row:
323
- t2st_examples = gr.Examples(
324
- examples=[
325
- ["My favorite animal is the elephant.", "English", "French"],
326
- ["My favorite animal is the elephant.", "English", "Mandarin Chinese"],
327
- [
328
- "Meta AI's Seamless M4T model is democratising spoken communication across language barriers",
329
- "English",
330
- "Hindi",
331
- ],
332
- [
333
- "Meta AI's Seamless M4T model is democratising spoken communication across language barriers",
334
- "English",
335
- "Spanish",
336
- ],
337
- ],
338
- inputs=[input_text, source_language, target_language],
339
- outputs=[output_audio, output_text],
340
- fn=process_t2st_example,
341
- cache_examples=CACHE_EXAMPLES,
342
- )
343
- with gr.Row(visible=False) as t2tt_example_row:
344
- t2tt_examples = gr.Examples(
345
- examples=[
346
- ["My favorite animal is the elephant.", "English", "French"],
347
- ["My favorite animal is the elephant.", "English", "Mandarin Chinese"],
348
- [
349
- "Meta AI's Seamless M4T model is democratising spoken communication across language barriers",
350
- "English",
351
- "Hindi",
352
- ],
353
- [
354
- "Meta AI's Seamless M4T model is democratising spoken communication across language barriers",
355
- "English",
356
- "Spanish",
357
- ],
358
- ],
359
- inputs=[input_text, source_language, target_language],
360
- outputs=[output_audio, output_text],
361
- fn=process_t2tt_example,
362
- cache_examples=CACHE_EXAMPLES,
363
- )
364
- with gr.Row(visible=False) as asr_example_row:
365
- asr_examples = gr.Examples(
366
- examples=[
367
- ["assets/sample_input.mp3", "English"],
368
- ["assets/sample_input_2.mp3", "English"],
369
- ],
370
- inputs=[input_audio_file, target_language],
371
- outputs=[output_audio, output_text],
372
- fn=process_asr_example,
373
- cache_examples=CACHE_EXAMPLES,
374
- )
375
 
376
- audio_source.change(
377
- fn=update_audio_ui,
378
- inputs=audio_source,
379
- outputs=[
380
- input_audio_mic,
381
- input_audio_file,
382
- ],
383
- queue=False,
384
- api_name=False,
385
- )
386
- task_name.change(
387
- fn=update_input_ui,
388
- inputs=task_name,
389
- outputs=[
390
- audio_box,
391
- input_text,
392
- source_language,
393
- target_language,
394
- ],
395
- queue=False,
396
- api_name=False,
397
- ).then(
398
- fn=update_output_ui,
399
- inputs=task_name,
400
- outputs=[output_audio, output_text],
401
- queue=False,
402
- api_name=False,
403
- ).then(
404
- fn=update_example_ui,
405
- inputs=task_name,
406
- outputs=[
407
- s2st_example_row,
408
- s2tt_example_row,
409
- t2st_example_row,
410
- t2tt_example_row,
411
- asr_example_row,
412
- ],
413
- queue=False,
414
- api_name=False,
415
- )
416
 
417
- btn.click(
418
- fn=predict,
419
- inputs=[
420
- task_name,
421
- audio_source,
422
- input_audio_mic,
423
- input_audio_file,
424
- input_text,
425
- source_language,
426
- target_language,
427
- ],
428
- outputs=[output_audio, output_text],
429
- api_name="run",
430
- )
431
- demo.queue(max_size=50).launch()
432
 
433
- # Linking models to the space
434
- # 'facebook/seamless-m4t-large'
435
- # 'facebook/SONAR'
 
8
  import torchaudio
9
  from seamless_communication.models.inference.translator import Translator
10
 
11
+ from transformers import pipeline
 
 
 
 
 
 
12
 
13
+ p = pipeline("automatic-speech-recognition")
14
 
15
+ from pydub import AudioSegment
16
+ import time
17
+ from time import sleep
18
 
 
 
 
19
 
20
+ def transcribe(audio, state=""):
21
+ # sleep(2)
22
+ print('state', state)
23
+ text = p(audio)["text"]
24
+ state += text + " "
25
+ return state
26
 
27
+ def blocks():
28
+ with gr.Blocks() as demo:
29
+ total_audio_bytes_state = gr.State(bytes())
30
+ total_text_state = gr.State("")
 
 
 
 
 
 
31
 
32
+ # input_audio = gr.Audio(label="Input Audio", type="filepath", format="mp3")
33
+ input_audio = gr.Audio(label="Input Audio", type="filepath", format="mp3", source="microphone", streaming=True)
34
+ with gr.Row():
35
+ with gr.Column():
36
+ stream_as_bytes_btn = gr.Button("Stream as Bytes")
37
+ stream_as_bytes_output = gr.Audio(format="bytes", streaming=True)
38
+ stream_output_text = gr.Textbox(label="Translated text")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
 
41
+ def stream_bytes(audio_file, total_audio_bytes_state, total_text_state):
42
+ chunk_size = 30000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
+ print(f"audio_file {audio_file}, size {os.path.getsize(audio_file)}")
45
+ with open(audio_file, "rb") as f:
46
 
47
+ while True:
48
+ chunk = f.read(chunk_size)
49
+ if chunk:
50
+ total_audio_bytes_state += chunk
51
+ print('yielding chunk', len(chunk))
52
+ print('total audio bytes', len(total_audio_bytes_state))
53
+ print(f"Text state: {total_text_state}")
 
 
 
 
 
 
 
54
 
55
+ # This does the whole thing every time
56
+ # total_text = transcribe(chunk, "")
57
+ # yield total_audio_bytes_state, total_text, total_audio_bytes_state, total_text_state
58
 
59
+ # This translates just the new part every time
60
+ total_text_state = transcribe(chunk, total_text_state)
61
+ total_text = total_text_state
62
+ # total_text = transcribe(chunk, total_text)
63
+ yield total_audio_bytes_state, total_text, total_audio_bytes_state, total_text_state
64
+ # sleep(3)
65
+ else:
66
+ break
67
+ def clear():
68
+ print('clearing')
69
+ return [bytes(), ""]
70
 
71
+ stream_as_bytes_btn.click(stream_bytes, [input_audio, total_audio_bytes_state, total_text_state], [stream_as_bytes_output, stream_output_text, total_audio_bytes_state, total_text_state])
72
 
73
+ input_audio.change(stream_bytes, [input_audio, total_audio_bytes_state, total_text_state], [stream_as_bytes_output, stream_output_text, total_audio_bytes_state, total_text_state])
74
+ input_audio.clear(clear, None, [total_audio_bytes_state, total_text_state])
75
+ input_audio.start_recording(clear, None, [total_audio_bytes_state, total_text_state])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
+ demo.queue().launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
+ # if __name__ == "__main__":
82
+ blocks()
 
requirements.txt CHANGED
@@ -1,6 +1,8 @@
1
  fairseq2==0.1.0
2
  git+https://github.com/facebookresearch/seamless_communication
3
- gradio==3.40.1
4
  huggingface_hub==0.16.4
5
  torch==2.0.1
6
  torchaudio==2.0.2
 
 
 
1
  fairseq2==0.1.0
2
  git+https://github.com/facebookresearch/seamless_communication
3
+ gradio==3.41.0
4
  huggingface_hub==0.16.4
5
  torch==2.0.1
6
  torchaudio==2.0.2
7
+ transformers==4.32.1
8
+ pydub