Spaces:
Runtime error
Runtime error
update with HF implementation
Browse files- Dockerfile +0 -56
- README.md +3 -2
- app.py +23 -19
- lang_list.py +148 -0
- requirements.txt +2 -5
Dockerfile
DELETED
@@ -1,56 +0,0 @@
|
|
1 |
-
FROM nvidia/cuda:11.7.1-cudnn8-devel-ubuntu22.04
|
2 |
-
ENV DEBIAN_FRONTEND=noninteractive
|
3 |
-
RUN apt-get update && \
|
4 |
-
apt-get upgrade -y && \
|
5 |
-
apt-get install -y --no-install-recommends \
|
6 |
-
git \
|
7 |
-
git-lfs \
|
8 |
-
wget \
|
9 |
-
curl \
|
10 |
-
# python build dependencies \
|
11 |
-
build-essential \
|
12 |
-
libssl-dev \
|
13 |
-
zlib1g-dev \
|
14 |
-
libbz2-dev \
|
15 |
-
libreadline-dev \
|
16 |
-
libsqlite3-dev \
|
17 |
-
libncursesw5-dev \
|
18 |
-
xz-utils \
|
19 |
-
tk-dev \
|
20 |
-
libxml2-dev \
|
21 |
-
libxmlsec1-dev \
|
22 |
-
libffi-dev \
|
23 |
-
liblzma-dev \
|
24 |
-
# gradio dependencies \
|
25 |
-
ffmpeg \
|
26 |
-
# fairseq2 dependencies \
|
27 |
-
libsndfile-dev && \
|
28 |
-
apt-get clean && \
|
29 |
-
rm -rf /var/lib/apt/lists/*
|
30 |
-
|
31 |
-
RUN useradd -m -u 1000 user
|
32 |
-
USER user
|
33 |
-
ENV HOME=/home/user \
|
34 |
-
PATH=/home/user/.local/bin:${PATH}
|
35 |
-
WORKDIR ${HOME}/app
|
36 |
-
|
37 |
-
RUN curl https://pyenv.run | bash
|
38 |
-
ENV PATH=${HOME}/.pyenv/shims:${HOME}/.pyenv/bin:${PATH}
|
39 |
-
ARG PYTHON_VERSION=3.10.12
|
40 |
-
RUN pyenv install ${PYTHON_VERSION} && \
|
41 |
-
pyenv global ${PYTHON_VERSION} && \
|
42 |
-
pyenv rehash && \
|
43 |
-
pip install --no-cache-dir -U pip setuptools wheel
|
44 |
-
|
45 |
-
COPY --chown=1000 ./requirements.txt /tmp/requirements.txt
|
46 |
-
RUN pip install --no-cache-dir --upgrade -r /tmp/requirements.txt
|
47 |
-
|
48 |
-
COPY --chown=1000 . ${HOME}/app
|
49 |
-
ENV PYTHONPATH=${HOME}/app \
|
50 |
-
PYTHONUNBUFFERED=1 \
|
51 |
-
GRADIO_ALLOW_FLAGGING=never \
|
52 |
-
GRADIO_NUM_PORTS=1 \
|
53 |
-
GRADIO_SERVER_NAME=0.0.0.0 \
|
54 |
-
GRADIO_THEME=huggingface \
|
55 |
-
SYSTEM=spaces
|
56 |
-
CMD ["python", "app.py"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
README.md
CHANGED
@@ -3,9 +3,10 @@ title: Seamless M4T
|
|
3 |
emoji: 📞
|
4 |
colorFrom: blue
|
5 |
colorTo: yellow
|
6 |
-
sdk:
|
|
|
7 |
pinned: false
|
8 |
suggested_hardware: t4-medium
|
9 |
---
|
10 |
|
11 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
3 |
emoji: 📞
|
4 |
colorFrom: blue
|
5 |
colorTo: yellow
|
6 |
+
sdk: gradio
|
7 |
+
app_file: app.py
|
8 |
pinned: false
|
9 |
suggested_hardware: t4-medium
|
10 |
---
|
11 |
|
12 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
CHANGED
@@ -6,7 +6,7 @@ import gradio as gr
|
|
6 |
import numpy as np
|
7 |
import torch
|
8 |
import torchaudio
|
9 |
-
from
|
10 |
|
11 |
from lang_list import (
|
12 |
LANGUAGE_NAME_TO_CODE,
|
@@ -14,13 +14,12 @@ from lang_list import (
|
|
14 |
S2TT_TARGET_LANGUAGE_NAMES,
|
15 |
T2TT_TARGET_LANGUAGE_NAMES,
|
16 |
TEXT_SOURCE_LANGUAGE_NAMES,
|
|
|
17 |
)
|
18 |
|
19 |
DESCRIPTION = """# SeamlessM4T
|
20 |
-
|
21 |
[SeamlessM4T](https://github.com/facebookresearch/seamless_communication) is designed to provide high-quality
|
22 |
translation, allowing people from different linguistic communities to communicate effortlessly through speech and text.
|
23 |
-
|
24 |
This unified model enables multiple tasks like Speech-to-Speech (S2ST), Speech-to-Text (S2TT), Text-to-Speech (T2ST)
|
25 |
translation and more, without relying on multiple separate models.
|
26 |
"""
|
@@ -39,11 +38,9 @@ MAX_INPUT_AUDIO_LENGTH = 60 # in seconds
|
|
39 |
DEFAULT_TARGET_LANGUAGE = "French"
|
40 |
|
41 |
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
device=device,
|
46 |
-
)
|
47 |
|
48 |
|
49 |
def predict(
|
@@ -71,18 +68,25 @@ def predict(
|
|
71 |
if new_arr.shape[1] > max_length:
|
72 |
new_arr = new_arr[:, :max_length]
|
73 |
gr.Warning(f"Input audio is too long. Only the first {MAX_INPUT_AUDIO_LENGTH} seconds is used.")
|
74 |
-
|
|
|
|
|
75 |
else:
|
76 |
-
input_data = input_text
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
tgt_lang=target_language_code,
|
81 |
-
|
82 |
-
|
83 |
-
|
|
|
|
|
|
|
|
|
|
|
84 |
if task_name in ["S2ST", "T2ST"]:
|
85 |
-
return (
|
86 |
else:
|
87 |
return None, text_out
|
88 |
|
@@ -430,4 +434,4 @@ demo.queue(max_size=50).launch()
|
|
430 |
|
431 |
# Linking models to the space
|
432 |
# 'facebook/seamless-m4t-large'
|
433 |
-
# 'facebook/SONAR'
|
|
|
6 |
import numpy as np
|
7 |
import torch
|
8 |
import torchaudio
|
9 |
+
from transformers import AutoProcessor, SeamlessM4TModel
|
10 |
|
11 |
from lang_list import (
|
12 |
LANGUAGE_NAME_TO_CODE,
|
|
|
14 |
S2TT_TARGET_LANGUAGE_NAMES,
|
15 |
T2TT_TARGET_LANGUAGE_NAMES,
|
16 |
TEXT_SOURCE_LANGUAGE_NAMES,
|
17 |
+
LANG_TO_SPKR_ID,
|
18 |
)
|
19 |
|
20 |
DESCRIPTION = """# SeamlessM4T
|
|
|
21 |
[SeamlessM4T](https://github.com/facebookresearch/seamless_communication) is designed to provide high-quality
|
22 |
translation, allowing people from different linguistic communities to communicate effortlessly through speech and text.
|
|
|
23 |
This unified model enables multiple tasks like Speech-to-Speech (S2ST), Speech-to-Text (S2TT), Text-to-Speech (T2ST)
|
24 |
translation and more, without relying on multiple separate models.
|
25 |
"""
|
|
|
38 |
DEFAULT_TARGET_LANGUAGE = "French"
|
39 |
|
40 |
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
41 |
+
|
42 |
+
processor = AutoProcessor.from_pretrained("ylacombe/hf-seamless-m4t-large")
|
43 |
+
model = SeamlessM4TModel.from_pretrained("ylacombe/hf-seamless-m4t-large").to(device)
|
|
|
|
|
44 |
|
45 |
|
46 |
def predict(
|
|
|
68 |
if new_arr.shape[1] > max_length:
|
69 |
new_arr = new_arr[:, :max_length]
|
70 |
gr.Warning(f"Input audio is too long. Only the first {MAX_INPUT_AUDIO_LENGTH} seconds is used.")
|
71 |
+
|
72 |
+
|
73 |
+
input_data = processor(audios = new_arr, sampling_rate=AUDIO_SAMPLE_RATE, return_tensors="pt").to(device)
|
74 |
else:
|
75 |
+
input_data = processor(text = input_text, src_lang=source_language_code, return_tensors="pt").to(device)
|
76 |
+
|
77 |
+
|
78 |
+
if task_name in ["S2TT", "T2TT"]:
|
79 |
+
tokens_ids = model.generate(**input_data, generate_speech=False, tgt_lang=target_language_code, num_beams=5, do_sample=True)[0].cpu().squeeze().detach().tolist()
|
80 |
+
else:
|
81 |
+
output = model.generate(**input_data, return_intermediate_token_ids=True, tgt_lang=target_language_code, num_beams=5, do_sample=True, spkr_id=LANG_TO_SPKR_ID[target_language_code][0])
|
82 |
+
|
83 |
+
waveform = output.waveform.cpu().squeeze().detach().numpy()
|
84 |
+
tokens_ids = output.sequences.cpu().squeeze().detach().tolist()
|
85 |
+
|
86 |
+
text_out = processor.decode(tokens_ids, skip_special_tokens=True)
|
87 |
+
|
88 |
if task_name in ["S2ST", "T2ST"]:
|
89 |
+
return (AUDIO_SAMPLE_RATE, waveform), text_out
|
90 |
else:
|
91 |
return None, text_out
|
92 |
|
|
|
434 |
|
435 |
# Linking models to the space
|
436 |
# 'facebook/seamless-m4t-large'
|
437 |
+
# 'facebook/SONAR'
|
lang_list.py
CHANGED
@@ -252,3 +252,151 @@ S2ST_TARGET_LANGUAGE_NAMES = sorted([language_code_to_name[code] for code in s2s
|
|
252 |
S2TT_TARGET_LANGUAGE_NAMES = TEXT_SOURCE_LANGUAGE_NAMES
|
253 |
# T2TT
|
254 |
T2TT_TARGET_LANGUAGE_NAMES = TEXT_SOURCE_LANGUAGE_NAMES
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
252 |
S2TT_TARGET_LANGUAGE_NAMES = TEXT_SOURCE_LANGUAGE_NAMES
|
253 |
# T2TT
|
254 |
T2TT_TARGET_LANGUAGE_NAMES = TEXT_SOURCE_LANGUAGE_NAMES
|
255 |
+
|
256 |
+
|
257 |
+
LANG_TO_SPKR_ID = {
|
258 |
+
"arb": [
|
259 |
+
0
|
260 |
+
],
|
261 |
+
"ben": [
|
262 |
+
2,
|
263 |
+
1
|
264 |
+
],
|
265 |
+
"cat": [
|
266 |
+
3
|
267 |
+
],
|
268 |
+
"ces": [
|
269 |
+
4
|
270 |
+
],
|
271 |
+
"cmn": [
|
272 |
+
5
|
273 |
+
],
|
274 |
+
"cym": [
|
275 |
+
6
|
276 |
+
],
|
277 |
+
"dan": [
|
278 |
+
7,
|
279 |
+
8
|
280 |
+
],
|
281 |
+
"deu": [
|
282 |
+
9
|
283 |
+
],
|
284 |
+
"eng": [
|
285 |
+
10
|
286 |
+
],
|
287 |
+
"est": [
|
288 |
+
11,
|
289 |
+
12,
|
290 |
+
13
|
291 |
+
],
|
292 |
+
"fin": [
|
293 |
+
14
|
294 |
+
],
|
295 |
+
"fra": [
|
296 |
+
15
|
297 |
+
],
|
298 |
+
"hin": [
|
299 |
+
16
|
300 |
+
],
|
301 |
+
"ind": [
|
302 |
+
17,
|
303 |
+
24,
|
304 |
+
18,
|
305 |
+
20,
|
306 |
+
19,
|
307 |
+
21,
|
308 |
+
23,
|
309 |
+
27,
|
310 |
+
26,
|
311 |
+
22,
|
312 |
+
25
|
313 |
+
],
|
314 |
+
"ita": [
|
315 |
+
29,
|
316 |
+
28
|
317 |
+
],
|
318 |
+
"jpn": [
|
319 |
+
30
|
320 |
+
],
|
321 |
+
"kor": [
|
322 |
+
31
|
323 |
+
],
|
324 |
+
"mlt": [
|
325 |
+
32,
|
326 |
+
33,
|
327 |
+
34
|
328 |
+
],
|
329 |
+
"nld": [
|
330 |
+
35
|
331 |
+
],
|
332 |
+
"pes": [
|
333 |
+
36
|
334 |
+
],
|
335 |
+
"pol": [
|
336 |
+
37
|
337 |
+
],
|
338 |
+
"por": [
|
339 |
+
38
|
340 |
+
],
|
341 |
+
"ron": [
|
342 |
+
39
|
343 |
+
],
|
344 |
+
"rus": [
|
345 |
+
40
|
346 |
+
],
|
347 |
+
"slk": [
|
348 |
+
41
|
349 |
+
],
|
350 |
+
"spa": [
|
351 |
+
42
|
352 |
+
],
|
353 |
+
"swe": [
|
354 |
+
43,
|
355 |
+
45,
|
356 |
+
44
|
357 |
+
],
|
358 |
+
"swh": [
|
359 |
+
46,
|
360 |
+
48,
|
361 |
+
47
|
362 |
+
],
|
363 |
+
"tel": [
|
364 |
+
49
|
365 |
+
],
|
366 |
+
"tgl": [
|
367 |
+
50
|
368 |
+
],
|
369 |
+
"tha": [
|
370 |
+
51,
|
371 |
+
54,
|
372 |
+
55,
|
373 |
+
52,
|
374 |
+
53
|
375 |
+
],
|
376 |
+
"tur": [
|
377 |
+
58,
|
378 |
+
57,
|
379 |
+
56
|
380 |
+
],
|
381 |
+
"ukr": [
|
382 |
+
59
|
383 |
+
],
|
384 |
+
"urd": [
|
385 |
+
60,
|
386 |
+
61,
|
387 |
+
62
|
388 |
+
],
|
389 |
+
"uzn": [
|
390 |
+
63,
|
391 |
+
64,
|
392 |
+
65
|
393 |
+
],
|
394 |
+
"vie": [
|
395 |
+
66,
|
396 |
+
67,
|
397 |
+
70,
|
398 |
+
71,
|
399 |
+
68,
|
400 |
+
69
|
401 |
+
]
|
402 |
+
}
|
requirements.txt
CHANGED
@@ -1,6 +1,3 @@
|
|
1 |
-
|
2 |
-
git+https://github.com/facebookresearch/seamless_communication
|
3 |
-
gradio==3.40.1
|
4 |
-
huggingface_hub==0.16.4
|
5 |
-
torch==2.0.1
|
6 |
torchaudio==2.0.2
|
|
|
|
1 |
+
git+https://github.com/huggingface/transformers
|
|
|
|
|
|
|
|
|
2 |
torchaudio==2.0.2
|
3 |
+
sentencepiece
|