Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import torch
|
3 |
+
import torchaudio
|
4 |
+
|
5 |
+
from transformers import AutoProcessor, SeamlessM4TModel
|
6 |
+
processor = AutoProcessor.from_pretrained("facebook/hf-seamless-m4t-medium")
|
7 |
+
model = SeamlessM4TModel.from_pretrained("facebook/hf-seamless-m4t-medium")
|
8 |
+
model.to('cuda')
|
9 |
+
|
10 |
+
language_dict = {
|
11 |
+
"Modern Standard Arabic" : "arb",
|
12 |
+
"Bengali" : "ben",
|
13 |
+
"Catalan" : "cat",
|
14 |
+
"Czech" : "ces",
|
15 |
+
"Mandarin Chinese" : "cmn",
|
16 |
+
"Welsh" : "cym",
|
17 |
+
"Danish" : "dan",
|
18 |
+
"German" : "deu",
|
19 |
+
"English" : "eng",
|
20 |
+
"Estonian" : "est",
|
21 |
+
"Finnish" : "fin",
|
22 |
+
"French" : "fra",
|
23 |
+
"Hindi" : "hin",
|
24 |
+
"Indonesian" : "ind",
|
25 |
+
"Italian" : "ita",
|
26 |
+
"Japanese" : "jpn",
|
27 |
+
"Korean" : "kor",
|
28 |
+
"Maltese" : "mlt",
|
29 |
+
"Dutch" : "nld",
|
30 |
+
"Western Persian" : "pes",
|
31 |
+
"Polish" : "pol",
|
32 |
+
"Portuguese" : "por",
|
33 |
+
"Romanian" : "ron",
|
34 |
+
"Russian" : "rus",
|
35 |
+
"Slovak" : "slk",
|
36 |
+
"Spanish" : "spa",
|
37 |
+
"Swedish" : "swe",
|
38 |
+
"Swahili" : "swh",
|
39 |
+
"Telugu" : "tel",
|
40 |
+
"Tagalog" : "tgl",
|
41 |
+
"Thai" : "tha",
|
42 |
+
"Turkish" : "tur",
|
43 |
+
"Ukrainian" : "ukr",
|
44 |
+
"Urdu" : "urd",
|
45 |
+
"Northern Uzbek" : "uzn",
|
46 |
+
"Vietnamese" : "vie"
|
47 |
+
}
|
48 |
+
languages = list(language_dict.keys())
|
49 |
+
|
50 |
+
|
51 |
+
def png(source_lang,target_lang,audio,text):
|
52 |
+
|
53 |
+
source_lang_code = language_dict[source_lang]
|
54 |
+
target_lang_code = language_dict[target_lang]
|
55 |
+
|
56 |
+
if audio == None:
|
57 |
+
processed_inputs = processor(text, src_lang=source_lang_code, return_tensors="pt")
|
58 |
+
else:
|
59 |
+
sample_rate, audio_data = audio
|
60 |
+
audio_tokens = torch.from_numpy(audio_data).to(torch.device("cuda"))
|
61 |
+
audio_tokens = audio_tokens.to(torch.float32)
|
62 |
+
audio_tokens = torchaudio.functional.resample(audio_tokens, orig_freq=sample_rate, new_freq=16_000)
|
63 |
+
audio_tokens = audio_tokens.cpu()
|
64 |
+
processed_inputs = processor(audios=audio_tokens, sampling_rate=16000, return_tensors="pt")
|
65 |
+
|
66 |
+
|
67 |
+
processed_inputs = processed_inputs.to("cuda")
|
68 |
+
generated_audio = model.generate(**processed_inputs, tgt_lang=target_lang_code)[0].cpu().numpy().squeeze()
|
69 |
+
output_tokens = model.generate(**processed_inputs, tgt_lang=target_lang_code, generate_speech=False)
|
70 |
+
generated_text = processor.decode(output_tokens[0].tolist()[0], skip_special_tokens=True)
|
71 |
+
|
72 |
+
return (16000,generated_audio),generated_text
|
73 |
+
|
74 |
+
iface = gr.Interface(
|
75 |
+
png,
|
76 |
+
inputs=[
|
77 |
+
gr.Dropdown(languages, label="Source Language"),
|
78 |
+
gr.Dropdown(languages, label="Target Language"),
|
79 |
+
gr.Audio(),
|
80 |
+
gr.Textbox(label="Enter Text in Source Language")
|
81 |
+
],
|
82 |
+
outputs=[
|
83 |
+
gr.Audio(label = "Translated Audio"),
|
84 |
+
gr.Textbox(label="Translated Text")
|
85 |
+
],
|
86 |
+
title="Language Translation App",
|
87 |
+
description="Select source and target languages for translation.",
|
88 |
+
)
|
89 |
+
|
90 |
+
iface.launch(debug=True)
|
91 |
+
|
92 |
+
|