File size: 3,505 Bytes
c51ea2c
 
 
 
 
 
 
606152f
c51ea2c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
606152f
c51ea2c
 
606152f
c51ea2c
 
 
606152f
c51ea2c
 
 
 
 
 
5207054
 
 
 
a984016
5207054
 
c51ea2c
 
 
 
 
 
 
 
 
 
 
 
 
a984016
 
914d75d
 
a984016
 
c51ea2c
 
 
bc6475a
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import gradio as gr
import torch
import torchaudio

from transformers import AutoProcessor, SeamlessM4TModel
processor = AutoProcessor.from_pretrained("facebook/hf-seamless-m4t-medium")
model = SeamlessM4TModel.from_pretrained("facebook/hf-seamless-m4t-medium")
# model.to('cuda')

language_dict  = {
"Modern Standard Arabic" : "arb",
"Bengali" : "ben",
"Catalan" : "cat",
"Czech" : "ces",
"Mandarin Chinese" : "cmn",
"Welsh" : "cym",
"Danish" : "dan",
"German" : "deu",
"English" : "eng",
"Estonian" : "est",
"Finnish" : "fin",
"French" : "fra",
"Hindi" : "hin",
"Indonesian" : "ind",
"Italian" : "ita",
"Japanese" : "jpn",
"Korean" : "kor",
"Maltese" : "mlt",
"Dutch" : "nld",
"Western Persian" : "pes",
"Polish" : "pol",
"Portuguese" : "por",
"Romanian" : "ron",
"Russian" : "rus",
"Slovak" : "slk",
"Spanish" : "spa",
"Swedish" : "swe",
"Swahili" : "swh",
"Telugu" : "tel",
"Tagalog" : "tgl",
"Thai" : "tha",
"Turkish" : "tur",
"Ukrainian" : "ukr",
"Urdu" : "urd",
"Northern Uzbek" : "uzn",
"Vietnamese" : "vie"
}
languages = list(language_dict.keys())


def png(source_lang,target_lang,audio,text):

  source_lang_code = language_dict[source_lang]
  target_lang_code = language_dict[target_lang]

  if audio == None:
    processed_inputs = processor(text, src_lang=source_lang_code, return_tensors="pt")
  else:
    sample_rate, audio_data = audio
    audio_tokens = torch.from_numpy(audio_data) #.to(torch.device("cuda"))
    audio_tokens = audio_tokens.to(torch.float32)
    audio_tokens = torchaudio.functional.resample(audio_tokens, orig_freq=sample_rate, new_freq=16_000)
    # audio_tokens = audio_tokens.cpu()
    processed_inputs = processor(audios=audio_tokens, sampling_rate=16000, return_tensors="pt")


  # processed_inputs = processed_inputs.to("cuda")
  generated_audio = model.generate(**processed_inputs, tgt_lang=target_lang_code)[0].cpu().numpy().squeeze()
  output_tokens = model.generate(**processed_inputs, tgt_lang=target_lang_code, generate_speech=False)
  generated_text = processor.decode(output_tokens[0].tolist()[0], skip_special_tokens=True)

  return (16000,generated_audio),generated_text

title = "36 Language Translator"
description = """
This Demo can translate either Speech or Text form any of the selected SOURCE language amoung 36 languages to both Speech and Text in any of the selected TARGET language.
This Demo is powered by "facebook/hf-seamless-m4t-medium" model. Thanks for checking out.
Select source and target languages for translation.
"""

iface = gr.Interface(
    png,
    inputs=[
        gr.Dropdown(languages, label="Source Language"),
        gr.Dropdown(languages, label="Target Language"),
        gr.Audio(),
        gr.Textbox(label="Enter Text in Source Language")
    ],
    outputs=[
        gr.Audio(label = "Translated Audio"),
        gr.Textbox(label="Translated Text")
        ],
    title="Language Translation App",
    description = """
    This Demo can translate either Speech or Text form any of the selected SOURCE language amoung 36 languages to both Speech and Text in any of the selected TARGET language.
    This Demo is powered by "facebook/hf-seamless-m4t-medium" model. This Demo can take 3-4 mins as it is running on a CPU. Try this Google Colab Notebook with GPU for faster processing.
    https://colab.research.google.com/drive/1NJ_FUl0RPQI7XHHj-5p34id13A4sREV4?usp=sharing .Thanks for checking out.
    Select source and target languages for translation.
    """,
)


if __name__ == "__main__":
    iface.launch()