File size: 2,614 Bytes
c51ea2c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
import gradio as gr
import torch
import torchaudio
from transformers import AutoProcessor, SeamlessM4TModel
processor = AutoProcessor.from_pretrained("facebook/hf-seamless-m4t-medium")
model = SeamlessM4TModel.from_pretrained("facebook/hf-seamless-m4t-medium")
model.to('cuda')
language_dict = {
"Modern Standard Arabic" : "arb",
"Bengali" : "ben",
"Catalan" : "cat",
"Czech" : "ces",
"Mandarin Chinese" : "cmn",
"Welsh" : "cym",
"Danish" : "dan",
"German" : "deu",
"English" : "eng",
"Estonian" : "est",
"Finnish" : "fin",
"French" : "fra",
"Hindi" : "hin",
"Indonesian" : "ind",
"Italian" : "ita",
"Japanese" : "jpn",
"Korean" : "kor",
"Maltese" : "mlt",
"Dutch" : "nld",
"Western Persian" : "pes",
"Polish" : "pol",
"Portuguese" : "por",
"Romanian" : "ron",
"Russian" : "rus",
"Slovak" : "slk",
"Spanish" : "spa",
"Swedish" : "swe",
"Swahili" : "swh",
"Telugu" : "tel",
"Tagalog" : "tgl",
"Thai" : "tha",
"Turkish" : "tur",
"Ukrainian" : "ukr",
"Urdu" : "urd",
"Northern Uzbek" : "uzn",
"Vietnamese" : "vie"
}
languages = list(language_dict.keys())
def png(source_lang,target_lang,audio,text):
source_lang_code = language_dict[source_lang]
target_lang_code = language_dict[target_lang]
if audio == None:
processed_inputs = processor(text, src_lang=source_lang_code, return_tensors="pt")
else:
sample_rate, audio_data = audio
audio_tokens = torch.from_numpy(audio_data).to(torch.device("cuda"))
audio_tokens = audio_tokens.to(torch.float32)
audio_tokens = torchaudio.functional.resample(audio_tokens, orig_freq=sample_rate, new_freq=16_000)
audio_tokens = audio_tokens.cpu()
processed_inputs = processor(audios=audio_tokens, sampling_rate=16000, return_tensors="pt")
processed_inputs = processed_inputs.to("cuda")
generated_audio = model.generate(**processed_inputs, tgt_lang=target_lang_code)[0].cpu().numpy().squeeze()
output_tokens = model.generate(**processed_inputs, tgt_lang=target_lang_code, generate_speech=False)
generated_text = processor.decode(output_tokens[0].tolist()[0], skip_special_tokens=True)
return (16000,generated_audio),generated_text
iface = gr.Interface(
png,
inputs=[
gr.Dropdown(languages, label="Source Language"),
gr.Dropdown(languages, label="Target Language"),
gr.Audio(),
gr.Textbox(label="Enter Text in Source Language")
],
outputs=[
gr.Audio(label = "Translated Audio"),
gr.Textbox(label="Translated Text")
],
title="Language Translation App",
description="Select source and target languages for translation.",
)
iface.launch(debug=True)
|