import gradio as gr import torch import torchaudio from transformers import AutoProcessor, SeamlessM4TModel processor = AutoProcessor.from_pretrained("facebook/hf-seamless-m4t-medium") model = SeamlessM4TModel.from_pretrained("facebook/hf-seamless-m4t-medium") model.to('cuda') language_dict = { "Modern Standard Arabic" : "arb", "Bengali" : "ben", "Catalan" : "cat", "Czech" : "ces", "Mandarin Chinese" : "cmn", "Welsh" : "cym", "Danish" : "dan", "German" : "deu", "English" : "eng", "Estonian" : "est", "Finnish" : "fin", "French" : "fra", "Hindi" : "hin", "Indonesian" : "ind", "Italian" : "ita", "Japanese" : "jpn", "Korean" : "kor", "Maltese" : "mlt", "Dutch" : "nld", "Western Persian" : "pes", "Polish" : "pol", "Portuguese" : "por", "Romanian" : "ron", "Russian" : "rus", "Slovak" : "slk", "Spanish" : "spa", "Swedish" : "swe", "Swahili" : "swh", "Telugu" : "tel", "Tagalog" : "tgl", "Thai" : "tha", "Turkish" : "tur", "Ukrainian" : "ukr", "Urdu" : "urd", "Northern Uzbek" : "uzn", "Vietnamese" : "vie" } languages = list(language_dict.keys()) def png(source_lang,target_lang,audio,text): source_lang_code = language_dict[source_lang] target_lang_code = language_dict[target_lang] if audio == None: processed_inputs = processor(text, src_lang=source_lang_code, return_tensors="pt") else: sample_rate, audio_data = audio audio_tokens = torch.from_numpy(audio_data).to(torch.device("cuda")) audio_tokens = audio_tokens.to(torch.float32) audio_tokens = torchaudio.functional.resample(audio_tokens, orig_freq=sample_rate, new_freq=16_000) audio_tokens = audio_tokens.cpu() processed_inputs = processor(audios=audio_tokens, sampling_rate=16000, return_tensors="pt") processed_inputs = processed_inputs.to("cuda") generated_audio = model.generate(**processed_inputs, tgt_lang=target_lang_code)[0].cpu().numpy().squeeze() output_tokens = model.generate(**processed_inputs, tgt_lang=target_lang_code, generate_speech=False) generated_text = processor.decode(output_tokens[0].tolist()[0], skip_special_tokens=True) return (16000,generated_audio),generated_text iface = gr.Interface( png, inputs=[ gr.Dropdown(languages, label="Source Language"), gr.Dropdown(languages, label="Target Language"), gr.Audio(), gr.Textbox(label="Enter Text in Source Language") ], outputs=[ gr.Audio(label = "Translated Audio"), gr.Textbox(label="Translated Text") ], title="Language Translation App", description="Select source and target languages for translation.", ) iface.launch(debug=True)