import gradio as gr from transformers import pipeline from transformers import MBartForConditionalGeneration, MBart50TokenizerFast transcribe = pipeline("automatic-speech-recognition") model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-one-to-many-mmt") tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-one-to-many-mmt", src_lang="en_XX") def speech_to_text(audio): text = transcribe(audio)["text"] model_inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True) generated_tokens = model.generate( **model_inputs, forced_bos_token_id=tokenizer.lang_code_to_id["hi_IN"] ) translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) return translation gr.Interface( fn=speech_to_text, inputs=gr.Audio(source="microphone", type="filepath"), outputs="text").launch()