import torch import scipy import gradio as gr from transformers import set_seed, pipeline from transformers import VitsTokenizer, VitsModel from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM from datasets import load_dataset, Audio import speech_to_text, text_to_speech, translation language_list = ['mos', 'fra', 'eng'] demo = gr.Blocks() mms_stt = gr.Interface( fn=speech_to_text.transcribe, inputs=[ gr.Audio(sources=["microphone", "upload"], type="filepath"), gr.Dropdown(language_list, label="Language") ], outputs="text", title="Speech-to-text" ) mms_tts = gr.Interface( fn=text_to_speech.synthesize_facebook, inputs=[ gr.Text(label="Input text"), gr.Dropdown(language_list, label="Language") ], outputs=[ gr.Audio(label="Generated Audio", type="numpy") ], title="Text-to-speech" ) mms_translate = gr.Interface( fn=translation.translation, inputs=[ gr.Textbox(label="Text", placeholder="Yaa sõama"), gr.Dropdown(label="Source Language", choices=["eng_Latn", "fra_Latn", "mos_Latn"]), gr.Dropdown(label="Target Language", choices=["eng_Latn", "fra_Latn", "mos_Latn"]) ], outputs=["text"], examples=[["Building a translation demo with Gradio is so easy!", "eng_Latn", "mos_Latn"]], title="Translation Demo", ) with demo: gr.TabbedInterface( [mms_translate, mms_tts, mms_stt], ["Translation", "Text-to-speech", "Speech-to-text"], ) demo.launch()