import gradio as gr import scipy from transformers import VitsModel, AutoTokenizer import torch model = VitsModel.from_pretrained("facebook/mms-tts-crh") tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-crh") def predict(image): global model, tokenizer text = name inputs = tokenizer(text, return_tensors="pt") with torch.no_grad(): output = model(**inputs).waveform return gr.Audio(output) gr.Interface( predict, inputs= gr.Textbox(value="", label="Duration in seconds"), outputs="audio", title="Hot Dog? Or Not?", ).launch()