import gradio as gr import torch import scipy.io.wavfile as wavfile from transformers import AutoProcessor, SeamlessM4TModel, pipeline # tokenizer = AutoProcessor.from_pretrained("facebook/hf-seamless-m4t-medium") # model = SeamlessM4TModel.from_pretrained("facebook/hf-seamless-m4t-medium") # text = "some example text in the English language" # def greet(text): # inputs = tokenizer(text, return_tensors="pt") # with torch.no_grad(): # output = model(**inputs, decoder_input_ids=inputs["input_ids"]).waveform # out = output[0] # wavfile.write("tmp.wav", rate=16000, data=out) # return open("tmp.wav", "rb").read() def stt(audio): print(audio) br, data = audio tscrb = pipeline("automatic-speech-recognition", model="facebook/hubert-large-ls960-ft") return tscrb(data) iface = gr.Interface(fn=stt, inputs="audio", outputs="text") iface.launch()