from deepspeech import Model import gradio as gr import numpy as np import urllib.request model_file_path = "deepspeech-0.9.3-models.pbmm" lm_file_path = "deepspeech-0.9.3-models.scorer" url = "https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/" urllib.request.urlretrieve(url + model_file_path, filename=model_file_path) urllib.request.urlretrieve(url + lm_file_path, filename=lm_file_path) beam_width = 100 lm_alpha = 0.93 lm_beta = 1.18 model = Model(model_file_path) model.enableExternalScorer(lm_file_path) model.setScorerAlphaBeta(lm_alpha, lm_beta) model.setBeamWidth(beam_width) def reformat_freq(sr, y): if sr not in ( 48000, 16000, ): # Deepspeech only supports 16k, (we convert 48k -> 16k) raise ValueError("Unsupported rate", sr) if sr == 48000: y = ( ((y / max(np.max(y), 1)) * 32767) .reshape((-1, 3)) .mean(axis=1) .astype("int16") ) sr = 16000 return sr, y def transcribe(audio_file): text = model.stt(audio_file) return text demo = gr.Interface( transcribe, # [gr.Audio(source="microphone", streaming=True), "state"], gr.Audio(label="Upload Audio File", source="upload", type="filepath"), outputs=gr.Textbox(label="Transcript") ) if __name__ == "__main__": demo.launch()