import gradio as gr whisper = gr.load("models/openai/whisper-small") def inference(audio): # audio = whisper.load_audio(audio) # audio = whisper.pad_or_trim(audio) # mel = whisper.log_mel_spectrogram(audio).to(model.device) # _, probs = model.detect_language(mel) # options = whisper.DecodingOptions(fp16 = False) # result = whisper.decode(model, mel, options) # print(result.text) # return result.text return whisper(audio).replace("AutomaticSpeechRecognitionOutput(text='", "").replace("', chunks=None)", "") title = "Whisper Speech Recognition" description = """ 本例用于演示 <b>openai/whisper-base</b> 模型的语音识别(ASR)能力。基于原始模型开发,没有对模型做微调。 本例默认输出为中文,Whisper识别出的是繁体中文。 Whisper包含多个不同大小的版本,理论来讲模型越大识别效果越好,模型越小速度越快 <b>使用方法:</b> 上传一个音频文件或直接在页面中录制音频。音频会在传递到模型之前转换为单声道并重新采样为16 kHz。 """ article = """ ## 参考 - [Innev GitHub](https://github.com/innev) """ examples = [ [None, "examples/zhiqi.wav", None], [None, "examples/zhichu.wav", None], [None, "examples/hmm_i_dont_know.wav", None], [None, "examples/henry5.mp3", None], [None, "examples/yearn_for_time.mp3", None], [None, "examples/see_in_eyes.wav", None], ] # gr.Interface( # fn=inference, # api_name="predict", # inputs=[ # gr.Audio(label="录制语音", type="filepath") # ], # outputs=[ # gr.Text(label="识别出的文字") # ], # title="Whisper Speech Recognition", # article=article # ).launch() gr.Interface( fn=inference, inputs=[ gr.Audio(label="录制语音", type="filepath") ], outputs=[ gr.Textbox(label="识别出的文字") ], title=title, description=description, article=article, examples=examples, submit_btn="提交", clear_btn="清除", ).launch()