whisper / app.py
innev's picture
Replace output result
6365962
import gradio as gr
whisper = gr.load("models/openai/whisper-small")
def inference(audio):
# audio = whisper.load_audio(audio)
# audio = whisper.pad_or_trim(audio)
# mel = whisper.log_mel_spectrogram(audio).to(model.device)
# _, probs = model.detect_language(mel)
# options = whisper.DecodingOptions(fp16 = False)
# result = whisper.decode(model, mel, options)
# print(result.text)
# return result.text
return whisper(audio).replace("AutomaticSpeechRecognitionOutput(text='", "").replace("', chunks=None)", "")
title = "Whisper Speech Recognition"
description = """
本例用于演示 <b>openai/whisper-base</b> 模型的语音识别(ASR)能力。基于原始模型开发,没有对模型做微调。 本例默认输出为中文,Whisper识别出的是繁体中文。
Whisper包含多个不同大小的版本,理论来讲模型越大识别效果越好,模型越小速度越快
<b>使用方法:</b> 上传一个音频文件或直接在页面中录制音频。音频会在传递到模型之前转换为单声道并重新采样为16 kHz。
"""
article = """
## 参考
- [Innev GitHub](https://github.com/innev)
"""
examples = [
[None, "examples/zhiqi.wav", None],
[None, "examples/zhichu.wav", None],
[None, "examples/hmm_i_dont_know.wav", None],
[None, "examples/henry5.mp3", None],
[None, "examples/yearn_for_time.mp3", None],
[None, "examples/see_in_eyes.wav", None],
]
# gr.Interface(
# fn=inference,
# api_name="predict",
# inputs=[
# gr.Audio(label="录制语音", type="filepath")
# ],
# outputs=[
# gr.Text(label="识别出的文字")
# ],
# title="Whisper Speech Recognition",
# article=article
# ).launch()
gr.Interface(
fn=inference,
inputs=[
gr.Audio(label="录制语音", type="filepath")
],
outputs=[
gr.Textbox(label="识别出的文字")
],
title=title,
description=description,
article=article,
examples=examples,
submit_btn="提交",
clear_btn="清除",
).launch()