import gradio as gr

whisper = gr.load("models/openai/whisper-small")
        
def inference(audio):
    # audio = whisper.load_audio(audio)
    # audio = whisper.pad_or_trim(audio)
    
    # mel = whisper.log_mel_spectrogram(audio).to(model.device)
    
    # _, probs = model.detect_language(mel)
    
    # options = whisper.DecodingOptions(fp16 = False)
    # result = whisper.decode(model, mel, options)
    
    # print(result.text)
    # return result.text
    return whisper(audio).replace("AutomaticSpeechRecognitionOutput(text='", "").replace("', chunks=None)", "")

title = "Whisper Speech Recognition"

description = """
本例用于演示 <b>openai/whisper-base</b> 模型的语音识别(ASR)能力。基于原始模型开发,没有对模型做微调。 本例默认输出为中文,Whisper识别出的是繁体中文。

Whisper包含多个不同大小的版本,理论来讲模型越大识别效果越好,模型越小速度越快

<b>使用方法:</b> 上传一个音频文件或直接在页面中录制音频。音频会在传递到模型之前转换为单声道并重新采样为16 kHz。
"""

article = """
## 参考
- [Innev GitHub](https://github.com/innev)
"""

examples = [
    [None, "examples/zhiqi.wav", None],
    [None, "examples/zhichu.wav", None],
    [None, "examples/hmm_i_dont_know.wav", None],
    [None, "examples/henry5.mp3", None],
    [None, "examples/yearn_for_time.mp3", None],
    [None, "examples/see_in_eyes.wav", None],
]

# gr.Interface(
#     fn=inference,
#     api_name="predict",
#     inputs=[
#         gr.Audio(label="录制语音", type="filepath")
#     ],
#     outputs=[
#         gr.Text(label="识别出的文字")
#     ],
#     title="Whisper Speech Recognition",
#     article=article
# ).launch()

gr.Interface(
    fn=inference,
    inputs=[
        gr.Audio(label="录制语音", type="filepath")
    ],
    outputs=[
        gr.Textbox(label="识别出的文字")
    ],
    title=title,
    description=description,
    article=article,
    examples=examples,
    submit_btn="提交",
    clear_btn="清除",
).launch()