import json import requests from datetime import datetime import time import traceback API_URL = "https://api-inference.huggingface.co/models/" def date_now(): return datetime.now().strftime("%Y-%m-%d %H:%M:%S") def record_opt(msg): return f"{date_now()} {msg}\n" def speech_recognize(audio, model_name, hf_token, opt): opt += record_opt("转录开始 ...") yield "转录中,请稍等...", opt start = time.monotonic() with open(audio, "rb") as f: data = f.read() try: url = API_URL + model_name print(f">>> url is {url}") headers = {"Authorization": f"Bearer {hf_token}"} response = requests.request("POST", url, headers=headers, data=data) text = json.loads(response.content.decode("utf-8")) print(f">>> text is {text}") text = text['text'] except: text = f"转录失败:\n{traceback.format_exc()}" cost = time.monotonic() - start opt += record_opt(f"转录结束,耗时{cost:.3f}s") yield text, opt import gradio as gr with gr.Blocks() as demo: gr.HTML("""

Automatic Speech Recognition (OpenAI Whisper with Inference API)

""") with gr.Row(): gr.Markdown( """🤗 调用 huggingface API,使用 OpenAI Whisper 模型进行语音识别,也可以称为语音转文本(Speech to Text, STT) 👉 目的是练习使用 Gradio Audio 组件和探索使用 Huggingface Inference API > 💡提示:需要填写 Huggingface token 来调用 Huggingface Inference API """ ) with gr.Row(): with gr.Column(): audio = gr.Audio(source="microphone", type="filepath") model_name = gr.Dropdown( label="选择模型", choices=[ "openai/whisper-large-v3", "openai/whisper-large-v2", "openai/whisper-large", "openai/whisper-medium", "openai/whisper-small", "openai/whisper-base", "openai/whisper-tiny", ], value="openai/whisper-large-v2", ) hf_token = gr.Textbox(label="Huggingface token") with gr.Column(): output = gr.Textbox(label="转录结果") operation = gr.Textbox(label="组件操作历史") audio.start_recording( lambda x: x + record_opt("开始录音 ..."), inputs=operation, outputs=operation ) audio.play( lambda x: x + record_opt("播放录音"), inputs=operation, outputs=operation ) audio.pause( lambda x: x + record_opt("暂停播放"), inputs=operation, outputs=operation ) audio.stop( lambda x: x + record_opt("停止播放"), inputs=operation, outputs=operation ) audio.end( lambda x: x + record_opt("播放完毕"), inputs=operation, outputs=operation ) audio.stop_recording(speech_recognize, inputs=[audio, model_name, hf_token, operation], outputs=[output, operation]) demo.queue(max_size=4, concurrency_count=4) demo.launch()