import json import requests from datetime import datetime import time import traceback API_URL = "https://api-inference.huggingface.co/models/" def date_now(): return datetime.now().strftime("%Y-%m-%d %H:%M:%S") def record_opt(msg): return f"{date_now()} {msg}\n" def speech_recognize(audio, model_name, access_token, opt): opt += record_opt("Transcription starts ...") yield "Transcribing, please wait..", opt start = time.monotonic() with open(audio, "rb") as f: data = f.read() try: url = API_URL + model_name print(f">>> url is {url}") headers = {"Authorization": f"Bearer {access_token}"} response = requests.request("POST", url, headers=headers, data=data) text = json.loads(response.content.decode("utf-8")) print(f">>> text is {text}") text = text['text'] except: text = f"Transcription failed:\n{traceback.format_exc()}" cost = time.monotonic() - start opt += record_opt(f"Transcription ends, time consuming{cost:.3f}s") yield text, opt import gradio as gr with gr.Blocks() as demo: gr.HTML("""

Automatic Speech Recognition (OpenAI Whisper with Inference API)

""") with gr.Row(): gr.Markdown( """🤗 Call the huggingface API and use the OpenAI Whisper model for speech recognition, which can also be called speech to text(Speech to Text, STT) 👉 The purpose is to practice using the Gradio Audio component and explore using the Huggingface Inference API > 💡Tip: You need to fill in the Huggingface access token to call the Huggingface Inference API """ ) with gr.Row(): with gr.Column(): audio = gr.Audio(source="microphone", type="filepath") model_name = gr.Dropdown( label="Select model", choices=[ "openai/whisper-large-v3", "openai/whisper-large-v2", "openai/whisper-large", "openai/whisper-medium", "openai/whisper-small", "openai/whisper-base", "openai/whisper-tiny", ], value="openai/whisper-large-v3", ) access_token = gr.Textbox(label="Huggingface access token") with gr.Column(): output = gr.Textbox(label="Transcription results") operation = gr.Textbox(label="Component operation history") audio.start_recording( lambda x: x + record_opt("Start recording ..."), inputs=operation, outputs=operation ) audio.play( lambda x: x + record_opt("Play recording"), inputs=operation, outputs=operation ) audio.pause( lambda x: x + record_opt("Pause playback"), inputs=operation, outputs=operation ) audio.stop( lambda x: x + record_opt("Stop play"), inputs=operation, outputs=operation ) audio.end( lambda x: x + record_opt("Finished playing"), inputs=operation, outputs=operation ) audio.stop_recording(speech_recognize, inputs=[audio, model_name, access_token, operation], outputs=[output, operation]) demo.queue(max_size=4, concurrency_count=4) demo.launch()