import gradio as gr from huggingface_hub.utils import get_token import requests import base64 from model import model_id, transcribe_audio_local token = get_token() def read_file_as_base64(file_path: str) -> str: with open(file_path, "rb") as f: return base64.b64encode(f.read()).decode() def transcribe_audio(audio: str) -> str: print(f"{audio=}") if audio is None: raise gr.Error( "Please wait a moment for the audio to be uploaded, then click the button again." ) # resample to 16k mono to reduce file size import subprocess import os audio_resampled = audio.replace(".mp3", "_resampled.mp3") subprocess.run( [ "ffmpeg", "-i", audio, "-ac", "1", "-ar", "16000", audio_resampled, "-y", ], check=True, ) b64 = read_file_as_base64(audio_resampled) url = f"https://api-inference.huggingface.co/models/{model_id}" headers = { "Authorization": f"Bearer {token}", "Content-Type": "application/json", "x-wait-for-model": "true", } data = { "inputs": b64, "parameters": { "generate_kwargs": { "return_timestamps": True, } }, } response = requests.post(url, headers=headers, json=data) print(f"{response.text=}") out = response.json() print(f"{out=}") return out["text"] with gr.Blocks() as demo: gr.Markdown("# TWASR: Chinese (Taiwan) Automatic Speech Recognition.") gr.Markdown("Upload an audio file or record your voice to transcribe it to text.") gr.Markdown( "First load may take a while to initialize the model, following requests will be faster." ) with gr.Row(): audio_input = gr.Audio( label="Audio", type="filepath", show_download_button=True ) text_output = gr.Textbox(label="Transcription") transcribe_local_button = gr.Button( "Transcribe with Transformers", variant="primary" ) transcribe_button = gr.Button("Transcribe with Inference API", variant="secondary") transcribe_local_button.click( fn=transcribe_audio_local, inputs=[audio_input], outputs=[text_output] ) transcribe_button.click( fn=transcribe_audio, inputs=[audio_input], outputs=[text_output] ) gr.Examples( [ ["./examples/audio1.mp3"], ["./examples/audio2.mp3"], ], inputs=[audio_input], outputs=[text_output], fn=transcribe_audio_local, cache_examples=True, cache_mode="lazy", run_on_click=True, ) gr.Markdown( f"Current model: {model_id}. For more information, visit the [model hub](https://huggingface.co/{model_id})." ) if __name__ == "__main__": demo.launch()