Spaces:
Sleeping
Sleeping
import gradio as gr | |
from huggingface_hub.utils import get_token | |
import requests | |
import base64 | |
from model import model_id, transcribe_audio_local | |
token = get_token() | |
def read_file_as_base64(file_path: str) -> str: | |
with open(file_path, "rb") as f: | |
return base64.b64encode(f.read()).decode() | |
def transcribe_audio(audio: str) -> str: | |
print(f"{audio=}") | |
if audio is None: | |
raise gr.Error( | |
"Please wait a moment for the audio to be uploaded, then click the button again." | |
) | |
# resample to 16k mono to reduce file size | |
import subprocess | |
import os | |
audio_resampled = audio.replace(".mp3", "_resampled.mp3") | |
subprocess.run( | |
[ | |
"ffmpeg", | |
"-i", | |
audio, | |
"-ac", | |
"1", | |
"-ar", | |
"16000", | |
audio_resampled, | |
"-y", | |
], | |
check=True, | |
) | |
b64 = read_file_as_base64(audio_resampled) | |
url = f"https://api-inference.huggingface.co/models/{model_id}" | |
headers = { | |
"Authorization": f"Bearer {token}", | |
"Content-Type": "application/json", | |
"x-wait-for-model": "true", | |
} | |
data = { | |
"inputs": b64, | |
"parameters": { | |
"generate_kwargs": { | |
"return_timestamps": True, | |
} | |
}, | |
} | |
response = requests.post(url, headers=headers, json=data) | |
print(f"{response.text=}") | |
out = response.json() | |
print(f"{out=}") | |
return out["text"] | |
with gr.Blocks() as demo: | |
gr.Markdown("# TWASR: Chinese (Taiwan) Automatic Speech Recognition.") | |
gr.Markdown("Upload an audio file or record your voice to transcribe it to text.") | |
gr.Markdown( | |
"First load may take a while to initialize the model, following requests will be faster." | |
) | |
with gr.Row(): | |
audio_input = gr.Audio( | |
label="Audio", type="filepath", show_download_button=True | |
) | |
text_output = gr.Textbox(label="Transcription") | |
transcribe_local_button = gr.Button( | |
"Transcribe with Transformers", variant="primary" | |
) | |
transcribe_button = gr.Button("Transcribe with Inference API", variant="secondary") | |
transcribe_local_button.click( | |
fn=transcribe_audio_local, inputs=[audio_input], outputs=[text_output] | |
) | |
transcribe_button.click( | |
fn=transcribe_audio, inputs=[audio_input], outputs=[text_output] | |
) | |
gr.Examples( | |
[ | |
["./examples/audio1.mp3"], | |
["./examples/audio2.mp3"], | |
], | |
inputs=[audio_input], | |
outputs=[text_output], | |
fn=transcribe_audio_local, | |
cache_examples=True, | |
cache_mode="lazy", | |
run_on_click=True, | |
) | |
gr.Markdown( | |
f"Current model: {model_id}. For more information, visit the [model hub](https://huggingface.co/{model_id})." | |
) | |
if __name__ == "__main__": | |
demo.launch() | |