import gradio as gr from transformers import pipeline import numpy as np import librosa from punctuators.models import PunctCapSegModelONNX transcriber = pipeline("automatic-speech-recognition", model="Oysiyl/w2v-bert-2.0-dutch-colab-CV16.0") punct_cap_model = PunctCapSegModelONNX.from_pretrained("1-800-BAD-CODE/xlm-roberta_punctuation_fullstop_truecase") def transcribe(audio): sr, y = audio y = y.astype(np.float32) y /= np.max(np.abs(y)) if sr != 16000: y = librosa.resample(y, orig_sr=sr, target_sr=16000) transcribed_text = transcriber({"sampling_rate": 16000, "raw": y})["text"] punct_cap_text = punct_cap_model.infer(texts=[transcribed_text], apply_sbd=True)[0][0] return punct_cap_text demo = gr.Interface( transcribe, gr.Audio(sources=["upload", "microphone"]), outputs="text", title="Automatic Speech Recognition for Dutch language demo", description="Click on the example below, upload audio from file or say something in microphone!", examples=[["examples/example1.wav"], ["examples/example2.wav"]], cache_examples=True ) demo.launch()