comodoro's picture
Debug
6829dc6
from transformers import pipeline, AutoFeatureExtractor, AutoTokenizer, Wav2Vec2ForCTC
import gradio as gr
import time
model_id = 'comodoro/wav2vec2-xls-r-300m-cs-250'
feature_extractor = AutoFeatureExtractor.from_pretrained(model_id)
model = Wav2Vec2ForCTC.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)
p = pipeline("automatic-speech-recognition", chunk_length_s=5, model=model,
tokenizer=tokenizer, feature_extractor=feature_extractor)
def transcribe(audio, state=""):
time.sleep(2)
text = p(audio)["text"]
state += text + " "
return state, state
gr.Interface(
fn=transcribe,
inputs=[
gr.inputs.Audio(source="microphone", type="filepath"),
"state"
],
outputs=[
"textbox",
"state"
],
live=True).launch(debug=True)