from transformers import pipeline, AutoFeatureExtractor, AutoTokenizer, Wav2Vec2ForCTC import gradio as gr import time model_id = 'comodoro/wav2vec2-xls-r-300m-cs-250' feature_extractor = AutoFeatureExtractor.from_pretrained(model_id) model = Wav2Vec2ForCTC.from_pretrained(model_id) tokenizer = AutoTokenizer.from_pretrained(model_id) p = pipeline("automatic-speech-recognition", chunk_length_s=5, model=model, tokenizer=tokenizer, feature_extractor=feature_extractor) def transcribe(audio, state=""): time.sleep(2) text = p(audio)["text"] state += text + " " return state, state gr.Interface( fn=transcribe, inputs=[ gr.inputs.Audio(source="microphone", type="filepath"), "state" ], outputs=[ "textbox", "state" ], live=True).launch(debug=True)