import gradio as gr import librosa from transformers import Wav2Vec2Processor, SpeechEncoderDecoder processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-xls-r-300m-en-to-15", use_auth_token="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") model = SpeechEncoderDecoder.from_pretrained("facebook/wav2vec2-xls-r-300m-en-to-15", use_auth_token="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") def process_audio_file(file): data, sr = librosa.load(file) if sr != 16000: data = librosa.resample(data, sr, 16000) print(data.shape) input_values = processor(data, return_tensors="pt").input_values return input_values def transcribe(file): input_values = process_audio_file(file) sequences = model.generate(input_values, num_beams=1, max_length=30) transcription = processor.batch_decode(sequences) return transcription[0] iface = gr.Interface( fn=speech_recognize, inputs=gr.inputs.Audio(source="microphone", type='filepath'), outputs="text", ) iface.launch()