import librosa import gradio as gr import numpy as np from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC import soundfile as sf import torch # load model and tokenizer processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h") model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") def speech2text(audio): sr, data = audio # resample to 16hz data_16hz = librosa.resample(data[:,0].astype(np.float32),sr,16000) # tokenize input_values = processor([data_16hz], return_tensors="pt", padding="longest").input_values # Batch size 1 # retrieve logits logits = model(input_values).logits # take argmax and decode predicted_ids = torch.argmax(logits, dim=-1) transcription = processor.batch_decode(predicted_ids) return transcription[0] # batch size 1 iface = gr.Interface(speech2text, "microphone", "text") iface.launch()