import gradio as gr from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC from datasets import load_dataset import soundfile as sf import torch # load model and tokenizer processor = Wav2Vec2Processor.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h") model = Wav2Vec2ForCTC.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h") # define function to read in sound file def map_to_array(batch): speech, _ = sf.read(batch["file"]) batch["speech"] = speech return batch # tokenize def recognize_speech(audio): ds = map_to_array({ "file": audio }) input_values = processor(ds["speech"], return_tensors="pt", padding="longest").input_values # Batch size 1 logits = model(input_values).logits predicted_ids = torch.argmax(logits, dim=-1) transcription = processor.batch_decode(predicted_ids) return transcription[0] # launch a simple UI gr.Interface(fn=recognize_speech, inputs="microphone", outputs="text").launch()