import nltk import librosa import torch import gradio as gr from transformers import Wav2Vec2ForCTC from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2ForCTC nltk.download("punkt") ## Loading the pre-trained model and the tokenizer model_name = "facebook/wav2vec2-base-960h" tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(model_name) model = Wav2Vec2ForCTC.from_pretrained(model_name) # Creating a function that makes sure that the speech input has a sampling rate of 16 kHz def load_data(input_file): speech, sample_rate = librosa.load(input_file) if len(speech.shape) > 1: speech = speech[:, 0] + speech[:, 1] if sample_rate != 16000: speech = librosa.resample(speech, sample_rate, 16000) return speech # Creating a function for correcting the letter casing in a sentence def correct_casing(input_sentence): sentences = nltk.sent_tokenize(input_sentence) return (' '.join([s.replace(s[0], s[0].capitalize(), 1) for s in sentences])) # Defining a function for getting a transcript of the audio input def asr_transcript(input_file): speech = load_data(input_file) input_values = tokenizer(speech, return_tensors="pt").input_values logits = model(input_values).logits predicted_ids = torch.argmax(logits, dim=-1) transcription = tokenizer.decode(predicted_ids[0]) transcription = correct_casing(transcription.lower()) return transcription # Creating a UI for the model using gr.Interface gr.Interface(fn=asr_transcript, inputs=gr.components.Audio(type="filepath", optional=True, label="Speaker"), outputs=gr.components.Textbox(label="Output Text"), title="ASR using Wav2Vec 2.0", description="This application displays transcribed text for given audio input", examples=[["Test_File1.wav"], ["Test_File2.wav"], ["Test_File3.wav"]], theme="grass").launch()