import librosa import torch from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer import gradio as gr from transformers import pipeline import IPython.display as display import soundfile as sf def speech_text(audio_file): tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h") model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") speech, rate = librosa.load(audio_file,sr=16000) display.Audio(audio_file, autoplay=True) print(rate) input_values = tokenizer(speech, return_tensors ='pt').input_values #Store logits (non-normalized predictions) logits = model(input_values).logits #Store predicted id's predicted_ids = torch.argmax(logits, dim =-1) transcriptions = tokenizer.decode(predicted_ids[0]) return transcriptions iface = gr.Interface(speech_text,inputs="audio",outputs="text",title='Sakil Transcription',description="Transcription") iface.launch(inline=False)