Sakil's picture
Update app.py
7b3323f
raw
history blame
937 Bytes
import librosa
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer
import gradio as gr
from transformers import pipeline
import IPython.display as display
import soundfile as sf
def speech_text(audio_file):
tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
speech, rate = librosa.load(audio_file,sr=16000)
display.Audio(audio_file, autoplay=True)
print(rate)
input_values = tokenizer(speech, return_tensors ='pt').input_values
#Store logits (non-normalized predictions)
logits = model(input_values).logits
#Store predicted id's
predicted_ids = torch.argmax(logits, dim =-1)
transcriptions = tokenizer.decode(predicted_ids[0])
return transcriptions
iface = gr.Interface(speech_text,inputs="audio",outputs="text",title='Sakil Transcription',description="Transcription")
iface.launch(inline=False)