speech2text / app.py
rensdimmendaal
lower text..
400fcfb
raw history blame
No virus
920 Bytes
import librosa
import gradio as gr
import numpy as np
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import soundfile as sf
import torch
# load model and tokenizer
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
def speech2text(audio):
sr, data = audio
# resample to 16hz
data_16hz = librosa.resample(data[:,0].astype(np.float32),sr,16000)
# tokenize
input_values = processor([data_16hz], return_tensors="pt", padding="longest").input_values # Batch size 1
# retrieve logits
logits = model(input_values).logits
# take argmax and decode
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)
return transcription[0].lower() # batch size 1
iface = gr.Interface(speech2text, "microphone", "text")
iface.launch()