speech-to-text / .ipynb_checkpoints /app-checkpoint.py
codenamewei's picture
Input with mic and file
d675859
import gradio as gr
from transformers import Wav2Vec2Processor
from transformers import AutoModelForCTC
from conversationalnlp.models.wav2vec2 import Wav2Vec2Predict
from conversationalnlp.models.wav2vec2 import ModelLoader
from conversationalnlp.utils import *
import soundfile as sf
import os
"""
run gradio with
>>python app.py
"""
audiosavepath = os.getcwd()
pretrained_model = "codenamewei/speech-to-text"
processor = Wav2Vec2Processor.from_pretrained(
pretrained_model)
model = AutoModelForCTC.from_pretrained(
pretrained_model)
modelloader = ModelLoader(model, processor)
predictor = Wav2Vec2Predict(modelloader)
examples = ["example1.flac", "example2.flac", "example3.flac"]
def greet(audioarray):
"""
audio array in the following format
(16000, array([ -5277184, 326400, -120320, ..., -5970432, -12745216,
-6934528], dtype=int32))
<class 'tuple'>
"""
audioabspath = os.path.join(audiosavepath, "temp.wav")
# WORKAROUND: Save to file and reread to get the array shape needed for prediction
sf.write(audioabspath, audioarray[1], audioarray[0])
print(f"Audio at path {audioabspath}")
predictiontexts = predictor.predictfiles([audioabspath])
outputtext = predictiontexts["predicted_text"][-1] + \
"\n" + predictiontexts["corrected_text"][-1]
return outputtext
demo = gr.Interface(fn=greet, inputs="audio",
outputs="text",
title="Speech-to-Text",
examples=examples)
demo.launch() # share=True)