speech-to-text / app.py
micknikolic's picture
Update app.py
640b224
raw
history blame
2.32 kB
import gradio as gr
import time
import io
import librosa
import torch
import soundfile as sf
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
#Instantiating the model object.
model = AutoModelForSpeechSeq2Seq.from_pretrained(pretrained_model_name_or_path= "openai/whisper-large-v3",
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
use_safetensors=True)
model = model.to("cuda")
#Instantiating the processor object.
processor = AutoProcessor.from_pretrained(pretrained_model_name_or_path="openai/whisper-large-v3")
#Instantiating the transformer class' pipeline object.
pipe = pipeline(task="automatic-speech-recognition",
model="openai/whisper-large-v3",
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
max_new_tokens=128,
chunk_length_s=30,
batch_size=16,
return_timestamps=True,
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
device="cuda"
)
#Defining speech-to-text function.
def convert(audio, state=""):
"""
This function performs speech to text conversion and will be used in Gradio's Interface function.
Parameters:
- audio: audio data as a bytes-like object.
- state: a string representing the accumulated text from previous conversions.
"""
time.sleep(3)
try:
result = pipe(audio)
transcribed_text = result['text']
state += transcribed_text + " "
except Exception as e:
return f"Error processing audio: Please start recording!", state
return state, state
#Instantiating Gradio Interface.
gr_interface = gr.Interface(
fn = convert,
title = "Automatic Speech-to-Text",
description = "### Record your speech and watch it get converted to text!",
inputs = [
gr.Audio(
label="Please Record Your Speech Here!",
sources="microphone",
type="filepath"),
"state"],
outputs = [
"textbox",
"state"
],
live=True
)
#Launching the app (share=True).
gr_interface.launch()