RASMUS's picture
Update app.py
3d187fe verified
raw
history blame contribute delete
No virus
4.2 kB
import gradio as gr
import librosa
import soundfile as sf
import torch
import warnings
import os
from transformers import Wav2Vec2ProcessorWithLM, Wav2Vec2CTCTokenizer
warnings.filterwarnings("ignore")
#load wav2vec2 tokenizer and model
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from fastapi import FastAPI, HTTPException, File
from transformers import pipeline
pipe_95m = pipeline(model="Finnish-NLP/wav2vec2-base-fi-voxpopuli-v2-finetuned",chunk_length_s=20, stride_length_s=(3, 3))
pipe_300m = pipeline(model="Finnish-NLP/wav2vec2-large-uralic-voxpopuli-v2-finnish",chunk_length_s=20, stride_length_s=(3, 3))
pipe_1b = pipeline(model="Finnish-NLP/wav2vec2-xlsr-1b-finnish-lm-v2",chunk_length_s=20, stride_length_s=(3, 3))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_checkpoint = 'Finnish-NLP/t5-small-nl24-casing-punctuation-correction'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)#, use_auth_token=os.environ.get('hf_token'))
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, from_flax=False, torch_dtype=torch.float32, use_auth_token=os.environ.get('hf_token')).to(device)
# define speech-to-text function
def asr_transcript(audio, audio_microphone, model_params):
audio = audio_microphone if audio_microphone else audio
if audio == None and audio_microphone == None:
return "Please provide audio (wav or mp3) by uploading a file or by recording audio using microphone by pressing Record (And allow usage of microphone)", "Please provide audio by uploading a file or by recording audio using microphone by pressing Record (And allow usage of microphone)"
text = ""
if audio:
if model_params == "1 billion":
text = pipe_1b(audio.name)
elif model_params == "300 million":
text = pipe_300m(audio.name)
elif model_params == "95 million":
text = pipe_95m(audio.name)
input_ids = tokenizer(text['text'], return_tensors="pt").input_ids.to(device)
outputs = model.generate(input_ids, max_length=128)
case_corrected_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
return text['text'], case_corrected_text
else:
return "File not valid"
gradio_ui = gr.Interface(
fn=asr_transcript,
title="Finnish Automatic Speech Recognition",
description="Upload an audio clip or record from browser using microphone, and let AI do the hard work of transcribing.",
article = """
This demo includes 2 kinds of models that are run together. First selected ASR model does speech recognition which produces lowercase text without punctuation.
After that we run a sequence-to-sequence model which tries to correct casing and punctuation which produces the final output.
You can select one of two speech recognition models listed below
1. 1 billion, best accuracy but slowest by big margin. Based on multilingual wav2vec2-xlsr model by Meta. More info here https://huggingface.co/Finnish-NLP/wav2vec2-xlsr-1b-finnish-lm-v2
3. 300 million, at bar in accuracy as 1. but a lot faster. Based on Uralic wav2vec2 model by Meta. More info here https://huggingface.co/Finnish-NLP/wav2vec2-large-uralic-voxpopuli-v2-finnish
3. 95 million, almost as accurate as 1. but really much faster. Based on Finnish wav2vec2 model by Meta. More info here https://huggingface.co/Finnish-NLP/wav2vec2-base-fi-voxpopuli-v2-finetuned
More info about the casing+punctuation correction model can be found here https://huggingface.co/Finnish-NLP/t5-small-nl24-casing-punctuation-correction
""",
inputs=[gr.inputs.Audio(label="Upload Audio File", type="file", optional=True), gr.inputs.Audio(source="microphone", type="file", optional=True, label="Record from microphone"), gr.inputs.Dropdown(choices=["95 million","300 million", "1 billion"], type="value", default="300 million", label="Select speech recognition model parameter amount", optional=False)],
outputs=[gr.outputs.Textbox(label="Recognized speech"),gr.outputs.Textbox(label="Recognized speech with case correction and punctuation")]
)
gradio_ui.launch()