acr / app.py
roman
whisper small large uk
f6eb858
import streamlit as st
import librosa
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
import tempfile
from pydub import AudioSegment
# Define available models
available_models = ["Yehor/whisper-small-ukrainian", "arampacha/whisper-large-uk-2"]
# , "Yehor/wav2vec2-xls-r-300m-uk-with-3gram-news-lm", "Yehor/wav2vec2-xls-r-300m-uk-with-wiki-lm"
# available_models = ["theodotus/stt_uk_squeezeformer_ctc_sm", "arampacha/whisper-large-uk-2"]
st.title("Voice Recognition App")
# Model selection dropdown
model_choice = st.selectbox("Choose a model", available_models)
processor = AutoProcessor.from_pretrained(model_choice)
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_choice)
uploaded_file = st.file_uploader("Choose file", type=["wav", "mp3"])
def map_to_pred(file_path):
# load audio file
audio, _ = librosa.load(file_path)
# preprocess audio and generate standard
input_features = processor([audio], return_tensors="pt", sampling_rate=16000).input_features
generated_ids = model.generate(inputs=input_features)
transcription = processor.batch_decode(generated_ids, normalize=True, skip_special_tokens=True)
text = processor.tokenizer._normalize(transcription[0])
return text
if uploaded_file is not None:
# convert file object to file path
file_path = './temp.wav'
with open(file_path, 'wb') as f:
f.write(uploaded_file.getbuffer())
# Save the uploaded file temporarily
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
temp_file.write(uploaded_file.read())
temp_file_path = temp_file.name
# Convert audio file to a format supported by Whisper (if necessary)
audio = AudioSegment.from_file(temp_file_path)
temp_wav_path = tempfile.mktemp(suffix=".wav")
audio.export(temp_wav_path, format="wav")
st.audio(uploaded_file, format="audio/wav")
text = map_to_pred(file_path)
# display results
st.write('Input audio:', uploaded_file.name)
st.write('Predicted standard:', text)