import streamlit as st import librosa from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq import tempfile from pydub import AudioSegment # Define available models available_models = ["Yehor/whisper-small-ukrainian", "arampacha/whisper-large-uk-2"] # , "Yehor/wav2vec2-xls-r-300m-uk-with-3gram-news-lm", "Yehor/wav2vec2-xls-r-300m-uk-with-wiki-lm" # available_models = ["theodotus/stt_uk_squeezeformer_ctc_sm", "arampacha/whisper-large-uk-2"] st.title("Voice Recognition App") # Model selection dropdown model_choice = st.selectbox("Choose a model", available_models) processor = AutoProcessor.from_pretrained(model_choice) model = AutoModelForSpeechSeq2Seq.from_pretrained(model_choice) uploaded_file = st.file_uploader("Choose file", type=["wav", "mp3"]) def map_to_pred(file_path): # load audio file audio, _ = librosa.load(file_path) # preprocess audio and generate standard input_features = processor([audio], return_tensors="pt", sampling_rate=16000).input_features generated_ids = model.generate(inputs=input_features) transcription = processor.batch_decode(generated_ids, normalize=True, skip_special_tokens=True) text = processor.tokenizer._normalize(transcription[0]) return text if uploaded_file is not None: # convert file object to file path file_path = './temp.wav' with open(file_path, 'wb') as f: f.write(uploaded_file.getbuffer()) # Save the uploaded file temporarily with tempfile.NamedTemporaryFile(delete=False) as temp_file: temp_file.write(uploaded_file.read()) temp_file_path = temp_file.name # Convert audio file to a format supported by Whisper (if necessary) audio = AudioSegment.from_file(temp_file_path) temp_wav_path = tempfile.mktemp(suffix=".wav") audio.export(temp_wav_path, format="wav") st.audio(uploaded_file, format="audio/wav") text = map_to_pred(file_path) # display results st.write('Input audio:', uploaded_file.name) st.write('Predicted standard:', text)