|
import streamlit as st |
|
import librosa |
|
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq |
|
import tempfile |
|
from pydub import AudioSegment |
|
|
|
|
|
available_models = ["Yehor/whisper-small-ukrainian", "arampacha/whisper-large-uk-2"] |
|
|
|
|
|
|
|
|
|
st.title("Voice Recognition App") |
|
|
|
|
|
model_choice = st.selectbox("Choose a model", available_models) |
|
|
|
processor = AutoProcessor.from_pretrained(model_choice) |
|
|
|
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_choice) |
|
|
|
uploaded_file = st.file_uploader("Choose file", type=["wav", "mp3"]) |
|
|
|
def map_to_pred(file_path): |
|
|
|
audio, _ = librosa.load(file_path) |
|
|
|
|
|
input_features = processor([audio], return_tensors="pt", sampling_rate=16000).input_features |
|
generated_ids = model.generate(inputs=input_features) |
|
transcription = processor.batch_decode(generated_ids, normalize=True, skip_special_tokens=True) |
|
text = processor.tokenizer._normalize(transcription[0]) |
|
|
|
return text |
|
if uploaded_file is not None: |
|
|
|
file_path = './temp.wav' |
|
with open(file_path, 'wb') as f: |
|
f.write(uploaded_file.getbuffer()) |
|
|
|
|
|
with tempfile.NamedTemporaryFile(delete=False) as temp_file: |
|
temp_file.write(uploaded_file.read()) |
|
temp_file_path = temp_file.name |
|
|
|
|
|
audio = AudioSegment.from_file(temp_file_path) |
|
temp_wav_path = tempfile.mktemp(suffix=".wav") |
|
audio.export(temp_wav_path, format="wav") |
|
|
|
st.audio(uploaded_file, format="audio/wav") |
|
|
|
text = map_to_pred(file_path) |
|
|
|
|
|
st.write('Input audio:', uploaded_file.name) |
|
st.write('Predicted standard:', text) |