import streamlit as st
import librosa
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq


uploaded_file = st.file_uploader("上传文件", type="wav")

processor = AutoProcessor.from_pretrained("Yehor/whisper-small-ukrainian")

model = AutoModelForSpeechSeq2Seq.from_pretrained("Yehor/whisper-small-ukrainian")

def map_to_pred(file_path):
    # load audio file
    audio, _ = librosa.load(file_path)

    # preprocess audio and generate standard
    input_features = processor([audio], return_tensors="pt", sampling_rate=16_000).input_features
    generated_ids = model.generate(inputs=input_features)
    transcription = processor.batch_decode(generated_ids, normalize=True, skip_special_tokens=True)
    text = processor.tokenizer._normalize(transcription[0])

    return text
if uploaded_file is not None:
    # convert file object to file path
    file_path = './temp.wav'
    with open(file_path, 'wb') as f:
        f.write(uploaded_file.getbuffer())

    text = map_to_pred(file_path)

    # display results
    st.write('Input audio:', uploaded_file.name)
    st.write('Predicted standard:', text)