import streamlit as st import librosa from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq uploaded_file = st.file_uploader("上传文件", type="wav") processor = AutoProcessor.from_pretrained("Yehor/whisper-small-ukrainian") model = AutoModelForSpeechSeq2Seq.from_pretrained("Yehor/whisper-small-ukrainian") def map_to_pred(file_path): # load audio file audio, _ = librosa.load(file_path) # preprocess audio and generate standard input_features = processor([audio], return_tensors="pt", sampling_rate=16_000).input_features generated_ids = model.generate(inputs=input_features) transcription = processor.batch_decode(generated_ids, normalize=True, skip_special_tokens=True) text = processor.tokenizer._normalize(transcription[0]) return text if uploaded_file is not None: # convert file object to file path file_path = './temp.wav' with open(file_path, 'wb') as f: f.write(uploaded_file.getbuffer()) text = map_to_pred(file_path) # display results st.write('Input audio:', uploaded_file.name) st.write('Predicted standard:', text)