import streamlit as st # from transformers import AutoModelForSpeechSeq2Seq, Wav2Vec2Processor from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq import torch import tempfile from pydub import AudioSegment import numpy as np # Define available models # available_models = [ # "facebook/s2t-small-mustc-en-fr-st", # "facebook/s2t-medium-mustc-en-fr-st", # "facebook/s2t-large-mustc-en-fr-st" # ] available_models = ["Yehor/whisper-small-ukrainian"] st.title("Voice Recognition App using SpeechSeq2Seq") st.write("Upload an audio file and choose a model to transcribe it to text.") # Model selection dropdown model_choice = st.selectbox("Choose a SpeechSeq2Seq model", available_models) # Load the selected model and processor @st.cache_resource def load_model_and_processor(model_name): # model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name) # processor = Wav2Vec2Processor.from_pretrained(model_name) model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name) processor = AutoProcessor.from_pretrained(model_name) return model, processor st.write(f"Loading {model_choice} model...") model, processor = load_model_and_processor(model_choice) st.write(f"{model_choice} model loaded successfully.") # File uploader for audio file uploaded_file = st.file_uploader("Choose an audio file", type=["wav", "mp3", "m4a"]) if uploaded_file is not None: # Save the uploaded file temporarily with tempfile.NamedTemporaryFile(delete=False) as temp_file: temp_file.write(uploaded_file.read()) temp_file_path = temp_file.name # Convert audio file to a format supported by the processor (if necessary) audio = AudioSegment.from_file(temp_file_path) temp_wav_path = tempfile.mktemp(suffix=".wav") audio.export(temp_wav_path, format="wav") st.audio(uploaded_file, format="audio/wav") st.write("Transcribing audio...") # # Load audio # audio_input = AudioSegment.from_file(temp_wav_path).set_frame_rate(16000).set_channels(1) # audio_input = np.array(audio_input.get_array_of_samples()) # # # Normalize audio # audio_input = (audio_input - np.mean(audio_input)) / np.std(audio_input) # # # Process the audio # input_features = processor(audio_input, return_tensors="pt", sampling_rate=16000).input_values # # # Generate transcription # with torch.no_grad(): # predicted_ids = model.generate(input_features) # # transcription = processor.batch_decode(predicted_ids)[0] transcription = model.transcribe(temp_wav_path) st.write("Transcription:") st.write(transcription)