Spaces:

romas-458
/

acr

Sleeping

acr / app3.py

roman

try new approach

d242d3a about 1 year ago

2.65 kB

	import streamlit as st
	# from transformers import AutoModelForSpeechSeq2Seq, Wav2Vec2Processor
	from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
	import torch
	import tempfile
	from pydub import AudioSegment
	import numpy as np

	# Define available models
	# available_models = [
	# "facebook/s2t-small-mustc-en-fr-st",
	# "facebook/s2t-medium-mustc-en-fr-st",
	# "facebook/s2t-large-mustc-en-fr-st"
	# ]

	available_models = ["Yehor/whisper-small-ukrainian"]

	st.title("Voice Recognition App using SpeechSeq2Seq")

	st.write("Upload an audio file and choose a model to transcribe it to text.")

	# Model selection dropdown
	model_choice = st.selectbox("Choose a SpeechSeq2Seq model", available_models)


	# Load the selected model and processor
	@st.cache_resource
	def load_model_and_processor(model_name):
	# model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name)
	# processor = Wav2Vec2Processor.from_pretrained(model_name)
	model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name)
	processor = AutoProcessor.from_pretrained(model_name)
	return model, processor

	st.write(f"Loading {model_choice} model...")
	model, processor = load_model_and_processor(model_choice)
	st.write(f"{model_choice} model loaded successfully.")

	# File uploader for audio file
	uploaded_file = st.file_uploader("Choose an audio file", type=["wav", "mp3", "m4a"])

	if uploaded_file is not None:
	# Save the uploaded file temporarily
	with tempfile.NamedTemporaryFile(delete=False) as temp_file:
	temp_file.write(uploaded_file.read())
	temp_file_path = temp_file.name

	# Convert audio file to a format supported by the processor (if necessary)
	audio = AudioSegment.from_file(temp_file_path)
	temp_wav_path = tempfile.mktemp(suffix=".wav")
	audio.export(temp_wav_path, format="wav")

	st.audio(uploaded_file, format="audio/wav")

	st.write("Transcribing audio...")

	# # Load audio
	# audio_input = AudioSegment.from_file(temp_wav_path).set_frame_rate(16000).set_channels(1)
	# audio_input = np.array(audio_input.get_array_of_samples())
	#
	# # Normalize audio
	# audio_input = (audio_input - np.mean(audio_input)) / np.std(audio_input)
	#
	# # Process the audio
	# input_features = processor(audio_input, return_tensors="pt", sampling_rate=16000).input_values
	#
	# # Generate transcription
	# with torch.no_grad():
	# predicted_ids = model.generate(input_features)
	#
	# transcription = processor.batch_decode(predicted_ids)[0]

	transcription = model.transcribe(temp_wav_path)

	st.write("Transcription:")
	st.write(transcription)