from svoice.separate import * import scipy.io as sio from scipy.io.wavfile import write import gradio as gr import os from transformers import AutoProcessor, pipeline from optimum.onnxruntime import ORTModelForSpeechSeq2Seq from glob import glob load_model() BASE_PATH = os.path.dirname(os.path.abspath(__file__)) os.makedirs('input', exist_ok=True) os.makedirs('separated', exist_ok=True) os.makedirs('whisper_checkpoint', exist_ok=True) print("Loading ASR model...") processor = AutoProcessor.from_pretrained("openai/whisper-small") if not os.path.exists("whisper_checkpoint"): model = ORTModelForSpeechSeq2Seq.from_pretrained("openai/whisper-small", from_transformers=True) speech_recognition_pipeline = pipeline( "automatic-speech-recognition", model=model, feature_extractor=processor.feature_extractor, tokenizer=processor.tokenizer, ) model.save_pretrained("whisper_checkpoint") else: model = ORTModelForSpeechSeq2Seq.from_pretrained("whisper_checkpoint", from_transformers=False) speech_recognition_pipeline = pipeline( "automatic-speech-recognition", model=model, feature_extractor=processor.feature_extractor, tokenizer=processor.tokenizer, ) print("Whisper ASR model loaded.") def separator(audio, rec_audio): outputs= {} if audio: write('input/original.wav', audio[0], audio[1]) elif rec_audio: write('input/original.wav', rec_audio[0], rec_audio[1]) separate_demo(mix_dir="./input") separated_files = glob(os.path.join('separated', "*.wav")) separated_files = [f for f in separated_files if "original.wav" not in f] outputs['transcripts'] = [] for file in sorted(separated_files): separated_audio = sio.wavfile.read(file) outputs['transcripts'].append(speech_recognition_pipeline(separated_audio[1])['text']) return sorted(separated_files) + outputs['transcripts'] def set_example_audio(example: list) -> dict: return gr.Audio.update(value=example[0]) demo = gr.Blocks() with demo: gr.Markdown('''
This is a demo for the multiple voice separation algorithm. The algorithm is trained on the LibriMix7 dataset and can be used to separate multiple voices from a single audio file.