import gradio as gr import numpy as np import scipy.io.wavfile import torch import torch.nn.functional as F from whisperspeech.pipeline import Pipeline import time def process_audio(audio_elem,text="This is a test voice genereation"): scipy.io.wavfile.write('test.mp3', audio_elem[0], audio_elem[1]) # print out details about ut pipe = Pipeline(s2a_ref='collabora/whisperspeech:s2a-q4-base-en+pl.model') # save audio_elem as a file speaker = pipe.extract_spk_emb("test.mp3") speaker2 = speaker.cpu().numpy() # Move tensor from GPU to CPU and convert to numpy array #save it locally np.savez_compressed("speaker", features=speaker2) try: pipe.generate_to_file('test.wav', text, lang='en', cps=10.5, speaker=speaker) except Exception as e: print("Error: ", e) return "speaker.npz", "test.wav" # Define Gradio interface with gr.Interface(fn=process_audio, inputs=["audio","text"], outputs=["file",'audio']) as iface: iface.launch()