|
import gradio as gr |
|
import librosa |
|
import numpy as np |
|
import torch |
|
import os |
|
import torch |
|
from speechbrain.pretrained import EncoderClassifier |
|
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan |
|
from scipy.io import wavfile |
|
import scipy.signal as sps |
|
import openai as ai |
|
import gc |
|
from examples import * |
|
|
|
checkpoint = "microsoft/speecht5_tts" |
|
processor = SpeechT5Processor.from_pretrained(checkpoint) |
|
model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint) |
|
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") |
|
ai.api_key = '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' |
|
|
|
|
|
spk_model_name = "speechbrain/spkrec-xvect-voxceleb" |
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
speaker_model = EncoderClassifier.from_hparams( |
|
source=spk_model_name, |
|
run_opts={"device": device}, |
|
savedir=os.path.join("/tmp", spk_model_name)) |
|
|
|
def create_speaker_embedding(waveform): |
|
with torch.no_grad(): |
|
speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform)) |
|
speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2) |
|
speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy() |
|
return speaker_embeddings |
|
|
|
def prepare_data(temp_text, audio_prompt): |
|
rate, audio_data = audio_prompt |
|
|
|
|
|
|
|
example = processor( |
|
text=temp_text, |
|
audio_target=audio_data, |
|
sampling_rate=16000, |
|
return_attention_mask=False,) |
|
example["speaker_embeddings"] = create_speaker_embedding(audio_data) |
|
example_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0) |
|
return example_embeddings |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def predict(temp_text, temp_audio, record_audio_prompt, prompt_text): |
|
if temp_audio is not None : |
|
audio_prompt = temp_audio |
|
else: |
|
audio_prompt = record_audio_prompt |
|
|
|
|
|
text=prompt_text |
|
embeddings=prepare_data(temp_text, audio_prompt) |
|
inputs = processor(text=text, return_tensors="pt") |
|
spectrogram = model.generate_speech(inputs["input_ids"], embeddings) |
|
|
|
with torch.no_grad(): |
|
speech = vocoder(spectrogram) |
|
|
|
speech = (speech.numpy() * 32767).astype(np.int16) |
|
speech=(16000, speech) |
|
del temp_text, temp_audio, record_audio_prompt, prompt_text, audio_prompt,embeddings,inputs,spectrogram |
|
gc.collect() |
|
return text, speech |
|
|
|
app = gr.Blocks() |
|
with app: |
|
with gr.Row(): |
|
with gr.Column(): |
|
|
|
temp_text=gr.Text(label="Template Text") |
|
temp_audio=gr.Audio(label="Template Speech", type="numpy") |
|
prompt_text=gr.Text(label="Input Text") |
|
record_audio_prompt = gr.Audio(label='recorded audio prompt', source='microphone', type="numpy") |
|
with gr.Column(): |
|
text = gr.Textbox(label="Message") |
|
speech=gr.Audio(label="Generated Speech", type="numpy") |
|
btn = gr.Button("Generate!") |
|
btn.click(predict, |
|
inputs=[temp_text, temp_audio, record_audio_prompt, prompt_text], |
|
outputs=[text, speech]) |
|
gr.Examples(examples=infer_from_audio_examples, |
|
inputs=[temp_text, temp_audio, record_audio_prompt, prompt_text], |
|
outputs=[text, speech], |
|
fn=predict, |
|
cache_examples=False,) |
|
|
|
app.launch() |