Spaces:

jsra2
/

prosody-speech

Build error

App Files Files Community

Santiago Roman commited on Mar 1, 2023

Commit

a1f7e54

•

1 Parent(s): 9d111b0

new app

Browse files

Files changed (2) hide show

app.py +52 -63
requirements.txt +4 -0

app.py CHANGED Viewed

@@ -1,87 +1,76 @@
-from matplotlib.pyplot import text
 import numpy as np
-import yaml
 from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
-import torch
-import soundfile as sf
-from datasets import load_dataset
-import gradio as gr
-# Original: https://huggingface.co/spaces/StevenLimcorn/fastspeech2-TTS/blob/main/app.py
-MODEL_NAMES = [
-    "SpeechT5",
-    "Custom"
-]
-speecht5 = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
-vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
-# load xvector containing speaker's voice characteristics from a dataset
-embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
-speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
-MODEL_DICT = {
-    "SpeechT5" : speecht5,
-    "Custom" : None
-}
-def inference(input_text, model_type):
-    # text2mel_name, vocoder_name = model_type.split(" + ")
-    # text2mel_model, vocoder_model = MODEL_DICT[text2mel_name], MODEL_DICT[vocoder_name]
-    text2mel_name = model_type
-    text2mel_model = MODEL_DICT[text2mel_name]
-    processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
-    inputs = processor(text=input_text, return_tensors="pt")
-    if text2mel_name == "SpeechT5":
-        speech = text2mel_model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
-    else:
-        raise ValueError("Only SpeechT5 is supported")
-    # # vocoder part
-    # if vocoder_name == "Melgan":
-    #     audio = vocoder_model(mel_outputs)[0, :, 0]
-    # elif vocoder_name == "MB-Melgan":
-    #     audio = vocoder_model(mel_outputs)[0, :, 0]
-    # else:
-    #     raise ValueError("Only MELGAN, MELGAN-STFT and MB_MELGAN are supported on vocoder_name")
-    sound_file = "./speech.wav"
-    sf.write(sound_file, speech.numpy(), samplerate=16000)
-    # if text2mel_name == "TACOTRON":
-    #     return mel_outputs.numpy(), alignment_history.numpy(), audio.numpy()
-    # else:
-    #     return mel_outputs.numpy(), audio.numpy()
-    # sf.write('./audio_after.wav', audio, 22050, "PCM_16")
-    return sound_file
-inputs = [
-    gr.inputs.Textbox(lines=5, label="Input Text"),
-    gr.inputs.Radio(label="Pick a TTS Model",choices=MODEL_NAMES,value = MODEL_NAMES[0])
-]
-outputs =  gr.outputs.Audio(type="filepath", label="Output Audio")
-title = "Prosody Project"
-description = "Gradio demo for Prosody Project"
-# article = "<p style='text-align: center'><a href='https://tensorspeech.github.io/TensorFlowTTS/'>TensorFlowTTS: Real-Time State-of-the-art Speech Synthesis for Tensorflow 2</a> | <a href='https://github.com/TensorSpeech/TensorFlowTTS'>Github Repo</a></p><p>An extension to akhaliq's implementation <a href='https://huggingface.co/spaces/akhaliq/TensorFlowTTS'></p>"
-article = None
 examples = [
-    ["Hi, my name is Santiago."],
-    ["Two bros, chilling in a hot tub, five feet apart because they are not gay."]
 ]
-gr.Interface(inference, inputs, outputs, title=title, description=description, article=article, examples=examples).launch()

+import gradio as gr
+import librosa
 import numpy as np
+import torch
 from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
+checkpoint = "microsoft/speecht5_tts"
+processor = SpeechT5Processor.from_pretrained(checkpoint)
+model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)
+vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
+default_voice = "CLB (female)"
+speaker_embeddings = {
+    "BDL": "spkemb/cmu_us_bdl_arctic-wav-arctic_a0009.npy",
+    "CLB": "spkemb/cmu_us_clb_arctic-wav-arctic_a0144.npy",
+    "KSP": "spkemb/cmu_us_ksp_arctic-wav-arctic_b0087.npy",
+    "RMS": "spkemb/cmu_us_rms_arctic-wav-arctic_b0353.npy",
+    "SLT": "spkemb/cmu_us_slt_arctic-wav-arctic_a0508.npy",
+}
+def predict(text, speaker):
+    if len(text.strip()) == 0:
+        return (16000, np.zeros(0).astype(np.int16))
+    inputs = processor(text=text, return_tensors="pt")
+    # limit input length
+    input_ids = inputs["input_ids"]
+    input_ids = input_ids[..., :model.config.max_text_positions]
+    speaker_embedding = np.load(speaker_embeddings[speaker[:3]])
+    speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0)
+    speech = model.generate_speech(input_ids, speaker_embedding, vocoder=vocoder)
+    speech = (speech.numpy() * 32767).astype(np.int16)
+    return (16000, speech)
+title = "SpeechT5: Speech Synthesis"
+description = """
+This is the Prosody Project for DT2112 Speech Technology
+"""
 examples = [
+    ["Hi, my name is Santiago", "CLB (female)"],
+    ["Two bros, chilling in a hot tub, five feet apart because they are not gay.", "CLB (female)"]
 ]
+gr.Interface(
+    fn=predict,
+    inputs=[
+        gr.Text(label="Input Text"),
+        gr.Radio(label="Speaker", choices=[
+            "CLB (female)"
+        ],
+        value="CLB (female)"),
+    ],
+    outputs=[
+        gr.Audio(label="Generated Speech", type="numpy"),
+    ],
+    title=title,
+    description=description,
+    article=None,
+    examples=examples,
+).launch()

requirements.txt CHANGED Viewed

@@ -3,4 +3,8 @@ sentencepiece
 datasets
 soundfile
 torch
 gradio

 datasets
 soundfile
 torch
+torchaudio
+samplerate
+librosa
+resampy
 gradio