Spaces:
Build error
Build error
Santiago Roman
commited on
Commit
•
a1f7e54
1
Parent(s):
9d111b0
new app
Browse files- app.py +52 -63
- requirements.txt +4 -0
app.py
CHANGED
@@ -1,87 +1,76 @@
|
|
1 |
-
|
|
|
2 |
import numpy as np
|
3 |
-
import
|
4 |
|
5 |
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
|
6 |
-
import torch
|
7 |
-
import soundfile as sf
|
8 |
-
from datasets import load_dataset
|
9 |
|
10 |
-
import gradio as gr
|
11 |
|
|
|
|
|
|
|
|
|
12 |
|
13 |
-
|
14 |
|
15 |
-
MODEL_NAMES = [
|
16 |
-
"SpeechT5",
|
17 |
-
"Custom"
|
18 |
-
]
|
19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
|
22 |
-
|
23 |
-
|
|
|
24 |
|
25 |
-
|
26 |
-
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
|
27 |
-
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
|
28 |
|
|
|
|
|
|
|
29 |
|
30 |
-
MODEL_DICT = {
|
31 |
-
"SpeechT5" : speecht5,
|
32 |
-
"Custom" : None
|
33 |
-
}
|
34 |
-
|
35 |
-
def inference(input_text, model_type):
|
36 |
-
# text2mel_name, vocoder_name = model_type.split(" + ")
|
37 |
-
# text2mel_model, vocoder_model = MODEL_DICT[text2mel_name], MODEL_DICT[vocoder_name]
|
38 |
-
text2mel_name = model_type
|
39 |
-
text2mel_model = MODEL_DICT[text2mel_name]
|
40 |
-
|
41 |
-
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
|
42 |
-
inputs = processor(text=input_text, return_tensors="pt")
|
43 |
-
|
44 |
-
if text2mel_name == "SpeechT5":
|
45 |
-
speech = text2mel_model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
|
46 |
-
else:
|
47 |
-
raise ValueError("Only SpeechT5 is supported")
|
48 |
-
|
49 |
-
# # vocoder part
|
50 |
-
# if vocoder_name == "Melgan":
|
51 |
-
# audio = vocoder_model(mel_outputs)[0, :, 0]
|
52 |
-
# elif vocoder_name == "MB-Melgan":
|
53 |
-
# audio = vocoder_model(mel_outputs)[0, :, 0]
|
54 |
-
# else:
|
55 |
-
# raise ValueError("Only MELGAN, MELGAN-STFT and MB_MELGAN are supported on vocoder_name")
|
56 |
|
|
|
57 |
|
58 |
-
|
59 |
-
sf.write(sound_file, speech.numpy(), samplerate=16000)
|
60 |
|
61 |
-
|
62 |
-
# return mel_outputs.numpy(), alignment_history.numpy(), audio.numpy()
|
63 |
-
# else:
|
64 |
-
# return mel_outputs.numpy(), audio.numpy()
|
65 |
|
66 |
-
|
67 |
-
return
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
gr.inputs.Radio(label="Pick a TTS Model",choices=MODEL_NAMES,value = MODEL_NAMES[0])
|
72 |
-
]
|
73 |
|
74 |
-
|
|
|
|
|
75 |
|
76 |
|
77 |
-
title = "Prosody Project"
|
78 |
-
description = "Gradio demo for Prosody Project"
|
79 |
-
# article = "<p style='text-align: center'><a href='https://tensorspeech.github.io/TensorFlowTTS/'>TensorFlowTTS: Real-Time State-of-the-art Speech Synthesis for Tensorflow 2</a> | <a href='https://github.com/TensorSpeech/TensorFlowTTS'>Github Repo</a></p><p>An extension to akhaliq's implementation <a href='https://huggingface.co/spaces/akhaliq/TensorFlowTTS'></p>"
|
80 |
-
article = None
|
81 |
|
82 |
examples = [
|
83 |
-
["Hi, my name is Santiago
|
84 |
-
["Two bros, chilling in a hot tub, five feet apart because they are not gay."]
|
85 |
]
|
86 |
|
87 |
-
gr.Interface(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import librosa
|
3 |
import numpy as np
|
4 |
+
import torch
|
5 |
|
6 |
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
|
|
|
|
|
|
|
7 |
|
|
|
8 |
|
9 |
+
checkpoint = "microsoft/speecht5_tts"
|
10 |
+
processor = SpeechT5Processor.from_pretrained(checkpoint)
|
11 |
+
model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)
|
12 |
+
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
|
13 |
|
14 |
+
default_voice = "CLB (female)"
|
15 |
|
|
|
|
|
|
|
|
|
16 |
|
17 |
+
speaker_embeddings = {
|
18 |
+
"BDL": "spkemb/cmu_us_bdl_arctic-wav-arctic_a0009.npy",
|
19 |
+
"CLB": "spkemb/cmu_us_clb_arctic-wav-arctic_a0144.npy",
|
20 |
+
"KSP": "spkemb/cmu_us_ksp_arctic-wav-arctic_b0087.npy",
|
21 |
+
"RMS": "spkemb/cmu_us_rms_arctic-wav-arctic_b0353.npy",
|
22 |
+
"SLT": "spkemb/cmu_us_slt_arctic-wav-arctic_a0508.npy",
|
23 |
+
}
|
24 |
|
25 |
|
26 |
+
def predict(text, speaker):
|
27 |
+
if len(text.strip()) == 0:
|
28 |
+
return (16000, np.zeros(0).astype(np.int16))
|
29 |
|
30 |
+
inputs = processor(text=text, return_tensors="pt")
|
|
|
|
|
31 |
|
32 |
+
# limit input length
|
33 |
+
input_ids = inputs["input_ids"]
|
34 |
+
input_ids = input_ids[..., :model.config.max_text_positions]
|
35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
|
37 |
+
speaker_embedding = np.load(speaker_embeddings[speaker[:3]])
|
38 |
|
39 |
+
speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0)
|
|
|
40 |
|
41 |
+
speech = model.generate_speech(input_ids, speaker_embedding, vocoder=vocoder)
|
|
|
|
|
|
|
42 |
|
43 |
+
speech = (speech.numpy() * 32767).astype(np.int16)
|
44 |
+
return (16000, speech)
|
45 |
+
|
46 |
+
|
47 |
+
title = "SpeechT5: Speech Synthesis"
|
|
|
|
|
48 |
|
49 |
+
description = """
|
50 |
+
This is the Prosody Project for DT2112 Speech Technology
|
51 |
+
"""
|
52 |
|
53 |
|
|
|
|
|
|
|
|
|
54 |
|
55 |
examples = [
|
56 |
+
["Hi, my name is Santiago", "CLB (female)"],
|
57 |
+
["Two bros, chilling in a hot tub, five feet apart because they are not gay.", "CLB (female)"]
|
58 |
]
|
59 |
|
60 |
+
gr.Interface(
|
61 |
+
fn=predict,
|
62 |
+
inputs=[
|
63 |
+
gr.Text(label="Input Text"),
|
64 |
+
gr.Radio(label="Speaker", choices=[
|
65 |
+
"CLB (female)"
|
66 |
+
],
|
67 |
+
value="CLB (female)"),
|
68 |
+
],
|
69 |
+
outputs=[
|
70 |
+
gr.Audio(label="Generated Speech", type="numpy"),
|
71 |
+
],
|
72 |
+
title=title,
|
73 |
+
description=description,
|
74 |
+
article=None,
|
75 |
+
examples=examples,
|
76 |
+
).launch()
|
requirements.txt
CHANGED
@@ -3,4 +3,8 @@ sentencepiece
|
|
3 |
datasets
|
4 |
soundfile
|
5 |
torch
|
|
|
|
|
|
|
|
|
6 |
gradio
|
|
|
3 |
datasets
|
4 |
soundfile
|
5 |
torch
|
6 |
+
torchaudio
|
7 |
+
samplerate
|
8 |
+
librosa
|
9 |
+
resampy
|
10 |
gradio
|