Santiago Roman commited on
Commit
a1f7e54
1 Parent(s): 9d111b0
Files changed (2) hide show
  1. app.py +52 -63
  2. requirements.txt +4 -0
app.py CHANGED
@@ -1,87 +1,76 @@
1
- from matplotlib.pyplot import text
 
2
  import numpy as np
3
- import yaml
4
 
5
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
6
- import torch
7
- import soundfile as sf
8
- from datasets import load_dataset
9
 
10
- import gradio as gr
11
 
 
 
 
 
12
 
13
- # Original: https://huggingface.co/spaces/StevenLimcorn/fastspeech2-TTS/blob/main/app.py
14
 
15
- MODEL_NAMES = [
16
- "SpeechT5",
17
- "Custom"
18
- ]
19
 
 
 
 
 
 
 
 
20
 
21
 
22
- speecht5 = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
23
- vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
 
24
 
25
- # load xvector containing speaker's voice characteristics from a dataset
26
- embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
27
- speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
28
 
 
 
 
29
 
30
- MODEL_DICT = {
31
- "SpeechT5" : speecht5,
32
- "Custom" : None
33
- }
34
-
35
- def inference(input_text, model_type):
36
- # text2mel_name, vocoder_name = model_type.split(" + ")
37
- # text2mel_model, vocoder_model = MODEL_DICT[text2mel_name], MODEL_DICT[vocoder_name]
38
- text2mel_name = model_type
39
- text2mel_model = MODEL_DICT[text2mel_name]
40
-
41
- processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
42
- inputs = processor(text=input_text, return_tensors="pt")
43
-
44
- if text2mel_name == "SpeechT5":
45
- speech = text2mel_model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
46
- else:
47
- raise ValueError("Only SpeechT5 is supported")
48
-
49
- # # vocoder part
50
- # if vocoder_name == "Melgan":
51
- # audio = vocoder_model(mel_outputs)[0, :, 0]
52
- # elif vocoder_name == "MB-Melgan":
53
- # audio = vocoder_model(mel_outputs)[0, :, 0]
54
- # else:
55
- # raise ValueError("Only MELGAN, MELGAN-STFT and MB_MELGAN are supported on vocoder_name")
56
 
 
57
 
58
- sound_file = "./speech.wav"
59
- sf.write(sound_file, speech.numpy(), samplerate=16000)
60
 
61
- # if text2mel_name == "TACOTRON":
62
- # return mel_outputs.numpy(), alignment_history.numpy(), audio.numpy()
63
- # else:
64
- # return mel_outputs.numpy(), audio.numpy()
65
 
66
- # sf.write('./audio_after.wav', audio, 22050, "PCM_16")
67
- return sound_file
68
-
69
- inputs = [
70
- gr.inputs.Textbox(lines=5, label="Input Text"),
71
- gr.inputs.Radio(label="Pick a TTS Model",choices=MODEL_NAMES,value = MODEL_NAMES[0])
72
- ]
73
 
74
- outputs = gr.outputs.Audio(type="filepath", label="Output Audio")
 
 
75
 
76
 
77
- title = "Prosody Project"
78
- description = "Gradio demo for Prosody Project"
79
- # article = "<p style='text-align: center'><a href='https://tensorspeech.github.io/TensorFlowTTS/'>TensorFlowTTS: Real-Time State-of-the-art Speech Synthesis for Tensorflow 2</a> | <a href='https://github.com/TensorSpeech/TensorFlowTTS'>Github Repo</a></p><p>An extension to akhaliq's implementation <a href='https://huggingface.co/spaces/akhaliq/TensorFlowTTS'></p>"
80
- article = None
81
 
82
  examples = [
83
- ["Hi, my name is Santiago."],
84
- ["Two bros, chilling in a hot tub, five feet apart because they are not gay."]
85
  ]
86
 
87
- gr.Interface(inference, inputs, outputs, title=title, description=description, article=article, examples=examples).launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import librosa
3
  import numpy as np
4
+ import torch
5
 
6
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
 
 
 
7
 
 
8
 
9
+ checkpoint = "microsoft/speecht5_tts"
10
+ processor = SpeechT5Processor.from_pretrained(checkpoint)
11
+ model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)
12
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
13
 
14
+ default_voice = "CLB (female)"
15
 
 
 
 
 
16
 
17
+ speaker_embeddings = {
18
+ "BDL": "spkemb/cmu_us_bdl_arctic-wav-arctic_a0009.npy",
19
+ "CLB": "spkemb/cmu_us_clb_arctic-wav-arctic_a0144.npy",
20
+ "KSP": "spkemb/cmu_us_ksp_arctic-wav-arctic_b0087.npy",
21
+ "RMS": "spkemb/cmu_us_rms_arctic-wav-arctic_b0353.npy",
22
+ "SLT": "spkemb/cmu_us_slt_arctic-wav-arctic_a0508.npy",
23
+ }
24
 
25
 
26
+ def predict(text, speaker):
27
+ if len(text.strip()) == 0:
28
+ return (16000, np.zeros(0).astype(np.int16))
29
 
30
+ inputs = processor(text=text, return_tensors="pt")
 
 
31
 
32
+ # limit input length
33
+ input_ids = inputs["input_ids"]
34
+ input_ids = input_ids[..., :model.config.max_text_positions]
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
+ speaker_embedding = np.load(speaker_embeddings[speaker[:3]])
38
 
39
+ speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0)
 
40
 
41
+ speech = model.generate_speech(input_ids, speaker_embedding, vocoder=vocoder)
 
 
 
42
 
43
+ speech = (speech.numpy() * 32767).astype(np.int16)
44
+ return (16000, speech)
45
+
46
+
47
+ title = "SpeechT5: Speech Synthesis"
 
 
48
 
49
+ description = """
50
+ This is the Prosody Project for DT2112 Speech Technology
51
+ """
52
 
53
 
 
 
 
 
54
 
55
  examples = [
56
+ ["Hi, my name is Santiago", "CLB (female)"],
57
+ ["Two bros, chilling in a hot tub, five feet apart because they are not gay.", "CLB (female)"]
58
  ]
59
 
60
+ gr.Interface(
61
+ fn=predict,
62
+ inputs=[
63
+ gr.Text(label="Input Text"),
64
+ gr.Radio(label="Speaker", choices=[
65
+ "CLB (female)"
66
+ ],
67
+ value="CLB (female)"),
68
+ ],
69
+ outputs=[
70
+ gr.Audio(label="Generated Speech", type="numpy"),
71
+ ],
72
+ title=title,
73
+ description=description,
74
+ article=None,
75
+ examples=examples,
76
+ ).launch()
requirements.txt CHANGED
@@ -3,4 +3,8 @@ sentencepiece
3
  datasets
4
  soundfile
5
  torch
 
 
 
 
6
  gradio
 
3
  datasets
4
  soundfile
5
  torch
6
+ torchaudio
7
+ samplerate
8
+ librosa
9
+ resampy
10
  gradio