Spaces:
Runtime error
Runtime error
divakaivan
commited on
Commit
•
6742dfa
1
Parent(s):
4f76169
Update app.py
Browse files
app.py
CHANGED
@@ -8,18 +8,7 @@ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5Hif
|
|
8 |
#.
|
9 |
checkpoint = "microsoft/speecht5_tts"
|
10 |
processor = SpeechT5Processor.from_pretrained(checkpoint)
|
11 |
-
model = SpeechT5ForTextToSpeech.from_pretrained(
|
12 |
-
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
|
13 |
-
|
14 |
-
|
15 |
-
speaker_embeddings = {
|
16 |
-
"BDL": "spkemb/cmu_us_bdl_arctic-wav-arctic_a0009.npy",
|
17 |
-
"CLB": "spkemb/cmu_us_clb_arctic-wav-arctic_a0144.npy",
|
18 |
-
"KSP": "spkemb/cmu_us_ksp_arctic-wav-arctic_b0087.npy",
|
19 |
-
"RMS": "spkemb/cmu_us_rms_arctic-wav-arctic_b0353.npy",
|
20 |
-
"SLT": "spkemb/cmu_us_slt_arctic-wav-arctic_a0508.npy",
|
21 |
-
}
|
22 |
-
|
23 |
|
24 |
from datasets import load_dataset, Audio
|
25 |
|
@@ -137,18 +126,11 @@ def predict(text, speaker):
|
|
137 |
|
138 |
### ### ###
|
139 |
example = dataset['test'][11]
|
140 |
-
|
141 |
-
|
142 |
-
# Ensure the speaker_embedding has the correct dimensions
|
143 |
-
if speaker_embedding.dim() == 2:
|
144 |
-
speaker_embedding = speaker_embedding.unsqueeze(1).expand(-1, inputs["input_ids"].size(1), -1)
|
145 |
-
elif speaker_embedding.dim() == 3:
|
146 |
-
speaker_embedding = speaker_embedding.expand(-1, inputs["input_ids"].size(1), -1)
|
147 |
|
148 |
-
spectrogram = model.generate_speech(inputs["input_ids"]
|
149 |
with torch.no_grad():
|
150 |
speech = vocoder(spectrogram)
|
151 |
-
# speech = model.generate_speech(input_ids, speaker_embedding, vocoder=vocoder)
|
152 |
|
153 |
speech = (speech.numpy() * 32767).astype(np.int16)
|
154 |
return (16000, speech)
|
|
|
8 |
#.
|
9 |
checkpoint = "microsoft/speecht5_tts"
|
10 |
processor = SpeechT5Processor.from_pretrained(checkpoint)
|
11 |
+
model = SpeechT5ForTextToSpeech.from_pretrained("divakaivan/glaswegian_tts")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
from datasets import load_dataset, Audio
|
14 |
|
|
|
126 |
|
127 |
### ### ###
|
128 |
example = dataset['test'][11]
|
129 |
+
speaker_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0)
|
|
|
|
|
|
|
|
|
|
|
|
|
130 |
|
131 |
+
spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings)
|
132 |
with torch.no_grad():
|
133 |
speech = vocoder(spectrogram)
|
|
|
134 |
|
135 |
speech = (speech.numpy() * 32767).astype(np.int16)
|
136 |
return (16000, speech)
|