Update app.py
Browse files
app.py
CHANGED
@@ -14,36 +14,6 @@ model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)
|
|
14 |
tokenizer = processor.tokenizer
|
15 |
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
|
16 |
|
17 |
-
def prepare_dataset(example):
|
18 |
-
audio = example["audio"]
|
19 |
-
|
20 |
-
example = processor(
|
21 |
-
text=transString(example["sentence"]),
|
22 |
-
audio_target=audio["array"],
|
23 |
-
sampling_rate=audio["sampling_rate"],
|
24 |
-
return_attention_mask=False,
|
25 |
-
)
|
26 |
-
|
27 |
-
# strip off the batch dimension
|
28 |
-
example["labels"] = example["labels"][0]
|
29 |
-
|
30 |
-
# use SpeechBrain to obtain x-vector
|
31 |
-
example["speaker_embeddings"] = create_speaker_embedding(audio["array"])
|
32 |
-
|
33 |
-
return example
|
34 |
-
|
35 |
-
# Set the authentication token
|
36 |
-
config.HF_DATASETS_CUSTOM_HEADERS = {
|
37 |
-
"Authorization": "Bearer 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"
|
38 |
-
}
|
39 |
-
from huggingface_hub import notebook_login
|
40 |
-
|
41 |
-
notebook_login()
|
42 |
-
|
43 |
-
test_dataset = load_dataset("mozilla-foundation/common_voice_13_0", "ur", split="test")
|
44 |
-
test_dataset = test_dataset.cast_column("audio", Audio(sampling_rate=16000))
|
45 |
-
test_dataset = test_dataset.map(prepare_dataset, remove_columns=test_dataset.column_names)
|
46 |
-
|
47 |
|
48 |
# Buckwalter to Unicode mapping
|
49 |
buck2uni = {
|
@@ -122,9 +92,7 @@ def generate_audio(text):
|
|
122 |
inputs = processor(text=roman_urdu, return_tensors="pt")
|
123 |
|
124 |
# Generate audio from the SpeechT5 model
|
125 |
-
|
126 |
-
speaker_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0)
|
127 |
-
|
128 |
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
|
129 |
|
130 |
return speech
|
|
|
14 |
tokenizer = processor.tokenizer
|
15 |
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
|
16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
# Buckwalter to Unicode mapping
|
19 |
buck2uni = {
|
|
|
92 |
inputs = processor(text=roman_urdu, return_tensors="pt")
|
93 |
|
94 |
# Generate audio from the SpeechT5 model
|
95 |
+
speaker_embeddings = torch.tensor(np.load("speaker_embeddings.npy"))
|
|
|
|
|
96 |
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
|
97 |
|
98 |
return speech
|