Spaces:

arham061
/

urdu_TTS

Running

App Files Files Community

arham061 commited on Jul 16, 2023

Commit

e38db12

•

1 Parent(s): 27c1d79

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -6

app.py CHANGED Viewed

@@ -13,6 +13,28 @@ model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)
 tokenizer = processor.tokenizer
 vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
 # Buckwalter to Unicode mapping
 buck2uni = {
@@ -88,11 +110,13 @@ def generate_audio(text):
     roman_urdu = transString(text)
     # Tokenize the input text
-    inputs = tokenizer(roman_urdu, return_tensors="pt")
     # Generate audio from the SpeechT5 model
-    with torch.no_grad():
-        speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
     return speech
@@ -101,10 +125,11 @@ def text_to_speech(text):
     audio_output = generate_audio(text)
     # Save audio as a .wav file
-    output_path = "output.wav"
-    sf.write(output_path, audio_output, 16000, 'PCM_16')
-    return output_path
 # Define the Gradio interface
 inputs = gr.inputs.Textbox(label="Enter text in Urdu")

 tokenizer = processor.tokenizer
 vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
+def prepare_dataset(example):
+    audio = example["audio"]
+    example = processor(
+        text=transString(example["sentence"]),
+        audio_target=audio["array"],
+        sampling_rate=audio["sampling_rate"],
+        return_attention_mask=False,
+    )
+    # strip off the batch dimension
+    example["labels"] = example["labels"][0]
+    # use SpeechBrain to obtain x-vector
+    example["speaker_embeddings"] = create_speaker_embedding(audio["array"])
+    return example
+test_dataset = load_dataset("mozilla-foundation/common_voice_13_0", "ur", split="test")
+test_dataset = test_dataset.cast_column("audio", Audio(sampling_rate=16000))
+test_dataset = test_dataset.map(prepare_dataset, remove_columns=test_dataset.column_names)
 # Buckwalter to Unicode mapping
 buck2uni = {
     roman_urdu = transString(text)
     # Tokenize the input text
+    inputs = processor(text=roman_urdu, return_tensors="pt")
     # Generate audio from the SpeechT5 model
+    example = test_dataset[22]
+    speaker_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0)
+    speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
     return speech
     audio_output = generate_audio(text)
     # Save audio as a .wav file
+    from IPython.display import Audio
+    audio = Audio(audio_output.numpy(), rate=16000)
+    return audio
 # Define the Gradio interface
 inputs = gr.inputs.Textbox(label="Enter text in Urdu")