arham061 commited on
Commit
e38db12
1 Parent(s): 27c1d79

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -6
app.py CHANGED
@@ -13,6 +13,28 @@ model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)
13
  tokenizer = processor.tokenizer
14
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  # Buckwalter to Unicode mapping
18
  buck2uni = {
@@ -88,11 +110,13 @@ def generate_audio(text):
88
  roman_urdu = transString(text)
89
 
90
  # Tokenize the input text
91
- inputs = tokenizer(roman_urdu, return_tensors="pt")
92
 
93
  # Generate audio from the SpeechT5 model
94
- with torch.no_grad():
95
- speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
 
 
96
 
97
  return speech
98
 
@@ -101,10 +125,11 @@ def text_to_speech(text):
101
  audio_output = generate_audio(text)
102
 
103
  # Save audio as a .wav file
104
- output_path = "output.wav"
105
- sf.write(output_path, audio_output, 16000, 'PCM_16')
 
106
 
107
- return output_path
108
 
109
  # Define the Gradio interface
110
  inputs = gr.inputs.Textbox(label="Enter text in Urdu")
 
13
  tokenizer = processor.tokenizer
14
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
15
 
16
+ def prepare_dataset(example):
17
+ audio = example["audio"]
18
+
19
+ example = processor(
20
+ text=transString(example["sentence"]),
21
+ audio_target=audio["array"],
22
+ sampling_rate=audio["sampling_rate"],
23
+ return_attention_mask=False,
24
+ )
25
+
26
+ # strip off the batch dimension
27
+ example["labels"] = example["labels"][0]
28
+
29
+ # use SpeechBrain to obtain x-vector
30
+ example["speaker_embeddings"] = create_speaker_embedding(audio["array"])
31
+
32
+ return example
33
+
34
+ test_dataset = load_dataset("mozilla-foundation/common_voice_13_0", "ur", split="test")
35
+ test_dataset = test_dataset.cast_column("audio", Audio(sampling_rate=16000))
36
+ test_dataset = test_dataset.map(prepare_dataset, remove_columns=test_dataset.column_names)
37
+
38
 
39
  # Buckwalter to Unicode mapping
40
  buck2uni = {
 
110
  roman_urdu = transString(text)
111
 
112
  # Tokenize the input text
113
+ inputs = processor(text=roman_urdu, return_tensors="pt")
114
 
115
  # Generate audio from the SpeechT5 model
116
+ example = test_dataset[22]
117
+ speaker_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0)
118
+
119
+ speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
120
 
121
  return speech
122
 
 
125
  audio_output = generate_audio(text)
126
 
127
  # Save audio as a .wav file
128
+ from IPython.display import Audio
129
+
130
+ audio = Audio(audio_output.numpy(), rate=16000)
131
 
132
+ return audio
133
 
134
  # Define the Gradio interface
135
  inputs = gr.inputs.Textbox(label="Enter text in Urdu")