arham061 commited on
Commit
bf20a09
1 Parent(s): a59ef1c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -9
app.py CHANGED
@@ -1,5 +1,5 @@
1
  import torch
2
- from transformers import SpeechT5ForTextToSpeech, SpeechT5Processor
3
  import soundfile as sf
4
  import gradio as gr
5
  import scipy.io.wavfile as wav
@@ -10,6 +10,7 @@ checkpoint = "arham061/speecht5_finetuned_voxpopuli_nl" # Replace with your act
10
  processor = SpeechT5Processor.from_pretrained(checkpoint)
11
  model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)
12
  tokenizer = processor.tokenizer
 
13
 
14
 
15
  # Buckwalter to Unicode mapping
@@ -87,17 +88,14 @@ def generate_audio(text):
87
  roman_urdu = transString(text)
88
 
89
  # Tokenize the input text
90
- inputs = tokenizer(roman_urdu, return_tensors="pt").input_values
91
 
92
- # Generate speech from the model
93
  with torch.no_grad():
94
- logits = model(inputs).logits
95
 
96
- # Convert logits to audio waveform
97
- predicted_ids = torch.argmax(logits, dim=-1)
98
- audio = tokenizer.decode(predicted_ids[0], skip_special_tokens=True)
99
 
100
- return audio
101
 
102
 
103
  def text_to_speech(text):
@@ -105,11 +103,12 @@ def text_to_speech(text):
105
  audio_output = generate_audio(text)
106
 
107
  # Save audio as a .wav file
108
- wav.write("output.wav", 16000, audio_output.astype(np.int16))
109
 
110
  return "output.wav"
111
 
112
 
 
113
  # Define the Gradio interface
114
  inputs = gr.inputs.Textbox(label="Enter text in Urdu")
115
  outputs = gr.outputs.Audio(label="Audio")
 
1
  import torch
2
+ from transformers import SpeechT5ForTextToSpeech, SpeechT5Processorf, SpeechT5HifiGan
3
  import soundfile as sf
4
  import gradio as gr
5
  import scipy.io.wavfile as wav
 
10
  processor = SpeechT5Processor.from_pretrained(checkpoint)
11
  model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)
12
  tokenizer = processor.tokenizer
13
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
14
 
15
 
16
  # Buckwalter to Unicode mapping
 
88
  roman_urdu = transString(text)
89
 
90
  # Tokenize the input text
91
+ inputs = tokenizer(roman_urdu, return_tensors="pt")
92
 
93
+ # Generate speech from the SpeechT5 model
94
  with torch.no_grad():
95
+ speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
96
 
97
+ return speech
 
 
98
 
 
99
 
100
 
101
  def text_to_speech(text):
 
103
  audio_output = generate_audio(text)
104
 
105
  # Save audio as a .wav file
106
+ sf.write("output.wav", audio_output.numpy(), samplerate=16000)
107
 
108
  return "output.wav"
109
 
110
 
111
+
112
  # Define the Gradio interface
113
  inputs = gr.inputs.Textbox(label="Enter text in Urdu")
114
  outputs = gr.outputs.Audio(label="Audio")