Aboubacar OUATTARA - kaira commited on
Commit
35053bd
1 Parent(s): 4c04c43

add audios files

Browse files
Files changed (1) hide show
  1. app.py +15 -37
app.py CHANGED
@@ -33,7 +33,7 @@ def translate_to_bambara(text, src_lang):
33
 
34
 
35
  # Function to convert text to speech
36
- def text_to_speech(bambara_text, reference_audio: Optional[Tuple] = None):
37
  if reference_audio is not None:
38
  ref_sr, ref_audio = reference_audio
39
  ref_audio = torch.from_numpy(ref_audio)
@@ -53,8 +53,8 @@ def text_to_speech(bambara_text, reference_audio: Optional[Tuple] = None):
53
  # Clean up the temporary file
54
  os.unlink(tmp_path)
55
  else:
56
- # If no reference audio provided, proceed with the default
57
- sr, audio = tts.text_to_speech(bambara_text)
58
 
59
  audio = audio.mean(dim=0)
60
  return audio, sr
@@ -91,36 +91,12 @@ def enhance_speech(audio_array, sampling_rate, solver, nfe, tau, denoise_before_
91
  return (new_sr1, denoised_audio.cpu().numpy()), (new_sr2, enhanced_audio.cpu().numpy())
92
 
93
 
94
- def resample_audio(audio_tensor, orig_sr, target_sr):
95
- """
96
- Resample audio tensor to a new sampling rate.
97
-
98
- Args:
99
- audio_tensor (torch.Tensor): Audio data tensor.
100
- orig_sr (int): Original sampling rate of the audio tensor.
101
- target_sr (int): Target sampling rate to resample the audio tensor to.
102
-
103
- Returns:
104
- torch.Tensor: Resampled audio tensor.
105
- """
106
- # Make sure the input tensor is in the shape (channels, time)
107
- if audio_tensor.ndim == 1:
108
- audio_tensor = audio_tensor.unsqueeze(0)
109
-
110
- # Initialize the resample transform
111
- resample_transform = torchaudio.transforms.Resample(orig_sr, target_sr)
112
-
113
- # Perform the resampling
114
- resampled_audio_tensor = resample_transform(audio_tensor)
115
-
116
- return resampled_audio_tensor.mean(dim=0)
117
-
118
-
119
  # Define the Gradio interface
120
  @spaces.GPU
121
  def _fn(
122
  src_lang,
123
  text,
 
124
  reference_audio=None,
125
  solver="Midpoint",
126
  nfe=64,
@@ -128,15 +104,19 @@ def _fn(
128
  denoise_before_enhancement=False
129
  ):
130
  source_lang = flores_codes[src_lang]
 
131
 
132
  # Step 1: Translate the text to Bambara
133
  bambara_text = translate_to_bambara(text, source_lang)
 
134
 
135
  # Step 2: Convert the translated text to speech with reference audio
136
  if reference_audio is not None:
137
  audio_array, sampling_rate = text_to_speech(bambara_text, reference_audio)
138
  else:
139
- audio_array, sampling_rate = text_to_speech(bambara_text)
 
 
140
 
141
  # Step 3: Enhance the audio
142
  denoised_audio, enhanced_audio = enhance_speech(
@@ -148,24 +128,22 @@ def _fn(
148
  denoise_before_enhancement
149
  )
150
 
151
- # Return all outputs
152
- return (
153
- bambara_text,
154
- (sampling_rate, audio_array.numpy()),
155
- denoised_audio,
156
- enhanced_audio
157
- )
158
 
159
 
160
  def main():
161
  lang_codes = list(flores_codes.keys())
162
 
 
 
 
163
  # Build Gradio app
164
  app = gr.Interface(
165
  fn=_fn,
166
  inputs=[
167
  gr.Dropdown(label="Source Language", choices=lang_codes, value='French'),
168
  gr.Textbox(label="Text to Translate", lines=3),
 
169
  gr.Audio(label="Clone your voice (optional)", type="numpy", format="wav"),
170
  gr.Dropdown(
171
  choices=["Midpoint", "RK4", "Euler"], value="Midpoint",
@@ -179,7 +157,7 @@ def main():
179
  gr.Textbox(label="Translated Text"),
180
  gr.Audio(label="Original TTS Audio", format='wav'),
181
  gr.Audio(label="Denoised Audio", format='wav'),
182
- gr.Audio(label="Enhanced Audio")
183
  ],
184
  title="Bambara Translation and Text to Speech with Audio Enhancement",
185
  description="Translate text to Bambara and convert it to speech with options to enhance audio quality."
 
33
 
34
 
35
  # Function to convert text to speech
36
+ def text_to_speech(bambara_text, reference_speaker: str, reference_audio: Optional[Tuple] = None):
37
  if reference_audio is not None:
38
  ref_sr, ref_audio = reference_audio
39
  ref_audio = torch.from_numpy(ref_audio)
 
53
  # Clean up the temporary file
54
  os.unlink(tmp_path)
55
  else:
56
+ # If no reference audio provided, proceed with the reference_speaker
57
+ sr, audio = tts.text_to_speech(bambara_text, speaker_reference_wav_path=reference_speaker)
58
 
59
  audio = audio.mean(dim=0)
60
  return audio, sr
 
91
  return (new_sr1, denoised_audio.cpu().numpy()), (new_sr2, enhanced_audio.cpu().numpy())
92
 
93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  # Define the Gradio interface
95
  @spaces.GPU
96
  def _fn(
97
  src_lang,
98
  text,
99
+ reference_speaker,
100
  reference_audio=None,
101
  solver="Midpoint",
102
  nfe=64,
 
104
  denoise_before_enhancement=False
105
  ):
106
  source_lang = flores_codes[src_lang]
107
+ reference_speaker = os.path.join("./audios", reference_speaker)
108
 
109
  # Step 1: Translate the text to Bambara
110
  bambara_text = translate_to_bambara(text, source_lang)
111
+ yield bambara_text, None, None, None
112
 
113
  # Step 2: Convert the translated text to speech with reference audio
114
  if reference_audio is not None:
115
  audio_array, sampling_rate = text_to_speech(bambara_text, reference_audio)
116
  else:
117
+ audio_array, sampling_rate = text_to_speech(bambara_text, reference_speaker=reference_speaker)
118
+
119
+ yield bambara_text, (sampling_rate, audio_array.numpy()), None, None
120
 
121
  # Step 3: Enhance the audio
122
  denoised_audio, enhanced_audio = enhance_speech(
 
128
  denoise_before_enhancement
129
  )
130
 
131
+ yield bambara_text, (sampling_rate, audio_array.numpy()), denoised_audio, enhanced_audio
 
 
 
 
 
 
132
 
133
 
134
  def main():
135
  lang_codes = list(flores_codes.keys())
136
 
137
+ # List all files in the ./audios directory for the dropdown
138
+ audio_files = [f for f in os.listdir('./audios') if os.path.isfile(os.path.join('./audios', f))]
139
+
140
  # Build Gradio app
141
  app = gr.Interface(
142
  fn=_fn,
143
  inputs=[
144
  gr.Dropdown(label="Source Language", choices=lang_codes, value='French'),
145
  gr.Textbox(label="Text to Translate", lines=3),
146
+ gr.Dropdown(label="Voice", choices=audio_files, value=audio_files[0]),
147
  gr.Audio(label="Clone your voice (optional)", type="numpy", format="wav"),
148
  gr.Dropdown(
149
  choices=["Midpoint", "RK4", "Euler"], value="Midpoint",
 
157
  gr.Textbox(label="Translated Text"),
158
  gr.Audio(label="Original TTS Audio", format='wav'),
159
  gr.Audio(label="Denoised Audio", format='wav'),
160
+ gr.Audio(label="Enhanced Audio", format='wav')
161
  ],
162
  title="Bambara Translation and Text to Speech with Audio Enhancement",
163
  description="Translate text to Bambara and convert it to speech with options to enhance audio quality."