Spaces:

oza75
/

bambara-mt

Running on Zero

App Files Files Community

Aboubacar OUATTARA - kaira commited on Apr 22

Commit

35053bd

•

1 Parent(s): 4c04c43

add audios files

Browse files

Files changed (1) hide show

app.py +15 -37

app.py CHANGED Viewed

@@ -33,7 +33,7 @@ def translate_to_bambara(text, src_lang):
 # Function to convert text to speech
-def text_to_speech(bambara_text, reference_audio: Optional[Tuple] = None):
     if reference_audio is not None:
         ref_sr, ref_audio = reference_audio
         ref_audio = torch.from_numpy(ref_audio)
@@ -53,8 +53,8 @@ def text_to_speech(bambara_text, reference_audio: Optional[Tuple] = None):
         # Clean up the temporary file
         os.unlink(tmp_path)
     else:
-        # If no reference audio provided, proceed with the default
-        sr, audio = tts.text_to_speech(bambara_text)
     audio = audio.mean(dim=0)
     return audio, sr
@@ -91,36 +91,12 @@ def enhance_speech(audio_array, sampling_rate, solver, nfe, tau, denoise_before_
         return (new_sr1, denoised_audio.cpu().numpy()), (new_sr2, enhanced_audio.cpu().numpy())
-def resample_audio(audio_tensor, orig_sr, target_sr):
-    """
-    Resample audio tensor to a new sampling rate.
-    Args:
-        audio_tensor (torch.Tensor): Audio data tensor.
-        orig_sr (int): Original sampling rate of the audio tensor.
-        target_sr (int): Target sampling rate to resample the audio tensor to.
-    Returns:
-        torch.Tensor: Resampled audio tensor.
-    """
-    # Make sure the input tensor is in the shape (channels, time)
-    if audio_tensor.ndim == 1:
-        audio_tensor = audio_tensor.unsqueeze(0)
-    # Initialize the resample transform
-    resample_transform = torchaudio.transforms.Resample(orig_sr, target_sr)
-    # Perform the resampling
-    resampled_audio_tensor = resample_transform(audio_tensor)
-    return resampled_audio_tensor.mean(dim=0)
 # Define the Gradio interface
 @spaces.GPU
 def _fn(
         src_lang,
         text,
         reference_audio=None,
         solver="Midpoint",
         nfe=64,
@@ -128,15 +104,19 @@ def _fn(
         denoise_before_enhancement=False
 ):
     source_lang = flores_codes[src_lang]
     # Step 1: Translate the text to Bambara
     bambara_text = translate_to_bambara(text, source_lang)
     # Step 2: Convert the translated text to speech with reference audio
     if reference_audio is not None:
         audio_array, sampling_rate = text_to_speech(bambara_text, reference_audio)
     else:
-        audio_array, sampling_rate = text_to_speech(bambara_text)
     # Step 3: Enhance the audio
     denoised_audio, enhanced_audio = enhance_speech(
@@ -148,24 +128,22 @@ def _fn(
         denoise_before_enhancement
     )
-    # Return all outputs
-    return (
-        bambara_text,
-        (sampling_rate, audio_array.numpy()),
-        denoised_audio,
-        enhanced_audio
-    )
 def main():
     lang_codes = list(flores_codes.keys())
     # Build Gradio app
     app = gr.Interface(
         fn=_fn,
         inputs=[
             gr.Dropdown(label="Source Language", choices=lang_codes, value='French'),
             gr.Textbox(label="Text to Translate", lines=3),
             gr.Audio(label="Clone your voice (optional)", type="numpy", format="wav"),
             gr.Dropdown(
                 choices=["Midpoint", "RK4", "Euler"], value="Midpoint",
@@ -179,7 +157,7 @@ def main():
             gr.Textbox(label="Translated Text"),
             gr.Audio(label="Original TTS Audio", format='wav'),
             gr.Audio(label="Denoised Audio", format='wav'),
-            gr.Audio(label="Enhanced Audio")
         ],
         title="Bambara Translation and Text to Speech with Audio Enhancement",
         description="Translate text to Bambara and convert it to speech with options to enhance audio quality."

 # Function to convert text to speech
+def text_to_speech(bambara_text, reference_speaker: str, reference_audio: Optional[Tuple] = None):
     if reference_audio is not None:
         ref_sr, ref_audio = reference_audio
         ref_audio = torch.from_numpy(ref_audio)
         # Clean up the temporary file
         os.unlink(tmp_path)
     else:
+        # If no reference audio provided, proceed with the reference_speaker
+        sr, audio = tts.text_to_speech(bambara_text, speaker_reference_wav_path=reference_speaker)
     audio = audio.mean(dim=0)
     return audio, sr
         return (new_sr1, denoised_audio.cpu().numpy()), (new_sr2, enhanced_audio.cpu().numpy())
 # Define the Gradio interface
 @spaces.GPU
 def _fn(
         src_lang,
         text,
+        reference_speaker,
         reference_audio=None,
         solver="Midpoint",
         nfe=64,
         denoise_before_enhancement=False
 ):
     source_lang = flores_codes[src_lang]
+    reference_speaker = os.path.join("./audios", reference_speaker)
     # Step 1: Translate the text to Bambara
     bambara_text = translate_to_bambara(text, source_lang)
+    yield bambara_text, None, None, None
     # Step 2: Convert the translated text to speech with reference audio
     if reference_audio is not None:
         audio_array, sampling_rate = text_to_speech(bambara_text, reference_audio)
     else:
+        audio_array, sampling_rate = text_to_speech(bambara_text, reference_speaker=reference_speaker)
+    yield bambara_text, (sampling_rate, audio_array.numpy()), None, None
     # Step 3: Enhance the audio
     denoised_audio, enhanced_audio = enhance_speech(
         denoise_before_enhancement
     )
+    yield bambara_text, (sampling_rate, audio_array.numpy()), denoised_audio, enhanced_audio
 def main():
     lang_codes = list(flores_codes.keys())
+    # List all files in the ./audios directory for the dropdown
+    audio_files = [f for f in os.listdir('./audios') if os.path.isfile(os.path.join('./audios', f))]
     # Build Gradio app
     app = gr.Interface(
         fn=_fn,
         inputs=[
             gr.Dropdown(label="Source Language", choices=lang_codes, value='French'),
             gr.Textbox(label="Text to Translate", lines=3),
+            gr.Dropdown(label="Voice", choices=audio_files, value=audio_files[0]),
             gr.Audio(label="Clone your voice (optional)", type="numpy", format="wav"),
             gr.Dropdown(
                 choices=["Midpoint", "RK4", "Euler"], value="Midpoint",
             gr.Textbox(label="Translated Text"),
             gr.Audio(label="Original TTS Audio", format='wav'),
             gr.Audio(label="Denoised Audio", format='wav'),
+            gr.Audio(label="Enhanced Audio", format='wav')
         ],
         title="Bambara Translation and Text to Speech with Audio Enhancement",
         description="Translate text to Bambara and convert it to speech with options to enhance audio quality."