Spaces:
Running
on
Zero
Running
on
Zero
Aboubacar OUATTARA - kaira
commited on
Commit
•
35053bd
1
Parent(s):
4c04c43
add audios files
Browse files
app.py
CHANGED
@@ -33,7 +33,7 @@ def translate_to_bambara(text, src_lang):
|
|
33 |
|
34 |
|
35 |
# Function to convert text to speech
|
36 |
-
def text_to_speech(bambara_text, reference_audio: Optional[Tuple] = None):
|
37 |
if reference_audio is not None:
|
38 |
ref_sr, ref_audio = reference_audio
|
39 |
ref_audio = torch.from_numpy(ref_audio)
|
@@ -53,8 +53,8 @@ def text_to_speech(bambara_text, reference_audio: Optional[Tuple] = None):
|
|
53 |
# Clean up the temporary file
|
54 |
os.unlink(tmp_path)
|
55 |
else:
|
56 |
-
# If no reference audio provided, proceed with the
|
57 |
-
sr, audio = tts.text_to_speech(bambara_text)
|
58 |
|
59 |
audio = audio.mean(dim=0)
|
60 |
return audio, sr
|
@@ -91,36 +91,12 @@ def enhance_speech(audio_array, sampling_rate, solver, nfe, tau, denoise_before_
|
|
91 |
return (new_sr1, denoised_audio.cpu().numpy()), (new_sr2, enhanced_audio.cpu().numpy())
|
92 |
|
93 |
|
94 |
-
def resample_audio(audio_tensor, orig_sr, target_sr):
|
95 |
-
"""
|
96 |
-
Resample audio tensor to a new sampling rate.
|
97 |
-
|
98 |
-
Args:
|
99 |
-
audio_tensor (torch.Tensor): Audio data tensor.
|
100 |
-
orig_sr (int): Original sampling rate of the audio tensor.
|
101 |
-
target_sr (int): Target sampling rate to resample the audio tensor to.
|
102 |
-
|
103 |
-
Returns:
|
104 |
-
torch.Tensor: Resampled audio tensor.
|
105 |
-
"""
|
106 |
-
# Make sure the input tensor is in the shape (channels, time)
|
107 |
-
if audio_tensor.ndim == 1:
|
108 |
-
audio_tensor = audio_tensor.unsqueeze(0)
|
109 |
-
|
110 |
-
# Initialize the resample transform
|
111 |
-
resample_transform = torchaudio.transforms.Resample(orig_sr, target_sr)
|
112 |
-
|
113 |
-
# Perform the resampling
|
114 |
-
resampled_audio_tensor = resample_transform(audio_tensor)
|
115 |
-
|
116 |
-
return resampled_audio_tensor.mean(dim=0)
|
117 |
-
|
118 |
-
|
119 |
# Define the Gradio interface
|
120 |
@spaces.GPU
|
121 |
def _fn(
|
122 |
src_lang,
|
123 |
text,
|
|
|
124 |
reference_audio=None,
|
125 |
solver="Midpoint",
|
126 |
nfe=64,
|
@@ -128,15 +104,19 @@ def _fn(
|
|
128 |
denoise_before_enhancement=False
|
129 |
):
|
130 |
source_lang = flores_codes[src_lang]
|
|
|
131 |
|
132 |
# Step 1: Translate the text to Bambara
|
133 |
bambara_text = translate_to_bambara(text, source_lang)
|
|
|
134 |
|
135 |
# Step 2: Convert the translated text to speech with reference audio
|
136 |
if reference_audio is not None:
|
137 |
audio_array, sampling_rate = text_to_speech(bambara_text, reference_audio)
|
138 |
else:
|
139 |
-
audio_array, sampling_rate = text_to_speech(bambara_text)
|
|
|
|
|
140 |
|
141 |
# Step 3: Enhance the audio
|
142 |
denoised_audio, enhanced_audio = enhance_speech(
|
@@ -148,24 +128,22 @@ def _fn(
|
|
148 |
denoise_before_enhancement
|
149 |
)
|
150 |
|
151 |
-
|
152 |
-
return (
|
153 |
-
bambara_text,
|
154 |
-
(sampling_rate, audio_array.numpy()),
|
155 |
-
denoised_audio,
|
156 |
-
enhanced_audio
|
157 |
-
)
|
158 |
|
159 |
|
160 |
def main():
|
161 |
lang_codes = list(flores_codes.keys())
|
162 |
|
|
|
|
|
|
|
163 |
# Build Gradio app
|
164 |
app = gr.Interface(
|
165 |
fn=_fn,
|
166 |
inputs=[
|
167 |
gr.Dropdown(label="Source Language", choices=lang_codes, value='French'),
|
168 |
gr.Textbox(label="Text to Translate", lines=3),
|
|
|
169 |
gr.Audio(label="Clone your voice (optional)", type="numpy", format="wav"),
|
170 |
gr.Dropdown(
|
171 |
choices=["Midpoint", "RK4", "Euler"], value="Midpoint",
|
@@ -179,7 +157,7 @@ def main():
|
|
179 |
gr.Textbox(label="Translated Text"),
|
180 |
gr.Audio(label="Original TTS Audio", format='wav'),
|
181 |
gr.Audio(label="Denoised Audio", format='wav'),
|
182 |
-
gr.Audio(label="Enhanced Audio")
|
183 |
],
|
184 |
title="Bambara Translation and Text to Speech with Audio Enhancement",
|
185 |
description="Translate text to Bambara and convert it to speech with options to enhance audio quality."
|
|
|
33 |
|
34 |
|
35 |
# Function to convert text to speech
|
36 |
+
def text_to_speech(bambara_text, reference_speaker: str, reference_audio: Optional[Tuple] = None):
|
37 |
if reference_audio is not None:
|
38 |
ref_sr, ref_audio = reference_audio
|
39 |
ref_audio = torch.from_numpy(ref_audio)
|
|
|
53 |
# Clean up the temporary file
|
54 |
os.unlink(tmp_path)
|
55 |
else:
|
56 |
+
# If no reference audio provided, proceed with the reference_speaker
|
57 |
+
sr, audio = tts.text_to_speech(bambara_text, speaker_reference_wav_path=reference_speaker)
|
58 |
|
59 |
audio = audio.mean(dim=0)
|
60 |
return audio, sr
|
|
|
91 |
return (new_sr1, denoised_audio.cpu().numpy()), (new_sr2, enhanced_audio.cpu().numpy())
|
92 |
|
93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
# Define the Gradio interface
|
95 |
@spaces.GPU
|
96 |
def _fn(
|
97 |
src_lang,
|
98 |
text,
|
99 |
+
reference_speaker,
|
100 |
reference_audio=None,
|
101 |
solver="Midpoint",
|
102 |
nfe=64,
|
|
|
104 |
denoise_before_enhancement=False
|
105 |
):
|
106 |
source_lang = flores_codes[src_lang]
|
107 |
+
reference_speaker = os.path.join("./audios", reference_speaker)
|
108 |
|
109 |
# Step 1: Translate the text to Bambara
|
110 |
bambara_text = translate_to_bambara(text, source_lang)
|
111 |
+
yield bambara_text, None, None, None
|
112 |
|
113 |
# Step 2: Convert the translated text to speech with reference audio
|
114 |
if reference_audio is not None:
|
115 |
audio_array, sampling_rate = text_to_speech(bambara_text, reference_audio)
|
116 |
else:
|
117 |
+
audio_array, sampling_rate = text_to_speech(bambara_text, reference_speaker=reference_speaker)
|
118 |
+
|
119 |
+
yield bambara_text, (sampling_rate, audio_array.numpy()), None, None
|
120 |
|
121 |
# Step 3: Enhance the audio
|
122 |
denoised_audio, enhanced_audio = enhance_speech(
|
|
|
128 |
denoise_before_enhancement
|
129 |
)
|
130 |
|
131 |
+
yield bambara_text, (sampling_rate, audio_array.numpy()), denoised_audio, enhanced_audio
|
|
|
|
|
|
|
|
|
|
|
|
|
132 |
|
133 |
|
134 |
def main():
|
135 |
lang_codes = list(flores_codes.keys())
|
136 |
|
137 |
+
# List all files in the ./audios directory for the dropdown
|
138 |
+
audio_files = [f for f in os.listdir('./audios') if os.path.isfile(os.path.join('./audios', f))]
|
139 |
+
|
140 |
# Build Gradio app
|
141 |
app = gr.Interface(
|
142 |
fn=_fn,
|
143 |
inputs=[
|
144 |
gr.Dropdown(label="Source Language", choices=lang_codes, value='French'),
|
145 |
gr.Textbox(label="Text to Translate", lines=3),
|
146 |
+
gr.Dropdown(label="Voice", choices=audio_files, value=audio_files[0]),
|
147 |
gr.Audio(label="Clone your voice (optional)", type="numpy", format="wav"),
|
148 |
gr.Dropdown(
|
149 |
choices=["Midpoint", "RK4", "Euler"], value="Midpoint",
|
|
|
157 |
gr.Textbox(label="Translated Text"),
|
158 |
gr.Audio(label="Original TTS Audio", format='wav'),
|
159 |
gr.Audio(label="Denoised Audio", format='wav'),
|
160 |
+
gr.Audio(label="Enhanced Audio", format='wav')
|
161 |
],
|
162 |
title="Bambara Translation and Text to Speech with Audio Enhancement",
|
163 |
description="Translate text to Bambara and convert it to speech with options to enhance audio quality."
|