Spaces:
Sleeping
Sleeping
import concurrent | |
from transformers import pipeline | |
import gradio as gr | |
import torch | |
import torchaudio | |
from resemble_enhance.enhancer.inference import denoise, enhance | |
from flore200_codes import flores_codes | |
# Check if CUDA is available | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
# Translation pipeline | |
translation_model = "oza75/nllb-600M-mt-french-bambara" | |
translator = pipeline("translation", model=translation_model, max_length=512) | |
# Text-to-Speech pipeline | |
tts_model = "oza75/bambara-tts-male-001" | |
tts = pipeline("text-to-speech", model=tts_model, device=device) | |
# Function to translate text to Bambara | |
def translate_to_bambara(text, src_lang): | |
translation = translator(text, src_lang=src_lang, tgt_lang="bam_Latn") | |
return translation[0]['translation_text'] | |
# Function to convert text to speech | |
def text_to_speech(bambara_text): | |
speech = tts(bambara_text) | |
audio, sr = speech['audio'], speech['sampling_rate'] | |
audio = torch.from_numpy(audio).mean(dim=0) | |
return audio, sr | |
# Function to enhance speech | |
def enhance_speech(audio_array, sampling_rate, solver, nfe, tau, denoise_before_enhancement): | |
solver = solver.lower() | |
nfe = int(nfe) | |
lambd = 0.9 if denoise_before_enhancement else 0.1 | |
def denoise_audio(): | |
return denoise(audio_array, sampling_rate, device) | |
def enhance_audio(): | |
return enhance(audio_array, sampling_rate, device, nfe=nfe, solver=solver, lambd=lambd, tau=tau) | |
with concurrent.futures.ThreadPoolExecutor() as executor: | |
future_denoise = executor.submit(denoise_audio) | |
future_enhance = executor.submit(enhance_audio) | |
denoised_audio, new_sr1 = future_denoise.result() | |
enhanced_audio, new_sr2 = future_enhance.result() | |
# Convert to numpy and return | |
return (new_sr1, denoised_audio.cpu().numpy()), (new_sr2, enhanced_audio.cpu().numpy()) | |
# Define the Gradio interface | |
def _fn(src_lang, text, solver="Midpoint", nfe=64, prior_temp=0.5, denoise_before_enhancement=False): | |
source_lang = flores_codes[src_lang] | |
# Step 1: Translate the text to Bambara | |
bambara_text = translate_to_bambara(text, source_lang) | |
# Step 2: Convert the translated text to speech | |
audio_array, sampling_rate = text_to_speech(bambara_text) | |
# Step 3: Enhance the audio | |
denoised_audio, enhanced_audio = enhance_speech( | |
audio_array, | |
sampling_rate, | |
solver, | |
nfe, | |
prior_temp, | |
denoise_before_enhancement | |
) | |
# Return all outputs | |
return bambara_text, (sampling_rate, audio_array.cpu().numpy()), denoised_audio, enhanced_audio | |
def main(): | |
lang_codes = list(flores_codes.keys()) | |
# Build Gradio app | |
app = gr.Interface( | |
fn=_fn, | |
inputs=[ | |
gr.Dropdown(label="Source Language", choices=lang_codes, value='French'), | |
gr.Textbox(label="Text to Translate"), | |
gr.Dropdown( | |
choices=["Midpoint", "RK4", "Euler"], value="Midpoint", | |
label="ODE Solver (Midpoint is recommended)" | |
), | |
gr.Slider(minimum=1, maximum=128, value=64, step=1, label="Number of Function Evaluations"), | |
gr.Slider(minimum=0, maximum=1, value=0.5, step=0.01, label="Prior Temperature"), | |
gr.Checkbox(value=False, label="Denoise Before Enhancement") | |
], | |
outputs=[ | |
gr.Textbox(label="Translated Text"), | |
gr.Audio(label="Original TTS Audio"), | |
gr.Audio(label="Denoised Audio"), | |
gr.Audio(label="Enhanced Audio") | |
], | |
title="Bambara Translation and Text to Speech with Audio Enhancement", | |
description="Translate text to Bambara and convert it to speech with options to enhance audio quality." | |
) | |
app.launch() | |
if __name__ == "__main__": | |
main() | |