Spaces:

oza75
/

bambara-mt

Sleeping

App Files Files Community

Aboubacar OUATTARA - kaira commited on Apr 21

Commit

1b0b842

•

1 Parent(s): 05fb637

use custom tts

Browse files

Files changed (4) hide show

.gitattributes +1 -0
app.py +49 -13
requirements.txt +8 -5
tts.py +395 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.wav filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -1,4 +1,8 @@
 import concurrent
 import spaces
 from transformers import pipeline
 import gradio as gr
@@ -7,6 +11,7 @@ import torchaudio
 from resemble_enhance.enhancer.inference import denoise, enhance
 from flore200_codes import flores_codes
 # Check if CUDA is available
 device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -16,8 +21,8 @@ translation_model = "oza75/nllb-600M-mt-french-bambara"
 translator = pipeline("translation", model=translation_model, max_length=512)
 # Text-to-Speech pipeline
-tts_model = "oza75/bambara-tts-male-001"
-tts = pipeline("text-to-speech", model=tts_model, device=device)
 # Function to translate text to Bambara
@@ -29,11 +34,30 @@ def translate_to_bambara(text, src_lang):
 # Function to convert text to speech
 @spaces.GPU
-def text_to_speech(bambara_text):
-    speech = tts(bambara_text)
-    audio, sr = speech['audio'], speech['sampling_rate']
-    audio = torch.from_numpy(audio).mean(dim=0)
     return audio, sr
@@ -64,14 +88,25 @@ def enhance_speech(audio_array, sampling_rate, solver, nfe, tau, denoise_before_
 # Define the Gradio interface
-def _fn(src_lang, text, solver="Midpoint", nfe=64, prior_temp=0.5, denoise_before_enhancement=False):
     source_lang = flores_codes[src_lang]
     # Step 1: Translate the text to Bambara
     bambara_text = translate_to_bambara(text, source_lang)
-    # Step 2: Convert the translated text to speech
-    audio_array, sampling_rate = text_to_speech(bambara_text)
     # Step 3: Enhance the audio
     denoised_audio, enhanced_audio = enhance_speech(
@@ -95,13 +130,14 @@ def main():
         fn=_fn,
         inputs=[
             gr.Dropdown(label="Source Language", choices=lang_codes, value='French'),
-            gr.Textbox(label="Text to Translate"),
             gr.Dropdown(
                 choices=["Midpoint", "RK4", "Euler"], value="Midpoint",
                 label="ODE Solver (Midpoint is recommended)"
             ),
             gr.Slider(minimum=1, maximum=128, value=64, step=1, label="Number of Function Evaluations"),
-            gr.Slider(minimum=0, maximum=1, value=0.5, step=0.01, label="Prior Temperature"),
             gr.Checkbox(value=False, label="Denoise Before Enhancement")
         ],
         outputs=[
@@ -114,7 +150,7 @@ def main():
         description="Translate text to Bambara and convert it to speech with options to enhance audio quality."
     )
-    app.launch()
 if __name__ == "__main__":

 import concurrent
+import os
+import tempfile
+from typing import Optional, Tuple
 import spaces
 from transformers import pipeline
 import gradio as gr
 from resemble_enhance.enhancer.inference import denoise, enhance
 from flore200_codes import flores_codes
+from tts import BambaraTTS
 # Check if CUDA is available
 device = "cuda" if torch.cuda.is_available() else "cpu"
 translator = pipeline("translation", model=translation_model, max_length=512)
 # Text-to-Speech pipeline
+tts_model = "oza75/bambara-tts"
+tts = BambaraTTS(tts_model)
 # Function to translate text to Bambara
 # Function to convert text to speech
 @spaces.GPU
+def text_to_speech(bambara_text, reference_audio: Optional[Tuple] = None):
+    if reference_audio is not None:
+        ref_sr, ref_audio = reference_audio
+        ref_audio = torch.from_numpy(ref_audio)
+        # Add a channel dimension if the audio is 1D
+        if ref_audio.ndim == 1:
+            ref_audio = ref_audio.unsqueeze(0)
+        # Save the reference audio to a temporary file if it's not None
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp:
+            torchaudio.save(tmp.name, ref_audio, ref_sr)
+            tmp_path = tmp.name
+        # Use the temporary file as the speaker reference
+        sr, audio = tts.text_to_speech(bambara_text, speaker_reference_wav_path=tmp_path)
+        # Clean up the temporary file
+        os.unlink(tmp_path)
+    else:
+        # If no reference audio provided, proceed with the default
+        sr, audio = tts.text_to_speech(bambara_text)
+    audio = audio.mean(dim=0)
     return audio, sr
 # Define the Gradio interface
+def _fn(
+        src_lang,
+        text,
+        reference_audio=None,
+        solver="Midpoint",
+        nfe=64,
+        prior_temp=0.5,
+        denoise_before_enhancement=False
+):
     source_lang = flores_codes[src_lang]
     # Step 1: Translate the text to Bambara
     bambara_text = translate_to_bambara(text, source_lang)
+    # Step 2: Convert the translated text to speech with reference audio
+    if reference_audio is not None:
+        audio_array, sampling_rate = text_to_speech(bambara_text, reference_audio)
+    else:
+        audio_array, sampling_rate = text_to_speech(bambara_text)
     # Step 3: Enhance the audio
     denoised_audio, enhanced_audio = enhance_speech(
         fn=_fn,
         inputs=[
             gr.Dropdown(label="Source Language", choices=lang_codes, value='French'),
+            gr.Textbox(label="Text to Translate", lines=3),
+            gr.Audio(label="Clone your voice (optional)", type="numpy", format="wav"),
             gr.Dropdown(
                 choices=["Midpoint", "RK4", "Euler"], value="Midpoint",
                 label="ODE Solver (Midpoint is recommended)"
             ),
             gr.Slider(minimum=1, maximum=128, value=64, step=1, label="Number of Function Evaluations"),
+            gr.Slider(minimum=0.1, maximum=1, value=0.5, step=0.01, label="Prior Temperature"),
             gr.Checkbox(value=False, label="Denoise Before Enhancement")
         ],
         outputs=[
         description="Translate text to Bambara and convert it to speech with options to enhance audio quality."
     )
+    app.launch(share=False)
 if __name__ == "__main__":

requirements.txt CHANGED Viewed

@@ -1,6 +1,9 @@
-transformers
-gradio
-torch
-torchaudio
-spaces
 resemble-enhance==0.0.2.dev240104122303

+transformers>=4.33.0
+gradio~=4.8.0
+torch~=2.1.1
+torchaudio~=2.1.1
+spaces~=0.26.1
+deepspeed~=0.12.1
+requests~=2.31.0
 resemble-enhance==0.0.2.dev240104122303
+git+https://github.com/oza75/coqui-TTS.git@prod

tts.py ADDED Viewed

	@@ -0,0 +1,395 @@

+import os
+import re
+import time
+import numpy as np
+import requests
+import torch
+from typing import Optional, Tuple
+from TTS.tts.configs.xtts_config import XttsConfig
+from TTS.tts.models.xtts import Xtts
+from TTS.tts.layers.xtts.tokenizer import VoiceBpeTokenizer, basic_cleaners
+from coqpit import Coqpit
+from huggingface_hub import hf_hub_download, hf_hub_url
+from tqdm import tqdm
+def download_file_with_progress(url: str, destination: str):
+    """
+    Downloads a file from a web URL with a progress bar.
+    """
+    # Streaming GET request
+    response = requests.get(url, stream=True)
+    # Total size in bytes, set to zero if missing
+    total_size = int(response.headers.get('content-length', 0))
+    # Using tqdm to display progress
+    with open(destination, 'wb') as file, tqdm(desc=destination, total=total_size, unit='B', unit_scale=True,
+                                               unit_divisor=1024) as bar:
+        for data in response.iter_content(chunk_size=1024):
+            size = file.write(data)
+            bar.update(size)
+class VoiceBambaraTextPreprocessor:
+    def preprocess_batch(self, texts):
+        return [self.preprocess(text) for text in texts]
+    def preprocess(self, text: str) -> str:
+        text = text.lower()
+        text = self.expand_number(text)
+        text = self.transliterate_bambara(text)
+        return text
+    def transliterate_bambara(self, text):
+        """
+        Transliterate Bambara text using a specified mapping of special characters.
+        Parameters:
+        - text (str): The original Bambara text.
+        Returns:
+        - str: The transliterated text.
+        """
+        bambara_transliteration = {
+            'ɲ': 'ny',
+            'ɛ': 'è',
+            'ɔ': 'o',
+            'ŋ': 'ng',
+            'ɟ': 'j',
+            'ʔ': "'",
+            'ɣ': 'gh',
+            'ʃ': 'sh',
+            'ߒ': 'n',
+            'ߎ': "u",
+        }
+        # Perform the transliteration
+        transliterated_text = "".join(bambara_transliteration.get(char, char) for char in text)
+        return transliterated_text
+    def expand_number(self, text):
+        """
+        Normalize Bambara text for TTS by replacing numerical figures with their word equivalents.
+        Args:
+        text (str): The text to be normalized.
+        Returns:
+        str: The normalized Bambara text.
+        """
+        # A regex pattern to match all numbers
+        number_pattern = re.compile(r'\b\d+\b')
+        # Function to replace each number with its Bambara text
+        def replace_number_with_text(match):
+            number = int(match.group())
+            return self.number_to_bambara(number)
+        # Replace each number in the text with its Bambara word equivalent
+        normalized_text = number_pattern.sub(replace_number_with_text, text)
+        return normalized_text
+    def number_to_bambara(self, n):
+        """
+        Convert a number into its textual representation in Bambara using recursion.
+        Args:
+        n (int): The number to be converted.
+        Returns:
+        str: The number expressed in Bambara text.
+        Examples:
+        >>> number_to_bambara(123)
+        'kɛmɛ ni mugan ni saba'
+        Notes:
+        This function assumes that 'n' is a non-negative integer.
+        """
+        # Bambara numbering rules
+        units = ["", "kɛlɛn", "fila", "saba", "naani", "duuru", "wɔrɔ", "wòlonwula", "sɛɛgin", "kɔnɔntɔn"]
+        tens = ["", "tan", "mugan", "bisaba", "binaani", "biduuru", "biwɔrɔ", "biwòlonfila", "bisɛɛgin", "bikɔnɔntɔn"]
+        hundreds = ["", "kɛmɛ"]
+        thousands = ["", "waga"]
+        millions = ["", "milyɔn"]
+        # Handle zero explicitly
+        if n == 0:
+            return ""  # bambara does not support zero
+        if n < 10:
+            return units[n]
+        elif n < 100:
+            return tens[n // 10] + (" ni " + self.number_to_bambara(n % 10) if n % 10 > 0 else "")
+        elif n < 1000:
+            return hundreds[1] + (" " + self.number_to_bambara(n // 100) if n >= 200 else "") + (
+                " ni " + self.number_to_bambara(n % 100) if n % 100 > 0 else "")
+        elif n < 1_000_000:
+            return thousands[1] + " " + self.number_to_bambara(n // 1000) + (
+                " ni " + self.number_to_bambara(n % 1000) if n % 1000 > 0 else "")
+        else:
+            return millions[1] + " " + self.number_to_bambara(n // 1_000_000) + (
+                " ni " + self.number_to_bambara(n % 1_000_000) if n % 1_000_000 > 0 else "")
+class BambaraTokenizer(VoiceBpeTokenizer):
+    """
+    A tokenizer for the Bambara language that extends the VoiceBpeTokenizer.
+    Attributes:
+        preprocessor: An instance of VoiceBambaraTextPreprocessor for text preprocessing.
+        char_limits: A dictionary to hold character limits for languages.
+    """
+    def __init__(self, vocab_file: Optional[str] = None):
+        """
+        Initializes the BambaraTokenizer with a given vocabulary file.
+        Args:
+            vocab_file: The path to the vocabulary file, defaults to None.
+        """
+        super().__init__(vocab_file)
+        self.preprocessor = VoiceBambaraTextPreprocessor()
+        self.char_limits['bm'] = 200  # Set character limit for Bambara language
+    def preprocess_text(self, txt: str, lang: str) -> str:
+        """
+        Preprocesses the input text based on the language.
+        Args:
+            txt: The text to preprocess.
+            lang: The language code of the text.
+        Returns:
+            The preprocessed text.
+        """
+        # Delegate preprocessing to the parent class for non-Bambara languages
+        if lang != "bm":
+            return super().preprocess_text(txt, lang)
+        # Apply Bambara-specific preprocessing
+        txt = self.preprocessor.preprocess(txt)
+        txt = basic_cleaners(txt)
+        return txt
+class BambaraXtts(Xtts):
+    """
+    A class for the Bambara language that extends the Xtts class.
+    Attributes:
+        tokenizer: An instance of BambaraTokenizer.
+    """
+    def __init__(self, config: Coqpit):
+        """
+        Initializes the BambaraXtts with the provided configuration.
+        Args:
+            config: An instance of Coqpit containing configuration settings.
+        """
+        super().__init__(config)
+        self.tokenizer = BambaraTokenizer()  # Initialize tokenizer for Bambara
+        self.init_models()
+    @classmethod
+    def init_from_config(cls, config: "XttsConfig", **kwargs) -> "BambaraXtts":
+        """
+        Class method to create an instance of BambaraXtts from a configuration object.
+        Args:
+            config: An instance of XttsConfig containing configuration settings.
+            **kwargs: Additional keyword arguments.
+        Returns:
+            An instance of BambaraXtts.
+        """
+        return cls(config)
+class BambaraTTS:
+    """
+    Bambara Text-to-Speech (TTS) class that initializes and uses a TTS model for the Bambara language.
+    Attributes:
+        language_code (str): The ISO language code for Bambara.
+        checkpoint_repo_or_dir (str): URL or local path to the model checkpoint directory.
+        local_dir (str): The directory to store downloaded checkpoints.
+        paths (dict): A dictionary of paths to model components.
+        config (XttsConfig): Configuration object for the TTS model.
+        model (BambaraXtts): The TTS model instance.
+    """
+    def __init__(self, checkpoint_repo_or_dir: str, local_dir: Optional[str] = None):
+        """
+        Initialize the BambaraTTS instance.
+        Args:
+            checkpoint_repo_or_dir: A string that represents either a Hugging Face hub repository
+                                    or a local directory where the TTS model checkpoint is located.
+            local_dir: An optional string representing a local directory path where model checkpoints
+                       will be downloaded. If not specified, a default local directory is used based
+                       on `checkpoint_repo_or_dir`.
+        The initialization process involves setting up local directories for model components,
+        ensuring the model checkpoint is available, and loading the model configuration and tokenizer.
+        """
+        # Set the language code for Bambara
+        self.language_code = 'bm'
+        # Store the checkpoint location and local directory path
+        self.checkpoint_repo_or_dir = checkpoint_repo_or_dir
+        # If no local directory is provided, use the default based on the checkpoint
+        self.local_dir = local_dir if local_dir else self.default_local_dir(checkpoint_repo_or_dir)
+        # Initialize the paths for model components
+        self.paths = self.init_paths(self.local_dir)
+        # Ensure the model checkpoint is available locally
+        self.ensure_checkpoint_is_downloaded()
+        # Load the model configuration from a JSON file
+        self.config = XttsConfig()
+        self.config.load_json(self.paths['config.json'])
+        # Initialize the TTS model with the loaded configuration
+        self.model = BambaraXtts(self.config)
+        # Set up the tokenizer for the model, using the vocabulary file path
+        self.model.tokenizer = BambaraTokenizer(vocab_file=self.paths['vocab.json'])
+        # Load the model checkpoint into the initialized model
+        self.model.load_checkpoint(
+            self.config,
+            vocab_path="fake_vocab.json",
+            # The 'fake_vocab.json' is specified because the base model class might
+            # attempt to override our tokenizer if a vocab file is present
+            checkpoint_dir=self.local_dir,
+            use_deepspeed=torch.cuda.is_available()  # Utilize DeepSpeed if CUDA is available
+        )
+        # Move the model to GPU if CUDA is available
+        if torch.cuda.is_available():
+            self.model.cuda()
+        self.log_tokenizer()
+    def ensure_checkpoint_is_downloaded(self):
+        """
+        Ensures that the model checkpoint is downloaded and available locally.
+        """
+        if os.path.exists(self.checkpoint_repo_or_dir):
+            return
+        os.makedirs(self.local_dir, exist_ok=True)
+        self.log("Downloading checkpoint from the hub...")
+        for filename, filepath in self.paths.items():
+            if os.path.exists(filepath):
+                self.log(f"File {filepath} already exists. Skipping...")
+                continue
+            file_url = hf_hub_url(repo_id=self.checkpoint_repo_or_dir, filename=filename)
+            self.log(f"Downloading {filename} from {file_url}")
+            download_file_with_progress(file_url, filepath)
+        self.log("Checkpoint downloaded successfully!")
+    def default_local_dir(self, checkpoint_repo_or_dir: str) -> str:
+        """
+        Generates a default local directory path for storing the model checkpoint.
+        Args:
+            checkpoint_repo_or_dir: The original checkpoint repository or directory path.
+        Returns:
+            The default local directory path.
+        """
+        if os.path.exists(checkpoint_repo_or_dir):
+            return checkpoint_repo_or_dir
+        model_path = f"models--{checkpoint_repo_or_dir.replace('/', '--')}"
+        local_dir = os.path.join(os.path.expanduser('~'), ".cache", "huggingface", "hub", model_path)
+        return local_dir.lower()
+    @staticmethod
+    def init_paths(local_dir: str) -> dict:
+        """
+        Initializes paths to various model components based on the local directory.
+        Args:
+            local_dir: The local directory where model components are stored.
+        Returns:
+            A dictionary with keys as component names and values as file paths.
+        """
+        components = ['model.pth', 'config.json', 'vocab.json', 'dvae.pth', 'mel_stats.pth']
+        return {name: os.path.join(local_dir, name) for name in components}
+    def text_to_speech(
+            self,
+            text: str,
+            speaker_reference_wav_path: Optional[str] = None,
+            temperature: Optional[float] = 0.1,
+            enable_text_splitting: bool = False
+    ) -> Tuple[int, torch.Tensor]:
+        """
+        Converts text into speech audio.
+        Args:
+            text: The input text to be converted into speech.
+            speaker_reference_wav_path: A path to a reference WAV file for the speaker.
+            temperature: The temperature parameter for sampling.
+            enable_text_splitting: Flag to enable or disable text splitting.
+        Returns:
+            A tuple containing the sampling rate and the generated audio tensor.
+        """
+        if speaker_reference_wav_path is None:
+            speaker_reference_wav_path = "reference_audios/male_2.wav"
+            self.log("Using default speaker reference ./audios/male_2.wav.")
+        self.log("Computing speaker latents...")
+        gpt_cond_latent, speaker_embedding = self.model.get_conditioning_latents(
+            audio_path=[speaker_reference_wav_path]
+        )
+        self.log("Starting inference...")
+        start_time = time.time()
+        out = self.model.inference(
+            text,
+            self.language_code,
+            gpt_cond_latent,
+            speaker_embedding,
+            temperature=temperature,
+            enable_text_splitting=enable_text_splitting
+        )
+        end_time = time.time()
+        audio = torch.tensor(out["wav"]).unsqueeze(0)
+        sampling_rate = self.config.model_args.output_sample_rate
+        self.log(f"Speech generated in {end_time - start_time:.2f} seconds.")
+        return sampling_rate, audio
+    def log(self, message: str):
+        """
+        Logs a message to the console with a uniform format.
+        Args:
+            message: The message to be logged.
+        """
+        print(f"[BambaraTTS] {message}")
+    def log_tokenizer(self):
+        """
+        Logs the tokenizer information.
+        """
+        self.log(f"Tokenizer: {self.model.tokenizer}")