bambara-mt / tts.py
Aboubacar OUATTARA - kaira
add audios files
e5f089f
import os
import re
import time
import numpy as np
import requests
import torch
from typing import Optional, Tuple
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
from TTS.tts.layers.xtts.tokenizer import VoiceBpeTokenizer, basic_cleaners
from coqpit import Coqpit
from huggingface_hub import hf_hub_download, hf_hub_url
from tqdm import tqdm
def download_file_with_progress(url: str, destination: str):
"""
Downloads a file from a web URL with a progress bar.
"""
# Streaming GET request
response = requests.get(url, stream=True)
# Total size in bytes, set to zero if missing
total_size = int(response.headers.get('content-length', 0))
# Using tqdm to display progress
with open(destination, 'wb') as file, tqdm(desc=destination, total=total_size, unit='B', unit_scale=True,
unit_divisor=1024) as bar:
for data in response.iter_content(chunk_size=1024):
size = file.write(data)
bar.update(size)
class VoiceBambaraTextPreprocessor:
def preprocess_batch(self, texts):
return [self.preprocess(text) for text in texts]
def preprocess(self, text: str) -> str:
text = text.lower()
text = self.expand_number(text)
text = self.transliterate_bambara(text)
return text
def transliterate_bambara(self, text):
"""
Transliterate Bambara text using a specified mapping of special characters.
Parameters:
- text (str): The original Bambara text.
Returns:
- str: The transliterated text.
"""
bambara_transliteration = {
'ɲ': 'ny',
'ɛ': 'è',
'ɔ': 'o',
'ŋ': 'ng',
'ɟ': 'j',
'ʔ': "'",
'ɣ': 'gh',
'ʃ': 'sh',
'ߒ': 'n',
'ߎ': "u",
}
# Perform the transliteration
transliterated_text = "".join(bambara_transliteration.get(char, char) for char in text)
return transliterated_text
def expand_number(self, text):
"""
Normalize Bambara text for TTS by replacing numerical figures with their word equivalents.
Args:
text (str): The text to be normalized.
Returns:
str: The normalized Bambara text.
"""
# A regex pattern to match all numbers
number_pattern = re.compile(r'\b\d+\b')
# Function to replace each number with its Bambara text
def replace_number_with_text(match):
number = int(match.group())
return self.number_to_bambara(number)
# Replace each number in the text with its Bambara word equivalent
normalized_text = number_pattern.sub(replace_number_with_text, text)
return normalized_text
def number_to_bambara(self, n):
"""
Convert a number into its textual representation in Bambara using recursion.
Args:
n (int): The number to be converted.
Returns:
str: The number expressed in Bambara text.
Examples:
>>> number_to_bambara(123)
'kɛmɛ ni mugan ni saba'
Notes:
This function assumes that 'n' is a non-negative integer.
"""
# Bambara numbering rules
units = ["", "kɛlɛn", "fila", "saba", "naani", "duuru", "wɔrɔ", "wòlonwula", "sɛɛgin", "kɔnɔntɔn"]
tens = ["", "tan", "mugan", "bisaba", "binaani", "biduuru", "biwɔrɔ", "biwòlonfila", "bisɛɛgin", "bikɔnɔntɔn"]
hundreds = ["", "kɛmɛ"]
thousands = ["", "waga"]
millions = ["", "milyɔn"]
# Handle zero explicitly
if n == 0:
return "" # bambara does not support zero
if n < 10:
return units[n]
elif n < 100:
return tens[n // 10] + (" ni " + self.number_to_bambara(n % 10) if n % 10 > 0 else "")
elif n < 1000:
return hundreds[1] + (" " + self.number_to_bambara(n // 100) if n >= 200 else "") + (
" ni " + self.number_to_bambara(n % 100) if n % 100 > 0 else "")
elif n < 1_000_000:
return thousands[1] + " " + self.number_to_bambara(n // 1000) + (
" ni " + self.number_to_bambara(n % 1000) if n % 1000 > 0 else "")
else:
return millions[1] + " " + self.number_to_bambara(n // 1_000_000) + (
" ni " + self.number_to_bambara(n % 1_000_000) if n % 1_000_000 > 0 else "")
class BambaraTokenizer(VoiceBpeTokenizer):
"""
A tokenizer for the Bambara language that extends the VoiceBpeTokenizer.
Attributes:
preprocessor: An instance of VoiceBambaraTextPreprocessor for text preprocessing.
char_limits: A dictionary to hold character limits for languages.
"""
def __init__(self, vocab_file: Optional[str] = None):
"""
Initializes the BambaraTokenizer with a given vocabulary file.
Args:
vocab_file: The path to the vocabulary file, defaults to None.
"""
super().__init__(vocab_file)
self.preprocessor = VoiceBambaraTextPreprocessor()
self.char_limits['bm'] = 200 # Set character limit for Bambara language
def preprocess_text(self, txt: str, lang: str) -> str:
"""
Preprocesses the input text based on the language.
Args:
txt: The text to preprocess.
lang: The language code of the text.
Returns:
The preprocessed text.
"""
# Delegate preprocessing to the parent class for non-Bambara languages
if lang != "bm":
return super().preprocess_text(txt, lang)
# Apply Bambara-specific preprocessing
txt = self.preprocessor.preprocess(txt)
txt = basic_cleaners(txt)
return txt
class BambaraXtts(Xtts):
"""
A class for the Bambara language that extends the Xtts class.
Attributes:
tokenizer: An instance of BambaraTokenizer.
"""
def __init__(self, config: Coqpit):
"""
Initializes the BambaraXtts with the provided configuration.
Args:
config: An instance of Coqpit containing configuration settings.
"""
super().__init__(config)
self.tokenizer = BambaraTokenizer() # Initialize tokenizer for Bambara
self.init_models()
@classmethod
def init_from_config(cls, config: "XttsConfig", **kwargs) -> "BambaraXtts":
"""
Class method to create an instance of BambaraXtts from a configuration object.
Args:
config: An instance of XttsConfig containing configuration settings.
**kwargs: Additional keyword arguments.
Returns:
An instance of BambaraXtts.
"""
return cls(config)
class BambaraTTS:
"""
Bambara Text-to-Speech (TTS) class that initializes and uses a TTS model for the Bambara language.
Attributes:
language_code (str): The ISO language code for Bambara.
checkpoint_repo_or_dir (str): URL or local path to the model checkpoint directory.
local_dir (str): The directory to store downloaded checkpoints.
paths (dict): A dictionary of paths to model components.
config (XttsConfig): Configuration object for the TTS model.
model (BambaraXtts): The TTS model instance.
"""
def __init__(self, checkpoint_repo_or_dir: str, local_dir: Optional[str] = None):
"""
Initialize the BambaraTTS instance.
Args:
checkpoint_repo_or_dir: A string that represents either a Hugging Face hub repository
or a local directory where the TTS model checkpoint is located.
local_dir: An optional string representing a local directory path where model checkpoints
will be downloaded. If not specified, a default local directory is used based
on `checkpoint_repo_or_dir`.
The initialization process involves setting up local directories for model components,
ensuring the model checkpoint is available, and loading the model configuration and tokenizer.
"""
# Set the language code for Bambara
self.language_code = 'bm'
# Store the checkpoint location and local directory path
self.checkpoint_repo_or_dir = checkpoint_repo_or_dir
# If no local directory is provided, use the default based on the checkpoint
self.local_dir = local_dir if local_dir else self.default_local_dir(checkpoint_repo_or_dir)
# Initialize the paths for model components
self.paths = self.init_paths(self.local_dir)
# Ensure the model checkpoint is available locally
self.ensure_checkpoint_is_downloaded()
# Load the model configuration from a JSON file
self.config = XttsConfig()
self.config.load_json(self.paths['config.json'])
# Initialize the TTS model with the loaded configuration
self.model = BambaraXtts(self.config)
# Set up the tokenizer for the model, using the vocabulary file path
self.model.tokenizer = BambaraTokenizer(vocab_file=self.paths['vocab.json'])
# Load the model checkpoint into the initialized model
self.model.load_checkpoint(
self.config,
vocab_path="fake_vocab.json",
# The 'fake_vocab.json' is specified because the base model class might
# attempt to override our tokenizer if a vocab file is present
checkpoint_dir=self.local_dir,
# use_deepspeed=torch.cuda.is_available() # Utilize DeepSpeed if CUDA is available
use_deepspeed=False # disable because make it fails on huggingface space
)
# Move the model to GPU if CUDA is available
if torch.cuda.is_available():
self.model.cuda()
self.log_tokenizer()
def ensure_checkpoint_is_downloaded(self):
"""
Ensures that the model checkpoint is downloaded and available locally.
"""
if os.path.exists(self.checkpoint_repo_or_dir):
return
os.makedirs(self.local_dir, exist_ok=True)
self.log("Downloading checkpoint from the hub...")
for filename, filepath in self.paths.items():
if os.path.exists(filepath):
self.log(f"File {filepath} already exists. Skipping...")
continue
file_url = hf_hub_url(repo_id=self.checkpoint_repo_or_dir, filename=filename)
self.log(f"Downloading {filename} from {file_url}")
download_file_with_progress(file_url, filepath)
self.log("Checkpoint downloaded successfully!")
def default_local_dir(self, checkpoint_repo_or_dir: str) -> str:
"""
Generates a default local directory path for storing the model checkpoint.
Args:
checkpoint_repo_or_dir: The original checkpoint repository or directory path.
Returns:
The default local directory path.
"""
if os.path.exists(checkpoint_repo_or_dir):
return checkpoint_repo_or_dir
model_path = f"models--{checkpoint_repo_or_dir.replace('/', '--')}"
local_dir = os.path.join(os.path.expanduser('~'), "bambara_tts", model_path)
return local_dir.lower()
@staticmethod
def init_paths(local_dir: str) -> dict:
"""
Initializes paths to various model components based on the local directory.
Args:
local_dir: The local directory where model components are stored.
Returns:
A dictionary with keys as component names and values as file paths.
"""
components = ['model.pth', 'config.json', 'vocab.json', 'dvae.pth', 'mel_stats.pth']
return {name: os.path.join(local_dir, name) for name in components}
def text_to_speech(
self,
text: str,
speaker_reference_wav_path: Optional[str] = None,
temperature: Optional[float] = 0.1,
enable_text_splitting: bool = False
) -> Tuple[int, torch.Tensor]:
"""
Converts text into speech audio.
Args:
text: The input text to be converted into speech.
speaker_reference_wav_path: A path to a reference WAV file for the speaker.
temperature: The temperature parameter for sampling.
enable_text_splitting: Flag to enable or disable text splitting.
Returns:
A tuple containing the sampling rate and the generated audio tensor.
"""
if speaker_reference_wav_path is None:
speaker_reference_wav_path = "./audios/male_2.wav"
self.log("Using default speaker reference ./audios/male_2.wav.")
self.log("Computing speaker latents...")
gpt_cond_latent, speaker_embedding = self.model.get_conditioning_latents(
audio_path=[speaker_reference_wav_path]
)
self.log("Starting inference...")
start_time = time.time()
out = self.model.inference(
text,
self.language_code,
gpt_cond_latent,
speaker_embedding,
temperature=temperature,
enable_text_splitting=enable_text_splitting
)
end_time = time.time()
audio = torch.tensor(out["wav"]).unsqueeze(0).cpu()
sampling_rate = torch.tensor(self.config.model_args.output_sample_rate).cpu().item()
self.log(f"Speech generated in {end_time - start_time:.2f} seconds.")
return sampling_rate, audio
def log(self, message: str):
"""
Logs a message to the console with a uniform format.
Args:
message: The message to be logged.
"""
print(f"[BambaraTTS] {message}")
def log_tokenizer(self):
"""
Logs the tokenizer information.
"""
self.log(f"Tokenizer: {self.model.tokenizer}")