import re import os import nltk import torch import pickle import torchaudio import numpy as np from TTS.tts.models.xtts import Xtts from nltk.tokenize import sent_tokenize from TTS.tts.configs.xtts_config import XttsConfig def _load_array(filename): """ Opens a file a returns it, used with numpy files """ with open(filename, 'rb') as f: return pickle.load(f) os.environ['COQUI_TOS_AGREED'] = '1' # Used to generate audio based on a sample nltk.download('punkt') model_path = os.path.join("../tts_model") config = XttsConfig() config.load_json(os.path.join(model_path, "config.json")) model = Xtts.init_from_config(config) model.load_checkpoint( config, checkpoint_path=os.path.join(model_path, "model.pth"), vocab_path=os.path.join(model_path, "vocab.json"), eval=True, use_deepspeed=True, ) device = 'cuda' if torch.cuda.is_available() else 'cpu' model.to(device) # Speaker latent path_latents = 'assets/gpt_cond_latent.npy' gpt_cond_latent = _load_array(path_latents) # Speaker embedding path_embedding = 'assets/speaker_embedding.npy' speaker_embedding = _load_array(path_embedding) def get_audio(text: str, language: str = 'es', saving_path: str = 'output') -> None: """ Creates an audio :param text: text to convert to audio :param language: 'es', 'en' or 'pt', language used for the audio file :param saving_path: path to save the audio :return: None """ # Creates an audio with the answer and saves it as output.wav _save_audio(text, language, saving_path) return def _save_audio(text: str, language: str, path_audio: str) -> None: """ Splits the text into sentences, clean and creates an audio for each one, then concatenates all the audios and saves them into a file. :param text: input text :param language: language used in the audio :param path_audio: saving path of the audio :return: None """ # Split the answer into sentences and clean it sentences = _get_clean_text(text, language) # Get the voice of each sentence audio_segments = [] for sentence in sentences: audio_stream = _get_voice(sentence, language) audio_stream = torch.tensor(audio_stream) audio_segments.append(audio_stream) # Concatenate and save all audio segments concatenated_audio = torch.cat(audio_segments, dim=0) torchaudio.save(f'{path_audio}.wav', concatenated_audio.unsqueeze(0), 24000) return def _get_voice(sentence: str, language: str) -> np.ndarray: """ Gets a numpy array with a wav of an audio with the given sentence and language :param sentence: input sentence :param language: languages used in the audio :return: numpy array with the audio """ out = model.inference( sentence, language=language, gpt_cond_latent=gpt_cond_latent, speaker_embedding=speaker_embedding, temperature=0.1 ) return out['wav'] def _get_clean_text(text: str, language: str) -> list[str]: """ Splits the text into smaller sentences using nltk and removes links. :param text: input text for the audio :param language: language used for the audio ('es', 'en', 'pt') :return: list of sentences """ # Remove the links in the audio and add another sentence if language == 'en': clean_answer = re.sub(r'http[s]?://\S+', 'the following link', text) max_characters = 250 elif language == 'es': clean_answer = re.sub(r'http[s]?://\S+', 'el siguiente link', text) max_characters = 239 else: clean_answer = re.sub(r'http[s]?://\S+', 'o seguinte link', text) max_characters = 203 # Change the name from Bella to Bela clean_answer = clean_answer.replace('Bella', 'Bela') # Remove Florida and zipcode clean_answer = re.sub(r', FL \d+', "", clean_answer) # Split the answer into sentences with nltk and make sure they are shorter than the maximum possible # characters split_sentences = sent_tokenize(clean_answer) sentences = [] for sentence in split_sentences: if len(sentence) > max_characters: sentences.extend(_split_sentence(sentence, max_characters)) else: sentences.append(sentence) return sentences def _split_sentence(sentence: str, max_characters: int) -> list[str]: """ Used when the sentences are still to long. The split point is the nearest comma to the middle of the sentence, if there is no comma then a space is used or just the middle. If the remaining sentences are still too long, another iteration is run. :param sentence: sentence to be split :param max_characters: max number of characters a sentence can have :return: list of sentences """ # Get index of each comma sentences = [] commas = [i for i, c in enumerate(sentence) if c == ','] # No commas, search for spaces if len(commas) == 0: commas = [i for i, c in enumerate(sentence) if c == ' '] # No commas or spaces, split it in the middle if len(commas) == 0: sentences.append(sentence[:len(sentence) // 2]) sentences.append(sentence[len(sentence) // 2:]) return sentences # Nearest index to the middle split_point = min(commas, key=lambda x: abs(x - (len(sentence) // 2))) if sentence[split_point] == ',': left = sentence[:split_point] right = sentence[split_point + 2:] else: left = sentence[:split_point] right = sentence[split_point + 1:] if len(left) > max_characters: sentences.extend(_split_sentence(left, max_characters)) else: sentences.append(left) if len(right) > max_characters: sentences.extend(_split_sentence(right, max_characters)) else: sentences.append(right) return sentences