SoniTranslate_CPU

Running

File size: 51,380 Bytes

b152010

from gtts import gTTS
import edge_tts, asyncio, json, glob # noqa
from tqdm import tqdm
import librosa, os, re, torch, gc, subprocess # noqa
from .language_configuration import (
    fix_code_language,
    BARK_VOICES_LIST,
    VITS_VOICES_LIST,
)
from .utils import (
    download_manager,
    create_directories,
    copy_files,
    rename_file,
    remove_directory_contents,
    remove_files,
    run_command,
)
import numpy as np
from typing import Any, Dict
from pathlib import Path
import soundfile as sf
import platform
import logging
import traceback
from .logging_setup import logger


class TTS_OperationError(Exception):
    def __init__(self, message="The operation did not complete successfully."):
        self.message = message
        super().__init__(self.message)


def verify_saved_file_and_size(filename):
    if not os.path.exists(filename):
        raise TTS_OperationError(f"File '{filename}' was not saved.")
    if os.path.getsize(filename) == 0:
        raise TTS_OperationError(
            f"File '{filename}' has a zero size. "
            "Related to incorrect TTS for the target language"
        )


def error_handling_in_tts(error, segment, TRANSLATE_AUDIO_TO, filename):
    traceback.print_exc()
    logger.error(f"Error: {str(error)}")
    try:
        from tempfile import TemporaryFile

        tts = gTTS(segment["text"], lang=fix_code_language(TRANSLATE_AUDIO_TO))
        # tts.save(filename)
        f = TemporaryFile()
        tts.write_to_fp(f)

        # Reset the file pointer to the beginning of the file
        f.seek(0)

        # Read audio data from the TemporaryFile using soundfile
        audio_data, samplerate = sf.read(f)
        f.close()  # Close the TemporaryFile
        sf.write(
            filename, audio_data, samplerate, format="ogg", subtype="vorbis"
        )

        logger.warning(
            'TTS auxiliary will be utilized '
            f'rather than TTS: {segment["tts_name"]}'
        )
        verify_saved_file_and_size(filename)
    except Exception as error:
        logger.critical(f"Error: {str(error)}")
        sample_rate_aux = 22050
        duration = float(segment["end"]) - float(segment["start"])
        data = np.zeros(int(sample_rate_aux * duration)).astype(np.float32)
        sf.write(
            filename, data, sample_rate_aux, format="ogg", subtype="vorbis"
        )
        logger.error("Audio will be replaced -> [silent audio].")
        verify_saved_file_and_size(filename)


def pad_array(array, sr):

    if isinstance(array, list):
        array = np.array(array)

    if not array.shape[0]:
        raise ValueError("The generated audio does not contain any data")

    valid_indices = np.where(np.abs(array) > 0.001)[0]

    if len(valid_indices) == 0:
        logger.debug(f"No valid indices: {array}")
        return array

    try:
        pad_indice = int(0.1 * sr)
        start_pad = max(0, valid_indices[0] - pad_indice)
        end_pad = min(len(array), valid_indices[-1] + 1 + pad_indice)
        padded_array = array[start_pad:end_pad]
        return padded_array
    except Exception as error:
        logger.error(str(error))
        return array


# =====================================
# EDGE TTS
# =====================================


def edge_tts_voices_list():
    try:
        completed_process = subprocess.run(
            ["edge-tts", "--list-voices"], capture_output=True, text=True
        )
        lines = completed_process.stdout.strip().split("\n")
    except Exception as error:
        logger.debug(str(error))
        lines = []

    voices = []
    for line in lines:
        if line.startswith("Name: "):
            voice_entry = {}
            voice_entry["Name"] = line.split(": ")[1]
        elif line.startswith("Gender: "):
            voice_entry["Gender"] = line.split(": ")[1]
            voices.append(voice_entry)

    formatted_voices = [
        f"{entry['Name']}-{entry['Gender']}" for entry in voices
    ]

    if not formatted_voices:
        logger.warning(
            "The list of Edge TTS voices could not be obtained, "
            "switching to an alternative method"
        )
        tts_voice_list = asyncio.new_event_loop().run_until_complete(
            edge_tts.list_voices()
        )
        formatted_voices = sorted(
            [f"{v['ShortName']}-{v['Gender']}" for v in tts_voice_list]
        )

    if not formatted_voices:
        logger.error("Can't get EDGE TTS - list voices")

    return formatted_voices


def segments_egde_tts(filtered_edge_segments, TRANSLATE_AUDIO_TO, is_gui):
    for segment in tqdm(filtered_edge_segments["segments"]):
        speaker = segment["speaker"] # noqa
        text = segment["text"]
        start = segment["start"]
        tts_name = segment["tts_name"]

        # make the tts audio
        filename = f"audio/{start}.ogg"
        temp_file = filename[:-3] + "mp3"

        logger.info(f"{text} >> {filename}")
        try:
            if is_gui:
                asyncio.run(
                    edge_tts.Communicate(
                        text, "-".join(tts_name.split("-")[:-1])
                    ).save(temp_file)
                )
            else:
                # nest_asyncio.apply() if not is_gui else None
                command = f'edge-tts -t "{text}" -v "{tts_name.replace("-Male", "").replace("-Female", "")}" --write-media "{temp_file}"'
                run_command(command)
            verify_saved_file_and_size(temp_file)

            data, sample_rate = sf.read(temp_file)
            data = pad_array(data, sample_rate)
            # os.remove(temp_file)

            # Save file
            sf.write(
                file=filename,
                samplerate=sample_rate,
                data=data,
                format="ogg",
                subtype="vorbis",
            )
            verify_saved_file_and_size(filename)

        except Exception as error:
            error_handling_in_tts(error, segment, TRANSLATE_AUDIO_TO, filename)


# =====================================
# BARK TTS
# =====================================


def segments_bark_tts(
    filtered_bark_segments, TRANSLATE_AUDIO_TO, model_id_bark="suno/bark-small"
):
    from transformers import AutoProcessor, BarkModel
    from optimum.bettertransformer import BetterTransformer

    device = os.environ.get("SONITR_DEVICE")
    torch_dtype_env = torch.float16 if device == "cuda" else torch.float32

    # load model bark
    model = BarkModel.from_pretrained(
        model_id_bark, torch_dtype=torch_dtype_env
    ).to(device)
    model = model.to(device)
    processor = AutoProcessor.from_pretrained(
        model_id_bark, return_tensors="pt"
    )  # , padding=True
    if device == "cuda":
        # convert to bettertransformer
        model = BetterTransformer.transform(model, keep_original_model=False)
        # enable CPU offload
        # model.enable_cpu_offload()
    sampling_rate = model.generation_config.sample_rate

    # filtered_segments = filtered_bark_segments['segments']
    # Sorting the segments by 'tts_name'
    # sorted_segments = sorted(filtered_segments, key=lambda x: x['tts_name'])
    # logger.debug(sorted_segments)

    for segment in tqdm(filtered_bark_segments["segments"]):
        speaker = segment["speaker"] # noqa
        text = segment["text"]
        start = segment["start"]
        tts_name = segment["tts_name"]

        inputs = processor(text, voice_preset=BARK_VOICES_LIST[tts_name]).to(
            device
        )

        # make the tts audio
        filename = f"audio/{start}.ogg"
        logger.info(f"{text} >> {filename}")
        try:
            # Infer
            with torch.inference_mode():
                speech_output = model.generate(
                    **inputs,
                    do_sample=True,
                    fine_temperature=0.4,
                    coarse_temperature=0.8,
                    pad_token_id=processor.tokenizer.pad_token_id,
                )
            # Save file
            data_tts = pad_array(
                speech_output.cpu().numpy().squeeze().astype(np.float32),
                sampling_rate,
            )
            sf.write(
                file=filename,
                samplerate=sampling_rate,
                data=data_tts,
                format="ogg",
                subtype="vorbis",
            )
            verify_saved_file_and_size(filename)
        except Exception as error:
            error_handling_in_tts(error, segment, TRANSLATE_AUDIO_TO, filename)
        gc.collect()
        torch.cuda.empty_cache()
    try:
        del processor
        del model
        gc.collect()
        torch.cuda.empty_cache()
    except Exception as error:
        logger.error(str(error))
        gc.collect()
        torch.cuda.empty_cache()


# =====================================
# VITS TTS
# =====================================


def uromanize(input_string):
    """Convert non-Roman strings to Roman using the `uroman` perl package."""
    # script_path = os.path.join(uroman_path, "bin", "uroman.pl")

    if not os.path.exists("./uroman"):
        logger.info(
            "Clonning repository uroman https://github.com/isi-nlp/uroman.git"
            " for romanize the text"
        )
        process = subprocess.Popen(
            ["git", "clone", "https://github.com/isi-nlp/uroman.git"],
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
        )
        stdout, stderr = process.communicate()
    script_path = os.path.join("./uroman", "bin", "uroman.pl")

    command = ["perl", script_path]

    process = subprocess.Popen(
        command,
        stdin=subprocess.PIPE,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
    )
    # Execute the perl command
    stdout, stderr = process.communicate(input=input_string.encode())

    if process.returncode != 0:
        raise ValueError(f"Error {process.returncode}: {stderr.decode()}")

    # Return the output as a string and skip the new-line character at the end
    return stdout.decode()[:-1]


def segments_vits_tts(filtered_vits_segments, TRANSLATE_AUDIO_TO):
    from transformers import VitsModel, AutoTokenizer

    filtered_segments = filtered_vits_segments["segments"]
    # Sorting the segments by 'tts_name'
    sorted_segments = sorted(filtered_segments, key=lambda x: x["tts_name"])
    logger.debug(sorted_segments)

    model_name_key = None
    for segment in tqdm(sorted_segments):
        speaker = segment["speaker"] # noqa
        text = segment["text"]
        start = segment["start"]
        tts_name = segment["tts_name"]

        if tts_name != model_name_key:
            model_name_key = tts_name
            model = VitsModel.from_pretrained(VITS_VOICES_LIST[tts_name])
            tokenizer = AutoTokenizer.from_pretrained(
                VITS_VOICES_LIST[tts_name]
            )
            sampling_rate = model.config.sampling_rate

        if tokenizer.is_uroman:
            romanize_text = uromanize(text)
            logger.debug(f"Romanize text: {romanize_text}")
            inputs = tokenizer(romanize_text, return_tensors="pt")
        else:
            inputs = tokenizer(text, return_tensors="pt")

        # make the tts audio
        filename = f"audio/{start}.ogg"
        logger.info(f"{text} >> {filename}")
        try:
            # Infer
            with torch.no_grad():
                speech_output = model(**inputs).waveform

            data_tts = pad_array(
                speech_output.cpu().numpy().squeeze().astype(np.float32),
                sampling_rate,
            )
            # Save file
            sf.write(
                file=filename,
                samplerate=sampling_rate,
                data=data_tts,
                format="ogg",
                subtype="vorbis",
            )
            verify_saved_file_and_size(filename)
        except Exception as error:
            error_handling_in_tts(error, segment, TRANSLATE_AUDIO_TO, filename)
        gc.collect()
        torch.cuda.empty_cache()
    try:
        del tokenizer
        del model
        gc.collect()
        torch.cuda.empty_cache()
    except Exception as error:
        logger.error(str(error))
        gc.collect()
        torch.cuda.empty_cache()


# =====================================
# Coqui XTTS
# =====================================


def coqui_xtts_voices_list():
    main_folder = "_XTTS_"
    pattern_coqui = re.compile(r".+\.(wav|mp3|ogg|m4a)$")
    pattern_automatic_speaker = re.compile(r"AUTOMATIC_SPEAKER_\d+\.wav$")

    # List only files in the directory matching the pattern but not matching
    # AUTOMATIC_SPEAKER_00.wav, AUTOMATIC_SPEAKER_01.wav, etc.
    wav_voices = [
        "_XTTS_/" + f
        for f in os.listdir(main_folder)
        if os.path.isfile(os.path.join(main_folder, f))
        and pattern_coqui.match(f)
        and not pattern_automatic_speaker.match(f)
    ]

    return ["_XTTS_/AUTOMATIC.wav"] + wav_voices


def seconds_to_hhmmss_ms(seconds):
    hours = seconds // 3600
    minutes = (seconds % 3600) // 60
    seconds = seconds % 60
    milliseconds = int((seconds - int(seconds)) * 1000)
    return "%02d:%02d:%02d.%03d" % (hours, minutes, int(seconds), milliseconds)


def audio_trimming(audio_path, destination, start, end):
    if isinstance(start, (int, float)):
        start = seconds_to_hhmmss_ms(start)
    if isinstance(end, (int, float)):
        end = seconds_to_hhmmss_ms(end)

    if destination:
        file_directory = destination
    else:
        file_directory = os.path.dirname(audio_path)

    file_name = os.path.splitext(os.path.basename(audio_path))[0]
    file_ = f"{file_name}_trim.wav"
    # file_ = f'{os.path.splitext(audio_path)[0]}_trim.wav'
    output_path = os.path.join(file_directory, file_)

    # -t (duration from -ss) | -to (time stop) | -af silenceremove=1:0:-50dB (remove silence)
    command = f'ffmpeg -y -loglevel error -i "{audio_path}" -ss {start} -to {end} -acodec pcm_s16le -f wav "{output_path}"'
    run_command(command)

    return output_path


def convert_to_xtts_good_sample(audio_path: str = "", destination: str = ""):
    if destination:
        file_directory = destination
    else:
        file_directory = os.path.dirname(audio_path)

    file_name = os.path.splitext(os.path.basename(audio_path))[0]
    file_ = f"{file_name}_good_sample.wav"
    # file_ = f'{os.path.splitext(audio_path)[0]}_good_sample.wav'
    mono_path = os.path.join(file_directory, file_)  # get root

    command = f'ffmpeg -y -loglevel error -i "{audio_path}" -ac 1 -ar 22050 -sample_fmt s16 -f wav "{mono_path}"'
    run_command(command)

    return mono_path


def sanitize_file_name(file_name):
    import unicodedata

    # Normalize the string to NFKD form to separate combined characters into
    # base characters and diacritics
    normalized_name = unicodedata.normalize("NFKD", file_name)
    # Replace any non-ASCII characters or special symbols with an underscore
    sanitized_name = re.sub(r"[^\w\s.-]", "_", normalized_name)
    return sanitized_name


def create_wav_file_vc(
    sample_name="",  # name final file
    audio_wav="",  # path
    start=None,  # trim start
    end=None,  # trim end
    output_final_path="_XTTS_",
    get_vocals_dereverb=True,
):
    sample_name = sample_name if sample_name else "default_name"
    sample_name = sanitize_file_name(sample_name)
    audio_wav = audio_wav if isinstance(audio_wav, str) else audio_wav.name

    BASE_DIR = (
        "."  # os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    )

    output_dir = os.path.join(BASE_DIR, "clean_song_output")  # remove content
    # remove_directory_contents(output_dir)

    if start or end:
        # Cut file
        audio_segment = audio_trimming(audio_wav, output_dir, start, end)
    else:
        # Complete file
        audio_segment = audio_wav

    from .mdx_net import process_uvr_task

    try:
        _, _, _, _, audio_segment = process_uvr_task(
            orig_song_path=audio_segment,
            main_vocals=True,
            dereverb=get_vocals_dereverb,
        )
    except Exception as error:
        logger.error(str(error))

    sample = convert_to_xtts_good_sample(audio_segment)

    sample_name = f"{sample_name}.wav"
    sample_rename = rename_file(sample, sample_name)

    copy_files(sample_rename, output_final_path)

    final_sample = os.path.join(output_final_path, sample_name)
    if os.path.exists(final_sample):
        logger.info(final_sample)
        return final_sample
    else:
        raise Exception(f"Error wav: {final_sample}")


def create_new_files_for_vc(
    speakers_coqui,
    segments_base,
    dereverb_automatic=True
):
    # before function delete automatic delete_previous_automatic
    output_dir = os.path.join(".", "clean_song_output")  # remove content
    remove_directory_contents(output_dir)

    for speaker in speakers_coqui:
        filtered_speaker = [
            segment
            for segment in segments_base
            if segment["speaker"] == speaker
        ]
        if len(filtered_speaker) > 4:
            filtered_speaker = filtered_speaker[1:]
        if filtered_speaker[0]["tts_name"] == "_XTTS_/AUTOMATIC.wav":
            name_automatic_wav = f"AUTOMATIC_{speaker}"
            if os.path.exists(f"_XTTS_/{name_automatic_wav}.wav"):
                logger.info(f"WAV automatic {speaker} exists")
                # path_wav = path_automatic_wav
                pass
            else:
                # create wav
                wav_ok = False
                for seg in filtered_speaker:
                    duration = float(seg["end"]) - float(seg["start"])
                    if duration > 7.0 and duration < 12.0:
                        logger.info(
                            f'Processing segment: {seg["start"]}, {seg["end"]}, {seg["speaker"]}, {duration}, {seg["text"]}'
                        )
                        create_wav_file_vc(
                            sample_name=name_automatic_wav,
                            audio_wav="audio.wav",
                            start=(float(seg["start"]) + 1.0),
                            end=(float(seg["end"]) - 1.0),
                            get_vocals_dereverb=dereverb_automatic,
                        )
                        wav_ok = True
                        break

                if not wav_ok:
                    logger.info("Taking the first segment")
                    seg = filtered_speaker[0]
                    logger.info(
                        f'Processing segment: {seg["start"]}, {seg["end"]}, {seg["speaker"]}, {seg["text"]}'
                    )
                    max_duration = float(seg["end"]) - float(seg["start"])
                    max_duration = max(2.0, min(max_duration, 9.0))

                    create_wav_file_vc(
                        sample_name=name_automatic_wav,
                        audio_wav="audio.wav",
                        start=(float(seg["start"])),
                        end=(float(seg["start"]) + max_duration),
                        get_vocals_dereverb=dereverb_automatic,
                    )


def segments_coqui_tts(
    filtered_coqui_segments,
    TRANSLATE_AUDIO_TO,
    model_id_coqui="tts_models/multilingual/multi-dataset/xtts_v2",
    speakers_coqui=None,
    delete_previous_automatic=True,
    dereverb_automatic=True,
    emotion=None,
):
    """XTTS
    Install:
    pip install -q TTS==0.21.1
    pip install -q numpy==1.23.5

    Notes:
    - tts_name is the wav|mp3|ogg|m4a file for VC
    """
    from TTS.api import TTS

    TRANSLATE_AUDIO_TO = fix_code_language(TRANSLATE_AUDIO_TO, syntax="coqui")
    supported_lang_coqui = [
        "zh-cn",
        "en",
        "fr",
        "de",
        "it",
        "pt",
        "pl",
        "tr",
        "ru",
        "nl",
        "cs",
        "ar",
        "es",
        "hu",
        "ko",
        "ja",
    ]
    if TRANSLATE_AUDIO_TO not in supported_lang_coqui:
        raise TTS_OperationError(
            f"'{TRANSLATE_AUDIO_TO}' is not a supported language for Coqui XTTS"
        )
    # Emotion and speed can only be used with Coqui Studio models. discontinued
    # emotions = ["Neutral", "Happy", "Sad", "Angry", "Dull"]

    if delete_previous_automatic:
        for spk in speakers_coqui:
            remove_files(f"_XTTS_/AUTOMATIC_{spk}.wav")

    directory_audios_vc = "_XTTS_"
    create_directories(directory_audios_vc)
    create_new_files_for_vc(
        speakers_coqui,
        filtered_coqui_segments["segments"],
        dereverb_automatic,
    )

    # Init TTS
    device = os.environ.get("SONITR_DEVICE")
    model = TTS(model_id_coqui).to(device)
    sampling_rate = 24000

    # filtered_segments = filtered_coqui_segments['segments']
    # Sorting the segments by 'tts_name'
    # sorted_segments = sorted(filtered_segments, key=lambda x: x['tts_name'])
    # logger.debug(sorted_segments)

    for segment in tqdm(filtered_coqui_segments["segments"]):
        speaker = segment["speaker"]
        text = segment["text"]
        start = segment["start"]
        tts_name = segment["tts_name"]
        if tts_name == "_XTTS_/AUTOMATIC.wav":
            tts_name = f"_XTTS_/AUTOMATIC_{speaker}.wav"

        # make the tts audio
        filename = f"audio/{start}.ogg"
        logger.info(f"{text} >> {filename}")
        try:
            # Infer
            wav = model.tts(
                text=text, speaker_wav=tts_name, language=TRANSLATE_AUDIO_TO
            )
            data_tts = pad_array(
                wav,
                sampling_rate,
            )
            # Save file
            sf.write(
                file=filename,
                samplerate=sampling_rate,
                data=data_tts,
                format="ogg",
                subtype="vorbis",
            )
            verify_saved_file_and_size(filename)
        except Exception as error:
            error_handling_in_tts(error, segment, TRANSLATE_AUDIO_TO, filename)
        gc.collect()
        torch.cuda.empty_cache()
    try:
        del model
        gc.collect()
        torch.cuda.empty_cache()
    except Exception as error:
        logger.error(str(error))
        gc.collect()
        torch.cuda.empty_cache()


# =====================================
# PIPER TTS
# =====================================


def piper_tts_voices_list():
    file_path = download_manager(
        url="https://huggingface.co/rhasspy/piper-voices/resolve/main/voices.json",
        path="./PIPER_MODELS",
    )

    with open(file_path, "r", encoding="utf8") as file:
        data = json.load(file)
    piper_id_models = [key + " VITS-onnx" for key in data.keys()]

    return piper_id_models


def replace_text_in_json(file_path, key_to_replace, new_text, condition=None):
    # Read the JSON file
    with open(file_path, "r", encoding="utf-8") as file:
        data = json.load(file)

    # Modify the specified key's value with the new text
    if key_to_replace in data:
        if condition:
            value_condition = condition
        else:
            value_condition = data[key_to_replace]

        if data[key_to_replace] == value_condition:
            data[key_to_replace] = new_text

    # Write the modified content back to the JSON file
    with open(file_path, "w") as file:
        json.dump(
            data, file, indent=2
        )  # Write the modified data back to the file with indentation for readability


def load_piper_model(
    model: str,
    data_dir: list,
    download_dir: str = "",
    update_voices: bool = False,
):
    from piper import PiperVoice
    from piper.download import ensure_voice_exists, find_voice, get_voices

    try:
        import onnxruntime as rt

        if rt.get_device() == "GPU" and os.environ.get("SONITR_DEVICE") == "cuda":
            logger.debug("onnxruntime device > GPU")
            cuda = True
        else:
            logger.info(
                "onnxruntime device > CPU"
            )  # try pip install onnxruntime-gpu
            cuda = False
    except Exception as error:
        raise TTS_OperationError(f"onnxruntime error: {str(error)}")

    # Disable CUDA in Windows
    if platform.system() == "Windows":
        logger.info("Employing CPU exclusivity with Piper TTS")
        cuda = False

    if not download_dir:
        # Download to first data directory by default
        download_dir = data_dir[0]
    else:
        data_dir = [os.path.join(data_dir[0], download_dir)]

    # Download voice if file doesn't exist
    model_path = Path(model)
    if not model_path.exists():
        # Load voice info
        voices_info = get_voices(download_dir, update_voices=update_voices)

        # Resolve aliases for backwards compatibility with old voice names
        aliases_info: Dict[str, Any] = {}
        for voice_info in voices_info.values():
            for voice_alias in voice_info.get("aliases", []):
                aliases_info[voice_alias] = {"_is_alias": True, **voice_info}

        voices_info.update(aliases_info)
        ensure_voice_exists(model, data_dir, download_dir, voices_info)
        model, config = find_voice(model, data_dir)

        replace_text_in_json(
            config, "phoneme_type", "espeak", "PhonemeType.ESPEAK"
        )

    # Load voice
    voice = PiperVoice.load(model, config_path=config, use_cuda=cuda)

    return voice


def synthesize_text_to_audio_np_array(voice, text, synthesize_args):
    audio_stream = voice.synthesize_stream_raw(text, **synthesize_args)

    # Collect the audio bytes into a single NumPy array
    audio_data = b""
    for audio_bytes in audio_stream:
        audio_data += audio_bytes

    # Ensure correct data type and convert audio bytes to NumPy array
    audio_np = np.frombuffer(audio_data, dtype=np.int16)
    return audio_np


def segments_vits_onnx_tts(filtered_onnx_vits_segments, TRANSLATE_AUDIO_TO):
    """
    Install:
    pip install -q piper-tts==1.2.0 onnxruntime-gpu # for cuda118
    """

    data_dir = [
        str(Path.cwd())
    ]  # "Data directory to check for downloaded models (default: current directory)"
    download_dir = "PIPER_MODELS"
    # model_name = "en_US-lessac-medium" tts_name in a dict like VITS
    update_voices = True  # "Download latest voices.json during startup",

    synthesize_args = {
        "speaker_id": None,
        "length_scale": 1.0,
        "noise_scale": 0.667,
        "noise_w": 0.8,
        "sentence_silence": 0.0,
    }

    filtered_segments = filtered_onnx_vits_segments["segments"]
    # Sorting the segments by 'tts_name'
    sorted_segments = sorted(filtered_segments, key=lambda x: x["tts_name"])
    logger.debug(sorted_segments)

    model_name_key = None
    for segment in tqdm(sorted_segments):
        speaker = segment["speaker"] # noqa
        text = segment["text"]
        start = segment["start"]
        tts_name = segment["tts_name"].replace(" VITS-onnx", "")

        if tts_name != model_name_key:
            model_name_key = tts_name
            model = load_piper_model(
                tts_name, data_dir, download_dir, update_voices
            )
            sampling_rate = model.config.sample_rate

        # make the tts audio
        filename = f"audio/{start}.ogg"
        logger.info(f"{text} >> {filename}")
        try:
            # Infer
            speech_output = synthesize_text_to_audio_np_array(
                model, text, synthesize_args
            )
            data_tts = pad_array(
                speech_output,  # .cpu().numpy().squeeze().astype(np.float32),
                sampling_rate,
            )
            # Save file
            sf.write(
                file=filename,
                samplerate=sampling_rate,
                data=data_tts,
                format="ogg",
                subtype="vorbis",
            )
            verify_saved_file_and_size(filename)
        except Exception as error:
            error_handling_in_tts(error, segment, TRANSLATE_AUDIO_TO, filename)
        gc.collect()
        torch.cuda.empty_cache()
    try:
        del model
        gc.collect()
        torch.cuda.empty_cache()
    except Exception as error:
        logger.error(str(error))
        gc.collect()
        torch.cuda.empty_cache()


# =====================================
# CLOSEAI TTS
# =====================================


def segments_openai_tts(
    filtered_openai_tts_segments, TRANSLATE_AUDIO_TO
):
    from openai import OpenAI

    client = OpenAI()
    sampling_rate = 24000

    # filtered_segments = filtered_openai_tts_segments['segments']
    # Sorting the segments by 'tts_name'
    # sorted_segments = sorted(filtered_segments, key=lambda x: x['tts_name'])

    for segment in tqdm(filtered_openai_tts_segments["segments"]):
        speaker = segment["speaker"] # noqa
        text = segment["text"].strip()
        start = segment["start"]
        tts_name = segment["tts_name"]

        # make the tts audio
        filename = f"audio/{start}.ogg"
        logger.info(f"{text} >> {filename}")

        try:
            # Request
            response = client.audio.speech.create(
                model="tts-1-hd" if "HD" in tts_name else "tts-1",
                voice=tts_name.split()[0][1:],
                response_format="wav",
                input=text
            )

            audio_bytes = b''
            for data in response.iter_bytes(chunk_size=4096):
                audio_bytes += data

            speech_output = np.frombuffer(audio_bytes, dtype=np.int16)

            # Save file
            data_tts = pad_array(
                speech_output[240:],
                sampling_rate,
            )

            sf.write(
                file=filename,
                samplerate=sampling_rate,
                data=data_tts,
                format="ogg",
                subtype="vorbis",
            )
            verify_saved_file_and_size(filename)

        except Exception as error:
            error_handling_in_tts(error, segment, TRANSLATE_AUDIO_TO, filename)


# =====================================
# Select task TTS
# =====================================


def find_spkr(pattern, speaker_to_voice, segments):
    return [
        speaker
        for speaker, voice in speaker_to_voice.items()
        if pattern.match(voice) and any(
            segment["speaker"] == speaker for segment in segments
        )
    ]


def filter_by_speaker(speakers, segments):
    return {
        "segments": [
            segment
            for segment in segments
            if segment["speaker"] in speakers
        ]
    }


def audio_segmentation_to_voice(
    result_diarize,
    TRANSLATE_AUDIO_TO,
    is_gui,
    tts_voice00,
    tts_voice01="",
    tts_voice02="",
    tts_voice03="",
    tts_voice04="",
    tts_voice05="",
    tts_voice06="",
    tts_voice07="",
    tts_voice08="",
    tts_voice09="",
    tts_voice10="",
    tts_voice11="",
    dereverb_automatic=True,
    model_id_bark="suno/bark-small",
    model_id_coqui="tts_models/multilingual/multi-dataset/xtts_v2",
    delete_previous_automatic=True,
):

    remove_directory_contents("audio")

    # Mapping speakers to voice variables
    speaker_to_voice = {
        "SPEAKER_00": tts_voice00,
        "SPEAKER_01": tts_voice01,
        "SPEAKER_02": tts_voice02,
        "SPEAKER_03": tts_voice03,
        "SPEAKER_04": tts_voice04,
        "SPEAKER_05": tts_voice05,
        "SPEAKER_06": tts_voice06,
        "SPEAKER_07": tts_voice07,
        "SPEAKER_08": tts_voice08,
        "SPEAKER_09": tts_voice09,
        "SPEAKER_10": tts_voice10,
        "SPEAKER_11": tts_voice11,
    }

    # Assign 'SPEAKER_00' to segments without a 'speaker' key
    for segment in result_diarize["segments"]:
        if "speaker" not in segment:
            segment["speaker"] = "SPEAKER_00"
            logger.warning(
                "NO SPEAKER DETECT IN SEGMENT: First TTS will be used in the"
                f" segment time {segment['start'], segment['text']}"
            )
        # Assign the TTS name
        segment["tts_name"] = speaker_to_voice[segment["speaker"]]

    # Find TTS method
    pattern_edge = re.compile(r".*-(Male|Female)$")
    pattern_bark = re.compile(r".* BARK$")
    pattern_vits = re.compile(r".* VITS$")
    pattern_coqui = re.compile(r".+\.(wav|mp3|ogg|m4a)$")
    pattern_vits_onnx = re.compile(r".* VITS-onnx$")
    pattern_openai_tts = re.compile(r".* OpenAI-TTS$")

    all_segments = result_diarize["segments"]

    speakers_edge = find_spkr(pattern_edge, speaker_to_voice, all_segments)
    speakers_bark = find_spkr(pattern_bark, speaker_to_voice, all_segments)
    speakers_vits = find_spkr(pattern_vits, speaker_to_voice, all_segments)
    speakers_coqui = find_spkr(pattern_coqui, speaker_to_voice, all_segments)
    speakers_vits_onnx = find_spkr(
        pattern_vits_onnx, speaker_to_voice, all_segments
    )
    speakers_openai_tts = find_spkr(
        pattern_openai_tts, speaker_to_voice, all_segments
    )

    # Filter method in segments
    filtered_edge = filter_by_speaker(speakers_edge, all_segments)
    filtered_bark = filter_by_speaker(speakers_bark, all_segments)
    filtered_vits = filter_by_speaker(speakers_vits, all_segments)
    filtered_coqui = filter_by_speaker(speakers_coqui, all_segments)
    filtered_vits_onnx = filter_by_speaker(speakers_vits_onnx, all_segments)
    filtered_openai_tts = filter_by_speaker(speakers_openai_tts, all_segments)

    # Infer
    if filtered_edge["segments"]:
        logger.info(f"EDGE TTS: {speakers_edge}")
        segments_egde_tts(filtered_edge, TRANSLATE_AUDIO_TO, is_gui)  # mp3
    if filtered_bark["segments"]:
        logger.info(f"BARK TTS: {speakers_bark}")
        segments_bark_tts(
            filtered_bark, TRANSLATE_AUDIO_TO, model_id_bark
        )  # wav
    if filtered_vits["segments"]:
        logger.info(f"VITS TTS: {speakers_vits}")
        segments_vits_tts(filtered_vits, TRANSLATE_AUDIO_TO)  # wav
    if filtered_coqui["segments"]:
        logger.info(f"Coqui TTS: {speakers_coqui}")
        segments_coqui_tts(
            filtered_coqui,
            TRANSLATE_AUDIO_TO,
            model_id_coqui,
            speakers_coqui,
            delete_previous_automatic,
            dereverb_automatic,
        )  # wav
    if filtered_vits_onnx["segments"]:
        logger.info(f"PIPER TTS: {speakers_vits_onnx}")
        segments_vits_onnx_tts(filtered_vits_onnx, TRANSLATE_AUDIO_TO)  # wav
    if filtered_openai_tts["segments"]:
        logger.info(f"OpenAI TTS: {speakers_openai_tts}")
        segments_openai_tts(filtered_openai_tts, TRANSLATE_AUDIO_TO)  # wav

    [result.pop("tts_name", None) for result in result_diarize["segments"]]
    return [
        speakers_edge,
        speakers_bark,
        speakers_vits,
        speakers_coqui,
        speakers_vits_onnx,
        speakers_openai_tts
    ]


def accelerate_segments(
    result_diarize,
    max_accelerate_audio,
    valid_speakers,
    acceleration_rate_regulation=False,
    folder_output="audio2",
):
    logger.info("Apply acceleration")

    (
        speakers_edge,
        speakers_bark,
        speakers_vits,
        speakers_coqui,
        speakers_vits_onnx,
        speakers_openai_tts
    ) = valid_speakers

    create_directories(f"{folder_output}/audio/")
    remove_directory_contents(f"{folder_output}/audio/")

    audio_files = []
    speakers_list = []

    max_count_segments_idx = len(result_diarize["segments"]) - 1

    for i, segment in tqdm(enumerate(result_diarize["segments"])):
        text = segment["text"] # noqa
        start = segment["start"]
        end = segment["end"]
        speaker = segment["speaker"]

        # find name audio
        # if speaker in speakers_edge:
        filename = f"audio/{start}.ogg"
        # elif speaker in speakers_bark + speakers_vits + speakers_coqui + speakers_vits_onnx:
        #    filename = f"audio/{start}.wav" # wav

        # duration
        duration_true = end - start
        duration_tts = librosa.get_duration(filename=filename)

        # Accelerate percentage
        acc_percentage = duration_tts / duration_true

        # Smoth
        if acceleration_rate_regulation and acc_percentage >= 1.3:
            try:
                next_segment = result_diarize["segments"][
                    min(max_count_segments_idx, i + 1)
                ]
                next_start = next_segment["start"]
                next_speaker = next_segment["speaker"]
                duration_with_next_start = next_start - start

                if duration_with_next_start > duration_true:
                    extra_time = duration_with_next_start - duration_true

                    if speaker == next_speaker:
                        # half
                        smoth_duration = duration_true + (extra_time * 0.5)
                    else:
                        # 7/10
                        smoth_duration = duration_true + (extra_time * 0.7)
                    logger.debug(
                        f"Base acc: {acc_percentage}, "
                        f"smoth acc: {duration_tts / smoth_duration}"
                    )
                    acc_percentage = max(1.2, (duration_tts / smoth_duration))

            except Exception as error:
                logger.error(str(error))

        if acc_percentage > max_accelerate_audio:
            acc_percentage = max_accelerate_audio
        elif acc_percentage <= 1.15 and acc_percentage >= 0.8:
            acc_percentage = 1.0
        elif acc_percentage <= 0.79:
            acc_percentage = 0.8

        # Round
        acc_percentage = round(acc_percentage + 0.0, 1)

        # Format read if need
        if speaker in speakers_edge:
            info_enc = sf.info(filename).format
        else:
            info_enc = "OGG"

        # Apply aceleration or opposite to the audio file in folder_output folder
        if acc_percentage == 1.0 and info_enc == "OGG":
            copy_files(filename, f"{folder_output}{os.sep}audio")
        else:
            os.system(
                f"ffmpeg -y -loglevel panic -i {filename} -filter:a atempo={acc_percentage} {folder_output}/{filename}"
            )

        if logger.isEnabledFor(logging.DEBUG):
            duration_create = librosa.get_duration(
                filename=f"{folder_output}/{filename}"
            )
            logger.debug(
                f"acc_percen is {acc_percentage}, tts duration "
                f"is {duration_tts}, new duration is {duration_create}"
                f", for {filename}"
            )

        audio_files.append(f"{folder_output}/{filename}")
        speaker = "TTS Speaker {:02d}".format(int(speaker[-2:]) + 1)
        speakers_list.append(speaker)

    return audio_files, speakers_list


# =====================================
# Tone color converter
# =====================================


def se_process_audio_segments(
    source_seg, tone_color_converter, device, remove_previous_processed=True
):
    # list wav seg
    source_audio_segs = glob.glob(f"{source_seg}/*.wav")
    if not source_audio_segs:
        raise ValueError(
            f"No audio segments found in {str(source_audio_segs)}"
        )

    source_se_path = os.path.join(source_seg, "se.pth")

    # if exist not create wav
    if os.path.isfile(source_se_path):
        se = torch.load(source_se_path).to(device)
        logger.debug(f"Previous created {source_se_path}")
    else:
        se = tone_color_converter.extract_se(source_audio_segs, source_se_path)

    return se


def create_wav_vc(
    valid_speakers,
    segments_base,
    audio_name,
    max_segments=10,
    target_dir="processed",
    get_vocals_dereverb=False,
):
    # valid_speakers = list({item['speaker'] for item in segments_base})

    # Before function delete automatic delete_previous_automatic
    output_dir = os.path.join(".", target_dir)  # remove content
    # remove_directory_contents(output_dir)

    path_source_segments = []
    path_target_segments = []
    for speaker in valid_speakers:
        filtered_speaker = [
            segment
            for segment in segments_base
            if segment["speaker"] == speaker
        ]
        if len(filtered_speaker) > 4:
            filtered_speaker = filtered_speaker[1:]

        dir_name_speaker = speaker + audio_name
        dir_name_speaker_tts = "tts" + speaker + audio_name
        dir_path_speaker = os.path.join(output_dir, dir_name_speaker)
        dir_path_speaker_tts = os.path.join(output_dir, dir_name_speaker_tts)
        create_directories([dir_path_speaker, dir_path_speaker_tts])

        path_target_segments.append(dir_path_speaker)
        path_source_segments.append(dir_path_speaker_tts)

        # create wav
        max_segments_count = 0
        for seg in filtered_speaker:
            duration = float(seg["end"]) - float(seg["start"])
            if duration > 3.0 and duration < 18.0:
                logger.info(
                    f'Processing segment: {seg["start"]}, {seg["end"]}, {seg["speaker"]}, {duration}, {seg["text"]}'
                )
                name_new_wav = str(seg["start"])

                check_segment_audio_target_file = os.path.join(
                    dir_path_speaker, f"{name_new_wav}.wav"
                )

                if os.path.exists(check_segment_audio_target_file):
                    logger.debug(
                        "Segment vc source exists: "
                        f"{check_segment_audio_target_file}"
                    )
                    pass
                else:
                    create_wav_file_vc(
                        sample_name=name_new_wav,
                        audio_wav="audio.wav",
                        start=(float(seg["start"]) + 1.0),
                        end=(float(seg["end"]) - 1.0),
                        output_final_path=dir_path_speaker,
                        get_vocals_dereverb=get_vocals_dereverb,
                    )

                    file_name_tts = f"audio2/audio/{str(seg['start'])}.ogg"
                    # copy_files(file_name_tts, os.path.join(output_dir, dir_name_speaker_tts)
                    convert_to_xtts_good_sample(
                        file_name_tts, dir_path_speaker_tts
                    )

                max_segments_count += 1
                if max_segments_count == max_segments:
                    break

        if max_segments_count == 0:
            logger.info("Taking the first segment")
            seg = filtered_speaker[0]
            logger.info(
                f'Processing segment: {seg["start"]}, {seg["end"]}, {seg["speaker"]}, {seg["text"]}'
            )
            max_duration = float(seg["end"]) - float(seg["start"])
            max_duration = max(1.0, min(max_duration, 18.0))

            name_new_wav = str(seg["start"])
            create_wav_file_vc(
                sample_name=name_new_wav,
                audio_wav="audio.wav",
                start=(float(seg["start"])),
                end=(float(seg["start"]) + max_duration),
                output_final_path=dir_path_speaker,
                get_vocals_dereverb=get_vocals_dereverb,
            )

            file_name_tts = f"audio2/audio/{str(seg['start'])}.ogg"
            # copy_files(file_name_tts, os.path.join(output_dir, dir_name_speaker_tts)
            convert_to_xtts_good_sample(file_name_tts, dir_path_speaker_tts)

    logger.debug(f"Base: {str(path_source_segments)}")
    logger.debug(f"Target: {str(path_target_segments)}")

    return path_source_segments, path_target_segments


def toneconverter_openvoice(
    result_diarize,
    preprocessor_max_segments,
    remove_previous_process=True,
    get_vocals_dereverb=False,
    model="openvoice",
):
    audio_path = "audio.wav"
    # se_path = "se.pth"
    target_dir = "processed"
    create_directories(target_dir)

    from openvoice import se_extractor
    from openvoice.api import ToneColorConverter

    audio_name = f"{os.path.basename(audio_path).rsplit('.', 1)[0]}_{se_extractor.hash_numpy_array(audio_path)}"
    # se_path = os.path.join(target_dir, audio_name, 'se.pth')

    # create wav seg original and target

    valid_speakers = list(
        {item["speaker"] for item in result_diarize["segments"]}
    )

    logger.info("Openvoice preprocessor...")

    if remove_previous_process:
        remove_directory_contents(target_dir)

    path_source_segments, path_target_segments = create_wav_vc(
        valid_speakers,
        result_diarize["segments"],
        audio_name,
        max_segments=preprocessor_max_segments,
        get_vocals_dereverb=get_vocals_dereverb,
    )

    logger.info("Openvoice loading model...")
    model_path_openvoice = "./OPENVOICE_MODELS"
    url_model_openvoice = "https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/converter"

    if "v2" in model:
        model_path = os.path.join(model_path_openvoice, "v2")
        url_model_openvoice = url_model_openvoice.replace(
            "OpenVoice", "OpenVoiceV2"
        ).replace("checkpoints/", "")
    else:
        model_path = os.path.join(model_path_openvoice, "v1")
    create_directories(model_path)

    config_url = f"{url_model_openvoice}/config.json"
    checkpoint_url = f"{url_model_openvoice}/checkpoint.pth"

    config_path = download_manager(url=config_url, path=model_path)
    checkpoint_path = download_manager(
        url=checkpoint_url, path=model_path
    )

    device = os.environ.get("SONITR_DEVICE")
    tone_color_converter = ToneColorConverter(config_path, device=device)
    tone_color_converter.load_ckpt(checkpoint_path)

    logger.info("Openvoice tone color converter:")
    global_progress_bar = tqdm(total=len(result_diarize["segments"]), desc="Progress")

    for source_seg, target_seg, speaker in zip(
        path_source_segments, path_target_segments, valid_speakers
    ):
        # source_se_path = os.path.join(source_seg, 'se.pth')
        source_se = se_process_audio_segments(source_seg, tone_color_converter, device)
        # target_se_path = os.path.join(target_seg, 'se.pth')
        target_se = se_process_audio_segments(target_seg, tone_color_converter, device)

        # Iterate throw segments
        encode_message = "@MyShell"
        filtered_speaker = [
            segment
            for segment in result_diarize["segments"]
            if segment["speaker"] == speaker
        ]
        for seg in filtered_speaker:
            src_path = (
                save_path
            ) = f"audio2/audio/{str(seg['start'])}.ogg"  # overwrite
            logger.debug(f"{src_path}")

            tone_color_converter.convert(
                audio_src_path=src_path,
                src_se=source_se,
                tgt_se=target_se,
                output_path=save_path,
                message=encode_message,
            )

            global_progress_bar.update(1)

    global_progress_bar.close()

    try:
        del tone_color_converter
        gc.collect()
        torch.cuda.empty_cache()
    except Exception as error:
        logger.error(str(error))
        gc.collect()
        torch.cuda.empty_cache()


def toneconverter_freevc(
    result_diarize,
    remove_previous_process=True,
    get_vocals_dereverb=False,
):
    audio_path = "audio.wav"
    target_dir = "processed"
    create_directories(target_dir)

    from openvoice import se_extractor

    audio_name = f"{os.path.basename(audio_path).rsplit('.', 1)[0]}_{se_extractor.hash_numpy_array(audio_path)}"

    # create wav seg; original is target and dubbing is source
    valid_speakers = list(
        {item["speaker"] for item in result_diarize["segments"]}
    )

    logger.info("FreeVC preprocessor...")

    if remove_previous_process:
        remove_directory_contents(target_dir)

    path_source_segments, path_target_segments = create_wav_vc(
        valid_speakers,
        result_diarize["segments"],
        audio_name,
        max_segments=1,
        get_vocals_dereverb=get_vocals_dereverb,
    )

    logger.info("FreeVC loading model...")
    device_id = os.environ.get("SONITR_DEVICE")
    device = None if device_id == "cpu" else device_id
    try:
        from TTS.api import TTS
        tts = TTS(
            model_name="voice_conversion_models/multilingual/vctk/freevc24",
            progress_bar=False
        ).to(device)
    except Exception as error:
        logger.error(str(error))
        logger.error("Error loading the FreeVC model.")
        return

    logger.info("FreeVC process:")
    global_progress_bar = tqdm(total=len(result_diarize["segments"]), desc="Progress")

    for source_seg, target_seg, speaker in zip(
        path_source_segments, path_target_segments, valid_speakers
    ):

        filtered_speaker = [
            segment
            for segment in result_diarize["segments"]
            if segment["speaker"] == speaker
        ]

        files_and_directories = os.listdir(target_seg)
        wav_files = [file for file in files_and_directories if file.endswith(".wav")]
        original_wav_audio_segment = os.path.join(target_seg, wav_files[0])

        for seg in filtered_speaker:

            src_path = (
                  save_path
              ) = f"audio2/audio/{str(seg['start'])}.ogg"  # overwrite
            logger.debug(f"{src_path} - {original_wav_audio_segment}")

            wav = tts.voice_conversion(
                source_wav=src_path,
                target_wav=original_wav_audio_segment,
            )

            sf.write(
                file=save_path,
                samplerate=tts.voice_converter.vc_config.audio.output_sample_rate,
                data=wav,
                format="ogg",
                subtype="vorbis",
            )

            global_progress_bar.update(1)

    global_progress_bar.close()

    try:
        del tts
        gc.collect()
        torch.cuda.empty_cache()
    except Exception as error:
        logger.error(str(error))
        gc.collect()
        torch.cuda.empty_cache()


def toneconverter(
    result_diarize,
    preprocessor_max_segments,
    remove_previous_process=True,
    get_vocals_dereverb=False,
    method_vc="freevc"
):

    if method_vc == "freevc":
        if preprocessor_max_segments > 1:
            logger.info("FreeVC only uses one segment.")
        return toneconverter_freevc(
                    result_diarize,
                    remove_previous_process=remove_previous_process,
                    get_vocals_dereverb=get_vocals_dereverb,
                )
    elif "openvoice" in method_vc:
        return toneconverter_openvoice(
                    result_diarize,
                    preprocessor_max_segments,
                    remove_previous_process=remove_previous_process,
                    get_vocals_dereverb=get_vocals_dereverb,
                    model=method_vc,
                )


if __name__ == "__main__":
    from segments import result_diarize

    audio_segmentation_to_voice(
        result_diarize,
        TRANSLATE_AUDIO_TO="en",
        max_accelerate_audio=2.1,
        is_gui=True,
        tts_voice00="en-facebook-mms VITS",
        tts_voice01="en-CA-ClaraNeural-Female",
        tts_voice02="en-GB-ThomasNeural-Male",
        tts_voice03="en-GB-SoniaNeural-Female",
        tts_voice04="en-NZ-MitchellNeural-Male",
        tts_voice05="en-GB-MaisieNeural-Female",
    )