SoniTranslate_CPU / soni_translate /text_to_speech.py
r3gm's picture
v0.5.0
b152010
raw
history blame
51.4 kB
from gtts import gTTS
import edge_tts, asyncio, json, glob # noqa
from tqdm import tqdm
import librosa, os, re, torch, gc, subprocess # noqa
from .language_configuration import (
fix_code_language,
BARK_VOICES_LIST,
VITS_VOICES_LIST,
)
from .utils import (
download_manager,
create_directories,
copy_files,
rename_file,
remove_directory_contents,
remove_files,
run_command,
)
import numpy as np
from typing import Any, Dict
from pathlib import Path
import soundfile as sf
import platform
import logging
import traceback
from .logging_setup import logger
class TTS_OperationError(Exception):
def __init__(self, message="The operation did not complete successfully."):
self.message = message
super().__init__(self.message)
def verify_saved_file_and_size(filename):
if not os.path.exists(filename):
raise TTS_OperationError(f"File '{filename}' was not saved.")
if os.path.getsize(filename) == 0:
raise TTS_OperationError(
f"File '{filename}' has a zero size. "
"Related to incorrect TTS for the target language"
)
def error_handling_in_tts(error, segment, TRANSLATE_AUDIO_TO, filename):
traceback.print_exc()
logger.error(f"Error: {str(error)}")
try:
from tempfile import TemporaryFile
tts = gTTS(segment["text"], lang=fix_code_language(TRANSLATE_AUDIO_TO))
# tts.save(filename)
f = TemporaryFile()
tts.write_to_fp(f)
# Reset the file pointer to the beginning of the file
f.seek(0)
# Read audio data from the TemporaryFile using soundfile
audio_data, samplerate = sf.read(f)
f.close() # Close the TemporaryFile
sf.write(
filename, audio_data, samplerate, format="ogg", subtype="vorbis"
)
logger.warning(
'TTS auxiliary will be utilized '
f'rather than TTS: {segment["tts_name"]}'
)
verify_saved_file_and_size(filename)
except Exception as error:
logger.critical(f"Error: {str(error)}")
sample_rate_aux = 22050
duration = float(segment["end"]) - float(segment["start"])
data = np.zeros(int(sample_rate_aux * duration)).astype(np.float32)
sf.write(
filename, data, sample_rate_aux, format="ogg", subtype="vorbis"
)
logger.error("Audio will be replaced -> [silent audio].")
verify_saved_file_and_size(filename)
def pad_array(array, sr):
if isinstance(array, list):
array = np.array(array)
if not array.shape[0]:
raise ValueError("The generated audio does not contain any data")
valid_indices = np.where(np.abs(array) > 0.001)[0]
if len(valid_indices) == 0:
logger.debug(f"No valid indices: {array}")
return array
try:
pad_indice = int(0.1 * sr)
start_pad = max(0, valid_indices[0] - pad_indice)
end_pad = min(len(array), valid_indices[-1] + 1 + pad_indice)
padded_array = array[start_pad:end_pad]
return padded_array
except Exception as error:
logger.error(str(error))
return array
# =====================================
# EDGE TTS
# =====================================
def edge_tts_voices_list():
try:
completed_process = subprocess.run(
["edge-tts", "--list-voices"], capture_output=True, text=True
)
lines = completed_process.stdout.strip().split("\n")
except Exception as error:
logger.debug(str(error))
lines = []
voices = []
for line in lines:
if line.startswith("Name: "):
voice_entry = {}
voice_entry["Name"] = line.split(": ")[1]
elif line.startswith("Gender: "):
voice_entry["Gender"] = line.split(": ")[1]
voices.append(voice_entry)
formatted_voices = [
f"{entry['Name']}-{entry['Gender']}" for entry in voices
]
if not formatted_voices:
logger.warning(
"The list of Edge TTS voices could not be obtained, "
"switching to an alternative method"
)
tts_voice_list = asyncio.new_event_loop().run_until_complete(
edge_tts.list_voices()
)
formatted_voices = sorted(
[f"{v['ShortName']}-{v['Gender']}" for v in tts_voice_list]
)
if not formatted_voices:
logger.error("Can't get EDGE TTS - list voices")
return formatted_voices
def segments_egde_tts(filtered_edge_segments, TRANSLATE_AUDIO_TO, is_gui):
for segment in tqdm(filtered_edge_segments["segments"]):
speaker = segment["speaker"] # noqa
text = segment["text"]
start = segment["start"]
tts_name = segment["tts_name"]
# make the tts audio
filename = f"audio/{start}.ogg"
temp_file = filename[:-3] + "mp3"
logger.info(f"{text} >> {filename}")
try:
if is_gui:
asyncio.run(
edge_tts.Communicate(
text, "-".join(tts_name.split("-")[:-1])
).save(temp_file)
)
else:
# nest_asyncio.apply() if not is_gui else None
command = f'edge-tts -t "{text}" -v "{tts_name.replace("-Male", "").replace("-Female", "")}" --write-media "{temp_file}"'
run_command(command)
verify_saved_file_and_size(temp_file)
data, sample_rate = sf.read(temp_file)
data = pad_array(data, sample_rate)
# os.remove(temp_file)
# Save file
sf.write(
file=filename,
samplerate=sample_rate,
data=data,
format="ogg",
subtype="vorbis",
)
verify_saved_file_and_size(filename)
except Exception as error:
error_handling_in_tts(error, segment, TRANSLATE_AUDIO_TO, filename)
# =====================================
# BARK TTS
# =====================================
def segments_bark_tts(
filtered_bark_segments, TRANSLATE_AUDIO_TO, model_id_bark="suno/bark-small"
):
from transformers import AutoProcessor, BarkModel
from optimum.bettertransformer import BetterTransformer
device = os.environ.get("SONITR_DEVICE")
torch_dtype_env = torch.float16 if device == "cuda" else torch.float32
# load model bark
model = BarkModel.from_pretrained(
model_id_bark, torch_dtype=torch_dtype_env
).to(device)
model = model.to(device)
processor = AutoProcessor.from_pretrained(
model_id_bark, return_tensors="pt"
) # , padding=True
if device == "cuda":
# convert to bettertransformer
model = BetterTransformer.transform(model, keep_original_model=False)
# enable CPU offload
# model.enable_cpu_offload()
sampling_rate = model.generation_config.sample_rate
# filtered_segments = filtered_bark_segments['segments']
# Sorting the segments by 'tts_name'
# sorted_segments = sorted(filtered_segments, key=lambda x: x['tts_name'])
# logger.debug(sorted_segments)
for segment in tqdm(filtered_bark_segments["segments"]):
speaker = segment["speaker"] # noqa
text = segment["text"]
start = segment["start"]
tts_name = segment["tts_name"]
inputs = processor(text, voice_preset=BARK_VOICES_LIST[tts_name]).to(
device
)
# make the tts audio
filename = f"audio/{start}.ogg"
logger.info(f"{text} >> {filename}")
try:
# Infer
with torch.inference_mode():
speech_output = model.generate(
**inputs,
do_sample=True,
fine_temperature=0.4,
coarse_temperature=0.8,
pad_token_id=processor.tokenizer.pad_token_id,
)
# Save file
data_tts = pad_array(
speech_output.cpu().numpy().squeeze().astype(np.float32),
sampling_rate,
)
sf.write(
file=filename,
samplerate=sampling_rate,
data=data_tts,
format="ogg",
subtype="vorbis",
)
verify_saved_file_and_size(filename)
except Exception as error:
error_handling_in_tts(error, segment, TRANSLATE_AUDIO_TO, filename)
gc.collect()
torch.cuda.empty_cache()
try:
del processor
del model
gc.collect()
torch.cuda.empty_cache()
except Exception as error:
logger.error(str(error))
gc.collect()
torch.cuda.empty_cache()
# =====================================
# VITS TTS
# =====================================
def uromanize(input_string):
"""Convert non-Roman strings to Roman using the `uroman` perl package."""
# script_path = os.path.join(uroman_path, "bin", "uroman.pl")
if not os.path.exists("./uroman"):
logger.info(
"Clonning repository uroman https://github.com/isi-nlp/uroman.git"
" for romanize the text"
)
process = subprocess.Popen(
["git", "clone", "https://github.com/isi-nlp/uroman.git"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
stdout, stderr = process.communicate()
script_path = os.path.join("./uroman", "bin", "uroman.pl")
command = ["perl", script_path]
process = subprocess.Popen(
command,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
# Execute the perl command
stdout, stderr = process.communicate(input=input_string.encode())
if process.returncode != 0:
raise ValueError(f"Error {process.returncode}: {stderr.decode()}")
# Return the output as a string and skip the new-line character at the end
return stdout.decode()[:-1]
def segments_vits_tts(filtered_vits_segments, TRANSLATE_AUDIO_TO):
from transformers import VitsModel, AutoTokenizer
filtered_segments = filtered_vits_segments["segments"]
# Sorting the segments by 'tts_name'
sorted_segments = sorted(filtered_segments, key=lambda x: x["tts_name"])
logger.debug(sorted_segments)
model_name_key = None
for segment in tqdm(sorted_segments):
speaker = segment["speaker"] # noqa
text = segment["text"]
start = segment["start"]
tts_name = segment["tts_name"]
if tts_name != model_name_key:
model_name_key = tts_name
model = VitsModel.from_pretrained(VITS_VOICES_LIST[tts_name])
tokenizer = AutoTokenizer.from_pretrained(
VITS_VOICES_LIST[tts_name]
)
sampling_rate = model.config.sampling_rate
if tokenizer.is_uroman:
romanize_text = uromanize(text)
logger.debug(f"Romanize text: {romanize_text}")
inputs = tokenizer(romanize_text, return_tensors="pt")
else:
inputs = tokenizer(text, return_tensors="pt")
# make the tts audio
filename = f"audio/{start}.ogg"
logger.info(f"{text} >> {filename}")
try:
# Infer
with torch.no_grad():
speech_output = model(**inputs).waveform
data_tts = pad_array(
speech_output.cpu().numpy().squeeze().astype(np.float32),
sampling_rate,
)
# Save file
sf.write(
file=filename,
samplerate=sampling_rate,
data=data_tts,
format="ogg",
subtype="vorbis",
)
verify_saved_file_and_size(filename)
except Exception as error:
error_handling_in_tts(error, segment, TRANSLATE_AUDIO_TO, filename)
gc.collect()
torch.cuda.empty_cache()
try:
del tokenizer
del model
gc.collect()
torch.cuda.empty_cache()
except Exception as error:
logger.error(str(error))
gc.collect()
torch.cuda.empty_cache()
# =====================================
# Coqui XTTS
# =====================================
def coqui_xtts_voices_list():
main_folder = "_XTTS_"
pattern_coqui = re.compile(r".+\.(wav|mp3|ogg|m4a)$")
pattern_automatic_speaker = re.compile(r"AUTOMATIC_SPEAKER_\d+\.wav$")
# List only files in the directory matching the pattern but not matching
# AUTOMATIC_SPEAKER_00.wav, AUTOMATIC_SPEAKER_01.wav, etc.
wav_voices = [
"_XTTS_/" + f
for f in os.listdir(main_folder)
if os.path.isfile(os.path.join(main_folder, f))
and pattern_coqui.match(f)
and not pattern_automatic_speaker.match(f)
]
return ["_XTTS_/AUTOMATIC.wav"] + wav_voices
def seconds_to_hhmmss_ms(seconds):
hours = seconds // 3600
minutes = (seconds % 3600) // 60
seconds = seconds % 60
milliseconds = int((seconds - int(seconds)) * 1000)
return "%02d:%02d:%02d.%03d" % (hours, minutes, int(seconds), milliseconds)
def audio_trimming(audio_path, destination, start, end):
if isinstance(start, (int, float)):
start = seconds_to_hhmmss_ms(start)
if isinstance(end, (int, float)):
end = seconds_to_hhmmss_ms(end)
if destination:
file_directory = destination
else:
file_directory = os.path.dirname(audio_path)
file_name = os.path.splitext(os.path.basename(audio_path))[0]
file_ = f"{file_name}_trim.wav"
# file_ = f'{os.path.splitext(audio_path)[0]}_trim.wav'
output_path = os.path.join(file_directory, file_)
# -t (duration from -ss) | -to (time stop) | -af silenceremove=1:0:-50dB (remove silence)
command = f'ffmpeg -y -loglevel error -i "{audio_path}" -ss {start} -to {end} -acodec pcm_s16le -f wav "{output_path}"'
run_command(command)
return output_path
def convert_to_xtts_good_sample(audio_path: str = "", destination: str = ""):
if destination:
file_directory = destination
else:
file_directory = os.path.dirname(audio_path)
file_name = os.path.splitext(os.path.basename(audio_path))[0]
file_ = f"{file_name}_good_sample.wav"
# file_ = f'{os.path.splitext(audio_path)[0]}_good_sample.wav'
mono_path = os.path.join(file_directory, file_) # get root
command = f'ffmpeg -y -loglevel error -i "{audio_path}" -ac 1 -ar 22050 -sample_fmt s16 -f wav "{mono_path}"'
run_command(command)
return mono_path
def sanitize_file_name(file_name):
import unicodedata
# Normalize the string to NFKD form to separate combined characters into
# base characters and diacritics
normalized_name = unicodedata.normalize("NFKD", file_name)
# Replace any non-ASCII characters or special symbols with an underscore
sanitized_name = re.sub(r"[^\w\s.-]", "_", normalized_name)
return sanitized_name
def create_wav_file_vc(
sample_name="", # name final file
audio_wav="", # path
start=None, # trim start
end=None, # trim end
output_final_path="_XTTS_",
get_vocals_dereverb=True,
):
sample_name = sample_name if sample_name else "default_name"
sample_name = sanitize_file_name(sample_name)
audio_wav = audio_wav if isinstance(audio_wav, str) else audio_wav.name
BASE_DIR = (
"." # os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
)
output_dir = os.path.join(BASE_DIR, "clean_song_output") # remove content
# remove_directory_contents(output_dir)
if start or end:
# Cut file
audio_segment = audio_trimming(audio_wav, output_dir, start, end)
else:
# Complete file
audio_segment = audio_wav
from .mdx_net import process_uvr_task
try:
_, _, _, _, audio_segment = process_uvr_task(
orig_song_path=audio_segment,
main_vocals=True,
dereverb=get_vocals_dereverb,
)
except Exception as error:
logger.error(str(error))
sample = convert_to_xtts_good_sample(audio_segment)
sample_name = f"{sample_name}.wav"
sample_rename = rename_file(sample, sample_name)
copy_files(sample_rename, output_final_path)
final_sample = os.path.join(output_final_path, sample_name)
if os.path.exists(final_sample):
logger.info(final_sample)
return final_sample
else:
raise Exception(f"Error wav: {final_sample}")
def create_new_files_for_vc(
speakers_coqui,
segments_base,
dereverb_automatic=True
):
# before function delete automatic delete_previous_automatic
output_dir = os.path.join(".", "clean_song_output") # remove content
remove_directory_contents(output_dir)
for speaker in speakers_coqui:
filtered_speaker = [
segment
for segment in segments_base
if segment["speaker"] == speaker
]
if len(filtered_speaker) > 4:
filtered_speaker = filtered_speaker[1:]
if filtered_speaker[0]["tts_name"] == "_XTTS_/AUTOMATIC.wav":
name_automatic_wav = f"AUTOMATIC_{speaker}"
if os.path.exists(f"_XTTS_/{name_automatic_wav}.wav"):
logger.info(f"WAV automatic {speaker} exists")
# path_wav = path_automatic_wav
pass
else:
# create wav
wav_ok = False
for seg in filtered_speaker:
duration = float(seg["end"]) - float(seg["start"])
if duration > 7.0 and duration < 12.0:
logger.info(
f'Processing segment: {seg["start"]}, {seg["end"]}, {seg["speaker"]}, {duration}, {seg["text"]}'
)
create_wav_file_vc(
sample_name=name_automatic_wav,
audio_wav="audio.wav",
start=(float(seg["start"]) + 1.0),
end=(float(seg["end"]) - 1.0),
get_vocals_dereverb=dereverb_automatic,
)
wav_ok = True
break
if not wav_ok:
logger.info("Taking the first segment")
seg = filtered_speaker[0]
logger.info(
f'Processing segment: {seg["start"]}, {seg["end"]}, {seg["speaker"]}, {seg["text"]}'
)
max_duration = float(seg["end"]) - float(seg["start"])
max_duration = max(2.0, min(max_duration, 9.0))
create_wav_file_vc(
sample_name=name_automatic_wav,
audio_wav="audio.wav",
start=(float(seg["start"])),
end=(float(seg["start"]) + max_duration),
get_vocals_dereverb=dereverb_automatic,
)
def segments_coqui_tts(
filtered_coqui_segments,
TRANSLATE_AUDIO_TO,
model_id_coqui="tts_models/multilingual/multi-dataset/xtts_v2",
speakers_coqui=None,
delete_previous_automatic=True,
dereverb_automatic=True,
emotion=None,
):
"""XTTS
Install:
pip install -q TTS==0.21.1
pip install -q numpy==1.23.5
Notes:
- tts_name is the wav|mp3|ogg|m4a file for VC
"""
from TTS.api import TTS
TRANSLATE_AUDIO_TO = fix_code_language(TRANSLATE_AUDIO_TO, syntax="coqui")
supported_lang_coqui = [
"zh-cn",
"en",
"fr",
"de",
"it",
"pt",
"pl",
"tr",
"ru",
"nl",
"cs",
"ar",
"es",
"hu",
"ko",
"ja",
]
if TRANSLATE_AUDIO_TO not in supported_lang_coqui:
raise TTS_OperationError(
f"'{TRANSLATE_AUDIO_TO}' is not a supported language for Coqui XTTS"
)
# Emotion and speed can only be used with Coqui Studio models. discontinued
# emotions = ["Neutral", "Happy", "Sad", "Angry", "Dull"]
if delete_previous_automatic:
for spk in speakers_coqui:
remove_files(f"_XTTS_/AUTOMATIC_{spk}.wav")
directory_audios_vc = "_XTTS_"
create_directories(directory_audios_vc)
create_new_files_for_vc(
speakers_coqui,
filtered_coqui_segments["segments"],
dereverb_automatic,
)
# Init TTS
device = os.environ.get("SONITR_DEVICE")
model = TTS(model_id_coqui).to(device)
sampling_rate = 24000
# filtered_segments = filtered_coqui_segments['segments']
# Sorting the segments by 'tts_name'
# sorted_segments = sorted(filtered_segments, key=lambda x: x['tts_name'])
# logger.debug(sorted_segments)
for segment in tqdm(filtered_coqui_segments["segments"]):
speaker = segment["speaker"]
text = segment["text"]
start = segment["start"]
tts_name = segment["tts_name"]
if tts_name == "_XTTS_/AUTOMATIC.wav":
tts_name = f"_XTTS_/AUTOMATIC_{speaker}.wav"
# make the tts audio
filename = f"audio/{start}.ogg"
logger.info(f"{text} >> {filename}")
try:
# Infer
wav = model.tts(
text=text, speaker_wav=tts_name, language=TRANSLATE_AUDIO_TO
)
data_tts = pad_array(
wav,
sampling_rate,
)
# Save file
sf.write(
file=filename,
samplerate=sampling_rate,
data=data_tts,
format="ogg",
subtype="vorbis",
)
verify_saved_file_and_size(filename)
except Exception as error:
error_handling_in_tts(error, segment, TRANSLATE_AUDIO_TO, filename)
gc.collect()
torch.cuda.empty_cache()
try:
del model
gc.collect()
torch.cuda.empty_cache()
except Exception as error:
logger.error(str(error))
gc.collect()
torch.cuda.empty_cache()
# =====================================
# PIPER TTS
# =====================================
def piper_tts_voices_list():
file_path = download_manager(
url="https://huggingface.co/rhasspy/piper-voices/resolve/main/voices.json",
path="./PIPER_MODELS",
)
with open(file_path, "r", encoding="utf8") as file:
data = json.load(file)
piper_id_models = [key + " VITS-onnx" for key in data.keys()]
return piper_id_models
def replace_text_in_json(file_path, key_to_replace, new_text, condition=None):
# Read the JSON file
with open(file_path, "r", encoding="utf-8") as file:
data = json.load(file)
# Modify the specified key's value with the new text
if key_to_replace in data:
if condition:
value_condition = condition
else:
value_condition = data[key_to_replace]
if data[key_to_replace] == value_condition:
data[key_to_replace] = new_text
# Write the modified content back to the JSON file
with open(file_path, "w") as file:
json.dump(
data, file, indent=2
) # Write the modified data back to the file with indentation for readability
def load_piper_model(
model: str,
data_dir: list,
download_dir: str = "",
update_voices: bool = False,
):
from piper import PiperVoice
from piper.download import ensure_voice_exists, find_voice, get_voices
try:
import onnxruntime as rt
if rt.get_device() == "GPU" and os.environ.get("SONITR_DEVICE") == "cuda":
logger.debug("onnxruntime device > GPU")
cuda = True
else:
logger.info(
"onnxruntime device > CPU"
) # try pip install onnxruntime-gpu
cuda = False
except Exception as error:
raise TTS_OperationError(f"onnxruntime error: {str(error)}")
# Disable CUDA in Windows
if platform.system() == "Windows":
logger.info("Employing CPU exclusivity with Piper TTS")
cuda = False
if not download_dir:
# Download to first data directory by default
download_dir = data_dir[0]
else:
data_dir = [os.path.join(data_dir[0], download_dir)]
# Download voice if file doesn't exist
model_path = Path(model)
if not model_path.exists():
# Load voice info
voices_info = get_voices(download_dir, update_voices=update_voices)
# Resolve aliases for backwards compatibility with old voice names
aliases_info: Dict[str, Any] = {}
for voice_info in voices_info.values():
for voice_alias in voice_info.get("aliases", []):
aliases_info[voice_alias] = {"_is_alias": True, **voice_info}
voices_info.update(aliases_info)
ensure_voice_exists(model, data_dir, download_dir, voices_info)
model, config = find_voice(model, data_dir)
replace_text_in_json(
config, "phoneme_type", "espeak", "PhonemeType.ESPEAK"
)
# Load voice
voice = PiperVoice.load(model, config_path=config, use_cuda=cuda)
return voice
def synthesize_text_to_audio_np_array(voice, text, synthesize_args):
audio_stream = voice.synthesize_stream_raw(text, **synthesize_args)
# Collect the audio bytes into a single NumPy array
audio_data = b""
for audio_bytes in audio_stream:
audio_data += audio_bytes
# Ensure correct data type and convert audio bytes to NumPy array
audio_np = np.frombuffer(audio_data, dtype=np.int16)
return audio_np
def segments_vits_onnx_tts(filtered_onnx_vits_segments, TRANSLATE_AUDIO_TO):
"""
Install:
pip install -q piper-tts==1.2.0 onnxruntime-gpu # for cuda118
"""
data_dir = [
str(Path.cwd())
] # "Data directory to check for downloaded models (default: current directory)"
download_dir = "PIPER_MODELS"
# model_name = "en_US-lessac-medium" tts_name in a dict like VITS
update_voices = True # "Download latest voices.json during startup",
synthesize_args = {
"speaker_id": None,
"length_scale": 1.0,
"noise_scale": 0.667,
"noise_w": 0.8,
"sentence_silence": 0.0,
}
filtered_segments = filtered_onnx_vits_segments["segments"]
# Sorting the segments by 'tts_name'
sorted_segments = sorted(filtered_segments, key=lambda x: x["tts_name"])
logger.debug(sorted_segments)
model_name_key = None
for segment in tqdm(sorted_segments):
speaker = segment["speaker"] # noqa
text = segment["text"]
start = segment["start"]
tts_name = segment["tts_name"].replace(" VITS-onnx", "")
if tts_name != model_name_key:
model_name_key = tts_name
model = load_piper_model(
tts_name, data_dir, download_dir, update_voices
)
sampling_rate = model.config.sample_rate
# make the tts audio
filename = f"audio/{start}.ogg"
logger.info(f"{text} >> {filename}")
try:
# Infer
speech_output = synthesize_text_to_audio_np_array(
model, text, synthesize_args
)
data_tts = pad_array(
speech_output, # .cpu().numpy().squeeze().astype(np.float32),
sampling_rate,
)
# Save file
sf.write(
file=filename,
samplerate=sampling_rate,
data=data_tts,
format="ogg",
subtype="vorbis",
)
verify_saved_file_and_size(filename)
except Exception as error:
error_handling_in_tts(error, segment, TRANSLATE_AUDIO_TO, filename)
gc.collect()
torch.cuda.empty_cache()
try:
del model
gc.collect()
torch.cuda.empty_cache()
except Exception as error:
logger.error(str(error))
gc.collect()
torch.cuda.empty_cache()
# =====================================
# CLOSEAI TTS
# =====================================
def segments_openai_tts(
filtered_openai_tts_segments, TRANSLATE_AUDIO_TO
):
from openai import OpenAI
client = OpenAI()
sampling_rate = 24000
# filtered_segments = filtered_openai_tts_segments['segments']
# Sorting the segments by 'tts_name'
# sorted_segments = sorted(filtered_segments, key=lambda x: x['tts_name'])
for segment in tqdm(filtered_openai_tts_segments["segments"]):
speaker = segment["speaker"] # noqa
text = segment["text"].strip()
start = segment["start"]
tts_name = segment["tts_name"]
# make the tts audio
filename = f"audio/{start}.ogg"
logger.info(f"{text} >> {filename}")
try:
# Request
response = client.audio.speech.create(
model="tts-1-hd" if "HD" in tts_name else "tts-1",
voice=tts_name.split()[0][1:],
response_format="wav",
input=text
)
audio_bytes = b''
for data in response.iter_bytes(chunk_size=4096):
audio_bytes += data
speech_output = np.frombuffer(audio_bytes, dtype=np.int16)
# Save file
data_tts = pad_array(
speech_output[240:],
sampling_rate,
)
sf.write(
file=filename,
samplerate=sampling_rate,
data=data_tts,
format="ogg",
subtype="vorbis",
)
verify_saved_file_and_size(filename)
except Exception as error:
error_handling_in_tts(error, segment, TRANSLATE_AUDIO_TO, filename)
# =====================================
# Select task TTS
# =====================================
def find_spkr(pattern, speaker_to_voice, segments):
return [
speaker
for speaker, voice in speaker_to_voice.items()
if pattern.match(voice) and any(
segment["speaker"] == speaker for segment in segments
)
]
def filter_by_speaker(speakers, segments):
return {
"segments": [
segment
for segment in segments
if segment["speaker"] in speakers
]
}
def audio_segmentation_to_voice(
result_diarize,
TRANSLATE_AUDIO_TO,
is_gui,
tts_voice00,
tts_voice01="",
tts_voice02="",
tts_voice03="",
tts_voice04="",
tts_voice05="",
tts_voice06="",
tts_voice07="",
tts_voice08="",
tts_voice09="",
tts_voice10="",
tts_voice11="",
dereverb_automatic=True,
model_id_bark="suno/bark-small",
model_id_coqui="tts_models/multilingual/multi-dataset/xtts_v2",
delete_previous_automatic=True,
):
remove_directory_contents("audio")
# Mapping speakers to voice variables
speaker_to_voice = {
"SPEAKER_00": tts_voice00,
"SPEAKER_01": tts_voice01,
"SPEAKER_02": tts_voice02,
"SPEAKER_03": tts_voice03,
"SPEAKER_04": tts_voice04,
"SPEAKER_05": tts_voice05,
"SPEAKER_06": tts_voice06,
"SPEAKER_07": tts_voice07,
"SPEAKER_08": tts_voice08,
"SPEAKER_09": tts_voice09,
"SPEAKER_10": tts_voice10,
"SPEAKER_11": tts_voice11,
}
# Assign 'SPEAKER_00' to segments without a 'speaker' key
for segment in result_diarize["segments"]:
if "speaker" not in segment:
segment["speaker"] = "SPEAKER_00"
logger.warning(
"NO SPEAKER DETECT IN SEGMENT: First TTS will be used in the"
f" segment time {segment['start'], segment['text']}"
)
# Assign the TTS name
segment["tts_name"] = speaker_to_voice[segment["speaker"]]
# Find TTS method
pattern_edge = re.compile(r".*-(Male|Female)$")
pattern_bark = re.compile(r".* BARK$")
pattern_vits = re.compile(r".* VITS$")
pattern_coqui = re.compile(r".+\.(wav|mp3|ogg|m4a)$")
pattern_vits_onnx = re.compile(r".* VITS-onnx$")
pattern_openai_tts = re.compile(r".* OpenAI-TTS$")
all_segments = result_diarize["segments"]
speakers_edge = find_spkr(pattern_edge, speaker_to_voice, all_segments)
speakers_bark = find_spkr(pattern_bark, speaker_to_voice, all_segments)
speakers_vits = find_spkr(pattern_vits, speaker_to_voice, all_segments)
speakers_coqui = find_spkr(pattern_coqui, speaker_to_voice, all_segments)
speakers_vits_onnx = find_spkr(
pattern_vits_onnx, speaker_to_voice, all_segments
)
speakers_openai_tts = find_spkr(
pattern_openai_tts, speaker_to_voice, all_segments
)
# Filter method in segments
filtered_edge = filter_by_speaker(speakers_edge, all_segments)
filtered_bark = filter_by_speaker(speakers_bark, all_segments)
filtered_vits = filter_by_speaker(speakers_vits, all_segments)
filtered_coqui = filter_by_speaker(speakers_coqui, all_segments)
filtered_vits_onnx = filter_by_speaker(speakers_vits_onnx, all_segments)
filtered_openai_tts = filter_by_speaker(speakers_openai_tts, all_segments)
# Infer
if filtered_edge["segments"]:
logger.info(f"EDGE TTS: {speakers_edge}")
segments_egde_tts(filtered_edge, TRANSLATE_AUDIO_TO, is_gui) # mp3
if filtered_bark["segments"]:
logger.info(f"BARK TTS: {speakers_bark}")
segments_bark_tts(
filtered_bark, TRANSLATE_AUDIO_TO, model_id_bark
) # wav
if filtered_vits["segments"]:
logger.info(f"VITS TTS: {speakers_vits}")
segments_vits_tts(filtered_vits, TRANSLATE_AUDIO_TO) # wav
if filtered_coqui["segments"]:
logger.info(f"Coqui TTS: {speakers_coqui}")
segments_coqui_tts(
filtered_coqui,
TRANSLATE_AUDIO_TO,
model_id_coqui,
speakers_coqui,
delete_previous_automatic,
dereverb_automatic,
) # wav
if filtered_vits_onnx["segments"]:
logger.info(f"PIPER TTS: {speakers_vits_onnx}")
segments_vits_onnx_tts(filtered_vits_onnx, TRANSLATE_AUDIO_TO) # wav
if filtered_openai_tts["segments"]:
logger.info(f"OpenAI TTS: {speakers_openai_tts}")
segments_openai_tts(filtered_openai_tts, TRANSLATE_AUDIO_TO) # wav
[result.pop("tts_name", None) for result in result_diarize["segments"]]
return [
speakers_edge,
speakers_bark,
speakers_vits,
speakers_coqui,
speakers_vits_onnx,
speakers_openai_tts
]
def accelerate_segments(
result_diarize,
max_accelerate_audio,
valid_speakers,
acceleration_rate_regulation=False,
folder_output="audio2",
):
logger.info("Apply acceleration")
(
speakers_edge,
speakers_bark,
speakers_vits,
speakers_coqui,
speakers_vits_onnx,
speakers_openai_tts
) = valid_speakers
create_directories(f"{folder_output}/audio/")
remove_directory_contents(f"{folder_output}/audio/")
audio_files = []
speakers_list = []
max_count_segments_idx = len(result_diarize["segments"]) - 1
for i, segment in tqdm(enumerate(result_diarize["segments"])):
text = segment["text"] # noqa
start = segment["start"]
end = segment["end"]
speaker = segment["speaker"]
# find name audio
# if speaker in speakers_edge:
filename = f"audio/{start}.ogg"
# elif speaker in speakers_bark + speakers_vits + speakers_coqui + speakers_vits_onnx:
# filename = f"audio/{start}.wav" # wav
# duration
duration_true = end - start
duration_tts = librosa.get_duration(filename=filename)
# Accelerate percentage
acc_percentage = duration_tts / duration_true
# Smoth
if acceleration_rate_regulation and acc_percentage >= 1.3:
try:
next_segment = result_diarize["segments"][
min(max_count_segments_idx, i + 1)
]
next_start = next_segment["start"]
next_speaker = next_segment["speaker"]
duration_with_next_start = next_start - start
if duration_with_next_start > duration_true:
extra_time = duration_with_next_start - duration_true
if speaker == next_speaker:
# half
smoth_duration = duration_true + (extra_time * 0.5)
else:
# 7/10
smoth_duration = duration_true + (extra_time * 0.7)
logger.debug(
f"Base acc: {acc_percentage}, "
f"smoth acc: {duration_tts / smoth_duration}"
)
acc_percentage = max(1.2, (duration_tts / smoth_duration))
except Exception as error:
logger.error(str(error))
if acc_percentage > max_accelerate_audio:
acc_percentage = max_accelerate_audio
elif acc_percentage <= 1.15 and acc_percentage >= 0.8:
acc_percentage = 1.0
elif acc_percentage <= 0.79:
acc_percentage = 0.8
# Round
acc_percentage = round(acc_percentage + 0.0, 1)
# Format read if need
if speaker in speakers_edge:
info_enc = sf.info(filename).format
else:
info_enc = "OGG"
# Apply aceleration or opposite to the audio file in folder_output folder
if acc_percentage == 1.0 and info_enc == "OGG":
copy_files(filename, f"{folder_output}{os.sep}audio")
else:
os.system(
f"ffmpeg -y -loglevel panic -i {filename} -filter:a atempo={acc_percentage} {folder_output}/{filename}"
)
if logger.isEnabledFor(logging.DEBUG):
duration_create = librosa.get_duration(
filename=f"{folder_output}/{filename}"
)
logger.debug(
f"acc_percen is {acc_percentage}, tts duration "
f"is {duration_tts}, new duration is {duration_create}"
f", for {filename}"
)
audio_files.append(f"{folder_output}/{filename}")
speaker = "TTS Speaker {:02d}".format(int(speaker[-2:]) + 1)
speakers_list.append(speaker)
return audio_files, speakers_list
# =====================================
# Tone color converter
# =====================================
def se_process_audio_segments(
source_seg, tone_color_converter, device, remove_previous_processed=True
):
# list wav seg
source_audio_segs = glob.glob(f"{source_seg}/*.wav")
if not source_audio_segs:
raise ValueError(
f"No audio segments found in {str(source_audio_segs)}"
)
source_se_path = os.path.join(source_seg, "se.pth")
# if exist not create wav
if os.path.isfile(source_se_path):
se = torch.load(source_se_path).to(device)
logger.debug(f"Previous created {source_se_path}")
else:
se = tone_color_converter.extract_se(source_audio_segs, source_se_path)
return se
def create_wav_vc(
valid_speakers,
segments_base,
audio_name,
max_segments=10,
target_dir="processed",
get_vocals_dereverb=False,
):
# valid_speakers = list({item['speaker'] for item in segments_base})
# Before function delete automatic delete_previous_automatic
output_dir = os.path.join(".", target_dir) # remove content
# remove_directory_contents(output_dir)
path_source_segments = []
path_target_segments = []
for speaker in valid_speakers:
filtered_speaker = [
segment
for segment in segments_base
if segment["speaker"] == speaker
]
if len(filtered_speaker) > 4:
filtered_speaker = filtered_speaker[1:]
dir_name_speaker = speaker + audio_name
dir_name_speaker_tts = "tts" + speaker + audio_name
dir_path_speaker = os.path.join(output_dir, dir_name_speaker)
dir_path_speaker_tts = os.path.join(output_dir, dir_name_speaker_tts)
create_directories([dir_path_speaker, dir_path_speaker_tts])
path_target_segments.append(dir_path_speaker)
path_source_segments.append(dir_path_speaker_tts)
# create wav
max_segments_count = 0
for seg in filtered_speaker:
duration = float(seg["end"]) - float(seg["start"])
if duration > 3.0 and duration < 18.0:
logger.info(
f'Processing segment: {seg["start"]}, {seg["end"]}, {seg["speaker"]}, {duration}, {seg["text"]}'
)
name_new_wav = str(seg["start"])
check_segment_audio_target_file = os.path.join(
dir_path_speaker, f"{name_new_wav}.wav"
)
if os.path.exists(check_segment_audio_target_file):
logger.debug(
"Segment vc source exists: "
f"{check_segment_audio_target_file}"
)
pass
else:
create_wav_file_vc(
sample_name=name_new_wav,
audio_wav="audio.wav",
start=(float(seg["start"]) + 1.0),
end=(float(seg["end"]) - 1.0),
output_final_path=dir_path_speaker,
get_vocals_dereverb=get_vocals_dereverb,
)
file_name_tts = f"audio2/audio/{str(seg['start'])}.ogg"
# copy_files(file_name_tts, os.path.join(output_dir, dir_name_speaker_tts)
convert_to_xtts_good_sample(
file_name_tts, dir_path_speaker_tts
)
max_segments_count += 1
if max_segments_count == max_segments:
break
if max_segments_count == 0:
logger.info("Taking the first segment")
seg = filtered_speaker[0]
logger.info(
f'Processing segment: {seg["start"]}, {seg["end"]}, {seg["speaker"]}, {seg["text"]}'
)
max_duration = float(seg["end"]) - float(seg["start"])
max_duration = max(1.0, min(max_duration, 18.0))
name_new_wav = str(seg["start"])
create_wav_file_vc(
sample_name=name_new_wav,
audio_wav="audio.wav",
start=(float(seg["start"])),
end=(float(seg["start"]) + max_duration),
output_final_path=dir_path_speaker,
get_vocals_dereverb=get_vocals_dereverb,
)
file_name_tts = f"audio2/audio/{str(seg['start'])}.ogg"
# copy_files(file_name_tts, os.path.join(output_dir, dir_name_speaker_tts)
convert_to_xtts_good_sample(file_name_tts, dir_path_speaker_tts)
logger.debug(f"Base: {str(path_source_segments)}")
logger.debug(f"Target: {str(path_target_segments)}")
return path_source_segments, path_target_segments
def toneconverter_openvoice(
result_diarize,
preprocessor_max_segments,
remove_previous_process=True,
get_vocals_dereverb=False,
model="openvoice",
):
audio_path = "audio.wav"
# se_path = "se.pth"
target_dir = "processed"
create_directories(target_dir)
from openvoice import se_extractor
from openvoice.api import ToneColorConverter
audio_name = f"{os.path.basename(audio_path).rsplit('.', 1)[0]}_{se_extractor.hash_numpy_array(audio_path)}"
# se_path = os.path.join(target_dir, audio_name, 'se.pth')
# create wav seg original and target
valid_speakers = list(
{item["speaker"] for item in result_diarize["segments"]}
)
logger.info("Openvoice preprocessor...")
if remove_previous_process:
remove_directory_contents(target_dir)
path_source_segments, path_target_segments = create_wav_vc(
valid_speakers,
result_diarize["segments"],
audio_name,
max_segments=preprocessor_max_segments,
get_vocals_dereverb=get_vocals_dereverb,
)
logger.info("Openvoice loading model...")
model_path_openvoice = "./OPENVOICE_MODELS"
url_model_openvoice = "https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/converter"
if "v2" in model:
model_path = os.path.join(model_path_openvoice, "v2")
url_model_openvoice = url_model_openvoice.replace(
"OpenVoice", "OpenVoiceV2"
).replace("checkpoints/", "")
else:
model_path = os.path.join(model_path_openvoice, "v1")
create_directories(model_path)
config_url = f"{url_model_openvoice}/config.json"
checkpoint_url = f"{url_model_openvoice}/checkpoint.pth"
config_path = download_manager(url=config_url, path=model_path)
checkpoint_path = download_manager(
url=checkpoint_url, path=model_path
)
device = os.environ.get("SONITR_DEVICE")
tone_color_converter = ToneColorConverter(config_path, device=device)
tone_color_converter.load_ckpt(checkpoint_path)
logger.info("Openvoice tone color converter:")
global_progress_bar = tqdm(total=len(result_diarize["segments"]), desc="Progress")
for source_seg, target_seg, speaker in zip(
path_source_segments, path_target_segments, valid_speakers
):
# source_se_path = os.path.join(source_seg, 'se.pth')
source_se = se_process_audio_segments(source_seg, tone_color_converter, device)
# target_se_path = os.path.join(target_seg, 'se.pth')
target_se = se_process_audio_segments(target_seg, tone_color_converter, device)
# Iterate throw segments
encode_message = "@MyShell"
filtered_speaker = [
segment
for segment in result_diarize["segments"]
if segment["speaker"] == speaker
]
for seg in filtered_speaker:
src_path = (
save_path
) = f"audio2/audio/{str(seg['start'])}.ogg" # overwrite
logger.debug(f"{src_path}")
tone_color_converter.convert(
audio_src_path=src_path,
src_se=source_se,
tgt_se=target_se,
output_path=save_path,
message=encode_message,
)
global_progress_bar.update(1)
global_progress_bar.close()
try:
del tone_color_converter
gc.collect()
torch.cuda.empty_cache()
except Exception as error:
logger.error(str(error))
gc.collect()
torch.cuda.empty_cache()
def toneconverter_freevc(
result_diarize,
remove_previous_process=True,
get_vocals_dereverb=False,
):
audio_path = "audio.wav"
target_dir = "processed"
create_directories(target_dir)
from openvoice import se_extractor
audio_name = f"{os.path.basename(audio_path).rsplit('.', 1)[0]}_{se_extractor.hash_numpy_array(audio_path)}"
# create wav seg; original is target and dubbing is source
valid_speakers = list(
{item["speaker"] for item in result_diarize["segments"]}
)
logger.info("FreeVC preprocessor...")
if remove_previous_process:
remove_directory_contents(target_dir)
path_source_segments, path_target_segments = create_wav_vc(
valid_speakers,
result_diarize["segments"],
audio_name,
max_segments=1,
get_vocals_dereverb=get_vocals_dereverb,
)
logger.info("FreeVC loading model...")
device_id = os.environ.get("SONITR_DEVICE")
device = None if device_id == "cpu" else device_id
try:
from TTS.api import TTS
tts = TTS(
model_name="voice_conversion_models/multilingual/vctk/freevc24",
progress_bar=False
).to(device)
except Exception as error:
logger.error(str(error))
logger.error("Error loading the FreeVC model.")
return
logger.info("FreeVC process:")
global_progress_bar = tqdm(total=len(result_diarize["segments"]), desc="Progress")
for source_seg, target_seg, speaker in zip(
path_source_segments, path_target_segments, valid_speakers
):
filtered_speaker = [
segment
for segment in result_diarize["segments"]
if segment["speaker"] == speaker
]
files_and_directories = os.listdir(target_seg)
wav_files = [file for file in files_and_directories if file.endswith(".wav")]
original_wav_audio_segment = os.path.join(target_seg, wav_files[0])
for seg in filtered_speaker:
src_path = (
save_path
) = f"audio2/audio/{str(seg['start'])}.ogg" # overwrite
logger.debug(f"{src_path} - {original_wav_audio_segment}")
wav = tts.voice_conversion(
source_wav=src_path,
target_wav=original_wav_audio_segment,
)
sf.write(
file=save_path,
samplerate=tts.voice_converter.vc_config.audio.output_sample_rate,
data=wav,
format="ogg",
subtype="vorbis",
)
global_progress_bar.update(1)
global_progress_bar.close()
try:
del tts
gc.collect()
torch.cuda.empty_cache()
except Exception as error:
logger.error(str(error))
gc.collect()
torch.cuda.empty_cache()
def toneconverter(
result_diarize,
preprocessor_max_segments,
remove_previous_process=True,
get_vocals_dereverb=False,
method_vc="freevc"
):
if method_vc == "freevc":
if preprocessor_max_segments > 1:
logger.info("FreeVC only uses one segment.")
return toneconverter_freevc(
result_diarize,
remove_previous_process=remove_previous_process,
get_vocals_dereverb=get_vocals_dereverb,
)
elif "openvoice" in method_vc:
return toneconverter_openvoice(
result_diarize,
preprocessor_max_segments,
remove_previous_process=remove_previous_process,
get_vocals_dereverb=get_vocals_dereverb,
model=method_vc,
)
if __name__ == "__main__":
from segments import result_diarize
audio_segmentation_to_voice(
result_diarize,
TRANSLATE_AUDIO_TO="en",
max_accelerate_audio=2.1,
is_gui=True,
tts_voice00="en-facebook-mms VITS",
tts_voice01="en-CA-ClaraNeural-Female",
tts_voice02="en-GB-ThomasNeural-Male",
tts_voice03="en-GB-SoniaNeural-Female",
tts_voice04="en-NZ-MitchellNeural-Male",
tts_voice05="en-GB-MaisieNeural-Female",
)