|
import sys |
|
|
|
sys.path.append("..") |
|
import os |
|
|
|
now_dir = os.getcwd() |
|
|
|
from dotenv import load_dotenv |
|
from lib.infer.modules.vc.modules import VC |
|
from assets.configs.config import Config |
|
|
|
load_dotenv() |
|
config = Config() |
|
vc = VC(config) |
|
|
|
import shutil |
|
import numpy as np |
|
import torch |
|
|
|
import soundfile as sf |
|
from gtts import gTTS |
|
import edge_tts |
|
import asyncio |
|
import scipy.io.wavfile as wavfile |
|
import nltk |
|
|
|
nltk.download("punkt", quiet=True) |
|
from nltk.tokenize import sent_tokenize |
|
from bark import SAMPLE_RATE |
|
|
|
import json |
|
import ssl |
|
from typing import Any, Dict, List, Optional |
|
import asyncio |
|
import aiohttp |
|
import certifi |
|
|
|
VOICE_LIST = ( |
|
"https://speech.platform.bing.com/consumer/speech/synthesize/" |
|
+ "readaloud/voices/list?trustedclienttoken=" |
|
+ "6A5AA1D4EAFF4E9FB37E23D68491D6F4" |
|
) |
|
def get_bark_voice(): |
|
mensaje = """ |
|
v2/en_speaker_0 English Male |
|
v2/en_speaker_1 English Male |
|
v2/en_speaker_2 English Male |
|
v2/en_speaker_3 English Male |
|
v2/en_speaker_4 English Male |
|
v2/en_speaker_5 English Male |
|
v2/en_speaker_6 English Male |
|
v2/en_speaker_7 English Male |
|
v2/en_speaker_8 English Male |
|
v2/en_speaker_9 English Female |
|
v2/zh_speaker_0 Chinese (Simplified) Male |
|
v2/zh_speaker_1 Chinese (Simplified) Male |
|
v2/zh_speaker_2 Chinese (Simplified) Male |
|
v2/zh_speaker_3 Chinese (Simplified) Male |
|
v2/zh_speaker_4 Chinese (Simplified) Female |
|
v2/zh_speaker_5 Chinese (Simplified) Male |
|
v2/zh_speaker_6 Chinese (Simplified) Female |
|
v2/zh_speaker_7 Chinese (Simplified) Female |
|
v2/zh_speaker_8 Chinese (Simplified) Male |
|
v2/zh_speaker_9 Chinese (Simplified) Female |
|
v2/fr_speaker_0 French Male |
|
v2/fr_speaker_1 French Female |
|
v2/fr_speaker_2 French Female |
|
v2/fr_speaker_3 French Male |
|
v2/fr_speaker_4 French Male |
|
v2/fr_speaker_5 French Female |
|
v2/fr_speaker_6 French Male |
|
v2/fr_speaker_7 French Male |
|
v2/fr_speaker_8 French Male |
|
v2/fr_speaker_9 French Male |
|
v2/de_speaker_0 German Male |
|
v2/de_speaker_1 German Male |
|
v2/de_speaker_2 German Male |
|
v2/de_speaker_3 German Female |
|
v2/de_speaker_4 German Male |
|
v2/de_speaker_5 German Male |
|
v2/de_speaker_6 German Male |
|
v2/de_speaker_7 German Male |
|
v2/de_speaker_8 German Female |
|
v2/de_speaker_9 German Male |
|
v2/hi_speaker_0 Hindi Female |
|
v2/hi_speaker_1 Hindi Female |
|
v2/hi_speaker_2 Hindi Male |
|
v2/hi_speaker_3 Hindi Female |
|
v2/hi_speaker_4 Hindi Female |
|
v2/hi_speaker_5 Hindi Male |
|
v2/hi_speaker_6 Hindi Male |
|
v2/hi_speaker_7 Hindi Male |
|
v2/hi_speaker_8 Hindi Male |
|
v2/hi_speaker_9 Hindi Female |
|
v2/it_speaker_0 Italian Male |
|
v2/it_speaker_1 Italian Male |
|
v2/it_speaker_2 Italian Female |
|
v2/it_speaker_3 Italian Male |
|
v2/it_speaker_4 Italian Male |
|
v2/it_speaker_5 Italian Male |
|
v2/it_speaker_6 Italian Male |
|
v2/it_speaker_7 Italian Female |
|
v2/it_speaker_8 Italian Male |
|
v2/it_speaker_9 Italian Female |
|
v2/ja_speaker_0 Japanese Female |
|
v2/ja_speaker_1 Japanese Female |
|
v2/ja_speaker_2 Japanese Male |
|
v2/ja_speaker_3 Japanese Female |
|
v2/ja_speaker_4 Japanese Female |
|
v2/ja_speaker_5 Japanese Female |
|
v2/ja_speaker_6 Japanese Male |
|
v2/ja_speaker_7 Japanese Female |
|
v2/ja_speaker_8 Japanese Female |
|
v2/ja_speaker_9 Japanese Female |
|
v2/ko_speaker_0 Korean Female |
|
v2/ko_speaker_1 Korean Male |
|
v2/ko_speaker_2 Korean Male |
|
v2/ko_speaker_3 Korean Male |
|
v2/ko_speaker_4 Korean Male |
|
v2/ko_speaker_5 Korean Male |
|
v2/ko_speaker_6 Korean Male |
|
v2/ko_speaker_7 Korean Male |
|
v2/ko_speaker_8 Korean Male |
|
v2/ko_speaker_9 Korean Male |
|
v2/pl_speaker_0 Polish Male |
|
v2/pl_speaker_1 Polish Male |
|
v2/pl_speaker_2 Polish Male |
|
v2/pl_speaker_3 Polish Male |
|
v2/pl_speaker_4 Polish Female |
|
v2/pl_speaker_5 Polish Male |
|
v2/pl_speaker_6 Polish Female |
|
v2/pl_speaker_7 Polish Male |
|
v2/pl_speaker_8 Polish Male |
|
v2/pl_speaker_9 Polish Female |
|
v2/pt_speaker_0 Portuguese Male |
|
v2/pt_speaker_1 Portuguese Male |
|
v2/pt_speaker_2 Portuguese Male |
|
v2/pt_speaker_3 Portuguese Male |
|
v2/pt_speaker_4 Portuguese Male |
|
v2/pt_speaker_5 Portuguese Male |
|
v2/pt_speaker_6 Portuguese Male |
|
v2/pt_speaker_7 Portuguese Male |
|
v2/pt_speaker_8 Portuguese Male |
|
v2/pt_speaker_9 Portuguese Male |
|
v2/ru_speaker_0 Russian Male |
|
v2/ru_speaker_1 Russian Male |
|
v2/ru_speaker_2 Russian Male |
|
v2/ru_speaker_3 Russian Male |
|
v2/ru_speaker_4 Russian Male |
|
v2/ru_speaker_5 Russian Female |
|
v2/ru_speaker_6 Russian Female |
|
v2/ru_speaker_7 Russian Male |
|
v2/ru_speaker_8 Russian Male |
|
v2/ru_speaker_9 Russian Female |
|
v2/es_speaker_0 Spanish Male |
|
v2/es_speaker_1 Spanish Male |
|
v2/es_speaker_2 Spanish Male |
|
v2/es_speaker_3 Spanish Male |
|
v2/es_speaker_4 Spanish Male |
|
v2/es_speaker_5 Spanish Male |
|
v2/es_speaker_6 Spanish Male |
|
v2/es_speaker_7 Spanish Male |
|
v2/es_speaker_8 Spanish Female |
|
v2/es_speaker_9 Spanish Female |
|
v2/tr_speaker_0 Turkish Male |
|
v2/tr_speaker_1 Turkish Male |
|
v2/tr_speaker_2 Turkish Male |
|
v2/tr_speaker_3 Turkish Male |
|
v2/tr_speaker_4 Turkish Female |
|
v2/tr_speaker_5 Turkish Female |
|
v2/tr_speaker_6 Turkish Male |
|
v2/tr_speaker_7 Turkish Male |
|
v2/tr_speaker_8 Turkish Male |
|
v2/tr_speaker_9 Turkish Male |
|
""" |
|
|
|
lineas = mensaje.split("\n") |
|
datos_deseados = [] |
|
for linea in lineas: |
|
partes = linea.split("\t") |
|
if len(partes) == 3: |
|
clave, _, genero = partes |
|
datos_deseados.append(f"{clave}-{genero}") |
|
|
|
return datos_deseados |
|
|
|
|
|
|
|
|
|
|
|
async def list_voices(*, proxy: Optional[str] = None) -> Any: |
|
""" |
|
List all available voices and their attributes. |
|
|
|
This pulls data from the URL used by Microsoft Edge to return a list of |
|
all available voices. |
|
|
|
Returns: |
|
dict: A dictionary of voice attributes. |
|
""" |
|
ssl_ctx = ssl.create_default_context(cafile=certifi.where()) |
|
async with aiohttp.ClientSession(trust_env=True) as session: |
|
async with session.get( |
|
VOICE_LIST, |
|
headers={ |
|
"Authority": "speech.platform.bing.com", |
|
"Sec-CH-UA": '" Not;A Brand";v="99", "Microsoft Edge";v="91", "Chromium";v="91"', |
|
"Sec-CH-UA-Mobile": "?0", |
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " |
|
"(KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36 Edg/91.0.864.41", |
|
"Accept": "*/*", |
|
"Sec-Fetch-Site": "none", |
|
"Sec-Fetch-Mode": "cors", |
|
"Sec-Fetch-Dest": "empty", |
|
"Accept-Encoding": "gzip, deflate, br", |
|
"Accept-Language": "en-US,en;q=0.9", |
|
}, |
|
proxy=proxy, |
|
ssl=ssl_ctx, |
|
) as url: |
|
data = json.loads(await url.text()) |
|
return data |
|
async def create(custom_voices: Optional[List[Dict[str, Any]]] = None) -> List[Dict[str, Any]]: |
|
""" |
|
Creates a list of voices with all available voices and their attributes. |
|
""" |
|
voices = await list_voices() if custom_voices is None else custom_voices |
|
voices = [ |
|
{**voice, **{"Language": voice["Locale"].split("-")[0]}} |
|
for voice in voices |
|
] |
|
simplified_voices = [ |
|
{'ShortName': voice['ShortName'], 'Gender': voice['Gender']} |
|
for voice in voices |
|
] |
|
return simplified_voices |
|
|
|
async def loop_main(): |
|
voices = await create() |
|
voices_json = json.dumps(voices) |
|
return voices_json |
|
|
|
def get_edge_voice(): |
|
loop = asyncio.get_event_loop() |
|
voices_json = loop.run_until_complete(loop_main()) |
|
voices = json.loads(voices_json) |
|
tts_voice = [] |
|
for voice in voices: |
|
short_name = voice['ShortName'] |
|
gender = voice['Gender'] |
|
formatted_entry = f"{short_name}-{gender}" |
|
tts_voice.append(formatted_entry) |
|
|
|
return tts_voice |
|
|
|
set_bark_voice = get_bark_voice() |
|
set_edge_voice = get_edge_voice() |
|
|
|
def update_tts_methods_voice(select_value): |
|
|
|
if select_value == "Edge-tts": |
|
return {"choices": set_edge_voice, "value": "", "__type__": "update"} |
|
elif select_value == "Bark-tts": |
|
return {"choices": set_bark_voice, "value": "", "__type__": "update"} |
|
|
|
|
|
def custom_voice( |
|
_values, |
|
audio_files, |
|
model_voice_path="", |
|
transpose=0, |
|
f0method="pm", |
|
index_rate_=float(0.66), |
|
crepe_hop_length_=float(64), |
|
f0_autotune=False, |
|
file_index="", |
|
file_index2="", |
|
): |
|
vc.get_vc(model_voice_path) |
|
|
|
for _value_item in _values: |
|
filename = ( |
|
"assets/audios/audio_outputs" + audio_files[_value_item] |
|
if _value_item != "converted_tts" |
|
else audio_files[0] |
|
) |
|
|
|
try: |
|
print(audio_files[_value_item], model_voice_path) |
|
except: |
|
pass |
|
info_, (sample_, audio_output_) = vc.vc_single_dont_save( |
|
sid=0, |
|
input_audio_path1=filename, |
|
f0_up_key=transpose, |
|
f0_file=None, |
|
f0_method=f0method, |
|
file_index=file_index, |
|
file_index2=file_index2, |
|
|
|
index_rate=index_rate_, |
|
filter_radius=int(3), |
|
resample_sr=int(0), |
|
rms_mix_rate=float(0.25), |
|
protect=float(0.33), |
|
crepe_hop_length=crepe_hop_length_, |
|
f0_autotune=f0_autotune, |
|
f0_min=50, |
|
note_min=50, |
|
f0_max=1100, |
|
note_max=1100, |
|
) |
|
|
|
sf.write( |
|
file=filename, |
|
samplerate=sample_, |
|
data=audio_output_, |
|
) |
|
|
|
|
|
def cast_to_device(tensor, device): |
|
try: |
|
return tensor.to(device) |
|
except Exception as e: |
|
print(e) |
|
return tensor |
|
|
|
|
|
def __bark__(text, voice_preset): |
|
os.makedirs(os.path.join(now_dir, "tts"), exist_ok=True) |
|
from transformers import AutoProcessor, BarkModel |
|
|
|
device = "cuda:0" if torch.cuda.is_available() else "cpu" |
|
dtype = torch.float32 if "cpu" in device else torch.float16 |
|
bark_processor = AutoProcessor.from_pretrained( |
|
"suno/bark", |
|
cache_dir=os.path.join(now_dir, "tts", "suno/bark"), |
|
torch_dtype=dtype, |
|
) |
|
bark_model = BarkModel.from_pretrained( |
|
"suno/bark", |
|
cache_dir=os.path.join(now_dir, "tts", "suno/bark"), |
|
torch_dtype=dtype, |
|
).to(device) |
|
|
|
inputs = bark_processor(text=[text], return_tensors="pt", voice_preset=voice_preset) |
|
tensor_dict = { |
|
k: cast_to_device(v, device) if hasattr(v, "to") else v |
|
for k, v in inputs.items() |
|
} |
|
speech_values = bark_model.generate(**tensor_dict, do_sample=True) |
|
sampling_rate = bark_model.generation_config.sample_rate |
|
speech = speech_values.cpu().numpy().squeeze() |
|
return speech, sampling_rate |
|
|
|
|
|
def use_tts( |
|
tts_text, |
|
tts_voice, |
|
model_path, |
|
index_path, |
|
transpose, |
|
f0_method, |
|
index_rate, |
|
crepe_hop_length, |
|
f0_autotune, |
|
tts_method, |
|
): |
|
if tts_voice == None: |
|
return |
|
|
|
output_folder = "assets/audios/audio-outputs" |
|
os.makedirs(output_folder, exist_ok=True) |
|
output_count = 1 |
|
|
|
while True: |
|
converted_tts_filename = os.path.join(output_folder, f"tts_out_{output_count}.wav") |
|
bark_out_filename = os.path.join(output_folder, f"bark_out_{output_count}.wav") |
|
|
|
if not os.path.exists(converted_tts_filename) and not os.path.exists(bark_out_filename): |
|
break |
|
output_count += 1 |
|
|
|
|
|
if "SET_LIMIT" == os.getenv("DEMO"): |
|
if len(tts_text) > 60: |
|
tts_text = tts_text[:60] |
|
print("DEMO; limit to 60 characters") |
|
|
|
language = tts_voice[:2] |
|
if tts_method == "Edge-tts": |
|
try: |
|
|
|
asyncio.run( |
|
edge_tts.Communicate( |
|
tts_text, "-".join(tts_voice.split("-")[:-1]) |
|
).save(converted_tts_filename) |
|
) |
|
except: |
|
try: |
|
tts = gTTS(tts_text, lang=language) |
|
tts.save(converted_tts_filename) |
|
tts.save |
|
print( |
|
f"No audio was received. Please change the tts voice for {tts_voice}. USING gTTS." |
|
) |
|
except: |
|
tts = gTTS("a", lang=language) |
|
tts.save(converted_tts_filename) |
|
print("Error: Audio will be replaced.") |
|
|
|
try: |
|
vc.get_vc(model_path) |
|
info_, (sample_, audio_output_) = vc.vc_single_dont_save( |
|
sid=0, |
|
input_audio_path1=converted_tts_filename, |
|
f0_up_key=transpose, |
|
f0_file=None, |
|
f0_method=f0_method, |
|
file_index="", |
|
file_index2=index_path, |
|
index_rate=index_rate, |
|
filter_radius=int(3), |
|
resample_sr=int(0), |
|
rms_mix_rate=float(0.25), |
|
protect=float(0.33), |
|
crepe_hop_length=crepe_hop_length, |
|
f0_autotune=f0_autotune, |
|
f0_min=50, |
|
note_min=50, |
|
f0_max=1100, |
|
note_max=1100, |
|
) |
|
|
|
|
|
vc_output_filename = os.path.join(output_folder, f"converted_tts_{output_count}.wav") |
|
|
|
|
|
wavfile.write( |
|
vc_output_filename, |
|
rate=sample_, |
|
data=audio_output_, |
|
) |
|
|
|
return vc_output_filename,converted_tts_filename |
|
except Exception as e: |
|
print(f"{e}") |
|
return None, None |
|
|
|
elif tts_method == "Bark-tts": |
|
try: |
|
script = tts_text.replace("\n", " ").strip() |
|
sentences = sent_tokenize(script) |
|
print(sentences) |
|
silence = np.zeros(int(0.25 * SAMPLE_RATE)) |
|
pieces = [] |
|
for sentence in sentences: |
|
audio_array, _ = __bark__(sentence, tts_voice.split("-")[0]) |
|
pieces += [audio_array, silence.copy()] |
|
|
|
sf.write( |
|
file=bark_out_filename, samplerate=SAMPLE_RATE, data=np.concatenate(pieces) |
|
) |
|
vc.get_vc(model_path) |
|
info_, (sample_, audio_output_) = vc.vc_single_dont_save( |
|
sid=0, |
|
input_audio_path1=os.path.join( |
|
now_dir, "assets", "audios", "audio-outputs", "bark_out.wav" |
|
), |
|
f0_up_key=transpose, |
|
f0_file=None, |
|
f0_method=f0_method, |
|
file_index="", |
|
file_index2=index_path, |
|
|
|
index_rate=index_rate, |
|
filter_radius=int(3), |
|
resample_sr=int(0), |
|
rms_mix_rate=float(0.25), |
|
protect=float(0.33), |
|
crepe_hop_length=crepe_hop_length, |
|
f0_autotune=f0_autotune, |
|
f0_min=50, |
|
note_min=50, |
|
f0_max=1100, |
|
note_max=1100, |
|
) |
|
|
|
vc_output_filename = os.path.join(output_folder, f"converted_bark_{output_count}.wav") |
|
|
|
|
|
wavfile.write( |
|
vc_output_filename, |
|
rate=sample_, |
|
data=audio_output_, |
|
) |
|
|
|
return vc_output_filename, bark_out_filename |
|
|
|
except Exception as e: |
|
print(f"{e}") |
|
return None, None |
|
|