import sys sys.path.append("..") import os now_dir = os.getcwd() from dotenv import load_dotenv from lib.infer.modules.vc.modules import VC from assets.configs.config import Config load_dotenv() config = Config() vc = VC(config) import shutil import numpy as np import torch import soundfile as sf from gtts import gTTS import edge_tts import asyncio import scipy.io.wavfile as wavfile import nltk nltk.download("punkt", quiet=True) from nltk.tokenize import sent_tokenize from bark import SAMPLE_RATE import json import ssl from typing import Any, Dict, List, Optional import asyncio import aiohttp import certifi VOICE_LIST = ( "https://speech.platform.bing.com/consumer/speech/synthesize/" + "readaloud/voices/list?trustedclienttoken=" + "6A5AA1D4EAFF4E9FB37E23D68491D6F4" ) def get_bark_voice(): mensaje = """ v2/en_speaker_0 English Male v2/en_speaker_1 English Male v2/en_speaker_2 English Male v2/en_speaker_3 English Male v2/en_speaker_4 English Male v2/en_speaker_5 English Male v2/en_speaker_6 English Male v2/en_speaker_7 English Male v2/en_speaker_8 English Male v2/en_speaker_9 English Female v2/zh_speaker_0 Chinese (Simplified) Male v2/zh_speaker_1 Chinese (Simplified) Male v2/zh_speaker_2 Chinese (Simplified) Male v2/zh_speaker_3 Chinese (Simplified) Male v2/zh_speaker_4 Chinese (Simplified) Female v2/zh_speaker_5 Chinese (Simplified) Male v2/zh_speaker_6 Chinese (Simplified) Female v2/zh_speaker_7 Chinese (Simplified) Female v2/zh_speaker_8 Chinese (Simplified) Male v2/zh_speaker_9 Chinese (Simplified) Female v2/fr_speaker_0 French Male v2/fr_speaker_1 French Female v2/fr_speaker_2 French Female v2/fr_speaker_3 French Male v2/fr_speaker_4 French Male v2/fr_speaker_5 French Female v2/fr_speaker_6 French Male v2/fr_speaker_7 French Male v2/fr_speaker_8 French Male v2/fr_speaker_9 French Male v2/de_speaker_0 German Male v2/de_speaker_1 German Male v2/de_speaker_2 German Male v2/de_speaker_3 German Female v2/de_speaker_4 German Male v2/de_speaker_5 German Male v2/de_speaker_6 German Male v2/de_speaker_7 German Male v2/de_speaker_8 German Female v2/de_speaker_9 German Male v2/hi_speaker_0 Hindi Female v2/hi_speaker_1 Hindi Female v2/hi_speaker_2 Hindi Male v2/hi_speaker_3 Hindi Female v2/hi_speaker_4 Hindi Female v2/hi_speaker_5 Hindi Male v2/hi_speaker_6 Hindi Male v2/hi_speaker_7 Hindi Male v2/hi_speaker_8 Hindi Male v2/hi_speaker_9 Hindi Female v2/it_speaker_0 Italian Male v2/it_speaker_1 Italian Male v2/it_speaker_2 Italian Female v2/it_speaker_3 Italian Male v2/it_speaker_4 Italian Male v2/it_speaker_5 Italian Male v2/it_speaker_6 Italian Male v2/it_speaker_7 Italian Female v2/it_speaker_8 Italian Male v2/it_speaker_9 Italian Female v2/ja_speaker_0 Japanese Female v2/ja_speaker_1 Japanese Female v2/ja_speaker_2 Japanese Male v2/ja_speaker_3 Japanese Female v2/ja_speaker_4 Japanese Female v2/ja_speaker_5 Japanese Female v2/ja_speaker_6 Japanese Male v2/ja_speaker_7 Japanese Female v2/ja_speaker_8 Japanese Female v2/ja_speaker_9 Japanese Female v2/ko_speaker_0 Korean Female v2/ko_speaker_1 Korean Male v2/ko_speaker_2 Korean Male v2/ko_speaker_3 Korean Male v2/ko_speaker_4 Korean Male v2/ko_speaker_5 Korean Male v2/ko_speaker_6 Korean Male v2/ko_speaker_7 Korean Male v2/ko_speaker_8 Korean Male v2/ko_speaker_9 Korean Male v2/pl_speaker_0 Polish Male v2/pl_speaker_1 Polish Male v2/pl_speaker_2 Polish Male v2/pl_speaker_3 Polish Male v2/pl_speaker_4 Polish Female v2/pl_speaker_5 Polish Male v2/pl_speaker_6 Polish Female v2/pl_speaker_7 Polish Male v2/pl_speaker_8 Polish Male v2/pl_speaker_9 Polish Female v2/pt_speaker_0 Portuguese Male v2/pt_speaker_1 Portuguese Male v2/pt_speaker_2 Portuguese Male v2/pt_speaker_3 Portuguese Male v2/pt_speaker_4 Portuguese Male v2/pt_speaker_5 Portuguese Male v2/pt_speaker_6 Portuguese Male v2/pt_speaker_7 Portuguese Male v2/pt_speaker_8 Portuguese Male v2/pt_speaker_9 Portuguese Male v2/ru_speaker_0 Russian Male v2/ru_speaker_1 Russian Male v2/ru_speaker_2 Russian Male v2/ru_speaker_3 Russian Male v2/ru_speaker_4 Russian Male v2/ru_speaker_5 Russian Female v2/ru_speaker_6 Russian Female v2/ru_speaker_7 Russian Male v2/ru_speaker_8 Russian Male v2/ru_speaker_9 Russian Female v2/es_speaker_0 Spanish Male v2/es_speaker_1 Spanish Male v2/es_speaker_2 Spanish Male v2/es_speaker_3 Spanish Male v2/es_speaker_4 Spanish Male v2/es_speaker_5 Spanish Male v2/es_speaker_6 Spanish Male v2/es_speaker_7 Spanish Male v2/es_speaker_8 Spanish Female v2/es_speaker_9 Spanish Female v2/tr_speaker_0 Turkish Male v2/tr_speaker_1 Turkish Male v2/tr_speaker_2 Turkish Male v2/tr_speaker_3 Turkish Male v2/tr_speaker_4 Turkish Female v2/tr_speaker_5 Turkish Female v2/tr_speaker_6 Turkish Male v2/tr_speaker_7 Turkish Male v2/tr_speaker_8 Turkish Male v2/tr_speaker_9 Turkish Male """ # Dividir el mensaje en líneas lineas = mensaje.split("\n") datos_deseados = [] for linea in lineas: partes = linea.split("\t") if len(partes) == 3: clave, _, genero = partes datos_deseados.append(f"{clave}-{genero}") return datos_deseados # ||-----------------------------------------------------------------------------------|| # || Obtained from dependency edge_tts || # ||-----------------------------------------------------------------------------------|| async def list_voices(*, proxy: Optional[str] = None) -> Any: """ List all available voices and their attributes. This pulls data from the URL used by Microsoft Edge to return a list of all available voices. Returns: dict: A dictionary of voice attributes. """ ssl_ctx = ssl.create_default_context(cafile=certifi.where()) async with aiohttp.ClientSession(trust_env=True) as session: async with session.get( VOICE_LIST, headers={ "Authority": "speech.platform.bing.com", "Sec-CH-UA": '" Not;A Brand";v="99", "Microsoft Edge";v="91", "Chromium";v="91"', "Sec-CH-UA-Mobile": "?0", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36 Edg/91.0.864.41", "Accept": "*/*", "Sec-Fetch-Site": "none", "Sec-Fetch-Mode": "cors", "Sec-Fetch-Dest": "empty", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "en-US,en;q=0.9", }, proxy=proxy, ssl=ssl_ctx, ) as url: data = json.loads(await url.text()) return data async def create(custom_voices: Optional[List[Dict[str, Any]]] = None) -> List[Dict[str, Any]]: """ Creates a list of voices with all available voices and their attributes. """ voices = await list_voices() if custom_voices is None else custom_voices voices = [ {**voice, **{"Language": voice["Locale"].split("-")[0]}} for voice in voices ] simplified_voices = [ {'ShortName': voice['ShortName'], 'Gender': voice['Gender']} for voice in voices ] return simplified_voices async def loop_main(): voices = await create() voices_json = json.dumps(voices) return voices_json def get_edge_voice(): loop = asyncio.get_event_loop() voices_json = loop.run_until_complete(loop_main()) voices = json.loads(voices_json) tts_voice = [] for voice in voices: short_name = voice['ShortName'] gender = voice['Gender'] formatted_entry = f"{short_name}-{gender}" tts_voice.append(formatted_entry) # print(f"{short_name}-{gender}") return tts_voice set_bark_voice = get_bark_voice() set_edge_voice = get_edge_voice() def update_tts_methods_voice(select_value): # ["Edge-tts", "RVG-tts", "Bark-tts"] if select_value == "Edge-tts": return {"choices": set_edge_voice, "value": "", "__type__": "update"} elif select_value == "Bark-tts": return {"choices": set_bark_voice, "value": "", "__type__": "update"} def custom_voice( _values, # filter indices audio_files, # all audio files model_voice_path="", transpose=0, f0method="pm", index_rate_=float(0.66), crepe_hop_length_=float(64), f0_autotune=False, file_index="", file_index2="", ): vc.get_vc(model_voice_path) for _value_item in _values: filename = ( "assets/audios/audio_outputs" + audio_files[_value_item] if _value_item != "converted_tts" else audio_files[0] ) # filename = "audio2/"+audio_files[_value_item] try: print(audio_files[_value_item], model_voice_path) except: pass info_, (sample_, audio_output_) = vc.vc_single_dont_save( sid=0, input_audio_path1=filename, # f"audio2/{filename}", f0_up_key=transpose, # transpose for m to f and reverse 0 12 f0_file=None, f0_method=f0method, file_index=file_index, # dir pwd? file_index2=file_index2, # file_big_npy1, index_rate=index_rate_, filter_radius=int(3), resample_sr=int(0), rms_mix_rate=float(0.25), protect=float(0.33), crepe_hop_length=crepe_hop_length_, f0_autotune=f0_autotune, f0_min=50, note_min=50, f0_max=1100, note_max=1100, ) sf.write( file=filename, # f"audio2/{filename}", samplerate=sample_, data=audio_output_, ) def cast_to_device(tensor, device): try: return tensor.to(device) except Exception as e: print(e) return tensor def __bark__(text, voice_preset): os.makedirs(os.path.join(now_dir, "tts"), exist_ok=True) from transformers import AutoProcessor, BarkModel device = "cuda:0" if torch.cuda.is_available() else "cpu" dtype = torch.float32 if "cpu" in device else torch.float16 bark_processor = AutoProcessor.from_pretrained( "suno/bark", cache_dir=os.path.join(now_dir, "tts", "suno/bark"), torch_dtype=dtype, ) bark_model = BarkModel.from_pretrained( "suno/bark", cache_dir=os.path.join(now_dir, "tts", "suno/bark"), torch_dtype=dtype, ).to(device) # bark_model.enable_cpu_offload() inputs = bark_processor(text=[text], return_tensors="pt", voice_preset=voice_preset) tensor_dict = { k: cast_to_device(v, device) if hasattr(v, "to") else v for k, v in inputs.items() } speech_values = bark_model.generate(**tensor_dict, do_sample=True) sampling_rate = bark_model.generation_config.sample_rate speech = speech_values.cpu().numpy().squeeze() return speech, sampling_rate def use_tts( tts_text, tts_voice, model_path, index_path, transpose, f0_method, index_rate, crepe_hop_length, f0_autotune, tts_method, ): if tts_voice == None: return output_folder = "assets/audios/audio-outputs" os.makedirs(output_folder, exist_ok=True) output_count = 1 # Contador para nombres de archivo únicos while True: converted_tts_filename = os.path.join(output_folder, f"tts_out_{output_count}.wav") bark_out_filename = os.path.join(output_folder, f"bark_out_{output_count}.wav") if not os.path.exists(converted_tts_filename) and not os.path.exists(bark_out_filename): break output_count += 1 if "SET_LIMIT" == os.getenv("DEMO"): if len(tts_text) > 60: tts_text = tts_text[:60] print("DEMO; limit to 60 characters") language = tts_voice[:2] if tts_method == "Edge-tts": try: # nest_asyncio.apply() # gradio;not asyncio.run( edge_tts.Communicate( tts_text, "-".join(tts_voice.split("-")[:-1]) ).save(converted_tts_filename) ) except: try: tts = gTTS(tts_text, lang=language) tts.save(converted_tts_filename) tts.save print( f"No audio was received. Please change the tts voice for {tts_voice}. USING gTTS." ) except: tts = gTTS("a", lang=language) tts.save(converted_tts_filename) print("Error: Audio will be replaced.") try: vc.get_vc(model_path) info_, (sample_, audio_output_) = vc.vc_single_dont_save( sid=0, input_audio_path1=converted_tts_filename, f0_up_key=transpose, f0_file=None, f0_method=f0_method, file_index="", file_index2=index_path, index_rate=index_rate, filter_radius=int(3), resample_sr=int(0), rms_mix_rate=float(0.25), protect=float(0.33), crepe_hop_length=crepe_hop_length, f0_autotune=f0_autotune, f0_min=50, note_min=50, f0_max=1100, note_max=1100, ) # Genera un nombre de archivo único para el archivo procesado por vc.vc_single_dont_save vc_output_filename = os.path.join(output_folder, f"converted_tts_{output_count}.wav") # Guarda el archivo de audio procesado por vc.vc_single_dont_save wavfile.write( vc_output_filename, rate=sample_, data=audio_output_, ) return vc_output_filename,converted_tts_filename except Exception as e: print(f"{e}") return None, None elif tts_method == "Bark-tts": try: script = tts_text.replace("\n", " ").strip() sentences = sent_tokenize(script) print(sentences) silence = np.zeros(int(0.25 * SAMPLE_RATE)) pieces = [] for sentence in sentences: audio_array, _ = __bark__(sentence, tts_voice.split("-")[0]) pieces += [audio_array, silence.copy()] sf.write( file=bark_out_filename, samplerate=SAMPLE_RATE, data=np.concatenate(pieces) ) vc.get_vc(model_path) info_, (sample_, audio_output_) = vc.vc_single_dont_save( sid=0, input_audio_path1=os.path.join( now_dir, "assets", "audios", "audio-outputs", "bark_out.wav" ), # f"audio2/{filename}", f0_up_key=transpose, # transpose for m to f and reverse 0 12 f0_file=None, f0_method=f0_method, file_index="", # dir pwd? file_index2=index_path, # file_big_npy1, index_rate=index_rate, filter_radius=int(3), resample_sr=int(0), rms_mix_rate=float(0.25), protect=float(0.33), crepe_hop_length=crepe_hop_length, f0_autotune=f0_autotune, f0_min=50, note_min=50, f0_max=1100, note_max=1100, ) vc_output_filename = os.path.join(output_folder, f"converted_bark_{output_count}.wav") # Guarda el archivo de audio procesado por vc.vc_single_dont_save wavfile.write( vc_output_filename, rate=sample_, data=audio_output_, ) return vc_output_filename, bark_out_filename except Exception as e: print(f"{e}") return None, None