SimpleRVC / tts /conversion.py
xJuuzouYTx
[ADD] edge tts
eb3ba2e
raw
history blame
4.79 kB
import os
import uuid
import numpy as np
import torch
import soundfile as sf
from gtts import gTTS
import edge_tts
from inference import Inference
import asyncio
#git+https://github.com/suno-ai/bark.git
# from transformers import AutoProcessor, BarkModel
# import nltk
# from nltk.tokenize import sent_tokenize
# from bark import SAMPLE_RATE
# now_dir = os.getcwd()
def cast_to_device(tensor, device):
try:
return tensor.to(device)
except Exception as e:
print(e)
return tensor
# Buscar la forma de evitar descargar el archivo de 4gb cada vez que crea una instancia
# def _bark_conversion_(text, voice_preset):
# os.makedirs(os.path.join(now_dir, "tts"), exist_ok=True)
# device = "cuda:0" if torch.cuda.is_available() else "cpu"
# dtype = torch.float32 if "cpu" in device else torch.float16
# bark_processor = AutoProcessor.from_pretrained(
# "suno/bark",
# cache_dir=os.path.join(now_dir, "tts", "suno/bark"),
# torch_dtype=dtype,
# )
# bark_model = BarkModel.from_pretrained(
# "suno/bark",
# cache_dir=os.path.join(now_dir, "tts", "suno/bark"),
# torch_dtype=dtype,
# ).to(device)
# # bark_model.enable_cpu_offload()
# inputs = bark_processor(text=[text], return_tensors="pt", voice_preset=voice_preset)
# tensor_dict = {
# k: cast_to_device(v, device) if hasattr(v, "to") else v
# for k, v in inputs.items()
# }
# speech_values = bark_model.generate(**tensor_dict, do_sample=True)
# sampling_rate = bark_model.generation_config.sample_rate
# speech = speech_values.cpu().numpy().squeeze()
# return speech, sampling_rate
def tts_infer(tts_text, model_url, tts_method, tts_model):
print("*****************")
print(tts_text)
print(model_url)
if not tts_text:
return 'Primero escribe el texto que quieres convertir.', None
if not tts_model:
return 'Selecciona un modelo TTS antes de convertir.', None
if not model_url:
return 'Escribe la url de modelo que quieres usar antes de convertir.', None
f0_method = "harvest"
output_folder = "audios"
os.makedirs(output_folder, exist_ok=True)
converted_tts_filename = os.path.join(output_folder, f"tts_out_{uuid.uuid4()}.wav")
success = False
if len(tts_text) > 60:
tts_text = tts_text[:60]
print("DEMO; limit to 60 characters")
language = tts_model[:2]
if tts_method == "Edge-tts":
try:
asyncio.run(
edge_tts.Communicate(
tts_text, "-".join(tts_model.split("-")[:-1])
).save(converted_tts_filename)
)
success = True
except Exception as e:
print("ERROR", e)
try:
tts = gTTS(tts_text, lang=language)
tts.save(converted_tts_filename)
print(
f"No audio was received. Please change the tts voice for {tts_model}. USING gTTS."
)
success = True
except:
tts = gTTS("a", lang=language)
tts.save(converted_tts_filename)
print("Error: Audio will be replaced.")
success = False
# elif tts_method == "Bark-tts":
# try:
# script = tts_text.replace("\n", " ").strip()
# sentences = sent_tokenize(script)
# silence = np.zeros(int(0.25 * SAMPLE_RATE))
# pieces = []
# for sentence in sentences:
# audio_array, _ = _bark_conversion_(sentence, tts_model.split("-")[0])
# pieces += [audio_array, silence.copy()]
# sf.write(
# file=converted_tts_filename, samplerate=SAMPLE_RATE, data=np.concatenate(pieces)
# )
# except Exception as e:
# print(f"{e}")
# return None, None
if success:
inference = Inference(
model_name=model_url,
f0_method=f0_method,
source_audio_path=converted_tts_filename,
output_file_name=os.path.join("./audio-outputs", os.path.basename(converted_tts_filename)),
)
output = inference.run()
if os.path.exists(converted_tts_filename):
os.remove(converted_tts_filename)
if os.path.exists(os.path.join("weights", inference.model_name)):
os.remove(os.path.join("weights", inference.model_name))
if 'success' in output and output['success']:
return output, output['file']
else:
return output, None
else:
return "Ocurri贸 un error durante la conversi贸n", None