import asyncio import datetime import logging import os import time import traceback import shutil import urllib.request import zipfile import gdown from argparse import ArgumentParser import requests import random os.system("aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt -d . -o hubert_base.pt") os.system("aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/rmvpe.pt -d . -o rmvpe.pt") # os.system("aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/sail-rvc/yoimiya-jp/resolve/main/model.pth -d ./weights/yoimiya -o yoimiya.pth") # os.system("aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/sail-rvc/yoimiya-jp/resolve/main/model.index -d ./weights/yoimiya -o yoimiya.index") # os.system("aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/sail-rvc/hitzeed-ch/resolve/main/model.pth -d ./weights/hitzeed -o hitzeed.pth") # os.system("aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/sail-rvc/hitzeed-ch/resolve/main/model.index -d ./weights/hitzeed -o hitzeed.index") # os.system("aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/sail-rvc/Eminem2333333/blob/main/model.pth -d ./weights/Eminem -o Eminem.pth") # os.system("aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/sail-rvc/hitzeed-ch/resolve/main/model.index -d ./weights/Eminem -o Eminem.index") os.system("aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/zhuowen999/yutou/resolve/main/yutou.index -d ./weights/yutou -o yutou.index") os.system("aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/zhuowen999/yutou/resolve/main/yutou.pth -d ./weights/yutou -o yutou.pth") BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) rvc_models_dir = os.path.join(BASE_DIR, 'weights') import edge_tts import gradio as gr import librosa import torch from fairseq import checkpoint_utils from config import Config from lib.infer_pack.models import ( SynthesizerTrnMs256NSFsid, SynthesizerTrnMs256NSFsid_nono, SynthesizerTrnMs768NSFsid, SynthesizerTrnMs768NSFsid_nono, ) from rmvpe import RMVPE from vc_infer_pipeline import VC logging.getLogger("fairseq").setLevel(logging.WARNING) logging.getLogger("numba").setLevel(logging.WARNING) logging.getLogger("markdown_it").setLevel(logging.WARNING) logging.getLogger("urllib3").setLevel(logging.WARNING) logging.getLogger("matplotlib").setLevel(logging.WARNING) limitation = os.getenv("SYSTEM") == "spaces" config = Config() edge_output_filename = "edge_output.mp3" tts_voice_list = asyncio.get_event_loop().run_until_complete(edge_tts.list_voices()) tts_voices = ['zh-CN-XiaoxiaoMultilingualNeural','zh-CN-YunyiMultilingualNeural','zh-CN-XiaoyuMultilingualNeural','zh-CN-XiaochenMultilingualNeural'] model_root = "weights" models = [ d for d in os.listdir(model_root) if os.path.isdir(os.path.join(model_root, d)) ] if len(models) == 0: raise ValueError("No model found in `weights` folder") models.sort() def tts_new(text,path,voice='zh-CN-XiaoxiaoMultilingualNeural',rate=-8): url = "https://www.text-to-speech.cn/getSpeek.php" payload = { "user_id": str(random.randint(120100,2000000)), "language": "中文(普通话,简体)", "voice": voice, "text": text, "role": "0", "style": "0", "styledegree": "1", "volume": "75", "predict": "0", "rate": rate, "pitch": "0", "kbitrate": "audio-16khz-128kbitrate-mono-mp3", "silence": "", "yzm": "^\"" } headers = { "Referer": "https://www.text-to-speech.cn/", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0", "accept": "*/*", "accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6", "content-type": "application/x-www-form-urlencoded; charset=UTF-8", "origin": "https://www.text-to-speech.cn", "referer": "https://www.text-to-speech.cn/" } response = requests.post(url, data=payload, headers=headers) url=response.json()['download'] print(response.json()) mp3=requests.get(url) with open(path, "wb") as f: f.write(mp3.content) def model_data(model_name): # global n_spk, tgt_sr, net_g, vc, cpt, version, index_file pth_files = [ os.path.join(model_root, model_name, f) for f in os.listdir(os.path.join(model_root, model_name)) if f.endswith(".pth") ] if len(pth_files) == 0: raise ValueError(f"No pth file found in {model_root}/{model_name}") pth_path = pth_files[0] print(f"Loading {pth_path}") cpt = torch.load(pth_path, map_location="cpu") tgt_sr = cpt["config"][-1] cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk if_f0 = cpt.get("f0", 1) version = cpt.get("version", "v1") if version == "v1": if if_f0 == 1: net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=config.is_half) else: net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"]) elif version == "v2": if if_f0 == 1: net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=config.is_half) else: net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"]) else: raise ValueError("Unknown version") del net_g.enc_q net_g.load_state_dict(cpt["weight"], strict=False) print("Model loaded") net_g.eval().to(config.device) if config.is_half: net_g = net_g.half() else: net_g = net_g.float() vc = VC(tgt_sr, config) # n_spk = cpt["config"][-3] index_files = [ os.path.join(model_root, model_name, f) for f in os.listdir(os.path.join(model_root, model_name)) if f.endswith(".index") ] if len(index_files) == 0: print("No index file found") index_file = "" else: index_file = index_files[0] print(f"Index file found: {index_file}") return tgt_sr, net_g, vc, version, index_file, if_f0 def load_hubert(): global hubert_model models, _, _ = checkpoint_utils.load_model_ensemble_and_task( ["hubert_base.pt"], suffix="", ) hubert_model = models[0] hubert_model = hubert_model.to(config.device) if config.is_half: hubert_model = hubert_model.half() else: hubert_model = hubert_model.float() return hubert_model.eval() print("Loading hubert model...") hubert_model = load_hubert() print("Hubert model loaded.") print("Loading rmvpe model...") rmvpe_model = RMVPE("rmvpe.pt", config.is_half, config.device) print("rmvpe model loaded.") def tts( model_name, speed, tts_text, tts_voice, f0_up_key, f0_method, index_rate, protect, filter_radius=3, resample_sr=0, rms_mix_rate=0.25, ): print("------------------") print(datetime.datetime.now()) print("tts_text:") print(tts_text) print(f"tts_voice: {tts_voice}") print(f"Model name: {model_name}") print(f"F0: {f0_method}, Key: {f0_up_key}, Index: {index_rate}, Protect: {protect}") try: if limitation and len(tts_text) > 280: print("Error: Text too long") return ( f"Text characters should be at most 280 in this huggingface space, but got {len(tts_text)} characters.", None, None, ) tgt_sr, net_g, vc, version, index_file, if_f0 = model_data(model_name) t0 = time.time() if speed >= 0: speed_str = f"+{speed}%" else: speed_str = f"{speed}%" # asyncio.run( # edge_tts.Communicate( # tts_text, "-".join(tts_voice.split("-")[:-1]), rate=speed_str # ).save(edge_output_filename) # ) tts_new(tts_text,edge_output_filename,tts_voice,speed) t1 = time.time() edge_time = t1 - t0 audio, sr = librosa.load(edge_output_filename, sr=16000, mono=True) duration = len(audio) / sr print(f"Audio duration: {duration}s") if limitation and duration >= 20: print("Error: Audio too long") return ( f"Audio should be less than 20 seconds in this huggingface space, but got {duration}s.", edge_output_filename, None, ) f0_up_key = int(f0_up_key) if not hubert_model: load_hubert() if f0_method == "rmvpe": vc.model_rmvpe = rmvpe_model times = [0, 0, 0] audio_opt = vc.pipeline( hubert_model, net_g, 0, audio, edge_output_filename, times, f0_up_key, f0_method, index_file, # file_big_npy, index_rate, if_f0, filter_radius, tgt_sr, resample_sr, rms_mix_rate, version, protect, None, ) if tgt_sr != resample_sr >= 16000: tgt_sr = resample_sr info = f"Success. Time: edge-tts: {edge_time}s, npy: {times[0]}s, f0: {times[1]}s, infer: {times[2]}s" print(info) return ( info, edge_output_filename, (tgt_sr, audio_opt), ) except EOFError: info = ( "It seems that the edge-tts output is not valid. " "This may occur when the input text and the speaker do not match. " "For example, maybe you entered Japanese (without alphabets) text but chose non-Japanese speaker?" ) print(info) return info, None, None except: info = traceback.format_exc() print(info) return info, None, None initial_md = """ # RVC TTS HF 🤗 [![open in clab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Blane187/rvc-tts/blob/main/rvc_tts.ipynb) This is a text-to-speech webui of RVC models. Input text ➡[(edge-tts)](https://github.com/rany2/edge-tts)➡ Speech mp3 file ➡[(RVC)](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI)➡ Final output """ Another_md = """ RVC TTS → [🌐 Github](https://github.com/Blane187/rvc-tts.git) """ app = gr.Blocks(title="RVC-TTS") with app: gr.Markdown(initial_md) gr.Markdown(Another_md) with gr.Row(): with gr.Column(): model_name = gr.Dropdown(label="Model", choices=models, value=models[0]) f0_key_up = gr.Number( label="Transpose (the best value depends on the models and speakers)", value=0, ) with gr.Column(): f0_method = gr.Radio( label="Pitch extraction method (Rmvpe is default)", choices=["rmvpe", "crepe"], # harvest is too slow value="rmvpe", interactive=True, ) index_rate = gr.Slider( minimum=0, maximum=1, label="Index rate", value=1, interactive=True, ) protect0 = gr.Slider( minimum=0, maximum=0.5, label="Protect", value=0.33, step=0.01, interactive=True, ) with gr.Row(): with gr.Column(): tts_voice = gr.Dropdown( label="Edge-tts speaker (format: language-Country-Name-Gender)", choices=tts_voices, allow_custom_value=False, value="zh-CN-XiaoxiaoMultilingualNeural", ) speed = gr.Slider( minimum=-100, maximum=100, label="Speech speed (%)", value=0, step=10, interactive=True, ) with gr.Column(): tts_text = gr.Textbox(label="Input Text", value="很高兴见到你") with gr.Column(): but0 = gr.Button("Convert", variant="primary") info_text = gr.Textbox(label="Output info") with gr.Column(): edge_tts_output = gr.Audio(label="Edge Voice", type="filepath") with gr.Column(): tts_output = gr.Audio(label="Result") but0.click( tts, [ model_name, speed, tts_text, tts_voice, f0_key_up, f0_method, index_rate, protect0, ], [info_text, edge_tts_output, tts_output], ) with gr.Row(): examples = gr.Examples( examples_per_page=100, examples=[ ["これは日本語テキストから音声への変換デモです。", "ja-JP-NanamiNeural-Female"], [ "This is an English text to speech conversation demo.", "en-US-AriaNeural-Female", ], ], inputs=[tts_text, tts_voice], ) app.launch()