Spaces:
Running
Running
File size: 8,235 Bytes
8e55e24 f1ab553 8e55e24 f1ab553 8e55e24 6e30432 8e55e24 f1ab553 8e55e24 f1ab553 8e55e24 286ae6c edb5ebf 7bb2df2 edb5ebf 7bb2df2 edb5ebf 6e30432 f1ab553 6e30432 f1ab553 6e30432 edb5ebf 6e30432 edb5ebf 6e30432 edb5ebf 6e30432 edb5ebf bc809a1 6e30432 bc809a1 037bd43 ef613df 6e30432 ef613df edb5ebf 6e30432 edb5ebf ef613df edb5ebf 6e30432 edb5ebf 6e30432 edb5ebf 6e30432 edb5ebf 6e30432 ef613df 6e30432 edb5ebf ef613df bc809a1 6e30432 bc809a1 ef613df 6e30432 ef613df edb5ebf 6e30432 edb5ebf 6e30432 edb5ebf ef613df 121637d 6e30432 bc809a1 f97daf3 785c591 f0f19d6 f97daf3 785c591 bc809a1 f97daf3 785c591 f97daf3 008505a f97daf3 6e30432 f97daf3 ad2bd62 6e30432 f97daf3 ad2bd62 f97daf3 785c591 f97daf3 ad2bd62 f97daf3 785c591 f97daf3 ad2bd62 f97daf3 6e30432 ef613df 6e30432 f97daf3 6e30432 f97daf3 bc809a1 f97daf3 6e30432 bc809a1 f97daf3 edb5ebf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 |
import os
import subprocess
# 编译 monotonic_align
def compile_monotonic_align():
if not os.path.exists("monotonic_align/monotonic_align/core.cpython-*.so"):
print("正在编译 monotonic_align...")
if not os.path.exists("monotonic_align"):
raise FileNotFoundError("monotonic_align 文件夹未找到!请确保它存在于根目录中。")
os.chdir("monotonic_align")
os.makedirs("monotonic_align", exist_ok=True)
subprocess.run(["python", "setup.py", "build_ext", "--inplace"], check=True)
os.chdir("..")
print("monotonic_align 编译成功!")
else:
print("monotonic_align 已编译,跳过...")
compile_monotonic_align()
import gradio as gr
import torch
import numpy as np
from scipy.io.wavfile import write
import commons
import utils
from models import SynthesizerTrn
from text.symbols import symbols
from text import get_bert, cleaned_text_to_sequence
from text.cleaner import clean_text
from huggingface_hub import hf_hub_download, snapshot_download
# 模型配置
MODEL_CONFIG = {
"roberta": {
"repo_id": "hfl/chinese-roberta-wwm-ext-large"
},
"vits": {
"repo_id": "guetLzy/BERT-ISTFT-VITS-Model",
"files": ["G_25000.pth"]
}
}
# 设备设置
device = "cuda" if torch.cuda.is_available() else "cpu"
# 可用的模型选项
MODEL_OPTIONS = {
"VITS_Model": "models/G_25000.pth",
}
def download_models():
os.makedirs("./bert/chinese-roberta-wwm-ext-large", exist_ok=True)
os.makedirs("./models", exist_ok=True)
roberta_path = snapshot_download(
repo_id=MODEL_CONFIG["roberta"]["repo_id"],
local_dir="./bert/chinese-roberta-wwm-ext-large",
resume_download=True
)
roberta_paths = {"repo_dir": roberta_path}
vits_paths = {}
for model_name, model_path in MODEL_OPTIONS.items():
path = hf_hub_download(
repo_id=MODEL_CONFIG["vits"]["repo_id"],
filename=os.path.basename(model_path),
local_dir="./models",
resume_download=True
)
vits_paths[model_name] = path
return {
"roberta": roberta_paths,
"vits": vits_paths
}
model_paths = download_models()
# 加载配置和模型
hps = utils.get_hparams_from_file("configs/1.json")
net_g = SynthesizerTrn(
len(symbols),
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
n_speakers=hps.data.n_speakers,
**hps.model,
).to(device)
_ = net_g.eval()
_ = utils.load_checkpoint(model_paths["vits"]["VITS_Model"], net_g, None)
def get_text(text, hps, language_str="ZH"):
"""处理输入文本,生成语音所需的序列,并返回音素、音调和word2ph序列"""
# 清理文本,获取初始 phone, tone 和 word2ph
norm_text, phone, tone, word2ph = clean_text(text, language_str)
# 保存处理前的 phone, tone 和 word2ph 用于显示
phone_list = phone.copy() # 音素序列
tone_list = tone.copy() # 音调序列
word2ph_list = word2ph.copy() # word2ph序列
# 转换为序列
phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
# 添加空白(如果配置要求)
if hps.data.add_blank:
phone = commons.intersperse(phone, 0)
tone = commons.intersperse(tone, 0)
language = commons.intersperse(language, 0)
for i in range(len(word2ph)):
word2ph[i] = word2ph[i] * 2
word2ph[0] += 1
# 获取 BERT 特征
if hps.data.use_bert:
bert = get_bert(norm_text, word2ph, language_str, device)
del word2ph
assert bert.shape[-1] == len(phone)
if language_str == "ZH":
bert = bert
else:
bert = torch.zeros(1024, len(phone))
else:
bert = torch.zeros(1024, len(phone))
# 转换为张量
phone = torch.LongTensor(phone)
tone = torch.LongTensor(tone)
language = torch.LongTensor(language)
return bert, phone, tone, language, phone_list, tone_list, word2ph_list
def generate_audio(text, noise_scale=1.0, noise_scale_w=0.8, length_scale=1.0, speaker_id="SSB0005"):
"""生成音频文件并返回音素、音调和word2ph序列"""
bert, phones, tones, language_id, phone_list, tone_list, word2ph_list = get_text(text, hps)
with torch.no_grad():
x_tst = phones.to(device).unsqueeze(0)
tones = tones.to(device).unsqueeze(0)
language_id = language_id.to(device).unsqueeze(0)
bert = bert.to(device).unsqueeze(0)
x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
sid = torch.LongTensor([hps.data.spk2id[speaker_id]]).to(device)
audio = (
net_g.infer(
x_tst,
x_tst_lengths,
sid,
tones,
language_id,
bert,
noise_scale=noise_scale,
noise_scale_w=noise_scale_w,
length_scale=length_scale,
)[0][0, 0]
.data.cpu()
.float()
.numpy()
)
output_path = "output.wav"
write(output_path, 22050, (audio * 32767.0).astype(np.int16))
return output_path, phone_list, tone_list, word2ph_list
with gr.Blocks(
title="BERT-ISTFT-VITS中文语音合成系统",
theme="gstaff/sketch"
) as interface:
gr.Markdown("# BERT-ISTFT-VITS中文语音合成系统")
gr.Markdown("输入中文文本,调整参数并选择说话人以生成语音。查看处理后的音素、音调和word2ph序列。")
with gr.Row():
with gr.Column(scale=1):
text_input = gr.Textbox(
label="输入文本",
value="桂林电子科技大学",
placeholder="请输入中文文本...",
lines=5,
)
with gr.Group():
gr.Markdown("### 参数调整")
noise_scale = gr.Slider(
minimum=0.1,
maximum=1,
step=0.1,
value=0.667,
label="噪声比例",
info="控制生成音频的噪声水平"
)
noise_scale_w = gr.Slider(
minimum=0.1,
maximum=1,
step=0.1,
value=1.0,
label="噪声比例 W",
info="控制音调的噪声影响"
)
length_scale = gr.Slider(
minimum=0.1,
maximum=1,
step=0.1,
value=1.0,
label="语速比例",
info="调整语音的播放速度"
)
speaker_id = gr.Dropdown(
choices=list(hps.data.spk2id.keys()),
label="选择说话人",
value="SSB0005",
info="选择生成语音的说话人"
)
with gr.Column(scale=1):
audio_output = gr.Audio(
label="生成的音频",
type="filepath",
interactive=False
)
phoneme_output = gr.Textbox(
label="音素序列 (Phones)",
placeholder="处理后的音素序列将显示在此...",
interactive=False
)
tone_output = gr.Textbox(
label="音调序列 (Tones)",
placeholder="处理后的音调序列将显示在此...",
interactive=False
)
word2ph_output = gr.Textbox(
label="Word-to-Phoneme序列 (Word2ph)",
placeholder="处理后的word2ph序列将显示在此...",
interactive=False
)
generate_btn = gr.Button("生成语音", variant="primary")
generate_btn.click(
fn=generate_audio,
inputs=[text_input, noise_scale, noise_scale_w, length_scale, speaker_id],
outputs=[audio_output, phoneme_output, tone_output, word2ph_output]
)
interface.launch() |