Spaces:
Running
on
Zero
Running
on
Zero
final
Browse files
app.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
import os
|
2 |
import sys
|
3 |
import spaces
|
|
|
4 |
cnhubert_base_path = "GPT_SoVITS/pretrained_models/chinese-hubert-base"
|
5 |
bert_path = "GPT_SoVITS\pretrained_models\chinese-roberta-wwm-ext-large"
|
6 |
os.environ["version"] = 'v2'
|
@@ -134,11 +135,9 @@ def get_spepc(hps, filename):
|
|
134 |
audio=load_audio(filename,int(hps.data.sampling_rate))
|
135 |
audio = audio / np.max(np.abs(audio))
|
136 |
audio=torch.FloatTensor(audio)
|
137 |
-
print(torch.max(torch.abs(audio)))
|
138 |
audio_norm = audio
|
139 |
# audio_norm = audio / torch.max(torch.abs(audio))
|
140 |
audio_norm = audio_norm.unsqueeze(0)
|
141 |
-
print(torch.max(torch.abs(audio_norm)))
|
142 |
spec = spectrogram_torch(audio_norm, hps.data.filter_length,hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,center=False)
|
143 |
return spec
|
144 |
|
@@ -164,11 +163,8 @@ def create_tts_fn(vq_model, ssl_model, t2s_model, hps, config, hz, max_sec):
|
|
164 |
wav16k = wav16k.float()
|
165 |
if(is_half==True):wav16k=wav16k.half().to(device)
|
166 |
else:wav16k=wav16k.to(device)
|
167 |
-
print(wav16k.shape) # 读取16k音频
|
168 |
ssl_content = ssl_model.model(wav16k.unsqueeze(0))["last_hidden_state"].transpose(1, 2)#.float()
|
169 |
-
print(ssl_content.shape)
|
170 |
codes = vq_model.extract_latent(ssl_content)
|
171 |
-
print(codes.shape)
|
172 |
prompt_semantic = codes[0, 0]
|
173 |
t1 = ttime()
|
174 |
phones1, word2ph1, norm_text1 = clean_text(prompt_text, prompt_language)
|
@@ -183,7 +179,6 @@ def create_tts_fn(vq_model, ssl_model, t2s_model, hps, config, hz, max_sec):
|
|
183 |
if(len(phones2) == 1 and phones2[0] == ""):
|
184 |
continue
|
185 |
#phones2, word2ph2, norm_text2 = clean_text(text, text_language)
|
186 |
-
print(phones2)
|
187 |
phones2 = cleaned_text_to_sequence(phones2)
|
188 |
#if(prompt_language=="zh"):bert1 = get_bert_feature(norm_text1, word2ph1).to(device)
|
189 |
bert1 = torch.zeros((1024, len(phones1)),dtype=torch.float16 if is_half==True else torch.float32).to(device)
|
@@ -236,6 +231,7 @@ def get_str_list_from_phone(text, text_language):
|
|
236 |
# raw文本过g2p得到音素列表,再转成字符串
|
237 |
# 注意,这里的text是一个段落,可能包含多个句子
|
238 |
# 段落间\n分割,音素间空格分割
|
|
|
239 |
texts=text.split("\n")
|
240 |
phone_list = []
|
241 |
for text in texts:
|
@@ -280,7 +276,7 @@ models_info = json.load(open("./models/models_info.json", "r", encoding="utf-8")
|
|
280 |
|
281 |
|
282 |
|
283 |
-
for i, info in models_info.items():
|
284 |
title = info['title']
|
285 |
cover = info['cover']
|
286 |
gpt_weight = info['gpt_weight']
|
|
|
1 |
import os
|
2 |
import sys
|
3 |
import spaces
|
4 |
+
import tqdm
|
5 |
cnhubert_base_path = "GPT_SoVITS/pretrained_models/chinese-hubert-base"
|
6 |
bert_path = "GPT_SoVITS\pretrained_models\chinese-roberta-wwm-ext-large"
|
7 |
os.environ["version"] = 'v2'
|
|
|
135 |
audio=load_audio(filename,int(hps.data.sampling_rate))
|
136 |
audio = audio / np.max(np.abs(audio))
|
137 |
audio=torch.FloatTensor(audio)
|
|
|
138 |
audio_norm = audio
|
139 |
# audio_norm = audio / torch.max(torch.abs(audio))
|
140 |
audio_norm = audio_norm.unsqueeze(0)
|
|
|
141 |
spec = spectrogram_torch(audio_norm, hps.data.filter_length,hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,center=False)
|
142 |
return spec
|
143 |
|
|
|
163 |
wav16k = wav16k.float()
|
164 |
if(is_half==True):wav16k=wav16k.half().to(device)
|
165 |
else:wav16k=wav16k.to(device)
|
|
|
166 |
ssl_content = ssl_model.model(wav16k.unsqueeze(0))["last_hidden_state"].transpose(1, 2)#.float()
|
|
|
167 |
codes = vq_model.extract_latent(ssl_content)
|
|
|
168 |
prompt_semantic = codes[0, 0]
|
169 |
t1 = ttime()
|
170 |
phones1, word2ph1, norm_text1 = clean_text(prompt_text, prompt_language)
|
|
|
179 |
if(len(phones2) == 1 and phones2[0] == ""):
|
180 |
continue
|
181 |
#phones2, word2ph2, norm_text2 = clean_text(text, text_language)
|
|
|
182 |
phones2 = cleaned_text_to_sequence(phones2)
|
183 |
#if(prompt_language=="zh"):bert1 = get_bert_feature(norm_text1, word2ph1).to(device)
|
184 |
bert1 = torch.zeros((1024, len(phones1)),dtype=torch.float16 if is_half==True else torch.float32).to(device)
|
|
|
231 |
# raw文本过g2p得到音素列表,再转成字符串
|
232 |
# 注意,这里的text是一个段落,可能包含多个句子
|
233 |
# 段落间\n分割,音素间空格分割
|
234 |
+
print(text)
|
235 |
texts=text.split("\n")
|
236 |
phone_list = []
|
237 |
for text in texts:
|
|
|
276 |
|
277 |
|
278 |
|
279 |
+
for i, info in tqdm.tqdm(models_info.items()):
|
280 |
title = info['title']
|
281 |
cover = info['cover']
|
282 |
gpt_weight = info['gpt_weight']
|