AkitoP commited on
Commit
52d3468
1 Parent(s): c8ad912
Files changed (1) hide show
  1. app.py +3 -7
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import os
2
  import sys
3
  import spaces
 
4
  cnhubert_base_path = "GPT_SoVITS/pretrained_models/chinese-hubert-base"
5
  bert_path = "GPT_SoVITS\pretrained_models\chinese-roberta-wwm-ext-large"
6
  os.environ["version"] = 'v2'
@@ -134,11 +135,9 @@ def get_spepc(hps, filename):
134
  audio=load_audio(filename,int(hps.data.sampling_rate))
135
  audio = audio / np.max(np.abs(audio))
136
  audio=torch.FloatTensor(audio)
137
- print(torch.max(torch.abs(audio)))
138
  audio_norm = audio
139
  # audio_norm = audio / torch.max(torch.abs(audio))
140
  audio_norm = audio_norm.unsqueeze(0)
141
- print(torch.max(torch.abs(audio_norm)))
142
  spec = spectrogram_torch(audio_norm, hps.data.filter_length,hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,center=False)
143
  return spec
144
 
@@ -164,11 +163,8 @@ def create_tts_fn(vq_model, ssl_model, t2s_model, hps, config, hz, max_sec):
164
  wav16k = wav16k.float()
165
  if(is_half==True):wav16k=wav16k.half().to(device)
166
  else:wav16k=wav16k.to(device)
167
- print(wav16k.shape) # 读取16k音频
168
  ssl_content = ssl_model.model(wav16k.unsqueeze(0))["last_hidden_state"].transpose(1, 2)#.float()
169
- print(ssl_content.shape)
170
  codes = vq_model.extract_latent(ssl_content)
171
- print(codes.shape)
172
  prompt_semantic = codes[0, 0]
173
  t1 = ttime()
174
  phones1, word2ph1, norm_text1 = clean_text(prompt_text, prompt_language)
@@ -183,7 +179,6 @@ def create_tts_fn(vq_model, ssl_model, t2s_model, hps, config, hz, max_sec):
183
  if(len(phones2) == 1 and phones2[0] == ""):
184
  continue
185
  #phones2, word2ph2, norm_text2 = clean_text(text, text_language)
186
- print(phones2)
187
  phones2 = cleaned_text_to_sequence(phones2)
188
  #if(prompt_language=="zh"):bert1 = get_bert_feature(norm_text1, word2ph1).to(device)
189
  bert1 = torch.zeros((1024, len(phones1)),dtype=torch.float16 if is_half==True else torch.float32).to(device)
@@ -236,6 +231,7 @@ def get_str_list_from_phone(text, text_language):
236
  # raw文本过g2p得到音素列表,再转成字符串
237
  # 注意,这里的text是一个段落,可能包含多个句子
238
  # 段落间\n分割,音素间空格分割
 
239
  texts=text.split("\n")
240
  phone_list = []
241
  for text in texts:
@@ -280,7 +276,7 @@ models_info = json.load(open("./models/models_info.json", "r", encoding="utf-8")
280
 
281
 
282
 
283
- for i, info in models_info.items():
284
  title = info['title']
285
  cover = info['cover']
286
  gpt_weight = info['gpt_weight']
 
1
  import os
2
  import sys
3
  import spaces
4
+ import tqdm
5
  cnhubert_base_path = "GPT_SoVITS/pretrained_models/chinese-hubert-base"
6
  bert_path = "GPT_SoVITS\pretrained_models\chinese-roberta-wwm-ext-large"
7
  os.environ["version"] = 'v2'
 
135
  audio=load_audio(filename,int(hps.data.sampling_rate))
136
  audio = audio / np.max(np.abs(audio))
137
  audio=torch.FloatTensor(audio)
 
138
  audio_norm = audio
139
  # audio_norm = audio / torch.max(torch.abs(audio))
140
  audio_norm = audio_norm.unsqueeze(0)
 
141
  spec = spectrogram_torch(audio_norm, hps.data.filter_length,hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,center=False)
142
  return spec
143
 
 
163
  wav16k = wav16k.float()
164
  if(is_half==True):wav16k=wav16k.half().to(device)
165
  else:wav16k=wav16k.to(device)
 
166
  ssl_content = ssl_model.model(wav16k.unsqueeze(0))["last_hidden_state"].transpose(1, 2)#.float()
 
167
  codes = vq_model.extract_latent(ssl_content)
 
168
  prompt_semantic = codes[0, 0]
169
  t1 = ttime()
170
  phones1, word2ph1, norm_text1 = clean_text(prompt_text, prompt_language)
 
179
  if(len(phones2) == 1 and phones2[0] == ""):
180
  continue
181
  #phones2, word2ph2, norm_text2 = clean_text(text, text_language)
 
182
  phones2 = cleaned_text_to_sequence(phones2)
183
  #if(prompt_language=="zh"):bert1 = get_bert_feature(norm_text1, word2ph1).to(device)
184
  bert1 = torch.zeros((1024, len(phones1)),dtype=torch.float16 if is_half==True else torch.float32).to(device)
 
231
  # raw文本过g2p得到音素列表,再转成字符串
232
  # 注意,这里的text是一个段落,可能包含多个句子
233
  # 段落间\n分割,音素间空格分割
234
+ print(text)
235
  texts=text.split("\n")
236
  phone_list = []
237
  for text in texts:
 
276
 
277
 
278
 
279
+ for i, info in tqdm.tqdm(models_info.items()):
280
  title = info['title']
281
  cover = info['cover']
282
  gpt_weight = info['gpt_weight']