Spaces:

AkitoP
/

GPT-SoVITS-V2-Gakuen_Idolmaster

Build error

App Files Files Community

AkitoP commited on Sep 8, 2024

Commit

52d3468

•

1 Parent(s): c8ad912

final

Browse files

Files changed (1) hide show

app.py +3 -7

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import os
 import sys
 import spaces
 cnhubert_base_path = "GPT_SoVITS/pretrained_models/chinese-hubert-base"
 bert_path = "GPT_SoVITS\pretrained_models\chinese-roberta-wwm-ext-large"
 os.environ["version"] = 'v2'
@@ -134,11 +135,9 @@ def get_spepc(hps, filename):
     audio=load_audio(filename,int(hps.data.sampling_rate))
     audio = audio / np.max(np.abs(audio))
     audio=torch.FloatTensor(audio)
-    print(torch.max(torch.abs(audio)))
     audio_norm = audio
     # audio_norm = audio / torch.max(torch.abs(audio))
     audio_norm = audio_norm.unsqueeze(0)
-    print(torch.max(torch.abs(audio_norm)))
     spec = spectrogram_torch(audio_norm, hps.data.filter_length,hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,center=False)
     return spec
@@ -164,11 +163,8 @@ def create_tts_fn(vq_model, ssl_model, t2s_model, hps, config, hz, max_sec):
             wav16k = wav16k.float()
             if(is_half==True):wav16k=wav16k.half().to(device)
             else:wav16k=wav16k.to(device)
-            print(wav16k.shape) # 读取16k音频
             ssl_content = ssl_model.model(wav16k.unsqueeze(0))["last_hidden_state"].transpose(1, 2)#.float()
-            print(ssl_content.shape)
             codes = vq_model.extract_latent(ssl_content)
-            print(codes.shape)
             prompt_semantic = codes[0, 0]
         t1 = ttime()
         phones1, word2ph1, norm_text1 = clean_text(prompt_text, prompt_language)
@@ -183,7 +179,6 @@ def create_tts_fn(vq_model, ssl_model, t2s_model, hps, config, hz, max_sec):
             if(len(phones2) == 1 and phones2[0] == ""):
                 continue
             #phones2, word2ph2, norm_text2 = clean_text(text, text_language)
-            print(phones2)
             phones2 = cleaned_text_to_sequence(phones2)
             #if(prompt_language=="zh"):bert1 = get_bert_feature(norm_text1, word2ph1).to(device)
             bert1 = torch.zeros((1024, len(phones1)),dtype=torch.float16 if is_half==True else torch.float32).to(device)
@@ -236,6 +231,7 @@ def get_str_list_from_phone(text, text_language):
     # raw文本过g2p得到音素列表，再转成字符串
     # 注意，这里的text是一个段落，可能包含多个句子
     # 段落间\n分割，音素间空格分割
     texts=text.split("\n")
     phone_list = []
     for text in texts:
@@ -280,7 +276,7 @@ models_info = json.load(open("./models/models_info.json", "r", encoding="utf-8")
-for i, info in models_info.items():
     title = info['title']
     cover = info['cover']
     gpt_weight = info['gpt_weight']

 import os
 import sys
 import spaces
+import tqdm
 cnhubert_base_path = "GPT_SoVITS/pretrained_models/chinese-hubert-base"
 bert_path = "GPT_SoVITS\pretrained_models\chinese-roberta-wwm-ext-large"
 os.environ["version"] = 'v2'
     audio=load_audio(filename,int(hps.data.sampling_rate))
     audio = audio / np.max(np.abs(audio))
     audio=torch.FloatTensor(audio)
     audio_norm = audio
     # audio_norm = audio / torch.max(torch.abs(audio))
     audio_norm = audio_norm.unsqueeze(0)
     spec = spectrogram_torch(audio_norm, hps.data.filter_length,hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,center=False)
     return spec
             wav16k = wav16k.float()
             if(is_half==True):wav16k=wav16k.half().to(device)
             else:wav16k=wav16k.to(device)
             ssl_content = ssl_model.model(wav16k.unsqueeze(0))["last_hidden_state"].transpose(1, 2)#.float()
             codes = vq_model.extract_latent(ssl_content)
             prompt_semantic = codes[0, 0]
         t1 = ttime()
         phones1, word2ph1, norm_text1 = clean_text(prompt_text, prompt_language)
             if(len(phones2) == 1 and phones2[0] == ""):
                 continue
             #phones2, word2ph2, norm_text2 = clean_text(text, text_language)
             phones2 = cleaned_text_to_sequence(phones2)
             #if(prompt_language=="zh"):bert1 = get_bert_feature(norm_text1, word2ph1).to(device)
             bert1 = torch.zeros((1024, len(phones1)),dtype=torch.float16 if is_half==True else torch.float32).to(device)
     # raw文本过g2p得到音素列表，再转成字符串
     # 注意，这里的text是一个段落，可能包含多个句子
     # 段落间\n分割，音素间空格分割
+    print(text)
     texts=text.split("\n")
     phone_list = []
     for text in texts:
+for i, info in tqdm.tqdm(models_info.items()):
     title = info['title']
     cover = info['cover']
     gpt_weight = info['gpt_weight']