PhoenixStormJr commited on
Commit
eef93a1
·
verified ·
1 Parent(s): 9e1b0bc

Upload folder using huggingface_hub

Browse files
infer/infer-pm-index256.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+
3
+ 对源特征进行检索
4
+ """
5
+ import torch, pdb, os, parselmouth
6
+
7
+ os.environ["CUDA_VISIBLE_DEVICES"] = "0"
8
+ import numpy as np
9
+ import soundfile as sf
10
+
11
+ # from models import SynthesizerTrn256#hifigan_nonsf
12
+ # from infer_pack.models import SynthesizerTrn256NSF as SynthesizerTrn256#hifigan_nsf
13
+ from infer_pack.models import (
14
+ SynthesizerTrnMs256NSFsid as SynthesizerTrn256,
15
+ ) # hifigan_nsf
16
+
17
+ # from infer_pack.models import SynthesizerTrnMs256NSFsid_sim as SynthesizerTrn256#hifigan_nsf
18
+ # from models import SynthesizerTrn256NSFsim as SynthesizerTrn256#hifigan_nsf
19
+ # from models import SynthesizerTrn256NSFsimFlow as SynthesizerTrn256#hifigan_nsf
20
+
21
+
22
+ from scipy.io import wavfile
23
+ from fairseq import checkpoint_utils
24
+
25
+ # import pyworld
26
+ import librosa
27
+ import torch.nn.functional as F
28
+ import scipy.signal as signal
29
+
30
+ # import torchcrepe
31
+ from time import time as ttime
32
+
33
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
34
+ model_path = r"E:\codes\py39\vits_vc_gpu_train\hubert_base.pt" #
35
+ print("load model(s) from {}".format(model_path))
36
+ models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
37
+ [model_path],
38
+ suffix="",
39
+ )
40
+ model = models[0]
41
+ model = model.to(device)
42
+ model = model.half()
43
+ model.eval()
44
+
45
+ # net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2,2],512,[16,16,4,4],183,256,is_half=True)#hifigan#512#256
46
+ # net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2,2],512,[16,16,4,4],109,256,is_half=True)#hifigan#512#256
47
+ net_g = SynthesizerTrn256(
48
+ 1025,
49
+ 32,
50
+ 192,
51
+ 192,
52
+ 768,
53
+ 2,
54
+ 6,
55
+ 3,
56
+ 0,
57
+ "1",
58
+ [3, 7, 11],
59
+ [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
60
+ [10, 10, 2, 2],
61
+ 512,
62
+ [16, 16, 4, 4],
63
+ 183,
64
+ 256,
65
+ is_half=True,
66
+ ) # hifigan#512#256#no_dropout
67
+ # net_g = SynthesizerTrn256(1025,32,192,192,768,2,3,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2,2],512,[16,16,4,4],0)#ts3
68
+ # net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2],512,[16,16,4],0)#hifigan-ps-sr
69
+ #
70
+ # net_g = SynthesizerTrn(1025, 32, 192, 192, 768, 2, 6, 3, 0.1, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [5,5], 512, [15,15], 0)#ms
71
+ # net_g = SynthesizerTrn(1025, 32, 192, 192, 768, 2, 6, 3, 0.1, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10,10], 512, [16,16], 0)#idwt2
72
+
73
+ # weights=torch.load("infer/ft-mi_1k-noD.pt")
74
+ # weights=torch.load("infer/ft-mi-freeze-vocoder-flow-enc_q_1k.pt")
75
+ # weights=torch.load("infer/ft-mi-freeze-vocoder_true_1k.pt")
76
+ # weights=torch.load("infer/ft-mi-sim1k.pt")
77
+ weights = torch.load("infer/ft-mi-no_opt-no_dropout.pt")
78
+ print(net_g.load_state_dict(weights, strict=True))
79
+
80
+ net_g.eval().to(device)
81
+ net_g.half()
82
+
83
+
84
+ def get_f0(x, p_len, f0_up_key=0):
85
+ time_step = 160 / 16000 * 1000
86
+ f0_min = 50
87
+ f0_max = 1100
88
+ f0_mel_min = 1127 * np.log(1 + f0_min / 700)
89
+ f0_mel_max = 1127 * np.log(1 + f0_max / 700)
90
+
91
+ f0 = (
92
+ parselmouth.Sound(x, 16000)
93
+ .to_pitch_ac(
94
+ time_step=time_step / 1000,
95
+ voicing_threshold=0.6,
96
+ pitch_floor=f0_min,
97
+ pitch_ceiling=f0_max,
98
+ )
99
+ .selected_array["frequency"]
100
+ )
101
+
102
+ pad_size = (p_len - len(f0) + 1) // 2
103
+ if pad_size > 0 or p_len - len(f0) - pad_size > 0:
104
+ f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
105
+ f0 *= pow(2, f0_up_key / 12)
106
+ f0bak = f0.copy()
107
+
108
+ f0_mel = 1127 * np.log(1 + f0 / 700)
109
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
110
+ f0_mel_max - f0_mel_min
111
+ ) + 1
112
+ f0_mel[f0_mel <= 1] = 1
113
+ f0_mel[f0_mel > 255] = 255
114
+ # f0_mel[f0_mel > 188] = 188
115
+ f0_coarse = np.rint(f0_mel).astype(np.int)
116
+ return f0_coarse, f0bak
117
+
118
+
119
+ import faiss
120
+
121
+ index = faiss.read_index("infer/added_IVF512_Flat_mi_baseline_src_feat.index")
122
+ big_npy = np.load("infer/big_src_feature_mi.npy")
123
+ ta0 = ta1 = ta2 = 0
124
+ for idx, name in enumerate(
125
+ [
126
+ "冬之花clip1.wav",
127
+ ]
128
+ ): ##
129
+ wav_path = "todo-songs/%s" % name #
130
+ f0_up_key = -2 #
131
+ audio, sampling_rate = sf.read(wav_path)
132
+ if len(audio.shape) > 1:
133
+ audio = librosa.to_mono(audio.transpose(1, 0))
134
+ if sampling_rate != 16000:
135
+ audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
136
+
137
+ feats = torch.from_numpy(audio).float()
138
+ if feats.dim() == 2: # double channels
139
+ feats = feats.mean(-1)
140
+ assert feats.dim() == 1, feats.dim()
141
+ feats = feats.view(1, -1)
142
+ padding_mask = torch.BoolTensor(feats.shape).fill_(False)
143
+ inputs = {
144
+ "source": feats.half().to(device),
145
+ "padding_mask": padding_mask.to(device),
146
+ "output_layer": 9, # layer 9
147
+ }
148
+ if torch.cuda.is_available():
149
+ torch.cuda.synchronize()
150
+ t0 = ttime()
151
+ with torch.no_grad():
152
+ logits = model.extract_features(**inputs)
153
+ feats = model.final_proj(logits[0])
154
+
155
+ ####索引优化
156
+ npy = feats[0].cpu().numpy().astype("float32")
157
+ D, I = index.search(npy, 1)
158
+ feats = (
159
+ torch.from_numpy(big_npy[I.squeeze()].astype("float16")).unsqueeze(0).to(device)
160
+ )
161
+
162
+ feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
163
+ if torch.cuda.is_available():
164
+ torch.cuda.synchronize()
165
+ t1 = ttime()
166
+ # p_len = min(feats.shape[1],10000,pitch.shape[0])#太大了爆显存
167
+ p_len = min(feats.shape[1], 10000) #
168
+ pitch, pitchf = get_f0(audio, p_len, f0_up_key)
169
+ p_len = min(feats.shape[1], 10000, pitch.shape[0]) # 太大了爆显存
170
+ if torch.cuda.is_available():
171
+ torch.cuda.synchronize()
172
+ t2 = ttime()
173
+ feats = feats[:, :p_len, :]
174
+ pitch = pitch[:p_len]
175
+ pitchf = pitchf[:p_len]
176
+ p_len = torch.LongTensor([p_len]).to(device)
177
+ pitch = torch.LongTensor(pitch).unsqueeze(0).to(device)
178
+ sid = torch.LongTensor([0]).to(device)
179
+ pitchf = torch.FloatTensor(pitchf).unsqueeze(0).to(device)
180
+ with torch.no_grad():
181
+ audio = (
182
+ net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0]
183
+ .data.cpu()
184
+ .float()
185
+ .numpy()
186
+ ) # nsf
187
+ if torch.cuda.is_available():
188
+ torch.cuda.synchronize()
189
+ t3 = ttime()
190
+ ta0 += t1 - t0
191
+ ta1 += t2 - t1
192
+ ta2 += t3 - t2
193
+ # wavfile.write("ft-mi_1k-index256-noD-%s.wav"%name, 40000, audio)##
194
+ # wavfile.write("ft-mi-freeze-vocoder-flow-enc_q_1k-%s.wav"%name, 40000, audio)##
195
+ # wavfile.write("ft-mi-sim1k-%s.wav"%name, 40000, audio)##
196
+ wavfile.write("ft-mi-no_opt-no_dropout-%s.wav" % name, 40000, audio) ##
197
+
198
+
199
+ print(ta0, ta1, ta2) #
infer/train-index -v2.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ 格式:直接cid为自带的index位;aid放不下了,通过字典来查,反正就5w个
3
+ """
4
+ import faiss, numpy as np, os
5
+
6
+ # ###########如果是原始特征要先写save
7
+ inp_root = r"./logs/nene/3_feature768"
8
+ npys = []
9
+ listdir_res = list(os.listdir(inp_root))
10
+ for name in sorted(listdir_res):
11
+ phone = np.load("%s/%s" % (inp_root, name))
12
+ npys.append(phone)
13
+ big_npy = np.concatenate(npys, 0)
14
+ big_npy_idx = np.arange(big_npy.shape[0])
15
+ np.random.shuffle(big_npy_idx)
16
+ big_npy = big_npy[big_npy_idx]
17
+ print(big_npy.shape) # (6196072, 192)#fp32#4.43G
18
+ np.save("infer/big_src_feature_mi.npy", big_npy)
19
+
20
+ ##################train+add
21
+ # big_npy=np.load("/bili-coeus/jupyter/jupyterhub-liujing04/vits_ch/inference_f0/big_src_feature_mi.npy")
22
+ n_ivf = min(int(16 * np.sqrt(big_npy.shape[0])), big_npy.shape[0] // 39)
23
+ index = faiss.index_factory(768, "IVF%s,Flat" % n_ivf) # mi
24
+ print("training")
25
+ index_ivf = faiss.extract_index_ivf(index) #
26
+ index_ivf.nprobe = 1
27
+ index.train(big_npy)
28
+ faiss.write_index(
29
+ index, "infer/trained_IVF%s_Flat_baseline_src_feat_v2.index" % (n_ivf)
30
+ )
31
+ print("adding")
32
+ batch_size_add = 8192
33
+ for i in range(0, big_npy.shape[0], batch_size_add):
34
+ index.add(big_npy[i : i + batch_size_add])
35
+ faiss.write_index(index, "infer/added_IVF%s_Flat_mi_baseline_src_feat.index" % (n_ivf))
36
+ """
37
+ 大小(都是FP32)
38
+ big_src_feature 2.95G
39
+ (3098036, 256)
40
+ big_emb 4.43G
41
+ (6196072, 192)
42
+ big_emb双倍是因为求特征要repeat后再加pitch
43
+
44
+ """
infer/train-index.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ 格式:直接cid为自带的index位;aid放不下了,通过字典来查,反正就5w个
3
+ """
4
+ import faiss, numpy as np, os
5
+
6
+ # ###########如果是原始特征要先写save
7
+ inp_root = r"E:\codes\py39\dataset\mi\2-co256"
8
+ npys = []
9
+ for name in sorted(list(os.listdir(inp_root))):
10
+ phone = np.load("%s/%s" % (inp_root, name))
11
+ npys.append(phone)
12
+ big_npy = np.concatenate(npys, 0)
13
+ print(big_npy.shape) # (6196072, 192)#fp32#4.43G
14
+ np.save("infer/big_src_feature_mi.npy", big_npy)
15
+
16
+ ##################train+add
17
+ # big_npy=np.load("/bili-coeus/jupyter/jupyterhub-liujing04/vits_ch/inference_f0/big_src_feature_mi.npy")
18
+ print(big_npy.shape)
19
+ index = faiss.index_factory(256, "IVF512,Flat") # mi
20
+ print("training")
21
+ index_ivf = faiss.extract_index_ivf(index) #
22
+ index_ivf.nprobe = 9
23
+ index.train(big_npy)
24
+ faiss.write_index(index, "infer/trained_IVF512_Flat_mi_baseline_src_feat.index")
25
+ print("adding")
26
+ index.add(big_npy)
27
+ faiss.write_index(index, "infer/added_IVF512_Flat_mi_baseline_src_feat.index")
28
+ """
29
+ 大小(都是FP32)
30
+ big_src_feature 2.95G
31
+ (3098036, 256)
32
+ big_emb 4.43G
33
+ (6196072, 192)
34
+ big_emb双倍是因为求特征要repeat后再加pitch
35
+
36
+ """
infer/trans_weights.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch, pdb
2
+
3
+ # a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-suc\G_1000.pth")["model"]#sim_nsf#
4
+ # a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-freeze-vocoder-flow-enc_q\G_1000.pth")["model"]#sim_nsf#
5
+ # a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-freeze-vocoder\G_1000.pth")["model"]#sim_nsf#
6
+ # a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-test\G_1000.pth")["model"]#sim_nsf#
7
+ a = torch.load(
8
+ r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-no_opt-no_dropout\G_1000.pth"
9
+ )[
10
+ "model"
11
+ ] # sim_nsf#
12
+ for key in a.keys():
13
+ a[key] = a[key].half()
14
+ # torch.save(a,"ft-mi-freeze-vocoder_true_1k.pt")#
15
+ # torch.save(a,"ft-mi-sim1k.pt")#
16
+ torch.save(a, "ft-mi-no_opt-no_dropout.pt") #