maxmax20160403 commited on
Commit
755994c
1 Parent(s): c24b656
app.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  from vits.models import SynthesizerInfer
2
  from omegaconf import OmegaConf
3
  import torchcrepe
@@ -72,7 +74,7 @@ model = SynthesizerInfer(
72
  hp.data.filter_length // 2 + 1,
73
  hp.data.segment_size // hp.data.hop_length,
74
  hp)
75
- load_svc_model("vits_pretrain/sovits5.0_bigvgan.pth", model)
76
  model.eval()
77
  model.to(device)
78
 
@@ -81,6 +83,8 @@ def svc_change(argswave, argsspk):
81
 
82
  argsppg = "svc_tmp.ppg.npy"
83
  os.system(f"python whisper/inference.py -w {argswave} -p {argsppg}")
 
 
84
 
85
  spk = np.load(argsspk)
86
  spk = torch.FloatTensor(spk)
@@ -89,13 +93,20 @@ def svc_change(argswave, argsspk):
89
  ppg = np.repeat(ppg, 2, 0) # 320 PPG -> 160 * 2
90
  ppg = torch.FloatTensor(ppg)
91
 
 
 
 
 
92
  pit = compute_f0_nn(argswave, device)
93
  pit = torch.FloatTensor(pit)
94
 
95
  len_pit = pit.size()[0]
 
96
  len_ppg = ppg.size()[0]
97
- len_min = min(len_pit, len_ppg)
 
98
  pit = pit[:len_min]
 
99
  ppg = ppg[:len_min, :]
100
 
101
  with torch.no_grad():
@@ -129,11 +140,12 @@ def svc_change(argswave, argsspk):
129
  cut_e_out = -1 * hop_frame * hop_size
130
 
131
  sub_ppg = ppg[cut_s:cut_e, :].unsqueeze(0).to(device)
 
132
  sub_pit = pit[cut_s:cut_e].unsqueeze(0).to(device)
133
  sub_len = torch.LongTensor([cut_e - cut_s]).to(device)
134
  sub_har = source[:, :, cut_s *
135
  hop_size:cut_e * hop_size].to(device)
136
- sub_out = model.inference(sub_ppg, sub_pit, spk, sub_len, sub_har)
137
  sub_out = sub_out[0, 0].data.cpu().detach().numpy()
138
 
139
  sub_out = sub_out[cut_s_out:cut_e_out]
@@ -148,10 +160,11 @@ def svc_change(argswave, argsspk):
148
  cut_s = 0
149
  cut_s_out = 0
150
  sub_ppg = ppg[cut_s:, :].unsqueeze(0).to(device)
 
151
  sub_pit = pit[cut_s:].unsqueeze(0).to(device)
152
  sub_len = torch.LongTensor([all_frame - cut_s]).to(device)
153
  sub_har = source[:, :, cut_s * hop_size:].to(device)
154
- sub_out = model.inference(sub_ppg, sub_pit, spk, sub_len, sub_har)
155
  sub_out = sub_out[0, 0].data.cpu().detach().numpy()
156
 
157
  sub_out = sub_out[cut_s_out:]
@@ -187,6 +200,8 @@ with app:
187
 
188
  https://github.com/Multi-Singer/Multi-Singer.github.io
189
 
 
 
190
  [轻度伴奏可以无需去伴奏]就能直接进行歌声转换的SVC库
191
  """)
192
  sid = gr.Dropdown(label="音色", choices=[
 
1
+ import sys,os
2
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
3
  from vits.models import SynthesizerInfer
4
  from omegaconf import OmegaConf
5
  import torchcrepe
 
74
  hp.data.filter_length // 2 + 1,
75
  hp.data.segment_size // hp.data.hop_length,
76
  hp)
77
+ load_svc_model("vits_pretrain/sovits5.0_bigvgan_mix.pth", model)
78
  model.eval()
79
  model.to(device)
80
 
 
83
 
84
  argsppg = "svc_tmp.ppg.npy"
85
  os.system(f"python whisper/inference.py -w {argswave} -p {argsppg}")
86
+ argsvec = "svc_tmp.vec.npy"
87
+ os.system(f"python hubert/inference.py -w {argswave} -v {argsvec}")
88
 
89
  spk = np.load(argsspk)
90
  spk = torch.FloatTensor(spk)
 
93
  ppg = np.repeat(ppg, 2, 0) # 320 PPG -> 160 * 2
94
  ppg = torch.FloatTensor(ppg)
95
 
96
+ vec = np.load(argsvec)
97
+ vec = np.repeat(vec, 2, 0) # 320 PPG -> 160 * 2
98
+ vec = torch.FloatTensor(vec)
99
+
100
  pit = compute_f0_nn(argswave, device)
101
  pit = torch.FloatTensor(pit)
102
 
103
  len_pit = pit.size()[0]
104
+ len_vec = vec.size()[0]
105
  len_ppg = ppg.size()[0]
106
+ len_min = min(len_pit, len_vec)
107
+ len_min = min(len_min, len_ppg)
108
  pit = pit[:len_min]
109
+ vec = vec[:len_min, :]
110
  ppg = ppg[:len_min, :]
111
 
112
  with torch.no_grad():
 
140
  cut_e_out = -1 * hop_frame * hop_size
141
 
142
  sub_ppg = ppg[cut_s:cut_e, :].unsqueeze(0).to(device)
143
+ sub_vec = vec[cut_s:cut_e, :].unsqueeze(0).to(device)
144
  sub_pit = pit[cut_s:cut_e].unsqueeze(0).to(device)
145
  sub_len = torch.LongTensor([cut_e - cut_s]).to(device)
146
  sub_har = source[:, :, cut_s *
147
  hop_size:cut_e * hop_size].to(device)
148
+ sub_out = model.inference(sub_ppg, sub_vec, sub_pit, spk, sub_len, sub_har)
149
  sub_out = sub_out[0, 0].data.cpu().detach().numpy()
150
 
151
  sub_out = sub_out[cut_s_out:cut_e_out]
 
160
  cut_s = 0
161
  cut_s_out = 0
162
  sub_ppg = ppg[cut_s:, :].unsqueeze(0).to(device)
163
+ sub_vec = vec[cut_s:, :].unsqueeze(0).to(device)
164
  sub_pit = pit[cut_s:].unsqueeze(0).to(device)
165
  sub_len = torch.LongTensor([all_frame - cut_s]).to(device)
166
  sub_har = source[:, :, cut_s * hop_size:].to(device)
167
+ sub_out = model.inference(sub_ppg, sub_vec, sub_pit, spk, sub_len, sub_har)
168
  sub_out = sub_out[0, 0].data.cpu().detach().numpy()
169
 
170
  sub_out = sub_out[cut_s_out:]
 
200
 
201
  https://github.com/Multi-Singer/Multi-Singer.github.io
202
 
203
+ mix_encoder: whisper + hubert, 提升跨语言能力和纯对白语音训练的效果
204
+
205
  [轻度伴奏可以无需去伴奏]就能直接进行歌声转换的SVC库
206
  """)
207
  sid = gr.Dropdown(label="音色", choices=[
configs/base.yaml CHANGED
@@ -28,11 +28,12 @@ data:
28
  #############################
29
  vits:
30
  ppg_dim: 1024
 
31
  spk_dim: 256
32
  gin_channels: 256
33
  inter_channels: 192
34
  hidden_channels: 192
35
- filter_channels: 512
36
  #############################
37
  gen:
38
  upsample_input: 192
 
28
  #############################
29
  vits:
30
  ppg_dim: 1024
31
+ vec_dim: 256
32
  spk_dim: 256
33
  gin_channels: 256
34
  inter_channels: 192
35
  hidden_channels: 192
36
+ filter_channels: 640
37
  #############################
38
  gen:
39
  upsample_input: 192
vits/data_utils.py CHANGED
@@ -29,13 +29,15 @@ class TextAudioSpeakerSet(torch.utils.data.Dataset):
29
  items_new = []
30
  items_min = int(self.segment_size / self.hop_length * 4) # 1 S
31
  items_max = int(self.segment_size / self.hop_length * 16) # 4 S
32
- for wavpath, spec, pitch, ppg, spk in self.items:
33
  if not os.path.isfile(wavpath):
34
  continue
35
  if not os.path.isfile(spec):
36
  continue
37
  if not os.path.isfile(pitch):
38
  continue
 
 
39
  if not os.path.isfile(ppg):
40
  continue
41
  if not os.path.isfile(spk):
@@ -46,7 +48,7 @@ class TextAudioSpeakerSet(torch.utils.data.Dataset):
46
  continue
47
  if (usel >= items_max):
48
  usel = items_max
49
- items_new.append([wavpath, spec, pitch, ppg, spk, usel])
50
  lengths.append(usel)
51
  self.items = items_new
52
  self.lengths = lengths
@@ -70,28 +72,35 @@ class TextAudioSpeakerSet(torch.utils.data.Dataset):
70
  wav = item[0]
71
  spe = item[1]
72
  pit = item[2]
73
- ppg = item[3]
74
- spk = item[4]
75
- use = item[5]
 
76
 
77
  wav = self.read_wav(wav)
78
  spe = torch.load(spe)
79
 
80
  pit = np.load(pit)
 
 
81
  ppg = np.load(ppg)
82
  ppg = np.repeat(ppg, 2, 0) # 320 PPG -> 160 * 2
83
  spk = np.load(spk)
84
 
85
  pit = torch.FloatTensor(pit)
 
86
  ppg = torch.FloatTensor(ppg)
87
  spk = torch.FloatTensor(spk)
88
 
89
  len_pit = pit.size()[0]
 
90
  len_ppg = ppg.size()[0] - 2 # for safe
91
- len_min = min(len_pit, len_ppg)
 
92
  len_wav = len_min * self.hop_length
93
 
94
  pit = pit[:len_min]
 
95
  ppg = ppg[:len_min, :]
96
  spe = spe[:, :len_min]
97
  wav = wav[:, :len_wav]
@@ -101,6 +110,7 @@ class TextAudioSpeakerSet(torch.utils.data.Dataset):
101
  frame_end = frame_start + use
102
 
103
  pit = pit[frame_start:frame_end]
 
104
  ppg = ppg[frame_start:frame_end, :]
105
  spe = spe[:, frame_start:frame_end]
106
 
@@ -112,7 +122,7 @@ class TextAudioSpeakerSet(torch.utils.data.Dataset):
112
  # print(ppg.shape)
113
  # print(pit.shape)
114
  # print(spk.shape)
115
- return spe, wav, ppg, pit, spk
116
 
117
 
118
  class TextAudioSpeakerCollate:
@@ -143,10 +153,13 @@ class TextAudioSpeakerCollate:
143
  ppg_lengths = torch.FloatTensor(len(batch))
144
  ppg_padded = torch.FloatTensor(
145
  len(batch), max_ppg_len, batch[0][2].size(1))
 
 
146
  pit_padded = torch.FloatTensor(len(batch), max_ppg_len)
147
  ppg_padded.zero_()
 
148
  pit_padded.zero_()
149
- spk = torch.FloatTensor(len(batch), batch[0][4].size(0))
150
 
151
  for i in range(len(ids_sorted_decreasing)):
152
  row = batch[ids_sorted_decreasing[i]]
@@ -163,10 +176,13 @@ class TextAudioSpeakerCollate:
163
  ppg_padded[i, : ppg.size(0), :] = ppg
164
  ppg_lengths[i] = ppg.size(0)
165
 
166
- pit = row[3]
 
 
 
167
  pit_padded[i, : pit.size(0)] = pit
168
 
169
- spk[i] = row[4]
170
  # print(ppg_padded.shape)
171
  # print(ppg_lengths.shape)
172
  # print(pit_padded.shape)
@@ -178,6 +194,7 @@ class TextAudioSpeakerCollate:
178
  return (
179
  ppg_padded,
180
  ppg_lengths,
 
181
  pit_padded,
182
  spk,
183
  spe_padded,
 
29
  items_new = []
30
  items_min = int(self.segment_size / self.hop_length * 4) # 1 S
31
  items_max = int(self.segment_size / self.hop_length * 16) # 4 S
32
+ for wavpath, spec, pitch, vec, ppg, spk in self.items:
33
  if not os.path.isfile(wavpath):
34
  continue
35
  if not os.path.isfile(spec):
36
  continue
37
  if not os.path.isfile(pitch):
38
  continue
39
+ if not os.path.isfile(vec):
40
+ continue
41
  if not os.path.isfile(ppg):
42
  continue
43
  if not os.path.isfile(spk):
 
48
  continue
49
  if (usel >= items_max):
50
  usel = items_max
51
+ items_new.append([wavpath, spec, pitch, vec, ppg, spk, usel])
52
  lengths.append(usel)
53
  self.items = items_new
54
  self.lengths = lengths
 
72
  wav = item[0]
73
  spe = item[1]
74
  pit = item[2]
75
+ vec = item[3]
76
+ ppg = item[4]
77
+ spk = item[5]
78
+ use = item[6]
79
 
80
  wav = self.read_wav(wav)
81
  spe = torch.load(spe)
82
 
83
  pit = np.load(pit)
84
+ vec = np.load(vec)
85
+ vec = np.repeat(vec, 2, 0) # 320 PPG -> 160 * 2
86
  ppg = np.load(ppg)
87
  ppg = np.repeat(ppg, 2, 0) # 320 PPG -> 160 * 2
88
  spk = np.load(spk)
89
 
90
  pit = torch.FloatTensor(pit)
91
+ vec = torch.FloatTensor(vec)
92
  ppg = torch.FloatTensor(ppg)
93
  spk = torch.FloatTensor(spk)
94
 
95
  len_pit = pit.size()[0]
96
+ len_vec = vec.size()[0] - 2 # for safe
97
  len_ppg = ppg.size()[0] - 2 # for safe
98
+ len_min = min(len_pit, len_vec)
99
+ len_min = min(len_min, len_ppg)
100
  len_wav = len_min * self.hop_length
101
 
102
  pit = pit[:len_min]
103
+ vec = vec[:len_min, :]
104
  ppg = ppg[:len_min, :]
105
  spe = spe[:, :len_min]
106
  wav = wav[:, :len_wav]
 
110
  frame_end = frame_start + use
111
 
112
  pit = pit[frame_start:frame_end]
113
+ vec = vec[frame_start:frame_end, :]
114
  ppg = ppg[frame_start:frame_end, :]
115
  spe = spe[:, frame_start:frame_end]
116
 
 
122
  # print(ppg.shape)
123
  # print(pit.shape)
124
  # print(spk.shape)
125
+ return spe, wav, ppg, vec, pit, spk
126
 
127
 
128
  class TextAudioSpeakerCollate:
 
153
  ppg_lengths = torch.FloatTensor(len(batch))
154
  ppg_padded = torch.FloatTensor(
155
  len(batch), max_ppg_len, batch[0][2].size(1))
156
+ vec_padded = torch.FloatTensor(
157
+ len(batch), max_ppg_len, batch[0][3].size(1))
158
  pit_padded = torch.FloatTensor(len(batch), max_ppg_len)
159
  ppg_padded.zero_()
160
+ vec_padded.zero_()
161
  pit_padded.zero_()
162
+ spk = torch.FloatTensor(len(batch), batch[0][5].size(0))
163
 
164
  for i in range(len(ids_sorted_decreasing)):
165
  row = batch[ids_sorted_decreasing[i]]
 
176
  ppg_padded[i, : ppg.size(0), :] = ppg
177
  ppg_lengths[i] = ppg.size(0)
178
 
179
+ vec = row[3]
180
+ vec_padded[i, : vec.size(0), :] = vec
181
+
182
+ pit = row[4]
183
  pit_padded[i, : pit.size(0)] = pit
184
 
185
+ spk[i] = row[5]
186
  # print(ppg_padded.shape)
187
  # print(ppg_lengths.shape)
188
  # print(pit_padded.shape)
 
194
  return (
195
  ppg_padded,
196
  ppg_lengths,
197
+ vec_padded,
198
  pit_padded,
199
  spk,
200
  spe_padded,
vits/models.py CHANGED
@@ -14,6 +14,7 @@ from vits.modules_grl import SpeakerClassifier
14
  class TextEncoder(nn.Module):
15
  def __init__(self,
16
  in_channels,
 
17
  out_channels,
18
  hidden_channels,
19
  filter_channels,
@@ -24,6 +25,7 @@ class TextEncoder(nn.Module):
24
  super().__init__()
25
  self.out_channels = out_channels
26
  self.pre = nn.Conv1d(in_channels, hidden_channels, kernel_size=5, padding=2)
 
27
  self.pit = nn.Embedding(256, hidden_channels)
28
  self.enc = attentions.Encoder(
29
  hidden_channels,
@@ -34,13 +36,15 @@ class TextEncoder(nn.Module):
34
  p_dropout)
35
  self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
36
 
37
- def forward(self, x, x_lengths, f0):
38
  x = torch.transpose(x, 1, -1) # [b, h, t]
39
  x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
40
  x.dtype
41
  )
42
  x = self.pre(x) * x_mask
43
- x = x + self.pit(f0).transpose(1, 2)
 
 
44
  x = self.enc(x * x_mask, x_mask)
45
  stats = self.proj(x) * x_mask
46
  m, logs = torch.split(stats, self.out_channels, dim=1)
@@ -144,6 +148,7 @@ class SynthesizerTrn(nn.Module):
144
  self.emb_g = nn.Linear(hp.vits.spk_dim, hp.vits.gin_channels)
145
  self.enc_p = TextEncoder(
146
  hp.vits.ppg_dim,
 
147
  hp.vits.inter_channels,
148
  hp.vits.hidden_channels,
149
  hp.vits.filter_channels,
@@ -175,11 +180,12 @@ class SynthesizerTrn(nn.Module):
175
  )
176
  self.dec = Generator(hp=hp)
177
 
178
- def forward(self, ppg, pit, spec, spk, ppg_l, spec_l):
179
- ppg = ppg + torch.randn_like(ppg) # Perturbation
 
180
  g = self.emb_g(F.normalize(spk)).unsqueeze(-1)
181
  z_p, m_p, logs_p, ppg_mask, x = self.enc_p(
182
- ppg, ppg_l, f0=f0_to_coarse(pit))
183
  z_q, m_q, logs_q, spec_mask = self.enc_q(spec, spec_l, g=g)
184
 
185
  z_slice, pit_slice, ids_slice = commons.rand_slice_segments_with_pitch(
@@ -193,10 +199,10 @@ class SynthesizerTrn(nn.Module):
193
  spk_preds = self.speaker_classifier(x)
194
  return audio, ids_slice, spec_mask, (z_f, z_r, z_p, m_p, logs_p, z_q, m_q, logs_q, logdet_f, logdet_r), spk_preds
195
 
196
- def infer(self, ppg, pit, spk, ppg_l):
197
  ppg = ppg + torch.randn_like(ppg) * 0.0001 # Perturbation
198
  z_p, m_p, logs_p, ppg_mask, x = self.enc_p(
199
- ppg, ppg_l, f0=f0_to_coarse(pit))
200
  z, _ = self.flow(z_p, ppg_mask, g=spk, reverse=True)
201
  o = self.dec(spk, z * ppg_mask, f0=pit)
202
  return o
@@ -213,6 +219,7 @@ class SynthesizerInfer(nn.Module):
213
  self.segment_size = segment_size
214
  self.enc_p = TextEncoder(
215
  hp.vits.ppg_dim,
 
216
  hp.vits.inter_channels,
217
  hp.vits.hidden_channels,
218
  hp.vits.filter_channels,
@@ -241,9 +248,9 @@ class SynthesizerInfer(nn.Module):
241
  def source2wav(self, source):
242
  return self.dec.source2wav(source)
243
 
244
- def inference(self, ppg, pit, spk, ppg_l, source):
245
  z_p, m_p, logs_p, ppg_mask, x = self.enc_p(
246
- ppg, ppg_l, f0=f0_to_coarse(pit))
247
  z, _ = self.flow(z_p, ppg_mask, g=spk, reverse=True)
248
  o = self.dec.inference(spk, z * ppg_mask, source)
249
  return o
 
14
  class TextEncoder(nn.Module):
15
  def __init__(self,
16
  in_channels,
17
+ vec_channels,
18
  out_channels,
19
  hidden_channels,
20
  filter_channels,
 
25
  super().__init__()
26
  self.out_channels = out_channels
27
  self.pre = nn.Conv1d(in_channels, hidden_channels, kernel_size=5, padding=2)
28
+ self.hub = nn.Conv1d(vec_channels, hidden_channels, kernel_size=5, padding=2)
29
  self.pit = nn.Embedding(256, hidden_channels)
30
  self.enc = attentions.Encoder(
31
  hidden_channels,
 
36
  p_dropout)
37
  self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
38
 
39
+ def forward(self, x, x_lengths, v, f0):
40
  x = torch.transpose(x, 1, -1) # [b, h, t]
41
  x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
42
  x.dtype
43
  )
44
  x = self.pre(x) * x_mask
45
+ v = torch.transpose(v, 1, -1) # [b, h, t]
46
+ v = self.hub(v) * x_mask
47
+ x = x + v + self.pit(f0).transpose(1, 2)
48
  x = self.enc(x * x_mask, x_mask)
49
  stats = self.proj(x) * x_mask
50
  m, logs = torch.split(stats, self.out_channels, dim=1)
 
148
  self.emb_g = nn.Linear(hp.vits.spk_dim, hp.vits.gin_channels)
149
  self.enc_p = TextEncoder(
150
  hp.vits.ppg_dim,
151
+ hp.vits.vec_dim,
152
  hp.vits.inter_channels,
153
  hp.vits.hidden_channels,
154
  hp.vits.filter_channels,
 
180
  )
181
  self.dec = Generator(hp=hp)
182
 
183
+ def forward(self, ppg, vec, pit, spec, spk, ppg_l, spec_l):
184
+ ppg = ppg + torch.randn_like(ppg) * 1 # Perturbation
185
+ vec = vec + torch.randn_like(vec) * 2 # Perturbation
186
  g = self.emb_g(F.normalize(spk)).unsqueeze(-1)
187
  z_p, m_p, logs_p, ppg_mask, x = self.enc_p(
188
+ ppg, ppg_l, vec, f0=f0_to_coarse(pit))
189
  z_q, m_q, logs_q, spec_mask = self.enc_q(spec, spec_l, g=g)
190
 
191
  z_slice, pit_slice, ids_slice = commons.rand_slice_segments_with_pitch(
 
199
  spk_preds = self.speaker_classifier(x)
200
  return audio, ids_slice, spec_mask, (z_f, z_r, z_p, m_p, logs_p, z_q, m_q, logs_q, logdet_f, logdet_r), spk_preds
201
 
202
+ def infer(self, ppg, vec, pit, spk, ppg_l):
203
  ppg = ppg + torch.randn_like(ppg) * 0.0001 # Perturbation
204
  z_p, m_p, logs_p, ppg_mask, x = self.enc_p(
205
+ ppg, ppg_l, vec, f0=f0_to_coarse(pit))
206
  z, _ = self.flow(z_p, ppg_mask, g=spk, reverse=True)
207
  o = self.dec(spk, z * ppg_mask, f0=pit)
208
  return o
 
219
  self.segment_size = segment_size
220
  self.enc_p = TextEncoder(
221
  hp.vits.ppg_dim,
222
+ hp.vits.vec_dim,
223
  hp.vits.inter_channels,
224
  hp.vits.hidden_channels,
225
  hp.vits.filter_channels,
 
248
  def source2wav(self, source):
249
  return self.dec.source2wav(source)
250
 
251
+ def inference(self, ppg, vec, pit, spk, ppg_l, source):
252
  z_p, m_p, logs_p, ppg_mask, x = self.enc_p(
253
+ ppg, ppg_l, vec, f0=f0_to_coarse(pit))
254
  z, _ = self.flow(z_p, ppg_mask, g=spk, reverse=True)
255
  o = self.dec.inference(spk, z * ppg_mask, source)
256
  return o
vits_pretrain/{sovits5.0_bigvgan.pth → sovits5.0_bigvgan_mix.pth} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ffed3845044b8bef076d72272da19791e1344ad3b750a02d6e4980acf6cb0a0b
3
- size 74825605
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b941958b20d2eb91abdb6ff9d1344e056ec2c78116e4c3a1e2b23b022d32db1
3
+ size 79352005
whisper/inference.py CHANGED
@@ -1,4 +1,5 @@
1
- import os
 
2
  import numpy as np
3
  import argparse
4
  import torch
 
1
+ import sys,os
2
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
3
  import numpy as np
4
  import argparse
5
  import torch