Adorable-Qin commited on
Commit
2d4dca5
1 Parent(s): 7c4481d

Change F0 extractor to Crepe

Browse files
ckpts/svc/vocalist_l1_contentvec+whisper/args.json CHANGED
@@ -140,7 +140,7 @@
140
  "pin_memory": true,
141
  "pitch_bin": 256,
142
  "pitch_dir": "pitches",
143
- "pitch_extractor": "parselmouth",
144
  "pitch_max": 1100.0,
145
  "pitch_min": 50.0,
146
  "processed_dir": "ckpts/svc/vocalist_l1_contentvec+whisper/data",
 
140
  "pin_memory": true,
141
  "pitch_bin": 256,
142
  "pitch_dir": "pitches",
143
+ "pitch_extractor": "crepe", // "parselmouth"
144
  "pitch_max": 1100.0,
145
  "pitch_min": 50.0,
146
  "processed_dir": "ckpts/svc/vocalist_l1_contentvec+whisper/data",
utils/f0.py CHANGED
@@ -207,7 +207,7 @@ def get_f0_features_using_harvest(audio, mel_len, fs, hop_length, f0_min, f0_max
207
  return f0
208
 
209
 
210
- def get_f0_features_using_crepe(
211
  audio, mel_len, fs, hop_length, hop_length_new, f0_min, f0_max, threshold=0.3
212
  ):
213
  """Using torchcrepe to extract the f0 feature.
@@ -259,6 +259,25 @@ def get_f0_features_using_crepe(
259
  f0 = np.interp(time_frame, time_org, f0, left=f0[0], right=f0[-1])
260
  return f0
261
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
262
 
263
  def get_f0(audio, cfg):
264
  if cfg.pitch_extractor == "dio":
@@ -267,6 +286,8 @@ def get_f0(audio, cfg):
267
  f0 = get_f0_features_using_pyin(audio, cfg)
268
  elif cfg.pitch_extractor == "parselmouth":
269
  f0, _ = get_f0_features_using_parselmouth(audio, cfg)
 
 
270
  # elif cfg.data.f0_extractor == 'cwt': # todo
271
 
272
  return f0
 
207
  return f0
208
 
209
 
210
+ def get_f0_features_using_crepe_legacy(
211
  audio, mel_len, fs, hop_length, hop_length_new, f0_min, f0_max, threshold=0.3
212
  ):
213
  """Using torchcrepe to extract the f0 feature.
 
259
  f0 = np.interp(time_frame, time_org, f0, left=f0[0], right=f0[-1])
260
  return f0
261
 
262
+ def get_f0_features_using_crepe(audio, cfg):
263
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
264
+ audio_torch = torch.FloatTensor(audio).unsqueeze(0).to(device)
265
+
266
+ crepe_pitch, pd = torchcrepe.predict(audio_torch, cfg.sample_rate, cfg.hop_size, fmin=cfg.f0_min, fmax=cfg.f0_max, return_periodicity=True)
267
+
268
+ threshold = 0.3
269
+
270
+ # Filter, de-silence, set up threshold for unvoiced part
271
+ pd = torchcrepe.filter.median(pd, 3)
272
+ pd = torchcrepe.threshold.Silence(-60.0)(pd, audio_torch, cfg.sample_rate, 256)
273
+ crepe_pitch = torchcrepe.threshold.At(threshold)(crepe_pitch, pd)
274
+ crepe_pitch = torchcrepe.filter.mean(crepe_pitch, 3)
275
+
276
+ # Convert unvoiced part to 0hz
277
+ crepe_pitch = torch.where(torch.isnan(crepe_pitch), torch.full_like(crepe_pitch, 0), crepe_pitch)
278
+
279
+ return crepe_pitch[0].cpu().numpy()
280
+
281
 
282
  def get_f0(audio, cfg):
283
  if cfg.pitch_extractor == "dio":
 
286
  f0 = get_f0_features_using_pyin(audio, cfg)
287
  elif cfg.pitch_extractor == "parselmouth":
288
  f0, _ = get_f0_features_using_parselmouth(audio, cfg)
289
+ elif cfg.pitch_extractor == "crepe":
290
+ f0 = get_f0_features_using_crepe(audio, cfg)
291
  # elif cfg.data.f0_extractor == 'cwt': # todo
292
 
293
  return f0