| |
| |
| |
| |
|
|
| import librosa |
| import numpy as np |
| import torch |
| import parselmouth |
| import torchcrepe |
| import pyworld as pw |
|
|
|
|
| def get_bin_index(f0, m, M, n_bins, use_log_scale): |
| """ |
| WARNING: to abandon! |
| |
| Args: |
| raw_f0: tensor whose shpae is (N, frame_len) |
| Returns: |
| index: tensor whose shape is same to f0 |
| """ |
| raw_f0 = f0.clone() |
| raw_m, raw_M = m, M |
|
|
| if use_log_scale: |
| f0[torch.where(f0 == 0)] = 1 |
| f0 = torch.log(f0) |
| m, M = float(np.log(m)), float(np.log(M)) |
|
|
| |
| width = (M + 1e-7 - m) / (n_bins - 1) |
| index = (f0 - m) // width + 1 |
| |
| index[torch.where(f0 == 0)] = 0 |
|
|
| |
| if torch.any(raw_f0 > raw_M): |
| print("F0 Warning: too high f0: {}".format(raw_f0[torch.where(raw_f0 > raw_M)])) |
| index[torch.where(raw_f0 > raw_M)] = n_bins - 1 |
| if torch.any(raw_f0 < raw_m): |
| print("F0 Warning: too low f0: {}".format(raw_f0[torch.where(f0 < m)])) |
| index[torch.where(f0 < m)] = 0 |
|
|
| return torch.as_tensor(index, dtype=torch.long, device=f0.device) |
|
|
|
|
| def f0_to_coarse(f0, pitch_bin, pitch_min, pitch_max): |
| |
|
|
| f0_mel_min = 1127 * np.log(1 + pitch_min / 700) |
| f0_mel_max = 1127 * np.log(1 + pitch_max / 700) |
|
|
| is_torch = isinstance(f0, torch.Tensor) |
| f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * np.log(1 + f0 / 700) |
| f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (pitch_bin - 2) / ( |
| f0_mel_max - f0_mel_min |
| ) + 1 |
|
|
| f0_mel[f0_mel <= 1] = 1 |
| f0_mel[f0_mel > pitch_bin - 1] = pitch_bin - 1 |
| f0_coarse = (f0_mel + 0.5).long() if is_torch else np.rint(f0_mel).astype(np.int32) |
| assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, ( |
| f0_coarse.max(), |
| f0_coarse.min(), |
| ) |
| return f0_coarse |
|
|
|
|
| def interpolate(f0): |
| """Interpolate the unvoiced part. Thus the f0 can be passed to a subtractive synthesizer. |
| Args: |
| f0: A numpy array of shape (seq_len,) |
| Returns: |
| f0: Interpolated f0 of shape (seq_len,) |
| uv: Unvoiced part of shape (seq_len,) |
| """ |
| uv = f0 == 0 |
| if len(f0[~uv]) > 0: |
| |
| f0[uv] = np.interp(np.where(uv)[0], np.where(~uv)[0], f0[~uv]) |
| uv = uv.astype("float") |
| uv = np.min(np.array([uv[:-2], uv[1:-1], uv[2:]]), axis=0) |
| uv = np.pad(uv, (1, 1)) |
| return f0, uv |
|
|
|
|
| def get_log_f0(f0): |
| f0[np.where(f0 == 0)] = 1 |
| log_f0 = np.log(f0) |
| return log_f0 |
|
|
|
|
| |
|
|
|
|
| def get_f0_features_using_pyin(audio, cfg): |
| """Using pyin to extract the f0 feature. |
| Args: |
| audio |
| fs |
| win_length |
| hop_length |
| f0_min |
| f0_max |
| Returns: |
| f0: numpy array of shape (frame_len,) |
| """ |
| f0, voiced_flag, voiced_probs = librosa.pyin( |
| y=audio, |
| fmin=cfg.f0_min, |
| fmax=cfg.f0_max, |
| sr=cfg.sample_rate, |
| win_length=cfg.win_size, |
| hop_length=cfg.hop_size, |
| ) |
| |
| f0[voiced_flag == False] = 0 |
| return f0 |
|
|
|
|
| def get_f0_features_using_parselmouth(audio, cfg, speed=1): |
| """Using parselmouth to extract the f0 feature. |
| Args: |
| audio |
| mel_len |
| hop_length |
| fs |
| f0_min |
| f0_max |
| speed(default=1) |
| Returns: |
| f0: numpy array of shape (frame_len,) |
| pitch_coarse: numpy array of shape (frame_len,) |
| """ |
| hop_size = int(np.round(cfg.hop_size * speed)) |
|
|
| |
| time_step = hop_size / cfg.sample_rate * 1000 |
|
|
| f0 = ( |
| parselmouth.Sound(audio, cfg.sample_rate) |
| .to_pitch_ac( |
| time_step=time_step / 1000, |
| voicing_threshold=0.6, |
| pitch_floor=cfg.f0_min, |
| pitch_ceiling=cfg.f0_max, |
| ) |
| .selected_array["frequency"] |
| ) |
|
|
| |
| |
| |
|
|
| |
| pitch_coarse = f0_to_coarse(f0, cfg.pitch_bin, cfg.f0_min, cfg.f0_max) |
| return f0, pitch_coarse |
|
|
|
|
| def get_f0_features_using_dio(audio, cfg): |
| """Using dio to extract the f0 feature. |
| Args: |
| audio |
| mel_len |
| fs |
| hop_length |
| f0_min |
| f0_max |
| Returns: |
| f0: numpy array of shape (frame_len,) |
| """ |
| |
| _f0, t = pw.dio( |
| audio.astype("double"), |
| cfg.sample_rate, |
| f0_floor=cfg.f0_min, |
| f0_ceil=cfg.f0_max, |
| channels_in_octave=2, |
| frame_period=(1000 * cfg.hop_size / cfg.sample_rate), |
| ) |
| |
| f0 = pw.stonemask(audio.astype("double"), _f0, t, cfg.sample_rate) |
| return f0 |
|
|
|
|
| def get_f0_features_using_harvest(audio, mel_len, fs, hop_length, f0_min, f0_max): |
| """Using harvest to extract the f0 feature. |
| Args: |
| audio |
| mel_len |
| fs |
| hop_length |
| f0_min |
| f0_max |
| Returns: |
| f0: numpy array of shape (frame_len,) |
| """ |
| f0, _ = pw.harvest( |
| audio.astype("double"), |
| fs, |
| f0_floor=f0_min, |
| f0_ceil=f0_max, |
| frame_period=(1000 * hop_length / fs), |
| ) |
| f0 = f0.astype("float")[:mel_len] |
| return f0 |
|
|
|
|
| def get_f0_features_using_crepe( |
| audio, mel_len, fs, hop_length, hop_length_new, f0_min, f0_max, threshold=0.3 |
| ): |
| """Using torchcrepe to extract the f0 feature. |
| Args: |
| audio |
| mel_len |
| fs |
| hop_length |
| hop_length_new |
| f0_min |
| f0_max |
| threshold(default=0.3) |
| Returns: |
| f0: numpy array of shape (frame_len,) |
| """ |
| |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| audio_16k = librosa.resample(audio, orig_sr=fs, target_sr=16000) |
| audio_16k_torch = torch.FloatTensor(audio_16k).unsqueeze(0).to(device) |
|
|
| |
| f0, pd = torchcrepe.predict( |
| audio_16k_torch, |
| 16000, |
| hop_length_new, |
| f0_min, |
| f0_max, |
| pad=True, |
| model="full", |
| batch_size=1024, |
| device=device, |
| return_periodicity=True, |
| ) |
|
|
| |
| pd = torchcrepe.filter.median(pd, 3) |
| pd = torchcrepe.threshold.Silence(-60.0)(pd, audio_16k_torch, 16000, hop_length_new) |
| f0 = torchcrepe.threshold.At(threshold)(f0, pd) |
| f0 = torchcrepe.filter.mean(f0, 3) |
|
|
| |
| f0 = torch.where(torch.isnan(f0), torch.full_like(f0, 0), f0) |
|
|
| |
| nzindex = torch.nonzero(f0[0]).squeeze() |
| f0 = torch.index_select(f0[0], dim=0, index=nzindex).cpu().numpy() |
| time_org = 0.005 * nzindex.cpu().numpy() |
| time_frame = np.arange(mel_len) * hop_length / fs |
| f0 = np.interp(time_frame, time_org, f0, left=f0[0], right=f0[-1]) |
| return f0 |
|
|
|
|
| def get_f0(audio, cfg): |
| if cfg.pitch_extractor == "dio": |
| f0 = get_f0_features_using_dio(audio, cfg) |
| elif cfg.pitch_extractor == "pyin": |
| f0 = get_f0_features_using_pyin(audio, cfg) |
| elif cfg.pitch_extractor == "parselmouth": |
| f0, _ = get_f0_features_using_parselmouth(audio, cfg) |
| |
|
|
| return f0 |
|
|
|
|
| def get_cents(f0_hz): |
| """ |
| F_{cent} = 1200 * log2 (F/440) |
| |
| Reference: |
| APSIPA'17, Perceptual Evaluation of Singing Quality |
| """ |
| voiced_f0 = f0_hz[f0_hz != 0] |
| return 1200 * np.log2(voiced_f0 / 440) |
|
|
|
|
| def get_pitch_derivatives(f0_hz): |
| """ |
| f0_hz: (,T) |
| """ |
| f0_cent = get_cents(f0_hz) |
| return f0_cent[1:] - f0_cent[:-1] |
|
|
|
|
| def get_pitch_sub_median(f0_hz): |
| """ |
| f0_hz: (,T) |
| """ |
| f0_cent = get_cents(f0_hz) |
| return f0_cent - np.median(f0_cent) |
|
|