|
import librosa |
|
import numpy as np |
|
import parselmouth |
|
import pyworld as pw |
|
import torch |
|
|
|
|
|
|
|
def norm_f0(f0, uv=None): |
|
if uv is None: |
|
uv = f0 == 0 |
|
f0 = np.log2(f0 + uv) |
|
f0[uv] = -np.inf |
|
return f0 |
|
|
|
|
|
def denorm_f0(f0, uv, pitch_padding=None): |
|
f0 = 2 ** f0 |
|
if uv is not None: |
|
f0[uv > 0] = 0 |
|
if pitch_padding is not None: |
|
f0[pitch_padding] = 0 |
|
return f0 |
|
|
|
|
|
def interp_f0(f0, uv=None): |
|
if uv is None: |
|
uv = f0 == 0 |
|
f0 = norm_f0(f0, uv) |
|
if uv.any() and not uv.all(): |
|
f0[uv] = np.interp(np.where(uv)[0], np.where(~uv)[0], f0[~uv]) |
|
return denorm_f0(f0, uv=None), uv |
|
|
|
|
|
def get_pitch(wav_data, length, hparams, speed=1, interp_uv=False): |
|
pe = hparams.get('pe', 'parselmouth') |
|
if pe == 'parselmouth': |
|
return get_pitch_parselmouth(wav_data, length, hparams, speed=speed, interp_uv=interp_uv) |
|
elif pe == 'harvest': |
|
return get_pitch_harvest(wav_data, length, hparams, speed=speed, interp_uv=interp_uv) |
|
else: |
|
raise ValueError(f" [x] Unknown pitch extractor: {pe}") |
|
|
|
|
|
def get_pitch_parselmouth(wav_data, length, hparams, speed=1, interp_uv=False): |
|
""" |
|
|
|
:param wav_data: [T] |
|
:param length: Expected number of frames |
|
:param hparams: |
|
:param speed: Change the speed |
|
:param interp_uv: Interpolate unvoiced parts |
|
:return: f0, uv |
|
""" |
|
hop_size = int(np.round(hparams['hop_size'] * speed)) |
|
time_step = hop_size / hparams['audio_sample_rate'] |
|
f0_min = hparams['f0_min'] |
|
f0_max = hparams['f0_max'] |
|
|
|
l_pad = int(np.ceil(1.5 / f0_min * hparams['audio_sample_rate'])) |
|
r_pad = hop_size * ((len(wav_data) - 1) // hop_size + 1) - len(wav_data) + l_pad + 1 |
|
wav_data = np.pad(wav_data, (l_pad, r_pad)) |
|
|
|
|
|
s = parselmouth.Sound(wav_data, sampling_frequency=hparams['audio_sample_rate']).to_pitch_ac( |
|
time_step=time_step, voicing_threshold=0.6, |
|
pitch_floor=f0_min, pitch_ceiling=f0_max) |
|
assert np.abs(s.t1 - 1.5 / f0_min) < 0.001 |
|
f0 = s.selected_array['frequency'].astype(np.float32) |
|
if len(f0) < length: |
|
f0 = np.pad(f0, (0, length - len(f0))) |
|
f0 = f0[: length] |
|
uv = f0 == 0 |
|
if uv.all(): |
|
return None, None |
|
if interp_uv: |
|
f0, uv = interp_f0(f0, uv) |
|
return f0, uv |
|
|
|
def get_pitch_harvest(wav_data, length, hparams, speed=1, interp_uv=False): |
|
hop_size = int(np.round(hparams['hop_size'] * speed)) |
|
time_step = 1000 * hop_size / hparams['audio_sample_rate'] |
|
f0_floor = hparams['f0_min'] |
|
f0_ceil = hparams['f0_max'] |
|
|
|
f0, _ = pw.harvest(wav_data.astype(np.float64), hparams['audio_sample_rate'], f0_floor=f0_floor, f0_ceil=f0_ceil, frame_period=time_step) |
|
f0 = f0.astype(np.float32) |
|
|
|
if f0.size < length: |
|
f0 = np.pad(f0, (0, length - f0.size)) |
|
f0 = f0[:length] |
|
uv = f0 == 0 |
|
if uv.all(): |
|
return None, None |
|
if interp_uv: |
|
f0, uv = interp_f0(f0, uv) |
|
return f0, uv |