Diff-Pitcher / pitch_controller /dataset /diff_lpc.py

jerryhai

Track binary files with Git LFS

90f7c1e over 1 year ago

9.98 kB

	import os
	import random
	import numpy as np
	import torch
	import tgt
	import pandas as pd

	from torch.utils.data import Dataset
	import librosa


	def f0_to_coarse(f0, hparams):
	f0_bin = hparams['f0_bin']
	f0_max = hparams['f0_max']
	f0_min = hparams['f0_min']
	is_torch = isinstance(f0, torch.Tensor)
	# to mel scale
	f0_mel_min = 1127 * np.log(1 + f0_min / 700)
	f0_mel_max = 1127 * np.log(1 + f0_max / 700)
	f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * np.log(1 + f0 / 700)

	unvoiced = (f0_mel == 0)

	f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (f0_bin - 2) / (f0_mel_max - f0_mel_min) + 1

	f0_mel[f0_mel <= 1] = 1
	f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1

	f0_mel[unvoiced] = 0

	f0_coarse = (f0_mel + 0.5).long() if is_torch else np.rint(f0_mel).astype(int)
	assert f0_coarse.max() <= 255 and f0_coarse.min() >= 0, (f0_coarse.max(), f0_coarse.min())
	return f0_coarse


	def log_f0(f0, hparams):
	f0_bin = hparams['f0_bin']
	f0_max = hparams['f0_max']
	f0_min = hparams['f0_min']

	f0_mel = np.zeros_like(f0)
	f0_mel[f0 != 0] = 12*np.log2(f0[f0 != 0]/f0_min) + 1
	f0_mel_min = 12*np.log2(f0_min/f0_min) + 1
	f0_mel_max = 12*np.log2(f0_max/f0_min) + 1

	unvoiced = (f0_mel == 0)

	f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (f0_bin - 2) / (f0_mel_max - f0_mel_min) + 1

	f0_mel[f0_mel <= 1] = 1
	f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1

	f0_mel[unvoiced] = 0

	f0_coarse = np.rint(f0_mel).astype(int)
	assert f0_coarse.max() <= (f0_bin-1) and f0_coarse.min() >= 0, (f0_coarse.max(), f0_coarse.min())
	return f0_coarse


	# training "average voice" encoder
	class VCDecLPCDataset(Dataset):
	def __init__(self, data_dir, subset, content_dir='lpc_mel_512', extract_emb=False,
	f0_type='bins'):
	self.path = data_dir
	meta = pd.read_csv(data_dir + 'meta_fix.csv')
	self.meta = meta[meta['subset'] == subset]
	self.content_dir = content_dir
	self.extract_emb = extract_emb
	self.f0_type = f0_type

	def get_vc_data(self, audio_path, mel_id):
	mel_dir = audio_path.replace('vocal', 'mel')
	embed_dir = audio_path.replace('vocal', 'embed')
	pitch_dir = audio_path.replace('vocal', 'f0')
	content_dir = audio_path.replace('vocal', self.content_dir)

	mel = os.path.join(mel_dir, mel_id + '.npy')
	embed = os.path.join(embed_dir, mel_id + '.npy')
	pitch = os.path.join(pitch_dir, mel_id + '.npy')
	content = os.path.join(content_dir, mel_id + '.npy')

	mel = np.load(mel)
	if self.extract_emb:
	embed = np.load(embed)
	else:
	embed = np.zeros(1)

	pitch = np.load(pitch)
	content = np.load(content)

	pitch = np.nan_to_num(pitch)
	if self.f0_type == 'bins':
	pitch = f0_to_coarse(pitch, {'f0_bin': 256,
	'f0_min': librosa.note_to_hz('C2'),
	'f0_max': librosa.note_to_hz('C6')})
	elif self.f0_type == 'log':
	pitch = log_f0(pitch, {'f0_bin': 345,
	'f0_min': librosa.note_to_hz('C2'),
	'f0_max': librosa.note_to_hz('C#6')})

	mel = torch.from_numpy(mel).float()
	embed = torch.from_numpy(embed).float()
	pitch = torch.from_numpy(pitch).float()
	content = torch.from_numpy(content).float()

	return (mel, embed, pitch, content)

	def __getitem__(self, index):
	row = self.meta.iloc[index]
	mel_id = row['file_name']
	audio_path = self.path + row['folder'] + row['subfolder']
	mel, embed, pitch, content = self.get_vc_data(audio_path, mel_id)
	item = {'mel': mel, 'embed': embed, 'f0': pitch, 'content': content}
	return item

	def __len__(self):
	return len(self.meta)


	class VCDecLPCBatchCollate(object):
	def __init__(self, train_frames, eps=1e-5):
	self.train_frames = train_frames
	self.eps = eps

	def __call__(self, batch):
	train_frames = self.train_frames
	eps = self.eps

	B = len(batch)
	embed = torch.stack([item['embed'] for item in batch], 0)

	n_mels = batch[0]['mel'].shape[0]
	content_dim = batch[0]['content'].shape[0]

	# min value of log-mel spectrogram is np.log(eps) == padding zero in time domain
	mels1 = torch.ones((B, n_mels, train_frames), dtype=torch.float32) * np.log(eps)
	mels2 = torch.ones((B, n_mels, train_frames), dtype=torch.float32) * np.log(eps)

	# ! need to deal with empty frames here
	contents1 = torch.ones((B, content_dim, train_frames), dtype=torch.float32) * np.log(eps)

	f0s1 = torch.zeros((B, train_frames), dtype=torch.float32)
	max_starts = [max(item['mel'].shape[-1] - train_frames, 0)
	for item in batch]

	starts1 = [random.choice(range(m)) if m > 0 else 0 for m in max_starts]
	starts2 = [random.choice(range(m)) if m > 0 else 0 for m in max_starts]
	mel_lengths = []
	for i, item in enumerate(batch):
	mel = item['mel']
	f0 = item['f0']
	content = item['content']

	if mel.shape[-1] < train_frames:
	mel_length = mel.shape[-1]
	else:
	mel_length = train_frames

	mels1[i, :, :mel_length] = mel[:, starts1[i]:starts1[i] + mel_length]
	f0s1[i, :mel_length] = f0[starts1[i]:starts1[i] + mel_length]
	contents1[i, :, :mel_length] = content[:, starts1[i]:starts1[i] + mel_length]

	mels2[i, :, :mel_length] = mel[:, starts2[i]:starts2[i] + mel_length]
	mel_lengths.append(mel_length)

	mel_lengths = torch.LongTensor(mel_lengths)

	return {'mel1': mels1, 'mel2': mels2, 'mel_lengths': mel_lengths,
	'embed': embed,
	'f0_1': f0s1,
	'content1': contents1}


	class VCDecLPCTest(Dataset):
	def __init__(self, data_dir, subset='test', eps=1e-5, test_frames=256, content_dir='lpc_mel_512', extract_emb=False,
	f0_type='bins'):
	self.path = data_dir
	meta = pd.read_csv(data_dir + 'meta_test.csv')
	self.meta = meta[meta['subset'] == subset]
	self.content_dir = content_dir
	self.extract_emb = extract_emb
	self.eps = eps
	self.test_frames = test_frames
	self.f0_type = f0_type

	def get_vc_data(self, audio_path, mel_id, pitch_shift):
	mel_dir = audio_path.replace('vocal', 'mel')
	embed_dir = audio_path.replace('vocal', 'embed')
	pitch_dir = audio_path.replace('vocal', 'f0')
	content_dir = audio_path.replace('vocal', self.content_dir)

	mel = os.path.join(mel_dir, mel_id + '.npy')
	embed = os.path.join(embed_dir, mel_id + '.npy')
	pitch = os.path.join(pitch_dir, mel_id + '.npy')
	content = os.path.join(content_dir, mel_id + '.npy')

	mel = np.load(mel)
	if self.extract_emb:
	embed = np.load(embed)
	else:
	embed = np.zeros(1)

	pitch = np.load(pitch)
	content = np.load(content)

	pitch = np.nan_to_num(pitch)
	pitch = pitch*pitch_shift

	if self.f0_type == 'bins':
	pitch = f0_to_coarse(pitch, {'f0_bin': 256,
	'f0_min': librosa.note_to_hz('C2'),
	'f0_max': librosa.note_to_hz('C6')})
	elif self.f0_type == 'log':
	pitch = log_f0(pitch, {'f0_bin': 345,
	'f0_min': librosa.note_to_hz('C2'),
	'f0_max': librosa.note_to_hz('C#6')})

	mel = torch.from_numpy(mel).float()
	embed = torch.from_numpy(embed).float()
	pitch = torch.from_numpy(pitch).float()
	content = torch.from_numpy(content).float()

	return (mel, embed, pitch, content)

	def __getitem__(self, index):
	row = self.meta.iloc[index]

	mel_id = row['content_file_name']
	audio_path = self.path + row['content_folder'] + row['content_subfolder']
	pitch_shift = row['pitch_shift']
	mel1, _, f0, content = self.get_vc_data(audio_path, mel_id, pitch_shift)

	mel_id = row['timbre_file_name']
	audio_path = self.path + row['timbre_folder'] + row['timbre_subfolder']
	mel2, embed, _, _ = self.get_vc_data(audio_path, mel_id, pitch_shift)

	n_mels = mel1.shape[0]
	content_dim = content.shape[0]

	mels1 = torch.ones((n_mels, self.test_frames), dtype=torch.float32) * np.log(self.eps)
	mels2 = torch.ones((n_mels, self.test_frames), dtype=torch.float32) * np.log(self.eps)
	lpcs1 = torch.ones((content_dim, self.test_frames), dtype=torch.float32) * np.log(self.eps)

	f0s1 = torch.zeros(self.test_frames, dtype=torch.float32)

	if mel1.shape[-1] < self.test_frames:
	mel_length = mel1.shape[-1]
	else:
	mel_length = self.test_frames
	mels1[:, :mel_length] = mel1[:, :mel_length]
	f0s1[:mel_length] = f0[:mel_length]
	lpcs1[:, :mel_length] = content[:, :mel_length]

	if mel2.shape[-1] < self.test_frames:
	mel_length = mel2.shape[-1]
	else:
	mel_length = self.test_frames
	mels2[:, :mel_length] = mel2[:, :mel_length]

	return {'mel1': mels1, 'mel2': mels2, 'embed': embed, 'f0_1': f0s1, 'content1': lpcs1}

	def __len__(self):
	return len(self.meta)


	if __name__ == '__main__':
	f0 = np.array([110.0, 220.0, librosa.note_to_hz('C2'), 0, librosa.note_to_hz('E3'), librosa.note_to_hz('C6')])
	# 50 midi notes = (50-1)
	pitch = log_f0(f0, {'f0_bin': 345,
	'f0_min': librosa.note_to_hz('C2'),
	'f0_max': librosa.note_to_hz('C#6')})