Spaces:

atsushieee
/

sovits-test

Sleeping

App Files Files Community

sovits-test / pitch /inference.py

atsushieee

Update pitch/inference.py

e417f48 10 months ago

raw

history blame

4.14 kB

	import sys,os
	sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
	import torch
	import librosa
	import argparse
	import numpy as np
	import crepe


	def move_average(a, n, mode="same"):
	return (np.convolve(a, np.ones((n,))/n, mode=mode))


	def compute_f0_mouth(path, device):
	# pip install praat-parselmouth
	import parselmouth

	x, sr = librosa.load(path, sr=16000)
	assert sr == 16000
	lpad = 1024 // 160
	rpad = lpad
	f0 = parselmouth.Sound(x, sr).to_pitch_ac(
	time_step=160 / sr,
	voicing_threshold=0.5,
	pitch_floor=30,
	pitch_ceiling=1000).selected_array['frequency']
	f0 = np.pad(f0, [[lpad, rpad]], mode='constant')
	return f0


	def compute_f0_salience(filename, device):
	from pitch.core.salience import salience
	audio, sr = librosa.load(filename, sr=16000)
	assert sr == 16000
	f0, t, s = salience(
	audio,
	Fs=sr,
	H=320,
	N=2048,
	F_min=45.0,
	F_max=1760.0)
	f0 = np.repeat(f0, 2, -1) # 320 -> 160 * 2
	f0 = move_average(f0, 3)
	return f0


	def compute_f0_voice(filename, device):
	audio, sr = librosa.load(filename, sr=16000)
	assert sr == 16000
	audio = torch.tensor(np.copy(audio))[None]
	audio = audio + torch.randn_like(audio) * 0.001
	# Here we'll use a 10 millisecond hop length
	hop_length = 160
	fmin = 50
	fmax = 1000
	model = "full"
	batch_size = 512
	# pitch = crepe.predict(
	# audio,
	# sr,
	# hop_length,
	# fmin,
	# fmax,
	# model,
	# batch_size=batch_size,
	# device=device,
	# return_periodicity=False,
	# )
	# pitch = crepe.filter.mean(pitch, 3)
	# pitch = pitch.squeeze(0)
	pitch, periodicity = crepe.predict(
	audio,
	sr,
	hop_length,
	fmin,
	fmax,
	model,
	batch_size=batch_size,
	device=device,
	return_periodicity=True,
	)
	# CREPE was not trained on silent audio. some error on silent need filter.pitPath
	periodicity = crepe.filter.median(periodicity, 7)
	pitch = crepe.filter.mean(pitch, 5)
	pitch[periodicity < 0.5] = 0
	pitch = pitch.squeeze(0)
	return pitch


	def compute_f0_sing(filename, device):
	audio, sr = librosa.load(filename, sr=16000)
	assert sr == 16000
	audio = torch.tensor(np.copy(audio))[None]
	audio = audio + torch.randn_like(audio) * 0.001
	# Here we'll use a 20 millisecond hop length
	hop_length = 320
	fmin = 50
	fmax = 1000
	model = "full"
	batch_size = 512
	pitch = crepe.predict(
	audio,
	sr,
	hop_length,
	fmin,
	fmax,
	model,
	batch_size=batch_size,
	device=device,
	return_periodicity=False,
	)
	pitch = np.repeat(pitch, 2, -1) # 320 -> 160 * 2
	pitch = crepe.filter.mean(pitch, 5)
	pitch = pitch.squeeze(0)
	return pitch


	def save_csv_pitch(pitch, path):
	with open(path, "w", encoding='utf-8') as pitch_file:
	for i in range(len(pitch)):
	t = i * 10
	minute = t // 60000
	seconds = (t - minute * 60000) // 1000
	millisecond = t % 1000
	print(
	f"{minute}m {seconds}s {millisecond:3d},{int(pitch[i])}", file=pitch_file)


	def load_csv_pitch(path):
	pitch = []
	with open(path, "r", encoding='utf-8') as pitch_file:
	for line in pitch_file.readlines():
	pit = line.strip().split(",")[-1]
	pitch.append(int(pit))
	return pitch


	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument("-w", "--wav", help="wav", dest="wav", required=True)
	parser.add_argument("-p", "--pit", help="pit", dest="pit", required=True) # csv for excel
	args = parser.parse_args()
	print(args.wav)
	print(args.pit)

	device = "cuda" if torch.cuda.is_available() else "cpu"
	# pitch = compute_f0_sing(args.wav, device)
	pitch = compute_f0_voice(args.wav, device)
	save_csv_pitch(pitch, args.pit)
	# tmp = load_csv_pitch(args.pit)
	# save_csv_pitch(tmp, "tmp.csv")