Spaces:

amphion
/

NaturalSpeech2

Running on T4

App Files Files Community

NaturalSpeech2 / evaluation /metrics /similarity /speaker_similarity.py

yuancwang

init

b725c5a 7 months ago

raw

history blame

No virus

3.58 kB

	# Copyright (c) 2023 Amphion.
	#
	# This source code is licensed under the MIT license found in the
	# LICENSE file in the root directory of this source tree.

	import os

	import numpy as np
	import soundfile as sf
	import torch
	import torch.nn.functional as F
	from tqdm import tqdm
	import librosa

	from .models.RawNetModel import RawNet3
	from .models.RawNetBasicBlock import Bottle2neck


	def extract_speaker_embd(
	model, fn: str, n_samples: int, n_segments: int = 10, gpu: bool = False
	) -> np.ndarray:
	audio, sample_rate = sf.read(fn)
	if len(audio.shape) > 1:
	raise ValueError(
	f"RawNet3 supports mono input only. Input data has a shape of {audio.shape}."
	)

	if sample_rate != 16000:
	# resample to 16000kHz
	audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=16000)
	# print("resample to 16000kHz!")
	if len(audio) < n_samples: # RawNet3 was trained using utterances of 3 seconds
	shortage = n_samples - len(audio) + 1
	audio = np.pad(audio, (0, shortage), "wrap")

	audios = []
	startframe = np.linspace(0, len(audio) - n_samples, num=n_segments)
	for asf in startframe:
	audios.append(audio[int(asf) : int(asf) + n_samples])

	audios = torch.from_numpy(np.stack(audios, axis=0).astype(np.float32))
	if gpu:
	audios = audios.to("cuda")
	with torch.no_grad():
	output = model(audios)

	return output


	def extract_speaker_similarity(target_path, reference_path):
	model = RawNet3(
	Bottle2neck,
	model_scale=8,
	context=True,
	summed=True,
	encoder_type="ECA",
	nOut=256,
	out_bn=False,
	sinc_stride=10,
	log_sinc=True,
	norm_sinc="mean",
	grad_mult=1,
	)

	gpu = False
	model.load_state_dict(
	torch.load(
	"pretrained/rawnet3/model.pt",
	map_location=lambda storage, loc: storage,
	)["model"]
	)
	model.eval()
	print("RawNet3 initialised & weights loaded!")

	if torch.cuda.is_available():
	print("Cuda available, conducting inference on GPU")
	model = model.to("cuda")
	gpu = True
	# for target_path, reference_path in zip(target_paths, ref_paths):
	# print(f"Extracting embeddings for target singers...")

	target_embeddings = []
	for file in tqdm(os.listdir(target_path)):
	output = extract_speaker_embd(
	model,
	fn=os.path.join(target_path, file),
	n_samples=48000,
	n_segments=10,
	gpu=gpu,
	).mean(0)
	target_embeddings.append(output.detach().cpu().numpy())
	target_embeddings = np.array(target_embeddings)
	target_embedding = np.mean(target_embeddings, axis=0)

	# print(f"Extracting embeddings for reference singer...")

	reference_embeddings = []
	for file in tqdm(os.listdir(reference_path)):
	output = extract_speaker_embd(
	model,
	fn=os.path.join(reference_path, file),
	n_samples=48000,
	n_segments=10,
	gpu=gpu,
	).mean(0)
	reference_embeddings.append(output.detach().cpu().numpy())
	reference_embeddings = np.array(reference_embeddings)

	# print("Calculating cosine similarity...")

	cos_sim = F.cosine_similarity(
	torch.from_numpy(np.mean(target_embeddings, axis=0)).unsqueeze(0),
	torch.from_numpy(np.mean(reference_embeddings, axis=0)).unsqueeze(0),
	dim=1,
	)

	# print(f"Mean cosine similarity: {cos_sim.item()}")

	return cos_sim.item()