yuancwang
init
b725c5a
raw
history blame
No virus
3.58 kB
# Copyright (c) 2023 Amphion.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import os
import numpy as np
import soundfile as sf
import torch
import torch.nn.functional as F
from tqdm import tqdm
import librosa
from .models.RawNetModel import RawNet3
from .models.RawNetBasicBlock import Bottle2neck
def extract_speaker_embd(
model, fn: str, n_samples: int, n_segments: int = 10, gpu: bool = False
) -> np.ndarray:
audio, sample_rate = sf.read(fn)
if len(audio.shape) > 1:
raise ValueError(
f"RawNet3 supports mono input only. Input data has a shape of {audio.shape}."
)
if sample_rate != 16000:
# resample to 16000kHz
audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=16000)
# print("resample to 16000kHz!")
if len(audio) < n_samples: # RawNet3 was trained using utterances of 3 seconds
shortage = n_samples - len(audio) + 1
audio = np.pad(audio, (0, shortage), "wrap")
audios = []
startframe = np.linspace(0, len(audio) - n_samples, num=n_segments)
for asf in startframe:
audios.append(audio[int(asf) : int(asf) + n_samples])
audios = torch.from_numpy(np.stack(audios, axis=0).astype(np.float32))
if gpu:
audios = audios.to("cuda")
with torch.no_grad():
output = model(audios)
return output
def extract_speaker_similarity(target_path, reference_path):
model = RawNet3(
Bottle2neck,
model_scale=8,
context=True,
summed=True,
encoder_type="ECA",
nOut=256,
out_bn=False,
sinc_stride=10,
log_sinc=True,
norm_sinc="mean",
grad_mult=1,
)
gpu = False
model.load_state_dict(
torch.load(
"pretrained/rawnet3/model.pt",
map_location=lambda storage, loc: storage,
)["model"]
)
model.eval()
print("RawNet3 initialised & weights loaded!")
if torch.cuda.is_available():
print("Cuda available, conducting inference on GPU")
model = model.to("cuda")
gpu = True
# for target_path, reference_path in zip(target_paths, ref_paths):
# print(f"Extracting embeddings for target singers...")
target_embeddings = []
for file in tqdm(os.listdir(target_path)):
output = extract_speaker_embd(
model,
fn=os.path.join(target_path, file),
n_samples=48000,
n_segments=10,
gpu=gpu,
).mean(0)
target_embeddings.append(output.detach().cpu().numpy())
target_embeddings = np.array(target_embeddings)
target_embedding = np.mean(target_embeddings, axis=0)
# print(f"Extracting embeddings for reference singer...")
reference_embeddings = []
for file in tqdm(os.listdir(reference_path)):
output = extract_speaker_embd(
model,
fn=os.path.join(reference_path, file),
n_samples=48000,
n_segments=10,
gpu=gpu,
).mean(0)
reference_embeddings.append(output.detach().cpu().numpy())
reference_embeddings = np.array(reference_embeddings)
# print("Calculating cosine similarity...")
cos_sim = F.cosine_similarity(
torch.from_numpy(np.mean(target_embeddings, axis=0)).unsqueeze(0),
torch.from_numpy(np.mean(reference_embeddings, axis=0)).unsqueeze(0),
dim=1,
)
# print(f"Mean cosine similarity: {cos_sim.item()}")
return cos_sim.item()