Spaces:

amphion
/

NaturalSpeech2

Sleeping

File size: 3,575 Bytes

b725c5a

# Copyright (c) 2023 Amphion.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

import os

import numpy as np
import soundfile as sf
import torch
import torch.nn.functional as F
from tqdm import tqdm
import librosa

from .models.RawNetModel import RawNet3
from .models.RawNetBasicBlock import Bottle2neck


def extract_speaker_embd(
    model, fn: str, n_samples: int, n_segments: int = 10, gpu: bool = False
) -> np.ndarray:
    audio, sample_rate = sf.read(fn)
    if len(audio.shape) > 1:
        raise ValueError(
            f"RawNet3 supports mono input only. Input data has a shape of {audio.shape}."
        )

    if sample_rate != 16000:
        # resample to 16000kHz
        audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=16000)
        # print("resample to 16000kHz!")
    if len(audio) < n_samples:  # RawNet3 was trained using utterances of 3 seconds
        shortage = n_samples - len(audio) + 1
        audio = np.pad(audio, (0, shortage), "wrap")

    audios = []
    startframe = np.linspace(0, len(audio) - n_samples, num=n_segments)
    for asf in startframe:
        audios.append(audio[int(asf) : int(asf) + n_samples])

    audios = torch.from_numpy(np.stack(audios, axis=0).astype(np.float32))
    if gpu:
        audios = audios.to("cuda")
    with torch.no_grad():
        output = model(audios)

    return output


def extract_speaker_similarity(target_path, reference_path):
    model = RawNet3(
        Bottle2neck,
        model_scale=8,
        context=True,
        summed=True,
        encoder_type="ECA",
        nOut=256,
        out_bn=False,
        sinc_stride=10,
        log_sinc=True,
        norm_sinc="mean",
        grad_mult=1,
    )

    gpu = False
    model.load_state_dict(
        torch.load(
            "pretrained/rawnet3/model.pt",
            map_location=lambda storage, loc: storage,
        )["model"]
    )
    model.eval()
    print("RawNet3 initialised & weights loaded!")

    if torch.cuda.is_available():
        print("Cuda available, conducting inference on GPU")
        model = model.to("cuda")
        gpu = True
    # for target_path, reference_path in zip(target_paths, ref_paths):
    # print(f"Extracting embeddings for target singers...")

    target_embeddings = []
    for file in tqdm(os.listdir(target_path)):
        output = extract_speaker_embd(
            model,
            fn=os.path.join(target_path, file),
            n_samples=48000,
            n_segments=10,
            gpu=gpu,
        ).mean(0)
        target_embeddings.append(output.detach().cpu().numpy())
    target_embeddings = np.array(target_embeddings)
    target_embedding = np.mean(target_embeddings, axis=0)

    # print(f"Extracting embeddings for reference singer...")

    reference_embeddings = []
    for file in tqdm(os.listdir(reference_path)):
        output = extract_speaker_embd(
            model,
            fn=os.path.join(reference_path, file),
            n_samples=48000,
            n_segments=10,
            gpu=gpu,
        ).mean(0)
        reference_embeddings.append(output.detach().cpu().numpy())
    reference_embeddings = np.array(reference_embeddings)

    # print("Calculating cosine similarity...")

    cos_sim = F.cosine_similarity(
        torch.from_numpy(np.mean(target_embeddings, axis=0)).unsqueeze(0),
        torch.from_numpy(np.mean(reference_embeddings, axis=0)).unsqueeze(0),
        dim=1,
    )

    # print(f"Mean cosine similarity: {cos_sim.item()}")

    return cos_sim.item()