File size: 2,578 Bytes
32bac05
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f6c8d4d
32bac05
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
from tsmnet.modules import Autoencoder

from torchvision.transforms.functional import resize
from torchvision.transforms import InterpolationMode
from pathlib import Path
import yaml
import torch
import os


def get_default_device():
    if torch.cuda.is_available():
        return "cuda"
    else:
        return "cpu"


def load_model(path, device=get_default_device()):
    """
    Args:
        mel2wav_path (str or Path): path to the root folder of dumped text2mel
        device (str or torch.device): device to load the model
    """
    root = Path(path)
    with open(os.path.join(os.path.dirname(path), "args.yml"), "r") as f:
        args = yaml.unsafe_load(f)
    netA = Autoencoder([int(n) for n in args.compress_ratios], args.ngf, args.n_residual_layers).to(device)
    netA.load_state_dict(torch.load(path, map_location=device))
    return netA


class Neuralgram:
    def __init__(
        self,
        path,
        device=None,
    ):
        if device is None:
            device = get_default_device()
        self.device = device
        self.netA = load_model(path, device)

    def __call__(self, audio):
        """
        Performs audio to neuralgram conversion (See Autoencoder.encoder in tsmnet/modules.py)
        Args:
            audio (torch.tensor): PyTorch tensor containing audio (batch_size, timesteps)
        Returns:
            torch.tensor: neuralgram computed on input audio (batch_size, channels, timesteps)
        """
        with torch.no_grad():
            return self.netA.encoder(torch.as_tensor(audio).unsqueeze(1).to(self.device))

    def inverse(self, neu):
        """
        Performs neuralgram to audio conversion
        Args:
            neu (torch.tensor): PyTorch tensor containing neuralgram (batch_size, channels, timesteps)
        Returns:
            torch.tensor:  Inverted raw audio (batch_size, timesteps)

        """
        with torch.no_grad():
            return self.netA.decoder(neu.to(self.device)).squeeze(1)

class Stretcher:
    def __init__(self, path, device=None):
        self.neuralgram = Neuralgram(path, device)
        
    @torch.no_grad()
    def __call__(self, audio, rate , interpolation=InterpolationMode.BICUBIC): # NEAREST | BILINEAR | BICUBIC
        if rate == 1:
            return audio.numpy() if isinstance(audio, torch.Tensor) else audio
        neu = self.neuralgram(audio)
        neu_resized = resize(
            neu,
            (*neu.shape[1:-1], int(neu.shape[-1] * (1/rate))),
            interpolation
        )
        return self.neuralgram.inverse(neu_resized)