Spaces:

ernestchu
/

tsm-net

Sleeping

App Files Files Community

ernestchu commited on Jun 28, 2023

Commit

32bac05

1 Parent(s): 19b2e5e

init

Browse files

Files changed (15) hide show

.gitignore +7 -0
app.py +117 -0
packages.txt +1 -0
requirements.txt +7 -0
tsmnet/setup.py +7 -0
tsmnet/tsmnet/__init__.py +1 -0
tsmnet/tsmnet/dataset.py +79 -0
tsmnet/tsmnet/interface.py +80 -0
tsmnet/tsmnet/modules.py +180 -0
tsmnet/tsmnet/utils.py +13 -0
weights/args.yml +24 -0
weights/classical-music.pt +3 -0
weights/general.pt +3 -0
weights/pop-music.pt +3 -0
weights/speech.pt +3 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,7 @@

+flagged
+__pycache__
+.DS_Store
+*.swp
+*.egg-info
+build

app.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import os
+from tsmnet import Stretcher
+import gradio as gr
+from gradio import processing_utils
+import torch
+import torchaudio
+model_root = './weights'
+available_models = ['general', 'pop-music', 'classical-music', 'speech']
+working_sr = 22050
+def prepare_models():
+    return {
+        weight: Stretcher(os.path.join(model_root, f'{weight}.pt'))
+        for weight in available_models
+    }
+def prepare_audio_file(rec, audio_file, yt_url):
+    if rec is not None:
+        return rec
+    if audio_file is not None:
+        return audio_file
+    if yt_url != '':
+        pass
+    else:
+        raise gr.Error('No audio found!')
+def run(rec, audio_file, yt_url, speed, model, start_time, end_time):
+    audio_file = prepare_audio_file(rec, audio_file, yt_url)
+    if speed == 1:
+        return processing_utils.audio_from_file(audio_file)
+    model = models[model]
+    x, sr = torchaudio.load(audio_file)
+    x = torchaudio.transforms.Resample(orig_freq=sr, new_freq=working_sr)(x)
+    sr = working_sr
+    x = model(x, speed).cpu()
+    torchaudio.save(audio_file, x, sr)
+    return processing_utils.audio_from_file(audio_file)
+# @@@@@@@ Start of the program @@@@@@@@
+models = prepare_models()
+with gr.Blocks() as demo:
+    gr.Markdown('# TSM-Net')
+    gr.Markdown('---')
+    with gr.Row():
+        with gr.Column():
+            with gr.Tab('From microphone'):
+                rec_box = gr.Audio(label='Recording', source='microphone', type='filepath')
+            with gr.Tab('From file'):
+                audio_file_box = gr.Audio(label='Audio sample', type='filepath')
+            with gr.Tab('From YouTube'):
+                yt_url_box  = gr.Textbox(label='YouTube URL', placeholder='Under Construction', interactive=False)
+            rec_box.change(lambda: [None] * 2, outputs=[audio_file_box, yt_url_box])
+            audio_file_box.change(lambda: [None] * 2, outputs=[rec_box, yt_url_box])
+            yt_url_box.input(lambda: [None] * 2, outputs=[rec_box, audio_file_box])
+            speed_box = gr.Slider(label='Playback speed', minimum=0, maximum=2, value=1)
+            with gr.Accordion('Fine-grained settings', open=False):
+                with gr.Row():
+                    gr.Textbox(label='', value='Trim audio sample', interactive=False)
+                    start_time_box = gr.Number(label='Start', value=0)
+                    end_time_box = gr.Number(label='End', value=20)
+                model_box = gr.Dropdown(label='Model weight', choices=available_models, value=available_models[0])
+            submit_btn = gr.Button('Submit')
+        with gr.Column():
+            with gr.Accordion('Hint', open=False):
+                gr.Markdown('You can find more settings under the **Fine-grained settings**')
+                gr.Markdown('- Feeling slow? Try to adjust the start/end timestamp')
+                gr.Markdown('- Low audio quality? Try to switch to a proper model weight')
+            outputs=gr.Audio(label='Output')
+        submit_btn.click(fn=run, inputs=[
+            rec_box,
+            audio_file_box,
+            yt_url_box,
+            speed_box,
+            model_box,
+            start_time_box,
+            end_time_box,
+        ], outputs=outputs)
+    with gr.Accordion('Read more ...', open=False):
+        gr.Markdown('---')
+        gr.Markdown(
+            'We proposed a novel approach in the field of time-scale modification '
+            'on audio signals. While traditional methods use the framing technique, '
+            'spectral approach uses the short-time Fourier transform to preserve '
+            'the frequency during temporal stretching. TSM-Net, our neural-network '
+            'model encodes the raw audio into a high-level latent representation. '
+            'We call it Neuralgram, in which one vector represents 1024 audio samples. '
+            'It is inspired by the framing technique but addresses the clipping '
+            'artifacts. The Neuralgram is a two-dimensional matrix with real values, '
+            'we can apply some existing image resizing techniques on the Neuralgram '
+            'and decode it using our neural decoder to obtain the time-scaled audio. '
+            'Both the encoder and decoder are trained with GANs, which shows fair '
+            'generalization ability on the scaled Neuralgrams. Our method yields '
+            'little artifacts and opens a new possibility in the research of modern '
+            'time-scale modification. Please find more detail in our '
+            '<a href="https://arxiv.org/abs/2210.17152" target="_blank">paper</a>.'
+        )
+demo.queue(4)
+demo.launch(server_name='0.0.0.0')

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ ffmpeg

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+./tsmnet
+torch
+torchvision
+torchaudio
+yt-dlp
+wget

tsmnet/setup.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from setuptools import setup
+setup(
+    name='tsmnet',
+    version='1.0.0',
+    packages=['tsmnet'],
+)

tsmnet/tsmnet/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from tsmnet.interface import load_model, Neuralgram, Stretcher

tsmnet/tsmnet/dataset.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import torch
+import torch.utils.data
+import torch.nn.functional as F
+import torchaudio
+from pathlib import Path
+import numpy as np
+import random
+def files_to_list(filename):
+    """
+    Takes a text file of filenames and makes a list of filenames
+    """
+    with open(filename, encoding="utf-8") as f:
+        files = f.readlines()
+    files = [f.rstrip() for f in files]
+    return files
+class AudioDataset(torch.utils.data.Dataset):
+    """
+    This is the main class that calculates the spectrogram and returns the
+    spectrogram, audio pair.
+    """
+    def __init__(self, training_files, segment_length, sampling_rate, augment=True):
+        self.sampling_rate = sampling_rate
+        self.segment_length = segment_length
+        self.audio_files = files_to_list(training_files)
+        self.audio_files = [Path(training_files).parent / x for x in self.audio_files]
+        random.seed(1234)
+        random.shuffle(self.audio_files)
+        self.augment = augment
+    def __getitem__(self, index):
+        # Read audio
+        filename = self.audio_files[index]
+        try:
+            audio, sampling_rate = self.load_wav_to_torch(filename)
+        except RuntimeError:
+            # there's lots of corrupted files in FMA
+            print(f'Found corrupted file: {filename}, use empty data instead')
+            audio = torch.tensor([])
+        # Take segment
+        if audio.size(0) >= self.segment_length:
+            max_audio_start = audio.size(0) - self.segment_length
+            audio_start = random.randint(0, max_audio_start)
+            audio = audio[audio_start : audio_start + self.segment_length]
+        else:
+            audio = F.pad(
+                audio, (0, self.segment_length - audio.size(0)), "constant"
+            ).data
+        # audio = audio / 32768.0
+        return audio.unsqueeze(0)
+    def __len__(self):
+        return len(self.audio_files)
+    def load_wav_to_torch(self, full_path):
+        """
+        Loads audio into torch array
+        """
+        data, sampling_rate = torchaudio.load(str(full_path))
+        data = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=self.sampling_rate)(data)
+        sampling_rate = self.sampling_rate
+        if len(data.shape) > 1:
+            # convert to mono
+            data = data[random.randint(0, data.shape[0]-1)]
+        if self.augment:
+            amplitude = np.random.uniform(low=0.3, high=1.0)
+            data = data * amplitude
+        return data.float(), sampling_rate

tsmnet/tsmnet/interface.py ADDED Viewed

	@@ -0,0 +1,80 @@

+from tsmnet.modules import Autoencoder
+from torchvision.transforms.functional import resize
+from torchvision.transforms import InterpolationMode
+from pathlib import Path
+import yaml
+import torch
+import os
+def get_default_device():
+    if torch.cuda.is_available():
+        return "cuda"
+    else:
+        return "cpu"
+def load_model(path, device=get_default_device()):
+    """
+    Args:
+        mel2wav_path (str or Path): path to the root folder of dumped text2mel
+        device (str or torch.device): device to load the model
+    """
+    root = Path(path)
+    with open(os.path.join(os.path.dirname(path), "args.yml"), "r") as f:
+        args = yaml.unsafe_load(f)
+    netA = Autoencoder([int(n) for n in args.compress_ratios], args.ngf, args.n_residual_layers).to(device)
+    netA.load_state_dict(torch.load(path, map_location=device))
+    return netA
+class Neuralgram:
+    def __init__(
+        self,
+        path,
+        device=None,
+    ):
+        if device is None:
+            device = get_default_device()
+        self.device = device
+        self.netA = load_model(path, device)
+    def __call__(self, audio):
+        """
+        Performs audio to neuralgram conversion (See Autoencoder.encoder in tsmnet/modules.py)
+        Args:
+            audio (torch.tensor): PyTorch tensor containing audio (batch_size, timesteps)
+        Returns:
+            torch.tensor: neuralgram computed on input audio (batch_size, channels, timesteps)
+        """
+        with torch.no_grad():
+            return self.netA.encoder(torch.as_tensor(audio).unsqueeze(1).to(self.device))
+    def inverse(self, neu):
+        """
+        Performs neuralgram to audio conversion
+        Args:
+            neu (torch.tensor): PyTorch tensor containing neuralgram (batch_size, channels, timesteps)
+        Returns:
+            torch.tensor:  Inverted raw audio (batch_size, timesteps)
+        """
+        with torch.no_grad():
+            return self.netA.decoder(neu.to(self.device)).squeeze(1)
+class Stretcher:
+    def __init__(self, path, device=None):
+        self.neuralgram = Neuralgram(path, device)
+    @torch.no_grad()
+    def __call__(self, audio, rate , interpolation=InterpolationMode.NEAREST): # NEAREST | BILINEAR | BICUBIC
+        if rate == 1:
+            return audio.numpy() if isinstance(audio, torch.Tensor) else audio
+        neu = self.neuralgram(audio)
+        neu_resized = resize(
+            neu,
+            (*neu.shape[1:-1], int(neu.shape[-1] * (1/rate))),
+            interpolation
+        )
+        return self.neuralgram.inverse(neu_resized)

tsmnet/tsmnet/modules.py ADDED Viewed

	@@ -0,0 +1,180 @@

+import torch.nn as nn
+import torch.nn.functional as F
+import torch
+from torch.nn.utils import weight_norm
+import numpy as np
+def weights_init(m):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        m.weight.data.normal_(0.0, 0.02)
+    elif classname.find("BatchNorm2d") != -1:
+        m.weight.data.normal_(1.0, 0.02)
+        m.bias.data.fill_(0)
+def WNConv1d(*args, **kwargs):
+    return weight_norm(nn.Conv1d(*args, **kwargs))
+def WNConvTranspose1d(*args, **kwargs):
+    return weight_norm(nn.ConvTranspose1d(*args, **kwargs))
+class ResnetBlock(nn.Module):
+    def __init__(self, dim, dilation=1):
+        super().__init__()
+        self.block = nn.Sequential(
+            nn.Tanh(),
+            nn.ReflectionPad1d(dilation),
+            WNConv1d(dim, dim, kernel_size=3, dilation=dilation),
+            nn.Tanh(),
+            WNConv1d(dim, dim, kernel_size=1),
+        )
+        self.shortcut = WNConv1d(dim, dim, kernel_size=1)
+    def forward(self, x):
+        return self.shortcut(x) + self.block(x)
+class Autoencoder(nn.Module):
+    def __init__(self, compress_ratios, ngf, n_residual_layers):
+        super().__init__()
+        self.encoder = self.makeEncoder(compress_ratios, ngf, n_residual_layers)
+        self.decoder = self.makeDecoder([r for r in reversed(compress_ratios)], ngf, n_residual_layers)
+        self.apply(weights_init)
+    def makeEncoder(self, ratios, ngf, n_residual_layers):
+        mult = 1
+        model = [
+            nn.ReflectionPad1d(3),
+            WNConv1d(1, ngf, kernel_size=7, padding=0),
+            nn.Tanh(),
+        ]
+        # Downsample to neuralgram scale
+        for i, r in enumerate(ratios):
+            mult *= 2
+            for j in range(n_residual_layers-1, -1, -1):
+                model += [ResnetBlock(mult * ngf // 2, dilation=3 ** j)]
+            model += [
+                nn.Tanh(),
+                WNConv1d(
+                    mult * ngf // 2,
+                    mult * ngf,
+                    kernel_size=r * 2,
+                    stride=r,
+                    padding=r // 2 + r % 2
+                ),
+            ]
+        model += [ nn.Tanh() ]
+        return nn.Sequential(*model)
+    def makeDecoder(self, ratios, ngf, n_residual_layers):
+        mult = int(2 ** len(ratios))
+        model = []
+        # Upsample to raw audio scale
+        for i, r in enumerate(ratios):
+            model += [
+                nn.Tanh(),
+                WNConvTranspose1d(
+                    mult * ngf,
+                    mult * ngf // 2,
+                    kernel_size=r * 2,
+                    stride=r,
+                    padding=r // 2 + r % 2,
+                    output_padding=r % 2
+                ),
+            ]
+            for j in range(n_residual_layers):
+                model += [ResnetBlock(mult * ngf // 2, dilation=3 ** j)]
+            mult //= 2
+        model += [
+            nn.Tanh(),
+            nn.ReflectionPad1d(3),
+            WNConv1d(ngf, 1, kernel_size=7, padding=0),
+            nn.Tanh(),
+        ]
+        return nn.Sequential(*model)
+    def forward(self, x):
+        return self.decoder(self.encoder(x))
+class NLayerDiscriminator(nn.Module):
+    def __init__(self, ndf, n_layers, downsampling_factor):
+        super().__init__()
+        model = nn.ModuleDict()
+        model["layer_0"] = nn.Sequential(
+            nn.ReflectionPad1d(7),
+            WNConv1d(1, ndf, kernel_size=15),
+            nn.Tanh(),
+        )
+        nf = ndf
+        stride = downsampling_factor
+        for n in range(1, n_layers + 1):
+            nf_prev = nf
+            nf = min(nf * stride, 1024)
+            model["layer_%d" % n] = nn.Sequential(
+                WNConv1d(
+                    nf_prev,
+                    nf,
+                    kernel_size=stride * 10 + 1,
+                    stride=stride,
+                    padding=stride * 5,
+                    groups=nf_prev // 4,
+                ),
+                nn.Tanh(),
+            )
+        nf = min(nf * 2, 1024)
+        model["layer_%d" % (n_layers + 1)] = nn.Sequential(
+            WNConv1d(nf_prev, nf, kernel_size=5, stride=1, padding=2),
+            nn.Tanh(),
+        )
+        model["layer_%d" % (n_layers + 2)] = WNConv1d(
+            nf, 1, kernel_size=3, stride=1, padding=1
+        )
+        self.model = model
+    def forward(self, x):
+        results = []
+        for key, layer in self.model.items():
+            x = layer(x)
+            results.append(x)
+        return results
+class Discriminator(nn.Module):
+    def __init__(self, num_D, ndf, n_layers, downsampling_factor):
+        super().__init__()
+        self.model = nn.ModuleDict()
+        for i in range(num_D):
+            self.model[f"disc_{i}"] = NLayerDiscriminator(
+                ndf, n_layers, downsampling_factor
+            )
+        self.downsample = nn.AvgPool1d(4, stride=2, padding=1, count_include_pad=False)
+        self.apply(weights_init)
+    def forward(self, x):
+        results = []
+        for key, disc in self.model.items():
+            results.append(disc(x))
+            x = self.downsample(x)
+        return results

tsmnet/tsmnet/utils.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import scipy.io.wavfile
+def save_sample(file_path, sampling_rate, audio):
+    """Helper function to save sample
+    Args:
+        file_path (str or pathlib.Path): save file path
+        sampling_rate (int): sampling rate of audio (usually 22050)
+        audio (torch.FloatTensor): torch array containing audio in [-1, 1]
+    """
+    audio = (audio.numpy() * 32768).astype("int16")
+    scipy.io.wavfile.write(file_path, sampling_rate, audio)

weights/args.yml ADDED Viewed

	@@ -0,0 +1,24 @@

+!!python/object:argparse.Namespace
+batch_size: 2
+compress_ratios: '22488'
+cond_disc: false
+data_path: !!python/object/apply:pathlib.PosixPath
+- /
+- home
+- b073040018
+- Datasets
+downsamp_factor: 4
+epochs: 3000
+lambda_feat: 10
+load_path: logs-all/weights
+log_interval: 100
+n_layers_D: 4
+n_residual_layers: 1
+n_test_samples: 8
+ndf: 16
+ngf: 32
+num_D: 3
+project: tsmnet-all
+save_interval: 1000
+save_path: logs-all2
+seq_len: 8192

weights/classical-music.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c736e5c7414354ad2789b4d8dd6d3ab2d5813f52fb0982818a4fff8887d2eeba
+size 100400811

weights/general.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e70b0ca672ab2008da3517ae3eb524135a1ef5685d59cc034084316a665f69f6
+size 100400920

weights/pop-music.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f3010d34e0d538ecb4c63c8bc89ad4023630dc36e2746bb71b799026d2b03ad4
+size 100400898

weights/speech.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e29674ce2312e1ba8f9071348de84031e8afbb08412cbc8088b7365f2162f497
+size 100400879