Spaces:

descript
/

vampnet

Runtime error

App Files Files Community

harp

#10

by hugggof - opened Oct 10, 2023

base: refs/heads/main

←

from: refs/pr/10

Discussion Files changed

+377

-34

Files changed (9) hide show

.gitignore +4 -0
app.py +71 -11
conf/lora/lora.yml +2 -2
conf/vampnet.yml +1 -1
scripts/exp/train.py +7 -7
scripts/utils/{augment.py → data/augment.py} +1 -1
scripts/utils/{maestro-reorg.py → data/maestro-reorg.py} +0 -0
scripts/utils/gtzan_embeddings.py +263 -0
vampnet/modules/transformer.py +28 -12

.gitignore CHANGED Viewed

@@ -182,3 +182,7 @@ models.zip
 .git-old
 conf/generated/*
 runs*/

 .git-old
 conf/generated/*
 runs*/
+gtzan.zip
+.gtzan_emb_cache

app.py CHANGED Viewed

@@ -21,8 +21,7 @@ import gradio as gr
 from vampnet.interface import Interface
 from vampnet import mask as pmask
-# Interface = argbind.bind(Interface)
-# AudioLoader = argbind.bind(at.data.datasets.AudioLoader)
@@ -54,13 +53,6 @@ def load_interface():
 interface = load_interface()
-# dataset = at.data.datasets.AudioDataset(
-#     loader,
-#     sample_rate=interface.codec.sample_rate,
-#     duration=interface.coarse.chunk_size_s,
-#     n_examples=5000,
-#     without_replacement=True,
-# )
 OUT_DIR = Path("gradio-outputs")
 OUT_DIR.mkdir(exist_ok=True, parents=True)
@@ -250,6 +242,46 @@ def save_vamp(data):
     return f"saved! your save code is {out_dir.stem}", zip_path
 with gr.Blocks() as demo:
@@ -373,7 +405,7 @@ with gr.Blocks() as demo:
                     minimum=0,
                     maximum=128,
                     step=1,
-                    value=5,
                 )
@@ -386,7 +418,7 @@ with gr.Blocks() as demo:
                 )
                 beat_mask_width = gr.Slider(
-                    label="beat mask width (in milliseconds)",
                     minimum=0,
                     maximum=200,
                     value=0,
@@ -546,6 +578,14 @@ with gr.Blocks() as demo:
         # mask settings
         with gr.Column():
             vamp_button = gr.Button("generate (vamp)!!!")
             output_audio = gr.Audio(
                 label="output audio",
@@ -620,4 +660,24 @@ with gr.Blocks() as demo:
         outputs=[thank_you, download_file]
     )
 demo.launch()

 from vampnet.interface import Interface
 from vampnet import mask as pmask
+from pyharp import ModelCard, build_endpoint
 interface = load_interface()
 OUT_DIR = Path("gradio-outputs")
 OUT_DIR.mkdir(exist_ok=True, parents=True)
     return f"saved! your save code is {out_dir.stem}", zip_path
+def harp_vamp(_input_audio, _beat_mask_width, _sampletemp):
+    out_dir = OUT_DIR / str(uuid.uuid4())
+    out_dir.mkdir()
+    sig = at.AudioSignal(_input_audio)
+    sig = interface.preprocess(sig)
+    z = interface.encode(sig)
+    # build the mask
+    mask = pmask.linear_random(z, 1.0)
+    if _beat_mask_width > 0:
+        beat_mask = interface.make_beat_mask(
+            sig,
+            after_beat_s=(_beat_mask_width/1000),
+        )
+        mask = pmask.mask_and(mask, beat_mask)
+    # save the mask as a txt file
+    zv, mask_z = interface.coarse_vamp(
+        z,
+        mask=mask,
+        sampling_temperature=_sampletemp,
+        return_mask=True,
+        gen_fn=interface.coarse.generate,
+    )
+    zv = interface.coarse_to_fine(
+        zv,
+        sampling_temperature=_sampletemp,
+        mask=mask,
+    )
+    sig = interface.to_signal(zv).cpu()
+    print("done")
+    sig.write(out_dir / "output.wav")
+    return sig.path_to_file
 with gr.Blocks() as demo:
                     minimum=0,
                     maximum=128,
                     step=1,
+                    value=3,
                 )
                 )
                 beat_mask_width = gr.Slider(
+                    label="beat prompt (ms)",
                     minimum=0,
                     maximum=200,
                     value=0,
         # mask settings
         with gr.Column():
+            # lora_choice = gr.Dropdown(
+            #     label="lora choice",
+            #     choices=list(loras.keys()),
+            #     value=LORA_NONE,
+            #     visible=False
+            # )
             vamp_button = gr.Button("generate (vamp)!!!")
             output_audio = gr.Audio(
                 label="output audio",
         outputs=[thank_you, download_file]
     )
+    # harp stuff
+    harp_inputs = [
+        input_audio,
+        beat_mask_width,
+        sampletemp,
+    ]
+    build_endpoint(
+        inputs=harp_inputs,
+        output=output_audio,
+        process_fn=harp_vamp,
+        card=ModelCard(
+            name="vampnet",
+            description="Generate variations on music input, based on small prompts around the beat.",
+            author="Hugo Flores García",
+            tags=["music", "generative"]
+        ),
+        visible=False
+    )
 demo.launch()

conf/lora/lora.yml CHANGED Viewed

@@ -9,9 +9,9 @@ val/AudioDataset.n_examples: 500
 NoamScheduler.warmup: 500
-batch_size: 7
 num_workers: 7
-save_iters: [10000, 20000, 30000, 40000, 50000]
 sample_freq: 1000
 val_freq: 500

 NoamScheduler.warmup: 500
+batch_size: 6
 num_workers: 7
+save_iters: [10000, 20000, 30000, 40000, 50000, 100000]
 sample_freq: 1000
 val_freq: 500

conf/vampnet.yml CHANGED Viewed

@@ -32,7 +32,7 @@ VampNet.n_heads: 20
 VampNet.flash_attn: false
 VampNet.dropout: 0.1
-AudioLoader.relative_path: /data/
 AudioDataset.loudness_cutoff: -30.0
 AudioDataset.without_replacement: true
 AudioLoader.shuffle: true

 VampNet.flash_attn: false
 VampNet.dropout: 0.1
+AudioLoader.relative_path: ""
 AudioDataset.loudness_cutoff: -30.0
 AudioDataset.without_replacement: true
 AudioLoader.shuffle: true

scripts/exp/train.py CHANGED Viewed

@@ -224,7 +224,7 @@ def train_loop(state: State, batch: dict, accel: Accelerator):
         dtype = torch.bfloat16 if accel.amp else None
         with accel.autocast(dtype=dtype):
-            z_hat = state.model(z_mask_latent, r)
         target = codebook_flatten(
             z[:, vn.n_conditioning_codebooks :, :],
@@ -289,7 +289,7 @@ def val_loop(state: State, batch: dict, accel: Accelerator):
     z_mask_latent = vn.embedding.from_codes(z_mask, state.codec)
-    z_hat = state.model(z_mask_latent, r)
     target = codebook_flatten(
         z[:, vn.n_conditioning_codebooks :, :],
@@ -408,19 +408,19 @@ def save_imputation(state, z, val_idx, writer):
     for i in range(len(val_idx)):
         imputed_noisy[i].cpu().write_audio_to_tb(
-            f"imputed_noisy/{i}",
             writer,
             step=state.tracker.step,
             plot_fn=None,
         )
         imputed[i].cpu().write_audio_to_tb(
-            f"imputed/{i}",
             writer,
             step=state.tracker.step,
             plot_fn=None,
         )
         imputed_true[i].cpu().write_audio_to_tb(
-            f"imputed_true/{i}",
             writer,
             step=state.tracker.step,
             plot_fn=None,
@@ -450,7 +450,7 @@ def save_samples(state: State, val_idx: int, writer: SummaryWriter):
     z_mask_latent = vn.embedding.from_codes(z_mask, state.codec)
-    z_hat = state.model(z_mask_latent, r)
     z_pred = torch.softmax(z_hat, dim=1).argmax(dim=1)
     z_pred = codebook_unflatten(z_pred, n_c=vn.n_predict_codebooks)
@@ -469,7 +469,7 @@ def save_samples(state: State, val_idx: int, writer: SummaryWriter):
         }
         for k, v in audio_dict.items():
             v.cpu().write_audio_to_tb(
-                f"samples/_{i}.r={r[i]:0.2f}/{k}",
                 writer,
                 step=state.tracker.step,
                 plot_fn=None,

         dtype = torch.bfloat16 if accel.amp else None
         with accel.autocast(dtype=dtype):
+            z_hat = state.model(z_mask_latent)
         target = codebook_flatten(
             z[:, vn.n_conditioning_codebooks :, :],
     z_mask_latent = vn.embedding.from_codes(z_mask, state.codec)
+    z_hat = state.model(z_mask_latent)
     target = codebook_flatten(
         z[:, vn.n_conditioning_codebooks :, :],
     for i in range(len(val_idx)):
         imputed_noisy[i].cpu().write_audio_to_tb(
+            f"inpainted_prompt/{i}",
             writer,
             step=state.tracker.step,
             plot_fn=None,
         )
         imputed[i].cpu().write_audio_to_tb(
+            f"inpainted_middle/{i}",
             writer,
             step=state.tracker.step,
             plot_fn=None,
         )
         imputed_true[i].cpu().write_audio_to_tb(
+            f"reconstructed/{i}",
             writer,
             step=state.tracker.step,
             plot_fn=None,
     z_mask_latent = vn.embedding.from_codes(z_mask, state.codec)
+    z_hat = state.model(z_mask_latent)
     z_pred = torch.softmax(z_hat, dim=1).argmax(dim=1)
     z_pred = codebook_unflatten(z_pred, n_c=vn.n_predict_codebooks)
         }
         for k, v in audio_dict.items():
             v.cpu().write_audio_to_tb(
+                f"onestep/_{i}.r={r[i]:0.2f}/{k}",
                 writer,
                 step=state.tracker.step,
                 plot_fn=None,

scripts/utils/{augment.py → data/augment.py} RENAMED Viewed

@@ -64,4 +64,4 @@ if __name__ == "__main__":
     args = argbind.parse_args()
     with argbind.scope(args):
-        augment()

     args = argbind.parse_args()
     with argbind.scope(args):
+        augment()

scripts/utils/{maestro-reorg.py → data/maestro-reorg.py} RENAMED Viewed

File without changes

scripts/utils/gtzan_embeddings.py ADDED Viewed

	@@ -0,0 +1,263 @@

+"""
+TODO: train a linear probe
+usage:
+   python gtzan_embeddings.py --args.load conf/interface.yml --Interface.device cuda --path_to_gtzan /path/to/gtzan/genres_original  --output_dir /path/to/output
+"""
+from pathlib import Path
+from typing import List
+import audiotools as at
+from audiotools import AudioSignal
+import argbind
+import torch
+import numpy as np
+import zipfile
+import json
+from vampnet.interface import Interface
+import tqdm
+# bind the Interface to argbind
+Interface = argbind.bind(Interface)
+DEBUG = False
+def smart_plotly_export(fig, save_path):
+    img_format = save_path.split('.')[-1]
+    if img_format == 'html':
+        fig.write_html(save_path)
+    elif img_format == 'bytes':
+        return fig.to_image(format='png')
+    #TODO: come back and make this prettier
+    elif img_format == 'numpy':
+        import io
+        from PIL import Image
+        def plotly_fig2array(fig):
+            #convert Plotly fig to  an array
+            fig_bytes = fig.to_image(format="png", width=1200, height=700)
+            buf = io.BytesIO(fig_bytes)
+            img = Image.open(buf)
+            return np.asarray(img)
+        return plotly_fig2array(fig)
+    elif img_format == 'jpeg' or 'png' or 'webp':
+        fig.write_image(save_path)
+    else:
+        raise ValueError("invalid image format")
+def dim_reduce(emb, labels, save_path, n_components=3, method='tsne', title=''):
+    """
+    dimensionality reduction for visualization!
+    saves an html plotly figure to save_path
+    parameters:
+        emb (np.ndarray): the samples to be reduces with shape (samples, features)
+        labels (list): list of labels for embedding
+        save_path (str): path where u wanna save ur figure
+        method (str): umap, tsne, or pca
+        title (str): title for ur figure
+    returns:
+        proj (np.ndarray): projection vector with shape (samples, dimensions)
+    """
+    import pandas as pd
+    import plotly.express as px
+    if method == 'umap':
+        reducer = umap.UMAP(n_components=n_components)
+    elif method == 'tsne':
+        from sklearn.manifold import TSNE
+        reducer = TSNE(n_components=n_components)
+    elif method == 'pca':
+        from sklearn.decomposition import PCA
+        reducer = PCA(n_components=n_components)
+    else:
+        raise ValueError
+    proj = reducer.fit_transform(emb)
+    if n_components == 2:
+        df = pd.DataFrame(dict(
+            x=proj[:, 0],
+            y=proj[:, 1],
+            instrument=labels
+        ))
+        fig = px.scatter(df, x='x', y='y', color='instrument',
+                        title=title+f"_{method}")
+    elif n_components == 3:
+        df = pd.DataFrame(dict(
+            x=proj[:, 0],
+            y=proj[:, 1],
+            z=proj[:, 2],
+            instrument=labels
+        ))
+        fig = px.scatter_3d(df, x='x', y='y', z='z',
+                        color='instrument',
+                        title=title)
+    else:
+        raise ValueError("cant plot more than 3 components")
+    fig.update_traces(marker=dict(size=6,
+                                  line=dict(width=1,
+                                            color='DarkSlateGrey')),
+                      selector=dict(mode='markers'))
+    return smart_plotly_export(fig, save_path)
+# per JukeMIR, we want the emebddings from the middle layer?
+def vampnet_embed(sig: AudioSignal, interface: Interface, layer=10):
+    with torch.inference_mode():
+        # preprocess the signal
+        sig = interface.preprocess(sig)
+        # get the coarse vampnet model
+        vampnet = interface.coarse
+        # get the tokens
+        z = interface.encode(sig)[:, :vampnet.n_codebooks, :]
+        z_latents = vampnet.embedding.from_codes(z, interface.codec)
+        # do a forward pass through the model, get the embeddings
+        _z, embeddings = vampnet(z_latents, return_activations=True)
+        # print(f"got embeddings with shape {embeddings.shape}")
+        # [layer, batch, time, n_dims]
+        # [20, 1, 600ish, 768]
+        # squeeze batch dim (1 bc layer should be dim 0)
+        assert embeddings.shape[1] == 1, f"expected batch dim to be 1, got {embeddings.shape[0]}"
+        embeddings = embeddings.squeeze(1)
+        num_layers = embeddings.shape[0]
+        assert layer < num_layers, f"layer {layer} is out of bounds for model with {num_layers} layers"
+        # do meanpooling over the time dimension
+        embeddings = embeddings.mean(dim=-2)
+        # [20, 768]
+        # return the embeddings
+        return embeddings
+from dataclasses import dataclass, fields
+@dataclass
+class Embedding:
+    genre: str
+    filename: str
+    embedding: np.ndarray
+    def save(self, path):
+        """Save the Embedding object to a given path as a zip file."""
+        with zipfile.ZipFile(path, 'w') as archive:
+            # Save numpy array
+            with archive.open('embedding.npy', 'w') as f:
+                np.save(f, self.embedding)
+            # Save non-numpy data as json
+            non_numpy_data = {f.name: getattr(self, f.name) for f in fields(self) if f.name != 'embedding'}
+            with archive.open('data.json', 'w') as f:
+                f.write(json.dumps(non_numpy_data).encode('utf-8'))
+    @classmethod
+    def load(cls, path):
+        """Load the Embedding object from a given zip path."""
+        with zipfile.ZipFile(path, 'r') as archive:
+            # Load numpy array
+            with archive.open('embedding.npy') as f:
+                embedding = np.load(f)
+            # Load non-numpy data from json
+            with archive.open('data.json') as f:
+                data = json.loads(f.read().decode('utf-8'))
+        return cls(embedding=embedding, **data)
+@argbind.bind(without_prefix=True)
+def main(
+    path_to_gtzan: str = None,
+    cache_dir: str = "./.gtzan_emb_cache",
+    output_dir: str = "./gtzan_vampnet_embeddings",
+    layers: List[int] = [1, 3, 5, 7, 9, 11, 13, 15, 17, 19]
+):
+    path_to_gtzan = Path(path_to_gtzan)
+    assert path_to_gtzan.exists(), f"{path_to_gtzan} does not exist"
+    cache_dir = Path(cache_dir)
+    output_dir = Path(output_dir)
+    output_dir.mkdir(exist_ok=True, parents=True)
+    # load our interface
+    # argbind will automatically load the default config,
+    interface = Interface()
+    # gtzan should have a folder for each genre, so let's get the list of genres
+    genres = [Path(x).name for x in path_to_gtzan.iterdir() if x.is_dir()]
+    print(f"Found {len(genres)} genres")
+    print(f"genres: {genres}")
+    # collect audio files, genres, and embeddings
+    data = []
+    for genre in genres:
+        audio_files = list(at.util.find_audio(path_to_gtzan / genre))
+        print(f"Found {len(audio_files)} audio files for genre {genre}")
+        for audio_file in tqdm.tqdm(audio_files, desc=f"embedding genre {genre}"):
+            # check if we have a cached embedding for this file
+            cached_path = (cache_dir / f"{genre}_{audio_file.stem}.emb")
+            if cached_path.exists():
+                # if so, load it
+                if DEBUG:
+                    print(f"loading cached embedding for {cached_path.stem}")
+                embedding = Embedding.load(cached_path)
+                data.append(embedding)
+            else:
+                try:
+                    sig = AudioSignal(audio_file)
+                except Exception as e:
+                    print(f"failed to load {audio_file.name} with error {e}")
+                    print(f"skipping {audio_file.name}")
+                    continue
+                # gets the embedding
+                emb = vampnet_embed(sig, interface).cpu().numpy()
+                # create an embedding we can save/load
+                embedding = Embedding(
+                    genre=genre,
+                    filename=audio_file.name,
+                    embedding=emb
+                )
+                # cache the embeddings
+                cached_path.parent.mkdir(exist_ok=True, parents=True)
+                embedding.save(cached_path)
+    # now, let's do a dim reduction on the embeddings
+    # and visualize them.
+    # collect a list of embeddings and labels
+    embeddings = [d.embedding for d in data]
+    labels = [d.genre for d in data]
+    # convert the embeddings to a numpy array
+    embeddings = np.stack(embeddings)
+    # do dimensionality reduction for each layer we're given
+    for layer in tqdm.tqdm(layers, desc="dim reduction"):
+        dim_reduce(
+            embeddings[:, layer, :], labels,
+            save_path=str(output_dir / f'vampnet-gtzan-layer={layer}.html'),
+            n_components=2, method='tsne',
+            title=f'vampnet-gtzan-layer={layer}'
+        )
+if __name__ == "__main__":
+    args = argbind.parse_args()
+    with argbind.scope(args):
+        main()

vampnet/modules/transformer.py CHANGED Viewed

@@ -410,7 +410,9 @@ class TransformerStack(nn.Module):
     def subsequent_mask(self, size):
         return torch.ones(1, size, size).tril().bool()
-    def forward(self, x, x_mask, cond=None, src=None, src_mask=None):
         """Computes a full transformer stack
         Parameters
         ----------
@@ -437,6 +439,8 @@ class TransformerStack(nn.Module):
         encoder_decoder_position_bias = None
         # Compute transformer layers
         for layer in self.layers:
             x, position_bias, encoder_decoder_position_bias = layer(
                 x=x,
@@ -447,8 +451,15 @@ class TransformerStack(nn.Module):
                 position_bias=position_bias,
                 encoder_decoder_position_bias=encoder_decoder_position_bias,
             )
-        return self.norm(x) if self.norm is not None else x
 class VampNet(at.ml.BaseModel):
@@ -456,7 +467,7 @@ class VampNet(at.ml.BaseModel):
         self,
         n_heads: int = 20,
         n_layers: int = 16,
-        r_cond_dim: int = 64,
         n_codebooks: int = 9,
         n_conditioning_codebooks: int = 0,
         latent_dim: int = 8,
@@ -467,6 +478,7 @@ class VampNet(at.ml.BaseModel):
         dropout: float = 0.1
     ):
         super().__init__()
         self.n_heads = n_heads
         self.n_layers = n_layers
         self.r_cond_dim = r_cond_dim
@@ -513,21 +525,25 @@ class VampNet(at.ml.BaseModel):
             ),
         )
-    def forward(self, x, cond):
         x = self.embedding(x)
         x_mask = torch.ones_like(x, dtype=torch.bool)[:, :1, :].squeeze(1)
-        cond = self.r_embed(cond)
         x = rearrange(x, "b d n -> b n d")
-        out = self.transformer(x=x, x_mask=x_mask, cond=cond)
         out = rearrange(out, "b n d -> b d n")
-        out = self.classifier(out, cond)
         out = rearrange(out, "b (p c) t -> b p (t c)", c=self.n_predict_codebooks)
-        return out
     def r_embed(self, r, max_positions=10000):
         if self.r_cond_dim > 0:
@@ -589,7 +605,7 @@ class VampNet(at.ml.BaseModel):
         top_p=None,
         return_signal=True,
         seed: int = None,
-        sample_cutoff: float = 0.5,
     ):
         if seed is not None:
             at.util.seed(seed)
@@ -660,7 +676,7 @@ class VampNet(at.ml.BaseModel):
             # infer from latents
             # NOTE: this collapses the codebook dimension into the sequence dimension
-            logits = self.forward(latents, r) # b, prob, seq
             logits = logits.permute(0, 2, 1)  # b, seq, prob
             b = logits.shape[0]
@@ -921,7 +937,7 @@ if __name__ == "__main__":
         z_mask_latent = torch.rand(
             batch_size, model.latent_dim * model.n_codebooks, seq_len
         ).to(device)
-        z_hat = model(z_mask_latent, r)
         pred = z_hat.argmax(dim=1)
         pred = model.embedding.unflatten(pred, n_codebooks=model.n_predict_codebooks)

     def subsequent_mask(self, size):
         return torch.ones(1, size, size).tril().bool()
+    def forward(self, x, x_mask, cond=None, src=None, src_mask=None,
+                return_activations: bool = False
+        ):
         """Computes a full transformer stack
         Parameters
         ----------
         encoder_decoder_position_bias = None
         # Compute transformer layers
+        if return_activations:
+            activations = []
         for layer in self.layers:
             x, position_bias, encoder_decoder_position_bias = layer(
                 x=x,
                 position_bias=position_bias,
                 encoder_decoder_position_bias=encoder_decoder_position_bias,
             )
+            if return_activations:
+                activations.append(x.detach())
+        out = self.norm(x) if self.norm is not None else x
+        if return_activations:
+            return out, torch.stack(activations)
+        else:
+            return out
 class VampNet(at.ml.BaseModel):
         self,
         n_heads: int = 20,
         n_layers: int = 16,
+        r_cond_dim: int = 0,
         n_codebooks: int = 9,
         n_conditioning_codebooks: int = 0,
         latent_dim: int = 8,
         dropout: float = 0.1
     ):
         super().__init__()
+        assert r_cond_dim == 0, f"r_cond_dim must be 0 (not supported), but got {r_cond_dim}"
         self.n_heads = n_heads
         self.n_layers = n_layers
         self.r_cond_dim = r_cond_dim
             ),
         )
+    def forward(self, x, return_activations: bool = False):
         x = self.embedding(x)
         x_mask = torch.ones_like(x, dtype=torch.bool)[:, :1, :].squeeze(1)
         x = rearrange(x, "b d n -> b n d")
+        out = self.transformer(x=x, x_mask=x_mask, return_activations=return_activations)
+        if return_activations:
+            out, activations = out
         out = rearrange(out, "b n d -> b d n")
+        out = self.classifier(out, None) # no cond here!
         out = rearrange(out, "b (p c) t -> b p (t c)", c=self.n_predict_codebooks)
+        if return_activations:
+            return out, activations
+        else:
+            return out
     def r_embed(self, r, max_positions=10000):
         if self.r_cond_dim > 0:
         top_p=None,
         return_signal=True,
         seed: int = None,
+        sample_cutoff: float = 1.0,
     ):
         if seed is not None:
             at.util.seed(seed)
             # infer from latents
             # NOTE: this collapses the codebook dimension into the sequence dimension
+            logits = self.forward(latents) # b, prob, seq
             logits = logits.permute(0, 2, 1)  # b, seq, prob
             b = logits.shape[0]
         z_mask_latent = torch.rand(
             batch_size, model.latent_dim * model.n_codebooks, seq_len
         ).to(device)
+        z_hat = model(z_mask_latent)
         pred = z_hat.argmax(dim=1)
         pred = model.embedding.unflatten(pred, n_codebooks=model.n_predict_codebooks)