Spaces:

descript
/

vampnet

Sleeping

App Files Files Community

Hugo Flores Garcia commited on May 27, 2023

Commit

e3ca5f7

1 Parent(s): 7aa3063

refactor masking, interface, demo

Browse files

Files changed (16) hide show

.dockerignore +0 -2
README.md +5 -1
demo.py +137 -324
env/alias.sh +0 -3
env/data.sh +0 -36
env/entry_script.sh +0 -41
env/setup.py +0 -94
scripts/exp/train.py +23 -19
scripts/utils/vamp_folder.py +0 -13
vampnet/interface.py +63 -296
vampnet/mask.py +184 -0
vampnet/modules/base.py +0 -412
vampnet/modules/layers.py +0 -17
vampnet/modules/transformer.py +282 -4
vampnet/signal.py +0 -5
vampnet/util.py +15 -1

.dockerignore DELETED Viewed

	@@ -1,2 +0,0 @@
1	- *.wav
2	- runs/

README.md CHANGED Viewed

@@ -27,7 +27,7 @@ git clone https://github.com/hugofloresgarcia/vampnet2.git
 pip install -e ./vampnet2
 ```
-## A note on Argbind
 This repository relies on [argbind](https://github.com/pseeth/argbind) to manage CLIs and config files.
 Config files are stored in the `conf/` folder.
@@ -56,6 +56,10 @@ You just need to provide a list of audio files // folders to fine-tune on, then
 python scripts/exp/train.py --args.load conf/lora/birds.yml --save_path /path/to/checkpoints
 ```
 ## Launching the Gradio Interface
 ```bash
 python demo.py --args.load conf/interface/spotdl.yml --Interface.device cuda

 pip install -e ./vampnet2
 ```
+## A note on argbind
 This repository relies on [argbind](https://github.com/pseeth/argbind) to manage CLIs and config files.
 Config files are stored in the `conf/` folder.
 python scripts/exp/train.py --args.load conf/lora/birds.yml --save_path /path/to/checkpoints
 ```
+## Getting the Pretrained Models
 ## Launching the Gradio Interface
 ```bash
 python demo.py --args.load conf/interface/spotdl.yml --Interface.device cuda

demo.py CHANGED Viewed

@@ -3,6 +3,7 @@ from typing import Tuple
 import yaml
 import tempfile
 import uuid
 import numpy as np
 import audiotools as at
@@ -10,6 +11,7 @@ import argbind
 import gradio as gr
 from vampnet.interface import Interface
 Interface = argbind.bind(Interface)
 AudioLoader = argbind.bind(at.data.datasets.AudioLoader)
@@ -60,132 +62,61 @@ def load_random_audio():
     return sig.path_to_file
-def ez_vamp(
-    input_audio, init_temp, final_temp,
-    mask_periodic_amt, mask_periodic_width, num_steps,
-    stretch_factor,
-):
-    print(input_audio)
-    sig = at.AudioSignal(input_audio)
-    print(f"running standard vampnet with {num_vamps} vamps")
-    zv = interface.coarse_vamp(
-        sig,
-        sampling_steps=num_steps,
-        temperature=(init_temp, final_temp),
-        prefix_dur_s=0.0,
-        suffix_dur_s=0.0,
-        num_vamps=1,
-        downsample_factor=mask_periodic_amt,
-        stretch_factor=stretch_factor,
-        periodic_width=mask_periodic_width,
-        periodic_dropout=0.0,
-        periodic_width_dropout=0.0,
-        n_conditioning_codebooks=None,
-        intensity=1.0,
-        ext_mask=None,
     )
-    zv = interface.coarse_to_fine(zv)
     sig = interface.to_signal(zv).cpu()
     print("done")
     out_dir = OUT_DIR / str(uuid.uuid4())
     out_dir.mkdir()
-    sig.write(out_dir / "output.wav")
-    # mask.write(out_dir / "mask.wav")
-    # return sig.path_to_file, mask.path_to_file
-    return sig.path_to_file
-def vamp(
-    input_audio, init_temp, final_temp,
-    prefix_s, suffix_s, rand_mask_intensity,
-    mask_periodic_amt, beat_unmask_dur,
-    mask_dwn_chk, dwn_factor,
-    mask_up_chk, up_factor,
-    num_vamps, mode, use_beats, num_steps, snap_to_beats,
-    beat_unmask_drop,  mask_periodic_width,
-    mask_periodic_dropout, mask_periodic_width_dropout,
-    n_conditioning_codebooks, use_coarse2fine, stretch_factor,
-):
-    # try:
-        print(input_audio)
-        sig = at.AudioSignal(input_audio)
-        if snap_to_beats:
-            old_sig = sig.clone()
-            sig = interface.snap_to_beats(sig)
-            if sig.duration < (sig.duration / 4): # we cut off too much
-                sig = old_sig
-                print(f"new sig duration is {sig.duration} which is too short, reverting to old sig")
-            print(f"new sig duration is {sig.duration}")
-        if beat_unmask_dur > 0.0 and use_beats:
-            beat_mask = interface.make_beat_mask(
-                sig,
-                before_beat_s=0.0,
-                after_beat_s=beat_unmask_dur,
-                mask_downbeats=mask_dwn_chk,
-                mask_upbeats=mask_up_chk,
-                downbeat_downsample_factor=dwn_factor if dwn_factor > 0 else None,
-                beat_downsample_factor=up_factor if up_factor > 0 else None,
-                dropout=beat_unmask_drop,
-                invert=True
-            )
-            print(beat_mask)
-        else:
-            beat_mask = None
-        if mode == "standard":
-            print(f"running standard vampnet with {num_vamps} vamps")
-            zv, mask_z = interface.coarse_vamp(
-                sig,
-                sampling_steps=num_steps,
-                temperature=(init_temp, final_temp),
-                prefix_dur_s=prefix_s,
-                suffix_dur_s=suffix_s,
-                num_vamps=num_vamps,
-                downsample_factor=mask_periodic_amt,
-                stretch_factor=stretch_factor,
-                periodic_width=mask_periodic_width,
-                periodic_dropout=mask_periodic_dropout,
-                periodic_width_dropout=mask_periodic_width_dropout,
-                n_conditioning_codebooks=n_conditioning_codebooks if n_conditioning_codebooks > 0 else None,
-                intensity=rand_mask_intensity,
-                ext_mask=beat_mask,
-                verbose=True,
-                return_mask=True
-            )
-            if use_coarse2fine:
-                zv = interface.coarse_to_fine(zv)
-            mask = interface.to_signal(mask_z).cpu()
-            sig = interface.to_signal(zv).cpu()
-            print("done")
-        out_dir = OUT_DIR / str(uuid.uuid4())
-        out_dir.mkdir()
-        sig.write(out_dir / "output.wav")
-        mask.write(out_dir / "mask.wav")
-        return sig.path_to_file, mask.path_to_file
-        # return sig.path_to_file, mask_z
-    # except Exception as e:
-    #     raise gr.Error(f"failed with error: {e}")
-def save_vamp(
-    input_audio, init_temp, final_temp,
-    prefix_s, suffix_s, rand_mask_intensity,
-    mask_periodic_amt, beat_unmask_dur,
-    mask_dwn_chk, dwn_factor,
-    mask_up_chk, up_factor,
-    num_vamps, mode, output_audio, notes, use_beats, num_steps, snap_to_beats,
-    beat_unmask_drop, mask_periodic_width, mask_periodic_dropout, mask_periodic_width_dropout,
-    n_conditioning_codebooks, use_coarse2fine, stretch_factor
-):
     out_dir = OUT_DIR / "saved" / str(uuid.uuid4())
     out_dir.mkdir(parents=True, exist_ok=True)
@@ -196,30 +127,18 @@ def save_vamp(
     sig_out.write(out_dir / "output.wav")
     data = {
-        "init_temp": init_temp,
-        "final_temp": final_temp,
-        "prefix_s": prefix_s,
-        "suffix_s": suffix_s,
-        "rand_mask_intensity": rand_mask_intensity,
-        "mask_periodic_amt": mask_periodic_amt,
-        "use_beats": use_beats,
-        "beat_unmask_dur": beat_unmask_dur,
-        "mask_dwn_chk": mask_dwn_chk,
-        "dwn_factor": dwn_factor,
-        "mask_up_chk": mask_up_chk,
-        "up_factor": up_factor,
-        "num_vamps": num_vamps,
-        "num_steps": num_steps,
-        "snap_to_beats": snap_to_beats,
-        "mode": mode,
-        "notes": notes,
-        "beat_unmask_drop": beat_unmask_drop,
-        "mask_periodic_width": mask_periodic_width,
-        "mask_periodic_dropout": mask_periodic_dropout,
-        "mask_periodic_width_dropout": mask_periodic_width_dropout,
-        "n_conditioning_codebooks": n_conditioning_codebooks,
-        "use_coarse2fine": use_coarse2fine,
-        "stretch_factor": stretch_factor,
     }
     # save with yaml
@@ -234,68 +153,16 @@ def save_vamp(
     return f"saved! your save code is {out_dir.stem}", zip_path
-with gr.Blocks() as demo:
-    with gr.Row():
-        # input audio
-        with gr.Column():
-            gr.Markdown("""
-            # Vampnet
-            **Instructions**:
-            1. Upload some audio (or click the load random audio button)
-            2. Adjust the mask hints. The more hints, the more the generated music will follow the input music
-            3. Adjust the vampnet parameters. The more vamps, the longer the generated music will be
-            4. Click the "vamp" button
-            5. Listen to the generated audio
-            6. If you noticed something you liked, write some notes, click the "save vamp" button, and copy the save code
-            """)
-            gr.Markdown("## Input Audio")
-        with gr.Column():
-            gr.Markdown("""
-            ### Tips
-            - use the beat hint button so the output audio has the same beat structure as the input audio
-            - if you want more beat structure:
-                - enable beat hints
-            - if you want a more "random" generation:
-                - increase the periodic unmasking to 12 or more
-                - increase the temperatures!
-                - uncheck the beat hint button (or reduce the beat unmask duration)
-            - if you want the generated audio to sound like the original, but with a different beat structure:
-                - uncheck the beat hint button
-                - decrease the periodic unmasking to anywhere from 2 to 20
-                - slightly decrease the random intensity, to like .95
-            """)
-        with gr.Column():
-            gr.Markdown("""
-            ## Mask Hints
-            - most of the original audio will be masked and replaced with audio generated by vampnet
-            - mask hints are used to guide vampnet to generate audio that sounds like the original
-            - the more hints you give, the more the generated audio will sound like the original
-            """)
     with gr.Row():
         with gr.Column():
-            mode = gr.Radio(
-                label="**mode**. note that loop mode requires a prefix and suffix longer than 0",
-                choices=["standard",],
-                value="standard"
-            )
             use_coarse2fine = gr.Checkbox(
                 label="use coarse2fine",
                 value=True
             )
-            num_vamps = gr.Number(
-                label="number of vamps. more vamps = longer generated audio",
-                value=1,
-                precision=0,
-                visible=False
-            )
             manual_audio_upload = gr.File(
                 label=f"upload some audio (will be randomly trimmed to max of {interface.coarse.chunk_size_s:.2f}s)",
@@ -328,57 +195,46 @@ with gr.Blocks() as demo:
                 outputs=[ input_audio]
             )
         # mask settings
         with gr.Column():
-            n_conditioning_codebooks = gr.Number(
-                label="number of conditioning codebooks. probably 0",
-                value=0,
-                precision=0,
-            )
-            stretch_factor = gr.Slider(
-                label="time stretch factor",
-                minimum=0,
-                maximum=64,
-                step=1,
-                value=1,
             )
-            mask_periodic_amt = gr.Slider(
-                label="periodic hint  (0.0 means no hint, 2 - lots of hints, 8 - a couple of hints, 16 - occasional hint, 32 - very occasional hint, etc)",
                 minimum=0,
-                maximum=64,
                 step=1,
                 value=9,
             )
-            mask_periodic_width = gr.Slider(
-                label="periodic hint width (steps, 1 step ~= 10milliseconds",
                 minimum=1,
-                maximum=100,
                 step=1,
                 value=1,
             )
-            mask_periodic_dropout = gr.Slider(
-                label="periodic hint dropout (0.0 means no dropout, 1.0 means all dropout)",
-                minimum=0.0,
-                maximum=1.0,
-                value=0.0,
-            )
-            mask_periodic_width_dropout = gr.Slider(
-                label="periodic hint width dropout (0.0 means no dropout, 1.0 means all dropout)",
-                minimum=0.0,
-                maximum=1.0,
-                value=0.0,
-            )
-            rand_mask_intensity = gr.Slider(
-                label="random mask intensity. (If this is less than 1, scatters tiny hints throughout the audio, should be between 0.9 and 1.0)",
-                minimum=0.8,
-                maximum=1.0,
-                value=1.0
-            )
             with gr.Accordion("prefix/suffix hints", open=False):
                 prefix_s = gr.Slider(
@@ -408,15 +264,6 @@ with gr.Blocks() as demo:
                     value=1.0
                 )
-            use_beats = gr.Checkbox(
-                label="use beat hints (helps the output stick to the beat structure of the input)",
-                value=False
-            )
-            snap_to_beats = gr.Checkbox(
-                label="trim to beat markers (uncheck if the output audio is too short.)",
-                value=True
-            )
             num_steps = gr.Slider(
                 label="number of steps (should normally be between 12 and 36)",
@@ -426,6 +273,14 @@ with gr.Blocks() as demo:
                 value=36
             )
             vamp_button = gr.Button("vamp!!!")
             output_audio = gr.Audio(
@@ -434,59 +289,26 @@ with gr.Blocks() as demo:
                 type="filepath"
             )
-            # gr.Markdown("**NOTE**: for loop mode, both prefix and suffix must be greater than 0.")
-            # compute_mask_button = gr.Button("compute mask")
-            # mask_output = gr.Audio(
-            #     label="masked audio",
-            #     interactive=False,
-            #     visible=False
-            # )
-            # mask_output_viz = gr.Video(
-            #     label="masked audio",
-            #     interactive=False
-            # )
-        with gr.Column():
-            with gr.Accordion(label="beat unmask (how much time around the beat should be hinted?)"):
-                beat_unmask_dur = gr.Slider(
-                    label="duration",
-                    minimum=0.0,
-                    maximum=3.0,
-                    value=0.07
-                )
-                beat_unmask_drop = gr.Slider(
-                    label="dropout (within beat)",
-                    minimum=0.0,
-                    maximum=1.0,
-                    value=0.0
-                )
-                with gr.Accordion("downbeat settings", open=False):
-                    mask_dwn_chk = gr.Checkbox(
-                        label="hint downbeats",
-                        value=True
-                    )
-                    dwn_factor = gr.Slider(
-                        label="downbeat downsample factor (hint only every Nth downbeat)",
-                        value=0,
-                        minimum=0,
-                        maximum=16,
-                        step=1
-                    )
-                with gr.Accordion("upbeat settings", open=False):
-                    mask_up_chk = gr.Checkbox(
-                        label="hint upbeats",
-                        value=True
-                    )
-                    up_factor = gr.Slider(
-                        label="upbeat downsample factor (hint only every Nth upbeat)",
-                        value=0,
-                        minimum=0,
-                        maximum=16,
-                        step=1
-                    )
             notes_text = gr.Textbox(
                 label="type any notes about the generated audio here",
@@ -499,52 +321,43 @@ with gr.Blocks() as demo:
                 interactive=False
             )
             thank_you = gr.Markdown("")
     # connect widgets
     vamp_button.click(
         fn=vamp,
-        inputs=[input_audio, init_temp,final_temp,
-            prefix_s, suffix_s, rand_mask_intensity,
-            mask_periodic_amt, beat_unmask_dur,
-            mask_dwn_chk, dwn_factor,
-            mask_up_chk, up_factor,
-            num_vamps, mode, use_beats, num_steps, snap_to_beats,
-            beat_unmask_drop, mask_periodic_width,
-            mask_periodic_dropout, mask_periodic_width_dropout,
-            n_conditioning_codebooks, use_coarse2fine, stretch_factor
-        ],
         outputs=[output_audio, audio_mask],
         api_name="vamp"
     )
     save_button.click(
         fn=save_vamp,
-        inputs=[
-            input_audio, init_temp, final_temp,
-            prefix_s, suffix_s, rand_mask_intensity,
-            mask_periodic_amt, beat_unmask_dur,
-            mask_dwn_chk, dwn_factor,
-            mask_up_chk, up_factor,
-            num_vamps, mode,
-            output_audio,
-            notes_text, use_beats, num_steps, snap_to_beats,
-            beat_unmask_drop, mask_periodic_width,
-            mask_periodic_dropout, mask_periodic_width_dropout,
-            n_conditioning_codebooks, use_coarse2fine, stretch_factor
-        ],
         outputs=[thank_you, download_file]
     )
-    ez_vamp_button = gr.Button("ez vamp")
-    ez_vamp_button.click(
-        fn=ez_vamp,
-        inputs=[input_audio, init_temp, final_temp, mask_periodic_amt,
-                mask_periodic_width, num_steps, stretch_factor ],
-        outputs=[output_audio],
-        api_name="ez_vamp"
-    )
 demo.launch(share=True, enable_queue=False, debug=True)

 import yaml
 import tempfile
 import uuid
+from dataclasses import dataclass, asdict
 import numpy as np
 import audiotools as at
 import gradio as gr
 from vampnet.interface import Interface
+from vampnet import mask as pmask
 Interface = argbind.bind(Interface)
 AudioLoader = argbind.bind(at.data.datasets.AudioLoader)
     return sig.path_to_file
+def vamp(data):
+    print(data[input_audio])
+    sig = at.AudioSignal(data[input_audio])
+    z = interface.encode(sig)
+    ncc = data[n_conditioning_codebooks]
+    # build the mask
+    mask = pmask.linear_random(z, data[rand_mask_intensity])
+    mask = pmask.mask_and(
+        mask, pmask.inpaint(
+            z,
+            interface.s2t(data[prefix_s]),
+            interface.s2t(data[suffix_s])
+        )
     )
+    mask = pmask.mask_and(
+        mask, pmask.periodic_mask(
+            z,
+            data[periodic_p],
+            data[periodic_w],
+            random_roll=True
+        )
+    )
+    mask = pmask.dropout(mask, data[dropout])
+    mask = pmask.codebook_unmask(mask, ncc)
+    print(f"created mask with: linear random {data[rand_mask_intensity]}, inpaint {data[prefix_s]}:{data[suffix_s]}, periodic {data[periodic_p]}:{data[periodic_w]}, dropout {data[dropout]}")
+    zv, mask_z = interface.coarse_vamp(
+        z,
+        mask=mask,
+        sampling_steps=data[num_steps],
+        temperature=(data[init_temp], data[final_temp]),
+        return_mask=True
+    )
+    if use_coarse2fine:
+        zv = interface.coarse_to_fine(zv)
+    mask = interface.to_signal(mask_z).cpu()
     sig = interface.to_signal(zv).cpu()
     print("done")
     out_dir = OUT_DIR / str(uuid.uuid4())
     out_dir.mkdir()
+    sig.write(out_dir / "output.wav")
+    mask.write(out_dir / "mask.wav")
+    return sig.path_to_file, mask.path_to_file
+def save_vamp(data):
     out_dir = OUT_DIR / "saved" / str(uuid.uuid4())
     out_dir.mkdir(parents=True, exist_ok=True)
     sig_out.write(out_dir / "output.wav")
     data = {
+        "init_temp": data[init_temp],
+        "final_temp": data[final_temp],
+        "prefix_s": data[prefix_s],
+        "suffix_s": data[suffix_s],
+        "rand_mask_intensity": data[rand_mask_intensity],
+        "num_steps": data[num_steps],
+        "notes": data[notes_text],
+        "periodic_period": data[periodic_p],
+        "periodic_width": data[periodic_w],
+        "n_conditioning_codebooks": data[n_conditioning_codebooks],
+        "use_coarse2fine": data[use_coarse2fine],
+        "stretch_factor": data[stretch_factor],
     }
     # save with yaml
     return f"saved! your save code is {out_dir.stem}", zip_path
+with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column():
             use_coarse2fine = gr.Checkbox(
                 label="use coarse2fine",
                 value=True
             )
             manual_audio_upload = gr.File(
                 label=f"upload some audio (will be randomly trimmed to max of {interface.coarse.chunk_size_s:.2f}s)",
                 outputs=[ input_audio]
             )
         # mask settings
         with gr.Column():
+            rand_mask_intensity = gr.Slider(
+                label="random mask intensity. (If this is less than 1, scatters prompts throughout the audio, should be between 0.9 and 1.0)",
+                minimum=0.0,
+                maximum=1.0,
+                value=1.0
             )
+            periodic_p = gr.Slider(
+                label="periodic prompt  (0.0 means no hint, 2 - lots of hints, 8 - a couple of hints, 16 - occasional hint, 32 - very occasional hint, etc)",
                 minimum=0,
+                maximum=128,
                 step=1,
                 value=9,
             )
+            periodic_w = gr.Slider(
+                label="periodic prompt width (steps, 1 step ~= 10milliseconds)",
                 minimum=1,
+                maximum=20,
                 step=1,
                 value=1,
             )
+            with gr.Accordion("extras ", open=False):
+                n_conditioning_codebooks = gr.Number(
+                    label="number of conditioning codebooks. probably 0",
+                    value=0,
+                    precision=0,
+                )
+                stretch_factor = gr.Slider(
+                    label="time stretch factor",
+                    minimum=0,
+                    maximum=64,
+                    step=1,
+                    value=1,
+                )
             with gr.Accordion("prefix/suffix hints", open=False):
                 prefix_s = gr.Slider(
                     value=1.0
                 )
             num_steps = gr.Slider(
                 label="number of steps (should normally be between 12 and 36)",
                 value=36
             )
+            dropout = gr.Slider(
+                label="mask dropout",
+                minimum=0.0,
+                maximum=1.0,
+                step=0.01,
+                value=0.0
+            )
             vamp_button = gr.Button("vamp!!!")
             output_audio = gr.Audio(
                 type="filepath"
             )
+        # with gr.Column():
+        #     with gr.Accordion(label="beat unmask (how much time around the beat should be hinted?)"):
+        #         use_beats = gr.Checkbox(
+        #             label="use beat hints (helps the output stick to the beat structure of the input)",
+        #             value=False
+        #         )
+        #         snap_to_beats = gr.Checkbox(
+        #             label="trim to beat markers (uncheck if the output audio is too short.)",
+        #             value=True
+        #         )
+        #         beat_unmask_dur = gr.Slider(
+        #             label="duration",
+        #             minimum=0.0,
+        #             maximum=3.0,
+        #             value=0.07
+        #         )
             notes_text = gr.Textbox(
                 label="type any notes about the generated audio here",
                 interactive=False
             )
             thank_you = gr.Markdown("")
     # connect widgets
     vamp_button.click(
         fn=vamp,
+        inputs={
+            input_audio,
+            num_steps,
+            init_temp, final_temp,
+            prefix_s, suffix_s,
+            rand_mask_intensity,
+            periodic_p, periodic_w,
+            n_conditioning_codebooks,
+            dropout,
+            use_coarse2fine,
+            stretch_factor
+        },
         outputs=[output_audio, audio_mask],
         api_name="vamp"
     )
     save_button.click(
         fn=save_vamp,
+        inputs={
+            input_audio,
+            num_steps,
+            init_temp, final_temp,
+            prefix_s, suffix_s,
+            rand_mask_intensity,
+            periodic_p, periodic_w,
+            n_conditioning_codebooks,
+            dropout,
+            use_coarse2fine,
+            stretch_factor,
+            notes_text
+        },
         outputs=[thank_you, download_file]
     )
 demo.launch(share=True, enable_queue=False, debug=True)

env/alias.sh DELETED Viewed

@@ -1,3 +0,0 @@
-alias cleanup="pkill python && echo -en '\e[?25h'"
-alias stage="python ./scripts/utils/stage.py"
-alias fix_cursor="echo -en '\e[?25h'"

env/data.sh DELETED Viewed

@@ -1,36 +0,0 @@
-export PATH_TO_DATA=~/data
-if [[ $(hostname) == "oon17" ]]; then
-    export PATH_TO_DATA=/data/
-fi
-if [[ $(hostname) == "oon19" ]]; then
-    export PATH_TO_DATA=/home/prem/shared/data/
-fi
-if [[ $(hostname) == "lucas-ssound-trt-vm" ]]; then
-    export PATH_TO_DATA=~/data
-fi
-if [[ $(hostname) == "a100-ssound" ]]; then
-    export PATH_TO_DATA=~/data
-fi
-if [[ $(hostname) == "oon25" ]]; then
-    export PATH_TO_DATA=/data
-fi
-if [[ $(hostname) == "macbook-pro-2.lan" ]]; then
-    export PATH_TO_DATA=~/data
-fi
-if [[ $(hostname) == "oon11" ]]; then
-    export PATH_TO_DATA=/data2/syncthing_lucas/data
-fi
-if [[ $(hostname) == "oon12" ]]; then
-    export PATH_TO_DATA=/data
-fi
-if [[ $(hostname) == "oon26" ]]; then
-    export PATH_TO_DATA=/data
-fi

env/entry_script.sh DELETED Viewed

@@ -1,41 +0,0 @@
-#!/bin/bash
-set -e
-if [ -z "${USER}" ]; then
-  echo "We need USER to be set!"; exit 100
-fi
-# check if host uid and gid are set
-if [ -z "${HOST_USER_ID}" ]; then
-    echo "Please set HOST_USER_ID env. variables to continue." ; exit 0
-fi
-if [ -z "${HOST_USER_GID}" ]; then
-    echo "Please set HOST_USER_GID env. variables to continue." ; exit 0
-fi
-USER_ID=$HOST_USER_ID
-USER_GID=$HOST_USER_GID
-USER_HOME=/u/home
-# modify uid and gid to match host
-sed -i -e "s/^${USER}:\([^:]*\):[0-9]*:[0-9]*/${USER}:\1:${USER_ID}:${USER_GID}/"  /etc/passwd
-# create a group for host gid
-groupadd -f --gid "${USER_GID}" "host_group"
-chown $USER_ID $USER_HOME
-chown $USER_ID /u/home/.zshrc
-chown $USER_ID /u/home/.oh-my-zsh
-mkdir -p /u/home/.cache
-chown -R $USER_ID:$USER_GID /u/home/.cache/
-_term() {
-  echo "Caught SIGTERM signal!"
-  kill -TERM "$child" 2>/dev/null
-}
-trap _term SIGTERM
-su -p "${USER}"

env/setup.py DELETED Viewed

@@ -1,94 +0,0 @@
-# This script guides the user through setting up their env.sh
-# if env.sh does not exist. Should have no dependencies other
-# than Python standard library.
-import shlex
-import socket
-import subprocess
-import textwrap
-def run(cmd):
-    return subprocess.check_output(shlex.split(cmd)).decode("utf-8")
-print()
-print("4. Setting up paths.")
-print("--------------------")
-PATH_TO_RUNS = input("Where runs should go (default:./runs/): ") or "./runs/"
-TENSORBOARD_PATH = (
-    input("Bucket/dir for tensorboard logs (default=PATH_TO_RUNS): ") or PATH_TO_RUNS
-)
-with open("env/data.sh") as f:
-    data_script = f.read()
-write_to_data_sh = False
-if socket.gethostname() not in data_script:
-    print("Looks like the data path for this machine is not setup.")
-    PATH_TO_DATA = input(f"Path to data on {socket.gethostname()}: ") or "~/data"
-    data_command = f"""
-if [[ $(hostname) == "{socket.gethostname()}" ]]; then
-    export PATH_TO_DATA={PATH_TO_DATA}
-fi
-    """
-    write_to_data_sh = True
-print()
-print("5. Setting up Papaya")
-print("-----------------------------------------")
-PAPAYA_USER_TOKEN = input("Papaya user token: ") or "undefined"
-env_script = f"""
-source env/alias.sh
-source env/data.sh
-export GITHUB_TOKEN={GITHUB_TOKEN}
-export PAPAYA_USER_TOKEN={PAPAYA_USER_TOKEN}
-export HOST_USER_ID=$(id -u)
-export HOST_USER_GID=$(id -g)
-export JUPYTER_TOKEN={JUPYTER_TOKEN}
-export JUPYTER_PORT={JUPYTER_PORT}
-export TENSORBOARD_PORT={TENSORBOARD_PORT}
-export PATH_TO_RUNS={PATH_TO_RUNS}
-export TENSORBOARD_PATH={TENSORBOARD_PATH}
-"""
-print()
-print("6. Potential file contents.")
-print("---------------------------")
-print("env/env.sh: \n")
-print("##################")
-print(env_script)
-print("##################")
-if write_to_data_sh:
-    data_script += data_command
-print("env/data.sh:")
-print("##################")
-print(data_script)
-print("##################")
-print()
-write_to_files = input("Write to file [yn]? ") or "n"
-if write_to_files == "y":
-    with open("env/env.sh", "w") as f:
-        f.write(env_script.strip())
-    with open("env/data.sh", "w") as f:
-        f.write(data_script.strip())
-print()
-print("8. Finalize setup.")
-print("------------------")
-print("Run the following command to complete setup.")
-print("source env/env.sh")

scripts/exp/train.py CHANGED Viewed

@@ -18,6 +18,8 @@ from tensorboardX import SummaryWriter
 import vampnet
 from vampnet.modules.transformer import VampNet
 from lac.model.lac import LAC
@@ -322,7 +324,10 @@ def train(
                 n_batch = z.shape[0]
                 r = rng.draw(n_batch)[:, 0].to(accel.device)
-                z_mask, mask = vn.add_noise(z, r)
                 z_mask_latent = vn.embedding.from_codes(z_mask, codec)
                 dtype = torch.bfloat16 if accel.amp else None
@@ -331,14 +336,12 @@ def train(
                     # for mask mode
                     z_hat = vn.add_truth_to_logits(z, z_hat, mask)
-                target = vn.embedding.flatten(
                     z[:, vn.n_conditioning_codebooks :, :],
-                    n_codebooks=vn.n_predict_codebooks,
                 )
-                flat_mask = vn.embedding.flatten(
                     mask[:, vn.n_conditioning_codebooks :, :],
-                    n_codebooks=vn.n_predict_codebooks,
                 )
                 if vn.noise_mode == "mask":
@@ -398,21 +401,22 @@ def train(
             n_batch = z.shape[0]
             r = rng.draw(n_batch)[:, 0].to(accel.device)
-            z_mask, mask = vn.add_noise(z, r)
             z_mask_latent = vn.embedding.from_codes(z_mask, codec)
             z_hat = model(z_mask_latent, r)
             # for mask mode
             z_hat = vn.add_truth_to_logits(z, z_hat, mask)
-            target = vn.embedding.flatten(
                 z[:, vn.n_conditioning_codebooks :, :],
-                n_codebooks=vn.n_predict_codebooks,
             )
-            flat_mask = vn.embedding.flatten(
-                mask[:, vn.n_conditioning_codebooks :, :],
-                n_codebooks=vn.n_predict_codebooks,
             )
             output = {}
@@ -514,14 +518,12 @@ def train(
         def save_imputation(self, z: torch.Tensor):
             n_prefix = int(z.shape[-1] * 0.25)
             n_suffix = int(z.shape[-1] *  0.25)
-            downsample_factor = None
             vn = accel.unwrap(model)
-            z_mask, mask = vn.add_noise(
-                z, r=0.0, n_prefix=n_prefix, n_suffix=n_suffix,
-                downsample_factor=downsample_factor
-            )
             imputed_noisy = vn.to_signal(z_mask, codec)
             imputed_true = vn.to_signal(z, codec)
@@ -574,9 +576,11 @@ def train(
             r = torch.linspace(0.1, 0.95, len(val_idx)).to(accel.device)
-            n_batch = z.shape[0]
-            z_mask, mask = vn.add_noise(z, r)
             z_mask_latent = vn.embedding.from_codes(z_mask, codec)
             z_hat = model(z_mask_latent, r)
@@ -584,7 +588,7 @@ def train(
             z_hat = vn.add_truth_to_logits(z, z_hat, mask)
             z_pred = torch.softmax(z_hat, dim=1).argmax(dim=1)
-            z_pred = vn.embedding.unflatten(z_pred, n_codebooks=vn.n_predict_codebooks)
             z_pred = torch.cat([z[:, : vn.n_conditioning_codebooks, :], z_pred], dim=1)
             generated = vn.to_signal(z_pred, codec)

 import vampnet
 from vampnet.modules.transformer import VampNet
+from vampnet.util import codebook_unflatten, codebook_flatten
+from vampnet import mask as pmask
 from lac.model.lac import LAC
                 n_batch = z.shape[0]
                 r = rng.draw(n_batch)[:, 0].to(accel.device)
+                mask = pmask.random(z, r)
+                mask = pmask.codebook_unmask(mask, vn.n_conditioning_codebooks)
+                z_mask, mask = pmask.apply_mask(z, mask, vn.mask_token)
                 z_mask_latent = vn.embedding.from_codes(z_mask, codec)
                 dtype = torch.bfloat16 if accel.amp else None
                     # for mask mode
                     z_hat = vn.add_truth_to_logits(z, z_hat, mask)
+                target = codebook_flatten(
                     z[:, vn.n_conditioning_codebooks :, :],
                 )
+                flat_mask = codebook_flatten(
                     mask[:, vn.n_conditioning_codebooks :, :],
                 )
                 if vn.noise_mode == "mask":
             n_batch = z.shape[0]
             r = rng.draw(n_batch)[:, 0].to(accel.device)
+            mask = pmask.random(z, r)
+            mask = pmask.codebook_unmask(mask, vn.n_conditioning_codebooks)
+            z_mask, mask = pmask.apply_mask(z, mask, vn.mask_token)
             z_mask_latent = vn.embedding.from_codes(z_mask, codec)
             z_hat = model(z_mask_latent, r)
             # for mask mode
             z_hat = vn.add_truth_to_logits(z, z_hat, mask)
+            target = codebook_flatten(
                 z[:, vn.n_conditioning_codebooks :, :],
             )
+            flat_mask = codebook_flatten(
+                mask[:, vn.n_conditioning_codebooks :, :]
             )
             output = {}
         def save_imputation(self, z: torch.Tensor):
             n_prefix = int(z.shape[-1] * 0.25)
             n_suffix = int(z.shape[-1] *  0.25)
             vn = accel.unwrap(model)
+            mask = pmask.inpaint(z, n_prefix, n_suffix)
+            mask = pmask.codebook_unmask(mask, vn.n_conditioning_codebooks)
+            z_mask, mask = pmask.apply_mask(z, mask, vn.mask_token)
             imputed_noisy = vn.to_signal(z_mask, codec)
             imputed_true = vn.to_signal(z, codec)
             r = torch.linspace(0.1, 0.95, len(val_idx)).to(accel.device)
+            mask = pmask.random(z, r)
+            mask = pmask.codebook_unmask(mask, vn.n_conditioning_codebooks)
+            z_mask, mask = pmask.apply_mask(z, mask, vn.mask_token)
             z_mask_latent = vn.embedding.from_codes(z_mask, codec)
             z_hat = model(z_mask_latent, r)
             z_hat = vn.add_truth_to_logits(z, z_hat, mask)
             z_pred = torch.softmax(z_hat, dim=1).argmax(dim=1)
+            z_pred = codebook_unflatten(z_pred, n_c=vn.n_predict_codebooks)
             z_pred = torch.cat([z[:, : vn.n_conditioning_codebooks, :], z_pred], dim=1)
             generated = vn.to_signal(z_pred, codec)

scripts/utils/vamp_folder.py CHANGED Viewed

@@ -95,19 +95,6 @@ def opus(sig, interface, bitrate=128):
         )
     return sig
-def token_noise(ratio=1.0):
-    def wrapper(sig, interface):
-        z = interface.encode(sig)
-        r = interface.coarse.invgamma(ratio).to(interface.device)
-        print(f'adding noise with ratio {ratio}')
-        z, mask = interface.coarse.add_noise(
-            z,
-            r,
-            noise_mode="random"
-        )
-        return interface.to_signal(z)
-    return wrapper
 def mask_ratio_1_step(ratio=1.0):
     def wrapper(sig, interface):
         r = interface.coarse.invgamma(ratio).to(interface.device)

         )
     return sig
 def mask_ratio_1_step(ratio=1.0):
     def wrapper(sig, interface):
         r = interface.coarse.invgamma(ratio).to(interface.device)

vampnet/interface.py CHANGED Viewed

@@ -9,6 +9,8 @@ import tqdm
 from .modules.transformer import VampNet
 from .beats import WaveBeat
 from lac.model.lac import LAC
@@ -20,14 +22,6 @@ def signal_concat(
     return AudioSignal(audio_data, sample_rate=audio_signals[0].sample_rate)
-class SignalPrompt:
-    def __init__(self, signal: AudioSignal):
-        self.sig = signal
 class Interface(torch.nn.Module):
     def __init__(
         self,
@@ -100,10 +94,6 @@ class Interface(torch.nn.Module):
     def to_signal(self, z: torch.Tensor):
         return self.coarse.to_signal(z, self.codec)
-    def autoencode(self, signal: AudioSignal):
-        z = self.encode(signal)
-        return self.to_signal(z)
     def preprocess(self, signal: AudioSignal):
         signal = (
             signal.clone()
@@ -249,182 +239,30 @@ class Interface(torch.nn.Module):
         fine_z = torch.cat(fine_z, dim=-1)
         return fine_z[:, :, :length].clone()
     def coarse_vamp(
         self,
-        signal,
-        prefix_dur_s: float = 0.0,
-        suffix_dur_s: float = 0.0,
-        num_vamps: int = 1,
-        downsample_factor: int = None,
-        stretch_factor: int = None,
-        periodic_width: int = 1,
-        periodic_dropout=0.0,
-        periodic_width_dropout=0.0,
-        intensity: float = 1.0,
-        debug=False,
-        swap_prefix_suffix=False,
-        ext_mask=None,
-        n_conditioning_codebooks=None,
-        verbose=False,
         return_mask=False,
         **kwargs
     ):
-        z = self.encode(signal)
         # coarse z
         cz = z[:, : self.coarse.n_codebooks, :].clone()
-        c_seq_len = cz.shape[-1]
-        n_prefix = self.s2t(prefix_dur_s)
-        n_suffix = self.s2t(suffix_dur_s)
-        # hmm, should be a better way to do this? think we just need a mask builder class
-        add_random_periodic_offset = True
-        if stretch_factor is not None and stretch_factor > 1:
-            print(f"stretching by {stretch_factor}")
-            assert stretch_factor >= 1, "stretch factor must be >= 1"
-            cz = cz.repeat_interleave(stretch_factor, dim=-1)
-            # the downsample factor is now relative to the stretched sequence
-            assert downsample_factor is None or downsample_factor <= 2, "downsample_factor must be None when stretch_factor is not None"
-            downsample_factor = stretch_factor
-            add_random_periodic_offset = False
-            assert n_prefix == 0 and n_suffix == 0, "prefix and suffix must be 0 when stretch_factor is not None"
-            assert ext_mask is None, "ext_mask must be None when stretch_factor is not None"
-            # trim cz to the original length
-            cz = cz[:, :, :c_seq_len]
         assert cz.shape[-1] <= self.s2t(self.coarse.chunk_size_s), f"the sequence of tokens provided must match the one specified in the coarse chunk size, but got {cz.shape[-1]} and {self.s2t(self.coarse.chunk_size_s)}"
-        assert n_prefix + n_suffix < c_seq_len, "prefix and suffix must be smaller than the chunk size"
-        if swap_prefix_suffix:
-            # swap the prefix and suffix
-            assert n_prefix == n_suffix, "prefix and suffix must be the same size for now"
-            cz[:, :, :n_prefix], cz[:, :, c_seq_len-n_suffix:] = cz[:, :, c_seq_len-n_suffix:], cz[:, :, :n_prefix].clone()
-        # we'll keep the final codes sequence here
-        c_vamp = {
-            'prefix': [cz[:, :, :n_prefix].clone()],
-            'suffix': [cz[:, :, c_seq_len-n_suffix:].clone()]
-        }
-        _cz = cz.clone()
-        cz_mask = None
-        range_fn = tqdm.trange if verbose else range
-        for _ in range_fn(num_vamps):
-            # add noise
-            cz_masked, cz_mask = self.coarse.add_noise(
-                _cz, r=1.0-intensity,
-                n_prefix=n_prefix,
-                n_suffix=n_suffix,
-                downsample_factor=downsample_factor,
-                periodic_width=periodic_width,
-                periodic_dropout=periodic_dropout,
-                add_random_periodic_offset=add_random_periodic_offset,
-                periodic_width_dropout=periodic_width_dropout,
-                mask=cz_mask,
-                ext_mask=ext_mask,
-                n_conditioning_codebooks=n_conditioning_codebooks
-            )
-            if debug:
-                print("tokens to infer")
-                self.to_signal(cz_masked).cpu().widget()
-            # sample!
-            if debug:
-                print(f"mask: {cz_mask[:,0,:]}")
-                print(f"z: {_cz[:,0,:]}")
-            cz_sampled = self.coarse.sample(
-                codec=self.codec,
-                time_steps=_cz.shape[-1],
-                start_tokens=_cz,
-                mask=cz_mask,
-                return_signal=False,
-                **kwargs
-            )
-            if debug:
-                print("tokens sampled")
-                self.to_signal(cz_sampled).cpu().widget()
-            # the z that was generated
-            cz_generated = cz_sampled[:, :, n_prefix:c_seq_len-n_suffix].clone()
-            n_generated = cz_generated.shape[-1]
-            # create the new prefix and suffix
-            # we'll make sure that the number of prefix and suffix
-            # tokens is the same as the original
-            # but we do want to advance the sequence as much as we can
-            if n_prefix > 0 and n_suffix > 0:
-                # we have both prefix and suffix, so we'll split the generated
-                # codes in two halves
-                prefix_start_idx = n_generated // 2
-                prefix_stop_idx = prefix_start_idx + n_prefix
-                assert prefix_start_idx >= 0, "internal error"
-                suffix_start_idx = n_prefix + n_generated // 2
-                suffix_stop_idx = suffix_start_idx + n_suffix
-                assert suffix_stop_idx <= cz_sampled.shape[-1], "internal error"
-                cz_new_prefix = cz_sampled[:, :, prefix_start_idx:prefix_stop_idx].clone()
-                cz_new_suffix = cz_sampled[:, :, suffix_start_idx:suffix_stop_idx].clone()
-                c_vamp['prefix'].append(cz_generated[:,:,:n_generated//2])
-                c_vamp['suffix'].insert(0, cz_generated[:,:,n_generated//2:])
-            elif n_prefix > 0:
-                # we only have a prefix
-                prefix_start_idx = n_generated
-                prefix_stop_idx = prefix_start_idx + n_prefix
-                cz_new_prefix = cz_sampled[:, :, prefix_start_idx:prefix_stop_idx].clone()
-                cz_new_suffix = _cz[:, :, :0].clone()
-                c_vamp['prefix'].append(cz_generated)
-            elif n_suffix > 0:
-                # we only have a suffix, so everything starting at 0 is generated
-                suffix_stop_idx = max(n_generated, n_suffix)
-                suffix_start_idx = suffix_stop_idx - n_suffix
-                cz_new_prefix = _cz[:, :, :0].clone()
-                cz_new_suffix = cz_sampled[:, :, suffix_start_idx:suffix_stop_idx].clone()
-                c_vamp['suffix'].insert(0, cz_generated)
-            else:
-                # we have no prefix or suffix, so we'll just use the generated
-                # codes as the new prefix and suffix
-                cz_new_prefix = cz_generated.clone()
-                cz_new_suffix = _cz[:, :, :0].clone()
-                c_vamp['prefix'].append(cz_generated)
-            n_to_insert = c_seq_len - (cz_new_prefix.shape[-1] + cz_new_suffix.shape[-1])
-            to_insert = torch.zeros(cz_new_prefix.shape[0], cz_new_prefix.shape[1], n_to_insert).long().to(self.device)
-            _cz = torch.cat([cz_new_prefix, to_insert, cz_new_suffix], dim=-1)
-            to_insert_mask = torch.zeros_like(_cz).long().to(self.device)
-            to_insert_mask[:, :, cz_new_prefix.shape[-1]:cz_new_prefix.shape[-1]+n_to_insert] = 1
-            cz_mask = (cz_mask + to_insert_mask).bool().long()
-            if debug:
-                print("tokens to infer next round (area to insert in the middle)")
-                self.to_signal(_cz).cpu().widget()
-        prefix_codes = torch.cat(c_vamp['prefix'], dim=-1)
-        suffix_codes = torch.cat(c_vamp['suffix'], dim=-1)
-        c_vamp = torch.cat([prefix_codes, suffix_codes], dim=-1)
         # replace the mask token in cz_masked with random tokens
         # so that we can decode it
@@ -433,132 +271,61 @@ class Interface(torch.nn.Module):
         return c_vamp
-    # create a variation of an audio signal
-    def variation(
-        self,
-        signal: AudioSignal,
-        verbose: bool = False,
-        beat_mask: bool = False,
-        beat_mask_kwargs: dict = {},
-        **kwargs
-    ):
-        signal = signal.clone()
-        # autoencode first, so the samples get rounded up to the nearest tokens
-        signal = self.autoencode(signal).cpu()
-        # pad the signal to the nearest chunk size
-        req_len = (
-            math.ceil(signal.duration / self.coarse.chunk_size_s)
-            * self.coarse.chunk_size_s
-        )
-        # eventually we DO want overlap, but we want overlap-replace not
-        # overlap-add
-        overlap_hop_ratio = 1.0
-        hop_duration = self.coarse.chunk_size_s * overlap_hop_ratio
-        original_length = signal.length
-        signal.zero_pad_to(req_len)
-        # window the signal
-        signal = signal.collect_windows(
-            window_duration=self.coarse.chunk_size_s,
-            hop_duration=hop_duration,
-        )
-        # output = []
-        range_fn = range if not verbose else tqdm.trange
-        for i in range_fn(signal.batch_size):
-            sig = AudioSignal(
-                signal.samples[i,...], signal.sample_rate
-            )
-            sig.to(self.device)
-            if beat_mask:
-                ext_mask = self.make_beat_mask(sig, **beat_mask_kwargs)
-            else:
-                ext_mask = None
-            out_z = self.coarse_vamp(
-                sig,
-                num_vamps=1,
-                swap_prefix_suffix=False,
-                ext_mask=ext_mask,
-                verbose=verbose,
-                **kwargs
-            )
-            if self.c2f is not None:
-                out_z = self.coarse_to_fine(out_z)
-            out_sig = self.to_signal(out_z).cpu()
-            signal.samples[i] = out_sig.samples
-        output = signal.overlap_and_add(hop_duration)
-        output.truncate_samples(original_length)
-        return output
-    # create a loop of a single region with variations
-    # TODO: this would work nicer if we could trim at the beat
-    # otherwise the model has to awkwardly fill up space that won't match
-    # the beat unless the signal is exactly the right length
-    def loop(
-        self,
-        signal: AudioSignal,
-        prefix_dur_s: float = 0.0,
-        suffix_dur_s: float = 0.0,
-        num_loops: int = 4,
-        # overlap_hop_ratio: float = 1.0, # TODO: should this be fixed to 1.0?  or should we overlap and replace instead of overlap add
-        verbose: bool = False,
-        return_mask: bool = False,
-        **kwargs,
-    ):
-        assert prefix_dur_s >= 0.0, "prefix duration must be >= 0"
-        assert suffix_dur_s >= 0.0, "suffix duration must be >= 0"
-        signal = self.preprocess(signal)
-        suffix_len_samples = int(suffix_dur_s * signal.sample_rate)
-        prefix_len_tokens = self.s2t(prefix_dur_s)
-        suffix_len_tokens = self.s2t(suffix_dur_s)
-        loops = [
-            # add everything but the suffix a the beggining
-            self.encode(signal.clone().trim(before=0, after=suffix_len_samples))
-        ]
-        range_fn = range if not verbose else tqdm.trange
-        for i in range_fn(num_loops):
-            is_flipped = i % 2 == 0
-            vamped = self.coarse_vamp(
-                        signal,
-                        prefix_dur_s=prefix_dur_s,
-                        suffix_dur_s=suffix_dur_s,
-                        swap_prefix_suffix=is_flipped,
-                        return_mask=return_mask,
-                        **kwargs
-                )
-            if return_mask:
-                vamped, mask = vamped
-            # if we're flipped, we trim the prefix off of the end
-            # otherwise we trim the suffix off of the end
-            trim_len = prefix_len_tokens if is_flipped else suffix_len_tokens
-            vamped = vamped[:, :, :vamped.shape[-1]-trim_len]
-            loops.append(vamped)
-        if is_flipped:
-            loops.append(
-                # add everything but the prefix at the end
-                self.encode(signal.clone())
-            )
-        if self.c2f is not None:
-            loops = [self.coarse_to_fine(l) for l in loops]
-        loops = [self.to_signal(l) for l in loops]
-        if return_mask:
-            return signal_concat(loops), self.to_signal(mask)
-        return signal_concat(loops)

 from .modules.transformer import VampNet
 from .beats import WaveBeat
+from .mask import *
 from lac.model.lac import LAC
     return AudioSignal(audio_data, sample_rate=audio_signals[0].sample_rate)
 class Interface(torch.nn.Module):
     def __init__(
         self,
     def to_signal(self, z: torch.Tensor):
         return self.coarse.to_signal(z, self.codec)
     def preprocess(self, signal: AudioSignal):
         signal = (
             signal.clone()
         fine_z = torch.cat(fine_z, dim=-1)
         return fine_z[:, :, :length].clone()
     def coarse_vamp(
         self,
+        z,
+        mask,
         return_mask=False,
         **kwargs
     ):
         # coarse z
         cz = z[:, : self.coarse.n_codebooks, :].clone()
         assert cz.shape[-1] <= self.s2t(self.coarse.chunk_size_s), f"the sequence of tokens provided must match the one specified in the coarse chunk size, but got {cz.shape[-1]} and {self.s2t(self.coarse.chunk_size_s)}"
+        mask = mask[:, : self.coarse.n_codebooks, :]
+        cz_masked, mask = apply_mask(cz, mask, self.coarse.mask_token)
+        cz_masked = cz_masked[:, : self.coarse.n_codebooks, :]
+        c_vamp = self.coarse.sample(
+            codec=self.codec,
+            time_steps=cz.shape[-1],
+            start_tokens=cz,
+            mask=mask,
+            return_signal=False,
+            **kwargs
+        )
         # replace the mask token in cz_masked with random tokens
         # so that we can decode it
         return c_vamp
+if __name__ == "__main__":
+    import audiotools as at
+    interface = Interface(
+        coarse_ckpt="./models/spotdl/coarse.pth",
+        coarse2fine_ckpt="./models/spotdl/c2f.pth",
+        codec_ckpt="./models/spotdl/codec.pth",
+        device="cpu"
+    )
+    sig = at.AudioSignal('cali.mp3', duration=10)
+    z = interface.encode(sig)
+    mask = linear_random(z, 0.8)
+    print(mask)
+    mask = mask_and(
+        mask, inpaint(
+            z,
+            interface.s2t(3),
+            interface.s2t(3)
+        )
+    )
+    print(mask)
+    mask = mask_and(
+        mask, periodic_mask(
+            z,
+            7,
+            1,
+            random_roll=True
+        )
+    )
+    mask = dropout(mask, 0.0)
+    mask = codebook_unmask(mask, 0)
+    zv, mask_z = interface.coarse_vamp(
+        z,
+        mask=mask,
+        sampling_steps=1,
+        temperature=(0.8,1),
+        return_mask=True
+    )
+    use_coarse2fine = False
+    if use_coarse2fine:
+        zv = interface.coarse_to_fine(zv)
+    print(mask_z)
+    mask = interface.to_signal(mask_z).cpu()
+    sig = interface.to_signal(zv).cpu()
+    print("done")
+    sig.write("output.wav")
+    mask.write("mask.wav")

vampnet/mask.py ADDED Viewed

	@@ -0,0 +1,184 @@

+from typing import Optional
+import torch
+from .util import scalar_to_batch_tensor
+def _gamma(r):
+    return (r * torch.pi / 2).cos()
+def _invgamma(y):
+    if not torch.is_tensor(y):
+        y = torch.tensor(y)[None]
+    return 2 * y.acos() / torch.pi
+def full_mask(x: torch.Tensor):
+    assert x.ndim == 3, "x must be (batch, n_codebooks, seq)"
+    return torch.ones_like(x).long()
+def empty_mask(x: torch.Tensor):
+    assert x.ndim == 3, "x must be (batch, n_codebooks, seq)"
+    return torch.zeros_like(x).long()
+def apply_mask(
+        x: torch.Tensor,
+        mask: torch.Tensor,
+        mask_token: int
+    ):
+    assert mask.ndim == 3, "mask must be (batch, n_codebooks, seq)"
+    assert mask.shape == x.shape, "mask must be same shape as x"
+    assert mask.dtype == torch.long, "mask must be long dtype"
+    assert ~torch.any(mask > 1), "mask must be binary"
+    assert ~torch.any(mask < 0), "mask must be binary"
+    fill_x = torch.full_like(x, mask_token)
+    x = x * (1 - mask) + fill_x * mask
+    return x, mask
+def random(
+    x: torch.Tensor,
+    r: torch.Tensor
+):
+    assert x.ndim == 3, "x must be (batch, n_codebooks, seq)"
+    if not isinstance(r, torch.Tensor):
+        r = scalar_to_batch_tensor(r, x.shape[0]).to(x.device)
+    r = _gamma(r)[:, None, None]
+    probs = torch.ones_like(x) * r
+    mask = torch.bernoulli(probs)
+    mask = mask.round().long()
+    return mask
+def linear_random(
+    x: torch.Tensor,
+    r: torch.Tensor,
+):
+    assert x.ndim == 3, "x must be (batch, n_codebooks, seq)"
+    if not isinstance(r, torch.Tensor):
+        r = scalar_to_batch_tensor(r, x.shape[0]).to(x.device).float()
+    probs = torch.ones_like(x).to(x.device).float()
+    # expand to batch and codebook dims
+    probs = probs.expand(x.shape[0], x.shape[1], -1)
+    probs = probs * r
+    mask = torch.bernoulli(probs)
+    mask = mask.round().long()
+    return mask
+def inpaint(x: torch.Tensor,
+    n_prefix,
+    n_suffix,
+):
+    assert n_prefix is not None
+    assert n_suffix is not None
+    mask = full_mask(x)
+    # if we have a prefix or suffix, set their mask prob to 0
+    if n_prefix > 0:
+        if not isinstance(n_prefix, torch.Tensor):
+            n_prefix = scalar_to_batch_tensor(n_prefix, x.shape[0]).to(x.device)
+        for i, n in enumerate(n_prefix):
+            if n > 0:
+                mask[i, :, :n] = 0.0
+    if n_suffix > 0:
+        if not isinstance(n_suffix, torch.Tensor):
+            n_suffix = scalar_to_batch_tensor(n_suffix, x.shape[0]).to(x.device)
+        for i, n in enumerate(n_suffix):
+            if n > 0:
+                mask[i, :, -n:] = 0.0
+    return mask
+def periodic_mask(x: torch.Tensor,
+                period: int, width: int = 1,
+                random_roll=False,
+    ):
+    mask = full_mask(x)
+    if period == 0:
+        return mask
+    if not isinstance(period, torch.Tensor):
+        period = scalar_to_batch_tensor(period, x.shape[0])
+    for i, factor in enumerate(period):
+        if factor == 0:
+            continue
+        for j in range(mask.shape[-1]):
+            if j % factor == 0:
+                # figure out how wide the mask should be
+                j_start = max(0, j - width // 2  )
+                j_end = min(mask.shape[-1] - 1, j + width // 2 ) + 1
+                # flip a coin for each position in the mask
+                j_mask = torch.bernoulli(torch.ones(j_end - j_start))
+                assert torch.all(j_mask == 1)
+                j_fill = torch.ones_like(j_mask) * (1 - j_mask)
+                assert torch.all(j_fill == 0)
+                # fill
+                mask[i, :, j_start:j_end] = j_fill
+    if random_roll:
+        # add a random offset to the mask
+        offset = torch.randint(0, period[0], (1,))
+        mask = torch.roll(mask, offset.item(), dims=-1)
+    return mask
+def codebook_unmask(
+    mask: torch.Tensor,
+    n_conditioning_codebooks: int
+):
+    if n_conditioning_codebooks == None:
+        return mask
+    # if we have any conditioning codebooks, set their mask  to 0
+    mask = mask.clone()
+    mask[:, :n_conditioning_codebooks, :] = 0
+    return mask
+def mask_and(
+    mask1: torch.Tensor,
+    mask2: torch.Tensor
+):
+    assert mask1.shape == mask2.shape, "masks must be same shape"
+    return torch.min(mask1, mask2)
+def dropout(
+    mask: torch.Tensor,
+    p: float,
+):
+    return torch.bernoulli((torch.ones_like(mask) * (1-p)).float()).long() * mask
+def mask_or(
+    mask1: torch.Tensor,
+    mask2: torch.Tensor
+):
+    assert mask1.shape == mask2.shape, "masks must be same shape"
+    assert mask1.max() <= 1, "mask1 must be binary"
+    assert mask2.max() <= 1, "mask2 must be binary"
+    assert mask1.min() >= 0, "mask1 must be binary"
+    assert mask2.min() >= 0, "mask2 must be binary"
+    return (mask1 + mask2).clamp(0, 1)
+def time_stretch_mask(
+    x: torch.Tensor,
+    stretch_factor: int,
+    mask_token: int
+):
+    assert stretch_factor >= 1, "stretch factor must be >= 1"
+    c_seq_len = x.shape[-1]
+    x = x.repeat_interleave(stretch_factor, dim=-1)
+    # trim cz to the original length
+    x = x[:, :, :c_seq_len]
+    mask = periodic_mask(x, stretch_factor, width=1)
+    return apply_mask(x, mask, mask_token)
+if __name__ == "__main__":
+    torch.set_printoptions(threshold=10000)

vampnet/modules/base.py DELETED Viewed

@@ -1,412 +0,0 @@
-import math
-from typing import Optional
-from typing import Tuple
-from typing import Union
-import audiotools as at
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from einops import rearrange
-from tqdm import tqdm
-from ..util import scalar_to_batch_tensor
-def log(t, eps=1e-20):
-    return torch.log(t + eps)
-def gumbel_noise(t):
-    noise = torch.zeros_like(t).uniform_(0, 1)
-    return -log(-log(noise))
-def gumbel_sample(t, temperature=1.0, dim=-1):
-    return ((t / max(temperature, 1e-10)) + gumbel_noise(t)).argmax(dim=dim)
-class VampBase(at.ml.BaseModel):
-    def forward(self, x: torch.Tensor, r: torch.Tensor):
-        raise NotImplementedError
-    def add_noise(
-        self,
-        x: torch.Tensor,
-        r: torch.Tensor,
-        random_x: Optional[torch.Tensor] = None,
-        mask: Optional[torch.Tensor] = None,
-        ext_mask: Optional[torch.Tensor] = None,
-        n_prefix: Optional[torch.Tensor] = None,
-        n_suffix: Optional[torch.Tensor] = None,
-        downsample_factor: Optional[int] = None,
-        periodic_width: int = 1,
-        periodic_width_dropout: float = 0.0,
-        periodic_dropout: float = 0.0,
-        add_random_periodic_offset: bool = False,  # TODO: should be always false lol this is hacky
-        n_conditioning_codebooks: Optional[int] = None,
-        noise_mode: str = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        assert x.ndim == 3, "x must be (batch, n_codebooks, seq)"
-        if mask is None:
-            if not isinstance(r, torch.Tensor):
-                r = scalar_to_batch_tensor(r, x.shape[0]).to(x.device)
-            r = self.gamma(r)[:, None, None]
-            probs = torch.ones_like(x) * r
-            # if we have a prefix or suffix, set their mask prob to 0
-            if n_prefix is not None:
-                if not isinstance(n_prefix, torch.Tensor):
-                    n_prefix = scalar_to_batch_tensor(n_prefix, x.shape[0]).to(x.device)
-                for i, n in enumerate(n_prefix):
-                    if n > 0:
-                        probs[i, :, :n] = 0.0
-            if n_suffix is not None:
-                if not isinstance(n_suffix, torch.Tensor):
-                    n_suffix = scalar_to_batch_tensor(n_suffix, x.shape[0]).to(x.device)
-                for i, n in enumerate(n_suffix):
-                    if n > 0:
-                        probs[i, :, -n:] = 0.0
-            # if we have a downsample factor, set the mask prob to 0
-            if downsample_factor is not None and downsample_factor > 0:
-                if not isinstance(downsample_factor, torch.Tensor):
-                    downsample_factor = scalar_to_batch_tensor(downsample_factor, x.shape[0])
-                for i, factor in enumerate(downsample_factor):
-                    if factor == 0:
-                        continue
-                    for j in range(probs.shape[-1]):
-                        if j % factor == 0:
-                            # if we have periodic dropout
-                            if periodic_dropout > 0:
-                                # flip a coin
-                                if torch.bernoulli(torch.tensor(periodic_dropout)).item() == 1:
-                                    # if we win, skip
-                                    continue
-                            # figure out how wide the mask should be
-                            j_start = max(0, j - periodic_width // 2)
-                            j_end = min(probs.shape[-1] - 1, j + periodic_width // 2) + 1
-                            # flip a coin for each position in the mask
-                            j_mask = torch.bernoulli(torch.ones(j_end - j_start) * periodic_width_dropout)
-                            j_fill = torch.ones_like(j_mask) * (1 - j_mask)
-                            # fill
-                            probs[i, :, j_start:j_end] = 1 - j_fill
-                if add_random_periodic_offset:
-                    # add a random offset to the mask
-                    offset = torch.randint(0, downsample_factor[0], (1,))
-                    probs = torch.roll(probs, offset.item(), dims=-1)
-            mask = torch.bernoulli(probs)
-            mask = mask.round().long()
-            # if we have any conditioning codebooks, set their mask  to 0
-            n_conditioning_codebooks = n_conditioning_codebooks or self.n_conditioning_codebooks
-            mask[:, :n_conditioning_codebooks, :] = 0
-        else:
-            assert mask.ndim == 3, "mask must be (batch, n_codebooks, seq)"
-            assert mask.shape == x.shape, "mask must be same shape as x"
-        if random_x is None:
-            random_x = torch.randint_like(x, 0, self.vocab_size)
-        noise_mode = noise_mode if noise_mode is not None else self.noise_mode
-        if noise_mode == "mask":
-            random_x = torch.full_like(x, self.mask_token)
-        elif noise_mode == "random":
-            if random_x is None:
-                random_x = torch.randint_like(x, 0, self.vocab_size)
-        else:
-            raise ValueError(f"invalid noise mode {noise_mode}")
-        # add the external mask if we were given one
-        if ext_mask is not None:
-            assert ext_mask.ndim == 3, "mask must be (batch, n_codebooks, seq)"
-            mask = (mask * ext_mask).bool().long()
-        x = x * (1 - mask) + random_x * mask
-        return x, mask
-    def add_truth_to_logits(
-        self,
-        z_true,
-        z_hat,
-        mask,
-    ):
-        if self.noise_mode == "mask":
-            z_true = z_true[:, self.n_conditioning_codebooks :, :]
-            mask = mask[:, self.n_conditioning_codebooks :, :]
-            truth = F.one_hot(z_true, self.vocab_size)
-            mask = mask[:, :, :, None].expand(-1, -1, -1, self.vocab_size)
-            z_hat = rearrange(
-                z_hat,
-                "b p (t c) -> b c t p",
-                c=self.n_codebooks - self.n_conditioning_codebooks,
-            )
-            z_hat = z_hat * mask + truth * (1 - mask)
-            z_hat = rearrange(z_hat, "b c t p -> b p (t c)")
-        else:
-            raise ValueError(f"invalid noise mode for adding truth to logits {self.noise_mode}")
-        return z_hat
-    def gamma(self, r):
-        return (r * torch.pi / 2).cos()
-    def invgamma(self, y):
-        if not torch.is_tensor(y):
-            y = torch.tensor(y)[None]
-        return 2 * y.acos() / torch.pi
-    def r_embed(self, r, max_positions=10000):
-        """ """
-        assert hasattr(self, "r_cond_dim"), "must set r_cond_dim before calling r_embed"
-        if self.r_cond_dim > 0:
-            dtype = r.dtype
-            r = self.gamma(r) * max_positions
-            half_dim = self.r_cond_dim // 2
-            emb = math.log(max_positions) / (half_dim - 1)
-            emb = torch.arange(half_dim, device=r.device).float().mul(-emb).exp()
-            emb = r[:, None] * emb[None, :]
-            emb = torch.cat([emb.sin(), emb.cos()], dim=1)
-            if self.r_cond_dim % 2 == 1:  # zero pad
-                emb = nn.functional.pad(emb, (0, 1), mode="constant")
-            return emb.to(dtype)
-        else:
-            return r
-    @torch.no_grad()
-    def to_signal(self, z, codec):
-        """
-        convert a sequence of latents to a signal.
-        """
-        if z.ndim == 2:
-            z = self.embedding.unflatten(z)
-        assert z.ndim == 3
-        signal = at.AudioSignal(
-            codec.decode(
-                codec.quantizer.from_latents(self.embedding.from_codes(z, codec))[0]
-            )["audio"],
-            codec.sample_rate,
-        )
-        # find where the mask token is and replace it with silence in the audio
-        for tstep in range(z.shape[-1]):
-            if torch.any(z[:, :, tstep] == self.mask_token):
-                sample_idx_0 = tstep * codec.hop_length
-                sample_idx_1 = sample_idx_0 + codec.hop_length
-                signal.samples[:, :, sample_idx_0:sample_idx_1] = 0.0
-        return signal
-    @torch.no_grad()
-    def sample(
-        self,
-        codec,
-        time_steps: int = 300,
-        sampling_steps: int = 36,
-        start_tokens: Optional[torch.Tensor] = None,
-        mask: Optional[torch.Tensor] = None,
-        temperature: Union[float, Tuple[float, float]] = 0.8,
-        top_k: int = None,
-        sample: str = "gumbel",
-        typical_filtering=True,
-        typical_mass=0.2,
-        typical_min_tokens=1,
-        return_signal=True,
-    ):
-        if isinstance(temperature, float):
-            temperature = torch.tensor(temperature).repeat(sampling_steps)
-        elif isinstance(temperature, tuple):
-            assert len(temperature) == 2
-            l, h = temperature
-            temperature = torch.linspace(l, h, sampling_steps)
-        else:
-            raise TypeError(f"invalid type for temperature")
-        def flatten(codes):
-            return rearrange(codes, "b c t -> b (t c)")
-        def unflatten(codes, c):
-            return rearrange(codes, "b (t c) -> b c t", c=c)
-        z = start_tokens
-        if z is None:
-            z = torch.full((1, self.n_codebooks, time_steps), self.mask_token).to(
-                self.device
-            )
-        if mask is None:
-            mask = torch.ones_like(z).to(self.device).int()
-            mask[:, : self.n_conditioning_codebooks, :] = 0.0
-        if mask.ndim == 2:
-            mask = mask[:, None, :].repeat(1, z.shape[1], 1)
-        # figure out which timesteps we're keeping
-        keep_mask = 1 - mask
-        # any conditioning codebook levels need to be in the keep mask
-        # if self.n_conditioning_codebooks > 0:
-        #     cond_mask = torch.ones(z.shape[0], self.n_conditioning_codebooks, z.shape[-1]).to(z.device)
-        #     keep_mask = torch.cat([cond_mask, keep_mask], dim=1)
-        # flatten
-        keep_mask = flatten(keep_mask)
-        # our r steps
-        r_steps = torch.linspace(0, 1, sampling_steps + 1)[1:].to(self.device)
-        # how many tokens did we keep on init?
-        num_kept_on_init = keep_mask.sum()
-        # how many codebooks are we inferring vs conditioning on?
-        n_infer_codebooks = self.n_codebooks - self.n_conditioning_codebooks
-        for i in range(sampling_steps):
-            # our current temperature
-            tmpt = temperature[i]
-            # our current schedule step
-            r = r_steps[i : i + 1]
-            with torch.inference_mode():
-                # mask our z
-                keep_mask_unflat = unflatten(keep_mask, c=self.n_codebooks)
-                z_masked = z.masked_fill(~keep_mask_unflat.bool(), self.mask_token)
-                # get latents
-                latents = self.embedding.from_codes(z_masked, codec)
-                # infer from latents
-                logits = self.forward(latents, r)
-                logits = logits.permute(0, 2, 1)  # b, seq, prob
-                # the schedule determines how many samples to keep
-                num_tokens_to_infer = (z.shape[-1] * z.shape[-2]) - num_kept_on_init
-                num_to_keep = num_kept_on_init + int(
-                    num_tokens_to_infer * (self.gamma(1 - r))
-                )
-                # figure out which logits we wanna keep
-                if num_to_keep > 0:
-                    probs = logits.softmax(dim=-1)
-                    # do mod self.vocab_size to make sure we don't sample from the mask token
-                    # in case the mask token was in the og z
-                    keep_probs = F.one_hot(z%self.vocab_size, self.vocab_size)[:, :, :]
-                    probs = rearrange(
-                        probs, "b (t c) p -> b c t p", c=n_infer_codebooks
-                    )
-                    probs = torch.cat(
-                        [keep_probs[:, : self.n_conditioning_codebooks, ...], probs],
-                        dim=1,
-                    )
-                    keep_probs = rearrange(
-                        keep_probs, "b c t p -> b (t c) p", c=self.n_codebooks
-                    )
-                    probs = rearrange(probs, "b c t p -> b (t c) p", c=self.n_codebooks)
-                    keep_prob_mask = keep_mask.unsqueeze(-1).repeat(
-                        1, 1, self.vocab_size
-                    )
-                    probs = (keep_prob_mask.long() * keep_probs) + (
-                        1 - keep_prob_mask.long()
-                    ) * probs
-                    highest_probs = probs.max(dim=-1, keepdim=False)[0]
-                    v, _ = highest_probs.topk(num_to_keep, dim=-1)
-                    keep_mask = torch.ones_like(keep_mask).bool().clone()
-                    keep_mask[highest_probs < v[..., [-1]]] = 0
-                logits = torch.log(probs)
-                z_inferred = self.sample_from_logits(
-                    logits=logits,
-                    top_k=top_k,
-                    temperature=tmpt,
-                    sample=sample,
-                    typical_filtering=typical_filtering,
-                    typical_mass=typical_mass,
-                    typical_min_tokens=typical_min_tokens,
-                )
-                z = rearrange(z_inferred, "b (t c) -> b c t", c=self.n_codebooks)
-                # add conditioning codebooks back
-                # z = torch.cat([z[:, :self.n_conditioning_codebooks, :], z_inferred], dim=1)
-        if return_signal:
-            return self.to_signal(z, codec)
-        else:
-            return z
-    def sample_from_logits(
-        self,
-        logits,
-        top_k: int = None,
-        temperature: float = 1.0,
-        sample: str = "multinomial",
-        typical_filtering=False,
-        typical_mass=0.2,
-        typical_min_tokens=1,
-    ):
-        # add temperature
-        logits = logits / temperature
-        # add topk
-        if top_k is not None:
-            v, topk_idx = logits.topk(top_k)
-            logits[logits < v[..., [-1]]] = -float("inf")
-        if typical_filtering:
-            assert top_k is None
-            nb, nt, _ = logits.shape
-            x_flat = rearrange(logits, "b t l -> (b t ) l")
-            x_flat_norm = torch.nn.functional.log_softmax(x_flat, dim=-1)
-            x_flat_norm_p = torch.exp(x_flat_norm)
-            entropy = -(x_flat_norm * x_flat_norm_p).nansum(-1, keepdim=True)
-            c_flat_shifted = torch.abs((-x_flat_norm) - entropy)
-            c_flat_sorted, x_flat_indices = torch.sort(c_flat_shifted, descending=False)
-            x_flat_cumsum = (
-                x_flat.gather(-1, x_flat_indices).softmax(dim=-1).cumsum(dim=-1)
-            )
-            last_ind = (x_flat_cumsum < typical_mass).sum(dim=-1)
-            sorted_indices_to_remove = c_flat_sorted > c_flat_sorted.gather(
-                1, last_ind.view(-1, 1)
-            )
-            if typical_min_tokens > 1:
-                sorted_indices_to_remove[..., :typical_min_tokens] = 0
-            indices_to_remove = sorted_indices_to_remove.scatter(
-                1, x_flat_indices, sorted_indices_to_remove
-            )
-            x_flat = x_flat.masked_fill(indices_to_remove, -float("Inf"))
-            logits = rearrange(x_flat, "(b t) l -> b t l", t=nt)
-        if sample == "multinomial":
-            probs = torch.softmax(logits, dim=-1)
-            inferred = torch.stack([pr.multinomial(1).squeeze(-1) for pr in probs])
-        elif sample == "argmax":
-            inferred = torch.softmax(logits, dim=-1).argmax(dim=-1)
-        elif sample == "gumbel":
-            inferred = gumbel_sample(logits, dim=-1)
-        else:
-            raise ValueError(f"invalid sampling method: {sample}")
-        return inferred

vampnet/modules/layers.py CHANGED Viewed

@@ -162,20 +162,3 @@ class CodebookEmbedding(nn.Module):
         x = self.out_proj(latents)
         return x
-    def flatten(self, tokens: torch.Tensor, n_codebooks: int = None):
-        """
-        flatten a sequence of tokens from (batch, codebook, time) to (batch, codebook * time)
-        """
-        n_c = n_codebooks if n_codebooks is not None else self.n_codebooks
-        return rearrange(tokens, "b c t -> b (t c)", c=n_c)
-    def unflatten(self, flat_tokens: torch.Tensor, n_codebooks: int = None):
-        """
-        unflatten a sequence of tokens from (batch, codebook * time) to (batch, codebook, time)
-        """
-        nb, nt = flat_tokens.shape
-        n_c = n_codebooks if n_codebooks is not None else self.n_codebooks
-        tokens = rearrange(flat_tokens, "b (t c) -> b c t", c=n_c)
-        return tokens


162	x = self.out_proj(latents)
163	return x
164

vampnet/modules/transformer.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import math
 import numpy as np
 import torch
@@ -6,16 +7,30 @@ import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange
 import loralib as lora
-from .base import VampBase
 from .activations import get_activation
 from .layers import CodebookEmbedding
 from .layers import FiLM
 from .layers import SequentialWithFiLM
 from .layers import WNConv1d
 LORA_R = 8
 class RMSNorm(nn.Module):
     def __init__(self, hidden_size: int, eps=1e-6):
@@ -435,7 +450,7 @@ class TransformerStack(nn.Module):
         return self.norm(x) if self.norm is not None else x
-class VampNet(VampBase):
     def __init__(
         self,
         n_heads: int = 20,
@@ -519,6 +534,270 @@ class VampNet(VampBase):
         out = rearrange(out, "b (p c) t -> b p (t c)", c=self.n_predict_codebooks)
         return out
 if __name__ == "__main__":
@@ -538,8 +817,7 @@ if __name__ == "__main__":
         ).to(device)
         r = torch.zeros(batch_size).to(device)
-        z_mask, mask = model.add_noise(z, r)
         z_mask_latent = torch.rand(
             batch_size, model.latent_dim * model.n_codebooks, seq_len
         ).to(device)

 import math
+from typing import Optional, Tuple, Union
 import numpy as np
 import torch
 import torch.nn.functional as F
 from einops import rearrange
 import loralib as lora
+import audiotools as at
 from .activations import get_activation
 from .layers import CodebookEmbedding
 from .layers import FiLM
 from .layers import SequentialWithFiLM
 from .layers import WNConv1d
+from ..util import scalar_to_batch_tensor, codebook_flatten, codebook_unflatten
+from ..mask import _gamma
 LORA_R = 8
+def log(t, eps=1e-20):
+    return torch.log(t + eps)
+def gumbel_noise(t):
+    noise = torch.zeros_like(t).uniform_(0, 1)
+    return -log(-log(noise))
+def gumbel_sample(t, temperature=1.0, dim=-1):
+    return ((t / max(temperature, 1e-10)) + gumbel_noise(t)).argmax(dim=dim)
 class RMSNorm(nn.Module):
     def __init__(self, hidden_size: int, eps=1e-6):
         return self.norm(x) if self.norm is not None else x
+class VampNet(at.ml.BaseModel):
     def __init__(
         self,
         n_heads: int = 20,
         out = rearrange(out, "b (p c) t -> b p (t c)", c=self.n_predict_codebooks)
         return out
+    def r_embed(self, r, max_positions=10000):
+        if self.r_cond_dim > 0:
+            dtype = r.dtype
+            r = _gamma(r) * max_positions
+            half_dim = self.r_cond_dim // 2
+            emb = math.log(max_positions) / (half_dim - 1)
+            emb = torch.arange(half_dim, device=r.device).float().mul(-emb).exp()
+            emb = r[:, None] * emb[None, :]
+            emb = torch.cat([emb.sin(), emb.cos()], dim=1)
+            if self.r_cond_dim % 2 == 1:  # zero pad
+                emb = nn.functional.pad(emb, (0, 1), mode="constant")
+            return emb.to(dtype)
+        else:
+            return r
+    @torch.no_grad()
+    def to_signal(self, z, codec):
+        """
+        convert a sequence of latents to a signal.
+        """
+        assert z.ndim == 3
+        signal = at.AudioSignal(
+            codec.decode(
+                codec.quantizer.from_latents(self.embedding.from_codes(z, codec))[0]
+            )["audio"],
+            codec.sample_rate,
+        )
+        # find where the mask token is and replace it with silence in the audio
+        for tstep in range(z.shape[-1]):
+            if torch.any(z[:, :, tstep] == self.mask_token):
+                sample_idx_0 = tstep * codec.hop_length
+                sample_idx_1 = sample_idx_0 + codec.hop_length
+                signal.samples[:, :, sample_idx_0:sample_idx_1] = 0.0
+        return signal
+    def add_truth_to_logits(
+        self,
+        z_true,
+        z_hat,
+        mask,
+    ):
+        if self.noise_mode == "mask":
+            z_true = z_true[:, self.n_conditioning_codebooks :, :]
+            mask = mask[:, self.n_conditioning_codebooks :, :]
+            truth = F.one_hot(z_true, self.vocab_size)
+            mask = mask[:, :, :, None].expand(-1, -1, -1, self.vocab_size)
+            z_hat = rearrange(
+                z_hat,
+                "b p (t c) -> b c t p",
+                c=self.n_codebooks - self.n_conditioning_codebooks,
+            )
+            z_hat = z_hat * mask + truth * (1 - mask)
+            z_hat = rearrange(z_hat, "b c t p -> b p (t c)")
+        else:
+            raise ValueError(f"invalid noise mode for adding truth to logits {self.noise_mode}")
+        return z_hat
+    @torch.no_grad()
+    def sample(
+        self,
+        codec,
+        time_steps: int = 300,
+        sampling_steps: int = 36,
+        start_tokens: Optional[torch.Tensor] = None,
+        mask: Optional[torch.Tensor] = None,
+        temperature: Union[float, Tuple[float, float]] = 0.8,
+        top_k: int = None,
+        sample: str = "gumbel",
+        typical_filtering=True,
+        typical_mass=0.2,
+        typical_min_tokens=1,
+        return_signal=True,
+    ):
+        if isinstance(temperature, float):
+            temperature = torch.tensor(temperature).repeat(sampling_steps)
+        elif isinstance(temperature, tuple):
+            assert len(temperature) == 2
+            l, h = temperature
+            temperature = torch.linspace(l, h, sampling_steps)
+        else:
+            raise TypeError(f"invalid type for temperature")
+        z = start_tokens
+        if z is None:
+            z = torch.full((1, self.n_codebooks, time_steps), self.mask_token).to(
+                self.device
+            )
+        if mask is None:
+            mask = torch.ones_like(z).to(self.device).int()
+            mask[:, : self.n_conditioning_codebooks, :] = 0.0
+        if mask.ndim == 2:
+            mask = mask[:, None, :].repeat(1, z.shape[1], 1)
+        # figure out which timesteps we're keeping
+        keep_mask = 1 - mask
+        # any conditioning codebook levels need to be in the keep mask
+        # if self.n_conditioning_codebooks > 0:
+        #     cond_mask = torch.ones(z.shape[0], self.n_conditioning_codebooks, z.shape[-1]).to(z.device)
+        #     keep_mask = torch.cat([cond_mask, keep_mask], dim=1)
+        # flatten
+        keep_mask = codebook_flatten(keep_mask)
+        # our r steps
+        r_steps = torch.linspace(0, 1, sampling_steps + 1)[1:].to(self.device)
+        # how many tokens did we keep on init?
+        num_kept_on_init = keep_mask.sum()
+        # how many codebooks are we inferring vs conditioning on?
+        n_infer_codebooks = self.n_codebooks - self.n_conditioning_codebooks
+        for i in range(sampling_steps):
+            # our current temperature
+            tmpt = temperature[i]
+            # our current schedule step
+            r = r_steps[i : i + 1]
+            with torch.inference_mode():
+                # mask our z
+                keep_mask_unflat = codebook_unflatten(keep_mask, n_c=self.n_codebooks)
+                z_masked = z.masked_fill(~keep_mask_unflat.bool(), self.mask_token)
+                # get latents
+                latents = self.embedding.from_codes(z_masked, codec)
+                # infer from latents
+                logits = self.forward(latents, r)
+                logits = logits.permute(0, 2, 1)  # b, seq, prob
+                # the schedule determines how many samples to keep
+                num_tokens_to_infer = (z.shape[-1] * z.shape[-2]) - num_kept_on_init
+                num_to_keep = num_kept_on_init + int(
+                    num_tokens_to_infer * (_gamma(1 - r))
+                )
+                # figure out which logits we wanna keep
+                if num_to_keep > 0:
+                    probs = logits.softmax(dim=-1)
+                    # do mod self.vocab_size to make sure we don't sample from the mask token
+                    # in case the mask token was in the og z
+                    keep_probs = F.one_hot(z%self.vocab_size, self.vocab_size)[:, :, :]
+                    probs = rearrange(
+                        probs, "b (t c) p -> b c t p", c=n_infer_codebooks
+                    )
+                    probs = torch.cat(
+                        [keep_probs[:, : self.n_conditioning_codebooks, ...], probs],
+                        dim=1,
+                    )
+                    keep_probs = rearrange(
+                        keep_probs, "b c t p -> b (t c) p", c=self.n_codebooks
+                    )
+                    probs = rearrange(probs, "b c t p -> b (t c) p", c=self.n_codebooks)
+                    keep_prob_mask = keep_mask.unsqueeze(-1).repeat(
+                        1, 1, self.vocab_size
+                    )
+                    probs = (keep_prob_mask.long() * keep_probs) + (
+                        1 - keep_prob_mask.long()
+                    ) * probs
+                    highest_probs = probs.max(dim=-1, keepdim=False)[0]
+                    v, _ = highest_probs.topk(num_to_keep, dim=-1)
+                    keep_mask = torch.ones_like(keep_mask).bool().clone()
+                    keep_mask[highest_probs < v[..., [-1]]] = 0
+                logits = torch.log(probs)
+                z_inferred = self.sample_from_logits(
+                    logits=logits,
+                    top_k=top_k,
+                    temperature=tmpt,
+                    sample=sample,
+                    typical_filtering=typical_filtering,
+                    typical_mass=typical_mass,
+                    typical_min_tokens=typical_min_tokens,
+                )
+                z = codebook_unflatten(z_inferred, n_c=self.n_codebooks)
+        if return_signal:
+            return self.to_signal(z, codec)
+        else:
+            return z
+    def sample_from_logits(
+        self,
+        logits,
+        top_k: int = None,
+        temperature: float = 1.0,
+        sample: str = "multinomial",
+        typical_filtering=False,
+        typical_mass=0.2,
+        typical_min_tokens=1,
+    ):
+        # add temperature
+        logits = logits / temperature
+        # add topk
+        if top_k is not None:
+            v, topk_idx = logits.topk(top_k)
+            logits[logits < v[..., [-1]]] = -float("inf")
+        if typical_filtering:
+            assert top_k is None
+            nb, nt, _ = logits.shape
+            x_flat = rearrange(logits, "b t l -> (b t ) l")
+            x_flat_norm = torch.nn.functional.log_softmax(x_flat, dim=-1)
+            x_flat_norm_p = torch.exp(x_flat_norm)
+            entropy = -(x_flat_norm * x_flat_norm_p).nansum(-1, keepdim=True)
+            c_flat_shifted = torch.abs((-x_flat_norm) - entropy)
+            c_flat_sorted, x_flat_indices = torch.sort(c_flat_shifted, descending=False)
+            x_flat_cumsum = (
+                x_flat.gather(-1, x_flat_indices).softmax(dim=-1).cumsum(dim=-1)
+            )
+            last_ind = (x_flat_cumsum < typical_mass).sum(dim=-1)
+            sorted_indices_to_remove = c_flat_sorted > c_flat_sorted.gather(
+                1, last_ind.view(-1, 1)
+            )
+            if typical_min_tokens > 1:
+                sorted_indices_to_remove[..., :typical_min_tokens] = 0
+            indices_to_remove = sorted_indices_to_remove.scatter(
+                1, x_flat_indices, sorted_indices_to_remove
+            )
+            x_flat = x_flat.masked_fill(indices_to_remove, -float("Inf"))
+            logits = rearrange(x_flat, "(b t) l -> b t l", t=nt)
+        if sample == "multinomial":
+            probs = torch.softmax(logits, dim=-1)
+            inferred = torch.stack([pr.multinomial(1).squeeze(-1) for pr in probs])
+        elif sample == "argmax":
+            inferred = torch.softmax(logits, dim=-1).argmax(dim=-1)
+        elif sample == "gumbel":
+            inferred = gumbel_sample(logits, dim=-1)
+        else:
+            raise ValueError(f"invalid sampling method: {sample}")
+        return inferred
 if __name__ == "__main__":
         ).to(device)
         r = torch.zeros(batch_size).to(device)
         z_mask_latent = torch.rand(
             batch_size, model.latent_dim * model.n_codebooks, seq_len
         ).to(device)

vampnet/signal.py DELETED Viewed

@@ -1,5 +0,0 @@
-import torch
-from typing import Optional, Tuple
-from .util import scalar_to_batch_tensor

vampnet/util.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import tqdm
 import torch
 def scalar_to_batch_tensor(x, batch_size):
     return torch.tensor(x).repeat(batch_size)
@@ -29,4 +30,17 @@ def parallelize(
     elif parallel == "single":
         return [fn(x) for x in tqdm.tqdm(*iterables)]
     else:
-        raise ValueError(f"parallel must be one of 'thread_map', 'process_map', 'single', but got {parallel}")

 import tqdm
 import torch
+from einops import rearrange
 def scalar_to_batch_tensor(x, batch_size):
     return torch.tensor(x).repeat(batch_size)
     elif parallel == "single":
         return [fn(x) for x in tqdm.tqdm(*iterables)]
     else:
+        raise ValueError(f"parallel must be one of 'thread_map', 'process_map', 'single', but got {parallel}")
+def codebook_flatten(tokens: torch.Tensor):
+    """
+    flatten a sequence of tokens from (batch, codebook, time) to (batch, codebook * time)
+    """
+    return rearrange(tokens, "b c t -> b (t c)")
+def codebook_unflatten(flat_tokens: torch.Tensor, n_c: int = None):
+    """
+    unflatten a sequence of tokens from (batch, codebook * time) to (batch, codebook, time)
+    """
+    tokens = rearrange(flat_tokens, "b (t c) -> b c t", c=n_c)
+    return tokens