Spaces:

hugggof
/

salad_bowl

Sleeping

App Files Files Community

Hugo Flores Garcia commited on Feb 25

Commit

c6f0e5a

•

1 Parent(s): a022742

new stuff

Browse files

Files changed (21) hide show

app.py +136 -32
conf/generated/church-bells/c2f.yml +15 -0
conf/generated/church-bells/coarse.yml +8 -0
conf/generated/church-bells/interface.yml +6 -0
conf/generated/copepod/c2f.yml +15 -0
conf/generated/copepod/coarse.yml +8 -0
conf/generated/copepod/interface.yml +6 -0
conf/generated/growl/c2f.yml +16 -0
conf/generated/growl/coarse.yml +9 -0
conf/generated/growl/interface.yml +7 -0
conf/generated/sample-instrument/c2f.yml +15 -0
conf/generated/sample-instrument/coarse.yml +8 -0
conf/generated/sample-instrument/interface.yml +6 -0
models/models/vampnet/c2f.pth +3 -0
models/models/vampnet/coarse.pth +3 -0
models/models/vampnet/codec.pth +3 -0
models/models/wavebeat.pth +3 -0
scripts/utils/split.py +5 -3
vampnet/interface.py +101 -12
vampnet/mask.py +10 -3
vampnet/modules/transformer.py +211 -9

app.py CHANGED Viewed

@@ -7,13 +7,15 @@ import audiotools as at
 import argbind
 import shutil
 import torch
 import gradio as gr
-from vampnet.interface import Interface
 from vampnet import mask as pmask
 device = "cuda" if torch.cuda.is_available() else "cpu"
 interface = Interface(
     device=device,
     coarse_ckpt="models/vampnet/coarse.pth",
@@ -78,11 +80,20 @@ def shift_pitch(signal, interval: int):
     )
     return signal
-def _vamp(data, return_mask=False):
-    out_dir = OUT_DIR / str(uuid.uuid4())
     out_dir.mkdir(parents=True)
     sig = at.AudioSignal(data[input_audio])
     # reload the model if necessary
     interface.reload(
@@ -96,18 +107,21 @@ def _vamp(data, return_mask=False):
     if data[pitch_shift_amt] != 0:
         sig = shift_pitch(sig, data[pitch_shift_amt])
     build_mask_kwargs = dict(
         rand_mask_intensity=data[rand_mask_intensity],
         prefix_s=data[prefix_s],
         suffix_s=data[suffix_s],
         periodic_prompt=data[periodic_p],
         periodic_prompt_width=data[periodic_w],
         onset_mask_width=data[onset_mask_width],
         _dropout=data[dropout],
-        upper_codebook_mask=int(data[n_mask_codebooks])
     )
-    _seed = data[seed] if data[seed] > 0 else None
     vamp_kwargs = dict(
         # _sampling_steps=[data[num_steps], 8, 8, 4, 4, 2, 2, 1, 1],
         mask_temperature=data[masktemp]*10,
@@ -121,31 +135,81 @@ def _vamp(data, return_mask=False):
     )
     # save the mask as a txt file
-    sig, mask = interface.ez_vamp(
         sig,
-        batch_size=1,
         feedback_steps=data[num_feedback_steps],
         time_stretch_factor=data[stretch_factor],
         build_mask_kwargs=build_mask_kwargs,
         vamp_kwargs=vamp_kwargs,
-        return_mask=return_mask,
     )
-    sig.write(out_dir / "output.wav")
-    if return_mask:
-        mask = interface.to_signal(mask.cuda()).cpu()
-        mask.write(out_dir / "mask.wav")
-        return sig.path_to_file, mask.path_to_file
-    else:
         return sig.path_to_file
 def vamp(data):
-    return _vamp(data, return_mask=True)
 def api_vamp(data):
-    return _vamp(data, return_mask=False)
 with gr.Blocks() as demo:
     with gr.Row():
@@ -193,7 +257,13 @@ with gr.Blocks() as demo:
                     step=1,
                     value=3,
                 )
                 onset_mask_width = gr.Slider(
                     label="onset mask width (multiplies with the periodic mask, 1 step ~= 10milliseconds) ",
@@ -204,9 +274,13 @@ with gr.Blocks() as demo:
                 )
                 n_mask_codebooks = gr.Number(
-                    label="first upper codebook level to mask",
                     value=3,
                 )
             with gr.Accordion("extras ", open=False):
                 pitch_shift_amt = gr.Slider(
@@ -337,17 +411,45 @@ with gr.Blocks() as demo:
                 value=1
             )
             vamp_button = gr.Button("generate (vamp)!!!")
-            output_audio = gr.Audio(
-                label="output audio",
                 interactive=False,
                 type="filepath"
             )
-            use_as_input_button = gr.Button("use output as input")
             thank_you = gr.Markdown("")
     _inputs = {
             input_audio,
@@ -368,29 +470,31 @@ with gr.Blocks() as demo:
             n_mask_codebooks,
             pitch_shift_amt,
             sample_cutoff,
-            num_feedback_steps
         }
     # connect widgets
     vamp_button.click(
         fn=vamp,
         inputs=_inputs,
-        outputs=[output_audio, audio_mask],
     )
     api_vamp_button = gr.Button("api vamp", visible=False)
     api_vamp_button.click(
         fn=api_vamp,
         inputs=_inputs,
-        outputs=[output_audio],
         api_name="vamp"
     )
-    use_as_input_button.click(
-        fn=lambda x: x,
-        inputs=[output_audio],
-        outputs=[input_audio]
-    )
-demo.queue()
-demo.launch(share=True, debug=True)

 import argbind
 import shutil
 import torch
+from datetime import datetime
 import gradio as gr
+from vampnet.interface import Interface, signal_concat
 from vampnet import mask as pmask
 device = "cuda" if torch.cuda.is_available() else "cpu"
 interface = Interface(
     device=device,
     coarse_ckpt="models/vampnet/coarse.pth",
     )
     return signal
+def _vamp(data, api: bool=False,# bypasses mask audio generation and other things to be faster
+    ):
+    _seed = data[seed] if data[seed] > 0 else None
+    if _seed is None:
+        # create a random seed
+        _seed = int(torch.randint(0, 2**32, (1,)).item())
+    at.util.seed(_seed)
+    datentime = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
+    out_dir = OUT_DIR / f"{Path(data[input_audio]).stem}-{datentime}-seed-{_seed}-model-{data[model_choice]}"
     out_dir.mkdir(parents=True)
     sig = at.AudioSignal(data[input_audio])
+    sig.write(out_dir / "input.wav")
     # reload the model if necessary
     interface.reload(
     if data[pitch_shift_amt] != 0:
         sig = shift_pitch(sig, data[pitch_shift_amt])
+    _p2 = data[periodic_p] if data[p2] == 0 else data[p2]
+    _n_codebooks_2 = data[n_mask_codebooks] if data[n_mask_codebooks_2] == 0 else data[n_mask_codebooks_2]
     build_mask_kwargs = dict(
         rand_mask_intensity=data[rand_mask_intensity],
         prefix_s=data[prefix_s],
         suffix_s=data[suffix_s],
         periodic_prompt=data[periodic_p],
+        periodic_prompt2=_p2,
         periodic_prompt_width=data[periodic_w],
         onset_mask_width=data[onset_mask_width],
         _dropout=data[dropout],
+        upper_codebook_mask=int(data[n_mask_codebooks]),
+        upper_codebook_mask_2=int(_n_codebooks_2),
     )
     vamp_kwargs = dict(
         # _sampling_steps=[data[num_steps], 8, 8, 4, 4, 2, 2, 1, 1],
         mask_temperature=data[masktemp]*10,
     )
     # save the mask as a txt file
+    interface.set_chunk_size(data[win_dur])
+    sig, mask, codes = interface.ez_vamp(
         sig,
+        batch_size=4 if not api else 1,
         feedback_steps=data[num_feedback_steps],
         time_stretch_factor=data[stretch_factor],
         build_mask_kwargs=build_mask_kwargs,
         vamp_kwargs=vamp_kwargs,
+        return_mask=True,
     )
+    if api:
+        sig.write(out_dir / "out.wav")
         return sig.path_to_file
+    if not api:
+        # write codes to numpy file
+        np.save(out_dir / "codes.npy", codes.cpu().numpy())
+        metadata = {}
+        metadata["seed"] = _seed
+        metadata["model_choice"] = data[model_choice]
+        metadata["mask_kwargs"] = build_mask_kwargs
+        metadata["vamp_kwargs"] = vamp_kwargs
+        metadata["loudness"] = loudness
+        # save the metadata
+        with open(out_dir / "metadata.yml", "w") as f:
+            yaml.dump(metadata, f)
+        sig0 = sig[0].write(out_dir / "out1.wav")
+        sig1 = sig[1].write(out_dir / "out2.wav")
+        sig2 = sig[2].write(out_dir / "out3.wav")
+        sig3 = sig[3].write(out_dir / "out4.wav")
+        # write the mask to txt
+        with open(out_dir / "mask.txt", "w") as f:
+            m = mask[0].cpu().numpy()
+            # write to txt, each time step on a new line
+            for i in range(m.shape[-1]):
+                f.write(f"{m[:, i]}\n")
+        import matplotlib.pyplot as plt
+        plt.clf()
+        interface.visualize_codes(mask)
+        plt.savefig(out_dir / "mask.png")
+        plt.clf()
+        interface.visualize_codes(codes)
+        plt.savefig(out_dir / "codes.png")
+        plt.close()
+        # zip out dir, and return the path to the zip
+        shutil.make_archive(out_dir, 'zip', out_dir)
+        # chunk in groups of 1024 timesteps
+        _mask_sigs = []
+        for i in range(0, mask.shape[-1], 1024):
+            _mask_sigs.append(interface.to_signal(mask[:, :, i:i+1024].to(interface.device)).cpu())
+        mask = signal_concat(_mask_sigs)
+        mask.write(out_dir / "mask.wav")
+        return (
+            sig0.path_to_file, sig1.path_to_file,
+            sig2.path_to_file, sig3.path_to_file,
+            mask.path_to_file, str(out_dir.with_suffix(".zip")), out_dir / "mask.png"
+        )
 def vamp(data):
+    return _vamp(data, api=False)
 def api_vamp(data):
+    return _vamp(data, api=True)
 with gr.Blocks() as demo:
     with gr.Row():
                     step=1,
                     value=3,
                 )
+                p2 = gr.Slider(
+                    label="periodic prompt 2 (0 - same as p1, 2 - lots of hints, 8 - a couple of hints, 16 - occasional hint, 32 - very occasional hint, etc)",
+                    minimum=0,
+                    maximum=128,
+                    step=1,
+                    value=0,
+                )
                 onset_mask_width = gr.Slider(
                     label="onset mask width (multiplies with the periodic mask, 1 step ~= 10milliseconds) ",
                 )
                 n_mask_codebooks = gr.Number(
+                    label="compression prompt (masks entire upper codebook levels above the specified level)",
                     value=3,
                 )
+                n_mask_codebooks_2 = gr.Number(
+                    label="compression prompt 2 via linear interpolation (0 == constant)",
+                    value=0,
+                )
             with gr.Accordion("extras ", open=False):
                 pitch_shift_amt = gr.Slider(
                 value=1
             )
+            win_dur= gr.Slider(
+                label="window duration (seconds)",
+                minimum=2,
+                maximum=10,
+                value=6)
             vamp_button = gr.Button("generate (vamp)!!!")
+            maskimg = gr.Image(
+                label="mask image",
+                interactive=False,
+                type="filepath"
+            )
+            out1 = gr.Audio(
+                label="output audio 1",
+                interactive=False,
+                type="filepath"
+            )
+            out2 = gr.Audio(
+                label="output audio 2",
+                interactive=False,
+                type="filepath"
+            )
+            out3 = gr.Audio(
+                label="output audio 3",
+                interactive=False,
+                type="filepath"
+            )
+            out4 = gr.Audio(
+                label="output audio 4",
                 interactive=False,
                 type="filepath"
             )
             thank_you = gr.Markdown("")
+            # download all the outputs
+            download = gr.File(type="filepath", label="download outputs")
     _inputs = {
             input_audio,
             n_mask_codebooks,
             pitch_shift_amt,
             sample_cutoff,
+            num_feedback_steps,
+            p2,
+            n_mask_codebooks_2,
+            win_dur
         }
     # connect widgets
     vamp_button.click(
         fn=vamp,
         inputs=_inputs,
+        outputs=[out1, out2, out3, out4, audio_mask, download, maskimg],
     )
     api_vamp_button = gr.Button("api vamp", visible=False)
     api_vamp_button.click(
         fn=api_vamp,
         inputs=_inputs,
+        outputs=[out1],
         api_name="vamp"
     )
+try:
+    demo.queue()
+    demo.launch(share=True, debug=True)
+except KeyboardInterrupt:
+    shutil.rmtree("gradio-outputs", ignore_errors=True)
+    raise

conf/generated/church-bells/c2f.yml ADDED Viewed

	@@ -0,0 +1,15 @@

+$include:
+- conf/lora/lora.yml
+AudioDataset.duration: 3.0
+AudioDataset.loudness_cutoff: -40.0
+VampNet.embedding_dim: 1280
+VampNet.n_codebooks: 14
+VampNet.n_conditioning_codebooks: 4
+VampNet.n_heads: 20
+VampNet.n_layers: 16
+fine_tune: true
+fine_tune_checkpoint: ./models/vampnet/c2f.pth
+save_path: ./runs/church-bells/c2f
+train/AudioLoader.sources: &id001
+- data/church-bells
+val/AudioLoader.sources: *id001

conf/generated/church-bells/coarse.yml ADDED Viewed

	@@ -0,0 +1,8 @@

+$include:
+- conf/lora/lora.yml
+fine_tune: true
+fine_tune_checkpoint: ./models/vampnet/coarse.pth
+save_path: ./runs/church-bells/coarse
+train/AudioLoader.sources: &id001
+- data/church-bells
+val/AudioLoader.sources: *id001

conf/generated/church-bells/interface.yml ADDED Viewed

	@@ -0,0 +1,6 @@

+AudioLoader.sources:
+- - data/church-bells
+Interface.coarse2fine_ckpt: ./runs/church-bells/c2f/latest/vampnet/weights.pth
+Interface.coarse_ckpt: ./runs/church-bells/coarse/latest/vampnet/weights.pth
+Interface.codec_ckpt: ./models/vampnet/codec.pth
+Interface.wavebeat_ckpt: ./models/wavebeat.pth

conf/generated/copepod/c2f.yml ADDED Viewed

	@@ -0,0 +1,15 @@

+$include:
+- conf/lora/lora.yml
+AudioDataset.duration: 3.0
+AudioDataset.loudness_cutoff: -40.0
+VampNet.embedding_dim: 1280
+VampNet.n_codebooks: 14
+VampNet.n_conditioning_codebooks: 4
+VampNet.n_heads: 20
+VampNet.n_layers: 16
+fine_tune: true
+fine_tune_checkpoint: ./models/vampnet/c2f.pth
+save_path: ./runs/copepod/c2f
+train/AudioLoader.sources: &id001
+- data/copepod
+val/AudioLoader.sources: *id001

conf/generated/copepod/coarse.yml ADDED Viewed

	@@ -0,0 +1,8 @@

+$include:
+- conf/lora/lora.yml
+fine_tune: true
+fine_tune_checkpoint: ./models/vampnet/coarse.pth
+save_path: ./runs/copepod/coarse
+train/AudioLoader.sources: &id001
+- data/copepod
+val/AudioLoader.sources: *id001

conf/generated/copepod/interface.yml ADDED Viewed

	@@ -0,0 +1,6 @@

+AudioLoader.sources:
+- - data/copepod
+Interface.coarse2fine_ckpt: ./runs/copepod/c2f/latest/vampnet/weights.pth
+Interface.coarse_ckpt: ./runs/copepod/coarse/latest/vampnet/weights.pth
+Interface.codec_ckpt: ./models/vampnet/codec.pth
+Interface.wavebeat_ckpt: ./models/wavebeat.pth

conf/generated/growl/c2f.yml ADDED Viewed

	@@ -0,0 +1,16 @@

+$include:
+- conf/lora/lora.yml
+AudioDataset.duration: 3.0
+AudioDataset.loudness_cutoff: -40.0
+VampNet.embedding_dim: 1280
+VampNet.n_codebooks: 14
+VampNet.n_conditioning_codebooks: 4
+VampNet.n_heads: 20
+VampNet.n_layers: 16
+fine_tune: true
+fine_tune_checkpoint: ./models/vampnet/c2f.pth
+save_path: ./runs/growl/c2f
+train/AudioLoader.sources: &id001
+- data/growly
+- animals/
+val/AudioLoader.sources: *id001

conf/generated/growl/coarse.yml ADDED Viewed

	@@ -0,0 +1,9 @@

+$include:
+- conf/lora/lora.yml
+fine_tune: true
+fine_tune_checkpoint: ./models/vampnet/coarse.pth
+save_path: ./runs/growl/coarse
+train/AudioLoader.sources: &id001
+- data/growly
+- animals/
+val/AudioLoader.sources: *id001

conf/generated/growl/interface.yml ADDED Viewed

	@@ -0,0 +1,7 @@

+AudioLoader.sources:
+- - data/growly
+  - animals/
+Interface.coarse2fine_ckpt: ./runs/growl/c2f/latest/vampnet/weights.pth
+Interface.coarse_ckpt: ./runs/growl/coarse/latest/vampnet/weights.pth
+Interface.codec_ckpt: ./models/vampnet/codec.pth
+Interface.wavebeat_ckpt: ./models/wavebeat.pth

conf/generated/sample-instrument/c2f.yml ADDED Viewed

	@@ -0,0 +1,15 @@

+$include:
+- conf/lora/lora.yml
+AudioDataset.duration: 3.0
+AudioDataset.loudness_cutoff: -40.0
+VampNet.embedding_dim: 1280
+VampNet.n_codebooks: 14
+VampNet.n_conditioning_codebooks: 4
+VampNet.n_heads: 20
+VampNet.n_layers: 16
+fine_tune: true
+fine_tune_checkpoint: ./models/vampnet/c2f.pth
+save_path: ./runs/sample-instrument/c2f
+train/AudioLoader.sources: &id001
+- data/sample-instrument/
+val/AudioLoader.sources: *id001

conf/generated/sample-instrument/coarse.yml ADDED Viewed

	@@ -0,0 +1,8 @@

+$include:
+- conf/lora/lora.yml
+fine_tune: true
+fine_tune_checkpoint: ./models/vampnet/coarse.pth
+save_path: ./runs/sample-instrument/coarse
+train/AudioLoader.sources: &id001
+- data/sample-instrument/
+val/AudioLoader.sources: *id001

conf/generated/sample-instrument/interface.yml ADDED Viewed

	@@ -0,0 +1,6 @@

+AudioLoader.sources:
+- - data/sample-instrument/
+Interface.coarse2fine_ckpt: ./runs/sample-instrument/c2f/latest/vampnet/weights.pth
+Interface.coarse_ckpt: ./runs/sample-instrument/coarse/latest/vampnet/weights.pth
+Interface.codec_ckpt: ./models/vampnet/codec.pth
+Interface.wavebeat_ckpt: ./models/wavebeat.pth

models/models/vampnet/c2f.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b10ea2d45459d34edb773cbacd71f40f7baa1f4e75ac8bcd93b022ac69f8fa63
+size 1101898865

models/models/vampnet/coarse.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:78e4ad4f8398e8ec3651bc5e5c6ea2995e1080b6226be186723ccf4320c9756c
+size 1332182321

models/models/vampnet/codec.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3db3fa43ab5d160439ddb81fc540b5573ad5ae962230de3fc5b47d218845b855
+size 600996465

models/models/wavebeat.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5ff1066a4470cb98b20edf1e489f6995b19e0435b9cfd5a190bf90a954d0cadb
+size 33248861

scripts/utils/split.py CHANGED Viewed

@@ -16,12 +16,11 @@ def train_test_split(
     audio_folder: str = ".",
     test_size: float = 0.2,
     seed: int = 42,
-    pattern: str = "**/*.mp3",
 ):
     print(f"finding audio")
     audio_folder = Path(audio_folder)
-    audio_files = list(tqdm(audio_folder.glob(pattern)))
     print(f"found {len(audio_files)} audio files")
     # split according to test_size
@@ -49,7 +48,10 @@ def train_test_split(
         for file in tqdm(files):
             out_file = audio_folder.parent / f"{audio_folder.name}-{split}" / Path(file).name
             out_file.parent.mkdir(exist_ok=True, parents=True)
-            os.symlink(file, out_file)
         # save split as json
         with open(Path(audio_folder) / f"{split}.json", "w") as f:

     audio_folder: str = ".",
     test_size: float = 0.2,
     seed: int = 42,
 ):
     print(f"finding audio")
     audio_folder = Path(audio_folder)
+    audio_files = util.find_audio(audio_folder)
     print(f"found {len(audio_files)} audio files")
     # split according to test_size
         for file in tqdm(files):
             out_file = audio_folder.parent / f"{audio_folder.name}-{split}" / Path(file).name
             out_file.parent.mkdir(exist_ok=True, parents=True)
+            try:
+                os.symlink(file, out_file)
+            except FileExistsError:
+                print(f"File {out_file} already exists, skipping")
         # save split as json
         with open(Path(audio_folder) / f"{split}.json", "w") as f:

vampnet/interface.py CHANGED Viewed

@@ -273,11 +273,15 @@ class Interface(torch.nn.Module):
         else:
             mask = mask.repeat(1, self.coarse.n_codebooks, 1)
         return mask
     def coarse_to_fine(
         self,
         z: torch.Tensor,
         mask: torch.Tensor = None,
         **kwargs
     ):
         assert self.c2f is not None, "No coarse2fine model loaded"
@@ -320,6 +324,9 @@ class Interface(torch.nn.Module):
             fine_z.append(chunk)
         fine_z = torch.cat(fine_z, dim=-1)
         return fine_z[:, :, :length].clone()
     def coarse_vamp(
@@ -397,10 +404,12 @@ class Interface(torch.nn.Module):
         prefix_s: float = 0.0,
         suffix_s: float = 0.0,
         periodic_prompt: int = 7,
         periodic_prompt_width: int = 1,
         onset_mask_width: int = 0,
         _dropout: float = 0.0,
         upper_codebook_mask: int = 3,
         ncc: int = 0,
     ):
@@ -409,10 +418,17 @@ class Interface(torch.nn.Module):
             mask,
             inpaint(z, self.s2t(prefix_s), self.s2t(suffix_s)),
         )
-        mask = mask_and(
-            mask,
-            periodic_mask(z, periodic_prompt, periodic_prompt_width, random_roll=True),
-        )
         if onset_mask_width > 0:
             assert sig is not None, f"must provide a signal to use onset mask"
             mask = mask_or(
@@ -424,7 +440,8 @@ class Interface(torch.nn.Module):
         mask = dropout(mask, _dropout)
         mask = codebook_unmask(mask, ncc)
-        mask = codebook_mask(mask, int(upper_codebook_mask))
         return mask
     def ez_vamp(
@@ -440,8 +457,8 @@ class Interface(torch.nn.Module):
         build_mask_kwargs = build_mask_kwargs or {}
         vamp_kwargs = vamp_kwargs or {}
-        sig = self.preprocess(sig)
         loudness = sig.loudness()
         z = self.encode(sig)
@@ -488,26 +505,60 @@ class Interface(torch.nn.Module):
             # now, coarse2fine
             print(f"coarse2fine!")
-            zv = self.coarse_to_fine(
                 zv,
                 mask=mask,
                 **vamp_kwargs,
-                _sampling_steps=[4, 2, 2, 2]
             )
             prev_zvs.append(zv)
             z = zv
-        sig = self.to_signal(zv).cpu()
         print("done")
         sig = sig.normalize(loudness)
         if return_mask:
-            return sig, mask.cpu()
         else:
             return sig
 if __name__ == "__main__":
     import audiotools as at
     import logging
@@ -528,8 +579,6 @@ if __name__ == "__main__":
     sig = at.AudioSignal('assets/example.wav')
     z = interface.encode(sig)
-    breakpoint()
     # mask = linear_random(z, 1.0)
     # mask = mask_and(
     #     mask, periodic_mask(
@@ -569,3 +618,43 @@ if __name__ == "__main__":
     print("done")

         else:
             mask = mask.repeat(1, self.coarse.n_codebooks, 1)
         return mask
+    def set_chunk_size(self, chunk_size_s: float):
+        self.coarse.chunk_size_s = chunk_size_s
     def coarse_to_fine(
         self,
         z: torch.Tensor,
         mask: torch.Tensor = None,
+        return_mask: bool = False,
         **kwargs
     ):
         assert self.c2f is not None, "No coarse2fine model loaded"
             fine_z.append(chunk)
         fine_z = torch.cat(fine_z, dim=-1)
+        if return_mask:
+            return fine_z[:, :, :length].clone(), apply_mask(fine_z, mask, self.c2f.mask_token)[0][:, :, :length].clone()
         return fine_z[:, :, :length].clone()
     def coarse_vamp(
         prefix_s: float = 0.0,
         suffix_s: float = 0.0,
         periodic_prompt: int = 7,
+        periodic_prompt2: int = 7,
         periodic_prompt_width: int = 1,
         onset_mask_width: int = 0,
         _dropout: float = 0.0,
         upper_codebook_mask: int = 3,
+        upper_codebook_mask_2: int = None,
         ncc: int = 0,
     ):
             mask,
             inpaint(z, self.s2t(prefix_s), self.s2t(suffix_s)),
         )
+        pmask1 = periodic_mask(z, periodic_prompt, periodic_prompt_width, random_roll=True)
+        pmask2 = periodic_mask(z, periodic_prompt2, periodic_prompt_width, random_roll=True)
+        # interpolate the two masks
+        pmask = torch.round(
+            pmask1 * torch.linspace(1, 0, pmask1.shape[-1], device=pmask1.device) +
+            pmask2 * torch.linspace(0, 1, pmask2.shape[-1], device=pmask2.device)
+        ).long()
+        mask = mask_and(mask, pmask)
         if onset_mask_width > 0:
             assert sig is not None, f"must provide a signal to use onset mask"
             mask = mask_or(
         mask = dropout(mask, _dropout)
         mask = codebook_unmask(mask, ncc)
+        mask = codebook_mask(mask, int(upper_codebook_mask), upper_codebook_mask_2)
         return mask
     def ez_vamp(
         build_mask_kwargs = build_mask_kwargs or {}
         vamp_kwargs = vamp_kwargs or {}
         loudness = sig.loudness()
+        sig = self.preprocess(sig)
         z = self.encode(sig)
             # now, coarse2fine
             print(f"coarse2fine!")
+            zv, fine_zv_mask = self.coarse_to_fine(
                 zv,
                 mask=mask,
                 **vamp_kwargs,
+                _sampling_steps=[2, 2, 1, 1],
+                return_mask=True
+            )
+            mask_z = torch.cat(
+                [mask_z[:, :self.coarse.n_codebooks, :], fine_zv_mask[:, self.coarse.n_codebooks:, :]],
+                dim=1
             )
             prev_zvs.append(zv)
             z = zv
+        # perform to_signal batch item by batch
+        sigs = []
+        for zv in prev_zvs:
+            # do it in timestep chunks of 1024
+            _sigs = []
+            for i in range(0, zv.shape[-1], 1024):
+                _sigs.append(self.to_signal(zv[:, :, i:i+1024]).cpu())
+            sigs.append(signal_concat(_sigs))
         print("done")
+        sig = AudioSignal.batch(sigs)
+        # sig = self.to_signal(zv).cpu()
+        # print("done")
         sig = sig.normalize(loudness)
         if return_mask:
+            return sig, mask_z.cpu(), zv.cpu()
         else:
             return sig
+    def visualize_codes(self, z: torch.Tensor):
+        import matplotlib.pyplot as plt
+        # make sure the figsize is square when imshow is called
+        fig = plt.figure(figsize=(10, 7))
+        # in subplots, plot z[0] and the mask
+        # set title to "codes" and "mask"
+        fig.add_subplot(2, 1, 1)
+        plt.imshow(z[0].cpu().numpy(), aspect='auto', origin='lower', cmap="tab20")
+        plt.title("codes")
+        plt.ylabel("codebook index")
+        # set the xticks to seconds
+        plt.xticks(
+            np.arange(0, z.shape[-1], self.s2t(1)),
+            np.arange(0, self.t2s(z.shape[-1]), 1)
+        )
+        plt.xlabel("time (s)")
 if __name__ == "__main__":
     import audiotools as at
     import logging
     sig = at.AudioSignal('assets/example.wav')
     z = interface.encode(sig)
     # mask = linear_random(z, 1.0)
     # mask = mask_and(
     #     mask, periodic_mask(
     print("done")
+# example plotting code
+    #  import matplotlib.pyplot as plt
+    #                 from pathlib import Path
+    #                 Path(".vampnet").mkdir(exist_ok=True)
+    #                 plt.clf()
+    #                 # close all figs
+    #                 plt.close('all')
+    #                 # set the fig size
+    #                 plt.subplot(4, 1, 1)
+    #                 # sig =  self.to_signal(sampled_z, codec)
+    #                 # sig.cpu().specshow()
+    #                 plt.subplot(4, 1, 2)
+    #                 # since z_masked is a codebook, we want to plot the colormap
+    #                 # with distinct colors for each codebook index
+    #                 # plt.imshow(_debug_z_masked_before_forward[0].cpu().numpy(), aspect='auto', origin='lower', cmap="tab20")
+    #                 # make it so that anywhere where the mask is 1, we make that pixel black
+    #                 plt.imshow(_debug_z_masked_before_forward[0].cpu().numpy(), aspect='auto', origin='lower', cmap='gray_r',)
+    #                 plt.subplot(4, 1, 3)
+    #                 # plot the mask (which is a matrix)
+    #                 plt.imshow(mask[0].cpu().numpy(), aspect='auto', origin='lower', cmap='gray_r')
+    #                 plt.subplot(4, 1, 4)
+    #                 # replace any inf or -inf with 0
+    #                 _selected_probs = torch.where(
+    #                     selected_probs == torch.inf, torch.zeros_like(selected_probs), selected_probs
+    #                 )
+    #                 _selected_probs = torch.where(
+    #                     selected_probs == -torch.inf, torch.zeros_like(selected_probs), selected_probs
+    #                 )
+    #                 # fig = plt.gcf()
+    #                 # fig.set_figheight(15)
+    #                 # fig.set_figwidth(15)
+    #                 plt.imshow(codebook_unflatten(_selected_probs, n_infer_codebooks)[0].cpu().numpy(), aspect='auto', origin='lower', cmap="viridis" )
+    #                 # plt.show()
+    #                 plt.savefig(f".vampnet/c={codebook_level}_{i}.png")
+    #                 plt.close('all')

vampnet/mask.py CHANGED Viewed

@@ -60,6 +60,7 @@ def linear_random(
     assert x.ndim == 3, "x must be (batch, n_codebooks, seq)"
     if not isinstance(r, torch.Tensor):
         r = scalar_to_batch_tensor(r, x.shape[0]).to(x.device).float()
     probs = torch.ones_like(x).to(x.device).float()
     # expand to batch and codebook dims
@@ -98,7 +99,7 @@ def inpaint(x: torch.Tensor,
     return mask
 def periodic_mask(x: torch.Tensor,
-                period: int, width: int = 1,
                 random_roll=False,
     ):
     mask = full_mask(x)
@@ -140,9 +141,15 @@ def codebook_unmask(
     mask[:, :n_conditioning_codebooks, :] = 0
     return mask
-def codebook_mask(mask: torch.Tensor, start: int):
     mask = mask.clone()
-    mask[:, start:, :] = 1
     return mask
 def mask_and(

     assert x.ndim == 3, "x must be (batch, n_codebooks, seq)"
     if not isinstance(r, torch.Tensor):
         r = scalar_to_batch_tensor(r, x.shape[0]).to(x.device).float()
+        r = r[:, None, None]
     probs = torch.ones_like(x).to(x.device).float()
     # expand to batch and codebook dims
     return mask
 def periodic_mask(x: torch.Tensor,
+                period: int,width: int = 1,
                 random_roll=False,
     ):
     mask = full_mask(x)
     mask[:, :n_conditioning_codebooks, :] = 0
     return mask
+def codebook_mask(mask: torch.Tensor, val1: int, val2: int = None):
     mask = mask.clone()
+    mask[:, val1:, :] = 1
+    # val2 = val2 or val1
+    # vs = torch.linspace(val1, val2, mask.shape[1])
+    # for t, v in enumerate(vs):
+    #     v = int(v)
+    #     mask[:, v:, t] = 1
     return mask
 def mask_and(

vampnet/modules/transformer.py CHANGED Viewed

@@ -572,6 +572,8 @@ class VampNet(at.ml.BaseModel):
         """
         assert z.ndim == 3
         signal = at.AudioSignal(
             codec.decode(
                 codec.quantizer.from_latents(self.embedding.from_codes(z, codec))[0]
@@ -581,15 +583,13 @@ class VampNet(at.ml.BaseModel):
         # find where the mask token is and replace it with silence in the audio
         for tstep in range(z.shape[-1]):
-            if torch.any(z[:, :, tstep] == self.mask_token):
                 sample_idx_0 = tstep * codec.hop_length
                 sample_idx_1 = sample_idx_0 + codec.hop_length
                 signal.samples[:, :, sample_idx_0:sample_idx_1] = 0.0
         return signal
     @torch.no_grad()
     def generate(
         self,
@@ -600,17 +600,37 @@ class VampNet(at.ml.BaseModel):
         sampling_temperature: float = 1.0,
         mask: Optional[torch.Tensor] = None,
         mask_temperature: float = 10.5,
-        typical_filtering=False,
         typical_mass=0.2,
         typical_min_tokens=1,
-        top_p=None,
         seed: int = None,
-        sample_cutoff: float = 1.0,
         return_signal=True,
         debug=False,
         causal_weight: float = 0.0,
     ):
         if seed is not None:
             at.util.seed(seed)
@@ -749,7 +769,7 @@ class VampNet(at.ml.BaseModel):
                             num_to_mask
                         )
                     )
-                    print(f"will mask {num_to_mask.sum()} tokens")
                     mask = codebook_flatten(mask)
                 # ignore any tokens that weren't masked
@@ -812,6 +832,188 @@ class VampNet(at.ml.BaseModel):
         else:
             return sampled_z
 def sample_from_logits(
         logits,
         sample: bool = True,

         """
         assert z.ndim == 3
+        # remove mask token
+        z = z.masked_fill(z == self.mask_token, 0)
         signal = at.AudioSignal(
             codec.decode(
                 codec.quantizer.from_latents(self.embedding.from_codes(z, codec))[0]
         # find where the mask token is and replace it with silence in the audio
         for tstep in range(z.shape[-1]):
+            if torch.all(z[:, :, tstep] == self.mask_token):
                 sample_idx_0 = tstep * codec.hop_length
                 sample_idx_1 = sample_idx_0 + codec.hop_length
                 signal.samples[:, :, sample_idx_0:sample_idx_1] = 0.0
         return signal
     @torch.no_grad()
     def generate(
         self,
         sampling_temperature: float = 1.0,
         mask: Optional[torch.Tensor] = None,
         mask_temperature: float = 10.5,
+        typical_filtering=True,
         typical_mass=0.2,
         typical_min_tokens=1,
+        top_p=0.9,
         seed: int = None,
+        sample_cutoff: float = 0.9,
         return_signal=True,
         debug=False,
         causal_weight: float = 0.0,
+        use_og_method: bool = False,
     ):
+        if use_og_method:
+            return self.og_method(
+                codec,
+                time_steps,
+                _sampling_steps,
+                start_tokens,
+                sampling_temperature,
+                mask,
+                mask_temperature,
+                typical_filtering,
+                typical_mass,
+                typical_min_tokens,
+                top_p,
+                seed,
+                sample_cutoff,
+                return_signal,
+                debug,
+                causal_weight,
+            )
         if seed is not None:
             at.util.seed(seed)
                             num_to_mask
                         )
                     )
+                    logging.debug(f"will mask {num_to_mask.sum()} tokens")
                     mask = codebook_flatten(mask)
                 # ignore any tokens that weren't masked
         else:
             return sampled_z
+    def og_method(
+        self,
+        codec,
+        time_steps: int = 300,
+        _sampling_steps: List[int] = [16, 8, 8, 2, 2, 2, 2, 1, 1],
+        start_tokens: Optional[torch.Tensor] = None,
+        sampling_temperature: float = 1.0,
+        mask: Optional[torch.Tensor] = None,
+        mask_temperature: float = 10.5,
+        typical_filtering=True,
+        typical_mass=0.2,
+        typical_min_tokens=1,
+        top_p=0.9,
+        seed: int = None,
+        sample_cutoff: float = 0.75,
+        return_signal=True,
+        debug=False,
+        causal_weight: float = 0.0,
+    ):
+        if seed is not None:
+            at.util.seed(seed)
+        sampling_steps = sum(_sampling_steps)
+        logging.debug(f"beginning generation with {sampling_steps} steps")
+        #####################
+        # resolve initial z #
+        #####################
+        z = start_tokens
+        if z is None:
+            z = torch.full((1, self.n_codebooks, time_steps), self.mask_token).to(
+                self.device
+            )
+        logging.debug(f"created z with shape {z.shape}")
+        #################
+        # resolve mask #
+        #################
+        if mask is None:
+            mask = torch.ones_like(z).to(self.device).int()
+            mask[:, : self.n_conditioning_codebooks, :] = 0.0
+        if mask.ndim == 2:
+            mask = mask[:, None, :].repeat(1, z.shape[1], 1)
+        # init_mask = mask.clone()
+        logging.debug(f"created mask with shape {mask.shape}")
+        ###########
+        # set up #
+        ##########
+        # apply the mask to z
+        z_masked = z.masked_fill(mask.bool(), self.mask_token)
+        # logging.debug(f"z_masked: {z_masked}")
+        # how many mask tokens to begin with?
+        num_mask_tokens_at_start = (z_masked == self.mask_token).sum()
+        logging.debug(f"num mask tokens at start: {num_mask_tokens_at_start}")
+        # how many codebooks are we inferring vs conditioning on?
+        n_infer_codebooks = self.n_codebooks - self.n_conditioning_codebooks
+        logging.debug(f"n infer codebooks: {n_infer_codebooks}")
+        #################
+        # begin sampling #
+        #################
+        for i in range(sampling_steps):
+            logging.debug(f"step {i} of {sampling_steps}")
+            # our current schedule step
+            r = scalar_to_batch_tensor(
+                (i + 1) / sampling_steps,
+                z.shape[0]
+            ).to(z.device)
+            logging.debug(f"r: {r}")
+            # get latents
+            latents = self.embedding.from_codes(z_masked, codec)
+            logging.debug(f"computed latents with shape: {latents.shape}")
+            # infer from latents
+            # NOTE: this collapses the codebook dimension into the sequence dimension
+            logits = self.forward(latents) # b, prob, seq
+            logits = logits.permute(0, 2, 1)  # b, seq, prob
+            b = logits.shape[0]
+            logging.debug(f"permuted logits with shape: {logits.shape}")
+            sampled_z, selected_probs = sample_from_logits(
+                logits, sample=(
+                   (i / sampling_steps) <= sample_cutoff
+                ),
+                temperature=sampling_temperature,
+                typical_filtering=typical_filtering, typical_mass=typical_mass,
+                typical_min_tokens=typical_min_tokens,
+                top_k=None, top_p=top_p, return_probs=True,
+            )
+            logging.debug(f"sampled z with shape: {sampled_z.shape}")
+            # flatten z_masked and mask, so we can deal with the sampling logic
+            # we'll unflatten them at the end of the loop for the next forward pass
+            # remove conditioning codebooks, we'll add them back at the end
+            z_masked = codebook_flatten(z_masked[:, self.n_conditioning_codebooks:, :])
+            mask = (z_masked == self.mask_token).int()
+            # update the mask, remove conditioning codebooks from the mask
+            logging.debug(f"updated mask with shape: {mask.shape}")
+            # add z back into sampled z where the mask was false
+            sampled_z = torch.where(
+                mask.bool(), sampled_z, z_masked
+            )
+            logging.debug(f"added z back into sampled z with shape: {sampled_z.shape}")
+            # ignore any tokens that weren't masked
+            selected_probs = torch.where(
+                mask.bool(), selected_probs, torch.inf
+            )
+            # get the num tokens to mask, according to the schedule
+            num_to_mask = torch.floor(_gamma(r) * num_mask_tokens_at_start).unsqueeze(1).long()
+            logging.debug(f"num to mask: {num_to_mask}")
+            if i != (sampling_steps - 1):
+                num_to_mask = torch.maximum(
+                    torch.tensor(1),
+                    torch.minimum(
+                        mask.sum(dim=-1, keepdim=True) - 1,
+                        num_to_mask
+                    )
+                )
+            # get our new mask
+            mask = mask_by_random_topk(
+                num_to_mask, selected_probs, mask_temperature * (1-r)
+            )
+            # update the mask
+            z_masked = torch.where(
+                mask.bool(), self.mask_token, sampled_z
+            )
+            logging.debug(f"updated z_masked with shape: {z_masked.shape}")
+            z_masked = codebook_unflatten(z_masked, n_infer_codebooks)
+            mask = codebook_unflatten(mask, n_infer_codebooks)
+            logging.debug(f"unflattened z_masked with shape: {z_masked.shape}")
+            # add conditioning codebooks back to z_masked
+            z_masked = torch.cat(
+                (z[:, :self.n_conditioning_codebooks, :], z_masked), dim=1
+            )
+            logging.debug(f"added conditioning codebooks back to z_masked with shape: {z_masked.shape}")
+        # add conditioning codebooks back to sampled_z
+        sampled_z = codebook_unflatten(sampled_z, n_infer_codebooks)
+        sampled_z = torch.cat(
+            (z[:, :self.n_conditioning_codebooks, :], sampled_z), dim=1
+        )
+        logging.debug(f"finished sampling")
+        if return_signal:
+            return self.to_signal(sampled_z, codec)
+        else:
+            return sampled_z
 def sample_from_logits(
         logits,
         sample: bool = True,