Spaces:

hugggof
/

nesquik

Sleeping

App Files Files Community

Hugo Flores Garcia commited on Mar 1

Commit

51f416f

•

1 Parent(s): a689560

fix

Browse files

Files changed (4) hide show

app.py +3 -3
vampnet/interface.py +271 -42
vampnet/mask.py +11 -4
vampnet/modules/transformer.py +262 -12

app.py CHANGED Viewed

@@ -494,7 +494,7 @@ with gr.Blocks() as demo:
                 minimum=1,
                 maximum=16,
                 step=1,
-                value=1
             )
             win_dur= gr.Slider(
@@ -580,8 +580,8 @@ with gr.Blocks() as demo:
     from pyharp import ModelCard, build_endpoint
     model_card = ModelCard(
-        name="salad bowl",
-        description="sounds",
         author="hugo flores garcía",
         tags=["generative","sound"],
     )

                 minimum=1,
                 maximum=16,
                 step=1,
+                value=3
             )
             win_dur= gr.Slider(
     from pyharp import ModelCard, build_endpoint
     model_card = ModelCard(
+        name="nesquik",
+        description="the ultimate 8-bit crusher",
         author="hugo flores garcía",
         tags=["generative","sound"],
     )

vampnet/interface.py CHANGED Viewed

@@ -110,26 +110,26 @@ class Interface(torch.nn.Module):
             # check if we already loaded, if so, don't reload
             if self.coarse_path == Path(coarse_ckpt):
                 print(f"already loaded {coarse_ckpt}")
-                return
-            self.coarse = _load_model(
-                ckpt=coarse_ckpt,
-                device=self.device,
-                chunk_size_s=self.coarse.chunk_size_s,
-            )
-            self.coarse_path = Path(coarse_ckpt)
-            print(f"loaded {coarse_ckpt}")
         if c2f_ckpt is not None:
             if self.c2f_path == Path(c2f_ckpt):
                 print(f"already loaded {c2f_ckpt}")
-                return
-            self.c2f = _load_model(
-                ckpt=c2f_ckpt,
-                device=self.device,
-                chunk_size_s=self.c2f.chunk_size_s,
-            )
-            self.c2f_path = Path(c2f_ckpt)
-            print(f"loaded {c2f_ckpt}")
     def s2t(self, seconds: float):
         """seconds to tokens"""
@@ -273,11 +273,15 @@ class Interface(torch.nn.Module):
         else:
             mask = mask.repeat(1, self.coarse.n_codebooks, 1)
         return mask
     def coarse_to_fine(
         self,
         z: torch.Tensor,
         mask: torch.Tensor = None,
         **kwargs
     ):
         assert self.c2f is not None, "No coarse2fine model loaded"
@@ -289,7 +293,7 @@ class Interface(torch.nn.Module):
         if length % chunk_len != 0:
             pad_len = chunk_len - (length % chunk_len)
             z = torch.nn.functional.pad(z, (0, pad_len))
-            mask = torch.nn.functional.pad(mask, (0, pad_len)) if mask is not None else None
         n_codebooks_to_append = self.c2f.n_codebooks - z.shape[1]
         if n_codebooks_to_append > 0:
@@ -297,6 +301,7 @@ class Interface(torch.nn.Module):
                 z,
                 torch.zeros(z.shape[0], n_codebooks_to_append, z.shape[-1]).long().to(self.device)
             ], dim=1)
         # set the mask to 0 for all conditioning codebooks
         if mask is not None:
@@ -319,6 +324,9 @@ class Interface(torch.nn.Module):
             fine_z.append(chunk)
         fine_z = torch.cat(fine_z, dim=-1)
         return fine_z[:, :, :length].clone()
     def coarse_vamp(
@@ -331,22 +339,52 @@ class Interface(torch.nn.Module):
     ):
         # coarse z
         cz = z[:, : self.coarse.n_codebooks, :].clone()
-        assert cz.shape[-1] <= self.s2t(self.coarse.chunk_size_s), f"the sequence of tokens provided must match the one specified in the coarse chunk size, but got {cz.shape[-1]} and {self.s2t(self.coarse.chunk_size_s)}"
         mask = mask[:, : self.coarse.n_codebooks, :]
-        cz_masked, mask = apply_mask(cz, mask, self.coarse.mask_token)
-        cz_masked = cz_masked[:, : self.coarse.n_codebooks, :]
-        gen_fn = gen_fn or self.coarse.generate
-        c_vamp = gen_fn(
-            codec=self.codec,
-            time_steps=cz.shape[-1],
-            start_tokens=cz,
-            mask=mask,
-            return_signal=False,
-            **kwargs
-        )
         # add the fine codes back in
         c_vamp = torch.cat(
@@ -358,16 +396,169 @@ class Interface(torch.nn.Module):
             return c_vamp, cz_masked
         return c_vamp
-    # def chunked_coarse_vamp(
-    #     self,
-    #     z,
-    #     mask,
-    #     return_mask=False,
-    #     gen_fn=None,
-    #     **kwargs
-    # )
 if __name__ == "__main__":
     import audiotools as at
@@ -389,8 +580,6 @@ if __name__ == "__main__":
     sig = at.AudioSignal('assets/example.wav')
     z = interface.encode(sig)
-    breakpoint()
     # mask = linear_random(z, 1.0)
     # mask = mask_and(
     #     mask, periodic_mask(
@@ -429,4 +618,44 @@ if __name__ == "__main__":
     sig = interface.to_signal(zv).cpu()
     print("done")

             # check if we already loaded, if so, don't reload
             if self.coarse_path == Path(coarse_ckpt):
                 print(f"already loaded {coarse_ckpt}")
+            else:
+                self.coarse = _load_model(
+                    ckpt=coarse_ckpt,
+                    device=self.device,
+                    chunk_size_s=self.coarse.chunk_size_s,
+                )
+                self.coarse_path = Path(coarse_ckpt)
+                print(f"loaded {coarse_ckpt}")
         if c2f_ckpt is not None:
             if self.c2f_path == Path(c2f_ckpt):
                 print(f"already loaded {c2f_ckpt}")
+            else:
+                self.c2f = _load_model(
+                    ckpt=c2f_ckpt,
+                    device=self.device,
+                    chunk_size_s=self.c2f.chunk_size_s,
+                )
+                self.c2f_path = Path(c2f_ckpt)
+                print(f"loaded {c2f_ckpt}")
     def s2t(self, seconds: float):
         """seconds to tokens"""
         else:
             mask = mask.repeat(1, self.coarse.n_codebooks, 1)
         return mask
+    def set_chunk_size(self, chunk_size_s: float):
+        self.coarse.chunk_size_s = chunk_size_s
     def coarse_to_fine(
         self,
         z: torch.Tensor,
         mask: torch.Tensor = None,
+        return_mask: bool = False,
         **kwargs
     ):
         assert self.c2f is not None, "No coarse2fine model loaded"
         if length % chunk_len != 0:
             pad_len = chunk_len - (length % chunk_len)
             z = torch.nn.functional.pad(z, (0, pad_len))
+            mask = torch.nn.functional.pad(mask, (0, pad_len), value=1) if mask is not None else None
         n_codebooks_to_append = self.c2f.n_codebooks - z.shape[1]
         if n_codebooks_to_append > 0:
                 z,
                 torch.zeros(z.shape[0], n_codebooks_to_append, z.shape[-1]).long().to(self.device)
             ], dim=1)
+            print(f"appended {n_codebooks_to_append} codebooks to z")
         # set the mask to 0 for all conditioning codebooks
         if mask is not None:
             fine_z.append(chunk)
         fine_z = torch.cat(fine_z, dim=-1)
+        if return_mask:
+            return fine_z[:, :, :length].clone(), apply_mask(fine_z, mask, self.c2f.mask_token)[0][:, :, :length].clone()
         return fine_z[:, :, :length].clone()
     def coarse_vamp(
     ):
         # coarse z
         cz = z[:, : self.coarse.n_codebooks, :].clone()
         mask = mask[:, : self.coarse.n_codebooks, :]
+        # assert cz.shape[-1] <= self.s2t(self.coarse.chunk_size_s), f"the sequence of tokens provided must match the one specified in the coarse chunk size, but got {cz.shape[-1]} and {self.s2t(self.coarse.chunk_size_s)}"
+        # cut into chunks, keep the last chunk separate if it's too small
+        chunk_len = self.s2t(self.coarse.chunk_size_s)
+        n_chunks = math.ceil(cz.shape[-1] / chunk_len)
+        last_chunk_len = cz.shape[-1] % chunk_len
+        cz_chunks = []
+        mask_chunks = []
+        for i in range(n_chunks):
+            chunk = cz[:, :, i * chunk_len : (i + 1) * chunk_len]
+            mask_chunk = mask[:, :, i * chunk_len : (i + 1) * chunk_len]
+            # make sure that the very first and last timestep of each chunk is 0 so that we don't get a weird
+            # discontinuity when we stitch the chunks back together
+            # only if there's already a 0 somewhere in the chunk
+            if torch.any(mask_chunk == 0):
+                mask_chunk[:, :, 0] = 0
+                mask_chunk[:, :, -1] = 0
+            cz_chunks.append(chunk)
+            mask_chunks.append(mask_chunk)
+        # now vamp each chunk
+        cz_masked_chunks = []
+        cz_vamped_chunks = []
+        for chunk, mask_chunk in zip(cz_chunks, mask_chunks):
+            cz_masked_chunk, mask_chunk = apply_mask(chunk, mask_chunk, self.coarse.mask_token)
+            cz_masked_chunk = cz_masked_chunk[:, : self.coarse.n_codebooks, :]
+            cz_masked_chunks.append(cz_masked_chunk)
+            gen_fn = gen_fn or self.coarse.generate
+            c_vamp_chunk = gen_fn(
+                codec=self.codec,
+                time_steps=chunk_len,
+                start_tokens=cz_masked_chunk,
+                return_signal=False,
+                mask=mask_chunk,
+                **kwargs
+            )
+            cz_vamped_chunks.append(c_vamp_chunk)
+        # stitch the chunks back together
+        cz_masked = torch.cat(cz_masked_chunks, dim=-1)
+        c_vamp = torch.cat(cz_vamped_chunks, dim=-1)
         # add the fine codes back in
         c_vamp = torch.cat(
             return c_vamp, cz_masked
         return c_vamp
+    def build_mask(self,
+        z: torch.Tensor,
+        sig: AudioSignal = None,
+        rand_mask_intensity: float = 1.0,
+        prefix_s: float = 0.0,
+        suffix_s: float = 0.0,
+        periodic_prompt: int = 7,
+        periodic_prompt2: int = 7,
+        periodic_prompt_width: int = 1,
+        onset_mask_width: int = 0,
+        _dropout: float = 0.0,
+        upper_codebook_mask: int = 3,
+        upper_codebook_mask_2: int = None,
+        ncc: int = 0,
+    ):
+        mask = linear_random(z, rand_mask_intensity)
+        mask = mask_and(
+            mask,
+            inpaint(z, self.s2t(prefix_s), self.s2t(suffix_s)),
+        )
+        pmask1 = periodic_mask(z, periodic_prompt, periodic_prompt_width, random_roll=True)
+        pmask2 = periodic_mask(z, periodic_prompt2, periodic_prompt_width, random_roll=True)
+        # interpolate the two masks
+        pmask = torch.round(
+            pmask1 * torch.linspace(1, 0, pmask1.shape[-1], device=pmask1.device) +
+            pmask2 * torch.linspace(0, 1, pmask2.shape[-1], device=pmask2.device)
+        ).long()
+        mask = mask_and(mask, pmask)
+        if onset_mask_width > 0:
+            assert sig is not None, f"must provide a signal to use onset mask"
+            mask = mask_or(
+                mask, onset_mask(
+                    sig, z, interface,
+                    width=onset_mask_width
+                )
+            )
+        mask = dropout(mask, _dropout)
+        mask = codebook_unmask(mask, ncc)
+        mask = codebook_mask(mask, int(upper_codebook_mask), upper_codebook_mask_2)
+        return mask
+    def ez_vamp(
+        self,
+        sig: AudioSignal,
+        batch_size: int = 4,
+        feedback_steps: int = 1,
+        time_stretch_factor: int = 1,
+        return_mask: bool = False,
+        build_mask_kwargs: dict = None,
+        vamp_kwargs: dict = None,
+    ):
+        feedback_steps = int(feedback_steps)
+        build_mask_kwargs = build_mask_kwargs or {}
+        vamp_kwargs = vamp_kwargs or {}
+        loudness = sig.loudness()
+        sig = self.preprocess(sig)
+        z = self.encode(sig)
+        # expand z to batch size
+        z = z.expand(batch_size, -1, -1)
+        mask = self.build_mask(
+            z=z,
+            **build_mask_kwargs
+        )
+        mask = mask.expand(batch_size, -1, -1)
+        # stretch mask and z to match the time stretch factor
+        # we'll add (stretch_factor - 1) mask tokens in between each timestep of z
+        # and we'll make the mask 1 in all the new slots we added
+        if time_stretch_factor > 1:
+            z = z.repeat_interleave(time_stretch_factor, dim=-1)
+            mask = mask.repeat_interleave(time_stretch_factor, dim=-1)
+            added_mask = torch.ones_like(mask)
+            added_mask[:, :, ::time_stretch_factor] = 0
+            mask = mask.bool() | added_mask.bool()
+            mask = mask.long()
+        prev_zvs = []
+        for i in tqdm.tqdm(range(feedback_steps), desc="feedback steps"):
+            print(z.shape)
+            vamp_kwargs.pop("mask", None)
+            vamp_kwargs.pop('return_mask', None)
+            print("coarse!")
+            zv, mask_z = self.coarse_vamp(
+                z,
+                mask=mask,
+                return_mask=True,
+                **vamp_kwargs
+            )
+            # add the top codebooks back in
+            if zv.shape[1] < z.shape[1]:
+                print(f"adding {z.shape[1] - zv.shape[1]} codebooks back in")
+                zv = torch.cat(
+                    [zv, z[:, self.coarse.n_codebooks :, :]],
+                    dim=1
+                )
+            # now, coarse2fine
+            print(f"coarse2fine!")
+            zv, fine_zv_mask = self.coarse_to_fine(
+                zv,
+                mask=mask,
+                **vamp_kwargs,
+                _sampling_steps=[2, 2, 1, 1],
+                return_mask=True
+            )
+            mask_z = torch.cat(
+                [mask_z[:, :self.coarse.n_codebooks, :], fine_zv_mask[:, self.coarse.n_codebooks:, :]],
+                dim=1
+            )
+            prev_zvs.append(zv)
+            z = zv
+        # perform to_signal batch item by batch
+        sigs = []
+        for zv in prev_zvs:
+            # do it in timestep chunks of 1024
+            _sigs = []
+            for i in range(0, zv.shape[-1], 1024):
+                _sigs.append(self.to_signal(zv[:, :, i:i+1024]).cpu())
+            sigs.append(signal_concat(_sigs))
+        print("done")
+        sig = AudioSignal.batch(sigs)
+        # sig = self.to_signal(zv).cpu()
+        # print("done")
+        sig = sig.normalize(loudness)
+        if return_mask:
+            return sig, mask_z.cpu(), zv.cpu()
+        else:
+            return sig
+    def visualize_codes(self, z: torch.Tensor):
+        import matplotlib.pyplot as plt
+        # make sure the figsize is square when imshow is called
+        fig = plt.figure(figsize=(10, 7))
+        # in subplots, plot z[0] and the mask
+        # set title to "codes" and "mask"
+        fig.add_subplot(2, 1, 1)
+        plt.imshow(z[0].cpu().numpy(), aspect='auto', origin='lower', cmap="tab20")
+        plt.title("codes")
+        plt.ylabel("codebook index")
+        # set the xticks to seconds
+        plt.xticks(
+            np.arange(0, z.shape[-1], self.s2t(1)),
+            np.arange(0, self.t2s(z.shape[-1]), 1)
+        )
+        plt.xlabel("time (s)")
 if __name__ == "__main__":
     import audiotools as at
     sig = at.AudioSignal('assets/example.wav')
     z = interface.encode(sig)
     # mask = linear_random(z, 1.0)
     # mask = mask_and(
     #     mask, periodic_mask(
     sig = interface.to_signal(zv).cpu()
     print("done")
+# example plotting code
+    #  import matplotlib.pyplot as plt
+    #                 from pathlib import Path
+    #                 Path(".vampnet").mkdir(exist_ok=True)
+    #                 plt.clf()
+    #                 # close all figs
+    #                 plt.close('all')
+    #                 # set the fig size
+    #                 plt.subplot(4, 1, 1)
+    #                 # sig =  self.to_signal(sampled_z, codec)
+    #                 # sig.cpu().specshow()
+    #                 plt.subplot(4, 1, 2)
+    #                 # since z_masked is a codebook, we want to plot the colormap
+    #                 # with distinct colors for each codebook index
+    #                 # plt.imshow(_debug_z_masked_before_forward[0].cpu().numpy(), aspect='auto', origin='lower', cmap="tab20")
+    #                 # make it so that anywhere where the mask is 1, we make that pixel black
+    #                 plt.imshow(_debug_z_masked_before_forward[0].cpu().numpy(), aspect='auto', origin='lower', cmap='gray_r',)
+    #                 plt.subplot(4, 1, 3)
+    #                 # plot the mask (which is a matrix)
+    #                 plt.imshow(mask[0].cpu().numpy(), aspect='auto', origin='lower', cmap='gray_r')
+    #                 plt.subplot(4, 1, 4)
+    #                 # replace any inf or -inf with 0
+    #                 _selected_probs = torch.where(
+    #                     selected_probs == torch.inf, torch.zeros_like(selected_probs), selected_probs
+    #                 )
+    #                 _selected_probs = torch.where(
+    #                     selected_probs == -torch.inf, torch.zeros_like(selected_probs), selected_probs
+    #                 )
+    #                 # fig = plt.gcf()
+    #                 # fig.set_figheight(15)
+    #                 # fig.set_figwidth(15)
+    #                 plt.imshow(codebook_unflatten(_selected_probs, n_infer_codebooks)[0].cpu().numpy(), aspect='auto', origin='lower', cmap="viridis" )
+    #                 # plt.show()
+    #                 plt.savefig(f".vampnet/c={codebook_level}_{i}.png")
+    #                 plt.close('all')

vampnet/mask.py CHANGED Viewed

@@ -60,6 +60,7 @@ def linear_random(
     assert x.ndim == 3, "x must be (batch, n_codebooks, seq)"
     if not isinstance(r, torch.Tensor):
         r = scalar_to_batch_tensor(r, x.shape[0]).to(x.device).float()
     probs = torch.ones_like(x).to(x.device).float()
     # expand to batch and codebook dims
@@ -98,7 +99,7 @@ def inpaint(x: torch.Tensor,
     return mask
 def periodic_mask(x: torch.Tensor,
-                period: int, width: int = 1,
                 random_roll=False,
     ):
     mask = full_mask(x)
@@ -140,9 +141,15 @@ def codebook_unmask(
     mask[:, :n_conditioning_codebooks, :] = 0
     return mask
-def codebook_mask(mask: torch.Tensor, start: int):
     mask = mask.clone()
-    mask[:, start:, :] = 1
     return mask
 def mask_and(
@@ -239,4 +246,4 @@ def onset_mask(
 if __name__ == "__main__":
-    pass

     assert x.ndim == 3, "x must be (batch, n_codebooks, seq)"
     if not isinstance(r, torch.Tensor):
         r = scalar_to_batch_tensor(r, x.shape[0]).to(x.device).float()
+        r = r[:, None, None]
     probs = torch.ones_like(x).to(x.device).float()
     # expand to batch and codebook dims
     return mask
 def periodic_mask(x: torch.Tensor,
+                period: int,width: int = 1,
                 random_roll=False,
     ):
     mask = full_mask(x)
     mask[:, :n_conditioning_codebooks, :] = 0
     return mask
+def codebook_mask(mask: torch.Tensor, val1: int, val2: int = None):
     mask = mask.clone()
+    mask[:, val1:, :] = 1
+    # val2 = val2 or val1
+    # vs = torch.linspace(val1, val2, mask.shape[1])
+    # for t, v in enumerate(vs):
+    #     v = int(v)
+    #     mask[:, v:, t] = 1
     return mask
 def mask_and(
 if __name__ == "__main__":
+    pass

vampnet/modules/transformer.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import math
 import logging
-from typing import Optional, Tuple, Union
 import numpy as np
 import torch
@@ -572,6 +572,8 @@ class VampNet(at.ml.BaseModel):
         """
         assert z.ndim == 3
         signal = at.AudioSignal(
             codec.decode(
                 codec.quantizer.from_latents(self.embedding.from_codes(z, codec))[0]
@@ -581,34 +583,279 @@ class VampNet(at.ml.BaseModel):
         # find where the mask token is and replace it with silence in the audio
         for tstep in range(z.shape[-1]):
-            if torch.any(z[:, :, tstep] == self.mask_token):
                 sample_idx_0 = tstep * codec.hop_length
                 sample_idx_1 = sample_idx_0 + codec.hop_length
                 signal.samples[:, :, sample_idx_0:sample_idx_1] = 0.0
         return signal
     @torch.no_grad()
     def generate(
         self,
         codec,
         time_steps: int = 300,
-        sampling_steps: int = 36,
         start_tokens: Optional[torch.Tensor] = None,
         sampling_temperature: float = 1.0,
         mask: Optional[torch.Tensor] = None,
         mask_temperature: float = 10.5,
-        typical_filtering=False,
         typical_mass=0.2,
         typical_min_tokens=1,
-        top_p=None,
-        return_signal=True,
         seed: int = None,
-        sample_cutoff: float = 1.0,
     ):
         if seed is not None:
             at.util.seed(seed)
         logging.debug(f"beginning generation with {sampling_steps} steps")
@@ -763,6 +1010,10 @@ class VampNet(at.ml.BaseModel):
         else:
             return sampled_z
 def sample_from_logits(
         logits,
         sample: bool = True,
@@ -942,12 +1193,11 @@ if __name__ == "__main__":
         pred = z_hat.argmax(dim=1)
         pred = model.embedding.unflatten(pred, n_codebooks=model.n_predict_codebooks)
-        print(f"model has {num_params(model)/1e6:<.3f}M parameters")
-        print(f"prediction has shape {pred.shape}")
         breakpoint()
     args = argbind.parse_args()
     with argbind.scope(args):
         try_model()

 import math
 import logging
+from typing import Optional, Tuple, Union, List
 import numpy as np
 import torch
         """
         assert z.ndim == 3
+        # remove mask token
+        z = z.masked_fill(z == self.mask_token, 0)
         signal = at.AudioSignal(
             codec.decode(
                 codec.quantizer.from_latents(self.embedding.from_codes(z, codec))[0]
         # find where the mask token is and replace it with silence in the audio
         for tstep in range(z.shape[-1]):
+            if torch.all(z[:, :, tstep] == self.mask_token):
                 sample_idx_0 = tstep * codec.hop_length
                 sample_idx_1 = sample_idx_0 + codec.hop_length
                 signal.samples[:, :, sample_idx_0:sample_idx_1] = 0.0
         return signal
     @torch.no_grad()
     def generate(
         self,
         codec,
         time_steps: int = 300,
+        _sampling_steps: List[int] = [16, 8, 8, 2, 2, 2, 2, 1, 1],
         start_tokens: Optional[torch.Tensor] = None,
         sampling_temperature: float = 1.0,
         mask: Optional[torch.Tensor] = None,
         mask_temperature: float = 10.5,
+        typical_filtering=True,
         typical_mass=0.2,
         typical_min_tokens=1,
+        top_p=0.9,
         seed: int = None,
+        sample_cutoff: float = 0.9,
+        return_signal=True,
+        debug=False,
+        causal_weight: float = 0.0,
+        use_og_method: bool = False,
+    ):
+        if use_og_method:
+            return self.og_method(
+                codec,
+                time_steps,
+                _sampling_steps,
+                start_tokens,
+                sampling_temperature,
+                mask,
+                mask_temperature,
+                typical_filtering,
+                typical_mass,
+                typical_min_tokens,
+                top_p,
+                seed,
+                sample_cutoff,
+                return_signal,
+                debug,
+                causal_weight,
+            )
+        if seed is not None:
+            at.util.seed(seed)
+        #####################
+        # resolve initial z #
+        #####################
+        z = start_tokens
+        if z is None:
+            z = torch.full((1, self.n_codebooks, time_steps), self.mask_token).to(
+                self.device
+            )
+        logging.debug(f"created z with shape {z.shape}")
+        #################
+        # resolve mask #
+        #################
+        if mask is None:
+            mask = torch.ones_like(z).to(self.device).int()
+            mask[:, : self.n_conditioning_codebooks, :] = 0.0
+        if mask.ndim == 2:
+            mask = mask[:, None, :].repeat(1, z.shape[1], 1)
+        orig_mask = mask
+        logging.debug(f"created mask with shape {mask.shape}")
+        ###########
+        # set up #
+        ##########
+        # apply the mask to z
+        z_masked = z.masked_fill(mask.bool(), self.mask_token)
+        # how many codebooks are we inferring vs conditioning on?
+        n_infer_codebooks = self.n_codebooks - self.n_conditioning_codebooks
+        logging.debug(f"n infer codebooks: {n_infer_codebooks}")
+        #################
+        # begin sampling #
+        #################
+        # add one sampling step for each codebook level
+        logging.debug(f"initial mask: {mask}")
+        logging.debug(f"adding {n_infer_codebooks} sampling steps")
+        steps = _sampling_steps + [1 for _ in range(n_infer_codebooks - len(_sampling_steps))]
+        # truncate if we have too many
+        steps = steps[:n_infer_codebooks]
+        for codebook_level, nsteps in enumerate(steps):
+            # apply the orig mask to z_masked, only in the current codebook level
+            # this is crucial due to the stemgen random masking we did during training
+            # which ensures all upper codebooks are masked while inferring the bottom ones.
+            z_masked[:, codebook_level, :] = torch.where(
+                orig_mask[:, codebook_level, :].bool(),
+                self.mask_token,
+                z_masked[:, codebook_level, :]
+            )
+            # how many mask tokens to begin with?
+            num_mask_tokens_at_start = (z_masked[:, codebook_level, :] == self.mask_token).sum(dim=-1)
+            logging.debug(f"num mask tokens at start: {num_mask_tokens_at_start}")
+            for i in range(nsteps):
+                logging.debug(f"processing cb level {codebook_level} of {len(steps)}")
+                logging.debug(f"step {i} of {nsteps}")
+                # our current schedule step
+                r = scalar_to_batch_tensor(
+                    (i + 1) / nsteps,
+                    z.shape[0]
+                ).to(z.device)
+                logging.debug(f"r: {r}")
+                # get latents
+                latents = self.embedding.from_codes(z_masked, codec)
+                logging.debug(f"computed latents with shape: {latents.shape}")
+                # infer from latents
+                # NOTE: this collapses the codebook dimension into the sequence dimension
+                logits = self.forward(
+                    latents,
+                )  # b, prob, seq
+                logits = logits.permute(0, 2, 1)  # b, seq, prob
+                logging.debug(f"permuted logits with shape: {logits.shape}")
+                sampled_z, selected_probs = sample_from_logits(
+                    logits, sample=(
+                    (i / nsteps) <= sample_cutoff
+                    ),
+                    temperature=sampling_temperature,
+                    typical_filtering=typical_filtering, typical_mass=typical_mass,
+                    typical_min_tokens=typical_min_tokens,
+                    top_k=None, top_p=top_p, return_probs=True,
+                )
+                # fill selected probs with -inf if we're not in the codebook level we are sampling from
+                # find out which codebook we are sampling from
+                selected_probs = codebook_unflatten(selected_probs, n_infer_codebooks)
+                selected_probs[:,  codebook_level+1:, :,] = -float("inf") # all the ones above
+                # selected_probs[:, :codebook_level, :,] = -float("inf")
+                logging.debug(f"masking all but codebook {codebook_level}")
+                logging.debug(f"selected probs: {selected_probs}")
+                logging.debug(mask)
+                selected_probs = codebook_flatten(selected_probs)
+                logging.debug(f"sampled z with shape: {sampled_z.shape}")
+                # flatten z_masked and mask, so we can deal with the sampling logic
+                # we'll unflatten them at the end of the loop for the next forward pass
+                # remove conditioning codebooks, we'll add them back at the end
+                z_masked = codebook_flatten(z_masked[:, self.n_conditioning_codebooks:, :])
+                mask = (z_masked == self.mask_token).int()
+                logging.debug(f"mask now: {mask}")
+                # update the mask, remove conditioning codebooks from the mask
+                logging.debug(f"updated mask with shape: {mask.shape}")
+                # add z back into sampled z where the mask was false
+                sampled_z = torch.where(
+                    mask.bool(), sampled_z, z_masked
+                )
+                logging.debug(f"added z back into sampled z with shape: {sampled_z.shape}")
+                # get the num tokens to mask, according to the schedule
+                num_to_mask = torch.floor(_gamma(r) * num_mask_tokens_at_start).unsqueeze(1).long()
+                # num_to_mask = torch.floor(r * num_mask_tokens_at_start).unsqueeze(1).long() # doesn't work at all this way
+                logging.debug(f"num to mask: {num_to_mask}")
+                logging.debug(f"masking {num_to_mask.sum()} tokens")
+                if i != (nsteps - 1):
+                    mask = codebook_unflatten(mask, n_infer_codebooks)
+                    num_to_mask = torch.maximum(
+                        torch.tensor(1),
+                        torch.minimum(
+                            mask[:, codebook_level, :].sum(dim=-1, keepdim=True) - 1,
+                            num_to_mask
+                        )
+                    )
+                    logging.debug(f"will mask {num_to_mask.sum()} tokens")
+                    mask = codebook_flatten(mask)
+                # ignore any tokens that weren't masked
+                selected_probs = torch.where(
+                   mask.bool(), selected_probs, torch.inf
+                )
+                # add a causal weight to the selected probs
+                # NOTE: some experiments i did showed that this didn't help.
+                # set it to 0 until further eval
+                causal_probs = torch.linspace(1, 0, z_masked.shape[-1], device=z_masked.device)
+                causal_probs = causal_probs.repeat(z_masked.shape[0], 1)
+                selected_probs = selected_probs + causal_probs * causal_weight
+                # # get our new mask
+                ############
+                mask = codebook_unflatten(mask, n_infer_codebooks)
+                selected_probs = codebook_unflatten(selected_probs, n_infer_codebooks)
+                # only consider probs at current level
+                selected_probs_cur_level = selected_probs[:, codebook_level, :]
+                mask_cur_level = mask_by_random_topk(
+                    num_to_mask, selected_probs_cur_level, mask_temperature * (1-r.unsqueeze(1))
+                )
+                mask[:, codebook_level, :] = mask_cur_level
+                mask = codebook_flatten(mask)
+                selected_probs = codebook_flatten(selected_probs)
+                ###############
+                # update the mask
+                z_masked = torch.where(
+                    mask.bool(), self.mask_token, sampled_z
+                )
+                logging.debug(f"updated z_masked with shape: {z_masked.shape}")
+                z_masked = codebook_unflatten(z_masked, n_infer_codebooks)
+                mask = codebook_unflatten(mask, n_infer_codebooks)
+                logging.debug(f"unflattened z_masked with shape: {z_masked.shape}")
+                # add conditioning codebooks back to z_masked
+                z_masked = torch.cat(
+                    (z[:, :self.n_conditioning_codebooks, :], z_masked), dim=1
+                )
+                logging.debug(f"added conditioning codebooks back to z_masked with shape: {z_masked.shape}")
+        # add conditioning codebooks back to sampled_z
+        sampled_z = codebook_unflatten(sampled_z, n_infer_codebooks)
+        sampled_z = torch.cat(
+            (z[:, :self.n_conditioning_codebooks, :], sampled_z), dim=1
+        )
+        logging.debug(f"finished sampling")
+        if return_signal:
+            return self.to_signal(sampled_z, codec)
+        else:
+            return sampled_z
+    def og_method(
+        self,
+        codec,
+        time_steps: int = 300,
+        _sampling_steps: List[int] = [16, 8, 8, 2, 2, 2, 2, 1, 1],
+        start_tokens: Optional[torch.Tensor] = None,
+        sampling_temperature: float = 1.0,
+        mask: Optional[torch.Tensor] = None,
+        mask_temperature: float = 10.5,
+        typical_filtering=True,
+        typical_mass=0.2,
+        typical_min_tokens=1,
+        top_p=0.9,
+        seed: int = None,
+        sample_cutoff: float = 0.75,
+        return_signal=True,
+        debug=False,
+        causal_weight: float = 0.0,
     ):
         if seed is not None:
             at.util.seed(seed)
+        sampling_steps = sum(_sampling_steps)
         logging.debug(f"beginning generation with {sampling_steps} steps")
         else:
             return sampled_z
 def sample_from_logits(
         logits,
         sample: bool = True,
         pred = z_hat.argmax(dim=1)
         pred = model.embedding.unflatten(pred, n_codebooks=model.n_predict_codebooks)
+        logging.debug(f"model has {num_params(model)/1e6:<.3f}M parameters")
+        logging.debug(f"prediction has shape {pred.shape}")
         breakpoint()
     args = argbind.parse_args()
     with argbind.scope(args):
         try_model()