duplicate xN_DRAW - for long gen

Browse files

Files changed (10) hide show

audiocraft/builders.py +1 -1
audiocraft/codebooks_patterns.py +3 -104
audiocraft/encodec.py +7 -89
audiocraft/genmodel.py +5 -2
audiocraft/lm.py +73 -62
audiocraft/loaders.py +2 -2
audiocraft/transformer.py +32 -27
audiocraft/utils/cluster.py +0 -75
audiocraft/utils/deadlock.py +0 -58
audiocraft/utils/utils.py +13 -36

audiocraft/builders.py CHANGED Viewed

@@ -7,7 +7,7 @@
 import typing as tp
 import omegaconf
 import torch
-from .encodec import CompressionModel, EncodecModel
 from .lm import LMModel
 from .seanet import SEANetDecoder
 from .codebooks_patterns import DelayedPatternProvider

 import typing as tp
 import omegaconf
 import torch
+from .encodec import EncodecModel
 from .lm import LMModel
 from .seanet import SEANetDecoder
 from .codebooks_patterns import DelayedPatternProvider

audiocraft/codebooks_patterns.py CHANGED Viewed

@@ -46,84 +46,12 @@ class Pattern:
     n_q: int
     def __post_init__(self):
-        assert len(self.layout) > 0
-        self._validate_layout()
         self._build_reverted_sequence_scatter_indexes = self._build_reverted_sequence_scatter_indexes
         self._build_pattern_sequence_scatter_indexes = self._build_pattern_sequence_scatter_indexes
         print("New pattern, time steps: %d, sequence steps: %d", self.timesteps, len(self.layout))
-    def _validate_layout(self):
-        """Runs checks on the layout to ensure a valid pattern is defined.
-        A pattern is considered invalid if:
-            - Multiple timesteps for a same codebook are defined in the same sequence step
-            - The timesteps for a given codebook are not in ascending order as we advance in the sequence
-              (this would mean that we have future timesteps before past timesteps).
-        """
-        q_timesteps = {q: 0 for q in range(self.n_q)}
-        for s, seq_coords in enumerate(self.layout):
-            if len(seq_coords) > 0:
-                qs = set()
-                for coord in seq_coords:
-                    qs.add(coord.q)
-                    last_q_timestep = q_timesteps[coord.q]
-                    assert coord.t >= last_q_timestep, \
-                        f"Past timesteps are found in the sequence for codebook = {coord.q} at step {s}"
-                    q_timesteps[coord.q] = coord.t
-                # each sequence step contains at max 1 coordinate per codebook
-                assert len(qs) == len(seq_coords), \
-                    f"Multiple entries for a same codebook are found at step {s}"
-                print(f'{qs=}\n\n\n\n QS VALIDATE LAYOUT')  # this prints 0,1,2,3 although
-                # if the q_timesteps contains special_index doe sthis show somehting diff than 0123
-        # =======================================================
-        # QS VALIDATE LAYOUT
-        # qs={0, 1}
-        #  QS VALIDATE LAYOUT
-        # qs={0, 1, 2}
-        #  QS VALIDATE LAYOUT
-        # qs={0, 1, 2, 3}
-        #  QS VALIDATE LAYOUT
-        # qs={0, 1, 2, 3}
-        #  QS VALIDATE LAYOUT
-        # qs={0, 1, 2, 3}
-        #  QS VALIDATE LAYOUT
-        # qs={0, 1, 2, 3}
-        #  QS VALIDATE LAYOUT
-        # qs={0, 1, 2, 3}
-        #  QS VALIDATE LAYOUT
-        # qs={0, 1, 2, 3}
-        #  QS VALIDATE LAYOUT
-        # qs={0, 1, 2, 3}
-    @property
-    def num_sequence_steps(self):
-        return len(self.layout) - 1
     @property
     def max_delay(self):
         max_t_in_seq_coords = 0
@@ -289,36 +217,6 @@ class Pattern:
-    # def revert_pattern_logits(self, logits,
-    #                           special_token,
-    #                           keep_only_valid_steps=False):
-    #     """similar to ``revert_pattern_sequence`` with the following specificities:
-    #     1. It is designed to work with the extra cardinality dimension
-    #     2. We return the logits for the first sequence item that matches the special_token and
-    #     which matching target in the original sequence is the first item of the sequence,
-    #     while we skip the last logits as there is no matching target
-    #     """
-    #     B, card, K, S = logits.shape
-    #     indexes, mask = self._build_reverted_sequence_scatter_indexes(
-    #         S, K, keep_only_valid_steps, is_model_output=True, device=logits.device
-    #     )
-    #     logits = logits.reshape(B, card, -1)
-    #     # we append the special token as the last index of our flattened z tensor
-    #     logits = torch.cat([logits, torch.zeros_like(logits[:, :, :1]) + special_token], dim=-1)  # [B, card, K x S]
-    #     values = logits[:, :, indexes.view(-1)]
-    #     values = values.view(B, card, K, indexes.shape[-1])
-    #     return values, indexes, mask
 class DelayedPatternProvider():
@@ -352,6 +250,7 @@ class DelayedPatternProvider():
         self.n_q = n_q
         if delays is None:
             delays = list(range(n_q))
         self.delays = delays
         self.flatten_first = flatten_first
         self.empty_initial = empty_initial

     n_q: int
     def __post_init__(self):
+        # assert len(self.layout) > 0
+        # self._validate_layout()   #
         self._build_reverted_sequence_scatter_indexes = self._build_reverted_sequence_scatter_indexes
         self._build_pattern_sequence_scatter_indexes = self._build_pattern_sequence_scatter_indexes
         print("New pattern, time steps: %d, sequence steps: %d", self.timesteps, len(self.layout))
     @property
     def max_delay(self):
         max_t_in_seq_coords = 0
 class DelayedPatternProvider():
         self.n_q = n_q
         if delays is None:
             delays = list(range(n_q))
+        print(f'{delays=}  PATTERN __ini')
         self.delays = delays
         self.flatten_first = flatten_first
         self.empty_initial = empty_initial

audiocraft/encodec.py CHANGED Viewed

@@ -1,100 +1,14 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-"""Compression models or wrapper around existing models.
-Also defines the main interface that a model must follow to be usable as an audio tokenizer.
-"""
-from abc import ABC, abstractmethod
-import logging
-from pathlib import Path
 import typing as tp
 from einops import rearrange
 import numpy as np
 import torch
 from torch import nn
-from transformers import EncodecModel as HFEncodecModel
-logger = logging.getLogger()
-class CompressionModel(ABC, nn.Module):
-    """Base API for all compression models that aim at being used as audio tokenizers
-    with a language model.
-    """
-    @abstractmethod
-    def decode(self, codes: torch.Tensor, scale: tp.Optional[torch.Tensor] = None):
-        """See `EncodecModel.decode`."""
-        ...
-    @abstractmethod
-    def decode_latent(self, codes: torch.Tensor):
-        """Decode from the discrete codes to continuous latent space."""
-        ...
-    @property
-    @abstractmethod
-    def channels(self) -> int:
-        ...
-    @property
-    @abstractmethod
-    def frame_rate(self) -> float:
-        ...
-    @property
-    @abstractmethod
-    def sample_rate(self) -> int:
-        ...
-    @property
-    @abstractmethod
-    def cardinality(self) -> int:
-        ...
-    @property
-    @abstractmethod
-    def num_codebooks(self) -> int:
-        ...
-    @property
-    @abstractmethod
-    def total_codebooks(self) -> int:
-        ...
-    @abstractmethod
-    def set_num_codebooks(self, n: int):
-        """Set the active number of codebooks used by the quantizer."""
-        ...
-class EncodecModel(CompressionModel):
-    """Encodec model operating on the raw waveform.
-    Args:
-        encoder (nn.Module): Encoder network.
-        decoder (nn.Module): Decoder network.
-        quantizer (qt.BaseQuantizer): Quantizer network.
-        frame_rate (int): Frame rate for the latent representation.
-        sample_rate (int): Audio sample rate.
-        channels (int): Number of audio channels.
-        causal (bool): Whether to use a causal version of the model.
-        renormalize (bool): Whether to renormalize the audio before running the model.
-    """
-    # we need assignment to override the property in the abstract class,
-    # I couldn't find a better way...
-    frame_rate: float = 0
-    sample_rate: int = 0
-    channels: int = 0
     def __init__(self,
                  decoder=None,
@@ -104,8 +18,11 @@ class EncodecModel(CompressionModel):
                  channels=None,
                  causal=False,
                  renormalize=False):
-        super().__init__()
         self.decoder = decoder
         self.quantizer = quantizer
         self.frame_rate = frame_rate
@@ -117,6 +34,7 @@ class EncodecModel(CompressionModel):
             # we force disabling here to avoid handling linear overlap of segments
             # as supported in original EnCodec codebase.
             assert not self.renormalize, 'Causal model does not support renormalize'
     @property
     def total_codebooks(self):
@@ -128,7 +46,7 @@ class EncodecModel(CompressionModel):
         """Active number of codebooks used by the quantizer."""
         return self.quantizer.num_codebooks
-    def set_num_codebooks(self, n: int):
         """Set the active number of codebooks used by the quantizer."""
         self.quantizer.set_num_codebooks(n)

 import typing as tp
 from einops import rearrange
 import numpy as np
 import torch
 from torch import nn
+class EncodecModel(nn.Module):
     def __init__(self,
                  decoder=None,
                  channels=None,
                  causal=False,
                  renormalize=False):
+        super().__init__()
+        self.frame_rate=0
+        self.sample_rate=0
+        self.channels=0
         self.decoder = decoder
         self.quantizer = quantizer
         self.frame_rate = frame_rate
             # we force disabling here to avoid handling linear overlap of segments
             # as supported in original EnCodec codebase.
             assert not self.renormalize, 'Causal model does not support renormalize'
     @property
     def total_codebooks(self):
         """Active number of codebooks used by the quantizer."""
         return self.quantizer.num_codebooks
+    def set_num_codebooks(self, n):
         """Set the active number of codebooks used by the quantizer."""
         self.quantizer.set_num_codebooks(n)

audiocraft/genmodel.py CHANGED Viewed

@@ -3,7 +3,7 @@ import omegaconf
 import torch
 from abc import ABC, abstractmethod
-from .encodec import CompressionModel
 from .lm import LMModel
 from .conditioners import ConditioningAttributes
 from .utils.autocast import TorchAutocast
@@ -18,7 +18,7 @@ class BaseGenModel(ABC):
         lm (LMModel): Language model over discrete representations
         max_duration (float, optional): As is using top250 token draw() we can gen xN sequences
     """
-    def __init__(self, name: str, compression_model: CompressionModel, lm: LMModel,
                  max_duration: tp.Optional[float] = None):
         self.name = name
         self.compression_model = compression_model
@@ -131,6 +131,9 @@ class BaseGenModel(ABC):
                                                 **self.generation_params)
         else:
             print('<>Long gen ?<>')
         return gen_tokens
     def generate_audio(self, gen_tokens: torch.Tensor) -> torch.Tensor:

 import torch
 from abc import ABC, abstractmethod
 from .lm import LMModel
 from .conditioners import ConditioningAttributes
 from .utils.autocast import TorchAutocast
         lm (LMModel): Language model over discrete representations
         max_duration (float, optional): As is using top250 token draw() we can gen xN sequences
     """
+    def __init__(self, name: str, compression_model, lm: LMModel,
                  max_duration: tp.Optional[float] = None):
         self.name = name
         self.compression_model = compression_model
                                                 **self.generation_params)
         else:
             print('<>Long gen ?<>')
+        # print(f'{gen_tokens.shape=}')   # [5,4,35]
+        # FLATTEN BATCH AS EXTRA SEQUENCE (BATCH IS VIRTUAL JUST MULTINOMIAL SAMPLING OF N_DRAW TOKENS)
+        gen_tokens = gen_tokens.transpose(0, 1).reshape(4, -1)[None, :, :]
         return gen_tokens
     def generate_audio(self, gen_tokens: torch.Tensor) -> torch.Tensor:

audiocraft/lm.py CHANGED Viewed

@@ -148,7 +148,7 @@ class LMModel(StreamingModule):
         super().__init__()
         self.cfg_coef = cfg_coef
         self.condition_provider = condition_provider
         self.fuser = fuser
         self.card = card  # 2048 ?
@@ -255,23 +255,7 @@ class LMModel(StreamingModule):
                            top_p: float = 0.0,
                            cfg_coef: tp.Optional[float] = None,
                            two_step_cfg: tp.Optional[bool] = None) -> torch.Tensor:
-        """Sample next token from the model given a sequence and a set of conditions. The model supports
-        multiple sampling strategies (greedy sampling, softmax, top-k, top-p...).
-        Args:
-            sequence (torch.Tensor): Current sequence of shape [B, K, S]
-                with K corresponding to the number of codebooks and S the number of sequence steps.
-                S = 1 in streaming mode, except for the first step that contains a bigger prompt.
-            condition_tensors (dict[str, ConditionType): Set of conditions. If CFG is used,
-                should be twice the batch size, being the concatenation of the conditions + null conditions.
-            use_sampling (bool): Whether to use a sampling strategy or not.
-            temp (float): Sampling temperature.
-            top_k (int): K for "top-k" sampling.
-            top_p (float): P for "top-p" sampling.
-            cfg_coef (float, optional): classifier free guidance coefficient
-        Returns:
-            next_token (torch.Tensor): Next token tensor of shape [B, K, 1].
-        """
         B = sequence.shape[0]
         cfg_coef = self.cfg_coef if cfg_coef is None else cfg_coef
         model = self if self._fsdp is None else self._fsdp
@@ -283,9 +267,11 @@ class LMModel(StreamingModule):
             assert isinstance(cfg_conditions, dict)
             condition_tensors = cfg_conditions
             if condition_tensors:
-                # print('\nD\n')
-                # Preparing for CFG, predicting both conditional and unconditional logits.
-                sequence = torch.cat([sequence, sequence], dim=0)
             all_logits = model(
                 sequence,
                 conditions=[], condition_tensors=condition_tensors)
@@ -298,24 +284,25 @@ class LMModel(StreamingModule):
                 print('\nF!\n')
-        logits = logits.permute(0, 1, 3, 2)  # [B, K, card, T]
-        logits = logits[..., -1]  # [B x K x card]
         # Apply softmax for sampling if temp > 0. Else, do greedy sampling to avoid zero division error.
-        if use_sampling and temp > 0.0:
-            # print(f'\nR {temp=} {top_p=} {top_k=}\n') -------------> R temp=1.0 top_p=0.0 top_k=250
-            probs = torch.softmax(logits / temp, dim=-1)
-            if top_p > 0.0:
-                next_token = utils.sample_top_p(probs, p=top_p)
-            elif top_k > 0:
-                next_token = utils.sample_top_k(probs, k=top_k)
-            else:
-                next_token = utils.multinomial(probs, num_samples=1)
-        else:
-            #
-            print('\nNeverHere\n')
         return next_token
     # GENERATE class revert_codebook_patterns()
@@ -324,7 +311,7 @@ class LMModel(StreamingModule):
                  prompt = None,
                  conditions = [],
                  num_samples = 1,       # THIS IS HOW MANY GENERATIONS - A SAMPLE IS A FULL WAV
-                 max_gen_len: int = 256,
                  use_sampling: bool = True,
                  temp: float = 1.0,
                  top_k: int = 250,
@@ -335,6 +322,7 @@ class LMModel(StreamingModule):
                  check: bool = False,
                  callback: tp.Optional[tp.Callable[[int, int], None]] = None,
                  **kwargs) -> torch.Tensor:
         print(f'{num_samples=}')
         first_param = next(iter(self.parameters()))
         device = first_param.device
@@ -364,10 +352,10 @@ class LMModel(StreamingModule):
         B, K, T = prompt.shape
         start_offset = T
-        assert start_offset < max_gen_len
-        pattern = self.pattern_provider.get_pattern(max_gen_len)
-        # this token is used as default value for codes that are not generated yet
         unknown_token = -1
@@ -375,32 +363,46 @@ class LMModel(StreamingModule):
         gen_codes[..., :start_offset] = prompt  # place 0
-        gen_sequence, _, mask = pattern.build_pattern_sequence(gen_codes, self.special_token_id)
-        start_offset_sequence = pattern.get_first_step_with_timesteps(start_offset)
-        # print('\n=', start_offset_sequence, '\n=')   # 1
-        assert start_offset_sequence is not None
         with self.streaming():
             unconditional_state = self.get_streaming_state()
             prev_offset = 0
-            gen_sequence_len = gen_sequence.shape[-1]  # gen_sequence shape is [B, K, S]
             # --
             # print(mask.shape, mask.sum(), 'MSK LM')
             # torch.Size([4, 39]) tensor(140, device='cuda:0') MSK LM ? Fully 1 normal no special token
             # --
-            for offset in range(start_offset_sequence, gen_sequence_len):
-                # get current sequence (note that the streaming API is providing the caching over previous offsets)
-                curr_sequence = gen_sequence[..., prev_offset:offset]
-                curr_mask = mask[None, ..., prev_offset:offset].expand(B, -1, -1)
                 next_token = self._sample_next_token(
-                    curr_sequence, cfg_conditions, unconditional_state, use_sampling, temp, top_k, top_p,
-                    cfg_coef=cfg_coef, two_step_cfg=two_step_cfg)
@@ -418,23 +420,32 @@ class LMModel(StreamingModule):
                 # next_token[:] = self.special_token_id    # seanet.embed torch.embedding does not have this - out of bounds in detokenize
-                # ensure we don't overwrite prompt tokens, we only write over unknown tokens
-                gen_sequence[..., offset:offset+1] = torch.where(
-                    gen_sequence[..., offset:offset+1] == unknown_token,
-                    next_token, gen_sequence[..., offset:offset+1]
-                )
                 prev_offset = offset
         unconditional_state.clear()
         # revert_pattern_logits ~ NOT CALLED EXPLICIT
-        out_codes, _, _ = pattern.revert_pattern_sequence(gen_sequence, special_token=unknown_token)
         # set(out_codes.unique().tolist()) - set(gen_sequence.unique().tolist())  # set()
@@ -448,7 +459,7 @@ class LMModel(StreamingModule):
         # -> unknown tokn = -1 or 2048
         # unknown_token=-1
-        # print(f' <=> CODES {out_codes.shape=} {out_codes.min()}  {out_codes.max()}\n')   # ARRIVES here also if special
         # unknown_token=-1 gen_sequence.shape=torch.Size([1, 4, 39])  out_codes.shape=torch.Size([1, 4, 35])
         # <=> CODES out_codes.shape=torch.Size([1, 4, 35]) 30  2024

         super().__init__()
         self.cfg_coef = cfg_coef
+        self.n_draw = 20
         self.condition_provider = condition_provider
         self.fuser = fuser
         self.card = card  # 2048 ?
                            top_p: float = 0.0,
                            cfg_coef: tp.Optional[float] = None,
                            two_step_cfg: tp.Optional[bool] = None) -> torch.Tensor:
+        """self.n_draw"""
         B = sequence.shape[0]
         cfg_coef = self.cfg_coef if cfg_coef is None else cfg_coef
         model = self if self._fsdp is None else self._fsdp
             assert isinstance(cfg_conditions, dict)
             condition_tensors = cfg_conditions
             if condition_tensors:
+                print('\nDcat\n')  # enters here
+                sequence = torch.cat([sequence, sequence], dim=0) # if i concatenate
+                # concatenates in batch but we only want to run 1st sequence - continutation
+                # the other paths will build "BLindly"
             all_logits = model(
                 sequence,
                 conditions=[], condition_tensors=condition_tensors)
                 print('\nF!\n')
+        logits = logits.permute(0, 1, 3, 2)  # [1, 4, 2048, 1]
+        # No crop this is just squeeze() of time
+        logits = logits[..., -1]  # [1 x 4 x 2048]
         # Apply softmax for sampling if temp > 0. Else, do greedy sampling to avoid zero division error.
+        # print(f'\nR {temp=} {top_p=} {top_k=}\n') -------------> R temp=1.0 top_p=0.0 top_k=250
+        # print(f'{temp=}')  # 1.0
+        probs = torch.softmax(logits / temp, dim=-1)
+        next_token = utils.sample_top_k(probs, k=top_k, n_draw=self.n_draw)
+        # th decoder will smooth the transitions
+        # so if we have 2 tokens although the 2nd token we need it for replica later
+        # so let it as batch and reshape at the final time-inversion
+        # To return multiple tokens here (batch_size = num_draws)
         return next_token
     # GENERATE class revert_codebook_patterns()
                  prompt = None,
                  conditions = [],
                  num_samples = 1,       # THIS IS HOW MANY GENERATIONS - A SAMPLE IS A FULL WAV
+                 max_gen_len=256,  # unduplicated sequence length - actual len will be  n_draw * maxgenlen
                  use_sampling: bool = True,
                  temp: float = 1.0,
                  top_k: int = 250,
                  check: bool = False,
                  callback: tp.Optional[tp.Callable[[int, int], None]] = None,
                  **kwargs) -> torch.Tensor:
         print(f'{num_samples=}')
         first_param = next(iter(self.parameters()))
         device = first_param.device
         B, K, T = prompt.shape
         start_offset = T
+        pattern = self.pattern_provider.get_pattern(max_gen_len)  # duplicate sequence
+        # this token is used as default value for codes that are not generated yet ?
         unknown_token = -1
         gen_codes[..., :start_offset] = prompt  # place 0
+        _gen_sequence, _, mask = pattern.build_pattern_sequence(gen_codes, self.special_token_id)
         with self.streaming():
             unconditional_state = self.get_streaming_state()
             prev_offset = 0
+            gen_sequence_len = _gen_sequence.shape[-1]  # gen_sequence shape is [B, K, S]
             # --
             # print(mask.shape, mask.sum(), 'MSK LM')
             # torch.Size([4, 39]) tensor(140, device='cuda:0') MSK LM ? Fully 1 normal no special token
             # --
+            duplicate_draw = [
+                _gen_sequence[:, :, 0:1].repeat(self.n_draw, 1, 1)
+                             ]
+            # list to hold next tokens - draw sample multiple tokens at each time-step
+            #   but continue the sequence only with isingle next token
+            for offset in range(1, gen_sequence_len):  # start_offset_sequence=1
+                print(f'{offset=}')
+                # starts from 1 not 0 thus uses the 0:1 as curr sequence
+                # although this is empty contains -1 ?
+                curr_sequence = _gen_sequence[..., prev_offset:offset]
                 next_token = self._sample_next_token(
+                    curr_sequence,
+                    cfg_conditions,
+                    unconditional_state,
+                    use_sampling,
+                    temp, top_k, top_p,
+                    cfg_coef=cfg_coef,
+                    two_step_cfg=two_step_cfg)  # [5, 4, 1]
+                print(f'{next_token.shape=}')
+                # replicate the sequence to hold 5 or more sequences as we generate 5 tokens or more
                 # next_token[:] = self.special_token_id    # seanet.embed torch.embedding does not have this - out of bounds in detokenize
+                _gen_sequence[..., offset:offset+1] = next_token[0, :, :]  #gen_sequence.shape=torch.Size([1, 4, 39])
+                # only cat 1 token to 1 sequence - preserve the duplicates in
+                duplicate_draw.append(next_token)
                 prev_offset = offset
         unconditional_state.clear()
+        gen_sequence = torch.cat(duplicate_draw, 2)  # [self.n_draw, 4, len_seq]
+        # revert codes as "batch"
+        # In decoder - flatten
+        # _, tokd, len_seq = gen_sequence.shape
+        # gen_sequence = gen_sequence.transpose(0, 1).reshape(tokd, self.n_draw * len_seq)[None, :, :]
+        print(f' <=> BEFORE CODES {gen_sequence.shape=} {_gen_sequence.shape=}\n')   # ARRIVES here also if special
         # revert_pattern_logits ~ NOT CALLED EXPLICIT
+        out_codes, _, _ = pattern.revert_pattern_sequence(gen_sequence,
+                                                          special_token=unknown_token)
         # set(out_codes.unique().tolist()) - set(gen_sequence.unique().tolist())  # set()
         # -> unknown tokn = -1 or 2048
         # unknown_token=-1
+        print(f' <=> CODES {out_codes.shape=} {out_codes.min()}  {out_codes.max()}\n')   # ARRIVES here also if special
         # unknown_token=-1 gen_sequence.shape=torch.Size([1, 4, 39])  out_codes.shape=torch.Size([1, 4, 35])
         # <=> CODES out_codes.shape=torch.Size([1, 4, 35]) 30  2024

audiocraft/loaders.py CHANGED Viewed

@@ -29,7 +29,7 @@ import torch
 import audiocraft
 from . import builders
-from .encodec import CompressionModel
 def get_audiocraft_cache_dir() -> tp.Optional[str]:
@@ -75,7 +75,7 @@ def load_compression_model_ckpt(file_or_url_or_id: tp.Union[Path, str], cache_di
 def load_compression_model(file_or_url_or_id: tp.Union[Path, str], device='cpu', cache_dir: tp.Optional[str] = None):
     pkg = load_compression_model_ckpt(file_or_url_or_id, cache_dir=cache_dir)
     if 'pretrained' in pkg:
-        return CompressionModel.get_pretrained(pkg['pretrained'], device=device)
     cfg = OmegaConf.create(pkg['xp.cfg'])
     cfg.device = str(device)
     model = builders.get_compression_model(cfg)

 import audiocraft
 from . import builders
+from .encodec import EncodecModel
 def get_audiocraft_cache_dir() -> tp.Optional[str]:
 def load_compression_model(file_or_url_or_id: tp.Union[Path, str], device='cpu', cache_dir: tp.Optional[str] = None):
     pkg = load_compression_model_ckpt(file_or_url_or_id, cache_dir=cache_dir)
     if 'pretrained' in pkg:
+        return EncodecModel.get_pretrained(pkg['pretrained'], device=device)
     cfg = OmegaConf.create(pkg['xp.cfg'])
     cfg.device = str(device)
     model = builders.get_compression_model(cfg)

audiocraft/transformer.py CHANGED Viewed

@@ -661,34 +661,39 @@ class StreamingTransformer(StreamingModule):
     def _apply_layer(self, layer, *args, **kwargs):
         method = self.checkpointing
         if method == 'none':
-            return layer(*args, **kwargs)
-        elif method == 'torch':
-            return torch_checkpoint(layer, *args, use_reentrant=False, **kwargs)
-        elif method.startswith('xformers'):
-            from xformers.checkpoint_fairinternal import checkpoint, _get_default_policy
-            if method == 'xformers_default':
-                # those operations will be saved, and not recomputed.
-                # According to Francisco we can get smarter policies but this is a good start.
-                allow_list = [
-                    "xformers.efficient_attention_forward_cutlass.default",
-                    "xformers_flash.flash_fwd.default",
-                    "aten.addmm.default",
-                    "aten.mm.default",
-                ]
-            elif method == 'xformers_mm':
-                # those operations will be saved, and not recomputed.
-                # According to Francisco we can get smarter policies but this is a good start.
-                allow_list = [
-                    "aten.addmm.default",
-                    "aten.mm.default",
-                ]
-            else:
-                raise ValueError(f"xformers checkpointing xformers policy {method} is not known.")
-            policy_fn = _get_default_policy(allow_list)
-            return checkpoint(layer, *args, policy_fn=policy_fn, **kwargs)
-        else:
-            raise ValueError(f"Checkpointing method {method} is unknown.")
     def forward(self, x: torch.Tensor, *args, **kwargs):
         B, T, C = x.shape

     def _apply_layer(self, layer, *args, **kwargs):
         method = self.checkpointing
+        print(f'{method=}')
         if method == 'none':
+            print([i.shape for i in args])
+            x = layer(*args, **kwargs)  # [10, 1, 1536]   probably does no t detect the bathc somwhere
+            return x
+        # elif method == 'torch':
+        #     print('TORCH')
+        #     return torch_checkpoint(layer, *args, use_reentrant=False, **kwargs)
+        # elif method.startswith('xformers'):
+        #     print('XFORMERS')
+        #     from xformers.checkpoint_fairinternal import checkpoint, _get_default_policy
+        #     if method == 'xformers_default':
+        #         # those operations will be saved, and not recomputed.
+        #         # According to Francisco we can get smarter policies but this is a good start.
+        #         allow_list = [
+        #             "xformers.efficient_attention_forward_cutlass.default",
+        #             "xformers_flash.flash_fwd.default",
+        #             "aten.addmm.default",
+        #             "aten.mm.default",
+        #         ]
+        #     elif method == 'xformers_mm':
+        #         # those operations will be saved, and not recomputed.
+        #         # According to Francisco we can get smarter policies but this is a good start.
+        #         allow_list = [
+        #             "aten.addmm.default",
+        #             "aten.mm.default",
+        #         ]
+        #     else:
+        #         raise ValueError(f"xformers checkpointing xformers policy {method} is not known.")
+        #     policy_fn = _get_default_policy(allow_list)
+        #     return checkpoint(layer, *args, policy_fn=policy_fn, **kwargs)
+        # else:
+        #     raise ValueError(f"Checkpointing method {method} is unknown.")
     def forward(self, x: torch.Tensor, *args, **kwargs):
         B, T, C = x.shape

audiocraft/utils/cluster.py DELETED Viewed

@@ -1,75 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-"""
-Utility functions for SLURM configuration and cluster settings.
-"""
-from enum import Enum
-import os
-import socket
-import typing as tp
-import omegaconf
-class ClusterType(Enum):
-    AWS = "aws"
-    FAIR = "fair"
-    RSC = "rsc"
-    LOCAL_DARWIN = "darwin"
-    DEFAULT = "default"  # used for any other cluster.
-def _guess_cluster_type() -> ClusterType:
-    uname = os.uname()
-    fqdn = socket.getfqdn()
-    if uname.sysname == "Linux" and (uname.release.endswith("-aws") or ".ec2" in fqdn):
-        return ClusterType.AWS
-    if fqdn.endswith(".fair"):
-        return ClusterType.FAIR
-    if fqdn.endswith(".facebook.com"):
-        return ClusterType.RSC
-    if uname.sysname == "Darwin":
-        return ClusterType.LOCAL_DARWIN
-    return ClusterType.DEFAULT
-def get_cluster_type(
-    cluster_type: tp.Optional[ClusterType] = None,
-) -> tp.Optional[ClusterType]:
-    if cluster_type is None:
-        return _guess_cluster_type()
-    return cluster_type
-def get_slurm_parameters(
-    cfg: omegaconf.DictConfig, cluster_type: tp.Optional[ClusterType] = None
-) -> omegaconf.DictConfig:
-    """Update SLURM parameters in configuration based on cluster type.
-    If the cluster type is not specify, it infers it automatically.
-    """
-    from ..environment import AudioCraftEnvironment
-    cluster_type = get_cluster_type(cluster_type)
-    # apply cluster-specific adjustments
-    if cluster_type == ClusterType.AWS:
-        cfg["mem_per_gpu"] = None
-        cfg["constraint"] = None
-        cfg["setup"] = []
-    elif cluster_type == ClusterType.RSC:
-        cfg["mem_per_gpu"] = None
-        cfg["setup"] = []
-        cfg["constraint"] = None
-        cfg["partition"] = "learn"
-    slurm_exclude = AudioCraftEnvironment.get_slurm_exclude()
-    if slurm_exclude is not None:
-        cfg["exclude"] = slurm_exclude
-    return cfg

audiocraft/utils/deadlock.py DELETED Viewed

@@ -1,58 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-import logging
-import os
-from queue import Queue, Empty
-import signal
-import sys
-import threading
-import traceback
-logger = logging.getLogger(__name__)
-class DeadlockDetect:
-    def __init__(self, use: bool = False, timeout: float = 120.):
-        self.use = use
-        self.timeout = timeout
-        self._queue: Queue = Queue()
-    def update(self, stage: str):
-        if self.use:
-            self._queue.put(stage)
-    def __enter__(self):
-        if self.use:
-            self._thread = threading.Thread(target=self._detector_thread)
-            self._thread.start()
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        if self.use:
-            self._queue.put(None)
-            self._thread.join()
-    def _detector_thread(self):
-        logger.debug("Deadlock detector started")
-        last_stage = "init"
-        while True:
-            try:
-                stage = self._queue.get(timeout=self.timeout)
-            except Empty:
-                break
-            if stage is None:
-                logger.debug("Exiting deadlock detector thread")
-                return
-            else:
-                last_stage = stage
-        logger.error("Deadlock detector timed out, last stage was %s", last_stage)
-        for th in threading.enumerate():
-            print(th, file=sys.stderr)
-            traceback.print_stack(sys._current_frames()[th.ident])
-            print(file=sys.stderr)
-        sys.stdout.flush()
-        sys.stderr.flush()
-        os.kill(os.getpid(), signal.SIGKILL)

audiocraft/utils/utils.py CHANGED Viewed

@@ -86,47 +86,24 @@ def get_dataset_from_loader(dataloader):
         return dataset
-def multinomial(input: torch.Tensor, num_samples: int, replacement=False, *, generator=None):
-    """torch.multinomial with arbitrary number of dimensions, and number of candidates on the last dimension.
-    Args:
-        input (torch.Tensor): The input tensor containing probabilities.
-        num_samples (int): Number of samples to draw.
-        replacement (bool): Whether to draw with replacement or not.
-    Keywords args:
-        generator (torch.Generator): A pseudorandom number generator for sampling.
-    Returns:
-        torch.Tensor: Last dimension contains num_samples indices
-            sampled from the multinomial probability distribution
-            located in the last dimension of tensor input.
-    """
-    input_ = input.reshape(-1, input.shape[-1])
-    output_ = torch.multinomial(input_, num_samples=num_samples, replacement=replacement, generator=generator)
-    output = output_.reshape(*list(input.shape[:-1]), -1)
-    # print('MULTINOmial', input.shape, output.shape) # MULTINOmial torch.Size([1, 4, 2048]) torch.Size([1, 4, 1])
-    # output = input[..., 0:1]
-    return output
-def sample_top_k(probs: torch.Tensor, k: int) -> torch.Tensor:
-    """Sample next token from top K values along the last dimension of the input probs tensor.
-    Args:
-        probs (torch.Tensor): Input probabilities with token candidates on the last dimension.
-        k (int): The k in “top-k”.
-    Returns:
-        torch.Tensor: Sampled tokens.
     """
-    top_k_value, i250 = torch.topk(probs, k, dim=-1)   # probs: [1, 4, 2048]
     min_value_top_k = top_k_value[..., [-1]]  #
-    probs *= (probs >= min_value_top_k).float()  # multiply all being > of min_topk with 1 thus zeroing others
-    probs.div_(probs.sum(dim=-1, keepdim=True))    # why normalize by the sum ? oh in order to choose mult
-    next_token = multinomial(probs, num_samples=1)
-    # so instead of chooose multinomial what happens if we take all 250 topk tokens
-    # probs.shape=torch.Size([1, 4, 2048]) <,    print(next_token,f'{probs.shape=}', 'h')  # 1,4,1  next token is 4tok
-    # next_token = i250
-    return next_token

         return dataset
+def sample_top_k(p, k, n_draw=None):
+    """
+        p probabs 2048 ?
+        num_draw : how many tokens to sample (for duplicate elongation)
     """
+    top_k_value, i250 = torch.topk(p, k, dim=-1)   # probs: [1, 4, 2048]
     min_value_top_k = top_k_value[..., [-1]]  #
+    p *= (p >= min_value_top_k).float()
+    p.div_(p.sum(dim=-1, keepdim=True))
+    # -- next_token = multinomial(probs, num_samples=num_draw)
+    p_ = p.reshape(-1, p.shape[-1])
+    out = torch.multinomial(p_,
+                             num_samples=n_draw,
+                             replacement=False)  # [4, num_draw]
+    return out.transpose(0, 1)[:, :, None]       # [num_draw, 4, 1]