nvidia
/

Llama-3_1-Nemotron-51B-Instruct

@@ -34,18 +34,19 @@ class VariableCache(Cache_4_44_2, Cache):
     def __init__(
             self,
             config: DeciLMConfig,
             batch_size: int = None,
             max_cache_len: int = None,
-            device: torch.device = None,
             dtype: torch.dtype = torch.float32,
             max_batch_size: Optional[int] = None,
             **kwargs: Any,
     ) -> None:
         Cache_4_44_2.__init__(self)
-        self.config = config
         self.max_batch_size = batch_size or max_batch_size
         self.max_cache_len = config.max_position_embeddings if max_cache_len is None else max_cache_len
         self.dtype = dtype

     def __init__(
             self,
+            *,  # key-word only, no positional args allowed to avoid mix-ups with newer transformers versions
             config: DeciLMConfig,
             batch_size: int = None,
             max_cache_len: int = None,
             dtype: torch.dtype = torch.float32,
             max_batch_size: Optional[int] = None,
             **kwargs: Any,
     ) -> None:
         Cache_4_44_2.__init__(self)
+        self.config = deepcopy(config)
         self.max_batch_size = batch_size or max_batch_size
+        self.batch_size = self.max_batch_size
         self.max_cache_len = config.max_position_embeddings if max_cache_len is None else max_cache_len
         self.dtype = dtype