togethercomputer
/

LLaMA-2-7B-32K

@@ -499,6 +499,7 @@ class LlamaPreTrainedModel(PreTrainedModel):
     supports_gradient_checkpointing = True
     _no_split_modules = ["LlamaDecoderLayer"]
     _skip_keys_device_placement = "past_key_values"
     def _init_weights(self, module):
         std = self.config.initializer_range

     supports_gradient_checkpointing = True
     _no_split_modules = ["LlamaDecoderLayer"]
     _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
     def _init_weights(self, module):
         std = self.config.initializer_range