magicslabnu
/

OutEffHop-opt-350m

@@ -17,32 +17,37 @@
 from typing import List, Optional, Tuple, Union
 import torch
 import torch.nn.functional as F
 import torch.utils.checkpoint
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-from ...activations import ACT2FN
-from ...modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
-from ...modeling_outputs import (
     BaseModelOutputWithPast,
     CausalLMOutputWithPast,
     QuestionAnsweringModelOutput,
     SequenceClassifierOutputWithPast,
 )
-from ...modeling_utils import PreTrainedModel
-from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     is_flash_attn_2_available,
     is_flash_attn_greater_or_equal_2_10,
     logging,
     replace_return_docstrings,
 )
 from .configuration_opt import OPTConfig
 if is_flash_attn_2_available():
     from flash_attn import flash_attn_func, flash_attn_varlen_func
     from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
@@ -128,6 +133,16 @@ def softmax_1(input: torch.Tensor, dim=-1, dtype=torch.float32) -> torch.Tensor:
     output = softmax_n_shifted_zeros(input, 1, dim=dim)
     return output if dtype is None else output.type(dtype=dtype)
 class OPTAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
@@ -147,7 +162,7 @@ class OPTAttention(nn.Module):
         self.head_dim = self.embed_dim // self.num_heads
         self.is_causal = True
         if (self.head_dim * self.num_heads) != self.embed_dim:
             raise ValueError(
                 f"embed_dim must be divisible by num_heads (got `embed_dim`: {
@@ -251,10 +266,10 @@ class OPTAttention(nn.Module):
         # upcast to fp32 if the weights are in fp16. Please see https://github.com/huggingface/transformers/pull/17437
         if attn_weights.dtype == torch.float16:
-            attn_weights = softmax_1(
                 attn_weights, dim=-1, dtype=torch.float32).to(torch.float16)
         else:
-            attn_weights = softmax_1(attn_weights, dim=-1)
         if layer_head_mask is not None:
             if layer_head_mask.size() != (self.num_heads,):
@@ -306,7 +321,6 @@ class OPTAttention(nn.Module):
 class OptFlashAttention2(OPTAttention):
     """
     OPT flash attention module. This module inherits from `OPTAttention` as the weights of the module stays untouched.

 from typing import List, Optional, Tuple, Union
 import torch
 import torch.nn.functional as F
 import torch.utils.checkpoint
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers.activations import ACT2FN
+from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
+from transformers.modeling_outputs import (
     BaseModelOutputWithPast,
     CausalLMOutputWithPast,
     QuestionAnsweringModelOutput,
     SequenceClassifierOutputWithPast,
 )
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     is_flash_attn_2_available,
     is_flash_attn_greater_or_equal_2_10,
     logging,
     replace_return_docstrings,
 )
 from .configuration_opt import OPTConfig
 if is_flash_attn_2_available():
     from flash_attn import flash_attn_func, flash_attn_varlen_func
     from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
     output = softmax_n_shifted_zeros(input, 1, dim=dim)
     return output if dtype is None else output.type(dtype=dtype)
+def clipped_softmax(data, dim=1, eta=1.1, gamma=-0.1, **kw):
+    sm_out = torch.nn.functional.softmax(data, dim=dim, **kw)
+    stretched_out = sm_out * (eta - gamma) + gamma
+    return torch.clip(stretched_out, 0, 1)
+def clipped_softmax1(data, dim=1, eta=1.1, gamma=-0.1, **kw):
+    sm_out = softmax_1(data, dim=dim, **kw)
+    stretched_out = sm_out * (eta - gamma) + gamma
+    return torch.clip(stretched_out, 0, 1)
 class OPTAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
         self.head_dim = self.embed_dim // self.num_heads
         self.is_causal = True
+        self.softmax_fn = clipped_softmax1
         if (self.head_dim * self.num_heads) != self.embed_dim:
             raise ValueError(
                 f"embed_dim must be divisible by num_heads (got `embed_dim`: {
         # upcast to fp32 if the weights are in fp16. Please see https://github.com/huggingface/transformers/pull/17437
         if attn_weights.dtype == torch.float16:
+            attn_weights = self.softmax_fn(
                 attn_weights, dim=-1, dtype=torch.float32).to(torch.float16)
         else:
+            attn_weights = self.softmax_fn(attn_weights, dim=-1)
         if layer_head_mask is not None:
             if layer_head_mask.size() != (self.num_heads,):
 class OptFlashAttention2(OPTAttention):
     """
     OPT flash attention module. This module inherits from `OPTAttention` as the weights of the module stays untouched.