olivierdehaene
/

optimized-santacoder

Text Generation

text-generation-inference

Model card Files Files and versions Community

OlivierDehaene commited on Jan 26, 2023

Commit

7d2ded6

•

1 Parent(s): 979206f

use torch.nn.functional.gelu instead

Files changed (1) hide show

modeling_gpt2_mq.py +2 -14

modeling_gpt2_mq.py CHANGED Viewed

@@ -71,26 +71,14 @@ def prepare_attn_mask(
     return combined_attention_mask
-@torch.jit.script
-def gelu_forward(x: torch.Tensor) -> torch.Tensor:
-    """
-    Custom bias GELU function. Adapted from Megatron-DeepSpeed code. Here we use a simple implementation (inference) to
-    make the model jitable.
-    Args:
-        x (`torch.tensor`, *required*):
-            input hidden states
-    """
-    return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
 class LinearGPT2MLP(nn.Module):
     def __init__(self, intermediate_size, config):
         super().__init__()
         embed_dim = config.hidden_size
         self.c_fc = nn.Linear(embed_dim, intermediate_size)
         self.c_proj = nn.Linear(intermediate_size, embed_dim)
-        self.act = ACT2FN[config.activation_function] if "gelu" not in config.activation_function else gelu_forward
         self.dropout = nn.Dropout(config.resid_pdrop)
     def forward(self, hidden_states: Optional[Tuple[torch.FloatTensor]]) -> torch.FloatTensor:

     return combined_attention_mask
 class LinearGPT2MLP(nn.Module):
     def __init__(self, intermediate_size, config):
         super().__init__()
         embed_dim = config.hidden_size
         self.c_fc = nn.Linear(embed_dim, intermediate_size)
         self.c_proj = nn.Linear(intermediate_size, embed_dim)
+        self.act = ACT2FN[config.activation_function] if "gelu" not in config.activation_function else lambda \
+            x: torch.nn.functional.gelu(x, approximate="tanh")
         self.dropout = nn.Dropout(config.resid_pdrop)
     def forward(self, hidden_states: Optional[Tuple[torch.FloatTensor]]) -> torch.FloatTensor: