Text Generation
Transformers
PyTorch
Safetensors
English
gpt_refact
code
custom_code
Eval Results
4 papers
svakhreev commited on
Commit
7fcc809
1 Parent(s): 818a52a

Upload GPTRefactForCausalLM

Browse files
Files changed (3) hide show
  1. config.json +1 -1
  2. modeling_gpt_refact.py +9 -13
  3. pytorch_model.bin +2 -2
config.json CHANGED
@@ -20,7 +20,7 @@
20
  "n_layer": 32,
21
  "n_positions": 4096,
22
  "scale_attention_softmax_in_fp32": true,
23
- "torch_dtype": "float16",
24
  "transformers_version": "4.31.0",
25
  "use_cache": true,
26
  "vocab_size": 49216
 
20
  "n_layer": 32,
21
  "n_positions": 4096,
22
  "scale_attention_softmax_in_fp32": true,
23
+ "torch_dtype": "bfloat16",
24
  "transformers_version": "4.31.0",
25
  "use_cache": true,
26
  "vocab_size": 49216
modeling_gpt_refact.py CHANGED
@@ -101,7 +101,6 @@ def get_alibi_biases(
101
  # Multiply them pair-wise to get the AliBi bias matrix
102
  biases = distance[:, :, None] * m[None, None, :]
103
  biases = biases.permute(2, 0, 1)[None, :, :T, :T]
104
- biases = biases.repeat(B, 1, 1, 1)
105
  return biases.contiguous()
106
 
107
 
@@ -132,8 +131,7 @@ class Attention(nn.Module):
132
  self.attention_bias_in_fp32 = config.attention_bias_in_fp32
133
 
134
  self.q = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
135
- self.k = nn.Linear(self.embed_dim, self.head_dim, bias=False)
136
- self.v = nn.Linear(self.embed_dim, self.head_dim, bias=False)
137
  self.c_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
138
 
139
  def _get_mask_value(self, device, dtype):
@@ -200,8 +198,8 @@ class Attention(nn.Module):
200
  Tuple[torch.Tensor, Optional[torch.Tensor], Tuple[torch.Tensor, ...]],
201
  ]:
202
  query = self.q(hidden_states)
203
- key = self.k(hidden_states)
204
- value = self.v(hidden_states)
205
 
206
  if layer_past is not None:
207
  past_key, past_value = layer_past
@@ -231,15 +229,14 @@ class MLP(nn.Module):
231
  embed_dim = config.hidden_size
232
  hidden_dim = intermediate_size
233
  hidden_dim = int(2 * hidden_dim / 3)
234
- hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
235
- self.linear_1 = nn.Linear(embed_dim, hidden_dim, bias=False)
236
- self.linear_3 = nn.Linear(embed_dim, hidden_dim, bias=False)
237
- self.c_proj = nn.Linear(hidden_dim, embed_dim, bias=False)
238
 
239
  def forward(self, x: torch.Tensor) -> torch.Tensor:
240
- x1 = F.silu(self.linear_1(x))
241
- x2 = self.linear_3(x)
242
- x = self.c_proj(x1 * x2)
243
  return x
244
 
245
 
@@ -264,7 +261,6 @@ class GPTRefactBlock(nn.Module):
264
  self.ln_1 = LayerNormNoBias(hidden_size, eps=config.layer_norm_epsilon)
265
  self.attn = Attention(config, layer_idx=layer_idx)
266
  self.ln_2 = LayerNormNoBias(hidden_size, eps=config.layer_norm_epsilon)
267
-
268
  self.mlp = MLP(self.inner_dim, config)
269
 
270
  def forward(
 
101
  # Multiply them pair-wise to get the AliBi bias matrix
102
  biases = distance[:, :, None] * m[None, None, :]
103
  biases = biases.permute(2, 0, 1)[None, :, :T, :T]
 
104
  return biases.contiguous()
105
 
106
 
 
131
  self.attention_bias_in_fp32 = config.attention_bias_in_fp32
132
 
133
  self.q = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
134
+ self.kv = nn.Linear(self.embed_dim, self.head_dim * 2, bias=False)
 
135
  self.c_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
136
 
137
  def _get_mask_value(self, device, dtype):
 
198
  Tuple[torch.Tensor, Optional[torch.Tensor], Tuple[torch.Tensor, ...]],
199
  ]:
200
  query = self.q(hidden_states)
201
+ kv = self.kv(hidden_states)
202
+ key, value = kv.split(self.head_dim, dim=-1)
203
 
204
  if layer_past is not None:
205
  past_key, past_value = layer_past
 
229
  embed_dim = config.hidden_size
230
  hidden_dim = intermediate_size
231
  hidden_dim = int(2 * hidden_dim / 3)
232
+ self.hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
233
+ self.gate_up_proj = nn.Linear(embed_dim, self.hidden_dim * 2, bias=False)
234
+ self.c_proj = nn.Linear(self.hidden_dim, embed_dim, bias=False)
 
235
 
236
  def forward(self, x: torch.Tensor) -> torch.Tensor:
237
+ up_proj = self.gate_up_proj(x)
238
+ x1, x2 = torch.split(up_proj, self.hidden_dim, dim=-1)
239
+ x = self.c_proj(F.silu(x1) * x2)
240
  return x
241
 
242
 
 
261
  self.ln_1 = LayerNormNoBias(hidden_size, eps=config.layer_norm_epsilon)
262
  self.attn = Attention(config, layer_idx=layer_idx)
263
  self.ln_2 = LayerNormNoBias(hidden_size, eps=config.layer_norm_epsilon)
 
264
  self.mlp = MLP(self.inner_dim, config)
265
 
266
  def forward(
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1092c5efe56fe5b04360ba0d4ac231e8b03f9d1d0b8633b8ed678f73bdcb021a
3
- size 3171776281
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6bf4dc20907069119671fdaf9f7b79d0260cd36ab94626f4af4fdd5a157d0205
3
+ size 3171755929