thesephist commited on
Commit
aa7c671
1 Parent(s): 73a7349

Upload BottleneckT5LMWithPerturb

Browse files
bottleneck_t5.py ADDED
@@ -0,0 +1,426 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import torch
3
+ import torch.nn as nn
4
+ from torch.nn import CrossEntropyLoss
5
+ from torch.nn import functional as F
6
+ from typing import Optional, Tuple, Union
7
+
8
+ from transformers import T5Config, T5Tokenizer, T5ForConditionalGeneration
9
+ from transformers.models.t5.modeling_t5 import (
10
+ T5LayerNorm,
11
+ T5LayerFF,
12
+ T5Attention,
13
+ T5LayerSelfAttention,
14
+ T5LayerCrossAttention,
15
+ T5Block,
16
+ T5Stack,
17
+ )
18
+ from transformers.modeling_outputs import BaseModelOutput, Seq2SeqLMOutput
19
+
20
+ class BottleneckCrossAttentionGate(nn.Module):
21
+ def __init__(self, config):
22
+ super().__init__()
23
+ self.gate = nn.Linear(2 * config.d_model, config.d_model, bias=False)
24
+ self.act = nn.Sigmoid()
25
+
26
+ def forward(self, query_states, latents):
27
+ latents = latents.unsqueeze(1).expand(query_states.shape)
28
+ query_latents = torch.cat([query_states, latents], dim=-1)
29
+ return 2 * self.act(self.gate(query_latents))
30
+
31
+ class BottleneckT5Attention(T5Attention):
32
+ def __init__(self, config: T5Config, has_relative_attention_bias=False):
33
+ super(T5Attention, self).__init__()
34
+ self.is_decoder = config.is_decoder
35
+ self.has_relative_attention_bias = has_relative_attention_bias
36
+ self.relative_attention_num_buckets = config.relative_attention_num_buckets
37
+ self.relative_attention_max_distance = config.relative_attention_max_distance
38
+ self.d_model = config.d_model
39
+ self.key_value_proj_dim = config.d_kv
40
+ self.n_heads = config.num_heads
41
+ self.dropout = config.dropout_rate
42
+ self.inner_dim = self.n_heads * self.key_value_proj_dim
43
+
44
+ # Mesh TensorFlow initialization to avoid scaling before softmax
45
+ self.v = nn.Linear(self.d_model, self.inner_dim, bias=False)
46
+ self.o = nn.Linear(self.inner_dim, self.d_model, bias=False)
47
+
48
+ if self.has_relative_attention_bias:
49
+ self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads)
50
+ self.pruned_heads = set()
51
+ self.gradient_checkpointing = False
52
+
53
+ def prune_heads(self, heads):
54
+ if len(heads) == 0:
55
+ return
56
+ heads, index = find_pruneable_heads_and_indices(
57
+ heads, self.n_heads, self.key_value_proj_dim, self.pruned_heads
58
+ )
59
+ # Prune linear layers
60
+ self.v = prune_linear_layer(self.v, index)
61
+ self.o = prune_linear_layer(self.o, index, dim=1)
62
+ # Update hyper params
63
+ self.n_heads = self.n_heads - len(heads)
64
+ self.inner_dim = self.key_value_proj_dim * self.n_heads
65
+ self.pruned_heads = self.pruned_heads.union(heads)
66
+
67
+ def forward(
68
+ self,
69
+ hidden_states,
70
+ mask=None,
71
+ key_value_states=None,
72
+ position_bias=None,
73
+ past_key_value=None,
74
+ layer_head_mask=None,
75
+ query_length=None,
76
+ use_cache=False,
77
+ output_attentions=False,
78
+ ):
79
+ """
80
+ Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
81
+ """
82
+ # Input is (batch_size, seq_length, dim)
83
+ # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length)
84
+ # past_key_value[0] is (batch_size, n_heads, q_len - 1, dim_per_head)
85
+ batch_size, seq_length = hidden_states.shape[:2]
86
+
87
+ real_seq_length = seq_length
88
+
89
+ if past_key_value is not None:
90
+ assert (
91
+ len(past_key_value) == 2
92
+ ), f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
93
+ real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length
94
+
95
+ key_length = real_seq_length if key_value_states is None else key_value_states.shape[1]
96
+
97
+ def shape(states):
98
+ """projection"""
99
+ return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
100
+
101
+ def unshape(states):
102
+ """reshape"""
103
+ return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim)
104
+
105
+ def project(hidden_states, proj_layer, key_value_states, past_key_value):
106
+ """projects hidden states correctly to key/query states"""
107
+ if key_value_states is None:
108
+ # self-attn
109
+ # (batch_size, n_heads, seq_length, dim_per_head)
110
+ hidden_states = shape(proj_layer(hidden_states))
111
+ elif past_key_value is None:
112
+ # cross-attn
113
+ # (batch_size, n_heads, seq_length, dim_per_head)
114
+ hidden_states = shape(proj_layer(key_value_states))
115
+
116
+ if past_key_value is not None:
117
+ if key_value_states is None:
118
+ # self-attn
119
+ # (batch_size, n_heads, key_length, dim_per_head)
120
+ hidden_states = torch.cat([past_key_value, hidden_states], dim=2)
121
+ else:
122
+ # cross-attn
123
+ hidden_states = past_key_value
124
+ return hidden_states
125
+
126
+ # key/value states
127
+ key_states = torch.zeros((batch_size, self.n_heads, seq_length, key_length), device=hidden_states.device)
128
+ value_states = project(
129
+ hidden_states, self.v, key_value_states, past_key_value[1] if past_key_value is not None else None
130
+ )
131
+
132
+ # compute scores
133
+ scores = torch.ones((batch_size, self.n_heads, seq_length, key_length), device=hidden_states.device)
134
+
135
+ if position_bias is None:
136
+ if not self.has_relative_attention_bias:
137
+ position_bias = torch.zeros(
138
+ (1, self.n_heads, real_seq_length, key_length), device=scores.device, dtype=scores.dtype
139
+ )
140
+ if self.gradient_checkpointing and self.training:
141
+ position_bias.requires_grad = True
142
+ else:
143
+ position_bias = self.compute_bias(real_seq_length, key_length, device=scores.device)
144
+
145
+ # if key and values are already calculated
146
+ # we want only the last query position bias
147
+ if past_key_value is not None:
148
+ position_bias = position_bias[:, :, -hidden_states.size(1) :, :]
149
+
150
+ if mask is not None:
151
+ position_bias = position_bias + mask # (batch_size, n_heads, seq_length, key_length)
152
+
153
+ if self.pruned_heads:
154
+ mask = torch.ones(position_bias.shape[1])
155
+ mask[list(self.pruned_heads)] = 0
156
+ position_bias_masked = position_bias[:, mask.bool()]
157
+ else:
158
+ position_bias_masked = position_bias
159
+
160
+ scores += position_bias_masked
161
+ attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as(
162
+ scores
163
+ ) # (batch_size, n_heads, seq_length, key_length)
164
+ attn_weights = nn.functional.dropout(
165
+ attn_weights, p=self.dropout, training=self.training
166
+ ) # (batch_size, n_heads, seq_length, key_length)
167
+
168
+ # Mask heads if we want to
169
+ if layer_head_mask is not None:
170
+ attn_weights = attn_weights * layer_head_mask
171
+
172
+ attn_output = unshape(torch.matmul(attn_weights, value_states)) # (batch_size, seq_length, dim)
173
+ attn_output = self.o(attn_output)
174
+
175
+ present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None
176
+ outputs = (attn_output,) + (present_key_value_state,) + (position_bias,)
177
+
178
+ if output_attentions:
179
+ outputs = outputs + (attn_weights,)
180
+ return outputs
181
+
182
+ class BottleneckT5LayerCrossAttention(T5LayerCrossAttention):
183
+ def __init__(self, config):
184
+ super(T5LayerCrossAttention, self).__init__()
185
+ self.EncDecAttention = BottleneckT5Attention(config, has_relative_attention_bias=False)
186
+ self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
187
+ self.gate = BottleneckCrossAttentionGate(config)
188
+ self.dropout = nn.Dropout(config.dropout_rate)
189
+
190
+ def forward(
191
+ self,
192
+ hidden_states,
193
+ key_value_states,
194
+ attention_mask=None,
195
+ position_bias=None,
196
+ layer_head_mask=None,
197
+ past_key_value=None,
198
+ use_cache=False,
199
+ query_length=None,
200
+ output_attentions=False,
201
+ ):
202
+ normed_hidden_states = self.layer_norm(hidden_states)
203
+ attention_output = self.EncDecAttention(
204
+ normed_hidden_states,
205
+ mask=attention_mask,
206
+ key_value_states=key_value_states,
207
+ position_bias=position_bias,
208
+ layer_head_mask=layer_head_mask,
209
+ past_key_value=past_key_value,
210
+ use_cache=use_cache,
211
+ query_length=query_length,
212
+ output_attentions=output_attentions,
213
+ )
214
+ latents = key_value_states[:, 0]
215
+ layer_output = hidden_states + self.dropout(self.gate(normed_hidden_states, latents) * attention_output[0])
216
+ outputs = (layer_output,) + attention_output[1:] # add attentions if we output them
217
+ return outputs
218
+
219
+ class BottleneckT5Block(T5Block):
220
+ def __init__(self, config, has_relative_attention_bias=False):
221
+ super(T5Block, self).__init__()
222
+ self.is_decoder = config.is_decoder
223
+ self.layer = nn.ModuleList()
224
+ self.layer.append(T5LayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias))
225
+ if self.is_decoder:
226
+ self.layer.append(BottleneckT5LayerCrossAttention(config))
227
+
228
+ self.layer.append(T5LayerFF(config))
229
+
230
+ class BottleneckT5Stack(T5Stack):
231
+ def __init__(self, config, embed_tokens=None):
232
+ super(T5Stack, self).__init__(config)
233
+
234
+ self.embed_tokens = embed_tokens
235
+ self.is_decoder = config.is_decoder
236
+
237
+ self.block = nn.ModuleList(
238
+ [BottleneckT5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)]
239
+ )
240
+ self.final_layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
241
+ self.dropout = nn.Dropout(config.dropout_rate)
242
+
243
+ # Initialize weights and apply final processing
244
+ self.post_init()
245
+ # Model parallel
246
+ self.model_parallel = False
247
+ self.device_map = None
248
+ self.gradient_checkpointing = False
249
+
250
+ class BottleneckT5LMWithPerturb(T5ForConditionalGeneration):
251
+ def __init__(self, config: T5Config):
252
+ super(T5ForConditionalGeneration, self).__init__(config)
253
+ self.model_dim = config.d_model
254
+
255
+ self.shared = nn.Embedding(config.vocab_size, config.d_model)
256
+ encoder_config = copy.deepcopy(config)
257
+ encoder_config.is_decoder = False
258
+ encoder_config.use_cache = False
259
+ encoder_config.is_encoder_decoder = False
260
+ self.encoder = T5Stack(encoder_config, self.shared)
261
+
262
+ # New in Contra: MHA bottleneck block
263
+ self.num_heads = config.num_heads
264
+ self.bottleneck = nn.MultiheadAttention(config.d_model,
265
+ num_heads=config.num_heads,
266
+ dropout=config.dropout_rate,
267
+ bias=False,
268
+ batch_first=True)
269
+ self.bottleneck_scale = nn.Parameter(torch.ones(1))
270
+
271
+ self.dec_emb = nn.Embedding(config.vocab_size, config.d_model)
272
+ decoder_config = copy.deepcopy(config)
273
+ decoder_config.is_decoder = True
274
+ decoder_config.is_encoder_decoder = False
275
+ decoder_config.num_layers = config.num_decoder_layers
276
+ self.decoder = BottleneckT5Stack(decoder_config, self.dec_emb)
277
+
278
+ self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
279
+
280
+ # Initialize weights and apply final processing
281
+ self.post_init()
282
+
283
+ # Model parallel
284
+ self.model_parallel = False
285
+ self.device_map = None
286
+
287
+ def forward(
288
+ self,
289
+ input_ids: Optional[torch.LongTensor] = None,
290
+ attention_mask: Optional[torch.FloatTensor] = None,
291
+ decoder_input_ids: Optional[torch.LongTensor] = None,
292
+ decoder_attention_mask: Optional[torch.BoolTensor] = None,
293
+ head_mask: Optional[torch.FloatTensor] = None,
294
+ decoder_head_mask: Optional[torch.FloatTensor] = None,
295
+ cross_attn_head_mask: Optional[torch.Tensor] = None,
296
+ encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
297
+ past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
298
+ inputs_embeds: Optional[torch.FloatTensor] = None,
299
+ decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
300
+ labels: Optional[torch.LongTensor] = None,
301
+ use_cache: Optional[bool] = None,
302
+ output_attentions: Optional[bool] = None,
303
+ output_hidden_states: Optional[bool] = None,
304
+ return_dict: Optional[bool] = None,
305
+ perturb_vector: Optional[torch.FloatTensor] = None,
306
+ encode_only: Optional[bool] = None,
307
+ ) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]:
308
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
309
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
310
+
311
+ # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
312
+ if head_mask is not None and decoder_head_mask is None:
313
+ if self.config.num_layers == self.config.num_decoder_layers:
314
+ warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
315
+ decoder_head_mask = head_mask
316
+
317
+ # Encode if needed (training, first prediction pass)
318
+ if encoder_outputs is None:
319
+ # Convert encoder inputs in embeddings if needed
320
+ encoder_outputs = self.encoder(
321
+ input_ids=input_ids,
322
+ attention_mask=attention_mask,
323
+ inputs_embeds=inputs_embeds,
324
+ head_mask=head_mask,
325
+ output_attentions=output_attentions,
326
+ output_hidden_states=output_hidden_states,
327
+ return_dict=return_dict,
328
+ )
329
+ elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
330
+ encoder_outputs = BaseModelOutput(
331
+ last_hidden_state=encoder_outputs[0],
332
+ hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
333
+ attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
334
+ )
335
+
336
+ hidden_states = encoder_outputs[0]
337
+
338
+ # MHA across token embeddings + embedding normalization + broadcast
339
+ hidden_states = hidden_states.repeat(
340
+ attention_mask.shape[0] // hidden_states.shape[0],
341
+ 1, 1) # during contrastive search, attn mask can have higher batch size than hidden_state
342
+ mask_expanded = attention_mask.float().unsqueeze(-1).expand(hidden_states.shape)
343
+ mean_pooled_embedding = torch.sum(hidden_states * mask_expanded, 1) / torch.clamp(mask_expanded.sum(1), min=1e-9)
344
+ unscaled_latent, attn_weights = self.bottleneck(mean_pooled_embedding.unsqueeze(1), hidden_states, hidden_states,
345
+ need_weights=False,
346
+ # torch MHA attn_mask has opposite signs to HF T5 masks... sigh
347
+ attn_mask=attention_mask.float().unsqueeze(1).repeat_interleave(self.num_heads, dim=0))
348
+ latent = self.bottleneck_scale * F.normalize(unscaled_latent, p=2, dim=2)
349
+ if encode_only:
350
+ return latent.squeeze(1)
351
+ hidden_states = latent.expand(hidden_states.shape)
352
+
353
+ if hasattr(self, 'perturb_vector'):
354
+ hidden_states = self.bottleneck_scale * F.normalize(hidden_states + self.perturb_vector, p=2, dim=2)
355
+
356
+ if self.model_parallel:
357
+ torch.cuda.set_device(self.decoder.first_device)
358
+
359
+ if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
360
+ # get decoder inputs from shifting lm labels to the right
361
+ decoder_input_ids = self._shift_right(labels)
362
+
363
+ # Set device for model parallelism
364
+ if self.model_parallel:
365
+ torch.cuda.set_device(self.decoder.first_device)
366
+ hidden_states = hidden_states.to(self.decoder.first_device)
367
+ if decoder_input_ids is not None:
368
+ decoder_input_ids = decoder_input_ids.to(self.decoder.first_device)
369
+ if attention_mask is not None:
370
+ attention_mask = attention_mask.to(self.decoder.first_device)
371
+ if decoder_attention_mask is not None:
372
+ decoder_attention_mask = decoder_attention_mask.to(self.decoder.first_device)
373
+
374
+ # Decode
375
+ decoder_outputs = self.decoder(
376
+ input_ids=decoder_input_ids,
377
+ attention_mask=decoder_attention_mask,
378
+ inputs_embeds=decoder_inputs_embeds,
379
+ past_key_values=past_key_values,
380
+ encoder_hidden_states=hidden_states,
381
+ encoder_attention_mask=attention_mask,
382
+ head_mask=decoder_head_mask,
383
+ cross_attn_head_mask=cross_attn_head_mask,
384
+ use_cache=use_cache,
385
+ output_attentions=output_attentions,
386
+ output_hidden_states=output_hidden_states,
387
+ return_dict=return_dict,
388
+ )
389
+
390
+ sequence_output = decoder_outputs[0]
391
+
392
+ # Set device for model parallelism
393
+ if self.model_parallel:
394
+ torch.cuda.set_device(self.encoder.first_device)
395
+ self.lm_head = self.lm_head.to(self.encoder.first_device)
396
+ sequence_output = sequence_output.to(self.lm_head.weight.device)
397
+
398
+ if self.config.tie_word_embeddings:
399
+ # Rescale output before projecting on vocab
400
+ # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
401
+ sequence_output = sequence_output * (self.model_dim**-0.5)
402
+
403
+ lm_logits = self.lm_head(sequence_output)
404
+
405
+ loss = None
406
+ if labels is not None:
407
+ loss_fct = CrossEntropyLoss(ignore_index=-100)
408
+ loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))
409
+ # TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666
410
+
411
+ if not return_dict:
412
+ output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs
413
+ return ((loss,) + output) if loss is not None else output
414
+
415
+ return Seq2SeqLMOutput(
416
+ loss=loss,
417
+ logits=lm_logits,
418
+ past_key_values=decoder_outputs.past_key_values,
419
+ decoder_hidden_states=decoder_outputs.hidden_states,
420
+ decoder_attentions=decoder_outputs.attentions,
421
+ cross_attentions=decoder_outputs.cross_attentions,
422
+ encoder_last_hidden_state=encoder_outputs.last_hidden_state,
423
+ encoder_hidden_states=encoder_outputs.hidden_states,
424
+ encoder_attentions=encoder_outputs.attentions,
425
+ )
426
+
config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "./bottleneck-t5",
3
+ "architectures": [
4
+ "BottleneckT5LMWithPerturb"
5
+ ],
6
+ "auto_map": {
7
+ "AutoModelForCausalLM": "bottleneck_t5.BottleneckT5LMWithPerturb"
8
+ },
9
+ "classifier_dropout": 0.0,
10
+ "d_ff": 5120,
11
+ "d_kv": 64,
12
+ "d_model": 2048,
13
+ "decoder_start_token_id": 0,
14
+ "dense_act_fn": "gelu_new",
15
+ "dropout_rate": 0.1,
16
+ "eos_token_id": 1,
17
+ "feed_forward_proj": "gated-gelu",
18
+ "initializer_factor": 1.0,
19
+ "is_encoder_decoder": true,
20
+ "is_gated_act": true,
21
+ "layer_norm_epsilon": 1e-06,
22
+ "model_type": "t5",
23
+ "num_decoder_layers": 24,
24
+ "num_heads": 32,
25
+ "num_layers": 24,
26
+ "output_past": true,
27
+ "pad_token_id": 0,
28
+ "relative_attention_max_distance": 128,
29
+ "relative_attention_num_buckets": 32,
30
+ "tie_word_embeddings": false,
31
+ "torch_dtype": "float32",
32
+ "transformers_version": "4.33.3",
33
+ "use_cache": true,
34
+ "vocab_size": 32128
35
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "decoder_start_token_id": 0,
4
+ "eos_token_id": 1,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.33.3"
7
+ }
pytorch_model-00001-of-00002.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e72248cb47fb6d3beb3be52ea489713be3747619ff212b07097862d9dc49b07
3
+ size 9989765967
pytorch_model-00002-of-00002.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe9fd3ea1ec5ef435768e3c49fba43b4f09d35fad310ba5c87eb2fbc3bc00114
3
+ size 1739761475
pytorch_model.bin.index.json ADDED
@@ -0,0 +1,547 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 11729330180
4
+ },
5
+ "weight_map": {
6
+ "bottleneck.in_proj_weight": "pytorch_model-00001-of-00002.bin",
7
+ "bottleneck.out_proj.weight": "pytorch_model-00001-of-00002.bin",
8
+ "bottleneck_scale": "pytorch_model-00001-of-00002.bin",
9
+ "dec_emb.weight": "pytorch_model-00001-of-00002.bin",
10
+ "decoder.block.0.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
11
+ "decoder.block.0.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
12
+ "decoder.block.0.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
13
+ "decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight": "pytorch_model-00001-of-00002.bin",
14
+ "decoder.block.0.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
15
+ "decoder.block.0.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
16
+ "decoder.block.0.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
17
+ "decoder.block.0.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
18
+ "decoder.block.0.layer.1.gate.gate.weight": "pytorch_model-00001-of-00002.bin",
19
+ "decoder.block.0.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
20
+ "decoder.block.0.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
21
+ "decoder.block.0.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
22
+ "decoder.block.0.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
23
+ "decoder.block.0.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
24
+ "decoder.block.1.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
25
+ "decoder.block.1.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
26
+ "decoder.block.1.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
27
+ "decoder.block.1.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
28
+ "decoder.block.1.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
29
+ "decoder.block.1.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
30
+ "decoder.block.1.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
31
+ "decoder.block.1.layer.1.gate.gate.weight": "pytorch_model-00001-of-00002.bin",
32
+ "decoder.block.1.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
33
+ "decoder.block.1.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
34
+ "decoder.block.1.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
35
+ "decoder.block.1.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
36
+ "decoder.block.1.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
37
+ "decoder.block.10.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
38
+ "decoder.block.10.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
39
+ "decoder.block.10.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
40
+ "decoder.block.10.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
41
+ "decoder.block.10.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
42
+ "decoder.block.10.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
43
+ "decoder.block.10.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
44
+ "decoder.block.10.layer.1.gate.gate.weight": "pytorch_model-00001-of-00002.bin",
45
+ "decoder.block.10.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
46
+ "decoder.block.10.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
47
+ "decoder.block.10.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
48
+ "decoder.block.10.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
49
+ "decoder.block.10.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
50
+ "decoder.block.11.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
51
+ "decoder.block.11.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
52
+ "decoder.block.11.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
53
+ "decoder.block.11.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
54
+ "decoder.block.11.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
55
+ "decoder.block.11.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
56
+ "decoder.block.11.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
57
+ "decoder.block.11.layer.1.gate.gate.weight": "pytorch_model-00001-of-00002.bin",
58
+ "decoder.block.11.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
59
+ "decoder.block.11.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
60
+ "decoder.block.11.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
61
+ "decoder.block.11.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
62
+ "decoder.block.11.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
63
+ "decoder.block.12.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
64
+ "decoder.block.12.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
65
+ "decoder.block.12.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
66
+ "decoder.block.12.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
67
+ "decoder.block.12.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
68
+ "decoder.block.12.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
69
+ "decoder.block.12.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
70
+ "decoder.block.12.layer.1.gate.gate.weight": "pytorch_model-00001-of-00002.bin",
71
+ "decoder.block.12.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
72
+ "decoder.block.12.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
73
+ "decoder.block.12.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
74
+ "decoder.block.12.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
75
+ "decoder.block.12.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
76
+ "decoder.block.13.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
77
+ "decoder.block.13.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
78
+ "decoder.block.13.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
79
+ "decoder.block.13.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
80
+ "decoder.block.13.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
81
+ "decoder.block.13.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
82
+ "decoder.block.13.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
83
+ "decoder.block.13.layer.1.gate.gate.weight": "pytorch_model-00001-of-00002.bin",
84
+ "decoder.block.13.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
85
+ "decoder.block.13.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
86
+ "decoder.block.13.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
87
+ "decoder.block.13.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
88
+ "decoder.block.13.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
89
+ "decoder.block.14.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
90
+ "decoder.block.14.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
91
+ "decoder.block.14.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
92
+ "decoder.block.14.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
93
+ "decoder.block.14.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
94
+ "decoder.block.14.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
95
+ "decoder.block.14.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
96
+ "decoder.block.14.layer.1.gate.gate.weight": "pytorch_model-00001-of-00002.bin",
97
+ "decoder.block.14.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
98
+ "decoder.block.14.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
99
+ "decoder.block.14.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
100
+ "decoder.block.14.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
101
+ "decoder.block.14.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
102
+ "decoder.block.15.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
103
+ "decoder.block.15.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
104
+ "decoder.block.15.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
105
+ "decoder.block.15.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
106
+ "decoder.block.15.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
107
+ "decoder.block.15.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
108
+ "decoder.block.15.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
109
+ "decoder.block.15.layer.1.gate.gate.weight": "pytorch_model-00001-of-00002.bin",
110
+ "decoder.block.15.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
111
+ "decoder.block.15.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
112
+ "decoder.block.15.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
113
+ "decoder.block.15.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
114
+ "decoder.block.15.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
115
+ "decoder.block.16.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
116
+ "decoder.block.16.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
117
+ "decoder.block.16.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
118
+ "decoder.block.16.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
119
+ "decoder.block.16.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
120
+ "decoder.block.16.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
121
+ "decoder.block.16.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
122
+ "decoder.block.16.layer.1.gate.gate.weight": "pytorch_model-00001-of-00002.bin",
123
+ "decoder.block.16.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
124
+ "decoder.block.16.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
125
+ "decoder.block.16.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
126
+ "decoder.block.16.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
127
+ "decoder.block.16.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
128
+ "decoder.block.17.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
129
+ "decoder.block.17.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
130
+ "decoder.block.17.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
131
+ "decoder.block.17.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
132
+ "decoder.block.17.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
133
+ "decoder.block.17.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
134
+ "decoder.block.17.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
135
+ "decoder.block.17.layer.1.gate.gate.weight": "pytorch_model-00001-of-00002.bin",
136
+ "decoder.block.17.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
137
+ "decoder.block.17.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
138
+ "decoder.block.17.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
139
+ "decoder.block.17.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
140
+ "decoder.block.17.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
141
+ "decoder.block.18.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
142
+ "decoder.block.18.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
143
+ "decoder.block.18.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
144
+ "decoder.block.18.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
145
+ "decoder.block.18.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
146
+ "decoder.block.18.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00002.bin",
147
+ "decoder.block.18.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
148
+ "decoder.block.18.layer.1.gate.gate.weight": "pytorch_model-00002-of-00002.bin",
149
+ "decoder.block.18.layer.1.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
150
+ "decoder.block.18.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00002.bin",
151
+ "decoder.block.18.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00002.bin",
152
+ "decoder.block.18.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00002.bin",
153
+ "decoder.block.18.layer.2.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
154
+ "decoder.block.19.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00002.bin",
155
+ "decoder.block.19.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00002.bin",
156
+ "decoder.block.19.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00002.bin",
157
+ "decoder.block.19.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00002.bin",
158
+ "decoder.block.19.layer.0.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
159
+ "decoder.block.19.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00002.bin",
160
+ "decoder.block.19.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00002.bin",
161
+ "decoder.block.19.layer.1.gate.gate.weight": "pytorch_model-00002-of-00002.bin",
162
+ "decoder.block.19.layer.1.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
163
+ "decoder.block.19.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00002.bin",
164
+ "decoder.block.19.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00002.bin",
165
+ "decoder.block.19.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00002.bin",
166
+ "decoder.block.19.layer.2.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
167
+ "decoder.block.2.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
168
+ "decoder.block.2.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
169
+ "decoder.block.2.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
170
+ "decoder.block.2.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
171
+ "decoder.block.2.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
172
+ "decoder.block.2.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
173
+ "decoder.block.2.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
174
+ "decoder.block.2.layer.1.gate.gate.weight": "pytorch_model-00001-of-00002.bin",
175
+ "decoder.block.2.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
176
+ "decoder.block.2.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
177
+ "decoder.block.2.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
178
+ "decoder.block.2.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
179
+ "decoder.block.2.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
180
+ "decoder.block.20.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00002.bin",
181
+ "decoder.block.20.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00002.bin",
182
+ "decoder.block.20.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00002.bin",
183
+ "decoder.block.20.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00002.bin",
184
+ "decoder.block.20.layer.0.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
185
+ "decoder.block.20.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00002.bin",
186
+ "decoder.block.20.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00002.bin",
187
+ "decoder.block.20.layer.1.gate.gate.weight": "pytorch_model-00002-of-00002.bin",
188
+ "decoder.block.20.layer.1.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
189
+ "decoder.block.20.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00002.bin",
190
+ "decoder.block.20.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00002.bin",
191
+ "decoder.block.20.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00002.bin",
192
+ "decoder.block.20.layer.2.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
193
+ "decoder.block.21.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00002.bin",
194
+ "decoder.block.21.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00002.bin",
195
+ "decoder.block.21.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00002.bin",
196
+ "decoder.block.21.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00002.bin",
197
+ "decoder.block.21.layer.0.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
198
+ "decoder.block.21.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00002.bin",
199
+ "decoder.block.21.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00002.bin",
200
+ "decoder.block.21.layer.1.gate.gate.weight": "pytorch_model-00002-of-00002.bin",
201
+ "decoder.block.21.layer.1.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
202
+ "decoder.block.21.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00002.bin",
203
+ "decoder.block.21.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00002.bin",
204
+ "decoder.block.21.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00002.bin",
205
+ "decoder.block.21.layer.2.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
206
+ "decoder.block.22.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00002.bin",
207
+ "decoder.block.22.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00002.bin",
208
+ "decoder.block.22.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00002.bin",
209
+ "decoder.block.22.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00002.bin",
210
+ "decoder.block.22.layer.0.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
211
+ "decoder.block.22.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00002.bin",
212
+ "decoder.block.22.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00002.bin",
213
+ "decoder.block.22.layer.1.gate.gate.weight": "pytorch_model-00002-of-00002.bin",
214
+ "decoder.block.22.layer.1.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
215
+ "decoder.block.22.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00002.bin",
216
+ "decoder.block.22.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00002.bin",
217
+ "decoder.block.22.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00002.bin",
218
+ "decoder.block.22.layer.2.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
219
+ "decoder.block.23.layer.0.SelfAttention.k.weight": "pytorch_model-00002-of-00002.bin",
220
+ "decoder.block.23.layer.0.SelfAttention.o.weight": "pytorch_model-00002-of-00002.bin",
221
+ "decoder.block.23.layer.0.SelfAttention.q.weight": "pytorch_model-00002-of-00002.bin",
222
+ "decoder.block.23.layer.0.SelfAttention.v.weight": "pytorch_model-00002-of-00002.bin",
223
+ "decoder.block.23.layer.0.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
224
+ "decoder.block.23.layer.1.EncDecAttention.o.weight": "pytorch_model-00002-of-00002.bin",
225
+ "decoder.block.23.layer.1.EncDecAttention.v.weight": "pytorch_model-00002-of-00002.bin",
226
+ "decoder.block.23.layer.1.gate.gate.weight": "pytorch_model-00002-of-00002.bin",
227
+ "decoder.block.23.layer.1.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
228
+ "decoder.block.23.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00002-of-00002.bin",
229
+ "decoder.block.23.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00002-of-00002.bin",
230
+ "decoder.block.23.layer.2.DenseReluDense.wo.weight": "pytorch_model-00002-of-00002.bin",
231
+ "decoder.block.23.layer.2.layer_norm.weight": "pytorch_model-00002-of-00002.bin",
232
+ "decoder.block.3.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
233
+ "decoder.block.3.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
234
+ "decoder.block.3.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
235
+ "decoder.block.3.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
236
+ "decoder.block.3.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
237
+ "decoder.block.3.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
238
+ "decoder.block.3.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
239
+ "decoder.block.3.layer.1.gate.gate.weight": "pytorch_model-00001-of-00002.bin",
240
+ "decoder.block.3.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
241
+ "decoder.block.3.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
242
+ "decoder.block.3.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
243
+ "decoder.block.3.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
244
+ "decoder.block.3.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
245
+ "decoder.block.4.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
246
+ "decoder.block.4.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
247
+ "decoder.block.4.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
248
+ "decoder.block.4.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
249
+ "decoder.block.4.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
250
+ "decoder.block.4.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
251
+ "decoder.block.4.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
252
+ "decoder.block.4.layer.1.gate.gate.weight": "pytorch_model-00001-of-00002.bin",
253
+ "decoder.block.4.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
254
+ "decoder.block.4.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
255
+ "decoder.block.4.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
256
+ "decoder.block.4.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
257
+ "decoder.block.4.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
258
+ "decoder.block.5.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
259
+ "decoder.block.5.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
260
+ "decoder.block.5.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
261
+ "decoder.block.5.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
262
+ "decoder.block.5.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
263
+ "decoder.block.5.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
264
+ "decoder.block.5.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
265
+ "decoder.block.5.layer.1.gate.gate.weight": "pytorch_model-00001-of-00002.bin",
266
+ "decoder.block.5.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
267
+ "decoder.block.5.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
268
+ "decoder.block.5.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
269
+ "decoder.block.5.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
270
+ "decoder.block.5.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
271
+ "decoder.block.6.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
272
+ "decoder.block.6.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
273
+ "decoder.block.6.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
274
+ "decoder.block.6.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
275
+ "decoder.block.6.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
276
+ "decoder.block.6.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
277
+ "decoder.block.6.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
278
+ "decoder.block.6.layer.1.gate.gate.weight": "pytorch_model-00001-of-00002.bin",
279
+ "decoder.block.6.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
280
+ "decoder.block.6.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
281
+ "decoder.block.6.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
282
+ "decoder.block.6.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
283
+ "decoder.block.6.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
284
+ "decoder.block.7.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
285
+ "decoder.block.7.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
286
+ "decoder.block.7.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
287
+ "decoder.block.7.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
288
+ "decoder.block.7.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
289
+ "decoder.block.7.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
290
+ "decoder.block.7.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
291
+ "decoder.block.7.layer.1.gate.gate.weight": "pytorch_model-00001-of-00002.bin",
292
+ "decoder.block.7.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
293
+ "decoder.block.7.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
294
+ "decoder.block.7.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
295
+ "decoder.block.7.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
296
+ "decoder.block.7.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
297
+ "decoder.block.8.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
298
+ "decoder.block.8.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
299
+ "decoder.block.8.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
300
+ "decoder.block.8.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
301
+ "decoder.block.8.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
302
+ "decoder.block.8.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
303
+ "decoder.block.8.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
304
+ "decoder.block.8.layer.1.gate.gate.weight": "pytorch_model-00001-of-00002.bin",
305
+ "decoder.block.8.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
306
+ "decoder.block.8.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
307
+ "decoder.block.8.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
308
+ "decoder.block.8.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
309
+ "decoder.block.8.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
310
+ "decoder.block.9.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
311
+ "decoder.block.9.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
312
+ "decoder.block.9.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
313
+ "decoder.block.9.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
314
+ "decoder.block.9.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
315
+ "decoder.block.9.layer.1.EncDecAttention.o.weight": "pytorch_model-00001-of-00002.bin",
316
+ "decoder.block.9.layer.1.EncDecAttention.v.weight": "pytorch_model-00001-of-00002.bin",
317
+ "decoder.block.9.layer.1.gate.gate.weight": "pytorch_model-00001-of-00002.bin",
318
+ "decoder.block.9.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
319
+ "decoder.block.9.layer.2.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
320
+ "decoder.block.9.layer.2.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
321
+ "decoder.block.9.layer.2.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
322
+ "decoder.block.9.layer.2.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
323
+ "decoder.embed_tokens.weight": "pytorch_model-00001-of-00002.bin",
324
+ "decoder.final_layer_norm.weight": "pytorch_model-00002-of-00002.bin",
325
+ "encoder.block.0.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
326
+ "encoder.block.0.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
327
+ "encoder.block.0.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
328
+ "encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight": "pytorch_model-00001-of-00002.bin",
329
+ "encoder.block.0.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
330
+ "encoder.block.0.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
331
+ "encoder.block.0.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
332
+ "encoder.block.0.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
333
+ "encoder.block.0.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
334
+ "encoder.block.0.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
335
+ "encoder.block.1.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
336
+ "encoder.block.1.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
337
+ "encoder.block.1.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
338
+ "encoder.block.1.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
339
+ "encoder.block.1.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
340
+ "encoder.block.1.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
341
+ "encoder.block.1.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
342
+ "encoder.block.1.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
343
+ "encoder.block.1.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
344
+ "encoder.block.10.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
345
+ "encoder.block.10.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
346
+ "encoder.block.10.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
347
+ "encoder.block.10.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
348
+ "encoder.block.10.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
349
+ "encoder.block.10.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
350
+ "encoder.block.10.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
351
+ "encoder.block.10.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
352
+ "encoder.block.10.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
353
+ "encoder.block.11.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
354
+ "encoder.block.11.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
355
+ "encoder.block.11.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
356
+ "encoder.block.11.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
357
+ "encoder.block.11.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
358
+ "encoder.block.11.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
359
+ "encoder.block.11.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
360
+ "encoder.block.11.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
361
+ "encoder.block.11.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
362
+ "encoder.block.12.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
363
+ "encoder.block.12.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
364
+ "encoder.block.12.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
365
+ "encoder.block.12.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
366
+ "encoder.block.12.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
367
+ "encoder.block.12.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
368
+ "encoder.block.12.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
369
+ "encoder.block.12.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
370
+ "encoder.block.12.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
371
+ "encoder.block.13.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
372
+ "encoder.block.13.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
373
+ "encoder.block.13.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
374
+ "encoder.block.13.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
375
+ "encoder.block.13.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
376
+ "encoder.block.13.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
377
+ "encoder.block.13.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
378
+ "encoder.block.13.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
379
+ "encoder.block.13.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
380
+ "encoder.block.14.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
381
+ "encoder.block.14.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
382
+ "encoder.block.14.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
383
+ "encoder.block.14.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
384
+ "encoder.block.14.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
385
+ "encoder.block.14.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
386
+ "encoder.block.14.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
387
+ "encoder.block.14.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
388
+ "encoder.block.14.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
389
+ "encoder.block.15.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
390
+ "encoder.block.15.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
391
+ "encoder.block.15.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
392
+ "encoder.block.15.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
393
+ "encoder.block.15.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
394
+ "encoder.block.15.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
395
+ "encoder.block.15.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
396
+ "encoder.block.15.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
397
+ "encoder.block.15.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
398
+ "encoder.block.16.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
399
+ "encoder.block.16.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
400
+ "encoder.block.16.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
401
+ "encoder.block.16.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
402
+ "encoder.block.16.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
403
+ "encoder.block.16.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
404
+ "encoder.block.16.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
405
+ "encoder.block.16.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
406
+ "encoder.block.16.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
407
+ "encoder.block.17.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
408
+ "encoder.block.17.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
409
+ "encoder.block.17.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
410
+ "encoder.block.17.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
411
+ "encoder.block.17.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
412
+ "encoder.block.17.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
413
+ "encoder.block.17.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
414
+ "encoder.block.17.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
415
+ "encoder.block.17.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
416
+ "encoder.block.18.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
417
+ "encoder.block.18.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
418
+ "encoder.block.18.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
419
+ "encoder.block.18.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
420
+ "encoder.block.18.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
421
+ "encoder.block.18.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
422
+ "encoder.block.18.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
423
+ "encoder.block.18.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
424
+ "encoder.block.18.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
425
+ "encoder.block.19.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
426
+ "encoder.block.19.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
427
+ "encoder.block.19.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
428
+ "encoder.block.19.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
429
+ "encoder.block.19.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
430
+ "encoder.block.19.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
431
+ "encoder.block.19.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
432
+ "encoder.block.19.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
433
+ "encoder.block.19.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
434
+ "encoder.block.2.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
435
+ "encoder.block.2.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
436
+ "encoder.block.2.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
437
+ "encoder.block.2.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
438
+ "encoder.block.2.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
439
+ "encoder.block.2.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
440
+ "encoder.block.2.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
441
+ "encoder.block.2.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
442
+ "encoder.block.2.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
443
+ "encoder.block.20.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
444
+ "encoder.block.20.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
445
+ "encoder.block.20.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
446
+ "encoder.block.20.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
447
+ "encoder.block.20.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
448
+ "encoder.block.20.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
449
+ "encoder.block.20.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
450
+ "encoder.block.20.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
451
+ "encoder.block.20.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
452
+ "encoder.block.21.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
453
+ "encoder.block.21.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
454
+ "encoder.block.21.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
455
+ "encoder.block.21.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
456
+ "encoder.block.21.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
457
+ "encoder.block.21.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
458
+ "encoder.block.21.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
459
+ "encoder.block.21.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
460
+ "encoder.block.21.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
461
+ "encoder.block.22.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
462
+ "encoder.block.22.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
463
+ "encoder.block.22.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
464
+ "encoder.block.22.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
465
+ "encoder.block.22.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
466
+ "encoder.block.22.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
467
+ "encoder.block.22.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
468
+ "encoder.block.22.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
469
+ "encoder.block.22.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
470
+ "encoder.block.23.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
471
+ "encoder.block.23.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
472
+ "encoder.block.23.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
473
+ "encoder.block.23.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
474
+ "encoder.block.23.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
475
+ "encoder.block.23.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
476
+ "encoder.block.23.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
477
+ "encoder.block.23.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
478
+ "encoder.block.23.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
479
+ "encoder.block.3.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
480
+ "encoder.block.3.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
481
+ "encoder.block.3.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
482
+ "encoder.block.3.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
483
+ "encoder.block.3.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
484
+ "encoder.block.3.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
485
+ "encoder.block.3.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
486
+ "encoder.block.3.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
487
+ "encoder.block.3.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
488
+ "encoder.block.4.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
489
+ "encoder.block.4.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
490
+ "encoder.block.4.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
491
+ "encoder.block.4.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
492
+ "encoder.block.4.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
493
+ "encoder.block.4.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
494
+ "encoder.block.4.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
495
+ "encoder.block.4.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
496
+ "encoder.block.4.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
497
+ "encoder.block.5.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
498
+ "encoder.block.5.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
499
+ "encoder.block.5.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
500
+ "encoder.block.5.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
501
+ "encoder.block.5.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
502
+ "encoder.block.5.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
503
+ "encoder.block.5.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
504
+ "encoder.block.5.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
505
+ "encoder.block.5.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
506
+ "encoder.block.6.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
507
+ "encoder.block.6.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
508
+ "encoder.block.6.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
509
+ "encoder.block.6.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
510
+ "encoder.block.6.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
511
+ "encoder.block.6.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
512
+ "encoder.block.6.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
513
+ "encoder.block.6.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
514
+ "encoder.block.6.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
515
+ "encoder.block.7.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
516
+ "encoder.block.7.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
517
+ "encoder.block.7.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
518
+ "encoder.block.7.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
519
+ "encoder.block.7.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
520
+ "encoder.block.7.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
521
+ "encoder.block.7.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
522
+ "encoder.block.7.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
523
+ "encoder.block.7.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
524
+ "encoder.block.8.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
525
+ "encoder.block.8.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
526
+ "encoder.block.8.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
527
+ "encoder.block.8.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
528
+ "encoder.block.8.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
529
+ "encoder.block.8.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
530
+ "encoder.block.8.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
531
+ "encoder.block.8.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
532
+ "encoder.block.8.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
533
+ "encoder.block.9.layer.0.SelfAttention.k.weight": "pytorch_model-00001-of-00002.bin",
534
+ "encoder.block.9.layer.0.SelfAttention.o.weight": "pytorch_model-00001-of-00002.bin",
535
+ "encoder.block.9.layer.0.SelfAttention.q.weight": "pytorch_model-00001-of-00002.bin",
536
+ "encoder.block.9.layer.0.SelfAttention.v.weight": "pytorch_model-00001-of-00002.bin",
537
+ "encoder.block.9.layer.0.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
538
+ "encoder.block.9.layer.1.DenseReluDense.wi_0.weight": "pytorch_model-00001-of-00002.bin",
539
+ "encoder.block.9.layer.1.DenseReluDense.wi_1.weight": "pytorch_model-00001-of-00002.bin",
540
+ "encoder.block.9.layer.1.DenseReluDense.wo.weight": "pytorch_model-00001-of-00002.bin",
541
+ "encoder.block.9.layer.1.layer_norm.weight": "pytorch_model-00001-of-00002.bin",
542
+ "encoder.embed_tokens.weight": "pytorch_model-00001-of-00002.bin",
543
+ "encoder.final_layer_norm.weight": "pytorch_model-00001-of-00002.bin",
544
+ "lm_head.weight": "pytorch_model-00002-of-00002.bin",
545
+ "shared.weight": "pytorch_model-00001-of-00002.bin"
546
+ }
547
+ }