liang.zhao commited on
Commit
8c1c087
1 Parent(s): fccd7e8

update model and config

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ pytorch_model-00001-of-00002.bin filter=lfs diff=lfs merge=lfs -text
37
+ pytorch_model-00002-of-00002.bin filter=lfs diff=lfs merge=lfs -text
config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "SkyworkForCausalLM"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "configuration_skywork.SkyworkConfig",
7
+ "AutoModelForCausalLM": "modeling_skywork.SkyworkForCausalLM"
8
+ },
9
+ "quantization_config": {
10
+ "bnb_4bit_compute_dtype": "float32",
11
+ "bnb_4bit_quant_type": "fp4",
12
+ "bnb_4bit_use_double_quant": false,
13
+ "llm_int8_enable_fp32_cpu_offload": false,
14
+ "llm_int8_has_fp16_weight": false,
15
+ "llm_int8_skip_modules": null,
16
+ "llm_int8_threshold": 6.0,
17
+ "load_in_4bit": false,
18
+ "load_in_8bit": true,
19
+ "quant_method": "bitsandbytes"
20
+ },
21
+ "bos_token_id": 1,
22
+ "eos_token_id": 2,
23
+ "pad_token_id": 0,
24
+ "hidden_act": "silu",
25
+ "hidden_size": 4608,
26
+ "initializer_range": 0.01,
27
+ "intermediate_size": 12288,
28
+ "max_position_embeddings": 4096,
29
+ "model_type": "skywork",
30
+ "num_attention_heads": 36,
31
+ "num_hidden_layers": 52,
32
+ "num_key_value_heads": 36,
33
+ "rms_norm_eps": 1e-06,
34
+ "tie_word_embeddings": false,
35
+ "torch_dtype": "bfloat16",
36
+ "transformers_version": "4.33.1",
37
+ "use_cache": true,
38
+ "vocab_size": 65519
39
+ }
configuration_skywork.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) SkyworkAI and the HuggingFace Inc. team. All rights reserved.
2
+ # This code is built upon Huggingface's transformers repository.
3
+
4
+ from transformers.configuration_utils import PretrainedConfig
5
+ from transformers.utils import logging
6
+
7
+
8
+ logger = logging.get_logger(__name__)
9
+
10
+ Skywork_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
11
+
12
+
13
+ class SkyworkConfig(PretrainedConfig):
14
+
15
+ model_type = "skywork"
16
+ keys_to_ignore_at_inference = ["past_key_values"]
17
+
18
+ def __init__(
19
+ self,
20
+ vocab_size=32000,
21
+ hidden_size=4096,
22
+ intermediate_size=11008,
23
+ num_hidden_layers=32,
24
+ num_attention_heads=32,
25
+ num_key_value_heads=None,
26
+ hidden_act="silu",
27
+ max_position_embeddings=2048,
28
+ initializer_range=0.02,
29
+ rms_norm_eps=1e-6,
30
+ use_cache=True,
31
+ pad_token_id=0,
32
+ bos_token_id=1,
33
+ eos_token_id=2,
34
+ pretraining_tp=1,
35
+ tie_word_embeddings=False,
36
+ rope_scaling=None,
37
+ rope_theta=10000.0,
38
+ attention_bias=False,
39
+ use_flash_attention=False,
40
+ **kwargs,
41
+ ):
42
+ self.vocab_size = vocab_size
43
+ self.max_position_embeddings = max_position_embeddings
44
+ self.hidden_size = hidden_size
45
+ self.intermediate_size = intermediate_size
46
+ self.num_hidden_layers = num_hidden_layers
47
+ self.num_attention_heads = num_attention_heads
48
+
49
+ # for backward compatibility
50
+ if num_key_value_heads is None:
51
+ num_key_value_heads = num_attention_heads
52
+
53
+ self.num_key_value_heads = num_key_value_heads
54
+ self.hidden_act = hidden_act
55
+ self.initializer_range = initializer_range
56
+ self.rms_norm_eps = rms_norm_eps
57
+ self.pretraining_tp = pretraining_tp
58
+ self.use_cache = use_cache
59
+ self.rope_scaling = rope_scaling
60
+ self.rope_theta = rope_theta
61
+ self.attention_bias = attention_bias
62
+ self.use_flash_attention = use_flash_attention
63
+ if self.use_flash_attention:
64
+ try:
65
+ from flash_attn.flash_attn_interface import flash_attn_varlen_func
66
+ from einops import rearrange
67
+ except:
68
+ raise ValueError("`use_flash_attention` requires Flash Attention 2+ and einops.\nTry `pip install einops` and installing Flash Attention from from https://github.com/Dao-AILab/flash-attention")
69
+
70
+ super().__init__(
71
+ pad_token_id=pad_token_id,
72
+ bos_token_id=bos_token_id,
73
+ eos_token_id=eos_token_id,
74
+ tie_word_embeddings=tie_word_embeddings,
75
+ **kwargs,
76
+ )
generation_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "do_sample": true,
4
+ "eos_token_id": 2,
5
+ "max_length": 4096,
6
+ "pad_token_id": 0,
7
+ "temperature": 0.6,
8
+ "top_p": 0.9,
9
+ "transformers_version": "4.34.0"
10
+ }
modeling_skywork.py ADDED
@@ -0,0 +1,1111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) SkyworkAI and the HuggingFace Inc. team. All rights reserved.
2
+ # This code is built upon Huggingface's transformers repository.
3
+ import math
4
+ from typing import List, Optional, Tuple, Union
5
+
6
+ import torch
7
+ import torch.nn.functional as F
8
+ import torch.utils.checkpoint
9
+ from torch import nn
10
+ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
11
+
12
+ from transformers.activations import ACT2FN
13
+ from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
14
+ from transformers.modeling_utils import PreTrainedModel
15
+ from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
16
+ from transformers.utils import (
17
+ add_start_docstrings,
18
+ add_start_docstrings_to_model_forward,
19
+ is_flash_attn_available,
20
+ logging,
21
+ replace_return_docstrings,
22
+ )
23
+ from .configuration_skywork import SkyworkConfig
24
+
25
+
26
+ if is_flash_attn_available():
27
+ from flash_attn import flash_attn_func, flash_attn_varlen_func
28
+ from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa
29
+
30
+
31
+ logger = logging.get_logger(__name__)
32
+
33
+ _CONFIG_FOR_DOC = "SkyworkConfig"
34
+
35
+
36
+ def _get_unpad_data(padding_mask):
37
+ seqlens_in_batch = padding_mask.sum(dim=-1, dtype=torch.int32)
38
+ indices = torch.nonzero(padding_mask.flatten(), as_tuple=False).flatten()
39
+ max_seqlen_in_batch = seqlens_in_batch.max().item()
40
+ cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
41
+ return (
42
+ indices,
43
+ cu_seqlens,
44
+ max_seqlen_in_batch,
45
+ )
46
+
47
+
48
+ # Copied from transformers.models.bart.modeling_bart._make_causal_mask
49
+ def _make_causal_mask(
50
+ input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
51
+ ):
52
+ """
53
+ Make causal mask used for bi-directional self-attention.
54
+ """
55
+ bsz, tgt_len = input_ids_shape
56
+ mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
57
+ mask_cond = torch.arange(mask.size(-1), device=device)
58
+ mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
59
+ mask = mask.to(dtype)
60
+
61
+ if past_key_values_length > 0:
62
+ mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
63
+ return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
64
+
65
+
66
+ # Copied from transformers.models.bart.modeling_bart._expand_mask
67
+ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
68
+ """
69
+ Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
70
+ """
71
+ bsz, src_len = mask.size()
72
+ tgt_len = tgt_len if tgt_len is not None else src_len
73
+
74
+ expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
75
+
76
+ inverted_mask = 1.0 - expanded_mask
77
+
78
+ return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
79
+
80
+
81
+ class SkyworkRMSNorm(nn.Module):
82
+ def __init__(self, hidden_size, eps=1e-6):
83
+ """
84
+ SkyworkRMSNorm is equivalent to T5LayerNorm
85
+ """
86
+ super().__init__()
87
+ self.weight = nn.Parameter(torch.ones(hidden_size))
88
+ self.variance_epsilon = eps
89
+
90
+ def forward(self, hidden_states):
91
+ input_dtype = hidden_states.dtype
92
+ hidden_states = hidden_states.to(torch.float32)
93
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
94
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
95
+ return self.weight * hidden_states.to(input_dtype)
96
+
97
+
98
+ ALL_LAYERNORM_LAYERS.append(SkyworkRMSNorm)
99
+
100
+
101
+ class SkyworkRotaryEmbedding(nn.Module):
102
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
103
+ super().__init__()
104
+
105
+ self.dim = dim
106
+ self.max_position_embeddings = max_position_embeddings
107
+ self.base = base
108
+ inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
109
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
110
+
111
+ # Build here to make `torch.jit.trace` work.
112
+ self._set_cos_sin_cache(
113
+ seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
114
+ )
115
+
116
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
117
+ self.max_seq_len_cached = seq_len
118
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
119
+
120
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
121
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
122
+ emb = torch.cat((freqs, freqs), dim=-1)
123
+ self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
124
+ self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
125
+
126
+ def forward(self, x, seq_len=None):
127
+ # x: [bs, num_attention_heads, seq_len, head_size]
128
+ if seq_len > self.max_seq_len_cached:
129
+ self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
130
+
131
+ return (
132
+ self.cos_cached[:seq_len].to(dtype=x.dtype),
133
+ self.sin_cached[:seq_len].to(dtype=x.dtype),
134
+ )
135
+
136
+
137
+ class SkyworkLinearScalingRotaryEmbedding(SkyworkRotaryEmbedding):
138
+ """SkyworkRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
139
+
140
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
141
+ self.scaling_factor = scaling_factor
142
+ super().__init__(dim, max_position_embeddings, base, device)
143
+
144
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
145
+ self.max_seq_len_cached = seq_len
146
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
147
+ t = t / self.scaling_factor
148
+
149
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
150
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
151
+ emb = torch.cat((freqs, freqs), dim=-1)
152
+ self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
153
+ self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
154
+
155
+
156
+ class SkyworkDynamicNTKScalingRotaryEmbedding(SkyworkRotaryEmbedding):
157
+ """SkyworkRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
158
+
159
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
160
+ self.scaling_factor = scaling_factor
161
+ super().__init__(dim, max_position_embeddings, base, device)
162
+
163
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
164
+ self.max_seq_len_cached = seq_len
165
+
166
+ if seq_len > self.max_position_embeddings:
167
+ base = self.base * (
168
+ (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
169
+ ) ** (self.dim / (self.dim - 2))
170
+ inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
171
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
172
+
173
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
174
+
175
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
176
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
177
+ emb = torch.cat((freqs, freqs), dim=-1)
178
+ self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
179
+ self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
180
+
181
+
182
+ def rotate_half(x):
183
+ """Rotates half the hidden dims of the input."""
184
+ x1 = x[..., : x.shape[-1] // 2]
185
+ x2 = x[..., x.shape[-1] // 2 :]
186
+ return torch.cat((-x2, x1), dim=-1)
187
+
188
+
189
+ # Copied from transformers.models.gpt_neox.modeling_gpt_neox.apply_rotary_pos_emb
190
+ def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
191
+ cos = cos[position_ids].unsqueeze(1) # [seq_len, dim] -> [batch_size, 1, seq_len, head_dim]
192
+ sin = sin[position_ids].unsqueeze(1)
193
+ q_embed = (q * cos) + (rotate_half(q) * sin)
194
+ k_embed = (k * cos) + (rotate_half(k) * sin)
195
+ return q_embed, k_embed
196
+
197
+
198
+ class SkyworkMLP(nn.Module):
199
+ def __init__(self, config):
200
+ super().__init__()
201
+ self.config = config
202
+ self.hidden_size = config.hidden_size
203
+ self.intermediate_size = config.intermediate_size
204
+ self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
205
+ self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
206
+ self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
207
+ self.act_fn = ACT2FN[config.hidden_act]
208
+
209
+ def forward(self, x):
210
+ if self.config.pretraining_tp > 1:
211
+ slice = self.intermediate_size // self.config.pretraining_tp
212
+ gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
213
+ up_proj_slices = self.up_proj.weight.split(slice, dim=0)
214
+ down_proj_slices = self.down_proj.weight.split(slice, dim=1)
215
+
216
+ gate_proj = torch.cat(
217
+ [F.linear(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1
218
+ )
219
+ up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1)
220
+
221
+ intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
222
+ down_proj = [
223
+ F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp)
224
+ ]
225
+ down_proj = sum(down_proj)
226
+ else:
227
+ down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
228
+
229
+ return down_proj
230
+
231
+
232
+ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
233
+ """
234
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
235
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
236
+ """
237
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
238
+ if n_rep == 1:
239
+ return hidden_states
240
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
241
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
242
+
243
+
244
+ class SkyworkAttention(nn.Module):
245
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
246
+
247
+ def __init__(self, config: SkyworkConfig):
248
+ super().__init__()
249
+ self.config = config
250
+ self.hidden_size = config.hidden_size
251
+ self.num_heads = config.num_attention_heads
252
+ self.head_dim = self.hidden_size // self.num_heads
253
+ self.num_key_value_heads = config.num_key_value_heads
254
+ self.num_key_value_groups = self.num_heads // self.num_key_value_heads
255
+ self.max_position_embeddings = config.max_position_embeddings
256
+ self.rope_theta = config.rope_theta
257
+
258
+ if (self.head_dim * self.num_heads) != self.hidden_size:
259
+ raise ValueError(
260
+ f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
261
+ f" and `num_heads`: {self.num_heads})."
262
+ )
263
+ self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
264
+ self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
265
+ self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
266
+ self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
267
+ self._init_rope()
268
+
269
+ def _init_rope(self):
270
+ if self.config.rope_scaling is None:
271
+ self.rotary_emb = SkyworkRotaryEmbedding(
272
+ self.head_dim,
273
+ max_position_embeddings=self.max_position_embeddings,
274
+ base=self.rope_theta,
275
+ )
276
+ else:
277
+ scaling_type = self.config.rope_scaling["type"]
278
+ scaling_factor = self.config.rope_scaling["factor"]
279
+ if scaling_type == "linear":
280
+ self.rotary_emb = SkyworkLinearScalingRotaryEmbedding(
281
+ self.head_dim,
282
+ max_position_embeddings=self.max_position_embeddings,
283
+ scaling_factor=scaling_factor,
284
+ base=self.rope_theta,
285
+ )
286
+ elif scaling_type == "dynamic":
287
+ self.rotary_emb = SkyworkDynamicNTKScalingRotaryEmbedding(
288
+ self.head_dim,
289
+ max_position_embeddings=self.max_position_embeddings,
290
+ scaling_factor=scaling_factor,
291
+ base=self.rope_theta,
292
+ )
293
+ else:
294
+ raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
295
+
296
+ def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
297
+ return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
298
+
299
+ def forward(
300
+ self,
301
+ hidden_states: torch.Tensor,
302
+ attention_mask: Optional[torch.Tensor] = None,
303
+ position_ids: Optional[torch.LongTensor] = None,
304
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
305
+ output_attentions: bool = False,
306
+ use_cache: bool = False,
307
+ padding_mask: Optional[torch.LongTensor] = None,
308
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
309
+ bsz, q_len, _ = hidden_states.size()
310
+
311
+ if self.config.pretraining_tp > 1:
312
+ key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
313
+ query_slices = self.q_proj.weight.split(
314
+ (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
315
+ )
316
+ key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
317
+ value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
318
+
319
+ query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
320
+ query_states = torch.cat(query_states, dim=-1)
321
+
322
+ key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
323
+ key_states = torch.cat(key_states, dim=-1)
324
+
325
+ value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
326
+ value_states = torch.cat(value_states, dim=-1)
327
+
328
+ else:
329
+ query_states = self.q_proj(hidden_states)
330
+ key_states = self.k_proj(hidden_states)
331
+ value_states = self.v_proj(hidden_states)
332
+
333
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
334
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
335
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
336
+
337
+ kv_seq_len = key_states.shape[-2]
338
+ if past_key_value is not None:
339
+ kv_seq_len += past_key_value[0].shape[-2]
340
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
341
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
342
+
343
+ if past_key_value is not None:
344
+ # reuse k, v, self_attention
345
+ key_states = torch.cat([past_key_value[0], key_states], dim=2)
346
+ value_states = torch.cat([past_key_value[1], value_states], dim=2)
347
+
348
+ past_key_value = (key_states, value_states) if use_cache else None
349
+
350
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
351
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
352
+
353
+ attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
354
+
355
+ if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
356
+ raise ValueError(
357
+ f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
358
+ f" {attn_weights.size()}"
359
+ )
360
+
361
+ if attention_mask is not None:
362
+ if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
363
+ raise ValueError(
364
+ f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
365
+ )
366
+ attn_weights = attn_weights + attention_mask
367
+
368
+ # upcast attention to fp32
369
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
370
+ attn_output = torch.matmul(attn_weights, value_states)
371
+
372
+ if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
373
+ raise ValueError(
374
+ f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
375
+ f" {attn_output.size()}"
376
+ )
377
+
378
+ attn_output = attn_output.transpose(1, 2).contiguous()
379
+
380
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
381
+
382
+ if self.config.pretraining_tp > 1:
383
+ attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
384
+ o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
385
+ attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)])
386
+ else:
387
+ attn_output = self.o_proj(attn_output)
388
+
389
+ if not output_attentions:
390
+ attn_weights = None
391
+
392
+ return attn_output, attn_weights, past_key_value
393
+
394
+
395
+ class SkyworkFlashAttention2(SkyworkAttention):
396
+ """
397
+ Skywork flash attention module. This module inherits from `SkyworkAttention` as the weights of the module stays
398
+ untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
399
+ flash attention and deal with padding tokens in case the input contains any of them.
400
+ """
401
+
402
+ def forward(
403
+ self,
404
+ hidden_states: torch.Tensor,
405
+ attention_mask: Optional[torch.Tensor] = None,
406
+ position_ids: Optional[torch.LongTensor] = None,
407
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
408
+ output_attentions: bool = False,
409
+ use_cache: bool = False,
410
+ padding_mask: Optional[torch.LongTensor] = None,
411
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
412
+ # SkyworkFlashAttention2 attention does not support output_attentions
413
+ output_attentions = False
414
+
415
+ bsz, q_len, _ = hidden_states.size()
416
+
417
+ query_states = self.q_proj(hidden_states)
418
+ key_states = self.k_proj(hidden_states)
419
+ value_states = self.v_proj(hidden_states)
420
+
421
+ # Flash attention requires the input to have the shape
422
+ # batch_size x seq_length x head_dime x hidden_dim
423
+ # therefore we just need to keep the original shape
424
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
425
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
426
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
427
+
428
+ kv_seq_len = key_states.shape[-2]
429
+ if past_key_value is not None:
430
+ kv_seq_len += past_key_value[0].shape[-2]
431
+
432
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
433
+
434
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
435
+
436
+ if past_key_value is not None:
437
+ # reuse k, v, self_attention
438
+ key_states = torch.cat([past_key_value[0], key_states], dim=2)
439
+ value_states = torch.cat([past_key_value[1], value_states], dim=2)
440
+
441
+ past_key_value = (key_states, value_states) if use_cache else None
442
+
443
+ query_states = query_states.transpose(1, 2)
444
+ key_states = key_states.transpose(1, 2)
445
+ value_states = value_states.transpose(1, 2)
446
+
447
+ # TODO: skywork does not have dropout in the config??
448
+ # It is recommended to use dropout with FA according to the docs
449
+ # when training.
450
+ dropout_rate = 0.0 # if not self.training else self.attn_dropout
451
+
452
+ # In PEFT, usually we cast the layer norms in float32 for training stability reasons
453
+ # therefore the input hidden states gets silently casted in float32. Hence, we need
454
+ # cast them back in float16 just to be sure everything works as expected.
455
+ # This might slowdown training & inference so it is recommended to not cast the LayerNorms
456
+ # in fp32. (SkyworkRMSNorm handles it correctly)
457
+ input_dtype = query_states.dtype
458
+ if input_dtype == torch.float32:
459
+ logger.warning_once(
460
+ "The input hidden states seems to be silently casted in float32, this might be related to"
461
+ " the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
462
+ " float16."
463
+ )
464
+
465
+ query_states = query_states.to(torch.float16)
466
+ key_states = key_states.to(torch.float16)
467
+ value_states = value_states.to(torch.float16)
468
+
469
+ attn_output = self._flash_attention_forward(
470
+ query_states, key_states, value_states, padding_mask, q_len, dropout=dropout_rate
471
+ )
472
+
473
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
474
+ attn_output = self.o_proj(attn_output)
475
+
476
+ if not output_attentions:
477
+ attn_weights = None
478
+
479
+ return attn_output, attn_weights, past_key_value
480
+
481
+ def _flash_attention_forward(
482
+ self, query_states, key_states, value_states, padding_mask, query_length, dropout=0.0, softmax_scale=None
483
+ ):
484
+ """
485
+ Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
486
+ first unpad the input, then computes the attention scores and pad the final attention scores.
487
+
488
+ Args:
489
+ query_states (`torch.Tensor`):
490
+ Input query states to be passed to Flash Attention API
491
+ key_states (`torch.Tensor`):
492
+ Input key states to be passed to Flash Attention API
493
+ value_states (`torch.Tensor`):
494
+ Input value states to be passed to Flash Attention API
495
+ padding_mask (`torch.Tensor`):
496
+ The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
497
+ position of padding tokens and 1 for the position of non-padding tokens.
498
+ dropout (`int`, *optional*):
499
+ Attention dropout
500
+ softmax_scale (`float`, *optional*):
501
+ The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
502
+ """
503
+ # Contains at least one padding token in the sequence
504
+ if padding_mask is not None:
505
+ batch_size = query_states.shape[0]
506
+ query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
507
+ query_states, key_states, value_states, padding_mask, query_length
508
+ )
509
+
510
+ cu_seqlens_q, cu_seqlens_k = cu_seq_lens
511
+ max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
512
+
513
+ attn_output_unpad = flash_attn_varlen_func(
514
+ query_states,
515
+ key_states,
516
+ value_states,
517
+ cu_seqlens_q=cu_seqlens_q,
518
+ cu_seqlens_k=cu_seqlens_k,
519
+ max_seqlen_q=max_seqlen_in_batch_q,
520
+ max_seqlen_k=max_seqlen_in_batch_k,
521
+ dropout_p=dropout,
522
+ softmax_scale=softmax_scale,
523
+ causal=True,
524
+ )
525
+
526
+ attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
527
+ else:
528
+ attn_output = flash_attn_func(
529
+ query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=True
530
+ )
531
+
532
+ return attn_output
533
+
534
+ def _upad_input(self, query_layer, key_layer, value_layer, padding_mask, query_length):
535
+ indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(padding_mask)
536
+ batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
537
+
538
+ key_layer = index_first_axis(
539
+ key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
540
+ )
541
+ value_layer = index_first_axis(
542
+ value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
543
+ )
544
+ if query_length == kv_seq_len:
545
+ query_layer = index_first_axis(
546
+ query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
547
+ )
548
+ cu_seqlens_q = cu_seqlens_k
549
+ max_seqlen_in_batch_q = max_seqlen_in_batch_k
550
+ indices_q = indices_k
551
+ elif query_length == 1:
552
+ max_seqlen_in_batch_q = 1
553
+ cu_seqlens_q = torch.arange(
554
+ batch_size + 1, dtype=torch.int32, device=query_layer.device
555
+ ) # There is a memcpy here, that is very bad.
556
+ indices_q = cu_seqlens_q[:-1]
557
+ query_layer = query_layer.squeeze(1)
558
+ else:
559
+ # The -q_len: slice assumes left padding.
560
+ padding_mask = padding_mask[:, -query_length:]
561
+ query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, padding_mask)
562
+
563
+ return (
564
+ query_layer,
565
+ key_layer,
566
+ value_layer,
567
+ indices_q,
568
+ (cu_seqlens_q, cu_seqlens_k),
569
+ (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
570
+ )
571
+
572
+
573
+ class SkyworkDecoderLayer(nn.Module):
574
+ def __init__(self, config: SkyworkConfig):
575
+ super().__init__()
576
+ self.hidden_size = config.hidden_size
577
+ self.self_attn = (
578
+ SkyworkAttention(config=config)
579
+ if not getattr(config, "_flash_attn_2_enabled", False)
580
+ else SkyworkFlashAttention2(config=config)
581
+ )
582
+ self.mlp = SkyworkMLP(config)
583
+ self.input_layernorm = SkyworkRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
584
+ self.post_attention_layernorm = SkyworkRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
585
+
586
+ def forward(
587
+ self,
588
+ hidden_states: torch.Tensor,
589
+ attention_mask: Optional[torch.Tensor] = None,
590
+ position_ids: Optional[torch.LongTensor] = None,
591
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
592
+ output_attentions: Optional[bool] = False,
593
+ use_cache: Optional[bool] = False,
594
+ padding_mask: Optional[torch.LongTensor] = None,
595
+ ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
596
+ """
597
+ Args:
598
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
599
+ attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
600
+ `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
601
+ output_attentions (`bool`, *optional*):
602
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
603
+ returned tensors for more detail.
604
+ use_cache (`bool`, *optional*):
605
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
606
+ (see `past_key_values`).
607
+ past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
608
+ """
609
+
610
+ residual = hidden_states
611
+
612
+ hidden_states = self.input_layernorm(hidden_states)
613
+
614
+ # Self Attention
615
+ hidden_states, self_attn_weights, present_key_value = self.self_attn(
616
+ hidden_states=hidden_states,
617
+ attention_mask=attention_mask,
618
+ position_ids=position_ids,
619
+ past_key_value=past_key_value,
620
+ output_attentions=output_attentions,
621
+ use_cache=use_cache,
622
+ padding_mask=padding_mask,
623
+ )
624
+ hidden_states = residual + hidden_states
625
+
626
+ # Fully Connected
627
+ residual = hidden_states
628
+ hidden_states = self.post_attention_layernorm(hidden_states)
629
+ hidden_states = self.mlp(hidden_states)
630
+ hidden_states = residual + hidden_states
631
+
632
+ outputs = (hidden_states,)
633
+
634
+ if output_attentions:
635
+ outputs += (self_attn_weights,)
636
+
637
+ if use_cache:
638
+ outputs += (present_key_value,)
639
+
640
+ return outputs
641
+
642
+ class SkyworkPreTrainedModel(PreTrainedModel):
643
+ config_class = SkyworkConfig
644
+ base_model_prefix = "model"
645
+ supports_gradient_checkpointing = True
646
+ _no_split_modules = ["SkyworkDecoderLayer"]
647
+ _skip_keys_device_placement = "past_key_values"
648
+ _supports_flash_attn_2 = True
649
+
650
+ def _init_weights(self, module):
651
+ std = self.config.initializer_range
652
+ if isinstance(module, nn.Linear):
653
+ module.weight.data.normal_(mean=0.0, std=std)
654
+ if module.bias is not None:
655
+ module.bias.data.zero_()
656
+ elif isinstance(module, nn.Embedding):
657
+ module.weight.data.normal_(mean=0.0, std=std)
658
+ if module.padding_idx is not None:
659
+ module.weight.data[module.padding_idx].zero_()
660
+
661
+ def _set_gradient_checkpointing(self, module, value=False):
662
+ if isinstance(module, SkyworkModel):
663
+ module.gradient_checkpointing = value
664
+
665
+ class SkyworkModel(SkyworkPreTrainedModel):
666
+ """
667
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`SkyworkDecoderLayer`]
668
+
669
+ Args:
670
+ config: SkyworkConfig
671
+ """
672
+
673
+ def __init__(self, config: SkyworkConfig):
674
+ super().__init__(config)
675
+ self.padding_idx = config.pad_token_id
676
+ self.vocab_size = config.vocab_size
677
+
678
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
679
+ self.layers = nn.ModuleList([SkyworkDecoderLayer(config) for _ in range(config.num_hidden_layers)])
680
+ self.norm = SkyworkRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
681
+
682
+ self.gradient_checkpointing = False
683
+ # Initialize weights and apply final processing
684
+ self.post_init()
685
+
686
+ def get_input_embeddings(self):
687
+ return self.embed_tokens
688
+
689
+ def set_input_embeddings(self, value):
690
+ self.embed_tokens = value
691
+
692
+ # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
693
+ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
694
+ # create causal mask
695
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
696
+ combined_attention_mask = None
697
+ if input_shape[-1] > 1:
698
+ combined_attention_mask = _make_causal_mask(
699
+ input_shape,
700
+ inputs_embeds.dtype,
701
+ device=inputs_embeds.device,
702
+ past_key_values_length=past_key_values_length,
703
+ )
704
+
705
+ if attention_mask is not None:
706
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
707
+ expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
708
+ inputs_embeds.device
709
+ )
710
+ combined_attention_mask = (
711
+ expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
712
+ )
713
+
714
+ return combined_attention_mask
715
+
716
+ def forward(
717
+ self,
718
+ input_ids: torch.LongTensor = None,
719
+ attention_mask: Optional[torch.Tensor] = None,
720
+ position_ids: Optional[torch.LongTensor] = None,
721
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
722
+ inputs_embeds: Optional[torch.FloatTensor] = None,
723
+ use_cache: Optional[bool] = None,
724
+ output_attentions: Optional[bool] = None,
725
+ output_hidden_states: Optional[bool] = None,
726
+ return_dict: Optional[bool] = None,
727
+ ) -> Union[Tuple, BaseModelOutputWithPast]:
728
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
729
+ output_hidden_states = (
730
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
731
+ )
732
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
733
+
734
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
735
+
736
+ # retrieve input_ids and inputs_embeds
737
+ if input_ids is not None and inputs_embeds is not None:
738
+ raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
739
+ elif input_ids is not None:
740
+ batch_size, seq_length = input_ids.shape
741
+ elif inputs_embeds is not None:
742
+ batch_size, seq_length, _ = inputs_embeds.shape
743
+ else:
744
+ raise ValueError("You have to specify either input_ids or inputs_embeds")
745
+
746
+ seq_length_with_past = seq_length
747
+ past_key_values_length = 0
748
+
749
+ if past_key_values is not None:
750
+ past_key_values_length = past_key_values[0][0].shape[2]
751
+ seq_length_with_past = seq_length_with_past + past_key_values_length
752
+
753
+ if position_ids is None:
754
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
755
+ position_ids = torch.arange(
756
+ past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
757
+ )
758
+ position_ids = position_ids.unsqueeze(0)
759
+
760
+ if inputs_embeds is None:
761
+ inputs_embeds = self.embed_tokens(input_ids)
762
+ # embed positions
763
+ if attention_mask is None:
764
+ attention_mask = torch.ones(
765
+ (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
766
+ )
767
+ padding_mask = None
768
+ else:
769
+ if 0 in attention_mask:
770
+ padding_mask = attention_mask
771
+ else:
772
+ padding_mask = None
773
+
774
+ attention_mask = self._prepare_decoder_attention_mask(
775
+ attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
776
+ )
777
+
778
+ hidden_states = inputs_embeds
779
+
780
+ if self.gradient_checkpointing and self.training:
781
+ if use_cache:
782
+ logger.warning_once(
783
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
784
+ )
785
+ use_cache = False
786
+
787
+ # decoder layers
788
+ all_hidden_states = () if output_hidden_states else None
789
+ all_self_attns = () if output_attentions else None
790
+ next_decoder_cache = () if use_cache else None
791
+
792
+ for idx, decoder_layer in enumerate(self.layers):
793
+ if output_hidden_states:
794
+ all_hidden_states += (hidden_states,)
795
+
796
+ past_key_value = past_key_values[idx] if past_key_values is not None else None
797
+
798
+ if self.gradient_checkpointing and self.training:
799
+
800
+ def create_custom_forward(module):
801
+ def custom_forward(*inputs):
802
+ # None for past_key_value
803
+ return module(*inputs, past_key_value, output_attentions, padding_mask=padding_mask)
804
+
805
+ return custom_forward
806
+
807
+ layer_outputs = torch.utils.checkpoint.checkpoint(
808
+ create_custom_forward(decoder_layer), hidden_states, attention_mask, position_ids
809
+ )
810
+ else:
811
+ layer_outputs = decoder_layer(
812
+ hidden_states,
813
+ attention_mask=attention_mask,
814
+ position_ids=position_ids,
815
+ past_key_value=past_key_value,
816
+ output_attentions=output_attentions,
817
+ use_cache=use_cache,
818
+ padding_mask=padding_mask,
819
+ )
820
+
821
+ hidden_states = layer_outputs[0]
822
+
823
+ if use_cache:
824
+ next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
825
+
826
+ if output_attentions:
827
+ all_self_attns += (layer_outputs[1],)
828
+
829
+ hidden_states = self.norm(hidden_states)
830
+
831
+ # add hidden states from the last decoder layer
832
+ if output_hidden_states:
833
+ all_hidden_states += (hidden_states,)
834
+
835
+ next_cache = next_decoder_cache if use_cache else None
836
+ if not return_dict:
837
+ return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
838
+ return BaseModelOutputWithPast(
839
+ last_hidden_state=hidden_states,
840
+ past_key_values=next_cache,
841
+ hidden_states=all_hidden_states,
842
+ attentions=all_self_attns,
843
+ )
844
+
845
+
846
+ class SkyworkForCausalLM(SkyworkPreTrainedModel):
847
+ _tied_weights_keys = ["lm_head.weight"]
848
+
849
+ def __init__(self, config):
850
+ super().__init__(config)
851
+ self.model = SkyworkModel(config)
852
+ self.vocab_size = config.vocab_size
853
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
854
+
855
+ # Initialize weights and apply final processing
856
+ self.post_init()
857
+
858
+ def get_input_embeddings(self):
859
+ return self.model.embed_tokens
860
+
861
+ def set_input_embeddings(self, value):
862
+ self.model.embed_tokens = value
863
+
864
+ def get_output_embeddings(self):
865
+ return self.lm_head
866
+
867
+ def set_output_embeddings(self, new_embeddings):
868
+ self.lm_head = new_embeddings
869
+
870
+ def set_decoder(self, decoder):
871
+ self.model = decoder
872
+
873
+ def get_decoder(self):
874
+ return self.model
875
+
876
+ @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
877
+ def forward(
878
+ self,
879
+ input_ids: torch.LongTensor = None,
880
+ attention_mask: Optional[torch.Tensor] = None,
881
+ position_ids: Optional[torch.LongTensor] = None,
882
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
883
+ inputs_embeds: Optional[torch.FloatTensor] = None,
884
+ labels: Optional[torch.LongTensor] = None,
885
+ use_cache: Optional[bool] = None,
886
+ output_attentions: Optional[bool] = None,
887
+ output_hidden_states: Optional[bool] = None,
888
+ return_dict: Optional[bool] = None,
889
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
890
+ r"""
891
+ Args:
892
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
893
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
894
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
895
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
896
+
897
+ Returns:
898
+
899
+ Example:
900
+
901
+ ```python
902
+ >>> from transformers import AutoTokenizer, SkyworkForCausalLM
903
+
904
+ >>> model = SkyworkForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
905
+ >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
906
+
907
+ >>> prompt = "Hey, are you conscious? Can you talk to me?"
908
+ >>> inputs = tokenizer(prompt, return_tensors="pt")
909
+
910
+ >>> # Generate
911
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
912
+ >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
913
+ "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
914
+ ```"""
915
+
916
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
917
+ output_hidden_states = (
918
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
919
+ )
920
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
921
+
922
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
923
+ outputs = self.model(
924
+ input_ids=input_ids,
925
+ attention_mask=attention_mask,
926
+ position_ids=position_ids,
927
+ past_key_values=past_key_values,
928
+ inputs_embeds=inputs_embeds,
929
+ use_cache=use_cache,
930
+ output_attentions=output_attentions,
931
+ output_hidden_states=output_hidden_states,
932
+ return_dict=return_dict,
933
+ )
934
+
935
+ hidden_states = outputs[0]
936
+ if self.config.pretraining_tp > 1:
937
+ lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
938
+ logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
939
+ logits = torch.cat(logits, dim=-1)
940
+ else:
941
+ logits = self.lm_head(hidden_states)
942
+ logits = logits.float()
943
+
944
+ loss = None
945
+ if labels is not None:
946
+ # Shift so that tokens < n predict n
947
+ shift_logits = logits[..., :-1, :].contiguous()
948
+ shift_labels = labels[..., 1:].contiguous()
949
+ # Flatten the tokens
950
+ loss_fct = CrossEntropyLoss()
951
+ shift_logits = shift_logits.view(-1, self.config.vocab_size)
952
+ shift_labels = shift_labels.view(-1)
953
+ # Enable model parallelism
954
+ shift_labels = shift_labels.to(shift_logits.device)
955
+ loss = loss_fct(shift_logits, shift_labels)
956
+
957
+ if not return_dict:
958
+ output = (logits,) + outputs[1:]
959
+ return (loss,) + output if loss is not None else output
960
+
961
+ return CausalLMOutputWithPast(
962
+ loss=loss,
963
+ logits=logits,
964
+ past_key_values=outputs.past_key_values,
965
+ hidden_states=outputs.hidden_states,
966
+ attentions=outputs.attentions,
967
+ )
968
+
969
+ def prepare_inputs_for_generation(
970
+ self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
971
+ ):
972
+ if past_key_values:
973
+ input_ids = input_ids[:, -1:]
974
+
975
+ position_ids = kwargs.get("position_ids", None)
976
+ if attention_mask is not None and position_ids is None:
977
+ # create position_ids on the fly for batch generation
978
+ position_ids = attention_mask.long().cumsum(-1) - 1
979
+ position_ids.masked_fill_(attention_mask == 0, 1)
980
+ if past_key_values:
981
+ position_ids = position_ids[:, -1].unsqueeze(-1)
982
+
983
+ # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
984
+ if inputs_embeds is not None and past_key_values is None:
985
+ model_inputs = {"inputs_embeds": inputs_embeds}
986
+ else:
987
+ model_inputs = {"input_ids": input_ids}
988
+
989
+ model_inputs.update(
990
+ {
991
+ "position_ids": position_ids,
992
+ "past_key_values": past_key_values,
993
+ "use_cache": kwargs.get("use_cache"),
994
+ "attention_mask": attention_mask,
995
+ }
996
+ )
997
+ return model_inputs
998
+
999
+ @staticmethod
1000
+ def _reorder_cache(past_key_values, beam_idx):
1001
+ reordered_past = ()
1002
+ for layer_past in past_key_values:
1003
+ reordered_past += (
1004
+ tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
1005
+ )
1006
+ return reordered_past
1007
+
1008
+ class SkyworkForSequenceClassification(SkyworkPreTrainedModel):
1009
+ def __init__(self, config):
1010
+ super().__init__(config)
1011
+ self.num_labels = config.num_labels
1012
+ self.model = SkyworkModel(config)
1013
+ self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
1014
+
1015
+ # Initialize weights and apply final processing
1016
+ self.post_init()
1017
+
1018
+ def get_input_embeddings(self):
1019
+ return self.model.embed_tokens
1020
+
1021
+ def set_input_embeddings(self, value):
1022
+ self.model.embed_tokens = value
1023
+
1024
+ def forward(
1025
+ self,
1026
+ input_ids: torch.LongTensor = None,
1027
+ attention_mask: Optional[torch.Tensor] = None,
1028
+ position_ids: Optional[torch.LongTensor] = None,
1029
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
1030
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1031
+ labels: Optional[torch.LongTensor] = None,
1032
+ use_cache: Optional[bool] = None,
1033
+ output_attentions: Optional[bool] = None,
1034
+ output_hidden_states: Optional[bool] = None,
1035
+ return_dict: Optional[bool] = None,
1036
+ ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
1037
+ r"""
1038
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1039
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
1040
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
1041
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
1042
+ """
1043
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1044
+
1045
+ transformer_outputs = self.model(
1046
+ input_ids,
1047
+ attention_mask=attention_mask,
1048
+ position_ids=position_ids,
1049
+ past_key_values=past_key_values,
1050
+ inputs_embeds=inputs_embeds,
1051
+ use_cache=use_cache,
1052
+ output_attentions=output_attentions,
1053
+ output_hidden_states=output_hidden_states,
1054
+ return_dict=return_dict,
1055
+ )
1056
+ hidden_states = transformer_outputs[0]
1057
+ logits = self.score(hidden_states)
1058
+
1059
+ if input_ids is not None:
1060
+ batch_size = input_ids.shape[0]
1061
+ else:
1062
+ batch_size = inputs_embeds.shape[0]
1063
+
1064
+ if self.config.pad_token_id is None and batch_size != 1:
1065
+ raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
1066
+ if self.config.pad_token_id is None:
1067
+ sequence_lengths = -1
1068
+ else:
1069
+ if input_ids is not None:
1070
+ sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1).to(
1071
+ logits.device
1072
+ )
1073
+ else:
1074
+ sequence_lengths = -1
1075
+
1076
+ pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
1077
+
1078
+ loss = None
1079
+ if labels is not None:
1080
+ labels = labels.to(logits.device)
1081
+ if self.config.problem_type is None:
1082
+ if self.num_labels == 1:
1083
+ self.config.problem_type = "regression"
1084
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
1085
+ self.config.problem_type = "single_label_classification"
1086
+ else:
1087
+ self.config.problem_type = "multi_label_classification"
1088
+
1089
+ if self.config.problem_type == "regression":
1090
+ loss_fct = MSELoss()
1091
+ if self.num_labels == 1:
1092
+ loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
1093
+ else:
1094
+ loss = loss_fct(pooled_logits, labels)
1095
+ elif self.config.problem_type == "single_label_classification":
1096
+ loss_fct = CrossEntropyLoss()
1097
+ loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
1098
+ elif self.config.problem_type == "multi_label_classification":
1099
+ loss_fct = BCEWithLogitsLoss()
1100
+ loss = loss_fct(pooled_logits, labels)
1101
+ if not return_dict:
1102
+ output = (pooled_logits,) + transformer_outputs[1:]
1103
+ return ((loss,) + output) if loss is not None else output
1104
+
1105
+ return SequenceClassifierOutputWithPast(
1106
+ loss=loss,
1107
+ logits=pooled_logits,
1108
+ past_key_values=transformer_outputs.past_key_values,
1109
+ hidden_states=transformer_outputs.hidden_states,
1110
+ attentions=transformer_outputs.attentions,
1111
+ )
pytorch_model-00001-of-00002.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef96286501811b2ee17470bbf8c071cafbd66f36e7b9a0d3e0f5fa43a5c6ae28
3
+ size 9983005310
pytorch_model-00002-of-00002.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8e1887f1eef677bac502bfdb6e534601b291a7586c36abb686cfcd398a7e2a7
3
+ size 4485927597
pytorch_model.bin.index.json ADDED
@@ -0,0 +1,842 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 14468637696
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "pytorch_model-00002-of-00002.bin",
7
+ "model.embed_tokens.weight": "pytorch_model-00001-of-00002.bin",
8
+ "model.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
9
+ "model.layers.0.mlp.down_proj.SCB": "pytorch_model-00001-of-00002.bin",
10
+ "model.layers.0.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
11
+ "model.layers.0.mlp.gate_proj.SCB": "pytorch_model-00001-of-00002.bin",
12
+ "model.layers.0.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
13
+ "model.layers.0.mlp.up_proj.SCB": "pytorch_model-00001-of-00002.bin",
14
+ "model.layers.0.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
15
+ "model.layers.0.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
16
+ "model.layers.0.self_attn.k_proj.SCB": "pytorch_model-00001-of-00002.bin",
17
+ "model.layers.0.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
18
+ "model.layers.0.self_attn.o_proj.SCB": "pytorch_model-00001-of-00002.bin",
19
+ "model.layers.0.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
20
+ "model.layers.0.self_attn.q_proj.SCB": "pytorch_model-00001-of-00002.bin",
21
+ "model.layers.0.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
22
+ "model.layers.0.self_attn.v_proj.SCB": "pytorch_model-00001-of-00002.bin",
23
+ "model.layers.0.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
24
+ "model.layers.1.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
25
+ "model.layers.1.mlp.down_proj.SCB": "pytorch_model-00001-of-00002.bin",
26
+ "model.layers.1.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
27
+ "model.layers.1.mlp.gate_proj.SCB": "pytorch_model-00001-of-00002.bin",
28
+ "model.layers.1.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
29
+ "model.layers.1.mlp.up_proj.SCB": "pytorch_model-00001-of-00002.bin",
30
+ "model.layers.1.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
31
+ "model.layers.1.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
32
+ "model.layers.1.self_attn.k_proj.SCB": "pytorch_model-00001-of-00002.bin",
33
+ "model.layers.1.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
34
+ "model.layers.1.self_attn.o_proj.SCB": "pytorch_model-00001-of-00002.bin",
35
+ "model.layers.1.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
36
+ "model.layers.1.self_attn.q_proj.SCB": "pytorch_model-00001-of-00002.bin",
37
+ "model.layers.1.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
38
+ "model.layers.1.self_attn.v_proj.SCB": "pytorch_model-00001-of-00002.bin",
39
+ "model.layers.1.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
40
+ "model.layers.10.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
41
+ "model.layers.10.mlp.down_proj.SCB": "pytorch_model-00001-of-00002.bin",
42
+ "model.layers.10.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
43
+ "model.layers.10.mlp.gate_proj.SCB": "pytorch_model-00001-of-00002.bin",
44
+ "model.layers.10.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
45
+ "model.layers.10.mlp.up_proj.SCB": "pytorch_model-00001-of-00002.bin",
46
+ "model.layers.10.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
47
+ "model.layers.10.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
48
+ "model.layers.10.self_attn.k_proj.SCB": "pytorch_model-00001-of-00002.bin",
49
+ "model.layers.10.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
50
+ "model.layers.10.self_attn.o_proj.SCB": "pytorch_model-00001-of-00002.bin",
51
+ "model.layers.10.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
52
+ "model.layers.10.self_attn.q_proj.SCB": "pytorch_model-00001-of-00002.bin",
53
+ "model.layers.10.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
54
+ "model.layers.10.self_attn.v_proj.SCB": "pytorch_model-00001-of-00002.bin",
55
+ "model.layers.10.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
56
+ "model.layers.11.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
57
+ "model.layers.11.mlp.down_proj.SCB": "pytorch_model-00001-of-00002.bin",
58
+ "model.layers.11.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
59
+ "model.layers.11.mlp.gate_proj.SCB": "pytorch_model-00001-of-00002.bin",
60
+ "model.layers.11.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
61
+ "model.layers.11.mlp.up_proj.SCB": "pytorch_model-00001-of-00002.bin",
62
+ "model.layers.11.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
63
+ "model.layers.11.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
64
+ "model.layers.11.self_attn.k_proj.SCB": "pytorch_model-00001-of-00002.bin",
65
+ "model.layers.11.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
66
+ "model.layers.11.self_attn.o_proj.SCB": "pytorch_model-00001-of-00002.bin",
67
+ "model.layers.11.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
68
+ "model.layers.11.self_attn.q_proj.SCB": "pytorch_model-00001-of-00002.bin",
69
+ "model.layers.11.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
70
+ "model.layers.11.self_attn.v_proj.SCB": "pytorch_model-00001-of-00002.bin",
71
+ "model.layers.11.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
72
+ "model.layers.12.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
73
+ "model.layers.12.mlp.down_proj.SCB": "pytorch_model-00001-of-00002.bin",
74
+ "model.layers.12.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
75
+ "model.layers.12.mlp.gate_proj.SCB": "pytorch_model-00001-of-00002.bin",
76
+ "model.layers.12.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
77
+ "model.layers.12.mlp.up_proj.SCB": "pytorch_model-00001-of-00002.bin",
78
+ "model.layers.12.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
79
+ "model.layers.12.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
80
+ "model.layers.12.self_attn.k_proj.SCB": "pytorch_model-00001-of-00002.bin",
81
+ "model.layers.12.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
82
+ "model.layers.12.self_attn.o_proj.SCB": "pytorch_model-00001-of-00002.bin",
83
+ "model.layers.12.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
84
+ "model.layers.12.self_attn.q_proj.SCB": "pytorch_model-00001-of-00002.bin",
85
+ "model.layers.12.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
86
+ "model.layers.12.self_attn.v_proj.SCB": "pytorch_model-00001-of-00002.bin",
87
+ "model.layers.12.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
88
+ "model.layers.13.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
89
+ "model.layers.13.mlp.down_proj.SCB": "pytorch_model-00001-of-00002.bin",
90
+ "model.layers.13.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
91
+ "model.layers.13.mlp.gate_proj.SCB": "pytorch_model-00001-of-00002.bin",
92
+ "model.layers.13.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
93
+ "model.layers.13.mlp.up_proj.SCB": "pytorch_model-00001-of-00002.bin",
94
+ "model.layers.13.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
95
+ "model.layers.13.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
96
+ "model.layers.13.self_attn.k_proj.SCB": "pytorch_model-00001-of-00002.bin",
97
+ "model.layers.13.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
98
+ "model.layers.13.self_attn.o_proj.SCB": "pytorch_model-00001-of-00002.bin",
99
+ "model.layers.13.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
100
+ "model.layers.13.self_attn.q_proj.SCB": "pytorch_model-00001-of-00002.bin",
101
+ "model.layers.13.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
102
+ "model.layers.13.self_attn.v_proj.SCB": "pytorch_model-00001-of-00002.bin",
103
+ "model.layers.13.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
104
+ "model.layers.14.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
105
+ "model.layers.14.mlp.down_proj.SCB": "pytorch_model-00001-of-00002.bin",
106
+ "model.layers.14.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
107
+ "model.layers.14.mlp.gate_proj.SCB": "pytorch_model-00001-of-00002.bin",
108
+ "model.layers.14.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
109
+ "model.layers.14.mlp.up_proj.SCB": "pytorch_model-00001-of-00002.bin",
110
+ "model.layers.14.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
111
+ "model.layers.14.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
112
+ "model.layers.14.self_attn.k_proj.SCB": "pytorch_model-00001-of-00002.bin",
113
+ "model.layers.14.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
114
+ "model.layers.14.self_attn.o_proj.SCB": "pytorch_model-00001-of-00002.bin",
115
+ "model.layers.14.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
116
+ "model.layers.14.self_attn.q_proj.SCB": "pytorch_model-00001-of-00002.bin",
117
+ "model.layers.14.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
118
+ "model.layers.14.self_attn.v_proj.SCB": "pytorch_model-00001-of-00002.bin",
119
+ "model.layers.14.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
120
+ "model.layers.15.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
121
+ "model.layers.15.mlp.down_proj.SCB": "pytorch_model-00001-of-00002.bin",
122
+ "model.layers.15.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
123
+ "model.layers.15.mlp.gate_proj.SCB": "pytorch_model-00001-of-00002.bin",
124
+ "model.layers.15.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
125
+ "model.layers.15.mlp.up_proj.SCB": "pytorch_model-00001-of-00002.bin",
126
+ "model.layers.15.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
127
+ "model.layers.15.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
128
+ "model.layers.15.self_attn.k_proj.SCB": "pytorch_model-00001-of-00002.bin",
129
+ "model.layers.15.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
130
+ "model.layers.15.self_attn.o_proj.SCB": "pytorch_model-00001-of-00002.bin",
131
+ "model.layers.15.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
132
+ "model.layers.15.self_attn.q_proj.SCB": "pytorch_model-00001-of-00002.bin",
133
+ "model.layers.15.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
134
+ "model.layers.15.self_attn.v_proj.SCB": "pytorch_model-00001-of-00002.bin",
135
+ "model.layers.15.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
136
+ "model.layers.16.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
137
+ "model.layers.16.mlp.down_proj.SCB": "pytorch_model-00001-of-00002.bin",
138
+ "model.layers.16.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
139
+ "model.layers.16.mlp.gate_proj.SCB": "pytorch_model-00001-of-00002.bin",
140
+ "model.layers.16.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
141
+ "model.layers.16.mlp.up_proj.SCB": "pytorch_model-00001-of-00002.bin",
142
+ "model.layers.16.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
143
+ "model.layers.16.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
144
+ "model.layers.16.self_attn.k_proj.SCB": "pytorch_model-00001-of-00002.bin",
145
+ "model.layers.16.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
146
+ "model.layers.16.self_attn.o_proj.SCB": "pytorch_model-00001-of-00002.bin",
147
+ "model.layers.16.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
148
+ "model.layers.16.self_attn.q_proj.SCB": "pytorch_model-00001-of-00002.bin",
149
+ "model.layers.16.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
150
+ "model.layers.16.self_attn.v_proj.SCB": "pytorch_model-00001-of-00002.bin",
151
+ "model.layers.16.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
152
+ "model.layers.17.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
153
+ "model.layers.17.mlp.down_proj.SCB": "pytorch_model-00001-of-00002.bin",
154
+ "model.layers.17.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
155
+ "model.layers.17.mlp.gate_proj.SCB": "pytorch_model-00001-of-00002.bin",
156
+ "model.layers.17.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
157
+ "model.layers.17.mlp.up_proj.SCB": "pytorch_model-00001-of-00002.bin",
158
+ "model.layers.17.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
159
+ "model.layers.17.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
160
+ "model.layers.17.self_attn.k_proj.SCB": "pytorch_model-00001-of-00002.bin",
161
+ "model.layers.17.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
162
+ "model.layers.17.self_attn.o_proj.SCB": "pytorch_model-00001-of-00002.bin",
163
+ "model.layers.17.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
164
+ "model.layers.17.self_attn.q_proj.SCB": "pytorch_model-00001-of-00002.bin",
165
+ "model.layers.17.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
166
+ "model.layers.17.self_attn.v_proj.SCB": "pytorch_model-00001-of-00002.bin",
167
+ "model.layers.17.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
168
+ "model.layers.18.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
169
+ "model.layers.18.mlp.down_proj.SCB": "pytorch_model-00001-of-00002.bin",
170
+ "model.layers.18.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
171
+ "model.layers.18.mlp.gate_proj.SCB": "pytorch_model-00001-of-00002.bin",
172
+ "model.layers.18.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
173
+ "model.layers.18.mlp.up_proj.SCB": "pytorch_model-00001-of-00002.bin",
174
+ "model.layers.18.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
175
+ "model.layers.18.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
176
+ "model.layers.18.self_attn.k_proj.SCB": "pytorch_model-00001-of-00002.bin",
177
+ "model.layers.18.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
178
+ "model.layers.18.self_attn.o_proj.SCB": "pytorch_model-00001-of-00002.bin",
179
+ "model.layers.18.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
180
+ "model.layers.18.self_attn.q_proj.SCB": "pytorch_model-00001-of-00002.bin",
181
+ "model.layers.18.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
182
+ "model.layers.18.self_attn.v_proj.SCB": "pytorch_model-00001-of-00002.bin",
183
+ "model.layers.18.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
184
+ "model.layers.19.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
185
+ "model.layers.19.mlp.down_proj.SCB": "pytorch_model-00001-of-00002.bin",
186
+ "model.layers.19.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
187
+ "model.layers.19.mlp.gate_proj.SCB": "pytorch_model-00001-of-00002.bin",
188
+ "model.layers.19.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
189
+ "model.layers.19.mlp.up_proj.SCB": "pytorch_model-00001-of-00002.bin",
190
+ "model.layers.19.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
191
+ "model.layers.19.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
192
+ "model.layers.19.self_attn.k_proj.SCB": "pytorch_model-00001-of-00002.bin",
193
+ "model.layers.19.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
194
+ "model.layers.19.self_attn.o_proj.SCB": "pytorch_model-00001-of-00002.bin",
195
+ "model.layers.19.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
196
+ "model.layers.19.self_attn.q_proj.SCB": "pytorch_model-00001-of-00002.bin",
197
+ "model.layers.19.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
198
+ "model.layers.19.self_attn.v_proj.SCB": "pytorch_model-00001-of-00002.bin",
199
+ "model.layers.19.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
200
+ "model.layers.2.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
201
+ "model.layers.2.mlp.down_proj.SCB": "pytorch_model-00001-of-00002.bin",
202
+ "model.layers.2.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
203
+ "model.layers.2.mlp.gate_proj.SCB": "pytorch_model-00001-of-00002.bin",
204
+ "model.layers.2.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
205
+ "model.layers.2.mlp.up_proj.SCB": "pytorch_model-00001-of-00002.bin",
206
+ "model.layers.2.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
207
+ "model.layers.2.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
208
+ "model.layers.2.self_attn.k_proj.SCB": "pytorch_model-00001-of-00002.bin",
209
+ "model.layers.2.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
210
+ "model.layers.2.self_attn.o_proj.SCB": "pytorch_model-00001-of-00002.bin",
211
+ "model.layers.2.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
212
+ "model.layers.2.self_attn.q_proj.SCB": "pytorch_model-00001-of-00002.bin",
213
+ "model.layers.2.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
214
+ "model.layers.2.self_attn.v_proj.SCB": "pytorch_model-00001-of-00002.bin",
215
+ "model.layers.2.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
216
+ "model.layers.20.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
217
+ "model.layers.20.mlp.down_proj.SCB": "pytorch_model-00001-of-00002.bin",
218
+ "model.layers.20.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
219
+ "model.layers.20.mlp.gate_proj.SCB": "pytorch_model-00001-of-00002.bin",
220
+ "model.layers.20.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
221
+ "model.layers.20.mlp.up_proj.SCB": "pytorch_model-00001-of-00002.bin",
222
+ "model.layers.20.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
223
+ "model.layers.20.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
224
+ "model.layers.20.self_attn.k_proj.SCB": "pytorch_model-00001-of-00002.bin",
225
+ "model.layers.20.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
226
+ "model.layers.20.self_attn.o_proj.SCB": "pytorch_model-00001-of-00002.bin",
227
+ "model.layers.20.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
228
+ "model.layers.20.self_attn.q_proj.SCB": "pytorch_model-00001-of-00002.bin",
229
+ "model.layers.20.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
230
+ "model.layers.20.self_attn.v_proj.SCB": "pytorch_model-00001-of-00002.bin",
231
+ "model.layers.20.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
232
+ "model.layers.21.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
233
+ "model.layers.21.mlp.down_proj.SCB": "pytorch_model-00001-of-00002.bin",
234
+ "model.layers.21.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
235
+ "model.layers.21.mlp.gate_proj.SCB": "pytorch_model-00001-of-00002.bin",
236
+ "model.layers.21.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
237
+ "model.layers.21.mlp.up_proj.SCB": "pytorch_model-00001-of-00002.bin",
238
+ "model.layers.21.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
239
+ "model.layers.21.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
240
+ "model.layers.21.self_attn.k_proj.SCB": "pytorch_model-00001-of-00002.bin",
241
+ "model.layers.21.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
242
+ "model.layers.21.self_attn.o_proj.SCB": "pytorch_model-00001-of-00002.bin",
243
+ "model.layers.21.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
244
+ "model.layers.21.self_attn.q_proj.SCB": "pytorch_model-00001-of-00002.bin",
245
+ "model.layers.21.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
246
+ "model.layers.21.self_attn.v_proj.SCB": "pytorch_model-00001-of-00002.bin",
247
+ "model.layers.21.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
248
+ "model.layers.22.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
249
+ "model.layers.22.mlp.down_proj.SCB": "pytorch_model-00001-of-00002.bin",
250
+ "model.layers.22.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
251
+ "model.layers.22.mlp.gate_proj.SCB": "pytorch_model-00001-of-00002.bin",
252
+ "model.layers.22.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
253
+ "model.layers.22.mlp.up_proj.SCB": "pytorch_model-00001-of-00002.bin",
254
+ "model.layers.22.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
255
+ "model.layers.22.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
256
+ "model.layers.22.self_attn.k_proj.SCB": "pytorch_model-00001-of-00002.bin",
257
+ "model.layers.22.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
258
+ "model.layers.22.self_attn.o_proj.SCB": "pytorch_model-00001-of-00002.bin",
259
+ "model.layers.22.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
260
+ "model.layers.22.self_attn.q_proj.SCB": "pytorch_model-00001-of-00002.bin",
261
+ "model.layers.22.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
262
+ "model.layers.22.self_attn.v_proj.SCB": "pytorch_model-00001-of-00002.bin",
263
+ "model.layers.22.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
264
+ "model.layers.23.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
265
+ "model.layers.23.mlp.down_proj.SCB": "pytorch_model-00001-of-00002.bin",
266
+ "model.layers.23.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
267
+ "model.layers.23.mlp.gate_proj.SCB": "pytorch_model-00001-of-00002.bin",
268
+ "model.layers.23.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
269
+ "model.layers.23.mlp.up_proj.SCB": "pytorch_model-00001-of-00002.bin",
270
+ "model.layers.23.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
271
+ "model.layers.23.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
272
+ "model.layers.23.self_attn.k_proj.SCB": "pytorch_model-00001-of-00002.bin",
273
+ "model.layers.23.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
274
+ "model.layers.23.self_attn.o_proj.SCB": "pytorch_model-00001-of-00002.bin",
275
+ "model.layers.23.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
276
+ "model.layers.23.self_attn.q_proj.SCB": "pytorch_model-00001-of-00002.bin",
277
+ "model.layers.23.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
278
+ "model.layers.23.self_attn.v_proj.SCB": "pytorch_model-00001-of-00002.bin",
279
+ "model.layers.23.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
280
+ "model.layers.24.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
281
+ "model.layers.24.mlp.down_proj.SCB": "pytorch_model-00001-of-00002.bin",
282
+ "model.layers.24.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
283
+ "model.layers.24.mlp.gate_proj.SCB": "pytorch_model-00001-of-00002.bin",
284
+ "model.layers.24.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
285
+ "model.layers.24.mlp.up_proj.SCB": "pytorch_model-00001-of-00002.bin",
286
+ "model.layers.24.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
287
+ "model.layers.24.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
288
+ "model.layers.24.self_attn.k_proj.SCB": "pytorch_model-00001-of-00002.bin",
289
+ "model.layers.24.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
290
+ "model.layers.24.self_attn.o_proj.SCB": "pytorch_model-00001-of-00002.bin",
291
+ "model.layers.24.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
292
+ "model.layers.24.self_attn.q_proj.SCB": "pytorch_model-00001-of-00002.bin",
293
+ "model.layers.24.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
294
+ "model.layers.24.self_attn.v_proj.SCB": "pytorch_model-00001-of-00002.bin",
295
+ "model.layers.24.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
296
+ "model.layers.25.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
297
+ "model.layers.25.mlp.down_proj.SCB": "pytorch_model-00001-of-00002.bin",
298
+ "model.layers.25.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
299
+ "model.layers.25.mlp.gate_proj.SCB": "pytorch_model-00001-of-00002.bin",
300
+ "model.layers.25.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
301
+ "model.layers.25.mlp.up_proj.SCB": "pytorch_model-00001-of-00002.bin",
302
+ "model.layers.25.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
303
+ "model.layers.25.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
304
+ "model.layers.25.self_attn.k_proj.SCB": "pytorch_model-00001-of-00002.bin",
305
+ "model.layers.25.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
306
+ "model.layers.25.self_attn.o_proj.SCB": "pytorch_model-00001-of-00002.bin",
307
+ "model.layers.25.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
308
+ "model.layers.25.self_attn.q_proj.SCB": "pytorch_model-00001-of-00002.bin",
309
+ "model.layers.25.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
310
+ "model.layers.25.self_attn.v_proj.SCB": "pytorch_model-00001-of-00002.bin",
311
+ "model.layers.25.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
312
+ "model.layers.26.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
313
+ "model.layers.26.mlp.down_proj.SCB": "pytorch_model-00001-of-00002.bin",
314
+ "model.layers.26.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
315
+ "model.layers.26.mlp.gate_proj.SCB": "pytorch_model-00001-of-00002.bin",
316
+ "model.layers.26.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
317
+ "model.layers.26.mlp.up_proj.SCB": "pytorch_model-00001-of-00002.bin",
318
+ "model.layers.26.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
319
+ "model.layers.26.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
320
+ "model.layers.26.self_attn.k_proj.SCB": "pytorch_model-00001-of-00002.bin",
321
+ "model.layers.26.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
322
+ "model.layers.26.self_attn.o_proj.SCB": "pytorch_model-00001-of-00002.bin",
323
+ "model.layers.26.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
324
+ "model.layers.26.self_attn.q_proj.SCB": "pytorch_model-00001-of-00002.bin",
325
+ "model.layers.26.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
326
+ "model.layers.26.self_attn.v_proj.SCB": "pytorch_model-00001-of-00002.bin",
327
+ "model.layers.26.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
328
+ "model.layers.27.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
329
+ "model.layers.27.mlp.down_proj.SCB": "pytorch_model-00001-of-00002.bin",
330
+ "model.layers.27.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
331
+ "model.layers.27.mlp.gate_proj.SCB": "pytorch_model-00001-of-00002.bin",
332
+ "model.layers.27.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
333
+ "model.layers.27.mlp.up_proj.SCB": "pytorch_model-00001-of-00002.bin",
334
+ "model.layers.27.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
335
+ "model.layers.27.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
336
+ "model.layers.27.self_attn.k_proj.SCB": "pytorch_model-00001-of-00002.bin",
337
+ "model.layers.27.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
338
+ "model.layers.27.self_attn.o_proj.SCB": "pytorch_model-00001-of-00002.bin",
339
+ "model.layers.27.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
340
+ "model.layers.27.self_attn.q_proj.SCB": "pytorch_model-00001-of-00002.bin",
341
+ "model.layers.27.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
342
+ "model.layers.27.self_attn.v_proj.SCB": "pytorch_model-00001-of-00002.bin",
343
+ "model.layers.27.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
344
+ "model.layers.28.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
345
+ "model.layers.28.mlp.down_proj.SCB": "pytorch_model-00001-of-00002.bin",
346
+ "model.layers.28.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
347
+ "model.layers.28.mlp.gate_proj.SCB": "pytorch_model-00001-of-00002.bin",
348
+ "model.layers.28.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
349
+ "model.layers.28.mlp.up_proj.SCB": "pytorch_model-00001-of-00002.bin",
350
+ "model.layers.28.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
351
+ "model.layers.28.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
352
+ "model.layers.28.self_attn.k_proj.SCB": "pytorch_model-00001-of-00002.bin",
353
+ "model.layers.28.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
354
+ "model.layers.28.self_attn.o_proj.SCB": "pytorch_model-00001-of-00002.bin",
355
+ "model.layers.28.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
356
+ "model.layers.28.self_attn.q_proj.SCB": "pytorch_model-00001-of-00002.bin",
357
+ "model.layers.28.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
358
+ "model.layers.28.self_attn.v_proj.SCB": "pytorch_model-00001-of-00002.bin",
359
+ "model.layers.28.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
360
+ "model.layers.29.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
361
+ "model.layers.29.mlp.down_proj.SCB": "pytorch_model-00001-of-00002.bin",
362
+ "model.layers.29.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
363
+ "model.layers.29.mlp.gate_proj.SCB": "pytorch_model-00001-of-00002.bin",
364
+ "model.layers.29.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
365
+ "model.layers.29.mlp.up_proj.SCB": "pytorch_model-00001-of-00002.bin",
366
+ "model.layers.29.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
367
+ "model.layers.29.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
368
+ "model.layers.29.self_attn.k_proj.SCB": "pytorch_model-00001-of-00002.bin",
369
+ "model.layers.29.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
370
+ "model.layers.29.self_attn.o_proj.SCB": "pytorch_model-00001-of-00002.bin",
371
+ "model.layers.29.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
372
+ "model.layers.29.self_attn.q_proj.SCB": "pytorch_model-00001-of-00002.bin",
373
+ "model.layers.29.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
374
+ "model.layers.29.self_attn.v_proj.SCB": "pytorch_model-00001-of-00002.bin",
375
+ "model.layers.29.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
376
+ "model.layers.3.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
377
+ "model.layers.3.mlp.down_proj.SCB": "pytorch_model-00001-of-00002.bin",
378
+ "model.layers.3.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
379
+ "model.layers.3.mlp.gate_proj.SCB": "pytorch_model-00001-of-00002.bin",
380
+ "model.layers.3.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
381
+ "model.layers.3.mlp.up_proj.SCB": "pytorch_model-00001-of-00002.bin",
382
+ "model.layers.3.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
383
+ "model.layers.3.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
384
+ "model.layers.3.self_attn.k_proj.SCB": "pytorch_model-00001-of-00002.bin",
385
+ "model.layers.3.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
386
+ "model.layers.3.self_attn.o_proj.SCB": "pytorch_model-00001-of-00002.bin",
387
+ "model.layers.3.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
388
+ "model.layers.3.self_attn.q_proj.SCB": "pytorch_model-00001-of-00002.bin",
389
+ "model.layers.3.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
390
+ "model.layers.3.self_attn.v_proj.SCB": "pytorch_model-00001-of-00002.bin",
391
+ "model.layers.3.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
392
+ "model.layers.30.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
393
+ "model.layers.30.mlp.down_proj.SCB": "pytorch_model-00001-of-00002.bin",
394
+ "model.layers.30.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
395
+ "model.layers.30.mlp.gate_proj.SCB": "pytorch_model-00001-of-00002.bin",
396
+ "model.layers.30.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
397
+ "model.layers.30.mlp.up_proj.SCB": "pytorch_model-00001-of-00002.bin",
398
+ "model.layers.30.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
399
+ "model.layers.30.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
400
+ "model.layers.30.self_attn.k_proj.SCB": "pytorch_model-00001-of-00002.bin",
401
+ "model.layers.30.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
402
+ "model.layers.30.self_attn.o_proj.SCB": "pytorch_model-00001-of-00002.bin",
403
+ "model.layers.30.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
404
+ "model.layers.30.self_attn.q_proj.SCB": "pytorch_model-00001-of-00002.bin",
405
+ "model.layers.30.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
406
+ "model.layers.30.self_attn.v_proj.SCB": "pytorch_model-00001-of-00002.bin",
407
+ "model.layers.30.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
408
+ "model.layers.31.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
409
+ "model.layers.31.mlp.down_proj.SCB": "pytorch_model-00001-of-00002.bin",
410
+ "model.layers.31.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
411
+ "model.layers.31.mlp.gate_proj.SCB": "pytorch_model-00001-of-00002.bin",
412
+ "model.layers.31.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
413
+ "model.layers.31.mlp.up_proj.SCB": "pytorch_model-00001-of-00002.bin",
414
+ "model.layers.31.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
415
+ "model.layers.31.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
416
+ "model.layers.31.self_attn.k_proj.SCB": "pytorch_model-00001-of-00002.bin",
417
+ "model.layers.31.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
418
+ "model.layers.31.self_attn.o_proj.SCB": "pytorch_model-00001-of-00002.bin",
419
+ "model.layers.31.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
420
+ "model.layers.31.self_attn.q_proj.SCB": "pytorch_model-00001-of-00002.bin",
421
+ "model.layers.31.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
422
+ "model.layers.31.self_attn.v_proj.SCB": "pytorch_model-00001-of-00002.bin",
423
+ "model.layers.31.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
424
+ "model.layers.32.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
425
+ "model.layers.32.mlp.down_proj.SCB": "pytorch_model-00001-of-00002.bin",
426
+ "model.layers.32.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
427
+ "model.layers.32.mlp.gate_proj.SCB": "pytorch_model-00001-of-00002.bin",
428
+ "model.layers.32.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
429
+ "model.layers.32.mlp.up_proj.SCB": "pytorch_model-00001-of-00002.bin",
430
+ "model.layers.32.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
431
+ "model.layers.32.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
432
+ "model.layers.32.self_attn.k_proj.SCB": "pytorch_model-00001-of-00002.bin",
433
+ "model.layers.32.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
434
+ "model.layers.32.self_attn.o_proj.SCB": "pytorch_model-00001-of-00002.bin",
435
+ "model.layers.32.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
436
+ "model.layers.32.self_attn.q_proj.SCB": "pytorch_model-00001-of-00002.bin",
437
+ "model.layers.32.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
438
+ "model.layers.32.self_attn.v_proj.SCB": "pytorch_model-00001-of-00002.bin",
439
+ "model.layers.32.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
440
+ "model.layers.33.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
441
+ "model.layers.33.mlp.down_proj.SCB": "pytorch_model-00001-of-00002.bin",
442
+ "model.layers.33.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
443
+ "model.layers.33.mlp.gate_proj.SCB": "pytorch_model-00001-of-00002.bin",
444
+ "model.layers.33.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
445
+ "model.layers.33.mlp.up_proj.SCB": "pytorch_model-00001-of-00002.bin",
446
+ "model.layers.33.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
447
+ "model.layers.33.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
448
+ "model.layers.33.self_attn.k_proj.SCB": "pytorch_model-00001-of-00002.bin",
449
+ "model.layers.33.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
450
+ "model.layers.33.self_attn.o_proj.SCB": "pytorch_model-00001-of-00002.bin",
451
+ "model.layers.33.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
452
+ "model.layers.33.self_attn.q_proj.SCB": "pytorch_model-00001-of-00002.bin",
453
+ "model.layers.33.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
454
+ "model.layers.33.self_attn.v_proj.SCB": "pytorch_model-00001-of-00002.bin",
455
+ "model.layers.33.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
456
+ "model.layers.34.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
457
+ "model.layers.34.mlp.down_proj.SCB": "pytorch_model-00001-of-00002.bin",
458
+ "model.layers.34.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
459
+ "model.layers.34.mlp.gate_proj.SCB": "pytorch_model-00001-of-00002.bin",
460
+ "model.layers.34.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
461
+ "model.layers.34.mlp.up_proj.SCB": "pytorch_model-00001-of-00002.bin",
462
+ "model.layers.34.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
463
+ "model.layers.34.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
464
+ "model.layers.34.self_attn.k_proj.SCB": "pytorch_model-00001-of-00002.bin",
465
+ "model.layers.34.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
466
+ "model.layers.34.self_attn.o_proj.SCB": "pytorch_model-00001-of-00002.bin",
467
+ "model.layers.34.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
468
+ "model.layers.34.self_attn.q_proj.SCB": "pytorch_model-00001-of-00002.bin",
469
+ "model.layers.34.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
470
+ "model.layers.34.self_attn.v_proj.SCB": "pytorch_model-00001-of-00002.bin",
471
+ "model.layers.34.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
472
+ "model.layers.35.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
473
+ "model.layers.35.mlp.down_proj.SCB": "pytorch_model-00001-of-00002.bin",
474
+ "model.layers.35.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
475
+ "model.layers.35.mlp.gate_proj.SCB": "pytorch_model-00001-of-00002.bin",
476
+ "model.layers.35.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
477
+ "model.layers.35.mlp.up_proj.SCB": "pytorch_model-00001-of-00002.bin",
478
+ "model.layers.35.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
479
+ "model.layers.35.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
480
+ "model.layers.35.self_attn.k_proj.SCB": "pytorch_model-00001-of-00002.bin",
481
+ "model.layers.35.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
482
+ "model.layers.35.self_attn.o_proj.SCB": "pytorch_model-00001-of-00002.bin",
483
+ "model.layers.35.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
484
+ "model.layers.35.self_attn.q_proj.SCB": "pytorch_model-00001-of-00002.bin",
485
+ "model.layers.35.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
486
+ "model.layers.35.self_attn.v_proj.SCB": "pytorch_model-00001-of-00002.bin",
487
+ "model.layers.35.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
488
+ "model.layers.36.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
489
+ "model.layers.36.mlp.down_proj.SCB": "pytorch_model-00002-of-00002.bin",
490
+ "model.layers.36.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
491
+ "model.layers.36.mlp.gate_proj.SCB": "pytorch_model-00001-of-00002.bin",
492
+ "model.layers.36.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
493
+ "model.layers.36.mlp.up_proj.SCB": "pytorch_model-00001-of-00002.bin",
494
+ "model.layers.36.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
495
+ "model.layers.36.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
496
+ "model.layers.36.self_attn.k_proj.SCB": "pytorch_model-00001-of-00002.bin",
497
+ "model.layers.36.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
498
+ "model.layers.36.self_attn.o_proj.SCB": "pytorch_model-00001-of-00002.bin",
499
+ "model.layers.36.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
500
+ "model.layers.36.self_attn.q_proj.SCB": "pytorch_model-00001-of-00002.bin",
501
+ "model.layers.36.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
502
+ "model.layers.36.self_attn.v_proj.SCB": "pytorch_model-00001-of-00002.bin",
503
+ "model.layers.36.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
504
+ "model.layers.37.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
505
+ "model.layers.37.mlp.down_proj.SCB": "pytorch_model-00002-of-00002.bin",
506
+ "model.layers.37.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
507
+ "model.layers.37.mlp.gate_proj.SCB": "pytorch_model-00002-of-00002.bin",
508
+ "model.layers.37.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
509
+ "model.layers.37.mlp.up_proj.SCB": "pytorch_model-00002-of-00002.bin",
510
+ "model.layers.37.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
511
+ "model.layers.37.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
512
+ "model.layers.37.self_attn.k_proj.SCB": "pytorch_model-00002-of-00002.bin",
513
+ "model.layers.37.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
514
+ "model.layers.37.self_attn.o_proj.SCB": "pytorch_model-00002-of-00002.bin",
515
+ "model.layers.37.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
516
+ "model.layers.37.self_attn.q_proj.SCB": "pytorch_model-00002-of-00002.bin",
517
+ "model.layers.37.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
518
+ "model.layers.37.self_attn.v_proj.SCB": "pytorch_model-00002-of-00002.bin",
519
+ "model.layers.37.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
520
+ "model.layers.38.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
521
+ "model.layers.38.mlp.down_proj.SCB": "pytorch_model-00002-of-00002.bin",
522
+ "model.layers.38.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
523
+ "model.layers.38.mlp.gate_proj.SCB": "pytorch_model-00002-of-00002.bin",
524
+ "model.layers.38.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
525
+ "model.layers.38.mlp.up_proj.SCB": "pytorch_model-00002-of-00002.bin",
526
+ "model.layers.38.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
527
+ "model.layers.38.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
528
+ "model.layers.38.self_attn.k_proj.SCB": "pytorch_model-00002-of-00002.bin",
529
+ "model.layers.38.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
530
+ "model.layers.38.self_attn.o_proj.SCB": "pytorch_model-00002-of-00002.bin",
531
+ "model.layers.38.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
532
+ "model.layers.38.self_attn.q_proj.SCB": "pytorch_model-00002-of-00002.bin",
533
+ "model.layers.38.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
534
+ "model.layers.38.self_attn.v_proj.SCB": "pytorch_model-00002-of-00002.bin",
535
+ "model.layers.38.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
536
+ "model.layers.39.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
537
+ "model.layers.39.mlp.down_proj.SCB": "pytorch_model-00002-of-00002.bin",
538
+ "model.layers.39.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
539
+ "model.layers.39.mlp.gate_proj.SCB": "pytorch_model-00002-of-00002.bin",
540
+ "model.layers.39.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
541
+ "model.layers.39.mlp.up_proj.SCB": "pytorch_model-00002-of-00002.bin",
542
+ "model.layers.39.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
543
+ "model.layers.39.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
544
+ "model.layers.39.self_attn.k_proj.SCB": "pytorch_model-00002-of-00002.bin",
545
+ "model.layers.39.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
546
+ "model.layers.39.self_attn.o_proj.SCB": "pytorch_model-00002-of-00002.bin",
547
+ "model.layers.39.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
548
+ "model.layers.39.self_attn.q_proj.SCB": "pytorch_model-00002-of-00002.bin",
549
+ "model.layers.39.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
550
+ "model.layers.39.self_attn.v_proj.SCB": "pytorch_model-00002-of-00002.bin",
551
+ "model.layers.39.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
552
+ "model.layers.4.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
553
+ "model.layers.4.mlp.down_proj.SCB": "pytorch_model-00001-of-00002.bin",
554
+ "model.layers.4.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
555
+ "model.layers.4.mlp.gate_proj.SCB": "pytorch_model-00001-of-00002.bin",
556
+ "model.layers.4.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
557
+ "model.layers.4.mlp.up_proj.SCB": "pytorch_model-00001-of-00002.bin",
558
+ "model.layers.4.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
559
+ "model.layers.4.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
560
+ "model.layers.4.self_attn.k_proj.SCB": "pytorch_model-00001-of-00002.bin",
561
+ "model.layers.4.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
562
+ "model.layers.4.self_attn.o_proj.SCB": "pytorch_model-00001-of-00002.bin",
563
+ "model.layers.4.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
564
+ "model.layers.4.self_attn.q_proj.SCB": "pytorch_model-00001-of-00002.bin",
565
+ "model.layers.4.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
566
+ "model.layers.4.self_attn.v_proj.SCB": "pytorch_model-00001-of-00002.bin",
567
+ "model.layers.4.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
568
+ "model.layers.40.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
569
+ "model.layers.40.mlp.down_proj.SCB": "pytorch_model-00002-of-00002.bin",
570
+ "model.layers.40.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
571
+ "model.layers.40.mlp.gate_proj.SCB": "pytorch_model-00002-of-00002.bin",
572
+ "model.layers.40.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
573
+ "model.layers.40.mlp.up_proj.SCB": "pytorch_model-00002-of-00002.bin",
574
+ "model.layers.40.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
575
+ "model.layers.40.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
576
+ "model.layers.40.self_attn.k_proj.SCB": "pytorch_model-00002-of-00002.bin",
577
+ "model.layers.40.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
578
+ "model.layers.40.self_attn.o_proj.SCB": "pytorch_model-00002-of-00002.bin",
579
+ "model.layers.40.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
580
+ "model.layers.40.self_attn.q_proj.SCB": "pytorch_model-00002-of-00002.bin",
581
+ "model.layers.40.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
582
+ "model.layers.40.self_attn.v_proj.SCB": "pytorch_model-00002-of-00002.bin",
583
+ "model.layers.40.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
584
+ "model.layers.41.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
585
+ "model.layers.41.mlp.down_proj.SCB": "pytorch_model-00002-of-00002.bin",
586
+ "model.layers.41.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
587
+ "model.layers.41.mlp.gate_proj.SCB": "pytorch_model-00002-of-00002.bin",
588
+ "model.layers.41.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
589
+ "model.layers.41.mlp.up_proj.SCB": "pytorch_model-00002-of-00002.bin",
590
+ "model.layers.41.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
591
+ "model.layers.41.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
592
+ "model.layers.41.self_attn.k_proj.SCB": "pytorch_model-00002-of-00002.bin",
593
+ "model.layers.41.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
594
+ "model.layers.41.self_attn.o_proj.SCB": "pytorch_model-00002-of-00002.bin",
595
+ "model.layers.41.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
596
+ "model.layers.41.self_attn.q_proj.SCB": "pytorch_model-00002-of-00002.bin",
597
+ "model.layers.41.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
598
+ "model.layers.41.self_attn.v_proj.SCB": "pytorch_model-00002-of-00002.bin",
599
+ "model.layers.41.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
600
+ "model.layers.42.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
601
+ "model.layers.42.mlp.down_proj.SCB": "pytorch_model-00002-of-00002.bin",
602
+ "model.layers.42.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
603
+ "model.layers.42.mlp.gate_proj.SCB": "pytorch_model-00002-of-00002.bin",
604
+ "model.layers.42.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
605
+ "model.layers.42.mlp.up_proj.SCB": "pytorch_model-00002-of-00002.bin",
606
+ "model.layers.42.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
607
+ "model.layers.42.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
608
+ "model.layers.42.self_attn.k_proj.SCB": "pytorch_model-00002-of-00002.bin",
609
+ "model.layers.42.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
610
+ "model.layers.42.self_attn.o_proj.SCB": "pytorch_model-00002-of-00002.bin",
611
+ "model.layers.42.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
612
+ "model.layers.42.self_attn.q_proj.SCB": "pytorch_model-00002-of-00002.bin",
613
+ "model.layers.42.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
614
+ "model.layers.42.self_attn.v_proj.SCB": "pytorch_model-00002-of-00002.bin",
615
+ "model.layers.42.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
616
+ "model.layers.43.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
617
+ "model.layers.43.mlp.down_proj.SCB": "pytorch_model-00002-of-00002.bin",
618
+ "model.layers.43.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
619
+ "model.layers.43.mlp.gate_proj.SCB": "pytorch_model-00002-of-00002.bin",
620
+ "model.layers.43.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
621
+ "model.layers.43.mlp.up_proj.SCB": "pytorch_model-00002-of-00002.bin",
622
+ "model.layers.43.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
623
+ "model.layers.43.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
624
+ "model.layers.43.self_attn.k_proj.SCB": "pytorch_model-00002-of-00002.bin",
625
+ "model.layers.43.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
626
+ "model.layers.43.self_attn.o_proj.SCB": "pytorch_model-00002-of-00002.bin",
627
+ "model.layers.43.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
628
+ "model.layers.43.self_attn.q_proj.SCB": "pytorch_model-00002-of-00002.bin",
629
+ "model.layers.43.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
630
+ "model.layers.43.self_attn.v_proj.SCB": "pytorch_model-00002-of-00002.bin",
631
+ "model.layers.43.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
632
+ "model.layers.44.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
633
+ "model.layers.44.mlp.down_proj.SCB": "pytorch_model-00002-of-00002.bin",
634
+ "model.layers.44.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
635
+ "model.layers.44.mlp.gate_proj.SCB": "pytorch_model-00002-of-00002.bin",
636
+ "model.layers.44.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
637
+ "model.layers.44.mlp.up_proj.SCB": "pytorch_model-00002-of-00002.bin",
638
+ "model.layers.44.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
639
+ "model.layers.44.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
640
+ "model.layers.44.self_attn.k_proj.SCB": "pytorch_model-00002-of-00002.bin",
641
+ "model.layers.44.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
642
+ "model.layers.44.self_attn.o_proj.SCB": "pytorch_model-00002-of-00002.bin",
643
+ "model.layers.44.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
644
+ "model.layers.44.self_attn.q_proj.SCB": "pytorch_model-00002-of-00002.bin",
645
+ "model.layers.44.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
646
+ "model.layers.44.self_attn.v_proj.SCB": "pytorch_model-00002-of-00002.bin",
647
+ "model.layers.44.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
648
+ "model.layers.45.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
649
+ "model.layers.45.mlp.down_proj.SCB": "pytorch_model-00002-of-00002.bin",
650
+ "model.layers.45.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
651
+ "model.layers.45.mlp.gate_proj.SCB": "pytorch_model-00002-of-00002.bin",
652
+ "model.layers.45.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
653
+ "model.layers.45.mlp.up_proj.SCB": "pytorch_model-00002-of-00002.bin",
654
+ "model.layers.45.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
655
+ "model.layers.45.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
656
+ "model.layers.45.self_attn.k_proj.SCB": "pytorch_model-00002-of-00002.bin",
657
+ "model.layers.45.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
658
+ "model.layers.45.self_attn.o_proj.SCB": "pytorch_model-00002-of-00002.bin",
659
+ "model.layers.45.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
660
+ "model.layers.45.self_attn.q_proj.SCB": "pytorch_model-00002-of-00002.bin",
661
+ "model.layers.45.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
662
+ "model.layers.45.self_attn.v_proj.SCB": "pytorch_model-00002-of-00002.bin",
663
+ "model.layers.45.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
664
+ "model.layers.46.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
665
+ "model.layers.46.mlp.down_proj.SCB": "pytorch_model-00002-of-00002.bin",
666
+ "model.layers.46.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
667
+ "model.layers.46.mlp.gate_proj.SCB": "pytorch_model-00002-of-00002.bin",
668
+ "model.layers.46.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
669
+ "model.layers.46.mlp.up_proj.SCB": "pytorch_model-00002-of-00002.bin",
670
+ "model.layers.46.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
671
+ "model.layers.46.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
672
+ "model.layers.46.self_attn.k_proj.SCB": "pytorch_model-00002-of-00002.bin",
673
+ "model.layers.46.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
674
+ "model.layers.46.self_attn.o_proj.SCB": "pytorch_model-00002-of-00002.bin",
675
+ "model.layers.46.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
676
+ "model.layers.46.self_attn.q_proj.SCB": "pytorch_model-00002-of-00002.bin",
677
+ "model.layers.46.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
678
+ "model.layers.46.self_attn.v_proj.SCB": "pytorch_model-00002-of-00002.bin",
679
+ "model.layers.46.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
680
+ "model.layers.47.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
681
+ "model.layers.47.mlp.down_proj.SCB": "pytorch_model-00002-of-00002.bin",
682
+ "model.layers.47.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
683
+ "model.layers.47.mlp.gate_proj.SCB": "pytorch_model-00002-of-00002.bin",
684
+ "model.layers.47.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
685
+ "model.layers.47.mlp.up_proj.SCB": "pytorch_model-00002-of-00002.bin",
686
+ "model.layers.47.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
687
+ "model.layers.47.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
688
+ "model.layers.47.self_attn.k_proj.SCB": "pytorch_model-00002-of-00002.bin",
689
+ "model.layers.47.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
690
+ "model.layers.47.self_attn.o_proj.SCB": "pytorch_model-00002-of-00002.bin",
691
+ "model.layers.47.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
692
+ "model.layers.47.self_attn.q_proj.SCB": "pytorch_model-00002-of-00002.bin",
693
+ "model.layers.47.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
694
+ "model.layers.47.self_attn.v_proj.SCB": "pytorch_model-00002-of-00002.bin",
695
+ "model.layers.47.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
696
+ "model.layers.48.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
697
+ "model.layers.48.mlp.down_proj.SCB": "pytorch_model-00002-of-00002.bin",
698
+ "model.layers.48.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
699
+ "model.layers.48.mlp.gate_proj.SCB": "pytorch_model-00002-of-00002.bin",
700
+ "model.layers.48.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
701
+ "model.layers.48.mlp.up_proj.SCB": "pytorch_model-00002-of-00002.bin",
702
+ "model.layers.48.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
703
+ "model.layers.48.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
704
+ "model.layers.48.self_attn.k_proj.SCB": "pytorch_model-00002-of-00002.bin",
705
+ "model.layers.48.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
706
+ "model.layers.48.self_attn.o_proj.SCB": "pytorch_model-00002-of-00002.bin",
707
+ "model.layers.48.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
708
+ "model.layers.48.self_attn.q_proj.SCB": "pytorch_model-00002-of-00002.bin",
709
+ "model.layers.48.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
710
+ "model.layers.48.self_attn.v_proj.SCB": "pytorch_model-00002-of-00002.bin",
711
+ "model.layers.48.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
712
+ "model.layers.49.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
713
+ "model.layers.49.mlp.down_proj.SCB": "pytorch_model-00002-of-00002.bin",
714
+ "model.layers.49.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
715
+ "model.layers.49.mlp.gate_proj.SCB": "pytorch_model-00002-of-00002.bin",
716
+ "model.layers.49.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
717
+ "model.layers.49.mlp.up_proj.SCB": "pytorch_model-00002-of-00002.bin",
718
+ "model.layers.49.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
719
+ "model.layers.49.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
720
+ "model.layers.49.self_attn.k_proj.SCB": "pytorch_model-00002-of-00002.bin",
721
+ "model.layers.49.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
722
+ "model.layers.49.self_attn.o_proj.SCB": "pytorch_model-00002-of-00002.bin",
723
+ "model.layers.49.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
724
+ "model.layers.49.self_attn.q_proj.SCB": "pytorch_model-00002-of-00002.bin",
725
+ "model.layers.49.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
726
+ "model.layers.49.self_attn.v_proj.SCB": "pytorch_model-00002-of-00002.bin",
727
+ "model.layers.49.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
728
+ "model.layers.5.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
729
+ "model.layers.5.mlp.down_proj.SCB": "pytorch_model-00001-of-00002.bin",
730
+ "model.layers.5.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
731
+ "model.layers.5.mlp.gate_proj.SCB": "pytorch_model-00001-of-00002.bin",
732
+ "model.layers.5.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
733
+ "model.layers.5.mlp.up_proj.SCB": "pytorch_model-00001-of-00002.bin",
734
+ "model.layers.5.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
735
+ "model.layers.5.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
736
+ "model.layers.5.self_attn.k_proj.SCB": "pytorch_model-00001-of-00002.bin",
737
+ "model.layers.5.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
738
+ "model.layers.5.self_attn.o_proj.SCB": "pytorch_model-00001-of-00002.bin",
739
+ "model.layers.5.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
740
+ "model.layers.5.self_attn.q_proj.SCB": "pytorch_model-00001-of-00002.bin",
741
+ "model.layers.5.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
742
+ "model.layers.5.self_attn.v_proj.SCB": "pytorch_model-00001-of-00002.bin",
743
+ "model.layers.5.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
744
+ "model.layers.50.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
745
+ "model.layers.50.mlp.down_proj.SCB": "pytorch_model-00002-of-00002.bin",
746
+ "model.layers.50.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
747
+ "model.layers.50.mlp.gate_proj.SCB": "pytorch_model-00002-of-00002.bin",
748
+ "model.layers.50.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
749
+ "model.layers.50.mlp.up_proj.SCB": "pytorch_model-00002-of-00002.bin",
750
+ "model.layers.50.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
751
+ "model.layers.50.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
752
+ "model.layers.50.self_attn.k_proj.SCB": "pytorch_model-00002-of-00002.bin",
753
+ "model.layers.50.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
754
+ "model.layers.50.self_attn.o_proj.SCB": "pytorch_model-00002-of-00002.bin",
755
+ "model.layers.50.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
756
+ "model.layers.50.self_attn.q_proj.SCB": "pytorch_model-00002-of-00002.bin",
757
+ "model.layers.50.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
758
+ "model.layers.50.self_attn.v_proj.SCB": "pytorch_model-00002-of-00002.bin",
759
+ "model.layers.50.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
760
+ "model.layers.51.input_layernorm.weight": "pytorch_model-00002-of-00002.bin",
761
+ "model.layers.51.mlp.down_proj.SCB": "pytorch_model-00002-of-00002.bin",
762
+ "model.layers.51.mlp.down_proj.weight": "pytorch_model-00002-of-00002.bin",
763
+ "model.layers.51.mlp.gate_proj.SCB": "pytorch_model-00002-of-00002.bin",
764
+ "model.layers.51.mlp.gate_proj.weight": "pytorch_model-00002-of-00002.bin",
765
+ "model.layers.51.mlp.up_proj.SCB": "pytorch_model-00002-of-00002.bin",
766
+ "model.layers.51.mlp.up_proj.weight": "pytorch_model-00002-of-00002.bin",
767
+ "model.layers.51.post_attention_layernorm.weight": "pytorch_model-00002-of-00002.bin",
768
+ "model.layers.51.self_attn.k_proj.SCB": "pytorch_model-00002-of-00002.bin",
769
+ "model.layers.51.self_attn.k_proj.weight": "pytorch_model-00002-of-00002.bin",
770
+ "model.layers.51.self_attn.o_proj.SCB": "pytorch_model-00002-of-00002.bin",
771
+ "model.layers.51.self_attn.o_proj.weight": "pytorch_model-00002-of-00002.bin",
772
+ "model.layers.51.self_attn.q_proj.SCB": "pytorch_model-00002-of-00002.bin",
773
+ "model.layers.51.self_attn.q_proj.weight": "pytorch_model-00002-of-00002.bin",
774
+ "model.layers.51.self_attn.v_proj.SCB": "pytorch_model-00002-of-00002.bin",
775
+ "model.layers.51.self_attn.v_proj.weight": "pytorch_model-00002-of-00002.bin",
776
+ "model.layers.6.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
777
+ "model.layers.6.mlp.down_proj.SCB": "pytorch_model-00001-of-00002.bin",
778
+ "model.layers.6.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
779
+ "model.layers.6.mlp.gate_proj.SCB": "pytorch_model-00001-of-00002.bin",
780
+ "model.layers.6.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
781
+ "model.layers.6.mlp.up_proj.SCB": "pytorch_model-00001-of-00002.bin",
782
+ "model.layers.6.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
783
+ "model.layers.6.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
784
+ "model.layers.6.self_attn.k_proj.SCB": "pytorch_model-00001-of-00002.bin",
785
+ "model.layers.6.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
786
+ "model.layers.6.self_attn.o_proj.SCB": "pytorch_model-00001-of-00002.bin",
787
+ "model.layers.6.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
788
+ "model.layers.6.self_attn.q_proj.SCB": "pytorch_model-00001-of-00002.bin",
789
+ "model.layers.6.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
790
+ "model.layers.6.self_attn.v_proj.SCB": "pytorch_model-00001-of-00002.bin",
791
+ "model.layers.6.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
792
+ "model.layers.7.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
793
+ "model.layers.7.mlp.down_proj.SCB": "pytorch_model-00001-of-00002.bin",
794
+ "model.layers.7.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
795
+ "model.layers.7.mlp.gate_proj.SCB": "pytorch_model-00001-of-00002.bin",
796
+ "model.layers.7.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
797
+ "model.layers.7.mlp.up_proj.SCB": "pytorch_model-00001-of-00002.bin",
798
+ "model.layers.7.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
799
+ "model.layers.7.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
800
+ "model.layers.7.self_attn.k_proj.SCB": "pytorch_model-00001-of-00002.bin",
801
+ "model.layers.7.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
802
+ "model.layers.7.self_attn.o_proj.SCB": "pytorch_model-00001-of-00002.bin",
803
+ "model.layers.7.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
804
+ "model.layers.7.self_attn.q_proj.SCB": "pytorch_model-00001-of-00002.bin",
805
+ "model.layers.7.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
806
+ "model.layers.7.self_attn.v_proj.SCB": "pytorch_model-00001-of-00002.bin",
807
+ "model.layers.7.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
808
+ "model.layers.8.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
809
+ "model.layers.8.mlp.down_proj.SCB": "pytorch_model-00001-of-00002.bin",
810
+ "model.layers.8.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
811
+ "model.layers.8.mlp.gate_proj.SCB": "pytorch_model-00001-of-00002.bin",
812
+ "model.layers.8.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
813
+ "model.layers.8.mlp.up_proj.SCB": "pytorch_model-00001-of-00002.bin",
814
+ "model.layers.8.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
815
+ "model.layers.8.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
816
+ "model.layers.8.self_attn.k_proj.SCB": "pytorch_model-00001-of-00002.bin",
817
+ "model.layers.8.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
818
+ "model.layers.8.self_attn.o_proj.SCB": "pytorch_model-00001-of-00002.bin",
819
+ "model.layers.8.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
820
+ "model.layers.8.self_attn.q_proj.SCB": "pytorch_model-00001-of-00002.bin",
821
+ "model.layers.8.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
822
+ "model.layers.8.self_attn.v_proj.SCB": "pytorch_model-00001-of-00002.bin",
823
+ "model.layers.8.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
824
+ "model.layers.9.input_layernorm.weight": "pytorch_model-00001-of-00002.bin",
825
+ "model.layers.9.mlp.down_proj.SCB": "pytorch_model-00001-of-00002.bin",
826
+ "model.layers.9.mlp.down_proj.weight": "pytorch_model-00001-of-00002.bin",
827
+ "model.layers.9.mlp.gate_proj.SCB": "pytorch_model-00001-of-00002.bin",
828
+ "model.layers.9.mlp.gate_proj.weight": "pytorch_model-00001-of-00002.bin",
829
+ "model.layers.9.mlp.up_proj.SCB": "pytorch_model-00001-of-00002.bin",
830
+ "model.layers.9.mlp.up_proj.weight": "pytorch_model-00001-of-00002.bin",
831
+ "model.layers.9.post_attention_layernorm.weight": "pytorch_model-00001-of-00002.bin",
832
+ "model.layers.9.self_attn.k_proj.SCB": "pytorch_model-00001-of-00002.bin",
833
+ "model.layers.9.self_attn.k_proj.weight": "pytorch_model-00001-of-00002.bin",
834
+ "model.layers.9.self_attn.o_proj.SCB": "pytorch_model-00001-of-00002.bin",
835
+ "model.layers.9.self_attn.o_proj.weight": "pytorch_model-00001-of-00002.bin",
836
+ "model.layers.9.self_attn.q_proj.SCB": "pytorch_model-00001-of-00002.bin",
837
+ "model.layers.9.self_attn.q_proj.weight": "pytorch_model-00001-of-00002.bin",
838
+ "model.layers.9.self_attn.v_proj.SCB": "pytorch_model-00001-of-00002.bin",
839
+ "model.layers.9.self_attn.v_proj.weight": "pytorch_model-00001-of-00002.bin",
840
+ "model.norm.weight": "pytorch_model-00002-of-00002.bin"
841
+ }
842
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "unk_token": {
17
+ "content": "<unk>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
tokenization_skywork.py ADDED
@@ -0,0 +1,267 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
+ # and OPT implementations in this library. It has been modified from its
6
+ # original forms to accommodate minor architectural differences compared
7
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License");
10
+ # you may not use this file except in compliance with the License.
11
+ # You may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ # See the License for the specific language governing permissions and
19
+ # limitations under the License.
20
+
21
+ """Tokenization classes for Skywork."""
22
+ import os
23
+ from shutil import copyfile
24
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
25
+
26
+ import sentencepiece as spm
27
+
28
+ from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
29
+ from transformers.utils import logging
30
+
31
+ if TYPE_CHECKING:
32
+ from transformers.pipelines.conversational import Conversation
33
+
34
+ logger = logging.get_logger(__name__)
35
+
36
+ VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
37
+
38
+
39
+ SPIECE_UNDERLINE = "▁"
40
+
41
+ B_INST, E_INST = "[INST]", "[/INST]"
42
+ B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
43
+
44
+ DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure\
45
+ that your responses are socially unbiased and positive in nature.
46
+
47
+ If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""
48
+
49
+ class SkyworkTokenizer(PreTrainedTokenizer):
50
+
51
+ vocab_files_names = VOCAB_FILES_NAMES
52
+ # pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
53
+ # max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
54
+ model_input_names = ["input_ids", "attention_mask"]
55
+
56
+ def __init__(
57
+ self,
58
+ vocab_file,
59
+ unk_token="<unk>",
60
+ bos_token="<s>",
61
+ eos_token="</s>",
62
+ pad_token=None,
63
+ sp_model_kwargs: Optional[Dict[str, Any]] = None,
64
+ add_bos_token=True,
65
+ add_eos_token=False,
66
+ clean_up_tokenization_spaces=False,
67
+ legacy=True,
68
+ **kwargs,
69
+ ):
70
+ self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
71
+ bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
72
+ eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
73
+ unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
74
+ pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
75
+ self.legacy = legacy
76
+ self.vocab_file = vocab_file
77
+ self.add_bos_token = add_bos_token
78
+ self.add_eos_token = add_eos_token
79
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
80
+ self.sp_model.Load(vocab_file)
81
+ super().__init__(
82
+ bos_token=bos_token,
83
+ eos_token=eos_token,
84
+ unk_token=unk_token,
85
+ pad_token=pad_token,
86
+ add_bos_token=add_bos_token,
87
+ add_eos_token=add_eos_token,
88
+ sp_model_kwargs=self.sp_model_kwargs,
89
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
90
+ legacy=legacy,
91
+ **kwargs,
92
+ )
93
+ if legacy:
94
+ logger.warning_once(
95
+ f"You are using the legacy behaviour of the {self.__class__}. This means that tokens that come after special tokens will not be properly handled. "
96
+ )
97
+
98
+
99
+ def __getstate__(self):
100
+ state = self.__dict__.copy()
101
+ state["sp_model"] = None
102
+ state["sp_model_proto"] = self.sp_model.serialized_model_proto()
103
+ return state
104
+
105
+ def __setstate__(self, d):
106
+ self.__dict__ = d
107
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
108
+ self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
109
+
110
+ @property
111
+ def vocab_size(self):
112
+ """Returns vocab size"""
113
+ return self.sp_model.get_piece_size()
114
+
115
+ def get_vocab(self):
116
+ """Returns vocab as a dict"""
117
+ vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
118
+ vocab.update(self.added_tokens_encoder)
119
+ return vocab
120
+
121
+ # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.tokenize
122
+ def tokenize(self, text, **kwargs) -> List[str]:
123
+ # Replace the SPIECE_UNDERLINE with a space to make sure SPIECE_UNDERLINE is only used at
124
+ # the beginning of the text
125
+ if not self.legacy:
126
+ text = SPIECE_UNDERLINE + text.replace(SPIECE_UNDERLINE, " ")
127
+ return super().tokenize(text, **kwargs)
128
+
129
+ # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer._tokenize
130
+ def _tokenize(self, text):
131
+ if not self.legacy:
132
+ is_first = text.startswith(SPIECE_UNDERLINE)
133
+ if is_first:
134
+ text = text[1:]
135
+
136
+ tokens = self.sp_model.encode(text, out_type=str)
137
+
138
+ if not self.legacy and not is_first and not text.startswith(" ") and tokens[0].startswith(SPIECE_UNDERLINE):
139
+ tokens = ([tokens[0][1:]] if len(tokens[0]) > 1 else []) + tokens[1:]
140
+ return tokens
141
+
142
+ def _convert_token_to_id(self, token):
143
+ """Converts a token (str) in an id using the vocab."""
144
+ return self.sp_model.piece_to_id(token)
145
+
146
+ def _convert_id_to_token(self, index):
147
+ """Converts an index (integer) in a token (str) using the vocab."""
148
+ token = self.sp_model.IdToPiece(index)
149
+ return token
150
+
151
+ def convert_tokens_to_string(self, tokens):
152
+ """Converts a sequence of tokens (string) in a single string."""
153
+ current_sub_tokens = []
154
+ out_string = ""
155
+ prev_is_special = False
156
+ for i, token in enumerate(tokens):
157
+ # make sure that special tokens are not decoded using sentencepiece model
158
+ if token in self.all_special_tokens:
159
+ if not prev_is_special and i != 0:
160
+ out_string += " "
161
+ out_string += self.sp_model.decode(current_sub_tokens) + token
162
+ prev_is_special = True
163
+ current_sub_tokens = []
164
+ else:
165
+ current_sub_tokens.append(token)
166
+ prev_is_special = False
167
+ out_string += self.sp_model.decode(current_sub_tokens)
168
+ return out_string
169
+
170
+ def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
171
+ if not os.path.isdir(save_directory):
172
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
173
+ return
174
+ out_vocab_file = os.path.join(
175
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
176
+ )
177
+
178
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
179
+ copyfile(self.vocab_file, out_vocab_file)
180
+ elif not os.path.isfile(self.vocab_file):
181
+ with open(out_vocab_file, "wb") as fi:
182
+ content_spiece_model = self.sp_model.serialized_model_proto()
183
+ fi.write(content_spiece_model)
184
+
185
+ return (out_vocab_file,)
186
+
187
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
188
+ bos_token_id = [self.bos_token_id] if self.add_bos_token else []
189
+ eos_token_id = [self.eos_token_id] if self.add_eos_token else []
190
+
191
+ output = bos_token_id + token_ids_0 + eos_token_id
192
+
193
+ if token_ids_1 is not None:
194
+ output = output + bos_token_id + token_ids_1 + eos_token_id
195
+
196
+ return output
197
+
198
+ def get_special_tokens_mask(
199
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
200
+ ) -> List[int]:
201
+ if already_has_special_tokens:
202
+ return super().get_special_tokens_mask(
203
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
204
+ )
205
+
206
+ bos_token_id = [1] if self.add_bos_token else []
207
+ eos_token_id = [1] if self.add_eos_token else []
208
+
209
+ if token_ids_1 is None:
210
+ return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
211
+ return (
212
+ bos_token_id
213
+ + ([0] * len(token_ids_0))
214
+ + eos_token_id
215
+ + bos_token_id
216
+ + ([0] * len(token_ids_1))
217
+ + eos_token_id
218
+ )
219
+
220
+ def create_token_type_ids_from_sequences(
221
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
222
+ ) -> List[int]:
223
+ bos_token_id = [self.bos_token_id] if self.add_bos_token else []
224
+ eos_token_id = [self.eos_token_id] if self.add_eos_token else []
225
+
226
+ output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
227
+
228
+ if token_ids_1 is not None:
229
+ output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
230
+
231
+ return output
232
+
233
+ def _build_conversation_input_ids(self, conversation: "Conversation") -> List[int]:
234
+ dialogue = list(conversation.iter_texts())
235
+ if not all([is_user for is_user, msg in dialogue[::2]]) or not all(
236
+ [not is_user for is_user, msg in dialogue[1::2]]
237
+ ):
238
+ raise ValueError(
239
+ "The model only supports 'user' and 'assistant' roles, starting with user and alternating (u/a/u/a/u...)"
240
+ )
241
+
242
+ dialog_tokens: List[int] = []
243
+ if len(conversation.past_user_inputs) > 0:
244
+ if not conversation.past_user_inputs[0].startswith(B_SYS) or E_SYS not in conversation.past_user_inputs[0]:
245
+ conversation.past_user_inputs[0] = (
246
+ B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.past_user_inputs[0]
247
+ )
248
+ elif not dialogue[0][1].startswith(B_SYS) or E_SYS not in dialogue[0][1]:
249
+ dialogue[0] = (dialogue[0][0], B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + dialogue[0][1])
250
+
251
+ dialog_tokens += sum(
252
+ [
253
+ [self.bos_token_id]
254
+ + self.encode(
255
+ f"{B_INST} {(prompt[1]).strip()} {E_INST} {(answer[1]).strip()} ", add_special_tokens=False
256
+ )
257
+ + [self.eos_token_id]
258
+ for prompt, answer in zip(dialogue[::2], dialogue[1::2])
259
+ ],
260
+ [],
261
+ )
262
+ if not (dialogue[-1][0]):
263
+ raise ValueError(f"Last message must be from user, got {dialogue[-1]['role']}")
264
+ dialog_tokens += [self.bos_token_id] + self.encode(
265
+ f"{B_INST} {(dialogue[-1][1]).strip()} {E_INST}", add_special_tokens=False
266
+ )
267
+ return dialog_tokens
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:36ec9a4d6fd7cc78fbb9e4afd89fb04cba0381b08a842ca0b60826073821f594
3
+ size 994250
tokenizer_config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "bos_token": {
5
+ "__type": "AddedToken",
6
+ "content": "<s>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "clean_up_tokenization_spaces": false,
13
+ "eos_token": {
14
+ "__type": "AddedToken",
15
+ "content": "</s>",
16
+ "lstrip": false,
17
+ "normalized": true,
18
+ "rstrip": false,
19
+ "single_word": false
20
+ },
21
+ "legacy": true,
22
+ "model_max_length": 1000000000000000019884624838656,
23
+ "pad_token": null,
24
+ "sp_model_kwargs": {},
25
+ "tokenizer_class": "SkyworkTokenizer",
26
+ "unk_token": {
27
+ "__type": "AddedToken",
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": true,
31
+ "rstrip": false,
32
+ "single_word": false
33
+ },
34
+ "auto_map": {
35
+ "AutoTokenizer": [
36
+ "tokenization_skywork.SkyworkTokenizer",
37
+ null
38
+ ]
39
+ }
40
+ }