ruixie commited on
Commit
0c50d7a
1 Parent(s): cbd75da

Upload folder using huggingface_hub

Browse files
added_tokens.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<commit_after>": 70017,
3
+ "<commit_before>": 70015,
4
+ "<commit_msg>": 70016,
5
+ "<empty_output>": 70014,
6
+ "<filename>": 70005,
7
+ "<fim_middle>": 70002,
8
+ "<fim_pad>": 70004,
9
+ "<fim_prefix>": 70001,
10
+ "<fim_suffix>": 70003,
11
+ "<gh_stars>": 70006,
12
+ "<issue_closed>": 70009,
13
+ "<issue_comment>": 70008,
14
+ "<issue_start>": 70007,
15
+ "<jupyter_code>": 70012,
16
+ "<jupyter_output>": 70013,
17
+ "<jupyter_start>": 70010,
18
+ "<jupyter_text>": 70011,
19
+ "<reponame>": 70018,
20
+ "<|endoftext|>": 70000
21
+ }
config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/nvme/xr/checkpoints/codeshell/pt_codeshell/iter_0023208/hf",
3
+ "activation_function": "gelu_pytorch_tanh",
4
+ "architectures": [
5
+ "CodeShellForCausalLM"
6
+ ],
7
+ "attention_softmax_in_fp32": true,
8
+ "attn_pdrop": 0.1,
9
+ "auto_map": {
10
+ "AutoConfig": "configuration_codeshell.CodeShellConfig",
11
+ "AutoModelForCausalLM": "modeling_codeshell.CodeShellForCausalLM"
12
+ },
13
+ "bos_token_id": 70000,
14
+ "embd_pdrop": 0.1,
15
+ "eos_token_id": 70000,
16
+ "group_query_attention": true,
17
+ "inference_runner": 0,
18
+ "initializer_range": 0.02,
19
+ "layer_norm_epsilon": 1e-05,
20
+ "max_batch_size": null,
21
+ "max_sequence_length": null,
22
+ "model_type": "codeshell",
23
+ "n_embd": 4096,
24
+ "n_head": 32,
25
+ "n_inner": 16384,
26
+ "n_layer": 42,
27
+ "n_positions": 8192,
28
+ "num_query_groups": 8,
29
+ "pad_key_length": true,
30
+ "position_embedding_type": "rope",
31
+ "pre_allocate_kv_cache": false,
32
+ "resid_pdrop": 0.1,
33
+ "rope_scaling": null,
34
+ "scale_attention_softmax_in_fp32": true,
35
+ "scale_attn_weights": true,
36
+ "summary_activation": null,
37
+ "summary_first_dropout": 0.1,
38
+ "summary_proj_to_labels": true,
39
+ "summary_type": "cls_index",
40
+ "summary_use_proj": true,
41
+ "torch_dtype": "bfloat16",
42
+ "transformers_version": "4.33.3",
43
+ "use_cache": true,
44
+ "validate_runner_input": true,
45
+ "vocab_size": 70144
46
+ }
configuration_codeshell.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023 WisdomShell Inc. All Rights Reserved.
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ # This code is based on Bigcode's GPTBigCode configuration. It has been modified from
17
+ # its original forms to accommodate minor architectural differences compared to
18
+ # GPTBigCode Configuration that trained the model.
19
+
20
+ # Copyright 2023 The BigCode team and HuggingFace Inc. team.
21
+ #
22
+ # Licensed under the Apache License, Version 2.0 (the "License");
23
+ # you may not use this file except in compliance with the License.
24
+ # You may obtain a copy of the License at
25
+ #
26
+ # http://www.apache.org/licenses/LICENSE-2.0
27
+ #
28
+ # Unless required by applicable law or agreed to in writing, software
29
+ # distributed under the License is distributed on an "AS IS" BASIS,
30
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
31
+ # See the License for the specific language governing permissions and
32
+ # limitations under the License.
33
+ """ Shell configuration"""
34
+
35
+ from transformers.configuration_utils import PretrainedConfig
36
+ from transformers.utils import logging
37
+
38
+
39
+ logger = logging.get_logger(__name__)
40
+
41
+
42
+ class CodeShellConfig(PretrainedConfig):
43
+ """
44
+ This is the configuration class to store the configuration of a [`CodeShellModel`]. It is used to instantiate a
45
+ CodeShell model according to the specified arguments, defining the model architecture.
46
+
47
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
48
+ documentation from [`PretrainedConfig`] for more information.
49
+
50
+ Args:
51
+ vocab_size (`int`, *optional*, defaults to 50257):
52
+ Vocabulary size of the GPT-2 model. Defines the number of different tokens that can be represented by the
53
+ `inputs_ids` passed when calling [`ShellModel`].
54
+ n_positions (`int`, *optional*, defaults to 1024):
55
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
56
+ just in case (e.g., 512 or 1024 or 2048).
57
+ n_embd (`int`, *optional*, defaults to 768):
58
+ Dimensionality of the embeddings and hidden states.
59
+ n_layer (`int`, *optional*, defaults to 12):
60
+ Number of hidden layers in the Transformer encoder.
61
+ n_head (`int`, *optional*, defaults to 12):
62
+ Number of attention heads for each attention layer in the Transformer encoder.
63
+ n_inner (`int`, *optional*, defaults to None):
64
+ Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd
65
+ activation_function (`str`, *optional*, defaults to `"gelu_pytorch_tanh"`):
66
+ Activation function, to be selected in the list `["relu", "silu", "gelu", "tanh", "gelu_new",
67
+ "gelu_pytorch_tanh"]`.
68
+ resid_pdrop (`float`, *optional*, defaults to 0.1):
69
+ The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
70
+ embd_pdrop (`float`, *optional*, defaults to 0.1):
71
+ The dropout ratio for the embeddings.
72
+ attn_pdrop (`float`, *optional*, defaults to 0.1):
73
+ The dropout ratio for the attention.
74
+ layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
75
+ The epsilon to use in the layer normalization layers.
76
+ initializer_range (`float`, *optional*, defaults to 0.02):
77
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
78
+ scale_attn_weights (`bool`, *optional*, defaults to `True`):
79
+ Scale attention weights by dividing by sqrt(hidden_size)..
80
+ use_cache (`bool`, *optional*, defaults to `True`):
81
+ Whether or not the model should return the last key/values attentions (not used by all models).
82
+ attention_softmax_in_fp32 (`bool`, *optional*, defaults to `True`):
83
+ Whether to call the fused softmax in float32.
84
+ scale_attention_softmax_in_fp32 (`bool`, *optional*, defaults to `True`):
85
+ Whether to scale the attention softmax in float32.
86
+ attention_type (`bool`, *optional*, defaults to `True`):
87
+ Whether to use Multi-Query Attion (`True`) or Multi-Head Attention (`False`).
88
+ Example:
89
+
90
+ ```python
91
+ >>> from configuration_codeshell import CodeShellConfig
92
+ >>> from modeling_codeshell import CodeShellForCausalLM
93
+
94
+ >>> # Initializing a CodeShell configuration
95
+ >>> configuration = CodeShellConfig()
96
+
97
+ >>> # Initializing a model (with random weights) from the configuration
98
+ >>> model = CodeShellForCausalLM(configuration)
99
+
100
+ >>> # Accessing the model configuration
101
+ >>> configuration = model.config
102
+ ```"""
103
+
104
+ model_type = "codeshell"
105
+ keys_to_ignore_at_inference = ["past_key_values"]
106
+ attribute_map = {
107
+ "hidden_size": "n_embd",
108
+ "max_position_embeddings": "n_positions",
109
+ "num_attention_heads": "n_head",
110
+ "num_hidden_layers": "n_layer",
111
+ }
112
+
113
+ def __init__(
114
+ self,
115
+ vocab_size=70144,
116
+ n_positions=8192,
117
+ n_embd=4096,
118
+ n_layer=42,
119
+ n_head=32,
120
+ n_inner=None,
121
+ activation_function="gelu_pytorch_tanh",
122
+ resid_pdrop=0.1,
123
+ embd_pdrop=0.1,
124
+ attn_pdrop=0.1,
125
+ layer_norm_epsilon=1e-5,
126
+ initializer_range=0.02,
127
+ scale_attn_weights=True,
128
+ use_cache=True,
129
+ bos_token_id=70000,
130
+ eos_token_id=70000,
131
+ attention_softmax_in_fp32=True,
132
+ scale_attention_softmax_in_fp32=True,
133
+ group_query_attention=True,
134
+ num_query_groups=1,
135
+ position_embedding_type="learned_absolute",
136
+ rope_scaling=None,
137
+ **kwargs,
138
+ ):
139
+ self.vocab_size = vocab_size
140
+ self.n_positions = n_positions
141
+ self.n_embd = n_embd
142
+ self.n_layer = n_layer
143
+ self.n_head = n_head
144
+ self.n_inner = n_inner
145
+ self.activation_function = activation_function
146
+ self.resid_pdrop = resid_pdrop
147
+ self.embd_pdrop = embd_pdrop
148
+ self.attn_pdrop = attn_pdrop
149
+ self.layer_norm_epsilon = layer_norm_epsilon
150
+ self.initializer_range = initializer_range
151
+ self.scale_attn_weights = scale_attn_weights
152
+ self.use_cache = use_cache
153
+ self.attention_softmax_in_fp32 = attention_softmax_in_fp32
154
+ self.scale_attention_softmax_in_fp32 = scale_attention_softmax_in_fp32
155
+ self.group_query_attention = group_query_attention
156
+ self.num_query_groups = num_query_groups
157
+ self.position_embedding_type = position_embedding_type
158
+ self.rope_scaling = rope_scaling
159
+ assert self.position_embedding_type in [
160
+ "learned_absolute", "rope"
161
+ ], "position_embedding_type must be one of ['learned_absolute', 'rope']"
162
+
163
+ self.bos_token_id = bos_token_id
164
+ self.eos_token_id = eos_token_id
165
+
166
+ super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 70000,
4
+ "eos_token_id": 70000,
5
+ "transformers_version": "4.33.3"
6
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
modeling_codeshell.py ADDED
@@ -0,0 +1,884 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023 WisdomShell Inc. All Rights Reserved.
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ # This code is based on Bigcode's GPTBigCode model. It has been modified from
17
+ # its original forms to accommodate minor architectural differences compared to
18
+ # GPTBigCode model that trained the model.
19
+
20
+ # Copyright 2023 The Bigcode team and HuggingFace Inc. team.
21
+ # Licensed under the Apache License, Version 2.0 (the "License");
22
+ # you may not use this file except in compliance with the License.
23
+ # You may obtain a copy of the License at
24
+ #
25
+ # http://www.apache.org/licenses/LICENSE-2.0
26
+ #
27
+ # Unless required by applicable law or agreed to in writing, software
28
+ # distributed under the License is distributed on an "AS IS" BASIS,
29
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
30
+ # See the License for the specific language governing permissions and
31
+ # limitations under the License.
32
+ """PyTorch CodeShell model."""
33
+ import math
34
+ from typing import List, Optional, Tuple, Union
35
+
36
+ import torch
37
+ import torch.utils.checkpoint
38
+ from torch import nn
39
+ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
40
+
41
+ from transformers.activations import ACT2FN
42
+ from transformers.modeling_outputs import (
43
+ BaseModelOutputWithPastAndCrossAttentions,
44
+ CausalLMOutputWithCrossAttentions,
45
+ )
46
+ from transformers.modeling_utils import PreTrainedModel
47
+ from transformers.utils import (
48
+ add_start_docstrings,
49
+ add_start_docstrings_to_model_forward,
50
+ )
51
+ from .configuration_codeshell import CodeShellConfig
52
+
53
+
54
+ # Fused kernels
55
+ # Use separate functions for each case because conditionals prevent kernel fusion.
56
+ # TODO: Could have better fused kernels depending on scaling, dropout and head mask.
57
+ # Is it doable without writing 32 functions?
58
+ @torch.jit.script
59
+ def upcast_masked_softmax(
60
+ x: torch.Tensor, mask: torch.Tensor, mask_value: torch.Tensor, scale: float, softmax_dtype: torch.dtype
61
+ ):
62
+ input_dtype = x.dtype
63
+ x = x.to(softmax_dtype) * scale
64
+ x = torch.where(mask, x, mask_value)
65
+ x = torch.nn.functional.softmax(x, dim=-1).to(input_dtype)
66
+ return x
67
+
68
+
69
+ @torch.jit.script
70
+ def upcast_softmax(x: torch.Tensor, scale: float, softmax_dtype: torch.dtype):
71
+ input_dtype = x.dtype
72
+ x = x.to(softmax_dtype) * scale
73
+ x = torch.nn.functional.softmax(x, dim=-1).to(input_dtype)
74
+ return x
75
+
76
+
77
+ @torch.jit.script
78
+ def masked_softmax(x: torch.Tensor, mask: torch.Tensor, mask_value: torch.Tensor):
79
+ x = torch.where(mask, x, mask_value)
80
+ x = torch.nn.functional.softmax(x, dim=-1)
81
+ return x
82
+
83
+
84
+ class CodeShellRotaryEmbedding(torch.nn.Module):
85
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
86
+ super().__init__()
87
+
88
+ self.dim = dim
89
+ self.max_position_embeddings = max_position_embeddings
90
+ self.base = base
91
+ inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
92
+ self.register_buffer("inv_freq", inv_freq)
93
+
94
+ # Build here to make `torch.jit.trace` work.
95
+ self._set_cos_sin_cache(
96
+ seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
97
+ )
98
+
99
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
100
+ self.max_seq_len_cached = seq_len
101
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
102
+
103
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
104
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
105
+ emb = torch.cat((freqs, freqs), dim=-1)
106
+ self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
107
+ self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
108
+
109
+ def forward(self, x, seq_len=None):
110
+ # x: [bs, num_attention_heads, seq_len, head_size]
111
+ if seq_len > self.max_seq_len_cached:
112
+ self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
113
+
114
+ return (
115
+ self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
116
+ self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
117
+ )
118
+
119
+
120
+ class CodeShellLinearScalingRotaryEmbedding(CodeShellRotaryEmbedding):
121
+ """CodeShellRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
122
+
123
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
124
+ self.scaling_factor = scaling_factor
125
+ super().__init__(dim, max_position_embeddings, base, device)
126
+
127
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
128
+ self.max_seq_len_cached = seq_len
129
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
130
+ t = t / self.scaling_factor
131
+
132
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
133
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
134
+ emb = torch.cat((freqs, freqs), dim=-1)
135
+ self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
136
+ self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
137
+
138
+
139
+ class CodeShellDynamicNTKScalingRotaryEmbedding(CodeShellRotaryEmbedding):
140
+ """ShellRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
141
+
142
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
143
+ self.scaling_factor = scaling_factor
144
+ super().__init__(dim, max_position_embeddings, base, device)
145
+
146
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
147
+ self.max_seq_len_cached = seq_len
148
+
149
+ if seq_len > self.max_position_embeddings:
150
+ base = self.base * (
151
+ (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
152
+ ) ** (self.dim / (self.dim - 2))
153
+ inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
154
+ self.register_buffer("inv_freq", inv_freq)
155
+
156
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
157
+
158
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
159
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
160
+ emb = torch.cat((freqs, freqs), dim=-1)
161
+ self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
162
+ self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
163
+
164
+ def rotate_half(x):
165
+ """Rotates half the hidden dims of the input."""
166
+ x1 = x[..., : x.shape[-1] // 2]
167
+ x2 = x[..., x.shape[-1] // 2 :]
168
+ return torch.cat((-x2, x1), dim=-1)
169
+
170
+
171
+ def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
172
+ # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
173
+ cos = cos.squeeze(1).squeeze(0) # [seq_len, dim]
174
+ sin = sin.squeeze(1).squeeze(0) # [seq_len, dim]
175
+ cos = cos[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim]
176
+ sin = sin[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim]
177
+ q_embed = (q * cos) + (rotate_half(q) * sin)
178
+ k_embed = (k * cos) + (rotate_half(k) * sin)
179
+ return q_embed, k_embed
180
+
181
+ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
182
+ """
183
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
184
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
185
+ """
186
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
187
+ if n_rep == 1:
188
+ return hidden_states
189
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
190
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
191
+
192
+ class CodeShellAttention(nn.Module):
193
+ def __init__(self, config, layer_idx=None):
194
+ super().__init__()
195
+ self.mask_value = None
196
+
197
+ self.position_embedding_type = config.position_embedding_type
198
+ self.rope_scaling = config.rope_scaling
199
+ self.max_position_embeddings = config.max_position_embeddings
200
+
201
+ self.group_query_attention = config.group_query_attention
202
+ self.num_query_groups = config.num_query_groups
203
+ self.num_key_value_groups = config.num_attention_heads // config.num_query_groups
204
+
205
+ self.embed_dim = config.hidden_size
206
+ self.num_heads = config.num_attention_heads
207
+ self.head_dim = self.embed_dim // self.num_heads
208
+ self.kv_heads = config.num_query_groups if self.group_query_attention else self.num_heads
209
+ self.kv_dim = self.kv_heads * self.head_dim
210
+ self.split_size = self.embed_dim
211
+ if self.head_dim * self.num_heads != self.embed_dim:
212
+ raise ValueError(
213
+ f"`embed_dim` must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
214
+ f" {self.num_heads})."
215
+ )
216
+
217
+ self.layer_idx = layer_idx
218
+
219
+ self.c_attn = nn.Linear(self.embed_dim, self.embed_dim + 2 * self.kv_dim)
220
+ self.c_proj = nn.Linear(self.embed_dim, self.embed_dim)
221
+
222
+ self.attn_dropout = nn.Dropout(config.attn_pdrop)
223
+ self.resid_dropout = nn.Dropout(config.resid_pdrop)
224
+
225
+ if self.position_embedding_type == "rope":
226
+ self._init_rope()
227
+
228
+ def _init_rope(self):
229
+ if self.rope_scaling is None:
230
+ self.rotary_emb = CodeShellRotaryEmbedding(self.head_dim, max_position_embeddings=self.max_position_embeddings)
231
+ else:
232
+ scaling_type = self.rope_scaling["type"]
233
+ scaling_factor = self.rope_scaling["factor"]
234
+ if scaling_type == "linear":
235
+ self.rotary_emb = CodeShellLinearScalingRotaryEmbedding(
236
+ self.head_dim, max_position_embeddings=self.max_position_embeddings, scaling_factor=scaling_factor
237
+ )
238
+ elif scaling_type == "dynamic":
239
+ self.rotary_emb = CodeShellDynamicNTKScalingRotaryEmbedding(
240
+ self.head_dim, max_position_embeddings=self.max_position_embeddings, scaling_factor=scaling_factor
241
+ )
242
+ else:
243
+ raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
244
+
245
+
246
+ def _get_mask_value(self, device, dtype):
247
+ # torch.where expects a tensor. We use a cache to avoid recreating it every time.
248
+ if self.mask_value is None or self.mask_value.dtype != dtype or self.mask_value.device != device:
249
+ self.mask_value = torch.full([], torch.finfo(dtype).min, dtype=dtype, device=device)
250
+ return self.mask_value
251
+
252
+ def forward(
253
+ self,
254
+ hidden_states: torch.Tensor,
255
+ layer_past: Optional[torch.Tensor] = None,
256
+ attention_mask: Optional[torch.Tensor] = None,
257
+ position_ids: Optional[torch.LongTensor] = None,
258
+ head_mask: Optional[torch.Tensor] = None,
259
+ use_cache: Optional[bool] = False,
260
+ output_attentions: Optional[bool] = False,
261
+ ) -> Union[
262
+ Tuple[torch.Tensor, Optional[torch.Tensor]],
263
+ Tuple[torch.Tensor, Optional[torch.Tensor], Tuple[torch.Tensor, ...]],
264
+ ]:
265
+ bsz, q_len, _ = hidden_states.size()
266
+ query_states, key_states, value_states = self.c_attn(hidden_states).split((self.embed_dim, self.kv_dim, self.kv_dim), dim=2)
267
+
268
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
269
+ key_states = key_states.view(bsz, q_len, self.num_query_groups, self.head_dim).transpose(1, 2)
270
+ value_states = value_states.view(bsz, q_len, self.num_query_groups, self.head_dim).transpose(1, 2)
271
+
272
+ kv_seq_len = key_states.shape[-2]
273
+ if layer_past is not None:
274
+ kv_seq_len += layer_past[0].shape[-2]
275
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
276
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
277
+
278
+ if layer_past is not None:
279
+ # reuse k, v, self_attention
280
+ key_states = torch.cat([layer_past[0], key_states], dim=2)
281
+ value_states = torch.cat([layer_past[1], value_states], dim=2)
282
+
283
+ layer_past = (key_states, value_states) if use_cache else None
284
+
285
+ # repeat k/v heads if n_kv_heads < n_heads
286
+ key_states = repeat_kv(key_states, self.num_heads // self.kv_heads)
287
+ value_states = repeat_kv(value_states, self.num_heads // self.kv_heads)
288
+
289
+ attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
290
+
291
+ if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
292
+ raise ValueError(
293
+ f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
294
+ f" {attn_weights.size()}"
295
+ )
296
+
297
+ if attention_mask is not None:
298
+ if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
299
+ raise ValueError(
300
+ f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
301
+ )
302
+ mask_value = self._get_mask_value(attn_weights.device, attn_weights.dtype)
303
+ # The fused kernel is very slow when the key length is not a multiple of 8, so we skip fusion.
304
+ attn_weights = torch.where(attention_mask, attn_weights, mask_value)
305
+
306
+ # upcast attention to fp32
307
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
308
+ attn_weights = self.attn_dropout(attn_weights)
309
+ attn_output = torch.matmul(attn_weights, value_states)
310
+
311
+ if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
312
+ raise ValueError(
313
+ f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
314
+ f" {attn_output.size()}"
315
+ )
316
+
317
+ attn_output = attn_output.transpose(1, 2).contiguous()
318
+ attn_output = attn_output.reshape(bsz, q_len, self.embed_dim)
319
+
320
+ attn_output = self.c_proj(attn_output)
321
+ attn_output = self.resid_dropout(attn_output)
322
+
323
+ outputs = (attn_output, layer_past)
324
+ if output_attentions:
325
+ outputs += (attn_weights,)
326
+
327
+ return outputs # a, present, (attentions)
328
+
329
+
330
+ class CodeShellMLP(nn.Module):
331
+ def __init__(self, intermediate_size, config):
332
+ super().__init__()
333
+ embed_dim = config.hidden_size
334
+ self.c_fc = nn.Linear(embed_dim, intermediate_size)
335
+ self.c_proj = nn.Linear(intermediate_size, embed_dim)
336
+ self.act = ACT2FN[config.activation_function]
337
+ self.dropout = nn.Dropout(config.resid_pdrop)
338
+
339
+ # Copied from transformers.models.gpt2.modeling_gpt2.GPT2MLP.forward
340
+ def forward(self, hidden_states: Optional[Tuple[torch.Tensor]]) -> torch.Tensor:
341
+ hidden_states = self.c_fc(hidden_states)
342
+ hidden_states = self.act(hidden_states)
343
+ hidden_states = self.c_proj(hidden_states)
344
+ hidden_states = self.dropout(hidden_states)
345
+ return hidden_states
346
+
347
+
348
+ class CodeShellBlock(nn.Module):
349
+ def __init__(self, config, layer_idx=None):
350
+ super().__init__()
351
+ hidden_size = config.hidden_size
352
+ self.inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size
353
+
354
+ self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
355
+ self.attn = CodeShellAttention(config, layer_idx=layer_idx)
356
+ self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
357
+
358
+ self.mlp = CodeShellMLP(self.inner_dim, config)
359
+
360
+ def forward(
361
+ self,
362
+ hidden_states: Optional[Tuple[torch.Tensor]],
363
+ layer_past: Optional[torch.Tensor] = None,
364
+ attention_mask: Optional[torch.Tensor] = None,
365
+ position_ids: Optional[torch.LongTensor] = None,
366
+ head_mask: Optional[torch.Tensor] = None,
367
+ encoder_hidden_states: Optional[torch.Tensor] = None,
368
+ encoder_attention_mask: Optional[torch.Tensor] = None,
369
+ use_cache: Optional[bool] = False,
370
+ output_attentions: Optional[bool] = False,
371
+ ) -> Union[
372
+ Tuple[torch.Tensor], Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor, torch.Tensor, torch.Tensor]
373
+ ]:
374
+ residual = hidden_states
375
+ hidden_states = self.ln_1(hidden_states)
376
+ attn_outputs = self.attn(
377
+ hidden_states,
378
+ layer_past=layer_past,
379
+ attention_mask=attention_mask,
380
+ position_ids=position_ids,
381
+ head_mask=head_mask,
382
+ use_cache=use_cache,
383
+ output_attentions=output_attentions,
384
+ )
385
+ attn_output = attn_outputs[0] # output_attn: a, present, (attentions)
386
+
387
+ outputs = attn_outputs[1:]
388
+ # residual connection
389
+ hidden_states = attn_output + residual
390
+
391
+ residual = hidden_states
392
+ hidden_states = self.ln_2(hidden_states)
393
+ feed_forward_hidden_states = self.mlp(hidden_states)
394
+ # residual connection
395
+ hidden_states = residual + feed_forward_hidden_states
396
+
397
+ if use_cache:
398
+ outputs = (hidden_states,) + outputs
399
+ else:
400
+ outputs = (hidden_states,) + outputs[1:]
401
+
402
+ return outputs # hidden_states, present, (attentions, cross_attentions)
403
+
404
+
405
+ class CodeShellPreTrainedModel(PreTrainedModel):
406
+ """
407
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
408
+ models.
409
+ """
410
+
411
+ config_class = CodeShellConfig
412
+ base_model_prefix = "transformer"
413
+ supports_gradient_checkpointing = True
414
+ _no_split_modules = ["ShellBlock"]
415
+ _skip_keys_device_placement = "past_key_values"
416
+
417
+ def __init__(self, *inputs, **kwargs):
418
+ super().__init__(*inputs, **kwargs)
419
+
420
+ def _init_weights(self, module):
421
+ """Initialize the weights."""
422
+ if isinstance(module, (CodeShellMLP, CodeShellAttention)):
423
+ # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
424
+ # > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
425
+ # > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
426
+ # > -- GPT-2 :: https://openai.com/blog/better-language-models/
427
+ #
428
+ # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
429
+ module.c_proj.weight.data.normal_(
430
+ mean=0.0, std=(self.config.initializer_range / math.sqrt(2 * self.config.n_layer))
431
+ )
432
+ module.c_proj._is_hf_initialized = True
433
+ elif isinstance(module, nn.Linear):
434
+ # Slightly different from the TF version which uses truncated_normal for initialization
435
+ # cf https://github.com/pytorch/pytorch/pull/5617
436
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
437
+ if module.bias is not None:
438
+ module.bias.data.zero_()
439
+ elif isinstance(module, nn.Embedding):
440
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
441
+ if module.padding_idx is not None:
442
+ module.weight.data[module.padding_idx].zero_()
443
+ elif isinstance(module, nn.LayerNorm):
444
+ module.bias.data.zero_()
445
+ module.weight.data.fill_(1.0)
446
+
447
+ # Copied from transformers.models.gpt2.modeling_gpt2.GPT2PreTrainedModel._set_gradient_checkpointing with GPT2->Shell
448
+ def _set_gradient_checkpointing(self, module, value=False):
449
+ if isinstance(module, CodeShellModel):
450
+ module.gradient_checkpointing = value
451
+
452
+
453
+ GPT_BIGCODE_START_DOCSTRING = r"""
454
+
455
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
456
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
457
+ etc.)
458
+
459
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
460
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
461
+ and behavior.
462
+
463
+ Parameters:
464
+ config ([`CodeShellConfig`]): Model configuration class with all the parameters of the model.
465
+ Initializing with a config file does not load the weights associated with the model, only the
466
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
467
+ """
468
+
469
+ GPT_BIGCODE_INPUTS_DOCSTRING = r"""
470
+ Args:
471
+ input_ids (`torch.Tensor` of shape `(batch_size, input_ids_length)`):
472
+ `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
473
+ `past_key_values[0][0].shape[-2]` (`sequence_length` of input past key value states). Indices of input
474
+ sequence tokens in the vocabulary.
475
+
476
+ If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
477
+ `input_ids`.
478
+
479
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
480
+ [`PreTrainedTokenizer.__call__`] for details.
481
+
482
+ [What are input IDs?](../glossary#input-ids)
483
+ past_key_values (`Tuple[torch.Tensor]` of length `config.n_layers`):
484
+ Contains precomputed hidden-states (key and values in the attention blocks) as computed by the model (see
485
+ `past_key_values` output below). Can be used to speed up sequential decoding. The `input_ids` which have
486
+ their past given to this model should not be passed as `input_ids` as they have already been computed.
487
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
488
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
489
+
490
+ - 1 for tokens that are **not masked**,
491
+ - 0 for tokens that are **masked**.
492
+
493
+ If `past_key_values` is used, `attention_mask` needs to contain the masking strategy that was used for
494
+ `past_key_values`. In other words, the `attention_mask` always has to have the length:
495
+ `len(past_key_values) + len(input_ids)`
496
+
497
+ [What are attention masks?](../glossary#attention-mask)
498
+ token_type_ids (`torch.Tensor` of shape `(batch_size, input_ids_length)`, *optional*):
499
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
500
+ 1]`:
501
+
502
+ - 0 corresponds to a *sentence A* token,
503
+ - 1 corresponds to a *sentence B* token.
504
+
505
+ [What are token type IDs?](../glossary#token-type-ids)
506
+ position_ids (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
507
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
508
+ config.max_position_embeddings - 1]`.
509
+
510
+ [What are position IDs?](../glossary#position-ids)
511
+ head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
512
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
513
+
514
+ - 1 indicates the head is **not masked**,
515
+ - 0 indicates the head is **masked**.
516
+
517
+ inputs_embeds (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
518
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
519
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
520
+ model's internal embedding lookup matrix.
521
+
522
+ If `past_key_values` is used, optionally only the last `inputs_embeds` have to be input (see
523
+ `past_key_values`).
524
+ use_cache (`bool`, *optional*):
525
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
526
+ `past_key_values`).
527
+ output_attentions (`bool`, *optional*):
528
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
529
+ tensors for more detail.
530
+ output_hidden_states (`bool`, *optional*):
531
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
532
+ more detail.
533
+ return_dict (`bool`, *optional*):
534
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
535
+ """
536
+
537
+
538
+ @add_start_docstrings(
539
+ "The bare GPT_BIGCODE Model transformer outputting raw hidden-states without any specific head on top.",
540
+ GPT_BIGCODE_START_DOCSTRING,
541
+ )
542
+ class CodeShellModel(CodeShellPreTrainedModel):
543
+ def __init__(self, config):
544
+ super().__init__(config)
545
+ self.group_query_attention = config.group_query_attention
546
+ self.num_query_groups = config.num_query_groups
547
+ self.position_embedding_type = config.position_embedding_type
548
+ self.embed_dim = config.hidden_size
549
+
550
+ self.wte = nn.Embedding(config.vocab_size, self.embed_dim)
551
+ if self.position_embedding_type == "learned_absolute":
552
+ self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
553
+ else:
554
+ pass
555
+
556
+ self.drop = nn.Dropout(config.embd_pdrop)
557
+ self.h = nn.ModuleList([CodeShellBlock(config, layer_idx=i) for i in range(config.num_hidden_layers)])
558
+ self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
559
+
560
+ max_positions = config.max_position_embeddings
561
+ self.register_buffer(
562
+ "bias", torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)), persistent=False
563
+ )
564
+
565
+ self.gradient_checkpointing = False
566
+
567
+ # Initialize weights and apply final processing
568
+ self.post_init()
569
+
570
+ def get_input_embeddings(self):
571
+ return self.wte
572
+
573
+ def set_input_embeddings(self, new_embeddings):
574
+ self.wte = new_embeddings
575
+
576
+ @add_start_docstrings_to_model_forward(GPT_BIGCODE_INPUTS_DOCSTRING)
577
+ def forward(
578
+ self,
579
+ input_ids: Optional[torch.Tensor] = None,
580
+ past_key_values: Optional[List[torch.Tensor]] = None,
581
+ attention_mask: Optional[torch.Tensor] = None,
582
+ token_type_ids: Optional[torch.Tensor] = None,
583
+ position_ids: Optional[torch.Tensor] = None,
584
+ head_mask: Optional[torch.Tensor] = None,
585
+ inputs_embeds: Optional[torch.Tensor] = None,
586
+ encoder_hidden_states: Optional[torch.Tensor] = None,
587
+ encoder_attention_mask: Optional[torch.Tensor] = None,
588
+ use_cache: Optional[bool] = None,
589
+ output_attentions: Optional[bool] = None,
590
+ output_hidden_states: Optional[bool] = None,
591
+ return_dict: Optional[bool] = None,
592
+ ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
593
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
594
+ output_hidden_states = (
595
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
596
+ )
597
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
598
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
599
+
600
+ if input_ids is not None and inputs_embeds is not None:
601
+ raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
602
+ elif input_ids is not None:
603
+ input_shape = input_ids.size()
604
+ input_ids = input_ids.reshape(-1, input_shape[-1])
605
+ batch_size = input_ids.shape[0]
606
+ elif inputs_embeds is not None:
607
+ input_shape = inputs_embeds.size()[:-1]
608
+ batch_size = inputs_embeds.shape[0]
609
+ else:
610
+ raise ValueError("You have to specify either input_ids or inputs_embeds")
611
+
612
+ if batch_size <= 0:
613
+ raise ValueError("batch_size has to be defined and > 0")
614
+
615
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
616
+
617
+ if token_type_ids is not None:
618
+ token_type_ids = token_type_ids.reshape(-1, input_shape[-1])
619
+ if position_ids is not None:
620
+ position_ids = position_ids.reshape(-1, input_shape[-1])
621
+
622
+ if past_key_values is None:
623
+ past_length = 0
624
+ past_key_values = tuple([None] * len(self.h))
625
+ else:
626
+ past_length = past_key_values[0][0].size(-2)
627
+
628
+ if attention_mask is not None and len(attention_mask.shape) == 2 and position_ids is None:
629
+ # create position_ids on the fly for batch generation
630
+ position_ids = attention_mask.long().cumsum(-1) - 1
631
+ position_ids.masked_fill_(attention_mask == 0, 1)
632
+ if past_length > 0:
633
+ position_ids = position_ids[:, past_length : input_shape[-1] + past_length :]
634
+ elif position_ids is None:
635
+ position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
636
+ position_ids = position_ids.unsqueeze(0).reshape(-1, input_shape[-1])
637
+
638
+ # Self-attention mask.
639
+ query_length = input_shape[-1]
640
+ key_length = past_length + query_length
641
+ self_attention_mask = self.bias[None, key_length - query_length : key_length, :key_length]
642
+
643
+ if attention_mask is not None:
644
+ self_attention_mask = self_attention_mask * attention_mask.reshape(batch_size, 1, -1).to(
645
+ dtype=torch.bool, device=self_attention_mask.device
646
+ )
647
+
648
+ # MQA models: (batch_size, query_length, n_heads, key_length)
649
+ # MHA models: (batch_size, n_heads, query_length, key_length)
650
+ attention_mask = self_attention_mask.unsqueeze(1)
651
+
652
+ encoder_attention_mask = None
653
+
654
+ # Prepare head mask if needed
655
+ # 1.0 in head_mask indicate we keep the head
656
+ # attention_probs has shape bsz x n_heads x N x N
657
+ # head_mask has shape n_layer x batch x n_heads x N x N
658
+ head_mask = self.get_head_mask(head_mask, self.config.n_layer)
659
+
660
+ if inputs_embeds is None:
661
+ inputs_embeds = self.wte(input_ids)
662
+
663
+ hidden_states = inputs_embeds
664
+ if self.position_embedding_type == "learned_absolute":
665
+ position_embeds = self.wpe(position_ids)
666
+ hidden_states = hidden_states + position_embeds
667
+
668
+ if token_type_ids is not None:
669
+ token_type_embeds = self.wte(token_type_ids)
670
+ hidden_states = hidden_states + token_type_embeds
671
+
672
+ hidden_states = self.drop(hidden_states)
673
+
674
+ output_shape = input_shape + (hidden_states.size(-1),)
675
+
676
+ presents = [] if use_cache else None
677
+ all_self_attentions = () if output_attentions else None
678
+ all_hidden_states = () if output_hidden_states else None
679
+ for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
680
+ if output_hidden_states:
681
+ all_hidden_states = all_hidden_states + (hidden_states,)
682
+
683
+ if self.gradient_checkpointing and self.training:
684
+
685
+ def create_custom_forward(module):
686
+ def custom_forward(*inputs):
687
+ # None for past_key_value
688
+ return module(*inputs, use_cache, output_attentions)
689
+
690
+ return custom_forward
691
+
692
+ outputs = torch.utils.checkpoint.checkpoint(
693
+ create_custom_forward(block),
694
+ hidden_states,
695
+ None,
696
+ attention_mask,
697
+ position_ids,
698
+ head_mask[i],
699
+ encoder_hidden_states,
700
+ encoder_attention_mask,
701
+ )
702
+ else:
703
+ outputs = block(
704
+ hidden_states,
705
+ layer_past=layer_past,
706
+ attention_mask=attention_mask,
707
+ position_ids=position_ids,
708
+ head_mask=head_mask[i],
709
+ encoder_hidden_states=encoder_hidden_states,
710
+ encoder_attention_mask=encoder_attention_mask,
711
+ use_cache=use_cache,
712
+ output_attentions=output_attentions,
713
+ )
714
+
715
+ hidden_states = outputs[0]
716
+ if use_cache:
717
+ presents.append(outputs[1])
718
+
719
+ if output_attentions:
720
+ all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
721
+
722
+ hidden_states = self.ln_f(hidden_states)
723
+ hidden_states = hidden_states.reshape(output_shape)
724
+ # Add last hidden state
725
+ if output_hidden_states:
726
+ all_hidden_states = all_hidden_states + (hidden_states,)
727
+
728
+
729
+ if not return_dict:
730
+ return tuple(
731
+ v
732
+ for v in [hidden_states, presents, all_hidden_states, all_self_attentions]
733
+ if v is not None
734
+ )
735
+
736
+ return BaseModelOutputWithPastAndCrossAttentions(
737
+ last_hidden_state=hidden_states,
738
+ past_key_values=presents,
739
+ hidden_states=all_hidden_states,
740
+ attentions=all_self_attentions,
741
+ )
742
+
743
+
744
+ @add_start_docstrings(
745
+ """
746
+ The GPT_BIGCODE Model transformer with a language modeling head on top (linear layer with weights tied to the input
747
+ embeddings).
748
+ """,
749
+ GPT_BIGCODE_START_DOCSTRING,
750
+ )
751
+ class CodeShellForCausalLM(CodeShellPreTrainedModel):
752
+ _tied_weights_keys = ["lm_head.weight"]
753
+
754
+ def __init__(self, config):
755
+ super().__init__(config)
756
+ self.transformer = CodeShellModel(config)
757
+ self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
758
+
759
+ # Initialize weights and apply final processing
760
+ self.post_init()
761
+
762
+ def quantize(self, bits: int):
763
+ try:
764
+ import bitsandbytes
765
+ from .quantizer import quantize_online
766
+ except ImportError:
767
+ raise ImportError(f"Needs bitsandbytes to run quantize.")
768
+ return quantize_online(self, bits)
769
+
770
+ def get_output_embeddings(self):
771
+ return self.lm_head
772
+
773
+ def set_output_embeddings(self, new_embeddings):
774
+ self.lm_head = new_embeddings
775
+
776
+ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
777
+ token_type_ids = kwargs.get("token_type_ids", None)
778
+ # only last token for inputs_ids if past is defined in kwargs
779
+ if past_key_values:
780
+ input_ids = input_ids[:, -1].unsqueeze(-1)
781
+ if token_type_ids is not None:
782
+ token_type_ids = token_type_ids[:, -1].unsqueeze(-1)
783
+
784
+ attention_mask = kwargs.get("attention_mask", None)
785
+ position_ids = kwargs.get("position_ids", None)
786
+
787
+ if attention_mask is not None and position_ids is None:
788
+ # create position_ids on the fly for batch generation
789
+ position_ids = attention_mask.long().cumsum(-1) - 1
790
+ position_ids.masked_fill_(attention_mask == 0, 1)
791
+ if past_key_values:
792
+ position_ids = position_ids[:, -1].unsqueeze(-1)
793
+ else:
794
+ position_ids = None
795
+
796
+ # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
797
+ if inputs_embeds is not None and past_key_values is None:
798
+ model_inputs = {"inputs_embeds": inputs_embeds}
799
+ else:
800
+ model_inputs = {"input_ids": input_ids}
801
+
802
+ model_inputs.update(
803
+ {
804
+ "past_key_values": past_key_values,
805
+ "use_cache": kwargs.get("use_cache"),
806
+ "position_ids": position_ids,
807
+ "attention_mask": attention_mask,
808
+ "token_type_ids": token_type_ids,
809
+ }
810
+ )
811
+ return model_inputs
812
+
813
+ @add_start_docstrings_to_model_forward(GPT_BIGCODE_INPUTS_DOCSTRING)
814
+ def forward(
815
+ self,
816
+ input_ids: Optional[torch.Tensor] = None,
817
+ past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
818
+ attention_mask: Optional[torch.Tensor] = None,
819
+ token_type_ids: Optional[torch.Tensor] = None,
820
+ position_ids: Optional[torch.Tensor] = None,
821
+ head_mask: Optional[torch.Tensor] = None,
822
+ inputs_embeds: Optional[torch.Tensor] = None,
823
+ encoder_hidden_states: Optional[torch.Tensor] = None,
824
+ encoder_attention_mask: Optional[torch.Tensor] = None,
825
+ labels: Optional[torch.Tensor] = None,
826
+ use_cache: Optional[bool] = None,
827
+ output_attentions: Optional[bool] = None,
828
+ output_hidden_states: Optional[bool] = None,
829
+ return_dict: Optional[bool] = None,
830
+ ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
831
+ r"""
832
+ labels (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
833
+ Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
834
+ `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
835
+ are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
836
+ """
837
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
838
+
839
+ transformer_outputs = self.transformer(
840
+ input_ids,
841
+ past_key_values=past_key_values,
842
+ attention_mask=attention_mask,
843
+ token_type_ids=token_type_ids,
844
+ position_ids=position_ids,
845
+ head_mask=head_mask,
846
+ inputs_embeds=inputs_embeds,
847
+ encoder_hidden_states=encoder_hidden_states,
848
+ encoder_attention_mask=encoder_attention_mask,
849
+ use_cache=use_cache,
850
+ output_attentions=output_attentions,
851
+ output_hidden_states=output_hidden_states,
852
+ return_dict=return_dict,
853
+ )
854
+ hidden_states = transformer_outputs[0]
855
+ lm_logits = self.lm_head(hidden_states)
856
+ loss = None
857
+ if labels is not None:
858
+ # Shift so that tokens < n predict n
859
+ shift_logits = lm_logits[..., :-1, :].contiguous()
860
+ shift_labels = labels[..., 1:].contiguous().to(shift_logits.device)
861
+ # Flatten the tokens
862
+ loss_fct = CrossEntropyLoss()
863
+ loss = loss_fct(shift_logits.reshape(-1, shift_logits.size(-1)), shift_labels.reshape(-1))
864
+
865
+ if not return_dict:
866
+ output = (lm_logits,) + transformer_outputs[1:]
867
+ return ((loss,) + output) if loss is not None else output
868
+
869
+ return CausalLMOutputWithCrossAttentions(
870
+ loss=loss,
871
+ logits=lm_logits,
872
+ past_key_values=transformer_outputs.past_key_values,
873
+ hidden_states=transformer_outputs.hidden_states,
874
+ attentions=transformer_outputs.attentions,
875
+ )
876
+
877
+ @staticmethod
878
+ def _reorder_cache(past_key_values, beam_idx):
879
+ reordered_past = ()
880
+ for layer_past in past_key_values:
881
+ reordered_past += (
882
+ tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
883
+ )
884
+ return reordered_past
pytorch_model-00001-of-00002.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d620ec905db38f0a71dcd520cf5e37845ef8f82201ff961babc5e09b8e1d81f7
3
+ size 9955739573
pytorch_model-00002-of-00002.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f3e0a4089c60dc30ca3341cdf222dc1f35ed8f64d3f8900c51186b913ed8ec1
3
+ size 5420546104
pytorch_model.bin.index.json ADDED
@@ -0,0 +1,557 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 15376102656
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "pytorch_model-00001-of-00002.bin",
7
+ "transformer.h.0.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
8
+ "transformer.h.0.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
9
+ "transformer.h.0.attn.c_proj.bias": "pytorch_model-00001-of-00002.bin",
10
+ "transformer.h.0.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
11
+ "transformer.h.0.attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
12
+ "transformer.h.0.ln_1.bias": "pytorch_model-00001-of-00002.bin",
13
+ "transformer.h.0.ln_1.weight": "pytorch_model-00001-of-00002.bin",
14
+ "transformer.h.0.ln_2.bias": "pytorch_model-00001-of-00002.bin",
15
+ "transformer.h.0.ln_2.weight": "pytorch_model-00001-of-00002.bin",
16
+ "transformer.h.0.mlp.c_fc.bias": "pytorch_model-00001-of-00002.bin",
17
+ "transformer.h.0.mlp.c_fc.weight": "pytorch_model-00001-of-00002.bin",
18
+ "transformer.h.0.mlp.c_proj.bias": "pytorch_model-00001-of-00002.bin",
19
+ "transformer.h.0.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
20
+ "transformer.h.1.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
21
+ "transformer.h.1.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
22
+ "transformer.h.1.attn.c_proj.bias": "pytorch_model-00001-of-00002.bin",
23
+ "transformer.h.1.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
24
+ "transformer.h.1.attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
25
+ "transformer.h.1.ln_1.bias": "pytorch_model-00001-of-00002.bin",
26
+ "transformer.h.1.ln_1.weight": "pytorch_model-00001-of-00002.bin",
27
+ "transformer.h.1.ln_2.bias": "pytorch_model-00001-of-00002.bin",
28
+ "transformer.h.1.ln_2.weight": "pytorch_model-00001-of-00002.bin",
29
+ "transformer.h.1.mlp.c_fc.bias": "pytorch_model-00001-of-00002.bin",
30
+ "transformer.h.1.mlp.c_fc.weight": "pytorch_model-00001-of-00002.bin",
31
+ "transformer.h.1.mlp.c_proj.bias": "pytorch_model-00001-of-00002.bin",
32
+ "transformer.h.1.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
33
+ "transformer.h.10.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
34
+ "transformer.h.10.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
35
+ "transformer.h.10.attn.c_proj.bias": "pytorch_model-00001-of-00002.bin",
36
+ "transformer.h.10.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
37
+ "transformer.h.10.attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
38
+ "transformer.h.10.ln_1.bias": "pytorch_model-00001-of-00002.bin",
39
+ "transformer.h.10.ln_1.weight": "pytorch_model-00001-of-00002.bin",
40
+ "transformer.h.10.ln_2.bias": "pytorch_model-00001-of-00002.bin",
41
+ "transformer.h.10.ln_2.weight": "pytorch_model-00001-of-00002.bin",
42
+ "transformer.h.10.mlp.c_fc.bias": "pytorch_model-00001-of-00002.bin",
43
+ "transformer.h.10.mlp.c_fc.weight": "pytorch_model-00001-of-00002.bin",
44
+ "transformer.h.10.mlp.c_proj.bias": "pytorch_model-00001-of-00002.bin",
45
+ "transformer.h.10.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
46
+ "transformer.h.11.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
47
+ "transformer.h.11.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
48
+ "transformer.h.11.attn.c_proj.bias": "pytorch_model-00001-of-00002.bin",
49
+ "transformer.h.11.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
50
+ "transformer.h.11.attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
51
+ "transformer.h.11.ln_1.bias": "pytorch_model-00001-of-00002.bin",
52
+ "transformer.h.11.ln_1.weight": "pytorch_model-00001-of-00002.bin",
53
+ "transformer.h.11.ln_2.bias": "pytorch_model-00001-of-00002.bin",
54
+ "transformer.h.11.ln_2.weight": "pytorch_model-00001-of-00002.bin",
55
+ "transformer.h.11.mlp.c_fc.bias": "pytorch_model-00001-of-00002.bin",
56
+ "transformer.h.11.mlp.c_fc.weight": "pytorch_model-00001-of-00002.bin",
57
+ "transformer.h.11.mlp.c_proj.bias": "pytorch_model-00001-of-00002.bin",
58
+ "transformer.h.11.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
59
+ "transformer.h.12.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
60
+ "transformer.h.12.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
61
+ "transformer.h.12.attn.c_proj.bias": "pytorch_model-00001-of-00002.bin",
62
+ "transformer.h.12.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
63
+ "transformer.h.12.attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
64
+ "transformer.h.12.ln_1.bias": "pytorch_model-00001-of-00002.bin",
65
+ "transformer.h.12.ln_1.weight": "pytorch_model-00001-of-00002.bin",
66
+ "transformer.h.12.ln_2.bias": "pytorch_model-00001-of-00002.bin",
67
+ "transformer.h.12.ln_2.weight": "pytorch_model-00001-of-00002.bin",
68
+ "transformer.h.12.mlp.c_fc.bias": "pytorch_model-00001-of-00002.bin",
69
+ "transformer.h.12.mlp.c_fc.weight": "pytorch_model-00001-of-00002.bin",
70
+ "transformer.h.12.mlp.c_proj.bias": "pytorch_model-00001-of-00002.bin",
71
+ "transformer.h.12.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
72
+ "transformer.h.13.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
73
+ "transformer.h.13.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
74
+ "transformer.h.13.attn.c_proj.bias": "pytorch_model-00001-of-00002.bin",
75
+ "transformer.h.13.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
76
+ "transformer.h.13.attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
77
+ "transformer.h.13.ln_1.bias": "pytorch_model-00001-of-00002.bin",
78
+ "transformer.h.13.ln_1.weight": "pytorch_model-00001-of-00002.bin",
79
+ "transformer.h.13.ln_2.bias": "pytorch_model-00001-of-00002.bin",
80
+ "transformer.h.13.ln_2.weight": "pytorch_model-00001-of-00002.bin",
81
+ "transformer.h.13.mlp.c_fc.bias": "pytorch_model-00001-of-00002.bin",
82
+ "transformer.h.13.mlp.c_fc.weight": "pytorch_model-00001-of-00002.bin",
83
+ "transformer.h.13.mlp.c_proj.bias": "pytorch_model-00001-of-00002.bin",
84
+ "transformer.h.13.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
85
+ "transformer.h.14.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
86
+ "transformer.h.14.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
87
+ "transformer.h.14.attn.c_proj.bias": "pytorch_model-00001-of-00002.bin",
88
+ "transformer.h.14.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
89
+ "transformer.h.14.attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
90
+ "transformer.h.14.ln_1.bias": "pytorch_model-00001-of-00002.bin",
91
+ "transformer.h.14.ln_1.weight": "pytorch_model-00001-of-00002.bin",
92
+ "transformer.h.14.ln_2.bias": "pytorch_model-00001-of-00002.bin",
93
+ "transformer.h.14.ln_2.weight": "pytorch_model-00001-of-00002.bin",
94
+ "transformer.h.14.mlp.c_fc.bias": "pytorch_model-00001-of-00002.bin",
95
+ "transformer.h.14.mlp.c_fc.weight": "pytorch_model-00001-of-00002.bin",
96
+ "transformer.h.14.mlp.c_proj.bias": "pytorch_model-00001-of-00002.bin",
97
+ "transformer.h.14.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
98
+ "transformer.h.15.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
99
+ "transformer.h.15.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
100
+ "transformer.h.15.attn.c_proj.bias": "pytorch_model-00001-of-00002.bin",
101
+ "transformer.h.15.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
102
+ "transformer.h.15.attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
103
+ "transformer.h.15.ln_1.bias": "pytorch_model-00001-of-00002.bin",
104
+ "transformer.h.15.ln_1.weight": "pytorch_model-00001-of-00002.bin",
105
+ "transformer.h.15.ln_2.bias": "pytorch_model-00001-of-00002.bin",
106
+ "transformer.h.15.ln_2.weight": "pytorch_model-00001-of-00002.bin",
107
+ "transformer.h.15.mlp.c_fc.bias": "pytorch_model-00001-of-00002.bin",
108
+ "transformer.h.15.mlp.c_fc.weight": "pytorch_model-00001-of-00002.bin",
109
+ "transformer.h.15.mlp.c_proj.bias": "pytorch_model-00001-of-00002.bin",
110
+ "transformer.h.15.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
111
+ "transformer.h.16.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
112
+ "transformer.h.16.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
113
+ "transformer.h.16.attn.c_proj.bias": "pytorch_model-00001-of-00002.bin",
114
+ "transformer.h.16.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
115
+ "transformer.h.16.attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
116
+ "transformer.h.16.ln_1.bias": "pytorch_model-00001-of-00002.bin",
117
+ "transformer.h.16.ln_1.weight": "pytorch_model-00001-of-00002.bin",
118
+ "transformer.h.16.ln_2.bias": "pytorch_model-00001-of-00002.bin",
119
+ "transformer.h.16.ln_2.weight": "pytorch_model-00001-of-00002.bin",
120
+ "transformer.h.16.mlp.c_fc.bias": "pytorch_model-00001-of-00002.bin",
121
+ "transformer.h.16.mlp.c_fc.weight": "pytorch_model-00001-of-00002.bin",
122
+ "transformer.h.16.mlp.c_proj.bias": "pytorch_model-00001-of-00002.bin",
123
+ "transformer.h.16.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
124
+ "transformer.h.17.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
125
+ "transformer.h.17.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
126
+ "transformer.h.17.attn.c_proj.bias": "pytorch_model-00001-of-00002.bin",
127
+ "transformer.h.17.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
128
+ "transformer.h.17.attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
129
+ "transformer.h.17.ln_1.bias": "pytorch_model-00001-of-00002.bin",
130
+ "transformer.h.17.ln_1.weight": "pytorch_model-00001-of-00002.bin",
131
+ "transformer.h.17.ln_2.bias": "pytorch_model-00001-of-00002.bin",
132
+ "transformer.h.17.ln_2.weight": "pytorch_model-00001-of-00002.bin",
133
+ "transformer.h.17.mlp.c_fc.bias": "pytorch_model-00001-of-00002.bin",
134
+ "transformer.h.17.mlp.c_fc.weight": "pytorch_model-00001-of-00002.bin",
135
+ "transformer.h.17.mlp.c_proj.bias": "pytorch_model-00001-of-00002.bin",
136
+ "transformer.h.17.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
137
+ "transformer.h.18.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
138
+ "transformer.h.18.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
139
+ "transformer.h.18.attn.c_proj.bias": "pytorch_model-00001-of-00002.bin",
140
+ "transformer.h.18.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
141
+ "transformer.h.18.attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
142
+ "transformer.h.18.ln_1.bias": "pytorch_model-00001-of-00002.bin",
143
+ "transformer.h.18.ln_1.weight": "pytorch_model-00001-of-00002.bin",
144
+ "transformer.h.18.ln_2.bias": "pytorch_model-00001-of-00002.bin",
145
+ "transformer.h.18.ln_2.weight": "pytorch_model-00001-of-00002.bin",
146
+ "transformer.h.18.mlp.c_fc.bias": "pytorch_model-00001-of-00002.bin",
147
+ "transformer.h.18.mlp.c_fc.weight": "pytorch_model-00001-of-00002.bin",
148
+ "transformer.h.18.mlp.c_proj.bias": "pytorch_model-00001-of-00002.bin",
149
+ "transformer.h.18.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
150
+ "transformer.h.19.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
151
+ "transformer.h.19.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
152
+ "transformer.h.19.attn.c_proj.bias": "pytorch_model-00001-of-00002.bin",
153
+ "transformer.h.19.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
154
+ "transformer.h.19.attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
155
+ "transformer.h.19.ln_1.bias": "pytorch_model-00001-of-00002.bin",
156
+ "transformer.h.19.ln_1.weight": "pytorch_model-00001-of-00002.bin",
157
+ "transformer.h.19.ln_2.bias": "pytorch_model-00001-of-00002.bin",
158
+ "transformer.h.19.ln_2.weight": "pytorch_model-00001-of-00002.bin",
159
+ "transformer.h.19.mlp.c_fc.bias": "pytorch_model-00001-of-00002.bin",
160
+ "transformer.h.19.mlp.c_fc.weight": "pytorch_model-00001-of-00002.bin",
161
+ "transformer.h.19.mlp.c_proj.bias": "pytorch_model-00001-of-00002.bin",
162
+ "transformer.h.19.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
163
+ "transformer.h.2.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
164
+ "transformer.h.2.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
165
+ "transformer.h.2.attn.c_proj.bias": "pytorch_model-00001-of-00002.bin",
166
+ "transformer.h.2.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
167
+ "transformer.h.2.attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
168
+ "transformer.h.2.ln_1.bias": "pytorch_model-00001-of-00002.bin",
169
+ "transformer.h.2.ln_1.weight": "pytorch_model-00001-of-00002.bin",
170
+ "transformer.h.2.ln_2.bias": "pytorch_model-00001-of-00002.bin",
171
+ "transformer.h.2.ln_2.weight": "pytorch_model-00001-of-00002.bin",
172
+ "transformer.h.2.mlp.c_fc.bias": "pytorch_model-00001-of-00002.bin",
173
+ "transformer.h.2.mlp.c_fc.weight": "pytorch_model-00001-of-00002.bin",
174
+ "transformer.h.2.mlp.c_proj.bias": "pytorch_model-00001-of-00002.bin",
175
+ "transformer.h.2.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
176
+ "transformer.h.20.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
177
+ "transformer.h.20.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
178
+ "transformer.h.20.attn.c_proj.bias": "pytorch_model-00001-of-00002.bin",
179
+ "transformer.h.20.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
180
+ "transformer.h.20.attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
181
+ "transformer.h.20.ln_1.bias": "pytorch_model-00001-of-00002.bin",
182
+ "transformer.h.20.ln_1.weight": "pytorch_model-00001-of-00002.bin",
183
+ "transformer.h.20.ln_2.bias": "pytorch_model-00001-of-00002.bin",
184
+ "transformer.h.20.ln_2.weight": "pytorch_model-00001-of-00002.bin",
185
+ "transformer.h.20.mlp.c_fc.bias": "pytorch_model-00001-of-00002.bin",
186
+ "transformer.h.20.mlp.c_fc.weight": "pytorch_model-00001-of-00002.bin",
187
+ "transformer.h.20.mlp.c_proj.bias": "pytorch_model-00001-of-00002.bin",
188
+ "transformer.h.20.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
189
+ "transformer.h.21.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
190
+ "transformer.h.21.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
191
+ "transformer.h.21.attn.c_proj.bias": "pytorch_model-00001-of-00002.bin",
192
+ "transformer.h.21.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
193
+ "transformer.h.21.attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
194
+ "transformer.h.21.ln_1.bias": "pytorch_model-00001-of-00002.bin",
195
+ "transformer.h.21.ln_1.weight": "pytorch_model-00001-of-00002.bin",
196
+ "transformer.h.21.ln_2.bias": "pytorch_model-00001-of-00002.bin",
197
+ "transformer.h.21.ln_2.weight": "pytorch_model-00001-of-00002.bin",
198
+ "transformer.h.21.mlp.c_fc.bias": "pytorch_model-00001-of-00002.bin",
199
+ "transformer.h.21.mlp.c_fc.weight": "pytorch_model-00001-of-00002.bin",
200
+ "transformer.h.21.mlp.c_proj.bias": "pytorch_model-00001-of-00002.bin",
201
+ "transformer.h.21.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
202
+ "transformer.h.22.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
203
+ "transformer.h.22.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
204
+ "transformer.h.22.attn.c_proj.bias": "pytorch_model-00001-of-00002.bin",
205
+ "transformer.h.22.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
206
+ "transformer.h.22.attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
207
+ "transformer.h.22.ln_1.bias": "pytorch_model-00001-of-00002.bin",
208
+ "transformer.h.22.ln_1.weight": "pytorch_model-00001-of-00002.bin",
209
+ "transformer.h.22.ln_2.bias": "pytorch_model-00001-of-00002.bin",
210
+ "transformer.h.22.ln_2.weight": "pytorch_model-00001-of-00002.bin",
211
+ "transformer.h.22.mlp.c_fc.bias": "pytorch_model-00001-of-00002.bin",
212
+ "transformer.h.22.mlp.c_fc.weight": "pytorch_model-00001-of-00002.bin",
213
+ "transformer.h.22.mlp.c_proj.bias": "pytorch_model-00001-of-00002.bin",
214
+ "transformer.h.22.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
215
+ "transformer.h.23.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
216
+ "transformer.h.23.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
217
+ "transformer.h.23.attn.c_proj.bias": "pytorch_model-00001-of-00002.bin",
218
+ "transformer.h.23.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
219
+ "transformer.h.23.attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
220
+ "transformer.h.23.ln_1.bias": "pytorch_model-00001-of-00002.bin",
221
+ "transformer.h.23.ln_1.weight": "pytorch_model-00001-of-00002.bin",
222
+ "transformer.h.23.ln_2.bias": "pytorch_model-00001-of-00002.bin",
223
+ "transformer.h.23.ln_2.weight": "pytorch_model-00001-of-00002.bin",
224
+ "transformer.h.23.mlp.c_fc.bias": "pytorch_model-00001-of-00002.bin",
225
+ "transformer.h.23.mlp.c_fc.weight": "pytorch_model-00001-of-00002.bin",
226
+ "transformer.h.23.mlp.c_proj.bias": "pytorch_model-00001-of-00002.bin",
227
+ "transformer.h.23.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
228
+ "transformer.h.24.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
229
+ "transformer.h.24.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
230
+ "transformer.h.24.attn.c_proj.bias": "pytorch_model-00001-of-00002.bin",
231
+ "transformer.h.24.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
232
+ "transformer.h.24.attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
233
+ "transformer.h.24.ln_1.bias": "pytorch_model-00001-of-00002.bin",
234
+ "transformer.h.24.ln_1.weight": "pytorch_model-00001-of-00002.bin",
235
+ "transformer.h.24.ln_2.bias": "pytorch_model-00001-of-00002.bin",
236
+ "transformer.h.24.ln_2.weight": "pytorch_model-00001-of-00002.bin",
237
+ "transformer.h.24.mlp.c_fc.bias": "pytorch_model-00001-of-00002.bin",
238
+ "transformer.h.24.mlp.c_fc.weight": "pytorch_model-00001-of-00002.bin",
239
+ "transformer.h.24.mlp.c_proj.bias": "pytorch_model-00001-of-00002.bin",
240
+ "transformer.h.24.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
241
+ "transformer.h.25.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
242
+ "transformer.h.25.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
243
+ "transformer.h.25.attn.c_proj.bias": "pytorch_model-00001-of-00002.bin",
244
+ "transformer.h.25.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
245
+ "transformer.h.25.attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
246
+ "transformer.h.25.ln_1.bias": "pytorch_model-00001-of-00002.bin",
247
+ "transformer.h.25.ln_1.weight": "pytorch_model-00001-of-00002.bin",
248
+ "transformer.h.25.ln_2.bias": "pytorch_model-00001-of-00002.bin",
249
+ "transformer.h.25.ln_2.weight": "pytorch_model-00001-of-00002.bin",
250
+ "transformer.h.25.mlp.c_fc.bias": "pytorch_model-00001-of-00002.bin",
251
+ "transformer.h.25.mlp.c_fc.weight": "pytorch_model-00001-of-00002.bin",
252
+ "transformer.h.25.mlp.c_proj.bias": "pytorch_model-00001-of-00002.bin",
253
+ "transformer.h.25.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
254
+ "transformer.h.26.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
255
+ "transformer.h.26.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
256
+ "transformer.h.26.attn.c_proj.bias": "pytorch_model-00001-of-00002.bin",
257
+ "transformer.h.26.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
258
+ "transformer.h.26.attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
259
+ "transformer.h.26.ln_1.bias": "pytorch_model-00001-of-00002.bin",
260
+ "transformer.h.26.ln_1.weight": "pytorch_model-00001-of-00002.bin",
261
+ "transformer.h.26.ln_2.bias": "pytorch_model-00001-of-00002.bin",
262
+ "transformer.h.26.ln_2.weight": "pytorch_model-00001-of-00002.bin",
263
+ "transformer.h.26.mlp.c_fc.bias": "pytorch_model-00001-of-00002.bin",
264
+ "transformer.h.26.mlp.c_fc.weight": "pytorch_model-00001-of-00002.bin",
265
+ "transformer.h.26.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
266
+ "transformer.h.26.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
267
+ "transformer.h.27.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
268
+ "transformer.h.27.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
269
+ "transformer.h.27.attn.c_proj.bias": "pytorch_model-00002-of-00002.bin",
270
+ "transformer.h.27.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
271
+ "transformer.h.27.attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
272
+ "transformer.h.27.ln_1.bias": "pytorch_model-00002-of-00002.bin",
273
+ "transformer.h.27.ln_1.weight": "pytorch_model-00002-of-00002.bin",
274
+ "transformer.h.27.ln_2.bias": "pytorch_model-00002-of-00002.bin",
275
+ "transformer.h.27.ln_2.weight": "pytorch_model-00002-of-00002.bin",
276
+ "transformer.h.27.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
277
+ "transformer.h.27.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
278
+ "transformer.h.27.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
279
+ "transformer.h.27.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
280
+ "transformer.h.28.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
281
+ "transformer.h.28.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
282
+ "transformer.h.28.attn.c_proj.bias": "pytorch_model-00002-of-00002.bin",
283
+ "transformer.h.28.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
284
+ "transformer.h.28.attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
285
+ "transformer.h.28.ln_1.bias": "pytorch_model-00002-of-00002.bin",
286
+ "transformer.h.28.ln_1.weight": "pytorch_model-00002-of-00002.bin",
287
+ "transformer.h.28.ln_2.bias": "pytorch_model-00002-of-00002.bin",
288
+ "transformer.h.28.ln_2.weight": "pytorch_model-00002-of-00002.bin",
289
+ "transformer.h.28.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
290
+ "transformer.h.28.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
291
+ "transformer.h.28.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
292
+ "transformer.h.28.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
293
+ "transformer.h.29.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
294
+ "transformer.h.29.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
295
+ "transformer.h.29.attn.c_proj.bias": "pytorch_model-00002-of-00002.bin",
296
+ "transformer.h.29.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
297
+ "transformer.h.29.attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
298
+ "transformer.h.29.ln_1.bias": "pytorch_model-00002-of-00002.bin",
299
+ "transformer.h.29.ln_1.weight": "pytorch_model-00002-of-00002.bin",
300
+ "transformer.h.29.ln_2.bias": "pytorch_model-00002-of-00002.bin",
301
+ "transformer.h.29.ln_2.weight": "pytorch_model-00002-of-00002.bin",
302
+ "transformer.h.29.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
303
+ "transformer.h.29.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
304
+ "transformer.h.29.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
305
+ "transformer.h.29.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
306
+ "transformer.h.3.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
307
+ "transformer.h.3.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
308
+ "transformer.h.3.attn.c_proj.bias": "pytorch_model-00001-of-00002.bin",
309
+ "transformer.h.3.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
310
+ "transformer.h.3.attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
311
+ "transformer.h.3.ln_1.bias": "pytorch_model-00001-of-00002.bin",
312
+ "transformer.h.3.ln_1.weight": "pytorch_model-00001-of-00002.bin",
313
+ "transformer.h.3.ln_2.bias": "pytorch_model-00001-of-00002.bin",
314
+ "transformer.h.3.ln_2.weight": "pytorch_model-00001-of-00002.bin",
315
+ "transformer.h.3.mlp.c_fc.bias": "pytorch_model-00001-of-00002.bin",
316
+ "transformer.h.3.mlp.c_fc.weight": "pytorch_model-00001-of-00002.bin",
317
+ "transformer.h.3.mlp.c_proj.bias": "pytorch_model-00001-of-00002.bin",
318
+ "transformer.h.3.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
319
+ "transformer.h.30.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
320
+ "transformer.h.30.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
321
+ "transformer.h.30.attn.c_proj.bias": "pytorch_model-00002-of-00002.bin",
322
+ "transformer.h.30.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
323
+ "transformer.h.30.attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
324
+ "transformer.h.30.ln_1.bias": "pytorch_model-00002-of-00002.bin",
325
+ "transformer.h.30.ln_1.weight": "pytorch_model-00002-of-00002.bin",
326
+ "transformer.h.30.ln_2.bias": "pytorch_model-00002-of-00002.bin",
327
+ "transformer.h.30.ln_2.weight": "pytorch_model-00002-of-00002.bin",
328
+ "transformer.h.30.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
329
+ "transformer.h.30.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
330
+ "transformer.h.30.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
331
+ "transformer.h.30.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
332
+ "transformer.h.31.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
333
+ "transformer.h.31.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
334
+ "transformer.h.31.attn.c_proj.bias": "pytorch_model-00002-of-00002.bin",
335
+ "transformer.h.31.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
336
+ "transformer.h.31.attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
337
+ "transformer.h.31.ln_1.bias": "pytorch_model-00002-of-00002.bin",
338
+ "transformer.h.31.ln_1.weight": "pytorch_model-00002-of-00002.bin",
339
+ "transformer.h.31.ln_2.bias": "pytorch_model-00002-of-00002.bin",
340
+ "transformer.h.31.ln_2.weight": "pytorch_model-00002-of-00002.bin",
341
+ "transformer.h.31.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
342
+ "transformer.h.31.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
343
+ "transformer.h.31.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
344
+ "transformer.h.31.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
345
+ "transformer.h.32.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
346
+ "transformer.h.32.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
347
+ "transformer.h.32.attn.c_proj.bias": "pytorch_model-00002-of-00002.bin",
348
+ "transformer.h.32.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
349
+ "transformer.h.32.attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
350
+ "transformer.h.32.ln_1.bias": "pytorch_model-00002-of-00002.bin",
351
+ "transformer.h.32.ln_1.weight": "pytorch_model-00002-of-00002.bin",
352
+ "transformer.h.32.ln_2.bias": "pytorch_model-00002-of-00002.bin",
353
+ "transformer.h.32.ln_2.weight": "pytorch_model-00002-of-00002.bin",
354
+ "transformer.h.32.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
355
+ "transformer.h.32.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
356
+ "transformer.h.32.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
357
+ "transformer.h.32.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
358
+ "transformer.h.33.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
359
+ "transformer.h.33.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
360
+ "transformer.h.33.attn.c_proj.bias": "pytorch_model-00002-of-00002.bin",
361
+ "transformer.h.33.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
362
+ "transformer.h.33.attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
363
+ "transformer.h.33.ln_1.bias": "pytorch_model-00002-of-00002.bin",
364
+ "transformer.h.33.ln_1.weight": "pytorch_model-00002-of-00002.bin",
365
+ "transformer.h.33.ln_2.bias": "pytorch_model-00002-of-00002.bin",
366
+ "transformer.h.33.ln_2.weight": "pytorch_model-00002-of-00002.bin",
367
+ "transformer.h.33.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
368
+ "transformer.h.33.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
369
+ "transformer.h.33.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
370
+ "transformer.h.33.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
371
+ "transformer.h.34.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
372
+ "transformer.h.34.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
373
+ "transformer.h.34.attn.c_proj.bias": "pytorch_model-00002-of-00002.bin",
374
+ "transformer.h.34.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
375
+ "transformer.h.34.attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
376
+ "transformer.h.34.ln_1.bias": "pytorch_model-00002-of-00002.bin",
377
+ "transformer.h.34.ln_1.weight": "pytorch_model-00002-of-00002.bin",
378
+ "transformer.h.34.ln_2.bias": "pytorch_model-00002-of-00002.bin",
379
+ "transformer.h.34.ln_2.weight": "pytorch_model-00002-of-00002.bin",
380
+ "transformer.h.34.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
381
+ "transformer.h.34.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
382
+ "transformer.h.34.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
383
+ "transformer.h.34.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
384
+ "transformer.h.35.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
385
+ "transformer.h.35.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
386
+ "transformer.h.35.attn.c_proj.bias": "pytorch_model-00002-of-00002.bin",
387
+ "transformer.h.35.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
388
+ "transformer.h.35.attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
389
+ "transformer.h.35.ln_1.bias": "pytorch_model-00002-of-00002.bin",
390
+ "transformer.h.35.ln_1.weight": "pytorch_model-00002-of-00002.bin",
391
+ "transformer.h.35.ln_2.bias": "pytorch_model-00002-of-00002.bin",
392
+ "transformer.h.35.ln_2.weight": "pytorch_model-00002-of-00002.bin",
393
+ "transformer.h.35.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
394
+ "transformer.h.35.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
395
+ "transformer.h.35.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
396
+ "transformer.h.35.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
397
+ "transformer.h.36.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
398
+ "transformer.h.36.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
399
+ "transformer.h.36.attn.c_proj.bias": "pytorch_model-00002-of-00002.bin",
400
+ "transformer.h.36.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
401
+ "transformer.h.36.attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
402
+ "transformer.h.36.ln_1.bias": "pytorch_model-00002-of-00002.bin",
403
+ "transformer.h.36.ln_1.weight": "pytorch_model-00002-of-00002.bin",
404
+ "transformer.h.36.ln_2.bias": "pytorch_model-00002-of-00002.bin",
405
+ "transformer.h.36.ln_2.weight": "pytorch_model-00002-of-00002.bin",
406
+ "transformer.h.36.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
407
+ "transformer.h.36.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
408
+ "transformer.h.36.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
409
+ "transformer.h.36.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
410
+ "transformer.h.37.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
411
+ "transformer.h.37.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
412
+ "transformer.h.37.attn.c_proj.bias": "pytorch_model-00002-of-00002.bin",
413
+ "transformer.h.37.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
414
+ "transformer.h.37.attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
415
+ "transformer.h.37.ln_1.bias": "pytorch_model-00002-of-00002.bin",
416
+ "transformer.h.37.ln_1.weight": "pytorch_model-00002-of-00002.bin",
417
+ "transformer.h.37.ln_2.bias": "pytorch_model-00002-of-00002.bin",
418
+ "transformer.h.37.ln_2.weight": "pytorch_model-00002-of-00002.bin",
419
+ "transformer.h.37.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
420
+ "transformer.h.37.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
421
+ "transformer.h.37.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
422
+ "transformer.h.37.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
423
+ "transformer.h.38.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
424
+ "transformer.h.38.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
425
+ "transformer.h.38.attn.c_proj.bias": "pytorch_model-00002-of-00002.bin",
426
+ "transformer.h.38.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
427
+ "transformer.h.38.attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
428
+ "transformer.h.38.ln_1.bias": "pytorch_model-00002-of-00002.bin",
429
+ "transformer.h.38.ln_1.weight": "pytorch_model-00002-of-00002.bin",
430
+ "transformer.h.38.ln_2.bias": "pytorch_model-00002-of-00002.bin",
431
+ "transformer.h.38.ln_2.weight": "pytorch_model-00002-of-00002.bin",
432
+ "transformer.h.38.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
433
+ "transformer.h.38.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
434
+ "transformer.h.38.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
435
+ "transformer.h.38.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
436
+ "transformer.h.39.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
437
+ "transformer.h.39.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
438
+ "transformer.h.39.attn.c_proj.bias": "pytorch_model-00002-of-00002.bin",
439
+ "transformer.h.39.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
440
+ "transformer.h.39.attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
441
+ "transformer.h.39.ln_1.bias": "pytorch_model-00002-of-00002.bin",
442
+ "transformer.h.39.ln_1.weight": "pytorch_model-00002-of-00002.bin",
443
+ "transformer.h.39.ln_2.bias": "pytorch_model-00002-of-00002.bin",
444
+ "transformer.h.39.ln_2.weight": "pytorch_model-00002-of-00002.bin",
445
+ "transformer.h.39.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
446
+ "transformer.h.39.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
447
+ "transformer.h.39.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
448
+ "transformer.h.39.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
449
+ "transformer.h.4.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
450
+ "transformer.h.4.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
451
+ "transformer.h.4.attn.c_proj.bias": "pytorch_model-00001-of-00002.bin",
452
+ "transformer.h.4.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
453
+ "transformer.h.4.attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
454
+ "transformer.h.4.ln_1.bias": "pytorch_model-00001-of-00002.bin",
455
+ "transformer.h.4.ln_1.weight": "pytorch_model-00001-of-00002.bin",
456
+ "transformer.h.4.ln_2.bias": "pytorch_model-00001-of-00002.bin",
457
+ "transformer.h.4.ln_2.weight": "pytorch_model-00001-of-00002.bin",
458
+ "transformer.h.4.mlp.c_fc.bias": "pytorch_model-00001-of-00002.bin",
459
+ "transformer.h.4.mlp.c_fc.weight": "pytorch_model-00001-of-00002.bin",
460
+ "transformer.h.4.mlp.c_proj.bias": "pytorch_model-00001-of-00002.bin",
461
+ "transformer.h.4.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
462
+ "transformer.h.40.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
463
+ "transformer.h.40.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
464
+ "transformer.h.40.attn.c_proj.bias": "pytorch_model-00002-of-00002.bin",
465
+ "transformer.h.40.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
466
+ "transformer.h.40.attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
467
+ "transformer.h.40.ln_1.bias": "pytorch_model-00002-of-00002.bin",
468
+ "transformer.h.40.ln_1.weight": "pytorch_model-00002-of-00002.bin",
469
+ "transformer.h.40.ln_2.bias": "pytorch_model-00002-of-00002.bin",
470
+ "transformer.h.40.ln_2.weight": "pytorch_model-00002-of-00002.bin",
471
+ "transformer.h.40.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
472
+ "transformer.h.40.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
473
+ "transformer.h.40.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
474
+ "transformer.h.40.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
475
+ "transformer.h.41.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
476
+ "transformer.h.41.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
477
+ "transformer.h.41.attn.c_proj.bias": "pytorch_model-00002-of-00002.bin",
478
+ "transformer.h.41.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
479
+ "transformer.h.41.attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
480
+ "transformer.h.41.ln_1.bias": "pytorch_model-00002-of-00002.bin",
481
+ "transformer.h.41.ln_1.weight": "pytorch_model-00002-of-00002.bin",
482
+ "transformer.h.41.ln_2.bias": "pytorch_model-00002-of-00002.bin",
483
+ "transformer.h.41.ln_2.weight": "pytorch_model-00002-of-00002.bin",
484
+ "transformer.h.41.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
485
+ "transformer.h.41.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
486
+ "transformer.h.41.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
487
+ "transformer.h.41.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
488
+ "transformer.h.5.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
489
+ "transformer.h.5.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
490
+ "transformer.h.5.attn.c_proj.bias": "pytorch_model-00001-of-00002.bin",
491
+ "transformer.h.5.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
492
+ "transformer.h.5.attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
493
+ "transformer.h.5.ln_1.bias": "pytorch_model-00001-of-00002.bin",
494
+ "transformer.h.5.ln_1.weight": "pytorch_model-00001-of-00002.bin",
495
+ "transformer.h.5.ln_2.bias": "pytorch_model-00001-of-00002.bin",
496
+ "transformer.h.5.ln_2.weight": "pytorch_model-00001-of-00002.bin",
497
+ "transformer.h.5.mlp.c_fc.bias": "pytorch_model-00001-of-00002.bin",
498
+ "transformer.h.5.mlp.c_fc.weight": "pytorch_model-00001-of-00002.bin",
499
+ "transformer.h.5.mlp.c_proj.bias": "pytorch_model-00001-of-00002.bin",
500
+ "transformer.h.5.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
501
+ "transformer.h.6.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
502
+ "transformer.h.6.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
503
+ "transformer.h.6.attn.c_proj.bias": "pytorch_model-00001-of-00002.bin",
504
+ "transformer.h.6.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
505
+ "transformer.h.6.attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
506
+ "transformer.h.6.ln_1.bias": "pytorch_model-00001-of-00002.bin",
507
+ "transformer.h.6.ln_1.weight": "pytorch_model-00001-of-00002.bin",
508
+ "transformer.h.6.ln_2.bias": "pytorch_model-00001-of-00002.bin",
509
+ "transformer.h.6.ln_2.weight": "pytorch_model-00001-of-00002.bin",
510
+ "transformer.h.6.mlp.c_fc.bias": "pytorch_model-00001-of-00002.bin",
511
+ "transformer.h.6.mlp.c_fc.weight": "pytorch_model-00001-of-00002.bin",
512
+ "transformer.h.6.mlp.c_proj.bias": "pytorch_model-00001-of-00002.bin",
513
+ "transformer.h.6.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
514
+ "transformer.h.7.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
515
+ "transformer.h.7.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
516
+ "transformer.h.7.attn.c_proj.bias": "pytorch_model-00001-of-00002.bin",
517
+ "transformer.h.7.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
518
+ "transformer.h.7.attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
519
+ "transformer.h.7.ln_1.bias": "pytorch_model-00001-of-00002.bin",
520
+ "transformer.h.7.ln_1.weight": "pytorch_model-00001-of-00002.bin",
521
+ "transformer.h.7.ln_2.bias": "pytorch_model-00001-of-00002.bin",
522
+ "transformer.h.7.ln_2.weight": "pytorch_model-00001-of-00002.bin",
523
+ "transformer.h.7.mlp.c_fc.bias": "pytorch_model-00001-of-00002.bin",
524
+ "transformer.h.7.mlp.c_fc.weight": "pytorch_model-00001-of-00002.bin",
525
+ "transformer.h.7.mlp.c_proj.bias": "pytorch_model-00001-of-00002.bin",
526
+ "transformer.h.7.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
527
+ "transformer.h.8.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
528
+ "transformer.h.8.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
529
+ "transformer.h.8.attn.c_proj.bias": "pytorch_model-00001-of-00002.bin",
530
+ "transformer.h.8.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
531
+ "transformer.h.8.attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
532
+ "transformer.h.8.ln_1.bias": "pytorch_model-00001-of-00002.bin",
533
+ "transformer.h.8.ln_1.weight": "pytorch_model-00001-of-00002.bin",
534
+ "transformer.h.8.ln_2.bias": "pytorch_model-00001-of-00002.bin",
535
+ "transformer.h.8.ln_2.weight": "pytorch_model-00001-of-00002.bin",
536
+ "transformer.h.8.mlp.c_fc.bias": "pytorch_model-00001-of-00002.bin",
537
+ "transformer.h.8.mlp.c_fc.weight": "pytorch_model-00001-of-00002.bin",
538
+ "transformer.h.8.mlp.c_proj.bias": "pytorch_model-00001-of-00002.bin",
539
+ "transformer.h.8.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
540
+ "transformer.h.9.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
541
+ "transformer.h.9.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
542
+ "transformer.h.9.attn.c_proj.bias": "pytorch_model-00001-of-00002.bin",
543
+ "transformer.h.9.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
544
+ "transformer.h.9.attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
545
+ "transformer.h.9.ln_1.bias": "pytorch_model-00001-of-00002.bin",
546
+ "transformer.h.9.ln_1.weight": "pytorch_model-00001-of-00002.bin",
547
+ "transformer.h.9.ln_2.bias": "pytorch_model-00001-of-00002.bin",
548
+ "transformer.h.9.ln_2.weight": "pytorch_model-00001-of-00002.bin",
549
+ "transformer.h.9.mlp.c_fc.bias": "pytorch_model-00001-of-00002.bin",
550
+ "transformer.h.9.mlp.c_fc.weight": "pytorch_model-00001-of-00002.bin",
551
+ "transformer.h.9.mlp.c_proj.bias": "pytorch_model-00001-of-00002.bin",
552
+ "transformer.h.9.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
553
+ "transformer.ln_f.bias": "pytorch_model-00002-of-00002.bin",
554
+ "transformer.ln_f.weight": "pytorch_model-00002-of-00002.bin",
555
+ "transformer.wte.weight": "pytorch_model-00001-of-00002.bin"
556
+ }
557
+ }
quantizer.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023 WisdomShell Inc. All Rights Reserved.
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ try:
17
+ import bitsandbytes as bnb
18
+ from bitsandbytes.nn.modules import Params4bit, Int8Params
19
+ except ImportError:
20
+ pass
21
+ import torch
22
+
23
+ def Params4bitCuda(self, device):
24
+ self.data = self.data.cuda(device)
25
+ self.quant_state[0] = self.quant_state[0].cuda(device)
26
+ self.quant_state[4][0] = self.quant_state[4][0].cuda(device)
27
+ self.quant_state[4][1][0] = self.quant_state[4][1][0].cuda(device)
28
+ self.quant_state[4][1][1] = self.quant_state[4][1][1].cuda(device)
29
+
30
+ self.quant_state[6] = self.quant_state[6].cuda(device)
31
+ return self
32
+
33
+ class Linear4bitOnline(torch.nn.Module):
34
+ def __init__(self, weight, bias, quant_type):
35
+ super().__init__()
36
+ self.weight = Params4bit(
37
+ weight.data, requires_grad=False, compress_statistics=True, quant_type=quant_type
38
+ )
39
+ self.compute_dtype = None
40
+ #self.weight.cuda(weight.device)
41
+ self.bias = bias
42
+
43
+ def forward(self, x: torch.Tensor):
44
+ # weights are cast automatically as Int8Params, but the bias has to be cast manually
45
+ if self.bias is not None and self.bias.dtype != x.dtype:
46
+ self.bias.data = self.bias.data.to(x.dtype)
47
+
48
+ if getattr(self.weight, "quant_state", None) is None:
49
+ print(
50
+ "FP4 quantization state not initialized. Please call .cuda() or .to(device) on the LinearFP4 layer first."
51
+ )
52
+ inp_dtype = x.dtype
53
+ if self.compute_dtype is not None:
54
+ x = x.to(self.compute_dtype)
55
+
56
+ bias = None if self.bias is None else self.bias.to(self.compute_dtype)
57
+ out = bnb.matmul_4bit(
58
+ x, self.weight.t(), bias=bias, quant_state=self.weight.quant_state
59
+ )
60
+
61
+ out = out.to(inp_dtype)
62
+
63
+ return out
64
+
65
+ class Linear8bitLtOnline(torch.nn.Module):
66
+ def __init__(
67
+ self,
68
+ weight,
69
+ bias,
70
+ has_fp16_weights=True,
71
+ memory_efficient_backward=False,
72
+ threshold=0.0,
73
+ index=None,
74
+ ):
75
+ super().__init__()
76
+ assert (
77
+ not memory_efficient_backward
78
+ ), "memory_efficient_backward is no longer required and the argument is deprecated in 0.37.0 and will be removed in 0.39.0"
79
+ self.state = bnb.MatmulLtState()
80
+ self.index = index
81
+
82
+ # Necessary for stacked layers
83
+ self.state.threshold = threshold
84
+ self.state.has_fp16_weights = has_fp16_weights
85
+ self.state.memory_efficient_backward = memory_efficient_backward
86
+ if threshold > 0.0 and not has_fp16_weights:
87
+ self.state.use_pool = True
88
+
89
+ self.weight = Int8Params(
90
+ weight.data,
91
+ has_fp16_weights=has_fp16_weights,
92
+ requires_grad=has_fp16_weights,
93
+ )
94
+ self.bias = bias
95
+
96
+ def init_8bit_state(self):
97
+ self.state.CB = self.weight.CB
98
+ self.state.SCB = self.weight.SCB
99
+ self.weight.CB = None
100
+ self.weight.SCB = None
101
+
102
+ def forward(self, x: torch.Tensor):
103
+ self.state.is_training = self.training
104
+ if self.weight.CB is not None:
105
+ self.init_8bit_state()
106
+
107
+ # weights are cast automatically as Int8Params, but the bias has to be cast manually
108
+ if self.bias is not None and self.bias.dtype != x.dtype:
109
+ self.bias.data = self.bias.data.to(x.dtype)
110
+
111
+ out = bnb.matmul(x, self.weight, bias=self.bias, state=self.state)
112
+
113
+ if not self.state.has_fp16_weights:
114
+ if self.state.CB is not None and self.state.CxB is not None:
115
+ # we converted 8-bit row major to turing/ampere format in the first inference pass
116
+ # we no longer need the row-major weight
117
+ del self.state.CB
118
+ self.weight.data = self.state.CxB
119
+ return out
120
+
121
+ def quantize_online(model, bits: int):
122
+ def quant(weight, bias=None):
123
+ if bits == 8:
124
+ linear = Linear8bitLtOnline(
125
+ weight,
126
+ bias,
127
+ has_fp16_weights=False,
128
+ threshold=6.0,
129
+ )
130
+ if bias is not None:
131
+ linear.bias = torch.nn.Parameter(bias)
132
+ elif bits == 4:
133
+ linear = Linear4bitOnline(
134
+ weight,
135
+ bias,
136
+ quant_type="nf4", #fp4/nf4
137
+ )
138
+ else:
139
+ raise ValueError("quantize only support 4/8 bit")
140
+ return linear
141
+
142
+ def auto_quant(layer):
143
+ if hasattr(layer,"bias"):
144
+ linear = quant(layer.weight,bias=layer.bias)
145
+ else:
146
+ linear = quant(layer.weight)
147
+ return linear
148
+
149
+ for i,layer in enumerate(model.transformer.h):
150
+ layer.mlp.c_fc = auto_quant(layer.mlp.c_fc)
151
+ layer.mlp.c_proj = auto_quant(layer.mlp.c_proj)
152
+
153
+ layer.attn.c_attn=auto_quant(layer.attn.c_attn)
154
+ layer.attn.c_proj=auto_quant(layer.attn.c_proj)
155
+
156
+ return model
special_tokens_map.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|endoftext|>",
4
+ "<fim_prefix>",
5
+ "<fim_middle>",
6
+ "<fim_suffix>",
7
+ "<fim_pad>",
8
+ "<filename>",
9
+ "<gh_stars>",
10
+ "<issue_start>",
11
+ "<issue_comment>",
12
+ "<issue_closed>",
13
+ "<jupyter_start>",
14
+ "<jupyter_text>",
15
+ "<jupyter_code>",
16
+ "<jupyter_output>",
17
+ "<empty_output>",
18
+ "<commit_before>",
19
+ "<commit_msg>",
20
+ "<commit_after>",
21
+ "<reponame>"
22
+ ],
23
+ "bos_token": "<|endoftext|>",
24
+ "eos_token": "<|endoftext|>",
25
+ "pad_token": "<|endoftext|>",
26
+ "unk_token": "<|endoftext|>"
27
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "additional_special_tokens": [
4
+ "<|endoftext|>",
5
+ "<fim_prefix>",
6
+ "<fim_middle>",
7
+ "<fim_suffix>",
8
+ "<fim_pad>",
9
+ "<filename>",
10
+ "<gh_stars>",
11
+ "<issue_start>",
12
+ "<issue_comment>",
13
+ "<issue_closed>",
14
+ "<jupyter_start>",
15
+ "<jupyter_text>",
16
+ "<jupyter_code>",
17
+ "<jupyter_output>",
18
+ "<empty_output>",
19
+ "<commit_before>",
20
+ "<commit_msg>",
21
+ "<commit_after>",
22
+ "<reponame>"
23
+ ],
24
+ "bos_token": "<|endoftext|>",
25
+ "clean_up_tokenization_spaces": true,
26
+ "eos_token": "<|endoftext|>",
27
+ "model_max_length": 4096,
28
+ "pad_token": "<|endoftext|>",
29
+ "tokenizer_class": "GPT2Tokenizer",
30
+ "unk_token": "<|endoftext|>",
31
+ "vocab_size": 70019
32
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff