ruixie commited on
Commit
ad679d1
1 Parent(s): 58bb805
License.pdf ADDED
Binary file (282 kB). View file
 
README.md CHANGED
@@ -1,3 +1,11 @@
1
  ---
2
- license: apache-2.0
 
 
 
 
 
 
 
3
  ---
 
 
1
  ---
2
+ language:
3
+ - zh
4
+ - en
5
+ tags:
6
+ - codeshell
7
+ - wisdomshell
8
+ - pku-kcl
9
+ - openbankai
10
  ---
11
+
added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<assistant>": 70020,
3
+ "<commit_after>": 70017,
4
+ "<commit_before>": 70015,
5
+ "<commit_msg>": 70016,
6
+ "<copilot>": 70021,
7
+ "<empty_output>": 70014,
8
+ "<filename>": 70005,
9
+ "<fim_middle>": 70002,
10
+ "<fim_pad>": 70004,
11
+ "<fim_prefix>": 70001,
12
+ "<fim_suffix>": 70003,
13
+ "<gh_stars>": 70006,
14
+ "<human>": 70019,
15
+ "<issue_closed>": 70009,
16
+ "<issue_comment>": 70008,
17
+ "<issue_start>": 70007,
18
+ "<jupyter_code>": 70012,
19
+ "<jupyter_output>": 70013,
20
+ "<jupyter_start>": 70010,
21
+ "<jupyter_text>": 70011,
22
+ "<reponame>": 70018,
23
+ "<|endoftext|>": 70000
24
+ }
config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "WisdomShell/Shell-7B",
3
+ "activation_function": "gelu_pytorch_tanh",
4
+ "architectures": [
5
+ "CodeShellForCausalLM"
6
+ ],
7
+ "attention_softmax_in_fp32": true,
8
+ "attn_pdrop": 0.1,
9
+ "auto_map": {
10
+ "AutoConfig": "configuration_codeshell.CodeShellConfig",
11
+ "AutoModelForCausalLM": "modeling_codeshell.CodeShellForCausalLM"
12
+ },
13
+ "bos_token_id": 70000,
14
+ "embd_pdrop": 0.1,
15
+ "eos_token_id": 70000,
16
+ "group_query_attention": true,
17
+ "inference_runner": 0,
18
+ "initializer_range": 0.02,
19
+ "layer_norm_epsilon": 1e-05,
20
+ "max_batch_size": null,
21
+ "max_sequence_length": null,
22
+ "model_type": "codeshell",
23
+ "n_embd": 4096,
24
+ "n_head": 32,
25
+ "n_inner": 16384,
26
+ "n_layer": 42,
27
+ "n_positions": 8192,
28
+ "num_query_groups": 8,
29
+ "pad_key_length": true,
30
+ "position_embedding_type": "rope",
31
+ "pre_allocate_kv_cache": false,
32
+ "resid_pdrop": 0.1,
33
+ "rope_scaling": null,
34
+ "scale_attention_softmax_in_fp32": true,
35
+ "scale_attn_weights": true,
36
+ "summary_activation": null,
37
+ "summary_first_dropout": 0.1,
38
+ "summary_proj_to_labels": true,
39
+ "summary_type": "cls_index",
40
+ "summary_use_proj": true,
41
+ "torch_dtype": "bfloat16",
42
+ "transformers_version": "4.31.0",
43
+ "use_cache": true,
44
+ "validate_runner_input": true,
45
+ "vocab_size": 70144
46
+ }
configuration_codeshell.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023 WisdomShell Inc. All Rights Reserved.
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ # This code is based on Bigcode's GPTBigCode configuration. It has been modified from
17
+ # its original forms to accommodate minor architectural differences compared to
18
+ # GPTBigCode Configuration that trained the model.
19
+
20
+ # Copyright 2023 The BigCode team and HuggingFace Inc. team.
21
+ #
22
+ # Licensed under the Apache License, Version 2.0 (the "License");
23
+ # you may not use this file except in compliance with the License.
24
+ # You may obtain a copy of the License at
25
+ #
26
+ # http://www.apache.org/licenses/LICENSE-2.0
27
+ #
28
+ # Unless required by applicable law or agreed to in writing, software
29
+ # distributed under the License is distributed on an "AS IS" BASIS,
30
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
31
+ # See the License for the specific language governing permissions and
32
+ # limitations under the License.
33
+ """ Shell configuration"""
34
+
35
+ from transformers.configuration_utils import PretrainedConfig
36
+ from transformers.utils import logging
37
+
38
+
39
+ logger = logging.get_logger(__name__)
40
+
41
+
42
+ class CodeShellConfig(PretrainedConfig):
43
+ """
44
+ This is the configuration class to store the configuration of a [`CodeShellModel`]. It is used to instantiate a
45
+ CodeShell model according to the specified arguments, defining the model architecture.
46
+
47
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
48
+ documentation from [`PretrainedConfig`] for more information.
49
+
50
+ Args:
51
+ vocab_size (`int`, *optional*, defaults to 50257):
52
+ Vocabulary size of the GPT-2 model. Defines the number of different tokens that can be represented by the
53
+ `inputs_ids` passed when calling [`ShellModel`].
54
+ n_positions (`int`, *optional*, defaults to 1024):
55
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
56
+ just in case (e.g., 512 or 1024 or 2048).
57
+ n_embd (`int`, *optional*, defaults to 768):
58
+ Dimensionality of the embeddings and hidden states.
59
+ n_layer (`int`, *optional*, defaults to 12):
60
+ Number of hidden layers in the Transformer encoder.
61
+ n_head (`int`, *optional*, defaults to 12):
62
+ Number of attention heads for each attention layer in the Transformer encoder.
63
+ n_inner (`int`, *optional*, defaults to None):
64
+ Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd
65
+ activation_function (`str`, *optional*, defaults to `"gelu_pytorch_tanh"`):
66
+ Activation function, to be selected in the list `["relu", "silu", "gelu", "tanh", "gelu_new",
67
+ "gelu_pytorch_tanh"]`.
68
+ resid_pdrop (`float`, *optional*, defaults to 0.1):
69
+ The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
70
+ embd_pdrop (`float`, *optional*, defaults to 0.1):
71
+ The dropout ratio for the embeddings.
72
+ attn_pdrop (`float`, *optional*, defaults to 0.1):
73
+ The dropout ratio for the attention.
74
+ layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
75
+ The epsilon to use in the layer normalization layers.
76
+ initializer_range (`float`, *optional*, defaults to 0.02):
77
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
78
+ scale_attn_weights (`bool`, *optional*, defaults to `True`):
79
+ Scale attention weights by dividing by sqrt(hidden_size)..
80
+ use_cache (`bool`, *optional*, defaults to `True`):
81
+ Whether or not the model should return the last key/values attentions (not used by all models).
82
+ attention_softmax_in_fp32 (`bool`, *optional*, defaults to `True`):
83
+ Whether to call the fused softmax in float32.
84
+ scale_attention_softmax_in_fp32 (`bool`, *optional*, defaults to `True`):
85
+ Whether to scale the attention softmax in float32.
86
+ attention_type (`bool`, *optional*, defaults to `True`):
87
+ Whether to use Multi-Query Attion (`True`) or Multi-Head Attention (`False`).
88
+ Example:
89
+
90
+ ```python
91
+ >>> from configuration_codeshell import CodeShellConfig
92
+ >>> from modeling_codeshell import CodeShellForCausalLM
93
+
94
+ >>> # Initializing a CodeShell configuration
95
+ >>> configuration = CodeShellConfig()
96
+
97
+ >>> # Initializing a model (with random weights) from the configuration
98
+ >>> model = CodeShellForCausalLM(configuration)
99
+
100
+ >>> # Accessing the model configuration
101
+ >>> configuration = model.config
102
+ ```"""
103
+
104
+ model_type = "codeshell"
105
+ keys_to_ignore_at_inference = ["past_key_values"]
106
+ attribute_map = {
107
+ "hidden_size": "n_embd",
108
+ "max_position_embeddings": "n_positions",
109
+ "num_attention_heads": "n_head",
110
+ "num_hidden_layers": "n_layer",
111
+ }
112
+
113
+ def __init__(
114
+ self,
115
+ vocab_size=70144,
116
+ n_positions=8192,
117
+ n_embd=4096,
118
+ n_layer=42,
119
+ n_head=32,
120
+ n_inner=None,
121
+ activation_function="gelu_pytorch_tanh",
122
+ resid_pdrop=0.1,
123
+ embd_pdrop=0.1,
124
+ attn_pdrop=0.1,
125
+ layer_norm_epsilon=1e-5,
126
+ initializer_range=0.02,
127
+ scale_attn_weights=True,
128
+ use_cache=True,
129
+ bos_token_id=70000,
130
+ eos_token_id=70000,
131
+ attention_softmax_in_fp32=True,
132
+ scale_attention_softmax_in_fp32=True,
133
+ group_query_attention=True,
134
+ num_query_groups=1,
135
+ position_embedding_type="learned_absolute",
136
+ rope_scaling=None,
137
+ **kwargs,
138
+ ):
139
+ self.vocab_size = vocab_size
140
+ self.n_positions = n_positions
141
+ self.n_embd = n_embd
142
+ self.n_layer = n_layer
143
+ self.n_head = n_head
144
+ self.n_inner = n_inner
145
+ self.activation_function = activation_function
146
+ self.resid_pdrop = resid_pdrop
147
+ self.embd_pdrop = embd_pdrop
148
+ self.attn_pdrop = attn_pdrop
149
+ self.layer_norm_epsilon = layer_norm_epsilon
150
+ self.initializer_range = initializer_range
151
+ self.scale_attn_weights = scale_attn_weights
152
+ self.use_cache = use_cache
153
+ self.attention_softmax_in_fp32 = attention_softmax_in_fp32
154
+ self.scale_attention_softmax_in_fp32 = scale_attention_softmax_in_fp32
155
+ self.group_query_attention = group_query_attention
156
+ self.num_query_groups = num_query_groups
157
+ self.position_embedding_type = position_embedding_type
158
+ self.rope_scaling = rope_scaling
159
+ assert self.position_embedding_type in [
160
+ "learned_absolute", "rope"
161
+ ], "position_embedding_type must be one of ['learned_absolute', 'rope']"
162
+
163
+ self.bos_token_id = bos_token_id
164
+ self.eos_token_id = eos_token_id
165
+
166
+ super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 70000,
4
+ "eos_token_id": 70000,
5
+ "max_new_tokens": 1024,
6
+ "transformers_version": "4.31.0"
7
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
modeling_codeshell.py ADDED
@@ -0,0 +1,1087 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023 WisdomShell Inc. All Rights Reserved.
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ # This code is based on Bigcode's GPTBigCode model. It has been modified from
17
+ # its original forms to accommodate minor architectural differences compared to
18
+ # GPTBigCode model that trained the model.
19
+
20
+ # Copyright 2023 The Bigcode team and HuggingFace Inc. team.
21
+ # Licensed under the Apache License, Version 2.0 (the "License");
22
+ # you may not use this file except in compliance with the License.
23
+ # You may obtain a copy of the License at
24
+ #
25
+ # http://www.apache.org/licenses/LICENSE-2.0
26
+ #
27
+ # Unless required by applicable law or agreed to in writing, software
28
+ # distributed under the License is distributed on an "AS IS" BASIS,
29
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
30
+ # See the License for the specific language governing permissions and
31
+ # limitations under the License.
32
+ """PyTorch CodeShell model."""
33
+ import os
34
+ import math
35
+ from typing import List, Optional, Tuple, Union, Callable
36
+ from threading import Thread
37
+ from queue import Queue
38
+
39
+
40
+ import torch
41
+ import torch.utils.checkpoint
42
+ from torch import nn
43
+ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
44
+
45
+ from transformers import LogitsProcessorList, StoppingCriteriaList, StoppingCriteria, PreTrainedModel, PretrainedConfig
46
+ from transformers.generation.utils import GenerationConfig
47
+
48
+ from transformers.activations import ACT2FN
49
+ from transformers.modeling_outputs import (
50
+ BaseModelOutputWithPastAndCrossAttentions,
51
+ CausalLMOutputWithCrossAttentions,
52
+ )
53
+ from transformers.modeling_utils import PreTrainedModel
54
+ from transformers.utils import (
55
+ add_start_docstrings,
56
+ add_start_docstrings_to_model_forward,
57
+ )
58
+ from .configuration_codeshell import CodeShellConfig
59
+
60
+ # Fused kernels
61
+ # Use separate functions for each case because conditionals prevent kernel fusion.
62
+ # TODO: Could have better fused kernels depending on scaling, dropout and head mask.
63
+ # Is it doable without writing 32 functions?
64
+ @torch.jit.script
65
+ def upcast_masked_softmax(
66
+ x: torch.Tensor, mask: torch.Tensor, mask_value: torch.Tensor, scale: float, softmax_dtype: torch.dtype
67
+ ):
68
+ input_dtype = x.dtype
69
+ x = x.to(softmax_dtype) * scale
70
+ x = torch.where(mask, x, mask_value)
71
+ x = torch.nn.functional.softmax(x, dim=-1).to(input_dtype)
72
+ return x
73
+
74
+
75
+ @torch.jit.script
76
+ def upcast_softmax(x: torch.Tensor, scale: float, softmax_dtype: torch.dtype):
77
+
78
+ input_dtype = x.dtype
79
+ x = x.to(softmax_dtype) * scale
80
+ x = torch.nn.functional.softmax(x, dim=-1).to(input_dtype)
81
+ return x
82
+
83
+
84
+ @torch.jit.script
85
+ def masked_softmax(x: torch.Tensor, mask: torch.Tensor, mask_value: torch.Tensor):
86
+ x = torch.where(mask, x, mask_value)
87
+ x = torch.nn.functional.softmax(x, dim=-1)
88
+ return x
89
+
90
+
91
+ class CodeShellRotaryEmbedding(torch.nn.Module):
92
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
93
+ super().__init__()
94
+
95
+ self.dim = dim
96
+ self.max_position_embeddings = max_position_embeddings
97
+ self.base = base
98
+ inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
99
+ self.register_buffer("inv_freq", inv_freq)
100
+
101
+ # Build here to make `torch.jit.trace` work.
102
+ self._set_cos_sin_cache(
103
+ seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
104
+ )
105
+
106
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
107
+ self.max_seq_len_cached = seq_len
108
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
109
+
110
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
111
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
112
+ emb = torch.cat((freqs, freqs), dim=-1)
113
+ self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
114
+ self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
115
+
116
+ def forward(self, x, seq_len=None):
117
+ # x: [bs, num_attention_heads, seq_len, head_size]
118
+ if seq_len > self.max_seq_len_cached:
119
+ self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
120
+
121
+ return (
122
+ self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
123
+ self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
124
+ )
125
+
126
+
127
+ class CodeShellLinearScalingRotaryEmbedding(CodeShellRotaryEmbedding):
128
+ """CodeShellRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
129
+
130
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
131
+ self.scaling_factor = scaling_factor
132
+ super().__init__(dim, max_position_embeddings, base, device)
133
+
134
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
135
+ self.max_seq_len_cached = seq_len
136
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
137
+ t = t / self.scaling_factor
138
+
139
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
140
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
141
+ emb = torch.cat((freqs, freqs), dim=-1)
142
+ self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
143
+ self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
144
+
145
+
146
+ class CodeShellDynamicNTKScalingRotaryEmbedding(CodeShellRotaryEmbedding):
147
+ """ShellRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
148
+
149
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
150
+ self.scaling_factor = scaling_factor
151
+ super().__init__(dim, max_position_embeddings, base, device)
152
+
153
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
154
+ self.max_seq_len_cached = seq_len
155
+
156
+ if seq_len > self.max_position_embeddings:
157
+ base = self.base * (
158
+ (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
159
+ ) ** (self.dim / (self.dim - 2))
160
+ inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
161
+ self.register_buffer("inv_freq", inv_freq)
162
+
163
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
164
+
165
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
166
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
167
+ emb = torch.cat((freqs, freqs), dim=-1)
168
+ self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
169
+ self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
170
+
171
+ def rotate_half(x):
172
+ """Rotates half the hidden dims of the input."""
173
+ x1 = x[..., : x.shape[-1] // 2]
174
+ x2 = x[..., x.shape[-1] // 2 :]
175
+ return torch.cat((-x2, x1), dim=-1)
176
+
177
+
178
+ def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
179
+ # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
180
+ cos = cos.squeeze(1).squeeze(0) # [seq_len, dim]
181
+ sin = sin.squeeze(1).squeeze(0) # [seq_len, dim]
182
+ cos = cos[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim]
183
+ sin = sin[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim]
184
+ q_embed = (q * cos) + (rotate_half(q) * sin)
185
+ k_embed = (k * cos) + (rotate_half(k) * sin)
186
+ return q_embed, k_embed
187
+
188
+ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
189
+ """
190
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
191
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
192
+ """
193
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
194
+ if n_rep == 1:
195
+ return hidden_states
196
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
197
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
198
+
199
+ class CodeShellAttention(nn.Module):
200
+ def __init__(self, config, layer_idx=None):
201
+ super().__init__()
202
+ self.mask_value = None
203
+
204
+ self.position_embedding_type = config.position_embedding_type
205
+ self.rope_scaling = config.rope_scaling
206
+ self.max_position_embeddings = config.max_position_embeddings
207
+
208
+ self.group_query_attention = config.group_query_attention
209
+ self.num_query_groups = config.num_query_groups
210
+ self.num_key_value_groups = config.num_attention_heads // config.num_query_groups
211
+
212
+ self.embed_dim = config.hidden_size
213
+ self.num_heads = config.num_attention_heads
214
+ self.head_dim = self.embed_dim // self.num_heads
215
+ self.kv_heads = config.num_query_groups if self.group_query_attention else self.num_heads
216
+ self.kv_dim = self.kv_heads * self.head_dim
217
+ self.split_size = self.embed_dim
218
+ if self.head_dim * self.num_heads != self.embed_dim:
219
+ raise ValueError(
220
+ f"`embed_dim` must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
221
+ f" {self.num_heads})."
222
+ )
223
+
224
+ self.layer_idx = layer_idx
225
+
226
+ self.c_attn = nn.Linear(self.embed_dim, self.embed_dim + 2 * self.kv_dim)
227
+ self.c_proj = nn.Linear(self.embed_dim, self.embed_dim)
228
+
229
+ self.attn_dropout = nn.Dropout(config.attn_pdrop)
230
+ self.resid_dropout = nn.Dropout(config.resid_pdrop)
231
+
232
+ if self.position_embedding_type == "rope":
233
+ self._init_rope()
234
+
235
+ def _init_rope(self):
236
+ if self.rope_scaling is None:
237
+ self.rotary_emb = CodeShellRotaryEmbedding(self.head_dim, max_position_embeddings=self.max_position_embeddings)
238
+ else:
239
+ scaling_type = self.rope_scaling["type"]
240
+ scaling_factor = self.rope_scaling["factor"]
241
+ if scaling_type == "linear":
242
+ self.rotary_emb = CodeShellLinearScalingRotaryEmbedding(
243
+ self.head_dim, max_position_embeddings=self.max_position_embeddings, scaling_factor=scaling_factor
244
+ )
245
+ elif scaling_type == "dynamic":
246
+ self.rotary_emb = CodeShellDynamicNTKScalingRotaryEmbedding(
247
+ self.head_dim, max_position_embeddings=self.max_position_embeddings, scaling_factor=scaling_factor
248
+ )
249
+ else:
250
+ raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
251
+
252
+
253
+ def _get_mask_value(self, device, dtype):
254
+ # torch.where expects a tensor. We use a cache to avoid recreating it every time.
255
+ if self.mask_value is None or self.mask_value.dtype != dtype or self.mask_value.device != device:
256
+ self.mask_value = torch.full([], torch.finfo(dtype).min, dtype=dtype, device=device)
257
+ return self.mask_value
258
+
259
+ def forward(
260
+ self,
261
+ hidden_states: torch.Tensor,
262
+ layer_past: Optional[torch.Tensor] = None,
263
+ attention_mask: Optional[torch.Tensor] = None,
264
+ position_ids: Optional[torch.LongTensor] = None,
265
+ head_mask: Optional[torch.Tensor] = None,
266
+ use_cache: Optional[bool] = False,
267
+ output_attentions: Optional[bool] = False,
268
+ ) -> Union[
269
+ Tuple[torch.Tensor, Optional[torch.Tensor]],
270
+ Tuple[torch.Tensor, Optional[torch.Tensor], Tuple[torch.Tensor, ...]],
271
+ ]:
272
+ bsz, q_len, _ = hidden_states.size()
273
+ query_states, key_states, value_states = self.c_attn(hidden_states).split((self.embed_dim, self.kv_dim, self.kv_dim), dim=2)
274
+
275
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
276
+ key_states = key_states.view(bsz, q_len, self.num_query_groups, self.head_dim).transpose(1, 2)
277
+ value_states = value_states.view(bsz, q_len, self.num_query_groups, self.head_dim).transpose(1, 2)
278
+
279
+ kv_seq_len = key_states.shape[-2]
280
+ if layer_past is not None:
281
+ kv_seq_len += layer_past[0].shape[-2]
282
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
283
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
284
+
285
+ if layer_past is not None:
286
+ # reuse k, v, self_attention
287
+ key_states = torch.cat([layer_past[0], key_states], dim=2)
288
+ value_states = torch.cat([layer_past[1], value_states], dim=2)
289
+
290
+ layer_past = (key_states, value_states) if use_cache else None
291
+
292
+ # repeat k/v heads if n_kv_heads < n_heads
293
+ key_states = repeat_kv(key_states, self.num_heads // self.kv_heads)
294
+ value_states = repeat_kv(value_states, self.num_heads // self.kv_heads)
295
+
296
+ attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
297
+
298
+ if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
299
+ raise ValueError(
300
+ f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
301
+ f" {attn_weights.size()}"
302
+ )
303
+
304
+ if attention_mask is not None:
305
+ if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
306
+ raise ValueError(
307
+ f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
308
+ )
309
+ mask_value = self._get_mask_value(attn_weights.device, attn_weights.dtype)
310
+ # The fused kernel is very slow when the key length is not a multiple of 8, so we skip fusion.
311
+ attn_weights = torch.where(attention_mask, attn_weights, mask_value)
312
+
313
+ # upcast attention to fp32
314
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
315
+ attn_weights = self.attn_dropout(attn_weights)
316
+ attn_output = torch.matmul(attn_weights, value_states)
317
+
318
+ if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
319
+ raise ValueError(
320
+ f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
321
+ f" {attn_output.size()}"
322
+ )
323
+
324
+ attn_output = attn_output.transpose(1, 2).contiguous()
325
+ attn_output = attn_output.reshape(bsz, q_len, self.embed_dim)
326
+
327
+ attn_output = self.c_proj(attn_output)
328
+ attn_output = self.resid_dropout(attn_output)
329
+
330
+ outputs = (attn_output, layer_past)
331
+ if output_attentions:
332
+ outputs += (attn_weights,)
333
+
334
+ return outputs # a, present, (attentions)
335
+
336
+
337
+ class CodeShellMLP(nn.Module):
338
+ def __init__(self, intermediate_size, config):
339
+ super().__init__()
340
+ embed_dim = config.hidden_size
341
+ self.c_fc = nn.Linear(embed_dim, intermediate_size)
342
+ self.c_proj = nn.Linear(intermediate_size, embed_dim)
343
+ self.act = ACT2FN[config.activation_function]
344
+ self.dropout = nn.Dropout(config.resid_pdrop)
345
+
346
+ # Copied from transformers.models.gpt2.modeling_gpt2.GPT2MLP.forward
347
+ def forward(self, hidden_states: Optional[Tuple[torch.Tensor]]) -> torch.Tensor:
348
+ hidden_states = self.c_fc(hidden_states)
349
+ hidden_states = self.act(hidden_states)
350
+ hidden_states = self.c_proj(hidden_states)
351
+ hidden_states = self.dropout(hidden_states)
352
+ return hidden_states
353
+
354
+
355
+ class CodeShellBlock(nn.Module):
356
+ def __init__(self, config, layer_idx=None):
357
+ super().__init__()
358
+ hidden_size = config.hidden_size
359
+ self.inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size
360
+
361
+ self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
362
+ self.attn = CodeShellAttention(config, layer_idx=layer_idx)
363
+ self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
364
+
365
+ self.mlp = CodeShellMLP(self.inner_dim, config)
366
+
367
+ def forward(
368
+ self,
369
+ hidden_states: Optional[Tuple[torch.Tensor]],
370
+ layer_past: Optional[torch.Tensor] = None,
371
+ attention_mask: Optional[torch.Tensor] = None,
372
+ position_ids: Optional[torch.LongTensor] = None,
373
+ head_mask: Optional[torch.Tensor] = None,
374
+ encoder_hidden_states: Optional[torch.Tensor] = None,
375
+ encoder_attention_mask: Optional[torch.Tensor] = None,
376
+ use_cache: Optional[bool] = False,
377
+ output_attentions: Optional[bool] = False,
378
+ ) -> Union[
379
+ Tuple[torch.Tensor], Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor, torch.Tensor, torch.Tensor]
380
+ ]:
381
+ residual = hidden_states
382
+ hidden_states = self.ln_1(hidden_states)
383
+ attn_outputs = self.attn(
384
+ hidden_states,
385
+ layer_past=layer_past,
386
+ attention_mask=attention_mask,
387
+ position_ids=position_ids,
388
+ head_mask=head_mask,
389
+ use_cache=use_cache,
390
+ output_attentions=output_attentions,
391
+ )
392
+ attn_output = attn_outputs[0] # output_attn: a, present, (attentions)
393
+
394
+ outputs = attn_outputs[1:]
395
+ # residual connection
396
+ hidden_states = attn_output + residual
397
+
398
+ residual = hidden_states
399
+ hidden_states = self.ln_2(hidden_states)
400
+ feed_forward_hidden_states = self.mlp(hidden_states)
401
+ # residual connection
402
+ hidden_states = residual + feed_forward_hidden_states
403
+
404
+ if use_cache:
405
+ outputs = (hidden_states,) + outputs
406
+ else:
407
+ outputs = (hidden_states,) + outputs[1:]
408
+
409
+ return outputs # hidden_states, present, (attentions, cross_attentions)
410
+
411
+
412
+ class CodeShellPreTrainedModel(PreTrainedModel):
413
+ """
414
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
415
+ models.
416
+ """
417
+
418
+ config_class = CodeShellConfig
419
+ base_model_prefix = "transformer"
420
+ supports_gradient_checkpointing = True
421
+ _no_split_modules = ["ShellBlock"]
422
+ _skip_keys_device_placement = "past_key_values"
423
+
424
+ def __init__(self, *inputs, **kwargs):
425
+ super().__init__(*inputs, **kwargs)
426
+
427
+ def _init_weights(self, module):
428
+ """Initialize the weights."""
429
+ if isinstance(module, (CodeShellMLP, CodeShellAttention)):
430
+ # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
431
+ # > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
432
+ # > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
433
+ # > -- GPT-2 :: https://openai.com/blog/better-language-models/
434
+ #
435
+ # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
436
+ module.c_proj.weight.data.normal_(
437
+ mean=0.0, std=(self.config.initializer_range / math.sqrt(2 * self.config.n_layer))
438
+ )
439
+ module.c_proj._is_hf_initialized = True
440
+ elif isinstance(module, nn.Linear):
441
+ # Slightly different from the TF version which uses truncated_normal for initialization
442
+ # cf https://github.com/pytorch/pytorch/pull/5617
443
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
444
+ if module.bias is not None:
445
+ module.bias.data.zero_()
446
+ elif isinstance(module, nn.Embedding):
447
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
448
+ if module.padding_idx is not None:
449
+ module.weight.data[module.padding_idx].zero_()
450
+ elif isinstance(module, nn.LayerNorm):
451
+ module.bias.data.zero_()
452
+ module.weight.data.fill_(1.0)
453
+
454
+ # Copied from transformers.models.gpt2.modeling_gpt2.GPT2PreTrainedModel._set_gradient_checkpointing with GPT2->Shell
455
+ def _set_gradient_checkpointing(self, module, value=False):
456
+ if isinstance(module, CodeShellModel):
457
+ module.gradient_checkpointing = value
458
+
459
+
460
+ GPT_BIGCODE_START_DOCSTRING = r"""
461
+
462
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
463
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
464
+ etc.)
465
+
466
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
467
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
468
+ and behavior.
469
+
470
+ Parameters:
471
+ config ([`CodeShellConfig`]): Model configuration class with all the parameters of the model.
472
+ Initializing with a config file does not load the weights associated with the model, only the
473
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
474
+ """
475
+
476
+ GPT_BIGCODE_INPUTS_DOCSTRING = r"""
477
+ Args:
478
+ input_ids (`torch.Tensor` of shape `(batch_size, input_ids_length)`):
479
+ `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
480
+ `past_key_values[0][0].shape[-2]` (`sequence_length` of input past key value states). Indices of input
481
+ sequence tokens in the vocabulary.
482
+
483
+ If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
484
+ `input_ids`.
485
+
486
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
487
+ [`PreTrainedTokenizer.__call__`] for details.
488
+
489
+ [What are input IDs?](../glossary#input-ids)
490
+ past_key_values (`Tuple[torch.Tensor]` of length `config.n_layers`):
491
+ Contains precomputed hidden-states (key and values in the attention blocks) as computed by the model (see
492
+ `past_key_values` output below). Can be used to speed up sequential decoding. The `input_ids` which have
493
+ their past given to this model should not be passed as `input_ids` as they have already been computed.
494
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
495
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
496
+
497
+ - 1 for tokens that are **not masked**,
498
+ - 0 for tokens that are **masked**.
499
+
500
+ If `past_key_values` is used, `attention_mask` needs to contain the masking strategy that was used for
501
+ `past_key_values`. In other words, the `attention_mask` always has to have the length:
502
+ `len(past_key_values) + len(input_ids)`
503
+
504
+ [What are attention masks?](../glossary#attention-mask)
505
+ token_type_ids (`torch.Tensor` of shape `(batch_size, input_ids_length)`, *optional*):
506
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
507
+ 1]`:
508
+
509
+ - 0 corresponds to a *sentence A* token,
510
+ - 1 corresponds to a *sentence B* token.
511
+
512
+ [What are token type IDs?](../glossary#token-type-ids)
513
+ position_ids (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
514
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
515
+ config.max_position_embeddings - 1]`.
516
+
517
+ [What are position IDs?](../glossary#position-ids)
518
+ head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
519
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
520
+
521
+ - 1 indicates the head is **not masked**,
522
+ - 0 indicates the head is **masked**.
523
+
524
+ inputs_embeds (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
525
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
526
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
527
+ model's internal embedding lookup matrix.
528
+
529
+ If `past_key_values` is used, optionally only the last `inputs_embeds` have to be input (see
530
+ `past_key_values`).
531
+ use_cache (`bool`, *optional*):
532
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
533
+ `past_key_values`).
534
+ output_attentions (`bool`, *optional*):
535
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
536
+ tensors for more detail.
537
+ output_hidden_states (`bool`, *optional*):
538
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
539
+ more detail.
540
+ return_dict (`bool`, *optional*):
541
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
542
+ """
543
+
544
+
545
+ @add_start_docstrings(
546
+ "The bare GPT_BIGCODE Model transformer outputting raw hidden-states without any specific head on top.",
547
+ GPT_BIGCODE_START_DOCSTRING,
548
+ )
549
+ class CodeShellModel(CodeShellPreTrainedModel):
550
+ def __init__(self, config):
551
+ super().__init__(config)
552
+ self.group_query_attention = config.group_query_attention
553
+ self.num_query_groups = config.num_query_groups
554
+ self.position_embedding_type = config.position_embedding_type
555
+ self.embed_dim = config.hidden_size
556
+
557
+ self.wte = nn.Embedding(config.vocab_size, self.embed_dim)
558
+ if self.position_embedding_type == "learned_absolute":
559
+ self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
560
+ else:
561
+ pass
562
+
563
+ self.drop = nn.Dropout(config.embd_pdrop)
564
+ self.h = nn.ModuleList([CodeShellBlock(config, layer_idx=i) for i in range(config.num_hidden_layers)])
565
+ self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
566
+
567
+ max_positions = config.max_position_embeddings
568
+ self.register_buffer(
569
+ "bias", torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)), persistent=False
570
+ )
571
+
572
+ self.gradient_checkpointing = False
573
+
574
+ # Initialize weights and apply final processing
575
+ self.post_init()
576
+
577
+ def get_input_embeddings(self):
578
+ return self.wte
579
+
580
+ def set_input_embeddings(self, new_embeddings):
581
+ self.wte = new_embeddings
582
+
583
+ @add_start_docstrings_to_model_forward(GPT_BIGCODE_INPUTS_DOCSTRING)
584
+ def forward(
585
+ self,
586
+ input_ids: Optional[torch.Tensor] = None,
587
+ past_key_values: Optional[List[torch.Tensor]] = None,
588
+ attention_mask: Optional[torch.Tensor] = None,
589
+ token_type_ids: Optional[torch.Tensor] = None,
590
+ position_ids: Optional[torch.Tensor] = None,
591
+ head_mask: Optional[torch.Tensor] = None,
592
+ inputs_embeds: Optional[torch.Tensor] = None,
593
+ encoder_hidden_states: Optional[torch.Tensor] = None,
594
+ encoder_attention_mask: Optional[torch.Tensor] = None,
595
+ use_cache: Optional[bool] = None,
596
+ output_attentions: Optional[bool] = None,
597
+ output_hidden_states: Optional[bool] = None,
598
+ return_dict: Optional[bool] = None,
599
+ ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
600
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
601
+ output_hidden_states = (
602
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
603
+ )
604
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
605
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
606
+
607
+ if input_ids is not None and inputs_embeds is not None:
608
+ raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
609
+ elif input_ids is not None:
610
+ input_shape = input_ids.size()
611
+ input_ids = input_ids.reshape(-1, input_shape[-1])
612
+ batch_size = input_ids.shape[0]
613
+ elif inputs_embeds is not None:
614
+ input_shape = inputs_embeds.size()[:-1]
615
+ batch_size = inputs_embeds.shape[0]
616
+ else:
617
+ raise ValueError("You have to specify either input_ids or inputs_embeds")
618
+
619
+ if batch_size <= 0:
620
+ raise ValueError("batch_size has to be defined and > 0")
621
+
622
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
623
+
624
+ if token_type_ids is not None:
625
+ token_type_ids = token_type_ids.reshape(-1, input_shape[-1])
626
+ if position_ids is not None:
627
+ position_ids = position_ids.reshape(-1, input_shape[-1])
628
+
629
+ if past_key_values is None:
630
+ past_length = 0
631
+ past_key_values = tuple([None] * len(self.h))
632
+ else:
633
+ past_length = past_key_values[0][0].size(-2)
634
+
635
+ if attention_mask is not None and len(attention_mask.shape) == 2 and position_ids is None:
636
+ # create position_ids on the fly for batch generation
637
+ position_ids = attention_mask.long().cumsum(-1) - 1
638
+ position_ids.masked_fill_(attention_mask == 0, 1)
639
+ if past_length > 0:
640
+ position_ids = position_ids[:, past_length : input_shape[-1] + past_length :]
641
+ elif position_ids is None:
642
+ position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
643
+ position_ids = position_ids.unsqueeze(0).reshape(-1, input_shape[-1])
644
+
645
+ # Self-attention mask.
646
+ query_length = input_shape[-1]
647
+ key_length = past_length + query_length
648
+ self_attention_mask = self.bias[None, key_length - query_length : key_length, :key_length]
649
+
650
+ if attention_mask is not None:
651
+ self_attention_mask = self_attention_mask * attention_mask.reshape(batch_size, 1, -1).to(
652
+ dtype=torch.bool, device=self_attention_mask.device
653
+ )
654
+
655
+ # MQA models: (batch_size, query_length, n_heads, key_length)
656
+ # MHA models: (batch_size, n_heads, query_length, key_length)
657
+ attention_mask = self_attention_mask.unsqueeze(1)
658
+
659
+ encoder_attention_mask = None
660
+
661
+ # Prepare head mask if needed
662
+ # 1.0 in head_mask indicate we keep the head
663
+ # attention_probs has shape bsz x n_heads x N x N
664
+ # head_mask has shape n_layer x batch x n_heads x N x N
665
+ head_mask = self.get_head_mask(head_mask, self.config.n_layer)
666
+
667
+ if inputs_embeds is None:
668
+ inputs_embeds = self.wte(input_ids)
669
+
670
+ hidden_states = inputs_embeds
671
+ if self.position_embedding_type == "learned_absolute":
672
+ position_embeds = self.wpe(position_ids)
673
+ hidden_states = hidden_states + position_embeds
674
+
675
+ if token_type_ids is not None:
676
+ token_type_embeds = self.wte(token_type_ids)
677
+ hidden_states = hidden_states + token_type_embeds
678
+
679
+ hidden_states = self.drop(hidden_states)
680
+
681
+ output_shape = input_shape + (hidden_states.size(-1),)
682
+
683
+ presents = [] if use_cache else None
684
+ all_self_attentions = () if output_attentions else None
685
+ all_hidden_states = () if output_hidden_states else None
686
+ for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
687
+ if output_hidden_states:
688
+ all_hidden_states = all_hidden_states + (hidden_states,)
689
+
690
+ if self.gradient_checkpointing and self.training:
691
+
692
+ def create_custom_forward(module):
693
+ def custom_forward(*inputs):
694
+ # None for past_key_value
695
+ return module(*inputs, use_cache, output_attentions)
696
+
697
+ return custom_forward
698
+
699
+ outputs = torch.utils.checkpoint.checkpoint(
700
+ create_custom_forward(block),
701
+ hidden_states,
702
+ None,
703
+ attention_mask,
704
+ position_ids,
705
+ head_mask[i],
706
+ encoder_hidden_states,
707
+ encoder_attention_mask,
708
+ )
709
+ else:
710
+ outputs = block(
711
+ hidden_states,
712
+ layer_past=layer_past,
713
+ attention_mask=attention_mask,
714
+ position_ids=position_ids,
715
+ head_mask=head_mask[i],
716
+ encoder_hidden_states=encoder_hidden_states,
717
+ encoder_attention_mask=encoder_attention_mask,
718
+ use_cache=use_cache,
719
+ output_attentions=output_attentions,
720
+ )
721
+
722
+ hidden_states = outputs[0]
723
+ if use_cache:
724
+ presents.append(outputs[1])
725
+
726
+ if output_attentions:
727
+ all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
728
+
729
+ hidden_states = self.ln_f(hidden_states)
730
+ hidden_states = hidden_states.reshape(output_shape)
731
+ # Add last hidden state
732
+ if output_hidden_states:
733
+ all_hidden_states = all_hidden_states + (hidden_states,)
734
+
735
+
736
+ if not return_dict:
737
+ return tuple(
738
+ v
739
+ for v in [hidden_states, presents, all_hidden_states, all_self_attentions]
740
+ if v is not None
741
+ )
742
+
743
+ return BaseModelOutputWithPastAndCrossAttentions(
744
+ last_hidden_state=hidden_states,
745
+ past_key_values=presents,
746
+ hidden_states=all_hidden_states,
747
+ attentions=all_self_attentions,
748
+ )
749
+
750
+ class EndOfFunctionCriteria(StoppingCriteria):
751
+ """Custom `StoppingCriteria` which checks if all generated functions in the batch are completed."""
752
+ def __init__(self, input_lengths, eof_strings, tokenizer):
753
+ self.input_lengths = input_lengths
754
+ self.eof_strings = eof_strings
755
+ self.tokenizer = tokenizer
756
+
757
+ def __call__(self, input_ids, scores, **kwargs):
758
+ """Returns true if all generated sequences contain any of the end-of-function strings."""
759
+ decoded_generations = []
760
+ for _input_ids, input_length in zip(input_ids, self.input_lengths):
761
+ decoded_generations.append(self.tokenizer.decode(_input_ids[input_length:]))
762
+ done = []
763
+ for decoded_generation in decoded_generations:
764
+ done.append(
765
+ any(
766
+ [
767
+ stop_string in decoded_generation
768
+ for stop_string in self.eof_strings
769
+ ]
770
+ )
771
+ )
772
+ return all(done)
773
+
774
+ class TextIterStreamer:
775
+ def __init__(self, tokenizer, skip_prompt=False, skip_special_tokens=False):
776
+ self.tokenizer = tokenizer
777
+ self.skip_prompt = skip_prompt
778
+ self.skip_special_tokens = skip_special_tokens
779
+ self.tokens = []
780
+ self.text_queue = Queue()
781
+ self.next_tokens_are_prompt = True
782
+
783
+ def put(self, value):
784
+ if self.skip_prompt and self.next_tokens_are_prompt:
785
+ self.next_tokens_are_prompt = False
786
+ else:
787
+ if len(value.shape) > 1:
788
+ value = value[0]
789
+ self.tokens.extend(value.tolist())
790
+ self.text_queue.put(
791
+ self.tokenizer.decode(self.tokens, skip_special_tokens=self.skip_special_tokens))
792
+
793
+ def end(self):
794
+ self.text_queue.put(None)
795
+
796
+ def __iter__(self):
797
+ return self
798
+
799
+ def __next__(self):
800
+ value = self.text_queue.get()
801
+ if value is None:
802
+ raise StopIteration()
803
+ else:
804
+ return value
805
+
806
+
807
+ @add_start_docstrings(
808
+ """
809
+ The GPT_BIGCODE Model transformer with a language modeling head on top (linear layer with weights tied to the input
810
+ embeddings).
811
+ """,
812
+ GPT_BIGCODE_START_DOCSTRING,
813
+ )
814
+ class CodeShellForCausalLM(CodeShellPreTrainedModel):
815
+ _tied_weights_keys = ["lm_head.weight"]
816
+
817
+ def __init__(self, config):
818
+ super().__init__(config)
819
+ self.transformer = CodeShellModel(config)
820
+ self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
821
+
822
+ # Initialize weights and apply final processing
823
+ self.post_init()
824
+
825
+ def quantize(self, bits: int):
826
+ try:
827
+ import bitsandbytes
828
+ from .quantizer import quantize
829
+ except ImportError:
830
+ raise ImportError(f"Needs bitsandbytes to run quantize.")
831
+ return quantize(self, bits)
832
+
833
+ def get_output_embeddings(self):
834
+ return self.lm_head
835
+
836
+ def set_output_embeddings(self, new_embeddings):
837
+ self.lm_head = new_embeddings
838
+
839
+ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
840
+ token_type_ids = kwargs.get("token_type_ids", None)
841
+ # only last token for inputs_ids if past is defined in kwargs
842
+ if past_key_values:
843
+ input_ids = input_ids[:, -1].unsqueeze(-1)
844
+ if token_type_ids is not None:
845
+ token_type_ids = token_type_ids[:, -1].unsqueeze(-1)
846
+
847
+ attention_mask = kwargs.get("attention_mask", None)
848
+ position_ids = kwargs.get("position_ids", None)
849
+
850
+ if attention_mask is not None and position_ids is None:
851
+ # create position_ids on the fly for batch generation
852
+ position_ids = attention_mask.long().cumsum(-1) - 1
853
+ position_ids.masked_fill_(attention_mask == 0, 1)
854
+ if past_key_values:
855
+ position_ids = position_ids[:, -1].unsqueeze(-1)
856
+ else:
857
+ position_ids = None
858
+
859
+ # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
860
+ if inputs_embeds is not None and past_key_values is None:
861
+ model_inputs = {"inputs_embeds": inputs_embeds}
862
+ else:
863
+ model_inputs = {"input_ids": input_ids}
864
+
865
+ model_inputs.update(
866
+ {
867
+ "past_key_values": past_key_values,
868
+ "use_cache": kwargs.get("use_cache"),
869
+ "position_ids": position_ids,
870
+ "attention_mask": attention_mask,
871
+ "token_type_ids": token_type_ids,
872
+ }
873
+ )
874
+ return model_inputs
875
+
876
+ @add_start_docstrings_to_model_forward(GPT_BIGCODE_INPUTS_DOCSTRING)
877
+ def forward(
878
+ self,
879
+ input_ids: Optional[torch.Tensor] = None,
880
+ past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
881
+ attention_mask: Optional[torch.Tensor] = None,
882
+ token_type_ids: Optional[torch.Tensor] = None,
883
+ position_ids: Optional[torch.Tensor] = None,
884
+ head_mask: Optional[torch.Tensor] = None,
885
+ inputs_embeds: Optional[torch.Tensor] = None,
886
+ encoder_hidden_states: Optional[torch.Tensor] = None,
887
+ encoder_attention_mask: Optional[torch.Tensor] = None,
888
+ labels: Optional[torch.Tensor] = None,
889
+ use_cache: Optional[bool] = None,
890
+ output_attentions: Optional[bool] = None,
891
+ output_hidden_states: Optional[bool] = None,
892
+ return_dict: Optional[bool] = None,
893
+ ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
894
+ r"""
895
+ labels (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
896
+ Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
897
+ `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
898
+ are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
899
+ """
900
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
901
+
902
+ transformer_outputs = self.transformer(
903
+ input_ids,
904
+ past_key_values=past_key_values,
905
+ attention_mask=attention_mask,
906
+ token_type_ids=token_type_ids,
907
+ position_ids=position_ids,
908
+ head_mask=head_mask,
909
+ inputs_embeds=inputs_embeds,
910
+ encoder_hidden_states=encoder_hidden_states,
911
+ encoder_attention_mask=encoder_attention_mask,
912
+ use_cache=use_cache,
913
+ output_attentions=output_attentions,
914
+ output_hidden_states=output_hidden_states,
915
+ return_dict=return_dict,
916
+ )
917
+ hidden_states = transformer_outputs[0]
918
+ lm_logits = self.lm_head(hidden_states)
919
+ loss = None
920
+ if labels is not None:
921
+ # Shift so that tokens < n predict n
922
+ shift_logits = lm_logits[..., :-1, :].contiguous()
923
+ shift_labels = labels[..., 1:].contiguous().to(shift_logits.device)
924
+ # Flatten the tokens
925
+ loss_fct = CrossEntropyLoss()
926
+ loss = loss_fct(shift_logits.reshape(-1, shift_logits.size(-1)), shift_labels.reshape(-1))
927
+
928
+ if not return_dict:
929
+ output = (lm_logits,) + transformer_outputs[1:]
930
+ return ((loss,) + output) if loss is not None else output
931
+
932
+ return CausalLMOutputWithCrossAttentions(
933
+ loss=loss,
934
+ logits=lm_logits,
935
+ past_key_values=transformer_outputs.past_key_values,
936
+ hidden_states=transformer_outputs.hidden_states,
937
+ attentions=transformer_outputs.attentions,
938
+ )
939
+
940
+ @staticmethod
941
+ def _reorder_cache(past_key_values, beam_idx):
942
+ reordered_past = ()
943
+ for layer_past in past_key_values:
944
+ reordered_past += (
945
+ tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
946
+ )
947
+ return reordered_past
948
+
949
+
950
+ def build_chat_input(self, query, history, tokenizer, max_new_tokens=None):
951
+ user_name = "<human>:"
952
+ ai_name = "<assistant>:"
953
+ stop = "<|endoftext|>"
954
+
955
+ prompt = ''
956
+ for q, r in history:
957
+ prompt += f"{user_name}{q}{stop}"
958
+ prompt += f"{ai_name}{r}{stop}"
959
+ prompt += f"{user_name}{query}{stop}"
960
+ prompt += ai_name.rstrip()
961
+
962
+ max_new_tokens = max_new_tokens or self.generation_config.max_new_tokens or 1024
963
+ max_input_tokens = self.config.n_positions - max_new_tokens
964
+
965
+ input_tokens = tokenizer.encode(prompt)
966
+ input_tokens = input_tokens[-max_input_tokens:] # truncate left
967
+ return torch.LongTensor([input_tokens]).to(self.device)
968
+
969
+ def chat(self, query, history, tokenizer, stream=False,
970
+ generation_config: Optional[GenerationConfig]=None):
971
+ generation_config = generation_config or self.generation_config
972
+ input_ids = self.build_chat_input(query, history, tokenizer, generation_config.max_new_tokens)
973
+ stopping_criteria = StoppingCriteriaList(
974
+ [EndOfFunctionCriteria([len(input_ids[0])], ["<|endoftext|>", "<human>:"], tokenizer)]
975
+ )
976
+
977
+ if stream:
978
+ streamer = TextIterStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
979
+ Thread(target=self.generate, kwargs=dict(
980
+ inputs=input_ids, streamer=streamer,
981
+ stopping_criteria = stopping_criteria,
982
+ generation_config=generation_config,
983
+ )).start()
984
+ return streamer
985
+ else:
986
+ outputs = self.generate(input_ids, generation_config=generation_config, stopping_criteria = stopping_criteria)
987
+ response = tokenizer.decode(outputs[0][len(input_ids[0]):], skip_special_tokens=True)
988
+ return response
989
+
990
+ def generate_stream(self, prompt, tokenizer, generation_config=None, **kwargs):
991
+ generation_config = generation_config or self.generation_config
992
+ max_input_tokens = self.config.n_positions - self.generation_config.max_new_tokens
993
+
994
+ input_ids = tokenizer.encode(prompt)
995
+ input_ids = input_ids[-max_input_tokens:] # truncate left
996
+
997
+ stopping_criteria = StoppingCriteriaList(
998
+ [EndOfFunctionCriteria([len(input_ids[0])], ["<|endoftext|>", "<human>:"], tokenizer)]
999
+ )
1000
+
1001
+ streamer = TextIterStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
1002
+ Thread(target=self.generate, kwargs=dict(
1003
+ inputs=input_ids, stopping_criteria=stopping_criteria, **kwargs
1004
+ )).start()
1005
+ return streamer
1006
+
1007
+
1008
+ class CodeShell4bitForCausalLM(CodeShellForCausalLM):
1009
+ def __init__(self, config):
1010
+ CodeShellPreTrainedModel.__init__(self, config)
1011
+ self.transformer = CodeShellModel(config)
1012
+ self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
1013
+
1014
+ try:
1015
+ import bitsandbytes
1016
+ from .quantizer import quantize_offline
1017
+ quantize_offline(self)
1018
+ except ImportError:
1019
+ raise ImportError(f"Needs bitsandbytes to run quantize.")
1020
+
1021
+ self.post_init()
1022
+
1023
+ @classmethod
1024
+ def from_pretrained(
1025
+ cls,
1026
+ pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
1027
+ *model_args,
1028
+ config: Optional[Union[PretrainedConfig, str, os.PathLike]] = None,
1029
+ cache_dir: Optional[Union[str, os.PathLike]] = None,
1030
+ ignore_mismatched_sizes: bool = False,
1031
+ force_download: bool = False,
1032
+ local_files_only: bool = False,
1033
+ token: Optional[Union[str, bool]] = None,
1034
+ revision: str = "main",
1035
+ use_safetensors: bool = None,
1036
+ **kwargs,
1037
+ ):
1038
+ if not isinstance(config, PretrainedConfig):
1039
+ config_path = config if config is not None else pretrained_model_name_or_path
1040
+ config, _ = cls.config_class.from_pretrained(
1041
+ config_path,
1042
+ cache_dir=cache_dir,
1043
+ return_unused_kwargs=True,
1044
+ force_download=force_download,
1045
+ resume_download=False,
1046
+ proxies=None,
1047
+ local_files_only=local_files_only,
1048
+ token=token,
1049
+ revision=revision,
1050
+ subfolder="",
1051
+ _from_auto=False,
1052
+ _from_pipeline=None,
1053
+ **kwargs,
1054
+ )
1055
+
1056
+ # Load config if we don't provide a configuration
1057
+ from .quantizer import load_state_dict_for_qunantied_model
1058
+ model = cls(config)
1059
+ state_dict = torch.load(os.path.join(pretrained_model_name_or_path, 'pytorch_model.bin'), map_location="cpu")
1060
+ model = load_state_dict_for_qunantied_model(model, state_dict)
1061
+ model.eval()
1062
+
1063
+ # If it is a model with generation capabilities, attempt to load the generation config
1064
+ if model.can_generate():
1065
+ try:
1066
+ model.generation_config = GenerationConfig.from_pretrained(
1067
+ pretrained_model_name_or_path,
1068
+ cache_dir=cache_dir,
1069
+ force_download=force_download,
1070
+ resume_download=False,
1071
+ proxies=None,
1072
+ local_files_only=local_files_only,
1073
+ token=token,
1074
+ revision=revision,
1075
+ subfolder="",
1076
+ _from_auto=False,
1077
+ _from_pipeline=None,
1078
+ **kwargs,
1079
+ )
1080
+ except (OSError, TypeError):
1081
+ pass
1082
+
1083
+ device_map = kwargs.pop("device_map", None)
1084
+ if device_map is not None:
1085
+ model = model.to(torch.device(device_map))
1086
+
1087
+ return model
pytorch_model-00001-of-00002.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:68ef7604ab0b5810b1da75749189ed19bdd312b426cc972be217853f3ae1eb5a
3
+ size 9955739637
pytorch_model-00002-of-00002.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85b11644d508f070bd3dbede750802035028e2216f7bb39c74f52d9cea558fcc
3
+ size 5420546104
pytorch_model.bin.index.json ADDED
@@ -0,0 +1,557 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 15376102656
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "pytorch_model-00001-of-00002.bin",
7
+ "transformer.h.0.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
8
+ "transformer.h.0.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
9
+ "transformer.h.0.attn.c_proj.bias": "pytorch_model-00001-of-00002.bin",
10
+ "transformer.h.0.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
11
+ "transformer.h.0.attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
12
+ "transformer.h.0.ln_1.bias": "pytorch_model-00001-of-00002.bin",
13
+ "transformer.h.0.ln_1.weight": "pytorch_model-00001-of-00002.bin",
14
+ "transformer.h.0.ln_2.bias": "pytorch_model-00001-of-00002.bin",
15
+ "transformer.h.0.ln_2.weight": "pytorch_model-00001-of-00002.bin",
16
+ "transformer.h.0.mlp.c_fc.bias": "pytorch_model-00001-of-00002.bin",
17
+ "transformer.h.0.mlp.c_fc.weight": "pytorch_model-00001-of-00002.bin",
18
+ "transformer.h.0.mlp.c_proj.bias": "pytorch_model-00001-of-00002.bin",
19
+ "transformer.h.0.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
20
+ "transformer.h.1.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
21
+ "transformer.h.1.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
22
+ "transformer.h.1.attn.c_proj.bias": "pytorch_model-00001-of-00002.bin",
23
+ "transformer.h.1.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
24
+ "transformer.h.1.attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
25
+ "transformer.h.1.ln_1.bias": "pytorch_model-00001-of-00002.bin",
26
+ "transformer.h.1.ln_1.weight": "pytorch_model-00001-of-00002.bin",
27
+ "transformer.h.1.ln_2.bias": "pytorch_model-00001-of-00002.bin",
28
+ "transformer.h.1.ln_2.weight": "pytorch_model-00001-of-00002.bin",
29
+ "transformer.h.1.mlp.c_fc.bias": "pytorch_model-00001-of-00002.bin",
30
+ "transformer.h.1.mlp.c_fc.weight": "pytorch_model-00001-of-00002.bin",
31
+ "transformer.h.1.mlp.c_proj.bias": "pytorch_model-00001-of-00002.bin",
32
+ "transformer.h.1.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
33
+ "transformer.h.10.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
34
+ "transformer.h.10.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
35
+ "transformer.h.10.attn.c_proj.bias": "pytorch_model-00001-of-00002.bin",
36
+ "transformer.h.10.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
37
+ "transformer.h.10.attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
38
+ "transformer.h.10.ln_1.bias": "pytorch_model-00001-of-00002.bin",
39
+ "transformer.h.10.ln_1.weight": "pytorch_model-00001-of-00002.bin",
40
+ "transformer.h.10.ln_2.bias": "pytorch_model-00001-of-00002.bin",
41
+ "transformer.h.10.ln_2.weight": "pytorch_model-00001-of-00002.bin",
42
+ "transformer.h.10.mlp.c_fc.bias": "pytorch_model-00001-of-00002.bin",
43
+ "transformer.h.10.mlp.c_fc.weight": "pytorch_model-00001-of-00002.bin",
44
+ "transformer.h.10.mlp.c_proj.bias": "pytorch_model-00001-of-00002.bin",
45
+ "transformer.h.10.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
46
+ "transformer.h.11.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
47
+ "transformer.h.11.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
48
+ "transformer.h.11.attn.c_proj.bias": "pytorch_model-00001-of-00002.bin",
49
+ "transformer.h.11.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
50
+ "transformer.h.11.attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
51
+ "transformer.h.11.ln_1.bias": "pytorch_model-00001-of-00002.bin",
52
+ "transformer.h.11.ln_1.weight": "pytorch_model-00001-of-00002.bin",
53
+ "transformer.h.11.ln_2.bias": "pytorch_model-00001-of-00002.bin",
54
+ "transformer.h.11.ln_2.weight": "pytorch_model-00001-of-00002.bin",
55
+ "transformer.h.11.mlp.c_fc.bias": "pytorch_model-00001-of-00002.bin",
56
+ "transformer.h.11.mlp.c_fc.weight": "pytorch_model-00001-of-00002.bin",
57
+ "transformer.h.11.mlp.c_proj.bias": "pytorch_model-00001-of-00002.bin",
58
+ "transformer.h.11.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
59
+ "transformer.h.12.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
60
+ "transformer.h.12.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
61
+ "transformer.h.12.attn.c_proj.bias": "pytorch_model-00001-of-00002.bin",
62
+ "transformer.h.12.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
63
+ "transformer.h.12.attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
64
+ "transformer.h.12.ln_1.bias": "pytorch_model-00001-of-00002.bin",
65
+ "transformer.h.12.ln_1.weight": "pytorch_model-00001-of-00002.bin",
66
+ "transformer.h.12.ln_2.bias": "pytorch_model-00001-of-00002.bin",
67
+ "transformer.h.12.ln_2.weight": "pytorch_model-00001-of-00002.bin",
68
+ "transformer.h.12.mlp.c_fc.bias": "pytorch_model-00001-of-00002.bin",
69
+ "transformer.h.12.mlp.c_fc.weight": "pytorch_model-00001-of-00002.bin",
70
+ "transformer.h.12.mlp.c_proj.bias": "pytorch_model-00001-of-00002.bin",
71
+ "transformer.h.12.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
72
+ "transformer.h.13.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
73
+ "transformer.h.13.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
74
+ "transformer.h.13.attn.c_proj.bias": "pytorch_model-00001-of-00002.bin",
75
+ "transformer.h.13.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
76
+ "transformer.h.13.attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
77
+ "transformer.h.13.ln_1.bias": "pytorch_model-00001-of-00002.bin",
78
+ "transformer.h.13.ln_1.weight": "pytorch_model-00001-of-00002.bin",
79
+ "transformer.h.13.ln_2.bias": "pytorch_model-00001-of-00002.bin",
80
+ "transformer.h.13.ln_2.weight": "pytorch_model-00001-of-00002.bin",
81
+ "transformer.h.13.mlp.c_fc.bias": "pytorch_model-00001-of-00002.bin",
82
+ "transformer.h.13.mlp.c_fc.weight": "pytorch_model-00001-of-00002.bin",
83
+ "transformer.h.13.mlp.c_proj.bias": "pytorch_model-00001-of-00002.bin",
84
+ "transformer.h.13.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
85
+ "transformer.h.14.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
86
+ "transformer.h.14.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
87
+ "transformer.h.14.attn.c_proj.bias": "pytorch_model-00001-of-00002.bin",
88
+ "transformer.h.14.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
89
+ "transformer.h.14.attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
90
+ "transformer.h.14.ln_1.bias": "pytorch_model-00001-of-00002.bin",
91
+ "transformer.h.14.ln_1.weight": "pytorch_model-00001-of-00002.bin",
92
+ "transformer.h.14.ln_2.bias": "pytorch_model-00001-of-00002.bin",
93
+ "transformer.h.14.ln_2.weight": "pytorch_model-00001-of-00002.bin",
94
+ "transformer.h.14.mlp.c_fc.bias": "pytorch_model-00001-of-00002.bin",
95
+ "transformer.h.14.mlp.c_fc.weight": "pytorch_model-00001-of-00002.bin",
96
+ "transformer.h.14.mlp.c_proj.bias": "pytorch_model-00001-of-00002.bin",
97
+ "transformer.h.14.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
98
+ "transformer.h.15.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
99
+ "transformer.h.15.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
100
+ "transformer.h.15.attn.c_proj.bias": "pytorch_model-00001-of-00002.bin",
101
+ "transformer.h.15.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
102
+ "transformer.h.15.attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
103
+ "transformer.h.15.ln_1.bias": "pytorch_model-00001-of-00002.bin",
104
+ "transformer.h.15.ln_1.weight": "pytorch_model-00001-of-00002.bin",
105
+ "transformer.h.15.ln_2.bias": "pytorch_model-00001-of-00002.bin",
106
+ "transformer.h.15.ln_2.weight": "pytorch_model-00001-of-00002.bin",
107
+ "transformer.h.15.mlp.c_fc.bias": "pytorch_model-00001-of-00002.bin",
108
+ "transformer.h.15.mlp.c_fc.weight": "pytorch_model-00001-of-00002.bin",
109
+ "transformer.h.15.mlp.c_proj.bias": "pytorch_model-00001-of-00002.bin",
110
+ "transformer.h.15.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
111
+ "transformer.h.16.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
112
+ "transformer.h.16.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
113
+ "transformer.h.16.attn.c_proj.bias": "pytorch_model-00001-of-00002.bin",
114
+ "transformer.h.16.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
115
+ "transformer.h.16.attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
116
+ "transformer.h.16.ln_1.bias": "pytorch_model-00001-of-00002.bin",
117
+ "transformer.h.16.ln_1.weight": "pytorch_model-00001-of-00002.bin",
118
+ "transformer.h.16.ln_2.bias": "pytorch_model-00001-of-00002.bin",
119
+ "transformer.h.16.ln_2.weight": "pytorch_model-00001-of-00002.bin",
120
+ "transformer.h.16.mlp.c_fc.bias": "pytorch_model-00001-of-00002.bin",
121
+ "transformer.h.16.mlp.c_fc.weight": "pytorch_model-00001-of-00002.bin",
122
+ "transformer.h.16.mlp.c_proj.bias": "pytorch_model-00001-of-00002.bin",
123
+ "transformer.h.16.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
124
+ "transformer.h.17.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
125
+ "transformer.h.17.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
126
+ "transformer.h.17.attn.c_proj.bias": "pytorch_model-00001-of-00002.bin",
127
+ "transformer.h.17.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
128
+ "transformer.h.17.attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
129
+ "transformer.h.17.ln_1.bias": "pytorch_model-00001-of-00002.bin",
130
+ "transformer.h.17.ln_1.weight": "pytorch_model-00001-of-00002.bin",
131
+ "transformer.h.17.ln_2.bias": "pytorch_model-00001-of-00002.bin",
132
+ "transformer.h.17.ln_2.weight": "pytorch_model-00001-of-00002.bin",
133
+ "transformer.h.17.mlp.c_fc.bias": "pytorch_model-00001-of-00002.bin",
134
+ "transformer.h.17.mlp.c_fc.weight": "pytorch_model-00001-of-00002.bin",
135
+ "transformer.h.17.mlp.c_proj.bias": "pytorch_model-00001-of-00002.bin",
136
+ "transformer.h.17.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
137
+ "transformer.h.18.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
138
+ "transformer.h.18.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
139
+ "transformer.h.18.attn.c_proj.bias": "pytorch_model-00001-of-00002.bin",
140
+ "transformer.h.18.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
141
+ "transformer.h.18.attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
142
+ "transformer.h.18.ln_1.bias": "pytorch_model-00001-of-00002.bin",
143
+ "transformer.h.18.ln_1.weight": "pytorch_model-00001-of-00002.bin",
144
+ "transformer.h.18.ln_2.bias": "pytorch_model-00001-of-00002.bin",
145
+ "transformer.h.18.ln_2.weight": "pytorch_model-00001-of-00002.bin",
146
+ "transformer.h.18.mlp.c_fc.bias": "pytorch_model-00001-of-00002.bin",
147
+ "transformer.h.18.mlp.c_fc.weight": "pytorch_model-00001-of-00002.bin",
148
+ "transformer.h.18.mlp.c_proj.bias": "pytorch_model-00001-of-00002.bin",
149
+ "transformer.h.18.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
150
+ "transformer.h.19.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
151
+ "transformer.h.19.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
152
+ "transformer.h.19.attn.c_proj.bias": "pytorch_model-00001-of-00002.bin",
153
+ "transformer.h.19.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
154
+ "transformer.h.19.attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
155
+ "transformer.h.19.ln_1.bias": "pytorch_model-00001-of-00002.bin",
156
+ "transformer.h.19.ln_1.weight": "pytorch_model-00001-of-00002.bin",
157
+ "transformer.h.19.ln_2.bias": "pytorch_model-00001-of-00002.bin",
158
+ "transformer.h.19.ln_2.weight": "pytorch_model-00001-of-00002.bin",
159
+ "transformer.h.19.mlp.c_fc.bias": "pytorch_model-00001-of-00002.bin",
160
+ "transformer.h.19.mlp.c_fc.weight": "pytorch_model-00001-of-00002.bin",
161
+ "transformer.h.19.mlp.c_proj.bias": "pytorch_model-00001-of-00002.bin",
162
+ "transformer.h.19.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
163
+ "transformer.h.2.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
164
+ "transformer.h.2.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
165
+ "transformer.h.2.attn.c_proj.bias": "pytorch_model-00001-of-00002.bin",
166
+ "transformer.h.2.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
167
+ "transformer.h.2.attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
168
+ "transformer.h.2.ln_1.bias": "pytorch_model-00001-of-00002.bin",
169
+ "transformer.h.2.ln_1.weight": "pytorch_model-00001-of-00002.bin",
170
+ "transformer.h.2.ln_2.bias": "pytorch_model-00001-of-00002.bin",
171
+ "transformer.h.2.ln_2.weight": "pytorch_model-00001-of-00002.bin",
172
+ "transformer.h.2.mlp.c_fc.bias": "pytorch_model-00001-of-00002.bin",
173
+ "transformer.h.2.mlp.c_fc.weight": "pytorch_model-00001-of-00002.bin",
174
+ "transformer.h.2.mlp.c_proj.bias": "pytorch_model-00001-of-00002.bin",
175
+ "transformer.h.2.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
176
+ "transformer.h.20.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
177
+ "transformer.h.20.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
178
+ "transformer.h.20.attn.c_proj.bias": "pytorch_model-00001-of-00002.bin",
179
+ "transformer.h.20.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
180
+ "transformer.h.20.attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
181
+ "transformer.h.20.ln_1.bias": "pytorch_model-00001-of-00002.bin",
182
+ "transformer.h.20.ln_1.weight": "pytorch_model-00001-of-00002.bin",
183
+ "transformer.h.20.ln_2.bias": "pytorch_model-00001-of-00002.bin",
184
+ "transformer.h.20.ln_2.weight": "pytorch_model-00001-of-00002.bin",
185
+ "transformer.h.20.mlp.c_fc.bias": "pytorch_model-00001-of-00002.bin",
186
+ "transformer.h.20.mlp.c_fc.weight": "pytorch_model-00001-of-00002.bin",
187
+ "transformer.h.20.mlp.c_proj.bias": "pytorch_model-00001-of-00002.bin",
188
+ "transformer.h.20.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
189
+ "transformer.h.21.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
190
+ "transformer.h.21.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
191
+ "transformer.h.21.attn.c_proj.bias": "pytorch_model-00001-of-00002.bin",
192
+ "transformer.h.21.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
193
+ "transformer.h.21.attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
194
+ "transformer.h.21.ln_1.bias": "pytorch_model-00001-of-00002.bin",
195
+ "transformer.h.21.ln_1.weight": "pytorch_model-00001-of-00002.bin",
196
+ "transformer.h.21.ln_2.bias": "pytorch_model-00001-of-00002.bin",
197
+ "transformer.h.21.ln_2.weight": "pytorch_model-00001-of-00002.bin",
198
+ "transformer.h.21.mlp.c_fc.bias": "pytorch_model-00001-of-00002.bin",
199
+ "transformer.h.21.mlp.c_fc.weight": "pytorch_model-00001-of-00002.bin",
200
+ "transformer.h.21.mlp.c_proj.bias": "pytorch_model-00001-of-00002.bin",
201
+ "transformer.h.21.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
202
+ "transformer.h.22.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
203
+ "transformer.h.22.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
204
+ "transformer.h.22.attn.c_proj.bias": "pytorch_model-00001-of-00002.bin",
205
+ "transformer.h.22.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
206
+ "transformer.h.22.attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
207
+ "transformer.h.22.ln_1.bias": "pytorch_model-00001-of-00002.bin",
208
+ "transformer.h.22.ln_1.weight": "pytorch_model-00001-of-00002.bin",
209
+ "transformer.h.22.ln_2.bias": "pytorch_model-00001-of-00002.bin",
210
+ "transformer.h.22.ln_2.weight": "pytorch_model-00001-of-00002.bin",
211
+ "transformer.h.22.mlp.c_fc.bias": "pytorch_model-00001-of-00002.bin",
212
+ "transformer.h.22.mlp.c_fc.weight": "pytorch_model-00001-of-00002.bin",
213
+ "transformer.h.22.mlp.c_proj.bias": "pytorch_model-00001-of-00002.bin",
214
+ "transformer.h.22.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
215
+ "transformer.h.23.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
216
+ "transformer.h.23.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
217
+ "transformer.h.23.attn.c_proj.bias": "pytorch_model-00001-of-00002.bin",
218
+ "transformer.h.23.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
219
+ "transformer.h.23.attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
220
+ "transformer.h.23.ln_1.bias": "pytorch_model-00001-of-00002.bin",
221
+ "transformer.h.23.ln_1.weight": "pytorch_model-00001-of-00002.bin",
222
+ "transformer.h.23.ln_2.bias": "pytorch_model-00001-of-00002.bin",
223
+ "transformer.h.23.ln_2.weight": "pytorch_model-00001-of-00002.bin",
224
+ "transformer.h.23.mlp.c_fc.bias": "pytorch_model-00001-of-00002.bin",
225
+ "transformer.h.23.mlp.c_fc.weight": "pytorch_model-00001-of-00002.bin",
226
+ "transformer.h.23.mlp.c_proj.bias": "pytorch_model-00001-of-00002.bin",
227
+ "transformer.h.23.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
228
+ "transformer.h.24.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
229
+ "transformer.h.24.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
230
+ "transformer.h.24.attn.c_proj.bias": "pytorch_model-00001-of-00002.bin",
231
+ "transformer.h.24.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
232
+ "transformer.h.24.attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
233
+ "transformer.h.24.ln_1.bias": "pytorch_model-00001-of-00002.bin",
234
+ "transformer.h.24.ln_1.weight": "pytorch_model-00001-of-00002.bin",
235
+ "transformer.h.24.ln_2.bias": "pytorch_model-00001-of-00002.bin",
236
+ "transformer.h.24.ln_2.weight": "pytorch_model-00001-of-00002.bin",
237
+ "transformer.h.24.mlp.c_fc.bias": "pytorch_model-00001-of-00002.bin",
238
+ "transformer.h.24.mlp.c_fc.weight": "pytorch_model-00001-of-00002.bin",
239
+ "transformer.h.24.mlp.c_proj.bias": "pytorch_model-00001-of-00002.bin",
240
+ "transformer.h.24.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
241
+ "transformer.h.25.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
242
+ "transformer.h.25.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
243
+ "transformer.h.25.attn.c_proj.bias": "pytorch_model-00001-of-00002.bin",
244
+ "transformer.h.25.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
245
+ "transformer.h.25.attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
246
+ "transformer.h.25.ln_1.bias": "pytorch_model-00001-of-00002.bin",
247
+ "transformer.h.25.ln_1.weight": "pytorch_model-00001-of-00002.bin",
248
+ "transformer.h.25.ln_2.bias": "pytorch_model-00001-of-00002.bin",
249
+ "transformer.h.25.ln_2.weight": "pytorch_model-00001-of-00002.bin",
250
+ "transformer.h.25.mlp.c_fc.bias": "pytorch_model-00001-of-00002.bin",
251
+ "transformer.h.25.mlp.c_fc.weight": "pytorch_model-00001-of-00002.bin",
252
+ "transformer.h.25.mlp.c_proj.bias": "pytorch_model-00001-of-00002.bin",
253
+ "transformer.h.25.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
254
+ "transformer.h.26.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
255
+ "transformer.h.26.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
256
+ "transformer.h.26.attn.c_proj.bias": "pytorch_model-00001-of-00002.bin",
257
+ "transformer.h.26.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
258
+ "transformer.h.26.attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
259
+ "transformer.h.26.ln_1.bias": "pytorch_model-00001-of-00002.bin",
260
+ "transformer.h.26.ln_1.weight": "pytorch_model-00001-of-00002.bin",
261
+ "transformer.h.26.ln_2.bias": "pytorch_model-00001-of-00002.bin",
262
+ "transformer.h.26.ln_2.weight": "pytorch_model-00001-of-00002.bin",
263
+ "transformer.h.26.mlp.c_fc.bias": "pytorch_model-00001-of-00002.bin",
264
+ "transformer.h.26.mlp.c_fc.weight": "pytorch_model-00001-of-00002.bin",
265
+ "transformer.h.26.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
266
+ "transformer.h.26.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
267
+ "transformer.h.27.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
268
+ "transformer.h.27.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
269
+ "transformer.h.27.attn.c_proj.bias": "pytorch_model-00002-of-00002.bin",
270
+ "transformer.h.27.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
271
+ "transformer.h.27.attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
272
+ "transformer.h.27.ln_1.bias": "pytorch_model-00002-of-00002.bin",
273
+ "transformer.h.27.ln_1.weight": "pytorch_model-00002-of-00002.bin",
274
+ "transformer.h.27.ln_2.bias": "pytorch_model-00002-of-00002.bin",
275
+ "transformer.h.27.ln_2.weight": "pytorch_model-00002-of-00002.bin",
276
+ "transformer.h.27.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
277
+ "transformer.h.27.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
278
+ "transformer.h.27.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
279
+ "transformer.h.27.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
280
+ "transformer.h.28.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
281
+ "transformer.h.28.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
282
+ "transformer.h.28.attn.c_proj.bias": "pytorch_model-00002-of-00002.bin",
283
+ "transformer.h.28.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
284
+ "transformer.h.28.attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
285
+ "transformer.h.28.ln_1.bias": "pytorch_model-00002-of-00002.bin",
286
+ "transformer.h.28.ln_1.weight": "pytorch_model-00002-of-00002.bin",
287
+ "transformer.h.28.ln_2.bias": "pytorch_model-00002-of-00002.bin",
288
+ "transformer.h.28.ln_2.weight": "pytorch_model-00002-of-00002.bin",
289
+ "transformer.h.28.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
290
+ "transformer.h.28.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
291
+ "transformer.h.28.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
292
+ "transformer.h.28.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
293
+ "transformer.h.29.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
294
+ "transformer.h.29.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
295
+ "transformer.h.29.attn.c_proj.bias": "pytorch_model-00002-of-00002.bin",
296
+ "transformer.h.29.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
297
+ "transformer.h.29.attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
298
+ "transformer.h.29.ln_1.bias": "pytorch_model-00002-of-00002.bin",
299
+ "transformer.h.29.ln_1.weight": "pytorch_model-00002-of-00002.bin",
300
+ "transformer.h.29.ln_2.bias": "pytorch_model-00002-of-00002.bin",
301
+ "transformer.h.29.ln_2.weight": "pytorch_model-00002-of-00002.bin",
302
+ "transformer.h.29.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
303
+ "transformer.h.29.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
304
+ "transformer.h.29.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
305
+ "transformer.h.29.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
306
+ "transformer.h.3.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
307
+ "transformer.h.3.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
308
+ "transformer.h.3.attn.c_proj.bias": "pytorch_model-00001-of-00002.bin",
309
+ "transformer.h.3.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
310
+ "transformer.h.3.attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
311
+ "transformer.h.3.ln_1.bias": "pytorch_model-00001-of-00002.bin",
312
+ "transformer.h.3.ln_1.weight": "pytorch_model-00001-of-00002.bin",
313
+ "transformer.h.3.ln_2.bias": "pytorch_model-00001-of-00002.bin",
314
+ "transformer.h.3.ln_2.weight": "pytorch_model-00001-of-00002.bin",
315
+ "transformer.h.3.mlp.c_fc.bias": "pytorch_model-00001-of-00002.bin",
316
+ "transformer.h.3.mlp.c_fc.weight": "pytorch_model-00001-of-00002.bin",
317
+ "transformer.h.3.mlp.c_proj.bias": "pytorch_model-00001-of-00002.bin",
318
+ "transformer.h.3.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
319
+ "transformer.h.30.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
320
+ "transformer.h.30.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
321
+ "transformer.h.30.attn.c_proj.bias": "pytorch_model-00002-of-00002.bin",
322
+ "transformer.h.30.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
323
+ "transformer.h.30.attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
324
+ "transformer.h.30.ln_1.bias": "pytorch_model-00002-of-00002.bin",
325
+ "transformer.h.30.ln_1.weight": "pytorch_model-00002-of-00002.bin",
326
+ "transformer.h.30.ln_2.bias": "pytorch_model-00002-of-00002.bin",
327
+ "transformer.h.30.ln_2.weight": "pytorch_model-00002-of-00002.bin",
328
+ "transformer.h.30.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
329
+ "transformer.h.30.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
330
+ "transformer.h.30.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
331
+ "transformer.h.30.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
332
+ "transformer.h.31.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
333
+ "transformer.h.31.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
334
+ "transformer.h.31.attn.c_proj.bias": "pytorch_model-00002-of-00002.bin",
335
+ "transformer.h.31.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
336
+ "transformer.h.31.attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
337
+ "transformer.h.31.ln_1.bias": "pytorch_model-00002-of-00002.bin",
338
+ "transformer.h.31.ln_1.weight": "pytorch_model-00002-of-00002.bin",
339
+ "transformer.h.31.ln_2.bias": "pytorch_model-00002-of-00002.bin",
340
+ "transformer.h.31.ln_2.weight": "pytorch_model-00002-of-00002.bin",
341
+ "transformer.h.31.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
342
+ "transformer.h.31.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
343
+ "transformer.h.31.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
344
+ "transformer.h.31.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
345
+ "transformer.h.32.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
346
+ "transformer.h.32.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
347
+ "transformer.h.32.attn.c_proj.bias": "pytorch_model-00002-of-00002.bin",
348
+ "transformer.h.32.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
349
+ "transformer.h.32.attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
350
+ "transformer.h.32.ln_1.bias": "pytorch_model-00002-of-00002.bin",
351
+ "transformer.h.32.ln_1.weight": "pytorch_model-00002-of-00002.bin",
352
+ "transformer.h.32.ln_2.bias": "pytorch_model-00002-of-00002.bin",
353
+ "transformer.h.32.ln_2.weight": "pytorch_model-00002-of-00002.bin",
354
+ "transformer.h.32.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
355
+ "transformer.h.32.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
356
+ "transformer.h.32.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
357
+ "transformer.h.32.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
358
+ "transformer.h.33.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
359
+ "transformer.h.33.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
360
+ "transformer.h.33.attn.c_proj.bias": "pytorch_model-00002-of-00002.bin",
361
+ "transformer.h.33.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
362
+ "transformer.h.33.attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
363
+ "transformer.h.33.ln_1.bias": "pytorch_model-00002-of-00002.bin",
364
+ "transformer.h.33.ln_1.weight": "pytorch_model-00002-of-00002.bin",
365
+ "transformer.h.33.ln_2.bias": "pytorch_model-00002-of-00002.bin",
366
+ "transformer.h.33.ln_2.weight": "pytorch_model-00002-of-00002.bin",
367
+ "transformer.h.33.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
368
+ "transformer.h.33.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
369
+ "transformer.h.33.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
370
+ "transformer.h.33.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
371
+ "transformer.h.34.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
372
+ "transformer.h.34.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
373
+ "transformer.h.34.attn.c_proj.bias": "pytorch_model-00002-of-00002.bin",
374
+ "transformer.h.34.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
375
+ "transformer.h.34.attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
376
+ "transformer.h.34.ln_1.bias": "pytorch_model-00002-of-00002.bin",
377
+ "transformer.h.34.ln_1.weight": "pytorch_model-00002-of-00002.bin",
378
+ "transformer.h.34.ln_2.bias": "pytorch_model-00002-of-00002.bin",
379
+ "transformer.h.34.ln_2.weight": "pytorch_model-00002-of-00002.bin",
380
+ "transformer.h.34.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
381
+ "transformer.h.34.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
382
+ "transformer.h.34.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
383
+ "transformer.h.34.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
384
+ "transformer.h.35.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
385
+ "transformer.h.35.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
386
+ "transformer.h.35.attn.c_proj.bias": "pytorch_model-00002-of-00002.bin",
387
+ "transformer.h.35.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
388
+ "transformer.h.35.attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
389
+ "transformer.h.35.ln_1.bias": "pytorch_model-00002-of-00002.bin",
390
+ "transformer.h.35.ln_1.weight": "pytorch_model-00002-of-00002.bin",
391
+ "transformer.h.35.ln_2.bias": "pytorch_model-00002-of-00002.bin",
392
+ "transformer.h.35.ln_2.weight": "pytorch_model-00002-of-00002.bin",
393
+ "transformer.h.35.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
394
+ "transformer.h.35.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
395
+ "transformer.h.35.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
396
+ "transformer.h.35.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
397
+ "transformer.h.36.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
398
+ "transformer.h.36.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
399
+ "transformer.h.36.attn.c_proj.bias": "pytorch_model-00002-of-00002.bin",
400
+ "transformer.h.36.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
401
+ "transformer.h.36.attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
402
+ "transformer.h.36.ln_1.bias": "pytorch_model-00002-of-00002.bin",
403
+ "transformer.h.36.ln_1.weight": "pytorch_model-00002-of-00002.bin",
404
+ "transformer.h.36.ln_2.bias": "pytorch_model-00002-of-00002.bin",
405
+ "transformer.h.36.ln_2.weight": "pytorch_model-00002-of-00002.bin",
406
+ "transformer.h.36.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
407
+ "transformer.h.36.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
408
+ "transformer.h.36.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
409
+ "transformer.h.36.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
410
+ "transformer.h.37.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
411
+ "transformer.h.37.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
412
+ "transformer.h.37.attn.c_proj.bias": "pytorch_model-00002-of-00002.bin",
413
+ "transformer.h.37.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
414
+ "transformer.h.37.attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
415
+ "transformer.h.37.ln_1.bias": "pytorch_model-00002-of-00002.bin",
416
+ "transformer.h.37.ln_1.weight": "pytorch_model-00002-of-00002.bin",
417
+ "transformer.h.37.ln_2.bias": "pytorch_model-00002-of-00002.bin",
418
+ "transformer.h.37.ln_2.weight": "pytorch_model-00002-of-00002.bin",
419
+ "transformer.h.37.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
420
+ "transformer.h.37.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
421
+ "transformer.h.37.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
422
+ "transformer.h.37.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
423
+ "transformer.h.38.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
424
+ "transformer.h.38.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
425
+ "transformer.h.38.attn.c_proj.bias": "pytorch_model-00002-of-00002.bin",
426
+ "transformer.h.38.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
427
+ "transformer.h.38.attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
428
+ "transformer.h.38.ln_1.bias": "pytorch_model-00002-of-00002.bin",
429
+ "transformer.h.38.ln_1.weight": "pytorch_model-00002-of-00002.bin",
430
+ "transformer.h.38.ln_2.bias": "pytorch_model-00002-of-00002.bin",
431
+ "transformer.h.38.ln_2.weight": "pytorch_model-00002-of-00002.bin",
432
+ "transformer.h.38.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
433
+ "transformer.h.38.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
434
+ "transformer.h.38.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
435
+ "transformer.h.38.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
436
+ "transformer.h.39.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
437
+ "transformer.h.39.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
438
+ "transformer.h.39.attn.c_proj.bias": "pytorch_model-00002-of-00002.bin",
439
+ "transformer.h.39.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
440
+ "transformer.h.39.attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
441
+ "transformer.h.39.ln_1.bias": "pytorch_model-00002-of-00002.bin",
442
+ "transformer.h.39.ln_1.weight": "pytorch_model-00002-of-00002.bin",
443
+ "transformer.h.39.ln_2.bias": "pytorch_model-00002-of-00002.bin",
444
+ "transformer.h.39.ln_2.weight": "pytorch_model-00002-of-00002.bin",
445
+ "transformer.h.39.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
446
+ "transformer.h.39.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
447
+ "transformer.h.39.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
448
+ "transformer.h.39.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
449
+ "transformer.h.4.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
450
+ "transformer.h.4.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
451
+ "transformer.h.4.attn.c_proj.bias": "pytorch_model-00001-of-00002.bin",
452
+ "transformer.h.4.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
453
+ "transformer.h.4.attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
454
+ "transformer.h.4.ln_1.bias": "pytorch_model-00001-of-00002.bin",
455
+ "transformer.h.4.ln_1.weight": "pytorch_model-00001-of-00002.bin",
456
+ "transformer.h.4.ln_2.bias": "pytorch_model-00001-of-00002.bin",
457
+ "transformer.h.4.ln_2.weight": "pytorch_model-00001-of-00002.bin",
458
+ "transformer.h.4.mlp.c_fc.bias": "pytorch_model-00001-of-00002.bin",
459
+ "transformer.h.4.mlp.c_fc.weight": "pytorch_model-00001-of-00002.bin",
460
+ "transformer.h.4.mlp.c_proj.bias": "pytorch_model-00001-of-00002.bin",
461
+ "transformer.h.4.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
462
+ "transformer.h.40.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
463
+ "transformer.h.40.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
464
+ "transformer.h.40.attn.c_proj.bias": "pytorch_model-00002-of-00002.bin",
465
+ "transformer.h.40.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
466
+ "transformer.h.40.attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
467
+ "transformer.h.40.ln_1.bias": "pytorch_model-00002-of-00002.bin",
468
+ "transformer.h.40.ln_1.weight": "pytorch_model-00002-of-00002.bin",
469
+ "transformer.h.40.ln_2.bias": "pytorch_model-00002-of-00002.bin",
470
+ "transformer.h.40.ln_2.weight": "pytorch_model-00002-of-00002.bin",
471
+ "transformer.h.40.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
472
+ "transformer.h.40.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
473
+ "transformer.h.40.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
474
+ "transformer.h.40.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
475
+ "transformer.h.41.attn.c_attn.bias": "pytorch_model-00002-of-00002.bin",
476
+ "transformer.h.41.attn.c_attn.weight": "pytorch_model-00002-of-00002.bin",
477
+ "transformer.h.41.attn.c_proj.bias": "pytorch_model-00002-of-00002.bin",
478
+ "transformer.h.41.attn.c_proj.weight": "pytorch_model-00002-of-00002.bin",
479
+ "transformer.h.41.attn.rotary_emb.inv_freq": "pytorch_model-00002-of-00002.bin",
480
+ "transformer.h.41.ln_1.bias": "pytorch_model-00002-of-00002.bin",
481
+ "transformer.h.41.ln_1.weight": "pytorch_model-00002-of-00002.bin",
482
+ "transformer.h.41.ln_2.bias": "pytorch_model-00002-of-00002.bin",
483
+ "transformer.h.41.ln_2.weight": "pytorch_model-00002-of-00002.bin",
484
+ "transformer.h.41.mlp.c_fc.bias": "pytorch_model-00002-of-00002.bin",
485
+ "transformer.h.41.mlp.c_fc.weight": "pytorch_model-00002-of-00002.bin",
486
+ "transformer.h.41.mlp.c_proj.bias": "pytorch_model-00002-of-00002.bin",
487
+ "transformer.h.41.mlp.c_proj.weight": "pytorch_model-00002-of-00002.bin",
488
+ "transformer.h.5.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
489
+ "transformer.h.5.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
490
+ "transformer.h.5.attn.c_proj.bias": "pytorch_model-00001-of-00002.bin",
491
+ "transformer.h.5.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
492
+ "transformer.h.5.attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
493
+ "transformer.h.5.ln_1.bias": "pytorch_model-00001-of-00002.bin",
494
+ "transformer.h.5.ln_1.weight": "pytorch_model-00001-of-00002.bin",
495
+ "transformer.h.5.ln_2.bias": "pytorch_model-00001-of-00002.bin",
496
+ "transformer.h.5.ln_2.weight": "pytorch_model-00001-of-00002.bin",
497
+ "transformer.h.5.mlp.c_fc.bias": "pytorch_model-00001-of-00002.bin",
498
+ "transformer.h.5.mlp.c_fc.weight": "pytorch_model-00001-of-00002.bin",
499
+ "transformer.h.5.mlp.c_proj.bias": "pytorch_model-00001-of-00002.bin",
500
+ "transformer.h.5.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
501
+ "transformer.h.6.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
502
+ "transformer.h.6.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
503
+ "transformer.h.6.attn.c_proj.bias": "pytorch_model-00001-of-00002.bin",
504
+ "transformer.h.6.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
505
+ "transformer.h.6.attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
506
+ "transformer.h.6.ln_1.bias": "pytorch_model-00001-of-00002.bin",
507
+ "transformer.h.6.ln_1.weight": "pytorch_model-00001-of-00002.bin",
508
+ "transformer.h.6.ln_2.bias": "pytorch_model-00001-of-00002.bin",
509
+ "transformer.h.6.ln_2.weight": "pytorch_model-00001-of-00002.bin",
510
+ "transformer.h.6.mlp.c_fc.bias": "pytorch_model-00001-of-00002.bin",
511
+ "transformer.h.6.mlp.c_fc.weight": "pytorch_model-00001-of-00002.bin",
512
+ "transformer.h.6.mlp.c_proj.bias": "pytorch_model-00001-of-00002.bin",
513
+ "transformer.h.6.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
514
+ "transformer.h.7.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
515
+ "transformer.h.7.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
516
+ "transformer.h.7.attn.c_proj.bias": "pytorch_model-00001-of-00002.bin",
517
+ "transformer.h.7.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
518
+ "transformer.h.7.attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
519
+ "transformer.h.7.ln_1.bias": "pytorch_model-00001-of-00002.bin",
520
+ "transformer.h.7.ln_1.weight": "pytorch_model-00001-of-00002.bin",
521
+ "transformer.h.7.ln_2.bias": "pytorch_model-00001-of-00002.bin",
522
+ "transformer.h.7.ln_2.weight": "pytorch_model-00001-of-00002.bin",
523
+ "transformer.h.7.mlp.c_fc.bias": "pytorch_model-00001-of-00002.bin",
524
+ "transformer.h.7.mlp.c_fc.weight": "pytorch_model-00001-of-00002.bin",
525
+ "transformer.h.7.mlp.c_proj.bias": "pytorch_model-00001-of-00002.bin",
526
+ "transformer.h.7.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
527
+ "transformer.h.8.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
528
+ "transformer.h.8.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
529
+ "transformer.h.8.attn.c_proj.bias": "pytorch_model-00001-of-00002.bin",
530
+ "transformer.h.8.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
531
+ "transformer.h.8.attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
532
+ "transformer.h.8.ln_1.bias": "pytorch_model-00001-of-00002.bin",
533
+ "transformer.h.8.ln_1.weight": "pytorch_model-00001-of-00002.bin",
534
+ "transformer.h.8.ln_2.bias": "pytorch_model-00001-of-00002.bin",
535
+ "transformer.h.8.ln_2.weight": "pytorch_model-00001-of-00002.bin",
536
+ "transformer.h.8.mlp.c_fc.bias": "pytorch_model-00001-of-00002.bin",
537
+ "transformer.h.8.mlp.c_fc.weight": "pytorch_model-00001-of-00002.bin",
538
+ "transformer.h.8.mlp.c_proj.bias": "pytorch_model-00001-of-00002.bin",
539
+ "transformer.h.8.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
540
+ "transformer.h.9.attn.c_attn.bias": "pytorch_model-00001-of-00002.bin",
541
+ "transformer.h.9.attn.c_attn.weight": "pytorch_model-00001-of-00002.bin",
542
+ "transformer.h.9.attn.c_proj.bias": "pytorch_model-00001-of-00002.bin",
543
+ "transformer.h.9.attn.c_proj.weight": "pytorch_model-00001-of-00002.bin",
544
+ "transformer.h.9.attn.rotary_emb.inv_freq": "pytorch_model-00001-of-00002.bin",
545
+ "transformer.h.9.ln_1.bias": "pytorch_model-00001-of-00002.bin",
546
+ "transformer.h.9.ln_1.weight": "pytorch_model-00001-of-00002.bin",
547
+ "transformer.h.9.ln_2.bias": "pytorch_model-00001-of-00002.bin",
548
+ "transformer.h.9.ln_2.weight": "pytorch_model-00001-of-00002.bin",
549
+ "transformer.h.9.mlp.c_fc.bias": "pytorch_model-00001-of-00002.bin",
550
+ "transformer.h.9.mlp.c_fc.weight": "pytorch_model-00001-of-00002.bin",
551
+ "transformer.h.9.mlp.c_proj.bias": "pytorch_model-00001-of-00002.bin",
552
+ "transformer.h.9.mlp.c_proj.weight": "pytorch_model-00001-of-00002.bin",
553
+ "transformer.ln_f.bias": "pytorch_model-00002-of-00002.bin",
554
+ "transformer.ln_f.weight": "pytorch_model-00002-of-00002.bin",
555
+ "transformer.wte.weight": "pytorch_model-00001-of-00002.bin"
556
+ }
557
+ }
quantizer.py ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023 WisdomShell Inc. All Rights Reserved.
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ try:
17
+ import bitsandbytes as bnb
18
+ from bitsandbytes.nn.modules import Params4bit, Int8Params
19
+ except ImportError:
20
+ pass
21
+ import torch
22
+
23
+ def Params4bitCuda(self, device):
24
+ self.data = self.data.cuda(device)
25
+ if self.quant_state is not None:
26
+ self.quant_state[0] = self.quant_state[0].cuda(device)
27
+ self.quant_state[6] = self.quant_state[6].cuda(device)
28
+ return self
29
+
30
+ def Params4bitTo(self, *args, **kwargs):
31
+ device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(*args, **kwargs)
32
+
33
+ if (device is not None and device.type == "cuda" and self.data.device.type == "cpu"):
34
+ return self.cuda(device)
35
+ else:
36
+ if self.quant_state is not None:
37
+ # make sure the quantization state is on the right device
38
+ self.quant_state[0] = self.quant_state[0].to(device)
39
+ self.quant_state[6] = self.quant_state[6].to(device)
40
+ new_param = Params4bit(self.to(device=device, dtype=dtype, non_blocking=non_blocking),
41
+ requires_grad=self.requires_grad, quant_state=self.quant_state,
42
+ blocksize=self.blocksize, compress_statistics=self.compress_statistics,
43
+ quant_type=self.quant_type)
44
+
45
+ return new_param
46
+
47
+ class Linear4bitOnline(torch.nn.Module):
48
+ def __init__(self, weight, bias, quant_type):
49
+ super().__init__()
50
+ self.weight = Params4bit(
51
+ weight.data, requires_grad=False, compress_statistics=True, quant_type=quant_type
52
+ )
53
+ self.compute_dtype = None
54
+ #self.weight.cuda(weight.device)
55
+ self.bias = bias
56
+
57
+ def forward(self, x: torch.Tensor):
58
+ # weights are cast automatically as Int8Params, but the bias has to be cast manually
59
+ if self.bias is not None and self.bias.dtype != x.dtype:
60
+ self.bias.data = self.bias.data.to(x.dtype)
61
+
62
+ if getattr(self.weight, "quant_state", None) is None:
63
+ print(
64
+ "FP4 quantization state not initialized. Please call .cuda() or .to(device) on the LinearFP4 layer first."
65
+ )
66
+ inp_dtype = x.dtype
67
+ if self.compute_dtype is not None:
68
+ x = x.to(self.compute_dtype)
69
+
70
+ bias = None if self.bias is None else self.bias.to(self.compute_dtype)
71
+ out = bnb.matmul_4bit(
72
+ x, self.weight.t(), bias=bias, quant_state=self.weight.quant_state
73
+ )
74
+
75
+ out = out.to(inp_dtype)
76
+
77
+ return out
78
+
79
+ class Linear8bitLtOnline(torch.nn.Module):
80
+ def __init__(
81
+ self,
82
+ weight,
83
+ bias,
84
+ has_fp16_weights=True,
85
+ memory_efficient_backward=False,
86
+ threshold=0.0,
87
+ index=None,
88
+ ):
89
+ super().__init__()
90
+ assert (
91
+ not memory_efficient_backward
92
+ ), "memory_efficient_backward is no longer required and the argument is deprecated in 0.37.0 and will be removed in 0.39.0"
93
+ self.state = bnb.MatmulLtState()
94
+ self.index = index
95
+
96
+ # Necessary for stacked layers
97
+ self.state.threshold = threshold
98
+ self.state.has_fp16_weights = has_fp16_weights
99
+ self.state.memory_efficient_backward = memory_efficient_backward
100
+ if threshold > 0.0 and not has_fp16_weights:
101
+ self.state.use_pool = True
102
+
103
+ self.weight = Int8Params(
104
+ weight.data,
105
+ has_fp16_weights=has_fp16_weights,
106
+ requires_grad=has_fp16_weights,
107
+ )
108
+ self.bias = bias
109
+
110
+ def init_8bit_state(self):
111
+ self.state.CB = self.weight.CB
112
+ self.state.SCB = self.weight.SCB
113
+ self.weight.CB = None
114
+ self.weight.SCB = None
115
+
116
+ def forward(self, x: torch.Tensor):
117
+ self.state.is_training = self.training
118
+ if self.weight.CB is not None:
119
+ self.init_8bit_state()
120
+
121
+ # weights are cast automatically as Int8Params, but the bias has to be cast manually
122
+ if self.bias is not None and self.bias.dtype != x.dtype:
123
+ self.bias.data = self.bias.data.to(x.dtype)
124
+
125
+ out = bnb.matmul(x, self.weight, bias=self.bias, state=self.state)
126
+
127
+ if not self.state.has_fp16_weights:
128
+ if self.state.CB is not None and self.state.CxB is not None:
129
+ # we converted 8-bit row major to turing/ampere format in the first inference pass
130
+ # we no longer need the row-major weight
131
+ del self.state.CB
132
+ self.weight.data = self.state.CxB
133
+ return out
134
+
135
+ def quantize_online(model, bits: int):
136
+ def quant(weight, bias=None):
137
+ if bits == 8:
138
+ linear = Linear8bitLtOnline(
139
+ weight,
140
+ bias,
141
+ has_fp16_weights=False,
142
+ threshold=6.0,
143
+ )
144
+ if bias is not None:
145
+ linear.bias = torch.nn.Parameter(bias)
146
+ elif bits == 4:
147
+ linear = Linear4bitOnline(
148
+ weight,
149
+ bias,
150
+ quant_type="nf4", #fp4/nf4
151
+ )
152
+ else:
153
+ raise ValueError("quantize only support 4/8 bit")
154
+ return linear
155
+
156
+ def auto_quant(layer):
157
+ if hasattr(layer,"bias"):
158
+ linear = quant(layer.weight,bias=layer.bias)
159
+ else:
160
+ linear = quant(layer.weight)
161
+ return linear
162
+
163
+ for i,layer in enumerate(model.transformer.h):
164
+ layer.mlp.c_fc = auto_quant(layer.mlp.c_fc)
165
+ layer.mlp.c_proj = auto_quant(layer.mlp.c_proj)
166
+
167
+ layer.attn.c_attn=auto_quant(layer.attn.c_attn)
168
+ layer.attn.c_proj=auto_quant(layer.attn.c_proj)
169
+
170
+ return model
171
+
172
+
173
+ general_weight_dict = {
174
+ "transformer.wte.weight": False,
175
+ "transformer.ln_f.weight": False,
176
+ "transformer.ln_f.bias": False,
177
+ "lm_head.weight": False,
178
+ }
179
+
180
+ layer_weight_dict = {
181
+ "transformer.h.{i}.ln_1.weight": False,
182
+ "transformer.h.{i}.ln_1.bias": False,
183
+ "transformer.h.{i}.attn.c_attn.weight": True,
184
+ "transformer.h.{i}.attn.c_attn.bias": False,
185
+ "transformer.h.{i}.attn.c_proj.weight": True,
186
+ "transformer.h.{i}.attn.c_proj.bias": False,
187
+ "transformer.h.{i}.attn.rotary_emb.inv_freq": False,
188
+ "transformer.h.{i}.ln_2.weight": False,
189
+ "transformer.h.{i}.ln_2.bias": False,
190
+ "transformer.h.{i}.mlp.c_fc.weight": True,
191
+ "transformer.h.{i}.mlp.c_fc.bias": False,
192
+ "transformer.h.{i}.mlp.c_proj.weight": True,
193
+ "transformer.h.{i}.mlp.c_proj.bias": False,
194
+ }
195
+ num_dict = {str(i):i for i in range(100)}
196
+
197
+ def set_value(model, name, state_dict, is_4bit):
198
+ keys = name.split('.')
199
+ parent = model
200
+ for key in keys[:-1]:
201
+ if key in num_dict:
202
+ parent = parent[num_dict[key]]
203
+ else:
204
+ parent = getattr(parent, key)
205
+ if is_4bit:
206
+ weight_data = state_dict[f'{name}.data']
207
+ weight_quant_state = state_dict[f'{name}.quant_state']
208
+ assert weight_data is not None, name
209
+ assert weight_quant_state is not None, name
210
+ setattr(parent, keys[-1], Params4bit(weight_data, requires_grad=False, quant_state=weight_quant_state))
211
+ else:
212
+ setattr(parent, keys[-1], state_dict[name])
213
+
214
+ def quantize_offline(model):
215
+ for i, layer in enumerate(model.transformer.h):
216
+ layer.mlp.c_fc = bnb.nn.Linear4bit(
217
+ layer.mlp.c_fc.weight.shape[1],
218
+ layer.mlp.c_fc.weight.shape[0],
219
+ False,
220
+ torch.bfloat16,
221
+ compress_statistics=True,
222
+ quant_type="nf4",
223
+ )
224
+ layer.mlp.c_proj = bnb.nn.Linear4bit(
225
+ layer.mlp.c_proj.weight.shape[1],
226
+ layer.mlp.c_proj.weight.shape[0],
227
+ False,
228
+ torch.bfloat16,
229
+ compress_statistics=True,
230
+ quant_type="nf4",
231
+ )
232
+
233
+ layer.attn.c_attn = bnb.nn.Linear4bit(
234
+ layer.attn.c_attn.weight.shape[1],
235
+ layer.attn.c_attn.weight.shape[0],
236
+ False,
237
+ torch.bfloat16,
238
+ compress_statistics=True,
239
+ quant_type="nf4",
240
+ )
241
+ layer.attn.c_proj = bnb.nn.Linear4bit(
242
+ layer.attn.c_proj.weight.shape[1],
243
+ layer.attn.c_proj.weight.shape[0],
244
+ False,
245
+ torch.bfloat16,
246
+ compress_statistics=True,
247
+ quant_type="nf4",
248
+ )
249
+ return model
250
+
251
+ def load_state_dict_for_qunantied_model(model, state_dict):
252
+ #replace Params4bit.cuda with Params4bitCuda
253
+ Params4bit.cuda = Params4bitCuda
254
+ Params4bit.to = Params4bitTo
255
+
256
+ for name, is_4bit in general_weight_dict.items():
257
+ set_value(model, name, state_dict, is_4bit)
258
+
259
+ for layer_i in range(len(model.transformer.h)):
260
+ for name, is_4bit in layer_weight_dict.items():
261
+ name = name.replace('{i}', str(layer_i))
262
+ set_value(model, name, state_dict, is_4bit)
263
+ return model
special_tokens_map.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|endoftext|>",
4
+ "<fim_prefix>",
5
+ "<fim_middle>",
6
+ "<fim_suffix>",
7
+ "<fim_pad>",
8
+ "<filename>",
9
+ "<gh_stars>",
10
+ "<issue_start>",
11
+ "<issue_comment>",
12
+ "<issue_closed>",
13
+ "<jupyter_start>",
14
+ "<jupyter_text>",
15
+ "<jupyter_code>",
16
+ "<jupyter_output>",
17
+ "<empty_output>",
18
+ "<commit_before>",
19
+ "<commit_msg>",
20
+ "<commit_after>",
21
+ "<reponame>"
22
+ ],
23
+ "bos_token": "<|endoftext|>",
24
+ "eos_token": "<|endoftext|>",
25
+ "pad_token": "<|endoftext|>",
26
+ "unk_token": "<|endoftext|>"
27
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "additional_special_tokens": [
4
+ "<|endoftext|>",
5
+ "<fim_prefix>",
6
+ "<fim_middle>",
7
+ "<fim_suffix>",
8
+ "<fim_pad>",
9
+ "<filename>",
10
+ "<gh_stars>",
11
+ "<issue_start>",
12
+ "<issue_comment>",
13
+ "<issue_closed>",
14
+ "<jupyter_start>",
15
+ "<jupyter_text>",
16
+ "<jupyter_code>",
17
+ "<jupyter_output>",
18
+ "<empty_output>",
19
+ "<commit_before>",
20
+ "<commit_msg>",
21
+ "<commit_after>",
22
+ "<reponame>"
23
+ ],
24
+ "bos_token": "<|endoftext|>",
25
+ "clean_up_tokenization_spaces": true,
26
+ "eos_token": "<|endoftext|>",
27
+ "model_max_length": 8192,
28
+ "pad_token": "<|endoftext|>",
29
+ "tokenizer_class": "GPT2Tokenizer",
30
+ "unk_token": "<|endoftext|>",
31
+ "vocab_size": 70019
32
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff