ruixie commited on
Commit
5b5b6f5
1 Parent(s): 1dab17d

Upload folder using huggingface_hub

Browse files
added_tokens.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<commit_after>": 70017,
3
+ "<commit_before>": 70015,
4
+ "<commit_msg>": 70016,
5
+ "<empty_output>": 70014,
6
+ "<filename>": 70005,
7
+ "<fim_middle>": 70002,
8
+ "<fim_pad>": 70004,
9
+ "<fim_prefix>": 70001,
10
+ "<fim_suffix>": 70003,
11
+ "<gh_stars>": 70006,
12
+ "<issue_closed>": 70009,
13
+ "<issue_comment>": 70008,
14
+ "<issue_start>": 70007,
15
+ "<jupyter_code>": 70012,
16
+ "<jupyter_output>": 70013,
17
+ "<jupyter_start>": 70010,
18
+ "<jupyter_text>": 70011,
19
+ "<reponame>": 70018,
20
+ "<|endoftext|>": 70000
21
+ }
config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "WisdomShell/CodeShell",
3
+ "activation_function": "gelu_pytorch_tanh",
4
+ "architectures": [
5
+ "CodeShell4bitForCausalLM"
6
+ ],
7
+ "attention_softmax_in_fp32": true,
8
+ "attn_pdrop": 0.1,
9
+ "auto_map": {
10
+ "AutoConfig": "configuration_codeshell.CodeShellConfig",
11
+ "AutoModelForCausalLM": "modeling_codeshell.CodeShell4bitForCausalLM"
12
+ },
13
+ "bos_token_id": 70000,
14
+ "embd_pdrop": 0.1,
15
+ "eos_token_id": 70000,
16
+ "group_query_attention": true,
17
+ "inference_runner": 0,
18
+ "initializer_range": 0.02,
19
+ "layer_norm_epsilon": 1e-05,
20
+ "max_batch_size": null,
21
+ "max_sequence_length": null,
22
+ "model_type": "codeshell",
23
+ "n_embd": 4096,
24
+ "n_head": 32,
25
+ "n_inner": 16384,
26
+ "n_layer": 42,
27
+ "n_positions": 8192,
28
+ "num_query_groups": 8,
29
+ "pad_key_length": true,
30
+ "position_embedding_type": "rope",
31
+ "pre_allocate_kv_cache": false,
32
+ "resid_pdrop": 0.1,
33
+ "rope_scaling": null,
34
+ "scale_attention_softmax_in_fp32": true,
35
+ "scale_attn_weights": true,
36
+ "summary_activation": null,
37
+ "summary_first_dropout": 0.1,
38
+ "summary_proj_to_labels": true,
39
+ "summary_type": "cls_index",
40
+ "summary_use_proj": true,
41
+ "torch_dtype": "bfloat16",
42
+ "transformers_version": "4.31.0",
43
+ "use_cache": true,
44
+ "validate_runner_input": true,
45
+ "vocab_size": 70144
46
+ }
configuration_codeshell.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023 WisdomShell Inc. All Rights Reserved.
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ # This code is based on Bigcode's GPTBigCode configuration. It has been modified from
17
+ # its original forms to accommodate minor architectural differences compared to
18
+ # GPTBigCode Configuration that trained the model.
19
+
20
+ # Copyright 2023 The BigCode team and HuggingFace Inc. team.
21
+ #
22
+ # Licensed under the Apache License, Version 2.0 (the "License");
23
+ # you may not use this file except in compliance with the License.
24
+ # You may obtain a copy of the License at
25
+ #
26
+ # http://www.apache.org/licenses/LICENSE-2.0
27
+ #
28
+ # Unless required by applicable law or agreed to in writing, software
29
+ # distributed under the License is distributed on an "AS IS" BASIS,
30
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
31
+ # See the License for the specific language governing permissions and
32
+ # limitations under the License.
33
+ """ Shell configuration"""
34
+
35
+ from transformers.configuration_utils import PretrainedConfig
36
+ from transformers.utils import logging
37
+
38
+
39
+ logger = logging.get_logger(__name__)
40
+
41
+
42
+ class CodeShellConfig(PretrainedConfig):
43
+ """
44
+ This is the configuration class to store the configuration of a [`CodeShellModel`]. It is used to instantiate a
45
+ CodeShell model according to the specified arguments, defining the model architecture.
46
+
47
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
48
+ documentation from [`PretrainedConfig`] for more information.
49
+
50
+ Args:
51
+ vocab_size (`int`, *optional*, defaults to 50257):
52
+ Vocabulary size of the GPT-2 model. Defines the number of different tokens that can be represented by the
53
+ `inputs_ids` passed when calling [`ShellModel`].
54
+ n_positions (`int`, *optional*, defaults to 1024):
55
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
56
+ just in case (e.g., 512 or 1024 or 2048).
57
+ n_embd (`int`, *optional*, defaults to 768):
58
+ Dimensionality of the embeddings and hidden states.
59
+ n_layer (`int`, *optional*, defaults to 12):
60
+ Number of hidden layers in the Transformer encoder.
61
+ n_head (`int`, *optional*, defaults to 12):
62
+ Number of attention heads for each attention layer in the Transformer encoder.
63
+ n_inner (`int`, *optional*, defaults to None):
64
+ Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd
65
+ activation_function (`str`, *optional*, defaults to `"gelu_pytorch_tanh"`):
66
+ Activation function, to be selected in the list `["relu", "silu", "gelu", "tanh", "gelu_new",
67
+ "gelu_pytorch_tanh"]`.
68
+ resid_pdrop (`float`, *optional*, defaults to 0.1):
69
+ The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
70
+ embd_pdrop (`float`, *optional*, defaults to 0.1):
71
+ The dropout ratio for the embeddings.
72
+ attn_pdrop (`float`, *optional*, defaults to 0.1):
73
+ The dropout ratio for the attention.
74
+ layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
75
+ The epsilon to use in the layer normalization layers.
76
+ initializer_range (`float`, *optional*, defaults to 0.02):
77
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
78
+ scale_attn_weights (`bool`, *optional*, defaults to `True`):
79
+ Scale attention weights by dividing by sqrt(hidden_size)..
80
+ use_cache (`bool`, *optional*, defaults to `True`):
81
+ Whether or not the model should return the last key/values attentions (not used by all models).
82
+ attention_softmax_in_fp32 (`bool`, *optional*, defaults to `True`):
83
+ Whether to call the fused softmax in float32.
84
+ scale_attention_softmax_in_fp32 (`bool`, *optional*, defaults to `True`):
85
+ Whether to scale the attention softmax in float32.
86
+ attention_type (`bool`, *optional*, defaults to `True`):
87
+ Whether to use Multi-Query Attion (`True`) or Multi-Head Attention (`False`).
88
+ Example:
89
+
90
+ ```python
91
+ >>> from configuration_codeshell import CodeShellConfig
92
+ >>> from modeling_codeshell import CodeShellForCausalLM
93
+
94
+ >>> # Initializing a CodeShell configuration
95
+ >>> configuration = CodeShellConfig()
96
+
97
+ >>> # Initializing a model (with random weights) from the configuration
98
+ >>> model = CodeShellForCausalLM(configuration)
99
+
100
+ >>> # Accessing the model configuration
101
+ >>> configuration = model.config
102
+ ```"""
103
+
104
+ model_type = "codeshell"
105
+ keys_to_ignore_at_inference = ["past_key_values"]
106
+ attribute_map = {
107
+ "hidden_size": "n_embd",
108
+ "max_position_embeddings": "n_positions",
109
+ "num_attention_heads": "n_head",
110
+ "num_hidden_layers": "n_layer",
111
+ }
112
+
113
+ def __init__(
114
+ self,
115
+ vocab_size=70144,
116
+ n_positions=8192,
117
+ n_embd=4096,
118
+ n_layer=42,
119
+ n_head=32,
120
+ n_inner=None,
121
+ activation_function="gelu_pytorch_tanh",
122
+ resid_pdrop=0.1,
123
+ embd_pdrop=0.1,
124
+ attn_pdrop=0.1,
125
+ layer_norm_epsilon=1e-5,
126
+ initializer_range=0.02,
127
+ scale_attn_weights=True,
128
+ use_cache=True,
129
+ bos_token_id=70000,
130
+ eos_token_id=70000,
131
+ attention_softmax_in_fp32=True,
132
+ scale_attention_softmax_in_fp32=True,
133
+ group_query_attention=True,
134
+ num_query_groups=1,
135
+ position_embedding_type="learned_absolute",
136
+ rope_scaling=None,
137
+ **kwargs,
138
+ ):
139
+ self.vocab_size = vocab_size
140
+ self.n_positions = n_positions
141
+ self.n_embd = n_embd
142
+ self.n_layer = n_layer
143
+ self.n_head = n_head
144
+ self.n_inner = n_inner
145
+ self.activation_function = activation_function
146
+ self.resid_pdrop = resid_pdrop
147
+ self.embd_pdrop = embd_pdrop
148
+ self.attn_pdrop = attn_pdrop
149
+ self.layer_norm_epsilon = layer_norm_epsilon
150
+ self.initializer_range = initializer_range
151
+ self.scale_attn_weights = scale_attn_weights
152
+ self.use_cache = use_cache
153
+ self.attention_softmax_in_fp32 = attention_softmax_in_fp32
154
+ self.scale_attention_softmax_in_fp32 = scale_attention_softmax_in_fp32
155
+ self.group_query_attention = group_query_attention
156
+ self.num_query_groups = num_query_groups
157
+ self.position_embedding_type = position_embedding_type
158
+ self.rope_scaling = rope_scaling
159
+ assert self.position_embedding_type in [
160
+ "learned_absolute", "rope"
161
+ ], "position_embedding_type must be one of ['learned_absolute', 'rope']"
162
+
163
+ self.bos_token_id = bos_token_id
164
+ self.eos_token_id = eos_token_id
165
+
166
+ super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
modeling_codeshell.py ADDED
@@ -0,0 +1,970 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023 WisdomShell Inc. All Rights Reserved.
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ # This code is based on Bigcode's GPTBigCode model. It has been modified from
17
+ # its original forms to accommodate minor architectural differences compared to
18
+ # GPTBigCode model that trained the model.
19
+
20
+ # Copyright 2023 The Bigcode team and HuggingFace Inc. team.
21
+ # Licensed under the Apache License, Version 2.0 (the "License");
22
+ # you may not use this file except in compliance with the License.
23
+ # You may obtain a copy of the License at
24
+ #
25
+ # http://www.apache.org/licenses/LICENSE-2.0
26
+ #
27
+ # Unless required by applicable law or agreed to in writing, software
28
+ # distributed under the License is distributed on an "AS IS" BASIS,
29
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
30
+ # See the License for the specific language governing permissions and
31
+ # limitations under the License.
32
+ """PyTorch CodeShell model."""
33
+ import os
34
+ import math
35
+ from typing import List, Optional, Tuple, Union
36
+
37
+ import torch
38
+ import torch.utils.checkpoint
39
+ from torch import nn
40
+ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
41
+
42
+ from transformers import PreTrainedModel, PretrainedConfig
43
+ from transformers.generation.utils import GenerationConfig
44
+
45
+ from transformers.activations import ACT2FN
46
+ from transformers.modeling_outputs import (
47
+ BaseModelOutputWithPastAndCrossAttentions,
48
+ CausalLMOutputWithCrossAttentions,
49
+ )
50
+ from transformers.modeling_utils import PreTrainedModel
51
+ from transformers.utils import (
52
+ add_start_docstrings,
53
+ add_start_docstrings_to_model_forward,
54
+ )
55
+ from .configuration_codeshell import CodeShellConfig
56
+
57
+
58
+ # Fused kernels
59
+ # Use separate functions for each case because conditionals prevent kernel fusion.
60
+ # TODO: Could have better fused kernels depending on scaling, dropout and head mask.
61
+ # Is it doable without writing 32 functions?
62
+ @torch.jit.script
63
+ def upcast_masked_softmax(
64
+ x: torch.Tensor, mask: torch.Tensor, mask_value: torch.Tensor, scale: float, softmax_dtype: torch.dtype
65
+ ):
66
+ input_dtype = x.dtype
67
+ x = x.to(softmax_dtype) * scale
68
+ x = torch.where(mask, x, mask_value)
69
+ x = torch.nn.functional.softmax(x, dim=-1).to(input_dtype)
70
+ return x
71
+
72
+
73
+ @torch.jit.script
74
+ def upcast_softmax(x: torch.Tensor, scale: float, softmax_dtype: torch.dtype):
75
+ input_dtype = x.dtype
76
+ x = x.to(softmax_dtype) * scale
77
+ x = torch.nn.functional.softmax(x, dim=-1).to(input_dtype)
78
+ return x
79
+
80
+
81
+ @torch.jit.script
82
+ def masked_softmax(x: torch.Tensor, mask: torch.Tensor, mask_value: torch.Tensor):
83
+ x = torch.where(mask, x, mask_value)
84
+ x = torch.nn.functional.softmax(x, dim=-1)
85
+ return x
86
+
87
+
88
+ class CodeShellRotaryEmbedding(torch.nn.Module):
89
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
90
+ super().__init__()
91
+
92
+ self.dim = dim
93
+ self.max_position_embeddings = max_position_embeddings
94
+ self.base = base
95
+ inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
96
+ self.register_buffer("inv_freq", inv_freq)
97
+
98
+ # Build here to make `torch.jit.trace` work.
99
+ self._set_cos_sin_cache(
100
+ seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
101
+ )
102
+
103
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
104
+ self.max_seq_len_cached = seq_len
105
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
106
+
107
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
108
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
109
+ emb = torch.cat((freqs, freqs), dim=-1)
110
+ self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
111
+ self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
112
+
113
+ def forward(self, x, seq_len=None):
114
+ # x: [bs, num_attention_heads, seq_len, head_size]
115
+ if seq_len > self.max_seq_len_cached:
116
+ self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
117
+
118
+ return (
119
+ self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
120
+ self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
121
+ )
122
+
123
+
124
+ class CodeShellLinearScalingRotaryEmbedding(CodeShellRotaryEmbedding):
125
+ """CodeShellRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
126
+
127
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
128
+ self.scaling_factor = scaling_factor
129
+ super().__init__(dim, max_position_embeddings, base, device)
130
+
131
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
132
+ self.max_seq_len_cached = seq_len
133
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
134
+ t = t / self.scaling_factor
135
+
136
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
137
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
138
+ emb = torch.cat((freqs, freqs), dim=-1)
139
+ self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
140
+ self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
141
+
142
+
143
+ class CodeShellDynamicNTKScalingRotaryEmbedding(CodeShellRotaryEmbedding):
144
+ """ShellRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
145
+
146
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
147
+ self.scaling_factor = scaling_factor
148
+ super().__init__(dim, max_position_embeddings, base, device)
149
+
150
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
151
+ self.max_seq_len_cached = seq_len
152
+
153
+ if seq_len > self.max_position_embeddings:
154
+ base = self.base * (
155
+ (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
156
+ ) ** (self.dim / (self.dim - 2))
157
+ inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
158
+ self.register_buffer("inv_freq", inv_freq)
159
+
160
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
161
+
162
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
163
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
164
+ emb = torch.cat((freqs, freqs), dim=-1)
165
+ self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False)
166
+ self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False)
167
+
168
+ def rotate_half(x):
169
+ """Rotates half the hidden dims of the input."""
170
+ x1 = x[..., : x.shape[-1] // 2]
171
+ x2 = x[..., x.shape[-1] // 2 :]
172
+ return torch.cat((-x2, x1), dim=-1)
173
+
174
+
175
+ def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
176
+ # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
177
+ cos = cos.squeeze(1).squeeze(0) # [seq_len, dim]
178
+ sin = sin.squeeze(1).squeeze(0) # [seq_len, dim]
179
+ cos = cos[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim]
180
+ sin = sin[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim]
181
+ q_embed = (q * cos) + (rotate_half(q) * sin)
182
+ k_embed = (k * cos) + (rotate_half(k) * sin)
183
+ return q_embed, k_embed
184
+
185
+ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
186
+ """
187
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
188
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
189
+ """
190
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
191
+ if n_rep == 1:
192
+ return hidden_states
193
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
194
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
195
+
196
+ class CodeShellAttention(nn.Module):
197
+ def __init__(self, config, layer_idx=None):
198
+ super().__init__()
199
+ self.mask_value = None
200
+
201
+ self.position_embedding_type = config.position_embedding_type
202
+ self.rope_scaling = config.rope_scaling
203
+ self.max_position_embeddings = config.max_position_embeddings
204
+
205
+ self.group_query_attention = config.group_query_attention
206
+ self.num_query_groups = config.num_query_groups
207
+ self.num_key_value_groups = config.num_attention_heads // config.num_query_groups
208
+
209
+ self.embed_dim = config.hidden_size
210
+ self.num_heads = config.num_attention_heads
211
+ self.head_dim = self.embed_dim // self.num_heads
212
+ self.kv_heads = config.num_query_groups if self.group_query_attention else self.num_heads
213
+ self.kv_dim = self.kv_heads * self.head_dim
214
+ self.split_size = self.embed_dim
215
+ if self.head_dim * self.num_heads != self.embed_dim:
216
+ raise ValueError(
217
+ f"`embed_dim` must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
218
+ f" {self.num_heads})."
219
+ )
220
+
221
+ self.layer_idx = layer_idx
222
+
223
+ self.c_attn = nn.Linear(self.embed_dim, self.embed_dim + 2 * self.kv_dim)
224
+ self.c_proj = nn.Linear(self.embed_dim, self.embed_dim)
225
+
226
+ self.attn_dropout = nn.Dropout(config.attn_pdrop)
227
+ self.resid_dropout = nn.Dropout(config.resid_pdrop)
228
+
229
+ if self.position_embedding_type == "rope":
230
+ self._init_rope()
231
+
232
+ def _init_rope(self):
233
+ if self.rope_scaling is None:
234
+ self.rotary_emb = CodeShellRotaryEmbedding(self.head_dim, max_position_embeddings=self.max_position_embeddings)
235
+ else:
236
+ scaling_type = self.rope_scaling["type"]
237
+ scaling_factor = self.rope_scaling["factor"]
238
+ if scaling_type == "linear":
239
+ self.rotary_emb = CodeShellLinearScalingRotaryEmbedding(
240
+ self.head_dim, max_position_embeddings=self.max_position_embeddings, scaling_factor=scaling_factor
241
+ )
242
+ elif scaling_type == "dynamic":
243
+ self.rotary_emb = CodeShellDynamicNTKScalingRotaryEmbedding(
244
+ self.head_dim, max_position_embeddings=self.max_position_embeddings, scaling_factor=scaling_factor
245
+ )
246
+ else:
247
+ raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
248
+
249
+
250
+ def _get_mask_value(self, device, dtype):
251
+ # torch.where expects a tensor. We use a cache to avoid recreating it every time.
252
+ if self.mask_value is None or self.mask_value.dtype != dtype or self.mask_value.device != device:
253
+ self.mask_value = torch.full([], torch.finfo(dtype).min, dtype=dtype, device=device)
254
+ return self.mask_value
255
+
256
+ def forward(
257
+ self,
258
+ hidden_states: torch.Tensor,
259
+ layer_past: Optional[torch.Tensor] = None,
260
+ attention_mask: Optional[torch.Tensor] = None,
261
+ position_ids: Optional[torch.LongTensor] = None,
262
+ head_mask: Optional[torch.Tensor] = None,
263
+ use_cache: Optional[bool] = False,
264
+ output_attentions: Optional[bool] = False,
265
+ ) -> Union[
266
+ Tuple[torch.Tensor, Optional[torch.Tensor]],
267
+ Tuple[torch.Tensor, Optional[torch.Tensor], Tuple[torch.Tensor, ...]],
268
+ ]:
269
+ bsz, q_len, _ = hidden_states.size()
270
+ query_states, key_states, value_states = self.c_attn(hidden_states).split((self.embed_dim, self.kv_dim, self.kv_dim), dim=2)
271
+
272
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
273
+ key_states = key_states.view(bsz, q_len, self.num_query_groups, self.head_dim).transpose(1, 2)
274
+ value_states = value_states.view(bsz, q_len, self.num_query_groups, self.head_dim).transpose(1, 2)
275
+
276
+ kv_seq_len = key_states.shape[-2]
277
+ if layer_past is not None:
278
+ kv_seq_len += layer_past[0].shape[-2]
279
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
280
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
281
+
282
+ if layer_past is not None:
283
+ # reuse k, v, self_attention
284
+ key_states = torch.cat([layer_past[0], key_states], dim=2)
285
+ value_states = torch.cat([layer_past[1], value_states], dim=2)
286
+
287
+ layer_past = (key_states, value_states) if use_cache else None
288
+
289
+ # repeat k/v heads if n_kv_heads < n_heads
290
+ key_states = repeat_kv(key_states, self.num_heads // self.kv_heads)
291
+ value_states = repeat_kv(value_states, self.num_heads // self.kv_heads)
292
+
293
+ attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
294
+
295
+ if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
296
+ raise ValueError(
297
+ f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
298
+ f" {attn_weights.size()}"
299
+ )
300
+
301
+ if attention_mask is not None:
302
+ if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
303
+ raise ValueError(
304
+ f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
305
+ )
306
+ mask_value = self._get_mask_value(attn_weights.device, attn_weights.dtype)
307
+ # The fused kernel is very slow when the key length is not a multiple of 8, so we skip fusion.
308
+ attn_weights = torch.where(attention_mask, attn_weights, mask_value)
309
+
310
+ # upcast attention to fp32
311
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
312
+ attn_weights = self.attn_dropout(attn_weights)
313
+ attn_output = torch.matmul(attn_weights, value_states)
314
+
315
+ if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
316
+ raise ValueError(
317
+ f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
318
+ f" {attn_output.size()}"
319
+ )
320
+
321
+ attn_output = attn_output.transpose(1, 2).contiguous()
322
+ attn_output = attn_output.reshape(bsz, q_len, self.embed_dim)
323
+
324
+ attn_output = self.c_proj(attn_output)
325
+ attn_output = self.resid_dropout(attn_output)
326
+
327
+ outputs = (attn_output, layer_past)
328
+ if output_attentions:
329
+ outputs += (attn_weights,)
330
+
331
+ return outputs # a, present, (attentions)
332
+
333
+
334
+ class CodeShellMLP(nn.Module):
335
+ def __init__(self, intermediate_size, config):
336
+ super().__init__()
337
+ embed_dim = config.hidden_size
338
+ self.c_fc = nn.Linear(embed_dim, intermediate_size)
339
+ self.c_proj = nn.Linear(intermediate_size, embed_dim)
340
+ self.act = ACT2FN[config.activation_function]
341
+ self.dropout = nn.Dropout(config.resid_pdrop)
342
+
343
+ # Copied from transformers.models.gpt2.modeling_gpt2.GPT2MLP.forward
344
+ def forward(self, hidden_states: Optional[Tuple[torch.Tensor]]) -> torch.Tensor:
345
+ hidden_states = self.c_fc(hidden_states)
346
+ hidden_states = self.act(hidden_states)
347
+ hidden_states = self.c_proj(hidden_states)
348
+ hidden_states = self.dropout(hidden_states)
349
+ return hidden_states
350
+
351
+
352
+ class CodeShellBlock(nn.Module):
353
+ def __init__(self, config, layer_idx=None):
354
+ super().__init__()
355
+ hidden_size = config.hidden_size
356
+ self.inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size
357
+
358
+ self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
359
+ self.attn = CodeShellAttention(config, layer_idx=layer_idx)
360
+ self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
361
+
362
+ self.mlp = CodeShellMLP(self.inner_dim, config)
363
+
364
+ def forward(
365
+ self,
366
+ hidden_states: Optional[Tuple[torch.Tensor]],
367
+ layer_past: Optional[torch.Tensor] = None,
368
+ attention_mask: Optional[torch.Tensor] = None,
369
+ position_ids: Optional[torch.LongTensor] = None,
370
+ head_mask: Optional[torch.Tensor] = None,
371
+ encoder_hidden_states: Optional[torch.Tensor] = None,
372
+ encoder_attention_mask: Optional[torch.Tensor] = None,
373
+ use_cache: Optional[bool] = False,
374
+ output_attentions: Optional[bool] = False,
375
+ ) -> Union[
376
+ Tuple[torch.Tensor], Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor, torch.Tensor, torch.Tensor]
377
+ ]:
378
+ residual = hidden_states
379
+ hidden_states = self.ln_1(hidden_states)
380
+ attn_outputs = self.attn(
381
+ hidden_states,
382
+ layer_past=layer_past,
383
+ attention_mask=attention_mask,
384
+ position_ids=position_ids,
385
+ head_mask=head_mask,
386
+ use_cache=use_cache,
387
+ output_attentions=output_attentions,
388
+ )
389
+ attn_output = attn_outputs[0] # output_attn: a, present, (attentions)
390
+
391
+ outputs = attn_outputs[1:]
392
+ # residual connection
393
+ hidden_states = attn_output + residual
394
+
395
+ residual = hidden_states
396
+ hidden_states = self.ln_2(hidden_states)
397
+ feed_forward_hidden_states = self.mlp(hidden_states)
398
+ # residual connection
399
+ hidden_states = residual + feed_forward_hidden_states
400
+
401
+ if use_cache:
402
+ outputs = (hidden_states,) + outputs
403
+ else:
404
+ outputs = (hidden_states,) + outputs[1:]
405
+
406
+ return outputs # hidden_states, present, (attentions, cross_attentions)
407
+
408
+
409
+ class CodeShellPreTrainedModel(PreTrainedModel):
410
+ """
411
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
412
+ models.
413
+ """
414
+
415
+ config_class = CodeShellConfig
416
+ base_model_prefix = "transformer"
417
+ supports_gradient_checkpointing = True
418
+ _no_split_modules = ["ShellBlock"]
419
+ _skip_keys_device_placement = "past_key_values"
420
+
421
+ def __init__(self, *inputs, **kwargs):
422
+ super().__init__(*inputs, **kwargs)
423
+
424
+ def _init_weights(self, module):
425
+ """Initialize the weights."""
426
+ if isinstance(module, (CodeShellMLP, CodeShellAttention)):
427
+ # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
428
+ # > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
429
+ # > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
430
+ # > -- GPT-2 :: https://openai.com/blog/better-language-models/
431
+ #
432
+ # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
433
+ module.c_proj.weight.data.normal_(
434
+ mean=0.0, std=(self.config.initializer_range / math.sqrt(2 * self.config.n_layer))
435
+ )
436
+ module.c_proj._is_hf_initialized = True
437
+ elif isinstance(module, nn.Linear):
438
+ # Slightly different from the TF version which uses truncated_normal for initialization
439
+ # cf https://github.com/pytorch/pytorch/pull/5617
440
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
441
+ if module.bias is not None:
442
+ module.bias.data.zero_()
443
+ elif isinstance(module, nn.Embedding):
444
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
445
+ if module.padding_idx is not None:
446
+ module.weight.data[module.padding_idx].zero_()
447
+ elif isinstance(module, nn.LayerNorm):
448
+ module.bias.data.zero_()
449
+ module.weight.data.fill_(1.0)
450
+
451
+ # Copied from transformers.models.gpt2.modeling_gpt2.GPT2PreTrainedModel._set_gradient_checkpointing with GPT2->Shell
452
+ def _set_gradient_checkpointing(self, module, value=False):
453
+ if isinstance(module, CodeShellModel):
454
+ module.gradient_checkpointing = value
455
+
456
+
457
+ GPT_BIGCODE_START_DOCSTRING = r"""
458
+
459
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
460
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
461
+ etc.)
462
+
463
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
464
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
465
+ and behavior.
466
+
467
+ Parameters:
468
+ config ([`CodeShellConfig`]): Model configuration class with all the parameters of the model.
469
+ Initializing with a config file does not load the weights associated with the model, only the
470
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
471
+ """
472
+
473
+ GPT_BIGCODE_INPUTS_DOCSTRING = r"""
474
+ Args:
475
+ input_ids (`torch.Tensor` of shape `(batch_size, input_ids_length)`):
476
+ `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
477
+ `past_key_values[0][0].shape[-2]` (`sequence_length` of input past key value states). Indices of input
478
+ sequence tokens in the vocabulary.
479
+
480
+ If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
481
+ `input_ids`.
482
+
483
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
484
+ [`PreTrainedTokenizer.__call__`] for details.
485
+
486
+ [What are input IDs?](../glossary#input-ids)
487
+ past_key_values (`Tuple[torch.Tensor]` of length `config.n_layers`):
488
+ Contains precomputed hidden-states (key and values in the attention blocks) as computed by the model (see
489
+ `past_key_values` output below). Can be used to speed up sequential decoding. The `input_ids` which have
490
+ their past given to this model should not be passed as `input_ids` as they have already been computed.
491
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
492
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
493
+
494
+ - 1 for tokens that are **not masked**,
495
+ - 0 for tokens that are **masked**.
496
+
497
+ If `past_key_values` is used, `attention_mask` needs to contain the masking strategy that was used for
498
+ `past_key_values`. In other words, the `attention_mask` always has to have the length:
499
+ `len(past_key_values) + len(input_ids)`
500
+
501
+ [What are attention masks?](../glossary#attention-mask)
502
+ token_type_ids (`torch.Tensor` of shape `(batch_size, input_ids_length)`, *optional*):
503
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
504
+ 1]`:
505
+
506
+ - 0 corresponds to a *sentence A* token,
507
+ - 1 corresponds to a *sentence B* token.
508
+
509
+ [What are token type IDs?](../glossary#token-type-ids)
510
+ position_ids (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
511
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
512
+ config.max_position_embeddings - 1]`.
513
+
514
+ [What are position IDs?](../glossary#position-ids)
515
+ head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
516
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
517
+
518
+ - 1 indicates the head is **not masked**,
519
+ - 0 indicates the head is **masked**.
520
+
521
+ inputs_embeds (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
522
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
523
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
524
+ model's internal embedding lookup matrix.
525
+
526
+ If `past_key_values` is used, optionally only the last `inputs_embeds` have to be input (see
527
+ `past_key_values`).
528
+ use_cache (`bool`, *optional*):
529
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
530
+ `past_key_values`).
531
+ output_attentions (`bool`, *optional*):
532
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
533
+ tensors for more detail.
534
+ output_hidden_states (`bool`, *optional*):
535
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
536
+ more detail.
537
+ return_dict (`bool`, *optional*):
538
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
539
+ """
540
+
541
+
542
+ @add_start_docstrings(
543
+ "The bare GPT_BIGCODE Model transformer outputting raw hidden-states without any specific head on top.",
544
+ GPT_BIGCODE_START_DOCSTRING,
545
+ )
546
+ class CodeShellModel(CodeShellPreTrainedModel):
547
+ def __init__(self, config):
548
+ super().__init__(config)
549
+ self.group_query_attention = config.group_query_attention
550
+ self.num_query_groups = config.num_query_groups
551
+ self.position_embedding_type = config.position_embedding_type
552
+ self.embed_dim = config.hidden_size
553
+
554
+ self.wte = nn.Embedding(config.vocab_size, self.embed_dim)
555
+ if self.position_embedding_type == "learned_absolute":
556
+ self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
557
+ else:
558
+ pass
559
+
560
+ self.drop = nn.Dropout(config.embd_pdrop)
561
+ self.h = nn.ModuleList([CodeShellBlock(config, layer_idx=i) for i in range(config.num_hidden_layers)])
562
+ self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
563
+
564
+ max_positions = config.max_position_embeddings
565
+ self.register_buffer(
566
+ "bias", torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)), persistent=False
567
+ )
568
+
569
+ self.gradient_checkpointing = False
570
+
571
+ # Initialize weights and apply final processing
572
+ self.post_init()
573
+
574
+ def get_input_embeddings(self):
575
+ return self.wte
576
+
577
+ def set_input_embeddings(self, new_embeddings):
578
+ self.wte = new_embeddings
579
+
580
+ @add_start_docstrings_to_model_forward(GPT_BIGCODE_INPUTS_DOCSTRING)
581
+ def forward(
582
+ self,
583
+ input_ids: Optional[torch.Tensor] = None,
584
+ past_key_values: Optional[List[torch.Tensor]] = None,
585
+ attention_mask: Optional[torch.Tensor] = None,
586
+ token_type_ids: Optional[torch.Tensor] = None,
587
+ position_ids: Optional[torch.Tensor] = None,
588
+ head_mask: Optional[torch.Tensor] = None,
589
+ inputs_embeds: Optional[torch.Tensor] = None,
590
+ encoder_hidden_states: Optional[torch.Tensor] = None,
591
+ encoder_attention_mask: Optional[torch.Tensor] = None,
592
+ use_cache: Optional[bool] = None,
593
+ output_attentions: Optional[bool] = None,
594
+ output_hidden_states: Optional[bool] = None,
595
+ return_dict: Optional[bool] = None,
596
+ ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
597
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
598
+ output_hidden_states = (
599
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
600
+ )
601
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
602
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
603
+
604
+ if input_ids is not None and inputs_embeds is not None:
605
+ raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
606
+ elif input_ids is not None:
607
+ input_shape = input_ids.size()
608
+ input_ids = input_ids.reshape(-1, input_shape[-1])
609
+ batch_size = input_ids.shape[0]
610
+ elif inputs_embeds is not None:
611
+ input_shape = inputs_embeds.size()[:-1]
612
+ batch_size = inputs_embeds.shape[0]
613
+ else:
614
+ raise ValueError("You have to specify either input_ids or inputs_embeds")
615
+
616
+ if batch_size <= 0:
617
+ raise ValueError("batch_size has to be defined and > 0")
618
+
619
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
620
+
621
+ if token_type_ids is not None:
622
+ token_type_ids = token_type_ids.reshape(-1, input_shape[-1])
623
+ if position_ids is not None:
624
+ position_ids = position_ids.reshape(-1, input_shape[-1])
625
+
626
+ if past_key_values is None:
627
+ past_length = 0
628
+ past_key_values = tuple([None] * len(self.h))
629
+ else:
630
+ past_length = past_key_values[0][0].size(-2)
631
+
632
+ if attention_mask is not None and len(attention_mask.shape) == 2 and position_ids is None:
633
+ # create position_ids on the fly for batch generation
634
+ position_ids = attention_mask.long().cumsum(-1) - 1
635
+ position_ids.masked_fill_(attention_mask == 0, 1)
636
+ if past_length > 0:
637
+ position_ids = position_ids[:, past_length : input_shape[-1] + past_length :]
638
+ elif position_ids is None:
639
+ position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
640
+ position_ids = position_ids.unsqueeze(0).reshape(-1, input_shape[-1])
641
+
642
+ # Self-attention mask.
643
+ query_length = input_shape[-1]
644
+ key_length = past_length + query_length
645
+ self_attention_mask = self.bias[None, key_length - query_length : key_length, :key_length]
646
+
647
+ if attention_mask is not None:
648
+ self_attention_mask = self_attention_mask * attention_mask.reshape(batch_size, 1, -1).to(
649
+ dtype=torch.bool, device=self_attention_mask.device
650
+ )
651
+
652
+ # MQA models: (batch_size, query_length, n_heads, key_length)
653
+ # MHA models: (batch_size, n_heads, query_length, key_length)
654
+ attention_mask = self_attention_mask.unsqueeze(1)
655
+
656
+ encoder_attention_mask = None
657
+
658
+ # Prepare head mask if needed
659
+ # 1.0 in head_mask indicate we keep the head
660
+ # attention_probs has shape bsz x n_heads x N x N
661
+ # head_mask has shape n_layer x batch x n_heads x N x N
662
+ head_mask = self.get_head_mask(head_mask, self.config.n_layer)
663
+
664
+ if inputs_embeds is None:
665
+ inputs_embeds = self.wte(input_ids)
666
+
667
+ hidden_states = inputs_embeds
668
+ if self.position_embedding_type == "learned_absolute":
669
+ position_embeds = self.wpe(position_ids)
670
+ hidden_states = hidden_states + position_embeds
671
+
672
+ if token_type_ids is not None:
673
+ token_type_embeds = self.wte(token_type_ids)
674
+ hidden_states = hidden_states + token_type_embeds
675
+
676
+ hidden_states = self.drop(hidden_states)
677
+
678
+ output_shape = input_shape + (hidden_states.size(-1),)
679
+
680
+ presents = [] if use_cache else None
681
+ all_self_attentions = () if output_attentions else None
682
+ all_hidden_states = () if output_hidden_states else None
683
+ for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
684
+ if output_hidden_states:
685
+ all_hidden_states = all_hidden_states + (hidden_states,)
686
+
687
+ if self.gradient_checkpointing and self.training:
688
+
689
+ def create_custom_forward(module):
690
+ def custom_forward(*inputs):
691
+ # None for past_key_value
692
+ return module(*inputs, use_cache, output_attentions)
693
+
694
+ return custom_forward
695
+
696
+ outputs = torch.utils.checkpoint.checkpoint(
697
+ create_custom_forward(block),
698
+ hidden_states,
699
+ None,
700
+ attention_mask,
701
+ position_ids,
702
+ head_mask[i],
703
+ encoder_hidden_states,
704
+ encoder_attention_mask,
705
+ )
706
+ else:
707
+ outputs = block(
708
+ hidden_states,
709
+ layer_past=layer_past,
710
+ attention_mask=attention_mask,
711
+ position_ids=position_ids,
712
+ head_mask=head_mask[i],
713
+ encoder_hidden_states=encoder_hidden_states,
714
+ encoder_attention_mask=encoder_attention_mask,
715
+ use_cache=use_cache,
716
+ output_attentions=output_attentions,
717
+ )
718
+
719
+ hidden_states = outputs[0]
720
+ if use_cache:
721
+ presents.append(outputs[1])
722
+
723
+ if output_attentions:
724
+ all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
725
+
726
+ hidden_states = self.ln_f(hidden_states)
727
+ hidden_states = hidden_states.reshape(output_shape)
728
+ # Add last hidden state
729
+ if output_hidden_states:
730
+ all_hidden_states = all_hidden_states + (hidden_states,)
731
+
732
+
733
+ if not return_dict:
734
+ return tuple(
735
+ v
736
+ for v in [hidden_states, presents, all_hidden_states, all_self_attentions]
737
+ if v is not None
738
+ )
739
+
740
+ return BaseModelOutputWithPastAndCrossAttentions(
741
+ last_hidden_state=hidden_states,
742
+ past_key_values=presents,
743
+ hidden_states=all_hidden_states,
744
+ attentions=all_self_attentions,
745
+ )
746
+
747
+
748
+ @add_start_docstrings(
749
+ """
750
+ The GPT_BIGCODE Model transformer with a language modeling head on top (linear layer with weights tied to the input
751
+ embeddings).
752
+ """,
753
+ GPT_BIGCODE_START_DOCSTRING,
754
+ )
755
+ class CodeShellForCausalLM(CodeShellPreTrainedModel):
756
+ _tied_weights_keys = ["lm_head.weight"]
757
+
758
+ def __init__(self, config):
759
+ super().__init__(config)
760
+ self.transformer = CodeShellModel(config)
761
+ self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
762
+
763
+ # Initialize weights and apply final processing
764
+ self.post_init()
765
+
766
+ def quantize(self, bits: int):
767
+ try:
768
+ import bitsandbytes
769
+ from .quantizer import quantize
770
+ except ImportError:
771
+ raise ImportError(f"Needs bitsandbytes to run quantize.")
772
+ return quantize(self, bits)
773
+
774
+ def get_output_embeddings(self):
775
+ return self.lm_head
776
+
777
+ def set_output_embeddings(self, new_embeddings):
778
+ self.lm_head = new_embeddings
779
+
780
+ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
781
+ token_type_ids = kwargs.get("token_type_ids", None)
782
+ # only last token for inputs_ids if past is defined in kwargs
783
+ if past_key_values:
784
+ input_ids = input_ids[:, -1].unsqueeze(-1)
785
+ if token_type_ids is not None:
786
+ token_type_ids = token_type_ids[:, -1].unsqueeze(-1)
787
+
788
+ attention_mask = kwargs.get("attention_mask", None)
789
+ position_ids = kwargs.get("position_ids", None)
790
+
791
+ if attention_mask is not None and position_ids is None:
792
+ # create position_ids on the fly for batch generation
793
+ position_ids = attention_mask.long().cumsum(-1) - 1
794
+ position_ids.masked_fill_(attention_mask == 0, 1)
795
+ if past_key_values:
796
+ position_ids = position_ids[:, -1].unsqueeze(-1)
797
+ else:
798
+ position_ids = None
799
+
800
+ # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
801
+ if inputs_embeds is not None and past_key_values is None:
802
+ model_inputs = {"inputs_embeds": inputs_embeds}
803
+ else:
804
+ model_inputs = {"input_ids": input_ids}
805
+
806
+ model_inputs.update(
807
+ {
808
+ "past_key_values": past_key_values,
809
+ "use_cache": kwargs.get("use_cache"),
810
+ "position_ids": position_ids,
811
+ "attention_mask": attention_mask,
812
+ "token_type_ids": token_type_ids,
813
+ }
814
+ )
815
+ return model_inputs
816
+
817
+ @add_start_docstrings_to_model_forward(GPT_BIGCODE_INPUTS_DOCSTRING)
818
+ def forward(
819
+ self,
820
+ input_ids: Optional[torch.Tensor] = None,
821
+ past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
822
+ attention_mask: Optional[torch.Tensor] = None,
823
+ token_type_ids: Optional[torch.Tensor] = None,
824
+ position_ids: Optional[torch.Tensor] = None,
825
+ head_mask: Optional[torch.Tensor] = None,
826
+ inputs_embeds: Optional[torch.Tensor] = None,
827
+ encoder_hidden_states: Optional[torch.Tensor] = None,
828
+ encoder_attention_mask: Optional[torch.Tensor] = None,
829
+ labels: Optional[torch.Tensor] = None,
830
+ use_cache: Optional[bool] = None,
831
+ output_attentions: Optional[bool] = None,
832
+ output_hidden_states: Optional[bool] = None,
833
+ return_dict: Optional[bool] = None,
834
+ ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
835
+ r"""
836
+ labels (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
837
+ Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
838
+ `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
839
+ are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
840
+ """
841
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
842
+
843
+ transformer_outputs = self.transformer(
844
+ input_ids,
845
+ past_key_values=past_key_values,
846
+ attention_mask=attention_mask,
847
+ token_type_ids=token_type_ids,
848
+ position_ids=position_ids,
849
+ head_mask=head_mask,
850
+ inputs_embeds=inputs_embeds,
851
+ encoder_hidden_states=encoder_hidden_states,
852
+ encoder_attention_mask=encoder_attention_mask,
853
+ use_cache=use_cache,
854
+ output_attentions=output_attentions,
855
+ output_hidden_states=output_hidden_states,
856
+ return_dict=return_dict,
857
+ )
858
+ hidden_states = transformer_outputs[0]
859
+ lm_logits = self.lm_head(hidden_states)
860
+ loss = None
861
+ if labels is not None:
862
+ # Shift so that tokens < n predict n
863
+ shift_logits = lm_logits[..., :-1, :].contiguous()
864
+ shift_labels = labels[..., 1:].contiguous().to(shift_logits.device)
865
+ # Flatten the tokens
866
+ loss_fct = CrossEntropyLoss()
867
+ loss = loss_fct(shift_logits.reshape(-1, shift_logits.size(-1)), shift_labels.reshape(-1))
868
+
869
+ if not return_dict:
870
+ output = (lm_logits,) + transformer_outputs[1:]
871
+ return ((loss,) + output) if loss is not None else output
872
+
873
+ return CausalLMOutputWithCrossAttentions(
874
+ loss=loss,
875
+ logits=lm_logits,
876
+ past_key_values=transformer_outputs.past_key_values,
877
+ hidden_states=transformer_outputs.hidden_states,
878
+ attentions=transformer_outputs.attentions,
879
+ )
880
+
881
+ @staticmethod
882
+ def _reorder_cache(past_key_values, beam_idx):
883
+ reordered_past = ()
884
+ for layer_past in past_key_values:
885
+ reordered_past += (
886
+ tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
887
+ )
888
+ return reordered_past
889
+
890
+ class CodeShell4bitForCausalLM(CodeShellForCausalLM):
891
+ def __init__(self, config):
892
+ CodeShellPreTrainedModel.__init__(self, config)
893
+ self.transformer = CodeShellModel(config)
894
+ self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
895
+
896
+ try:
897
+ import bitsandbytes
898
+ from .quantizer import quantize_offline
899
+ quantize_offline(self)
900
+ except ImportError:
901
+ raise ImportError(f"Needs bitsandbytes to run quantize.")
902
+
903
+ self.post_init()
904
+
905
+ @classmethod
906
+ def from_pretrained(
907
+ cls,
908
+ pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
909
+ *model_args,
910
+ config: Optional[Union[PretrainedConfig, str, os.PathLike]] = None,
911
+ cache_dir: Optional[Union[str, os.PathLike]] = None,
912
+ ignore_mismatched_sizes: bool = False,
913
+ force_download: bool = False,
914
+ local_files_only: bool = False,
915
+ token: Optional[Union[str, bool]] = None,
916
+ revision: str = "main",
917
+ use_safetensors: bool = None,
918
+ **kwargs,
919
+ ):
920
+ if not isinstance(config, PretrainedConfig):
921
+ config_path = config if config is not None else pretrained_model_name_or_path
922
+ config, _ = cls.config_class.from_pretrained(
923
+ config_path,
924
+ cache_dir=cache_dir,
925
+ return_unused_kwargs=True,
926
+ force_download=force_download,
927
+ resume_download=False,
928
+ proxies=None,
929
+ local_files_only=local_files_only,
930
+ token=token,
931
+ revision=revision,
932
+ subfolder="",
933
+ _from_auto=False,
934
+ _from_pipeline=None,
935
+ **kwargs,
936
+ )
937
+
938
+ # Load config if we don't provide a configuration
939
+ from .quantizer import load_state_dict_for_qunantied_model
940
+ model = cls(config)
941
+ state_dict = torch.load(os.path.join(pretrained_model_name_or_path, 'pytorch_model.bin'), map_location="cpu")
942
+ model = load_state_dict_for_qunantied_model(model, state_dict)
943
+ model.eval()
944
+
945
+ # If it is a model with generation capabilities, attempt to load the generation config
946
+ if model.can_generate():
947
+ try:
948
+ model.generation_config = GenerationConfig.from_pretrained(
949
+ pretrained_model_name_or_path,
950
+ cache_dir=cache_dir,
951
+ force_download=force_download,
952
+ resume_download=False,
953
+ proxies=None,
954
+ local_files_only=local_files_only,
955
+ token=token,
956
+ revision=revision,
957
+ subfolder="",
958
+ _from_auto=False,
959
+ _from_pipeline=None,
960
+ **kwargs,
961
+ )
962
+ except (OSError, TypeError):
963
+ pass
964
+
965
+ device_map = kwargs.pop("device_map", None)
966
+ if device_map is not None:
967
+ model = model.to(torch.device(device_map))
968
+
969
+ return model
970
+
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1196cbde268398d9991c41f302f86685930d5efa62621c8b72e1b1967ce8c8de
3
+ size 4740708865
quantizer.py ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023 WisdomShell Inc. All Rights Reserved.
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ try:
17
+ import bitsandbytes as bnb
18
+ from bitsandbytes.nn.modules import Params4bit, Int8Params
19
+ except ImportError:
20
+ pass
21
+ import torch
22
+
23
+ def Params4bitCuda(self, device):
24
+ self.data = self.data.cuda(device)
25
+ if self.quant_state is not None:
26
+ self.quant_state[0] = self.quant_state[0].cuda(device)
27
+ self.quant_state[6] = self.quant_state[6].cuda(device)
28
+ return self
29
+
30
+ def Params4bitTo(self, *args, **kwargs):
31
+ device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(*args, **kwargs)
32
+
33
+ if (device is not None and device.type == "cuda" and self.data.device.type == "cpu"):
34
+ return self.cuda(device)
35
+ else:
36
+ if self.quant_state is not None:
37
+ # make sure the quantization state is on the right device
38
+ self.quant_state[0] = self.quant_state[0].to(device)
39
+ self.quant_state[6] = self.quant_state[6].to(device)
40
+ new_param = Params4bit(self.to(device=device, dtype=dtype, non_blocking=non_blocking),
41
+ requires_grad=self.requires_grad, quant_state=self.quant_state,
42
+ blocksize=self.blocksize, compress_statistics=self.compress_statistics,
43
+ quant_type=self.quant_type)
44
+
45
+ return new_param
46
+
47
+ class Linear4bitOnline(torch.nn.Module):
48
+ def __init__(self, weight, bias, quant_type):
49
+ super().__init__()
50
+ self.weight = Params4bit(
51
+ weight.data, requires_grad=False, compress_statistics=True, quant_type=quant_type
52
+ )
53
+ self.compute_dtype = None
54
+ #self.weight.cuda(weight.device)
55
+ self.bias = bias
56
+
57
+ def forward(self, x: torch.Tensor):
58
+ # weights are cast automatically as Int8Params, but the bias has to be cast manually
59
+ if self.bias is not None and self.bias.dtype != x.dtype:
60
+ self.bias.data = self.bias.data.to(x.dtype)
61
+
62
+ if getattr(self.weight, "quant_state", None) is None:
63
+ print(
64
+ "FP4 quantization state not initialized. Please call .cuda() or .to(device) on the LinearFP4 layer first."
65
+ )
66
+ inp_dtype = x.dtype
67
+ if self.compute_dtype is not None:
68
+ x = x.to(self.compute_dtype)
69
+
70
+ bias = None if self.bias is None else self.bias.to(self.compute_dtype)
71
+ out = bnb.matmul_4bit(
72
+ x, self.weight.t(), bias=bias, quant_state=self.weight.quant_state
73
+ )
74
+
75
+ out = out.to(inp_dtype)
76
+
77
+ return out
78
+
79
+ class Linear8bitLtOnline(torch.nn.Module):
80
+ def __init__(
81
+ self,
82
+ weight,
83
+ bias,
84
+ has_fp16_weights=True,
85
+ memory_efficient_backward=False,
86
+ threshold=0.0,
87
+ index=None,
88
+ ):
89
+ super().__init__()
90
+ assert (
91
+ not memory_efficient_backward
92
+ ), "memory_efficient_backward is no longer required and the argument is deprecated in 0.37.0 and will be removed in 0.39.0"
93
+ self.state = bnb.MatmulLtState()
94
+ self.index = index
95
+
96
+ # Necessary for stacked layers
97
+ self.state.threshold = threshold
98
+ self.state.has_fp16_weights = has_fp16_weights
99
+ self.state.memory_efficient_backward = memory_efficient_backward
100
+ if threshold > 0.0 and not has_fp16_weights:
101
+ self.state.use_pool = True
102
+
103
+ self.weight = Int8Params(
104
+ weight.data,
105
+ has_fp16_weights=has_fp16_weights,
106
+ requires_grad=has_fp16_weights,
107
+ )
108
+ self.bias = bias
109
+
110
+ def init_8bit_state(self):
111
+ self.state.CB = self.weight.CB
112
+ self.state.SCB = self.weight.SCB
113
+ self.weight.CB = None
114
+ self.weight.SCB = None
115
+
116
+ def forward(self, x: torch.Tensor):
117
+ self.state.is_training = self.training
118
+ if self.weight.CB is not None:
119
+ self.init_8bit_state()
120
+
121
+ # weights are cast automatically as Int8Params, but the bias has to be cast manually
122
+ if self.bias is not None and self.bias.dtype != x.dtype:
123
+ self.bias.data = self.bias.data.to(x.dtype)
124
+
125
+ out = bnb.matmul(x, self.weight, bias=self.bias, state=self.state)
126
+
127
+ if not self.state.has_fp16_weights:
128
+ if self.state.CB is not None and self.state.CxB is not None:
129
+ # we converted 8-bit row major to turing/ampere format in the first inference pass
130
+ # we no longer need the row-major weight
131
+ del self.state.CB
132
+ self.weight.data = self.state.CxB
133
+ return out
134
+
135
+ def quantize_online(model, bits: int):
136
+ def quant(weight, bias=None):
137
+ if bits == 8:
138
+ linear = Linear8bitLtOnline(
139
+ weight,
140
+ bias,
141
+ has_fp16_weights=False,
142
+ threshold=6.0,
143
+ )
144
+ if bias is not None:
145
+ linear.bias = torch.nn.Parameter(bias)
146
+ elif bits == 4:
147
+ linear = Linear4bitOnline(
148
+ weight,
149
+ bias,
150
+ quant_type="nf4", #fp4/nf4
151
+ )
152
+ else:
153
+ raise ValueError("quantize only support 4/8 bit")
154
+ return linear
155
+
156
+ def auto_quant(layer):
157
+ if hasattr(layer,"bias"):
158
+ linear = quant(layer.weight,bias=layer.bias)
159
+ else:
160
+ linear = quant(layer.weight)
161
+ return linear
162
+
163
+ for i,layer in enumerate(model.transformer.h):
164
+ layer.mlp.c_fc = auto_quant(layer.mlp.c_fc)
165
+ layer.mlp.c_proj = auto_quant(layer.mlp.c_proj)
166
+
167
+ layer.attn.c_attn=auto_quant(layer.attn.c_attn)
168
+ layer.attn.c_proj=auto_quant(layer.attn.c_proj)
169
+
170
+ return model
171
+
172
+
173
+ general_weight_dict = {
174
+ "transformer.wte.weight": False,
175
+ "transformer.ln_f.weight": False,
176
+ "transformer.ln_f.bias": False,
177
+ "lm_head.weight": False,
178
+ }
179
+
180
+ layer_weight_dict = {
181
+ "transformer.h.{i}.ln_1.weight": False,
182
+ "transformer.h.{i}.ln_1.bias": False,
183
+ "transformer.h.{i}.attn.c_attn.weight": True,
184
+ "transformer.h.{i}.attn.c_attn.bias": False,
185
+ "transformer.h.{i}.attn.c_proj.weight": True,
186
+ "transformer.h.{i}.attn.c_proj.bias": False,
187
+ "transformer.h.{i}.attn.rotary_emb.inv_freq": False,
188
+ "transformer.h.{i}.ln_2.weight": False,
189
+ "transformer.h.{i}.ln_2.bias": False,
190
+ "transformer.h.{i}.mlp.c_fc.weight": True,
191
+ "transformer.h.{i}.mlp.c_fc.bias": False,
192
+ "transformer.h.{i}.mlp.c_proj.weight": True,
193
+ "transformer.h.{i}.mlp.c_proj.bias": False,
194
+ }
195
+ num_dict = {str(i):i for i in range(100)}
196
+
197
+ def set_value(model, name, state_dict, is_4bit):
198
+ keys = name.split('.')
199
+ parent = model
200
+ for key in keys[:-1]:
201
+ if key in num_dict:
202
+ parent = parent[num_dict[key]]
203
+ else:
204
+ parent = getattr(parent, key)
205
+ if is_4bit:
206
+ weight_data = state_dict[f'{name}.data']
207
+ weight_quant_state = state_dict[f'{name}.quant_state']
208
+ assert weight_data is not None, name
209
+ assert weight_quant_state is not None, name
210
+ setattr(parent, keys[-1], Params4bit(weight_data, requires_grad=False, quant_state=weight_quant_state))
211
+ else:
212
+ setattr(parent, keys[-1], state_dict[name])
213
+
214
+ def quantize_offline(model):
215
+ for i, layer in enumerate(model.transformer.h):
216
+ layer.mlp.c_fc = bnb.nn.Linear4bit(
217
+ layer.mlp.c_fc.weight.shape[1],
218
+ layer.mlp.c_fc.weight.shape[0],
219
+ False,
220
+ torch.bfloat16,
221
+ compress_statistics=True,
222
+ quant_type="nf4",
223
+ )
224
+ layer.mlp.c_proj = bnb.nn.Linear4bit(
225
+ layer.mlp.c_proj.weight.shape[1],
226
+ layer.mlp.c_proj.weight.shape[0],
227
+ False,
228
+ torch.bfloat16,
229
+ compress_statistics=True,
230
+ quant_type="nf4",
231
+ )
232
+
233
+ layer.attn.c_attn = bnb.nn.Linear4bit(
234
+ layer.attn.c_attn.weight.shape[1],
235
+ layer.attn.c_attn.weight.shape[0],
236
+ False,
237
+ torch.bfloat16,
238
+ compress_statistics=True,
239
+ quant_type="nf4",
240
+ )
241
+ layer.attn.c_proj = bnb.nn.Linear4bit(
242
+ layer.attn.c_proj.weight.shape[1],
243
+ layer.attn.c_proj.weight.shape[0],
244
+ False,
245
+ torch.bfloat16,
246
+ compress_statistics=True,
247
+ quant_type="nf4",
248
+ )
249
+ return model
250
+
251
+ def load_state_dict_for_qunantied_model(model, state_dict):
252
+ #replace Params4bit.cuda with Params4bitCuda
253
+ Params4bit.cuda = Params4bitCuda
254
+ Params4bit.to = Params4bitTo
255
+
256
+ for name, is_4bit in general_weight_dict.items():
257
+ set_value(model, name, state_dict, is_4bit)
258
+
259
+ for layer_i in range(len(model.transformer.h)):
260
+ for name, is_4bit in layer_weight_dict.items():
261
+ name = name.replace('{i}', str(layer_i))
262
+ set_value(model, name, state_dict, is_4bit)
263
+ return model
special_tokens_map.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|endoftext|>",
4
+ "<fim_prefix>",
5
+ "<fim_middle>",
6
+ "<fim_suffix>",
7
+ "<fim_pad>",
8
+ "<filename>",
9
+ "<gh_stars>",
10
+ "<issue_start>",
11
+ "<issue_comment>",
12
+ "<issue_closed>",
13
+ "<jupyter_start>",
14
+ "<jupyter_text>",
15
+ "<jupyter_code>",
16
+ "<jupyter_output>",
17
+ "<empty_output>",
18
+ "<commit_before>",
19
+ "<commit_msg>",
20
+ "<commit_after>",
21
+ "<reponame>"
22
+ ],
23
+ "bos_token": "<|endoftext|>",
24
+ "eos_token": "<|endoftext|>",
25
+ "pad_token": "<|endoftext|>",
26
+ "unk_token": "<|endoftext|>"
27
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "70000": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "70001": {
13
+ "content": "<fim_prefix>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "70002": {
21
+ "content": "<fim_middle>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "70003": {
29
+ "content": "<fim_suffix>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "70004": {
37
+ "content": "<fim_pad>",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "70005": {
45
+ "content": "<filename>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "70006": {
53
+ "content": "<gh_stars>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ },
60
+ "70007": {
61
+ "content": "<issue_start>",
62
+ "lstrip": false,
63
+ "normalized": false,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": true
67
+ },
68
+ "70008": {
69
+ "content": "<issue_comment>",
70
+ "lstrip": false,
71
+ "normalized": false,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": true
75
+ },
76
+ "70009": {
77
+ "content": "<issue_closed>",
78
+ "lstrip": false,
79
+ "normalized": false,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": true
83
+ },
84
+ "70010": {
85
+ "content": "<jupyter_start>",
86
+ "lstrip": false,
87
+ "normalized": false,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": true
91
+ },
92
+ "70011": {
93
+ "content": "<jupyter_text>",
94
+ "lstrip": false,
95
+ "normalized": false,
96
+ "rstrip": false,
97
+ "single_word": false,
98
+ "special": true
99
+ },
100
+ "70012": {
101
+ "content": "<jupyter_code>",
102
+ "lstrip": false,
103
+ "normalized": false,
104
+ "rstrip": false,
105
+ "single_word": false,
106
+ "special": true
107
+ },
108
+ "70013": {
109
+ "content": "<jupyter_output>",
110
+ "lstrip": false,
111
+ "normalized": false,
112
+ "rstrip": false,
113
+ "single_word": false,
114
+ "special": true
115
+ },
116
+ "70014": {
117
+ "content": "<empty_output>",
118
+ "lstrip": false,
119
+ "normalized": false,
120
+ "rstrip": false,
121
+ "single_word": false,
122
+ "special": true
123
+ },
124
+ "70015": {
125
+ "content": "<commit_before>",
126
+ "lstrip": false,
127
+ "normalized": false,
128
+ "rstrip": false,
129
+ "single_word": false,
130
+ "special": true
131
+ },
132
+ "70016": {
133
+ "content": "<commit_msg>",
134
+ "lstrip": false,
135
+ "normalized": false,
136
+ "rstrip": false,
137
+ "single_word": false,
138
+ "special": true
139
+ },
140
+ "70017": {
141
+ "content": "<commit_after>",
142
+ "lstrip": false,
143
+ "normalized": false,
144
+ "rstrip": false,
145
+ "single_word": false,
146
+ "special": true
147
+ },
148
+ "70018": {
149
+ "content": "<reponame>",
150
+ "lstrip": false,
151
+ "normalized": false,
152
+ "rstrip": false,
153
+ "single_word": false,
154
+ "special": true
155
+ }
156
+ },
157
+ "additional_special_tokens": [
158
+ "<|endoftext|>",
159
+ "<fim_prefix>",
160
+ "<fim_middle>",
161
+ "<fim_suffix>",
162
+ "<fim_pad>",
163
+ "<filename>",
164
+ "<gh_stars>",
165
+ "<issue_start>",
166
+ "<issue_comment>",
167
+ "<issue_closed>",
168
+ "<jupyter_start>",
169
+ "<jupyter_text>",
170
+ "<jupyter_code>",
171
+ "<jupyter_output>",
172
+ "<empty_output>",
173
+ "<commit_before>",
174
+ "<commit_msg>",
175
+ "<commit_after>",
176
+ "<reponame>"
177
+ ],
178
+ "bos_token": "<|endoftext|>",
179
+ "clean_up_tokenization_spaces": true,
180
+ "eos_token": "<|endoftext|>",
181
+ "model_max_length": 4096,
182
+ "pad_token": "<|endoftext|>",
183
+ "tokenizer_class": "GPT2Tokenizer",
184
+ "unk_token": "<|endoftext|>",
185
+ "vocab_size": 70000
186
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff