TheBloke commited on
Commit
36e6fa5
1 Parent(s): 36c11fc

Initial GPTQ model commit

Browse files
Files changed (1) hide show
  1. quantizer.py +211 -0
quantizer.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import bitsandbytes as bnb
2
+ from accelerate import init_empty_weights
3
+ from bitsandbytes.nn.modules import Params4bit, Int8Params
4
+ import torch
5
+
6
+ def Params4bitCuda(self, device):
7
+ self.data = self.data.cuda(device)
8
+ self.quant_state[0] = self.quant_state[0].cuda(device)
9
+ self.quant_state[4][0] = self.quant_state[4][0].cuda(device)
10
+ self.quant_state[4][1][0] = self.quant_state[4][1][0].cuda(device)
11
+ self.quant_state[4][1][1] = self.quant_state[4][1][1].cuda(device)
12
+
13
+ self.quant_state[6] = self.quant_state[6].cuda(device)
14
+ return self
15
+
16
+ class Linear4bitOnline(torch.nn.Module):
17
+ def __init__(self, weight, bias, quant_type):
18
+ super().__init__()
19
+ self.weight = Params4bit(
20
+ weight.data, requires_grad=False, compress_statistics=True, quant_type=quant_type
21
+ )
22
+ self.compute_dtype = None
23
+ #self.weight.cuda(weight.device)
24
+ self.bias = bias
25
+
26
+ def forward(self, x: torch.Tensor):
27
+ # weights are cast automatically as Int8Params, but the bias has to be cast manually
28
+ if self.bias is not None and self.bias.dtype != x.dtype:
29
+ self.bias.data = self.bias.data.to(x.dtype)
30
+
31
+ if getattr(self.weight, "quant_state", None) is None:
32
+ print(
33
+ "FP4 quantization state not initialized. Please call .cuda() or .to(device) on the LinearFP4 layer first."
34
+ )
35
+ inp_dtype = x.dtype
36
+ if self.compute_dtype is not None:
37
+ x = x.to(self.compute_dtype)
38
+
39
+ bias = None if self.bias is None else self.bias.to(self.compute_dtype)
40
+ out = bnb.matmul_4bit(
41
+ x, self.weight.t(), bias=bias, quant_state=self.weight.quant_state
42
+ )
43
+
44
+ out = out.to(inp_dtype)
45
+
46
+ return out
47
+
48
+ class Linear8bitLtOnline(torch.nn.Module):
49
+ def __init__(
50
+ self,
51
+ weight,
52
+ bias,
53
+ has_fp16_weights=True,
54
+ memory_efficient_backward=False,
55
+ threshold=0.0,
56
+ index=None,
57
+ ):
58
+ super().__init__()
59
+ assert (
60
+ not memory_efficient_backward
61
+ ), "memory_efficient_backward is no longer required and the argument is deprecated in 0.37.0 and will be removed in 0.39.0"
62
+ self.state = bnb.MatmulLtState()
63
+ self.index = index
64
+
65
+ # Necessary for stacked layers
66
+ self.state.threshold = threshold
67
+ self.state.has_fp16_weights = has_fp16_weights
68
+ self.state.memory_efficient_backward = memory_efficient_backward
69
+ if threshold > 0.0 and not has_fp16_weights:
70
+ self.state.use_pool = True
71
+
72
+ self.weight = Int8Params(
73
+ weight.data,
74
+ has_fp16_weights=has_fp16_weights,
75
+ requires_grad=has_fp16_weights,
76
+ )
77
+ self.bias = bias
78
+
79
+ def init_8bit_state(self):
80
+ self.state.CB = self.weight.CB
81
+ self.state.SCB = self.weight.SCB
82
+ self.weight.CB = None
83
+ self.weight.SCB = None
84
+
85
+ def forward(self, x: torch.Tensor):
86
+ self.state.is_training = self.training
87
+ if self.weight.CB is not None:
88
+ self.init_8bit_state()
89
+
90
+ # weights are cast automatically as Int8Params, but the bias has to be cast manually
91
+ if self.bias is not None and self.bias.dtype != x.dtype:
92
+ self.bias.data = self.bias.data.to(x.dtype)
93
+
94
+ out = bnb.matmul(x, self.weight, bias=self.bias, state=self.state)
95
+
96
+ if not self.state.has_fp16_weights:
97
+ if self.state.CB is not None and self.state.CxB is not None:
98
+ # we converted 8-bit row major to turing/ampere format in the first inference pass
99
+ # we no longer need the row-major weight
100
+ del self.state.CB
101
+ self.weight.data = self.state.CxB
102
+ return out
103
+
104
+ def quantize_offline(model, bits: int):
105
+ assert (bits == 4), f'bits: {bits} is not supported'
106
+
107
+ for i, layer in enumerate(model.model.layers):
108
+ layer.self_attn.W_pack = bnb.nn.Linear4bit(
109
+ layer.self_attn.W_pack.weight.shape[1],
110
+ layer.self_attn.W_pack.weight.shape[0],
111
+ False,
112
+ torch.float16,
113
+ compress_statistics=True,
114
+ quant_type="nf4",
115
+ )
116
+ layer.self_attn.o_proj = bnb.nn.Linear4bit(
117
+ layer.self_attn.o_proj.weight.shape[1],
118
+ layer.self_attn.o_proj.weight.shape[0],
119
+ False,
120
+ torch.float16,
121
+ compress_statistics=True,
122
+ quant_type="nf4",
123
+ )
124
+
125
+ layer.mlp.gate_proj = bnb.nn.Linear4bit(
126
+ layer.mlp.gate_proj.weight.shape[1],
127
+ layer.mlp.gate_proj.weight.shape[0],
128
+ False,
129
+ torch.float16,
130
+ compress_statistics=True,
131
+ quant_type="nf4",
132
+ )
133
+ layer.mlp.down_proj = bnb.nn.Linear4bit(
134
+ layer.mlp.down_proj.weight.shape[1],
135
+ layer.mlp.down_proj.weight.shape[0],
136
+ False,
137
+ torch.float16,
138
+ compress_statistics=True,
139
+ quant_type="nf4",
140
+ )
141
+ layer.mlp.up_proj = bnb.nn.Linear4bit(
142
+ layer.mlp.up_proj.weight.shape[1],
143
+ layer.mlp.up_proj.weight.shape[0],
144
+ False,
145
+ torch.float16,
146
+ compress_statistics=True,
147
+ quant_type="nf4",
148
+ )
149
+ return model
150
+
151
+ def quantize_online(model, bits: int):
152
+ def quant(weight, bias=None):
153
+ if bits == 8:
154
+ linear = Linear8bitLtOnline(
155
+ weight,
156
+ bias,
157
+ has_fp16_weights=False,
158
+ threshold=6.0,
159
+ )
160
+ if bias is not None:
161
+ linear.bias = torch.nn.Parameter(bias)
162
+ elif bits == 4:
163
+ linear = Linear4bitOnline(
164
+ weight,
165
+ bias,
166
+ quant_type="nf4", #fp4/nf4
167
+ )
168
+ else:
169
+ raise ValueError("quantize only support 4/8 bit")
170
+ return linear
171
+
172
+ for i, layer in enumerate(model.model.layers):
173
+ layer.self_attn.W_pack = quant(layer.self_attn.W_pack.weight)
174
+ layer.self_attn.o_proj = quant(layer.self_attn.o_proj.weight)
175
+ layer.mlp.gate_proj = quant(layer.mlp.gate_proj.weight)
176
+ layer.mlp.down_proj = quant(layer.mlp.down_proj.weight)
177
+ layer.mlp.up_proj = quant(layer.mlp.up_proj.weight)
178
+ return model
179
+
180
+ def init_model_weight_int4(config, model, state_dict):
181
+ #replace Params4bit.cuda with Params4bitCuda
182
+ Params4bit.cuda = Params4bitCuda
183
+
184
+ for i in range(config.num_hidden_layers):
185
+ weight_data = state_dict[f'model.layers.{i}.self_attn.W_pack.weight.data']
186
+ weight_quant_state = state_dict[f'model.layers.{i}.self_attn.W_pack.weight.quant_state']
187
+ model.model.layers[i].self_attn.W_pack.weight = Params4bit(weight_data, requires_grad=False, quant_state=weight_quant_state)
188
+
189
+ weight_data = state_dict[f'model.layers.{i}.self_attn.o_proj.weight.data']
190
+ weight_quant_state = state_dict[f'model.layers.{i}.self_attn.o_proj.weight.quant_state']
191
+ model.model.layers[i].self_attn.o_proj.weight = Params4bit(weight_data, requires_grad=False, quant_state=weight_quant_state)
192
+
193
+ weight_data = state_dict[f'model.layers.{i}.mlp.gate_proj.weight.data']
194
+ weight_quant_state = state_dict[f'model.layers.{i}.mlp.gate_proj.weight.quant_state']
195
+ model.model.layers[i].mlp.gate_proj.weight = Params4bit(weight_data, requires_grad=False, quant_state=weight_quant_state)
196
+
197
+ weight_data = state_dict[f'model.layers.{i}.mlp.up_proj.weight.data']
198
+ weight_quant_state = state_dict[f'model.layers.{i}.mlp.up_proj.weight.quant_state']
199
+ model.model.layers[i].mlp.up_proj.weight = Params4bit(weight_data, requires_grad=False, quant_state=weight_quant_state)
200
+
201
+ weight_data = state_dict[f'model.layers.{i}.mlp.down_proj.weight.data']
202
+ weight_quant_state = state_dict[f'model.layers.{i}.mlp.down_proj.weight.quant_state']
203
+ model.model.layers[i].mlp.down_proj.weight = Params4bit(weight_data, requires_grad=False, quant_state=weight_quant_state)
204
+
205
+ model.model.layers[i].input_layernorm.weight = state_dict[f'model.layers.{i}.input_layernorm.weight']
206
+ model.model.layers[i].post_attention_layernorm.weight = state_dict[f'model.layers.{i}.post_attention_layernorm.weight']
207
+
208
+ model.model.embed_tokens.weight = state_dict['model.embed_tokens.weight']
209
+ model.model.norm.weight = state_dict['model.norm.weight']
210
+ model.lm_head.weight = state_dict['lm_head.weight']
211
+ return model