cicdatopea commited on
Commit
f4bbbab
·
verified ·
1 Parent(s): 7c8c9c6

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +9 -12
README.md CHANGED
@@ -29,7 +29,7 @@ intel-extension-for-transformers: faster repacking, slower inference,higher accu
29
 
30
  intel-extension-for-pytorch: much slower repacking, faster inference, lower accuracy
31
 
32
- ~~python
33
  from auto_round import AutoRoundConfig ##must import for autoround format
34
  from transformers import AutoModelForCausalLM, AutoTokenizer
35
  import torch
@@ -161,7 +161,7 @@ prompt = "There is a girl who likes adventure,"
161
  prompt = "Please give a brief introduction of DeepSeek company."
162
  ##INT4:
163
  """DeepSeek Artificial Intelligence Co., Ltd. (referred to as "DeepSeek" or "深度求索") , founded in 2023, is a Chinese company dedicated to making AGI a reality"""
164
- ~~
165
 
166
  ### INT4 Inference on CUDA(have not tested, maybe need 8X80G GPU)
167
 
@@ -217,7 +217,7 @@ we have no enough resource to evaluate the model
217
 
218
  We discovered that the inputs and outputs of certain layers in this model are very large and even exceed the FP16 range when tested with a few prompts. It is recommended to exclude these layers from quantization—particularly the 'down_proj' in layer 60—and run them using BF16 precision instead. However, we have not implemented this in this int4 model as in cpu, the compute dtype for int4 is bf16 or FP32.
219
 
220
- ~~python
221
  model.layers.60.mlp.experts.150.down_proj tensor(1144.) tensor(2122.9451)
222
  model.layers.60.mlp.experts.231.down_proj tensor(25856.) tensor(12827.9980)
223
  model.layers.60.mlp.shared_experts.down_proj tensor(1880.) tensor(3156.7344)
@@ -227,17 +227,14 @@ model.layers.59.mlp.experts.138.down_proj tensor(1568.) tensor(190.8769)
227
  model.layers.60.mlp.experts.81.down_proj tensor(7360.) tensor(10024.4531)
228
  model.layers.60.mlp.experts.92.down_proj tensor(116224.) tensor(55192.4180)
229
 
230
- ~~
231
-
232
-
233
 
234
  **1 add meta data to bf16 model** https://huggingface.co/opensourcerelease/DeepSeek-V3-bf16
235
 
236
- ~~python
237
  import safetensors
238
  from safetensors.torch import save_file
239
 
240
-
241
  for i in range(1, 164):
242
  idx_str = "0" * (5-len(str(i))) + str(i)
243
  safetensors_path = f"model-{idx_str}-of-000163.safetensors"
@@ -247,7 +244,7 @@ for i in range(1, 164):
247
  for key in f.keys():
248
  tensors[key] = f.get_tensor(key)
249
  save_file(tensors, safetensors_path, metadata={'format': 'pt'})
250
- ~~
251
 
252
 
253
 
@@ -259,9 +256,9 @@ https://github.com/intel/auto-round/blob/deepseekv3/modeling_deepseek.py
259
 
260
  **3 tuning**
261
 
262
- ~~
263
  git clone https://github.com/intel/auto-round.git && cd auto-round && git checkout deepseekv3
264
- ~~
265
 
266
  ```bash
267
  python3 -m auto_round --model "/models/DeepSeek-V3-bf16/" --group_size 128 --format "auto_gptq" --iters 200 --devices 0,1,2,3,4 --nsamples 512 --batch_size 8 --seqlen 512 --low_gpu_mem_usage --output_dir "tmp_autoround" --disable_eval e 2>&1 | tee -a seekv3.txt
@@ -289,4 +286,4 @@ The license on this model does not constitute legal advice. We are not responsib
289
 
290
  @article{cheng2023optimize, title={Optimize weight rounding via signed gradient descent for the quantization of llms}, author={Cheng, Wenhua and Zhang, Weiwei and Shen, Haihao and Cai, Yiyang and He, Xin and Lv, Kaokao and Liu, Yi}, journal={arXiv preprint arXiv:2309.05516}, year={2023} }
291
 
292
- [arxiv](https://arxiv.org/abs/2309.05516) [github](https://github.com/intel/auto-round)
 
29
 
30
  intel-extension-for-pytorch: much slower repacking, faster inference, lower accuracy
31
 
32
+ ~~~python
33
  from auto_round import AutoRoundConfig ##must import for autoround format
34
  from transformers import AutoModelForCausalLM, AutoTokenizer
35
  import torch
 
161
  prompt = "Please give a brief introduction of DeepSeek company."
162
  ##INT4:
163
  """DeepSeek Artificial Intelligence Co., Ltd. (referred to as "DeepSeek" or "深度求索") , founded in 2023, is a Chinese company dedicated to making AGI a reality"""
164
+ ~~~
165
 
166
  ### INT4 Inference on CUDA(have not tested, maybe need 8X80G GPU)
167
 
 
217
 
218
  We discovered that the inputs and outputs of certain layers in this model are very large and even exceed the FP16 range when tested with a few prompts. It is recommended to exclude these layers from quantization—particularly the 'down_proj' in layer 60—and run them using BF16 precision instead. However, we have not implemented this in this int4 model as in cpu, the compute dtype for int4 is bf16 or FP32.
219
 
220
+ ~~~python
221
  model.layers.60.mlp.experts.150.down_proj tensor(1144.) tensor(2122.9451)
222
  model.layers.60.mlp.experts.231.down_proj tensor(25856.) tensor(12827.9980)
223
  model.layers.60.mlp.shared_experts.down_proj tensor(1880.) tensor(3156.7344)
 
227
  model.layers.60.mlp.experts.81.down_proj tensor(7360.) tensor(10024.4531)
228
  model.layers.60.mlp.experts.92.down_proj tensor(116224.) tensor(55192.4180)
229
 
230
+ ~~~
 
 
231
 
232
  **1 add meta data to bf16 model** https://huggingface.co/opensourcerelease/DeepSeek-V3-bf16
233
 
234
+ ~~~python
235
  import safetensors
236
  from safetensors.torch import save_file
237
 
 
238
  for i in range(1, 164):
239
  idx_str = "0" * (5-len(str(i))) + str(i)
240
  safetensors_path = f"model-{idx_str}-of-000163.safetensors"
 
244
  for key in f.keys():
245
  tensors[key] = f.get_tensor(key)
246
  save_file(tensors, safetensors_path, metadata={'format': 'pt'})
247
+ ~~~
248
 
249
 
250
 
 
256
 
257
  **3 tuning**
258
 
259
+ ```bash
260
  git clone https://github.com/intel/auto-round.git && cd auto-round && git checkout deepseekv3
261
+ ```
262
 
263
  ```bash
264
  python3 -m auto_round --model "/models/DeepSeek-V3-bf16/" --group_size 128 --format "auto_gptq" --iters 200 --devices 0,1,2,3,4 --nsamples 512 --batch_size 8 --seqlen 512 --low_gpu_mem_usage --output_dir "tmp_autoround" --disable_eval e 2>&1 | tee -a seekv3.txt
 
286
 
287
  @article{cheng2023optimize, title={Optimize weight rounding via signed gradient descent for the quantization of llms}, author={Cheng, Wenhua and Zhang, Weiwei and Shen, Haihao and Cai, Yiyang and He, Xin and Lv, Kaokao and Liu, Yi}, journal={arXiv preprint arXiv:2309.05516}, year={2023} }
288
 
289
+ [arxiv](https://arxiv.org/abs/2309.05516) [github](https://github.com/intel/auto-round)