update model generated by v0.2

Browse files

Signed-off-by: wenhuach <wenhuach87@gmail.com>

Files changed (5) hide show

README.md +21 -36
config.json +7 -6
model.safetensors +2 -2
quantize_config.json +3 -3
tokenizer.json +2 -2

README.md CHANGED Viewed

@@ -1,10 +1,3 @@
----
-license: apache-2.0
-datasets:
-- NeelNanda/pile-10k
-language:
-- en
----
@@ -21,11 +14,8 @@ This model is an int4 model with group_size 128 of [google/gemma-2b](https://hug
 ### INT4 Inference with AutoGPTQ's  kernel
-Install the latest [AutoGPTQ](https://github.com/AutoGPTQ/AutoGPTQ) from source first
 ```python
-##pip install auto-gptq[triton]
-##pip install triton==2.2.0
 from transformers import AutoModelForCausalLM, AutoTokenizer
 quantized_model_dir = "Intel/gemma-2b-int4-inc"
 tokenizer = AutoTokenizer.from_pretrained(quantized_model_dir)
@@ -37,41 +27,40 @@ tokenizer = AutoTokenizer.from_pretrained(quantized_model_dir, use_fast=True)
 text = "There is a girl who likes adventure,"
 inputs = tokenizer(text, return_tensors="pt").to(model.device)
 print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50, do_sample=True)[0]))
 ```
 ### Evaluate the model
-Install [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness.git) from source,  and the git id we used is 96d185fa6232a5ab685ba7c43e45d1dbb3bb906d
-pip install auto-gptq[triton]
-pip install triton==2.2.0
 Please note that there is a discrepancy between the baseline result and the official data, which is a known issue within the official model card community.
 ```bash
-lm_eval --model hf --model_args pretrained="Intel/gemma-2b-int4-inc",autogptq=True,gptq_use_triton=True --device cuda:0 --tasks lambada_openai,hellaswag,piqa,winogrande,truthfulqa_mc1,openbookqa,boolq,rte,arc_easy,arc_challenge,mmlu --batch_size 16
 ```
-| Metric         | FP16   | int4   |
-| -------------- | ------ | ------ |
-| Avg.           | 0.5383 | 0.5338 |
-| mmlu           | 0.3337 | 0.3276 |
-| lambada_openai | 0.6398 | 0.6319 |
-| hellaswag      | 0.5271 | 0.5161 |
-| winogrande     | 0.6472 | 0.6472 |
-| piqa           | 0.7699 | 0.7622 |
-| truthfulqa_mc1 | 0.2203 | 0.2191 |
-| openbookqa     | 0.3020 | 0.2980 |
-| boolq          | 0.6939 | 0.6939 |
-| rte            | 0.6426 | 0.6498 |
-| arc_easy       | 0.7424 | 0.7348 |
-| arc_challenge  | 0.4019 | 0.3908 |
-### Reproduce the model
 Here is the sample command to reproduce the model
@@ -85,6 +74,8 @@ python3 main.py \
 --group_size 128 \
 --bits 4 \
 --iters 400 \
 --deployment_device 'gpu' \
 --output_dir "./tmp_autoround"
@@ -111,9 +102,3 @@ Here are a couple of useful links to learn more about Intel's AI software:
 The license on this model does not constitute legal advice. We are not responsible for the actions of third parties who use this model. Please consult an attorney before using this model for commercial purposes.
-## Cite
-@article{cheng2023optimize, title={Optimize weight rounding via signed gradient descent for the quantization of llms}, author={Cheng, Wenhua and Zhang, Weiwei and Shen, Haihao and Cai, Yiyang and He, Xin and Lv, Kaokao}, journal={arXiv preprint arXiv:2309.05516}, year={2023} }
-[arxiv](https://arxiv.org/abs/2309.05516) [github](https://github.com/intel/auto-round)

 ### INT4 Inference with AutoGPTQ's  kernel
 ```python
+##pip install auto-gptq
 from transformers import AutoModelForCausalLM, AutoTokenizer
 quantized_model_dir = "Intel/gemma-2b-int4-inc"
 tokenizer = AutoTokenizer.from_pretrained(quantized_model_dir)
 text = "There is a girl who likes adventure,"
 inputs = tokenizer(text, return_tensors="pt").to(model.device)
 print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50, do_sample=True)[0]))
+##<bos>There is a girl who likes adventure, and she is a girl who likes to travel. She is a girl who likes to explore the world and see new things. She is a girl who likes to meet new people and learn about their cultures. She is a girl who likes to take risks
 ```
 ### Evaluate the model
+pip3 install lm-eval==0.4.2
+pip install auto-gptq
 Please note that there is a discrepancy between the baseline result and the official data, which is a known issue within the official model card community.
 ```bash
+lm_eval --model hf --model_args pretrained="Intel/gemma-2b-int4-inc",autogptq=True,gptq_use_triton=True --device cuda:0 --tasks lambada_openai,hellaswag,piqa,winogrande,truthfulqa_mc1,openbookqa,boolq,arc_easy,arc_challenge,mmlu --batch_size 16
 ```
+| Metric         | BF16   | FP16   | AutoRound v0.1 | AutoRound v0.2 |
+| -------------- | ------ | ------ | -------------- | -------------- |
+| Avg.           | 0.5263 | 0.5277 | 0.5235         | 0.5248         |
+| mmlu           | 0.3287 | 0.3287 | 0.3297         | 0.3309         |
+| lambada_openai | 0.6344 | 0.6375 | 0.6307         | 0.6379         |
+| hellaswag      | 0.5273 | 0.5281 | 0.5159         | 0.5184         |
+| winogrande     | 0.6504 | 0.6488 | 0.6543         | 0.6575         |
+| piqa           | 0.7671 | 0.7720 | 0.7612         | 0.7606         |
+| truthfulqa_mc1 | 0.2203 | 0.2203 | 0.2203         | 0.2191         |
+| openbookqa     | 0.2980 | 0.3020 | 0.3000         | 0.3060         |
+| boolq          | 0.6927 | 0.6936 | 0.6939         | 0.6966         |
+| arc_easy       | 0.7420 | 0.7403 | 0.7353         | 0.7357         |
+| arc_challenge  | 0.4019 | 0.4061 | 0.3933         | 0.3857         |
 Here is the sample command to reproduce the model
 --group_size 128 \
 --bits 4 \
 --iters 400 \
+--use_quant_input \
+--model_dtype "float16"
 --deployment_device 'gpu' \
 --output_dir "./tmp_autoround"
 The license on this model does not constitute legal advice. We are not responsible for the actions of third parties who use this model. Please consult an attorney before using this model for commercial purposes.

config.json CHANGED Viewed

@@ -9,6 +9,7 @@
   "eos_token_id": 1,
   "head_dim": 256,
   "hidden_act": "gelu",
   "hidden_size": 2048,
   "initializer_range": 0.02,
   "intermediate_size": 16384,
@@ -19,11 +20,12 @@
   "num_key_value_heads": 1,
   "pad_token_id": 0,
   "quantization_config": {
-    "autoround_version": "0.1",
     "bits": 4,
     "damp_percent": 0.01,
     "desc_act": false,
     "enable_minmax_tuning": true,
     "group_size": 128,
     "is_marlin_format": false,
     "iters": 400,
@@ -32,17 +34,16 @@
     "model_file_base_name": "model",
     "model_name_or_path": null,
     "quant_method": "gptq",
-    "scale_dtype": "torch.float32",
     "static_groups": false,
     "sym": false,
-    "true_sequential": false,
-    "use_quant_input": true
   },
   "rms_norm_eps": 1e-06,
   "rope_scaling": null,
   "rope_theta": 10000.0,
-  "torch_dtype": "bfloat16",
-  "transformers_version": "4.38.2",
   "use_cache": true,
   "vocab_size": 256000
 }

   "eos_token_id": 1,
   "head_dim": 256,
   "hidden_act": "gelu",
+  "hidden_activation": null,
   "hidden_size": 2048,
   "initializer_range": 0.02,
   "intermediate_size": 16384,
   "num_key_value_heads": 1,
   "pad_token_id": 0,
   "quantization_config": {
+    "autoround_version": "0.2.0.dev",
     "bits": 4,
     "damp_percent": 0.01,
     "desc_act": false,
     "enable_minmax_tuning": true,
+    "enable_quanted_input": true,
     "group_size": 128,
     "is_marlin_format": false,
     "iters": 400,
     "model_file_base_name": "model",
     "model_name_or_path": null,
     "quant_method": "gptq",
+    "scale_dtype": "float16",
     "static_groups": false,
     "sym": false,
+    "true_sequential": false
   },
   "rms_norm_eps": 1e-06,
   "rope_scaling": null,
   "rope_theta": 10000.0,
+  "torch_dtype": "float16",
+  "transformers_version": "4.40.2",
   "use_cache": true,
   "vocab_size": 256000
 }

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0ee3302033aa9b45890be54a999acad6b7531ff900408b325eafd6a21bc20399
-size 3130472776

 version https://git-lfs.github.com/spec/v1
+oid sha256:fcbf563b9667464d9217348e712763af0ae6acd26c5b53dc15483c18d51e910d
+size 3130472744

quantize_config.json CHANGED Viewed

@@ -10,11 +10,11 @@
   "model_file_base_name": "model",
   "is_marlin_format": false,
   "quant_method": "intel/auto-round",
-  "autoround_version": "0.1",
   "iters": 400,
   "lr": 0.0025,
   "minmax_lr": 0.0025,
   "enable_minmax_tuning": true,
-  "use_quant_input": true,
-  "scale_dtype": "torch.float32"
 }

   "model_file_base_name": "model",
   "is_marlin_format": false,
   "quant_method": "intel/auto-round",
+  "autoround_version": "0.2.0.dev",
   "iters": 400,
   "lr": 0.0025,
   "minmax_lr": 0.0025,
   "enable_minmax_tuning": true,
+  "enable_quanted_input": true,
+  "scale_dtype": "float16"
 }

tokenizer.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d0d908b4f9326e0998815690e325b6abbd378978553e10627924dd825db7e243
-size 17477553

 version https://git-lfs.github.com/spec/v1
+oid sha256:4db21bfaffa1fd75fd741df2d95dc51e539d5cc38b07934bae0d7d129db90662
+size 17477581