add fp16 benchmark
Browse files
README.md
CHANGED
@@ -22,20 +22,19 @@ This is an <a href="https://github.com/mobiusml/hqq/">HQQ</a> all 4-bit (group-s
|
|
22 |
| Decoding* - short seq (tokens/sec)| OOM | 10.7 (tokens/sec) |
|
23 |
| Decoding* - long seq (tokens/sec)| OOM | 9.7 (tokens/sec)|
|
24 |
|
25 |
-
*:
|
26 |
|
27 |
## Performance
|
28 |
|
29 |
-
| Models | HQQ 4-bit/gs-64 (no calib) |
|
30 |
-
|
31 |
-
| ARC (25-shot) | 70.22 |
|
32 |
-
| HellaSwag (10-shot)| 86.39 |
|
33 |
-
| MMLU (5-shot) | 81.04 |
|
34 |
-
| TruthfulQA-MC2 | 60.39 |
|
35 |
-
| Winogrande (5-shot)| 84.53 |
|
36 |
-
| GSM8K (5-shot) | 89.92 |
|
37 |
-
| Average | 78.75 |
|
38 |
-
|
39 |
|
40 |
You can reproduce the results above via `pip install lm-eval==0.4.3`
|
41 |
|
@@ -58,7 +57,7 @@ from hqq.utils.generation_hf import HFGenerator
|
|
58 |
|
59 |
#Load the model
|
60 |
###################################################
|
61 |
-
model_id = 'mobiuslabsgmbh/Llama-3.1-70b-instruct_4bitgs64_hqq'
|
62 |
|
63 |
compute_dtype = torch.bfloat16 #bfloat16 for torchao, float16 for bitblas
|
64 |
cache_dir = '.'
|
|
|
22 |
| Decoding* - short seq (tokens/sec)| OOM | 10.7 (tokens/sec) |
|
23 |
| Decoding* - long seq (tokens/sec)| OOM | 9.7 (tokens/sec)|
|
24 |
|
25 |
+
*: 1xA100 80GB
|
26 |
|
27 |
## Performance
|
28 |
|
29 |
+
| Models | fp16 | HQQ 4-bit/gs-64 (no calib) |
|
30 |
+
|:-------------------:|:--------:|:--------:|
|
31 |
+
| ARC (25-shot) | 70.31 | 70.22 |
|
32 |
+
| HellaSwag (10-shot)| 86.40 | 86.39 |
|
33 |
+
| MMLU (5-shot) | 81.84 | 81.04 |
|
34 |
+
| TruthfulQA-MC2 | 59.83 | 60.39 |
|
35 |
+
| Winogrande (5-shot)| 84.85 | 84.53 |
|
36 |
+
| GSM8K (5-shot) | 88.25 | 89.92 |
|
37 |
+
| Average | 78.58 | 78.75 |
|
|
|
38 |
|
39 |
You can reproduce the results above via `pip install lm-eval==0.4.3`
|
40 |
|
|
|
57 |
|
58 |
#Load the model
|
59 |
###################################################
|
60 |
+
model_id = 'mobiuslabsgmbh/Llama-3.1-70b-instruct_4bitgs64_hqq'
|
61 |
|
62 |
compute_dtype = torch.bfloat16 #bfloat16 for torchao, float16 for bitblas
|
63 |
cache_dir = '.'
|