Vasily Alexeev commited on
Commit
0c79aa9
•
1 Parent(s): 1f3dc6b

add info in readme

Browse files
Files changed (1) hide show
  1. README.md +225 -0
README.md CHANGED
@@ -1,5 +1,230 @@
1
  ---
 
 
 
 
2
  license: other
3
  license_name: llama3
4
  license_link: https://llama.meta.com/llama3/license
 
 
 
 
 
5
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ base_model: NousResearch/Meta-Llama-3-70B-Instruct
3
+ model_type: llama
4
+ pipeline_tag: text-generation
5
+ quantized_by: Compressa
6
  license: other
7
  license_name: llama3
8
  license_link: https://llama.meta.com/llama3/license
9
+ tags:
10
+ - llama3
11
+ - omniquant
12
+ - gptq
13
+ - triton
14
  ---
15
+
16
+
17
+ # Llama 3 70B Instruct – OmniQuant
18
+
19
+ Based on [Llama 3 70B Instruct](https://huggingface.co/NousResearch/Meta-Llama-3-70B-Instruct).
20
+
21
+ Quantized with [OmniQuant](https://github.com/OpenGVLab/OmniQuant).
22
+
23
+
24
+ ## Evaluation
25
+
26
+ ### PPL (↓)
27
+
28
+ | | wiki |
29
+ | --------- | ---- |
30
+ | FP | 5,33 |
31
+ | Quantized | 5,90 |
32
+
33
+
34
+ ### Accuracy on English Benchmarks, % (↑)
35
+
36
+ | | piqa | arc_easy | arc_challenge | boolq | hellaswag | winogrande |
37
+ | --------- | ---- | -------- | ------------- | ----- | --------- | ---------- |
38
+ | FP | 81,5 | 86,2 | 61,9 | 87,4 | 63,7 | 75,8 |
39
+ | Quantized | 80,7 | 85,8 | 61,4 | 87,0 | 62,7 | 73,0 |
40
+
41
+
42
+ ### Summary
43
+
44
+ | | Avg acc diff on Eng, % (↑) | Occupied disk space, % (↓) |
45
+ | ------------- | -------------------------- | -------------------------- |
46
+ | FP | 0 | 100 |
47
+ | **Quantized** | \-1,0 | 28,2 |
48
+
49
+
50
+ ## Examples
51
+
52
+ ### Imports and Model Loading
53
+
54
+ <details>
55
+ <summary>Expand</summary>
56
+
57
+ ```python
58
+ import gc
59
+
60
+ import auto_gptq.nn_modules.qlinear.qlinear_cuda as qlinear_cuda
61
+ import auto_gptq.nn_modules.qlinear.qlinear_triton as qlinear_triton
62
+ import torch
63
+
64
+ from accelerate import (
65
+ init_empty_weights,
66
+ infer_auto_device_map,
67
+ load_checkpoint_in_model,
68
+ )
69
+ from tqdm import tqdm
70
+ from transformers import (
71
+ AutoConfig,
72
+ AutoModelForCausalLM,
73
+ AutoTokenizer,
74
+ pipeline,
75
+ )
76
+
77
+
78
+ def get_named_linears(model):
79
+ return {
80
+ name: module for name, module in model.named_modules()
81
+ if isinstance(module, torch.nn.Linear)
82
+ }
83
+
84
+
85
+ def set_module(model, name, module):
86
+ parent = model
87
+ levels = name.split('.')
88
+
89
+ for i in range(len(levels) - 1):
90
+ cur_name = levels[i]
91
+
92
+ if cur_name.isdigit():
93
+ parent = parent[int(cur_name)]
94
+ else:
95
+ parent = getattr(parent, cur_name)
96
+
97
+ setattr(parent, levels[-1], module)
98
+
99
+
100
+ def load_model(model_path):
101
+ # Based on: https://github.com/OpenGVLab/OmniQuant/blob/main/runing_quantized_mixtral_7bx8.ipynb
102
+
103
+ config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
104
+
105
+ if not hasattr(config, 'quantization_config'):
106
+ raise AttributeError(
107
+ f'No quantization info found in model config "{model_path}"'
108
+ f' (`quantization_config` section is missing).'
109
+ )
110
+
111
+ wbits = config.quantization_config['bits']
112
+ group_size = config.quantization_config['group_size']
113
+
114
+ # We are going to init an ordinary model and then manually replace all Linears with QuantLinears
115
+ del config.quantization_config
116
+
117
+ with init_empty_weights():
118
+ model = AutoModelForCausalLM.from_config(config=config, torch_dtype=torch.float16, trust_remote_code=True)
119
+
120
+ layers = model.model.layers
121
+
122
+ for i in tqdm(range(len(layers))):
123
+ layer = layers[i]
124
+ named_linears = get_named_linears(layer)
125
+
126
+ for name, module in named_linears.items():
127
+ params = (
128
+ wbits, group_size,
129
+ module.in_features, module.out_features,
130
+ module.bias is not None
131
+ )
132
+
133
+ if wbits in [2, 4]:
134
+ q_linear = qlinear_triton.QuantLinear(*params)
135
+ elif wbits == 3:
136
+ q_linear = qlinear_cuda.QuantLinear(*params)
137
+ else:
138
+ raise NotImplementedError("Only 2, 3 and 4 bits are supported.")
139
+
140
+ q_linear.to(next(layer.parameters()).device)
141
+ set_module(layer, name, q_linear)
142
+
143
+ torch.cuda.empty_cache()
144
+ gc.collect()
145
+
146
+ model.tie_weights()
147
+ device_map = infer_auto_device_map(model)
148
+
149
+ print("Loading pre-computed quantized weights...")
150
+
151
+ load_checkpoint_in_model(
152
+ model, checkpoint=model_path,
153
+ device_map=device_map, offload_state_dict=True,
154
+ )
155
+
156
+ print("Model loaded successfully!")
157
+
158
+ return model
159
+ ```
160
+ </details>
161
+
162
+
163
+ ### Inference
164
+
165
+ ```python
166
+ model_path = "compressa-ai/Llama-3-70B-Instruct-OmniQuant"
167
+
168
+ model = load_model(model_path).cuda()
169
+ tokenizer = AutoTokenizer.from_pretrained(
170
+ model_path, use_fast=False, trust_remote_code=True
171
+ )
172
+
173
+ # Llama 3 "specifics"
174
+ # https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct/discussions/4
175
+ terminators = [
176
+ tokenizer.convert_tokens_to_ids("<|end_of_text|>"),
177
+ tokenizer.convert_tokens_to_ids("<|eot_id|>")
178
+ ]
179
+
180
+ system_message = "You are a friendly chatbot who responds as if you are the Sandy Cheeks squirrel from the SpongeBob SquarePants cartoon."
181
+ user_message = "Do squirrels communicate with birds?"
182
+ messages = [
183
+ {"role": "system", "content": system_message},
184
+ {"role": "user", "content": user_message},
185
+ ]
186
+ prompt = tokenizer.apply_chat_template(
187
+ messages, tokenize=False, add_generation_prompt=True
188
+ )
189
+
190
+ inputs = tokenizer(prompt, return_tensors="pt")
191
+ inputs = {k: v.cuda() for k, v in inputs.items()}
192
+
193
+ outputs = model.generate(
194
+ **inputs, max_new_tokens=512,
195
+ do_sample=True, temperature=0.7, top_p=0.95,
196
+ eos_token_id=terminators,
197
+ )
198
+
199
+ response = tokenizer.decode(outputs[0])
200
+ continuation = response.removeprefix(prompt).removesuffix(tokenizer.eos_token)
201
+
202
+ print(f'Prompt:\n{prompt}')
203
+ print(f'Continuation:\n{continuation}\n')
204
+ ```
205
+
206
+
207
+ ### Inference Using Pipeline
208
+
209
+ ```python
210
+ pipe = pipeline(
211
+ "text-generation",
212
+ model=model, tokenizer=tokenizer,
213
+ eos_token_id=terminators,
214
+ max_new_tokens=512, do_sample=True,
215
+ temperature=0.7, top_p=0.95,
216
+ device=0,
217
+ )
218
+
219
+ prompt = pipe.tokenizer.apply_chat_template(
220
+ messages, tokenize=False, add_generation_prompt=True
221
+ )
222
+
223
+ outputs = pipe(prompt)
224
+
225
+ response = outputs[0]["generated_text"]
226
+ continuation = response.removeprefix(prompt)
227
+
228
+ print(f'Prompt:\n{prompt}')
229
+ print(f'Continuation:\n{continuation}\n')
230
+ ```