mgoin commited on
Commit
955b6a1
·
verified ·
1 Parent(s): cd23aab

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +61 -0
README.md ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ```python
2
+ from typing import List
3
+
4
+ from transformers import AutoTokenizer
5
+
6
+ from llmcompressor.modifiers.quantization import QuantizationModifier
7
+ from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot
8
+ from llmcompressor.transformers.compression.helpers import calculate_offload_device_map
9
+
10
+ MODEL_ID = "mistralai/Mixtral-8x7B-Instruct-v0.1"
11
+ NUM_GPUS = 2
12
+
13
+ # Adjust based off number of desired GPUs
14
+ device_map = calculate_offload_device_map(
15
+ MODEL_ID, reserve_for_hessians=True, num_gpus=NUM_GPUS, torch_dtype="auto"
16
+ )
17
+
18
+ model = SparseAutoModelForCausalLM.from_pretrained(
19
+ MODEL_ID, device_map=device_map, torch_dtype="auto"
20
+ )
21
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
22
+
23
+
24
+ # Dataset config parameters
25
+ DATASET_ID = "open_platypus"
26
+ MAX_SEQ_LENGTH = 2048
27
+ NUM_CALIBRATION_SAMPLES = 512
28
+
29
+ # Save location of quantized model
30
+ OUTPUT_DIR = f"{MODEL_ID.split('/')[-1]}-FP8"
31
+ SAVE_COMPRESSED = True
32
+
33
+ layers_to_ignore: List[str] = [
34
+ "lm_head",
35
+ "re:.*block_sparse_moe.gate", # does not quantize well
36
+ ]
37
+
38
+ recipe = QuantizationModifier(
39
+ scheme="FP8", targets="Linear", ignore=layers_to_ignore
40
+ )
41
+
42
+
43
+ oneshot(
44
+ model=model,
45
+ tokenizer=tokenizer,
46
+ dataset=DATASET_ID,
47
+ recipe=recipe,
48
+ max_seq_length=MAX_SEQ_LENGTH,
49
+ num_calibration_samples=NUM_CALIBRATION_SAMPLES,
50
+ save_compressed=SAVE_COMPRESSED,
51
+ overwrite_output_dir=True,
52
+ output_dir=OUTPUT_DIR,
53
+ )
54
+
55
+ # Confirm generations of the quantized model look sane.
56
+ print("========== SAMPLE GENERATION ==============")
57
+ input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
58
+ output = model.generate(input_ids, max_new_tokens=20)
59
+ print(tokenizer.decode(output[0]))
60
+ print("==========================================")
61
+ ```