Update README.md
Browse files
README.md
CHANGED
|
@@ -6,6 +6,7 @@ config version: 1
|
|
| 6 |
torchao version: 0.14.dev
|
| 7 |
```
|
| 8 |
|
|
|
|
| 9 |
```
|
| 10 |
import logging
|
| 11 |
|
|
@@ -106,4 +107,54 @@ output_text = tokenizer.batch_decode(
|
|
| 106 |
print("Response:", output_text[0][len(prompt) :])
|
| 107 |
|
| 108 |
assert(correct_output_text == output_text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
```
|
|
|
|
| 6 |
torchao version: 0.14.dev
|
| 7 |
```
|
| 8 |
|
| 9 |
+
# Generate Quantized Model
|
| 10 |
```
|
| 11 |
import logging
|
| 12 |
|
|
|
|
| 107 |
print("Response:", output_text[0][len(prompt) :])
|
| 108 |
|
| 109 |
assert(correct_output_text == output_text)
|
| 110 |
+
```
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
# Test Loading
|
| 114 |
+
```
|
| 115 |
+
from transformers import (
|
| 116 |
+
AutoModelForCausalLM,
|
| 117 |
+
AutoProcessor,
|
| 118 |
+
AutoTokenizer,
|
| 119 |
+
TorchAoConfig,
|
| 120 |
+
)
|
| 121 |
+
from torchao.quantization import Float8Tensor
|
| 122 |
+
from torchao.quantization import (
|
| 123 |
+
Float8Tensor,
|
| 124 |
+
Int4TilePackedTo4dTensor,
|
| 125 |
+
IntxUnpackedToInt8Tensor,
|
| 126 |
+
)
|
| 127 |
+
import torch
|
| 128 |
+
|
| 129 |
+
model_name = "torchao-testing/opt-125m-ModuleFqnToConfig-v1-regex-0.14.0.dev"
|
| 130 |
+
device = "cuda"
|
| 131 |
+
input_text = "What are we having for dinner?"
|
| 132 |
+
max_new_tokens = 10
|
| 133 |
+
|
| 134 |
+
quantized_model = AutoModelForCausalLM.from_pretrained(
|
| 135 |
+
model_name,
|
| 136 |
+
device_map=device,
|
| 137 |
+
dtype=torch.bfloat16,
|
| 138 |
+
)
|
| 139 |
+
print("quantized model:", quantized_model)
|
| 140 |
+
for i in range(12):
|
| 141 |
+
if i == 3:
|
| 142 |
+
assert isinstance(quantized_model.model.decoder.layers[i].self_attn.q_proj.weight, Int4TilePackedTo4dTensor)
|
| 143 |
+
else:
|
| 144 |
+
assert isinstance(quantized_model.model.decoder.layers[i].self_attn.q_proj.weight, Float8Tensor)
|
| 145 |
+
assert isinstance(quantized_model.model.decoder.layers[i].self_attn.k_proj.weight, Float8Tensor)
|
| 146 |
+
assert not isinstance(quantized_model.model.decoder.layers[i].self_attn.v_proj.weight, Float8Tensor)
|
| 147 |
+
assert isinstance(quantized_model.model.decoder.layers[i].self_attn.out_proj.weight, IntxUnpackedToInt8Tensor)
|
| 148 |
+
|
| 149 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 150 |
+
|
| 151 |
+
input_ids = tokenizer(input_text, return_tensors="pt").to(device)
|
| 152 |
+
|
| 153 |
+
output = quantized_model.generate(**input_ids, max_new_tokens=max_new_tokens)
|
| 154 |
+
EXPECTED_OUTPUT = [
|
| 155 |
+
"What are we having for dinner?\n\nJessica: (smiling)",
|
| 156 |
+
"What are we having for dinner?\n\nJess: (smiling) I",
|
| 157 |
+
]
|
| 158 |
+
# self.assertTrue(tokenizer.decode(output[0], skip_special_tokens=True) in EXPECTED_OUTPUT)
|
| 159 |
+
|
| 160 |
```
|