Upload README.md with huggingface_hub
Browse files
README.md
CHANGED
@@ -37,16 +37,43 @@ outputs = model.generate(input_ids, max_new_tokens=128)
|
|
37 |
print(tokenizer.decode(outputs[0]))
|
38 |
```
|
39 |
|
40 |
-
How to use
|
41 |
```python
|
42 |
-
from
|
|
|
43 |
model_path = 'vita-group/llama-2-7b_wanda_2_4_gptq_4bit_128g'
|
|
|
44 |
model = AutoGPTQForCausalLM.from_quantized(
|
45 |
model_path,
|
46 |
# inject_fused_attention=False, # or
|
47 |
disable_exllama=True,
|
48 |
device_map='auto',
|
|
|
49 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
```
|
51 |
|
52 |
|
|
|
37 |
print(tokenizer.decode(outputs[0]))
|
38 |
```
|
39 |
|
40 |
+
How to use wanda+gptq models
|
41 |
```python
|
42 |
+
from transformers import AutoTokenizer
|
43 |
+
from auto_gptq import AutoGPTQForCausalLM
|
44 |
model_path = 'vita-group/llama-2-7b_wanda_2_4_gptq_4bit_128g'
|
45 |
+
tokenizer_path = 'meta-llama/Llama-2-7b-hf'
|
46 |
model = AutoGPTQForCausalLM.from_quantized(
|
47 |
model_path,
|
48 |
# inject_fused_attention=False, # or
|
49 |
disable_exllama=True,
|
50 |
device_map='auto',
|
51 |
+
revision='2bit_128g',
|
52 |
)
|
53 |
+
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, trust_remote_code=True)
|
54 |
+
input_ids = tokenizer('Hello! I am a VITA-compressed-LLM chatbot!', return_tensors='pt').input_ids.to('cuda')
|
55 |
+
outputs = model.generate(input_ids=input_ids, max_length=128)
|
56 |
+
tokenizer.decode(outputs[0])
|
57 |
+
```
|
58 |
+
|
59 |
+
How to use gptq models
|
60 |
+
```python
|
61 |
+
from transformers import AutoTokenizer
|
62 |
+
from auto_gptq import AutoGPTQForCausalLM
|
63 |
+
model_path = 'vita-group/vicuna-7b-v1.3_gptq'
|
64 |
+
tokenizer_path = 'lmsys/vicuna-7b-v1.3'
|
65 |
+
revision = '2bit_128g'
|
66 |
+
model = AutoGPTQForCausalLM.from_quantized(
|
67 |
+
model_path,
|
68 |
+
# inject_fused_attention=False, # or
|
69 |
+
disable_exllama=True,
|
70 |
+
device_map='auto',
|
71 |
+
revision=revision,
|
72 |
+
)
|
73 |
+
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, trust_remote_code=True)
|
74 |
+
input_ids = tokenizer('Hello! I am a VITA-compressed-LLM chatbot!', return_tensors='pt').input_ids.to('cuda')
|
75 |
+
outputs = model.generate(input_ids=input_ids, max_length=128)
|
76 |
+
tokenizer.decode(outputs[0])
|
77 |
```
|
78 |
|
79 |
|