jyhong836 commited on
Commit
6bcf825
1 Parent(s): 3810abb

Upload README.md with huggingface_hub

Browse files
Files changed (1) hide show
  1. README.md +29 -2
README.md CHANGED
@@ -37,16 +37,43 @@ outputs = model.generate(input_ids, max_new_tokens=128)
37
  print(tokenizer.decode(outputs[0]))
38
  ```
39
 
40
- How to use quantized models
41
  ```python
42
- from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
 
43
  model_path = 'vita-group/llama-2-7b_wanda_2_4_gptq_4bit_128g'
 
44
  model = AutoGPTQForCausalLM.from_quantized(
45
  model_path,
46
  # inject_fused_attention=False, # or
47
  disable_exllama=True,
48
  device_map='auto',
 
49
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  ```
51
 
52
 
 
37
  print(tokenizer.decode(outputs[0]))
38
  ```
39
 
40
+ How to use wanda+gptq models
41
  ```python
42
+ from transformers import AutoTokenizer
43
+ from auto_gptq import AutoGPTQForCausalLM
44
  model_path = 'vita-group/llama-2-7b_wanda_2_4_gptq_4bit_128g'
45
+ tokenizer_path = 'meta-llama/Llama-2-7b-hf'
46
  model = AutoGPTQForCausalLM.from_quantized(
47
  model_path,
48
  # inject_fused_attention=False, # or
49
  disable_exllama=True,
50
  device_map='auto',
51
+ revision='2bit_128g',
52
  )
53
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, trust_remote_code=True)
54
+ input_ids = tokenizer('Hello! I am a VITA-compressed-LLM chatbot!', return_tensors='pt').input_ids.to('cuda')
55
+ outputs = model.generate(input_ids=input_ids, max_length=128)
56
+ tokenizer.decode(outputs[0])
57
+ ```
58
+
59
+ How to use gptq models
60
+ ```python
61
+ from transformers import AutoTokenizer
62
+ from auto_gptq import AutoGPTQForCausalLM
63
+ model_path = 'vita-group/vicuna-7b-v1.3_gptq'
64
+ tokenizer_path = 'lmsys/vicuna-7b-v1.3'
65
+ revision = '2bit_128g'
66
+ model = AutoGPTQForCausalLM.from_quantized(
67
+ model_path,
68
+ # inject_fused_attention=False, # or
69
+ disable_exllama=True,
70
+ device_map='auto',
71
+ revision=revision,
72
+ )
73
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, trust_remote_code=True)
74
+ input_ids = tokenizer('Hello! I am a VITA-compressed-LLM chatbot!', return_tensors='pt').input_ids.to('cuda')
75
+ outputs = model.generate(input_ids=input_ids, max_length=128)
76
+ tokenizer.decode(outputs[0])
77
  ```
78
 
79