TheBloke commited on
Commit
71cdf2d
1 Parent(s): 1b4c932

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +44 -12
README.md CHANGED
@@ -90,24 +90,56 @@ pip install einops
90
 
91
  You can then run this example code:
92
  ```python
93
- import torch
94
- from transformers import AutoTokenizer
95
- from auto_gptq import AutoGPTQForCausalLM
 
 
 
 
 
 
96
 
97
- # Download the model from HF and store it locally, then reference its location here:
98
- quantized_model_dir = "/path/to/falcon7b-instruct-gptq"
99
 
100
- from transformers import AutoTokenizer
101
- tokenizer = AutoTokenizer.from_pretrained(quantized_model_dir, use_fast=False)
102
 
103
- model = AutoGPTQForCausalLM.from_quantized(quantized_model_dir, device="cuda:0", use_triton=False, use_safetensors=True, torch_dtype=torch.float32, trust_remote_code=True)
 
 
 
 
 
 
104
 
105
- prompt = "Write a story about llamas"
106
- prompt_template = f"### Instruction: {prompt}\n### Response:"
 
107
 
108
- tokens = tokenizer(prompt_template, return_tensors="pt").to("cuda:0").input_ids
109
- output = model.generate(input_ids=tokens, max_new_tokens=100, do_sample=True, temperature=0.8)
 
 
110
  print(tokenizer.decode(output[0]))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  ```
112
 
113
  ## Provided files
 
90
 
91
  You can then run this example code:
92
  ```python
93
+ from transformers import AutoTokenizer, pipeline, logging
94
+ from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
95
+ import argparse
96
+
97
+ model_name_or_path = "TheBloke/falcon-7b-instruct-GPTQ"
98
+ # You could also download the model locally, and access it there
99
+ # model_name_or_path = "/path/to/TheBloke_falcon-7b-instruct-GPTQ"
100
+
101
+ model_basename = "gptq_model-4bit-64g"
102
 
103
+ use_triton = False
 
104
 
105
+ tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
 
106
 
107
+ model = AutoGPTQForCausalLM.from_quantized(model_name_or_path,
108
+ model_basename=model_basename,
109
+ use_safetensors=True,
110
+ trust_remote_code=True,
111
+ device="cuda:0",
112
+ use_triton=use_triton,
113
+ quantize_config=None)
114
 
115
+ prompt = "Tell me about AI"
116
+ prompt_template=f'''### Human: {prompt}
117
+ ### Assistant:'''
118
 
119
+ print("\n\n*** Generate:")
120
+
121
+ input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()
122
+ output = model.generate(inputs=input_ids, temperature=0.7, max_new_tokens=512)
123
  print(tokenizer.decode(output[0]))
124
+
125
+ # Inference can also be done using transformers' pipeline
126
+ # Note that if you use pipeline, you will see a spurious error message saying the model type is not supported
127
+ # This can be ignored! Or you can hide it with the following logging line:
128
+ # Prevent printing spurious transformers error when using pipeline with AutoGPTQ
129
+ logging.set_verbosity(logging.CRITICAL)
130
+
131
+ print("*** Pipeline:")
132
+ pipe = pipeline(
133
+ "text-generation",
134
+ model=model,
135
+ tokenizer=tokenizer,
136
+ max_new_tokens=512,
137
+ temperature=0.7,
138
+ top_p=0.95,
139
+ repetition_penalty=1.15
140
+ )
141
+
142
+ print(pipe(prompt_template)[0]['generated_text'])
143
  ```
144
 
145
  ## Provided files