this is my code:
from transformers import AutoTokenizer, pipeline, logging
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
model_name_or_path = "TheBloke/Llama-2-13B-chat-GPTQ"
model_basename = "gptq_model-4bit-128g"

use_triton = False

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)

model = AutoGPTQForCausalLM.from_quantized(model_name_or_path,
model_basename=model_basename,
use_safetensors=True,
trust_remote_code=True,
device="cuda:0",
use_triton=use_triton,
quantize_config=None)

prompt = "Tell me about AI"
system_message = "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."
prompt_template=f'''[INST] <>
{system_message}
<>

{prompt} [/INST]'''

print("\n\n*** Generate:")

input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()
output = model.generate(inputs=input_ids, temperature=0.7, max_new_tokens=512)
print(tokenizer.decode(output[0]))

Inference can also be done using transformers' pipeline

Prevent printing spurious transformers error when using pipeline with AutoGPTQ

logging.set_verbosity(logging.CRITICAL)

print("*** Pipeline:")
pipe = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
max_new_tokens=512,
temperature=0.7,
top_p=0.95,
repetition_penalty=1.15
)

print(pipe(prompt_template)[0]['generated_text'])

gives me the following error:
AttributeError Traceback (most recent call last)
Cell In[23], line 13
9 use_triton = False
11 tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
---> 13 model = AutoGPTQForCausalLM.from_quantized(model_name_or_path,
14 model_basename=model_basename,
15 use_safetensors=True,
16 trust_remote_code=True,
17 device="cuda:0",
18 use_triton=use_triton,
19 quantize_config=None)
21 """
22 To download from a specific branch, use the revision parameter, as in this example:
23
(...)
30 quantize_config=None)
31 """
33 prompt = "Tell me about AI"

File /usr/local/lib/python3.10/dist-packages/auto_gptq/modeling/auto.py:94, in AutoGPTQForCausalLM.from_quantized(cls, model_name_or_path, save_dir, device_map, max_memory, device, low_cpu_mem_usage, use_triton, inject_fused_attention, inject_fused_mlp, use_cuda_fp16, quantize_config, model_basename, use_safetensors, trust_remote_code, warmup_triton, trainable, **kwargs)
88 quant_func = GPTQ_CAUSAL_LM_MODEL_MAP[model_type].from_quantized
89 keywords = {
90 key: kwargs[key]
91 for key in signature(quant_func).parameters
92 if key in kwargs
93 }
---> 94 return quant_func(
95 model_name_or_path=model_name_or_path,
96 save_dir=save_dir,
97 device_map=device_map,
98 max_memory=max_memory,
99 device=device,
100 low_cpu_mem_usage=low_cpu_mem_usage,
101 use_triton=use_triton,
102 inject_fused_attention=inject_fused_attention,
103 inject_fused_mlp=inject_fused_mlp,
104 use_cuda_fp16=use_cuda_fp16,
105 quantize_config=quantize_config,
106 model_basename=model_basename,
107 use_safetensors=use_safetensors,
108 trust_remote_code=trust_remote_code,
109 warmup_triton=warmup_triton,
110 trainable=trainable,
111 **keywords
112 )

File /usr/local/lib/python3.10/dist-packages/auto_gptq/modeling/_base.py:793, in BaseGPTQForCausalLM.from_quantized(cls, model_name_or_path, save_dir, device_map, max_memory, device, low_cpu_mem_usage, use_triton, torch_dtype, inject_fused_attention, inject_fused_mlp, use_cuda_fp16, quantize_config, model_basename, use_safetensors, trust_remote_code, warmup_triton, trainable, **kwargs)
790 if low_cpu_mem_usage:
791 make_sure_no_tensor_in_meta_device(model, use_triton, quantize_config.desc_act, quantize_config.group_size)
--> 793 accelerate.utils.modeling.load_checkpoint_in_model(
794 model,
795 checkpoint=model_save_name,
796 device_map=device_map,
797 offload_state_dict=True,
798 offload_buffers=True
799 )
800 model = simple_dispatch_model(model, device_map)
802 # == step4: set seqlen == #

AttributeError: module 'accelerate.utils' has no attribute 'modeling'

TheBloke
/

Llama-2-13B-chat-GPTQ

Getting an error: AttributeError: module 'accelerate.utils' has no attribute 'modeling'. Please tell me what should i do?

Inference can also be done using transformers' pipeline

Prevent printing spurious transformers error when using pipeline with AutoGPTQ