When running the demo code, the initial inference works, but then the pipeline gives the following error:

AttributeError Traceback (most recent call last)
Input In [2], in <cell line: 44>()
41 from transformers import pipeline
43 print("*** Pipeline:")
---> 44 pipe = pipeline(
45 "text-generation",
46 model=model,
47 tokenizer=tokenizer,
48 max_new_tokens=512,
49 do_sample=True,
50 temperature=0.7,
51 top_p=0.95,
52 top_k=40,
53 repetition_penalty=1.1
54 )
56 print(pipe(prompt_template)[0]['generated_text'])

File /usr/local/lib/python3.9/dist-packages/transformers/pipelines/init.py:880, in pipeline(task, model, config, tokenizer, feature_extractor, image_processor, framework, revision, use_fast, token, device, device_map, torch_dtype, trust_remote_code, model_kwargs, pipeline_class, **kwargs)
869 model_classes = {"tf": targeted_task["tf"], "pt": targeted_task["pt"]}
870 framework, model = infer_framework_load_model(
871 model,
872 model_classes=model_classes,
(...)
877 **model_kwargs,
878 )
--> 880 model_config = model.config
881 hub_kwargs["_commit_hash"] = model.config._commit_hash
882 load_tokenizer = type(model_config) in TOKENIZER_MAPPING or model_config.tokenizer_class is not None

File /usr/local/lib/python3.9/dist-packages/torch/nn/modules/module.py:1695, in Module.getattr(self, name)
1693 if name in modules:
1694 return modules[name]
-> 1695 raise AttributeError(f"'{type(self).name}' object has no attribute '{name}'")

AttributeError: 'LlamaAWQForCausalLM' object has no attribute 'config'

Here's the demo code for reference:

from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer

model_name_or_path = "TheBloke/fin-llama-33B-AWQ"

Load model

model = AutoAWQForCausalLM.from_quantized(model_name_or_path, fuse_layers=True,
trust_remote_code=False, safetensors=True)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=False)

prompt = "Tell me about AI"
prompt_template=f'''Below is an instruction that describes a task. Write a response that appropriately completes the request.

Instruction:

{prompt}

Response:

'''

print("\n\n*** Generate:")

tokens = tokenizer(
prompt_template,
return_tensors='pt'
).input_ids.cuda()

Generate output

generation_output = model.generate(
tokens,
do_sample=True,
temperature=0.7,
top_p=0.95,
top_k=40,
max_new_tokens=512
)

print("Output: ", tokenizer.decode(generation_output[0]))

Inference can also be done using transformers' pipeline

from transformers import pipeline

print("*** Pipeline:")
pipe = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
max_new_tokens=512,
do_sample=True,
temperature=0.7,
top_p=0.95,
top_k=40,
repetition_penalty=1.1
)

print(pipe(prompt_template)[0]['generated_text'])