``` from transformers import AutoTokenizer from auto_gptq import AutoGPTQForCausalLM model_path = 'efficient-llm/llama-2-13b-chat-gptq' tokenizer_path = 'meta-llama/Llama-2-7b-hf' model = AutoGPTQForCausalLM.from_quantized( model_path, # inject_fused_attention=False, # or disable_exllama=True, device_map='auto', revision='3bit_128g', ) from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, trust_remote_code=True) input_ids = tokenizer('How are you?', return_tensors='pt').input_ids.to('cuda') outputs = model.generate(input_ids=input_ids, max_length=128) print(tokenizer.decode(outputs[0])) ```