|
``` |
|
from transformers import AutoTokenizer |
|
from auto_gptq import AutoGPTQForCausalLM |
|
model_path = 'efficient-llm/llama-2-13b-chat-gptq' |
|
tokenizer_path = 'meta-llama/Llama-2-7b-hf' |
|
model = AutoGPTQForCausalLM.from_quantized( |
|
model_path, |
|
# inject_fused_attention=False, # or |
|
disable_exllama=True, |
|
device_map='auto', |
|
revision='3bit_128g', |
|
) |
|
from transformers import AutoTokenizer |
|
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, trust_remote_code=True) |
|
input_ids = tokenizer('How are you?', return_tensors='pt').input_ids.to('cuda') |
|
outputs = model.generate(input_ids=input_ids, max_length=128) |
|
print(tokenizer.decode(outputs[0])) |
|
``` |
|
|