|
|
|
|
|
|
|
import os |
|
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" |
|
os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2,3" |
|
import torch |
|
from transformers import ( |
|
AutoModelForCausalLM, |
|
AutoTokenizer, |
|
pipeline, |
|
) |
|
from peft import PeftModel |
|
|
|
|
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
'codellama/CodeLlama-7b-Instruct-hf', |
|
device_map = "balanced_low_0" |
|
) |
|
|
|
tokenizer = AutoTokenizer.from_pretrained( |
|
'llama_prompt_model/tokenizer', |
|
padding=True, |
|
truncation=True, |
|
device_map = "balanced_low_0" |
|
) |
|
|
|
model.resize_token_embeddings(len(tokenizer)) |
|
|
|
model = PeftModel.from_pretrained(model, "llama_prompt_model/model") |
|
|
|
model = model.merge_and_unload() |
|
|
|
|
|
|
|
|
|
pipe = pipeline(task="text-generation", model = model, tokenizer = tokenizer, max_length = 2000, pad_token_id = 2) |
|
|
|
|
|
|
|
|
|
prompt = "What's your name?" |
|
|
|
result = pipe("<s>[INST]%s[/INST]"%(prompt))[0]['generated_text'] |
|
|
|
print(result) |
|
|
|
|
|
|
|
|
|
|