|
import torch |
|
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig |
|
from peft import PeftModel, PeftConfig |
|
import argparse |
|
|
|
model_id = "TinyPixel/Llama-2-7B-bf16-sharded" |
|
peft_model_id = "checkpoint-3690" |
|
|
|
config = PeftConfig.from_pretrained(peft_model_id) |
|
|
|
bnb_config = BitsAndBytesConfig( |
|
load_in_8bit=False, |
|
load_in_4bit=True, |
|
llm_int8_threshold=6.0, |
|
llm_int8_skip_modules=None, |
|
llm_int8_enable_fp32_cpu_offload=False, |
|
llm_int8_has_fp16_weight=False, |
|
bnb_4bit_quant_type="nf4", |
|
bnb_4bit_use_double_quant=False, |
|
bnb_4bit_compute_dtype="float16", |
|
) |
|
|
|
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"": 0}) |
|
model = PeftModel.from_pretrained(model, peft_model_id) |
|
tokenizer = AutoTokenizer.from_pretrained(model_id) |
|
|
|
model.eval() |
|
|
|
prompt = "Below is an instruction that describes a task. Write a response that appropriately completes the request. ### Instruction: %s ### Response: " |
|
|
|
def gen(x): |
|
q = prompt % (x,) |
|
gened = model.generate( |
|
**tokenizer( |
|
q, |
|
return_tensors='pt', |
|
return_token_type_ids=False |
|
).to('cuda'), |
|
max_new_tokens=128, |
|
early_stopping=True, |
|
do_sample=True, |
|
) |
|
return tokenizer.decode(gened[0]).replace(q, "") |
|
|
|
if __name__ == "__main__": |
|
parser = argparse.ArgumentParser(description="Generate responses based on instructions.") |
|
parser.add_argument("instruction", type=str, help="The instruction for generating a response.") |
|
args = parser.parse_args() |
|
|
|
response = gen(args.instruction) |
|
print("Generated Response:", response) |
|
|