| import torch | |
| from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig | |
| from peft import PeftModel, PeftConfig | |
| import argparse | |
| model_id = "TinyPixel/Llama-2-7B-bf16-sharded" | |
| peft_model_id = "checkpoint-3690" | |
| config = PeftConfig.from_pretrained(peft_model_id) | |
| bnb_config = BitsAndBytesConfig( | |
| load_in_8bit=False, | |
| load_in_4bit=True, | |
| llm_int8_threshold=6.0, | |
| llm_int8_skip_modules=None, | |
| llm_int8_enable_fp32_cpu_offload=False, | |
| llm_int8_has_fp16_weight=False, | |
| bnb_4bit_quant_type="nf4", | |
| bnb_4bit_use_double_quant=False, | |
| bnb_4bit_compute_dtype="float16", | |
| ) | |
| model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"": 0}) | |
| model = PeftModel.from_pretrained(model, peft_model_id) | |
| tokenizer = AutoTokenizer.from_pretrained(model_id) | |
| model.eval() | |
| prompt = "Below is an instruction that describes a task. Write a response that appropriately completes the request. ### Instruction: %s ### Response: " | |
| def gen(x): | |
| q = prompt % (x,) | |
| gened = model.generate( | |
| **tokenizer( | |
| q, | |
| return_tensors='pt', | |
| return_token_type_ids=False | |
| ).to('cuda'), | |
| max_new_tokens=128, | |
| early_stopping=True, | |
| do_sample=True, | |
| ) | |
| return tokenizer.decode(gened[0]).replace(q, "") | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser(description="Generate responses based on instructions.") | |
| parser.add_argument("instruction", type=str, help="The instruction for generating a response.") | |
| args = parser.parse_args() | |
| response = gen(args.instruction) | |
| print("Generated Response:", response) | |