import torch from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig from peft import PeftModel, PeftConfig import argparse model_id = "TinyPixel/Llama-2-7B-bf16-sharded" peft_model_id = "checkpoint-3690" config = PeftConfig.from_pretrained(peft_model_id) bnb_config = BitsAndBytesConfig( load_in_8bit=False, load_in_4bit=True, llm_int8_threshold=6.0, llm_int8_skip_modules=None, llm_int8_enable_fp32_cpu_offload=False, llm_int8_has_fp16_weight=False, bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=False, bnb_4bit_compute_dtype="float16", ) model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"": 0}) model = PeftModel.from_pretrained(model, peft_model_id) tokenizer = AutoTokenizer.from_pretrained(model_id) model.eval() prompt = "Below is an instruction that describes a task. Write a response that appropriately completes the request. ### Instruction: %s ### Response: " def gen(x): q = prompt % (x,) gened = model.generate( **tokenizer( q, return_tensors='pt', return_token_type_ids=False ).to('cuda'), max_new_tokens=128, early_stopping=True, do_sample=True, ) return tokenizer.decode(gened[0]).replace(q, "") if __name__ == "__main__": parser = argparse.ArgumentParser(description="Generate responses based on instructions.") parser.add_argument("instruction", type=str, help="The instruction for generating a response.") args = parser.parse_args() response = gen(args.instruction) print("Generated Response:", response)