## Inference Code ``` import os os.environ["CUDA_VISIBLE_DEVICES"]="0" from dataclasses import dataclass, field from typing import Optional import torch from datasets import load_dataset from peft import LoraConfig from transformers import ( AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, AutoTokenizer, TrainingArguments, ) from trl import SFTTrainer from peft import ( prepare_model_for_kbit_training, LoraConfig, get_peft_model, PeftModel ) bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype="bfloat16", bnb_4bit_use_double_quant=False, ) device_map = {"": 0} tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-40b") model = AutoModelForCausalLM.from_pretrained( "tiiuae/falcon-40b", quantization_config=bnb_config, device_map=device_map, trust_remote_code=True ) model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=False) model_id = "smangrul/falcon-40B-int4-peft-lora-sfttrainer" model = PeftModel.from_pretrained(model, model_id) text = '### Human: Write a tweet celebrating the Apache-2 release of Falcon models which are generative Large Language Models (LLMs) on which you have been finetuned. Previously, it was under a bit of a restrictive license. Make the tweet punchy, energetic, exciting and marketable.### Assitant:' outputs = model.generate(input_ids=tokenizer(text, return_tensors="pt").input_ids, max_new_tokens=256, temperature=0.7, top_p=0.9, do_sample=True) print(tokenizer.batch_decode(outputs)) ```