## Inference Code

```
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

from dataclasses import dataclass, field
from typing import Optional

import torch
from datasets import load_dataset
from peft import LoraConfig
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    AutoTokenizer,
    TrainingArguments,
)

from trl import SFTTrainer

from peft import (
    prepare_model_for_kbit_training,
    LoraConfig,
    get_peft_model,
    PeftModel
)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="bfloat16",
    bnb_4bit_use_double_quant=False,
)

device_map = {"": 0}
tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-40b")
model = AutoModelForCausalLM.from_pretrained(
    "tiiuae/falcon-40b", quantization_config=bnb_config, device_map=device_map, trust_remote_code=True
)
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=False)
model_id = "smangrul/falcon-40B-int4-peft-lora-sfttrainer"
model = PeftModel.from_pretrained(model, model_id)

text = '### Human: Write a tweet celebrating the Apache-2 release of Falcon models which are generative Large Language Models (LLMs) on which you have been finetuned. Previously, it was under a bit of a restrictive license. Make the tweet punchy, energetic, exciting and marketable.### Assitant:'
outputs = model.generate(input_ids=tokenizer(text, return_tensors="pt").input_ids, 
                         max_new_tokens=256, 
                         temperature=0.7, 
                         top_p=0.9,
                         do_sample=True)

print(tokenizer.batch_decode(outputs))

```