samdeniyi's picture
Added custom inference handler and requirements
369b113
raw
history blame
3.11 kB
from typing import Dict, Any
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel, PeftConfig
import torch
import time
class EndpointHandler:
def __init__(self, path="samadeniyi/lora_lesson_plan_model"):
# Load the model configuration
config = PeftConfig.from_pretrained(path)
# Define 4-bit quantization configuration
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
)
# Load the model with quantization
self.model = AutoModelForCausalLM.from_pretrained(
config.base_model_name_or_path,
return_dict=True,
load_in_4bit=True,
device_map={"": 0},
trust_remote_code=True,
quantization_config=bnb_config,
)
# Load tokenizer
self.tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
self.tokenizer.pad_token = self.tokenizer.eos_token
# Apply PEFT (Parameter-Efficient Fine-Tuning) to the model
self.model = PeftModel.from_pretrained(self.model, path)
def __call__(self, data: Any) -> Dict[str, Any]:
"""
Args:
data :obj:`dict`:. The object should contain {"instruction": "some text", "input": "some text"}:
- "instruction": The instruction describing what to generate.
- "input": Context to guide the generation.
Returns:
A :obj:`dict` containing {"generated_text": "the generated lesson plan", "time": "..."}:
- "generated_text": The generated text based on the input.
- "time": The time taken to generate the output.
"""
# Parse input data
inputs = data.pop("inputs", data)
instruction = inputs.get("instruction", "")
input_context = inputs.get("input", "")
# Create the lesson plan prompt based on your preparation format
lesson_prompt = f"""Below is an instruction that describes how to create a lesson plan, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{instruction}
### Input:
{input_context}
### Response:
"""
# Tokenize the prompt
batch = self.tokenizer(
lesson_prompt,
padding=True,
truncation=True,
return_tensors='pt'
)
batch = batch.to('cuda:0')
# Configure generation settings
generation_config = self.model.generation_config
generation_config.top_p = 0.7
generation_config.temperature = 0.7
generation_config.max_new_tokens = 256
generation_config.num_return_sequences = 1
generation_config.pad_token_id = self.tokenizer.eos_token_id
generation_config.eos_token_id = self.tokenizer.eos_token_id
# Time the prediction
start = time.time()
with torch.cuda.amp.autocast():
output_tokens = self.model.generate(
input_ids=batch.input_ids,
generation_config=generation_config,
)
end = time.time()
# Decode generated tokens into text
generated_text = self.tokenizer.decode(output_tokens[0], skip_special_tokens=True)
# Return the generated text and the time taken
return {"generated_text": generated_text, "time": f"{(end - start):.2f} s"}