Context / handler.py
Jefferson de Azevedo
Create handler.py
fdd590b verified
from unsloth import FastLanguageModel
import torch
class EndpointHandler:
def __init__(self, path=""):
self.model, self.tokenizer = FastLanguageModel.from_pretrained(
model_name=path,
max_seq_length=2048,
dtype=torch.float16,
load_in_4bit=True,
)
FastLanguageModel.for_inference(self.model)
def __call__(self, data: dict):
inputs_text = data.pop("inputs", "")
parameters = data.pop("parameters", {})
# Formata no template do LLaMA 3
messages = [{"role": "user", "content": inputs_text}]
formatted = self.tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
inputs = self.tokenizer(
formatted,
return_tensors="pt"
).to("cuda")
outputs = self.model.generate(
**inputs,
max_new_tokens=parameters.get("max_new_tokens", 512),
temperature=parameters.get("temperature", 0.7),
do_sample=True,
pad_token_id=self.tokenizer.eos_token_id,
)
# Retorna só a resposta, sem o prompt
decoded = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
return {"generated_text": decoded}