from unsloth import FastLanguageModel import torch class EndpointHandler: def __init__(self, path=""): self.model, self.tokenizer = FastLanguageModel.from_pretrained( model_name=path, max_seq_length=2048, dtype=torch.float16, load_in_4bit=True, ) FastLanguageModel.for_inference(self.model) def __call__(self, data: dict): inputs_text = data.pop("inputs", "") parameters = data.pop("parameters", {}) # Formata no template do LLaMA 3 messages = [{"role": "user", "content": inputs_text}] formatted = self.tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) inputs = self.tokenizer( formatted, return_tensors="pt" ).to("cuda") outputs = self.model.generate( **inputs, max_new_tokens=parameters.get("max_new_tokens", 512), temperature=parameters.get("temperature", 0.7), do_sample=True, pad_token_id=self.tokenizer.eos_token_id, ) # Retorna só a resposta, sem o prompt decoded = self.tokenizer.decode(outputs[0], skip_special_tokens=True) return {"generated_text": decoded}