import re from typing import Dict, List, Any from unsloth import FastLanguageModel class EndpointHandler(): def __init__(self, path=""): # Preload all the elements you are going to need at inference. # pseudo: # self.model= load_model(path) max_seq_length = 2048 dtype = None load_in_4bit = True self.model, self.tokenizer = FastLanguageModel.from_pretrained( model_name=path, # YOUR MODEL YOU USED FOR TRAINING max_seq_length=max_seq_length, dtype=dtype, load_in_4bit=load_in_4bit, ) FastLanguageModel.for_inference(self.model) # Enable native 2x faster inference def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: """ data args: inputs (:obj: `str` | `PIL.Image` | `np.array`) kwargs Return: A :obj:`list` | `dict`: will be serialized and returned """ messages = data.pop("inputs", data) # messages = [ # {"from": "human", "value": "What is a famous tall tower in Paris?"}, # ] inputs = self.tokenizer.apply_chat_template( messages, tokenize=True, add_generation_prompt=True, # Must add for generation return_tensors="pt", ).to("cuda") outputs = self.model.generate(input_ids=inputs, max_new_tokens=1000, use_cache=True) content = self.tokenizer.batch_decode(outputs) pattern = r'\[INST\].*?\[/INST\]' content = re.sub(pattern, '', content, flags=re.DOTALL) content = content.replace('', '').replace('', '').strip() return content