from typing import Dict, List, Any from transformers import ( AutoModelForCausalLM, AutoTokenizer) import torch model = AutoModelForCausalLM.from_pretrained( "sjster/test_medium", trust_remote_code=True, quantization_config=None, torch_dtype=torch.float, # data type is float device_map="auto", ) class EndpointHandler(): def __init__(self, path=""): # Preload all the elements you are going to need at inference. self.model = AutoModelForCausalLM.from_pretrained( path, trust_remote_code=True, quantization_config=None, torch_dtype=torch.float, # data type is float device_map="auto") self.tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True) self.tokenizer.padding_side = "left" self.tokenizer.pad_token = self.tokenizer.eos_token self.tokenizer.add_eos_token = True def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: """ data args: inputs (:obj: `str` | `PIL.Image` | `np.array`) kwargs Return: A :obj:`list` | `dict`: will be serialized and returned """ inputs = data.pop("inputs", data) messages = [ { "role": "user", "content": "" + inputs, }, ] encodeds = self.tokenizer.apply_chat_template(messages, return_tensors="pt") encoded_length = len(encodeds[0]) model_inputs = encodeds.to('cuda') result = self.model.generate(model_inputs, do_sample=False, output_scores=True, return_dict_in_generate=True, output_attentions=True, output_hidden_states=True, #num_beams=3, #no_repeat_ngram_size=1, early_stopping = True, #top_k=0, max_new_tokens=400) x, logits_gen = result.sequences, result.scores x = x[:,encoded_length:] decoded = self.tokenizer.batch_decode(x) return [{"outputs": decoded[0]}]