from typing import Dict, List, Any from transformers import AutoModel, AutoTokenizer from PIL import Image class EndpointHandler(): def __init__(self, path=""): # Preload all the elements you are going to need at inference. self.model = AutoModel.from_pretrained('openbmb/MiniCPM-Llama3-V-2_5-int4', trust_remote_code=True) self.tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-Llama3-V-2_5-int4', trust_remote_code=True) def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: image_url = data.pop("image_url") image = Image.open(image_url).convert("RGB") message = data.pop("message") messages = [{'role': 'user', 'content': message}] return model.chat( image=image, msgs=msgs, tokenizer=self.tokenizer, sampling=True, # if sampling=False, beam_search will be used by default temperature=0.7, # system_prompt='' # pass system_prompt if needed )