from typing import Dict, List, Any
from transformers import AutoModel, AutoTokenizer
from PIL import Image

class EndpointHandler():
    def __init__(self, path=""):
        # Preload all the elements you are going to need at inference.
        self.model = AutoModel.from_pretrained('openbmb/MiniCPM-Llama3-V-2_5-int4', trust_remote_code=True)
        self.tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-Llama3-V-2_5-int4', trust_remote_code=True)

    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
        image_url = data.pop("image_url")
        image = Image.open(image_url).convert("RGB")
        message = data.pop("message")
        messages = [{'role': 'user', 'content': message}]
        return model.chat(
            image=image,
            msgs=msgs,
            tokenizer=self.tokenizer,
            sampling=True, # if sampling=False, beam_search will be used by default
            temperature=0.7,
            # system_prompt='' # pass system_prompt if needed
        )