from typing import Dict, List, Any from transformers import BlipProcessor, BlipForConditionalGeneration from PIL import Image import requests class EndpointHandler(): def __init__(self, path=""): self.processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large") self.model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large") def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: """ data args: image_url (:obj: `str`): URL of the image to be captioned Return: A :obj:`list` | `dict`: will be serialized and returned """ # get inputs image_url = data.pop("image_url", None) # check if image_url exists if image_url is None: return [{"error": "image_url not provided"}] # get image from URL try: raw_image = Image.open(requests.get(image_url, stream=True).raw).convert('RGB') except: return [{"error": "unable to load image from the provided URL"}] # unconditional image captioning inputs = self.processor(raw_image, return_tensors="pt") # generate captions out = self.model.generate(**inputs) # return the generated captions return [{"caption": self.processor.decode(out[0], skip_special_tokens=True)}]