blip2-image-to-text / handler.py
thoth-AI's picture
Updated handler.py
b1e4650
raw
history blame
1.54 kB
from typing import Dict, List, Any
from transformers import Blip2Processor, Blip2ForConditionalGeneration
from PIL import Image
from io import BytesIO
import torch, re, base64
class EndpointHandler:
def __init__(self, path=""):
# load the optimized model
self.processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
self.model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", device_map="auto")
def __call__(self, data: Any) -> Dict[str, Any]:
"""
Args:
data (:obj:):
includes the input data and the parameters for the inference.
Return:
A :obj:`dict`:. The object returned should be a dict of one list like {"captions": ["A hugging face at the office"]} containing :
- "caption": A string corresponding to the generated caption.
"""
# parameters = data.pop("parameters", {})
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
inputs = base64.b64decode(re.sub('^data:image/.+;base64,', '', data['inputs']))
raw_images = Image.open(BytesIO(inputs))
processed_image = self.processor(images=raw_images, return_tensors="pt").to(device)
out = self.model.generate(**processed_image)
captions = self.processor.decode(out[0], skip_special_tokens=True)
# postprocess the prediction
return {"captions": captions}
EndpointHandler()