blip_captioning / handler.py
florentgbelidji's picture
Updated requirements.txt and transformed pipeline to handler
9d9b5e2
raw history blame
No virus
2.19 kB
from typing import Dict, List, Any
from PIL import Image
import requests
import torch
import base64
import os
from io import BytesIO
from models.blip_decoder import blip_decoder
from torchvision import transforms
from torchvision.transforms.functional import InterpolationMode
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
class EndpointHandler():
def __init__(self, path=""):
# load the optimized model
self.model_path = os.path.join(path,'model_large_caption.pth')
self.model = blip_decoder(
pretrained=self.model_path,
image_size=384,
vit='large',
med_config=os.path.join(path, 'configs/med_config.json')
)
self.model.eval()
self.model = self.model.to(device)
image_size = 384
self.transform = transforms.Compose([
transforms.Resize((image_size,image_size),interpolation=InterpolationMode.BICUBIC),
transforms.ToTensor(),
transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
])
def __call__(self, data: Any) -> Dict[str, Any]:
"""
Args:
data (:obj:):
includes the input data and the parameters for the inference.
Return:
A :obj:`dict`:. The object returned should be a dict of one list like {"caption": ["A hugging face at the office"]} containing :
- "caption": A string corresponding to the generated caption.
"""
inputs = data.pop("inputs", data)
parameters = data.pop("parameters", {})
image = Image.open(BytesIO(inputs['image']))
image = self.transform(image).unsqueeze(0).to(device)
with torch.no_grad():
caption = self.model.generate(
image,
sample=parameters.get('sample',True),
top_p=parameters.get('top_p',0.9),
max_length=parameters.get('max_length',20),
min_length=parameters.get('min_length',5)
)
# postprocess the prediction
return {"caption": caption}