File size: 2,230 Bytes

a583978
 
 
 
cb05228
f3126f3
cb05228
a79f819
a583978
 
 
 
 
 
a7a4721
a583978
f0b2295
baa2ff5
 
 
 
 
 
2a79ef4
8576dce
2a79ef4
 
 
 
 
 
 
 
a583978
 
a7a4721
a583978
 
 
 
 
baa2ff5
 
a583978
cb05228
baa2ff5
a583978
cb05228
 
8576dce
2a79ef4
baa2ff5
 
 
 
 
 
 
a583978
baa2ff5

from typing import  Dict, List, Any
from PIL import Image
import requests
import torch
import base64
import os
from io import BytesIO
from models.blip_decoder import blip_decoder
from torchvision import transforms
from torchvision.transforms.functional import InterpolationMode

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class PreTrainedPipeline():
    def __init__(self, path=""):
        # load the optimized model
        self.model_path = os.path.join(path,'model_large_caption.pth') 
        self.model = blip_decoder(
            pretrained=self.model_path, 
            image_size=384, 
            vit='large',
            med_config=os.path.join(path, 'configs/med_config.json')
        )
        self.model.eval()
        self.model = self.model.to(device)
        
        image_size = 384
        self.transform = transforms.Compose([
            transforms.Resize((image_size,image_size),interpolation=InterpolationMode.BICUBIC),
            transforms.ToTensor(),
            transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
            ]) 
     


    def __call__(self, data: Any) -> Dict[str, Any]:
        """
        Args:
            data (:obj:):
                includes the input data and the parameters for the inference.
        Return:
            A :obj:`dict`:. The object returned should be a dict of one list like [[{"label": 0.9939950108528137}]] containing :
                - "caption": A string corresponding to the generated caption.
        """
        inputs = data.pop("inputs", data)
        parameters = data.pop("parameters", {})

        # decode base64 image to PIL
        image = Image.open(BytesIO(base64.b64decode(inputs['image'])))
        image = self.transform(image).unsqueeze(0).to(device)   
        with torch.no_grad():
            caption = self.model.generate(
                image, 
                sample=parameters.get('sample',True),
                top_p=parameters.get('top_p',0.9), 
                max_length=parameters.get('max_length',20), 
                min_length=parameters.get('min_length',5)
            )
        # postprocess the prediction
        return {"caption": caption}