sooh-j
/

blip-image-captioning-base

image-captioning

Model card Files Files and versions

blip-image-captioning-base / handler.py

sooh-j's picture

Upload folder using huggingface_hub

b2d68d7 verified over 1 year ago

history blame contribute delete

2 kB

	import numpy as np
	from transformers import AutoProcessor, AutoTokenizer, AutoImageProcessor, AutoModelForCausalLM, BlipForConditionalGeneration, VisionEncoderDecoderModel
	from typing import Dict, List, Any
	from PIL import Image
	from transformers import pipeline
	import requests
	import torch
	from io import BytesIO
	import base64

	class EndpointHandler():
	def __init__(self, path=""):
	self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
	print("device:",self.device)
	self.model_name = "sooh-j/blip-image-captioning-base"
	self.processor = AutoProcessor.from_pretrained(self.model_name)
	self.model = BlipForConditionalGeneration.from_pretrained(self.model_name,
	)

	def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
	"""
	data args:
	inputs (:obj: `str` \| `PIL.Image` \| `np.array`)
	kwargs
	Return:
	A :obj:`list` \| `dict`: will be serialized and returned
	"""
	inputs = data.get("inputs")
	imageBase64 = inputs.get("image")
	# question = inputs.get("question")

	# imageURL = inputs.get("image")
	# image = Image.open(requests.get(imageBase64, stream=True).raw)

	if 'http:' in imageBase64:
	image = Image.open(requests.get(imageBase64, stream=True).raw)
	else:
	image = Image.open(BytesIO(base64.b64decode(imageBase64.split(",")[0].encode())))

	# prompt = f"Question: {question}, Answer:"
	processed = self.processor(images=image, return_tensors="pt").to(self.device)

	with torch.no_grad():
	out = self.model.generate(**processed, max_new_tokens=50).to(self.device)

	result = {}
	text_output = self.processor.decode(out[0], skip_special_tokens=True)
	result["text_output"] = text_output
	score = 0

	return [{"answer":text_output,"score":score}]