CiaranMn commited on
Commit
b3f0971
0 Parent(s):

initial commit

Browse files
Files changed (3) hide show
  1. README.md +25 -0
  2. handler.py +50 -0
  3. requirements.txt +1 -0
README.md ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - image-to-text
4
+ - image-captioning
5
+ - endpoints-template
6
+ license: bsd-3-clause
7
+ library_name: generic
8
+ ---
9
+
10
+ # Image captioning
11
+ For deployment as an inference endpoint, using a Custom task type – a fixed(?) version of [this repo](https://huggingface.co/florentgbelidji/blip_captioning)
12
+
13
+ ## Request payload
14
+ ```json
15
+ {
16
+ "inputs": ["/9j/4AAQSkZJRgABAQEBLAEsAAD/2wBDAAMCAgICAgMC...."], // base64-encoded image
17
+ }
18
+ ```
19
+
20
+ ## Response payload
21
+ ```json
22
+ {
23
+ "captions": ["inferred caption for image"]
24
+ }
25
+ ```
handler.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # +
2
+ from typing import Dict, List, Any
3
+ from PIL import Image
4
+ import base64
5
+ import torch
6
+ import os
7
+ from io import BytesIO
8
+ from transformers import BlipForConditionalGeneration, BlipProcessor
9
+ # -
10
+
11
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
12
+
13
+ class EndpointHandler():
14
+ def __init__(self, path=""):
15
+ # load the optimized model
16
+
17
+ self.processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
18
+ self.model = BlipForConditionalGeneration.from_pretrained(
19
+ "Salesforce/blip-image-captioning-base"
20
+ ).to(device)
21
+ self.model.eval()
22
+ self.model = self.model.to(device)
23
+
24
+
25
+
26
+ def __call__(self, data: Any) -> Dict[str, Any]:
27
+ """
28
+ Args:
29
+ data (:obj:):
30
+ includes the input data and the parameters for the inference.
31
+ Return:
32
+ A :obj:`dict`:. The object returned should be a dict of one list like {"captions": ["A hugging face at the office"]} containing :
33
+ - "caption": A string corresponding to the generated caption.
34
+ """
35
+ inputs = data.pop("inputs", data)
36
+ parameters = data.pop("parameters", {})
37
+
38
+ raw_image = Image.open(BytesIO(base64.b64decode(_img))) for _img in inputs]
39
+
40
+ processed_image = self.processor(images=raw_images, return_tensors="pt")
41
+ processed_image["pixel_values"] = processed_image["pixel_values"].to(device)
42
+ processed_image = {**processed_image, **parameters}
43
+
44
+ with torch.no_grad():
45
+ out = self.model.generate(
46
+ **processed_image
47
+ )
48
+ captions = self.processor.batch_decode(out, skip_special_tokens=True)
49
+
50
+ return {"captions": captions}
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ git+https://github.com/huggingface/transformers.git@main