sergeipetrov florentgbelidji HF staff commited on
Commit
42762fe
0 Parent(s):

Duplicate from florentgbelidji/blip_captioning

Browse files

Co-authored-by: Florent Gbelidji <florentgbelidji@users.noreply.huggingface.co>

Files changed (4) hide show
  1. .gitattributes +31 -0
  2. README.md +97 -0
  3. handler.py +49 -0
  4. requirements.txt +0 -0
.gitattributes ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ftz filter=lfs diff=lfs merge=lfs -text
6
+ *.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.h5 filter=lfs diff=lfs merge=lfs -text
8
+ *.joblib filter=lfs diff=lfs merge=lfs -text
9
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
10
+ *.model filter=lfs diff=lfs merge=lfs -text
11
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
12
+ *.npy filter=lfs diff=lfs merge=lfs -text
13
+ *.npz filter=lfs diff=lfs merge=lfs -text
14
+ *.onnx filter=lfs diff=lfs merge=lfs -text
15
+ *.ot filter=lfs diff=lfs merge=lfs -text
16
+ *.parquet filter=lfs diff=lfs merge=lfs -text
17
+ *.pb filter=lfs diff=lfs merge=lfs -text
18
+ *.pickle filter=lfs diff=lfs merge=lfs -text
19
+ *.pkl filter=lfs diff=lfs merge=lfs -text
20
+ *.pt filter=lfs diff=lfs merge=lfs -text
21
+ *.pth filter=lfs diff=lfs merge=lfs -text
22
+ *.rar filter=lfs diff=lfs merge=lfs -text
23
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
24
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
25
+ *.tflite filter=lfs diff=lfs merge=lfs -text
26
+ *.tgz filter=lfs diff=lfs merge=lfs -text
27
+ *.wasm filter=lfs diff=lfs merge=lfs -text
28
+ *.xz filter=lfs diff=lfs merge=lfs -text
29
+ *.zip filter=lfs diff=lfs merge=lfs -text
30
+ *.zstandard filter=lfs diff=lfs merge=lfs -text
31
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - image-to-text
4
+ - image-captioning
5
+ - endpoints-template
6
+ license: bsd-3-clause
7
+ library_name: generic
8
+ ---
9
+
10
+ # Fork of [salesforce/BLIP](https://github.com/salesforce/BLIP) for a `image-captioning` task on 🤗Inference endpoint.
11
+
12
+ This repository implements a `custom` task for `image-captioning` for 🤗 Inference Endpoints. The code for the customized pipeline is in the [pipeline.py](https://huggingface.co/florentgbelidji/blip_captioning/blob/main/pipeline.py).
13
+ To use deploy this model a an Inference Endpoint you have to select `Custom` as task to use the `pipeline.py` file. -> _double check if it is selected_
14
+ ### expected Request payload
15
+ ```json
16
+ {
17
+ "image": "/9j/4AAQSkZJRgABAQEBLAEsAAD/2wBDAAMCAgICAgMC....", // base64 image as bytes
18
+ }
19
+ ```
20
+ below is an example on how to run a request using Python and `requests`.
21
+ ## Run Request
22
+ 1. prepare an image.
23
+ ```bash
24
+ !wget https://huggingface.co/datasets/mishig/sample_images/resolve/main/palace.jpg
25
+ ```
26
+ 2.run request
27
+
28
+ ```python
29
+ import json
30
+ from typing import List
31
+ import requests as r
32
+ import base64
33
+
34
+ ENDPOINT_URL = ""
35
+ HF_TOKEN = ""
36
+
37
+ def predict(path_to_image: str = None):
38
+ with open(path_to_image, "rb") as i:
39
+ image = i.read()
40
+ payload = {
41
+ "inputs": [image],
42
+ "parameters": {
43
+ "do_sample": True,
44
+ "top_p":0.9,
45
+ "min_length":5,
46
+ "max_length":20
47
+ }
48
+ }
49
+ response = r.post(
50
+ ENDPOINT_URL, headers={"Authorization": f"Bearer {HF_TOKEN}"}, json=payload
51
+ )
52
+ return response.json()
53
+ prediction = predict(
54
+ path_to_image="palace.jpg"
55
+ )
56
+
57
+ ```
58
+ Example parameters depending on the decoding strategy:
59
+
60
+ 1. Beam search
61
+
62
+ ```
63
+ "parameters": {
64
+ "num_beams":5,
65
+ "max_length":20
66
+ }
67
+ ```
68
+
69
+ 2. Nucleus sampling
70
+
71
+ ```
72
+ "parameters": {
73
+ "num_beams":1,
74
+ "max_length":20,
75
+ "do_sample": True,
76
+ "top_k":50,
77
+ "top_p":0.95
78
+ }
79
+ ```
80
+
81
+ 3. Contrastive search
82
+
83
+ ```
84
+ "parameters": {
85
+ "penalty_alpha":0.6,
86
+ "top_k":4
87
+ "max_length":512
88
+ }
89
+ ```
90
+
91
+ See [generate()](https://huggingface.co/docs/transformers/v4.25.1/en/main_classes/text_generation#transformers.GenerationMixin.generate) doc for additional detail
92
+
93
+
94
+ expected output
95
+ ```python
96
+ ['buckingham palace with flower beds and red flowers']
97
+ ```
handler.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # +
2
+ from typing import Dict, List, Any
3
+ from PIL import Image
4
+ import torch
5
+ import os
6
+ from io import BytesIO
7
+ from transformers import BlipForConditionalGeneration, BlipProcessor
8
+ # -
9
+
10
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
11
+
12
+ class EndpointHandler():
13
+ def __init__(self, path=""):
14
+ # load the optimized model
15
+
16
+ self.processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
17
+ self.model = BlipForConditionalGeneration.from_pretrained(
18
+ "Salesforce/blip-image-captioning-base"
19
+ ).to(device)
20
+ self.model.eval()
21
+ self.model = self.model.to(device)
22
+
23
+
24
+
25
+ def __call__(self, data: Any) -> Dict[str, Any]:
26
+ """
27
+ Args:
28
+ data (:obj:):
29
+ includes the input data and the parameters for the inference.
30
+ Return:
31
+ A :obj:`dict`:. The object returned should be a dict of one list like {"captions": ["A hugging face at the office"]} containing :
32
+ - "caption": A string corresponding to the generated caption.
33
+ """
34
+ inputs = data.pop("inputs", data)
35
+ parameters = data.pop("parameters", {})
36
+
37
+ raw_images = [Image.open(BytesIO(_img)) for _img in inputs]
38
+
39
+ processed_image = self.processor(images=raw_images, return_tensors="pt")
40
+ processed_image["pixel_values"] = processed_image["pixel_values"].to(device)
41
+ processed_image = {**processed_image, **parameters}
42
+
43
+ with torch.no_grad():
44
+ out = self.model.generate(
45
+ **processed_image
46
+ )
47
+ captions = self.processor.batch_decode(out, skip_special_tokens=True)
48
+ # postprocess the prediction
49
+ return {"captions": captions}
requirements.txt ADDED
File without changes