austinmw commited on
Commit
298d752
1 Parent(s): 611e172

Upload tool

Browse files
Files changed (4) hide show
  1. app.py +4 -0
  2. blip_tool.py +69 -0
  3. requirements.txt +2 -0
  4. tool_config.json +5 -0
app.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from transformers import launch_gradio_demo
2
+ from blip_tool import InstructBLIPImageQuestionAnsweringTool
3
+
4
+ launch_gradio_demo(InstructBLIPImageQuestionAnsweringTool)
blip_tool.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoModelForVision2Seq, AutoProcessor
3
+ from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration
4
+ from transformers.tools import PipelineTool
5
+ from transformers.tools.base import get_default_device
6
+ from transformers.utils import requires_backends
7
+
8
+ class InstructBLIPImageQuestionAnsweringTool(PipelineTool):
9
+ #default_checkpoint = "Salesforce/blip2-opt-2.7b"
10
+ #default_checkpoint = "Salesforce/instructblip-flan-t5-xl"
11
+ #default_checkpoint = "Salesforce/instructblip-vicuna-7b"
12
+ default_checkpoint = "Salesforce/instructblip-vicuna-13b"
13
+
14
+ description = (
15
+ "This is a tool that answers a question about an image. It takes an input named `image` which should be the "
16
+ "image containing the information, as well as a `question` which should be the question in English. It "
17
+ "returns a text that is the answer to the question."
18
+ )
19
+ name = "image_qa"
20
+ pre_processor_class = AutoProcessor
21
+ model_class = AutoModelForVision2Seq
22
+ inputs = ["image", "text"]
23
+ outputs = ["text"]
24
+
25
+ def __init__(self, *args, **kwargs):
26
+ requires_backends(self, ["vision"])
27
+ super().__init__(*args, **kwargs)
28
+
29
+ def setup(self):
30
+ """
31
+ Instantiates the `pre_processor`, `model` and `post_processor` if necessary.
32
+ """
33
+ if isinstance(self.pre_processor, str):
34
+ self.pre_processor = self.pre_processor_class.from_pretrained(self.pre_processor, **self.hub_kwargs)
35
+
36
+ if isinstance(self.model, str):
37
+ self.model = self.model_class.from_pretrained(self.model, **self.model_kwargs, **self.hub_kwargs, load_in_4bit=True, torch_dtype=torch.float16)
38
+
39
+ if self.post_processor is None:
40
+ self.post_processor = self.pre_processor
41
+ elif isinstance(self.post_processor, str):
42
+ self.post_processor = self.post_processor_class.from_pretrained(self.post_processor, **self.hub_kwargs)
43
+
44
+ if self.device is None:
45
+ if self.device_map is not None:
46
+ self.device = list(self.model.hf_device_map.values())[0]
47
+ else:
48
+ self.device = get_default_device()
49
+
50
+ self.is_initialized = True
51
+
52
+ def encode(self, image, question: str):
53
+ return self.pre_processor(images=image, text=question, return_tensors="pt").to(device="cuda", dtype=torch.float16)
54
+
55
+ def forward(self, inputs):
56
+ outputs = self.model.generate(
57
+ **inputs,
58
+ num_beams=5,
59
+ max_new_tokens=256,
60
+ min_length=1,
61
+ top_p=0.9,
62
+ repetition_penalty=1.5,
63
+ length_penalty=1.0,
64
+ temperature=0.7,
65
+ )
66
+ return outputs
67
+
68
+ def decode(self, outputs):
69
+ return self.pre_processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ transformers
2
+ torch
tool_config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "description": "This is a tool that answers a question about an image. It takes an input named `image` which should be the image containing the information, as well as a `question` which should be the question in English. It returns a text that is the answer to the question.",
3
+ "name": "image_qa",
4
+ "tool_class": "blip_tool.InstructBLIPImageQuestionAnsweringTool"
5
+ }