macadeliccc commited on
Commit
a5153ba
1 Parent(s): 1059cff
app.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import requests
3
+ from PIL import Image, ImageDraw, ImageFont
4
+ import random
5
+ from transformers import AutoProcessor, AutoModelForVision2Seq
6
+
7
+ # Load the model and processor
8
+ model = AutoModelForVision2Seq.from_pretrained("microsoft/kosmos-2-patch14-224")
9
+ processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")
10
+
11
+
12
+ def draw_bounding_boxes(image: Image, entities):
13
+ draw = ImageDraw.Draw(image)
14
+ width, height = image.size
15
+
16
+ color_bank = [
17
+ "#0AC2FF", "#30D5C8", "#F3C300", "#47FF0A", "#C2FF0A"
18
+ ]
19
+
20
+ try:
21
+ font_size = 20
22
+ font = ImageFont.truetype("assets/arial.ttf", font_size)
23
+ except IOError:
24
+ font_size = 20
25
+ font = ImageFont.load_default()
26
+
27
+ for entity in entities:
28
+ label, _, boxes = entity
29
+ for box in boxes:
30
+ box_coords = [
31
+ box[0] * width, box[1] * height,
32
+ box[2] * width, box[3] * height
33
+ ]
34
+
35
+ outline_color = random.choice(color_bank)
36
+ text_fill_color = random.choice(color_bank)
37
+
38
+ draw.rectangle(box_coords, outline=outline_color, width=4)
39
+ text_position = (box_coords[0] + 5, box_coords[1] - font_size - 5)
40
+ draw.text(text_position, label, fill=text_fill_color, font=font)
41
+
42
+ return image
43
+
44
+ def highlight_entities(text, entities):
45
+ for entity in entities:
46
+ label = entity[0]
47
+ text = text.replace(label, f"*{label}*") # Highlighting by enclosing in asterisks
48
+ return text
49
+
50
+ def process_image(image, prompt_option, custom_prompt):
51
+ if not isinstance(image, Image.Image):
52
+ image = Image.open(image)
53
+
54
+ # Use the selected prompt option
55
+ if prompt_option == "Brief":
56
+ prompt = "<grounding>An image of"
57
+ elif prompt_option == "Detailed":
58
+ prompt = "<grounding> Describe this image in detail:"
59
+ else: # Custom
60
+ prompt = custom_prompt
61
+
62
+ inputs = processor(text=prompt, images=image, return_tensors="pt")
63
+ generated_ids = model.generate(
64
+ pixel_values=inputs["pixel_values"],
65
+ input_ids=inputs["input_ids"],
66
+ attention_mask=inputs["attention_mask"],
67
+ image_embeds=None,
68
+ image_embeds_position_mask=inputs["image_embeds_position_mask"],
69
+ use_cache=True,
70
+ max_new_tokens=128,
71
+ )
72
+ generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
73
+ processed_text, entities = processor.post_process_generation(generated_text)
74
+
75
+ # Draw bounding boxes on a copy of the image
76
+ processed_image = draw_bounding_boxes(image.copy(), entities)
77
+
78
+ highlighted_entities = highlight_entities(processed_text, entities)
79
+
80
+ return processed_image, processed_text, entities, highlighted_entities
81
+
82
+ def clear_interface():
83
+ return None, None, None, None
84
+
85
+
86
+ with gr.Blocks(gr.themes.Soft()) as demo:
87
+ gr.Markdown("# Kosmos-2 VQA Demo")
88
+ gr.Markdown("Run this space on your own hardware with this command: ```docker run -it```")
89
+
90
+ with gr.Row(equal_height=True):
91
+ image_input = gr.Image(type="pil", label="Upload Image")
92
+ processed_image_output = gr.Image(label="Processed Image")
93
+ with gr.Row(equal_height=True):
94
+ with gr.Column():
95
+ with gr.Accordion("Prompt Options"):
96
+ prompt_option = gr.Radio(choices=["Brief", "Detailed", "Custom"], label="Select Prompt Option", value="Brief")
97
+ custom_prompt_input = gr.Textbox(label="Custom Prompt", visible=False)
98
+
99
+ def show_custom_prompt_input(prompt_option):
100
+ return prompt_option == "Custom"
101
+
102
+ prompt_option.change(show_custom_prompt_input, inputs=[prompt_option], outputs=[custom_prompt_input])
103
+
104
+ with gr.Row(equal_height=True):
105
+ submit_button = gr.Button("Run Model")
106
+ clear_button = gr.Button("Clear", elem_id="clear_button")
107
+
108
+ with gr.Row(equal_height=True):
109
+ with gr.Column():
110
+ highlighted_entities = gr.Textbox(label="Processed Text")
111
+ with gr.Column():
112
+ with gr.Accordion("Entities"):
113
+ entities_output = gr.JSON(label="Entities", elem_id="entities_output")
114
+
115
+
116
+ # Define examples
117
+ examples = [
118
+ ["assets/snowman.jpg", "Custom", "<grounding> Question: Where is<phrase> the fire</phrase><object><patch_index_0005><patch_index_0911></object> next to? Answer:"],
119
+ ["assets/traffic.jpg", "Detailed", "<grounding> Describe this image in detail:"],
120
+ ["assets/umbrellas.jpg", "Brief", "<grounding>An image of"],
121
+ ]
122
+ gr.Examples(examples, inputs=[image_input, prompt_option, custom_prompt_input])
123
+
124
+ with gr.Row(equal_height=True):
125
+ with gr.Accordion("Additional Info"):
126
+ gr.Markdown("This demo uses the [Kosmos-2]")
127
+ submit_button.click(
128
+ fn=process_image,
129
+ inputs=[image_input, prompt_option, custom_prompt_input],
130
+ outputs=[processed_image_output, highlighted_entities, entities_output]
131
+ )
132
+
133
+ clear_button.click(
134
+ fn=clear_interface,
135
+ inputs=[],
136
+ outputs=[image_input, processed_image_output, highlighted_entities, entities_output]
137
+ )
138
+
139
+
140
+
141
+ demo.launch()
assets/arial.ttf ADDED
Binary file (276 kB). View file
 
assets/snowman.jpg ADDED
assets/traffic.jpg ADDED
assets/umbrellas.jpg ADDED
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ git+https://github.com/huggingface/transformers.git
2
+ torch
3
+ datasets
4
+ accelerate
5
+ numpy
6
+ Pillow
7
+ pydantic
8
+ numpy
9
+ scipy