Spaces:
Runtime error
Runtime error
macadeliccc
commited on
Commit
•
a5153ba
1
Parent(s):
1059cff
init
Browse files- app.py +141 -0
- assets/arial.ttf +0 -0
- assets/snowman.jpg +0 -0
- assets/traffic.jpg +0 -0
- assets/umbrellas.jpg +0 -0
- requirements.txt +9 -0
app.py
ADDED
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import requests
|
3 |
+
from PIL import Image, ImageDraw, ImageFont
|
4 |
+
import random
|
5 |
+
from transformers import AutoProcessor, AutoModelForVision2Seq
|
6 |
+
|
7 |
+
# Load the model and processor
|
8 |
+
model = AutoModelForVision2Seq.from_pretrained("microsoft/kosmos-2-patch14-224")
|
9 |
+
processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")
|
10 |
+
|
11 |
+
|
12 |
+
def draw_bounding_boxes(image: Image, entities):
|
13 |
+
draw = ImageDraw.Draw(image)
|
14 |
+
width, height = image.size
|
15 |
+
|
16 |
+
color_bank = [
|
17 |
+
"#0AC2FF", "#30D5C8", "#F3C300", "#47FF0A", "#C2FF0A"
|
18 |
+
]
|
19 |
+
|
20 |
+
try:
|
21 |
+
font_size = 20
|
22 |
+
font = ImageFont.truetype("assets/arial.ttf", font_size)
|
23 |
+
except IOError:
|
24 |
+
font_size = 20
|
25 |
+
font = ImageFont.load_default()
|
26 |
+
|
27 |
+
for entity in entities:
|
28 |
+
label, _, boxes = entity
|
29 |
+
for box in boxes:
|
30 |
+
box_coords = [
|
31 |
+
box[0] * width, box[1] * height,
|
32 |
+
box[2] * width, box[3] * height
|
33 |
+
]
|
34 |
+
|
35 |
+
outline_color = random.choice(color_bank)
|
36 |
+
text_fill_color = random.choice(color_bank)
|
37 |
+
|
38 |
+
draw.rectangle(box_coords, outline=outline_color, width=4)
|
39 |
+
text_position = (box_coords[0] + 5, box_coords[1] - font_size - 5)
|
40 |
+
draw.text(text_position, label, fill=text_fill_color, font=font)
|
41 |
+
|
42 |
+
return image
|
43 |
+
|
44 |
+
def highlight_entities(text, entities):
|
45 |
+
for entity in entities:
|
46 |
+
label = entity[0]
|
47 |
+
text = text.replace(label, f"*{label}*") # Highlighting by enclosing in asterisks
|
48 |
+
return text
|
49 |
+
|
50 |
+
def process_image(image, prompt_option, custom_prompt):
|
51 |
+
if not isinstance(image, Image.Image):
|
52 |
+
image = Image.open(image)
|
53 |
+
|
54 |
+
# Use the selected prompt option
|
55 |
+
if prompt_option == "Brief":
|
56 |
+
prompt = "<grounding>An image of"
|
57 |
+
elif prompt_option == "Detailed":
|
58 |
+
prompt = "<grounding> Describe this image in detail:"
|
59 |
+
else: # Custom
|
60 |
+
prompt = custom_prompt
|
61 |
+
|
62 |
+
inputs = processor(text=prompt, images=image, return_tensors="pt")
|
63 |
+
generated_ids = model.generate(
|
64 |
+
pixel_values=inputs["pixel_values"],
|
65 |
+
input_ids=inputs["input_ids"],
|
66 |
+
attention_mask=inputs["attention_mask"],
|
67 |
+
image_embeds=None,
|
68 |
+
image_embeds_position_mask=inputs["image_embeds_position_mask"],
|
69 |
+
use_cache=True,
|
70 |
+
max_new_tokens=128,
|
71 |
+
)
|
72 |
+
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
73 |
+
processed_text, entities = processor.post_process_generation(generated_text)
|
74 |
+
|
75 |
+
# Draw bounding boxes on a copy of the image
|
76 |
+
processed_image = draw_bounding_boxes(image.copy(), entities)
|
77 |
+
|
78 |
+
highlighted_entities = highlight_entities(processed_text, entities)
|
79 |
+
|
80 |
+
return processed_image, processed_text, entities, highlighted_entities
|
81 |
+
|
82 |
+
def clear_interface():
|
83 |
+
return None, None, None, None
|
84 |
+
|
85 |
+
|
86 |
+
with gr.Blocks(gr.themes.Soft()) as demo:
|
87 |
+
gr.Markdown("# Kosmos-2 VQA Demo")
|
88 |
+
gr.Markdown("Run this space on your own hardware with this command: ```docker run -it```")
|
89 |
+
|
90 |
+
with gr.Row(equal_height=True):
|
91 |
+
image_input = gr.Image(type="pil", label="Upload Image")
|
92 |
+
processed_image_output = gr.Image(label="Processed Image")
|
93 |
+
with gr.Row(equal_height=True):
|
94 |
+
with gr.Column():
|
95 |
+
with gr.Accordion("Prompt Options"):
|
96 |
+
prompt_option = gr.Radio(choices=["Brief", "Detailed", "Custom"], label="Select Prompt Option", value="Brief")
|
97 |
+
custom_prompt_input = gr.Textbox(label="Custom Prompt", visible=False)
|
98 |
+
|
99 |
+
def show_custom_prompt_input(prompt_option):
|
100 |
+
return prompt_option == "Custom"
|
101 |
+
|
102 |
+
prompt_option.change(show_custom_prompt_input, inputs=[prompt_option], outputs=[custom_prompt_input])
|
103 |
+
|
104 |
+
with gr.Row(equal_height=True):
|
105 |
+
submit_button = gr.Button("Run Model")
|
106 |
+
clear_button = gr.Button("Clear", elem_id="clear_button")
|
107 |
+
|
108 |
+
with gr.Row(equal_height=True):
|
109 |
+
with gr.Column():
|
110 |
+
highlighted_entities = gr.Textbox(label="Processed Text")
|
111 |
+
with gr.Column():
|
112 |
+
with gr.Accordion("Entities"):
|
113 |
+
entities_output = gr.JSON(label="Entities", elem_id="entities_output")
|
114 |
+
|
115 |
+
|
116 |
+
# Define examples
|
117 |
+
examples = [
|
118 |
+
["assets/snowman.jpg", "Custom", "<grounding> Question: Where is<phrase> the fire</phrase><object><patch_index_0005><patch_index_0911></object> next to? Answer:"],
|
119 |
+
["assets/traffic.jpg", "Detailed", "<grounding> Describe this image in detail:"],
|
120 |
+
["assets/umbrellas.jpg", "Brief", "<grounding>An image of"],
|
121 |
+
]
|
122 |
+
gr.Examples(examples, inputs=[image_input, prompt_option, custom_prompt_input])
|
123 |
+
|
124 |
+
with gr.Row(equal_height=True):
|
125 |
+
with gr.Accordion("Additional Info"):
|
126 |
+
gr.Markdown("This demo uses the [Kosmos-2]")
|
127 |
+
submit_button.click(
|
128 |
+
fn=process_image,
|
129 |
+
inputs=[image_input, prompt_option, custom_prompt_input],
|
130 |
+
outputs=[processed_image_output, highlighted_entities, entities_output]
|
131 |
+
)
|
132 |
+
|
133 |
+
clear_button.click(
|
134 |
+
fn=clear_interface,
|
135 |
+
inputs=[],
|
136 |
+
outputs=[image_input, processed_image_output, highlighted_entities, entities_output]
|
137 |
+
)
|
138 |
+
|
139 |
+
|
140 |
+
|
141 |
+
demo.launch()
|
assets/arial.ttf
ADDED
Binary file (276 kB). View file
|
|
assets/snowman.jpg
ADDED
assets/traffic.jpg
ADDED
assets/umbrellas.jpg
ADDED
requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
git+https://github.com/huggingface/transformers.git
|
2 |
+
torch
|
3 |
+
datasets
|
4 |
+
accelerate
|
5 |
+
numpy
|
6 |
+
Pillow
|
7 |
+
pydantic
|
8 |
+
numpy
|
9 |
+
scipy
|