added test script and data for local handler testing, fixed syntax error in handler script

Browse files

Files changed (4) hide show

.gitattributes +1 -0
handler.py +54 -28
test.png +3 -0
test.py +12 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text

handler.py CHANGED Viewed

@@ -4,12 +4,10 @@ from PIL import Image
 from io import BytesIO
 import numpy as np
 import os
-import requests
 import torch
 import torchvision.transforms as T
 from transformers import AutoProcessor, AutoModelForVision2Seq
 import cv2
-import ast
 # set device
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
@@ -18,15 +16,43 @@ if device.type != 'cuda':
 # set mixed precision dtype
 dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] == 8 else torch.float16
 class EndpointHandler():
     def __init__(self, path=""):
         self.ckpt_id = "ydshieh/kosmos-2-patch14-224"
-        self.model = AutoModelForVision2Seq.from_pretrained(ckpt_id, trust_remote_code=True).to("cuda")
-        self.processor = AutoProcessor.from_pretrained(ckpt, trust_remote_code=True)
-    def draw_entity_boxes_on_image(image, entities, show=False, save_path=None, entity_index=-1):
         """_summary_
         Args:
             image (_type_): image or image path
@@ -56,17 +82,17 @@ class EndpointHandler():
             image = np.array(pil_img)[:, :, [2, 1, 0]]
         else:
             raise ValueError(f"invaild image format, {type(image)} for {image}")
         if len(entities) == 0:
             return image
         indices = list(range(len(entities)))
         if entity_index >= 0:
             indices = [entity_index]
         # Not to show too many bboxes
         entities = entities[:len(color_map)]
         new_image = image.copy()
         previous_bboxes = []
         # size of text
@@ -78,10 +104,10 @@ class EndpointHandler():
         base_height = int(text_height * 0.675)
         text_offset_original = text_height - base_height
         text_spaces = 3
         # num_bboxes = sum(len(x[-1]) for x in entities)
         used_colors = colors  # random.sample(colors, k=num_bboxes)
         color_id = -1
         for entity_idx, (entity_name, (start, end), bboxes) in enumerate(entities):
             color_id += 1
@@ -91,37 +117,37 @@ class EndpointHandler():
                 # if start is None and bbox_id > 0:
                 #     color_id += 1
                 orig_x1, orig_y1, orig_x2, orig_y2 = int(x1_norm * image_w), int(y1_norm * image_h), int(x2_norm * image_w), int(y2_norm * image_h)
                 # draw bbox
                 # random color
                 color = used_colors[color_id]  # tuple(np.random.randint(0, 255, size=3).tolist())
                 new_image = cv2.rectangle(new_image, (orig_x1, orig_y1), (orig_x2, orig_y2), color, box_line)
                 l_o, r_o = box_line // 2 + box_line % 2, box_line // 2 + box_line % 2 + 1
                 x1 = orig_x1 - l_o
                 y1 = orig_y1 - l_o
                 if y1 < text_height + text_offset_original + 2 * text_spaces:
                     y1 = orig_y1 + r_o + text_height + text_offset_original + 2 * text_spaces
                     x1 = orig_x1 + r_o
                 # add text background
                 (text_width, text_height), _ = cv2.getTextSize(f"  {entity_name}", cv2.FONT_HERSHEY_COMPLEX, text_size, text_line)
                 text_bg_x1, text_bg_y1, text_bg_x2, text_bg_y2 = x1, y1 - (text_height + text_offset_original + 2 * text_spaces), x1 + text_width, y1
                 for prev_bbox in previous_bboxes:
-                    while is_overlapping((text_bg_x1, text_bg_y1, text_bg_x2, text_bg_y2), prev_bbox):
                         text_bg_y1 += (text_height + text_offset_original + 2 * text_spaces)
                         text_bg_y2 += (text_height + text_offset_original + 2 * text_spaces)
                         y1 += (text_height + text_offset_original + 2 * text_spaces)
                         if text_bg_y2 >= image_h:
                             text_bg_y1 = max(0, image_h - (text_height + text_offset_original + 2 * text_spaces))
                             text_bg_y2 = image_h
                             y1 = image_h
                             break
                 alpha = 0.5
                 for i in range(text_bg_y1, text_bg_y2):
                     for j in range(text_bg_x1, text_bg_x2):
@@ -133,19 +159,19 @@ class EndpointHandler():
                                 # white
                                 bg_color = [255, 255, 255]
                             new_image[i, j] = (alpha * new_image[i, j] + (1 - alpha) * np.array(bg_color)).astype(np.uint8)
                 cv2.putText(
                     new_image, f"  {entity_name}", (x1, y1 - text_offset_original - 1 * text_spaces), cv2.FONT_HERSHEY_COMPLEX, text_size, (0, 0, 0), text_line, cv2.LINE_AA
                 )
                 # previous_locations.append((x1, y1))
                 previous_bboxes.append((text_bg_x1, text_bg_y1, text_bg_x2, text_bg_y2))
         pil_image = Image.fromarray(new_image[:, :, [2, 1, 0]])
         if save_path:
             pil_image.save(save_path)
         if show:
             pil_image.show()
         return pil_image
@@ -161,13 +187,13 @@ class EndpointHandler():
         # (https://github.com/microsoft/unilm/blob/f4695ed0244a275201fff00bee495f76670fbe70/kosmos-2/demo/gradio_app.py#L345-L346)
         user_image_path = "/tmp/user_input_test_image.jpg"
         image_input.save(user_image_path)
         # This might give different results from the original argument `image_input`
         image_input = Image.open(user_image_path)
         text_input = "<grounding>Describe this image in detail:"
         #text_input = f"<grounding>{text_input}"
-        inputs = processor(text=text_input, images=image_input, return_tensors="pt")
         generated_ids = self.model.generate(
             pixel_values=inputs["pixel_values"].to("cuda"),
@@ -181,7 +207,7 @@ class EndpointHandler():
         generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
         # By default, the generated  text is cleanup and the entities are extracted.
-        processed_text, entities = processor.post_process_generation(generated_text)
         annotated_image = self.draw_entity_boxes_on_image(image_input, entities, show=False)
@@ -213,10 +239,10 @@ class EndpointHandler():
             colored_text.append((processed_text[end:len(processed_text)], None))
         return annotated_image, colored_text, str(filtered_entities)
     # helper to decode input image
     def decode_base64_image(self, image_string):
         base64_image = base64.b64decode(image_string)
         buffer = BytesIO(base64_image)
         image = Image.open(buffer)
-        return image

 from io import BytesIO
 import numpy as np
 import os
 import torch
 import torchvision.transforms as T
 from transformers import AutoProcessor, AutoModelForVision2Seq
 import cv2
 # set device
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 # set mixed precision dtype
 dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] == 8 else torch.float16
+colors = [
+    (0, 255, 0),
+    (0, 0, 255),
+    (255, 255, 0),
+    (255, 0, 255),
+    (0, 255, 255),
+    (114, 128, 250),
+    (0, 165, 255),
+    (0, 128, 0),
+    (144, 238, 144),
+    (238, 238, 175),
+    (255, 191, 0),
+    (0, 128, 0),
+    (226, 43, 138),
+    (255, 0, 255),
+    (0, 215, 255),
+    (255, 0, 0),
+]
+color_map = {
+    f"{color_id}": f"#{hex(color[2])[2:].zfill(2)}{hex(color[1])[2:].zfill(2)}{hex(color[0])[2:].zfill(2)}" for color_id, color in enumerate(colors)
+}
 class EndpointHandler():
     def __init__(self, path=""):
         self.ckpt_id = "ydshieh/kosmos-2-patch14-224"
+        self.model = AutoModelForVision2Seq.from_pretrained(self.ckpt_id, trust_remote_code=True).to("cuda")
+        self.processor = AutoProcessor.from_pretrained(self.ckpt_id, trust_remote_code=True)
+    def is_overlapping(self, rect1, rect2):
+        x1, y1, x2, y2 = rect1
+        x3, y3, x4, y4 = rect2
+        return not (x2 < x3 or x1 > x4 or y2 < y3 or y1 > y4)
+    def draw_entity_boxes_on_image(self, image, entities, show=False, save_path=None, entity_index=-1):
         """_summary_
         Args:
             image (_type_): image or image path
             image = np.array(pil_img)[:, :, [2, 1, 0]]
         else:
             raise ValueError(f"invaild image format, {type(image)} for {image}")
         if len(entities) == 0:
             return image
         indices = list(range(len(entities)))
         if entity_index >= 0:
             indices = [entity_index]
         # Not to show too many bboxes
         entities = entities[:len(color_map)]
         new_image = image.copy()
         previous_bboxes = []
         # size of text
         base_height = int(text_height * 0.675)
         text_offset_original = text_height - base_height
         text_spaces = 3
         # num_bboxes = sum(len(x[-1]) for x in entities)
         used_colors = colors  # random.sample(colors, k=num_bboxes)
         color_id = -1
         for entity_idx, (entity_name, (start, end), bboxes) in enumerate(entities):
             color_id += 1
                 # if start is None and bbox_id > 0:
                 #     color_id += 1
                 orig_x1, orig_y1, orig_x2, orig_y2 = int(x1_norm * image_w), int(y1_norm * image_h), int(x2_norm * image_w), int(y2_norm * image_h)
                 # draw bbox
                 # random color
                 color = used_colors[color_id]  # tuple(np.random.randint(0, 255, size=3).tolist())
                 new_image = cv2.rectangle(new_image, (orig_x1, orig_y1), (orig_x2, orig_y2), color, box_line)
                 l_o, r_o = box_line // 2 + box_line % 2, box_line // 2 + box_line % 2 + 1
                 x1 = orig_x1 - l_o
                 y1 = orig_y1 - l_o
                 if y1 < text_height + text_offset_original + 2 * text_spaces:
                     y1 = orig_y1 + r_o + text_height + text_offset_original + 2 * text_spaces
                     x1 = orig_x1 + r_o
                 # add text background
                 (text_width, text_height), _ = cv2.getTextSize(f"  {entity_name}", cv2.FONT_HERSHEY_COMPLEX, text_size, text_line)
                 text_bg_x1, text_bg_y1, text_bg_x2, text_bg_y2 = x1, y1 - (text_height + text_offset_original + 2 * text_spaces), x1 + text_width, y1
                 for prev_bbox in previous_bboxes:
+                    while self.is_overlapping((text_bg_x1, text_bg_y1, text_bg_x2, text_bg_y2), prev_bbox):
                         text_bg_y1 += (text_height + text_offset_original + 2 * text_spaces)
                         text_bg_y2 += (text_height + text_offset_original + 2 * text_spaces)
                         y1 += (text_height + text_offset_original + 2 * text_spaces)
                         if text_bg_y2 >= image_h:
                             text_bg_y1 = max(0, image_h - (text_height + text_offset_original + 2 * text_spaces))
                             text_bg_y2 = image_h
                             y1 = image_h
                             break
                 alpha = 0.5
                 for i in range(text_bg_y1, text_bg_y2):
                     for j in range(text_bg_x1, text_bg_x2):
                                 # white
                                 bg_color = [255, 255, 255]
                             new_image[i, j] = (alpha * new_image[i, j] + (1 - alpha) * np.array(bg_color)).astype(np.uint8)
                 cv2.putText(
                     new_image, f"  {entity_name}", (x1, y1 - text_offset_original - 1 * text_spaces), cv2.FONT_HERSHEY_COMPLEX, text_size, (0, 0, 0), text_line, cv2.LINE_AA
                 )
                 # previous_locations.append((x1, y1))
                 previous_bboxes.append((text_bg_x1, text_bg_y1, text_bg_x2, text_bg_y2))
         pil_image = Image.fromarray(new_image[:, :, [2, 1, 0]])
         if save_path:
             pil_image.save(save_path)
         if show:
             pil_image.show()
         return pil_image
         # (https://github.com/microsoft/unilm/blob/f4695ed0244a275201fff00bee495f76670fbe70/kosmos-2/demo/gradio_app.py#L345-L346)
         user_image_path = "/tmp/user_input_test_image.jpg"
         image_input.save(user_image_path)
         # This might give different results from the original argument `image_input`
         image_input = Image.open(user_image_path)
         text_input = "<grounding>Describe this image in detail:"
         #text_input = f"<grounding>{text_input}"
+        inputs = self.processor(text=text_input, images=image_input, return_tensors="pt")
         generated_ids = self.model.generate(
             pixel_values=inputs["pixel_values"].to("cuda"),
         generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
         # By default, the generated  text is cleanup and the entities are extracted.
+        processed_text, entities = self.processor.post_process_generation(generated_text)
         annotated_image = self.draw_entity_boxes_on_image(image_input, entities, show=False)
             colored_text.append((processed_text[end:len(processed_text)], None))
         return annotated_image, colored_text, str(filtered_entities)
     # helper to decode input image
     def decode_base64_image(self, image_string):
         base64_image = base64.b64decode(image_string)
         buffer = BytesIO(base64_image)
         image = Image.open(buffer)
+        return image

test.png ADDED Viewed

Git LFS Details

SHA256: 91905662c8deed94cf19ca41c71b48ab013eda1a88d4c5d9cdb97cda96b04f54
Pointer size: 132 Bytes
Size of remote file: 6.62 MB

test.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from handler import EndpointHandler
+from PIL import Image
+import base64
+# init handler
+my_handler = EndpointHandler(path=".")
+# prepare sample payload
+image = Image.open("test.png")
+payload = {"image": base64.b64encode(image)}
+pred=my_handler(payload)