Spaces:

yizhangliu
/

Grounded-Segment-Anything

Sleeping

App Files Files Community

liuyizhang commited on May 2, 2023

Commit

7a7f9d8

•

1 Parent(s): 5d0da89

add ram

Browse files

Files changed (6) hide show

app.py +200 -25
assets/OpenSans-Bold.ttf +0 -0
checkpoints/ram_epoch12.pth +3 -0
ram_train_eval.py +416 -0
ram_utils.py +152 -0
requirements.txt +2 -7

app.py CHANGED Viewed

@@ -44,7 +44,7 @@ from lama_cleaner.model_manager import ModelManager
 from lama_cleaner.schema import Config
 # segment anything
-from segment_anything import build_sam, SamPredictor
 # diffusers
 import PIL
@@ -238,6 +238,7 @@ groundingdino_model = load_model_hf(config_file, ckpt_repo_id, ckpt_filenmae)
 # initialize SAM
 logger.info(f"initialize SAM model...")
 sam_predictor = SamPredictor(build_sam(checkpoint=sam_checkpoint))
 # initialize stable-diffusion-inpainting
 logger.info(f"initialize stable-diffusion-inpainting...")
@@ -319,11 +320,168 @@ def lama_cleaner_process(image, mask):
     image = Image.open(io.BytesIO(numpy_to_bytes(res_np_img, 'png')))
     return  image
 mask_source_draw = "draw a mask on input image"
 mask_source_segment = "type what to detect below"
-def run_grounded_sam(input_image, text_prompt, task_type, inpaint_prompt, box_threshold, text_threshold,
-            iou_threshold, inpaint_mode, mask_source_radio, remove_mode, remove_mask_extend):
     text_prompt = text_prompt.strip()
     if not ((task_type == 'inpainting' or task_type == 'remove') and mask_source_radio == mask_source_draw):
         if text_prompt == '':
@@ -333,7 +491,7 @@ def run_grounded_sam(input_image, text_prompt, task_type, inpaint_prompt, box_th
             return [], gr.Gallery.update(label='Please upload a image!😂😂😂😂')
     file_temp = int(time.time())
-    logger.info(f'run_grounded_sam_[{file_temp}]_{task_type}/{inpaint_mode}/[{mask_source_radio}]/{remove_mode}/{remove_mask_extend}_[{text_prompt}]/[{inpaint_prompt}]___1_')
     # load image
     input_mask_pil = input_image['mask']
@@ -364,7 +522,7 @@ def run_grounded_sam(input_image, text_prompt, task_type, inpaint_prompt, box_th
             groundingdino_model, image, text_prompt, box_threshold, text_threshold, device=groundingdino_device
         )
         if boxes_filt.size(0) == 0:
-            logger.info(f'run_grounded_sam_[{file_temp}]_{task_type}_[{text_prompt}]_1_[No objects detected, please try others.]_')
             return [], gr.Gallery.update(label='No objects detected, please try others.😂😂😂😂')
         boxes_filt_ori = copy.deepcopy(boxes_filt)
@@ -380,7 +538,7 @@ def run_grounded_sam(input_image, text_prompt, task_type, inpaint_prompt, box_th
         os.remove(image_path)
         output_images.append(detection_image_result)
-    logger.info(f'run_grounded_sam_[{file_temp}]_{task_type}_2_')
     if task_type == 'segment' or ((task_type == 'inpainting' or task_type == 'remove') and mask_source_radio == mask_source_segment):
         image = np.array(input_image['image'])
         sam_predictor.set_image(image)
@@ -416,15 +574,15 @@ def run_grounded_sam(input_image, text_prompt, task_type, inpaint_prompt, box_th
         os.remove(image_path)
         output_images.append(segment_image_result)
-    logger.info(f'run_grounded_sam_[{file_temp}]_{task_type}_3_')
     if task_type == 'detection' or task_type == 'segment':
-        logger.info(f'run_grounded_sam_[{file_temp}]_{task_type}_9_')
         return output_images, gr.Gallery.update(label='result images')
     elif task_type == 'inpainting' or task_type == 'remove':
         if inpaint_prompt.strip() == '' and mask_source_radio == mask_source_segment:
             task_type = 'remove'
-        logger.info(f'run_grounded_sam_[{file_temp}]_{task_type}_4_')
         if mask_source_radio == mask_source_draw:
             mask_pil = input_mask_pil
             mask = input_mask
@@ -437,6 +595,8 @@ def run_grounded_sam(input_image, text_prompt, task_type, inpaint_prompt, box_th
             mask_pil = Image.fromarray(mask)
         image_path = os.path.join(output_dir, f"image_mask_{file_temp}.jpg")
         mask_pil.convert("RGB").save(image_path)
         image_result = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB)
         os.remove(image_path)
@@ -480,6 +640,8 @@ def run_grounded_sam(input_image, text_prompt, task_type, inpaint_prompt, box_th
                 mask_pil = mix_masks(mask_imgs)
                 image_path = os.path.join(output_dir, f"image_mask_{file_temp}.jpg")
                 mask_pil.convert("RGB").save(image_path)
                 image_result = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB)
                 os.remove(image_path)
@@ -492,25 +654,35 @@ def run_grounded_sam(input_image, text_prompt, task_type, inpaint_prompt, box_th
         image_inpainting.save(image_path)
         image_result = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB)
         os.remove(image_path)
-        logger.info(f'run_grounded_sam_[{file_temp}]_{task_type}_9_')
         output_images.append(image_result)
         return output_images, gr.Gallery.update(label='result images')
     else:
         logger.info(f"task_type:{task_type} error!")
-    logger.info(f'run_grounded_sam_[{file_temp}]_9_9_')
     return output_images, gr.Gallery.update(label='result images')
-def change_radio_display(task_type, mask_source_radio):
     text_prompt_visible = True
     inpaint_prompt_visible = False
     mask_source_radio_visible = False
     if task_type == "inpainting":
         inpaint_prompt_visible = True
     if task_type == "inpainting" or task_type == "remove":
         mask_source_radio_visible = True
         if mask_source_radio == mask_source_draw:
             text_prompt_visible = False
-    return  gr.Textbox.update(visible=text_prompt_visible), gr.Textbox.update(visible=inpaint_prompt_visible), gr.Radio.update(visible=mask_source_radio_visible)
 if __name__ == "__main__":
     parser = argparse.ArgumentParser("Grounded SAM demo", add_help=True)
@@ -525,15 +697,16 @@ if __name__ == "__main__":
         with gr.Row():
             with gr.Column():
                 input_image = gr.Image(source='upload', elem_id="image_upload", tool='sketch', type='pil', label="Upload")
-                task_type = gr.Radio(["detection", "segment", "inpainting", "remove"],  value="detection",
-                                                label='Task type',interactive=True, visible=True)
                 mask_source_radio = gr.Radio([mask_source_draw, mask_source_segment],
                                     value=mask_source_segment, label="Mask from",
-                                    interactive=True, visible=False)
                 text_prompt = gr.Textbox(label="Detection Prompt[To detect multiple objects, seperating each name with '.', like this: cat . dog . chair ]", placeholder="Cannot be empty")
                 inpaint_prompt = gr.Textbox(label="Inpaint Prompt (if this is empty, then remove)", visible=False)
                 run_button = gr.Button(label="Run")
-                with gr.Accordion("Advanced options", open=False):
                     box_threshold = gr.Slider(
                         label="Box Threshold", minimum=0.0, maximum=1.0, value=0.3, step=0.001
                     )
@@ -551,14 +724,16 @@ if __name__ == "__main__":
                             remove_mask_extend = gr.Textbox(label="remove_mask_extend", value='10')
             with gr.Column():
-                gallery = gr.Gallery(
-                    label="result images", show_label=True, elem_id="gallery"
-                ).style(grid=[2], full_width=True, full_height=True)
-        run_button.click(fn=run_grounded_sam, inputs=[
-                        input_image, text_prompt, task_type, inpaint_prompt, box_threshold, text_threshold, iou_threshold, inpaint_mode, mask_source_radio, remove_mode, remove_mask_extend], outputs=[gallery, gallery])
-        task_type.change(fn=change_radio_display, inputs=[task_type, mask_source_radio], outputs=[text_prompt, inpaint_prompt, mask_source_radio])
-        mask_source_radio.change(fn=change_radio_display, inputs=[task_type, mask_source_radio], outputs=[text_prompt, inpaint_prompt, mask_source_radio])
         DESCRIPTION = '### This demo from [Grounded-Segment-Anything](https://github.com/IDEA-Research/Grounded-Segment-Anything). Thanks for their excellent work.'
         DESCRIPTION += f'<p>For faster inference without waiting in queue, you may duplicate the space and upgrade to GPU in settings. <a href="https://huggingface.co/spaces/yizhangliu/Grounded-Segment-Anything?duplicate=true"><img style="display: inline; margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space" /></a></p>'

 from lama_cleaner.schema import Config
 # segment anything
+from segment_anything import build_sam, SamPredictor, SamAutomaticMaskGenerator
 # diffusers
 import PIL
 # initialize SAM
 logger.info(f"initialize SAM model...")
 sam_predictor = SamPredictor(build_sam(checkpoint=sam_checkpoint))
+sam_mask_generator = SamAutomaticMaskGenerator(sam_predictor)
 # initialize stable-diffusion-inpainting
 logger.info(f"initialize stable-diffusion-inpainting...")
     image = Image.open(io.BytesIO(numpy_to_bytes(res_np_img, 'png')))
     return  image
+# relate anything
+from ram_utils import iou, sort_and_deduplicate, relation_classes, MLP, show_anns, show_mask
+from ram_train_eval import RamModel,RamPredictor
+from mmengine.config import Config
+input_size = 512
+hidden_size = 256
+num_classes = 56
+# load ram model
+model_path = "./checkpoints/ram_epoch12.pth"
+config = dict(
+    model=dict(
+        pretrained_model_name_or_path='bert-base-uncased',
+        load_pretrained_weights=False,
+        num_transformer_layer=2,
+        input_feature_size=256,
+        output_feature_size=768,
+        cls_feature_size=512,
+        num_relation_classes=56,
+        pred_type='attention',
+        loss_type='multi_label_ce',
+    ),
+    load_from=model_path,
+)
+config = Config(config)
+class Predictor(RamPredictor, device='cpu'):
+    def __init__(self,config):
+        self.config = config
+        self.device = torch.device(device)
+        self._build_model()
+    def _build_model(self):
+        self.model = RamModel(**self.config.model).to(self.device)
+        if self.config.load_from is not None:
+            self.model.load_state_dict(torch.load(self.config.load_from, map_location=self.device))
+        self.model.train()
+ram_model = Predictor(config, device)
+# visualization
+def draw_selected_mask(mask, draw):
+    color = (255, 0, 0, 153)
+    nonzero_coords = np.transpose(np.nonzero(mask))
+    for coord in nonzero_coords:
+        draw.point(coord[::-1], fill=color)
+def draw_object_mask(mask, draw):
+    color = (0, 0, 255, 153)
+    nonzero_coords = np.transpose(np.nonzero(mask))
+    for coord in nonzero_coords:
+        draw.point(coord[::-1], fill=color)
+def create_title_image(word1, word2, word3, width, font_path='./assets/OpenSans-Bold.ttf'):
+    # Define the colors to use for each word
+    color_red = (255, 0, 0)
+    color_black = (0, 0, 0)
+    color_blue = (0, 0, 255)
+    # Define the initial font size and spacing between words
+    font_size = 40
+    # Create a new image with the specified width and white background
+    image = Image.new('RGB', (width, 60), (255, 255, 255))
+    # Load the specified font
+    font = ImageFont.truetype(font_path, font_size)
+    # Keep increasing the font size until all words fit within the desired width
+    while True:
+        # Create a draw object for the image
+        draw = ImageDraw.Draw(image)
+        word_spacing = font_size / 2
+        # Draw each word in the appropriate color
+        x_offset = word_spacing
+        draw.text((x_offset, 0), word1, color_red, font=font)
+        x_offset += font.getsize(word1)[0] + word_spacing
+        draw.text((x_offset, 0), word2, color_black, font=font)
+        x_offset += font.getsize(word2)[0] + word_spacing
+        draw.text((x_offset, 0), word3, color_blue, font=font)
+        word_sizes = [font.getsize(word) for word in [word1, word2, word3]]
+        total_width = sum([size[0] for size in word_sizes]) + word_spacing * 3
+        # Stop increasing font size if the image is within the desired width
+        if total_width <= width:
+            break
+        # Increase font size and reset the draw object
+        font_size -= 1
+        image = Image.new('RGB', (width, 50), (255, 255, 255))
+        font = ImageFont.truetype(font_path, font_size)
+        draw = None
+    return image
+def concatenate_images_vertical(image1, image2):
+    # Get the dimensions of the two images
+    width1, height1 = image1.size
+    width2, height2 = image2.size
+    # Create a new image with the combined height and the maximum width
+    new_image = Image.new('RGBA', (max(width1, width2), height1 + height2))
+    # Paste the first image at the top of the new image
+    new_image.paste(image1, (0, 0))
+    # Paste the second image below the first image
+    new_image.paste(image2, (0, height1))
+    return new_image
+def relate_anything(input_image, k):
+    w, h = input_image.size
+    max_edge = 1500
+    if w > max_edge or h > max_edge:
+        ratio = max(w, h) / max_edge
+        new_size = (int(w / ratio), int(h / ratio))
+        input_image.thumbnail(new_size)
+    # load image
+    pil_image = input_image.convert('RGBA')
+    image = np.array(input_image)
+    sam_masks = sam_mask_generator.generate(image)
+    filtered_masks = sort_and_deduplicate(sam_masks)
+    feat_list = []
+    for fm in filtered_masks:
+        feat = torch.Tensor(fm['feat']).unsqueeze(0).unsqueeze(0).to(device)
+        feat_list.append(feat)
+    feat = torch.cat(feat_list, dim=1).to(device)
+    matrix_output, rel_triplets = ram_model.predict(feat)
+    pil_image_list = []
+    for i, rel in enumerate(rel_triplets[:k]):
+        s,o,r = int(rel[0]),int(rel[1]),int(rel[2])
+        relation = relation_classes[r]
+        mask_image = Image.new('RGBA', pil_image.size, color=(0, 0, 0, 0))
+        mask_draw = ImageDraw.Draw(mask_image)
+        draw_selected_mask(filtered_masks[s]['segmentation'], mask_draw)
+        draw_object_mask(filtered_masks[o]['segmentation'], mask_draw)
+        current_pil_image = pil_image.copy()
+        current_pil_image.alpha_composite(mask_image)
+        title_image = create_title_image('Red', relation, 'Blue', current_pil_image.size[0])
+        concate_pil_image = concatenate_images_vertical(current_pil_image, title_image)
+        pil_image_list.append(concate_pil_image)
+    yield pil_image_list
 mask_source_draw = "draw a mask on input image"
 mask_source_segment = "type what to detect below"
+def run_anything_task(input_image, text_prompt, task_type, inpaint_prompt, box_threshold, text_threshold,
+            iou_threshold, inpaint_mode, mask_source_radio, remove_mode, remove_mask_extend, num_relation):
+    if task_type == "relate anything":
+        return relate_anything(input_image['image'], num_relation)
     text_prompt = text_prompt.strip()
     if not ((task_type == 'inpainting' or task_type == 'remove') and mask_source_radio == mask_source_draw):
         if text_prompt == '':
             return [], gr.Gallery.update(label='Please upload a image!😂😂😂😂')
     file_temp = int(time.time())
+    logger.info(f'run_anything_task_[{file_temp}]_{task_type}/{inpaint_mode}/[{mask_source_radio}]/{remove_mode}/{remove_mask_extend}_[{text_prompt}]/[{inpaint_prompt}]___1_')
     # load image
     input_mask_pil = input_image['mask']
             groundingdino_model, image, text_prompt, box_threshold, text_threshold, device=groundingdino_device
         )
         if boxes_filt.size(0) == 0:
+            logger.info(f'run_anything_task_[{file_temp}]_{task_type}_[{text_prompt}]_1_[No objects detected, please try others.]_')
             return [], gr.Gallery.update(label='No objects detected, please try others.😂😂😂😂')
         boxes_filt_ori = copy.deepcopy(boxes_filt)
         os.remove(image_path)
         output_images.append(detection_image_result)
+    logger.info(f'run_anything_task_[{file_temp}]_{task_type}_2_')
     if task_type == 'segment' or ((task_type == 'inpainting' or task_type == 'remove') and mask_source_radio == mask_source_segment):
         image = np.array(input_image['image'])
         sam_predictor.set_image(image)
         os.remove(image_path)
         output_images.append(segment_image_result)
+    logger.info(f'run_anything_task_[{file_temp}]_{task_type}_3_')
     if task_type == 'detection' or task_type == 'segment':
+        logger.info(f'run_anything_task_[{file_temp}]_{task_type}_9_')
         return output_images, gr.Gallery.update(label='result images')
     elif task_type == 'inpainting' or task_type == 'remove':
         if inpaint_prompt.strip() == '' and mask_source_radio == mask_source_segment:
             task_type = 'remove'
+        logger.info(f'run_anything_task_[{file_temp}]_{task_type}_4_')
         if mask_source_radio == mask_source_draw:
             mask_pil = input_mask_pil
             mask = input_mask
             mask_pil = Image.fromarray(mask)
         image_path = os.path.join(output_dir, f"image_mask_{file_temp}.jpg")
+        # if reverse_mask:
+        #     mask_pil = mask_pil.point(lambda _: 255-_)
         mask_pil.convert("RGB").save(image_path)
         image_result = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB)
         os.remove(image_path)
                 mask_pil = mix_masks(mask_imgs)
                 image_path = os.path.join(output_dir, f"image_mask_{file_temp}.jpg")
+                # if reverse_mask:
+                #     mask_pil = mask_pil.point(lambda _: 255-_)
                 mask_pil.convert("RGB").save(image_path)
                 image_result = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB)
                 os.remove(image_path)
         image_inpainting.save(image_path)
         image_result = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB)
         os.remove(image_path)
+        logger.info(f'run_anything_task_[{file_temp}]_{task_type}_9_')
         output_images.append(image_result)
         return output_images, gr.Gallery.update(label='result images')
     else:
         logger.info(f"task_type:{task_type} error!")
+    logger.info(f'run_anything_task_[{file_temp}]_9_9_')
     return output_images, gr.Gallery.update(label='result images')
+def change_radio_display(task_type, mask_source_radio, num_relation): #, gsa_gallery, ram_gallery):
     text_prompt_visible = True
     inpaint_prompt_visible = False
     mask_source_radio_visible = False
+    num_relation_visible = False
+    # gsa_gallery_visible = True
+    # ram_gallery_visible = False
     if task_type == "inpainting":
         inpaint_prompt_visible = True
     if task_type == "inpainting" or task_type == "remove":
         mask_source_radio_visible = True
         if mask_source_radio == mask_source_draw:
             text_prompt_visible = False
+    if task_type == "relate anything":
+        text_prompt_visible = False
+        num_relation_visible = True
+        # gsa_gallery_visible = False
+        # ram_gallery_visible = True
+    return  gr.Textbox.update(visible=text_prompt_visible), gr.Textbox.update(visible=inpaint_prompt_visible),
+            gr.Radio.update(visible=mask_source_radio_visible), gr.Slider.update(visible=num_relation_visible)
+            # gr.Gallery.update(visible=gas_gallery_visible), gr.Gallery.update(visible=ram_gallery_visible)
 if __name__ == "__main__":
     parser = argparse.ArgumentParser("Grounded SAM demo", add_help=True)
         with gr.Row():
             with gr.Column():
                 input_image = gr.Image(source='upload', elem_id="image_upload", tool='sketch', type='pil', label="Upload")
+                task_type = gr.Radio(["detection", "segment", "inpainting", "remove", "relate anything"],  value="detection",
+                                                label='Task type', visible=True)
                 mask_source_radio = gr.Radio([mask_source_draw, mask_source_segment],
                                     value=mask_source_segment, label="Mask from",
+                                    visible=False)
                 text_prompt = gr.Textbox(label="Detection Prompt[To detect multiple objects, seperating each name with '.', like this: cat . dog . chair ]", placeholder="Cannot be empty")
                 inpaint_prompt = gr.Textbox(label="Inpaint Prompt (if this is empty, then remove)", visible=False)
+                num_relation = gr.Slider(label="How many relations do you want to see", minimum=1, maximum=20, value=5, step=1, visible=False)
                 run_button = gr.Button(label="Run")
+                with gr.Accordion("Advanced options", open=False) as advanced_options:
                     box_threshold = gr.Slider(
                         label="Box Threshold", minimum=0.0, maximum=1.0, value=0.3, step=0.001
                     )
                             remove_mask_extend = gr.Textbox(label="remove_mask_extend", value='10')
             with gr.Column():
+                # gsa_gallery = gr.Gallery(
+                #     label="result images", show_label=True, elem_id="gsa_gallery"
+                # ).style(grid=[2], full_width=True, full_height=True)
+                gallery = gr.Gallery(label="Your Result", show_label=True, elem_id="gallery").style(preview=True, columns=5, object_fit="scale-down")
+        run_button.click(fn=run_anything_task, inputs=[
+                        input_image, text_prompt, task_type, inpaint_prompt, box_threshold, text_threshold, iou_threshold, inpaint_mode, mask_source_radio, remove_mode, remove_mask_extend, num_relation], outputs=[gsa_gallery, gsa_gallery])
+        task_type.change(fn=change_radio_display, inputs=[task_type, mask_source_radio], outputs=[text_prompt, inpaint_prompt, mask_source_radio, num_relation])
+        mask_source_radio.change(fn=change_radio_display, inputs=[task_type, mask_source_radio], outputs=[text_prompt, inpaint_prompt, mask_source_radio, num_relation])
         DESCRIPTION = '### This demo from [Grounded-Segment-Anything](https://github.com/IDEA-Research/Grounded-Segment-Anything). Thanks for their excellent work.'
         DESCRIPTION += f'<p>For faster inference without waiting in queue, you may duplicate the space and upgrade to GPU in settings. <a href="https://huggingface.co/spaces/yizhangliu/Grounded-Segment-Anything?duplicate=true"><img style="display: inline; margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space" /></a></p>'

assets/OpenSans-Bold.ttf ADDED Viewed

Binary file (225 kB). View file

checkpoints/ram_epoch12.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:036ddbb89e3376b61cb548c8cac3007c3ab7236fb6ac82207d4ccf4039654297
+size 333991817

ram_train_eval.py ADDED Viewed

	@@ -0,0 +1,416 @@

+import os
+import time
+from datetime import timedelta
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.config import Config
+from mmengine.utils import ProgressBar
+from transformers import AutoConfig, AutoModel
+class RamDataset(torch.utils.data.Dataset):
+    def __init__(self, data_path, is_train=True, num_relation_classes=56):
+        super().__init__()
+        self.num_relation_classes = num_relation_classes
+        data = np.load(data_path, allow_pickle=True)
+        self.samples = data["arr_0"]
+        sample_num = self.samples.size
+        self.sample_idx_list = []
+        for idx in range(sample_num):
+            if self.samples[idx]["is_train"] == is_train:
+                self.sample_idx_list.append(idx)
+    def __getitem__(self, idx):
+        sample = self.samples[self.sample_idx_list[idx]]
+        object_num = sample["feat"].shape[0]
+        embedding = torch.from_numpy(sample["feat"])
+        gt_rels = sample["relations"]
+        rel_target = self._get_target(object_num, gt_rels)
+        return embedding, rel_target, gt_rels
+    def __len__(self):
+        return len(self.sample_idx_list)
+    def _get_target(self, object_num, gt_rels):
+        rel_target = torch.zeros([self.num_relation_classes, object_num, object_num])
+        for ii, jj, cls_relationship in gt_rels:
+            rel_target[cls_relationship, ii, jj] = 1
+        return rel_target
+class RamModel(nn.Module):
+    def __init__(
+        self,
+        pretrained_model_name_or_path,
+        load_pretrained_weights=True,
+        num_transformer_layer=2,
+        input_feature_size=256,
+        output_feature_size=768,
+        cls_feature_size=512,
+        num_relation_classes=56,
+        pred_type="attention",
+        loss_type="bce",
+    ):
+        super().__init__()
+        # 0. config
+        self.cls_feature_size = cls_feature_size
+        self.num_relation_classes = num_relation_classes
+        self.pred_type = pred_type
+        self.loss_type = loss_type
+        # 1. fc input and output
+        self.fc_input = nn.Sequential(
+            nn.Linear(input_feature_size, output_feature_size),
+            nn.LayerNorm(output_feature_size),
+        )
+        self.fc_output = nn.Sequential(
+            nn.Linear(output_feature_size, output_feature_size),
+            nn.LayerNorm(output_feature_size),
+        )
+        # 2. transformer model
+        if load_pretrained_weights:
+            self.model = AutoModel.from_pretrained(pretrained_model_name_or_path)
+        else:
+            config = AutoConfig.from_pretrained(pretrained_model_name_or_path)
+            self.model = AutoModel.from_config(config)
+        if num_transformer_layer != "all" and isinstance(num_transformer_layer, int):
+            self.model.encoder.layer = self.model.encoder.layer[:num_transformer_layer]
+        # 3. predict head
+        self.cls_sub = nn.Linear(output_feature_size, cls_feature_size * num_relation_classes)
+        self.cls_obj = nn.Linear(output_feature_size, cls_feature_size * num_relation_classes)
+        # 4. loss
+        if self.loss_type == "bce":
+            self.bce_loss = nn.BCEWithLogitsLoss()
+        elif self.loss_type == "multi_label_ce":
+            print("Use Multi Label Cross Entropy Loss.")
+    def forward(self, embeds, attention_mask=None):
+        """
+        embeds: (batch_size, token_num, feature_size)
+        attention_mask: (batch_size, token_num)
+        """
+        # 1. fc input
+        embeds = self.fc_input(embeds)
+        # 2. transformer model
+        position_ids = torch.ones([1, embeds.shape[1]]).to(embeds.device).to(torch.long)
+        outputs = self.model.forward(inputs_embeds=embeds, attention_mask=attention_mask, position_ids=position_ids)
+        embeds = outputs["last_hidden_state"]
+        # 3. fc output
+        embeds = self.fc_output(embeds)
+        # 4. predict head
+        batch_size, token_num, feature_size = embeds.shape
+        sub_embeds = self.cls_sub(embeds).reshape([batch_size, token_num, self.num_relation_classes, self.cls_feature_size]).permute([0, 2, 1, 3])
+        obj_embeds = self.cls_obj(embeds).reshape([batch_size, token_num, self.num_relation_classes, self.cls_feature_size]).permute([0, 2, 1, 3])
+        if self.pred_type == "attention":
+            cls_pred = sub_embeds @ torch.transpose(obj_embeds, 2, 3) / self.cls_feature_size**0.5  # noqa
+        elif self.pred_type == "einsum":
+            cls_pred = torch.einsum("nrsc,nroc->nrso", sub_embeds, obj_embeds)
+        return cls_pred
+    def loss(self, pred, target, attention_mask):
+        loss_dict = dict()
+        batch_size, relation_num, _, _ = pred.shape
+        mask = torch.zeros_like(pred).to(pred.device)
+        for idx in range(batch_size):
+            n = torch.sum(attention_mask[idx]).to(torch.int)
+            mask[idx, :, :n, :n] = 1
+        pred = pred * mask - 9999 * (1 - mask)
+        if self.loss_type == "bce":
+            loss = self.bce_loss(pred, target)
+        elif self.loss_type == "multi_label_ce":
+            input_tensor = torch.permute(pred, (1, 0, 2, 3))
+            target_tensor = torch.permute(target, (1, 0, 2, 3))
+            input_tensor = pred.reshape([relation_num, -1])
+            target_tensor = target.reshape([relation_num, -1])
+            loss = self.multilabel_categorical_crossentropy(target_tensor, input_tensor)
+            weight = loss / loss.max()
+            loss = loss * weight
+        loss = loss.mean()
+        loss_dict["loss"] = loss
+        # running metric
+        recall_20 = get_recall_N(pred, target, object_num=20)
+        loss_dict["recall@20"] = recall_20
+        return loss_dict
+    def multilabel_categorical_crossentropy(self, y_true, y_pred):
+        """
+        https://kexue.fm/archives/7359
+        """
+        y_pred = (1 - 2 * y_true) * y_pred
+        y_pred_neg = y_pred - y_true * 9999
+        y_pred_pos = y_pred - (1 - y_true) * 9999
+        zeros = torch.zeros_like(y_pred[..., :1])
+        y_pred_neg = torch.cat([y_pred_neg, zeros], dim=-1)
+        y_pred_pos = torch.cat([y_pred_pos, zeros], dim=-1)
+        neg_loss = torch.logsumexp(y_pred_neg, dim=-1)
+        pos_loss = torch.logsumexp(y_pred_pos, dim=-1)
+        return neg_loss + pos_loss
+def get_recall_N(y_pred, y_true, object_num=20):
+    """
+    y_pred: [batch_size, 56, object_num, object_num]
+    y_true: [batch_size, 56, object_num, object_num]
+    """
+    device = y_pred.device
+    recall_list = []
+    for idx in range(len(y_true)):
+        sample_y_true = []
+        sample_y_pred = []
+        # find topk
+        _, topk_indices = torch.topk(
+            y_true[idx : idx + 1].reshape(
+                [
+                    -1,
+                ]
+            ),
+            k=object_num,
+        )
+        for index in topk_indices:
+            pred_cls = index // (y_true.shape[2] ** 2)
+            index_subject_object = index % (y_true.shape[2] ** 2)
+            pred_subject = index_subject_object // y_true.shape[2]
+            pred_object = index_subject_object % y_true.shape[2]
+            if y_true[idx, pred_cls, pred_subject, pred_object] == 0:
+                continue
+            sample_y_true.append([pred_subject, pred_object, pred_cls])
+        # find topk
+        _, topk_indices = torch.topk(
+            y_pred[idx : idx + 1].reshape(
+                [
+                    -1,
+                ]
+            ),
+            k=object_num,
+        )
+        for index in topk_indices:
+            pred_cls = index // (y_pred.shape[2] ** 2)
+            index_subject_object = index % (y_pred.shape[2] ** 2)
+            pred_subject = index_subject_object // y_pred.shape[2]
+            pred_object = index_subject_object % y_pred.shape[2]
+            sample_y_pred.append([pred_subject, pred_object, pred_cls])
+        recall = len([x for x in sample_y_pred if x in sample_y_true]) / (len(sample_y_true) + 1e-8)
+        recall_list.append(recall)
+    recall = torch.tensor(recall_list).to(device).mean() * 100
+    return recall
+class RamTrainer(object):
+    def __init__(self, config):
+        self.config = config
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self._build_dataset()
+        self._build_dataloader()
+        self._build_model()
+        self._build_optimizer()
+        self._build_lr_scheduler()
+    def _build_dataset(self):
+        self.dataset = RamDataset(**self.config.dataset)
+    def _build_dataloader(self):
+        self.dataloader = torch.utils.data.DataLoader(
+            self.dataset,
+            batch_size=self.config.dataloader.batch_size,
+            shuffle=True if self.config.dataset.is_train else False,
+        )
+    def _build_model(self):
+        self.model = RamModel(**self.config.model).to(self.device)
+        if self.config.load_from is not None:
+            self.model.load_state_dict(torch.load(self.config.load_from))
+        self.model.train()
+    def _build_optimizer(self):
+        self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=self.config.optim.lr, weight_decay=self.config.optim.weight_decay, eps=self.config.optim.eps, betas=self.config.optim.betas)
+    def _build_lr_scheduler(self):
+        self.lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(self.optimizer, milestones=self.config.optim.lr_scheduler.step, gamma=self.config.optim.lr_scheduler.gamma)
+    def train(self):
+        t_start = time.time()
+        running_avg_loss = 0
+        for epoch_idx in range(self.config.num_epoch):
+            for batch_idx, batch_data in enumerate(self.dataloader):
+                batch_embeds = batch_data[0].to(torch.float32).to(self.device)
+                batch_target = batch_data[1].to(torch.float32).to(self.device)
+                attention_mask = batch_embeds.new_ones((batch_embeds.shape[0], batch_embeds.shape[1]))
+                batch_pred = self.model.forward(batch_embeds, attention_mask)
+                loss_dict = self.model.loss(batch_pred, batch_target, attention_mask)
+                loss = loss_dict["loss"]
+                recall_20 = loss_dict["recall@20"]
+                self.optimizer.zero_grad()
+                loss.backward()
+                torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.config.optim.max_norm, self.config.optim.norm_type)
+                self.optimizer.step()
+                running_avg_loss += loss.item()
+                if batch_idx % 100 == 0:
+                    t_current = time.time()
+                    num_finished_step = epoch_idx * self.config.num_epoch * len(self.dataloader) + batch_idx + 1
+                    num_to_do_step = (self.config.num_epoch - epoch_idx - 1) * len(self.dataloader) + (len(self.dataloader) - batch_idx - 1)
+                    avg_speed = num_finished_step / (t_current - t_start)
+                    eta = num_to_do_step / avg_speed
+                    print(
+                        "ETA={:0>8}, Epoch={}, Batch={}/{}, LR={}, Loss={:.4f}, RunningAvgLoss={:.4f}, Recall@20={:.2f}%".format(
+                            str(timedelta(seconds=int(eta))), epoch_idx + 1, batch_idx, len(self.dataloader), self.lr_scheduler.get_last_lr()[0], loss.item(), running_avg_loss / num_finished_step, recall_20.item()
+                        )
+                    )
+            self.lr_scheduler.step()
+            if not os.path.exists(self.config.output_dir):
+                os.makedirs(self.config.output_dir)
+            save_path = os.path.join(self.config.output_dir, "epoch_{}.pth".format(epoch_idx + 1))
+            print("Save epoch={} checkpoint to {}".format(epoch_idx + 1, save_path))
+            torch.save(self.model.state_dict(), save_path)
+        return save_path
+class RamPredictor(object):
+    def __init__(self, config):
+        self.config = config
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self._build_dataset()
+        self._build_dataloader()
+        self._build_model()
+    def _build_dataset(self):
+        self.dataset = RamDataset(**self.config.dataset)
+    def _build_dataloader(self):
+        self.dataloader = torch.utils.data.DataLoader(self.dataset, batch_size=self.config.dataloader.batch_size, shuffle=False)
+    def _build_model(self):
+        self.model = RamModel(**self.config.model).to(self.device)
+        if self.config.load_from is not None:
+            self.model.load_state_dict(torch.load(self.config.load_from))
+        self.model.eval()
+    def predict(self, batch_embeds, pred_keep_num=100):
+        """
+        Parameters
+        ----------
+            batch_embeds: (batch_size=1, token_num, feature_size)
+            pred_keep_num: int
+        Returns
+        -------
+            batch_pred: (batch_size, relation_num, object_num, object_num)
+            pred_rels: [[sub_id, obj_id, rel_id], ...]
+        """
+        if not isinstance(batch_embeds, torch.Tensor):
+            batch_embeds = torch.asarray(batch_embeds)
+        batch_embeds = batch_embeds.to(torch.float32).to(self.device)
+        attention_mask = batch_embeds.new_ones((batch_embeds.shape[0], batch_embeds.shape[1]))
+        batch_pred = self.model.forward(batch_embeds, attention_mask)
+        for idx_i in range(batch_pred.shape[2]):
+            batch_pred[:, :, idx_i, idx_i] = -9999
+        batch_pred = batch_pred.sigmoid()
+        pred_rels = []
+        _, topk_indices = torch.topk(
+            batch_pred.reshape(
+                [
+                    -1,
+                ]
+            ),
+            k=pred_keep_num,
+        )
+        # subject, object, relation
+        for index in topk_indices:
+            pred_relation = index // (batch_pred.shape[2] ** 2)
+            index_subject_object = index % (batch_pred.shape[2] ** 2)
+            pred_subject = index_subject_object // batch_pred.shape[2]
+            pred_object = index_subject_object % batch_pred.shape[2]
+            pred = [pred_subject.item(), pred_object.item(), pred_relation.item()]
+            pred_rels.append(pred)
+        return batch_pred, pred_rels
+    def eval(self):
+        sum_recall_20 = 0.0
+        sum_recall_50 = 0.0
+        sum_recall_100 = 0.0
+        prog_bar = ProgressBar(len(self.dataloader))
+        for batch_idx, batch_data in enumerate(self.dataloader):
+            batch_embeds = batch_data[0]
+            batch_target = batch_data[1]
+            gt_rels = batch_data[2]
+            batch_pred, pred_rels = self.predict(batch_embeds)
+            this_recall_20 = get_recall_N(batch_pred, batch_target, object_num=20)
+            this_recall_50 = get_recall_N(batch_pred, batch_target, object_num=50)
+            this_recall_100 = get_recall_N(batch_pred, batch_target, object_num=100)
+            sum_recall_20 += this_recall_20.item()
+            sum_recall_50 += this_recall_50.item()
+            sum_recall_100 += this_recall_100.item()
+            prog_bar.update()
+        recall_20 = sum_recall_20 / len(self.dataloader)
+        recall_50 = sum_recall_50 / len(self.dataloader)
+        recall_100 = sum_recall_100 / len(self.dataloader)
+        metric = {
+            "recall_20": recall_20,
+            "recall_50": recall_50,
+            "recall_100": recall_100,
+        }
+        return metric
+if __name__ == "__main__":
+    # Config
+    config = dict(
+        dataset=dict(
+            data_path="./data/feat_0420.npz",
+            is_train=True,
+            num_relation_classes=56,
+        ),
+        dataloader=dict(
+            batch_size=4,
+        ),
+        model=dict(
+            pretrained_model_name_or_path="bert-base-uncased",
+            load_pretrained_weights=True,
+            num_transformer_layer=2,
+            input_feature_size=256,
+            output_feature_size=768,
+            cls_feature_size=512,
+            num_relation_classes=56,
+            pred_type="attention",
+            loss_type="multi_label_ce",
+        ),
+        optim=dict(
+            lr=1e-4,
+            weight_decay=0.05,
+            eps=1e-8,
+            betas=(0.9, 0.999),
+            max_norm=0.01,
+            norm_type=2,
+            lr_scheduler=dict(
+                step=[6, 10],
+                gamma=0.1,
+            ),
+        ),
+        num_epoch=12,
+        output_dir="./work_dirs",
+        load_from=None,
+    )
+    # Train
+    config = Config(config)
+    trainer = RamTrainer(config)
+    last_model_path = trainer.train()
+    # Test/Eval
+    config.dataset.is_train = False
+    config.load_from = last_model_path
+    predictor = RamPredictor(config)
+    metric = predictor.eval()
+    print(metric)

ram_utils.py ADDED Viewed

	@@ -0,0 +1,152 @@

+import torch
+import torch.nn as nn
+import torch.optim as optim
+import numpy as np
+import torch.nn.functional as F
+class MLP(nn.Module):
+    def __init__(self, input_size, hidden_size, num_classes, dropout_prob=0.1):
+        super(MLP, self).__init__()
+        self.fc1 = nn.Linear(input_size, hidden_size)
+        self.relu = nn.ReLU()
+        self.dropout = nn.Dropout(dropout_prob)
+        self.fc2 = nn.Linear(hidden_size, num_classes)
+    def forward(self, x):
+        out = self.fc1(x)
+        out = self.relu(out)
+        out = self.dropout(out)
+        out = self.fc2(out)
+        return out
+def show_anns(anns, color_code='auto'):
+    if len(anns) == 0:
+        return
+    sorted_anns = sorted(anns, key=(lambda x: x['area']), reverse=True)
+    ax = plt.gca()
+    ax.set_autoscale_on(False)
+    polygons = []
+    color = []
+    for ann in sorted_anns:
+        m = ann['segmentation']
+        img = np.ones((m.shape[0], m.shape[1], 3))
+        color_mask = np.random.random((1, 3)).tolist()[0]
+        if color_code == 'auto':
+            for i in range(3):
+                img[:,:,i] = color_mask[i]
+        elif color_code == 'red':
+            for i in range(3):
+                img[:,:,0] = 1
+                img[:,:,1] = 0
+                img[:,:,2] = 0
+        else:
+            for i in range(3):
+                img[:,:,0] = 0
+                img[:,:,1] = 0
+                img[:,:,2] = 1
+    return np.dstack((img, m*0.35))
+def show_points(coords, labels, ax, marker_size=375):
+    pos_points = coords[labels==1]
+    neg_points = coords[labels==0]
+    ax.scatter(pos_points[:, 0], pos_points[:, 1], color='green', marker='*',
+               s=marker_size, edgecolor='white', linewidth=1.25)
+    ax.scatter(neg_points[:, 0], neg_points[:, 1], color='red', marker='*',
+               s=marker_size, edgecolor='white', linewidth=1.25)
+def show_mask(m):
+    img = np.ones((m.shape[0], m.shape[1], 3))
+    color_mask = np.random.random((1, 3)).tolist()[0]
+    for i in range(3):
+        img[:,:,0] = 1
+        img[:,:,1] = 0
+        img[:,:,2] = 0
+    return np.dstack((img, m*0.35))
+def iou(mask1, mask2):
+    intersection = np.logical_and(mask1, mask2)
+    union = np.logical_or(mask1, mask2)
+    iou_score = np.sum(intersection) / np.sum(union)
+    return iou_score
+def sort_and_deduplicate(sam_masks, iou_threshold=0.8):
+    # Sort the sam_masks list based on the area value
+    sorted_masks = sorted(sam_masks, key=lambda x: x['area'], reverse=True)
+    # Deduplicate masks based on the given iou_threshold
+    filtered_masks = []
+    for mask in sorted_masks:
+        duplicate = False
+        for filtered_mask in filtered_masks:
+            if iou(mask['segmentation'], filtered_mask['segmentation']) > iou_threshold:
+                duplicate = True
+                break
+        if not duplicate:
+            filtered_masks.append(mask)
+    return filtered_masks
+relation_classes = ['over',
+        'in front of',
+        'beside',
+        'on',
+        'in',
+        'attached to',
+        'hanging from',
+        'on back of',
+        'falling off',
+        'going down',
+        'painted on',
+        'walking on',
+        'running on',
+        'crossing',
+        'standing on',
+        'lying on',
+        'sitting on',
+        'flying over',
+        'jumping over',
+        'jumping from',
+        'wearing',
+        'holding',
+        'carrying',
+        'looking at',
+        'guiding',
+        'kissing',
+        'eating',
+        'drinking',
+        'feeding',
+        'biting',
+        'catching',
+        'picking',
+        'playing with',
+        'chasing',
+        'climbing',
+        'cleaning',
+        'playing',
+        'touching',
+        'pushing',
+        'pulling',
+        'opening',
+        'cooking',
+        'talking to',
+        'throwing',
+        'slicing',
+        'driving',
+        'riding',
+        'parked on',
+        'driving on',
+        'about to hit',
+        'kicking',
+        'swinging',
+        'entering',
+        'exiting',
+        'enclosing',
+        'leaning on',]

requirements.txt CHANGED Viewed

@@ -22,11 +22,6 @@ yapf
 numba
 segment_anything
-# ftfy
-# uuid
-# psutil
-# facexlib
 lama-cleaner==0.25.0
-# tensorflow
-# easydict

 numba
 segment_anything
 lama-cleaner==0.25.0
+openmim==0.1.5
+mmcv==2.0.0