Spaces:

Awiny
/

Image2Paragraph

Runtime error

App Files Files Community

Awiny commited on Apr 15, 2023

Commit

c3a1897

1 Parent(s): 353fa54

first version submission

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

app.py +59 -4
main_gradio.py +84 -0
models/__pycache__/blip2_model.cpython-38.pyc +0 -0
models/__pycache__/blip2_model.cpython-39.pyc +0 -0
models/__pycache__/controlnet_model.cpython-38.pyc +0 -0
models/__pycache__/gpt_model.cpython-38.pyc +0 -0
models/__pycache__/grit_model.cpython-38.pyc +0 -0
models/__pycache__/image_text_transformation.cpython-38.pyc +0 -0
models/__pycache__/image_text_transformation.cpython-39.pyc +0 -0
models/__pycache__/region_semantic.cpython-38.pyc +0 -0
models/blip2_model.py +38 -0
models/controlnet_model.py +51 -0
models/gpt_model.py +40 -0
models/grit_model.py +26 -0
models/grit_src/__pycache__/image_dense_captions.cpython-38.pyc +0 -0
models/grit_src/configs/Base.yaml +77 -0
models/grit_src/configs/GRiT_B_DenseCap.yaml +20 -0
models/grit_src/configs/GRiT_B_DenseCap_ObjectDet.yaml +23 -0
models/grit_src/configs/GRiT_B_ObjectDet.yaml +20 -0
models/grit_src/configs/GRiT_H_ObjectDet.yaml +21 -0
models/grit_src/configs/GRiT_L_ObjectDet.yaml +20 -0
models/grit_src/grit/__init__.py +7 -0
models/grit_src/grit/__pycache__/__init__.cpython-38.pyc +0 -0
models/grit_src/grit/__pycache__/config.cpython-38.pyc +0 -0
models/grit_src/grit/__pycache__/predictor.cpython-38.pyc +0 -0
models/grit_src/grit/config.py +50 -0
models/grit_src/grit/custom_solver.py +88 -0
models/grit_src/grit/data/__pycache__/custom_build_augmentation.cpython-38.pyc +0 -0
models/grit_src/grit/data/__pycache__/custom_dataset_mapper.cpython-38.pyc +0 -0
models/grit_src/grit/data/custom_build_augmentation.py +44 -0
models/grit_src/grit/data/custom_dataset_dataloader.py +250 -0
models/grit_src/grit/data/custom_dataset_mapper.py +149 -0
models/grit_src/grit/data/datasets/__pycache__/grit_coco.cpython-38.pyc +0 -0
models/grit_src/grit/data/datasets/__pycache__/object365.cpython-38.pyc +0 -0
models/grit_src/grit/data/datasets/__pycache__/vg.cpython-38.pyc +0 -0
models/grit_src/grit/data/datasets/grit_coco.py +112 -0
models/grit_src/grit/data/datasets/object365.py +111 -0
models/grit_src/grit/data/datasets/vg.py +98 -0
models/grit_src/grit/data/transforms/__pycache__/custom_augmentation_impl.cpython-38.pyc +0 -0
models/grit_src/grit/data/transforms/__pycache__/custom_transform.cpython-38.pyc +0 -0
models/grit_src/grit/data/transforms/custom_augmentation_impl.py +52 -0
models/grit_src/grit/data/transforms/custom_transform.py +115 -0
models/grit_src/grit/evaluation/eval.py +156 -0
models/grit_src/grit/modeling/__pycache__/soft_nms.cpython-38.pyc +0 -0
models/grit_src/grit/modeling/backbone/__pycache__/utils.cpython-38.pyc +0 -0
models/grit_src/grit/modeling/backbone/__pycache__/vit.cpython-38.pyc +0 -0
models/grit_src/grit/modeling/backbone/utils.py +186 -0
models/grit_src/grit/modeling/backbone/vit.py +538 -0
models/grit_src/grit/modeling/meta_arch/__pycache__/grit.cpython-38.pyc +0 -0
models/grit_src/grit/modeling/meta_arch/grit.py +66 -0

app.py CHANGED Viewed

@@ -1,7 +1,62 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-iface = gr.Interface(fn=greet, inputs="text", outputs="text")
-iface.launch()

 import gradio as gr
+import cv2
+import numpy as np
+from PIL import Image
+import base64
+from io import BytesIO
+from models.image_text_transformation import ImageTextTransformation
+def pil_image_to_base64(image):
+    buffered = BytesIO()
+    image.save(buffered, format="JPEG")
+    img_str = base64.b64encode(buffered.getvalue()).decode()
+    return img_str
+def add_logo():
+    with open("examples/logo.png", "rb") as f:
+        logo_base64 = base64.b64encode(f.read()).decode()
+    return logo_base64
+def process_image(image_src, processor):
+    gen_text = processor.image_to_text(image_src)
+    gen_image = processor.text_to_image(gen_text)
+    gen_image_str = pil_image_to_base64(gen_image)
+    # Combine the outputs into a single HTML output
+    custom_output = f'''
+    <h2>Image->Text->Image:</h2>
+    <div style="display: flex; flex-wrap: wrap;">
+        <div style="flex: 1;">
+            <h3>Image2Text</h3>
+            <p>{gen_text}</p>
+        </div>
+        <div style="flex: 1;">
+            <h3>Text2Image</h3>
+            <img src="data:image/jpeg;base64,{gen_image_str}" width="100%" />
+        </div>
+    </div>
+    '''
+    return custom_output
+processor = ImageTextTransformation()
+# Create Gradio input and output components
+image_input = gr.inputs.Image(type='filepath', label="Input Image")
+logo_base64 = add_logo()
+# Create the title with the logo
+title_with_logo = f'<img src="data:image/jpeg;base64,{logo_base64}" width="400" style="vertical-align: middle;"> Understanding Image with Text'
+# Create Gradio interface
+interface = gr.Interface(
+    fn=lambda image: process_image(image, processor),  # Pass the processor object using a lambda function
+    inputs=image_input,
+    outputs=gr.outputs.HTML(),
+    title=title_with_logo,
+    description="""
+    This code support image to text transformation. Then the generated text can do retrieval, question answering et al to conduct zero-shot.
+    """
+)
+# Launch the interface
+interface.launch()

main_gradio.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import gradio as gr
+import cv2
+import numpy as np
+from PIL import Image
+import base64
+from io import BytesIO
+from models.image_text_transformation import ImageTextTransformation
+def pil_image_to_base64(image):
+    buffered = BytesIO()
+    image.save(buffered, format="JPEG")
+    img_str = base64.b64encode(buffered.getvalue()).decode()
+    return img_str
+def add_logo():
+    with open("examples/logo.png", "rb") as f:
+        logo_base64 = base64.b64encode(f.read()).decode()
+    return logo_base64
+def process_image(image_src, processor):
+    gen_text = processor.image_to_text(image_src)
+    gen_image = processor.text_to_image(gen_text)
+    gen_image_str = pil_image_to_base64(gen_image)
+    # Combine the outputs into a single HTML output
+    custom_output = f'''
+    <h2>Image->Text->Image:</h2>
+    <div style="display: flex; flex-wrap: wrap;">
+        <div style="flex: 1;">
+            <h3>Image2Text</h3>
+            <p>{gen_text}</p>
+        </div>
+        <div style="flex: 1;">
+            <h3>Text2Image</h3>
+            <img src="data:image/jpeg;base64,{gen_image_str}" width="100%" />
+        </div>
+    </div>
+    <h2>Using Source Image to do Retrieval on COCO:</h2>
+    <div style="display: flex; flex-wrap: wrap;">
+        <div style="flex: 1;">
+            <h3>Retrieval Top-3 Text</h3>
+            <p>{gen_text}</p>
+        </div>
+        <div style="flex: 1;">
+            <h3>Retrieval Top-3 Image</h3>
+            <img src="data:image/jpeg;base64,{gen_image_str}" width="100%" />
+        </div>
+    </div>
+    <h2>Using Generated texts to do Retrieval on COCO:</h2>
+    <div style="display: flex; flex-wrap: wrap;">
+        <div style="flex: 1;">
+            <h3>Retrieval Top-3 Text</h3>
+            <p>{gen_text}</p>
+        </div>
+        <div style="flex: 1;">
+            <h3>Retrieval Top-3 Image</h3>
+            <img src="data:image/jpeg;base64,{gen_image_str}" width="100%" />
+        </div>
+    </div>
+    '''
+    return custom_output
+processor = ImageTextTransformation()
+# Create Gradio input and output components
+image_input = gr.inputs.Image(type='filepath', label="Input Image")
+logo_base64 = add_logo()
+# Create the title with the logo
+title_with_logo = f'<img src="data:image/jpeg;base64,{logo_base64}" width="400" style="vertical-align: middle;"> Understanding Image with Text'
+# Create Gradio interface
+interface = gr.Interface(
+    fn=lambda image: process_image(image, processor),  # Pass the processor object using a lambda function
+    inputs=image_input,
+    outputs=gr.outputs.HTML(),
+    title=title_with_logo,
+    description="""
+    This code support image to text transformation. Then the generated text can do retrieval, question answering et al to conduct zero-shot.
+    """
+)
+# Launch the interface
+interface.launch()

models/__pycache__/blip2_model.cpython-38.pyc ADDED Viewed

Binary file (1.88 kB). View file

models/__pycache__/blip2_model.cpython-39.pyc ADDED Viewed

Binary file (1.88 kB). View file

models/__pycache__/controlnet_model.cpython-38.pyc ADDED Viewed

Binary file (1.88 kB). View file

models/__pycache__/gpt_model.cpython-38.pyc ADDED Viewed

Binary file (2.28 kB). View file

models/__pycache__/grit_model.cpython-38.pyc ADDED Viewed

Binary file (1.38 kB). View file

models/__pycache__/image_text_transformation.cpython-38.pyc ADDED Viewed

Binary file (2.55 kB). View file

models/__pycache__/image_text_transformation.cpython-39.pyc ADDED Viewed

Binary file (2.55 kB). View file

models/__pycache__/region_semantic.cpython-38.pyc ADDED Viewed

Binary file (2.2 kB). View file

models/blip2_model.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from PIL import Image
+import requests
+from transformers import Blip2Processor, Blip2ForConditionalGeneration
+import torch
+class ImageCaptioning:
+    def __init__(self) -> None:
+        self.device = None
+        # self.processor, self.model = None, None
+        self.processor, self.model = self.initialize_model()
+    def initialize_model(self):
+        # device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.device = "cpu" # for low gpu memory devices
+        if self.device == 'cpu':
+            self.data_type = torch.float32
+        else:
+            self.data_type = torch.float16
+        processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
+        model = Blip2ForConditionalGeneration.from_pretrained(
+            "Salesforce/blip2-opt-2.7b", torch_dtype=self.data_type
+        )
+        model.to(self.device)
+        return processor, model
+    def image_caption(self, image_src):
+        image = Image.open(image_src)
+        inputs = self.processor(images=image, return_tensors="pt").to(self.device, self.data_type)
+        generated_ids = self.model.generate(**inputs)
+        generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
+        print('*'*100 + '\nStep1, BLIP2 caption:')
+        print(generated_text)
+        print('\n' + '*'*100)
+        return generated_text
+    def image_caption_debug(self, image_src):
+        return "A dish with salmon, broccoli, and something yellow."

models/controlnet_model.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import cv2
+import torch
+import numpy as np
+from PIL import Image
+from diffusers import (
+    StableDiffusionControlNetPipeline,
+    ControlNetModel,
+    UniPCMultistepScheduler,
+)
+class TextToImage:
+    def __init__(self):
+        # self.model = None
+        self.model = self.initialize_model()
+    def initialize_model(self):
+        controlnet = ControlNetModel.from_pretrained(
+            "fusing/stable-diffusion-v1-5-controlnet-canny",
+            torch_dtype=torch.float16,
+        )
+        pipeline = StableDiffusionControlNetPipeline.from_pretrained(
+            "runwayml/stable-diffusion-v1-5",
+            controlnet=controlnet,
+            safety_checker=None,
+            torch_dtype=torch.float16,
+        )
+        pipeline.scheduler = UniPCMultistepScheduler.from_config(
+            pipeline.scheduler.config
+        )
+        pipeline.enable_model_cpu_offload()
+        return pipeline
+    @staticmethod
+    def preprocess_image(image):
+        image = np.array(image)
+        low_threshold = 100
+        high_threshold = 200
+        image = cv2.Canny(image, low_threshold, high_threshold)
+        image = np.stack([image, image, image], axis=2)
+        image = Image.fromarray(image)
+        return image
+    def text_to_image(self, text, image):
+        image = self.preprocess_image(image)
+        generated_image = self.model(text, image, num_inference_steps=20).images[0]
+        return generated_image
+    def text_to_image_debug(self, text, image):
+        print("text_to_image_debug")
+        return image

models/gpt_model.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import openai
+class ImageToText:
+    def __init__(self, api_key):
+        self.template = self.initialize_template()
+        openai.api_key = api_key
+    def initialize_template(self):
+        prompt_prefix_1 = """Generate only an informative and nature paragraph based on the given information(a,b,c,d):\n"""
+        prompt_prefix_2 = """\n a. Image Resolution:  """
+        prompt_prefix_3 = """\n b. Image Caption: """
+        prompt_prefix_4 = """\n c. Dense Caption: """
+        prompt_prefix_5 = """\n d. Region Semantic: """
+        prompt_suffix = """\n There are some rules:
+        Show object, color and position.
+        Use nouns rather than coordinates to show position information of each object.
+        No more than 7 sentences.
+        Only use one paragraph.
+        Do not appear number.
+        """
+        template = f"{prompt_prefix_1}{prompt_prefix_2}{{width}}X{{height}}{prompt_prefix_3}{{caption}}{prompt_prefix_4}{{dense_caption}}{prompt_prefix_5}{{region_semantic}}{prompt_suffix}"
+        return template
+    def paragraph_summary_with_gpt(self, caption, dense_caption, region_semantic, width, height):
+        question = self.template.format(width=width, height=height, caption=caption, dense_caption=dense_caption, region_semantic=region_semantic)
+        print('*'*100)
+        print("question:", question)
+        completion = openai.ChatCompletion.create(
+            model="gpt-3.5-turbo",
+            messages = [
+            {"role": "user", "content" : question}]
+        )
+        print("chatgpt response:", completion['choices'][0]['message']['content'])
+        print('*'*100)
+        return completion['choices'][0]['message']['content']
+    def paragraph_summary_with_gpt_debug(self, caption, dense_caption, width, height):
+        question = self.template.format(width=width, height=height, caption=caption, dense_caption=dense_caption)
+        print("paragraph_summary_with_gpt_debug:")
+        return question

models/grit_model.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import os
+from models.grit_src.image_dense_captions import image_caption_api
+class DenseCaptioning():
+    def __init__(self) -> None:
+        self.model = None
+    def initialize_model(self):
+        pass
+    def image_dense_caption_debug(self, image_src):
+        dense_caption = """
+        1. the broccoli is green, [0, 0, 333, 325];
+        2. a piece of broccoli, [0, 147, 143, 324];
+        3. silver fork on plate, [4, 547, 252, 612];
+        """
+        return dense_caption
+    def image_dense_caption(self, image_src):
+        dense_caption = image_caption_api(image_src)
+        print("Step2, Dense Caption:\n")
+        print(dense_caption)
+        print('\n'+'*'*100)
+        return dense_caption

models/grit_src/__pycache__/image_dense_captions.cpython-38.pyc ADDED Viewed

Binary file (2.54 kB). View file

models/grit_src/configs/Base.yaml ADDED Viewed

	@@ -0,0 +1,77 @@

+MODEL:
+  META_ARCHITECTURE: "GRiT"
+  MASK_ON: True
+  PROPOSAL_GENERATOR:
+    NAME: "CenterNet"
+  FPN:
+    IN_FEATURES: ["layer3", "layer4", "layer5"]
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.12, 57.375]
+  ROI_HEADS:
+    NAME: GRiTROIHeadsAndTextDecoder
+    IN_FEATURES: ["p3", "p4", "p5"]
+    IOU_THRESHOLDS: [0.6]
+    NUM_CLASSES: 1
+    SCORE_THRESH_TEST: 0.02
+    NMS_THRESH_TEST: 0.5
+    OBJECT_FEAT_POOLER_RES: 14
+  ROI_BOX_CASCADE_HEAD:
+    IOUS: [0.6, 0.7, 0.8]
+  ROI_BOX_HEAD:
+    NAME: "FastRCNNConvFCHead"
+    NUM_FC: 2
+    POOLER_RESOLUTION: 7
+    CLS_AGNOSTIC_BBOX_REG: True
+    MULT_PROPOSAL_SCORE: True
+  ROI_MASK_HEAD:
+    NAME: "MaskRCNNConvUpsampleHead"
+    NUM_CONV: 4
+    POOLER_RESOLUTION: 14
+    CLS_AGNOSTIC_MASK: True
+  CENTERNET:
+    NUM_CLASSES: 1
+    REG_WEIGHT: 1.
+    NOT_NORM_REG: True
+    ONLY_PROPOSAL: True
+    WITH_AGN_HM: True
+    INFERENCE_TH: 0.0001
+    PRE_NMS_TOPK_TRAIN: 4000
+    POST_NMS_TOPK_TRAIN: 2000
+    PRE_NMS_TOPK_TEST: 1000
+    POST_NMS_TOPK_TEST: 256
+    NMS_TH_TRAIN: 0.9
+    NMS_TH_TEST: 0.9
+    POS_WEIGHT: 0.5
+    NEG_WEIGHT: 0.5
+    IGNORE_HIGH_FP: 0.85
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+DATALOADER:
+  SAMPLER_TRAIN: "MultiDatasetSampler"
+  DATASET_RATIO: [1]
+  DATASET_INPUT_SIZE: [1024]
+  DATASET_INPUT_SCALE: [[0.1, 2.0]]
+  FILTER_EMPTY_ANNOTATIONS: False
+  NUM_WORKERS: 8
+TEST:
+  DETECTIONS_PER_IMAGE: 256
+SOLVER:
+  LR_SCHEDULER_NAME: "WarmupCosineLR"
+  CHECKPOINT_PERIOD: 10000
+  WARMUP_ITERS: 1000
+  WARMUP_FACTOR: 0.001
+  USE_CUSTOM_SOLVER: True
+  OPTIMIZER: "ADAMW"
+  MAX_ITER: 180000
+  IMS_PER_BATCH: 64
+  BASE_LR: 0.00008
+  VIT_LAYER_DECAY: True
+  CLIP_GRADIENTS:
+    ENABLED: True
+INPUT:
+  FORMAT: RGB
+  CUSTOM_AUG: EfficientDetResizeCrop
+  TRAIN_SIZE: 640
+USE_ACT_CHECKPOINT: True
+VERSION: 2

models/grit_src/configs/GRiT_B_DenseCap.yaml ADDED Viewed

	@@ -0,0 +1,20 @@

+_BASE_: "Base.yaml"
+MODEL:
+  TRAIN_TASK: ["DenseCap"]
+  TEST_TASK: "DenseCap"
+  MASK_ON: False
+  ROI_HEADS:
+    SOFT_NMS_ENABLED: False
+  BEAM_SIZE: 1
+  WEIGHTS: "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_base.pth"
+  BACKBONE:
+    NAME: build_vit_fpn_backbone
+  VIT_LAYERS: 12
+SOLVER:
+  VIT_LAYER_DECAY_RATE: 0.7
+DATASETS:
+  TRAIN: ("vg_train",)
+  TEST: ("vg_test",)
+DATALOADER:
+  DATASET_BS: 2
+OUTPUT_DIR: "./output/GRiT_B_DenseCap"

models/grit_src/configs/GRiT_B_DenseCap_ObjectDet.yaml ADDED Viewed

	@@ -0,0 +1,23 @@

+_BASE_: "Base.yaml"
+MODEL:
+  TRAIN_TASK: ["ObjectDet", "DenseCap"]
+  TEST_TASK: "DenseCap" # DenseCap or ObjectDet: Choose one for testing
+  MASK_ON: True
+  ROI_HEADS:
+    SOFT_NMS_ENABLED: False
+  BEAM_SIZE: 1
+  WEIGHTS: "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_base.pth"
+  BACKBONE:
+    NAME: build_vit_fpn_backbone
+  VIT_LAYERS: 12
+SOLVER:
+  VIT_LAYER_DECAY_RATE: 0.7
+DATASETS:
+  TRAIN: ("GRiT_coco2017_train", "vg_train")
+  TEST: ("coco_2017_test-dev",)
+DATALOADER:
+  DATASET_RATIO: [1, 1]
+  DATASET_BS: 2
+  DATASET_INPUT_SIZE: [1024, 1024]
+  DATASET_INPUT_SCALE: [[0.1, 2.0], [0.1, 2.0]]
+OUTPUT_DIR: "./output/GRiT_B_DenseCap_ObjectDet"

models/grit_src/configs/GRiT_B_ObjectDet.yaml ADDED Viewed

	@@ -0,0 +1,20 @@

+_BASE_: "Base.yaml"
+MODEL:
+  TRAIN_TASK: ["ObjectDet"]
+  TEST_TASK: "ObjectDet"
+  MASK_ON: True
+  ROI_HEADS:
+    SOFT_NMS_ENABLED: True
+  BEAM_SIZE: 3
+  WEIGHTS: "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_base.pth"
+  BACKBONE:
+    NAME: build_vit_fpn_backbone
+  VIT_LAYERS: 12
+SOLVER:
+  VIT_LAYER_DECAY_RATE: 0.7
+DATASETS:
+  TRAIN: ("GRiT_coco2017_train",)
+  TEST: ("coco_2017_val",)
+DATALOADER:
+  DATASET_BS: 2
+OUTPUT_DIR: "./output/GRiT_B_ObjectDet"

models/grit_src/configs/GRiT_H_ObjectDet.yaml ADDED Viewed

	@@ -0,0 +1,21 @@

+_BASE_: "Base.yaml"
+MODEL:
+  TRAIN_TASK: ["ObjectDet"]
+  TEST_TASK: "ObjectDet"
+  MASK_ON: True
+  ROI_HEADS:
+    SOFT_NMS_ENABLED: True
+  BEAM_SIZE: 3
+  WEIGHTS: "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_huge_p14to16.pth"
+  BACKBONE:
+    NAME: build_vit_fpn_backbone_huge
+  VIT_LAYERS: 32
+SOLVER:
+  MAX_ITER: 135000
+  VIT_LAYER_DECAY_RATE: 0.9
+DATASETS:
+  TRAIN: ("GRiT_coco2017_train",)
+  TEST: ("coco_2017_val",)
+DATALOADER:
+  DATASET_BS: 1
+OUTPUT_DIR: "./output/GRiT_H_ObjectDet"

models/grit_src/configs/GRiT_L_ObjectDet.yaml ADDED Viewed

	@@ -0,0 +1,20 @@

+_BASE_: "Base.yaml"
+MODEL:
+  TRAIN_TASK: ["ObjectDet"]
+  TEST_TASK: "ObjectDet"
+  MASK_ON: True
+  ROI_HEADS:
+    SOFT_NMS_ENABLED: True
+  BEAM_SIZE: 3
+  WEIGHTS: "detectron2://ImageNetPretrained/MAE/mae_pretrain_vit_large.pth"
+  BACKBONE:
+    NAME: build_vit_fpn_backbone_large
+  VIT_LAYERS: 24
+SOLVER:
+  VIT_LAYER_DECAY_RATE: 0.8
+DATASETS:
+  TRAIN: ("GRiT_coco2017_train",)
+  TEST: ("coco_2017_val",)
+DATALOADER:
+  DATASET_BS: 1
+OUTPUT_DIR: "./output/GRiT_L_ObjectDet"

models/grit_src/grit/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from .modeling.meta_arch import grit
+from .modeling.roi_heads import grit_roi_heads
+from .modeling.backbone import vit
+from .data.datasets import object365
+from .data.datasets import vg
+from .data.datasets import grit_coco

models/grit_src/grit/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (405 Bytes). View file

models/grit_src/grit/__pycache__/config.cpython-38.pyc ADDED Viewed

Binary file (1.4 kB). View file

models/grit_src/grit/__pycache__/predictor.cpython-38.pyc ADDED Viewed

Binary file (2.65 kB). View file

models/grit_src/grit/config.py ADDED Viewed

	@@ -0,0 +1,50 @@

+from detectron2.config import CfgNode as CN
+def add_grit_config(cfg):
+    _C = cfg
+    _C.MODEL.BEAM_SIZE = 1
+    _C.MODEL.TRAIN_TASK = ["ObjectDet", "DenseCap"]
+    _C.MODEL.TEST_TASK = "DenseCap"  # This can be varied if the model is jointly trained on multiple tasks
+    _C.MODEL.ROI_BOX_HEAD.USE_BIAS = 0.0 # >= 0: not use
+    _C.MODEL.ROI_BOX_HEAD.MULT_PROPOSAL_SCORE = False
+    _C.MODEL.ROI_HEADS.MASK_WEIGHT = 1.0
+    _C.MODEL.ROI_HEADS.OBJECT_FEAT_POOLER_RES = 14
+    _C.MODEL.ROI_HEADS.SOFT_NMS_ENABLED = False
+    # Backbones
+    _C.MODEL.VIT_LAYERS = 12
+    # Text Decoder
+    _C.TEXT_DECODER = CN()
+    _C.TEXT_DECODER.VOCAB_SIZE = 30522
+    _C.TEXT_DECODER.HIDDEN_SIZE = 768
+    _C.TEXT_DECODER.NUM_LAYERS = 6
+    _C.TEXT_DECODER.ATTENTION_HEADS = 12
+    _C.TEXT_DECODER.FEEDFORWARD_SIZE = 768 * 4
+    # Multi-dataset dataloader
+    _C.DATALOADER.DATASET_RATIO = [1, 1]  # sample ratio
+    _C.DATALOADER.DATASET_BS = 1
+    _C.DATALOADER.DATASET_INPUT_SIZE = [1024, 1024]
+    _C.DATALOADER.DATASET_INPUT_SCALE = [(0.1, 2.0), (0.1, 2.0)]
+    _C.DATALOADER.DATASET_MIN_SIZES = [(640, 800), (640, 800)]
+    _C.DATALOADER.DATASET_MAX_SIZES = [1333, 1333]
+    _C.SOLVER.USE_CUSTOM_SOLVER = True
+    _C.SOLVER.OPTIMIZER = 'ADAMW'
+    _C.SOLVER.VIT_LAYER_DECAY = True
+    _C.SOLVER.VIT_LAYER_DECAY_RATE = 0.7
+    _C.INPUT.CUSTOM_AUG = 'EfficientDetResizeCrop'
+    _C.INPUT.TRAIN_SIZE = 1024
+    _C.INPUT.TEST_SIZE = 1024
+    _C.INPUT.SCALE_RANGE = (0.1, 2.)
+    # 'default' for fixed short / long edge
+    _C.INPUT.TEST_INPUT_TYPE = 'default'
+    _C.FIND_UNUSED_PARAM = True
+    _C.USE_ACT_CHECKPOINT = True

models/grit_src/grit/custom_solver.py ADDED Viewed

	@@ -0,0 +1,88 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# Modified by Jialian Wu from https://github.com/facebookresearch/Detic/blob/main/detic/custom_solver.py
+import itertools
+from typing import Any, Callable, Dict, Iterable, List, Set, Type, Union
+import torch
+from detectron2.config import CfgNode
+from detectron2.solver.build import maybe_add_gradient_clipping
+def build_custom_optimizer(cfg: CfgNode, model: torch.nn.Module) -> torch.optim.Optimizer:
+    params: List[Dict[str, Any]] = []
+    memo: Set[torch.nn.parameter.Parameter] = set()
+    optimizer_type = cfg.SOLVER.OPTIMIZER
+    for key, value in model.named_parameters(recurse=True):
+        if not value.requires_grad:
+            continue
+        # Avoid duplicating parameters
+        if value in memo:
+            continue
+        memo.add(value)
+        lr = cfg.SOLVER.BASE_LR
+        weight_decay = cfg.SOLVER.WEIGHT_DECAY
+        if cfg.SOLVER.VIT_LAYER_DECAY:
+            lr = lr * get_vit_lr_decay_rate(key, cfg.SOLVER.VIT_LAYER_DECAY_RATE, cfg.MODEL.VIT_LAYERS)
+        param = {"params": [value], "lr": lr}
+        if optimizer_type != 'ADAMW':
+            param['weight_decay'] = weight_decay
+        params += [param]
+    def maybe_add_full_model_gradient_clipping(optim):  # optim: the optimizer class
+        # detectron2 doesn't have full model gradient clipping now
+        clip_norm_val = cfg.SOLVER.CLIP_GRADIENTS.CLIP_VALUE
+        enable = (
+            cfg.SOLVER.CLIP_GRADIENTS.ENABLED
+            and cfg.SOLVER.CLIP_GRADIENTS.CLIP_TYPE == "full_model"
+            and clip_norm_val > 0.0
+        )
+        class FullModelGradientClippingOptimizer(optim):
+            def step(self, closure=None):
+                all_params = itertools.chain(*[x["params"] for x in self.param_groups])
+                torch.nn.utils.clip_grad_norm_(all_params, clip_norm_val)
+                super().step(closure=closure)
+        return FullModelGradientClippingOptimizer if enable else optim
+    if optimizer_type == 'SGD':
+        optimizer = maybe_add_full_model_gradient_clipping(torch.optim.SGD)(
+            params, cfg.SOLVER.BASE_LR, momentum=cfg.SOLVER.MOMENTUM,
+            nesterov=cfg.SOLVER.NESTEROV
+        )
+    elif optimizer_type == 'ADAMW':
+        optimizer = maybe_add_full_model_gradient_clipping(torch.optim.AdamW)(
+            params, cfg.SOLVER.BASE_LR,
+            weight_decay=cfg.SOLVER.WEIGHT_DECAY
+        )
+    else:
+        raise NotImplementedError(f"no optimizer type {optimizer_type}")
+    if not cfg.SOLVER.CLIP_GRADIENTS.CLIP_TYPE == "full_model":
+        optimizer = maybe_add_gradient_clipping(cfg, optimizer)
+    return optimizer
+def get_vit_lr_decay_rate(name, lr_decay_rate=1.0, num_layers=12):
+    """
+    Calculate lr decay rate for different ViT blocks.
+    Args:
+        name (string): parameter name.
+        lr_decay_rate (float): base lr decay rate.
+        num_layers (int): number of ViT blocks.
+    Returns:
+        lr decay rate for the given parameter.
+    """
+    layer_id = num_layers + 1
+    if name.startswith("backbone"):
+        if ".pos_embed" in name or ".patch_embed" in name:
+            layer_id = 0
+        elif ".blocks." in name and ".residual." not in name:
+            layer_id = int(name[name.find(".blocks.") :].split(".")[2]) + 1
+    return lr_decay_rate ** (num_layers + 1 - layer_id)

models/grit_src/grit/data/__pycache__/custom_build_augmentation.cpython-38.pyc ADDED Viewed

Binary file (1.21 kB). View file

models/grit_src/grit/data/__pycache__/custom_dataset_mapper.cpython-38.pyc ADDED Viewed

Binary file (5.68 kB). View file

models/grit_src/grit/data/custom_build_augmentation.py ADDED Viewed

	@@ -0,0 +1,44 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+from detectron2.data import transforms as T
+from .transforms.custom_augmentation_impl import EfficientDetResizeCrop
+def build_custom_augmentation(cfg, is_train, scale=None, size=None, \
+    min_size=None, max_size=None):
+    """
+    Create a list of default :class:`Augmentation` from config.
+    Now it includes resizing and flipping.
+    Returns:
+        list[Augmentation]
+    """
+    if cfg.INPUT.CUSTOM_AUG == 'ResizeShortestEdge':
+        if is_train:
+            min_size = cfg.INPUT.MIN_SIZE_TRAIN if min_size is None else min_size
+            max_size = cfg.INPUT.MAX_SIZE_TRAIN if max_size is None else max_size
+            sample_style = cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING
+        else:
+            min_size = cfg.INPUT.MIN_SIZE_TEST
+            max_size = cfg.INPUT.MAX_SIZE_TEST
+            sample_style = "choice"
+        augmentation = [T.ResizeShortestEdge(min_size, max_size, sample_style)]
+    elif cfg.INPUT.CUSTOM_AUG == 'EfficientDetResizeCrop':
+        if is_train:
+            scale = cfg.INPUT.SCALE_RANGE if scale is None else scale
+            size = cfg.INPUT.TRAIN_SIZE if size is None else size
+        else:
+            scale = (1, 1)
+            size = cfg.INPUT.TEST_SIZE
+        augmentation = [EfficientDetResizeCrop(size, scale)]
+    else:
+        assert 0, cfg.INPUT.CUSTOM_AUG
+    if is_train:
+        augmentation.append(T.RandomFlip())
+    return augmentation
+build_custom_transform_gen = build_custom_augmentation
+"""
+Alias for backward-compatibility.
+"""

models/grit_src/grit/data/custom_dataset_dataloader.py ADDED Viewed

	@@ -0,0 +1,250 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# Modified by Jialian Wu from https://github.com/facebookresearch/Detic/blob/main/detic/data/custom_dataset_dataloader.py
+import operator
+import torch
+import torch.utils.data
+from detectron2.utils.comm import get_world_size
+from detectron2.config import configurable
+from torch.utils.data.sampler import BatchSampler, Sampler
+from detectron2.data.common import DatasetFromList, MapDataset
+from detectron2.data.dataset_mapper import DatasetMapper
+from detectron2.data.build import get_detection_dataset_dicts, build_batch_data_loader
+from detectron2.data.samplers import TrainingSampler
+from detectron2.data.build import worker_init_reset_seed, print_instances_class_histogram
+from detectron2.data.build import filter_images_with_only_crowd_annotations
+from detectron2.data.build import filter_images_with_few_keypoints
+from detectron2.data.build import check_metadata_consistency
+from detectron2.data.catalog import MetadataCatalog, DatasetCatalog
+from detectron2.utils import comm
+import itertools
+from typing import Optional
+def _custom_train_loader_from_config(cfg, mapper=None, *, dataset=None, sampler=None):
+    sampler_name = cfg.DATALOADER.SAMPLER_TRAIN
+    if 'MultiDataset' in sampler_name:
+        dataset_dicts = get_detection_dataset_dicts_with_source(
+            cfg.DATASETS.TRAIN,
+            filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS,
+            min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE
+            if cfg.MODEL.KEYPOINT_ON else 0,
+            proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None,
+        )
+    else:
+        dataset_dicts = get_detection_dataset_dicts(
+            cfg.DATASETS.TRAIN,
+            filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS,
+            min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE
+            if cfg.MODEL.KEYPOINT_ON else 0,
+            proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None,
+        )
+    if mapper is None:
+        mapper = DatasetMapper(cfg, True)
+    if sampler is not None:
+        pass
+    elif sampler_name == "TrainingSampler":
+        sampler = TrainingSampler(len(dataset))
+    elif sampler_name == "MultiDatasetSampler":
+        sampler = MultiDatasetSampler(
+            dataset_dicts,
+            dataset_ratio=cfg.DATALOADER.DATASET_RATIO,
+        )
+    else:
+        raise ValueError("Unknown training sampler: {}".format(sampler_name))
+    return {
+        "dataset": dataset_dicts,
+        "sampler": sampler,
+        "mapper": mapper,
+        "total_batch_size": cfg.SOLVER.IMS_PER_BATCH,
+        "num_workers": cfg.DATALOADER.NUM_WORKERS,
+        'dataset_bs': cfg.DATALOADER.DATASET_BS,
+        'num_datasets': len(cfg.DATASETS.TRAIN)
+    }
+@configurable(from_config=_custom_train_loader_from_config)
+def build_custom_train_loader(
+        dataset, *, mapper, sampler,
+        total_batch_size=16,
+        num_workers=0,
+        num_datasets=1,
+        dataset_bs=1
+):
+    if isinstance(dataset, list):
+        dataset = DatasetFromList(dataset, copy=False)
+    if mapper is not None:
+        dataset = MapDataset(dataset, mapper)
+    if sampler is None:
+        sampler = TrainingSampler(len(dataset))
+    assert isinstance(sampler, torch.utils.data.sampler.Sampler)
+    return build_dataset_batch_data_loader(
+        dataset_bs,
+        dataset,
+        sampler,
+        total_batch_size,
+        num_datasets=num_datasets,
+        num_workers=num_workers,
+    )
+def build_dataset_batch_data_loader(
+    dataset_bs, dataset, sampler, total_batch_size, num_datasets, num_workers=0
+):
+    world_size = get_world_size()
+    assert (
+        total_batch_size > 0 and total_batch_size % world_size == 0
+    ), "Total batch size ({}) must be divisible by the number of gpus ({}).".format(
+        total_batch_size, world_size
+    )
+    data_loader = torch.utils.data.DataLoader(
+        dataset,
+        sampler=sampler,
+        num_workers=num_workers,
+        batch_sampler=None,
+        collate_fn=operator.itemgetter(0),  # don't batch, but yield individual elements
+        worker_init_fn=worker_init_reset_seed,
+    )
+    if num_datasets > 1:
+        return MultiDatasets(data_loader, dataset_bs, num_datasets)
+    else:
+        return SingleDataset(data_loader, dataset_bs)
+def get_detection_dataset_dicts_with_source(
+    dataset_names, filter_empty=True, min_keypoints=0, proposal_files=None
+):
+    assert len(dataset_names)
+    dataset_dicts = [DatasetCatalog.get(dataset_name) for dataset_name in dataset_names]
+    for dataset_name, dicts in zip(dataset_names, dataset_dicts):
+        assert len(dicts), "Dataset '{}' is empty!".format(dataset_name)
+    for source_id, (dataset_name, dicts) in \
+        enumerate(zip(dataset_names, dataset_dicts)):
+        assert len(dicts), "Dataset '{}' is empty!".format(dataset_name)
+        for d in dicts:
+            d['dataset_source'] = source_id
+        if "annotations" in dicts[0]:
+            try:
+                class_names = MetadataCatalog.get(dataset_name).thing_classes
+                check_metadata_consistency("thing_classes", dataset_name)
+                print_instances_class_histogram(dicts, class_names)
+            except AttributeError:  # class names are not available for this dataset
+                pass
+    assert proposal_files is None
+    dataset_dicts = list(itertools.chain.from_iterable(dataset_dicts))
+    has_instances = "annotations" in dataset_dicts[0]
+    if filter_empty and has_instances:
+        dataset_dicts = filter_images_with_only_crowd_annotations(dataset_dicts)
+    if min_keypoints > 0 and has_instances:
+        dataset_dicts = filter_images_with_few_keypoints(dataset_dicts, min_keypoints)
+    return dataset_dicts
+class MultiDatasetSampler(Sampler):
+    def __init__(
+        self,
+        dataset_dicts,
+        dataset_ratio,
+        seed: Optional[int] = None,
+    ):
+        sizes = [0 for _ in range(len(dataset_ratio))]
+        for d in dataset_dicts:
+            sizes[d['dataset_source']] += 1
+        print('dataset sizes', sizes)
+        self.sizes = sizes
+        assert len(dataset_ratio) == len(sizes), \
+            'length of dataset ratio {} should be equal to number if dataset {}'.format(
+                len(dataset_ratio), len(sizes)
+            )
+        if seed is None:
+            seed = comm.shared_random_seed()
+        self._seed = int(seed)
+        self._rank = comm.get_rank()
+        self._world_size = comm.get_world_size()
+        self.dataset_ids = torch.tensor(
+            [d['dataset_source'] for d in dataset_dicts], dtype=torch.long)
+        self.dataset_ratio = dataset_ratio
+        dataset_weight = [torch.ones(s) * max(sizes) / s * r / sum(dataset_ratio) \
+            for i, (r, s) in enumerate(zip(dataset_ratio, sizes))]
+        dataset_weight = torch.cat(dataset_weight)
+        self.weights = dataset_weight
+        self.sample_epoch_size = len(self.weights)
+    def __iter__(self):
+        start = self._rank
+        yield from itertools.islice(
+            self._infinite_indices(), start, None, self._world_size)
+    def _infinite_indices(self):
+        g = torch.Generator()
+        g.manual_seed(self._seed)
+        while True:
+            if len(self.dataset_ratio) > 1:
+                # multiple datasets
+                ids = torch.multinomial(
+                    self.weights, self.sample_epoch_size, generator=g,
+                    replacement=True)
+                nums = [(self.dataset_ids[ids] == i).sum().int().item() \
+                    for i in range(len(self.sizes))]
+                yield from ids
+            else:
+                # single dataset
+                yield from torch.randperm(self.sizes[0], generator=g).tolist()
+class SingleDataset(torch.utils.data.IterableDataset):
+    def __init__(self, dataset, batch_sizes):
+        self.dataset = dataset
+        self.batch_sizes = batch_sizes
+        self._buckets = [[] for _ in range(2)]
+    def __iter__(self):
+        for d in self.dataset:
+            w, h = d["width"], d["height"]
+            aspect_ratio_bucket_id = 0 if w > h else 1
+            bucket_id = aspect_ratio_bucket_id
+            bucket = self._buckets[bucket_id]
+            bucket.append(d)
+            if len(bucket) == self.batch_sizes:
+                yield bucket[:]
+                del bucket[:]
+class MultiDatasets(torch.utils.data.IterableDataset):
+    def __init__(self, dataset, batch_sizes, num_datasets):
+        self.dataset = dataset
+        self.batch_sizes = batch_sizes
+        self._buckets = [[] for _ in range(2 * num_datasets)]
+        self.iter_idx = 0
+        self.num_datasets = num_datasets
+    def __iter__(self):
+        for d in self.dataset:
+            w, h = d["width"], d["height"]
+            aspect_ratio_bucket_id = 0 if w > h else 1
+            bucket_id = d['dataset_source'] * 2 + aspect_ratio_bucket_id
+            bucket = self._buckets[bucket_id]
+            if len(bucket) < self.batch_sizes:
+                bucket.append(d)
+            selected_dataset = self.iter_idx % self.num_datasets
+            if len(bucket) == self.batch_sizes and selected_dataset == d['dataset_source']:
+                self.iter_idx += 1
+                yield bucket[:]
+                del bucket[:]

models/grit_src/grit/data/custom_dataset_mapper.py ADDED Viewed

	@@ -0,0 +1,149 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# Modified by Jialian Wu from https://github.com/facebookresearch/Detic/blob/main/detic/data/custom_dataset_mapper.py
+import copy
+import numpy as np
+import torch
+from detectron2.config import configurable
+from detectron2.data import detection_utils as utils
+from detectron2.data import transforms as T
+from detectron2.data.dataset_mapper import DatasetMapper
+from .custom_build_augmentation import build_custom_augmentation
+from itertools import compress
+import logging
+__all__ = ["CustomDatasetMapper", "ObjDescription"]
+logger = logging.getLogger(__name__)
+class CustomDatasetMapper(DatasetMapper):
+    @configurable
+    def __init__(self, is_train: bool,
+        dataset_augs=[],
+        **kwargs):
+        if is_train:
+            self.dataset_augs = [T.AugmentationList(x) for x in dataset_augs]
+        super().__init__(is_train, **kwargs)
+    @classmethod
+    def from_config(cls, cfg, is_train: bool = True):
+        ret = super().from_config(cfg, is_train)
+        if is_train:
+            if cfg.INPUT.CUSTOM_AUG == 'EfficientDetResizeCrop':
+                dataset_scales = cfg.DATALOADER.DATASET_INPUT_SCALE
+                dataset_sizes = cfg.DATALOADER.DATASET_INPUT_SIZE
+                ret['dataset_augs'] = [
+                    build_custom_augmentation(cfg, True, scale, size) \
+                        for scale, size in zip(dataset_scales, dataset_sizes)]
+            else:
+                assert cfg.INPUT.CUSTOM_AUG == 'ResizeShortestEdge'
+                min_sizes = cfg.DATALOADER.DATASET_MIN_SIZES
+                max_sizes = cfg.DATALOADER.DATASET_MAX_SIZES
+                ret['dataset_augs'] = [
+                    build_custom_augmentation(
+                        cfg, True, min_size=mi, max_size=ma) \
+                        for mi, ma in zip(min_sizes, max_sizes)]
+        else:
+            ret['dataset_augs'] = []
+        return ret
+    def __call__(self, dataset_dict):
+        dataset_dict_out = self.prepare_data(dataset_dict)
+        # When augmented image is too small, do re-augmentation
+        retry = 0
+        while (dataset_dict_out["image"].shape[1] < 32 or dataset_dict_out["image"].shape[2] < 32):
+            retry += 1
+            if retry == 100:
+                logger.info('Retry 100 times for augmentation. Make sure the image size is not too small.')
+                logger.info('Find image information below')
+                logger.info(dataset_dict)
+            dataset_dict_out = self.prepare_data(dataset_dict)
+        return dataset_dict_out
+    def prepare_data(self, dataset_dict_in):
+        dataset_dict = copy.deepcopy(dataset_dict_in)
+        if 'file_name' in dataset_dict:
+            ori_image = utils.read_image(
+                dataset_dict["file_name"], format=self.image_format)
+        else:
+            ori_image, _, _ = self.tar_dataset[dataset_dict["tar_index"]]
+            ori_image = utils._apply_exif_orientation(ori_image)
+            ori_image = utils.convert_PIL_to_numpy(ori_image, self.image_format)
+        utils.check_image_size(dataset_dict, ori_image)
+        aug_input = T.AugInput(copy.deepcopy(ori_image), sem_seg=None)
+        if self.is_train:
+            transforms = \
+                self.dataset_augs[dataset_dict['dataset_source']](aug_input)
+        else:
+            transforms = self.augmentations(aug_input)
+        image, sem_seg_gt = aug_input.image, aug_input.sem_seg
+        image_shape = image.shape[:2]
+        dataset_dict["image"] = torch.as_tensor(
+            np.ascontiguousarray(image.transpose(2, 0, 1)))
+        if not self.is_train:
+            # USER: Modify this if you want to keep them for some reason.
+            dataset_dict.pop("annotations", None)
+            return dataset_dict
+        if "annotations" in dataset_dict:
+            if len(dataset_dict["annotations"]) > 0:
+                object_descriptions = [an['object_description'] for an in dataset_dict["annotations"]]
+            else:
+                object_descriptions = []
+            # USER: Modify this if you want to keep them for some reason.
+            for anno in dataset_dict["annotations"]:
+                if not self.use_instance_mask:
+                    anno.pop("segmentation", None)
+                if not self.use_keypoint:
+                    anno.pop("keypoints", None)
+            all_annos = [
+                (utils.transform_instance_annotations(
+                    obj, transforms, image_shape,
+                    keypoint_hflip_indices=self.keypoint_hflip_indices,
+                ),  obj.get("iscrowd", 0))
+                for obj in dataset_dict.pop("annotations")
+            ]
+            annos = [ann[0] for ann in all_annos if ann[1] == 0]
+            instances = utils.annotations_to_instances(
+                annos, image_shape, mask_format=self.instance_mask_format
+            )
+            instances.gt_object_descriptions = ObjDescription(object_descriptions)
+            del all_annos
+            if self.recompute_boxes:
+                instances.gt_boxes = instances.gt_masks.get_bounding_boxes()
+            dataset_dict["instances"] = utils.filter_empty_instances(instances)
+        return dataset_dict
+class ObjDescription:
+    def __init__(self, object_descriptions):
+        self.data = object_descriptions
+    def __getitem__(self, item):
+        assert type(item) == torch.Tensor
+        assert item.dim() == 1
+        if len(item) > 0:
+            assert item.dtype == torch.int64 or item.dtype == torch.bool
+            if item.dtype == torch.int64:
+                return ObjDescription([self.data[x.item()] for x in item])
+            elif item.dtype == torch.bool:
+                return ObjDescription(list(compress(self.data, item)))
+        return ObjDescription(list(compress(self.data, item)))
+    def __len__(self):
+        return len(self.data)
+    def __repr__(self):
+        return "ObjDescription({})".format(self.data)

models/grit_src/grit/data/datasets/__pycache__/grit_coco.cpython-38.pyc ADDED Viewed

Binary file (3.94 kB). View file

models/grit_src/grit/data/datasets/__pycache__/object365.cpython-38.pyc ADDED Viewed

Binary file (3.7 kB). View file

models/grit_src/grit/data/datasets/__pycache__/vg.cpython-38.pyc ADDED Viewed

Binary file (3.28 kB). View file

models/grit_src/grit/data/datasets/grit_coco.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import logging
+import os
+from fvcore.common.timer import Timer
+from detectron2.structures import BoxMode
+from fvcore.common.file_io import PathManager
+from detectron2.data import DatasetCatalog, MetadataCatalog
+from lvis import LVIS
+logger = logging.getLogger(__name__)
+__all__ = ["load_GRiTcoco_json", "register_GRiTcoco_instances"]
+def register_GRiTcoco_instances(name, metadata, json_file, image_root):
+    """
+    """
+    DatasetCatalog.register(name, lambda: load_GRiTcoco_json(
+        json_file, image_root, name))
+    MetadataCatalog.get(name).set(
+        json_file=json_file, image_root=image_root,
+        evaluator_type="coco", **metadata
+    )
+def get_GRiTcoco_meta():
+    categories = [{'supercategory': 'object', 'id': 1, 'name': 'object'}]
+    categories = sorted(categories, key=lambda x: x["id"])
+    thing_classes = [k["name"] for k in categories]
+    meta = {"thing_classes": thing_classes}
+    return meta
+def load_GRiTcoco_json(json_file, image_root, dataset_name=None):
+    '''
+    Load COCO class name text for object description for GRiT
+    '''
+    json_file = PathManager.get_local_path(json_file)
+    timer = Timer()
+    lvis_api = LVIS(json_file)
+    if timer.seconds() > 1:
+        logger.info("Loading {} takes {:.2f} seconds.".format(
+            json_file, timer.seconds()))
+    class_names = {}
+    sort_cat = sorted(lvis_api.dataset['categories'], key=lambda x: x['id'])
+    for x in sort_cat:
+        class_names[x['id']] = x['name']
+    img_ids = sorted(lvis_api.imgs.keys())
+    imgs = lvis_api.load_imgs(img_ids)
+    anns = [lvis_api.img_ann_map[img_id] for img_id in img_ids]
+    ann_ids = [ann["id"] for anns_per_image in anns for ann in anns_per_image]
+    assert len(set(ann_ids)) == len(ann_ids), \
+        "Annotation ids in '{}' are not unique".format(json_file)
+    imgs_anns = list(zip(imgs, anns))
+    logger.info("Loaded {} images in the LVIS v1 format from {}".format(
+        len(imgs_anns), json_file))
+    dataset_dicts = []
+    for (img_dict, anno_dict_list) in imgs_anns:
+        record = {}
+        if "file_name" in img_dict:
+            file_name = img_dict["file_name"]
+            record["file_name"] = os.path.join(image_root, file_name)
+        record["height"] = int(img_dict["height"])
+        record["width"] = int(img_dict["width"])
+        image_id = record["image_id"] = img_dict["id"]
+        objs = []
+        for anno in anno_dict_list:
+            assert anno["image_id"] == image_id
+            if anno.get('iscrowd', 0) > 0:
+                continue
+            obj = {"bbox": anno["bbox"], "bbox_mode": BoxMode.XYWH_ABS}
+            obj["category_id"] = 0
+            obj["object_description"] = class_names[anno['category_id']]
+            if 'segmentation' in anno:
+                segm = anno["segmentation"]
+                valid_segm = [poly for poly in segm \
+                    if len(poly) % 2 == 0 and len(poly) >= 6]
+                if not len(segm) == len(valid_segm):
+                    print('Annotation contains an invalid polygon with < 3 points')
+                assert len(segm) > 0
+                obj["segmentation"] = segm
+            objs.append(obj)
+        record["annotations"] = objs
+        if len(record["annotations"]) == 0:
+            continue
+        record["task"] = "ObjectDet"
+        dataset_dicts.append(record)
+    return dataset_dicts
+_CUSTOM_SPLITS_LVIS = {
+    "GRiT_coco2017_train": ("coco/train2017/", "coco/annotations/instances_train2017.json"),
+}
+for key, (image_root, json_file) in _CUSTOM_SPLITS_LVIS.items():
+    register_GRiTcoco_instances(
+        key,
+        get_GRiTcoco_meta(),
+        os.path.join("datasets", json_file) if "://" not in json_file else json_file,
+        os.path.join("datasets", image_root),
+    )

models/grit_src/grit/data/datasets/object365.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import logging
+import os
+from fvcore.common.timer import Timer
+from detectron2.structures import BoxMode
+from fvcore.common.file_io import PathManager
+from detectron2.data import DatasetCatalog, MetadataCatalog
+from lvis import LVIS
+logger = logging.getLogger(__name__)
+__all__ = ["load_o365_json", "register_o365_instances"]
+def register_o365_instances(name, metadata, json_file, image_root):
+    DatasetCatalog.register(name, lambda: load_o365_json(
+        json_file, image_root, name))
+    MetadataCatalog.get(name).set(
+        json_file=json_file, image_root=image_root,
+        evaluator_type="lvis", **metadata
+    )
+def get_o365_meta():
+    categories = [{'supercategory': 'object', 'id': 1, 'name': 'object'}]
+    o365_categories = sorted(categories, key=lambda x: x["id"])
+    thing_classes = [k["name"] for k in o365_categories]
+    meta = {"thing_classes": thing_classes}
+    return meta
+def load_o365_json(json_file, image_root, dataset_name=None):
+    '''
+    Load Object365 class name text for object description for GRiT
+    '''
+    json_file = PathManager.get_local_path(json_file)
+    timer = Timer()
+    lvis_api = LVIS(json_file)
+    if timer.seconds() > 1:
+        logger.info("Loading {} takes {:.2f} seconds.".format(
+            json_file, timer.seconds()))
+    class_names = {}
+    sort_cat = sorted(lvis_api.dataset['categories'], key=lambda x: x['id'])
+    for x in sort_cat:
+        if '/' in x['name']:
+            text = ''
+            for xx in x['name'].split('/'):
+                text += xx
+                text += ' '
+            text = text[:-1]
+        else:
+            text = x['name']
+        class_names[x['id']] = text
+    img_ids = sorted(lvis_api.imgs.keys())
+    imgs = lvis_api.load_imgs(img_ids)
+    anns = [lvis_api.img_ann_map[img_id] for img_id in img_ids]
+    ann_ids = [ann["id"] for anns_per_image in anns for ann in anns_per_image]
+    assert len(set(ann_ids)) == len(ann_ids), \
+        "Annotation ids in '{}' are not unique".format(json_file)
+    imgs_anns = list(zip(imgs, anns))
+    logger.info("Loaded {} images in the LVIS v1 format from {}".format(
+        len(imgs_anns), json_file))
+    dataset_dicts = []
+    for (img_dict, anno_dict_list) in imgs_anns:
+        record = {}
+        if "file_name" in img_dict:
+            file_name = img_dict["file_name"]
+            record["file_name"] = os.path.join(image_root, file_name)
+        record["height"] = int(img_dict["height"])
+        record["width"] = int(img_dict["width"])
+        image_id = record["image_id"] = img_dict["id"]
+        objs = []
+        for anno in anno_dict_list:
+            assert anno["image_id"] == image_id
+            if anno.get('iscrowd', 0) > 0:
+                continue
+            obj = {"bbox": anno["bbox"], "bbox_mode": BoxMode.XYWH_ABS}
+            obj["category_id"] = 0
+            obj["object_description"] = class_names[anno['category_id']]
+            objs.append(obj)
+        record["annotations"] = objs
+        if len(record["annotations"]) == 0:
+            continue
+        record["task"] = "ObjectDet"
+        dataset_dicts.append(record)
+    return dataset_dicts
+_CUSTOM_SPLITS_LVIS = {
+    "object365_train": ("object365/images/train/", "object365/annotations/train_v1.json"),
+}
+for key, (image_root, json_file) in _CUSTOM_SPLITS_LVIS.items():
+    register_o365_instances(
+        key,
+        get_o365_meta(),
+        os.path.join("datasets", json_file) if "://" not in json_file else json_file,
+        os.path.join("datasets", image_root),
+    )

models/grit_src/grit/data/datasets/vg.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import logging
+import os
+from fvcore.common.timer import Timer
+from detectron2.structures import BoxMode
+from fvcore.common.file_io import PathManager
+from detectron2.data import DatasetCatalog, MetadataCatalog
+from lvis import LVIS
+logger = logging.getLogger(__name__)
+__all__ = ["load_vg_json", "register_vg_instances"]
+def register_vg_instances(name, metadata, json_file, image_root):
+    """
+    """
+    DatasetCatalog.register(name, lambda: load_vg_json(
+        json_file, image_root, name))
+    MetadataCatalog.get(name).set(
+        json_file=json_file, image_root=image_root,
+        evaluator_type="vg", **metadata
+    )
+def get_vg_meta():
+    categories = [{'supercategory': 'object', 'id': 1, 'name': 'object'}]
+    vg_categories = sorted(categories, key=lambda x: x["id"])
+    thing_classes = [k["name"] for k in vg_categories]
+    meta = {"thing_classes": thing_classes}
+    return meta
+def load_vg_json(json_file, image_root, dataset_name=None):
+    json_file = PathManager.get_local_path(json_file)
+    timer = Timer()
+    lvis_api = LVIS(json_file)
+    if timer.seconds() > 1:
+        logger.info("Loading {} takes {:.2f} seconds.".format(
+            json_file, timer.seconds()))
+    img_ids = sorted(lvis_api.imgs.keys())
+    imgs = lvis_api.load_imgs(img_ids)
+    anns = [lvis_api.img_ann_map[img_id] for img_id in img_ids]
+    ann_ids = [ann["id"] for anns_per_image in anns for ann in anns_per_image]
+    assert len(set(ann_ids)) == len(ann_ids), \
+        "Annotation ids in '{}' are not unique".format(json_file)
+    imgs_anns = list(zip(imgs, anns))
+    logger.info("Loaded {} images in the LVIS v1 format from {}".format(
+        len(imgs_anns), json_file))
+    dataset_dicts = []
+    for (img_dict, anno_dict_list) in imgs_anns:
+        record = {}
+        if "file_name" in img_dict:
+            file_name = img_dict["file_name"]
+            record["file_name"] = os.path.join(image_root, file_name)
+        record["height"] = int(img_dict["height"])
+        record["width"] = int(img_dict["width"])
+        image_id = record["image_id"] = img_dict["id"]
+        objs = []
+        for anno in anno_dict_list:
+            assert anno["image_id"] == image_id
+            if anno.get('iscrowd', 0) > 0:
+                continue
+            obj = {"bbox": anno["bbox"], "bbox_mode": BoxMode.XYWH_ABS}
+            obj["category_id"] = 0
+            obj["object_description"] = anno["caption"]
+            objs.append(obj)
+        record["annotations"] = objs
+        if len(record["annotations"]) == 0:
+            continue
+        record["task"] = "DenseCap"
+        dataset_dicts.append(record)
+    return dataset_dicts
+_CUSTOM_SPLITS_LVIS = {
+    "vg_train": ("vg/images", "vg/annotations/train.json"),
+    "vg_test": ("vg/images", "vg/annotations/test.json"),
+}
+for key, (image_root, json_file) in _CUSTOM_SPLITS_LVIS.items():
+    register_vg_instances(
+        key,
+        get_vg_meta(),
+        os.path.join("datasets", json_file) if "://" not in json_file else json_file,
+        os.path.join("datasets", image_root),
+    )

models/grit_src/grit/data/transforms/__pycache__/custom_augmentation_impl.cpython-38.pyc ADDED Viewed

Binary file (1.73 kB). View file

models/grit_src/grit/data/transforms/__pycache__/custom_transform.cpython-38.pyc ADDED Viewed

Binary file (3.89 kB). View file

models/grit_src/grit/data/transforms/custom_augmentation_impl.py ADDED Viewed

	@@ -0,0 +1,52 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# Part of the code is from https://github.com/rwightman/efficientdet-pytorch/blob/master/effdet/data/transforms.py
+# Modified by Xingyi Zhou
+# The original code is under Apache-2.0 License
+import numpy as np
+from PIL import Image
+from detectron2.data.transforms.augmentation import Augmentation
+from .custom_transform import EfficientDetResizeCropTransform
+__all__ = [
+    "EfficientDetResizeCrop",
+]
+class EfficientDetResizeCrop(Augmentation):
+    """
+    Scale the shorter edge to the given size, with a limit of `max_size` on the longer edge.
+    If `max_size` is reached, then downscale so that the longer edge does not exceed max_size.
+    """
+    def __init__(
+        self, size, scale, interp=Image.BILINEAR
+    ):
+        """
+        """
+        super().__init__()
+        self.target_size = (size, size)
+        self.scale = scale
+        self.interp = interp
+    def get_transform(self, img):
+        # Select a random scale factor.
+        scale_factor = np.random.uniform(*self.scale)
+        scaled_target_height = scale_factor * self.target_size[0]
+        scaled_target_width = scale_factor * self.target_size[1]
+        # Recompute the accurate scale_factor using rounded scaled image size.
+        width, height = img.shape[1], img.shape[0]
+        img_scale_y = scaled_target_height / height
+        img_scale_x = scaled_target_width / width
+        img_scale = min(img_scale_y, img_scale_x)
+        # Select non-zero random offset (x, y) if scaled image is larger than target size
+        scaled_h = int(height * img_scale)
+        scaled_w = int(width * img_scale)
+        offset_y = scaled_h - self.target_size[0]
+        offset_x = scaled_w - self.target_size[1]
+        offset_y = int(max(0.0, float(offset_y)) * np.random.uniform(0, 1))
+        offset_x = int(max(0.0, float(offset_x)) * np.random.uniform(0, 1))
+        return EfficientDetResizeCropTransform(
+            scaled_h, scaled_w, offset_y, offset_x, img_scale, self.target_size, self.interp)

models/grit_src/grit/data/transforms/custom_transform.py ADDED Viewed

	@@ -0,0 +1,115 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# Part of the code is from https://github.com/rwightman/efficientdet-pytorch/blob/master/effdet/data/transforms.py
+# Modified by Xingyi Zhou
+# The original code is under Apache-2.0 License
+import numpy as np
+import torch
+import torch.nn.functional as F
+from fvcore.transforms.transform import (
+    CropTransform,
+    HFlipTransform,
+    NoOpTransform,
+    Transform,
+    TransformList,
+)
+from PIL import Image
+try:
+    import cv2  # noqa
+except ImportError:
+    # OpenCV is an optional dependency at the moment
+    pass
+__all__ = [
+    "EfficientDetResizeCropTransform",
+]
+class EfficientDetResizeCropTransform(Transform):
+    """
+    """
+    def __init__(self, scaled_h, scaled_w, offset_y, offset_x, img_scale, \
+        target_size, interp=None):
+        """
+        Args:
+            h, w (int): original image size
+            new_h, new_w (int): new image size
+            interp: PIL interpolation methods, defaults to bilinear.
+        """
+        # TODO decide on PIL vs opencv
+        super().__init__()
+        if interp is None:
+            interp = Image.BILINEAR
+        self._set_attributes(locals())
+    def apply_image(self, img, interp=None):
+        assert len(img.shape) <= 4
+        if img.dtype == np.uint8:
+            pil_image = Image.fromarray(img)
+            interp_method = interp if interp is not None else self.interp
+            pil_image = pil_image.resize((self.scaled_w, self.scaled_h), interp_method)
+            ret = np.asarray(pil_image)
+            right = min(self.scaled_w, self.offset_x + self.target_size[1])
+            lower = min(self.scaled_h, self.offset_y + self.target_size[0])
+            if len(ret.shape) <= 3:
+                ret = ret[self.offset_y: lower, self.offset_x: right]
+            else:
+                ret = ret[..., self.offset_y: lower, self.offset_x: right, :]
+        else:
+            # PIL only supports uint8
+            img = torch.from_numpy(img)
+            shape = list(img.shape)
+            shape_4d = shape[:2] + [1] * (4 - len(shape)) + shape[2:]
+            img = img.view(shape_4d).permute(2, 3, 0, 1)  # hw(c) -> nchw
+            _PIL_RESIZE_TO_INTERPOLATE_MODE = {Image.BILINEAR: "bilinear", Image.BICUBIC: "bicubic"}
+            mode = _PIL_RESIZE_TO_INTERPOLATE_MODE[self.interp]
+            img = F.interpolate(img, (self.scaled_h, self.scaled_w), mode=mode, align_corners=False)
+            shape[:2] = (self.scaled_h, self.scaled_w)
+            ret = img.permute(2, 3, 0, 1).view(shape).numpy()  # nchw -> hw(c)
+            right = min(self.scaled_w, self.offset_x + self.target_size[1])
+            lower = min(self.scaled_h, self.offset_y + self.target_size[0])
+            if len(ret.shape) <= 3:
+                ret = ret[self.offset_y: lower, self.offset_x: right]
+            else:
+                ret = ret[..., self.offset_y: lower, self.offset_x: right, :]
+        return ret
+    def apply_coords(self, coords):
+        coords[:, 0] = coords[:, 0] * self.img_scale
+        coords[:, 1] = coords[:, 1] * self.img_scale
+        coords[:, 0] -= self.offset_x
+        coords[:, 1] -= self.offset_y
+        return coords
+    def apply_segmentation(self, segmentation):
+        segmentation = self.apply_image(segmentation, interp=Image.NEAREST)
+        return segmentation
+    def inverse(self):
+        raise NotImplementedError
+    def inverse_apply_coords(self, coords):
+        coords[:, 0] += self.offset_x
+        coords[:, 1] += self.offset_y
+        coords[:, 0] = coords[:, 0] / self.img_scale
+        coords[:, 1] = coords[:, 1] / self.img_scale
+        return coords
+    def inverse_apply_box(self, box: np.ndarray) -> np.ndarray:
+        """
+        """
+        idxs = np.array([(0, 1), (2, 1), (0, 3), (2, 3)]).flatten()
+        coords = np.asarray(box).reshape(-1, 4)[:, idxs].reshape(-1, 2)
+        coords = self.inverse_apply_coords(coords).reshape((-1, 4, 2))
+        minxy = coords.min(axis=1)
+        maxxy = coords.max(axis=1)
+        trans_boxes = np.concatenate((minxy, maxxy), axis=1)
+        return trans_boxes

models/grit_src/grit/evaluation/eval.py ADDED Viewed

	@@ -0,0 +1,156 @@

+import itertools
+import json
+import os
+from detectron2.structures import Boxes, BoxMode, pairwise_iou
+from detectron2.utils.file_io import PathManager
+import numpy as np
+import pycocotools.mask as mask_util
+from detectron2.evaluation.coco_evaluation import COCOEvaluator
+from detectron2.evaluation.coco_evaluation import _evaluate_predictions_on_coco
+class GRiTCOCOEvaluator(COCOEvaluator):
+    def process(self, inputs, outputs):
+        for input, output in zip(inputs, outputs):
+            prediction = {"image_id": input["image_id"]}
+            if "instances" in output:
+                instances = output["instances"].to(self._cpu_device)
+                prediction["instances"] = instances_to_coco_json(instances, input["image_id"])
+            if len(prediction) > 1:
+                self._predictions.append(prediction)
+    def _eval_predictions(self, predictions, img_ids=None):
+        self._logger.info("Preparing results for COCO format ...")
+        coco_results = list(itertools.chain(*[x["instances"] for x in predictions]))
+        tasks = self._tasks or self._tasks_from_predictions(coco_results)
+        if self._output_dir:
+            file_path = os.path.join(self._output_dir, "coco_instances_results.json")
+            self._logger.info("Saving results to {}".format(file_path))
+            with PathManager.open(file_path, "w") as f:
+                f.write(json.dumps(coco_results))
+                f.flush()
+        if not self._do_evaluation:
+            self._logger.info("Annotations are not available for evaluation.")
+            return
+        self._logger.info(
+            "Evaluating predictions with {} COCO API...".format(
+                "unofficial" if self._use_fast_impl else "official"
+            )
+        )
+        coco_results = self.convert_classname_to_id(coco_results)
+        for task in sorted(tasks):
+            assert task in {"bbox", "segm", "keypoints"}, f"Got unknown task: {task}!"
+            coco_eval = (
+                _evaluate_predictions_on_coco(
+                    self._coco_api,
+                    coco_results,
+                    task,
+                    kpt_oks_sigmas=self._kpt_oks_sigmas,
+                    use_fast_impl=self._use_fast_impl,
+                    img_ids=img_ids,
+                    max_dets_per_image=self._max_dets_per_image,
+                )
+                if len(coco_results) > 0
+                else None  # cocoapi does not handle empty results very well
+            )
+            res = self._derive_coco_results(
+                coco_eval, task, class_names=self._metadata.get("thing_classes")
+            )
+            self._results[task] = res
+    def convert_classname_to_id(self, results):
+        outputs = []
+        class_name_to_id = {}
+        categories = sorted(self._coco_api.dataset['categories'], key=lambda x: x['id'])
+        for cat in categories:
+            class_name_to_id[cat['name']] = cat['id']
+        for pred in results:
+            if pred['object_descriptions'] in class_name_to_id:
+                pred['category_id'] = class_name_to_id[pred['object_descriptions']]
+                del pred['object_descriptions']
+                outputs.append(pred)
+        return outputs
+class GRiTVGEvaluator(COCOEvaluator):
+    def process(self, inputs, outputs):
+        for input, output in zip(inputs, outputs):
+            assert input["image_id"] == int(input['file_name'].split('/')[-1].split('.')[0])
+            prediction = {"image_id": input["image_id"]}
+            if "instances" in output:
+                instances = output["instances"].to(self._cpu_device)
+                prediction["instances"] = instances_to_coco_json(instances, input["image_id"], output_logits=True)
+                h = input['height']
+                w = input['width']
+                scale = 720.0 / max(h, w)
+                scaled_inst = []
+                for inst in prediction["instances"]:
+                    inst['bbox'][0] = inst['bbox'][0] * scale
+                    inst['bbox'][1] = inst['bbox'][1] * scale
+                    inst['bbox'][2] = inst['bbox'][2] * scale
+                    inst['bbox'][3] = inst['bbox'][3] * scale
+                    scaled_inst.append(inst)
+                if len(scaled_inst) > 0:
+                    prediction["instances"] = scaled_inst
+            if len(prediction) > 1:
+                self._predictions.append(prediction)
+    def _eval_predictions(self, predictions, img_ids=None):
+        '''
+        This is only for saving the results to json file
+        '''
+        self._logger.info("Preparing results for COCO format ...")
+        coco_results = list(itertools.chain(*[x["instances"] for x in predictions]))
+        if self._output_dir:
+            file_path = os.path.join(self._output_dir, "vg_instances_results.json")
+            self._logger.info("Saving results to {}".format(file_path))
+            with PathManager.open(file_path, "w") as f:
+                f.write(json.dumps(coco_results))
+                f.flush()
+def instances_to_coco_json(instances, img_id, output_logits=False):
+    """
+        Add object_descriptions and logit (if applicable) to
+        detectron2's instances_to_coco_json
+    """
+    num_instance = len(instances)
+    if num_instance == 0:
+        return []
+    boxes = instances.pred_boxes.tensor.numpy()
+    boxes = BoxMode.convert(boxes, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
+    boxes = boxes.tolist()
+    scores = instances.scores.tolist()
+    classes = instances.pred_classes.tolist()
+    object_descriptions = instances.pred_object_descriptions.data
+    if output_logits:
+        logits = instances.logits.tolist()
+    results = []
+    for k in range(num_instance):
+        result = {
+            "image_id": img_id,
+            "category_id": classes[k],
+            "bbox": boxes[k],
+            "score": scores[k],
+            'object_descriptions': object_descriptions[k],
+        }
+        if output_logits:
+            result["logit"] = logits[k]
+        results.append(result)
+    return results

models/grit_src/grit/modeling/__pycache__/soft_nms.cpython-38.pyc ADDED Viewed

Binary file (5.99 kB). View file

models/grit_src/grit/modeling/backbone/__pycache__/utils.cpython-38.pyc ADDED Viewed

Binary file (6.12 kB). View file

models/grit_src/grit/modeling/backbone/__pycache__/vit.cpython-38.pyc ADDED Viewed

Binary file (15.6 kB). View file

models/grit_src/grit/modeling/backbone/utils.py ADDED Viewed

	@@ -0,0 +1,186 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# This code is from https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/backbone/utils.py
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+__all__ = [
+    "window_partition",
+    "window_unpartition",
+    "add_decomposed_rel_pos",
+    "get_abs_pos",
+    "PatchEmbed",
+]
+def window_partition(x, window_size):
+    """
+    Partition into non-overlapping windows with padding if needed.
+    Args:
+        x (tensor): input tokens with [B, H, W, C].
+        window_size (int): window size.
+    Returns:
+        windows: windows after partition with [B * num_windows, window_size, window_size, C].
+        (Hp, Wp): padded height and width before partition
+    """
+    B, H, W, C = x.shape
+    pad_h = (window_size - H % window_size) % window_size
+    pad_w = (window_size - W % window_size) % window_size
+    if pad_h > 0 or pad_w > 0:
+        x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h))
+    Hp, Wp = H + pad_h, W + pad_w
+    x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C)
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    return windows, (Hp, Wp)
+def window_unpartition(windows, window_size, pad_hw, hw):
+    """
+    Window unpartition into original sequences and removing padding.
+    Args:
+        x (tensor): input tokens with [B * num_windows, window_size, window_size, C].
+        window_size (int): window size.
+        pad_hw (Tuple): padded height and width (Hp, Wp).
+        hw (Tuple): original height and width (H, W) before padding.
+    Returns:
+        x: unpartitioned sequences with [B, H, W, C].
+    """
+    Hp, Wp = pad_hw
+    H, W = hw
+    B = windows.shape[0] // (Hp * Wp // window_size // window_size)
+    x = windows.view(B, Hp // window_size, Wp // window_size, window_size, window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, -1)
+    if Hp > H or Wp > W:
+        x = x[:, :H, :W, :].contiguous()
+    return x
+def get_rel_pos(q_size, k_size, rel_pos):
+    """
+    Get relative positional embeddings according to the relative positions of
+        query and key sizes.
+    Args:
+        q_size (int): size of query q.
+        k_size (int): size of key k.
+        rel_pos (Tensor): relative position embeddings (L, C).
+    Returns:
+        Extracted positional embeddings according to relative positions.
+    """
+    max_rel_dist = int(2 * max(q_size, k_size) - 1)
+    # Interpolate rel pos if needed.
+    if rel_pos.shape[0] != max_rel_dist:
+        # Interpolate rel pos.
+        rel_pos_resized = F.interpolate(
+            rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1),
+            size=max_rel_dist,
+            mode="linear",
+        )
+        rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0)
+    else:
+        rel_pos_resized = rel_pos
+    # Scale the coords with short length if shapes for q and k are different.
+    q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0)
+    k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0)
+    relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0)
+    return rel_pos_resized[relative_coords.long()]
+def add_decomposed_rel_pos(attn, q, rel_pos_h, rel_pos_w, q_size, k_size):
+    """
+    Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
+    https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py   # noqa B950
+    Args:
+        attn (Tensor): attention map.
+        q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C).
+        rel_pos_h (Tensor): relative position embeddings (Lh, C) for height axis.
+        rel_pos_w (Tensor): relative position embeddings (Lw, C) for width axis.
+        q_size (Tuple): spatial sequence size of query q with (q_h, q_w).
+        k_size (Tuple): spatial sequence size of key k with (k_h, k_w).
+    Returns:
+        attn (Tensor): attention map with added relative positional embeddings.
+    """
+    q_h, q_w = q_size
+    k_h, k_w = k_size
+    Rh = get_rel_pos(q_h, k_h, rel_pos_h)
+    Rw = get_rel_pos(q_w, k_w, rel_pos_w)
+    B, _, dim = q.shape
+    r_q = q.reshape(B, q_h, q_w, dim)
+    rel_h = torch.einsum("bhwc,hkc->bhwk", r_q, Rh)
+    rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, Rw)
+    attn = (
+        attn.view(B, q_h, q_w, k_h, k_w) + rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :]
+    ).view(B, q_h * q_w, k_h * k_w)
+    return attn
+def get_abs_pos(abs_pos, has_cls_token, hw):
+    """
+    Calculate absolute positional embeddings. If needed, resize embeddings and remove cls_token
+        dimension for the original embeddings.
+    Args:
+        abs_pos (Tensor): absolute positional embeddings with (1, num_position, C).
+        has_cls_token (bool): If true, has 1 embedding in abs_pos for cls token.
+        hw (Tuple): size of input image tokens.
+    Returns:
+        Absolute positional embeddings after processing with shape (1, H, W, C)
+    """
+    h, w = hw
+    if has_cls_token:
+        abs_pos = abs_pos[:, 1:]
+    xy_num = abs_pos.shape[1]
+    size = int(math.sqrt(xy_num))
+    assert size * size == xy_num
+    if size != h or size != w:
+        new_abs_pos = F.interpolate(
+            abs_pos.reshape(1, size, size, -1).permute(0, 3, 1, 2),
+            size=(h, w),
+            mode="bicubic",
+            align_corners=False,
+        )
+        return new_abs_pos.permute(0, 2, 3, 1)
+    else:
+        return abs_pos.reshape(1, h, w, -1)
+class PatchEmbed(nn.Module):
+    """
+    Image to Patch Embedding.
+    """
+    def __init__(
+        self, kernel_size=(16, 16), stride=(16, 16), padding=(0, 0), in_chans=3, embed_dim=768
+    ):
+        """
+        Args:
+            kernel_size (Tuple): kernel size of the projection layer.
+            stride (Tuple): stride of the projection layer.
+            padding (Tuple): padding size of the projection layer.
+            in_chans (int): Number of input image channels.
+            embed_dim (int):  embed_dim (int): Patch embedding dimension.
+        """
+        super().__init__()
+        self.proj = nn.Conv2d(
+            in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding
+        )
+    def forward(self, x):
+        x = self.proj(x)
+        # B C H W -> B H W C
+        x = x.permute(0, 2, 3, 1)
+        return x

models/grit_src/grit/modeling/backbone/vit.py ADDED Viewed

	@@ -0,0 +1,538 @@

+# Modified by Jialian Wu from https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/backbone/vit.py
+import logging
+import math
+import fvcore.nn.weight_init as weight_init
+import torch
+import torch.nn as nn
+from functools import partial
+from detectron2.layers import CNNBlockBase, Conv2d, get_norm
+from detectron2.modeling.backbone.build import BACKBONE_REGISTRY
+from detectron2.layers import ShapeSpec
+from centernet.modeling.backbone.fpn_p5 import LastLevelP6P7_P5
+import torch.utils.checkpoint as checkpoint
+from timm.models.layers import DropPath, Mlp, trunc_normal_
+from detectron2.modeling.backbone.backbone import Backbone
+from .utils import (
+    PatchEmbed,
+    add_decomposed_rel_pos,
+    get_abs_pos,
+    window_partition,
+    window_unpartition,
+)
+logger = logging.getLogger(__name__)
+__all__ = ["ViT"]
+class Attention(nn.Module):
+    """Multi-head Attention block with relative position embeddings."""
+    def __init__(
+        self,
+        dim,
+        num_heads=8,
+        qkv_bias=True,
+        use_rel_pos=False,
+        rel_pos_zero_init=True,
+        input_size=None,
+    ):
+        """
+        Args:
+            dim (int): Number of input channels.
+            num_heads (int): Number of attention heads.
+            qkv_bias (bool:  If True, add a learnable bias to query, key, value.
+            rel_pos (bool): If True, add relative positional embeddings to the attention map.
+            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+            input_size (int or None): Input resolution for calculating the relative positional
+                parameter size.
+        """
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.proj = nn.Linear(dim, dim)
+        self.use_rel_pos = use_rel_pos
+        if self.use_rel_pos:
+            # initialize relative positional embeddings
+            self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, head_dim))
+            self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim))
+            if not rel_pos_zero_init:
+                trunc_normal_(self.rel_pos_h, std=0.02)
+                trunc_normal_(self.rel_pos_w, std=0.02)
+    def forward(self, x):
+        B, H, W, _ = x.shape
+        # qkv with shape (3, B, nHead, H * W, C)
+        qkv = self.qkv(x).reshape(B, H * W, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        # q, k, v with shape (B * nHead, H * W, C)
+        q, k, v = qkv.reshape(3, B * self.num_heads, H * W, -1).unbind(0)
+        attn = (q * self.scale) @ k.transpose(-2, -1)
+        if self.use_rel_pos:
+            attn = add_decomposed_rel_pos(attn, q, self.rel_pos_h, self.rel_pos_w, (H, W), (H, W))
+        attn = attn.softmax(dim=-1)
+        x = (attn @ v).view(B, self.num_heads, H, W, -1).permute(0, 2, 3, 1, 4).reshape(B, H, W, -1)
+        x = self.proj(x)
+        return x
+class ResBottleneckBlock(CNNBlockBase):
+    """
+    The standard bottleneck residual block without the last activation layer.
+    It contains 3 conv layers with kernels 1x1, 3x3, 1x1.
+    """
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        bottleneck_channels,
+        norm="LN",
+        act_layer=nn.GELU,
+    ):
+        """
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            bottleneck_channels (int): number of output channels for the 3x3
+                "bottleneck" conv layers.
+            norm (str or callable): normalization for all conv layers.
+                See :func:`layers.get_norm` for supported format.
+            act_layer (callable): activation for all conv layers.
+        """
+        super().__init__(in_channels, out_channels, 1)
+        self.conv1 = Conv2d(in_channels, bottleneck_channels, 1, bias=False)
+        self.norm1 = get_norm(norm, bottleneck_channels)
+        self.act1 = act_layer()
+        self.conv2 = Conv2d(
+            bottleneck_channels,
+            bottleneck_channels,
+            3,
+            padding=1,
+            bias=False,
+        )
+        self.norm2 = get_norm(norm, bottleneck_channels)
+        self.act2 = act_layer()
+        self.conv3 = Conv2d(bottleneck_channels, out_channels, 1, bias=False)
+        self.norm3 = get_norm(norm, out_channels)
+        for layer in [self.conv1, self.conv2, self.conv3]:
+            weight_init.c2_msra_fill(layer)
+        for layer in [self.norm1, self.norm2]:
+            layer.weight.data.fill_(1.0)
+            layer.bias.data.zero_()
+        # zero init last norm layer.
+        self.norm3.weight.data.zero_()
+        self.norm3.bias.data.zero_()
+    def forward(self, x):
+        out = x
+        for layer in self.children():
+            out = layer(out)
+        out = x + out
+        return out
+class Block(nn.Module):
+    """Transformer blocks with support of window attention and residual propagation blocks"""
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        drop_path=0.0,
+        norm_layer=nn.LayerNorm,
+        act_layer=nn.GELU,
+        use_rel_pos=False,
+        rel_pos_zero_init=True,
+        window_size=0,
+        use_residual_block=False,
+        input_size=None,
+    ):
+        """
+        Args:
+            dim (int): Number of input channels.
+            num_heads (int): Number of attention heads in each ViT block.
+            mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+            qkv_bias (bool): If True, add a learnable bias to query, key, value.
+            drop_path (float): Stochastic depth rate.
+            norm_layer (nn.Module): Normalization layer.
+            act_layer (nn.Module): Activation layer.
+            use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
+            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+            window_size (int): Window size for window attention blocks. If it equals 0, then not
+                use window attention.
+            use_residual_block (bool): If True, use a residual block after the MLP block.
+            input_size (int or None): Input resolution for calculating the relative positional
+                parameter size.
+        """
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            use_rel_pos=use_rel_pos,
+            rel_pos_zero_init=rel_pos_zero_init,
+            input_size=input_size if window_size == 0 else (window_size, window_size),
+        )
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        self.mlp = Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer)
+        self.window_size = window_size
+        self.use_residual_block = use_residual_block
+        if use_residual_block:
+            # Use a residual block with bottleneck channel as dim // 2
+            self.residual = ResBottleneckBlock(
+                in_channels=dim,
+                out_channels=dim,
+                bottleneck_channels=dim // 2,
+                norm="LN",
+                act_layer=act_layer,
+            )
+    def forward(self, x):
+        shortcut = x
+        x = self.norm1(x)
+        # Window partition
+        if self.window_size > 0:
+            H, W = x.shape[1], x.shape[2]
+            x, pad_hw = window_partition(x, self.window_size)
+        x = self.attn(x)
+        # Reverse window partition
+        if self.window_size > 0:
+            x = window_unpartition(x, self.window_size, pad_hw, (H, W))
+        x = shortcut + self.drop_path(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        if self.use_residual_block:
+            x = self.residual(x.permute(0, 3, 1, 2)).permute(0, 2, 3, 1)
+        return x
+class ViT(Backbone):
+    """
+    This module implements Vision Transformer (ViT) backbone in :paper:`vitdet`.
+    "Exploring Plain Vision Transformer Backbones for Object Detection",
+    https://arxiv.org/abs/2203.16527
+    """
+    def __init__(
+        self,
+        img_size=1024,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        drop_path_rate=0.0,
+        norm_layer=nn.LayerNorm,
+        act_layer=nn.GELU,
+        use_abs_pos=True,
+        use_rel_pos=False,
+        rel_pos_zero_init=True,
+        window_size=0,
+        window_block_indexes=(),
+        residual_block_indexes=(),
+        use_act_checkpoint=True,
+        pretrain_img_size=224,
+        pretrain_use_cls_token=True,
+        out_feature="last_feat",
+    ):
+        """
+        Args:
+            img_size (int): Input image size.
+            patch_size (int): Patch size.
+            in_chans (int): Number of input image channels.
+            embed_dim (int): Patch embedding dimension.
+            depth (int): Depth of ViT.
+            num_heads (int): Number of attention heads in each ViT block.
+            mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+            qkv_bias (bool): If True, add a learnable bias to query, key, value.
+            drop_path_rate (float): Stochastic depth rate.
+            norm_layer (nn.Module): Normalization layer.
+            act_layer (nn.Module): Activation layer.
+            use_abs_pos (bool): If True, use absolute positional embeddings.
+            use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
+            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+            window_size (int): Window size for window attention blocks.
+            window_block_indexes (list): Indexes for blocks using window attention.
+            residual_block_indexes (list): Indexes for blocks using conv propagation.
+            use_act_checkpoint (bool): If True, use activation checkpointing.
+            pretrain_img_size (int): input image size for pretraining models.
+            pretrain_use_cls_token (bool): If True, pretrainig models use class token.
+            out_feature (str): name of the feature from the last block.
+        """
+        super().__init__()
+        self.pretrain_use_cls_token = pretrain_use_cls_token
+        self.use_act_checkpoint = use_act_checkpoint
+        self.patch_embed = PatchEmbed(
+            kernel_size=(patch_size, patch_size),
+            stride=(patch_size, patch_size),
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+        )
+        if use_abs_pos:
+            # Initialize absolute positional embedding with pretrain image size.
+            num_patches = (pretrain_img_size // patch_size) * (pretrain_img_size // patch_size)
+            num_positions = (num_patches + 1) if pretrain_use_cls_token else num_patches
+            self.pos_embed = nn.Parameter(torch.zeros(1, num_positions, embed_dim))
+        else:
+            self.pos_embed = None
+        # stochastic depth decay rule
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]
+        self.blocks = nn.ModuleList()
+        for i in range(depth):
+            block = Block(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                act_layer=act_layer,
+                use_rel_pos=use_rel_pos,
+                rel_pos_zero_init=rel_pos_zero_init,
+                window_size=window_size if i in window_block_indexes else 0,
+                use_residual_block=i in residual_block_indexes,
+                input_size=(img_size // patch_size, img_size // patch_size),
+            )
+            self.blocks.append(block)
+        self._out_feature_channels = {out_feature: embed_dim}
+        self._out_feature_strides = {out_feature: patch_size}
+        self._out_features = [out_feature]
+        if self.pos_embed is not None:
+            trunc_normal_(self.pos_embed, std=0.02)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    def forward(self, x):
+        x = self.patch_embed(x)
+        if self.pos_embed is not None:
+            x = x + get_abs_pos(
+                self.pos_embed, self.pretrain_use_cls_token, (x.shape[1], x.shape[2])
+            )
+        for blk in self.blocks:
+            if self.use_act_checkpoint:
+                x = checkpoint.checkpoint(blk, x)
+            else:
+                x = blk(x)
+        return x.permute(0, 3, 1, 2)
+class ViT_FPN(Backbone):
+    def __init__(self, bottom_up=None, top_block=None, out_channels=None, strides=None, vit_out_dim=None):
+        super(ViT_FPN, self).__init__()
+        assert isinstance(bottom_up, Backbone)
+        self.bottom_up = bottom_up
+        self.top_block = top_block
+        self._out_feature_strides = {"p{}".format(int(math.log2(s))): s for s in strides}
+        self._out_features = list(self._out_feature_strides.keys())
+        self._out_feature_channels = {k: out_channels for k in self._out_features}
+        self._size_divisibility = strides[2]
+        self.maxpool = nn.MaxPool2d(2, stride=2)
+        self.fpn_stride_16_8 = nn.ConvTranspose2d(vit_out_dim, vit_out_dim, 2, stride=2, bias=False)
+        self.fpn_stride8_conv1 = nn.Conv2d(in_channels=vit_out_dim, out_channels=out_channels, kernel_size=1, bias=False)
+        self.fpn_stride8_norm1 = nn.LayerNorm(out_channels)
+        self.fpn_stride8_conv2 = nn.Conv2d(in_channels=out_channels, out_channels=out_channels, kernel_size=3, stride=1, padding=1, bias=False)
+        self.fpn_stride8_norm2 = nn.LayerNorm(out_channels)
+        self.fpn_stride16_conv1 = nn.Conv2d(in_channels=vit_out_dim, out_channels=out_channels, kernel_size=1, bias=False)
+        self.fpn_stride16_norm1 = nn.LayerNorm(out_channels)
+        self.fpn_stride16_conv2 = nn.Conv2d(in_channels=out_channels, out_channels=out_channels, kernel_size=3, stride=1, padding=1, bias=False)
+        self.fpn_stride16_norm2 = nn.LayerNorm(out_channels)
+        self.fpn_stride32_conv1 = nn.Conv2d(in_channels=vit_out_dim, out_channels=out_channels, kernel_size=1, bias=False)
+        self.fpn_stride32_norm1 = nn.LayerNorm(out_channels)
+        self.fpn_stride32_conv2 = nn.Conv2d(in_channels=out_channels, out_channels=out_channels, kernel_size=3, stride=1, padding=1, bias=False)
+        self.fpn_stride32_norm2 = nn.LayerNorm(out_channels)
+    def forward(self, x):
+        vit_output_featuremap = self.bottom_up(x)
+        stride8_feature = self.fpn_stride_16_8(vit_output_featuremap)
+        stride8_feature = self.fpn_stride8_norm1(self.fpn_stride8_conv1(stride8_feature)
+                                                 .permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
+        stride8_feature = self.fpn_stride8_norm2(self.fpn_stride8_conv2(stride8_feature)
+                                                 .permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
+        stride32_feature = self.maxpool(vit_output_featuremap)
+        stride32_feature = self.fpn_stride32_norm1(self.fpn_stride32_conv1(stride32_feature)
+                                                   .permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
+        stride32_feature = self.fpn_stride32_norm2(self.fpn_stride32_conv2(stride32_feature)
+                                                   .permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
+        stride16_feature = self.fpn_stride16_norm1(self.fpn_stride16_conv1(vit_output_featuremap).
+                                                   permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
+        stride16_feature = self.fpn_stride16_norm2(self.fpn_stride16_conv2(stride16_feature)
+                                                   .permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
+        results = [stride8_feature, stride16_feature, stride32_feature]
+        results.extend(self.top_block(stride32_feature))
+        assert len(self._out_features) == len(results)
+        fpn_out = {f: res for f, res in zip(self._out_features, results)}
+        return fpn_out
+    @property
+    def size_divisibility(self):
+        return self._size_divisibility
+    def output_shape(self):
+        return {
+            name: ShapeSpec(
+                channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
+            )
+            for name in self._out_features
+        }
+@BACKBONE_REGISTRY.register()
+def build_vit_fpn_backbone(cfg, input_shape: ShapeSpec):
+    embed_dim = 768
+    vit_out_dim = embed_dim
+    bottom_up = ViT(  # Single-scale ViT backbone
+        img_size=1024,
+        patch_size=16,
+        embed_dim=embed_dim,
+        depth=12,
+        num_heads=12,
+        drop_path_rate=0.1,
+        window_size=14,
+        mlp_ratio=4,
+        qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        window_block_indexes=[
+            # 2, 5, 8 11 for global attention
+            0,
+            1,
+            3,
+            4,
+            6,
+            7,
+            9,
+            10,
+        ],
+        residual_block_indexes=[],
+        use_act_checkpoint=cfg.USE_ACT_CHECKPOINT,
+        use_rel_pos=True,
+        out_feature="last_feat",)
+    out_channels = cfg.MODEL.FPN.OUT_CHANNELS
+    assert out_channels == 256 or out_channels == 768 or out_channels == 1024
+    backbone = ViT_FPN(bottom_up=bottom_up,
+                       top_block=LastLevelP6P7_P5(out_channels, out_channels),
+                       out_channels=out_channels,
+                       strides=[8, 16, 32, 64, 128],
+                       vit_out_dim=vit_out_dim)
+    return backbone
+@BACKBONE_REGISTRY.register()
+def build_vit_fpn_backbone_large(cfg, input_shape: ShapeSpec):
+    window_block_indexes = (list(range(0, 5)) + list(range(6, 11)) + list(range(12, 17)) + list(range(18, 23)))
+    embed_dim = 1024
+    vit_out_dim = embed_dim
+    bottom_up = ViT(  # Single-scale ViT backbone
+        img_size=1024,
+        patch_size=16,
+        embed_dim=embed_dim,
+        depth=24,
+        num_heads=16,
+        drop_path_rate=0.4,
+        window_size=14,
+        mlp_ratio=4,
+        qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        window_block_indexes=window_block_indexes,
+        residual_block_indexes=[],
+        use_act_checkpoint=cfg.USE_ACT_CHECKPOINT,
+        use_rel_pos=True,
+        out_feature="last_feat",)
+    out_channels = cfg.MODEL.FPN.OUT_CHANNELS
+    assert out_channels == 256 or out_channels == 768 or out_channels == 1024
+    backbone = ViT_FPN(bottom_up=bottom_up,
+                          top_block=LastLevelP6P7_P5(out_channels, out_channels),
+                          out_channels=out_channels,
+                          strides=[8, 16, 32, 64, 128],
+                          vit_out_dim=vit_out_dim)
+    return backbone
+@BACKBONE_REGISTRY.register()
+def build_vit_fpn_backbone_huge(cfg, input_shape: ShapeSpec):
+    window_block_indexes = (list(range(0, 7)) + list(range(8, 15)) + list(range(16, 23)) + list(range(24, 31)))
+    embed_dim = 1280
+    vit_out_dim = embed_dim
+    bottom_up = ViT(  # Single-scale ViT backbone
+        img_size=1024,
+        patch_size=16,
+        embed_dim=embed_dim,
+        depth=32,
+        num_heads=16,
+        drop_path_rate=0.5,
+        window_size=14,
+        mlp_ratio=4,
+        qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        window_block_indexes=window_block_indexes,
+        residual_block_indexes=[],
+        use_act_checkpoint=cfg.USE_ACT_CHECKPOINT,
+        use_rel_pos=True,
+        out_feature="last_feat",)
+    out_channels = cfg.MODEL.FPN.OUT_CHANNELS
+    assert out_channels == 256 or out_channels == 768 or out_channels == 1024
+    backbone = ViT_FPN(bottom_up=bottom_up,
+                          top_block=LastLevelP6P7_P5(out_channels, out_channels),
+                          out_channels=out_channels,
+                          strides=[8, 16, 32, 64, 128],
+                          vit_out_dim=vit_out_dim)
+    return backbone

models/grit_src/grit/modeling/meta_arch/__pycache__/grit.cpython-38.pyc ADDED Viewed

Binary file (2.49 kB). View file

models/grit_src/grit/modeling/meta_arch/grit.py ADDED Viewed

	@@ -0,0 +1,66 @@

+from typing import Dict, List, Optional, Tuple
+import torch
+from detectron2.config import configurable
+from detectron2.structures import ImageList, Instances, Boxes
+from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY
+from detectron2.modeling.meta_arch.rcnn import GeneralizedRCNN
+@META_ARCH_REGISTRY.register()
+class GRiT(GeneralizedRCNN):
+    @configurable
+    def __init__(
+        self,
+        **kwargs):
+        super().__init__(**kwargs)
+        assert self.proposal_generator is not None
+    @classmethod
+    def from_config(cls, cfg):
+        ret = super().from_config(cfg)
+        return ret
+    def inference(
+        self,
+        batched_inputs: Tuple[Dict[str, torch.Tensor]],
+        detected_instances: Optional[List[Instances]] = None,
+        do_postprocess: bool = True,
+    ):
+        assert not self.training
+        assert detected_instances is None
+        images = self.preprocess_image(batched_inputs)
+        features = self.backbone(images.tensor)
+        proposals, _ = self.proposal_generator(images, features, None)
+        results, _ = self.roi_heads(features, proposals)
+        if do_postprocess:
+            assert not torch.jit.is_scripting(), \
+                "Scripting is not supported for postprocess."
+            return GRiT._postprocess(
+                results, batched_inputs, images.image_sizes)
+        else:
+            return results
+    def forward(self, batched_inputs: List[Dict[str, torch.Tensor]]):
+        if not self.training:
+            return self.inference(batched_inputs)
+        images = self.preprocess_image(batched_inputs)
+        gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
+        targets_task = batched_inputs[0]['task']
+        for anno_per_image in batched_inputs:
+            assert targets_task == anno_per_image['task']
+        features = self.backbone(images.tensor)
+        proposals, proposal_losses = self.proposal_generator(
+            images, features, gt_instances)
+        proposals, roihead_textdecoder_losses = self.roi_heads(
+            features, proposals, gt_instances, targets_task=targets_task)
+        losses = {}
+        losses.update(roihead_textdecoder_losses)
+        losses.update(proposal_losses)
+        return losses