dereksiyuanli
/

masa

masa

Model card Files Files and versions Community

dereksiyuanli commited on Jun 12

Commit

ab0701b

•

1 Parent(s): b456849

Push model using huggingface_hub.

Browse files

Files changed (3) hide show

README.md +10 -3
config.json +513 -0
model.safetensors +3 -0

README.md CHANGED Viewed

@@ -1,3 +1,10 @@
----
-license: apache-2.0
----

+---
+library_name: masa
+tags:
+- pytorch_model_hub_mixin
+- model_hub_mixin
+---
+This model has been pushed to the Hub using the [PytorchModelHubMixin](https://huggingface.co/docs/huggingface_hub/package_reference/mixins#huggingface_hub.PyTorchModelHubMixin) integration:
+- Library: https://github.com/siyuanliii/masa
+- Docs: [More Information Needed]

config.json ADDED Viewed

	@@ -0,0 +1,513 @@

+{
+  "backbone": null,
+  "benchmark": "tao",
+  "data_preprocessor": {
+    "bgr_to_rgb": true,
+    "mean": [
+      123.675,
+      116.28,
+      103.53
+    ],
+    "pad_mask": false,
+    "pad_size_divisor": 32,
+    "std": [
+      58.395,
+      57.12,
+      57.375
+    ],
+    "type": "TrackDataPreprocessor"
+  },
+  "detector": {
+    "as_two_stage": true,
+    "backbone": {
+      "attn_drop_rate": 0.0,
+      "convert_weights": false,
+      "depths": [
+        2,
+        2,
+        18,
+        2
+      ],
+      "drop_path_rate": 0.3,
+      "drop_rate": 0.0,
+      "embed_dims": 128,
+      "mlp_ratio": 4,
+      "num_heads": [
+        4,
+        8,
+        16,
+        32
+      ],
+      "out_indices": [
+        1,
+        2,
+        3
+      ],
+      "patch_norm": true,
+      "pretrain_img_size": 384,
+      "qk_scale": null,
+      "qkv_bias": true,
+      "type": "SwinTransformer",
+      "window_size": 12,
+      "with_cp": false
+    },
+    "bbox_head": {
+      "as_two_stage": true,
+      "contrastive_cfg": {
+        "max_text_len": 256
+      },
+      "loss_bbox": {
+        "loss_weight": 5.0,
+        "type": "L1Loss"
+      },
+      "loss_cls": {
+        "alpha": 0.25,
+        "gamma": 2.0,
+        "loss_weight": 1.0,
+        "type": "FocalLoss",
+        "use_sigmoid": true
+      },
+      "num_classes": 80,
+      "num_pred_layer": 7,
+      "share_pred_layer": false,
+      "sync_cls_avg_factor": true,
+      "test_cfg": {
+        "max_per_img": 300
+      },
+      "train_cfg": null,
+      "type": "GroundingDINOHead"
+    },
+    "decoder": {
+      "layer_cfg": {
+        "cross_attn_cfg": {
+          "batch_first": true,
+          "dropout": 0.0,
+          "embed_dims": 256,
+          "num_heads": 8
+        },
+        "cross_attn_text_cfg": {
+          "batch_first": true,
+          "dropout": 0.0,
+          "embed_dims": 256,
+          "num_heads": 8
+        },
+        "ffn_cfg": {
+          "embed_dims": 256,
+          "feedforward_channels": 2048,
+          "ffn_drop": 0.0
+        },
+        "self_attn_cfg": {
+          "batch_first": true,
+          "dropout": 0.0,
+          "embed_dims": 256,
+          "num_heads": 8
+        }
+      },
+      "num_layers": 6,
+      "post_norm_cfg": null,
+      "return_intermediate": true
+    },
+    "dn_cfg": {
+      "box_noise_scale": 1.0,
+      "embed_dims": 256,
+      "group_cfg": {
+        "dynamic": true,
+        "num_dn_queries": 100,
+        "num_groups": null
+      },
+      "label_noise_scale": 0.5,
+      "num_classes": 80,
+      "num_matching_queries": 900
+    },
+    "encoder": {
+      "fusion_layer_cfg": {
+        "embed_dim": 1024,
+        "init_values": 0.0001,
+        "l_dim": 256,
+        "num_heads": 4,
+        "v_dim": 256
+      },
+      "layer_cfg": {
+        "ffn_cfg": {
+          "embed_dims": 256,
+          "feedforward_channels": 2048,
+          "ffn_drop": 0.0
+        },
+        "self_attn_cfg": {
+          "batch_first": true,
+          "dropout": 0.0,
+          "embed_dims": 256,
+          "num_levels": 4
+        }
+      },
+      "num_layers": 6,
+      "text_layer_cfg": {
+        "ffn_cfg": {
+          "embed_dims": 256,
+          "feedforward_channels": 1024,
+          "ffn_drop": 0.0
+        },
+        "self_attn_cfg": {
+          "batch_first": true,
+          "dropout": 0.0,
+          "embed_dims": 256,
+          "num_heads": 4
+        }
+      }
+    },
+    "init_cfg": {
+      "checkpoint": "saved_models/tsa_models/groundingdino_swinb_cogcoor_mmdet-55949c9c.pth",
+      "type": "Pretrained"
+    },
+    "language_model": {
+      "add_pooling_layer": true,
+      "name": "bert-base-uncased",
+      "pad_to_max": false,
+      "special_tokens_list": [
+        "[CLS]",
+        "[SEP]",
+        ".",
+        "?"
+      ],
+      "type": "BertModel",
+      "use_sub_sentence_represent": true
+    },
+    "neck": {
+      "act_cfg": null,
+      "bias": true,
+      "in_channels": [
+        256,
+        512,
+        1024
+      ],
+      "kernel_size": 1,
+      "norm_cfg": {
+        "num_groups": 32,
+        "type": "GN"
+      },
+      "num_outs": 4,
+      "out_channels": 256,
+      "type": "ChannelMapper"
+    },
+    "num_queries": 900,
+    "positional_encoding": {
+      "normalize": true,
+      "num_feats": 128,
+      "offset": 0.0,
+      "temperature": 20
+    },
+    "test_cfg": {
+      "max_per_img": 300
+    },
+    "train_cfg": null,
+    "type": "GroundingDINOMasa",
+    "with_box_refine": true
+  },
+  "end_pkl_name": ".pth",
+  "freeze_detector": true,
+  "freeze_masa_adapter": false,
+  "freeze_masa_backbone": false,
+  "freeze_object_prior_distillation": false,
+  "given_dets": false,
+  "init_cfg": null,
+  "load_public_dets": false,
+  "masa_adapter": [
+    {
+      "in_channels": [
+        256,
+        512,
+        1024
+      ],
+      "norm_cfg": {
+        "requires_grad": true,
+        "type": "SyncBN"
+      },
+      "num_outs": 5,
+      "out_channels": 256,
+      "type": "FPN"
+    },
+    {
+      "in_channels": 256,
+      "num_blocks": 3,
+      "out_channels": 256,
+      "type": "DeformFusion"
+    }
+  ],
+  "public_det_path": "results/public_dets/tao_val_dets/teta_50_internms/teter_swinT_tao_val_internms_50/",
+  "roi_head": {
+    "bbox_head": {
+      "bbox_coder": {
+        "target_means": [
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ],
+        "target_stds": [
+          0.1,
+          0.1,
+          0.2,
+          0.2
+        ],
+        "type": "DeltaXYWHBBoxCoder"
+      },
+      "fc_out_channels": 1024,
+      "in_channels": 256,
+      "loss_bbox": {
+        "loss_weight": 1.0,
+        "type": "L1Loss"
+      },
+      "loss_cls": {
+        "loss_weight": 1.0,
+        "type": "CrossEntropyLoss",
+        "use_sigmoid": false
+      },
+      "num_classes": 1,
+      "reg_class_agnostic": true,
+      "roi_feat_size": 7,
+      "type": "Shared2FCBBoxHead"
+    },
+    "bbox_roi_extractor": {
+      "featmap_strides": [
+        8,
+        16,
+        32
+      ],
+      "out_channels": 256,
+      "roi_layer": {
+        "output_size": 7,
+        "sampling_ratio": 0,
+        "type": "RoIAlign"
+      },
+      "type": "SingleRoIExtractor"
+    },
+    "test_cfg": {
+      "mask_thr_binary": 0.5,
+      "max_per_img": 50,
+      "nms": {
+        "class_agnostic": true,
+        "iou_threshold": 0.5,
+        "split_thr": 100000,
+        "type": "nms"
+      },
+      "score_thr": 0.02
+    },
+    "train_cfg": {
+      "assigner": {
+        "ignore_iof_thr": -1,
+        "match_low_quality": false,
+        "min_pos_iou": 0.5,
+        "neg_iou_thr": 0.5,
+        "pos_iou_thr": 0.5,
+        "type": "MaxIoUAssigner"
+      },
+      "debug": false,
+      "pos_weight": -1,
+      "sampler": {
+        "add_gt_as_proposals": true,
+        "neg_pos_ub": -1,
+        "num": 512,
+        "pos_fraction": 0.25,
+        "type": "RandomSampler"
+      }
+    },
+    "type": "StandardRoIHead"
+  },
+  "rpn_head": {
+    "anchor_generator": {
+      "ratios": [
+        0.5,
+        1.0,
+        2.0
+      ],
+      "scales": [
+        8
+      ],
+      "strides": [
+        8,
+        16,
+        32,
+        64,
+        128
+      ],
+      "type": "AnchorGenerator"
+    },
+    "bbox_coder": {
+      "target_means": [
+        0.0,
+        0.0,
+        0.0,
+        0.0
+      ],
+      "target_stds": [
+        1.0,
+        1.0,
+        1.0,
+        1.0
+      ],
+      "type": "DeltaXYWHBBoxCoder"
+    },
+    "feat_channels": 256,
+    "in_channels": 256,
+    "loss_bbox": {
+      "beta": 0.1111111111111111,
+      "loss_weight": 1.0,
+      "type": "SmoothL1Loss"
+    },
+    "loss_cls": {
+      "loss_weight": 1.0,
+      "type": "CrossEntropyLoss",
+      "use_sigmoid": true
+    },
+    "type": "RPNHead"
+  },
+  "test_cfg": {
+    "rcnn": {
+      "mask_thr_binary": 0.5,
+      "max_per_img": 50,
+      "nms": {
+        "class_agnostic": true,
+        "iou_threshold": 0.5,
+        "split_thr": 100000,
+        "type": "nms"
+      },
+      "score_thr": 0.02
+    },
+    "rpn": {
+      "max_per_img": 1000,
+      "min_bbox_size": 0,
+      "nms": {
+        "iou_threshold": 0.7,
+        "type": "nms"
+      },
+      "nms_pre": 1000
+    }
+  },
+  "track_head": {
+    "embed_head": {
+      "embed_channels": 256,
+      "loss_track": {
+        "loss_weight": 0.25,
+        "type": "UnbiasedContrastLoss"
+      },
+      "loss_track_aux": {
+        "hard_mining": true,
+        "loss_weight": 1.0,
+        "neg_margin": 0.1,
+        "neg_pos_ub": 3,
+        "pos_margin": 0,
+        "type": "MarginL2Loss"
+      },
+      "norm_cfg": {
+        "num_groups": 32,
+        "type": "GN"
+      },
+      "num_convs": 4,
+      "num_fcs": 1,
+      "type": "QuasiDenseEmbedHead"
+    },
+    "roi_extractor": {
+      "featmap_strides": [
+        8,
+        16,
+        32
+      ],
+      "out_channels": 256,
+      "roi_layer": {
+        "output_size": 7,
+        "sampling_ratio": 0,
+        "type": "RoIAlign"
+      },
+      "type": "SingleRoIExtractor"
+    },
+    "train_cfg": {
+      "assigner": {
+        "ignore_iof_thr": -1,
+        "match_low_quality": false,
+        "min_pos_iou": 0.5,
+        "neg_iou_thr": 0.3,
+        "pos_iou_thr": 0.7,
+        "type": "MaxIoUAssigner"
+      },
+      "sampler": {
+        "add_gt_as_proposals": true,
+        "neg_pos_ub": 3,
+        "neg_sampler": {
+          "type": "RandomSampler"
+        },
+        "num": 512,
+        "pos_fraction": 0.5,
+        "pos_sampler": {
+          "type": "InstanceBalancedPosSampler"
+        },
+        "type": "CombinedSampler"
+      }
+    },
+    "type": "QuasiDenseTrackHead"
+  },
+  "tracker": {
+    "fps": 30,
+    "init_score_thr": 0.1,
+    "match_metric": "bisoftmax",
+    "match_score_thr": 0.5,
+    "max_distance": 100,
+    "memo_momentum": 0.8,
+    "memo_tracklet_frames": 10,
+    "obj_score_thr": 0.01,
+    "type": "MasaTaoTracker",
+    "with_cats": false
+  },
+  "train_cfg": {
+    "rcnn": {
+      "assigner": {
+        "ignore_iof_thr": -1,
+        "match_low_quality": false,
+        "min_pos_iou": 0.5,
+        "neg_iou_thr": 0.5,
+        "pos_iou_thr": 0.5,
+        "type": "MaxIoUAssigner"
+      },
+      "debug": false,
+      "pos_weight": -1,
+      "sampler": {
+        "add_gt_as_proposals": true,
+        "neg_pos_ub": -1,
+        "num": 512,
+        "pos_fraction": 0.25,
+        "type": "RandomSampler"
+      }
+    },
+    "rpn": {
+      "allowed_border": -1,
+      "assigner": {
+        "ignore_iof_thr": -1,
+        "match_low_quality": true,
+        "min_pos_iou": 0.3,
+        "neg_iou_thr": 0.3,
+        "pos_iou_thr": 0.7,
+        "type": "MaxIoUAssigner"
+      },
+      "debug": false,
+      "pos_weight": -1,
+      "sampler": {
+        "add_gt_as_proposals": false,
+        "neg_pos_ub": -1,
+        "num": 256,
+        "pos_fraction": 0.5,
+        "type": "RandomSampler"
+      }
+    },
+    "rpn_proposal": {
+      "max_per_img": 1000,
+      "min_bbox_size": 0,
+      "nms": {
+        "iou_threshold": 0.7,
+        "type": "nms"
+      },
+      "nms_pre": 2000
+    }
+  },
+  "unified_backbone": true,
+  "use_masa_backbone": false,
+  "with_segm": false
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e7b2ca27e21a2ea49ef0304864f1ae5d0c41852121f4640833f606692aaea0f0
+size 1090774448