dereksiyuanli commited on
Commit
ab0701b
1 Parent(s): b456849

Push model using huggingface_hub.

Browse files
Files changed (3) hide show
  1. README.md +10 -3
  2. config.json +513 -0
  3. model.safetensors +3 -0
README.md CHANGED
@@ -1,3 +1,10 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: masa
3
+ tags:
4
+ - pytorch_model_hub_mixin
5
+ - model_hub_mixin
6
+ ---
7
+
8
+ This model has been pushed to the Hub using the [PytorchModelHubMixin](https://huggingface.co/docs/huggingface_hub/package_reference/mixins#huggingface_hub.PyTorchModelHubMixin) integration:
9
+ - Library: https://github.com/siyuanliii/masa
10
+ - Docs: [More Information Needed]
config.json ADDED
@@ -0,0 +1,513 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backbone": null,
3
+ "benchmark": "tao",
4
+ "data_preprocessor": {
5
+ "bgr_to_rgb": true,
6
+ "mean": [
7
+ 123.675,
8
+ 116.28,
9
+ 103.53
10
+ ],
11
+ "pad_mask": false,
12
+ "pad_size_divisor": 32,
13
+ "std": [
14
+ 58.395,
15
+ 57.12,
16
+ 57.375
17
+ ],
18
+ "type": "TrackDataPreprocessor"
19
+ },
20
+ "detector": {
21
+ "as_two_stage": true,
22
+ "backbone": {
23
+ "attn_drop_rate": 0.0,
24
+ "convert_weights": false,
25
+ "depths": [
26
+ 2,
27
+ 2,
28
+ 18,
29
+ 2
30
+ ],
31
+ "drop_path_rate": 0.3,
32
+ "drop_rate": 0.0,
33
+ "embed_dims": 128,
34
+ "mlp_ratio": 4,
35
+ "num_heads": [
36
+ 4,
37
+ 8,
38
+ 16,
39
+ 32
40
+ ],
41
+ "out_indices": [
42
+ 1,
43
+ 2,
44
+ 3
45
+ ],
46
+ "patch_norm": true,
47
+ "pretrain_img_size": 384,
48
+ "qk_scale": null,
49
+ "qkv_bias": true,
50
+ "type": "SwinTransformer",
51
+ "window_size": 12,
52
+ "with_cp": false
53
+ },
54
+ "bbox_head": {
55
+ "as_two_stage": true,
56
+ "contrastive_cfg": {
57
+ "max_text_len": 256
58
+ },
59
+ "loss_bbox": {
60
+ "loss_weight": 5.0,
61
+ "type": "L1Loss"
62
+ },
63
+ "loss_cls": {
64
+ "alpha": 0.25,
65
+ "gamma": 2.0,
66
+ "loss_weight": 1.0,
67
+ "type": "FocalLoss",
68
+ "use_sigmoid": true
69
+ },
70
+ "num_classes": 80,
71
+ "num_pred_layer": 7,
72
+ "share_pred_layer": false,
73
+ "sync_cls_avg_factor": true,
74
+ "test_cfg": {
75
+ "max_per_img": 300
76
+ },
77
+ "train_cfg": null,
78
+ "type": "GroundingDINOHead"
79
+ },
80
+ "decoder": {
81
+ "layer_cfg": {
82
+ "cross_attn_cfg": {
83
+ "batch_first": true,
84
+ "dropout": 0.0,
85
+ "embed_dims": 256,
86
+ "num_heads": 8
87
+ },
88
+ "cross_attn_text_cfg": {
89
+ "batch_first": true,
90
+ "dropout": 0.0,
91
+ "embed_dims": 256,
92
+ "num_heads": 8
93
+ },
94
+ "ffn_cfg": {
95
+ "embed_dims": 256,
96
+ "feedforward_channels": 2048,
97
+ "ffn_drop": 0.0
98
+ },
99
+ "self_attn_cfg": {
100
+ "batch_first": true,
101
+ "dropout": 0.0,
102
+ "embed_dims": 256,
103
+ "num_heads": 8
104
+ }
105
+ },
106
+ "num_layers": 6,
107
+ "post_norm_cfg": null,
108
+ "return_intermediate": true
109
+ },
110
+ "dn_cfg": {
111
+ "box_noise_scale": 1.0,
112
+ "embed_dims": 256,
113
+ "group_cfg": {
114
+ "dynamic": true,
115
+ "num_dn_queries": 100,
116
+ "num_groups": null
117
+ },
118
+ "label_noise_scale": 0.5,
119
+ "num_classes": 80,
120
+ "num_matching_queries": 900
121
+ },
122
+ "encoder": {
123
+ "fusion_layer_cfg": {
124
+ "embed_dim": 1024,
125
+ "init_values": 0.0001,
126
+ "l_dim": 256,
127
+ "num_heads": 4,
128
+ "v_dim": 256
129
+ },
130
+ "layer_cfg": {
131
+ "ffn_cfg": {
132
+ "embed_dims": 256,
133
+ "feedforward_channels": 2048,
134
+ "ffn_drop": 0.0
135
+ },
136
+ "self_attn_cfg": {
137
+ "batch_first": true,
138
+ "dropout": 0.0,
139
+ "embed_dims": 256,
140
+ "num_levels": 4
141
+ }
142
+ },
143
+ "num_layers": 6,
144
+ "text_layer_cfg": {
145
+ "ffn_cfg": {
146
+ "embed_dims": 256,
147
+ "feedforward_channels": 1024,
148
+ "ffn_drop": 0.0
149
+ },
150
+ "self_attn_cfg": {
151
+ "batch_first": true,
152
+ "dropout": 0.0,
153
+ "embed_dims": 256,
154
+ "num_heads": 4
155
+ }
156
+ }
157
+ },
158
+ "init_cfg": {
159
+ "checkpoint": "saved_models/tsa_models/groundingdino_swinb_cogcoor_mmdet-55949c9c.pth",
160
+ "type": "Pretrained"
161
+ },
162
+ "language_model": {
163
+ "add_pooling_layer": true,
164
+ "name": "bert-base-uncased",
165
+ "pad_to_max": false,
166
+ "special_tokens_list": [
167
+ "[CLS]",
168
+ "[SEP]",
169
+ ".",
170
+ "?"
171
+ ],
172
+ "type": "BertModel",
173
+ "use_sub_sentence_represent": true
174
+ },
175
+ "neck": {
176
+ "act_cfg": null,
177
+ "bias": true,
178
+ "in_channels": [
179
+ 256,
180
+ 512,
181
+ 1024
182
+ ],
183
+ "kernel_size": 1,
184
+ "norm_cfg": {
185
+ "num_groups": 32,
186
+ "type": "GN"
187
+ },
188
+ "num_outs": 4,
189
+ "out_channels": 256,
190
+ "type": "ChannelMapper"
191
+ },
192
+ "num_queries": 900,
193
+ "positional_encoding": {
194
+ "normalize": true,
195
+ "num_feats": 128,
196
+ "offset": 0.0,
197
+ "temperature": 20
198
+ },
199
+ "test_cfg": {
200
+ "max_per_img": 300
201
+ },
202
+ "train_cfg": null,
203
+ "type": "GroundingDINOMasa",
204
+ "with_box_refine": true
205
+ },
206
+ "end_pkl_name": ".pth",
207
+ "freeze_detector": true,
208
+ "freeze_masa_adapter": false,
209
+ "freeze_masa_backbone": false,
210
+ "freeze_object_prior_distillation": false,
211
+ "given_dets": false,
212
+ "init_cfg": null,
213
+ "load_public_dets": false,
214
+ "masa_adapter": [
215
+ {
216
+ "in_channels": [
217
+ 256,
218
+ 512,
219
+ 1024
220
+ ],
221
+ "norm_cfg": {
222
+ "requires_grad": true,
223
+ "type": "SyncBN"
224
+ },
225
+ "num_outs": 5,
226
+ "out_channels": 256,
227
+ "type": "FPN"
228
+ },
229
+ {
230
+ "in_channels": 256,
231
+ "num_blocks": 3,
232
+ "out_channels": 256,
233
+ "type": "DeformFusion"
234
+ }
235
+ ],
236
+ "public_det_path": "results/public_dets/tao_val_dets/teta_50_internms/teter_swinT_tao_val_internms_50/",
237
+ "roi_head": {
238
+ "bbox_head": {
239
+ "bbox_coder": {
240
+ "target_means": [
241
+ 0.0,
242
+ 0.0,
243
+ 0.0,
244
+ 0.0
245
+ ],
246
+ "target_stds": [
247
+ 0.1,
248
+ 0.1,
249
+ 0.2,
250
+ 0.2
251
+ ],
252
+ "type": "DeltaXYWHBBoxCoder"
253
+ },
254
+ "fc_out_channels": 1024,
255
+ "in_channels": 256,
256
+ "loss_bbox": {
257
+ "loss_weight": 1.0,
258
+ "type": "L1Loss"
259
+ },
260
+ "loss_cls": {
261
+ "loss_weight": 1.0,
262
+ "type": "CrossEntropyLoss",
263
+ "use_sigmoid": false
264
+ },
265
+ "num_classes": 1,
266
+ "reg_class_agnostic": true,
267
+ "roi_feat_size": 7,
268
+ "type": "Shared2FCBBoxHead"
269
+ },
270
+ "bbox_roi_extractor": {
271
+ "featmap_strides": [
272
+ 8,
273
+ 16,
274
+ 32
275
+ ],
276
+ "out_channels": 256,
277
+ "roi_layer": {
278
+ "output_size": 7,
279
+ "sampling_ratio": 0,
280
+ "type": "RoIAlign"
281
+ },
282
+ "type": "SingleRoIExtractor"
283
+ },
284
+ "test_cfg": {
285
+ "mask_thr_binary": 0.5,
286
+ "max_per_img": 50,
287
+ "nms": {
288
+ "class_agnostic": true,
289
+ "iou_threshold": 0.5,
290
+ "split_thr": 100000,
291
+ "type": "nms"
292
+ },
293
+ "score_thr": 0.02
294
+ },
295
+ "train_cfg": {
296
+ "assigner": {
297
+ "ignore_iof_thr": -1,
298
+ "match_low_quality": false,
299
+ "min_pos_iou": 0.5,
300
+ "neg_iou_thr": 0.5,
301
+ "pos_iou_thr": 0.5,
302
+ "type": "MaxIoUAssigner"
303
+ },
304
+ "debug": false,
305
+ "pos_weight": -1,
306
+ "sampler": {
307
+ "add_gt_as_proposals": true,
308
+ "neg_pos_ub": -1,
309
+ "num": 512,
310
+ "pos_fraction": 0.25,
311
+ "type": "RandomSampler"
312
+ }
313
+ },
314
+ "type": "StandardRoIHead"
315
+ },
316
+ "rpn_head": {
317
+ "anchor_generator": {
318
+ "ratios": [
319
+ 0.5,
320
+ 1.0,
321
+ 2.0
322
+ ],
323
+ "scales": [
324
+ 8
325
+ ],
326
+ "strides": [
327
+ 8,
328
+ 16,
329
+ 32,
330
+ 64,
331
+ 128
332
+ ],
333
+ "type": "AnchorGenerator"
334
+ },
335
+ "bbox_coder": {
336
+ "target_means": [
337
+ 0.0,
338
+ 0.0,
339
+ 0.0,
340
+ 0.0
341
+ ],
342
+ "target_stds": [
343
+ 1.0,
344
+ 1.0,
345
+ 1.0,
346
+ 1.0
347
+ ],
348
+ "type": "DeltaXYWHBBoxCoder"
349
+ },
350
+ "feat_channels": 256,
351
+ "in_channels": 256,
352
+ "loss_bbox": {
353
+ "beta": 0.1111111111111111,
354
+ "loss_weight": 1.0,
355
+ "type": "SmoothL1Loss"
356
+ },
357
+ "loss_cls": {
358
+ "loss_weight": 1.0,
359
+ "type": "CrossEntropyLoss",
360
+ "use_sigmoid": true
361
+ },
362
+ "type": "RPNHead"
363
+ },
364
+ "test_cfg": {
365
+ "rcnn": {
366
+ "mask_thr_binary": 0.5,
367
+ "max_per_img": 50,
368
+ "nms": {
369
+ "class_agnostic": true,
370
+ "iou_threshold": 0.5,
371
+ "split_thr": 100000,
372
+ "type": "nms"
373
+ },
374
+ "score_thr": 0.02
375
+ },
376
+ "rpn": {
377
+ "max_per_img": 1000,
378
+ "min_bbox_size": 0,
379
+ "nms": {
380
+ "iou_threshold": 0.7,
381
+ "type": "nms"
382
+ },
383
+ "nms_pre": 1000
384
+ }
385
+ },
386
+ "track_head": {
387
+ "embed_head": {
388
+ "embed_channels": 256,
389
+ "loss_track": {
390
+ "loss_weight": 0.25,
391
+ "type": "UnbiasedContrastLoss"
392
+ },
393
+ "loss_track_aux": {
394
+ "hard_mining": true,
395
+ "loss_weight": 1.0,
396
+ "neg_margin": 0.1,
397
+ "neg_pos_ub": 3,
398
+ "pos_margin": 0,
399
+ "type": "MarginL2Loss"
400
+ },
401
+ "norm_cfg": {
402
+ "num_groups": 32,
403
+ "type": "GN"
404
+ },
405
+ "num_convs": 4,
406
+ "num_fcs": 1,
407
+ "type": "QuasiDenseEmbedHead"
408
+ },
409
+ "roi_extractor": {
410
+ "featmap_strides": [
411
+ 8,
412
+ 16,
413
+ 32
414
+ ],
415
+ "out_channels": 256,
416
+ "roi_layer": {
417
+ "output_size": 7,
418
+ "sampling_ratio": 0,
419
+ "type": "RoIAlign"
420
+ },
421
+ "type": "SingleRoIExtractor"
422
+ },
423
+ "train_cfg": {
424
+ "assigner": {
425
+ "ignore_iof_thr": -1,
426
+ "match_low_quality": false,
427
+ "min_pos_iou": 0.5,
428
+ "neg_iou_thr": 0.3,
429
+ "pos_iou_thr": 0.7,
430
+ "type": "MaxIoUAssigner"
431
+ },
432
+ "sampler": {
433
+ "add_gt_as_proposals": true,
434
+ "neg_pos_ub": 3,
435
+ "neg_sampler": {
436
+ "type": "RandomSampler"
437
+ },
438
+ "num": 512,
439
+ "pos_fraction": 0.5,
440
+ "pos_sampler": {
441
+ "type": "InstanceBalancedPosSampler"
442
+ },
443
+ "type": "CombinedSampler"
444
+ }
445
+ },
446
+ "type": "QuasiDenseTrackHead"
447
+ },
448
+ "tracker": {
449
+ "fps": 30,
450
+ "init_score_thr": 0.1,
451
+ "match_metric": "bisoftmax",
452
+ "match_score_thr": 0.5,
453
+ "max_distance": 100,
454
+ "memo_momentum": 0.8,
455
+ "memo_tracklet_frames": 10,
456
+ "obj_score_thr": 0.01,
457
+ "type": "MasaTaoTracker",
458
+ "with_cats": false
459
+ },
460
+ "train_cfg": {
461
+ "rcnn": {
462
+ "assigner": {
463
+ "ignore_iof_thr": -1,
464
+ "match_low_quality": false,
465
+ "min_pos_iou": 0.5,
466
+ "neg_iou_thr": 0.5,
467
+ "pos_iou_thr": 0.5,
468
+ "type": "MaxIoUAssigner"
469
+ },
470
+ "debug": false,
471
+ "pos_weight": -1,
472
+ "sampler": {
473
+ "add_gt_as_proposals": true,
474
+ "neg_pos_ub": -1,
475
+ "num": 512,
476
+ "pos_fraction": 0.25,
477
+ "type": "RandomSampler"
478
+ }
479
+ },
480
+ "rpn": {
481
+ "allowed_border": -1,
482
+ "assigner": {
483
+ "ignore_iof_thr": -1,
484
+ "match_low_quality": true,
485
+ "min_pos_iou": 0.3,
486
+ "neg_iou_thr": 0.3,
487
+ "pos_iou_thr": 0.7,
488
+ "type": "MaxIoUAssigner"
489
+ },
490
+ "debug": false,
491
+ "pos_weight": -1,
492
+ "sampler": {
493
+ "add_gt_as_proposals": false,
494
+ "neg_pos_ub": -1,
495
+ "num": 256,
496
+ "pos_fraction": 0.5,
497
+ "type": "RandomSampler"
498
+ }
499
+ },
500
+ "rpn_proposal": {
501
+ "max_per_img": 1000,
502
+ "min_bbox_size": 0,
503
+ "nms": {
504
+ "iou_threshold": 0.7,
505
+ "type": "nms"
506
+ },
507
+ "nms_pre": 2000
508
+ }
509
+ },
510
+ "unified_backbone": true,
511
+ "use_masa_backbone": false,
512
+ "with_segm": false
513
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7b2ca27e21a2ea49ef0304864f1ae5d0c41852121f4640833f606692aaea0f0
3
+ size 1090774448