YaTharThShaRma999 commited on
Commit
d279346
1 Parent(s): fba9c83

Rename config.json to swint.py

Browse files
Files changed (2) hide show
  1. config.json +0 -37
  2. swint.py +118 -0
config.json DELETED
@@ -1,37 +0,0 @@
1
- {
2
- "_name_or_path": "mistralai/Mistral-7B-v0.1",
3
- "architectures": [
4
- "LlavaMistralForCausalLM"
5
- ],
6
- "bos_token_id": 1,
7
- "eos_token_id": 2,
8
- "freeze_mm_mlp_adapter": false,
9
- "hidden_act": "silu",
10
- "hidden_size": 4096,
11
- "image_aspect_ratio": "pad",
12
- "image_grid_pinpoints": null,
13
- "initializer_range": 0.02,
14
- "intermediate_size": 14336,
15
- "max_position_embeddings": 32768,
16
- "mm_hidden_size": 1024,
17
- "mm_projector_type": "mlp2x_gelu",
18
- "mm_use_im_patch_token": false,
19
- "mm_use_im_start_end": false,
20
- "mm_vision_select_feature": "patch",
21
- "mm_vision_select_layer": -2,
22
- "mm_vision_tower": "openai/clip-vit-large-patch14-336",
23
- "model_type": "llava_mistral",
24
- "num_attention_heads": 32,
25
- "num_hidden_layers": 32,
26
- "num_key_value_heads": 8,
27
- "rms_norm_eps": 1e-05,
28
- "rope_theta": 10000.0,
29
- "sliding_window": 4096,
30
- "tie_word_embeddings": false,
31
- "torch_dtype": "bfloat16",
32
- "transformers_version": "4.35.0.dev0",
33
- "tune_mm_mlp_adapter": false,
34
- "use_cache": true,
35
- "use_mm_proj": true,
36
- "vocab_size": 32000
37
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
swint.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ data_aug_scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800]
2
+ data_aug_max_size = 1333
3
+ data_aug_scales2_resize = [400, 500, 600]
4
+ data_aug_scales2_crop = [384, 600]
5
+ data_aug_scale_overlap = None
6
+ batch_size = 4
7
+ modelname = 'groundingdino'
8
+ backbone = 'swin_T_224_1k'
9
+ position_embedding = 'sine'
10
+ pe_temperatureH = 20
11
+ pe_temperatureW = 20
12
+ return_interm_indices = [1, 2, 3]
13
+ enc_layers = 6
14
+ dec_layers = 6
15
+ pre_norm = False
16
+ dim_feedforward = 2048
17
+ hidden_dim = 256
18
+ dropout = 0.0
19
+ nheads = 8
20
+ num_queries = 900
21
+ query_dim = 4
22
+ num_patterns = 0
23
+ num_feature_levels = 4
24
+ enc_n_points = 4
25
+ dec_n_points = 4
26
+ two_stage_type = 'standard'
27
+ two_stage_bbox_embed_share = False
28
+ two_stage_class_embed_share = False
29
+ transformer_activation = 'relu'
30
+ dec_pred_bbox_embed_share = True
31
+ dn_box_noise_scale = 1.0
32
+ dn_label_noise_ratio = 0.5
33
+ dn_label_coef = 1.0
34
+ dn_bbox_coef = 1.0
35
+ embed_init_tgt = True
36
+ dn_labelbook_size = 91
37
+ max_text_len = 256
38
+ text_encoder_type = "bert-base-uncased"
39
+ use_text_enhancer = True
40
+ use_fusion_layer = True
41
+ use_checkpoint = True
42
+ use_transformer_ckpt = True
43
+ use_text_cross_attention = True
44
+ text_dropout = 0.0
45
+ fusion_dropout = 0.0
46
+ fusion_droppath = 0.1
47
+ sub_sentence_present = True
48
+ max_labels = 50 # pos + neg
49
+ lr = 0.0001 # base learning rate
50
+ backbone_freeze_keywords = None # only for gdino backbone
51
+ freeze_keywords = ['bert'] # for whole model, e.g. ['backbone.0', 'bert'] for freeze visual encoder and text encoder
52
+ lr_backbone = 1e-05 # specific learning rate
53
+ lr_backbone_names = ['backbone.0', 'bert']
54
+ lr_linear_proj_mult = 1e-05
55
+ lr_linear_proj_names = ['ref_point_head', 'sampling_offsets']
56
+ weight_decay = 0.0001
57
+ param_dict_type = 'ddetr_in_mmdet'
58
+ ddetr_lr_param = False
59
+ epochs = 15
60
+ lr_drop = 4
61
+ save_checkpoint_interval = 1
62
+ clip_max_norm = 0.1
63
+ onecyclelr = False
64
+ multi_step_lr = False
65
+ lr_drop_list = [4, 8]
66
+ frozen_weights = None
67
+ dilation = False
68
+ pdetr3_bbox_embed_diff_each_layer = False
69
+ pdetr3_refHW = -1
70
+ random_refpoints_xy = False
71
+ fix_refpoints_hw = -1
72
+ dabdetr_yolo_like_anchor_update = False
73
+ dabdetr_deformable_encoder = False
74
+ dabdetr_deformable_decoder = False
75
+ use_deformable_box_attn = False
76
+ box_attn_type = 'roi_align'
77
+ dec_layer_number = None
78
+ decoder_layer_noise = False
79
+ dln_xy_noise = 0.2
80
+ dln_hw_noise = 0.2
81
+ add_channel_attention = False
82
+ add_pos_value = False
83
+ two_stage_pat_embed = 0
84
+ two_stage_add_query_num = 0
85
+ two_stage_learn_wh = False
86
+ two_stage_default_hw = 0.05
87
+ two_stage_keep_all_tokens = False
88
+ num_select = 300
89
+ batch_norm_type = 'FrozenBatchNorm2d'
90
+ masks = False
91
+ aux_loss = True
92
+ set_cost_class = 1.0
93
+ set_cost_bbox = 5.0
94
+ set_cost_giou = 2.0
95
+ cls_loss_coef = 2.0
96
+ bbox_loss_coef = 5.0
97
+ giou_loss_coef = 2.0
98
+ enc_loss_coef = 1.0
99
+ interm_loss_coef = 1.0
100
+ no_interm_box_loss = False
101
+ mask_loss_coef = 1.0
102
+ dice_loss_coef = 1.0
103
+ focal_alpha = 0.25
104
+ focal_gamma = 2.0
105
+ decoder_sa_type = 'sa'
106
+ matcher_type = 'HungarianMatcher'
107
+ decoder_module_seq = ['sa', 'ca', 'ffn']
108
+ nms_iou_threshold = -1
109
+ dec_pred_class_embed_share = True
110
+
111
+
112
+ match_unstable_error = True
113
+ use_ema = False
114
+ ema_decay = 0.9997
115
+ ema_epoch = 0
116
+ use_detached_boxes_dec_out = False
117
+ use_coco_eval = True
118
+ dn_scalar = 100