YaTharThShaRma999
commited on
Commit
•
d279346
1
Parent(s):
fba9c83
Rename config.json to swint.py
Browse files- config.json +0 -37
- swint.py +118 -0
config.json
DELETED
@@ -1,37 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"_name_or_path": "mistralai/Mistral-7B-v0.1",
|
3 |
-
"architectures": [
|
4 |
-
"LlavaMistralForCausalLM"
|
5 |
-
],
|
6 |
-
"bos_token_id": 1,
|
7 |
-
"eos_token_id": 2,
|
8 |
-
"freeze_mm_mlp_adapter": false,
|
9 |
-
"hidden_act": "silu",
|
10 |
-
"hidden_size": 4096,
|
11 |
-
"image_aspect_ratio": "pad",
|
12 |
-
"image_grid_pinpoints": null,
|
13 |
-
"initializer_range": 0.02,
|
14 |
-
"intermediate_size": 14336,
|
15 |
-
"max_position_embeddings": 32768,
|
16 |
-
"mm_hidden_size": 1024,
|
17 |
-
"mm_projector_type": "mlp2x_gelu",
|
18 |
-
"mm_use_im_patch_token": false,
|
19 |
-
"mm_use_im_start_end": false,
|
20 |
-
"mm_vision_select_feature": "patch",
|
21 |
-
"mm_vision_select_layer": -2,
|
22 |
-
"mm_vision_tower": "openai/clip-vit-large-patch14-336",
|
23 |
-
"model_type": "llava_mistral",
|
24 |
-
"num_attention_heads": 32,
|
25 |
-
"num_hidden_layers": 32,
|
26 |
-
"num_key_value_heads": 8,
|
27 |
-
"rms_norm_eps": 1e-05,
|
28 |
-
"rope_theta": 10000.0,
|
29 |
-
"sliding_window": 4096,
|
30 |
-
"tie_word_embeddings": false,
|
31 |
-
"torch_dtype": "bfloat16",
|
32 |
-
"transformers_version": "4.35.0.dev0",
|
33 |
-
"tune_mm_mlp_adapter": false,
|
34 |
-
"use_cache": true,
|
35 |
-
"use_mm_proj": true,
|
36 |
-
"vocab_size": 32000
|
37 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
swint.py
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
data_aug_scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800]
|
2 |
+
data_aug_max_size = 1333
|
3 |
+
data_aug_scales2_resize = [400, 500, 600]
|
4 |
+
data_aug_scales2_crop = [384, 600]
|
5 |
+
data_aug_scale_overlap = None
|
6 |
+
batch_size = 4
|
7 |
+
modelname = 'groundingdino'
|
8 |
+
backbone = 'swin_T_224_1k'
|
9 |
+
position_embedding = 'sine'
|
10 |
+
pe_temperatureH = 20
|
11 |
+
pe_temperatureW = 20
|
12 |
+
return_interm_indices = [1, 2, 3]
|
13 |
+
enc_layers = 6
|
14 |
+
dec_layers = 6
|
15 |
+
pre_norm = False
|
16 |
+
dim_feedforward = 2048
|
17 |
+
hidden_dim = 256
|
18 |
+
dropout = 0.0
|
19 |
+
nheads = 8
|
20 |
+
num_queries = 900
|
21 |
+
query_dim = 4
|
22 |
+
num_patterns = 0
|
23 |
+
num_feature_levels = 4
|
24 |
+
enc_n_points = 4
|
25 |
+
dec_n_points = 4
|
26 |
+
two_stage_type = 'standard'
|
27 |
+
two_stage_bbox_embed_share = False
|
28 |
+
two_stage_class_embed_share = False
|
29 |
+
transformer_activation = 'relu'
|
30 |
+
dec_pred_bbox_embed_share = True
|
31 |
+
dn_box_noise_scale = 1.0
|
32 |
+
dn_label_noise_ratio = 0.5
|
33 |
+
dn_label_coef = 1.0
|
34 |
+
dn_bbox_coef = 1.0
|
35 |
+
embed_init_tgt = True
|
36 |
+
dn_labelbook_size = 91
|
37 |
+
max_text_len = 256
|
38 |
+
text_encoder_type = "bert-base-uncased"
|
39 |
+
use_text_enhancer = True
|
40 |
+
use_fusion_layer = True
|
41 |
+
use_checkpoint = True
|
42 |
+
use_transformer_ckpt = True
|
43 |
+
use_text_cross_attention = True
|
44 |
+
text_dropout = 0.0
|
45 |
+
fusion_dropout = 0.0
|
46 |
+
fusion_droppath = 0.1
|
47 |
+
sub_sentence_present = True
|
48 |
+
max_labels = 50 # pos + neg
|
49 |
+
lr = 0.0001 # base learning rate
|
50 |
+
backbone_freeze_keywords = None # only for gdino backbone
|
51 |
+
freeze_keywords = ['bert'] # for whole model, e.g. ['backbone.0', 'bert'] for freeze visual encoder and text encoder
|
52 |
+
lr_backbone = 1e-05 # specific learning rate
|
53 |
+
lr_backbone_names = ['backbone.0', 'bert']
|
54 |
+
lr_linear_proj_mult = 1e-05
|
55 |
+
lr_linear_proj_names = ['ref_point_head', 'sampling_offsets']
|
56 |
+
weight_decay = 0.0001
|
57 |
+
param_dict_type = 'ddetr_in_mmdet'
|
58 |
+
ddetr_lr_param = False
|
59 |
+
epochs = 15
|
60 |
+
lr_drop = 4
|
61 |
+
save_checkpoint_interval = 1
|
62 |
+
clip_max_norm = 0.1
|
63 |
+
onecyclelr = False
|
64 |
+
multi_step_lr = False
|
65 |
+
lr_drop_list = [4, 8]
|
66 |
+
frozen_weights = None
|
67 |
+
dilation = False
|
68 |
+
pdetr3_bbox_embed_diff_each_layer = False
|
69 |
+
pdetr3_refHW = -1
|
70 |
+
random_refpoints_xy = False
|
71 |
+
fix_refpoints_hw = -1
|
72 |
+
dabdetr_yolo_like_anchor_update = False
|
73 |
+
dabdetr_deformable_encoder = False
|
74 |
+
dabdetr_deformable_decoder = False
|
75 |
+
use_deformable_box_attn = False
|
76 |
+
box_attn_type = 'roi_align'
|
77 |
+
dec_layer_number = None
|
78 |
+
decoder_layer_noise = False
|
79 |
+
dln_xy_noise = 0.2
|
80 |
+
dln_hw_noise = 0.2
|
81 |
+
add_channel_attention = False
|
82 |
+
add_pos_value = False
|
83 |
+
two_stage_pat_embed = 0
|
84 |
+
two_stage_add_query_num = 0
|
85 |
+
two_stage_learn_wh = False
|
86 |
+
two_stage_default_hw = 0.05
|
87 |
+
two_stage_keep_all_tokens = False
|
88 |
+
num_select = 300
|
89 |
+
batch_norm_type = 'FrozenBatchNorm2d'
|
90 |
+
masks = False
|
91 |
+
aux_loss = True
|
92 |
+
set_cost_class = 1.0
|
93 |
+
set_cost_bbox = 5.0
|
94 |
+
set_cost_giou = 2.0
|
95 |
+
cls_loss_coef = 2.0
|
96 |
+
bbox_loss_coef = 5.0
|
97 |
+
giou_loss_coef = 2.0
|
98 |
+
enc_loss_coef = 1.0
|
99 |
+
interm_loss_coef = 1.0
|
100 |
+
no_interm_box_loss = False
|
101 |
+
mask_loss_coef = 1.0
|
102 |
+
dice_loss_coef = 1.0
|
103 |
+
focal_alpha = 0.25
|
104 |
+
focal_gamma = 2.0
|
105 |
+
decoder_sa_type = 'sa'
|
106 |
+
matcher_type = 'HungarianMatcher'
|
107 |
+
decoder_module_seq = ['sa', 'ca', 'ffn']
|
108 |
+
nms_iou_threshold = -1
|
109 |
+
dec_pred_class_embed_share = True
|
110 |
+
|
111 |
+
|
112 |
+
match_unstable_error = True
|
113 |
+
use_ema = False
|
114 |
+
ema_decay = 0.9997
|
115 |
+
ema_epoch = 0
|
116 |
+
use_detached_boxes_dec_out = False
|
117 |
+
use_coco_eval = True
|
118 |
+
dn_scalar = 100
|