Jiqing commited on
Commit
a78e902
1 Parent(s): ac550ec

Upload 5 files

Browse files

TVP model for temporal video grounding

Files changed (5) hide show
  1. config.json +47 -0
  2. preprocessor_config.json +13 -0
  3. pytorch_model.bin +3 -0
  4. tokenizer.json +0 -0
  5. vocab.txt +0 -0
config.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "tvp",
3
+ "alpha": 1.0,
4
+ "beta": 0.1,
5
+ "vp_type": "framepad",
6
+ "vp_apply": "replace",
7
+ "max_img_size": 448,
8
+ "pad_size": 96,
9
+ "num_frm": 48,
10
+ "attention_probs_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "classifier": "mlp",
13
+ "classifier_dropout": null,
14
+ "cls_hidden_scale": 2,
15
+ "hidden_act": "gelu",
16
+ "hidden_dropout_prob": 0.1,
17
+ "initializer_range": 0.02,
18
+ "intermediate_size": 3072,
19
+ "layer_norm_eps": 1e-12,
20
+ "max_grid_col_position_embeddings": 100,
21
+ "max_grid_row_position_embeddings": 100,
22
+ "max_position_embeddings": 512,
23
+ "num_attention_heads": 12,
24
+ "num_hidden_layers": 12,
25
+ "pad_token_id": 0,
26
+ "position_embedding_type": "absolute",
27
+ "transformers_version": "4.29.2",
28
+ "type_vocab_size": 2,
29
+ "use_cache": true,
30
+ "vocab_size": 30522,
31
+ "max_text_length": 100,
32
+ "vision_config": {
33
+ "model_type": "tvp_vision_model",
34
+ "input_format": "BGR",
35
+ "features": ["res5"],
36
+ "resnets_depth": 50,
37
+ "resnets_num_groups": 1,
38
+ "resnets_width_per_group": 64,
39
+ "resnets_stem_input_channels": 3,
40
+ "resnets_stem_out_channels": 64,
41
+ "resnets_res_out_channels": 256,
42
+ "resnets_res_dilation": 1,
43
+ "backbone_freeze_at": 2,
44
+ "grid_encoder_conv_input_size": 2048,
45
+ "grid_encoder_conv_output_size": 768
46
+ }
47
+ }
preprocessor_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_center_crop": false,
3
+ "do_normalize": true,
4
+ "do_resize": true,
5
+ "do_rescale": false,
6
+ "do_padding": true,
7
+ "image_mean": [8.2381, 7.3115, 6.6981],
8
+ "image_std": [9.6335, 9.0659, 8.7213],
9
+ "processor_class": "TVPProcessor",
10
+ "max_size": 448,
11
+ "padding_size": {"height": 448, "width": 448},
12
+ "tokenizer": "bert-base-uncased"
13
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fcae30a6c715639d1f5d4255a53ce0b479d058f96b3d2f7c554a1fa78410289a
3
+ size 673739769
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
vocab.txt ADDED
The diff for this file is too large to render. See raw diff