Transformers
PyTorch
flava
pretraining
Inference Endpoints
aps commited on
Commit
06305f7
1 Parent(s): ba70dbe

Update according to latest comments on the main PR

Browse files
config.json CHANGED
@@ -1,11 +1,80 @@
1
  {
2
  "architectures": [
3
- "FLAVAForPretraining"
4
  ],
5
  "ce_ignore_index": -100,
6
  "global_backprop_contrastive": true,
7
  "global_contrastive_weight": 1.0,
8
  "hidden_size": 768,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  "image_config": {
10
  "_name_or_path": "",
11
  "add_cross_attention": false,
@@ -77,12 +146,13 @@
77
  "top_p": 1.0,
78
  "torch_dtype": null,
79
  "torchscript": false,
80
- "transformers_version": "4.18.0.dev0",
81
  "typical_p": 1.0,
82
  "use_bfloat16": false,
83
  "vocab_size": 8192
84
  },
85
- "image_config_dict": null,
 
86
  "initializer_factor": 1.0,
87
  "initializer_range": 0.02,
88
  "itm_weight": 1.0,
@@ -160,13 +230,15 @@
160
  "top_p": 1.0,
161
  "torch_dtype": null,
162
  "torchscript": false,
163
- "transformers_version": "4.18.0.dev0",
164
  "typical_p": 1.0,
165
  "use_bfloat16": false,
166
  "use_cls_token": true
167
  },
168
- "multimodal_config_dict": null,
169
  "projection_dim": 768,
 
 
170
  "text_config": {
171
  "_name_or_path": "",
172
  "add_cross_attention": false,
@@ -236,13 +308,13 @@
236
  "top_p": 1.0,
237
  "torch_dtype": null,
238
  "torchscript": false,
239
- "transformers_version": "4.18.0.dev0",
240
  "type_vocab_size": 2,
241
  "typical_p": 1.0,
242
  "use_bfloat16": false,
243
  "vocab_size": 30522
244
  },
245
- "text_config_dict": null,
246
  "torch_dtype": "float32",
247
  "transformers_version": null
248
  }
1
  {
2
  "architectures": [
3
+ "FlavaForPreTraining"
4
  ],
5
  "ce_ignore_index": -100,
6
  "global_backprop_contrastive": true,
7
  "global_contrastive_weight": 1.0,
8
  "hidden_size": 768,
9
+ "image_codebook_config": {
10
+ "_name_or_path": "",
11
+ "add_cross_attention": false,
12
+ "architectures": null,
13
+ "bad_words_ids": null,
14
+ "bos_token_id": null,
15
+ "chunk_size_feed_forward": 0,
16
+ "cross_attention_hidden_size": null,
17
+ "decoder_start_token_id": null,
18
+ "diversity_penalty": 0.0,
19
+ "do_sample": false,
20
+ "early_stopping": false,
21
+ "encoder_no_repeat_ngram_size": 0,
22
+ "eos_token_id": null,
23
+ "exponential_decay_length_penalty": null,
24
+ "finetuning_task": null,
25
+ "forced_bos_token_id": null,
26
+ "forced_eos_token_id": null,
27
+ "freeze": true,
28
+ "hidden_size": 256,
29
+ "id2label": {
30
+ "0": "LABEL_0",
31
+ "1": "LABEL_1"
32
+ },
33
+ "initializer_range": 0.02,
34
+ "input_channels": 3,
35
+ "is_decoder": false,
36
+ "is_encoder_decoder": false,
37
+ "label2id": {
38
+ "LABEL_0": 0,
39
+ "LABEL_1": 1
40
+ },
41
+ "length_penalty": 1.0,
42
+ "max_length": 20,
43
+ "min_length": 0,
44
+ "model_type": "flava_image_codebook",
45
+ "no_repeat_ngram_size": 0,
46
+ "num_beam_groups": 1,
47
+ "num_beams": 1,
48
+ "num_blocks_per_group": 2,
49
+ "num_groups": 4,
50
+ "num_return_sequences": 1,
51
+ "output_attentions": false,
52
+ "output_hidden_states": false,
53
+ "output_scores": false,
54
+ "pad_token_id": null,
55
+ "prefix": null,
56
+ "problem_type": null,
57
+ "pruned_heads": {},
58
+ "remove_invalid_values": false,
59
+ "repetition_penalty": 1.0,
60
+ "return_dict": true,
61
+ "return_dict_in_generate": false,
62
+ "sep_token_id": null,
63
+ "task_specific_params": null,
64
+ "temperature": 1.0,
65
+ "tie_encoder_decoder": false,
66
+ "tie_word_embeddings": true,
67
+ "tokenizer_class": null,
68
+ "top_k": 50,
69
+ "top_p": 1.0,
70
+ "torch_dtype": null,
71
+ "torchscript": false,
72
+ "transformers_version": "4.19.0.dev0",
73
+ "typical_p": 1.0,
74
+ "use_bfloat16": false,
75
+ "vocab_size": 8192
76
+ },
77
+ "image_codebook_config_dict": {},
78
  "image_config": {
79
  "_name_or_path": "",
80
  "add_cross_attention": false,
146
  "top_p": 1.0,
147
  "torch_dtype": null,
148
  "torchscript": false,
149
+ "transformers_version": "4.19.0.dev0",
150
  "typical_p": 1.0,
151
  "use_bfloat16": false,
152
  "vocab_size": 8192
153
  },
154
+ "image_config_dict": {},
155
+ "init_codebook": true,
156
  "initializer_factor": 1.0,
157
  "initializer_range": 0.02,
158
  "itm_weight": 1.0,
230
  "top_p": 1.0,
231
  "torch_dtype": null,
232
  "torchscript": false,
233
+ "transformers_version": "4.19.0.dev0",
234
  "typical_p": 1.0,
235
  "use_bfloat16": false,
236
  "use_cls_token": true
237
  },
238
+ "multimodal_config_dict": {},
239
  "projection_dim": 768,
240
+ "return_loss": true,
241
+ "skip_unmasked_multimodal_encoder": true,
242
  "text_config": {
243
  "_name_or_path": "",
244
  "add_cross_attention": false,
308
  "top_p": 1.0,
309
  "torch_dtype": null,
310
  "torchscript": false,
311
+ "transformers_version": "4.19.0.dev0",
312
  "type_vocab_size": 2,
313
  "typical_p": 1.0,
314
  "use_bfloat16": false,
315
  "vocab_size": 30522
316
  },
317
+ "text_config_dict": {},
318
  "torch_dtype": "float32",
319
  "transformers_version": null
320
  }
preprocessor_config.json CHANGED
@@ -1,9 +1,26 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "crop_size": 224,
3
  "do_center_crop": true,
4
  "do_normalize": true,
5
  "do_resize": true,
6
- "feature_extractor_type": "FLAVAFeatureExtractor",
7
  "image_mean": [
8
  0.48145466,
9
  0.4578275,
@@ -19,7 +36,7 @@
19
  "mask_group_max_patches": null,
20
  "mask_group_min_aspect_ratio": 0.3,
21
  "mask_group_min_patches": 16,
22
- "processor_class": "FLAVAProcessor",
23
  "resample": 3,
24
  "size": 224,
25
  "total_mask_patches": 75
1
  {
2
+ "codebook_crop_size": 112,
3
+ "codebook_do_center_crop": true,
4
+ "codebook_do_map_pixels": true,
5
+ "codebook_do_normalize": true,
6
+ "codebook_do_resize": true,
7
+ "codebook_image_mean": [
8
+ 0.0,
9
+ 0.0,
10
+ 0.0
11
+ ],
12
+ "codebook_image_std": [
13
+ 1.0,
14
+ 1.0,
15
+ 1.0
16
+ ],
17
+ "codebook_resample": 1,
18
+ "codebook_size": 112,
19
  "crop_size": 224,
20
  "do_center_crop": true,
21
  "do_normalize": true,
22
  "do_resize": true,
23
+ "feature_extractor_type": "FlavaFeatureExtractor",
24
  "image_mean": [
25
  0.48145466,
26
  0.4578275,
36
  "mask_group_max_patches": null,
37
  "mask_group_min_aspect_ratio": 0.3,
38
  "mask_group_min_patches": 16,
39
+ "processor_class": "FlavaProcessor",
40
  "resample": 3,
41
  "size": 224,
42
  "total_mask_patches": 75
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aa8315bfc8aa41a9f50a525a8bd9ab00bb9352aa33af9a4fa883101ca7ea00c1
3
- size 1215620615
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de8cb370b6412f4b3c5a7d89d563e357004b252225dadea42abe54d484c036e6
3
+ size 1430791411
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
tokenizer_config.json CHANGED
@@ -1 +1 @@
1
- {"do_lower_case": true, "do_basic_tokenize": true, "never_split": null, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "bert-base-uncased", "tokenizer_class": "BertTokenizer", "processor_class": "FLAVAProcessor"}
1
+ {"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "bert-base-uncased", "tokenizer_class": "BertTokenizer", "processor_class": "FlavaProcessor"}