Update P model
Browse files- config.json +12 -4
config.json
CHANGED
@@ -3,12 +3,12 @@
|
|
3 |
"PerceiverForImageClassification"
|
4 |
],
|
5 |
"attention_probs_dropout_prob": 0.1,
|
|
|
6 |
"cross_attention_shape_for_attention": "kv",
|
7 |
"cross_attention_widening_factor": 1,
|
8 |
"d_latents": 1024,
|
9 |
"d_model": 512,
|
10 |
"hidden_act": "gelu",
|
11 |
-
"hidden_dropout_prob": 0.1,
|
12 |
"id2label": {
|
13 |
"0": "tench, Tinca tinca",
|
14 |
"1": "goldfish, Carassius auratus",
|
@@ -1011,6 +1011,7 @@
|
|
1011 |
"998": "ear, spike, capitulum",
|
1012 |
"999": "toilet tissue, toilet paper, bathroom tissue"
|
1013 |
},
|
|
|
1014 |
"initializer_range": 0.02,
|
1015 |
"label2id": {
|
1016 |
"Afghan hound, Afghan": 160,
|
@@ -2014,22 +2015,29 @@
|
|
2014 |
"zucchini, courgette": 939
|
2015 |
},
|
2016 |
"layer_norm_eps": 1e-12,
|
|
|
2017 |
"model_type": "perceiver",
|
2018 |
"num_blocks": 8,
|
2019 |
"num_cross_attention_heads": 1,
|
|
|
2020 |
"num_latents": 512,
|
2021 |
"num_self_attends_per_block": 6,
|
2022 |
"num_self_attention_heads": 8,
|
2023 |
-
"
|
|
|
|
|
|
|
|
|
|
|
2024 |
"qk_channels": null,
|
|
|
2025 |
"self_attention_widening_factor": 1,
|
2026 |
-
"seq_len": 2048,
|
2027 |
"torch_dtype": "float32",
|
2028 |
"train_size": [
|
2029 |
368,
|
2030 |
496
|
2031 |
],
|
2032 |
-
"transformers_version": "4.
|
2033 |
"use_query_residual": true,
|
2034 |
"v_channels": null,
|
2035 |
"vocab_size": 262
|
|
|
3 |
"PerceiverForImageClassification"
|
4 |
],
|
5 |
"attention_probs_dropout_prob": 0.1,
|
6 |
+
"audio_samples_per_frame": 1920,
|
7 |
"cross_attention_shape_for_attention": "kv",
|
8 |
"cross_attention_widening_factor": 1,
|
9 |
"d_latents": 1024,
|
10 |
"d_model": 512,
|
11 |
"hidden_act": "gelu",
|
|
|
12 |
"id2label": {
|
13 |
"0": "tench, Tinca tinca",
|
14 |
"1": "goldfish, Carassius auratus",
|
|
|
1011 |
"998": "ear, spike, capitulum",
|
1012 |
"999": "toilet tissue, toilet paper, bathroom tissue"
|
1013 |
},
|
1014 |
+
"image_size": 224,
|
1015 |
"initializer_range": 0.02,
|
1016 |
"label2id": {
|
1017 |
"Afghan hound, Afghan": 160,
|
|
|
2015 |
"zucchini, courgette": 939
|
2016 |
},
|
2017 |
"layer_norm_eps": 1e-12,
|
2018 |
+
"max_position_embeddings": 2048,
|
2019 |
"model_type": "perceiver",
|
2020 |
"num_blocks": 8,
|
2021 |
"num_cross_attention_heads": 1,
|
2022 |
+
"num_frames": 16,
|
2023 |
"num_latents": 512,
|
2024 |
"num_self_attends_per_block": 6,
|
2025 |
"num_self_attention_heads": 8,
|
2026 |
+
"output_shape": [
|
2027 |
+
1,
|
2028 |
+
16,
|
2029 |
+
224,
|
2030 |
+
224
|
2031 |
+
],
|
2032 |
"qk_channels": null,
|
2033 |
+
"samples_per_patch": 16,
|
2034 |
"self_attention_widening_factor": 1,
|
|
|
2035 |
"torch_dtype": "float32",
|
2036 |
"train_size": [
|
2037 |
368,
|
2038 |
496
|
2039 |
],
|
2040 |
+
"transformers_version": "4.12.0.dev0",
|
2041 |
"use_query_residual": true,
|
2042 |
"v_channels": null,
|
2043 |
"vocab_size": 262
|