robin luo commited on
Commit
d4559dc
1 Parent(s): b43e686
config.json CHANGED
@@ -1,38 +1,2026 @@
1
  {
2
- "architecture": "vit_small_patch16_224",
3
- "num_classes": 1000,
4
- "num_features": 384,
5
- "global_pool": "token",
6
- "pretrained_cfg": {
7
- "tag": "augreg_in21k_ft_in1k",
8
- "custom_load": true,
9
- "input_size": [
10
- 3,
11
- 224,
12
- 224
13
- ],
14
- "fixed_input_size": true,
15
- "interpolation": "bicubic",
16
- "crop_pct": 0.9,
17
- "crop_mode": "center",
18
-
19
- "mean": [
20
- 0.5,
21
- 0.5,
22
- 0.5
23
- ],
24
- "std": [
25
- 0.5,
26
- 0.5,
27
- 0.5
28
- ],
29
- "num_classes": 1000,
30
- "pool_size": null,
31
- "first_conv": "patch_embed.proj",
32
- "classifier": "head"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  },
34
- "_name_or_path": "magicslabnu/OutEffHop_vit_small_patch16_224",
35
- "auto_map":
36
- {"AutoModel": "vision_transformer.VisionTransformer"
37
- }
38
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  {
2
+ "architectures": [
3
+ "ViTForImageClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.0,
6
+ "encoder_stride": 16,
7
+ "hidden_act": "gelu",
8
+ "hidden_dropout_prob": 0.0,
9
+ "hidden_size": 384,
10
+ "id2label": {
11
+ "0": "n01440764",
12
+ "1": "n01443537",
13
+ "2": "n01484850",
14
+ "3": "n01491361",
15
+ "4": "n01494475",
16
+ "5": "n01496331",
17
+ "6": "n01498041",
18
+ "7": "n01514668",
19
+ "8": "n01514859",
20
+ "9": "n01518878",
21
+ "10": "n01530575",
22
+ "11": "n01531178",
23
+ "12": "n01532829",
24
+ "13": "n01534433",
25
+ "14": "n01537544",
26
+ "15": "n01558993",
27
+ "16": "n01560419",
28
+ "17": "n01580077",
29
+ "18": "n01582220",
30
+ "19": "n01592084",
31
+ "20": "n01601694",
32
+ "21": "n01608432",
33
+ "22": "n01614925",
34
+ "23": "n01616318",
35
+ "24": "n01622779",
36
+ "25": "n01629819",
37
+ "26": "n01630670",
38
+ "27": "n01631663",
39
+ "28": "n01632458",
40
+ "29": "n01632777",
41
+ "30": "n01641577",
42
+ "31": "n01644373",
43
+ "32": "n01644900",
44
+ "33": "n01664065",
45
+ "34": "n01665541",
46
+ "35": "n01667114",
47
+ "36": "n01667778",
48
+ "37": "n01669191",
49
+ "38": "n01675722",
50
+ "39": "n01677366",
51
+ "40": "n01682714",
52
+ "41": "n01685808",
53
+ "42": "n01687978",
54
+ "43": "n01688243",
55
+ "44": "n01689811",
56
+ "45": "n01692333",
57
+ "46": "n01693334",
58
+ "47": "n01694178",
59
+ "48": "n01695060",
60
+ "49": "n01697457",
61
+ "50": "n01698640",
62
+ "51": "n01704323",
63
+ "52": "n01728572",
64
+ "53": "n01728920",
65
+ "54": "n01729322",
66
+ "55": "n01729977",
67
+ "56": "n01734418",
68
+ "57": "n01735189",
69
+ "58": "n01737021",
70
+ "59": "n01739381",
71
+ "60": "n01740131",
72
+ "61": "n01742172",
73
+ "62": "n01744401",
74
+ "63": "n01748264",
75
+ "64": "n01749939",
76
+ "65": "n01751748",
77
+ "66": "n01753488",
78
+ "67": "n01755581",
79
+ "68": "n01756291",
80
+ "69": "n01768244",
81
+ "70": "n01770081",
82
+ "71": "n01770393",
83
+ "72": "n01773157",
84
+ "73": "n01773549",
85
+ "74": "n01773797",
86
+ "75": "n01774384",
87
+ "76": "n01774750",
88
+ "77": "n01775062",
89
+ "78": "n01776313",
90
+ "79": "n01784675",
91
+ "80": "n01795545",
92
+ "81": "n01796340",
93
+ "82": "n01797886",
94
+ "83": "n01798484",
95
+ "84": "n01806143",
96
+ "85": "n01806567",
97
+ "86": "n01807496",
98
+ "87": "n01817953",
99
+ "88": "n01818515",
100
+ "89": "n01819313",
101
+ "90": "n01820546",
102
+ "91": "n01824575",
103
+ "92": "n01828970",
104
+ "93": "n01829413",
105
+ "94": "n01833805",
106
+ "95": "n01843065",
107
+ "96": "n01843383",
108
+ "97": "n01847000",
109
+ "98": "n01855032",
110
+ "99": "n01855672",
111
+ "100": "n01860187",
112
+ "101": "n01871265",
113
+ "102": "n01872401",
114
+ "103": "n01873310",
115
+ "104": "n01877812",
116
+ "105": "n01882714",
117
+ "106": "n01883070",
118
+ "107": "n01910747",
119
+ "108": "n01914609",
120
+ "109": "n01917289",
121
+ "110": "n01924916",
122
+ "111": "n01930112",
123
+ "112": "n01943899",
124
+ "113": "n01944390",
125
+ "114": "n01945685",
126
+ "115": "n01950731",
127
+ "116": "n01955084",
128
+ "117": "n01968897",
129
+ "118": "n01978287",
130
+ "119": "n01978455",
131
+ "120": "n01980166",
132
+ "121": "n01981276",
133
+ "122": "n01983481",
134
+ "123": "n01984695",
135
+ "124": "n01985128",
136
+ "125": "n01986214",
137
+ "126": "n01990800",
138
+ "127": "n02002556",
139
+ "128": "n02002724",
140
+ "129": "n02006656",
141
+ "130": "n02007558",
142
+ "131": "n02009229",
143
+ "132": "n02009912",
144
+ "133": "n02011460",
145
+ "134": "n02012849",
146
+ "135": "n02013706",
147
+ "136": "n02017213",
148
+ "137": "n02018207",
149
+ "138": "n02018795",
150
+ "139": "n02025239",
151
+ "140": "n02027492",
152
+ "141": "n02028035",
153
+ "142": "n02033041",
154
+ "143": "n02037110",
155
+ "144": "n02051845",
156
+ "145": "n02056570",
157
+ "146": "n02058221",
158
+ "147": "n02066245",
159
+ "148": "n02071294",
160
+ "149": "n02074367",
161
+ "150": "n02077923",
162
+ "151": "n02085620",
163
+ "152": "n02085782",
164
+ "153": "n02085936",
165
+ "154": "n02086079",
166
+ "155": "n02086240",
167
+ "156": "n02086646",
168
+ "157": "n02086910",
169
+ "158": "n02087046",
170
+ "159": "n02087394",
171
+ "160": "n02088094",
172
+ "161": "n02088238",
173
+ "162": "n02088364",
174
+ "163": "n02088466",
175
+ "164": "n02088632",
176
+ "165": "n02089078",
177
+ "166": "n02089867",
178
+ "167": "n02089973",
179
+ "168": "n02090379",
180
+ "169": "n02090622",
181
+ "170": "n02090721",
182
+ "171": "n02091032",
183
+ "172": "n02091134",
184
+ "173": "n02091244",
185
+ "174": "n02091467",
186
+ "175": "n02091635",
187
+ "176": "n02091831",
188
+ "177": "n02092002",
189
+ "178": "n02092339",
190
+ "179": "n02093256",
191
+ "180": "n02093428",
192
+ "181": "n02093647",
193
+ "182": "n02093754",
194
+ "183": "n02093859",
195
+ "184": "n02093991",
196
+ "185": "n02094114",
197
+ "186": "n02094258",
198
+ "187": "n02094433",
199
+ "188": "n02095314",
200
+ "189": "n02095570",
201
+ "190": "n02095889",
202
+ "191": "n02096051",
203
+ "192": "n02096177",
204
+ "193": "n02096294",
205
+ "194": "n02096437",
206
+ "195": "n02096585",
207
+ "196": "n02097047",
208
+ "197": "n02097130",
209
+ "198": "n02097209",
210
+ "199": "n02097298",
211
+ "200": "n02097474",
212
+ "201": "n02097658",
213
+ "202": "n02098105",
214
+ "203": "n02098286",
215
+ "204": "n02098413",
216
+ "205": "n02099267",
217
+ "206": "n02099429",
218
+ "207": "n02099601",
219
+ "208": "n02099712",
220
+ "209": "n02099849",
221
+ "210": "n02100236",
222
+ "211": "n02100583",
223
+ "212": "n02100735",
224
+ "213": "n02100877",
225
+ "214": "n02101006",
226
+ "215": "n02101388",
227
+ "216": "n02101556",
228
+ "217": "n02102040",
229
+ "218": "n02102177",
230
+ "219": "n02102318",
231
+ "220": "n02102480",
232
+ "221": "n02102973",
233
+ "222": "n02104029",
234
+ "223": "n02104365",
235
+ "224": "n02105056",
236
+ "225": "n02105162",
237
+ "226": "n02105251",
238
+ "227": "n02105412",
239
+ "228": "n02105505",
240
+ "229": "n02105641",
241
+ "230": "n02105855",
242
+ "231": "n02106030",
243
+ "232": "n02106166",
244
+ "233": "n02106382",
245
+ "234": "n02106550",
246
+ "235": "n02106662",
247
+ "236": "n02107142",
248
+ "237": "n02107312",
249
+ "238": "n02107574",
250
+ "239": "n02107683",
251
+ "240": "n02107908",
252
+ "241": "n02108000",
253
+ "242": "n02108089",
254
+ "243": "n02108422",
255
+ "244": "n02108551",
256
+ "245": "n02108915",
257
+ "246": "n02109047",
258
+ "247": "n02109525",
259
+ "248": "n02109961",
260
+ "249": "n02110063",
261
+ "250": "n02110185",
262
+ "251": "n02110341",
263
+ "252": "n02110627",
264
+ "253": "n02110806",
265
+ "254": "n02110958",
266
+ "255": "n02111129",
267
+ "256": "n02111277",
268
+ "257": "n02111500",
269
+ "258": "n02111889",
270
+ "259": "n02112018",
271
+ "260": "n02112137",
272
+ "261": "n02112350",
273
+ "262": "n02112706",
274
+ "263": "n02113023",
275
+ "264": "n02113186",
276
+ "265": "n02113624",
277
+ "266": "n02113712",
278
+ "267": "n02113799",
279
+ "268": "n02113978",
280
+ "269": "n02114367",
281
+ "270": "n02114548",
282
+ "271": "n02114712",
283
+ "272": "n02114855",
284
+ "273": "n02115641",
285
+ "274": "n02115913",
286
+ "275": "n02116738",
287
+ "276": "n02117135",
288
+ "277": "n02119022",
289
+ "278": "n02119789",
290
+ "279": "n02120079",
291
+ "280": "n02120505",
292
+ "281": "n02123045",
293
+ "282": "n02123159",
294
+ "283": "n02123394",
295
+ "284": "n02123597",
296
+ "285": "n02124075",
297
+ "286": "n02125311",
298
+ "287": "n02127052",
299
+ "288": "n02128385",
300
+ "289": "n02128757",
301
+ "290": "n02128925",
302
+ "291": "n02129165",
303
+ "292": "n02129604",
304
+ "293": "n02130308",
305
+ "294": "n02132136",
306
+ "295": "n02133161",
307
+ "296": "n02134084",
308
+ "297": "n02134418",
309
+ "298": "n02137549",
310
+ "299": "n02138441",
311
+ "300": "n02165105",
312
+ "301": "n02165456",
313
+ "302": "n02167151",
314
+ "303": "n02168699",
315
+ "304": "n02169497",
316
+ "305": "n02172182",
317
+ "306": "n02174001",
318
+ "307": "n02177972",
319
+ "308": "n02190166",
320
+ "309": "n02206856",
321
+ "310": "n02219486",
322
+ "311": "n02226429",
323
+ "312": "n02229544",
324
+ "313": "n02231487",
325
+ "314": "n02233338",
326
+ "315": "n02236044",
327
+ "316": "n02256656",
328
+ "317": "n02259212",
329
+ "318": "n02264363",
330
+ "319": "n02268443",
331
+ "320": "n02268853",
332
+ "321": "n02276258",
333
+ "322": "n02277742",
334
+ "323": "n02279972",
335
+ "324": "n02280649",
336
+ "325": "n02281406",
337
+ "326": "n02281787",
338
+ "327": "n02317335",
339
+ "328": "n02319095",
340
+ "329": "n02321529",
341
+ "330": "n02325366",
342
+ "331": "n02326432",
343
+ "332": "n02328150",
344
+ "333": "n02342885",
345
+ "334": "n02346627",
346
+ "335": "n02356798",
347
+ "336": "n02361337",
348
+ "337": "n02363005",
349
+ "338": "n02364673",
350
+ "339": "n02389026",
351
+ "340": "n02391049",
352
+ "341": "n02395406",
353
+ "342": "n02396427",
354
+ "343": "n02397096",
355
+ "344": "n02398521",
356
+ "345": "n02403003",
357
+ "346": "n02408429",
358
+ "347": "n02410509",
359
+ "348": "n02412080",
360
+ "349": "n02415577",
361
+ "350": "n02417914",
362
+ "351": "n02422106",
363
+ "352": "n02422699",
364
+ "353": "n02423022",
365
+ "354": "n02437312",
366
+ "355": "n02437616",
367
+ "356": "n02441942",
368
+ "357": "n02442845",
369
+ "358": "n02443114",
370
+ "359": "n02443484",
371
+ "360": "n02444819",
372
+ "361": "n02445715",
373
+ "362": "n02447366",
374
+ "363": "n02454379",
375
+ "364": "n02457408",
376
+ "365": "n02480495",
377
+ "366": "n02480855",
378
+ "367": "n02481823",
379
+ "368": "n02483362",
380
+ "369": "n02483708",
381
+ "370": "n02484975",
382
+ "371": "n02486261",
383
+ "372": "n02486410",
384
+ "373": "n02487347",
385
+ "374": "n02488291",
386
+ "375": "n02488702",
387
+ "376": "n02489166",
388
+ "377": "n02490219",
389
+ "378": "n02492035",
390
+ "379": "n02492660",
391
+ "380": "n02493509",
392
+ "381": "n02493793",
393
+ "382": "n02494079",
394
+ "383": "n02497673",
395
+ "384": "n02500267",
396
+ "385": "n02504013",
397
+ "386": "n02504458",
398
+ "387": "n02509815",
399
+ "388": "n02510455",
400
+ "389": "n02514041",
401
+ "390": "n02526121",
402
+ "391": "n02536864",
403
+ "392": "n02606052",
404
+ "393": "n02607072",
405
+ "394": "n02640242",
406
+ "395": "n02641379",
407
+ "396": "n02643566",
408
+ "397": "n02655020",
409
+ "398": "n02666196",
410
+ "399": "n02667093",
411
+ "400": "n02669723",
412
+ "401": "n02672831",
413
+ "402": "n02676566",
414
+ "403": "n02687172",
415
+ "404": "n02690373",
416
+ "405": "n02692877",
417
+ "406": "n02699494",
418
+ "407": "n02701002",
419
+ "408": "n02704792",
420
+ "409": "n02708093",
421
+ "410": "n02727426",
422
+ "411": "n02730930",
423
+ "412": "n02747177",
424
+ "413": "n02749479",
425
+ "414": "n02769748",
426
+ "415": "n02776631",
427
+ "416": "n02777292",
428
+ "417": "n02782093",
429
+ "418": "n02783161",
430
+ "419": "n02786058",
431
+ "420": "n02787622",
432
+ "421": "n02788148",
433
+ "422": "n02790996",
434
+ "423": "n02791124",
435
+ "424": "n02791270",
436
+ "425": "n02793495",
437
+ "426": "n02794156",
438
+ "427": "n02795169",
439
+ "428": "n02797295",
440
+ "429": "n02799071",
441
+ "430": "n02802426",
442
+ "431": "n02804414",
443
+ "432": "n02804610",
444
+ "433": "n02807133",
445
+ "434": "n02808304",
446
+ "435": "n02808440",
447
+ "436": "n02814533",
448
+ "437": "n02814860",
449
+ "438": "n02815834",
450
+ "439": "n02817516",
451
+ "440": "n02823428",
452
+ "441": "n02823750",
453
+ "442": "n02825657",
454
+ "443": "n02834397",
455
+ "444": "n02835271",
456
+ "445": "n02837789",
457
+ "446": "n02840245",
458
+ "447": "n02841315",
459
+ "448": "n02843684",
460
+ "449": "n02859443",
461
+ "450": "n02860847",
462
+ "451": "n02865351",
463
+ "452": "n02869837",
464
+ "453": "n02870880",
465
+ "454": "n02871525",
466
+ "455": "n02877765",
467
+ "456": "n02879718",
468
+ "457": "n02883205",
469
+ "458": "n02892201",
470
+ "459": "n02892767",
471
+ "460": "n02894605",
472
+ "461": "n02895154",
473
+ "462": "n02906734",
474
+ "463": "n02909870",
475
+ "464": "n02910353",
476
+ "465": "n02916936",
477
+ "466": "n02917067",
478
+ "467": "n02927161",
479
+ "468": "n02930766",
480
+ "469": "n02939185",
481
+ "470": "n02948072",
482
+ "471": "n02950826",
483
+ "472": "n02951358",
484
+ "473": "n02951585",
485
+ "474": "n02963159",
486
+ "475": "n02965783",
487
+ "476": "n02966193",
488
+ "477": "n02966687",
489
+ "478": "n02971356",
490
+ "479": "n02974003",
491
+ "480": "n02977058",
492
+ "481": "n02978881",
493
+ "482": "n02979186",
494
+ "483": "n02980441",
495
+ "484": "n02981792",
496
+ "485": "n02988304",
497
+ "486": "n02992211",
498
+ "487": "n02992529",
499
+ "488": "n02999410",
500
+ "489": "n03000134",
501
+ "490": "n03000247",
502
+ "491": "n03000684",
503
+ "492": "n03014705",
504
+ "493": "n03016953",
505
+ "494": "n03017168",
506
+ "495": "n03018349",
507
+ "496": "n03026506",
508
+ "497": "n03028079",
509
+ "498": "n03032252",
510
+ "499": "n03041632",
511
+ "500": "n03042490",
512
+ "501": "n03045698",
513
+ "502": "n03047690",
514
+ "503": "n03062245",
515
+ "504": "n03063599",
516
+ "505": "n03063689",
517
+ "506": "n03065424",
518
+ "507": "n03075370",
519
+ "508": "n03085013",
520
+ "509": "n03089624",
521
+ "510": "n03095699",
522
+ "511": "n03100240",
523
+ "512": "n03109150",
524
+ "513": "n03110669",
525
+ "514": "n03124043",
526
+ "515": "n03124170",
527
+ "516": "n03125729",
528
+ "517": "n03126707",
529
+ "518": "n03127747",
530
+ "519": "n03127925",
531
+ "520": "n03131574",
532
+ "521": "n03133878",
533
+ "522": "n03134739",
534
+ "523": "n03141823",
535
+ "524": "n03146219",
536
+ "525": "n03160309",
537
+ "526": "n03179701",
538
+ "527": "n03180011",
539
+ "528": "n03187595",
540
+ "529": "n03188531",
541
+ "530": "n03196217",
542
+ "531": "n03197337",
543
+ "532": "n03201208",
544
+ "533": "n03207743",
545
+ "534": "n03207941",
546
+ "535": "n03208938",
547
+ "536": "n03216828",
548
+ "537": "n03218198",
549
+ "538": "n03220513",
550
+ "539": "n03223299",
551
+ "540": "n03240683",
552
+ "541": "n03249569",
553
+ "542": "n03250847",
554
+ "543": "n03255030",
555
+ "544": "n03259280",
556
+ "545": "n03271574",
557
+ "546": "n03272010",
558
+ "547": "n03272562",
559
+ "548": "n03290653",
560
+ "549": "n03291819",
561
+ "550": "n03297495",
562
+ "551": "n03314780",
563
+ "552": "n03325584",
564
+ "553": "n03337140",
565
+ "554": "n03344393",
566
+ "555": "n03345487",
567
+ "556": "n03347037",
568
+ "557": "n03355925",
569
+ "558": "n03372029",
570
+ "559": "n03376595",
571
+ "560": "n03379051",
572
+ "561": "n03384352",
573
+ "562": "n03388043",
574
+ "563": "n03388183",
575
+ "564": "n03388549",
576
+ "565": "n03393912",
577
+ "566": "n03394916",
578
+ "567": "n03400231",
579
+ "568": "n03404251",
580
+ "569": "n03417042",
581
+ "570": "n03424325",
582
+ "571": "n03425413",
583
+ "572": "n03443371",
584
+ "573": "n03444034",
585
+ "574": "n03445777",
586
+ "575": "n03445924",
587
+ "576": "n03447447",
588
+ "577": "n03447721",
589
+ "578": "n03450230",
590
+ "579": "n03452741",
591
+ "580": "n03457902",
592
+ "581": "n03459775",
593
+ "582": "n03461385",
594
+ "583": "n03467068",
595
+ "584": "n03476684",
596
+ "585": "n03476991",
597
+ "586": "n03478589",
598
+ "587": "n03481172",
599
+ "588": "n03482405",
600
+ "589": "n03483316",
601
+ "590": "n03485407",
602
+ "591": "n03485794",
603
+ "592": "n03492542",
604
+ "593": "n03494278",
605
+ "594": "n03495258",
606
+ "595": "n03496892",
607
+ "596": "n03498962",
608
+ "597": "n03527444",
609
+ "598": "n03529860",
610
+ "599": "n03530642",
611
+ "600": "n03532672",
612
+ "601": "n03534580",
613
+ "602": "n03535780",
614
+ "603": "n03538406",
615
+ "604": "n03544143",
616
+ "605": "n03584254",
617
+ "606": "n03584829",
618
+ "607": "n03590841",
619
+ "608": "n03594734",
620
+ "609": "n03594945",
621
+ "610": "n03595614",
622
+ "611": "n03598930",
623
+ "612": "n03599486",
624
+ "613": "n03602883",
625
+ "614": "n03617480",
626
+ "615": "n03623198",
627
+ "616": "n03627232",
628
+ "617": "n03630383",
629
+ "618": "n03633091",
630
+ "619": "n03637318",
631
+ "620": "n03642806",
632
+ "621": "n03649909",
633
+ "622": "n03657121",
634
+ "623": "n03658185",
635
+ "624": "n03661043",
636
+ "625": "n03662601",
637
+ "626": "n03666591",
638
+ "627": "n03670208",
639
+ "628": "n03673027",
640
+ "629": "n03676483",
641
+ "630": "n03680355",
642
+ "631": "n03690938",
643
+ "632": "n03691459",
644
+ "633": "n03692522",
645
+ "634": "n03697007",
646
+ "635": "n03706229",
647
+ "636": "n03709823",
648
+ "637": "n03710193",
649
+ "638": "n03710637",
650
+ "639": "n03710721",
651
+ "640": "n03717622",
652
+ "641": "n03720891",
653
+ "642": "n03721384",
654
+ "643": "n03724870",
655
+ "644": "n03729826",
656
+ "645": "n03733131",
657
+ "646": "n03733281",
658
+ "647": "n03733805",
659
+ "648": "n03742115",
660
+ "649": "n03743016",
661
+ "650": "n03759954",
662
+ "651": "n03761084",
663
+ "652": "n03763968",
664
+ "653": "n03764736",
665
+ "654": "n03769881",
666
+ "655": "n03770439",
667
+ "656": "n03770679",
668
+ "657": "n03773504",
669
+ "658": "n03775071",
670
+ "659": "n03775546",
671
+ "660": "n03776460",
672
+ "661": "n03777568",
673
+ "662": "n03777754",
674
+ "663": "n03781244",
675
+ "664": "n03782006",
676
+ "665": "n03785016",
677
+ "666": "n03786901",
678
+ "667": "n03787032",
679
+ "668": "n03788195",
680
+ "669": "n03788365",
681
+ "670": "n03791053",
682
+ "671": "n03792782",
683
+ "672": "n03792972",
684
+ "673": "n03793489",
685
+ "674": "n03794056",
686
+ "675": "n03796401",
687
+ "676": "n03803284",
688
+ "677": "n03804744",
689
+ "678": "n03814639",
690
+ "679": "n03814906",
691
+ "680": "n03825788",
692
+ "681": "n03832673",
693
+ "682": "n03837869",
694
+ "683": "n03838899",
695
+ "684": "n03840681",
696
+ "685": "n03841143",
697
+ "686": "n03843555",
698
+ "687": "n03854065",
699
+ "688": "n03857828",
700
+ "689": "n03866082",
701
+ "690": "n03868242",
702
+ "691": "n03868863",
703
+ "692": "n03871628",
704
+ "693": "n03873416",
705
+ "694": "n03874293",
706
+ "695": "n03874599",
707
+ "696": "n03876231",
708
+ "697": "n03877472",
709
+ "698": "n03877845",
710
+ "699": "n03884397",
711
+ "700": "n03887697",
712
+ "701": "n03888257",
713
+ "702": "n03888605",
714
+ "703": "n03891251",
715
+ "704": "n03891332",
716
+ "705": "n03895866",
717
+ "706": "n03899768",
718
+ "707": "n03902125",
719
+ "708": "n03903868",
720
+ "709": "n03908618",
721
+ "710": "n03908714",
722
+ "711": "n03916031",
723
+ "712": "n03920288",
724
+ "713": "n03924679",
725
+ "714": "n03929660",
726
+ "715": "n03929855",
727
+ "716": "n03930313",
728
+ "717": "n03930630",
729
+ "718": "n03933933",
730
+ "719": "n03935335",
731
+ "720": "n03937543",
732
+ "721": "n03938244",
733
+ "722": "n03942813",
734
+ "723": "n03944341",
735
+ "724": "n03947888",
736
+ "725": "n03950228",
737
+ "726": "n03954731",
738
+ "727": "n03956157",
739
+ "728": "n03958227",
740
+ "729": "n03961711",
741
+ "730": "n03967562",
742
+ "731": "n03970156",
743
+ "732": "n03976467",
744
+ "733": "n03976657",
745
+ "734": "n03977966",
746
+ "735": "n03980874",
747
+ "736": "n03982430",
748
+ "737": "n03983396",
749
+ "738": "n03991062",
750
+ "739": "n03992509",
751
+ "740": "n03995372",
752
+ "741": "n03998194",
753
+ "742": "n04004767",
754
+ "743": "n04005630",
755
+ "744": "n04008634",
756
+ "745": "n04009552",
757
+ "746": "n04019541",
758
+ "747": "n04023962",
759
+ "748": "n04026417",
760
+ "749": "n04033901",
761
+ "750": "n04033995",
762
+ "751": "n04037443",
763
+ "752": "n04039381",
764
+ "753": "n04040759",
765
+ "754": "n04041544",
766
+ "755": "n04044716",
767
+ "756": "n04049303",
768
+ "757": "n04065272",
769
+ "758": "n04067472",
770
+ "759": "n04069434",
771
+ "760": "n04070727",
772
+ "761": "n04074963",
773
+ "762": "n04081281",
774
+ "763": "n04086273",
775
+ "764": "n04090263",
776
+ "765": "n04099969",
777
+ "766": "n04111531",
778
+ "767": "n04116512",
779
+ "768": "n04118538",
780
+ "769": "n04118776",
781
+ "770": "n04120489",
782
+ "771": "n04125021",
783
+ "772": "n04127249",
784
+ "773": "n04131690",
785
+ "774": "n04133789",
786
+ "775": "n04136333",
787
+ "776": "n04141076",
788
+ "777": "n04141327",
789
+ "778": "n04141975",
790
+ "779": "n04146614",
791
+ "780": "n04147183",
792
+ "781": "n04149813",
793
+ "782": "n04152593",
794
+ "783": "n04153751",
795
+ "784": "n04154565",
796
+ "785": "n04162706",
797
+ "786": "n04179913",
798
+ "787": "n04192698",
799
+ "788": "n04200800",
800
+ "789": "n04201297",
801
+ "790": "n04204238",
802
+ "791": "n04204347",
803
+ "792": "n04208210",
804
+ "793": "n04209133",
805
+ "794": "n04209239",
806
+ "795": "n04228054",
807
+ "796": "n04229816",
808
+ "797": "n04235860",
809
+ "798": "n04238763",
810
+ "799": "n04239074",
811
+ "800": "n04243546",
812
+ "801": "n04251144",
813
+ "802": "n04252077",
814
+ "803": "n04252225",
815
+ "804": "n04254120",
816
+ "805": "n04254680",
817
+ "806": "n04254777",
818
+ "807": "n04258138",
819
+ "808": "n04259630",
820
+ "809": "n04263257",
821
+ "810": "n04264628",
822
+ "811": "n04265275",
823
+ "812": "n04266014",
824
+ "813": "n04270147",
825
+ "814": "n04273569",
826
+ "815": "n04275548",
827
+ "816": "n04277352",
828
+ "817": "n04285008",
829
+ "818": "n04286575",
830
+ "819": "n04296562",
831
+ "820": "n04310018",
832
+ "821": "n04311004",
833
+ "822": "n04311174",
834
+ "823": "n04317175",
835
+ "824": "n04325704",
836
+ "825": "n04326547",
837
+ "826": "n04328186",
838
+ "827": "n04330267",
839
+ "828": "n04332243",
840
+ "829": "n04335435",
841
+ "830": "n04336792",
842
+ "831": "n04344873",
843
+ "832": "n04346328",
844
+ "833": "n04347754",
845
+ "834": "n04350905",
846
+ "835": "n04355338",
847
+ "836": "n04355933",
848
+ "837": "n04356056",
849
+ "838": "n04357314",
850
+ "839": "n04366367",
851
+ "840": "n04367480",
852
+ "841": "n04370456",
853
+ "842": "n04371430",
854
+ "843": "n04371774",
855
+ "844": "n04372370",
856
+ "845": "n04376876",
857
+ "846": "n04380533",
858
+ "847": "n04389033",
859
+ "848": "n04392985",
860
+ "849": "n04398044",
861
+ "850": "n04399382",
862
+ "851": "n04404412",
863
+ "852": "n04409515",
864
+ "853": "n04417672",
865
+ "854": "n04418357",
866
+ "855": "n04423845",
867
+ "856": "n04428191",
868
+ "857": "n04429376",
869
+ "858": "n04435653",
870
+ "859": "n04442312",
871
+ "860": "n04443257",
872
+ "861": "n04447861",
873
+ "862": "n04456115",
874
+ "863": "n04458633",
875
+ "864": "n04461696",
876
+ "865": "n04462240",
877
+ "866": "n04465501",
878
+ "867": "n04467665",
879
+ "868": "n04476259",
880
+ "869": "n04479046",
881
+ "870": "n04482393",
882
+ "871": "n04483307",
883
+ "872": "n04485082",
884
+ "873": "n04486054",
885
+ "874": "n04487081",
886
+ "875": "n04487394",
887
+ "876": "n04493381",
888
+ "877": "n04501370",
889
+ "878": "n04505470",
890
+ "879": "n04507155",
891
+ "880": "n04509417",
892
+ "881": "n04515003",
893
+ "882": "n04517823",
894
+ "883": "n04522168",
895
+ "884": "n04523525",
896
+ "885": "n04525038",
897
+ "886": "n04525305",
898
+ "887": "n04532106",
899
+ "888": "n04532670",
900
+ "889": "n04536866",
901
+ "890": "n04540053",
902
+ "891": "n04542943",
903
+ "892": "n04548280",
904
+ "893": "n04548362",
905
+ "894": "n04550184",
906
+ "895": "n04552348",
907
+ "896": "n04553703",
908
+ "897": "n04554684",
909
+ "898": "n04557648",
910
+ "899": "n04560804",
911
+ "900": "n04562935",
912
+ "901": "n04579145",
913
+ "902": "n04579432",
914
+ "903": "n04584207",
915
+ "904": "n04589890",
916
+ "905": "n04590129",
917
+ "906": "n04591157",
918
+ "907": "n04591713",
919
+ "908": "n04592741",
920
+ "909": "n04596742",
921
+ "910": "n04597913",
922
+ "911": "n04599235",
923
+ "912": "n04604644",
924
+ "913": "n04606251",
925
+ "914": "n04612504",
926
+ "915": "n04613696",
927
+ "916": "n06359193",
928
+ "917": "n06596364",
929
+ "918": "n06785654",
930
+ "919": "n06794110",
931
+ "920": "n06874185",
932
+ "921": "n07248320",
933
+ "922": "n07565083",
934
+ "923": "n07579787",
935
+ "924": "n07583066",
936
+ "925": "n07584110",
937
+ "926": "n07590611",
938
+ "927": "n07613480",
939
+ "928": "n07614500",
940
+ "929": "n07615774",
941
+ "930": "n07684084",
942
+ "931": "n07693725",
943
+ "932": "n07695742",
944
+ "933": "n07697313",
945
+ "934": "n07697537",
946
+ "935": "n07711569",
947
+ "936": "n07714571",
948
+ "937": "n07714990",
949
+ "938": "n07715103",
950
+ "939": "n07716358",
951
+ "940": "n07716906",
952
+ "941": "n07717410",
953
+ "942": "n07717556",
954
+ "943": "n07718472",
955
+ "944": "n07718747",
956
+ "945": "n07720875",
957
+ "946": "n07730033",
958
+ "947": "n07734744",
959
+ "948": "n07742313",
960
+ "949": "n07745940",
961
+ "950": "n07747607",
962
+ "951": "n07749582",
963
+ "952": "n07753113",
964
+ "953": "n07753275",
965
+ "954": "n07753592",
966
+ "955": "n07754684",
967
+ "956": "n07760859",
968
+ "957": "n07768694",
969
+ "958": "n07802026",
970
+ "959": "n07831146",
971
+ "960": "n07836838",
972
+ "961": "n07860988",
973
+ "962": "n07871810",
974
+ "963": "n07873807",
975
+ "964": "n07875152",
976
+ "965": "n07880968",
977
+ "966": "n07892512",
978
+ "967": "n07920052",
979
+ "968": "n07930864",
980
+ "969": "n07932039",
981
+ "970": "n09193705",
982
+ "971": "n09229709",
983
+ "972": "n09246464",
984
+ "973": "n09256479",
985
+ "974": "n09288635",
986
+ "975": "n09332890",
987
+ "976": "n09399592",
988
+ "977": "n09421951",
989
+ "978": "n09428293",
990
+ "979": "n09468604",
991
+ "980": "n09472597",
992
+ "981": "n09835506",
993
+ "982": "n10148035",
994
+ "983": "n10565667",
995
+ "984": "n11879895",
996
+ "985": "n11939491",
997
+ "986": "n12057211",
998
+ "987": "n12144580",
999
+ "988": "n12267677",
1000
+ "989": "n12620546",
1001
+ "990": "n12768682",
1002
+ "991": "n12985857",
1003
+ "992": "n12998815",
1004
+ "993": "n13037406",
1005
+ "994": "n13040303",
1006
+ "995": "n13044778",
1007
+ "996": "n13052670",
1008
+ "997": "n13054560",
1009
+ "998": "n13133613",
1010
+ "999": "n15075141"
1011
  },
1012
+ "image_size": 224,
1013
+ "initializer_range": 0.02,
1014
+ "intermediate_size": 1536,
1015
+ "label2id": {
1016
+ "n01440764": 0,
1017
+ "n01443537": 1,
1018
+ "n01484850": 2,
1019
+ "n01491361": 3,
1020
+ "n01494475": 4,
1021
+ "n01496331": 5,
1022
+ "n01498041": 6,
1023
+ "n01514668": 7,
1024
+ "n01514859": 8,
1025
+ "n01518878": 9,
1026
+ "n01530575": 10,
1027
+ "n01531178": 11,
1028
+ "n01532829": 12,
1029
+ "n01534433": 13,
1030
+ "n01537544": 14,
1031
+ "n01558993": 15,
1032
+ "n01560419": 16,
1033
+ "n01580077": 17,
1034
+ "n01582220": 18,
1035
+ "n01592084": 19,
1036
+ "n01601694": 20,
1037
+ "n01608432": 21,
1038
+ "n01614925": 22,
1039
+ "n01616318": 23,
1040
+ "n01622779": 24,
1041
+ "n01629819": 25,
1042
+ "n01630670": 26,
1043
+ "n01631663": 27,
1044
+ "n01632458": 28,
1045
+ "n01632777": 29,
1046
+ "n01641577": 30,
1047
+ "n01644373": 31,
1048
+ "n01644900": 32,
1049
+ "n01664065": 33,
1050
+ "n01665541": 34,
1051
+ "n01667114": 35,
1052
+ "n01667778": 36,
1053
+ "n01669191": 37,
1054
+ "n01675722": 38,
1055
+ "n01677366": 39,
1056
+ "n01682714": 40,
1057
+ "n01685808": 41,
1058
+ "n01687978": 42,
1059
+ "n01688243": 43,
1060
+ "n01689811": 44,
1061
+ "n01692333": 45,
1062
+ "n01693334": 46,
1063
+ "n01694178": 47,
1064
+ "n01695060": 48,
1065
+ "n01697457": 49,
1066
+ "n01698640": 50,
1067
+ "n01704323": 51,
1068
+ "n01728572": 52,
1069
+ "n01728920": 53,
1070
+ "n01729322": 54,
1071
+ "n01729977": 55,
1072
+ "n01734418": 56,
1073
+ "n01735189": 57,
1074
+ "n01737021": 58,
1075
+ "n01739381": 59,
1076
+ "n01740131": 60,
1077
+ "n01742172": 61,
1078
+ "n01744401": 62,
1079
+ "n01748264": 63,
1080
+ "n01749939": 64,
1081
+ "n01751748": 65,
1082
+ "n01753488": 66,
1083
+ "n01755581": 67,
1084
+ "n01756291": 68,
1085
+ "n01768244": 69,
1086
+ "n01770081": 70,
1087
+ "n01770393": 71,
1088
+ "n01773157": 72,
1089
+ "n01773549": 73,
1090
+ "n01773797": 74,
1091
+ "n01774384": 75,
1092
+ "n01774750": 76,
1093
+ "n01775062": 77,
1094
+ "n01776313": 78,
1095
+ "n01784675": 79,
1096
+ "n01795545": 80,
1097
+ "n01796340": 81,
1098
+ "n01797886": 82,
1099
+ "n01798484": 83,
1100
+ "n01806143": 84,
1101
+ "n01806567": 85,
1102
+ "n01807496": 86,
1103
+ "n01817953": 87,
1104
+ "n01818515": 88,
1105
+ "n01819313": 89,
1106
+ "n01820546": 90,
1107
+ "n01824575": 91,
1108
+ "n01828970": 92,
1109
+ "n01829413": 93,
1110
+ "n01833805": 94,
1111
+ "n01843065": 95,
1112
+ "n01843383": 96,
1113
+ "n01847000": 97,
1114
+ "n01855032": 98,
1115
+ "n01855672": 99,
1116
+ "n01860187": 100,
1117
+ "n01871265": 101,
1118
+ "n01872401": 102,
1119
+ "n01873310": 103,
1120
+ "n01877812": 104,
1121
+ "n01882714": 105,
1122
+ "n01883070": 106,
1123
+ "n01910747": 107,
1124
+ "n01914609": 108,
1125
+ "n01917289": 109,
1126
+ "n01924916": 110,
1127
+ "n01930112": 111,
1128
+ "n01943899": 112,
1129
+ "n01944390": 113,
1130
+ "n01945685": 114,
1131
+ "n01950731": 115,
1132
+ "n01955084": 116,
1133
+ "n01968897": 117,
1134
+ "n01978287": 118,
1135
+ "n01978455": 119,
1136
+ "n01980166": 120,
1137
+ "n01981276": 121,
1138
+ "n01983481": 122,
1139
+ "n01984695": 123,
1140
+ "n01985128": 124,
1141
+ "n01986214": 125,
1142
+ "n01990800": 126,
1143
+ "n02002556": 127,
1144
+ "n02002724": 128,
1145
+ "n02006656": 129,
1146
+ "n02007558": 130,
1147
+ "n02009229": 131,
1148
+ "n02009912": 132,
1149
+ "n02011460": 133,
1150
+ "n02012849": 134,
1151
+ "n02013706": 135,
1152
+ "n02017213": 136,
1153
+ "n02018207": 137,
1154
+ "n02018795": 138,
1155
+ "n02025239": 139,
1156
+ "n02027492": 140,
1157
+ "n02028035": 141,
1158
+ "n02033041": 142,
1159
+ "n02037110": 143,
1160
+ "n02051845": 144,
1161
+ "n02056570": 145,
1162
+ "n02058221": 146,
1163
+ "n02066245": 147,
1164
+ "n02071294": 148,
1165
+ "n02074367": 149,
1166
+ "n02077923": 150,
1167
+ "n02085620": 151,
1168
+ "n02085782": 152,
1169
+ "n02085936": 153,
1170
+ "n02086079": 154,
1171
+ "n02086240": 155,
1172
+ "n02086646": 156,
1173
+ "n02086910": 157,
1174
+ "n02087046": 158,
1175
+ "n02087394": 159,
1176
+ "n02088094": 160,
1177
+ "n02088238": 161,
1178
+ "n02088364": 162,
1179
+ "n02088466": 163,
1180
+ "n02088632": 164,
1181
+ "n02089078": 165,
1182
+ "n02089867": 166,
1183
+ "n02089973": 167,
1184
+ "n02090379": 168,
1185
+ "n02090622": 169,
1186
+ "n02090721": 170,
1187
+ "n02091032": 171,
1188
+ "n02091134": 172,
1189
+ "n02091244": 173,
1190
+ "n02091467": 174,
1191
+ "n02091635": 175,
1192
+ "n02091831": 176,
1193
+ "n02092002": 177,
1194
+ "n02092339": 178,
1195
+ "n02093256": 179,
1196
+ "n02093428": 180,
1197
+ "n02093647": 181,
1198
+ "n02093754": 182,
1199
+ "n02093859": 183,
1200
+ "n02093991": 184,
1201
+ "n02094114": 185,
1202
+ "n02094258": 186,
1203
+ "n02094433": 187,
1204
+ "n02095314": 188,
1205
+ "n02095570": 189,
1206
+ "n02095889": 190,
1207
+ "n02096051": 191,
1208
+ "n02096177": 192,
1209
+ "n02096294": 193,
1210
+ "n02096437": 194,
1211
+ "n02096585": 195,
1212
+ "n02097047": 196,
1213
+ "n02097130": 197,
1214
+ "n02097209": 198,
1215
+ "n02097298": 199,
1216
+ "n02097474": 200,
1217
+ "n02097658": 201,
1218
+ "n02098105": 202,
1219
+ "n02098286": 203,
1220
+ "n02098413": 204,
1221
+ "n02099267": 205,
1222
+ "n02099429": 206,
1223
+ "n02099601": 207,
1224
+ "n02099712": 208,
1225
+ "n02099849": 209,
1226
+ "n02100236": 210,
1227
+ "n02100583": 211,
1228
+ "n02100735": 212,
1229
+ "n02100877": 213,
1230
+ "n02101006": 214,
1231
+ "n02101388": 215,
1232
+ "n02101556": 216,
1233
+ "n02102040": 217,
1234
+ "n02102177": 218,
1235
+ "n02102318": 219,
1236
+ "n02102480": 220,
1237
+ "n02102973": 221,
1238
+ "n02104029": 222,
1239
+ "n02104365": 223,
1240
+ "n02105056": 224,
1241
+ "n02105162": 225,
1242
+ "n02105251": 226,
1243
+ "n02105412": 227,
1244
+ "n02105505": 228,
1245
+ "n02105641": 229,
1246
+ "n02105855": 230,
1247
+ "n02106030": 231,
1248
+ "n02106166": 232,
1249
+ "n02106382": 233,
1250
+ "n02106550": 234,
1251
+ "n02106662": 235,
1252
+ "n02107142": 236,
1253
+ "n02107312": 237,
1254
+ "n02107574": 238,
1255
+ "n02107683": 239,
1256
+ "n02107908": 240,
1257
+ "n02108000": 241,
1258
+ "n02108089": 242,
1259
+ "n02108422": 243,
1260
+ "n02108551": 244,
1261
+ "n02108915": 245,
1262
+ "n02109047": 246,
1263
+ "n02109525": 247,
1264
+ "n02109961": 248,
1265
+ "n02110063": 249,
1266
+ "n02110185": 250,
1267
+ "n02110341": 251,
1268
+ "n02110627": 252,
1269
+ "n02110806": 253,
1270
+ "n02110958": 254,
1271
+ "n02111129": 255,
1272
+ "n02111277": 256,
1273
+ "n02111500": 257,
1274
+ "n02111889": 258,
1275
+ "n02112018": 259,
1276
+ "n02112137": 260,
1277
+ "n02112350": 261,
1278
+ "n02112706": 262,
1279
+ "n02113023": 263,
1280
+ "n02113186": 264,
1281
+ "n02113624": 265,
1282
+ "n02113712": 266,
1283
+ "n02113799": 267,
1284
+ "n02113978": 268,
1285
+ "n02114367": 269,
1286
+ "n02114548": 270,
1287
+ "n02114712": 271,
1288
+ "n02114855": 272,
1289
+ "n02115641": 273,
1290
+ "n02115913": 274,
1291
+ "n02116738": 275,
1292
+ "n02117135": 276,
1293
+ "n02119022": 277,
1294
+ "n02119789": 278,
1295
+ "n02120079": 279,
1296
+ "n02120505": 280,
1297
+ "n02123045": 281,
1298
+ "n02123159": 282,
1299
+ "n02123394": 283,
1300
+ "n02123597": 284,
1301
+ "n02124075": 285,
1302
+ "n02125311": 286,
1303
+ "n02127052": 287,
1304
+ "n02128385": 288,
1305
+ "n02128757": 289,
1306
+ "n02128925": 290,
1307
+ "n02129165": 291,
1308
+ "n02129604": 292,
1309
+ "n02130308": 293,
1310
+ "n02132136": 294,
1311
+ "n02133161": 295,
1312
+ "n02134084": 296,
1313
+ "n02134418": 297,
1314
+ "n02137549": 298,
1315
+ "n02138441": 299,
1316
+ "n02165105": 300,
1317
+ "n02165456": 301,
1318
+ "n02167151": 302,
1319
+ "n02168699": 303,
1320
+ "n02169497": 304,
1321
+ "n02172182": 305,
1322
+ "n02174001": 306,
1323
+ "n02177972": 307,
1324
+ "n02190166": 308,
1325
+ "n02206856": 309,
1326
+ "n02219486": 310,
1327
+ "n02226429": 311,
1328
+ "n02229544": 312,
1329
+ "n02231487": 313,
1330
+ "n02233338": 314,
1331
+ "n02236044": 315,
1332
+ "n02256656": 316,
1333
+ "n02259212": 317,
1334
+ "n02264363": 318,
1335
+ "n02268443": 319,
1336
+ "n02268853": 320,
1337
+ "n02276258": 321,
1338
+ "n02277742": 322,
1339
+ "n02279972": 323,
1340
+ "n02280649": 324,
1341
+ "n02281406": 325,
1342
+ "n02281787": 326,
1343
+ "n02317335": 327,
1344
+ "n02319095": 328,
1345
+ "n02321529": 329,
1346
+ "n02325366": 330,
1347
+ "n02326432": 331,
1348
+ "n02328150": 332,
1349
+ "n02342885": 333,
1350
+ "n02346627": 334,
1351
+ "n02356798": 335,
1352
+ "n02361337": 336,
1353
+ "n02363005": 337,
1354
+ "n02364673": 338,
1355
+ "n02389026": 339,
1356
+ "n02391049": 340,
1357
+ "n02395406": 341,
1358
+ "n02396427": 342,
1359
+ "n02397096": 343,
1360
+ "n02398521": 344,
1361
+ "n02403003": 345,
1362
+ "n02408429": 346,
1363
+ "n02410509": 347,
1364
+ "n02412080": 348,
1365
+ "n02415577": 349,
1366
+ "n02417914": 350,
1367
+ "n02422106": 351,
1368
+ "n02422699": 352,
1369
+ "n02423022": 353,
1370
+ "n02437312": 354,
1371
+ "n02437616": 355,
1372
+ "n02441942": 356,
1373
+ "n02442845": 357,
1374
+ "n02443114": 358,
1375
+ "n02443484": 359,
1376
+ "n02444819": 360,
1377
+ "n02445715": 361,
1378
+ "n02447366": 362,
1379
+ "n02454379": 363,
1380
+ "n02457408": 364,
1381
+ "n02480495": 365,
1382
+ "n02480855": 366,
1383
+ "n02481823": 367,
1384
+ "n02483362": 368,
1385
+ "n02483708": 369,
1386
+ "n02484975": 370,
1387
+ "n02486261": 371,
1388
+ "n02486410": 372,
1389
+ "n02487347": 373,
1390
+ "n02488291": 374,
1391
+ "n02488702": 375,
1392
+ "n02489166": 376,
1393
+ "n02490219": 377,
1394
+ "n02492035": 378,
1395
+ "n02492660": 379,
1396
+ "n02493509": 380,
1397
+ "n02493793": 381,
1398
+ "n02494079": 382,
1399
+ "n02497673": 383,
1400
+ "n02500267": 384,
1401
+ "n02504013": 385,
1402
+ "n02504458": 386,
1403
+ "n02509815": 387,
1404
+ "n02510455": 388,
1405
+ "n02514041": 389,
1406
+ "n02526121": 390,
1407
+ "n02536864": 391,
1408
+ "n02606052": 392,
1409
+ "n02607072": 393,
1410
+ "n02640242": 394,
1411
+ "n02641379": 395,
1412
+ "n02643566": 396,
1413
+ "n02655020": 397,
1414
+ "n02666196": 398,
1415
+ "n02667093": 399,
1416
+ "n02669723": 400,
1417
+ "n02672831": 401,
1418
+ "n02676566": 402,
1419
+ "n02687172": 403,
1420
+ "n02690373": 404,
1421
+ "n02692877": 405,
1422
+ "n02699494": 406,
1423
+ "n02701002": 407,
1424
+ "n02704792": 408,
1425
+ "n02708093": 409,
1426
+ "n02727426": 410,
1427
+ "n02730930": 411,
1428
+ "n02747177": 412,
1429
+ "n02749479": 413,
1430
+ "n02769748": 414,
1431
+ "n02776631": 415,
1432
+ "n02777292": 416,
1433
+ "n02782093": 417,
1434
+ "n02783161": 418,
1435
+ "n02786058": 419,
1436
+ "n02787622": 420,
1437
+ "n02788148": 421,
1438
+ "n02790996": 422,
1439
+ "n02791124": 423,
1440
+ "n02791270": 424,
1441
+ "n02793495": 425,
1442
+ "n02794156": 426,
1443
+ "n02795169": 427,
1444
+ "n02797295": 428,
1445
+ "n02799071": 429,
1446
+ "n02802426": 430,
1447
+ "n02804414": 431,
1448
+ "n02804610": 432,
1449
+ "n02807133": 433,
1450
+ "n02808304": 434,
1451
+ "n02808440": 435,
1452
+ "n02814533": 436,
1453
+ "n02814860": 437,
1454
+ "n02815834": 438,
1455
+ "n02817516": 439,
1456
+ "n02823428": 440,
1457
+ "n02823750": 441,
1458
+ "n02825657": 442,
1459
+ "n02834397": 443,
1460
+ "n02835271": 444,
1461
+ "n02837789": 445,
1462
+ "n02840245": 446,
1463
+ "n02841315": 447,
1464
+ "n02843684": 448,
1465
+ "n02859443": 449,
1466
+ "n02860847": 450,
1467
+ "n02865351": 451,
1468
+ "n02869837": 452,
1469
+ "n02870880": 453,
1470
+ "n02871525": 454,
1471
+ "n02877765": 455,
1472
+ "n02879718": 456,
1473
+ "n02883205": 457,
1474
+ "n02892201": 458,
1475
+ "n02892767": 459,
1476
+ "n02894605": 460,
1477
+ "n02895154": 461,
1478
+ "n02906734": 462,
1479
+ "n02909870": 463,
1480
+ "n02910353": 464,
1481
+ "n02916936": 465,
1482
+ "n02917067": 466,
1483
+ "n02927161": 467,
1484
+ "n02930766": 468,
1485
+ "n02939185": 469,
1486
+ "n02948072": 470,
1487
+ "n02950826": 471,
1488
+ "n02951358": 472,
1489
+ "n02951585": 473,
1490
+ "n02963159": 474,
1491
+ "n02965783": 475,
1492
+ "n02966193": 476,
1493
+ "n02966687": 477,
1494
+ "n02971356": 478,
1495
+ "n02974003": 479,
1496
+ "n02977058": 480,
1497
+ "n02978881": 481,
1498
+ "n02979186": 482,
1499
+ "n02980441": 483,
1500
+ "n02981792": 484,
1501
+ "n02988304": 485,
1502
+ "n02992211": 486,
1503
+ "n02992529": 487,
1504
+ "n02999410": 488,
1505
+ "n03000134": 489,
1506
+ "n03000247": 490,
1507
+ "n03000684": 491,
1508
+ "n03014705": 492,
1509
+ "n03016953": 493,
1510
+ "n03017168": 494,
1511
+ "n03018349": 495,
1512
+ "n03026506": 496,
1513
+ "n03028079": 497,
1514
+ "n03032252": 498,
1515
+ "n03041632": 499,
1516
+ "n03042490": 500,
1517
+ "n03045698": 501,
1518
+ "n03047690": 502,
1519
+ "n03062245": 503,
1520
+ "n03063599": 504,
1521
+ "n03063689": 505,
1522
+ "n03065424": 506,
1523
+ "n03075370": 507,
1524
+ "n03085013": 508,
1525
+ "n03089624": 509,
1526
+ "n03095699": 510,
1527
+ "n03100240": 511,
1528
+ "n03109150": 512,
1529
+ "n03110669": 513,
1530
+ "n03124043": 514,
1531
+ "n03124170": 515,
1532
+ "n03125729": 516,
1533
+ "n03126707": 517,
1534
+ "n03127747": 518,
1535
+ "n03127925": 519,
1536
+ "n03131574": 520,
1537
+ "n03133878": 521,
1538
+ "n03134739": 522,
1539
+ "n03141823": 523,
1540
+ "n03146219": 524,
1541
+ "n03160309": 525,
1542
+ "n03179701": 526,
1543
+ "n03180011": 527,
1544
+ "n03187595": 528,
1545
+ "n03188531": 529,
1546
+ "n03196217": 530,
1547
+ "n03197337": 531,
1548
+ "n03201208": 532,
1549
+ "n03207743": 533,
1550
+ "n03207941": 534,
1551
+ "n03208938": 535,
1552
+ "n03216828": 536,
1553
+ "n03218198": 537,
1554
+ "n03220513": 538,
1555
+ "n03223299": 539,
1556
+ "n03240683": 540,
1557
+ "n03249569": 541,
1558
+ "n03250847": 542,
1559
+ "n03255030": 543,
1560
+ "n03259280": 544,
1561
+ "n03271574": 545,
1562
+ "n03272010": 546,
1563
+ "n03272562": 547,
1564
+ "n03290653": 548,
1565
+ "n03291819": 549,
1566
+ "n03297495": 550,
1567
+ "n03314780": 551,
1568
+ "n03325584": 552,
1569
+ "n03337140": 553,
1570
+ "n03344393": 554,
1571
+ "n03345487": 555,
1572
+ "n03347037": 556,
1573
+ "n03355925": 557,
1574
+ "n03372029": 558,
1575
+ "n03376595": 559,
1576
+ "n03379051": 560,
1577
+ "n03384352": 561,
1578
+ "n03388043": 562,
1579
+ "n03388183": 563,
1580
+ "n03388549": 564,
1581
+ "n03393912": 565,
1582
+ "n03394916": 566,
1583
+ "n03400231": 567,
1584
+ "n03404251": 568,
1585
+ "n03417042": 569,
1586
+ "n03424325": 570,
1587
+ "n03425413": 571,
1588
+ "n03443371": 572,
1589
+ "n03444034": 573,
1590
+ "n03445777": 574,
1591
+ "n03445924": 575,
1592
+ "n03447447": 576,
1593
+ "n03447721": 577,
1594
+ "n03450230": 578,
1595
+ "n03452741": 579,
1596
+ "n03457902": 580,
1597
+ "n03459775": 581,
1598
+ "n03461385": 582,
1599
+ "n03467068": 583,
1600
+ "n03476684": 584,
1601
+ "n03476991": 585,
1602
+ "n03478589": 586,
1603
+ "n03481172": 587,
1604
+ "n03482405": 588,
1605
+ "n03483316": 589,
1606
+ "n03485407": 590,
1607
+ "n03485794": 591,
1608
+ "n03492542": 592,
1609
+ "n03494278": 593,
1610
+ "n03495258": 594,
1611
+ "n03496892": 595,
1612
+ "n03498962": 596,
1613
+ "n03527444": 597,
1614
+ "n03529860": 598,
1615
+ "n03530642": 599,
1616
+ "n03532672": 600,
1617
+ "n03534580": 601,
1618
+ "n03535780": 602,
1619
+ "n03538406": 603,
1620
+ "n03544143": 604,
1621
+ "n03584254": 605,
1622
+ "n03584829": 606,
1623
+ "n03590841": 607,
1624
+ "n03594734": 608,
1625
+ "n03594945": 609,
1626
+ "n03595614": 610,
1627
+ "n03598930": 611,
1628
+ "n03599486": 612,
1629
+ "n03602883": 613,
1630
+ "n03617480": 614,
1631
+ "n03623198": 615,
1632
+ "n03627232": 616,
1633
+ "n03630383": 617,
1634
+ "n03633091": 618,
1635
+ "n03637318": 619,
1636
+ "n03642806": 620,
1637
+ "n03649909": 621,
1638
+ "n03657121": 622,
1639
+ "n03658185": 623,
1640
+ "n03661043": 624,
1641
+ "n03662601": 625,
1642
+ "n03666591": 626,
1643
+ "n03670208": 627,
1644
+ "n03673027": 628,
1645
+ "n03676483": 629,
1646
+ "n03680355": 630,
1647
+ "n03690938": 631,
1648
+ "n03691459": 632,
1649
+ "n03692522": 633,
1650
+ "n03697007": 634,
1651
+ "n03706229": 635,
1652
+ "n03709823": 636,
1653
+ "n03710193": 637,
1654
+ "n03710637": 638,
1655
+ "n03710721": 639,
1656
+ "n03717622": 640,
1657
+ "n03720891": 641,
1658
+ "n03721384": 642,
1659
+ "n03724870": 643,
1660
+ "n03729826": 644,
1661
+ "n03733131": 645,
1662
+ "n03733281": 646,
1663
+ "n03733805": 647,
1664
+ "n03742115": 648,
1665
+ "n03743016": 649,
1666
+ "n03759954": 650,
1667
+ "n03761084": 651,
1668
+ "n03763968": 652,
1669
+ "n03764736": 653,
1670
+ "n03769881": 654,
1671
+ "n03770439": 655,
1672
+ "n03770679": 656,
1673
+ "n03773504": 657,
1674
+ "n03775071": 658,
1675
+ "n03775546": 659,
1676
+ "n03776460": 660,
1677
+ "n03777568": 661,
1678
+ "n03777754": 662,
1679
+ "n03781244": 663,
1680
+ "n03782006": 664,
1681
+ "n03785016": 665,
1682
+ "n03786901": 666,
1683
+ "n03787032": 667,
1684
+ "n03788195": 668,
1685
+ "n03788365": 669,
1686
+ "n03791053": 670,
1687
+ "n03792782": 671,
1688
+ "n03792972": 672,
1689
+ "n03793489": 673,
1690
+ "n03794056": 674,
1691
+ "n03796401": 675,
1692
+ "n03803284": 676,
1693
+ "n03804744": 677,
1694
+ "n03814639": 678,
1695
+ "n03814906": 679,
1696
+ "n03825788": 680,
1697
+ "n03832673": 681,
1698
+ "n03837869": 682,
1699
+ "n03838899": 683,
1700
+ "n03840681": 684,
1701
+ "n03841143": 685,
1702
+ "n03843555": 686,
1703
+ "n03854065": 687,
1704
+ "n03857828": 688,
1705
+ "n03866082": 689,
1706
+ "n03868242": 690,
1707
+ "n03868863": 691,
1708
+ "n03871628": 692,
1709
+ "n03873416": 693,
1710
+ "n03874293": 694,
1711
+ "n03874599": 695,
1712
+ "n03876231": 696,
1713
+ "n03877472": 697,
1714
+ "n03877845": 698,
1715
+ "n03884397": 699,
1716
+ "n03887697": 700,
1717
+ "n03888257": 701,
1718
+ "n03888605": 702,
1719
+ "n03891251": 703,
1720
+ "n03891332": 704,
1721
+ "n03895866": 705,
1722
+ "n03899768": 706,
1723
+ "n03902125": 707,
1724
+ "n03903868": 708,
1725
+ "n03908618": 709,
1726
+ "n03908714": 710,
1727
+ "n03916031": 711,
1728
+ "n03920288": 712,
1729
+ "n03924679": 713,
1730
+ "n03929660": 714,
1731
+ "n03929855": 715,
1732
+ "n03930313": 716,
1733
+ "n03930630": 717,
1734
+ "n03933933": 718,
1735
+ "n03935335": 719,
1736
+ "n03937543": 720,
1737
+ "n03938244": 721,
1738
+ "n03942813": 722,
1739
+ "n03944341": 723,
1740
+ "n03947888": 724,
1741
+ "n03950228": 725,
1742
+ "n03954731": 726,
1743
+ "n03956157": 727,
1744
+ "n03958227": 728,
1745
+ "n03961711": 729,
1746
+ "n03967562": 730,
1747
+ "n03970156": 731,
1748
+ "n03976467": 732,
1749
+ "n03976657": 733,
1750
+ "n03977966": 734,
1751
+ "n03980874": 735,
1752
+ "n03982430": 736,
1753
+ "n03983396": 737,
1754
+ "n03991062": 738,
1755
+ "n03992509": 739,
1756
+ "n03995372": 740,
1757
+ "n03998194": 741,
1758
+ "n04004767": 742,
1759
+ "n04005630": 743,
1760
+ "n04008634": 744,
1761
+ "n04009552": 745,
1762
+ "n04019541": 746,
1763
+ "n04023962": 747,
1764
+ "n04026417": 748,
1765
+ "n04033901": 749,
1766
+ "n04033995": 750,
1767
+ "n04037443": 751,
1768
+ "n04039381": 752,
1769
+ "n04040759": 753,
1770
+ "n04041544": 754,
1771
+ "n04044716": 755,
1772
+ "n04049303": 756,
1773
+ "n04065272": 757,
1774
+ "n04067472": 758,
1775
+ "n04069434": 759,
1776
+ "n04070727": 760,
1777
+ "n04074963": 761,
1778
+ "n04081281": 762,
1779
+ "n04086273": 763,
1780
+ "n04090263": 764,
1781
+ "n04099969": 765,
1782
+ "n04111531": 766,
1783
+ "n04116512": 767,
1784
+ "n04118538": 768,
1785
+ "n04118776": 769,
1786
+ "n04120489": 770,
1787
+ "n04125021": 771,
1788
+ "n04127249": 772,
1789
+ "n04131690": 773,
1790
+ "n04133789": 774,
1791
+ "n04136333": 775,
1792
+ "n04141076": 776,
1793
+ "n04141327": 777,
1794
+ "n04141975": 778,
1795
+ "n04146614": 779,
1796
+ "n04147183": 780,
1797
+ "n04149813": 781,
1798
+ "n04152593": 782,
1799
+ "n04153751": 783,
1800
+ "n04154565": 784,
1801
+ "n04162706": 785,
1802
+ "n04179913": 786,
1803
+ "n04192698": 787,
1804
+ "n04200800": 788,
1805
+ "n04201297": 789,
1806
+ "n04204238": 790,
1807
+ "n04204347": 791,
1808
+ "n04208210": 792,
1809
+ "n04209133": 793,
1810
+ "n04209239": 794,
1811
+ "n04228054": 795,
1812
+ "n04229816": 796,
1813
+ "n04235860": 797,
1814
+ "n04238763": 798,
1815
+ "n04239074": 799,
1816
+ "n04243546": 800,
1817
+ "n04251144": 801,
1818
+ "n04252077": 802,
1819
+ "n04252225": 803,
1820
+ "n04254120": 804,
1821
+ "n04254680": 805,
1822
+ "n04254777": 806,
1823
+ "n04258138": 807,
1824
+ "n04259630": 808,
1825
+ "n04263257": 809,
1826
+ "n04264628": 810,
1827
+ "n04265275": 811,
1828
+ "n04266014": 812,
1829
+ "n04270147": 813,
1830
+ "n04273569": 814,
1831
+ "n04275548": 815,
1832
+ "n04277352": 816,
1833
+ "n04285008": 817,
1834
+ "n04286575": 818,
1835
+ "n04296562": 819,
1836
+ "n04310018": 820,
1837
+ "n04311004": 821,
1838
+ "n04311174": 822,
1839
+ "n04317175": 823,
1840
+ "n04325704": 824,
1841
+ "n04326547": 825,
1842
+ "n04328186": 826,
1843
+ "n04330267": 827,
1844
+ "n04332243": 828,
1845
+ "n04335435": 829,
1846
+ "n04336792": 830,
1847
+ "n04344873": 831,
1848
+ "n04346328": 832,
1849
+ "n04347754": 833,
1850
+ "n04350905": 834,
1851
+ "n04355338": 835,
1852
+ "n04355933": 836,
1853
+ "n04356056": 837,
1854
+ "n04357314": 838,
1855
+ "n04366367": 839,
1856
+ "n04367480": 840,
1857
+ "n04370456": 841,
1858
+ "n04371430": 842,
1859
+ "n04371774": 843,
1860
+ "n04372370": 844,
1861
+ "n04376876": 845,
1862
+ "n04380533": 846,
1863
+ "n04389033": 847,
1864
+ "n04392985": 848,
1865
+ "n04398044": 849,
1866
+ "n04399382": 850,
1867
+ "n04404412": 851,
1868
+ "n04409515": 852,
1869
+ "n04417672": 853,
1870
+ "n04418357": 854,
1871
+ "n04423845": 855,
1872
+ "n04428191": 856,
1873
+ "n04429376": 857,
1874
+ "n04435653": 858,
1875
+ "n04442312": 859,
1876
+ "n04443257": 860,
1877
+ "n04447861": 861,
1878
+ "n04456115": 862,
1879
+ "n04458633": 863,
1880
+ "n04461696": 864,
1881
+ "n04462240": 865,
1882
+ "n04465501": 866,
1883
+ "n04467665": 867,
1884
+ "n04476259": 868,
1885
+ "n04479046": 869,
1886
+ "n04482393": 870,
1887
+ "n04483307": 871,
1888
+ "n04485082": 872,
1889
+ "n04486054": 873,
1890
+ "n04487081": 874,
1891
+ "n04487394": 875,
1892
+ "n04493381": 876,
1893
+ "n04501370": 877,
1894
+ "n04505470": 878,
1895
+ "n04507155": 879,
1896
+ "n04509417": 880,
1897
+ "n04515003": 881,
1898
+ "n04517823": 882,
1899
+ "n04522168": 883,
1900
+ "n04523525": 884,
1901
+ "n04525038": 885,
1902
+ "n04525305": 886,
1903
+ "n04532106": 887,
1904
+ "n04532670": 888,
1905
+ "n04536866": 889,
1906
+ "n04540053": 890,
1907
+ "n04542943": 891,
1908
+ "n04548280": 892,
1909
+ "n04548362": 893,
1910
+ "n04550184": 894,
1911
+ "n04552348": 895,
1912
+ "n04553703": 896,
1913
+ "n04554684": 897,
1914
+ "n04557648": 898,
1915
+ "n04560804": 899,
1916
+ "n04562935": 900,
1917
+ "n04579145": 901,
1918
+ "n04579432": 902,
1919
+ "n04584207": 903,
1920
+ "n04589890": 904,
1921
+ "n04590129": 905,
1922
+ "n04591157": 906,
1923
+ "n04591713": 907,
1924
+ "n04592741": 908,
1925
+ "n04596742": 909,
1926
+ "n04597913": 910,
1927
+ "n04599235": 911,
1928
+ "n04604644": 912,
1929
+ "n04606251": 913,
1930
+ "n04612504": 914,
1931
+ "n04613696": 915,
1932
+ "n06359193": 916,
1933
+ "n06596364": 917,
1934
+ "n06785654": 918,
1935
+ "n06794110": 919,
1936
+ "n06874185": 920,
1937
+ "n07248320": 921,
1938
+ "n07565083": 922,
1939
+ "n07579787": 923,
1940
+ "n07583066": 924,
1941
+ "n07584110": 925,
1942
+ "n07590611": 926,
1943
+ "n07613480": 927,
1944
+ "n07614500": 928,
1945
+ "n07615774": 929,
1946
+ "n07684084": 930,
1947
+ "n07693725": 931,
1948
+ "n07695742": 932,
1949
+ "n07697313": 933,
1950
+ "n07697537": 934,
1951
+ "n07711569": 935,
1952
+ "n07714571": 936,
1953
+ "n07714990": 937,
1954
+ "n07715103": 938,
1955
+ "n07716358": 939,
1956
+ "n07716906": 940,
1957
+ "n07717410": 941,
1958
+ "n07717556": 942,
1959
+ "n07718472": 943,
1960
+ "n07718747": 944,
1961
+ "n07720875": 945,
1962
+ "n07730033": 946,
1963
+ "n07734744": 947,
1964
+ "n07742313": 948,
1965
+ "n07745940": 949,
1966
+ "n07747607": 950,
1967
+ "n07749582": 951,
1968
+ "n07753113": 952,
1969
+ "n07753275": 953,
1970
+ "n07753592": 954,
1971
+ "n07754684": 955,
1972
+ "n07760859": 956,
1973
+ "n07768694": 957,
1974
+ "n07802026": 958,
1975
+ "n07831146": 959,
1976
+ "n07836838": 960,
1977
+ "n07860988": 961,
1978
+ "n07871810": 962,
1979
+ "n07873807": 963,
1980
+ "n07875152": 964,
1981
+ "n07880968": 965,
1982
+ "n07892512": 966,
1983
+ "n07920052": 967,
1984
+ "n07930864": 968,
1985
+ "n07932039": 969,
1986
+ "n09193705": 970,
1987
+ "n09229709": 971,
1988
+ "n09246464": 972,
1989
+ "n09256479": 973,
1990
+ "n09288635": 974,
1991
+ "n09332890": 975,
1992
+ "n09399592": 976,
1993
+ "n09421951": 977,
1994
+ "n09428293": 978,
1995
+ "n09468604": 979,
1996
+ "n09472597": 980,
1997
+ "n09835506": 981,
1998
+ "n10148035": 982,
1999
+ "n10565667": 983,
2000
+ "n11879895": 984,
2001
+ "n11939491": 985,
2002
+ "n12057211": 986,
2003
+ "n12144580": 987,
2004
+ "n12267677": 988,
2005
+ "n12620546": 989,
2006
+ "n12768682": 990,
2007
+ "n12985857": 991,
2008
+ "n12998815": 992,
2009
+ "n13037406": 993,
2010
+ "n13040303": 994,
2011
+ "n13044778": 995,
2012
+ "n13052670": 996,
2013
+ "n13054560": 997,
2014
+ "n13133613": 998,
2015
+ "n15075141": 999
2016
+ },
2017
+ "layer_norm_eps": 1e-12,
2018
+ "model_type": "vit",
2019
+ "num_attention_heads": 6,
2020
+ "num_channels": 3,
2021
+ "num_hidden_layers": 12,
2022
+ "patch_size": 16,
2023
+ "qkv_bias": true,
2024
+ "torch_dtype": "float32",
2025
+ "transformers_version": "4.41.2"
2026
+ }
pytorch_model.bin → model.safetensors RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8a99d3d0eab5d260edfa1246d753fa4f7dabf308f32b856401f0968d7bd6d013
3
- size 88204542
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4372789ccc964b99b87bb8b4b39a37d57bce55360183a3208d7d37b48b40c309
3
+ size 88225584
preprocessor_config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_valid_processor_keys": [
3
+ "images",
4
+ "do_resize",
5
+ "size",
6
+ "resample",
7
+ "do_rescale",
8
+ "rescale_factor",
9
+ "do_normalize",
10
+ "image_mean",
11
+ "image_std",
12
+ "return_tensors",
13
+ "data_format",
14
+ "input_data_format"
15
+ ],
16
+ "do_normalize": true,
17
+ "do_rescale": true,
18
+ "do_resize": true,
19
+ "image_mean": [
20
+ 0.5,
21
+ 0.5,
22
+ 0.5
23
+ ],
24
+ "image_processor_type": "ViTImageProcessor",
25
+ "image_std": [
26
+ 0.5,
27
+ 0.5,
28
+ 0.5
29
+ ],
30
+ "resample": 2,
31
+ "rescale_factor": 0.00392156862745098,
32
+ "size": {
33
+ "height": 224,
34
+ "width": 224
35
+ }
36
+ }
vision_transformer.py DELETED
@@ -1,1853 +0,0 @@
1
- """ Vision Transformer (ViT) in PyTorch
2
-
3
- A PyTorch implement of Vision Transformers as described in:
4
-
5
- 'An Image Is Worth 16 x 16 Words: Transformers for Image Recognition at Scale'
6
- - https://arxiv.org/abs/2010.11929
7
-
8
- `How to train your ViT? Data, Augmentation, and Regularization in Vision Transformers`
9
- - https://arxiv.org/abs/2106.10270
10
-
11
- `FlexiViT: One Model for All Patch Sizes`
12
- - https://arxiv.org/abs/2212.08013
13
-
14
- The official jax code is released and available at
15
- * https://github.com/google-research/vision_transformer
16
- * https://github.com/google-research/big_vision
17
-
18
- Acknowledgments:
19
- * The paper authors for releasing code and weights, thanks!
20
- * I fixed my class token impl based on Phil Wang's https://github.com/lucidrains/vit-pytorch
21
- * Simple transformer style inspired by Andrej Karpathy's https://github.com/karpathy/minGPT
22
- * Bert reference code checks against Huggingface Transformers and Tensorflow Bert
23
-
24
- Hacked together by / Copyright 2020, Ross Wightman
25
- """
26
- import logging
27
- import math
28
- from collections import OrderedDict
29
- from functools import partial
30
- from typing import Optional, List
31
-
32
- import torch
33
- import torch.nn as nn
34
- import torch.nn.functional as F
35
- import torch.utils.checkpoint
36
-
37
- from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD, \
38
- OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
39
- from timm.layers import PatchEmbed, Mlp, DropPath, trunc_normal_, lecun_normal_, resample_patch_embed, \
40
- resample_abs_pos_embed
41
- from timm.models._builder import build_model_with_cfg
42
- from timm.models._manipulate import named_apply, checkpoint_seq, adapt_input_conv
43
- from timm.models._pretrained import generate_default_cfgs
44
- from timm.models._registry import register_model
45
- import math
46
- from functools import partial
47
- from typing import Optional, Tuple
48
- import argparse
49
- import json
50
- import logging
51
- import os
52
-
53
- import numpy as np
54
- import torch
55
- import torch.utils.checkpoint
56
- import torch
57
- import torch.nn as nn
58
- import torch.nn.functional as F
59
- import torch.utils.checkpoint
60
- from torch.jit import Final
61
- from quantization.utils import BaseEnumOptions
62
- from transformers_language.models.softmax import clipped_softmax, clipped_softmax1
63
-
64
- __all__ = ['VisionTransformer'] # model_registry will add each entrypoint fn to this
65
-
66
-
67
- _logger = logging.getLogger(__name__)
68
-
69
-
70
- # import torch.nn.Function as F
71
- # Set to True if exporting a model with Same padding via ONNX
72
- _EXPORTABLE = False
73
-
74
- # Set to True if wanting to use torch.jit.script on a model
75
- _SCRIPTABLE = False
76
-
77
-
78
- # use torch.scaled_dot_product_attention where possible
79
- _HAS_FUSED_ATTN = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
80
- if 'TIMM_FUSED_ATTN' in os.environ:
81
- _USE_FUSED_ATTN = int(os.environ['TIMM_FUSED_ATTN'])
82
- else:
83
- _USE_FUSED_ATTN = 1 # 0 == off, 1 == on (for tested use), 2 == on (for experimental use)
84
-
85
- def logit(p, eps=1e-16):
86
- p = np.clip(p, eps, 1 - eps)
87
- return -np.log(1 / p - 1)
88
-
89
-
90
- class AttentionGateType(BaseEnumOptions):
91
- none = 0
92
- unconditional_per_head = 1
93
- conditional_per_head = 2
94
- conditional_per_token = 3
95
-
96
- def use_fused_attn(experimental: bool = False) -> bool:
97
- # NOTE: ONNX export cannot handle F.scaled_dot_product_attention as of pytorch 2.0
98
- if not _HAS_FUSED_ATTN or _EXPORTABLE:
99
- return False
100
- if experimental:
101
- return _USE_FUSED_ATTN > 1
102
- return _USE_FUSED_ATTN > 0
103
-
104
- def scaled_dot_product_attention(query, key, value, softmax_fn, attn_mask=None, dropout_p=0.0, is_causal=False, scale=None) -> torch.Tensor:
105
- # Efficient implementation equivalent to the following:
106
- device = "cuda" if torch.cuda.is_available() else "cpu"
107
- L, S = query.size(-2), key.size(-2)
108
- scale_factor = 1 / math.sqrt(query.size(-1)) if scale is None else scale
109
- attn_bias = torch.zeros(L, S, dtype=query.dtype, device=query.device)
110
- if is_causal:
111
- assert attn_mask is None
112
- temp_mask = torch.ones(L, S, dtype=torch.bool).tril(diagonal=0)
113
- attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
114
- attn_bias.to(query.dtype)
115
-
116
- if attn_mask is not None:
117
- if attn_mask.dtype == torch.bool:
118
- attn_mask.masked_fill_(attn_mask.logical_not(), float("-inf"))
119
- else:
120
- attn_bias += attn_mask
121
- attn_weight = query @ key.transpose(-2, -1) * scale_factor
122
- attn_weight += attn_bias
123
- attn_weight = softmax_fn(attn_weight, dim=-1)
124
- attn_weight = torch.dropout(attn_weight, dropout_p, train=True)
125
- return attn_weight @ value
126
-
127
- class Attention(nn.Module):
128
- fused_attn: Final[bool]
129
-
130
- def __init__(
131
- self,
132
- dim: int,
133
- num_heads: int = 8,
134
- qkv_bias: bool = False,
135
- qk_norm: bool = False,
136
- attn_drop: float = 0.,
137
- proj_drop: float = 0.,
138
- norm_layer: nn.Module = nn.LayerNorm,
139
- softmax_fn=torch.nn.functional.softmax,
140
- gamma=None,
141
- ssm_eps=None,
142
- tau=None,
143
- skip_attn=False,
144
- attn_gate_type=AttentionGateType.none,
145
- attn_gate_init=None,
146
- attn_gate_mlp=False,
147
- attn_gate_mlp2=False,
148
- attn_gate_linear_all_features=False,
149
- fine_tuning=False,
150
- max_seq_length=None,
151
-
152
- ) -> None:
153
- super().__init__()
154
- assert dim % num_heads == 0, 'dim should be divisible by num_heads'
155
- self.num_attention_heads = num_heads
156
- self.attention_head_size = dim // num_heads
157
- self.scale = self.attention_head_size ** -0.5
158
- self.fused_attn = use_fused_attn()
159
- self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
160
- self.q_norm = norm_layer(self.attention_head_size) if qk_norm else nn.Identity()
161
- self.k_norm = norm_layer(self.attention_head_size) if qk_norm else nn.Identity()
162
- self.attn_drop = nn.Dropout(attn_drop)
163
- self.proj = nn.Linear(dim, dim)
164
- self.proj_drop = nn.Dropout(proj_drop)
165
-
166
- self.attn_scores = nn.Identity() # before attention mask
167
- self.attn_probs_before_dropout = nn.Identity()
168
- self.attn_probs_after_dropout = nn.Identity()
169
-
170
- self.gamma = gamma
171
- self.ssm_eps = ssm_eps
172
- self.tau = tau
173
- self.max_seq_length = max_seq_length
174
-
175
- # define softmax function
176
-
177
- self.softmax_fn = softmax_fn
178
-
179
- self.skip_attn = skip_attn
180
-
181
- # attention gating
182
- self.last_gate_avg_prob = None
183
- self.last_gate_all_probs = None
184
-
185
- self.attn_gate_type = attn_gate_type
186
- self.attn_gate_init = attn_gate_init
187
- self.attn_gate_mlp = attn_gate_mlp
188
- self.attn_gate_mlp2 = attn_gate_mlp2
189
- self.attn_gate_linear_all_features = attn_gate_linear_all_features
190
-
191
- self.alpha = None
192
- self.gate_fn = torch.sigmoid
193
- self.pooling_fn = partial(torch.mean, dim=1, keepdims=True)
194
-
195
- self.fine_tuning = fine_tuning
196
-
197
- # gate scaling factor
198
- self.gate_scaling_factor = 1.0
199
- if self.fine_tuning and self.attn_gate_init is not None:
200
- self.gate_scaling_factor = 1.0 / self.attn_gate_init
201
-
202
- # define gate
203
- if self.attn_gate_type == AttentionGateType.unconditional_per_head:
204
- init_alpha = torch.zeros(size=(self.num_attention_heads,))
205
- self.alpha = nn.Parameter(init_alpha, requires_grad=True)
206
-
207
- elif self.attn_gate_type in (
208
- AttentionGateType.conditional_per_head,
209
- AttentionGateType.conditional_per_token,
210
- ):
211
- if self.attn_gate_linear_all_features:
212
- self.alpha = nn.Linear(self.all_head_size, self.num_attention_heads, bias=True)
213
-
214
- else: # separate predictors for each head
215
- module_list = []
216
- for _ in range(self.num_attention_heads):
217
- if self.attn_gate_mlp:
218
- fc = nn.Sequential(
219
- nn.Linear(
220
- self.attention_head_size, self.attention_head_size // 4, bias=True
221
- ),
222
- nn.ReLU(),
223
- nn.Linear(self.attention_head_size // 4, 1, bias=True),
224
- )
225
- elif self.attn_gate_mlp2:
226
- fc = nn.Sequential(
227
- nn.Linear(
228
- self.attention_head_size, self.attention_head_size, bias=True
229
- ),
230
- nn.ReLU(),
231
- nn.Linear(self.attention_head_size, 1, bias=True),
232
- )
233
- else:
234
- fc = nn.Linear(self.attention_head_size, 1, bias=True)
235
-
236
- if self.attn_gate_init is not None:
237
- init_bias = logit(self.attn_gate_init)
238
- torch.nn.init.constant_(fc.bias, init_bias)
239
-
240
- if self.fine_tuning:
241
- # init to a very small values
242
- torch.nn.init.normal_(fc.weight, mean=0.0, std=0.01)
243
-
244
- module_list.append(fc)
245
- self.alpha = nn.ModuleList(module_list)
246
-
247
- def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
248
- new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
249
- x = x.view(new_x_shape)
250
- return x.permute(0, 2, 1, 3)
251
-
252
- def forward(self, x: torch.Tensor) -> torch.Tensor:
253
- hidden_states = x
254
- B, N, C = x.shape
255
- qkv = self.qkv(x).reshape(B, N, 3, self.num_attention_heads, self.attention_head_size).permute(2, 0, 3, 1, 4)
256
- q, k, v = qkv.unbind(0)
257
- q, k = self.q_norm(q), self.k_norm(k)
258
-
259
- if self.fused_attn:
260
- context_layer = scaled_dot_product_attention(
261
- q, k, v, self.softmax_fn,
262
- dropout_p=self.attn_drop.p if self.training else 0.,
263
- )
264
- else:
265
- q = q * self.scale
266
- attn = q @ k.transpose(-2, -1)
267
-
268
- attn = self.softmax_fn(attn, dim=-1)
269
- attn = self.attn_probs_before_dropout(attn)
270
- attn = self.attn_drop(attn)
271
- attn = self.attn_probs_after_dropout(attn)
272
- context_layer = attn @ v
273
-
274
-
275
- # *** Gating ***
276
- if self.attn_gate_type == AttentionGateType.unconditional_per_head:
277
- gate = self.gate_fn(self.alpha) # (H,)
278
- context_layer *= gate.view(-1, 1, 1) # (B, H, T, d_head)
279
-
280
- self.last_gate_avg_prob = gate.view(-1)
281
-
282
- elif self.attn_gate_type in (
283
- AttentionGateType.conditional_per_head,
284
- AttentionGateType.conditional_per_token,
285
- ):
286
-
287
- x = hidden_states
288
-
289
- if self.attn_gate_linear_all_features: # assume per_token
290
- alpha = self.alpha(x) # (B, T, H)
291
- gate = self.gate_fn(alpha)
292
- gate = gate.permute(0, 2, 1).contiguous() # (B, H, T)
293
- gate = gate.unsqueeze(3) # (B, H, T, 1)
294
-
295
- else:
296
- x = self.transpose_for_scores(x) # (B, H, T, d_head)
297
-
298
- alpha = []
299
- for head_idx in range(self.num_attention_heads):
300
- x_head = x[:, head_idx, ...] # (B, T, d_head)
301
- fc_head = self.alpha[head_idx]
302
- alpha_head = fc_head(x_head) # (B, T, 1)
303
- if self.attn_gate_type == AttentionGateType.conditional_per_head:
304
- alpha_head = self.pooling_fn(alpha_head) # (B, 1, 1)
305
- alpha.append(alpha_head)
306
- alpha = torch.stack(alpha, dim=1) # (B, H, *, 1)
307
- gate = self.gate_fn(alpha)
308
-
309
- context_layer *= gate * self.gate_scaling_factor
310
-
311
- self.last_gate_all_probs = gate # all gates to see the distributions
312
- avg_gate = gate.mean(dim=0)
313
- self.last_gate_avg_prob = avg_gate.view(self.num_attention_heads, -1).mean(dim=1)
314
-
315
-
316
- x = context_layer.transpose(1, 2).reshape(B, N, C)
317
- x = self.proj(x)
318
- x = self.proj_drop(x)
319
- return x
320
-
321
-
322
- class LayerScale(nn.Module):
323
- def __init__(self, dim, init_values=1e-5, inplace=False):
324
- super().__init__()
325
- self.inplace = inplace
326
- self.gamma = nn.Parameter(init_values * torch.ones(dim))
327
-
328
- def forward(self, x):
329
- return x.mul_(self.gamma) if self.inplace else x * self.gamma
330
-
331
-
332
- class Block(nn.Module):
333
-
334
- def __init__(
335
- self,
336
- dim,
337
- num_heads,
338
- mlp_ratio=4.,
339
- qkv_bias=False,
340
- drop=0.,
341
- attn_drop=0.,
342
- init_values=None,
343
- drop_path=0.,
344
- act_layer=nn.GELU,
345
- norm_layer=nn.LayerNorm
346
- ):
347
- super().__init__()
348
- self.norm1 = norm_layer(dim)
349
- self.attn = Attention(dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)
350
- self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
351
- # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
352
- self.drop_path1 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
353
-
354
- self.norm2 = norm_layer(dim)
355
- self.mlp = Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer, drop=drop)
356
- self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
357
- self.drop_path2 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
358
-
359
- def forward(self, x):
360
- x = x + self.drop_path1(self.ls1(self.attn(self.norm1(x))))
361
- x = x + self.drop_path2(self.ls2(self.mlp(self.norm2(x))))
362
- return x
363
-
364
-
365
- class ResPostBlock(nn.Module):
366
-
367
- def __init__(
368
- self,
369
- dim,
370
- num_heads,
371
- mlp_ratio=4.,
372
- qkv_bias=False,
373
- drop=0.,
374
- attn_drop=0.,
375
- init_values=None,
376
- drop_path=0.,
377
- act_layer=nn.GELU,
378
- norm_layer=nn.LayerNorm
379
- ):
380
- super().__init__()
381
- self.init_values = init_values
382
-
383
- self.attn = Attention(dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)
384
- self.norm1 = norm_layer(dim)
385
- self.drop_path1 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
386
-
387
- self.mlp = Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer, drop=drop)
388
- self.norm2 = norm_layer(dim)
389
- self.drop_path2 = DropPath(drop_path) if drop_path > 0. else nn.Identity()
390
-
391
- self.init_weights()
392
-
393
- def init_weights(self):
394
- # NOTE this init overrides that base model init with specific changes for the block type
395
- if self.init_values is not None:
396
- nn.init.constant_(self.norm1.weight, self.init_values)
397
- nn.init.constant_(self.norm2.weight, self.init_values)
398
-
399
- def forward(self, x):
400
- x = x + self.drop_path1(self.norm1(self.attn(x)))
401
- x = x + self.drop_path2(self.norm2(self.mlp(x)))
402
- return x
403
-
404
-
405
- class ParallelBlock(nn.Module):
406
-
407
- def __init__(
408
- self,
409
- dim,
410
- num_heads,
411
- num_parallel=2,
412
- mlp_ratio=4.,
413
- qkv_bias=False,
414
- init_values=None,
415
- drop=0.,
416
- attn_drop=0.,
417
- drop_path=0.,
418
- act_layer=nn.GELU,
419
- norm_layer=nn.LayerNorm
420
- ):
421
- super().__init__()
422
- self.num_parallel = num_parallel
423
- self.attns = nn.ModuleList()
424
- self.ffns = nn.ModuleList()
425
- for _ in range(num_parallel):
426
- self.attns.append(nn.Sequential(OrderedDict([
427
- ('norm', norm_layer(dim)),
428
- ('attn', Attention(dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)),
429
- ('ls', LayerScale(dim, init_values=init_values) if init_values else nn.Identity()),
430
- ('drop_path', DropPath(drop_path) if drop_path > 0. else nn.Identity())
431
- ])))
432
- self.ffns.append(nn.Sequential(OrderedDict([
433
- ('norm', norm_layer(dim)),
434
- ('mlp', Mlp(dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer, drop=drop)),
435
- ('ls', LayerScale(dim, init_values=init_values) if init_values else nn.Identity()),
436
- ('drop_path', DropPath(drop_path) if drop_path > 0. else nn.Identity())
437
- ])))
438
-
439
- def _forward_jit(self, x):
440
- x = x + torch.stack([attn(x) for attn in self.attns]).sum(dim=0)
441
- x = x + torch.stack([ffn(x) for ffn in self.ffns]).sum(dim=0)
442
- return x
443
-
444
- @torch.jit.ignore
445
- def _forward(self, x):
446
- x = x + sum(attn(x) for attn in self.attns)
447
- x = x + sum(ffn(x) for ffn in self.ffns)
448
- return x
449
-
450
- def forward(self, x):
451
- if torch.jit.is_scripting() or torch.jit.is_tracing():
452
- return self._forward_jit(x)
453
- else:
454
- return self._forward(x)
455
-
456
-
457
- class VisionTransformer(nn.Module):
458
- """ Vision Transformer
459
-
460
- A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale`
461
- - https://arxiv.org/abs/2010.11929
462
- """
463
-
464
- def __init__(
465
- self,
466
- img_size=224,
467
- patch_size=16,
468
- in_chans=3,
469
- num_classes=1000,
470
- global_pool='token',
471
- embed_dim=768,
472
- depth=12,
473
- num_heads=12,
474
- mlp_ratio=4.,
475
- qkv_bias=True,
476
- init_values=None,
477
- class_token=True,
478
- no_embed_class=False,
479
- pre_norm=False,
480
- fc_norm=None,
481
- drop_rate=0.,
482
- attn_drop_rate=0.,
483
- drop_path_rate=0.,
484
- weight_init='',
485
- embed_layer=PatchEmbed,
486
- norm_layer=None,
487
- act_layer=None,
488
- block_fn=Block,
489
- ):
490
- """
491
- Args:
492
- img_size (int, tuple): input image size
493
- patch_size (int, tuple): patch size
494
- in_chans (int): number of input channels
495
- num_classes (int): number of classes for classification head
496
- global_pool (str): type of global pooling for final sequence (default: 'token')
497
- embed_dim (int): embedding dimension
498
- depth (int): depth of transformer
499
- num_heads (int): number of attention heads
500
- mlp_ratio (int): ratio of mlp hidden dim to embedding dim
501
- qkv_bias (bool): enable bias for qkv if True
502
- init_values: (float): layer-scale init values
503
- class_token (bool): use class token
504
- fc_norm (Optional[bool]): pre-fc norm after pool, set if global_pool == 'avg' if None (default: None)
505
- drop_rate (float): dropout rate
506
- attn_drop_rate (float): attention dropout rate
507
- drop_path_rate (float): stochastic depth rate
508
- weight_init (str): weight init scheme
509
- embed_layer (nn.Module): patch embedding layer
510
- norm_layer: (nn.Module): normalization layer
511
- act_layer: (nn.Module): MLP activation layer
512
- """
513
- super().__init__()
514
- assert global_pool in ('', 'avg', 'token')
515
- assert class_token or global_pool != 'token'
516
- use_fc_norm = global_pool == 'avg' if fc_norm is None else fc_norm
517
- norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
518
- act_layer = act_layer or nn.GELU
519
-
520
- self.num_classes = num_classes
521
- self.global_pool = global_pool
522
- self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models
523
- self.num_prefix_tokens = 1 if class_token else 0
524
- self.no_embed_class = no_embed_class
525
- self.grad_checkpointing = False
526
-
527
- self.patch_embed = embed_layer(
528
- img_size=img_size,
529
- patch_size=patch_size,
530
- in_chans=in_chans,
531
- embed_dim=embed_dim,
532
- bias=not pre_norm, # disable bias if pre-norm is used (e.g. CLIP)
533
- )
534
- num_patches = self.patch_embed.num_patches
535
-
536
- self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) if class_token else None
537
- embed_len = num_patches if no_embed_class else num_patches + self.num_prefix_tokens
538
- self.pos_embed = nn.Parameter(torch.randn(1, embed_len, embed_dim) * .02)
539
- self.pos_drop = nn.Dropout(p=drop_rate)
540
- self.norm_pre = norm_layer(embed_dim) if pre_norm else nn.Identity()
541
-
542
- dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule
543
- self.blocks = nn.Sequential(*[
544
- block_fn(
545
- dim=embed_dim,
546
- num_heads=num_heads,
547
- mlp_ratio=mlp_ratio,
548
- qkv_bias=qkv_bias,
549
- init_values=init_values,
550
- drop=drop_rate,
551
- attn_drop=attn_drop_rate,
552
- drop_path=dpr[i],
553
- norm_layer=norm_layer,
554
- act_layer=act_layer
555
- )
556
- for i in range(depth)])
557
- self.norm = norm_layer(embed_dim) if not use_fc_norm else nn.Identity()
558
-
559
- # Classifier Head
560
- self.fc_norm = norm_layer(embed_dim) if use_fc_norm else nn.Identity()
561
- self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
562
-
563
- if weight_init != 'skip':
564
- self.init_weights(weight_init)
565
-
566
- def init_weights(self, mode=''):
567
- assert mode in ('jax', 'jax_nlhb', 'moco', '')
568
- head_bias = -math.log(self.num_classes) if 'nlhb' in mode else 0.
569
- trunc_normal_(self.pos_embed, std=.02)
570
- if self.cls_token is not None:
571
- nn.init.normal_(self.cls_token, std=1e-6)
572
- named_apply(get_init_weights_vit(mode, head_bias), self)
573
-
574
- def _init_weights(self, m):
575
- # this fn left here for compat with downstream users
576
- init_weights_vit_timm(m)
577
-
578
- @torch.jit.ignore()
579
- def load_pretrained(self, checkpoint_path, prefix=''):
580
- _load_weights(self, checkpoint_path, prefix)
581
-
582
- @torch.jit.ignore
583
- def no_weight_decay(self):
584
- return {'pos_embed', 'cls_token', 'dist_token'}
585
-
586
- @torch.jit.ignore
587
- def group_matcher(self, coarse=False):
588
- return dict(
589
- stem=r'^cls_token|pos_embed|patch_embed', # stem and embed
590
- blocks=[(r'^blocks\.(\d+)', None), (r'^norm', (99999,))]
591
- )
592
-
593
- @torch.jit.ignore
594
- def set_grad_checkpointing(self, enable=True):
595
- self.grad_checkpointing = enable
596
-
597
- @torch.jit.ignore
598
- def get_classifier(self):
599
- return self.head
600
-
601
- def reset_classifier(self, num_classes: int, global_pool=None):
602
- self.num_classes = num_classes
603
- if global_pool is not None:
604
- assert global_pool in ('', 'avg', 'token')
605
- self.global_pool = global_pool
606
- self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
607
-
608
- def _pos_embed(self, x):
609
- if self.no_embed_class:
610
- # deit-3, updated JAX (big vision)
611
- # position embedding does not overlap with class token, add then concat
612
- x = x + self.pos_embed
613
- if self.cls_token is not None:
614
- x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
615
- else:
616
- # original timm, JAX, and deit vit impl
617
- # pos_embed has entry for class token, concat then add
618
- if self.cls_token is not None:
619
- x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
620
- x = x + self.pos_embed
621
- return self.pos_drop(x)
622
-
623
- def forward_features(self, x):
624
- x = self.patch_embed(x)
625
- x = self._pos_embed(x)
626
- x = self.norm_pre(x)
627
- if self.grad_checkpointing and not torch.jit.is_scripting():
628
- x = checkpoint_seq(self.blocks, x)
629
- else:
630
- x = self.blocks(x)
631
- x = self.norm(x)
632
- return x
633
-
634
- def forward_head(self, x, pre_logits: bool = False):
635
- if self.global_pool:
636
- x = x[:, self.num_prefix_tokens:].mean(dim=1) if self.global_pool == 'avg' else x[:, 0]
637
- x = self.fc_norm(x)
638
- return x if pre_logits else self.head(x)
639
-
640
- def forward(self, x):
641
- x = self.forward_features(x)
642
- x = self.forward_head(x)
643
- return x
644
-
645
-
646
- def init_weights_vit_timm(module: nn.Module, name: str = ''):
647
- """ ViT weight initialization, original timm impl (for reproducibility) """
648
- if isinstance(module, nn.Linear):
649
- trunc_normal_(module.weight, std=.02)
650
- if module.bias is not None:
651
- nn.init.zeros_(module.bias)
652
- elif hasattr(module, 'init_weights'):
653
- module.init_weights()
654
-
655
-
656
- def init_weights_vit_jax(module: nn.Module, name: str = '', head_bias: float = 0.):
657
- """ ViT weight initialization, matching JAX (Flax) impl """
658
- if isinstance(module, nn.Linear):
659
- if name.startswith('head'):
660
- nn.init.zeros_(module.weight)
661
- nn.init.constant_(module.bias, head_bias)
662
- else:
663
- nn.init.xavier_uniform_(module.weight)
664
- if module.bias is not None:
665
- nn.init.normal_(module.bias, std=1e-6) if 'mlp' in name else nn.init.zeros_(module.bias)
666
- elif isinstance(module, nn.Conv2d):
667
- lecun_normal_(module.weight)
668
- if module.bias is not None:
669
- nn.init.zeros_(module.bias)
670
- elif hasattr(module, 'init_weights'):
671
- module.init_weights()
672
-
673
-
674
- def init_weights_vit_moco(module: nn.Module, name: str = ''):
675
- """ ViT weight initialization, matching moco-v3 impl minus fixed PatchEmbed """
676
- if isinstance(module, nn.Linear):
677
- if 'qkv' in name:
678
- # treat the weights of Q, K, V separately
679
- val = math.sqrt(6. / float(module.weight.shape[0] // 3 + module.weight.shape[1]))
680
- nn.init.uniform_(module.weight, -val, val)
681
- else:
682
- nn.init.xavier_uniform_(module.weight)
683
- if module.bias is not None:
684
- nn.init.zeros_(module.bias)
685
- elif hasattr(module, 'init_weights'):
686
- module.init_weights()
687
-
688
-
689
- def get_init_weights_vit(mode='jax', head_bias: float = 0.):
690
- if 'jax' in mode:
691
- return partial(init_weights_vit_jax, head_bias=head_bias)
692
- elif 'moco' in mode:
693
- return init_weights_vit_moco
694
- else:
695
- return init_weights_vit_timm
696
-
697
-
698
- def resize_pos_embed(
699
- posemb,
700
- posemb_new,
701
- num_prefix_tokens=1,
702
- gs_new=(),
703
- interpolation='bicubic',
704
- antialias=False,
705
- ):
706
- """ Rescale the grid of position embeddings when loading from state_dict.
707
-
708
- *DEPRECATED* This function is being deprecated in favour of resample_abs_pos_embed
709
-
710
- Adapted from:
711
- https://github.com/google-research/vision_transformer/blob/00883dd691c63a6830751563748663526e811cee/vit_jax/checkpoint.py#L224
712
- """
713
- ntok_new = posemb_new.shape[1]
714
- if num_prefix_tokens:
715
- posemb_prefix, posemb_grid = posemb[:, :num_prefix_tokens], posemb[0, num_prefix_tokens:]
716
- ntok_new -= num_prefix_tokens
717
- else:
718
- posemb_prefix, posemb_grid = posemb[:, :0], posemb[0]
719
- gs_old = int(math.sqrt(len(posemb_grid)))
720
- if not len(gs_new): # backwards compatibility
721
- gs_new = [int(math.sqrt(ntok_new))] * 2
722
- assert len(gs_new) >= 2
723
- _logger.info(f'Resized position embedding: {posemb.shape} ({[gs_old, gs_old]}) to {posemb_new.shape} ({gs_new}).')
724
- posemb_grid = posemb_grid.reshape(1, gs_old, gs_old, -1).permute(0, 3, 1, 2)
725
- posemb_grid = F.interpolate(posemb_grid, size=gs_new, mode=interpolation, antialias=antialias, align_corners=False)
726
- posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, gs_new[0] * gs_new[1], -1)
727
- posemb = torch.cat([posemb_prefix, posemb_grid], dim=1)
728
- return posemb
729
-
730
-
731
- @torch.no_grad()
732
- def _load_weights(model: VisionTransformer, checkpoint_path: str, prefix: str = ''):
733
- """ Load weights from .npz checkpoints for official Google Brain Flax implementation
734
- """
735
- import numpy as np
736
-
737
- def _n2p(w, t=True):
738
- if w.ndim == 4 and w.shape[0] == w.shape[1] == w.shape[2] == 1:
739
- w = w.flatten()
740
- if t:
741
- if w.ndim == 4:
742
- w = w.transpose([3, 2, 0, 1])
743
- elif w.ndim == 3:
744
- w = w.transpose([2, 0, 1])
745
- elif w.ndim == 2:
746
- w = w.transpose([1, 0])
747
- return torch.from_numpy(w)
748
-
749
- w = np.load(checkpoint_path)
750
- interpolation = 'bilinear'
751
- antialias = False
752
- big_vision = False
753
- if not prefix:
754
- if 'opt/target/embedding/kernel' in w:
755
- prefix = 'opt/target/'
756
- elif 'params/embedding/kernel' in w:
757
- prefix = 'params/'
758
- big_vision = True
759
-
760
- if hasattr(model.patch_embed, 'backbone'):
761
- # hybrid
762
- backbone = model.patch_embed.backbone
763
- stem_only = not hasattr(backbone, 'stem')
764
- stem = backbone if stem_only else backbone.stem
765
- stem.conv.weight.copy_(adapt_input_conv(stem.conv.weight.shape[1], _n2p(w[f'{prefix}conv_root/kernel'])))
766
- stem.norm.weight.copy_(_n2p(w[f'{prefix}gn_root/scale']))
767
- stem.norm.bias.copy_(_n2p(w[f'{prefix}gn_root/bias']))
768
- if not stem_only:
769
- for i, stage in enumerate(backbone.stages):
770
- for j, block in enumerate(stage.blocks):
771
- bp = f'{prefix}block{i + 1}/unit{j + 1}/'
772
- for r in range(3):
773
- getattr(block, f'conv{r + 1}').weight.copy_(_n2p(w[f'{bp}conv{r + 1}/kernel']))
774
- getattr(block, f'norm{r + 1}').weight.copy_(_n2p(w[f'{bp}gn{r + 1}/scale']))
775
- getattr(block, f'norm{r + 1}').bias.copy_(_n2p(w[f'{bp}gn{r + 1}/bias']))
776
- if block.downsample is not None:
777
- block.downsample.conv.weight.copy_(_n2p(w[f'{bp}conv_proj/kernel']))
778
- block.downsample.norm.weight.copy_(_n2p(w[f'{bp}gn_proj/scale']))
779
- block.downsample.norm.bias.copy_(_n2p(w[f'{bp}gn_proj/bias']))
780
- embed_conv_w = _n2p(w[f'{prefix}embedding/kernel'])
781
- else:
782
- embed_conv_w = adapt_input_conv(
783
- model.patch_embed.proj.weight.shape[1], _n2p(w[f'{prefix}embedding/kernel']))
784
- if embed_conv_w.shape[-2:] != model.patch_embed.proj.weight.shape[-2:]:
785
- embed_conv_w = resample_patch_embed(
786
- embed_conv_w,
787
- model.patch_embed.proj.weight.shape[-2:],
788
- interpolation=interpolation,
789
- antialias=antialias,
790
- verbose=True,
791
- )
792
-
793
- model.patch_embed.proj.weight.copy_(embed_conv_w)
794
- model.patch_embed.proj.bias.copy_(_n2p(w[f'{prefix}embedding/bias']))
795
- if model.cls_token is not None:
796
- model.cls_token.copy_(_n2p(w[f'{prefix}cls'], t=False))
797
- if big_vision:
798
- pos_embed_w = _n2p(w[f'{prefix}pos_embedding'], t=False)
799
- else:
800
- pos_embed_w = _n2p(w[f'{prefix}Transformer/posembed_input/pos_embedding'], t=False)
801
- if pos_embed_w.shape != model.pos_embed.shape:
802
- old_shape = pos_embed_w.shape
803
- num_prefix_tokens = 0 if getattr(model, 'no_embed_class', False) else getattr(model, 'num_prefix_tokens', 1)
804
- pos_embed_w = resample_abs_pos_embed( # resize pos embedding when different size from pretrained weights
805
- pos_embed_w,
806
- new_size=model.patch_embed.grid_size,
807
- num_prefix_tokens=num_prefix_tokens,
808
- interpolation=interpolation,
809
- antialias=antialias,
810
- verbose=True,
811
- )
812
- model.pos_embed.copy_(pos_embed_w)
813
- model.norm.weight.copy_(_n2p(w[f'{prefix}Transformer/encoder_norm/scale']))
814
- model.norm.bias.copy_(_n2p(w[f'{prefix}Transformer/encoder_norm/bias']))
815
- if isinstance(model.head, nn.Linear) and model.head.bias.shape[0] == w[f'{prefix}head/bias'].shape[-1]:
816
- model.head.weight.copy_(_n2p(w[f'{prefix}head/kernel']))
817
- model.head.bias.copy_(_n2p(w[f'{prefix}head/bias']))
818
- # NOTE representation layer has been removed, not used in latest 21k/1k pretrained weights
819
- # if isinstance(getattr(model.pre_logits, 'fc', None), nn.Linear) and f'{prefix}pre_logits/bias' in w:
820
- # model.pre_logits.fc.weight.copy_(_n2p(w[f'{prefix}pre_logits/kernel']))
821
- # model.pre_logits.fc.bias.copy_(_n2p(w[f'{prefix}pre_logits/bias']))
822
- mha_sub, b_sub, ln1_sub = (0, 0, 1) if big_vision else (1, 3, 2)
823
- for i, block in enumerate(model.blocks.children()):
824
- block_prefix = f'{prefix}Transformer/encoderblock_{i}/'
825
- mha_prefix = block_prefix + f'MultiHeadDotProductAttention_{mha_sub}/'
826
- block.norm1.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/scale']))
827
- block.norm1.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/bias']))
828
- block.attn.qkv.weight.copy_(torch.cat([
829
- _n2p(w[f'{mha_prefix}{n}/kernel'], t=False).flatten(1).T for n in ('query', 'key', 'value')]))
830
- block.attn.qkv.bias.copy_(torch.cat([
831
- _n2p(w[f'{mha_prefix}{n}/bias'], t=False).reshape(-1) for n in ('query', 'key', 'value')]))
832
- block.attn.proj.weight.copy_(_n2p(w[f'{mha_prefix}out/kernel']).flatten(1))
833
- block.attn.proj.bias.copy_(_n2p(w[f'{mha_prefix}out/bias']))
834
- for r in range(2):
835
- getattr(block.mlp, f'fc{r + 1}').weight.copy_(_n2p(w[f'{block_prefix}MlpBlock_{b_sub}/Dense_{r}/kernel']))
836
- getattr(block.mlp, f'fc{r + 1}').bias.copy_(_n2p(w[f'{block_prefix}MlpBlock_{b_sub}/Dense_{r}/bias']))
837
- block.norm2.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_{ln1_sub}/scale']))
838
- block.norm2.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_{ln1_sub}/bias']))
839
-
840
-
841
- def _convert_openai_clip(state_dict, model):
842
- out_dict = {}
843
- swaps = [
844
- ('visual.', ''), ('conv1', 'patch_embed.proj'), ('positional_embedding', 'pos_embed'),
845
- ('transformer.resblocks.', 'blocks.'), ('ln_pre', 'norm_pre'), ('ln_post', 'norm'), ('ln_', 'norm'),
846
- ('in_proj_', 'qkv.'), ('out_proj', 'proj'), ('mlp.c_fc', 'mlp.fc1'), ('mlp.c_proj', 'mlp.fc2'),
847
- ]
848
- for k, v in state_dict.items():
849
- if not k.startswith('visual.'):
850
- continue
851
- for sp in swaps:
852
- k = k.replace(sp[0], sp[1])
853
-
854
- if k == 'proj':
855
- k = 'head.weight'
856
- v = v.transpose(0, 1)
857
- out_dict['head.bias'] = torch.zeros(v.shape[0])
858
- elif k == 'class_embedding':
859
- k = 'cls_token'
860
- v = v.unsqueeze(0).unsqueeze(1)
861
- elif k == 'pos_embed':
862
- v = v.unsqueeze(0)
863
- if v.shape[1] != model.pos_embed.shape[1]:
864
- # To resize pos embedding when using model at different size from pretrained weights
865
- v = resize_pos_embed(
866
- v,
867
- model.pos_embed,
868
- 0 if getattr(model, 'no_embed_class') else getattr(model, 'num_prefix_tokens', 1),
869
- model.patch_embed.grid_size
870
- )
871
- out_dict[k] = v
872
- return out_dict
873
-
874
-
875
- def checkpoint_filter_fn(
876
- state_dict,
877
- model,
878
- adapt_layer_scale=False,
879
- interpolation='bicubic',
880
- antialias=True,
881
- ):
882
- """ convert patch embedding weight from manual patchify + linear proj to conv"""
883
- import re
884
- out_dict = {}
885
- if 'model' in state_dict:
886
- # For deit models
887
- state_dict = state_dict['model']
888
-
889
- if 'visual.class_embedding' in state_dict:
890
- return _convert_openai_clip(state_dict, model)
891
-
892
- for k, v in state_dict.items():
893
- if 'patch_embed.proj.weight' in k:
894
- O, I, H, W = model.patch_embed.proj.weight.shape
895
- if len(v.shape) < 4:
896
- # For old models that I trained prior to conv based patchification
897
- O, I, H, W = model.patch_embed.proj.weight.shape
898
- v = v.reshape(O, -1, H, W)
899
- if v.shape[-1] != W or v.shape[-2] != H:
900
- v = resample_patch_embed(
901
- v,
902
- (H, W),
903
- interpolation=interpolation,
904
- antialias=antialias,
905
- verbose=True,
906
- )
907
- elif k == 'pos_embed' and v.shape[1] != model.pos_embed.shape[1]:
908
- # To resize pos embedding when using model at different size from pretrained weights
909
- num_prefix_tokens = 0 if getattr(model, 'no_embed_class', False) else getattr(model, 'num_prefix_tokens', 1)
910
- v = resample_abs_pos_embed(
911
- v,
912
- new_size=model.patch_embed.grid_size,
913
- num_prefix_tokens=num_prefix_tokens,
914
- interpolation=interpolation,
915
- antialias=antialias,
916
- verbose=True,
917
- )
918
- elif adapt_layer_scale and 'gamma_' in k:
919
- # remap layer-scale gamma into sub-module (deit3 models)
920
- k = re.sub(r'gamma_([0-9])', r'ls\1.gamma', k)
921
- elif 'pre_logits' in k:
922
- # NOTE representation layer removed as not used in latest 21k/1k pretrained weights
923
- continue
924
- out_dict[k] = v
925
- return out_dict
926
-
927
-
928
- def _cfg(url='', **kwargs):
929
- return {
930
- 'url': url,
931
- 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None,
932
- 'crop_pct': .9, 'interpolation': 'bicubic', 'fixed_input_size': True,
933
- 'mean': IMAGENET_INCEPTION_MEAN, 'std': IMAGENET_INCEPTION_STD,
934
- 'first_conv': 'patch_embed.proj', 'classifier': 'head',
935
- **kwargs
936
- }
937
-
938
-
939
- default_cfgs = generate_default_cfgs({
940
-
941
- # re-finetuned augreg 21k FT on in1k weights
942
- 'vit_base_patch16_224.augreg2_in21k_ft_in1k': _cfg(
943
- hf_hub_id='timm/'),
944
- 'vit_base_patch16_384.augreg2_in21k_ft_in1k': _cfg(),
945
- 'vit_base_patch8_224.augreg2_in21k_ft_in1k': _cfg(
946
- hf_hub_id='timm/'),
947
-
948
- # How to train your ViT (augreg) weights, pretrained on 21k FT on in1k
949
- 'vit_tiny_patch16_224.augreg_in21k_ft_in1k': _cfg(
950
- url='https://storage.googleapis.com/vit_models/augreg/Ti_16-i21k-300ep-lr_0.001-aug_none-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_224.npz',
951
- hf_hub_id='timm/',
952
- custom_load=True),
953
- 'vit_tiny_patch16_384.augreg_in21k_ft_in1k': _cfg(
954
- url='https://storage.googleapis.com/vit_models/augreg/Ti_16-i21k-300ep-lr_0.001-aug_none-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_384.npz',
955
- hf_hub_id='timm/',
956
- custom_load=True, input_size=(3, 384, 384), crop_pct=1.0),
957
- 'vit_small_patch32_224.augreg_in21k_ft_in1k': _cfg(
958
- url='https://storage.googleapis.com/vit_models/augreg/S_32-i21k-300ep-lr_0.001-aug_light1-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_224.npz',
959
- hf_hub_id='timm/',
960
- custom_load=True),
961
- 'vit_small_patch32_384.augreg_in21k_ft_in1k': _cfg(
962
- url='https://storage.googleapis.com/vit_models/augreg/S_32-i21k-300ep-lr_0.001-aug_light1-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_384.npz',
963
- hf_hub_id='timm/',
964
- custom_load=True, input_size=(3, 384, 384), crop_pct=1.0),
965
- 'vit_small_patch16_224.augreg_in21k_ft_in1k': _cfg(
966
- url='https://storage.googleapis.com/vit_models/augreg/S_16-i21k-300ep-lr_0.001-aug_light1-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_224.npz',
967
- hf_hub_id='timm/',
968
- custom_load=True),
969
- 'vit_small_patch16_384.augreg_in21k_ft_in1k': _cfg(
970
- url='https://storage.googleapis.com/vit_models/augreg/S_16-i21k-300ep-lr_0.001-aug_light1-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_384.npz',
971
- hf_hub_id='timm/',
972
- custom_load=True, input_size=(3, 384, 384), crop_pct=1.0),
973
- 'vit_base_patch32_224.augreg_in21k_ft_in1k': _cfg(
974
- url='https://storage.googleapis.com/vit_models/augreg/B_32-i21k-300ep-lr_0.001-aug_medium1-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_224.npz',
975
- hf_hub_id='timm/',
976
- custom_load=True),
977
- 'vit_base_patch32_384.augreg_in21k_ft_in1k': _cfg(
978
- url='https://storage.googleapis.com/vit_models/augreg/B_32-i21k-300ep-lr_0.001-aug_light1-wd_0.1-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_384.npz',
979
- hf_hub_id='timm/',
980
- custom_load=True, input_size=(3, 384, 384), crop_pct=1.0),
981
- 'vit_base_patch16_224.augreg_in21k_ft_in1k': _cfg(
982
- url='https://storage.googleapis.com/vit_models/augreg/B_16-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.01-res_224.npz',
983
- hf_hub_id='timm/',
984
- custom_load=True),
985
- 'vit_base_patch16_384.augreg_in21k_ft_in1k': _cfg(
986
- url='https://storage.googleapis.com/vit_models/augreg/B_16-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.01-res_384.npz',
987
- hf_hub_id='timm/',
988
- custom_load=True, input_size=(3, 384, 384), crop_pct=1.0),
989
- 'vit_base_patch8_224.augreg_in21k_ft_in1k': _cfg(
990
- url='https://storage.googleapis.com/vit_models/augreg/B_8-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.01-res_224.npz',
991
- hf_hub_id='timm/',
992
- custom_load=True),
993
- 'vit_large_patch16_224.augreg_in21k_ft_in1k': _cfg(
994
- url='https://storage.googleapis.com/vit_models/augreg/L_16-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.1-sd_0.1--imagenet2012-steps_20k-lr_0.01-res_224.npz',
995
- hf_hub_id='timm/',
996
- custom_load=True),
997
- 'vit_large_patch16_384.augreg_in21k_ft_in1k': _cfg(
998
- url='https://storage.googleapis.com/vit_models/augreg/L_16-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.1-sd_0.1--imagenet2012-steps_20k-lr_0.01-res_384.npz',
999
- hf_hub_id='timm/',
1000
- custom_load=True, input_size=(3, 384, 384), crop_pct=1.0),
1001
-
1002
- # patch models (weights from official Google JAX impl) pretrained on in21k FT on in1k
1003
- 'vit_base_patch16_224.orig_in21k_ft_in1k': _cfg(
1004
- url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_p16_224-80ecf9dd.pth',
1005
- hf_hub_id='timm/'),
1006
- 'vit_base_patch16_384.orig_in21k_ft_in1k': _cfg(
1007
- url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_p16_384-83fb41ba.pth',
1008
- hf_hub_id='timm/',
1009
- input_size=(3, 384, 384), crop_pct=1.0),
1010
- 'vit_large_patch32_384.orig_in21k_ft_in1k': _cfg(
1011
- url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_large_p32_384-9b920ba8.pth',
1012
- hf_hub_id='timm/',
1013
- input_size=(3, 384, 384), crop_pct=1.0),
1014
-
1015
- # How to train your ViT (augreg) weights trained on in1k only
1016
- 'vit_small_patch16_224.augreg_in1k': _cfg(
1017
- url='https://storage.googleapis.com/vit_models/augreg/S_16-i1k-300ep-lr_0.001-aug_medium2-wd_0.1-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.01-res_224.npz',
1018
- hf_hub_id='timm/',
1019
- custom_load=True),
1020
- 'vit_small_patch16_384.augreg_in1k': _cfg(
1021
- url='https://storage.googleapis.com/vit_models/augreg/S_16-i1k-300ep-lr_0.001-aug_medium2-wd_0.1-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.01-res_384.npz',
1022
- hf_hub_id='timm/',
1023
- custom_load=True, input_size=(3, 384, 384), crop_pct=1.0),
1024
- 'vit_base_patch32_224.augreg_in1k': _cfg(
1025
- url='https://storage.googleapis.com/vit_models/augreg/B_32-i1k-300ep-lr_0.001-aug_medium2-wd_0.1-do_0.1-sd_0.1--imagenet2012-steps_20k-lr_0.01-res_224.npz',
1026
- hf_hub_id='timm/',
1027
- custom_load=True),
1028
- 'vit_base_patch32_384.augreg_in1k': _cfg(
1029
- url='https://storage.googleapis.com/vit_models/augreg/B_32-i1k-300ep-lr_0.001-aug_medium2-wd_0.1-do_0.1-sd_0.1--imagenet2012-steps_20k-lr_0.01-res_384.npz',
1030
- hf_hub_id='timm/',
1031
- custom_load=True, input_size=(3, 384, 384), crop_pct=1.0),
1032
- 'vit_base_patch16_224.augreg_in1k': _cfg(
1033
- url='https://storage.googleapis.com/vit_models/augreg/B_16-i1k-300ep-lr_0.001-aug_strong2-wd_0.1-do_0.1-sd_0.1--imagenet2012-steps_20k-lr_0.01-res_224.npz',
1034
- hf_hub_id='timm/',
1035
- custom_load=True),
1036
- 'vit_base_patch16_384.augreg_in1k': _cfg(
1037
- url='https://storage.googleapis.com/vit_models/augreg/B_16-i1k-300ep-lr_0.001-aug_strong2-wd_0.1-do_0.1-sd_0.1--imagenet2012-steps_20k-lr_0.01-res_384.npz',
1038
- hf_hub_id='timm/',
1039
- custom_load=True, input_size=(3, 384, 384), crop_pct=1.0),
1040
-
1041
- 'vit_large_patch14_224.untrained': _cfg(url=''),
1042
- 'vit_huge_patch14_224.untrained': _cfg(url=''),
1043
- 'vit_giant_patch14_224.untrained': _cfg(url=''),
1044
- 'vit_gigantic_patch14_224.untrained': _cfg(url=''),
1045
-
1046
- # patch models, imagenet21k (weights from official Google JAX impl)
1047
- 'vit_large_patch32_224.orig_in21k': _cfg(
1048
- url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_large_patch32_224_in21k-9046d2e7.pth',
1049
- hf_hub_id='timm/',
1050
- num_classes=21843),
1051
- 'vit_huge_patch14_224.orig_in21k': _cfg(
1052
- url='https://storage.googleapis.com/vit_models/imagenet21k/ViT-H_14.npz',
1053
- hf_hub_id='timm/',
1054
- custom_load=True, num_classes=21843),
1055
-
1056
- # How to train your ViT (augreg) weights, pretrained on in21k
1057
- 'vit_tiny_patch16_224.augreg_in21k': _cfg(
1058
- url='https://storage.googleapis.com/vit_models/augreg/Ti_16-i21k-300ep-lr_0.001-aug_none-wd_0.03-do_0.0-sd_0.0.npz',
1059
- hf_hub_id='timm/',
1060
- custom_load=True, num_classes=21843),
1061
- 'vit_small_patch32_224.augreg_in21k': _cfg(
1062
- url='https://storage.googleapis.com/vit_models/augreg/S_32-i21k-300ep-lr_0.001-aug_light1-wd_0.03-do_0.0-sd_0.0.npz',
1063
- hf_hub_id='timm/',
1064
- custom_load=True, num_classes=21843),
1065
- 'vit_small_patch16_224.augreg_in21k': _cfg(
1066
- url='https://storage.googleapis.com/vit_models/augreg/S_16-i21k-300ep-lr_0.001-aug_light1-wd_0.03-do_0.0-sd_0.0.npz',
1067
- hf_hub_id='timm/',
1068
- custom_load=True, num_classes=21843),
1069
- 'vit_base_patch32_224.augreg_in21k': _cfg(
1070
- url='https://storage.googleapis.com/vit_models/augreg/B_32-i21k-300ep-lr_0.001-aug_medium1-wd_0.03-do_0.0-sd_0.0.npz',
1071
- hf_hub_id='timm/',
1072
- custom_load=True, num_classes=21843),
1073
- 'vit_base_patch16_224.augreg_in21k': _cfg(
1074
- url='https://storage.googleapis.com/vit_models/augreg/B_16-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.0-sd_0.0.npz',
1075
- hf_hub_id='timm/',
1076
- custom_load=True, num_classes=21843),
1077
- 'vit_base_patch8_224.augreg_in21k': _cfg(
1078
- url='https://storage.googleapis.com/vit_models/augreg/B_8-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.0-sd_0.0.npz',
1079
- hf_hub_id='timm/',
1080
- custom_load=True, num_classes=21843),
1081
- 'vit_large_patch16_224.augreg_in21k': _cfg(
1082
- url='https://storage.googleapis.com/vit_models/augreg/L_16-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.1-sd_0.1.npz',
1083
- hf_hub_id='timm/',
1084
- custom_load=True, num_classes=21843),
1085
-
1086
- # SAM trained models (https://arxiv.org/abs/2106.01548)
1087
- 'vit_base_patch32_224.sam': _cfg(
1088
- url='https://storage.googleapis.com/vit_models/sam/ViT-B_32.npz', custom_load=True,
1089
- hf_hub_id='timm/'),
1090
- 'vit_base_patch16_224.sam': _cfg(
1091
- url='https://storage.googleapis.com/vit_models/sam/ViT-B_16.npz', custom_load=True,
1092
- hf_hub_id='timm/'),
1093
-
1094
- # DINO pretrained - https://arxiv.org/abs/2104.14294 (no classifier head, for fine-tune only)
1095
- 'vit_small_patch16_224.dino': _cfg(
1096
- url='https://dl.fbaipublicfiles.com/dino/dino_deitsmall16_pretrain/dino_deitsmall16_pretrain.pth',
1097
- hf_hub_id='timm/',
1098
- mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD, num_classes=0),
1099
- 'vit_small_patch8_224.dino': _cfg(
1100
- url='https://dl.fbaipublicfiles.com/dino/dino_deitsmall8_pretrain/dino_deitsmall8_pretrain.pth',
1101
- hf_hub_id='timm/',
1102
- mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD, num_classes=0),
1103
- 'vit_base_patch16_224.dino': _cfg(
1104
- url='https://dl.fbaipublicfiles.com/dino/dino_vitbase16_pretrain/dino_vitbase16_pretrain.pth',
1105
- hf_hub_id='timm/',
1106
- mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD, num_classes=0),
1107
- 'vit_base_patch8_224.dino': _cfg(
1108
- url='https://dl.fbaipublicfiles.com/dino/dino_vitbase8_pretrain/dino_vitbase8_pretrain.pth',
1109
- hf_hub_id='timm/',
1110
- mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD, num_classes=0),
1111
-
1112
- # ViT ImageNet-21K-P pretraining by MILL
1113
- 'vit_base_patch16_224_miil.in21k': _cfg(
1114
- url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/vit_base_patch16_224_in21k_miil-887286df.pth',
1115
- hf_hub_id='timm/',
1116
- mean=(0., 0., 0.), std=(1., 1., 1.), crop_pct=0.875, interpolation='bilinear', num_classes=11221),
1117
- 'vit_base_patch16_224_miil.in21k_ft_in1k': _cfg(
1118
- url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/vit_base_patch16_224_1k_miil_84_4-2deb18e3.pth',
1119
- hf_hub_id='timm/',
1120
- mean=(0., 0., 0.), std=(1., 1., 1.), crop_pct=0.875, interpolation='bilinear'),
1121
-
1122
- # Custom timm variants
1123
- 'vit_base_patch16_rpn_224.in1k': _cfg(
1124
- url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/vit_base_patch16_rpn_224-sw-3b07e89d.pth',
1125
- hf_hub_id='timm/'),
1126
- 'vit_medium_patch16_gap_240.in12k': _cfg(
1127
- hf_hub_id='timm/',
1128
- input_size=(3, 240, 240), crop_pct=0.95, num_classes=11821),
1129
- 'vit_medium_patch16_gap_256.in12k_ft_in1k': _cfg(
1130
- hf_hub_id='timm/',
1131
- input_size=(3, 256, 256), crop_pct=0.95),
1132
- 'vit_medium_patch16_gap_384.in12k_ft_in1k': _cfg(
1133
- hf_hub_id='timm/',
1134
- input_size=(3, 384, 384), crop_pct=0.95, crop_mode='squash'),
1135
- 'vit_base_patch16_gap_224': _cfg(),
1136
-
1137
- # CLIP pretrained image tower and related fine-tuned weights
1138
- 'vit_base_patch32_clip_224.laion2b_ft_in12k_in1k': _cfg(
1139
- hf_hub_id='timm/',
1140
- mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD),
1141
- 'vit_base_patch32_clip_384.laion2b_ft_in12k_in1k': _cfg(
1142
- hf_hub_id='timm/',
1143
- mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, input_size=(3, 384, 384)),
1144
- 'vit_base_patch32_clip_448.laion2b_ft_in12k_in1k': _cfg(
1145
- hf_hub_id='timm/',
1146
- mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, input_size=(3, 448, 448)),
1147
- 'vit_base_patch16_clip_224.laion2b_ft_in12k_in1k': _cfg(
1148
- hf_hub_id='timm/',
1149
- mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=0.95),
1150
- 'vit_base_patch16_clip_384.laion2b_ft_in12k_in1k': _cfg(
1151
- hf_hub_id='timm/',
1152
- mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
1153
- crop_pct=1.0, input_size=(3, 384, 384), crop_mode='squash'),
1154
- 'vit_large_patch14_clip_224.laion2b_ft_in12k_in1k': _cfg(
1155
- hf_hub_id='timm/',
1156
- mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD, crop_pct=1.0),
1157
- 'vit_large_patch14_clip_336.laion2b_ft_in12k_in1k': _cfg(
1158
- hf_hub_id='timm/',
1159
- mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD,
1160
- crop_pct=1.0, input_size=(3, 336, 336), crop_mode='squash'),
1161
- 'vit_huge_patch14_clip_224.laion2b_ft_in12k_in1k': _cfg(
1162
- hf_hub_id='timm/',
1163
- mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0),
1164
- 'vit_huge_patch14_clip_336.laion2b_ft_in12k_in1k': _cfg(
1165
- hf_hub_id='timm/',
1166
- mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
1167
- crop_pct=1.0, input_size=(3, 336, 336), crop_mode='squash'),
1168
-
1169
- 'vit_base_patch32_clip_224.openai_ft_in12k_in1k': _cfg(
1170
- # hf_hub_id='timm/vit_base_patch32_clip_224.openai_ft_in12k_in1k',
1171
- mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD),
1172
- 'vit_base_patch32_clip_384.openai_ft_in12k_in1k': _cfg(
1173
- hf_hub_id='timm/',
1174
- mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
1175
- crop_pct=0.95, input_size=(3, 384, 384), crop_mode='squash'),
1176
- 'vit_base_patch16_clip_224.openai_ft_in12k_in1k': _cfg(
1177
- hf_hub_id='timm/',
1178
- mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=0.95),
1179
- 'vit_base_patch16_clip_384.openai_ft_in12k_in1k': _cfg(
1180
- hf_hub_id='timm/',
1181
- mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
1182
- crop_pct=0.95, input_size=(3, 384, 384), crop_mode='squash'),
1183
- 'vit_large_patch14_clip_224.openai_ft_in12k_in1k': _cfg(
1184
- hf_hub_id='timm/',
1185
- mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0),
1186
- 'vit_large_patch14_clip_336.openai_ft_in12k_in1k': _cfg(
1187
- hf_hub_id='timm/',
1188
- mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
1189
- crop_pct=1.0, input_size=(3, 336, 336), crop_mode='squash'),
1190
-
1191
- 'vit_base_patch32_clip_224.laion2b_ft_in1k': _cfg(
1192
- hf_hub_id='timm/',
1193
- mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD),
1194
- 'vit_base_patch16_clip_224.laion2b_ft_in1k': _cfg(
1195
- hf_hub_id='timm/',
1196
- mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0),
1197
- 'vit_base_patch16_clip_384.laion2b_ft_in1k': _cfg(
1198
- hf_hub_id='timm/',
1199
- mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
1200
- crop_pct=1.0, input_size=(3, 384, 384), crop_mode='squash'),
1201
- 'vit_large_patch14_clip_224.laion2b_ft_in1k': _cfg(
1202
- hf_hub_id='timm/',
1203
- mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD, crop_pct=1.0),
1204
- 'vit_large_patch14_clip_336.laion2b_ft_in1k': _cfg(
1205
- hf_hub_id='timm/',
1206
- mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD,
1207
- crop_pct=1.0, input_size=(3, 336, 336), crop_mode='squash'),
1208
- 'vit_huge_patch14_clip_224.laion2b_ft_in1k': _cfg(
1209
- hf_hub_id='timm/',
1210
- mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0),
1211
- 'vit_huge_patch14_clip_336.laion2b_ft_in1k': _cfg(
1212
- hf_hub_id='',
1213
- mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
1214
- crop_pct=1.0, input_size=(3, 336, 336), crop_mode='squash'),
1215
-
1216
- 'vit_base_patch32_clip_224.openai_ft_in1k': _cfg(
1217
- hf_hub_id='timm/',
1218
- mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD),
1219
- 'vit_base_patch16_clip_224.openai_ft_in1k': _cfg(
1220
- hf_hub_id='timm/',
1221
- mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD),
1222
- 'vit_base_patch16_clip_384.openai_ft_in1k': _cfg(
1223
- hf_hub_id='timm/',
1224
- mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
1225
- crop_pct=1.0, input_size=(3, 384, 384), crop_mode='squash'),
1226
- 'vit_large_patch14_clip_224.openai_ft_in1k': _cfg(
1227
- hf_hub_id='timm/',
1228
- mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0),
1229
-
1230
- 'vit_base_patch32_clip_224.laion2b_ft_in12k': _cfg(
1231
- #hf_hub_id='timm/vit_base_patch32_clip_224.laion2b_ft_in12k',
1232
- mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=11821),
1233
- 'vit_base_patch16_clip_224.laion2b_ft_in12k': _cfg(
1234
- hf_hub_id='timm/',
1235
- mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=11821),
1236
- 'vit_large_patch14_clip_224.laion2b_ft_in12k': _cfg(
1237
- hf_hub_id='timm/',
1238
- mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD, crop_pct=1.0, num_classes=11821),
1239
- 'vit_huge_patch14_clip_224.laion2b_ft_in12k': _cfg(
1240
- hf_hub_id='timm/',
1241
- mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=11821),
1242
-
1243
- 'vit_base_patch32_clip_224.openai_ft_in12k': _cfg(
1244
- # hf_hub_id='timm/vit_base_patch32_clip_224.openai_ft_in12k',
1245
- mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=11821),
1246
- 'vit_base_patch16_clip_224.openai_ft_in12k': _cfg(
1247
- hf_hub_id='timm/',
1248
- mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=11821),
1249
- 'vit_large_patch14_clip_224.openai_ft_in12k': _cfg(
1250
- hf_hub_id='timm/',
1251
- mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=11821),
1252
-
1253
- 'vit_base_patch32_clip_224.laion2b': _cfg(
1254
- hf_hub_id='laion/CLIP-ViT-B-32-laion2B-s34B-b79K',
1255
- hf_hub_filename='open_clip_pytorch_model.bin',
1256
- mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=512),
1257
- 'vit_base_patch16_clip_224.laion2b': _cfg(
1258
- # hf_hub_id='laion/CLIP-ViT-B-16-laion2B-s34B-b88K',
1259
- hf_hub_filename='open_clip_pytorch_model.bin',
1260
- mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512),
1261
- 'vit_large_patch14_clip_224.laion2b': _cfg(
1262
- hf_hub_id='laion/CLIP-ViT-L-14-laion2B-s32B-b82K',
1263
- hf_hub_filename='open_clip_pytorch_model.bin',
1264
- mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD, crop_pct=1.0, num_classes=768),
1265
- 'vit_huge_patch14_clip_224.laion2b': _cfg(
1266
- hf_hub_id='laion/CLIP-ViT-H-14-laion2B-s32B-b79K',
1267
- hf_hub_filename='open_clip_pytorch_model.bin',
1268
- mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=1024),
1269
- 'vit_giant_patch14_clip_224.laion2b': _cfg(
1270
- hf_hub_id='laion/CLIP-ViT-g-14-laion2B-s12B-b42K',
1271
- hf_hub_filename='open_clip_pytorch_model.bin',
1272
- mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=1024),
1273
-
1274
- 'vit_base_patch32_clip_224.openai': _cfg(
1275
- hf_hub_id='timm/',
1276
- mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=512),
1277
- 'vit_base_patch16_clip_224.openai': _cfg(
1278
- hf_hub_id='timm/',
1279
- mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=512),
1280
- 'vit_large_patch14_clip_224.openai': _cfg(
1281
- hf_hub_id='timm/',
1282
- mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=768),
1283
-
1284
- # experimental (may be removed)
1285
- 'vit_base_patch32_plus_256': _cfg(url='', input_size=(3, 256, 256), crop_pct=0.95),
1286
- 'vit_base_patch16_plus_240': _cfg(url='', input_size=(3, 240, 240), crop_pct=0.95),
1287
- 'vit_small_patch16_36x1_224': _cfg(url=''),
1288
- 'vit_small_patch16_18x2_224': _cfg(url=''),
1289
- 'vit_base_patch16_18x2_224': _cfg(url=''),
1290
-
1291
- # EVA fine-tuned weights from MAE style MIM - EVA-CLIP target pretrain
1292
- # https://github.com/baaivision/EVA/blob/7ecf2c0a370d97967e86d047d7af9188f78d2df3/eva/README.md#eva-l-learning-better-mim-representations-from-eva-clip
1293
- 'eva_large_patch14_196.in22k_ft_in22k_in1k': _cfg(
1294
- # hf_hub_id='BAAI/EVA', hf_hub_filename='eva_l_psz14_196px_21k_to_1k_ft_88p6.pt',
1295
- hf_hub_id='timm/',
1296
- mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
1297
- input_size=(3, 196, 196), crop_pct=1.0),
1298
- 'eva_large_patch14_336.in22k_ft_in22k_in1k': _cfg(
1299
- # hf_hub_id='BAAI/EVA', hf_hub_filename='eva_l_psz14_336px_21k_to_1k_ft_89p2.pt',
1300
- hf_hub_id='timm/',
1301
- mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
1302
- input_size=(3, 336, 336), crop_pct=1.0, crop_mode='squash'),
1303
- 'eva_large_patch14_196.in22k_ft_in1k': _cfg(
1304
- # hf_hub_id='BAAI/EVA', hf_hub_filename='eva_l_psz14_196px_1k_ft_88p0.pt',
1305
- hf_hub_id='timm/',
1306
- mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
1307
- input_size=(3, 196, 196), crop_pct=1.0),
1308
- 'eva_large_patch14_336.in22k_ft_in1k': _cfg(
1309
- # hf_hub_id='BAAI/EVA', hf_hub_filename='eva_l_psz14_336px_1k_ft_88p65.pt',
1310
- hf_hub_id='timm/',
1311
- mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
1312
- input_size=(3, 336, 336), crop_pct=1.0, crop_mode='squash'),
1313
-
1314
- 'flexivit_small.1200ep_in1k': _cfg(
1315
- url='https://storage.googleapis.com/big_vision/flexivit/flexivit_s_i1k.npz', custom_load=True,
1316
- hf_hub_id='timm/',
1317
- input_size=(3, 240, 240), crop_pct=0.95),
1318
- 'flexivit_small.600ep_in1k': _cfg(
1319
- url='https://storage.googleapis.com/big_vision/flexivit/flexivit_s_i1k_600ep.npz', custom_load=True,
1320
- hf_hub_id='timm/',
1321
- input_size=(3, 240, 240), crop_pct=0.95),
1322
- 'flexivit_small.300ep_in1k': _cfg(
1323
- url='https://storage.googleapis.com/big_vision/flexivit/flexivit_s_i1k_300ep.npz', custom_load=True,
1324
- hf_hub_id='timm/',
1325
- input_size=(3, 240, 240), crop_pct=0.95),
1326
-
1327
- 'flexivit_base.1200ep_in1k': _cfg(
1328
- url='https://storage.googleapis.com/big_vision/flexivit/flexivit_b_i1k.npz', custom_load=True,
1329
- hf_hub_id='timm/',
1330
- input_size=(3, 240, 240), crop_pct=0.95),
1331
- 'flexivit_base.600ep_in1k': _cfg(
1332
- url='https://storage.googleapis.com/big_vision/flexivit/flexivit_b_i1k_600ep.npz', custom_load=True,
1333
- hf_hub_id='timm/',
1334
- input_size=(3, 240, 240), crop_pct=0.95),
1335
- 'flexivit_base.300ep_in1k': _cfg(
1336
- url='https://storage.googleapis.com/big_vision/flexivit/flexivit_b_i1k_300ep.npz', custom_load=True,
1337
- hf_hub_id='timm/',
1338
- input_size=(3, 240, 240), crop_pct=0.95),
1339
- 'flexivit_base.1000ep_in21k': _cfg(
1340
- url='https://storage.googleapis.com/big_vision/flexivit/flexivit_b_i21k_1000ep.npz', custom_load=True,
1341
- hf_hub_id='timm/',
1342
- input_size=(3, 240, 240), crop_pct=0.95, num_classes=21843),
1343
- 'flexivit_base.300ep_in21k': _cfg(
1344
- url='https://storage.googleapis.com/big_vision/flexivit/flexivit_b_i21k_300ep.npz', custom_load=True,
1345
- hf_hub_id='timm/',
1346
- input_size=(3, 240, 240), crop_pct=0.95, num_classes=21843),
1347
-
1348
- 'flexivit_large.1200ep_in1k': _cfg(
1349
- url='https://storage.googleapis.com/big_vision/flexivit/flexivit_l_i1k.npz', custom_load=True,
1350
- hf_hub_id='timm/',
1351
- input_size=(3, 240, 240), crop_pct=0.95),
1352
- 'flexivit_large.600ep_in1k': _cfg(
1353
- url='https://storage.googleapis.com/big_vision/flexivit/flexivit_l_i1k_600ep.npz', custom_load=True,
1354
- hf_hub_id='timm/',
1355
- input_size=(3, 240, 240), crop_pct=0.95),
1356
- 'flexivit_large.300ep_in1k': _cfg(
1357
- url='https://storage.googleapis.com/big_vision/flexivit/flexivit_l_i1k_300ep.npz', custom_load=True,
1358
- hf_hub_id='timm/',
1359
- input_size=(3, 240, 240), crop_pct=0.95),
1360
-
1361
- 'flexivit_base.patch16_in21k': _cfg(
1362
- url='https://storage.googleapis.com/big_vision/flexivit/vit_b16_i21k_300ep.npz', custom_load=True,
1363
- hf_hub_id='timm/',
1364
- input_size=(3, 240, 240), crop_pct=0.95, num_classes=21843),
1365
- 'flexivit_base.patch30_in21k': _cfg(
1366
- url='https://storage.googleapis.com/big_vision/flexivit/vit_b30_i21k_300ep.npz', custom_load=True,
1367
- hf_hub_id='timm/',
1368
- input_size=(3, 240, 240), crop_pct=0.95, num_classes=21843),
1369
- })
1370
-
1371
-
1372
- def _create_vision_transformer(variant, pretrained=False, **kwargs):
1373
- if kwargs.get('features_only', None):
1374
- raise RuntimeError('features_only not implemented for Vision Transformer models.')
1375
-
1376
- if 'flexi' in variant:
1377
- # FIXME Google FlexiViT pretrained models have a strong preference for bilinear patch / embed
1378
- # interpolation, other pretrained models resize better w/ anti-aliased bicubic interpolation.
1379
- _filter_fn = partial(checkpoint_filter_fn, interpolation='bilinear', antialias=False)
1380
- else:
1381
- _filter_fn = checkpoint_filter_fn
1382
-
1383
- return build_model_with_cfg(
1384
- VisionTransformer, variant, pretrained,
1385
- pretrained_filter_fn=_filter_fn,
1386
- **kwargs,
1387
- )
1388
-
1389
-
1390
- @register_model
1391
- def vit_tiny_patch16_224(pretrained=False, **kwargs):
1392
- """ ViT-Tiny (Vit-Ti/16)
1393
- """
1394
- model_kwargs = dict(patch_size=16, embed_dim=192, depth=12, num_heads=3)
1395
- model = _create_vision_transformer('vit_tiny_patch16_224', pretrained=pretrained, **dict(model_kwargs, **kwargs))
1396
- return model
1397
-
1398
-
1399
- @register_model
1400
- def vit_tiny_patch16_384(pretrained=False, **kwargs):
1401
- """ ViT-Tiny (Vit-Ti/16) @ 384x384.
1402
- """
1403
- model_kwargs = dict(patch_size=16, embed_dim=192, depth=12, num_heads=3)
1404
- model = _create_vision_transformer('vit_tiny_patch16_384', pretrained=pretrained, **dict(model_kwargs, **kwargs))
1405
- return model
1406
-
1407
-
1408
- @register_model
1409
- def vit_small_patch32_224(pretrained=False, **kwargs):
1410
- """ ViT-Small (ViT-S/32)
1411
- """
1412
- model_kwargs = dict(patch_size=32, embed_dim=384, depth=12, num_heads=6)
1413
- model = _create_vision_transformer('vit_small_patch32_224', pretrained=pretrained, **dict(model_kwargs, **kwargs))
1414
- return model
1415
-
1416
-
1417
- @register_model
1418
- def vit_small_patch32_384(pretrained=False, **kwargs):
1419
- """ ViT-Small (ViT-S/32) at 384x384.
1420
- """
1421
- model_kwargs = dict(patch_size=32, embed_dim=384, depth=12, num_heads=6)
1422
- model = _create_vision_transformer('vit_small_patch32_384', pretrained=pretrained, **dict(model_kwargs, **kwargs))
1423
- return model
1424
-
1425
-
1426
- @register_model
1427
- def vit_small_patch16_224(pretrained=False, **kwargs):
1428
- """ ViT-Small (ViT-S/16)
1429
- """
1430
- model_kwargs = dict(patch_size=16, embed_dim=384, depth=12, num_heads=6)
1431
- model = _create_vision_transformer('vit_small_patch16_224', pretrained=pretrained, **dict(model_kwargs, **kwargs))
1432
- return model
1433
-
1434
-
1435
- @register_model
1436
- def vit_small_patch16_384(pretrained=False, **kwargs):
1437
- """ ViT-Small (ViT-S/16)
1438
- """
1439
- model_kwargs = dict(patch_size=16, embed_dim=384, depth=12, num_heads=6)
1440
- model = _create_vision_transformer('vit_small_patch16_384', pretrained=pretrained, **dict(model_kwargs, **kwargs))
1441
- return model
1442
-
1443
-
1444
- @register_model
1445
- def vit_small_patch8_224(pretrained=False, **kwargs):
1446
- """ ViT-Small (ViT-S/8)
1447
- """
1448
- model_kwargs = dict(patch_size=8, embed_dim=384, depth=12, num_heads=6)
1449
- model = _create_vision_transformer('vit_small_patch8_224', pretrained=pretrained, **dict(model_kwargs, **kwargs))
1450
- return model
1451
-
1452
-
1453
- @register_model
1454
- def vit_base_patch32_224(pretrained=False, **kwargs):
1455
- """ ViT-Base (ViT-B/32) from original paper (https://arxiv.org/abs/2010.11929).
1456
- ImageNet-1k weights fine-tuned from in21k, source https://github.com/google-research/vision_transformer.
1457
- """
1458
- model_kwargs = dict(patch_size=32, embed_dim=768, depth=12, num_heads=12)
1459
- model = _create_vision_transformer('vit_base_patch32_224', pretrained=pretrained, **dict(model_kwargs, **kwargs))
1460
- return model
1461
-
1462
-
1463
- @register_model
1464
- def vit_base_patch32_384(pretrained=False, **kwargs):
1465
- """ ViT-Base model (ViT-B/32) from original paper (https://arxiv.org/abs/2010.11929).
1466
- ImageNet-1k weights fine-tuned from in21k @ 384x384, source https://github.com/google-research/vision_transformer.
1467
- """
1468
- model_kwargs = dict(patch_size=32, embed_dim=768, depth=12, num_heads=12)
1469
- model = _create_vision_transformer('vit_base_patch32_384', pretrained=pretrained, **dict(model_kwargs, **kwargs))
1470
- return model
1471
-
1472
-
1473
- @register_model
1474
- def vit_base_patch16_224(pretrained=False, **kwargs):
1475
- """ ViT-Base (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929).
1476
- ImageNet-1k weights fine-tuned from in21k @ 224x224, source https://github.com/google-research/vision_transformer.
1477
- """
1478
- model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12)
1479
- model = _create_vision_transformer('vit_base_patch16_224', pretrained=pretrained, **dict(model_kwargs, **kwargs))
1480
- return model
1481
-
1482
-
1483
- @register_model
1484
- def vit_base_patch16_384(pretrained=False, **kwargs):
1485
- """ ViT-Base model (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929).
1486
- ImageNet-1k weights fine-tuned from in21k @ 384x384, source https://github.com/google-research/vision_transformer.
1487
- """
1488
- model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12)
1489
- model = _create_vision_transformer('vit_base_patch16_384', pretrained=pretrained, **dict(model_kwargs, **kwargs))
1490
- return model
1491
-
1492
-
1493
- @register_model
1494
- def vit_base_patch8_224(pretrained=False, **kwargs):
1495
- """ ViT-Base (ViT-B/8) from original paper (https://arxiv.org/abs/2010.11929).
1496
- ImageNet-1k weights fine-tuned from in21k @ 224x224, source https://github.com/google-research/vision_transformer.
1497
- """
1498
- model_kwargs = dict(patch_size=8, embed_dim=768, depth=12, num_heads=12)
1499
- model = _create_vision_transformer('vit_base_patch8_224', pretrained=pretrained, **dict(model_kwargs, **kwargs))
1500
- return model
1501
-
1502
-
1503
- @register_model
1504
- def vit_large_patch32_224(pretrained=False, **kwargs):
1505
- """ ViT-Large model (ViT-L/32) from original paper (https://arxiv.org/abs/2010.11929). No pretrained weights.
1506
- """
1507
- model_kwargs = dict(patch_size=32, embed_dim=1024, depth=24, num_heads=16)
1508
- model = _create_vision_transformer('vit_large_patch32_224', pretrained=pretrained, **dict(model_kwargs, **kwargs))
1509
- return model
1510
-
1511
-
1512
- @register_model
1513
- def vit_large_patch32_384(pretrained=False, **kwargs):
1514
- """ ViT-Large model (ViT-L/32) from original paper (https://arxiv.org/abs/2010.11929).
1515
- ImageNet-1k weights fine-tuned from in21k @ 384x384, source https://github.com/google-research/vision_transformer.
1516
- """
1517
- model_kwargs = dict(patch_size=32, embed_dim=1024, depth=24, num_heads=16)
1518
- model = _create_vision_transformer('vit_large_patch32_384', pretrained=pretrained, **dict(model_kwargs, **kwargs))
1519
- return model
1520
-
1521
-
1522
- @register_model
1523
- def vit_large_patch16_224(pretrained=False, **kwargs):
1524
- """ ViT-Large model (ViT-L/16) from original paper (https://arxiv.org/abs/2010.11929).
1525
- ImageNet-1k weights fine-tuned from in21k @ 224x224, source https://github.com/google-research/vision_transformer.
1526
- """
1527
- model_kwargs = dict(patch_size=16, embed_dim=1024, depth=24, num_heads=16)
1528
- model = _create_vision_transformer('vit_large_patch16_224', pretrained=pretrained, **dict(model_kwargs, **kwargs))
1529
- return model
1530
-
1531
-
1532
- @register_model
1533
- def vit_large_patch16_384(pretrained=False, **kwargs):
1534
- """ ViT-Large model (ViT-L/16) from original paper (https://arxiv.org/abs/2010.11929).
1535
- ImageNet-1k weights fine-tuned from in21k @ 384x384, source https://github.com/google-research/vision_transformer.
1536
- """
1537
- model_kwargs = dict(patch_size=16, embed_dim=1024, depth=24, num_heads=16)
1538
- model = _create_vision_transformer('vit_large_patch16_384', pretrained=pretrained, **dict(model_kwargs, **kwargs))
1539
- return model
1540
-
1541
-
1542
- @register_model
1543
- def vit_large_patch14_224(pretrained=False, **kwargs):
1544
- """ ViT-Large model (ViT-L/14)
1545
- """
1546
- model_kwargs = dict(patch_size=14, embed_dim=1024, depth=24, num_heads=16)
1547
- model = _create_vision_transformer('vit_large_patch14_224', pretrained=pretrained, **dict(model_kwargs, **kwargs))
1548
- return model
1549
-
1550
-
1551
- @register_model
1552
- def vit_huge_patch14_224(pretrained=False, **kwargs):
1553
- """ ViT-Huge model (ViT-H/14) from original paper (https://arxiv.org/abs/2010.11929).
1554
- """
1555
- model_kwargs = dict(patch_size=14, embed_dim=1280, depth=32, num_heads=16)
1556
- model = _create_vision_transformer('vit_huge_patch14_224', pretrained=pretrained, **dict(model_kwargs, **kwargs))
1557
- return model
1558
-
1559
-
1560
- @register_model
1561
- def vit_giant_patch14_224(pretrained=False, **kwargs):
1562
- """ ViT-Giant (little-g) model (ViT-g/14) from `Scaling Vision Transformers` - https://arxiv.org/abs/2106.04560
1563
- """
1564
- model_kwargs = dict(patch_size=14, embed_dim=1408, mlp_ratio=48/11, depth=40, num_heads=16)
1565
- model = _create_vision_transformer('vit_giant_patch14_224', pretrained=pretrained, **dict(model_kwargs, **kwargs))
1566
- return model
1567
-
1568
-
1569
- @register_model
1570
- def vit_gigantic_patch14_224(pretrained=False, **kwargs):
1571
- """ ViT-Gigantic (big-G) model (ViT-G/14) from `Scaling Vision Transformers` - https://arxiv.org/abs/2106.04560
1572
- """
1573
- model_kwargs = dict(patch_size=14, embed_dim=1664, mlp_ratio=64/13, depth=48, num_heads=16)
1574
- model = _create_vision_transformer(
1575
- 'vit_gigantic_patch14_224', pretrained=pretrained, **dict(model_kwargs, **kwargs))
1576
- return model
1577
-
1578
-
1579
- @register_model
1580
- def vit_base_patch16_224_miil(pretrained=False, **kwargs):
1581
- """ ViT-Base (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929).
1582
- Weights taken from: https://github.com/Alibaba-MIIL/ImageNet21K
1583
- """
1584
- model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, qkv_bias=False)
1585
- model = _create_vision_transformer(
1586
- 'vit_base_patch16_224_miil', pretrained=pretrained, **dict(model_kwargs, **kwargs))
1587
- return model
1588
-
1589
-
1590
- @register_model
1591
- def vit_medium_patch16_gap_240(pretrained=False, **kwargs):
1592
- """ ViT-Medium (ViT-M/16) w/o class token, w/ avg-pool @ 240x240
1593
- """
1594
- model_kwargs = dict(
1595
- patch_size=16, embed_dim=512, depth=12, num_heads=8, class_token=False,
1596
- global_pool='avg', qkv_bias=False, init_values=1e-6, fc_norm=False)
1597
- model = _create_vision_transformer(
1598
- 'vit_medium_patch16_gap_240', pretrained=pretrained, **dict(model_kwargs, **kwargs))
1599
- return model
1600
-
1601
-
1602
- @register_model
1603
- def vit_medium_patch16_gap_256(pretrained=False, **kwargs):
1604
- """ ViT-Medium (ViT-M/16) w/o class token, w/ avg-pool @ 256x256
1605
- """
1606
- model_kwargs = dict(
1607
- patch_size=16, embed_dim=512, depth=12, num_heads=8, class_token=False,
1608
- global_pool='avg', qkv_bias=False, init_values=1e-6, fc_norm=False)
1609
- model = _create_vision_transformer(
1610
- 'vit_medium_patch16_gap_256', pretrained=pretrained, **dict(model_kwargs, **kwargs))
1611
- return model
1612
-
1613
-
1614
- @register_model
1615
- def vit_medium_patch16_gap_384(pretrained=False, **kwargs):
1616
- """ ViT-Medium (ViT-M/16) w/o class token, w/ avg-pool @ 384x384
1617
- """
1618
- model_kwargs = dict(
1619
- patch_size=16, embed_dim=512, depth=12, num_heads=8, class_token=False,
1620
- global_pool='avg', qkv_bias=False, init_values=1e-6, fc_norm=False)
1621
- model = _create_vision_transformer(
1622
- 'vit_medium_patch16_gap_384', pretrained=pretrained, **dict(model_kwargs, **kwargs))
1623
- return model
1624
-
1625
-
1626
- @register_model
1627
- def vit_base_patch16_gap_224(pretrained=False, **kwargs):
1628
- """ ViT-Base (ViT-B/16) w/o class token, w/ avg-pool @ 256x256
1629
- """
1630
- model_kwargs = dict(
1631
- patch_size=16, embed_dim=768, depth=12, num_heads=16, class_token=False, global_pool='avg', fc_norm=False)
1632
- model = _create_vision_transformer(
1633
- 'vit_base_patch16_gap_224', pretrained=pretrained, **dict(model_kwargs, **kwargs))
1634
- return model
1635
-
1636
-
1637
- @register_model
1638
- def vit_base_patch32_clip_224(pretrained=False, **kwargs):
1639
- """ ViT-B/32 CLIP image tower @ 224x224
1640
- """
1641
- model_kwargs = dict(
1642
- patch_size=32, embed_dim=768, depth=12, num_heads=12, pre_norm=True, norm_layer=nn.LayerNorm)
1643
- model = _create_vision_transformer(
1644
- 'vit_base_patch32_clip_224', pretrained=pretrained, **dict(model_kwargs, **kwargs))
1645
- return model
1646
-
1647
-
1648
- @register_model
1649
- def vit_base_patch32_clip_384(pretrained=False, **kwargs):
1650
- """ ViT-B/32 CLIP image tower @ 384x384
1651
- """
1652
- model_kwargs = dict(
1653
- patch_size=32, embed_dim=768, depth=12, num_heads=12, pre_norm=True, norm_layer=nn.LayerNorm)
1654
- model = _create_vision_transformer(
1655
- 'vit_base_patch32_clip_384', pretrained=pretrained, **dict(model_kwargs, **kwargs))
1656
- return model
1657
-
1658
-
1659
- @register_model
1660
- def vit_base_patch32_clip_448(pretrained=False, **kwargs):
1661
- """ ViT-B/32 CLIP image tower @ 448x448
1662
- """
1663
- model_kwargs = dict(
1664
- patch_size=32, embed_dim=768, depth=12, num_heads=12, pre_norm=True, norm_layer=nn.LayerNorm)
1665
- model = _create_vision_transformer(
1666
- 'vit_base_patch32_clip_448', pretrained=pretrained, **dict(model_kwargs, **kwargs))
1667
- return model
1668
-
1669
-
1670
- @register_model
1671
- def vit_base_patch16_clip_224(pretrained=False, **kwargs):
1672
- """ ViT-B/16 CLIP image tower
1673
- """
1674
- model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, pre_norm=True, norm_layer=nn.LayerNorm)
1675
- model = _create_vision_transformer(
1676
- 'vit_base_patch16_clip_224', pretrained=pretrained, **dict(model_kwargs, **kwargs))
1677
- return model
1678
-
1679
-
1680
- @register_model
1681
- def vit_base_patch16_clip_384(pretrained=False, **kwargs):
1682
- """ ViT-B/16 CLIP image tower @ 384x384
1683
- """
1684
- model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, pre_norm=True, norm_layer=nn.LayerNorm)
1685
- model = _create_vision_transformer(
1686
- 'vit_base_patch16_clip_384', pretrained=pretrained, **dict(model_kwargs, **kwargs))
1687
- return model
1688
-
1689
-
1690
- @register_model
1691
- def vit_large_patch14_clip_224(pretrained=False, **kwargs):
1692
- """ ViT-Large model (ViT-L/14) CLIP image tower
1693
- """
1694
- model_kwargs = dict(patch_size=14, embed_dim=1024, depth=24, num_heads=16, pre_norm=True, norm_layer=nn.LayerNorm)
1695
- model = _create_vision_transformer(
1696
- 'vit_large_patch14_clip_224', pretrained=pretrained, **dict(model_kwargs, **kwargs))
1697
- return model
1698
-
1699
-
1700
- @register_model
1701
- def vit_large_patch14_clip_336(pretrained=False, **kwargs):
1702
- """ ViT-Large model (ViT-L/14) CLIP image tower @ 336x336
1703
- """
1704
- model_kwargs = dict(patch_size=14, embed_dim=1024, depth=24, num_heads=16, pre_norm=True, norm_layer=nn.LayerNorm)
1705
- model = _create_vision_transformer(
1706
- 'vit_large_patch14_clip_336', pretrained=pretrained, **dict(model_kwargs, **kwargs))
1707
- return model
1708
-
1709
-
1710
- @register_model
1711
- def vit_huge_patch14_clip_224(pretrained=False, **kwargs):
1712
- """ ViT-Huge model (ViT-H/14) CLIP image tower.
1713
- """
1714
- model_kwargs = dict(patch_size=14, embed_dim=1280, depth=32, num_heads=16, pre_norm=True, norm_layer=nn.LayerNorm)
1715
- model = _create_vision_transformer(
1716
- 'vit_huge_patch14_clip_224', pretrained=pretrained, **dict(model_kwargs, **kwargs))
1717
- return model
1718
-
1719
-
1720
- @register_model
1721
- def vit_huge_patch14_clip_336(pretrained=False, **kwargs):
1722
- """ ViT-Huge model (ViT-H/14) CLIP image tower @ 336x336
1723
- """
1724
- model_kwargs = dict(patch_size=14, embed_dim=1280, depth=32, num_heads=16, pre_norm=True, norm_layer=nn.LayerNorm)
1725
- model = _create_vision_transformer(
1726
- 'vit_huge_patch14_clip_336', pretrained=pretrained, **dict(model_kwargs, **kwargs))
1727
- return model
1728
-
1729
-
1730
- @register_model
1731
- def vit_giant_patch14_clip_224(pretrained=False, **kwargs):
1732
- """ ViT-Giant (little-g) model (ViT-g/14) from `Scaling Vision Transformers` - https://arxiv.org/abs/2106.04560
1733
- Pretrained weights from CLIP image tower.
1734
- """
1735
- model_kwargs = dict(
1736
- patch_size=14, embed_dim=1408, mlp_ratio=48/11, depth=40, num_heads=16, pre_norm=True, norm_layer=nn.LayerNorm)
1737
- model = _create_vision_transformer(
1738
- 'vit_giant_patch14_clip_224', pretrained=pretrained, **dict(model_kwargs, **kwargs))
1739
- return model
1740
-
1741
-
1742
- # Experimental models below
1743
-
1744
- @register_model
1745
- def vit_base_patch32_plus_256(pretrained=False, **kwargs):
1746
- """ ViT-Base (ViT-B/32+)
1747
- """
1748
- model_kwargs = dict(patch_size=32, embed_dim=896, depth=12, num_heads=14, init_values=1e-5)
1749
- model = _create_vision_transformer(
1750
- 'vit_base_patch32_plus_256', pretrained=pretrained, **dict(model_kwargs, **kwargs))
1751
- return model
1752
-
1753
-
1754
- @register_model
1755
- def vit_base_patch16_plus_240(pretrained=False, **kwargs):
1756
- """ ViT-Base (ViT-B/16+)
1757
- """
1758
- model_kwargs = dict(patch_size=16, embed_dim=896, depth=12, num_heads=14, init_values=1e-5)
1759
- model = _create_vision_transformer(
1760
- 'vit_base_patch16_plus_240', pretrained=pretrained, **dict(model_kwargs, **kwargs))
1761
- return model
1762
-
1763
-
1764
- @register_model
1765
- def vit_base_patch16_rpn_224(pretrained=False, **kwargs):
1766
- """ ViT-Base (ViT-B/16) w/ residual post-norm
1767
- """
1768
- model_kwargs = dict(
1769
- patch_size=16, embed_dim=768, depth=12, num_heads=12, qkv_bias=False, init_values=1e-5,
1770
- class_token=False, block_fn=ResPostBlock, global_pool='avg')
1771
- model = _create_vision_transformer(
1772
- 'vit_base_patch16_rpn_224', pretrained=pretrained, **dict(model_kwargs, **kwargs))
1773
- return model
1774
-
1775
-
1776
- @register_model
1777
- def vit_small_patch16_36x1_224(pretrained=False, **kwargs):
1778
- """ ViT-Base w/ LayerScale + 36 x 1 (36 block serial) config. Experimental, may remove.
1779
- Based on `Three things everyone should know about Vision Transformers` - https://arxiv.org/abs/2203.09795
1780
- Paper focuses on 24x2 + 48x1 for 'Small' width but those are extremely slow.
1781
- """
1782
- model_kwargs = dict(patch_size=16, embed_dim=384, depth=36, num_heads=6, init_values=1e-5)
1783
- model = _create_vision_transformer(
1784
- 'vit_small_patch16_36x1_224', pretrained=pretrained, **dict(model_kwargs, **kwargs))
1785
- return model
1786
-
1787
-
1788
- @register_model
1789
- def vit_small_patch16_18x2_224(pretrained=False, **kwargs):
1790
- """ ViT-Small w/ LayerScale + 18 x 2 (36 block parallel) config. Experimental, may remove.
1791
- Based on `Three things everyone should know about Vision Transformers` - https://arxiv.org/abs/2203.09795
1792
- Paper focuses on 24x2 + 48x1 for 'Small' width but those are extremely slow.
1793
- """
1794
- model_kwargs = dict(
1795
- patch_size=16, embed_dim=384, depth=18, num_heads=6, init_values=1e-5, block_fn=ParallelBlock)
1796
- model = _create_vision_transformer(
1797
- 'vit_small_patch16_18x2_224', pretrained=pretrained, **dict(model_kwargs, **kwargs))
1798
- return model
1799
-
1800
-
1801
- @register_model
1802
- def vit_base_patch16_18x2_224(pretrained=False, **kwargs):
1803
- """ ViT-Base w/ LayerScale + 18 x 2 (36 block parallel) config. Experimental, may remove.
1804
- Based on `Three things everyone should know about Vision Transformers` - https://arxiv.org/abs/2203.09795
1805
- """
1806
- model_kwargs = dict(patch_size=16, embed_dim=768, depth=18, num_heads=12, init_values=1e-5, block_fn=ParallelBlock)
1807
- model = _create_vision_transformer(
1808
- 'vit_base_patch16_18x2_224', pretrained=pretrained, **dict(model_kwargs, **kwargs))
1809
- return model
1810
-
1811
-
1812
- @register_model
1813
- def eva_large_patch14_196(pretrained=False, **kwargs):
1814
- """ EVA-large model https://arxiv.org/abs/2211.07636 /via MAE MIM pretrain"""
1815
- model_kwargs = dict(patch_size=14, embed_dim=1024, depth=24, num_heads=16, global_pool='avg')
1816
- model = _create_vision_transformer(
1817
- 'eva_large_patch14_196', pretrained=pretrained, **dict(model_kwargs, **kwargs))
1818
- return model
1819
-
1820
-
1821
- @register_model
1822
- def eva_large_patch14_336(pretrained=False, **kwargs):
1823
- """ EVA-large model https://arxiv.org/abs/2211.07636 via MAE MIM pretrain"""
1824
- model_kwargs = dict(patch_size=14, embed_dim=1024, depth=24, num_heads=16, global_pool='avg')
1825
- model = _create_vision_transformer('eva_large_patch14_336', pretrained=pretrained, **dict(model_kwargs, **kwargs))
1826
- return model
1827
-
1828
-
1829
- @register_model
1830
- def flexivit_small(pretrained=False, **kwargs):
1831
- """ FlexiViT-Small
1832
- """
1833
- model_kwargs = dict(patch_size=16, embed_dim=384, depth=12, num_heads=6, no_embed_class=True)
1834
- model = _create_vision_transformer('flexivit_small', pretrained=pretrained, **dict(model_kwargs, **kwargs))
1835
- return model
1836
-
1837
-
1838
- @register_model
1839
- def flexivit_base(pretrained=False, **kwargs):
1840
- """ FlexiViT-Base
1841
- """
1842
- model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, no_embed_class=True)
1843
- model = _create_vision_transformer('flexivit_base', pretrained=pretrained, **dict(model_kwargs, **kwargs))
1844
- return model
1845
-
1846
-
1847
- @register_model
1848
- def flexivit_large(pretrained=False, **kwargs):
1849
- """ FlexiViT-Large
1850
- """
1851
- model_kwargs = dict(patch_size=16, embed_dim=1024, depth=24, num_heads=16, no_embed_class=True)
1852
- model = _create_vision_transformer('flexivit_large', pretrained=pretrained, **dict(model_kwargs, **kwargs))
1853
- return model