pbarker commited on
Commit
7a91659
·
verified ·
1 Parent(s): 11845ba

Upload folder using huggingface_hub

Browse files
Files changed (49) hide show
  1. .gitattributes +1 -0
  2. added_tokens.json +428 -0
  3. config.json +33 -0
  4. config_molmo.py +60 -0
  5. generation_config.json +6 -0
  6. global_step412/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
  7. global_step412/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +3 -0
  8. global_step412/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +3 -0
  9. global_step412/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +3 -0
  10. global_step412/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +3 -0
  11. global_step412/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +3 -0
  12. global_step412/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +3 -0
  13. global_step412/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +3 -0
  14. global_step412/zero_pp_rank_0_mp_rank_00_model_states.pt +3 -0
  15. global_step412/zero_pp_rank_1_mp_rank_00_model_states.pt +3 -0
  16. global_step412/zero_pp_rank_2_mp_rank_00_model_states.pt +3 -0
  17. global_step412/zero_pp_rank_3_mp_rank_00_model_states.pt +3 -0
  18. global_step412/zero_pp_rank_4_mp_rank_00_model_states.pt +3 -0
  19. global_step412/zero_pp_rank_5_mp_rank_00_model_states.pt +3 -0
  20. global_step412/zero_pp_rank_6_mp_rank_00_model_states.pt +3 -0
  21. global_step412/zero_pp_rank_7_mp_rank_00_model_states.pt +3 -0
  22. image_preprocessing_molmo.py +546 -0
  23. latest +1 -0
  24. merges.txt +0 -0
  25. model-00001-of-00004.safetensors +3 -0
  26. model-00002-of-00004.safetensors +3 -0
  27. model-00003-of-00004.safetensors +3 -0
  28. model-00004-of-00004.safetensors +3 -0
  29. model.safetensors.index.json +592 -0
  30. modeling_molmo.py +2367 -0
  31. preprocessing_molmo.py +192 -0
  32. preprocessor_config.json +32 -0
  33. processor_config.json +6 -0
  34. rng_state_0.pth +3 -0
  35. rng_state_1.pth +3 -0
  36. rng_state_2.pth +3 -0
  37. rng_state_3.pth +3 -0
  38. rng_state_4.pth +3 -0
  39. rng_state_5.pth +3 -0
  40. rng_state_6.pth +3 -0
  41. rng_state_7.pth +3 -0
  42. sft_args.json +302 -0
  43. special_tokens_map.json +435 -0
  44. tokenizer.json +3 -0
  45. tokenizer_config.json +3853 -0
  46. trainer_state.json +890 -0
  47. training_args.bin +3 -0
  48. vocab.json +0 -0
  49. zero_to_fp32.py +760 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
added_tokens.json ADDED
@@ -0,0 +1,428 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<im_col>": 152067,
3
+ "<im_end>": 152065,
4
+ "<im_patch>": 152066,
5
+ "<im_start>": 152064,
6
+ "<|endoftext|>": 151643,
7
+ "<|im_end|>": 151645,
8
+ "<|im_start|>": 151644,
9
+ "<|image|>": 152068,
10
+ "|<EXTRA_TOKENS_0>|": 151646,
11
+ "|<EXTRA_TOKENS_100>|": 151746,
12
+ "|<EXTRA_TOKENS_101>|": 151747,
13
+ "|<EXTRA_TOKENS_102>|": 151748,
14
+ "|<EXTRA_TOKENS_103>|": 151749,
15
+ "|<EXTRA_TOKENS_104>|": 151750,
16
+ "|<EXTRA_TOKENS_105>|": 151751,
17
+ "|<EXTRA_TOKENS_106>|": 151752,
18
+ "|<EXTRA_TOKENS_107>|": 151753,
19
+ "|<EXTRA_TOKENS_108>|": 151754,
20
+ "|<EXTRA_TOKENS_109>|": 151755,
21
+ "|<EXTRA_TOKENS_10>|": 151656,
22
+ "|<EXTRA_TOKENS_110>|": 151756,
23
+ "|<EXTRA_TOKENS_111>|": 151757,
24
+ "|<EXTRA_TOKENS_112>|": 151758,
25
+ "|<EXTRA_TOKENS_113>|": 151759,
26
+ "|<EXTRA_TOKENS_114>|": 151760,
27
+ "|<EXTRA_TOKENS_115>|": 151761,
28
+ "|<EXTRA_TOKENS_116>|": 151762,
29
+ "|<EXTRA_TOKENS_117>|": 151763,
30
+ "|<EXTRA_TOKENS_118>|": 151764,
31
+ "|<EXTRA_TOKENS_119>|": 151765,
32
+ "|<EXTRA_TOKENS_11>|": 151657,
33
+ "|<EXTRA_TOKENS_120>|": 151766,
34
+ "|<EXTRA_TOKENS_121>|": 151767,
35
+ "|<EXTRA_TOKENS_122>|": 151768,
36
+ "|<EXTRA_TOKENS_123>|": 151769,
37
+ "|<EXTRA_TOKENS_124>|": 151770,
38
+ "|<EXTRA_TOKENS_125>|": 151771,
39
+ "|<EXTRA_TOKENS_126>|": 151772,
40
+ "|<EXTRA_TOKENS_127>|": 151773,
41
+ "|<EXTRA_TOKENS_128>|": 151774,
42
+ "|<EXTRA_TOKENS_129>|": 151775,
43
+ "|<EXTRA_TOKENS_12>|": 151658,
44
+ "|<EXTRA_TOKENS_130>|": 151776,
45
+ "|<EXTRA_TOKENS_131>|": 151777,
46
+ "|<EXTRA_TOKENS_132>|": 151778,
47
+ "|<EXTRA_TOKENS_133>|": 151779,
48
+ "|<EXTRA_TOKENS_134>|": 151780,
49
+ "|<EXTRA_TOKENS_135>|": 151781,
50
+ "|<EXTRA_TOKENS_136>|": 151782,
51
+ "|<EXTRA_TOKENS_137>|": 151783,
52
+ "|<EXTRA_TOKENS_138>|": 151784,
53
+ "|<EXTRA_TOKENS_139>|": 151785,
54
+ "|<EXTRA_TOKENS_13>|": 151659,
55
+ "|<EXTRA_TOKENS_140>|": 151786,
56
+ "|<EXTRA_TOKENS_141>|": 151787,
57
+ "|<EXTRA_TOKENS_142>|": 151788,
58
+ "|<EXTRA_TOKENS_143>|": 151789,
59
+ "|<EXTRA_TOKENS_144>|": 151790,
60
+ "|<EXTRA_TOKENS_145>|": 151791,
61
+ "|<EXTRA_TOKENS_146>|": 151792,
62
+ "|<EXTRA_TOKENS_147>|": 151793,
63
+ "|<EXTRA_TOKENS_148>|": 151794,
64
+ "|<EXTRA_TOKENS_149>|": 151795,
65
+ "|<EXTRA_TOKENS_14>|": 151660,
66
+ "|<EXTRA_TOKENS_150>|": 151796,
67
+ "|<EXTRA_TOKENS_151>|": 151797,
68
+ "|<EXTRA_TOKENS_152>|": 151798,
69
+ "|<EXTRA_TOKENS_153>|": 151799,
70
+ "|<EXTRA_TOKENS_154>|": 151800,
71
+ "|<EXTRA_TOKENS_155>|": 151801,
72
+ "|<EXTRA_TOKENS_156>|": 151802,
73
+ "|<EXTRA_TOKENS_157>|": 151803,
74
+ "|<EXTRA_TOKENS_158>|": 151804,
75
+ "|<EXTRA_TOKENS_159>|": 151805,
76
+ "|<EXTRA_TOKENS_15>|": 151661,
77
+ "|<EXTRA_TOKENS_160>|": 151806,
78
+ "|<EXTRA_TOKENS_161>|": 151807,
79
+ "|<EXTRA_TOKENS_162>|": 151808,
80
+ "|<EXTRA_TOKENS_163>|": 151809,
81
+ "|<EXTRA_TOKENS_164>|": 151810,
82
+ "|<EXTRA_TOKENS_165>|": 151811,
83
+ "|<EXTRA_TOKENS_166>|": 151812,
84
+ "|<EXTRA_TOKENS_167>|": 151813,
85
+ "|<EXTRA_TOKENS_168>|": 151814,
86
+ "|<EXTRA_TOKENS_169>|": 151815,
87
+ "|<EXTRA_TOKENS_16>|": 151662,
88
+ "|<EXTRA_TOKENS_170>|": 151816,
89
+ "|<EXTRA_TOKENS_171>|": 151817,
90
+ "|<EXTRA_TOKENS_172>|": 151818,
91
+ "|<EXTRA_TOKENS_173>|": 151819,
92
+ "|<EXTRA_TOKENS_174>|": 151820,
93
+ "|<EXTRA_TOKENS_175>|": 151821,
94
+ "|<EXTRA_TOKENS_176>|": 151822,
95
+ "|<EXTRA_TOKENS_177>|": 151823,
96
+ "|<EXTRA_TOKENS_178>|": 151824,
97
+ "|<EXTRA_TOKENS_179>|": 151825,
98
+ "|<EXTRA_TOKENS_17>|": 151663,
99
+ "|<EXTRA_TOKENS_180>|": 151826,
100
+ "|<EXTRA_TOKENS_181>|": 151827,
101
+ "|<EXTRA_TOKENS_182>|": 151828,
102
+ "|<EXTRA_TOKENS_183>|": 151829,
103
+ "|<EXTRA_TOKENS_184>|": 151830,
104
+ "|<EXTRA_TOKENS_185>|": 151831,
105
+ "|<EXTRA_TOKENS_186>|": 151832,
106
+ "|<EXTRA_TOKENS_187>|": 151833,
107
+ "|<EXTRA_TOKENS_188>|": 151834,
108
+ "|<EXTRA_TOKENS_189>|": 151835,
109
+ "|<EXTRA_TOKENS_18>|": 151664,
110
+ "|<EXTRA_TOKENS_190>|": 151836,
111
+ "|<EXTRA_TOKENS_191>|": 151837,
112
+ "|<EXTRA_TOKENS_192>|": 151838,
113
+ "|<EXTRA_TOKENS_193>|": 151839,
114
+ "|<EXTRA_TOKENS_194>|": 151840,
115
+ "|<EXTRA_TOKENS_195>|": 151841,
116
+ "|<EXTRA_TOKENS_196>|": 151842,
117
+ "|<EXTRA_TOKENS_197>|": 151843,
118
+ "|<EXTRA_TOKENS_198>|": 151844,
119
+ "|<EXTRA_TOKENS_199>|": 151845,
120
+ "|<EXTRA_TOKENS_19>|": 151665,
121
+ "|<EXTRA_TOKENS_1>|": 151647,
122
+ "|<EXTRA_TOKENS_200>|": 151846,
123
+ "|<EXTRA_TOKENS_201>|": 151847,
124
+ "|<EXTRA_TOKENS_202>|": 151848,
125
+ "|<EXTRA_TOKENS_203>|": 151849,
126
+ "|<EXTRA_TOKENS_204>|": 151850,
127
+ "|<EXTRA_TOKENS_205>|": 151851,
128
+ "|<EXTRA_TOKENS_206>|": 151852,
129
+ "|<EXTRA_TOKENS_207>|": 151853,
130
+ "|<EXTRA_TOKENS_208>|": 151854,
131
+ "|<EXTRA_TOKENS_209>|": 151855,
132
+ "|<EXTRA_TOKENS_20>|": 151666,
133
+ "|<EXTRA_TOKENS_210>|": 151856,
134
+ "|<EXTRA_TOKENS_211>|": 151857,
135
+ "|<EXTRA_TOKENS_212>|": 151858,
136
+ "|<EXTRA_TOKENS_213>|": 151859,
137
+ "|<EXTRA_TOKENS_214>|": 151860,
138
+ "|<EXTRA_TOKENS_215>|": 151861,
139
+ "|<EXTRA_TOKENS_216>|": 151862,
140
+ "|<EXTRA_TOKENS_217>|": 151863,
141
+ "|<EXTRA_TOKENS_218>|": 151864,
142
+ "|<EXTRA_TOKENS_219>|": 151865,
143
+ "|<EXTRA_TOKENS_21>|": 151667,
144
+ "|<EXTRA_TOKENS_220>|": 151866,
145
+ "|<EXTRA_TOKENS_221>|": 151867,
146
+ "|<EXTRA_TOKENS_222>|": 151868,
147
+ "|<EXTRA_TOKENS_223>|": 151869,
148
+ "|<EXTRA_TOKENS_224>|": 151870,
149
+ "|<EXTRA_TOKENS_225>|": 151871,
150
+ "|<EXTRA_TOKENS_226>|": 151872,
151
+ "|<EXTRA_TOKENS_227>|": 151873,
152
+ "|<EXTRA_TOKENS_228>|": 151874,
153
+ "|<EXTRA_TOKENS_229>|": 151875,
154
+ "|<EXTRA_TOKENS_22>|": 151668,
155
+ "|<EXTRA_TOKENS_230>|": 151876,
156
+ "|<EXTRA_TOKENS_231>|": 151877,
157
+ "|<EXTRA_TOKENS_232>|": 151878,
158
+ "|<EXTRA_TOKENS_233>|": 151879,
159
+ "|<EXTRA_TOKENS_234>|": 151880,
160
+ "|<EXTRA_TOKENS_235>|": 151881,
161
+ "|<EXTRA_TOKENS_236>|": 151882,
162
+ "|<EXTRA_TOKENS_237>|": 151883,
163
+ "|<EXTRA_TOKENS_238>|": 151884,
164
+ "|<EXTRA_TOKENS_239>|": 151885,
165
+ "|<EXTRA_TOKENS_23>|": 151669,
166
+ "|<EXTRA_TOKENS_240>|": 151886,
167
+ "|<EXTRA_TOKENS_241>|": 151887,
168
+ "|<EXTRA_TOKENS_242>|": 151888,
169
+ "|<EXTRA_TOKENS_243>|": 151889,
170
+ "|<EXTRA_TOKENS_244>|": 151890,
171
+ "|<EXTRA_TOKENS_245>|": 151891,
172
+ "|<EXTRA_TOKENS_246>|": 151892,
173
+ "|<EXTRA_TOKENS_247>|": 151893,
174
+ "|<EXTRA_TOKENS_248>|": 151894,
175
+ "|<EXTRA_TOKENS_249>|": 151895,
176
+ "|<EXTRA_TOKENS_24>|": 151670,
177
+ "|<EXTRA_TOKENS_250>|": 151896,
178
+ "|<EXTRA_TOKENS_251>|": 151897,
179
+ "|<EXTRA_TOKENS_252>|": 151898,
180
+ "|<EXTRA_TOKENS_253>|": 151899,
181
+ "|<EXTRA_TOKENS_254>|": 151900,
182
+ "|<EXTRA_TOKENS_255>|": 151901,
183
+ "|<EXTRA_TOKENS_256>|": 151902,
184
+ "|<EXTRA_TOKENS_257>|": 151903,
185
+ "|<EXTRA_TOKENS_258>|": 151904,
186
+ "|<EXTRA_TOKENS_259>|": 151905,
187
+ "|<EXTRA_TOKENS_25>|": 151671,
188
+ "|<EXTRA_TOKENS_260>|": 151906,
189
+ "|<EXTRA_TOKENS_261>|": 151907,
190
+ "|<EXTRA_TOKENS_262>|": 151908,
191
+ "|<EXTRA_TOKENS_263>|": 151909,
192
+ "|<EXTRA_TOKENS_264>|": 151910,
193
+ "|<EXTRA_TOKENS_265>|": 151911,
194
+ "|<EXTRA_TOKENS_266>|": 151912,
195
+ "|<EXTRA_TOKENS_267>|": 151913,
196
+ "|<EXTRA_TOKENS_268>|": 151914,
197
+ "|<EXTRA_TOKENS_269>|": 151915,
198
+ "|<EXTRA_TOKENS_26>|": 151672,
199
+ "|<EXTRA_TOKENS_270>|": 151916,
200
+ "|<EXTRA_TOKENS_271>|": 151917,
201
+ "|<EXTRA_TOKENS_272>|": 151918,
202
+ "|<EXTRA_TOKENS_273>|": 151919,
203
+ "|<EXTRA_TOKENS_274>|": 151920,
204
+ "|<EXTRA_TOKENS_275>|": 151921,
205
+ "|<EXTRA_TOKENS_276>|": 151922,
206
+ "|<EXTRA_TOKENS_277>|": 151923,
207
+ "|<EXTRA_TOKENS_278>|": 151924,
208
+ "|<EXTRA_TOKENS_279>|": 151925,
209
+ "|<EXTRA_TOKENS_27>|": 151673,
210
+ "|<EXTRA_TOKENS_280>|": 151926,
211
+ "|<EXTRA_TOKENS_281>|": 151927,
212
+ "|<EXTRA_TOKENS_282>|": 151928,
213
+ "|<EXTRA_TOKENS_283>|": 151929,
214
+ "|<EXTRA_TOKENS_284>|": 151930,
215
+ "|<EXTRA_TOKENS_285>|": 151931,
216
+ "|<EXTRA_TOKENS_286>|": 151932,
217
+ "|<EXTRA_TOKENS_287>|": 151933,
218
+ "|<EXTRA_TOKENS_288>|": 151934,
219
+ "|<EXTRA_TOKENS_289>|": 151935,
220
+ "|<EXTRA_TOKENS_28>|": 151674,
221
+ "|<EXTRA_TOKENS_290>|": 151936,
222
+ "|<EXTRA_TOKENS_291>|": 151937,
223
+ "|<EXTRA_TOKENS_292>|": 151938,
224
+ "|<EXTRA_TOKENS_293>|": 151939,
225
+ "|<EXTRA_TOKENS_294>|": 151940,
226
+ "|<EXTRA_TOKENS_295>|": 151941,
227
+ "|<EXTRA_TOKENS_296>|": 151942,
228
+ "|<EXTRA_TOKENS_297>|": 151943,
229
+ "|<EXTRA_TOKENS_298>|": 151944,
230
+ "|<EXTRA_TOKENS_299>|": 151945,
231
+ "|<EXTRA_TOKENS_29>|": 151675,
232
+ "|<EXTRA_TOKENS_2>|": 151648,
233
+ "|<EXTRA_TOKENS_300>|": 151946,
234
+ "|<EXTRA_TOKENS_301>|": 151947,
235
+ "|<EXTRA_TOKENS_302>|": 151948,
236
+ "|<EXTRA_TOKENS_303>|": 151949,
237
+ "|<EXTRA_TOKENS_304>|": 151950,
238
+ "|<EXTRA_TOKENS_305>|": 151951,
239
+ "|<EXTRA_TOKENS_306>|": 151952,
240
+ "|<EXTRA_TOKENS_307>|": 151953,
241
+ "|<EXTRA_TOKENS_308>|": 151954,
242
+ "|<EXTRA_TOKENS_309>|": 151955,
243
+ "|<EXTRA_TOKENS_30>|": 151676,
244
+ "|<EXTRA_TOKENS_310>|": 151956,
245
+ "|<EXTRA_TOKENS_311>|": 151957,
246
+ "|<EXTRA_TOKENS_312>|": 151958,
247
+ "|<EXTRA_TOKENS_313>|": 151959,
248
+ "|<EXTRA_TOKENS_314>|": 151960,
249
+ "|<EXTRA_TOKENS_315>|": 151961,
250
+ "|<EXTRA_TOKENS_316>|": 151962,
251
+ "|<EXTRA_TOKENS_317>|": 151963,
252
+ "|<EXTRA_TOKENS_318>|": 151964,
253
+ "|<EXTRA_TOKENS_319>|": 151965,
254
+ "|<EXTRA_TOKENS_31>|": 151677,
255
+ "|<EXTRA_TOKENS_320>|": 151966,
256
+ "|<EXTRA_TOKENS_321>|": 151967,
257
+ "|<EXTRA_TOKENS_322>|": 151968,
258
+ "|<EXTRA_TOKENS_323>|": 151969,
259
+ "|<EXTRA_TOKENS_324>|": 151970,
260
+ "|<EXTRA_TOKENS_325>|": 151971,
261
+ "|<EXTRA_TOKENS_326>|": 151972,
262
+ "|<EXTRA_TOKENS_327>|": 151973,
263
+ "|<EXTRA_TOKENS_328>|": 151974,
264
+ "|<EXTRA_TOKENS_329>|": 151975,
265
+ "|<EXTRA_TOKENS_32>|": 151678,
266
+ "|<EXTRA_TOKENS_330>|": 151976,
267
+ "|<EXTRA_TOKENS_331>|": 151977,
268
+ "|<EXTRA_TOKENS_332>|": 151978,
269
+ "|<EXTRA_TOKENS_333>|": 151979,
270
+ "|<EXTRA_TOKENS_334>|": 151980,
271
+ "|<EXTRA_TOKENS_335>|": 151981,
272
+ "|<EXTRA_TOKENS_336>|": 151982,
273
+ "|<EXTRA_TOKENS_337>|": 151983,
274
+ "|<EXTRA_TOKENS_338>|": 151984,
275
+ "|<EXTRA_TOKENS_339>|": 151985,
276
+ "|<EXTRA_TOKENS_33>|": 151679,
277
+ "|<EXTRA_TOKENS_340>|": 151986,
278
+ "|<EXTRA_TOKENS_341>|": 151987,
279
+ "|<EXTRA_TOKENS_342>|": 151988,
280
+ "|<EXTRA_TOKENS_343>|": 151989,
281
+ "|<EXTRA_TOKENS_344>|": 151990,
282
+ "|<EXTRA_TOKENS_345>|": 151991,
283
+ "|<EXTRA_TOKENS_346>|": 151992,
284
+ "|<EXTRA_TOKENS_347>|": 151993,
285
+ "|<EXTRA_TOKENS_348>|": 151994,
286
+ "|<EXTRA_TOKENS_349>|": 151995,
287
+ "|<EXTRA_TOKENS_34>|": 151680,
288
+ "|<EXTRA_TOKENS_350>|": 151996,
289
+ "|<EXTRA_TOKENS_351>|": 151997,
290
+ "|<EXTRA_TOKENS_352>|": 151998,
291
+ "|<EXTRA_TOKENS_353>|": 151999,
292
+ "|<EXTRA_TOKENS_354>|": 152000,
293
+ "|<EXTRA_TOKENS_355>|": 152001,
294
+ "|<EXTRA_TOKENS_356>|": 152002,
295
+ "|<EXTRA_TOKENS_357>|": 152003,
296
+ "|<EXTRA_TOKENS_358>|": 152004,
297
+ "|<EXTRA_TOKENS_359>|": 152005,
298
+ "|<EXTRA_TOKENS_35>|": 151681,
299
+ "|<EXTRA_TOKENS_360>|": 152006,
300
+ "|<EXTRA_TOKENS_361>|": 152007,
301
+ "|<EXTRA_TOKENS_362>|": 152008,
302
+ "|<EXTRA_TOKENS_363>|": 152009,
303
+ "|<EXTRA_TOKENS_364>|": 152010,
304
+ "|<EXTRA_TOKENS_365>|": 152011,
305
+ "|<EXTRA_TOKENS_366>|": 152012,
306
+ "|<EXTRA_TOKENS_367>|": 152013,
307
+ "|<EXTRA_TOKENS_368>|": 152014,
308
+ "|<EXTRA_TOKENS_369>|": 152015,
309
+ "|<EXTRA_TOKENS_36>|": 151682,
310
+ "|<EXTRA_TOKENS_370>|": 152016,
311
+ "|<EXTRA_TOKENS_371>|": 152017,
312
+ "|<EXTRA_TOKENS_372>|": 152018,
313
+ "|<EXTRA_TOKENS_373>|": 152019,
314
+ "|<EXTRA_TOKENS_374>|": 152020,
315
+ "|<EXTRA_TOKENS_375>|": 152021,
316
+ "|<EXTRA_TOKENS_376>|": 152022,
317
+ "|<EXTRA_TOKENS_377>|": 152023,
318
+ "|<EXTRA_TOKENS_378>|": 152024,
319
+ "|<EXTRA_TOKENS_379>|": 152025,
320
+ "|<EXTRA_TOKENS_37>|": 151683,
321
+ "|<EXTRA_TOKENS_380>|": 152026,
322
+ "|<EXTRA_TOKENS_381>|": 152027,
323
+ "|<EXTRA_TOKENS_382>|": 152028,
324
+ "|<EXTRA_TOKENS_383>|": 152029,
325
+ "|<EXTRA_TOKENS_384>|": 152030,
326
+ "|<EXTRA_TOKENS_385>|": 152031,
327
+ "|<EXTRA_TOKENS_386>|": 152032,
328
+ "|<EXTRA_TOKENS_387>|": 152033,
329
+ "|<EXTRA_TOKENS_388>|": 152034,
330
+ "|<EXTRA_TOKENS_389>|": 152035,
331
+ "|<EXTRA_TOKENS_38>|": 151684,
332
+ "|<EXTRA_TOKENS_390>|": 152036,
333
+ "|<EXTRA_TOKENS_391>|": 152037,
334
+ "|<EXTRA_TOKENS_392>|": 152038,
335
+ "|<EXTRA_TOKENS_393>|": 152039,
336
+ "|<EXTRA_TOKENS_394>|": 152040,
337
+ "|<EXTRA_TOKENS_395>|": 152041,
338
+ "|<EXTRA_TOKENS_396>|": 152042,
339
+ "|<EXTRA_TOKENS_397>|": 152043,
340
+ "|<EXTRA_TOKENS_398>|": 152044,
341
+ "|<EXTRA_TOKENS_399>|": 152045,
342
+ "|<EXTRA_TOKENS_39>|": 151685,
343
+ "|<EXTRA_TOKENS_3>|": 151649,
344
+ "|<EXTRA_TOKENS_400>|": 152046,
345
+ "|<EXTRA_TOKENS_401>|": 152047,
346
+ "|<EXTRA_TOKENS_402>|": 152048,
347
+ "|<EXTRA_TOKENS_403>|": 152049,
348
+ "|<EXTRA_TOKENS_404>|": 152050,
349
+ "|<EXTRA_TOKENS_405>|": 152051,
350
+ "|<EXTRA_TOKENS_406>|": 152052,
351
+ "|<EXTRA_TOKENS_407>|": 152053,
352
+ "|<EXTRA_TOKENS_408>|": 152054,
353
+ "|<EXTRA_TOKENS_409>|": 152055,
354
+ "|<EXTRA_TOKENS_40>|": 151686,
355
+ "|<EXTRA_TOKENS_410>|": 152056,
356
+ "|<EXTRA_TOKENS_411>|": 152057,
357
+ "|<EXTRA_TOKENS_412>|": 152058,
358
+ "|<EXTRA_TOKENS_413>|": 152059,
359
+ "|<EXTRA_TOKENS_414>|": 152060,
360
+ "|<EXTRA_TOKENS_415>|": 152061,
361
+ "|<EXTRA_TOKENS_416>|": 152062,
362
+ "|<EXTRA_TOKENS_417>|": 152063,
363
+ "|<EXTRA_TOKENS_41>|": 151687,
364
+ "|<EXTRA_TOKENS_42>|": 151688,
365
+ "|<EXTRA_TOKENS_43>|": 151689,
366
+ "|<EXTRA_TOKENS_44>|": 151690,
367
+ "|<EXTRA_TOKENS_45>|": 151691,
368
+ "|<EXTRA_TOKENS_46>|": 151692,
369
+ "|<EXTRA_TOKENS_47>|": 151693,
370
+ "|<EXTRA_TOKENS_48>|": 151694,
371
+ "|<EXTRA_TOKENS_49>|": 151695,
372
+ "|<EXTRA_TOKENS_4>|": 151650,
373
+ "|<EXTRA_TOKENS_50>|": 151696,
374
+ "|<EXTRA_TOKENS_51>|": 151697,
375
+ "|<EXTRA_TOKENS_52>|": 151698,
376
+ "|<EXTRA_TOKENS_53>|": 151699,
377
+ "|<EXTRA_TOKENS_54>|": 151700,
378
+ "|<EXTRA_TOKENS_55>|": 151701,
379
+ "|<EXTRA_TOKENS_56>|": 151702,
380
+ "|<EXTRA_TOKENS_57>|": 151703,
381
+ "|<EXTRA_TOKENS_58>|": 151704,
382
+ "|<EXTRA_TOKENS_59>|": 151705,
383
+ "|<EXTRA_TOKENS_5>|": 151651,
384
+ "|<EXTRA_TOKENS_60>|": 151706,
385
+ "|<EXTRA_TOKENS_61>|": 151707,
386
+ "|<EXTRA_TOKENS_62>|": 151708,
387
+ "|<EXTRA_TOKENS_63>|": 151709,
388
+ "|<EXTRA_TOKENS_64>|": 151710,
389
+ "|<EXTRA_TOKENS_65>|": 151711,
390
+ "|<EXTRA_TOKENS_66>|": 151712,
391
+ "|<EXTRA_TOKENS_67>|": 151713,
392
+ "|<EXTRA_TOKENS_68>|": 151714,
393
+ "|<EXTRA_TOKENS_69>|": 151715,
394
+ "|<EXTRA_TOKENS_6>|": 151652,
395
+ "|<EXTRA_TOKENS_70>|": 151716,
396
+ "|<EXTRA_TOKENS_71>|": 151717,
397
+ "|<EXTRA_TOKENS_72>|": 151718,
398
+ "|<EXTRA_TOKENS_73>|": 151719,
399
+ "|<EXTRA_TOKENS_74>|": 151720,
400
+ "|<EXTRA_TOKENS_75>|": 151721,
401
+ "|<EXTRA_TOKENS_76>|": 151722,
402
+ "|<EXTRA_TOKENS_77>|": 151723,
403
+ "|<EXTRA_TOKENS_78>|": 151724,
404
+ "|<EXTRA_TOKENS_79>|": 151725,
405
+ "|<EXTRA_TOKENS_7>|": 151653,
406
+ "|<EXTRA_TOKENS_80>|": 151726,
407
+ "|<EXTRA_TOKENS_81>|": 151727,
408
+ "|<EXTRA_TOKENS_82>|": 151728,
409
+ "|<EXTRA_TOKENS_83>|": 151729,
410
+ "|<EXTRA_TOKENS_84>|": 151730,
411
+ "|<EXTRA_TOKENS_85>|": 151731,
412
+ "|<EXTRA_TOKENS_86>|": 151732,
413
+ "|<EXTRA_TOKENS_87>|": 151733,
414
+ "|<EXTRA_TOKENS_88>|": 151734,
415
+ "|<EXTRA_TOKENS_89>|": 151735,
416
+ "|<EXTRA_TOKENS_8>|": 151654,
417
+ "|<EXTRA_TOKENS_90>|": 151736,
418
+ "|<EXTRA_TOKENS_91>|": 151737,
419
+ "|<EXTRA_TOKENS_92>|": 151738,
420
+ "|<EXTRA_TOKENS_93>|": 151739,
421
+ "|<EXTRA_TOKENS_94>|": 151740,
422
+ "|<EXTRA_TOKENS_95>|": 151741,
423
+ "|<EXTRA_TOKENS_96>|": 151742,
424
+ "|<EXTRA_TOKENS_97>|": 151743,
425
+ "|<EXTRA_TOKENS_98>|": 151744,
426
+ "|<EXTRA_TOKENS_99>|": 151745,
427
+ "|<EXTRA_TOKENS_9>|": 151655
428
+ }
config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/root/.cache/huggingface/hub/models--pbarker--ComputerBase-v0.1-M-3epoch/snapshots/915dc4d8809028264506a79c6e71387c8a26aa35",
3
+ "architectures": [
4
+ "MolmoForCausalLM"
5
+ ],
6
+ "attention_layer_norm": false,
7
+ "auto_map": {
8
+ "AutoConfig": "config_molmo.MolmoConfig",
9
+ "AutoModelForCausalLM": "modeling_molmo.MolmoForCausalLM"
10
+ },
11
+ "clip_qkv": null,
12
+ "embedding_size": 152064,
13
+ "hidden_size": 3584,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 37888,
16
+ "layer_norm_eps": 1e-06,
17
+ "layer_norm_type": "rms",
18
+ "max_position_embeddings": 4096,
19
+ "model_type": "molmo",
20
+ "norm_after": false,
21
+ "num_attention_heads": 28,
22
+ "num_hidden_layers": 28,
23
+ "num_key_value_heads": 4,
24
+ "qkv_bias": true,
25
+ "rope_theta": 1000000.0,
26
+ "tie_word_embeddings": false,
27
+ "torch_dtype": "bfloat16",
28
+ "transformers_version": "4.47.1",
29
+ "use_cache": true,
30
+ "use_position_ids": true,
31
+ "vocab_size": 152064,
32
+ "weight_tying": false
33
+ }
config_molmo.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+
3
+ from transformers import PretrainedConfig, AutoTokenizer
4
+
5
+
6
+ class MolmoConfig(PretrainedConfig):
7
+ model_type = "molmo"
8
+ keys_to_ignore_at_inference = ["past_key_values"]
9
+
10
+ def __init__(
11
+ self,
12
+ vocab_size=50304,
13
+ embedding_size=50304,
14
+ hidden_size=4096,
15
+ intermediate_size=11008,
16
+ num_hidden_layers=32,
17
+ num_attention_heads=32,
18
+ num_key_value_heads=None,
19
+ max_position_embeddings=2048,
20
+ initializer_range=0.02,
21
+ use_cache=True,
22
+ layer_norm_eps: float = 1e-5,
23
+ rope_theta=10000.0,
24
+ clip_qkv=None,
25
+ qkv_bias: bool = False,
26
+ weight_tying: bool = False,
27
+ use_position_ids: bool=True,
28
+ tie_word_embeddings: bool=True,
29
+ attention_layer_norm: bool=False,
30
+ norm_after: bool = False,
31
+ layer_norm_type: str="rms",
32
+ **kwargs,
33
+ ):
34
+ self.vocab_size = vocab_size
35
+ self.embedding_size = embedding_size
36
+ self.max_position_embeddings = max_position_embeddings
37
+ self.hidden_size = hidden_size
38
+ self.intermediate_size = intermediate_size
39
+ self.num_hidden_layers = num_hidden_layers
40
+ self.num_attention_heads = num_attention_heads
41
+ self.layer_norm_eps = layer_norm_eps
42
+ self.weight_tying = weight_tying
43
+ self.use_position_ids = use_position_ids
44
+ self.attention_layer_norm = attention_layer_norm
45
+ self.num_key_value_heads = num_key_value_heads
46
+ self.initializer_range = initializer_range
47
+ self.use_cache = use_cache
48
+ self.rope_theta = rope_theta
49
+ self.clip_qkv = clip_qkv
50
+ self.qkv_bias = qkv_bias
51
+ self.norm_after = norm_after
52
+ self.tie_word_embeddings = tie_word_embeddings
53
+ self.layer_norm_type = layer_norm_type
54
+
55
+ super().__init__(
56
+ tie_word_embeddings=tie_word_embeddings,
57
+ **kwargs,
58
+ )
59
+
60
+ MolmoConfig.register_for_auto_class()
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "eos_token_id": 151643,
3
+ "max_new_tokens": 2048,
4
+ "pad_token_id": 151643,
5
+ "transformers_version": "4.47.1"
6
+ }
global_step412/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:234760e1ab29f222f2af1ae5fa66e7d1ca9a91fd0a2c569d17ebf5ba4188f78e
3
+ size 12031542784
global_step412/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69f3d6b1f0a357d8a10328ee4a5fd4ccd7ca23f3ed2e107b1a455a87ddde29e5
3
+ size 12031542784
global_step412/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c5a05c596fa5ec381c0b0bbaadb93ea6fa0deebbb3a7eb10ec006d9a1a3ecb3
3
+ size 12031542784
global_step412/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:441f71e09bd1ead621e4a0c8e709aa039149eb884a97ba6812a6f33ea3b614b2
3
+ size 12031542784
global_step412/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7205e4226c5b4e55dbf5c6fd2cafaf53c14c0c7737b45b0d5e59231ab630f209
3
+ size 12031542784
global_step412/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88329221dacbe4dc3f2d55aa1f19498d48f3dc9e8458c67670f1c7a488a7db43
3
+ size 12031542784
global_step412/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58103b7810deb16532f5e06c81d05b4181baf1bb61e8ab62832c23e959d195ce
3
+ size 12031542784
global_step412/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5497eceffeb58e079a239af29b687b6df1f1124e08809718f03aa7648cd99c51
3
+ size 12031542784
global_step412/zero_pp_rank_0_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e9ea986416373bd24e5175cebc60278e072c0b67dc0a72c18456d1bcb36d0da
3
+ size 328563
global_step412/zero_pp_rank_1_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8354ea1e5ccefb48cdf18bd9006343f1cdcfb637ba5368efc1041079517464d2
3
+ size 328563
global_step412/zero_pp_rank_2_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50627b8a21f8fbd0072a80f576737eace787095e07ac906ef57e33ba15a675d5
3
+ size 328563
global_step412/zero_pp_rank_3_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98cb72b8672ff264dc5044eb1bfdf0d9d076ce33a6c957f4b400f4133dfa6104
3
+ size 328563
global_step412/zero_pp_rank_4_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61cadc80a73bb54a82b2d907853ec419a9565dbea3cdbe439d866d62a3827f6b
3
+ size 328563
global_step412/zero_pp_rank_5_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b8117239a0ed06722fc8df0ce89498835f1cffd9f08cb634b5cf58f6dfcbd9b
3
+ size 328563
global_step412/zero_pp_rank_6_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49bb6fa62d1627bd34748f03249b227ac633ce21cffa4b024d7f100b1f1a3a50
3
+ size 328563
global_step412/zero_pp_rank_7_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7aa3f91a7b354c710c864623f970f0b4bc84c3a22a81685a3d53112621f16563
3
+ size 328563
image_preprocessing_molmo.py ADDED
@@ -0,0 +1,546 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Image processor class for Molmo"""
2
+ from typing import List, Optional, Union, Mapping
3
+
4
+ import numpy as np
5
+ import einops
6
+ import torch
7
+ import torchvision.transforms
8
+ from torchvision.transforms import InterpolationMode
9
+ from torchvision.transforms.functional import convert_image_dtype
10
+
11
+ from transformers.image_utils import (
12
+ OPENAI_CLIP_MEAN,
13
+ OPENAI_CLIP_STD,
14
+ ImageInput,
15
+ is_valid_image,
16
+ )
17
+ from transformers.processing_utils import ImagesKwargs
18
+ from transformers.image_processing_utils import BaseImageProcessor
19
+ from transformers.utils import logging
20
+
21
+
22
+ logger = logging.get_logger(__name__)
23
+
24
+
25
+ def pad_to_bounding_box(
26
+ image, offset_height, offset_width, target_height,
27
+ target_width, value=0
28
+ ):
29
+ height, width = image.shape[:2]
30
+ after_padding_width = target_width - offset_width - width
31
+ after_padding_height = target_height - offset_height - height
32
+ return np.pad(image, [
33
+ [offset_height, after_padding_height],
34
+ [offset_width, after_padding_width],
35
+ [0, 0]
36
+ ], constant_values=value)
37
+
38
+
39
+ def normalize_image(image, offset, scale):
40
+ image -= np.array(offset, dtype=np.float32)[None, None, :]
41
+ image /= np.array(scale, dtype=np.float32)[None, None, :]
42
+ return image
43
+
44
+
45
+ def resize_and_pad(
46
+ image,
47
+ desired_output_size,
48
+ resize_method="torch-bilinear",
49
+ pad_value=0,
50
+ normalize=True,
51
+ image_mean=OPENAI_CLIP_MEAN,
52
+ image_std=OPENAI_CLIP_STD,
53
+ ):
54
+ desired_height, desired_width = desired_output_size
55
+ height, width = image.shape[:2]
56
+
57
+ # Cast into float32 since the training code did this in float32 and it (very rarely) effects
58
+ # the results after rounding.
59
+ image_scale_y = np.array(desired_height, np.float32) / np.array(height, np.float32)
60
+ image_scale_x = np.array(desired_width, np.float32) / np.array(width, np.float32)
61
+ image_scale = min(image_scale_x, image_scale_y)
62
+ scaled_height = int(np.array(height, np.float32) * image_scale)
63
+ scaled_width = int(np.array(width, np.float32) * image_scale)
64
+
65
+ if resize_method == "tensorflow":
66
+ # This how the original training code did resizing, it can produce slightly different
67
+ # results then using torch resize so we keep it just in case
68
+ import tensorflow as tf
69
+ image = tf.image.convert_image_dtype(tf.constant(image), dtype=tf.float32)
70
+ image = tf.image.resize(
71
+ image,
72
+ [scaled_height, scaled_width],
73
+ method=tf.image.ResizeMethod.BILINEAR,
74
+ antialias=True,
75
+ )
76
+ image = tf.clip_by_value(image, 0.0, 1.0)
77
+ image = image.numpy()
78
+ elif resize_method == "torch-bilinear":
79
+ image = torch.permute(torch.from_numpy(image), [2, 0, 1])
80
+ image = convert_image_dtype(image) # resize in float32 to match the training code
81
+ image = torchvision.transforms.Resize(
82
+ [scaled_height, scaled_width], InterpolationMode.BILINEAR, antialias=True
83
+ )(image)
84
+ image = torch.clip(image, 0.0, 1.0)
85
+ image = torch.permute(image, [1, 2, 0]).numpy()
86
+ else:
87
+ raise NotImplementedError(resize_method)
88
+
89
+ top_pad = (desired_height - scaled_height) // 2
90
+ left_pad = (desired_width - scaled_width) // 2
91
+ padding = [
92
+ [top_pad, desired_height - scaled_height - top_pad],
93
+ [left_pad, desired_width - scaled_width - left_pad],
94
+ [0, 0]
95
+ ]
96
+ image_mask = np.pad(np.ones_like(image[:, :, 0], dtype=bool), padding[:2])
97
+ image = np.pad(image, padding, constant_values=pad_value)
98
+ if normalize:
99
+ image = normalize_image(image, offset=image_mean, scale=image_std)
100
+ return image, image_mask
101
+
102
+
103
+ def select_tiling(h, w, patch_size, max_num_patches):
104
+ """Decide how best to divide in image of size [w, h] in up to max_num_patches of size patch_size"""
105
+ original_size = np.stack([h, w]) # [1, 2]
106
+ original_res = h * w
107
+ tilings = []
108
+ for i in range(1, max_num_patches+1):
109
+ for j in range(1, max_num_patches+1):
110
+ if i*j <= max_num_patches:
111
+ tilings.append((i, j))
112
+ # sort so argmin and argmax favour smaller tilings in the event of a tie
113
+ tilings.sort(key=lambda x: (x[0]*x[1], x[0]))
114
+ candidate_tilings = np.array(tilings, dtype=np.int32) # [n_resolutions, 2]
115
+ candidate_resolutions = candidate_tilings * patch_size # [n_resolutions, 2]
116
+
117
+ # How much we would need to scale the image to fit exactly in each tiling
118
+ original_size = np.stack([h, w], dtype=np.float32) # [1, 2]
119
+ required_scale_d = candidate_resolutions.astype(np.float32) / original_size
120
+ required_scale = np.min(required_scale_d, axis=-1, keepdims=True) # [n_resolutions, 1]
121
+ if np.all(required_scale < 1):
122
+ # We are forced to downscale, so try to minimize the amount of downscaling
123
+ ix = np.argmax(required_scale)
124
+ else:
125
+ # Pick the resolution that required the least upscaling so that it most closely fits the image
126
+ required_scale = np.where(required_scale < 1.0, 10e9, required_scale)
127
+ ix = np.argmin(required_scale)
128
+ return candidate_tilings[ix]
129
+
130
+
131
+ class MolmoImagesKwargs(ImagesKwargs, total=False):
132
+ max_crops: Optional[int]
133
+ overlap_margins: Optional[List[int]]
134
+ base_image_input_size: Optional[List[int]]
135
+ image_token_length_w: Optional[int]
136
+ image_token_length_h: Optional[int]
137
+ image_patch_size: Optional[int]
138
+ image_padding_mask: Optional[bool]
139
+
140
+
141
+ class MolmoImageProcessor(BaseImageProcessor):
142
+ """Preprocess images and multi-model inputs"""
143
+
144
+ def __init__(
145
+ self,
146
+ max_crops: int = 12,
147
+ overlap_margins: List[int] = (4, 4),
148
+ base_image_input_size: List[int] = (336, 336),
149
+ image_token_length_w: int = 12,
150
+ image_token_length_h: int = 12,
151
+ image_patch_size: int = 14,
152
+ image_padding_mask: bool = True,
153
+ do_normalize: bool = True,
154
+ image_mean: Optional[Union[float, List[float]]] = None,
155
+ image_std: Optional[Union[float, List[float]]] = None,
156
+ **kwargs,
157
+ ):
158
+ super().__init__(**kwargs)
159
+ self.max_crops = max_crops
160
+ self.overlap_margins = overlap_margins
161
+ self.base_image_input_size = base_image_input_size
162
+ self.image_token_length_w = image_token_length_w
163
+ self.image_token_length_h = image_token_length_h
164
+ self.image_patch_size = image_patch_size
165
+ self.image_padding_mask = image_padding_mask
166
+ self.do_normalize = do_normalize
167
+ self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
168
+ self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
169
+
170
+ def image_to_patches_and_tokens(
171
+ self,
172
+ image: ImageInput,
173
+ image_patch_token_id: int,
174
+ image_col_token_id: int,
175
+ image_start_token_id: int,
176
+ image_end_token_id: int,
177
+ max_crops: Optional[int] = None,
178
+ overlap_margins: Optional[List[int]] = None,
179
+ base_image_input_size: Optional[Union[int, List[int]]] = None,
180
+ image_token_length_w: Optional[int] = None,
181
+ image_token_length_h: Optional[int] = None,
182
+ image_patch_size: Optional[int] = None,
183
+ ):
184
+ if isinstance(base_image_input_size, int):
185
+ base_image_input_size = (base_image_input_size, base_image_input_size)
186
+
187
+ base_image_input_d = image_patch_size
188
+ tokens_per_image = image_token_length_w * image_token_length_h
189
+ image_base_patch_w = base_image_input_size[1] // base_image_input_d
190
+ image_base_patch_h = base_image_input_size[0] // base_image_input_d
191
+
192
+ original_image_h, original_image_w = image.shape[:2]
193
+ crop_size = base_image_input_size[0]
194
+
195
+ # Discard this many patches from the (left/top, right/bottom) of crops
196
+ left_margin, right_margin = overlap_margins
197
+ # left_margin, right_margin = 2, 2
198
+ assert left_margin % 2 == 0 # Required for compatibility with 2x2 pooling
199
+ total_margin_pixels = base_image_input_d*(right_margin + left_margin) # pixels removed per dim
200
+ crop_patches = base_image_input_size[0] // base_image_input_d # patches per crop dim
201
+ crop_window_patches = crop_patches - (right_margin + left_margin) # usable patches
202
+ crop_window_size = crop_window_patches * base_image_input_d
203
+ tiling = select_tiling(
204
+ original_image_h - total_margin_pixels,
205
+ original_image_w - total_margin_pixels,
206
+ crop_window_size,
207
+ max_crops
208
+ )
209
+ src, img_mask = resize_and_pad(
210
+ image,
211
+ [tiling[0]*crop_window_size+total_margin_pixels, tiling[1]*crop_window_size+total_margin_pixels]
212
+ )
213
+
214
+ # Now we have to split the image into crops, while keeping track of how each patch in the
215
+ # each crop should be ordered in the global image, this require a lot of tricky booking
216
+ n_crops = tiling[0] * tiling[1]
217
+ patches_arr = []
218
+ mask_arr = []
219
+ patch_ordering_arr = []
220
+
221
+ # We assume 2x2 pooling, but can allow padding the right/bottom with extra
222
+ # patches if the number of patches per side is not even
223
+ assert (crop_patches+1)//2 == image_token_length_h
224
+ assert (crop_patches+1)//2 == image_token_length_w
225
+ on = 0
226
+ on_patch = 0
227
+ for i in range(tiling[0]):
228
+ y0 = i*crop_window_size
229
+ if i == 0:
230
+ crop_y0 = 0
231
+ else:
232
+ crop_y0 = left_margin // 2
233
+
234
+ crop_h = image_base_patch_h - (right_margin + left_margin)
235
+ if i == 0:
236
+ crop_h += left_margin
237
+ if i == (tiling[0]-1):
238
+ crop_h += right_margin
239
+ for j in range(tiling[1]):
240
+ x0 = j*crop_window_size
241
+ if j == 0:
242
+ crop_x0 = 0
243
+ else:
244
+ crop_x0 = left_margin // 2
245
+
246
+ crop_w = image_base_patch_w - (right_margin + left_margin)
247
+ if j == 0:
248
+ crop_w += left_margin
249
+ if j == (tiling[1]-1):
250
+ crop_w += right_margin
251
+
252
+ pooled_w = (crop_w + 1) // 2
253
+ pooled_h = (crop_h + 1) // 2
254
+ patch_ordering_arr.append(
255
+ pad_to_bounding_box(
256
+ np.reshape(np.arange(on, on+pooled_h*pooled_w, dtype=np.int32), (pooled_h, pooled_w, 1)),
257
+ crop_y0, crop_x0, image_token_length_h, image_token_length_w, value=-1
258
+ )[:, :, 0]
259
+ )
260
+ patches_arr.append(src[y0:y0+crop_size, x0:x0+crop_size])
261
+ mask_arr.append(img_mask[y0:y0+crop_size, x0:x0+crop_size])
262
+
263
+ on += pooled_h*pooled_w
264
+ on_patch += 1
265
+ patches = np.stack(patches_arr)
266
+ patch_ordering = np.stack(patch_ordering_arr)
267
+ img_mask = np.stack(mask_arr)
268
+
269
+ # Switch to [n_crops, n_patches, pixels_per_patch] format
270
+ image_layout_impatch_w, image_layout_impatch_h = tiling[0], tiling[1]
271
+ patches = einops.rearrange(
272
+ patches, 'p (h dh) (w dw) c -> p (h w) (dh dw c)',
273
+ dh=base_image_input_d,
274
+ dw=base_image_input_d,
275
+ h=image_base_patch_h,
276
+ w=image_base_patch_w
277
+ )
278
+ img_mask = einops.rearrange(
279
+ img_mask, 'p (h dh) (w dw) -> p (h w) (dh dw)',
280
+ dh=base_image_input_d,
281
+ dw=base_image_input_d,
282
+ h=image_base_patch_h,
283
+ w=image_base_patch_w
284
+ )
285
+
286
+ img_mask = img_mask.astype(np.float32).mean(axis=-1)
287
+ patch_ordering = np.reshape(patch_ordering, [-1])
288
+ valid = patch_ordering >= 0
289
+
290
+ # Transpose order, to get left-to-right order instead of crop-by-crop order
291
+ patch_ordering_rh = np.reshape(
292
+ patch_ordering,
293
+ [tiling[0], tiling[1], image_token_length_h, image_token_length_w]
294
+ )
295
+ patch_ordering_rh = np.transpose(patch_ordering_rh, [0, 2, 1, 3])
296
+ patch_ordering_rh = np.reshape(patch_ordering_rh, [-1])
297
+
298
+ # The transpose will screw up which patches are masked, project the
299
+ # new order into sparse structure of `patch_ordering` to fix this
300
+ patch_ordering[valid] = patch_ordering_rh[patch_ordering_rh >= 0]
301
+
302
+ # Now build the output tokens
303
+ h = tiling[0] * crop_window_patches + (right_margin+left_margin)
304
+ w = tiling[1] * crop_window_patches + (right_margin+left_margin)
305
+ per_row = np.full(
306
+ ((w+1)//2,),
307
+ image_patch_token_id,
308
+ )
309
+ per_row = np.concatenate([per_row, [image_col_token_id]], 0)
310
+
311
+ joint = np.tile(per_row, [(h+1)//2])
312
+ joint = [
313
+ [image_start_token_id],
314
+ joint,
315
+ [image_end_token_id]
316
+ ]
317
+
318
+ # Finally do the same for the global image
319
+ resized, _ = resize_and_pad(image, base_image_input_size)
320
+ resized = einops.rearrange(
321
+ resized, '(h dh) (w dw) c -> (h w) (dh dw c)',
322
+ dh=base_image_input_d,
323
+ dw=base_image_input_d,
324
+ h=image_base_patch_h,
325
+ w=image_base_patch_w
326
+ )
327
+ patches = np.concatenate([np.expand_dims(resized, 0), patches], 0)
328
+
329
+ # Global image goes first, so the order of patches in previous crops gets increased
330
+ patch_ordering = np.where(
331
+ patch_ordering >= 0,
332
+ patch_ordering + tokens_per_image,
333
+ -1
334
+ )
335
+ patch_ordering = np.concatenate([np.arange(0, tokens_per_image), patch_ordering], 0)
336
+ per_row = np.full(
337
+ (image_token_length_w,),
338
+ image_patch_token_id,
339
+ )
340
+ per_row = np.concatenate([per_row, [image_col_token_id]], 0)
341
+ extra_tokens = np.tile(per_row, [image_token_length_h])
342
+ joint = [
343
+ [image_start_token_id],
344
+ extra_tokens,
345
+ [image_end_token_id],
346
+ ] + joint
347
+
348
+ joint = np.concatenate(joint, 0)
349
+ img_mask = np.pad(img_mask, [[0, 1], [0, 0]], constant_values=-1)
350
+ return patches, joint, patch_ordering, img_mask
351
+
352
+ def build_image_input_idx(
353
+ self,
354
+ image_tokens: np.ndarray,
355
+ patch_order: np.ndarray,
356
+ image_patch_token_id: int,
357
+ no_image: Optional[bool] = None,
358
+ image_token_length_w: Optional[int] = None,
359
+ image_token_length_h: Optional[int] = None,
360
+ ):
361
+ """Converts `patch_order` into a mapping of token_id -> patch_id"""
362
+
363
+ tokens_per_image = image_token_length_w * image_token_length_h
364
+ if no_image is not None and no_image:
365
+ return np.zeros((0, tokens_per_image), np.int32)
366
+
367
+ # Indices to insert the patches
368
+ image_input_idx = image_tokens == image_patch_token_id
369
+ image_input_idx = np.nonzero(image_input_idx)[0].astype(np.int32)
370
+
371
+ if patch_order is not None:
372
+ n_tokens = image_input_idx.shape[0]
373
+ patch_order = np.reshape(patch_order, [-1])
374
+ n_patches = patch_order.shape[0]
375
+
376
+ valid = patch_order >= 0
377
+ n_valid_patches = valid.sum()
378
+ assert len(image_input_idx) == n_valid_patches
379
+
380
+ sorted_patch_ixs = np.zeros([n_tokens], np.int32)
381
+ sorted_patch_ixs[patch_order[valid]] = np.arange(n_valid_patches, dtype=np.int32)
382
+
383
+ # Project the inverted mapping into same sparse structure
384
+ sorted_patch_ixs_ex = np.full(np.shape(patch_order), -1)
385
+ sorted_patch_ixs_ex[valid] = sorted_patch_ixs
386
+
387
+ # Do the gather and then re-masked outputs that were masked in `sorted_patch_ixs`
388
+ valid = (sorted_patch_ixs_ex >= 0).astype(np.int32)
389
+ image_input_idx = image_input_idx[sorted_patch_ixs_ex*valid]
390
+ image_input_idx = image_input_idx*valid - 100*(1 - valid)
391
+ image_input_idx = np.reshape(image_input_idx, [-1, tokens_per_image])
392
+ return image_input_idx
393
+
394
+ def preprocess(
395
+ self,
396
+ image: np.ndarray,
397
+ image_patch_token_id: int,
398
+ image_col_token_id: int,
399
+ image_start_token_id: int,
400
+ image_end_token_id: int,
401
+ max_crops: Optional[int] = None,
402
+ overlap_margins: Optional[List[int]] = None,
403
+ base_image_input_size: Optional[Union[int, List[int]]] = None,
404
+ image_token_length_w: Optional[int] = None,
405
+ image_token_length_h: Optional[int] = None,
406
+ image_patch_size: Optional[int] = None,
407
+ **kwargs,
408
+ ):
409
+ """Preprocesses an image
410
+
411
+ Returns:
412
+ crops: (n_crops, n_patches, patch_dim) individual crops, `n_crops` might
413
+ change between images but the other dimension are fixed
414
+ tokens: (n_tokens,) int32 tokens, pad tokens indicate where to insert the
415
+ patch features, might include other special tokens as well
416
+ image_idx: (n_crops, n_patches) index in `tokens` to put the patch features from the
417
+ crops after pooling, negative values indicates patches features to exclude
418
+ padding_mask: (n_crops, n_patches) what percent of each crop is padding, can be None
419
+ if the image mask is not being used.
420
+ """
421
+
422
+ max_crops = max_crops or self.max_crops
423
+ overlap_margins = overlap_margins or self.overlap_margins
424
+ base_image_input_size = base_image_input_size or self.base_image_input_size
425
+ image_token_length_w = image_token_length_w or self.image_token_length_w
426
+ image_token_length_h = image_token_length_h or self.image_token_length_h
427
+ image_patch_size = image_patch_size or self.image_patch_size
428
+
429
+ crops, image_tokens, patch_ordering, img_mask = self.image_to_patches_and_tokens(
430
+ image,
431
+ image_patch_token_id,
432
+ image_col_token_id,
433
+ image_start_token_id,
434
+ image_end_token_id,
435
+ max_crops,
436
+ overlap_margins,
437
+ base_image_input_size,
438
+ image_token_length_w,
439
+ image_token_length_h,
440
+ image_patch_size,
441
+ )
442
+ patch_idx = self.build_image_input_idx(
443
+ image_tokens,
444
+ patch_ordering,
445
+ image_patch_token_id,
446
+ image_token_length_w=image_token_length_w,
447
+ image_token_length_h=image_token_length_h,
448
+ )
449
+ return crops, image_tokens, patch_idx, img_mask
450
+
451
+ def multimodal_preprocess(
452
+ self,
453
+ images: np.ndarray,
454
+ tokens: List[int],
455
+ image_idx: np.ndarray,
456
+ sequence_length: int,
457
+ image_patch_token_id: int,
458
+ image_col_token_id: int,
459
+ image_start_token_id: int,
460
+ image_end_token_id: int,
461
+ **kwargs,
462
+ ):
463
+ """Merge images and text tokens into multi-modal features for the model
464
+
465
+ :param images: images to use as input
466
+ :param tokens: input text tokens
467
+ :param image_idx: where to insert the images into `tokens`
468
+ :params image_patch_token_id: id to use of tokens that will contain image features
469
+ :params image_col_token_id: token id for image column special tokens
470
+ :params image_start_token_id: token id for image start special tokens
471
+ :params image_end_token_id: token id for image end special tokens
472
+ :params kwargs: override preprocessor default args
473
+ """
474
+ max_total_crops = kwargs.get("max_crops") or self.max_crops
475
+ image_token_length_w = kwargs.get("image_token_length_w") or self.image_token_length_w
476
+ image_token_length_h = kwargs.get("image_token_length_h") or self.image_token_length_h
477
+ image_patch_size = kwargs.get("image_patch_size") or self.image_patch_size
478
+ base_image_input_size = kwargs.get("base_image_input_size") or self.base_image_input_size
479
+ image_num_patch = (
480
+ base_image_input_size[0] // image_patch_size,
481
+ base_image_input_size[1] // image_patch_size,
482
+ )
483
+ image_padding_mask = kwargs.get("image_padding_mask") or self.image_padding_mask
484
+
485
+ tokens_per_image = image_token_length_w * image_token_length_h
486
+ n_pixels = image_patch_size * image_patch_size * 3
487
+ n_patches = image_num_patch[0] * image_num_patch[1]
488
+
489
+ if images is None:
490
+ return {
491
+ "input_ids": tokens,
492
+ }
493
+ else:
494
+ n = len(images)
495
+ all_crops = []
496
+ all_image_idx = []
497
+ out_tokens = []
498
+ all_crop_masks = []
499
+
500
+ for ix in range(n):
501
+ token_ix = image_idx[ix]
502
+ crops, image_tokens, patch_idx, img_mask = self.preprocess(
503
+ images[ix],
504
+ image_patch_token_id,
505
+ image_col_token_id,
506
+ image_start_token_id,
507
+ image_end_token_id,
508
+ **kwargs,
509
+ )
510
+
511
+ if token_ix == -1: # -1 is an image inserted at the very start
512
+ start = 0
513
+ token_ix = 0
514
+ end = 0
515
+ else:
516
+ start = 0 if ix == 0 else image_idx[ix-1] + 1
517
+ end = token_ix + 1
518
+
519
+ all_image_idx.append(patch_idx + token_ix)
520
+ all_crops.append(crops)
521
+ out_tokens.append(tokens[start:token_ix])
522
+ out_tokens.append(image_tokens)
523
+ if ix == (n - 1):
524
+ out_tokens.append(tokens[end:])
525
+ if image_padding_mask:
526
+ all_crop_masks.append(img_mask)
527
+
528
+ input_ids = np.concatenate(out_tokens, 0)
529
+ images = np.concatenate(all_crops, 0)
530
+ image_input_idx = np.concatenate(all_image_idx, 0)
531
+ if image_padding_mask:
532
+ image_masks = np.concatenate(all_crop_masks, 0)
533
+ else:
534
+ image_masks = None
535
+
536
+ out = {
537
+ "input_ids": input_ids,
538
+ "images": images,
539
+ "image_input_idx": image_input_idx
540
+ }
541
+ if image_masks is not None:
542
+ out["image_masks"] = image_masks
543
+ return out
544
+
545
+
546
+ MolmoImageProcessor.register_for_auto_class()
latest ADDED
@@ -0,0 +1 @@
 
 
1
+ global_step412
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a64680d5cfae52d6e47f4b5457ba03badc95879615c052ecd7ed6379cce51397
3
+ size 4981346544
model-00002-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89430b3d75c77933d8124e7286e936cc69b74d99bff38f35baa7056fafdd90f6
3
+ size 4991475304
model-00003-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:54e51ce161091a621052e11bb9d274636dcda751d8a300ec22de26cef998a253
3
+ size 4169357528
model-00004-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:14f6cbc2ec98dd95fb05cde73a27887b6376cd060dc4a460f8f64bd45cb3d90d
3
+ size 1899952568
model.safetensors.index.json ADDED
@@ -0,0 +1,592 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 16042050560
4
+ },
5
+ "weight_map": {
6
+ "model.transformer.blocks.0.att_proj.bias": "model-00001-of-00004.safetensors",
7
+ "model.transformer.blocks.0.att_proj.weight": "model-00001-of-00004.safetensors",
8
+ "model.transformer.blocks.0.attn_norm.weight": "model-00001-of-00004.safetensors",
9
+ "model.transformer.blocks.0.attn_out.weight": "model-00001-of-00004.safetensors",
10
+ "model.transformer.blocks.0.ff_norm.weight": "model-00001-of-00004.safetensors",
11
+ "model.transformer.blocks.0.ff_out.weight": "model-00001-of-00004.safetensors",
12
+ "model.transformer.blocks.0.ff_proj.weight": "model-00001-of-00004.safetensors",
13
+ "model.transformer.blocks.1.att_proj.bias": "model-00001-of-00004.safetensors",
14
+ "model.transformer.blocks.1.att_proj.weight": "model-00001-of-00004.safetensors",
15
+ "model.transformer.blocks.1.attn_norm.weight": "model-00001-of-00004.safetensors",
16
+ "model.transformer.blocks.1.attn_out.weight": "model-00001-of-00004.safetensors",
17
+ "model.transformer.blocks.1.ff_norm.weight": "model-00001-of-00004.safetensors",
18
+ "model.transformer.blocks.1.ff_out.weight": "model-00001-of-00004.safetensors",
19
+ "model.transformer.blocks.1.ff_proj.weight": "model-00001-of-00004.safetensors",
20
+ "model.transformer.blocks.10.att_proj.bias": "model-00002-of-00004.safetensors",
21
+ "model.transformer.blocks.10.att_proj.weight": "model-00002-of-00004.safetensors",
22
+ "model.transformer.blocks.10.attn_norm.weight": "model-00002-of-00004.safetensors",
23
+ "model.transformer.blocks.10.attn_out.weight": "model-00002-of-00004.safetensors",
24
+ "model.transformer.blocks.10.ff_norm.weight": "model-00002-of-00004.safetensors",
25
+ "model.transformer.blocks.10.ff_out.weight": "model-00002-of-00004.safetensors",
26
+ "model.transformer.blocks.10.ff_proj.weight": "model-00002-of-00004.safetensors",
27
+ "model.transformer.blocks.11.att_proj.bias": "model-00002-of-00004.safetensors",
28
+ "model.transformer.blocks.11.att_proj.weight": "model-00002-of-00004.safetensors",
29
+ "model.transformer.blocks.11.attn_norm.weight": "model-00002-of-00004.safetensors",
30
+ "model.transformer.blocks.11.attn_out.weight": "model-00002-of-00004.safetensors",
31
+ "model.transformer.blocks.11.ff_norm.weight": "model-00002-of-00004.safetensors",
32
+ "model.transformer.blocks.11.ff_out.weight": "model-00002-of-00004.safetensors",
33
+ "model.transformer.blocks.11.ff_proj.weight": "model-00002-of-00004.safetensors",
34
+ "model.transformer.blocks.12.att_proj.bias": "model-00002-of-00004.safetensors",
35
+ "model.transformer.blocks.12.att_proj.weight": "model-00002-of-00004.safetensors",
36
+ "model.transformer.blocks.12.attn_norm.weight": "model-00002-of-00004.safetensors",
37
+ "model.transformer.blocks.12.attn_out.weight": "model-00002-of-00004.safetensors",
38
+ "model.transformer.blocks.12.ff_norm.weight": "model-00002-of-00004.safetensors",
39
+ "model.transformer.blocks.12.ff_out.weight": "model-00002-of-00004.safetensors",
40
+ "model.transformer.blocks.12.ff_proj.weight": "model-00002-of-00004.safetensors",
41
+ "model.transformer.blocks.13.att_proj.bias": "model-00002-of-00004.safetensors",
42
+ "model.transformer.blocks.13.att_proj.weight": "model-00002-of-00004.safetensors",
43
+ "model.transformer.blocks.13.attn_norm.weight": "model-00002-of-00004.safetensors",
44
+ "model.transformer.blocks.13.attn_out.weight": "model-00002-of-00004.safetensors",
45
+ "model.transformer.blocks.13.ff_norm.weight": "model-00002-of-00004.safetensors",
46
+ "model.transformer.blocks.13.ff_out.weight": "model-00002-of-00004.safetensors",
47
+ "model.transformer.blocks.13.ff_proj.weight": "model-00002-of-00004.safetensors",
48
+ "model.transformer.blocks.14.att_proj.bias": "model-00002-of-00004.safetensors",
49
+ "model.transformer.blocks.14.att_proj.weight": "model-00002-of-00004.safetensors",
50
+ "model.transformer.blocks.14.attn_norm.weight": "model-00002-of-00004.safetensors",
51
+ "model.transformer.blocks.14.attn_out.weight": "model-00002-of-00004.safetensors",
52
+ "model.transformer.blocks.14.ff_norm.weight": "model-00002-of-00004.safetensors",
53
+ "model.transformer.blocks.14.ff_out.weight": "model-00002-of-00004.safetensors",
54
+ "model.transformer.blocks.14.ff_proj.weight": "model-00002-of-00004.safetensors",
55
+ "model.transformer.blocks.15.att_proj.bias": "model-00002-of-00004.safetensors",
56
+ "model.transformer.blocks.15.att_proj.weight": "model-00002-of-00004.safetensors",
57
+ "model.transformer.blocks.15.attn_norm.weight": "model-00002-of-00004.safetensors",
58
+ "model.transformer.blocks.15.attn_out.weight": "model-00002-of-00004.safetensors",
59
+ "model.transformer.blocks.15.ff_norm.weight": "model-00002-of-00004.safetensors",
60
+ "model.transformer.blocks.15.ff_out.weight": "model-00002-of-00004.safetensors",
61
+ "model.transformer.blocks.15.ff_proj.weight": "model-00002-of-00004.safetensors",
62
+ "model.transformer.blocks.16.att_proj.bias": "model-00002-of-00004.safetensors",
63
+ "model.transformer.blocks.16.att_proj.weight": "model-00002-of-00004.safetensors",
64
+ "model.transformer.blocks.16.attn_norm.weight": "model-00002-of-00004.safetensors",
65
+ "model.transformer.blocks.16.attn_out.weight": "model-00002-of-00004.safetensors",
66
+ "model.transformer.blocks.16.ff_norm.weight": "model-00002-of-00004.safetensors",
67
+ "model.transformer.blocks.16.ff_out.weight": "model-00002-of-00004.safetensors",
68
+ "model.transformer.blocks.16.ff_proj.weight": "model-00002-of-00004.safetensors",
69
+ "model.transformer.blocks.17.att_proj.bias": "model-00002-of-00004.safetensors",
70
+ "model.transformer.blocks.17.att_proj.weight": "model-00002-of-00004.safetensors",
71
+ "model.transformer.blocks.17.attn_norm.weight": "model-00002-of-00004.safetensors",
72
+ "model.transformer.blocks.17.attn_out.weight": "model-00002-of-00004.safetensors",
73
+ "model.transformer.blocks.17.ff_norm.weight": "model-00002-of-00004.safetensors",
74
+ "model.transformer.blocks.17.ff_out.weight": "model-00002-of-00004.safetensors",
75
+ "model.transformer.blocks.17.ff_proj.weight": "model-00002-of-00004.safetensors",
76
+ "model.transformer.blocks.18.att_proj.bias": "model-00002-of-00004.safetensors",
77
+ "model.transformer.blocks.18.att_proj.weight": "model-00002-of-00004.safetensors",
78
+ "model.transformer.blocks.18.attn_norm.weight": "model-00002-of-00004.safetensors",
79
+ "model.transformer.blocks.18.attn_out.weight": "model-00002-of-00004.safetensors",
80
+ "model.transformer.blocks.18.ff_norm.weight": "model-00002-of-00004.safetensors",
81
+ "model.transformer.blocks.18.ff_out.weight": "model-00002-of-00004.safetensors",
82
+ "model.transformer.blocks.18.ff_proj.weight": "model-00002-of-00004.safetensors",
83
+ "model.transformer.blocks.19.att_proj.bias": "model-00003-of-00004.safetensors",
84
+ "model.transformer.blocks.19.att_proj.weight": "model-00003-of-00004.safetensors",
85
+ "model.transformer.blocks.19.attn_norm.weight": "model-00003-of-00004.safetensors",
86
+ "model.transformer.blocks.19.attn_out.weight": "model-00002-of-00004.safetensors",
87
+ "model.transformer.blocks.19.ff_norm.weight": "model-00003-of-00004.safetensors",
88
+ "model.transformer.blocks.19.ff_out.weight": "model-00003-of-00004.safetensors",
89
+ "model.transformer.blocks.19.ff_proj.weight": "model-00003-of-00004.safetensors",
90
+ "model.transformer.blocks.2.att_proj.bias": "model-00001-of-00004.safetensors",
91
+ "model.transformer.blocks.2.att_proj.weight": "model-00001-of-00004.safetensors",
92
+ "model.transformer.blocks.2.attn_norm.weight": "model-00001-of-00004.safetensors",
93
+ "model.transformer.blocks.2.attn_out.weight": "model-00001-of-00004.safetensors",
94
+ "model.transformer.blocks.2.ff_norm.weight": "model-00001-of-00004.safetensors",
95
+ "model.transformer.blocks.2.ff_out.weight": "model-00001-of-00004.safetensors",
96
+ "model.transformer.blocks.2.ff_proj.weight": "model-00001-of-00004.safetensors",
97
+ "model.transformer.blocks.20.att_proj.bias": "model-00003-of-00004.safetensors",
98
+ "model.transformer.blocks.20.att_proj.weight": "model-00003-of-00004.safetensors",
99
+ "model.transformer.blocks.20.attn_norm.weight": "model-00003-of-00004.safetensors",
100
+ "model.transformer.blocks.20.attn_out.weight": "model-00003-of-00004.safetensors",
101
+ "model.transformer.blocks.20.ff_norm.weight": "model-00003-of-00004.safetensors",
102
+ "model.transformer.blocks.20.ff_out.weight": "model-00003-of-00004.safetensors",
103
+ "model.transformer.blocks.20.ff_proj.weight": "model-00003-of-00004.safetensors",
104
+ "model.transformer.blocks.21.att_proj.bias": "model-00003-of-00004.safetensors",
105
+ "model.transformer.blocks.21.att_proj.weight": "model-00003-of-00004.safetensors",
106
+ "model.transformer.blocks.21.attn_norm.weight": "model-00003-of-00004.safetensors",
107
+ "model.transformer.blocks.21.attn_out.weight": "model-00003-of-00004.safetensors",
108
+ "model.transformer.blocks.21.ff_norm.weight": "model-00003-of-00004.safetensors",
109
+ "model.transformer.blocks.21.ff_out.weight": "model-00003-of-00004.safetensors",
110
+ "model.transformer.blocks.21.ff_proj.weight": "model-00003-of-00004.safetensors",
111
+ "model.transformer.blocks.22.att_proj.bias": "model-00003-of-00004.safetensors",
112
+ "model.transformer.blocks.22.att_proj.weight": "model-00003-of-00004.safetensors",
113
+ "model.transformer.blocks.22.attn_norm.weight": "model-00003-of-00004.safetensors",
114
+ "model.transformer.blocks.22.attn_out.weight": "model-00003-of-00004.safetensors",
115
+ "model.transformer.blocks.22.ff_norm.weight": "model-00003-of-00004.safetensors",
116
+ "model.transformer.blocks.22.ff_out.weight": "model-00003-of-00004.safetensors",
117
+ "model.transformer.blocks.22.ff_proj.weight": "model-00003-of-00004.safetensors",
118
+ "model.transformer.blocks.23.att_proj.bias": "model-00003-of-00004.safetensors",
119
+ "model.transformer.blocks.23.att_proj.weight": "model-00003-of-00004.safetensors",
120
+ "model.transformer.blocks.23.attn_norm.weight": "model-00003-of-00004.safetensors",
121
+ "model.transformer.blocks.23.attn_out.weight": "model-00003-of-00004.safetensors",
122
+ "model.transformer.blocks.23.ff_norm.weight": "model-00003-of-00004.safetensors",
123
+ "model.transformer.blocks.23.ff_out.weight": "model-00003-of-00004.safetensors",
124
+ "model.transformer.blocks.23.ff_proj.weight": "model-00003-of-00004.safetensors",
125
+ "model.transformer.blocks.24.att_proj.bias": "model-00003-of-00004.safetensors",
126
+ "model.transformer.blocks.24.att_proj.weight": "model-00003-of-00004.safetensors",
127
+ "model.transformer.blocks.24.attn_norm.weight": "model-00003-of-00004.safetensors",
128
+ "model.transformer.blocks.24.attn_out.weight": "model-00003-of-00004.safetensors",
129
+ "model.transformer.blocks.24.ff_norm.weight": "model-00003-of-00004.safetensors",
130
+ "model.transformer.blocks.24.ff_out.weight": "model-00003-of-00004.safetensors",
131
+ "model.transformer.blocks.24.ff_proj.weight": "model-00003-of-00004.safetensors",
132
+ "model.transformer.blocks.25.att_proj.bias": "model-00003-of-00004.safetensors",
133
+ "model.transformer.blocks.25.att_proj.weight": "model-00003-of-00004.safetensors",
134
+ "model.transformer.blocks.25.attn_norm.weight": "model-00003-of-00004.safetensors",
135
+ "model.transformer.blocks.25.attn_out.weight": "model-00003-of-00004.safetensors",
136
+ "model.transformer.blocks.25.ff_norm.weight": "model-00003-of-00004.safetensors",
137
+ "model.transformer.blocks.25.ff_out.weight": "model-00003-of-00004.safetensors",
138
+ "model.transformer.blocks.25.ff_proj.weight": "model-00003-of-00004.safetensors",
139
+ "model.transformer.blocks.26.att_proj.bias": "model-00003-of-00004.safetensors",
140
+ "model.transformer.blocks.26.att_proj.weight": "model-00003-of-00004.safetensors",
141
+ "model.transformer.blocks.26.attn_norm.weight": "model-00003-of-00004.safetensors",
142
+ "model.transformer.blocks.26.attn_out.weight": "model-00003-of-00004.safetensors",
143
+ "model.transformer.blocks.26.ff_norm.weight": "model-00003-of-00004.safetensors",
144
+ "model.transformer.blocks.26.ff_out.weight": "model-00003-of-00004.safetensors",
145
+ "model.transformer.blocks.26.ff_proj.weight": "model-00003-of-00004.safetensors",
146
+ "model.transformer.blocks.27.att_proj.bias": "model-00003-of-00004.safetensors",
147
+ "model.transformer.blocks.27.att_proj.weight": "model-00003-of-00004.safetensors",
148
+ "model.transformer.blocks.27.attn_norm.weight": "model-00003-of-00004.safetensors",
149
+ "model.transformer.blocks.27.attn_out.weight": "model-00003-of-00004.safetensors",
150
+ "model.transformer.blocks.27.ff_norm.weight": "model-00003-of-00004.safetensors",
151
+ "model.transformer.blocks.27.ff_out.weight": "model-00003-of-00004.safetensors",
152
+ "model.transformer.blocks.27.ff_proj.weight": "model-00003-of-00004.safetensors",
153
+ "model.transformer.blocks.3.att_proj.bias": "model-00001-of-00004.safetensors",
154
+ "model.transformer.blocks.3.att_proj.weight": "model-00001-of-00004.safetensors",
155
+ "model.transformer.blocks.3.attn_norm.weight": "model-00001-of-00004.safetensors",
156
+ "model.transformer.blocks.3.attn_out.weight": "model-00001-of-00004.safetensors",
157
+ "model.transformer.blocks.3.ff_norm.weight": "model-00001-of-00004.safetensors",
158
+ "model.transformer.blocks.3.ff_out.weight": "model-00001-of-00004.safetensors",
159
+ "model.transformer.blocks.3.ff_proj.weight": "model-00001-of-00004.safetensors",
160
+ "model.transformer.blocks.4.att_proj.bias": "model-00001-of-00004.safetensors",
161
+ "model.transformer.blocks.4.att_proj.weight": "model-00001-of-00004.safetensors",
162
+ "model.transformer.blocks.4.attn_norm.weight": "model-00001-of-00004.safetensors",
163
+ "model.transformer.blocks.4.attn_out.weight": "model-00001-of-00004.safetensors",
164
+ "model.transformer.blocks.4.ff_norm.weight": "model-00001-of-00004.safetensors",
165
+ "model.transformer.blocks.4.ff_out.weight": "model-00001-of-00004.safetensors",
166
+ "model.transformer.blocks.4.ff_proj.weight": "model-00001-of-00004.safetensors",
167
+ "model.transformer.blocks.5.att_proj.bias": "model-00001-of-00004.safetensors",
168
+ "model.transformer.blocks.5.att_proj.weight": "model-00001-of-00004.safetensors",
169
+ "model.transformer.blocks.5.attn_norm.weight": "model-00001-of-00004.safetensors",
170
+ "model.transformer.blocks.5.attn_out.weight": "model-00001-of-00004.safetensors",
171
+ "model.transformer.blocks.5.ff_norm.weight": "model-00001-of-00004.safetensors",
172
+ "model.transformer.blocks.5.ff_out.weight": "model-00001-of-00004.safetensors",
173
+ "model.transformer.blocks.5.ff_proj.weight": "model-00001-of-00004.safetensors",
174
+ "model.transformer.blocks.6.att_proj.bias": "model-00001-of-00004.safetensors",
175
+ "model.transformer.blocks.6.att_proj.weight": "model-00001-of-00004.safetensors",
176
+ "model.transformer.blocks.6.attn_norm.weight": "model-00001-of-00004.safetensors",
177
+ "model.transformer.blocks.6.attn_out.weight": "model-00001-of-00004.safetensors",
178
+ "model.transformer.blocks.6.ff_norm.weight": "model-00001-of-00004.safetensors",
179
+ "model.transformer.blocks.6.ff_out.weight": "model-00001-of-00004.safetensors",
180
+ "model.transformer.blocks.6.ff_proj.weight": "model-00001-of-00004.safetensors",
181
+ "model.transformer.blocks.7.att_proj.bias": "model-00001-of-00004.safetensors",
182
+ "model.transformer.blocks.7.att_proj.weight": "model-00001-of-00004.safetensors",
183
+ "model.transformer.blocks.7.attn_norm.weight": "model-00001-of-00004.safetensors",
184
+ "model.transformer.blocks.7.attn_out.weight": "model-00001-of-00004.safetensors",
185
+ "model.transformer.blocks.7.ff_norm.weight": "model-00001-of-00004.safetensors",
186
+ "model.transformer.blocks.7.ff_out.weight": "model-00001-of-00004.safetensors",
187
+ "model.transformer.blocks.7.ff_proj.weight": "model-00001-of-00004.safetensors",
188
+ "model.transformer.blocks.8.att_proj.bias": "model-00002-of-00004.safetensors",
189
+ "model.transformer.blocks.8.att_proj.weight": "model-00002-of-00004.safetensors",
190
+ "model.transformer.blocks.8.attn_norm.weight": "model-00001-of-00004.safetensors",
191
+ "model.transformer.blocks.8.attn_out.weight": "model-00001-of-00004.safetensors",
192
+ "model.transformer.blocks.8.ff_norm.weight": "model-00001-of-00004.safetensors",
193
+ "model.transformer.blocks.8.ff_out.weight": "model-00001-of-00004.safetensors",
194
+ "model.transformer.blocks.8.ff_proj.weight": "model-00002-of-00004.safetensors",
195
+ "model.transformer.blocks.9.att_proj.bias": "model-00002-of-00004.safetensors",
196
+ "model.transformer.blocks.9.att_proj.weight": "model-00002-of-00004.safetensors",
197
+ "model.transformer.blocks.9.attn_norm.weight": "model-00002-of-00004.safetensors",
198
+ "model.transformer.blocks.9.attn_out.weight": "model-00002-of-00004.safetensors",
199
+ "model.transformer.blocks.9.ff_norm.weight": "model-00002-of-00004.safetensors",
200
+ "model.transformer.blocks.9.ff_out.weight": "model-00002-of-00004.safetensors",
201
+ "model.transformer.blocks.9.ff_proj.weight": "model-00002-of-00004.safetensors",
202
+ "model.transformer.ff_out.weight": "model-00004-of-00004.safetensors",
203
+ "model.transformer.ln_f.weight": "model-00001-of-00004.safetensors",
204
+ "model.transformer.wte.embedding": "model-00001-of-00004.safetensors",
205
+ "model.transformer.wte.new_embedding": "model-00001-of-00004.safetensors",
206
+ "model.vision_backbone.image_pooling_2d.wk.bias": "model-00004-of-00004.safetensors",
207
+ "model.vision_backbone.image_pooling_2d.wk.weight": "model-00004-of-00004.safetensors",
208
+ "model.vision_backbone.image_pooling_2d.wo.bias": "model-00004-of-00004.safetensors",
209
+ "model.vision_backbone.image_pooling_2d.wo.weight": "model-00004-of-00004.safetensors",
210
+ "model.vision_backbone.image_pooling_2d.wq.bias": "model-00004-of-00004.safetensors",
211
+ "model.vision_backbone.image_pooling_2d.wq.weight": "model-00004-of-00004.safetensors",
212
+ "model.vision_backbone.image_pooling_2d.wv.bias": "model-00004-of-00004.safetensors",
213
+ "model.vision_backbone.image_pooling_2d.wv.weight": "model-00004-of-00004.safetensors",
214
+ "model.vision_backbone.image_projector.w1.weight": "model-00004-of-00004.safetensors",
215
+ "model.vision_backbone.image_projector.w2.weight": "model-00004-of-00004.safetensors",
216
+ "model.vision_backbone.image_projector.w3.weight": "model-00004-of-00004.safetensors",
217
+ "model.vision_backbone.image_vit.class_embedding": "model-00004-of-00004.safetensors",
218
+ "model.vision_backbone.image_vit.patch_embedding.weight": "model-00004-of-00004.safetensors",
219
+ "model.vision_backbone.image_vit.positional_embedding": "model-00004-of-00004.safetensors",
220
+ "model.vision_backbone.image_vit.pre_ln.bias": "model-00004-of-00004.safetensors",
221
+ "model.vision_backbone.image_vit.pre_ln.weight": "model-00004-of-00004.safetensors",
222
+ "model.vision_backbone.image_vit.transformer.resblocks.0.attention.wk.bias": "model-00004-of-00004.safetensors",
223
+ "model.vision_backbone.image_vit.transformer.resblocks.0.attention.wk.weight": "model-00004-of-00004.safetensors",
224
+ "model.vision_backbone.image_vit.transformer.resblocks.0.attention.wo.bias": "model-00004-of-00004.safetensors",
225
+ "model.vision_backbone.image_vit.transformer.resblocks.0.attention.wo.weight": "model-00004-of-00004.safetensors",
226
+ "model.vision_backbone.image_vit.transformer.resblocks.0.attention.wq.bias": "model-00004-of-00004.safetensors",
227
+ "model.vision_backbone.image_vit.transformer.resblocks.0.attention.wq.weight": "model-00004-of-00004.safetensors",
228
+ "model.vision_backbone.image_vit.transformer.resblocks.0.attention.wv.bias": "model-00004-of-00004.safetensors",
229
+ "model.vision_backbone.image_vit.transformer.resblocks.0.attention.wv.weight": "model-00004-of-00004.safetensors",
230
+ "model.vision_backbone.image_vit.transformer.resblocks.0.attention_norm.bias": "model-00004-of-00004.safetensors",
231
+ "model.vision_backbone.image_vit.transformer.resblocks.0.attention_norm.weight": "model-00004-of-00004.safetensors",
232
+ "model.vision_backbone.image_vit.transformer.resblocks.0.feed_forward.w1.bias": "model-00004-of-00004.safetensors",
233
+ "model.vision_backbone.image_vit.transformer.resblocks.0.feed_forward.w1.weight": "model-00004-of-00004.safetensors",
234
+ "model.vision_backbone.image_vit.transformer.resblocks.0.feed_forward.w2.bias": "model-00004-of-00004.safetensors",
235
+ "model.vision_backbone.image_vit.transformer.resblocks.0.feed_forward.w2.weight": "model-00004-of-00004.safetensors",
236
+ "model.vision_backbone.image_vit.transformer.resblocks.0.ffn_norm.bias": "model-00004-of-00004.safetensors",
237
+ "model.vision_backbone.image_vit.transformer.resblocks.0.ffn_norm.weight": "model-00004-of-00004.safetensors",
238
+ "model.vision_backbone.image_vit.transformer.resblocks.1.attention.wk.bias": "model-00004-of-00004.safetensors",
239
+ "model.vision_backbone.image_vit.transformer.resblocks.1.attention.wk.weight": "model-00004-of-00004.safetensors",
240
+ "model.vision_backbone.image_vit.transformer.resblocks.1.attention.wo.bias": "model-00004-of-00004.safetensors",
241
+ "model.vision_backbone.image_vit.transformer.resblocks.1.attention.wo.weight": "model-00004-of-00004.safetensors",
242
+ "model.vision_backbone.image_vit.transformer.resblocks.1.attention.wq.bias": "model-00004-of-00004.safetensors",
243
+ "model.vision_backbone.image_vit.transformer.resblocks.1.attention.wq.weight": "model-00004-of-00004.safetensors",
244
+ "model.vision_backbone.image_vit.transformer.resblocks.1.attention.wv.bias": "model-00004-of-00004.safetensors",
245
+ "model.vision_backbone.image_vit.transformer.resblocks.1.attention.wv.weight": "model-00004-of-00004.safetensors",
246
+ "model.vision_backbone.image_vit.transformer.resblocks.1.attention_norm.bias": "model-00004-of-00004.safetensors",
247
+ "model.vision_backbone.image_vit.transformer.resblocks.1.attention_norm.weight": "model-00004-of-00004.safetensors",
248
+ "model.vision_backbone.image_vit.transformer.resblocks.1.feed_forward.w1.bias": "model-00004-of-00004.safetensors",
249
+ "model.vision_backbone.image_vit.transformer.resblocks.1.feed_forward.w1.weight": "model-00004-of-00004.safetensors",
250
+ "model.vision_backbone.image_vit.transformer.resblocks.1.feed_forward.w2.bias": "model-00004-of-00004.safetensors",
251
+ "model.vision_backbone.image_vit.transformer.resblocks.1.feed_forward.w2.weight": "model-00004-of-00004.safetensors",
252
+ "model.vision_backbone.image_vit.transformer.resblocks.1.ffn_norm.bias": "model-00004-of-00004.safetensors",
253
+ "model.vision_backbone.image_vit.transformer.resblocks.1.ffn_norm.weight": "model-00004-of-00004.safetensors",
254
+ "model.vision_backbone.image_vit.transformer.resblocks.10.attention.wk.bias": "model-00004-of-00004.safetensors",
255
+ "model.vision_backbone.image_vit.transformer.resblocks.10.attention.wk.weight": "model-00004-of-00004.safetensors",
256
+ "model.vision_backbone.image_vit.transformer.resblocks.10.attention.wo.bias": "model-00004-of-00004.safetensors",
257
+ "model.vision_backbone.image_vit.transformer.resblocks.10.attention.wo.weight": "model-00004-of-00004.safetensors",
258
+ "model.vision_backbone.image_vit.transformer.resblocks.10.attention.wq.bias": "model-00004-of-00004.safetensors",
259
+ "model.vision_backbone.image_vit.transformer.resblocks.10.attention.wq.weight": "model-00004-of-00004.safetensors",
260
+ "model.vision_backbone.image_vit.transformer.resblocks.10.attention.wv.bias": "model-00004-of-00004.safetensors",
261
+ "model.vision_backbone.image_vit.transformer.resblocks.10.attention.wv.weight": "model-00004-of-00004.safetensors",
262
+ "model.vision_backbone.image_vit.transformer.resblocks.10.attention_norm.bias": "model-00004-of-00004.safetensors",
263
+ "model.vision_backbone.image_vit.transformer.resblocks.10.attention_norm.weight": "model-00004-of-00004.safetensors",
264
+ "model.vision_backbone.image_vit.transformer.resblocks.10.feed_forward.w1.bias": "model-00004-of-00004.safetensors",
265
+ "model.vision_backbone.image_vit.transformer.resblocks.10.feed_forward.w1.weight": "model-00004-of-00004.safetensors",
266
+ "model.vision_backbone.image_vit.transformer.resblocks.10.feed_forward.w2.bias": "model-00004-of-00004.safetensors",
267
+ "model.vision_backbone.image_vit.transformer.resblocks.10.feed_forward.w2.weight": "model-00004-of-00004.safetensors",
268
+ "model.vision_backbone.image_vit.transformer.resblocks.10.ffn_norm.bias": "model-00004-of-00004.safetensors",
269
+ "model.vision_backbone.image_vit.transformer.resblocks.10.ffn_norm.weight": "model-00004-of-00004.safetensors",
270
+ "model.vision_backbone.image_vit.transformer.resblocks.11.attention.wk.bias": "model-00004-of-00004.safetensors",
271
+ "model.vision_backbone.image_vit.transformer.resblocks.11.attention.wk.weight": "model-00004-of-00004.safetensors",
272
+ "model.vision_backbone.image_vit.transformer.resblocks.11.attention.wo.bias": "model-00004-of-00004.safetensors",
273
+ "model.vision_backbone.image_vit.transformer.resblocks.11.attention.wo.weight": "model-00004-of-00004.safetensors",
274
+ "model.vision_backbone.image_vit.transformer.resblocks.11.attention.wq.bias": "model-00004-of-00004.safetensors",
275
+ "model.vision_backbone.image_vit.transformer.resblocks.11.attention.wq.weight": "model-00004-of-00004.safetensors",
276
+ "model.vision_backbone.image_vit.transformer.resblocks.11.attention.wv.bias": "model-00004-of-00004.safetensors",
277
+ "model.vision_backbone.image_vit.transformer.resblocks.11.attention.wv.weight": "model-00004-of-00004.safetensors",
278
+ "model.vision_backbone.image_vit.transformer.resblocks.11.attention_norm.bias": "model-00004-of-00004.safetensors",
279
+ "model.vision_backbone.image_vit.transformer.resblocks.11.attention_norm.weight": "model-00004-of-00004.safetensors",
280
+ "model.vision_backbone.image_vit.transformer.resblocks.11.feed_forward.w1.bias": "model-00004-of-00004.safetensors",
281
+ "model.vision_backbone.image_vit.transformer.resblocks.11.feed_forward.w1.weight": "model-00004-of-00004.safetensors",
282
+ "model.vision_backbone.image_vit.transformer.resblocks.11.feed_forward.w2.bias": "model-00004-of-00004.safetensors",
283
+ "model.vision_backbone.image_vit.transformer.resblocks.11.feed_forward.w2.weight": "model-00004-of-00004.safetensors",
284
+ "model.vision_backbone.image_vit.transformer.resblocks.11.ffn_norm.bias": "model-00004-of-00004.safetensors",
285
+ "model.vision_backbone.image_vit.transformer.resblocks.11.ffn_norm.weight": "model-00004-of-00004.safetensors",
286
+ "model.vision_backbone.image_vit.transformer.resblocks.12.attention.wk.bias": "model-00004-of-00004.safetensors",
287
+ "model.vision_backbone.image_vit.transformer.resblocks.12.attention.wk.weight": "model-00004-of-00004.safetensors",
288
+ "model.vision_backbone.image_vit.transformer.resblocks.12.attention.wo.bias": "model-00004-of-00004.safetensors",
289
+ "model.vision_backbone.image_vit.transformer.resblocks.12.attention.wo.weight": "model-00004-of-00004.safetensors",
290
+ "model.vision_backbone.image_vit.transformer.resblocks.12.attention.wq.bias": "model-00004-of-00004.safetensors",
291
+ "model.vision_backbone.image_vit.transformer.resblocks.12.attention.wq.weight": "model-00004-of-00004.safetensors",
292
+ "model.vision_backbone.image_vit.transformer.resblocks.12.attention.wv.bias": "model-00004-of-00004.safetensors",
293
+ "model.vision_backbone.image_vit.transformer.resblocks.12.attention.wv.weight": "model-00004-of-00004.safetensors",
294
+ "model.vision_backbone.image_vit.transformer.resblocks.12.attention_norm.bias": "model-00004-of-00004.safetensors",
295
+ "model.vision_backbone.image_vit.transformer.resblocks.12.attention_norm.weight": "model-00004-of-00004.safetensors",
296
+ "model.vision_backbone.image_vit.transformer.resblocks.12.feed_forward.w1.bias": "model-00004-of-00004.safetensors",
297
+ "model.vision_backbone.image_vit.transformer.resblocks.12.feed_forward.w1.weight": "model-00004-of-00004.safetensors",
298
+ "model.vision_backbone.image_vit.transformer.resblocks.12.feed_forward.w2.bias": "model-00004-of-00004.safetensors",
299
+ "model.vision_backbone.image_vit.transformer.resblocks.12.feed_forward.w2.weight": "model-00004-of-00004.safetensors",
300
+ "model.vision_backbone.image_vit.transformer.resblocks.12.ffn_norm.bias": "model-00004-of-00004.safetensors",
301
+ "model.vision_backbone.image_vit.transformer.resblocks.12.ffn_norm.weight": "model-00004-of-00004.safetensors",
302
+ "model.vision_backbone.image_vit.transformer.resblocks.13.attention.wk.bias": "model-00004-of-00004.safetensors",
303
+ "model.vision_backbone.image_vit.transformer.resblocks.13.attention.wk.weight": "model-00004-of-00004.safetensors",
304
+ "model.vision_backbone.image_vit.transformer.resblocks.13.attention.wo.bias": "model-00004-of-00004.safetensors",
305
+ "model.vision_backbone.image_vit.transformer.resblocks.13.attention.wo.weight": "model-00004-of-00004.safetensors",
306
+ "model.vision_backbone.image_vit.transformer.resblocks.13.attention.wq.bias": "model-00004-of-00004.safetensors",
307
+ "model.vision_backbone.image_vit.transformer.resblocks.13.attention.wq.weight": "model-00004-of-00004.safetensors",
308
+ "model.vision_backbone.image_vit.transformer.resblocks.13.attention.wv.bias": "model-00004-of-00004.safetensors",
309
+ "model.vision_backbone.image_vit.transformer.resblocks.13.attention.wv.weight": "model-00004-of-00004.safetensors",
310
+ "model.vision_backbone.image_vit.transformer.resblocks.13.attention_norm.bias": "model-00004-of-00004.safetensors",
311
+ "model.vision_backbone.image_vit.transformer.resblocks.13.attention_norm.weight": "model-00004-of-00004.safetensors",
312
+ "model.vision_backbone.image_vit.transformer.resblocks.13.feed_forward.w1.bias": "model-00004-of-00004.safetensors",
313
+ "model.vision_backbone.image_vit.transformer.resblocks.13.feed_forward.w1.weight": "model-00004-of-00004.safetensors",
314
+ "model.vision_backbone.image_vit.transformer.resblocks.13.feed_forward.w2.bias": "model-00004-of-00004.safetensors",
315
+ "model.vision_backbone.image_vit.transformer.resblocks.13.feed_forward.w2.weight": "model-00004-of-00004.safetensors",
316
+ "model.vision_backbone.image_vit.transformer.resblocks.13.ffn_norm.bias": "model-00004-of-00004.safetensors",
317
+ "model.vision_backbone.image_vit.transformer.resblocks.13.ffn_norm.weight": "model-00004-of-00004.safetensors",
318
+ "model.vision_backbone.image_vit.transformer.resblocks.14.attention.wk.bias": "model-00004-of-00004.safetensors",
319
+ "model.vision_backbone.image_vit.transformer.resblocks.14.attention.wk.weight": "model-00004-of-00004.safetensors",
320
+ "model.vision_backbone.image_vit.transformer.resblocks.14.attention.wo.bias": "model-00004-of-00004.safetensors",
321
+ "model.vision_backbone.image_vit.transformer.resblocks.14.attention.wo.weight": "model-00004-of-00004.safetensors",
322
+ "model.vision_backbone.image_vit.transformer.resblocks.14.attention.wq.bias": "model-00004-of-00004.safetensors",
323
+ "model.vision_backbone.image_vit.transformer.resblocks.14.attention.wq.weight": "model-00004-of-00004.safetensors",
324
+ "model.vision_backbone.image_vit.transformer.resblocks.14.attention.wv.bias": "model-00004-of-00004.safetensors",
325
+ "model.vision_backbone.image_vit.transformer.resblocks.14.attention.wv.weight": "model-00004-of-00004.safetensors",
326
+ "model.vision_backbone.image_vit.transformer.resblocks.14.attention_norm.bias": "model-00004-of-00004.safetensors",
327
+ "model.vision_backbone.image_vit.transformer.resblocks.14.attention_norm.weight": "model-00004-of-00004.safetensors",
328
+ "model.vision_backbone.image_vit.transformer.resblocks.14.feed_forward.w1.bias": "model-00004-of-00004.safetensors",
329
+ "model.vision_backbone.image_vit.transformer.resblocks.14.feed_forward.w1.weight": "model-00004-of-00004.safetensors",
330
+ "model.vision_backbone.image_vit.transformer.resblocks.14.feed_forward.w2.bias": "model-00004-of-00004.safetensors",
331
+ "model.vision_backbone.image_vit.transformer.resblocks.14.feed_forward.w2.weight": "model-00004-of-00004.safetensors",
332
+ "model.vision_backbone.image_vit.transformer.resblocks.14.ffn_norm.bias": "model-00004-of-00004.safetensors",
333
+ "model.vision_backbone.image_vit.transformer.resblocks.14.ffn_norm.weight": "model-00004-of-00004.safetensors",
334
+ "model.vision_backbone.image_vit.transformer.resblocks.15.attention.wk.bias": "model-00004-of-00004.safetensors",
335
+ "model.vision_backbone.image_vit.transformer.resblocks.15.attention.wk.weight": "model-00004-of-00004.safetensors",
336
+ "model.vision_backbone.image_vit.transformer.resblocks.15.attention.wo.bias": "model-00004-of-00004.safetensors",
337
+ "model.vision_backbone.image_vit.transformer.resblocks.15.attention.wo.weight": "model-00004-of-00004.safetensors",
338
+ "model.vision_backbone.image_vit.transformer.resblocks.15.attention.wq.bias": "model-00004-of-00004.safetensors",
339
+ "model.vision_backbone.image_vit.transformer.resblocks.15.attention.wq.weight": "model-00004-of-00004.safetensors",
340
+ "model.vision_backbone.image_vit.transformer.resblocks.15.attention.wv.bias": "model-00004-of-00004.safetensors",
341
+ "model.vision_backbone.image_vit.transformer.resblocks.15.attention.wv.weight": "model-00004-of-00004.safetensors",
342
+ "model.vision_backbone.image_vit.transformer.resblocks.15.attention_norm.bias": "model-00004-of-00004.safetensors",
343
+ "model.vision_backbone.image_vit.transformer.resblocks.15.attention_norm.weight": "model-00004-of-00004.safetensors",
344
+ "model.vision_backbone.image_vit.transformer.resblocks.15.feed_forward.w1.bias": "model-00004-of-00004.safetensors",
345
+ "model.vision_backbone.image_vit.transformer.resblocks.15.feed_forward.w1.weight": "model-00004-of-00004.safetensors",
346
+ "model.vision_backbone.image_vit.transformer.resblocks.15.feed_forward.w2.bias": "model-00004-of-00004.safetensors",
347
+ "model.vision_backbone.image_vit.transformer.resblocks.15.feed_forward.w2.weight": "model-00004-of-00004.safetensors",
348
+ "model.vision_backbone.image_vit.transformer.resblocks.15.ffn_norm.bias": "model-00004-of-00004.safetensors",
349
+ "model.vision_backbone.image_vit.transformer.resblocks.15.ffn_norm.weight": "model-00004-of-00004.safetensors",
350
+ "model.vision_backbone.image_vit.transformer.resblocks.16.attention.wk.bias": "model-00004-of-00004.safetensors",
351
+ "model.vision_backbone.image_vit.transformer.resblocks.16.attention.wk.weight": "model-00004-of-00004.safetensors",
352
+ "model.vision_backbone.image_vit.transformer.resblocks.16.attention.wo.bias": "model-00004-of-00004.safetensors",
353
+ "model.vision_backbone.image_vit.transformer.resblocks.16.attention.wo.weight": "model-00004-of-00004.safetensors",
354
+ "model.vision_backbone.image_vit.transformer.resblocks.16.attention.wq.bias": "model-00004-of-00004.safetensors",
355
+ "model.vision_backbone.image_vit.transformer.resblocks.16.attention.wq.weight": "model-00004-of-00004.safetensors",
356
+ "model.vision_backbone.image_vit.transformer.resblocks.16.attention.wv.bias": "model-00004-of-00004.safetensors",
357
+ "model.vision_backbone.image_vit.transformer.resblocks.16.attention.wv.weight": "model-00004-of-00004.safetensors",
358
+ "model.vision_backbone.image_vit.transformer.resblocks.16.attention_norm.bias": "model-00004-of-00004.safetensors",
359
+ "model.vision_backbone.image_vit.transformer.resblocks.16.attention_norm.weight": "model-00004-of-00004.safetensors",
360
+ "model.vision_backbone.image_vit.transformer.resblocks.16.feed_forward.w1.bias": "model-00004-of-00004.safetensors",
361
+ "model.vision_backbone.image_vit.transformer.resblocks.16.feed_forward.w1.weight": "model-00004-of-00004.safetensors",
362
+ "model.vision_backbone.image_vit.transformer.resblocks.16.feed_forward.w2.bias": "model-00004-of-00004.safetensors",
363
+ "model.vision_backbone.image_vit.transformer.resblocks.16.feed_forward.w2.weight": "model-00004-of-00004.safetensors",
364
+ "model.vision_backbone.image_vit.transformer.resblocks.16.ffn_norm.bias": "model-00004-of-00004.safetensors",
365
+ "model.vision_backbone.image_vit.transformer.resblocks.16.ffn_norm.weight": "model-00004-of-00004.safetensors",
366
+ "model.vision_backbone.image_vit.transformer.resblocks.17.attention.wk.bias": "model-00004-of-00004.safetensors",
367
+ "model.vision_backbone.image_vit.transformer.resblocks.17.attention.wk.weight": "model-00004-of-00004.safetensors",
368
+ "model.vision_backbone.image_vit.transformer.resblocks.17.attention.wo.bias": "model-00004-of-00004.safetensors",
369
+ "model.vision_backbone.image_vit.transformer.resblocks.17.attention.wo.weight": "model-00004-of-00004.safetensors",
370
+ "model.vision_backbone.image_vit.transformer.resblocks.17.attention.wq.bias": "model-00004-of-00004.safetensors",
371
+ "model.vision_backbone.image_vit.transformer.resblocks.17.attention.wq.weight": "model-00004-of-00004.safetensors",
372
+ "model.vision_backbone.image_vit.transformer.resblocks.17.attention.wv.bias": "model-00004-of-00004.safetensors",
373
+ "model.vision_backbone.image_vit.transformer.resblocks.17.attention.wv.weight": "model-00004-of-00004.safetensors",
374
+ "model.vision_backbone.image_vit.transformer.resblocks.17.attention_norm.bias": "model-00004-of-00004.safetensors",
375
+ "model.vision_backbone.image_vit.transformer.resblocks.17.attention_norm.weight": "model-00004-of-00004.safetensors",
376
+ "model.vision_backbone.image_vit.transformer.resblocks.17.feed_forward.w1.bias": "model-00004-of-00004.safetensors",
377
+ "model.vision_backbone.image_vit.transformer.resblocks.17.feed_forward.w1.weight": "model-00004-of-00004.safetensors",
378
+ "model.vision_backbone.image_vit.transformer.resblocks.17.feed_forward.w2.bias": "model-00004-of-00004.safetensors",
379
+ "model.vision_backbone.image_vit.transformer.resblocks.17.feed_forward.w2.weight": "model-00004-of-00004.safetensors",
380
+ "model.vision_backbone.image_vit.transformer.resblocks.17.ffn_norm.bias": "model-00004-of-00004.safetensors",
381
+ "model.vision_backbone.image_vit.transformer.resblocks.17.ffn_norm.weight": "model-00004-of-00004.safetensors",
382
+ "model.vision_backbone.image_vit.transformer.resblocks.18.attention.wk.bias": "model-00004-of-00004.safetensors",
383
+ "model.vision_backbone.image_vit.transformer.resblocks.18.attention.wk.weight": "model-00004-of-00004.safetensors",
384
+ "model.vision_backbone.image_vit.transformer.resblocks.18.attention.wo.bias": "model-00004-of-00004.safetensors",
385
+ "model.vision_backbone.image_vit.transformer.resblocks.18.attention.wo.weight": "model-00004-of-00004.safetensors",
386
+ "model.vision_backbone.image_vit.transformer.resblocks.18.attention.wq.bias": "model-00004-of-00004.safetensors",
387
+ "model.vision_backbone.image_vit.transformer.resblocks.18.attention.wq.weight": "model-00004-of-00004.safetensors",
388
+ "model.vision_backbone.image_vit.transformer.resblocks.18.attention.wv.bias": "model-00004-of-00004.safetensors",
389
+ "model.vision_backbone.image_vit.transformer.resblocks.18.attention.wv.weight": "model-00004-of-00004.safetensors",
390
+ "model.vision_backbone.image_vit.transformer.resblocks.18.attention_norm.bias": "model-00004-of-00004.safetensors",
391
+ "model.vision_backbone.image_vit.transformer.resblocks.18.attention_norm.weight": "model-00004-of-00004.safetensors",
392
+ "model.vision_backbone.image_vit.transformer.resblocks.18.feed_forward.w1.bias": "model-00004-of-00004.safetensors",
393
+ "model.vision_backbone.image_vit.transformer.resblocks.18.feed_forward.w1.weight": "model-00004-of-00004.safetensors",
394
+ "model.vision_backbone.image_vit.transformer.resblocks.18.feed_forward.w2.bias": "model-00004-of-00004.safetensors",
395
+ "model.vision_backbone.image_vit.transformer.resblocks.18.feed_forward.w2.weight": "model-00004-of-00004.safetensors",
396
+ "model.vision_backbone.image_vit.transformer.resblocks.18.ffn_norm.bias": "model-00004-of-00004.safetensors",
397
+ "model.vision_backbone.image_vit.transformer.resblocks.18.ffn_norm.weight": "model-00004-of-00004.safetensors",
398
+ "model.vision_backbone.image_vit.transformer.resblocks.19.attention.wk.bias": "model-00004-of-00004.safetensors",
399
+ "model.vision_backbone.image_vit.transformer.resblocks.19.attention.wk.weight": "model-00004-of-00004.safetensors",
400
+ "model.vision_backbone.image_vit.transformer.resblocks.19.attention.wo.bias": "model-00004-of-00004.safetensors",
401
+ "model.vision_backbone.image_vit.transformer.resblocks.19.attention.wo.weight": "model-00004-of-00004.safetensors",
402
+ "model.vision_backbone.image_vit.transformer.resblocks.19.attention.wq.bias": "model-00004-of-00004.safetensors",
403
+ "model.vision_backbone.image_vit.transformer.resblocks.19.attention.wq.weight": "model-00004-of-00004.safetensors",
404
+ "model.vision_backbone.image_vit.transformer.resblocks.19.attention.wv.bias": "model-00004-of-00004.safetensors",
405
+ "model.vision_backbone.image_vit.transformer.resblocks.19.attention.wv.weight": "model-00004-of-00004.safetensors",
406
+ "model.vision_backbone.image_vit.transformer.resblocks.19.attention_norm.bias": "model-00004-of-00004.safetensors",
407
+ "model.vision_backbone.image_vit.transformer.resblocks.19.attention_norm.weight": "model-00004-of-00004.safetensors",
408
+ "model.vision_backbone.image_vit.transformer.resblocks.19.feed_forward.w1.bias": "model-00004-of-00004.safetensors",
409
+ "model.vision_backbone.image_vit.transformer.resblocks.19.feed_forward.w1.weight": "model-00004-of-00004.safetensors",
410
+ "model.vision_backbone.image_vit.transformer.resblocks.19.feed_forward.w2.bias": "model-00004-of-00004.safetensors",
411
+ "model.vision_backbone.image_vit.transformer.resblocks.19.feed_forward.w2.weight": "model-00004-of-00004.safetensors",
412
+ "model.vision_backbone.image_vit.transformer.resblocks.19.ffn_norm.bias": "model-00004-of-00004.safetensors",
413
+ "model.vision_backbone.image_vit.transformer.resblocks.19.ffn_norm.weight": "model-00004-of-00004.safetensors",
414
+ "model.vision_backbone.image_vit.transformer.resblocks.2.attention.wk.bias": "model-00004-of-00004.safetensors",
415
+ "model.vision_backbone.image_vit.transformer.resblocks.2.attention.wk.weight": "model-00004-of-00004.safetensors",
416
+ "model.vision_backbone.image_vit.transformer.resblocks.2.attention.wo.bias": "model-00004-of-00004.safetensors",
417
+ "model.vision_backbone.image_vit.transformer.resblocks.2.attention.wo.weight": "model-00004-of-00004.safetensors",
418
+ "model.vision_backbone.image_vit.transformer.resblocks.2.attention.wq.bias": "model-00004-of-00004.safetensors",
419
+ "model.vision_backbone.image_vit.transformer.resblocks.2.attention.wq.weight": "model-00004-of-00004.safetensors",
420
+ "model.vision_backbone.image_vit.transformer.resblocks.2.attention.wv.bias": "model-00004-of-00004.safetensors",
421
+ "model.vision_backbone.image_vit.transformer.resblocks.2.attention.wv.weight": "model-00004-of-00004.safetensors",
422
+ "model.vision_backbone.image_vit.transformer.resblocks.2.attention_norm.bias": "model-00004-of-00004.safetensors",
423
+ "model.vision_backbone.image_vit.transformer.resblocks.2.attention_norm.weight": "model-00004-of-00004.safetensors",
424
+ "model.vision_backbone.image_vit.transformer.resblocks.2.feed_forward.w1.bias": "model-00004-of-00004.safetensors",
425
+ "model.vision_backbone.image_vit.transformer.resblocks.2.feed_forward.w1.weight": "model-00004-of-00004.safetensors",
426
+ "model.vision_backbone.image_vit.transformer.resblocks.2.feed_forward.w2.bias": "model-00004-of-00004.safetensors",
427
+ "model.vision_backbone.image_vit.transformer.resblocks.2.feed_forward.w2.weight": "model-00004-of-00004.safetensors",
428
+ "model.vision_backbone.image_vit.transformer.resblocks.2.ffn_norm.bias": "model-00004-of-00004.safetensors",
429
+ "model.vision_backbone.image_vit.transformer.resblocks.2.ffn_norm.weight": "model-00004-of-00004.safetensors",
430
+ "model.vision_backbone.image_vit.transformer.resblocks.20.attention.wk.bias": "model-00004-of-00004.safetensors",
431
+ "model.vision_backbone.image_vit.transformer.resblocks.20.attention.wk.weight": "model-00004-of-00004.safetensors",
432
+ "model.vision_backbone.image_vit.transformer.resblocks.20.attention.wo.bias": "model-00004-of-00004.safetensors",
433
+ "model.vision_backbone.image_vit.transformer.resblocks.20.attention.wo.weight": "model-00004-of-00004.safetensors",
434
+ "model.vision_backbone.image_vit.transformer.resblocks.20.attention.wq.bias": "model-00004-of-00004.safetensors",
435
+ "model.vision_backbone.image_vit.transformer.resblocks.20.attention.wq.weight": "model-00004-of-00004.safetensors",
436
+ "model.vision_backbone.image_vit.transformer.resblocks.20.attention.wv.bias": "model-00004-of-00004.safetensors",
437
+ "model.vision_backbone.image_vit.transformer.resblocks.20.attention.wv.weight": "model-00004-of-00004.safetensors",
438
+ "model.vision_backbone.image_vit.transformer.resblocks.20.attention_norm.bias": "model-00004-of-00004.safetensors",
439
+ "model.vision_backbone.image_vit.transformer.resblocks.20.attention_norm.weight": "model-00004-of-00004.safetensors",
440
+ "model.vision_backbone.image_vit.transformer.resblocks.20.feed_forward.w1.bias": "model-00004-of-00004.safetensors",
441
+ "model.vision_backbone.image_vit.transformer.resblocks.20.feed_forward.w1.weight": "model-00004-of-00004.safetensors",
442
+ "model.vision_backbone.image_vit.transformer.resblocks.20.feed_forward.w2.bias": "model-00004-of-00004.safetensors",
443
+ "model.vision_backbone.image_vit.transformer.resblocks.20.feed_forward.w2.weight": "model-00004-of-00004.safetensors",
444
+ "model.vision_backbone.image_vit.transformer.resblocks.20.ffn_norm.bias": "model-00004-of-00004.safetensors",
445
+ "model.vision_backbone.image_vit.transformer.resblocks.20.ffn_norm.weight": "model-00004-of-00004.safetensors",
446
+ "model.vision_backbone.image_vit.transformer.resblocks.21.attention.wk.bias": "model-00004-of-00004.safetensors",
447
+ "model.vision_backbone.image_vit.transformer.resblocks.21.attention.wk.weight": "model-00004-of-00004.safetensors",
448
+ "model.vision_backbone.image_vit.transformer.resblocks.21.attention.wo.bias": "model-00004-of-00004.safetensors",
449
+ "model.vision_backbone.image_vit.transformer.resblocks.21.attention.wo.weight": "model-00004-of-00004.safetensors",
450
+ "model.vision_backbone.image_vit.transformer.resblocks.21.attention.wq.bias": "model-00004-of-00004.safetensors",
451
+ "model.vision_backbone.image_vit.transformer.resblocks.21.attention.wq.weight": "model-00004-of-00004.safetensors",
452
+ "model.vision_backbone.image_vit.transformer.resblocks.21.attention.wv.bias": "model-00004-of-00004.safetensors",
453
+ "model.vision_backbone.image_vit.transformer.resblocks.21.attention.wv.weight": "model-00004-of-00004.safetensors",
454
+ "model.vision_backbone.image_vit.transformer.resblocks.21.attention_norm.bias": "model-00004-of-00004.safetensors",
455
+ "model.vision_backbone.image_vit.transformer.resblocks.21.attention_norm.weight": "model-00004-of-00004.safetensors",
456
+ "model.vision_backbone.image_vit.transformer.resblocks.21.feed_forward.w1.bias": "model-00004-of-00004.safetensors",
457
+ "model.vision_backbone.image_vit.transformer.resblocks.21.feed_forward.w1.weight": "model-00004-of-00004.safetensors",
458
+ "model.vision_backbone.image_vit.transformer.resblocks.21.feed_forward.w2.bias": "model-00004-of-00004.safetensors",
459
+ "model.vision_backbone.image_vit.transformer.resblocks.21.feed_forward.w2.weight": "model-00004-of-00004.safetensors",
460
+ "model.vision_backbone.image_vit.transformer.resblocks.21.ffn_norm.bias": "model-00004-of-00004.safetensors",
461
+ "model.vision_backbone.image_vit.transformer.resblocks.21.ffn_norm.weight": "model-00004-of-00004.safetensors",
462
+ "model.vision_backbone.image_vit.transformer.resblocks.22.attention.wk.bias": "model-00004-of-00004.safetensors",
463
+ "model.vision_backbone.image_vit.transformer.resblocks.22.attention.wk.weight": "model-00004-of-00004.safetensors",
464
+ "model.vision_backbone.image_vit.transformer.resblocks.22.attention.wo.bias": "model-00004-of-00004.safetensors",
465
+ "model.vision_backbone.image_vit.transformer.resblocks.22.attention.wo.weight": "model-00004-of-00004.safetensors",
466
+ "model.vision_backbone.image_vit.transformer.resblocks.22.attention.wq.bias": "model-00004-of-00004.safetensors",
467
+ "model.vision_backbone.image_vit.transformer.resblocks.22.attention.wq.weight": "model-00004-of-00004.safetensors",
468
+ "model.vision_backbone.image_vit.transformer.resblocks.22.attention.wv.bias": "model-00004-of-00004.safetensors",
469
+ "model.vision_backbone.image_vit.transformer.resblocks.22.attention.wv.weight": "model-00004-of-00004.safetensors",
470
+ "model.vision_backbone.image_vit.transformer.resblocks.22.attention_norm.bias": "model-00004-of-00004.safetensors",
471
+ "model.vision_backbone.image_vit.transformer.resblocks.22.attention_norm.weight": "model-00004-of-00004.safetensors",
472
+ "model.vision_backbone.image_vit.transformer.resblocks.22.feed_forward.w1.bias": "model-00004-of-00004.safetensors",
473
+ "model.vision_backbone.image_vit.transformer.resblocks.22.feed_forward.w1.weight": "model-00004-of-00004.safetensors",
474
+ "model.vision_backbone.image_vit.transformer.resblocks.22.feed_forward.w2.bias": "model-00004-of-00004.safetensors",
475
+ "model.vision_backbone.image_vit.transformer.resblocks.22.feed_forward.w2.weight": "model-00004-of-00004.safetensors",
476
+ "model.vision_backbone.image_vit.transformer.resblocks.22.ffn_norm.bias": "model-00004-of-00004.safetensors",
477
+ "model.vision_backbone.image_vit.transformer.resblocks.22.ffn_norm.weight": "model-00004-of-00004.safetensors",
478
+ "model.vision_backbone.image_vit.transformer.resblocks.3.attention.wk.bias": "model-00004-of-00004.safetensors",
479
+ "model.vision_backbone.image_vit.transformer.resblocks.3.attention.wk.weight": "model-00004-of-00004.safetensors",
480
+ "model.vision_backbone.image_vit.transformer.resblocks.3.attention.wo.bias": "model-00004-of-00004.safetensors",
481
+ "model.vision_backbone.image_vit.transformer.resblocks.3.attention.wo.weight": "model-00004-of-00004.safetensors",
482
+ "model.vision_backbone.image_vit.transformer.resblocks.3.attention.wq.bias": "model-00004-of-00004.safetensors",
483
+ "model.vision_backbone.image_vit.transformer.resblocks.3.attention.wq.weight": "model-00004-of-00004.safetensors",
484
+ "model.vision_backbone.image_vit.transformer.resblocks.3.attention.wv.bias": "model-00004-of-00004.safetensors",
485
+ "model.vision_backbone.image_vit.transformer.resblocks.3.attention.wv.weight": "model-00004-of-00004.safetensors",
486
+ "model.vision_backbone.image_vit.transformer.resblocks.3.attention_norm.bias": "model-00004-of-00004.safetensors",
487
+ "model.vision_backbone.image_vit.transformer.resblocks.3.attention_norm.weight": "model-00004-of-00004.safetensors",
488
+ "model.vision_backbone.image_vit.transformer.resblocks.3.feed_forward.w1.bias": "model-00004-of-00004.safetensors",
489
+ "model.vision_backbone.image_vit.transformer.resblocks.3.feed_forward.w1.weight": "model-00004-of-00004.safetensors",
490
+ "model.vision_backbone.image_vit.transformer.resblocks.3.feed_forward.w2.bias": "model-00004-of-00004.safetensors",
491
+ "model.vision_backbone.image_vit.transformer.resblocks.3.feed_forward.w2.weight": "model-00004-of-00004.safetensors",
492
+ "model.vision_backbone.image_vit.transformer.resblocks.3.ffn_norm.bias": "model-00004-of-00004.safetensors",
493
+ "model.vision_backbone.image_vit.transformer.resblocks.3.ffn_norm.weight": "model-00004-of-00004.safetensors",
494
+ "model.vision_backbone.image_vit.transformer.resblocks.4.attention.wk.bias": "model-00004-of-00004.safetensors",
495
+ "model.vision_backbone.image_vit.transformer.resblocks.4.attention.wk.weight": "model-00004-of-00004.safetensors",
496
+ "model.vision_backbone.image_vit.transformer.resblocks.4.attention.wo.bias": "model-00004-of-00004.safetensors",
497
+ "model.vision_backbone.image_vit.transformer.resblocks.4.attention.wo.weight": "model-00004-of-00004.safetensors",
498
+ "model.vision_backbone.image_vit.transformer.resblocks.4.attention.wq.bias": "model-00004-of-00004.safetensors",
499
+ "model.vision_backbone.image_vit.transformer.resblocks.4.attention.wq.weight": "model-00004-of-00004.safetensors",
500
+ "model.vision_backbone.image_vit.transformer.resblocks.4.attention.wv.bias": "model-00004-of-00004.safetensors",
501
+ "model.vision_backbone.image_vit.transformer.resblocks.4.attention.wv.weight": "model-00004-of-00004.safetensors",
502
+ "model.vision_backbone.image_vit.transformer.resblocks.4.attention_norm.bias": "model-00004-of-00004.safetensors",
503
+ "model.vision_backbone.image_vit.transformer.resblocks.4.attention_norm.weight": "model-00004-of-00004.safetensors",
504
+ "model.vision_backbone.image_vit.transformer.resblocks.4.feed_forward.w1.bias": "model-00004-of-00004.safetensors",
505
+ "model.vision_backbone.image_vit.transformer.resblocks.4.feed_forward.w1.weight": "model-00004-of-00004.safetensors",
506
+ "model.vision_backbone.image_vit.transformer.resblocks.4.feed_forward.w2.bias": "model-00004-of-00004.safetensors",
507
+ "model.vision_backbone.image_vit.transformer.resblocks.4.feed_forward.w2.weight": "model-00004-of-00004.safetensors",
508
+ "model.vision_backbone.image_vit.transformer.resblocks.4.ffn_norm.bias": "model-00004-of-00004.safetensors",
509
+ "model.vision_backbone.image_vit.transformer.resblocks.4.ffn_norm.weight": "model-00004-of-00004.safetensors",
510
+ "model.vision_backbone.image_vit.transformer.resblocks.5.attention.wk.bias": "model-00004-of-00004.safetensors",
511
+ "model.vision_backbone.image_vit.transformer.resblocks.5.attention.wk.weight": "model-00004-of-00004.safetensors",
512
+ "model.vision_backbone.image_vit.transformer.resblocks.5.attention.wo.bias": "model-00004-of-00004.safetensors",
513
+ "model.vision_backbone.image_vit.transformer.resblocks.5.attention.wo.weight": "model-00004-of-00004.safetensors",
514
+ "model.vision_backbone.image_vit.transformer.resblocks.5.attention.wq.bias": "model-00004-of-00004.safetensors",
515
+ "model.vision_backbone.image_vit.transformer.resblocks.5.attention.wq.weight": "model-00004-of-00004.safetensors",
516
+ "model.vision_backbone.image_vit.transformer.resblocks.5.attention.wv.bias": "model-00004-of-00004.safetensors",
517
+ "model.vision_backbone.image_vit.transformer.resblocks.5.attention.wv.weight": "model-00004-of-00004.safetensors",
518
+ "model.vision_backbone.image_vit.transformer.resblocks.5.attention_norm.bias": "model-00004-of-00004.safetensors",
519
+ "model.vision_backbone.image_vit.transformer.resblocks.5.attention_norm.weight": "model-00004-of-00004.safetensors",
520
+ "model.vision_backbone.image_vit.transformer.resblocks.5.feed_forward.w1.bias": "model-00004-of-00004.safetensors",
521
+ "model.vision_backbone.image_vit.transformer.resblocks.5.feed_forward.w1.weight": "model-00004-of-00004.safetensors",
522
+ "model.vision_backbone.image_vit.transformer.resblocks.5.feed_forward.w2.bias": "model-00004-of-00004.safetensors",
523
+ "model.vision_backbone.image_vit.transformer.resblocks.5.feed_forward.w2.weight": "model-00004-of-00004.safetensors",
524
+ "model.vision_backbone.image_vit.transformer.resblocks.5.ffn_norm.bias": "model-00004-of-00004.safetensors",
525
+ "model.vision_backbone.image_vit.transformer.resblocks.5.ffn_norm.weight": "model-00004-of-00004.safetensors",
526
+ "model.vision_backbone.image_vit.transformer.resblocks.6.attention.wk.bias": "model-00004-of-00004.safetensors",
527
+ "model.vision_backbone.image_vit.transformer.resblocks.6.attention.wk.weight": "model-00004-of-00004.safetensors",
528
+ "model.vision_backbone.image_vit.transformer.resblocks.6.attention.wo.bias": "model-00004-of-00004.safetensors",
529
+ "model.vision_backbone.image_vit.transformer.resblocks.6.attention.wo.weight": "model-00004-of-00004.safetensors",
530
+ "model.vision_backbone.image_vit.transformer.resblocks.6.attention.wq.bias": "model-00004-of-00004.safetensors",
531
+ "model.vision_backbone.image_vit.transformer.resblocks.6.attention.wq.weight": "model-00004-of-00004.safetensors",
532
+ "model.vision_backbone.image_vit.transformer.resblocks.6.attention.wv.bias": "model-00004-of-00004.safetensors",
533
+ "model.vision_backbone.image_vit.transformer.resblocks.6.attention.wv.weight": "model-00004-of-00004.safetensors",
534
+ "model.vision_backbone.image_vit.transformer.resblocks.6.attention_norm.bias": "model-00004-of-00004.safetensors",
535
+ "model.vision_backbone.image_vit.transformer.resblocks.6.attention_norm.weight": "model-00004-of-00004.safetensors",
536
+ "model.vision_backbone.image_vit.transformer.resblocks.6.feed_forward.w1.bias": "model-00004-of-00004.safetensors",
537
+ "model.vision_backbone.image_vit.transformer.resblocks.6.feed_forward.w1.weight": "model-00004-of-00004.safetensors",
538
+ "model.vision_backbone.image_vit.transformer.resblocks.6.feed_forward.w2.bias": "model-00004-of-00004.safetensors",
539
+ "model.vision_backbone.image_vit.transformer.resblocks.6.feed_forward.w2.weight": "model-00004-of-00004.safetensors",
540
+ "model.vision_backbone.image_vit.transformer.resblocks.6.ffn_norm.bias": "model-00004-of-00004.safetensors",
541
+ "model.vision_backbone.image_vit.transformer.resblocks.6.ffn_norm.weight": "model-00004-of-00004.safetensors",
542
+ "model.vision_backbone.image_vit.transformer.resblocks.7.attention.wk.bias": "model-00004-of-00004.safetensors",
543
+ "model.vision_backbone.image_vit.transformer.resblocks.7.attention.wk.weight": "model-00004-of-00004.safetensors",
544
+ "model.vision_backbone.image_vit.transformer.resblocks.7.attention.wo.bias": "model-00004-of-00004.safetensors",
545
+ "model.vision_backbone.image_vit.transformer.resblocks.7.attention.wo.weight": "model-00004-of-00004.safetensors",
546
+ "model.vision_backbone.image_vit.transformer.resblocks.7.attention.wq.bias": "model-00004-of-00004.safetensors",
547
+ "model.vision_backbone.image_vit.transformer.resblocks.7.attention.wq.weight": "model-00004-of-00004.safetensors",
548
+ "model.vision_backbone.image_vit.transformer.resblocks.7.attention.wv.bias": "model-00004-of-00004.safetensors",
549
+ "model.vision_backbone.image_vit.transformer.resblocks.7.attention.wv.weight": "model-00004-of-00004.safetensors",
550
+ "model.vision_backbone.image_vit.transformer.resblocks.7.attention_norm.bias": "model-00004-of-00004.safetensors",
551
+ "model.vision_backbone.image_vit.transformer.resblocks.7.attention_norm.weight": "model-00004-of-00004.safetensors",
552
+ "model.vision_backbone.image_vit.transformer.resblocks.7.feed_forward.w1.bias": "model-00004-of-00004.safetensors",
553
+ "model.vision_backbone.image_vit.transformer.resblocks.7.feed_forward.w1.weight": "model-00004-of-00004.safetensors",
554
+ "model.vision_backbone.image_vit.transformer.resblocks.7.feed_forward.w2.bias": "model-00004-of-00004.safetensors",
555
+ "model.vision_backbone.image_vit.transformer.resblocks.7.feed_forward.w2.weight": "model-00004-of-00004.safetensors",
556
+ "model.vision_backbone.image_vit.transformer.resblocks.7.ffn_norm.bias": "model-00004-of-00004.safetensors",
557
+ "model.vision_backbone.image_vit.transformer.resblocks.7.ffn_norm.weight": "model-00004-of-00004.safetensors",
558
+ "model.vision_backbone.image_vit.transformer.resblocks.8.attention.wk.bias": "model-00004-of-00004.safetensors",
559
+ "model.vision_backbone.image_vit.transformer.resblocks.8.attention.wk.weight": "model-00004-of-00004.safetensors",
560
+ "model.vision_backbone.image_vit.transformer.resblocks.8.attention.wo.bias": "model-00004-of-00004.safetensors",
561
+ "model.vision_backbone.image_vit.transformer.resblocks.8.attention.wo.weight": "model-00004-of-00004.safetensors",
562
+ "model.vision_backbone.image_vit.transformer.resblocks.8.attention.wq.bias": "model-00004-of-00004.safetensors",
563
+ "model.vision_backbone.image_vit.transformer.resblocks.8.attention.wq.weight": "model-00004-of-00004.safetensors",
564
+ "model.vision_backbone.image_vit.transformer.resblocks.8.attention.wv.bias": "model-00004-of-00004.safetensors",
565
+ "model.vision_backbone.image_vit.transformer.resblocks.8.attention.wv.weight": "model-00004-of-00004.safetensors",
566
+ "model.vision_backbone.image_vit.transformer.resblocks.8.attention_norm.bias": "model-00004-of-00004.safetensors",
567
+ "model.vision_backbone.image_vit.transformer.resblocks.8.attention_norm.weight": "model-00004-of-00004.safetensors",
568
+ "model.vision_backbone.image_vit.transformer.resblocks.8.feed_forward.w1.bias": "model-00004-of-00004.safetensors",
569
+ "model.vision_backbone.image_vit.transformer.resblocks.8.feed_forward.w1.weight": "model-00004-of-00004.safetensors",
570
+ "model.vision_backbone.image_vit.transformer.resblocks.8.feed_forward.w2.bias": "model-00004-of-00004.safetensors",
571
+ "model.vision_backbone.image_vit.transformer.resblocks.8.feed_forward.w2.weight": "model-00004-of-00004.safetensors",
572
+ "model.vision_backbone.image_vit.transformer.resblocks.8.ffn_norm.bias": "model-00004-of-00004.safetensors",
573
+ "model.vision_backbone.image_vit.transformer.resblocks.8.ffn_norm.weight": "model-00004-of-00004.safetensors",
574
+ "model.vision_backbone.image_vit.transformer.resblocks.9.attention.wk.bias": "model-00004-of-00004.safetensors",
575
+ "model.vision_backbone.image_vit.transformer.resblocks.9.attention.wk.weight": "model-00004-of-00004.safetensors",
576
+ "model.vision_backbone.image_vit.transformer.resblocks.9.attention.wo.bias": "model-00004-of-00004.safetensors",
577
+ "model.vision_backbone.image_vit.transformer.resblocks.9.attention.wo.weight": "model-00004-of-00004.safetensors",
578
+ "model.vision_backbone.image_vit.transformer.resblocks.9.attention.wq.bias": "model-00004-of-00004.safetensors",
579
+ "model.vision_backbone.image_vit.transformer.resblocks.9.attention.wq.weight": "model-00004-of-00004.safetensors",
580
+ "model.vision_backbone.image_vit.transformer.resblocks.9.attention.wv.bias": "model-00004-of-00004.safetensors",
581
+ "model.vision_backbone.image_vit.transformer.resblocks.9.attention.wv.weight": "model-00004-of-00004.safetensors",
582
+ "model.vision_backbone.image_vit.transformer.resblocks.9.attention_norm.bias": "model-00004-of-00004.safetensors",
583
+ "model.vision_backbone.image_vit.transformer.resblocks.9.attention_norm.weight": "model-00004-of-00004.safetensors",
584
+ "model.vision_backbone.image_vit.transformer.resblocks.9.feed_forward.w1.bias": "model-00004-of-00004.safetensors",
585
+ "model.vision_backbone.image_vit.transformer.resblocks.9.feed_forward.w1.weight": "model-00004-of-00004.safetensors",
586
+ "model.vision_backbone.image_vit.transformer.resblocks.9.feed_forward.w2.bias": "model-00004-of-00004.safetensors",
587
+ "model.vision_backbone.image_vit.transformer.resblocks.9.feed_forward.w2.weight": "model-00004-of-00004.safetensors",
588
+ "model.vision_backbone.image_vit.transformer.resblocks.9.ffn_norm.bias": "model-00004-of-00004.safetensors",
589
+ "model.vision_backbone.image_vit.transformer.resblocks.9.ffn_norm.weight": "model-00004-of-00004.safetensors",
590
+ "model.vision_backbone.pad_embed": "model-00004-of-00004.safetensors"
591
+ }
592
+ }
modeling_molmo.py ADDED
@@ -0,0 +1,2367 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import math
3
+ from copy import deepcopy
4
+ from dataclasses import fields, dataclass, replace
5
+ from enum import Enum
6
+ from typing import List, Optional, Tuple, Union, Dict, Any, Sequence, Callable, cast, MutableMapping
7
+
8
+ import torch
9
+ from einops import einsum, einops
10
+ from transformers import PreTrainedModel, GenerationConfig
11
+ from transformers.cache_utils import Cache
12
+ from transformers.modeling_outputs import CausalLMOutputWithPast, ModelOutput
13
+ from transformers.models.auto import AutoModelForCausalLM
14
+ from torch import nn
15
+
16
+ from .config_molmo import MolmoConfig
17
+ from torch.nn import functional as F
18
+
19
+
20
+ log = logging.getLogger(__name__)
21
+
22
+
23
+ class BufferCache(dict, MutableMapping[str, torch.Tensor]):
24
+ """
25
+ Cache for attention biases and other things that would normally be stored as buffers.
26
+ We avoid using buffers because we've run into various issues doing so with FSDP.
27
+ In general it appears the way FSDP handles buffers is not well-defined.
28
+ It doesn't shard them but apparently it does synchronize them across processes, which we want to avoid
29
+ since (A) it isn't necessary, and (B) we sometimes have `-inf` in these biases which might get turned into
30
+ NaNs when they're synchronized due to casting or some other issue.
31
+ """
32
+
33
+
34
+ class StrEnum(str, Enum):
35
+ def __str__(self) -> str:
36
+ return self.value
37
+
38
+ def __repr__(self) -> str:
39
+ return f"'{str(self)}'"
40
+
41
+
42
+ class ImageProjectType(StrEnum):
43
+ mlp = "mlp"
44
+ mlpx2 = "2mlp"
45
+ linear = "linear"
46
+
47
+
48
+ class ImagePooling2DType(StrEnum):
49
+ attention = "attention"
50
+ attention_meanq = "attention-meanq"
51
+ attention_2wide = "attention_2wide"
52
+ attention_v2 = "attention-v2"
53
+ none = "none"
54
+ stack = "stack"
55
+
56
+
57
+ class ActivationType(StrEnum):
58
+ quick_gelu = "quick_gelu"
59
+ gelu = "gelu"
60
+ gelu_tanh = "gelu_tanh"
61
+ relu = "relu"
62
+ silu = "silu"
63
+ llama_geglu = "llama_geglu"
64
+ llama_geglu_tanh = "llama_geglu_tanh"
65
+ llama_swiglu = "llama_swiglu"
66
+ swiglu = "swiglu"
67
+
68
+
69
+ def ensure_finite_(x: torch.Tensor, check_neg_inf: bool = True, check_pos_inf: bool = False):
70
+ """
71
+ Modify ``x`` in place to replace ``float("-inf")`` with the minimum value of the dtype when ``check_neg_inf``
72
+ is ``True`` and to replace ``float("inf")`` with the maximum value of the dtype when ``check_pos_inf`` is ``True``.
73
+ """
74
+ if check_neg_inf:
75
+ x.masked_fill_(x == float("-inf"), torch.finfo(x.dtype).min)
76
+ if check_pos_inf:
77
+ x.masked_fill_(x == float("inf"), torch.finfo(x.dtype).max)
78
+
79
+
80
+ class MolmoConfigurationError(Exception):
81
+ pass
82
+
83
+
84
+ def _non_meta_init_device(config) -> torch.device:
85
+ if config.init_device is not None and config.init_device != "meta":
86
+ return torch.device(config.init_device)
87
+ else:
88
+ return torch.device("cuda" if torch.cuda.is_available() else "cpu")
89
+
90
+
91
+ class RotaryEmbedding(nn.Module):
92
+ """
93
+ [Rotary positional embeddings (RoPE)](https://arxiv.org/abs/2104.09864).
94
+ """
95
+
96
+ def __init__(self, config: MolmoConfig, cache: BufferCache):
97
+ super().__init__()
98
+ self.config = config
99
+ self.__cache = cache
100
+ # Warm up cache.
101
+ self.get_rotary_embedding(
102
+ config.max_position_embeddings or config.max_sequence_length,
103
+ _non_meta_init_device(config)
104
+ )
105
+
106
+ def get_rotary_embedding(self, seq_len: int, device: torch.device) -> Tuple[torch.Tensor, torch.Tensor]:
107
+ if (
108
+ (pos_sin := self.__cache.get("rope_pos_sin")) is not None
109
+ and (pos_cos := self.__cache.get("rope_pos_cos")) is not None
110
+ and pos_sin.shape[-2] >= seq_len
111
+ and pos_cos.shape[-2] >= seq_len
112
+ ):
113
+ if pos_sin.device != device:
114
+ pos_sin = pos_sin.to(device)
115
+ self.__cache["rope_pos_sin"] = pos_sin
116
+ if pos_cos.device != device:
117
+ pos_cos = pos_cos.to(device)
118
+ self.__cache["rope_pos_cos"] = pos_cos
119
+ return pos_sin[:, :, :seq_len, :], pos_cos[:, :, :seq_len, :]
120
+
121
+ with torch.autocast(device.type, enabled=False):
122
+ dim = self.config.d_model // self.config.n_heads
123
+ inv_freq = 1.0 / (self.config.rope_theta ** (torch.arange(0, dim, 2, device=device, dtype=torch.float) / dim))
124
+ seq = torch.arange(seq_len, device=device, dtype=torch.float)
125
+ freqs = torch.einsum("i , j -> i j", seq, inv_freq)
126
+ if self.config.rope_impl == "interleave":
127
+ positions = freqs.repeat_interleave(2, dim=-1)
128
+ else:
129
+ positions = torch.cat((freqs, freqs), dim=-1)
130
+ pos_sin, pos_cos = positions.sin()[None, None, :, :], positions.cos()[None, None, :, :]
131
+ self.__cache["rope_pos_sin"] = pos_sin
132
+ self.__cache["rope_pos_cos"] = pos_cos
133
+ return pos_sin, pos_cos
134
+
135
+ def rotate_half(self, x: torch.Tensor) -> torch.Tensor:
136
+ B, nh, T, hs = x.size()
137
+ x = x.view(B, nh, T, 2, hs // 2)
138
+ x1, x2 = x.unbind(dim=-2)
139
+ return torch.cat((-x2, x1), dim=-1)
140
+
141
+ def rotate_every_two(self, x: torch.Tensor) -> torch.Tensor:
142
+ B, nh, T, hs = x.size()
143
+ x = x.view(B, nh, T, hs // 2, 2)
144
+ x1, x2 = x.unbind(dim=-1)
145
+ x = torch.stack((-x2, x1), dim=-1)
146
+ return x.view(B, nh, T, hs)
147
+
148
+ def apply_rotary_pos_emb(self, pos_sin: torch.Tensor, pos_cos: torch.Tensor, t: torch.Tensor) -> torch.Tensor:
149
+ if self.config.rope_impl == "interleave":
150
+ return ((t * pos_cos) + (self.rotate_every_two(t) * pos_sin)).to(t.dtype)
151
+ else:
152
+ return ((t * pos_cos) + (self.rotate_half(t) * pos_sin)).to(t.dtype)
153
+
154
+ def forward(
155
+ self,
156
+ q: torch.Tensor,
157
+ k: torch.Tensor,
158
+ position_ids: Optional[torch.Tensor] = None
159
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
160
+ if self.config.rope_full_precision:
161
+ q_, k_ = q.float(), k.float()
162
+ else:
163
+ q_, k_ = q, k
164
+
165
+ with torch.autocast(q.device.type, enabled=False):
166
+ batch_size = q_.shape[0]
167
+ query_len, key_len = q_.shape[-2], k_.shape[-2] # could be different if layer_past not None
168
+ if position_ids is not None:
169
+ freqs_cis_len = (self.config.max_position_embeddings or self.config.max_sequence_length)
170
+ else:
171
+ freqs_cis_len = key_len
172
+ pos_sin, pos_cos = self.get_rotary_embedding(freqs_cis_len, q_.device)
173
+ pos_sin = pos_sin.type_as(q_)
174
+ pos_cos = pos_cos.type_as(q_)
175
+ if position_ids is not None:
176
+ assert query_len == key_len, "Query and key lengths must be equal when using position IDs."
177
+ pos_sin = pos_sin[0, 0][position_ids].view(
178
+ (batch_size, 1, key_len, pos_sin.shape[-1])
179
+ )
180
+ pos_cos = pos_cos[0, 0][position_ids].view(
181
+ (batch_size, 1, key_len, pos_cos.shape[-1])
182
+ )
183
+ q_ = self.apply_rotary_pos_emb(
184
+ pos_sin[:, :, key_len - query_len : key_len, :],
185
+ pos_cos[:, :, key_len - query_len : key_len, :],
186
+ q_,
187
+ )
188
+ k_ = self.apply_rotary_pos_emb(pos_sin, pos_cos, k_)
189
+ return q_.type_as(q), k_.type_as(k)
190
+
191
+
192
+ class MolmoBlock(nn.Module):
193
+ """
194
+ A base class for transformer block implementations.
195
+ """
196
+
197
+ def __init__(self, layer_id: int, config: MolmoConfig, cache: BufferCache):
198
+ super().__init__()
199
+ self.layer_id = layer_id
200
+ self.config = config
201
+ self.hidden_size = (
202
+ config.mlp_hidden_size if config.mlp_hidden_size is not None else config.mlp_ratio * config.d_model
203
+ )
204
+ self.__cache = cache
205
+ self._activation_checkpoint_fn = None
206
+
207
+ # Dropout.
208
+ self.dropout = Dropout(config.residual_dropout)
209
+
210
+ # Layer norms.
211
+ self.k_norm: Optional[LayerNormBase] = None
212
+ self.q_norm: Optional[LayerNormBase] = None
213
+ if config.attention_layer_norm:
214
+ assert config.effective_n_kv_heads is not None
215
+ self.k_norm = LayerNormBase.build(
216
+ config,
217
+ size=(config.d_model // config.n_heads) * config.effective_n_kv_heads,
218
+ elementwise_affine=config.attention_layer_norm_with_affine,
219
+ )
220
+ self.q_norm = LayerNormBase.build(config, elementwise_affine=config.attention_layer_norm_with_affine)
221
+
222
+ # Make sure QKV clip coefficient is positive, otherwise it's not well-defined.
223
+ if config.clip_qkv is not None:
224
+ assert config.clip_qkv > 0
225
+
226
+ # Activation function.
227
+ self.act = Activation.build(config)
228
+ assert (self.act.output_multiplier * self.hidden_size) % 1 == 0
229
+
230
+ # Attention output projection.
231
+ input_dim = config.d_model
232
+ self.attn_out = nn.Linear(
233
+ input_dim, config.d_model,
234
+ bias=config.include_bias,
235
+ device=config.init_device
236
+ )
237
+
238
+ # Feed-forward output projection.
239
+ self.ff_out = nn.Linear(
240
+ int(self.act.output_multiplier * self.hidden_size),
241
+ config.d_model,
242
+ bias=config.include_bias,
243
+ device=config.init_device,
244
+ )
245
+ self.ff_out._is_residual = True # type: ignore
246
+
247
+ # Rotary embeddings.
248
+ if self.config.rope:
249
+ self.rotary_emb = RotaryEmbedding(config, self.__cache)
250
+
251
+ self.flash_attn_func = None
252
+ if config.attention_type == "flash":
253
+ try:
254
+ from flash_attn import flash_attn_func # type: ignore
255
+
256
+ self.flash_attn_func = flash_attn_func
257
+ except ModuleNotFoundError:
258
+ pass
259
+
260
+ def reset_parameters(self):
261
+ if self.k_norm is not None:
262
+ self.k_norm.reset_parameters()
263
+ if self.q_norm is not None:
264
+ self.q_norm.reset_parameters()
265
+ init_weights(
266
+ self.config,
267
+ self.attn_out,
268
+ d=self.config.d_model,
269
+ layer_id=self.layer_id,
270
+ type_of_module=ModuleType.out_module,
271
+ )
272
+ init_weights(
273
+ self.config,
274
+ self.ff_out,
275
+ d=self.ff_out.in_features,
276
+ layer_id=self.layer_id,
277
+ type_of_module=ModuleType.out_module,
278
+ )
279
+
280
+ @classmethod
281
+ def _cast_attn_bias(cls, bias: torch.Tensor, input_dtype: torch.dtype) -> torch.Tensor:
282
+ target_dtype = input_dtype
283
+ # NOTE: `is_autocast_enabled()` only checks for CUDA autocast, so we use the separate function
284
+ # `is_autocast_cpu_enabled()` for CPU autocast.
285
+ # See https://github.com/pytorch/pytorch/issues/110966.
286
+ if bias.device.type == "cuda" and torch.is_autocast_enabled():
287
+ target_dtype = torch.get_autocast_gpu_dtype()
288
+ elif bias.device.type == "cpu" and torch.is_autocast_cpu_enabled():
289
+ target_dtype = torch.get_autocast_cpu_dtype()
290
+ if bias.dtype != target_dtype:
291
+ bias = bias.to(target_dtype)
292
+ ensure_finite_(bias, check_neg_inf=True, check_pos_inf=False)
293
+ return bias
294
+
295
+ def _scaled_dot_product_attention(
296
+ self,
297
+ q: torch.Tensor,
298
+ k: torch.Tensor,
299
+ v: torch.Tensor,
300
+ attn_mask: Optional[torch.Tensor] = None,
301
+ dropout_p: float = 0.0,
302
+ response_dropout_p: float = 0.0,
303
+ is_causal: bool = False,
304
+ ) -> torch.Tensor:
305
+ """
306
+ Computes scaled dot product attention on query, key and value tensors, using an optional
307
+ attention mask if passed, and applying dropout if a probability greater than 0.0 is specified.
308
+ """
309
+ if attn_mask is not None:
310
+ attn_mask = attn_mask.to(q.device)
311
+
312
+ if self.flash_attn_func is not None and attn_mask is None:
313
+ r = self.flash_attn_func(
314
+ q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), dropout_p=dropout_p, causal=is_causal
315
+ )
316
+ return r.transpose(1, 2)
317
+ else:
318
+ # torch's sdpa doesn't support GQA, so we're doing this
319
+ assert k.size(1) == v.size(1)
320
+ num_kv_heads = k.size(1)
321
+ num_q_heads = q.size(1)
322
+ if num_q_heads != num_kv_heads:
323
+ assert num_q_heads % num_kv_heads == 0
324
+ k = k.repeat_interleave(num_q_heads // num_kv_heads, dim=1, output_size=num_q_heads)
325
+ v = v.repeat_interleave(num_q_heads // num_kv_heads, dim=1, output_size=num_q_heads)
326
+
327
+ return F.scaled_dot_product_attention(
328
+ q,
329
+ k,
330
+ v,
331
+ attn_mask=attn_mask,
332
+ dropout_p=dropout_p,
333
+ is_causal=is_causal,
334
+ )
335
+
336
+ def attention(
337
+ self,
338
+ q: torch.Tensor,
339
+ k: torch.Tensor,
340
+ v: torch.Tensor,
341
+ attention_bias: Optional[torch.Tensor] = None,
342
+ position_ids: Optional[torch.Tensor] = None,
343
+ layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
344
+ use_cache: bool = False,
345
+ ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
346
+ B, T, C = q.size() # batch size, sequence length, d_model
347
+ dtype = k.dtype
348
+
349
+ # Optionally apply layer norm to keys and queries.
350
+ if self.q_norm is not None and self.k_norm is not None:
351
+ q = self.q_norm(q).to(dtype=dtype)
352
+ k = self.k_norm(k).to(dtype=dtype)
353
+
354
+ # Move head forward to be next to the batch dim.
355
+ # shape: (B, nh, T, hs)
356
+ q = q.view(B, T, self.config.n_heads, C // self.config.n_heads).transpose(1, 2)
357
+ # shape: (B, n_kv_h, T, hs)
358
+ k = k.view(B, T, self.config.effective_n_kv_heads, C // self.config.n_heads).transpose(1, 2)
359
+ # shape: (B, n_kv_h, T, hs)
360
+ v = v.view(B, T, self.config.effective_n_kv_heads, C // self.config.n_heads).transpose(1, 2)
361
+
362
+ if self.config.use_position_ids and self.config.rope:
363
+ # Apply rotary embeddings
364
+ q, k = self.rotary_emb(q, k, position_ids=position_ids)
365
+
366
+ if layer_past is not None:
367
+ past_key, past_value = layer_past
368
+ k = torch.cat((past_key.to(k.device), k), dim=-2)
369
+ v = torch.cat((past_value.to(v.device), v), dim=-2)
370
+
371
+ present = (k, v) if use_cache else None
372
+ query_len, key_len = q.shape[-2], k.shape[-2] # could be different if layer_past not None
373
+
374
+ if not self.config.use_position_ids and self.config.rope:
375
+ # Apply rotary embeddings
376
+ q, k = self.rotary_emb(q, k)
377
+
378
+ if attention_bias is not None:
379
+ # Resize and cast attention bias.
380
+ # The current dtype of the attention bias might not match the dtype that the SDP attn function will
381
+ # run in if AMP is enabled, and this can be a problem if some tokens are masked out due to padding
382
+ # as down-casting the attention bias to the autocast precision will result in -infs, which will
383
+ # cause the SDP attn function to produce NaNs.
384
+ attention_bias = self._cast_attn_bias(
385
+ attention_bias[:, :, key_len - query_len : key_len, :key_len], dtype
386
+ )
387
+
388
+ # Get the attention scores.
389
+ # shape: (B, nh, T, hs)
390
+ att = self._scaled_dot_product_attention(
391
+ q,
392
+ k,
393
+ v,
394
+ attn_mask=attention_bias,
395
+ dropout_p=0.0 if not self.training else self.config.attention_dropout,
396
+ response_dropout_p=0.0 if not self.training else self.config.response_attention_dropout,
397
+ is_causal=attention_bias is None,
398
+ )
399
+
400
+ # Re-assemble all head outputs side-by-side.
401
+ att = att.transpose(1, 2).contiguous().view(B, T, C)
402
+
403
+ # Apply output projection.
404
+ return self.attn_out(att), present
405
+
406
+ def forward(
407
+ self,
408
+ x: torch.Tensor,
409
+ attention_bias: Optional[torch.FloatTensor] = None,
410
+ position_ids: Optional[torch.Tensor] = None,
411
+ layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
412
+ use_cache: bool = False,
413
+ ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
414
+ raise NotImplementedError
415
+
416
+ @classmethod
417
+ def build(cls, layer_id: int, config: MolmoConfig, cache: BufferCache):
418
+ return MolmoSequentialBlock(layer_id, config, cache)
419
+
420
+
421
+ class MolmoSequentialBlock(MolmoBlock):
422
+ """
423
+ This is a typical transformer block where the output is computed as ``MLP(LN(x + Attention(LN(x))))``
424
+ (plus another skip connection).
425
+ """
426
+
427
+ def __init__(self, layer_id: int, config: MolmoConfig, cache: BufferCache):
428
+ super().__init__(layer_id, config, cache)
429
+ # Layer norms.
430
+ self.attn_norm = LayerNorm.build(config)
431
+ self.ff_norm = LayerNorm.build(config)
432
+ # Attention input projection. Projects x -> (q, k, v)
433
+
434
+ head_dim = config.d_model // config.n_heads
435
+ self.fused_dims = (
436
+ config.d_model,
437
+ config.effective_n_kv_heads * head_dim,
438
+ config.effective_n_kv_heads * head_dim,
439
+ )
440
+ self.att_proj = nn.Linear(
441
+ config.d_model, sum(self.fused_dims),
442
+ bias=config.include_bias or config.qkv_bias,
443
+ device=config.init_device
444
+ )
445
+ # Feed-forward input projection.
446
+ self.ff_proj = nn.Linear(
447
+ config.d_model, self.hidden_size, bias=config.include_bias, device=config.init_device
448
+ )
449
+
450
+ def reset_parameters(self):
451
+ super().reset_parameters()
452
+ self.attn_norm.reset_parameters()
453
+ self.ff_norm.reset_parameters()
454
+ # NOTE: the standard deviation for these weights does not depend on the layer.
455
+ init_weights(
456
+ self.config, self.att_proj, d=self.config.d_model, layer_id=None, type_of_module=ModuleType.in_module
457
+ )
458
+ init_weights(
459
+ self.config, self.ff_proj, d=self.config.d_model, layer_id=None, type_of_module=ModuleType.in_module
460
+ )
461
+
462
+ def forward(
463
+ self,
464
+ x: torch.Tensor,
465
+ attention_bias: Optional[torch.Tensor] = None,
466
+ position_ids: Optional[torch.Tensor] = None,
467
+ layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
468
+ use_cache: bool = False,
469
+ ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
470
+ # Get query, key, value projections.
471
+ # shape:
472
+ # - for regular attn q, k, v: (batch_size, seq_len, d_model)
473
+ # - for multi-query attn q: (batch_size, seq_len, d_model)
474
+ # k, v: (batch_size, seq_len, d_model // n_heads)
475
+ # - for group query attn q: (batch_size, seq_len, d_model)
476
+ # k, v: (batch_size, seq_len, d_model // n_kv_heads)
477
+
478
+ if not self.config.norm_after:
479
+ if self._activation_checkpoint_fn is not None:
480
+ atten_in = self._activation_checkpoint_fn(self.attn_norm, x)
481
+ else:
482
+ atten_in = self.attn_norm(x)
483
+ else:
484
+ atten_in = x
485
+ qkv = self.att_proj(atten_in)
486
+
487
+ if self.config.clip_qkv is not None:
488
+ qkv.clamp_(min=-self.config.clip_qkv, max=self.config.clip_qkv)
489
+
490
+ q, k, v = qkv.split(self.fused_dims, dim=-1)
491
+
492
+ # Get attention scores.
493
+ if self._activation_checkpoint_fn is not None:
494
+ att, cache = self._activation_checkpoint_fn( # type: ignore
495
+ self.attention, q, k, v, attention_bias, position_ids=position_ids, layer_past=layer_past, use_cache=use_cache
496
+ )
497
+ else:
498
+ att, cache = self.attention(q, k, v, attention_bias, position_ids=position_ids, layer_past=layer_past, use_cache=use_cache)
499
+
500
+ if self.config.norm_after:
501
+ if self._activation_checkpoint_fn is not None:
502
+ att = self._activation_checkpoint_fn(self.attn_norm, att)
503
+ else:
504
+ att = self.attn_norm(att)
505
+
506
+ # Add attention scores.
507
+ # shape: (B, T, C)
508
+ x = x + self.dropout(att)
509
+
510
+ # Add feed-forward projection.
511
+ # shape: (batch_size, seq_len, d_model)
512
+ og_x = x
513
+
514
+ if not self.config.norm_after:
515
+ if self._activation_checkpoint_fn is not None:
516
+ x = self._activation_checkpoint_fn(self.ff_norm, x) # type: ignore
517
+ else:
518
+ x = self.ff_norm(x)
519
+
520
+ x = self.ff_proj(x)
521
+ if self._activation_checkpoint_fn is not None:
522
+ x = self._activation_checkpoint_fn(self.act, x) # type: ignore
523
+ else:
524
+ x = self.act(x)
525
+ x = self.ff_out(x)
526
+
527
+ if self.config.norm_after:
528
+ if self._activation_checkpoint_fn is not None:
529
+ x = self._activation_checkpoint_fn(self.ff_norm, x) # type: ignore
530
+ else:
531
+ x = self.ff_norm(x)
532
+
533
+ x = self.dropout(x)
534
+ x = og_x + x
535
+
536
+ return x, cache
537
+
538
+
539
+ class Embedding(nn.Module):
540
+ def __init__(
541
+ self,
542
+ num_embeddings: int,
543
+ num_new_embeddings: int,
544
+ features: int,
545
+ device: Union[str, torch.device],
546
+ initializer_range: float = 0.02,
547
+ new_embed_initializer_range: float = 0.02,
548
+ ):
549
+ super().__init__()
550
+ self.initializer_range = initializer_range
551
+ self.new_embed_initializer_range = new_embed_initializer_range
552
+ self.embedding = nn.Parameter(
553
+ torch.zeros(num_embeddings, features, device=device),
554
+ )
555
+ self.new_embedding = nn.Parameter(
556
+ torch.zeros(num_new_embeddings, features, device=device),
557
+ )
558
+
559
+ def reset_parameters(self):
560
+ nn.init.normal_(self.embedding, std=self.initializer_range)
561
+ nn.init.normal_(self.new_embedding, std=self.new_embed_initializer_range)
562
+
563
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
564
+ return F.embedding(x, torch.cat([self.embedding, self.new_embedding], dim=0))
565
+
566
+
567
+ class Dropout(nn.Dropout):
568
+ def __init__(
569
+ self,
570
+ p: float = 0.5,
571
+ inplace: bool = False,
572
+ mask_p: float = 0,
573
+ broadcast_dims: Sequence[int] = (),
574
+ ):
575
+ super().__init__(p, inplace)
576
+ self.mask_p = mask_p
577
+ self.broadcast_dims = broadcast_dims
578
+
579
+ def forward(self, input: torch.Tensor) -> torch.Tensor:
580
+ """
581
+ :param input: A tensor of shape `(batch_size, seq_len, embed_dim)`
582
+ """
583
+ if self.p == 0.0 and (self.mask_p is None or self.mask_p == 0.0):
584
+ return input
585
+ else:
586
+ if self.p > 0. and len(self.broadcast_dims) > 0 and self.training:
587
+ keep_prob = 1.0 - self.p
588
+ dropout_shape = list(input.shape)
589
+ for dim in self.broadcast_dims:
590
+ dropout_shape[dim] = 1
591
+ keep = input.new_empty(dropout_shape).bernoulli_(keep_prob)
592
+ multiplier = keep.broadcast_to(input.shape)
593
+ multiplier.div_(keep_prob)
594
+ input = input * multiplier
595
+ else:
596
+ return F.dropout(input, self.p, self.training, self.inplace)
597
+
598
+
599
+ @dataclass
600
+ class VisionBackboneConfig:
601
+ image_default_input_size: Tuple[int, int] = (336, 336)
602
+ image_patch_size: int = 14
603
+ image_pos_patch_size: int = 14
604
+ image_emb_dim: int = 1024
605
+ image_num_heads: int = 16
606
+ image_num_key_value_heads: int = 16
607
+ image_num_layers: int = 24
608
+ image_head_dim: int = 64
609
+ image_mlp_dim: int = 4096
610
+ image_mlp_activations: str = "gelu"
611
+ image_dropout_rate: float = 0.0
612
+ image_num_pos: int = 577
613
+ image_norm_eps: float = 1e-5
614
+ attention_dropout: float = 0.0
615
+ residual_dropout: float = 0.0
616
+ initializer_range: float = 0.02
617
+ fsdp_wrap: bool = False
618
+ resize_mode: str = "default"
619
+
620
+ def __post_init__(self):
621
+ self.image_default_input_size = tuple(self.image_default_input_size) # type: ignore[assignment]
622
+
623
+ @property
624
+ def image_num_patch(self):
625
+ h, w = self.image_default_input_size
626
+ return h // self.image_patch_size, w // self.image_patch_size
627
+
628
+
629
+ @dataclass
630
+ class FullMolmoConfig:
631
+ d_model: int = 768
632
+ n_heads: int = 12
633
+ n_kv_heads: Optional[int] = None
634
+ qkv_bias: bool = False
635
+ clip_qkv: Optional[float] = None
636
+ n_layers: int = 12
637
+ mlp_ratio: int = 4
638
+ mlp_hidden_size: Optional[int] = None
639
+ activation_type: str = "swiglu"
640
+ block_group_size: int = 1
641
+ rope: bool = True
642
+ rope_full_precision: bool = True
643
+ rope_theta: float = 10000.
644
+ rope_impl: str = "interleave"
645
+ vision_backbone: Optional[VisionBackboneConfig] = None
646
+ attention_type: str = "sdpa"
647
+ float32_attention: bool = True
648
+ attention_dropout: float = 0.1
649
+ response_attention_dropout: float = 0.0
650
+ multi_query_attention: Optional[bool] = None
651
+ attention_layer_norm: bool = False
652
+ residual_dropout: float = 0.1
653
+ embedding_dropout: float = 0.1
654
+ layer_norm_type: str = "default"
655
+ layer_norm_with_affine: bool = True
656
+ layer_norm_eps: Optional[float] = None
657
+ attention_layer_norm_with_affine: bool = True
658
+ max_sequence_length: int = 1024
659
+ max_position_embeddings: Optional[int] = None
660
+ include_bias: bool = True
661
+ bias_for_layer_norm: Optional[bool] = None
662
+ scale_logits: bool = False
663
+ vocab_size: int = 50257
664
+ embedding_size: Optional[int] = 50304
665
+ additional_vocab_size: Optional[int] = None
666
+ new_embedding_init_range: float = 0.02
667
+ weight_tying: bool = True
668
+ pad_token_id: int = -1
669
+ init_device: Optional[str] = None
670
+ init_std: float = 0.02
671
+ init_cutoff_factor: Optional[float] = None
672
+ norm_after: bool = False
673
+ precision: Optional[str] = None
674
+ image_padding_embed: Optional[str] = None
675
+ vit_layers: Tuple = (-1,)
676
+ image_pooling_h: int = 2
677
+ image_pooling_w: int = 2
678
+ image_pooling_2d: str = "attention"
679
+ image_projector: str = "mlp"
680
+ image_feature_dropout: float = 0.0
681
+ initializer_range: float = 0.02
682
+ normalize_input_embeds: bool = False
683
+ use_position_ids: bool = True
684
+
685
+ @property
686
+ def effective_n_kv_heads(self) -> int:
687
+ if self.n_kv_heads is None:
688
+ if self.multi_query_attention is True:
689
+ return 1
690
+ else:
691
+ return self.n_heads
692
+ else:
693
+ if self.multi_query_attention is None:
694
+ return self.n_kv_heads
695
+ if self.multi_query_attention:
696
+ n_kv_heads_should_be = 1
697
+ else:
698
+ n_kv_heads_should_be = self.n_heads
699
+ if self.n_kv_heads == n_kv_heads_should_be:
700
+ return n_kv_heads_should_be
701
+ else:
702
+ raise MolmoConfigurationError(
703
+ "You can't set `multi_query_attention` and `n_kv_heads` at the same time."
704
+ )
705
+
706
+ @property
707
+ def image_num_patch(self):
708
+ assert self.vision_backbone is not None
709
+ return self.vision_backbone.image_num_patch
710
+
711
+ @property
712
+ def image_patch_size(self):
713
+ assert self.vision_backbone is not None
714
+ return self.visoin_backbone.image_patch_size
715
+
716
+ def llm_patches_per_crop(self):
717
+ h, w = self.image_num_patch
718
+ # Round up in case we need to pad the image features for pooling
719
+ h = (h + self.image_pooling_h - 1) // self.image_pooling_h
720
+ w = (w + self.image_pooling_w - 1) // self.image_pooling_w
721
+ return h, w
722
+
723
+
724
+ def _expand_token(token, batch_size: int):
725
+ return token.view(1, 1, -1).expand(batch_size, -1, -1)
726
+
727
+
728
+ class ViTMLP(nn.Module):
729
+ def __init__(self, config: FullMolmoConfig):
730
+ super().__init__()
731
+ self.config = config
732
+ v_cfg = config.vision_backbone
733
+
734
+ self.w1 = nn.Linear(
735
+ v_cfg.image_emb_dim,
736
+ v_cfg.image_mlp_dim,
737
+ bias=True,
738
+ device=config.init_device,
739
+ )
740
+ # Activation function.
741
+ cfg = deepcopy(config)
742
+ cfg.activation_type = v_cfg.image_mlp_activations
743
+ self.act = Activation.build(cfg)
744
+ self.w2 = nn.Linear(
745
+ v_cfg.image_mlp_dim,
746
+ v_cfg.image_emb_dim,
747
+ bias=True,
748
+ device=config.init_device,
749
+ )
750
+
751
+ def reset_parameters(self):
752
+ v_cfg = self.config.vision_backbone
753
+ nn.init.trunc_normal_(self.w1.weight, std=math.sqrt(1 / v_cfg.image_emb_dim), a=-2.0, b=2.0)
754
+ nn.init.trunc_normal_(self.w2.weight, std=math.sqrt(1 / v_cfg.image_mlp_dim), a=-2.0, b=2.0)
755
+ nn.init.zeros_(self.w1.bias)
756
+ nn.init.zeros_(self.w2.bias)
757
+
758
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
759
+ x = self.w1(x)
760
+ x = self.act(x)
761
+ x = self.w2(x)
762
+ return x
763
+
764
+
765
+ class ResidualAttentionBlock(nn.Module):
766
+
767
+ def __init__(self, config: FullMolmoConfig):
768
+ super().__init__()
769
+ self.config = config
770
+
771
+ v_cfg = config.vision_backbone
772
+ self.attention = MultiHeadDotProductAttention(config)
773
+ self.feed_forward = ViTMLP(config)
774
+ self.attention_norm = nn.LayerNorm(
775
+ v_cfg.image_emb_dim,
776
+ eps=v_cfg.image_norm_eps,
777
+ device=config.init_device,
778
+ )
779
+ self.ffn_norm = nn.LayerNorm(
780
+ v_cfg.image_emb_dim,
781
+ eps=v_cfg.image_norm_eps,
782
+ device=config.init_device,
783
+ )
784
+
785
+ def reset_parameters(self):
786
+ self.attention.reset_parameters()
787
+ self.feed_forward.reset_parameters()
788
+ self.attention_norm.reset_parameters()
789
+ self.ffn_norm.reset_parameters()
790
+
791
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
792
+ x = x + self.attention(self.attention_norm(x))
793
+ x = x + self.feed_forward(self.ffn_norm(x))
794
+ return x
795
+
796
+
797
+ class BlockCollection(nn.Module):
798
+
799
+ def __init__(self, config: FullMolmoConfig):
800
+ super().__init__()
801
+ self.config = config
802
+ self.grad_checkpointing: bool = False
803
+
804
+ v_cfg = config.vision_backbone
805
+ self.resblocks = nn.ModuleList([
806
+ ResidualAttentionBlock(config) for _ in range(v_cfg.image_num_layers)
807
+ ])
808
+
809
+ def reset_parameters(self):
810
+ for r in self.resblocks:
811
+ r.reset_parameters()
812
+
813
+ def forward(self, x: torch.Tensor) -> List[torch.Tensor]:
814
+ hidden_states = []
815
+ for r in self.resblocks:
816
+ x = r(x)
817
+ hidden_states.append(x)
818
+ return hidden_states
819
+
820
+
821
+ class LayerNormFp32(nn.LayerNorm):
822
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
823
+ orig_type = x.dtype
824
+ x = F.layer_norm(x.to(torch.float32), self.normalized_shape, self.weight.to(torch.float32),
825
+ self.bias.to(torch.float32), self.eps)
826
+ return x.to(orig_type)
827
+
828
+
829
+ class VisionTransformer(nn.Module):
830
+
831
+ def __init__(self, config: FullMolmoConfig):
832
+ super().__init__()
833
+ self.config = config
834
+
835
+ v_cfg = config.vision_backbone
836
+ # class embeddings and positional embeddings
837
+ self.scale = v_cfg.image_emb_dim ** -0.5
838
+ self.class_embedding = nn.Parameter(
839
+ torch.zeros(v_cfg.image_emb_dim, device=config.init_device),
840
+ )
841
+ self.num_prefix_tokens: int = 1
842
+ self.positional_embedding = nn.Parameter(
843
+ torch.zeros(v_cfg.image_num_pos, v_cfg.image_emb_dim, device=config.init_device),
844
+ )
845
+
846
+ image_patch_size = v_cfg.image_patch_size
847
+ self.patch_embedding = nn.Linear(
848
+ image_patch_size * image_patch_size * 3,
849
+ v_cfg.image_emb_dim,
850
+ bias=False,
851
+ device=config.init_device,
852
+ )
853
+
854
+ self.pre_ln = LayerNormFp32(
855
+ v_cfg.image_emb_dim,
856
+ eps=v_cfg.image_norm_eps,
857
+ )
858
+
859
+ self.transformer = BlockCollection(config)
860
+
861
+ @torch.jit.ignore
862
+ def set_grad_checkpointing(self, enable=True):
863
+ self.transformer.grad_checkpointing = enable
864
+
865
+ def reset_parameters(self):
866
+ nn.init.normal_(self.class_embedding, std=self.scale)
867
+ nn.init.normal_(self.positional_embedding, std=self.scale)
868
+ nn.init.normal_(self.patch_embedding.weight, std=0.02)
869
+ self.pre_ln.reset_parameters()
870
+ self.transformer.reset_parameters()
871
+
872
+ def add_pos_emb(self, x: torch.Tensor, patch_num: int) -> torch.Tensor:
873
+ cls_emb = self.positional_embedding[0:1]
874
+ pos_emb = self.positional_embedding[1:]
875
+
876
+ pos_emb = pos_emb.reshape(
877
+ (int(math.sqrt(pos_emb.shape[0])), int(math.sqrt(pos_emb.shape[0])), pos_emb.shape[1])
878
+ )
879
+
880
+ (patch_num_0, patch_num_1) = patch_num
881
+
882
+ if pos_emb.shape[0] != patch_num_0 or pos_emb.shape[1] != patch_num_1:
883
+ # Dervied from https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py
884
+ # antialias: default True in jax.image.resize
885
+ pos_emb = pos_emb.unsqueeze(0).permute(0, 3, 1, 2)
886
+ pos_emb = F.interpolate(
887
+ pos_emb, size=(patch_num_0, patch_num_1), mode="bicubic", align_corners=False, antialias=True,
888
+ )
889
+ pos_emb = pos_emb.permute(0, 2, 3, 1).squeeze(0)
890
+
891
+ pos_emb = pos_emb.reshape(-1, pos_emb.shape[-1])
892
+ x = x + torch.cat([cls_emb[None, :, :], pos_emb[None, :, :]], dim=1).to(x.dtype)
893
+ return x
894
+
895
+ def forward(self, x: torch.Tensor, patch_num: int = None) -> List[torch.Tensor]:
896
+ """
897
+ : param x: (batch_size, num_patch, n_pixels)
898
+ """
899
+ if patch_num is None:
900
+ patch_num = self.config.vision_backbone.image_num_patch
901
+ B, N, D = x.shape
902
+
903
+ x = self.patch_embedding(x)
904
+
905
+ # class embeddings and positional embeddings
906
+ x = torch.cat([_expand_token(self.class_embedding, x.shape[0]).to(x.dtype), x], dim=1)
907
+ x = self.add_pos_emb(x, patch_num)
908
+
909
+ x = self.pre_ln(x)
910
+
911
+ hidden_states = self.transformer(x)
912
+ return hidden_states
913
+
914
+
915
+ class MultiHeadDotProductAttention(nn.Module):
916
+ def __init__(self, config: FullMolmoConfig, use_bias: bool = True, is_vit_layer: Optional[bool] = True):
917
+ super().__init__()
918
+ self.config = config
919
+ self.use_bias = use_bias
920
+
921
+ v_cfg = config.vision_backbone
922
+ self.embed_dim = v_cfg.image_emb_dim
923
+ self.num_heads = v_cfg.image_num_heads
924
+ self.head_dim = v_cfg.image_head_dim
925
+ self.num_key_value_heads = v_cfg.image_num_key_value_heads
926
+ self.num_key_value_groups = self.num_heads // self.num_key_value_heads
927
+ self.initializer_range = v_cfg.initializer_range
928
+ self.is_vit_layer = is_vit_layer
929
+
930
+ nlayers = 1 if (is_vit_layer or config.vit_layers is None) else len(config.vit_layers)
931
+
932
+ self.wq = nn.Linear(
933
+ nlayers * self.embed_dim,
934
+ self.num_heads * self.head_dim,
935
+ bias=use_bias,
936
+ device=config.init_device,
937
+ )
938
+ self.wk = nn.Linear(
939
+ nlayers * self.embed_dim,
940
+ self.num_key_value_heads * self.head_dim,
941
+ bias=use_bias,
942
+ device=config.init_device,
943
+ )
944
+ self.wv = nn.Linear(
945
+ nlayers * self.embed_dim,
946
+ self.num_key_value_heads * self.head_dim,
947
+ bias=use_bias,
948
+ device=config.init_device,
949
+ )
950
+ self.wo = nn.Linear(
951
+ self.num_heads * self.head_dim,
952
+ self.embed_dim,
953
+ bias=use_bias,
954
+ device=config.init_device,
955
+ )
956
+ self.attention_dropout: Optional[Dropout] = None
957
+ if v_cfg.attention_dropout > 0:
958
+ self.attention_dropout = Dropout(v_cfg.attention_dropout, broadcast_dims=(0, 1))
959
+ self.residual_dropout = Dropout(v_cfg.residual_dropout)
960
+
961
+ def reset_parameters(self):
962
+ nn.init.normal_(self.wq.weight, std=self.initializer_range)
963
+ nn.init.normal_(self.wk.weight, std=self.initializer_range)
964
+ nn.init.normal_(self.wv.weight, std=self.initializer_range)
965
+ nn.init.normal_(self.wo.weight, std=self.initializer_range)
966
+ if self.use_bias:
967
+ nn.init.constant_(self.wq.bias, 0)
968
+ nn.init.constant_(self.wk.bias, 0)
969
+ nn.init.constant_(self.wv.bias, 0)
970
+ nn.init.constant_(self.wo.bias, 0)
971
+
972
+ def _split_heads(self, hidden_states, num_heads) -> torch.Tensor:
973
+ return hidden_states.reshape(hidden_states.shape[:2] + (num_heads, self.head_dim))
974
+
975
+ def _merge_heads(self, hidden_states) -> torch.Tensor:
976
+ return hidden_states.reshape(hidden_states.shape[:2] + (self.embed_dim,))
977
+
978
+ def forward(self, inputs_q: torch.Tensor, inputs_kv: Optional[torch.Tensor] = None) -> torch.Tensor:
979
+
980
+ if inputs_kv is not None:
981
+ inputs_k = inputs_kv
982
+ inputs_v = inputs_kv
983
+ else:
984
+ inputs_k = inputs_q
985
+ inputs_v = inputs_q
986
+
987
+ xq, xk, xv = self.wq(inputs_q), self.wk(inputs_k), self.wv(inputs_v)
988
+
989
+ xq = self._split_heads(xq, self.num_heads)
990
+ xk = self._split_heads(xk, self.num_key_value_heads)
991
+ xv = self._split_heads(xv, self.num_key_value_heads)
992
+
993
+ if self.num_heads != self.num_key_value_heads:
994
+ xk = xk.repeat_interleave(self.num_key_value_groups, dim=2, output_size=self.num_heads)
995
+ xv = xv.repeat_interleave(self.num_key_value_groups, dim=2, output_size=self.num_heads)
996
+
997
+ og_dtype = xq.dtype
998
+
999
+ if self.config.float32_attention:
1000
+ xq = xq.to(torch.float)
1001
+ xk = xk.to(torch.float)
1002
+
1003
+ if self.config.attention_type == "direct":
1004
+ attn_weights = torch.einsum("...qhd,...khd->...hqk", xq / math.sqrt(xq.size(-1)), xk)
1005
+ attn_weights = F.softmax(attn_weights, dim=-1, dtype=torch.float32).to(xq.dtype)
1006
+ if self.attention_dropout is not None:
1007
+ attn_weights = self.attention_dropout(attn_weights)
1008
+ attn_output = torch.einsum("...hqk,...khd->...qhd", attn_weights.to(xv.dtype), xv)
1009
+
1010
+ elif self.config.attention_type == "sdpa":
1011
+ if self.config.float32_attention and not torch.is_autocast_enabled():
1012
+ xv = xv.to(torch.float32)
1013
+ attn_output = F.scaled_dot_product_attention(
1014
+ xq.transpose(1, 2).contiguous(),
1015
+ xk.transpose(1, 2).contiguous(),
1016
+ xv.transpose(1, 2).contiguous(),
1017
+ is_causal=False,
1018
+ dropout_p=self.config.vision_backbone.attention_dropout
1019
+ ).transpose(1, 2)
1020
+ else:
1021
+ raise NotImplementedError(self.config.attention_type)
1022
+ attn_output = attn_output.to(og_dtype)
1023
+ attn_output = self._merge_heads(attn_output)
1024
+ attn_output = self.wo(attn_output)
1025
+ attn_output = self.residual_dropout(attn_output)
1026
+
1027
+ return attn_output
1028
+
1029
+
1030
+ class MultiHeadAttentionPool(nn.Module):
1031
+ def __init__(
1032
+ self,
1033
+ config: FullMolmoConfig,
1034
+ factor: int = 1,
1035
+ use_bias: bool = True,
1036
+ dropout: bool = True,
1037
+ output_layer: bool = True,
1038
+ mean_residual: bool = False,
1039
+ query: str = "mean",
1040
+ is_vit_layer: Optional[bool] = True
1041
+ ):
1042
+ super().__init__()
1043
+ self.config = config
1044
+ self.factor = factor
1045
+ self.use_bias = use_bias
1046
+ self.dropout = dropout
1047
+ self.output_layer = output_layer
1048
+ self.mean_residual = mean_residual
1049
+ self.query = query
1050
+
1051
+ v_cfg = config.vision_backbone
1052
+ input_dim = v_cfg.image_emb_dim
1053
+ self.embed_dim = v_cfg.image_emb_dim * factor
1054
+ self.num_heads = v_cfg.image_num_heads
1055
+ self.head_dim = v_cfg.image_head_dim * factor
1056
+ self.num_key_value_heads = v_cfg.image_num_key_value_heads
1057
+ self.num_key_value_groups = self.num_heads // self.num_key_value_heads
1058
+ self.initializer_range = v_cfg.initializer_range
1059
+
1060
+ nlayers = 1 if (is_vit_layer or config.vit_layers is None) else len(config.vit_layers)
1061
+
1062
+ if query != "vector":
1063
+ self.wq = nn.Linear(
1064
+ nlayers * input_dim,
1065
+ self.num_heads * self.head_dim,
1066
+ bias=use_bias,
1067
+ device=config.init_device,
1068
+ )
1069
+ self.wk = nn.Linear(
1070
+ nlayers * input_dim,
1071
+ self.num_key_value_heads * self.head_dim,
1072
+ bias=use_bias,
1073
+ device=config.init_device,
1074
+ )
1075
+ self.wv = nn.Linear(
1076
+ nlayers * input_dim,
1077
+ self.num_key_value_heads * self.head_dim,
1078
+ bias=use_bias,
1079
+ device=config.init_device,
1080
+ )
1081
+
1082
+ if query == "vector":
1083
+ self.attention_query = nn.Parameter(
1084
+ torch.zeros(
1085
+ 1, self.num_key_value_heads * self.head_dim, device=config.init_device,
1086
+ ),
1087
+ )
1088
+
1089
+ if output_layer:
1090
+ self.wo = nn.Linear(
1091
+ self.num_heads * self.head_dim,
1092
+ self.embed_dim,
1093
+ bias=use_bias,
1094
+ device=config.init_device,
1095
+ )
1096
+ self.attention_dropout = Dropout(v_cfg.attention_dropout, broadcast_dims=(0, 1))
1097
+ if dropout:
1098
+ self.residual_dropout = Dropout(v_cfg.residual_dropout)
1099
+
1100
+ def reset_parameters(self):
1101
+ if self.query != "vector":
1102
+ nn.init.normal_(self.wq.weight, std=self.initializer_range)
1103
+ nn.init.normal_(self.wk.weight, std=self.initializer_range)
1104
+ nn.init.normal_(self.wv.weight, std=self.initializer_range)
1105
+ if self.output_layer:
1106
+ nn.init.normal_(self.wo.weight, std=self.initializer_range)
1107
+ if self.use_bias:
1108
+ if self.query != "vector":
1109
+ nn.init.constant_(self.wq.bias, 0)
1110
+ nn.init.constant_(self.wk.bias, 0)
1111
+ nn.init.constant_(self.wv.bias, 0)
1112
+ if self.output_layer:
1113
+ nn.init.constant_(self.wo.bias, 0)
1114
+ if self.query == "vector":
1115
+ nn.init.normal_(self.attention_query, std=self.initializer_range)
1116
+
1117
+ def _split_heads(self, hidden_states, num_heads):
1118
+ return hidden_states.reshape(hidden_states.shape[:2] + (num_heads, self.head_dim))
1119
+
1120
+ def _merge_heads(self, hidden_states):
1121
+ return hidden_states.reshape(hidden_states.shape[:2] + (self.embed_dim,))
1122
+
1123
+ def forward(self, inputs_kv: torch.Tensor) -> torch.Tensor:
1124
+
1125
+ xk, xv = self.wk(inputs_kv), self.wv(inputs_kv)
1126
+
1127
+ if self.query == "mean":
1128
+ inputs_q = inputs_kv.mean(dim=1, keepdim=True)
1129
+ xq = self.wq(inputs_q)
1130
+ elif self.query == "first":
1131
+ inputs_q = inputs_kv[:, :1]
1132
+ xq = self.wq(inputs_q)
1133
+ elif self.query == "vector":
1134
+ xq = self.attention_query.expand(inputs_kv.size(0), -1, -1)
1135
+ elif self.query == "constant":
1136
+ inputs_q = torch.ones_like(inputs_kv[:, :1]) / math.sqrt(inputs_kv.shape[-1])
1137
+ xq = self.wq(inputs_q)
1138
+ else:
1139
+ raise ValueError(f"Unknown query type: {self.query}")
1140
+
1141
+ xq = self._split_heads(xq, self.num_heads)
1142
+ xk = self._split_heads(xk, self.num_key_value_heads)
1143
+ xv = self._split_heads(xv, self.num_key_value_heads)
1144
+
1145
+ if self.num_heads != self.num_key_value_heads:
1146
+ xk = xk.repeat_interleave(self.num_key_value_groups, dim=2, output_size=self.num_heads)
1147
+ xv = xv.repeat_interleave(self.num_key_value_groups, dim=2, output_size=self.num_heads)
1148
+
1149
+ xq = xq.to(torch.float)
1150
+ xk = xk.to(torch.float)
1151
+
1152
+ xq = xq / math.sqrt(xq.size(-1))
1153
+ attn_weights = torch.einsum("...qhd,...khd->...hqk", xq, xk)
1154
+
1155
+ attn_weights = F.softmax(attn_weights, dim=-1).to(xq.dtype)
1156
+
1157
+ attn_weights = self.attention_dropout(attn_weights).to(xv.dtype)
1158
+
1159
+ attn_output = torch.einsum("...hqk,...khd->...qhd", attn_weights, xv)
1160
+ attn_output = self._merge_heads(attn_output)
1161
+ if self.output_layer:
1162
+ attn_output = self.wo(attn_output)
1163
+ if self.dropout:
1164
+ attn_output = self.residual_dropout(attn_output)
1165
+ if self.mean_residual:
1166
+ attn_output += inputs_kv.mean(dim=1, keepdim=True)
1167
+
1168
+ return attn_output
1169
+
1170
+
1171
+ class MLP(nn.Module):
1172
+ def __init__(self, config: FullMolmoConfig, input_dim: int, dropout: float = 0.0):
1173
+ super().__init__()
1174
+ self.config = config
1175
+ self.hidden_size = (
1176
+ config.mlp_hidden_size if config.mlp_hidden_size is not None else config.mlp_ratio * config.d_model
1177
+ )
1178
+ self.initializer_range = config.initializer_range
1179
+
1180
+ self.w1 = nn.Linear(
1181
+ input_dim,
1182
+ self.hidden_size // 2,
1183
+ bias=False,
1184
+ device=config.init_device,
1185
+ )
1186
+ self.w2 = nn.Linear(
1187
+ self.hidden_size // 2,
1188
+ config.d_model,
1189
+ bias=False,
1190
+ device=config.init_device,
1191
+ )
1192
+ self.w3 = nn.Linear(
1193
+ input_dim,
1194
+ self.hidden_size // 2,
1195
+ bias=False,
1196
+ device=config.init_device,
1197
+ )
1198
+ # Activation function.
1199
+ self.act = Activation.build(config)
1200
+ self.dropout = Dropout(dropout)
1201
+
1202
+ def reset_parameters(self):
1203
+ nn.init.normal_(self.w1.weight, std=self.initializer_range)
1204
+ nn.init.normal_(self.w2.weight, std=self.initializer_range)
1205
+ nn.init.normal_(self.w3.weight, std=self.initializer_range)
1206
+
1207
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
1208
+ x = self.w2(self.act(self.w1(x), self.w3(x)))
1209
+ x = self.dropout(x)
1210
+ return x
1211
+
1212
+
1213
+ class Residual(nn.Module):
1214
+ def __init__(self, submodule: nn.Module):
1215
+ super().__init__()
1216
+ self.submodule = submodule
1217
+
1218
+ def reset_parameters(self):
1219
+ self.submodule.reset_parameters()
1220
+
1221
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
1222
+ return x + self.submodule(x)
1223
+
1224
+
1225
+ class OLMoVisionBackbone(nn.Module):
1226
+ def __init__(self, config: FullMolmoConfig):
1227
+ super().__init__()
1228
+ self.config = config
1229
+ self.image_vit = VisionTransformer(config)
1230
+
1231
+ input_dim: int = None
1232
+ self.image_pooling_2d: nn.Module = None
1233
+ if config.image_pooling_2d in {ImagePooling2DType.attention, ImagePooling2DType.attention_meanq}:
1234
+ self.image_pooling_2d = MultiHeadDotProductAttention(config, is_vit_layer=False)
1235
+ input_dim = config.vision_backbone.image_emb_dim
1236
+ elif config.image_pooling_2d == ImagePooling2DType.attention_2wide:
1237
+ cfg = deepcopy(config)
1238
+ cfg.vision_backbone.image_emb_dim *= 2
1239
+ cfg.vision_backbone.image_head_dim *= 2
1240
+ self.image_pooling_2d = MultiHeadDotProductAttention(cfg, is_vit_layer=False)
1241
+ input_dim = cfg.vision_backbone.image_emb_dim
1242
+ elif config.image_pooling_2d == ImagePooling2DType.attention_v2:
1243
+ assert config.vit_layers is not None
1244
+ use_bias = True
1245
+ dropout = True
1246
+ output_layer = True
1247
+ query = "mean"
1248
+ mean_residual = False
1249
+ factor = len(config.vit_layers)
1250
+ self.image_pooling_2d = MultiHeadAttentionPool(
1251
+ config,
1252
+ factor=factor,
1253
+ use_bias=use_bias,
1254
+ dropout=dropout,
1255
+ output_layer=output_layer,
1256
+ mean_residual=mean_residual,
1257
+ query=query,
1258
+ is_vit_layer=False,
1259
+ )
1260
+ input_dim = config.vision_backbone.image_emb_dim * factor
1261
+ elif config.image_pooling_2d in [ImagePooling2DType.none, ImagePooling2DType.stack]:
1262
+ self.image_pooling_2d = None
1263
+ nlayers = 1 if config.vit_layers is None else len(config.vit_layers)
1264
+ input_dim = nlayers * config.vision_backbone.image_emb_dim
1265
+ else:
1266
+ raise NotImplementedError(f"Unknown image pooling 2D method: {config.image_pooling_2d}")
1267
+
1268
+ self.input_dim = input_dim
1269
+
1270
+ # `MLP` assume the activation takes two inputs, so it must be a 'llama' version
1271
+ if config.activation_type == ActivationType.swiglu:
1272
+ mlp_config = replace(config, activation_type=ActivationType.llama_swiglu)
1273
+ elif config.activation_type == ActivationType.gelu:
1274
+ mlp_config = replace(config, activation_type=ActivationType.llama_geglu)
1275
+ else:
1276
+ mlp_config = config
1277
+ if config.image_projector == ImageProjectType.mlpx2:
1278
+ self.image_projector = nn.ModuleList(
1279
+ [MLP(mlp_config, input_dim), Residual(MLP(config, input_dim))]
1280
+ )
1281
+ elif config.image_projector == ImageProjectType.mlp:
1282
+ self.image_projector = MLP(mlp_config, input_dim)
1283
+ elif config.image_projector == ImageProjectType.linear:
1284
+ self.image_projector = nn.Linear(
1285
+ input_dim,
1286
+ config.d_model,
1287
+ bias=False,
1288
+ device=config.init_device,
1289
+ )
1290
+ else:
1291
+ raise NotImplementedError(f"Unknown image projector: {config.image_projector}")
1292
+
1293
+ self.image_feature_dropout = Dropout(config.image_feature_dropout)
1294
+
1295
+ def reset_parameters(self):
1296
+ if self.image_pooling_2d is not None:
1297
+ self.image_pooling_2d.reset_parameters()
1298
+ if self.config.image_projector == "2mlp":
1299
+ for module in self.image_projector:
1300
+ module.reset_parameters()
1301
+ elif self.config.image_projector == "linear":
1302
+ nn.init.xavier_uniform_(self.image_projector.weight)
1303
+ else:
1304
+ self.image_projector.reset_parameters()
1305
+
1306
+ def forward(self, images: torch.Tensor, image_masks: torch.Tensor) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
1307
+ raise NotImplementedError
1308
+
1309
+
1310
+ class OLMoPretrainedVisionBackbone(OLMoVisionBackbone):
1311
+ def __init__(self, config: FullMolmoConfig):
1312
+ super().__init__(config)
1313
+ v_cfg = self.config.vision_backbone
1314
+ self.grad_checkpointing = False
1315
+
1316
+ self.num_prefix_tokens = self.image_vit.num_prefix_tokens
1317
+ assert self.num_prefix_tokens in {0, 1}, "Only 0 or 1 prefix tokens are supported"
1318
+
1319
+ self.pad_embed = None
1320
+ if config.image_padding_embed:
1321
+ image_dim = v_cfg.image_emb_dim*len(self.config.vit_layers)
1322
+ if config.image_padding_embed in ["pad_embed", "regress"]:
1323
+ self.pad_embed = nn.Parameter(
1324
+ torch.zeros((image_dim,), device=config.init_device))
1325
+ elif config.image_padding_embed == "pad_and_partial_pad":
1326
+ self.pad_embed = nn.Parameter(
1327
+ torch.zeros((2, image_dim), device=config.init_device))
1328
+ else:
1329
+ raise ValueError(config.image_padding_embed)
1330
+
1331
+ def reset_parameters(self):
1332
+ super().reset_parameters()
1333
+ self.image_vit.reset_parameters()
1334
+
1335
+ def encode_image(self, images: torch.Tensor) -> torch.Tensor:
1336
+ """
1337
+ : param images: (batch_size, num_crops, num_patch, n_pixels)
1338
+ """
1339
+ cfg = self.config
1340
+ v_cfg = self.config.vision_backbone
1341
+ B, T, N, D = images.shape
1342
+
1343
+ mask = ~torch.all(images.view(B * T, N, D) == -1, dim=(1, 2), keepdim=True)
1344
+
1345
+ # Output all hidden states
1346
+ # n_layers x (batch_num_crops, (1+)n_tokens, image_emb_dim)
1347
+ images = images.view(B * T, N, D)
1348
+ image_features = self.image_vit(images)
1349
+
1350
+ if cfg.vit_layers is not None:
1351
+ features = []
1352
+ for layer in cfg.vit_layers:
1353
+ features.append(image_features[layer])
1354
+ image_features = torch.cat(features, dim=-1)
1355
+ else:
1356
+ image_features = image_features[-1]
1357
+
1358
+ cls_embed: torch.Tensor = None
1359
+ if self.num_prefix_tokens > 0:
1360
+ cls_embed = image_features[:, 0]
1361
+ image_features = image_features[:, 1:]
1362
+
1363
+ image_features = image_features * mask
1364
+ image_features = image_features.view(B, T, N, -1)
1365
+
1366
+ cls_embed = cls_embed.view(B, T, -1) if cls_embed is not None else None
1367
+
1368
+ return image_features, cls_embed
1369
+
1370
+ def forward(self, images: torch.Tensor, image_masks: torch.Tensor) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
1371
+ cfg = self.config
1372
+
1373
+ # image_features: (batch_size, num_crops(=num_image), num_patch, nximage_emb_dim)
1374
+ batch_size, num_image = images.shape[:2]
1375
+ image_features, cls_embed = self.encode_image(images)
1376
+
1377
+ if cfg.image_padding_embed:
1378
+ assert image_masks is not None
1379
+ if cfg.image_padding_embed == "pad_embed":
1380
+ all_pad = (image_masks == 0).to(dtype=torch.float32)
1381
+ pad_embed = self.pad_embed[None, None, None, :]
1382
+ image_features = image_features + pad_embed * torch.unsqueeze(all_pad, -1)
1383
+ elif cfg.image_padding_embed == "regress":
1384
+ pad_embed = self.pad_embed[None, None, None, :]
1385
+ image_features = image_features + pad_embed * torch.unsqueeze(torch.maximum(image_masks, torch.zeros_like(image_masks)), -1)
1386
+ elif cfg.image_padding_embed == "pad_and_partial_pad":
1387
+ pad_embed = self.pad_embed[:, None, None, None, :]
1388
+ all_pad = image_masks == 0
1389
+ partial_pad = torch.logical_and(image_masks < 1, torch.logical_not(all_pad)).to(dtype=image_features.dtype)
1390
+ all_pad = all_pad.to(dtype=image_features.dtype)
1391
+ image_features = image_features + pad_embed[0] * torch.unsqueeze(all_pad, -1)
1392
+ image_features = image_features + pad_embed[1] * torch.unsqueeze(partial_pad, -1)
1393
+ else:
1394
+ raise ValueError(cfg.image_padding_embed)
1395
+
1396
+ image_features = self.image_feature_dropout(image_features)
1397
+ if cls_embed is not None:
1398
+ cls_embed = self.image_feature_dropout(cls_embed)
1399
+
1400
+ image_features = image_features.reshape(
1401
+ (batch_size, num_image) + cfg.image_num_patch + (-1,),
1402
+ )
1403
+
1404
+ if cfg.image_num_patch[0] % cfg.image_pooling_h == 1:
1405
+ # Pad so we can still pool 2x2 patches
1406
+ image_features = F.pad(
1407
+ image_features,
1408
+ (0, 0, 0, 1, 0, 1, 0, 0, 0, 0),
1409
+ )
1410
+
1411
+ # image pooling
1412
+ image_features = einops.rearrange(
1413
+ image_features,
1414
+ 'b n (h dh) (w dw) c -> (b n h w) (dh dw) c',
1415
+ dh=cfg.image_pooling_h,
1416
+ dw=cfg.image_pooling_w,
1417
+ )
1418
+
1419
+ if cfg.image_pooling_2d == ImagePooling2DType.attention_meanq:
1420
+ query = image_features.mean(-2, keepdim=True)
1421
+ image_features = self.image_pooling_2d(query, image_features)
1422
+ elif cfg.image_pooling_2d not in {ImagePooling2DType.none, ImagePooling2DType.stack}:
1423
+ if self.grad_checkpointing:
1424
+ from torch.utils.checkpoint import checkpoint
1425
+ image_features = checkpoint(self.image_pooling_2d, image_features[:, :1, :], image_features, use_reentrant=False)
1426
+ else:
1427
+ image_features = self.image_pooling_2d(image_features[:, :1, :], image_features)
1428
+
1429
+ h, w = cfg.llm_patches_per_crop()
1430
+ image_features = image_features.reshape(batch_size, num_image, h * w, -1)
1431
+
1432
+ # MLP layer to map the feature.
1433
+ if self.grad_checkpointing:
1434
+ from torch.utils.checkpoint import checkpoint
1435
+ image_features = checkpoint(self.image_projector, image_features, use_reentrant=False)
1436
+ else:
1437
+ image_features = self.image_projector(image_features)
1438
+
1439
+ # image_features: (batch_size, num_image, num_patch, d_model)
1440
+ # cls_embed: (batch_size, num_image, d_model)
1441
+ return image_features, cls_embed
1442
+
1443
+
1444
+ class ModuleType(str, Enum):
1445
+ in_module = "in"
1446
+ out_module = "out"
1447
+ emb = "emb"
1448
+ final_out = "final_out"
1449
+
1450
+
1451
+ def init_weights(
1452
+ config: FullMolmoConfig,
1453
+ module: Union[nn.Linear, nn.Embedding],
1454
+ d: Optional[int] = None,
1455
+ layer_id: Optional[int] = None,
1456
+ std_factor: float = 1.0,
1457
+ type_of_module: Optional[ModuleType] = None,
1458
+ ) -> None:
1459
+ d = d if d is not None else config.d_model
1460
+ std = config.init_std * std_factor
1461
+ if config.init_cutoff_factor is not None:
1462
+ cutoff_value = config.init_cutoff_factor * std
1463
+ nn.init.trunc_normal_(module.weight, mean=0.0, std=std, a=-cutoff_value, b=cutoff_value)
1464
+ else:
1465
+ nn.init.normal_(module.weight, mean=0.0, std=std)
1466
+
1467
+
1468
+ class LlamaSwiGLU(nn.Module):
1469
+ def forward(self, x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor:
1470
+ return F.silu(x1) * x2
1471
+
1472
+ @property
1473
+ def output_multiplier(self) -> float:
1474
+ return 0.5
1475
+
1476
+
1477
+ class SwiGLU(nn.Module):
1478
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
1479
+ x, gate = x.chunk(2, dim=-1)
1480
+ return F.silu(gate) * x
1481
+
1482
+ @property
1483
+ def output_multiplier(self) -> float:
1484
+ return 0.5
1485
+
1486
+
1487
+ class Activation(nn.Module):
1488
+ def __init__(self, config: FullMolmoConfig):
1489
+ super().__init__()
1490
+ self.config = config
1491
+
1492
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
1493
+ raise NotImplementedError
1494
+
1495
+ @property
1496
+ def output_multiplier(self) -> float:
1497
+ raise NotImplementedError
1498
+
1499
+ @classmethod
1500
+ def build(cls, config: FullMolmoConfig) -> 'Activation':
1501
+ if config.activation_type == "quick_gelu":
1502
+ return QuickGELU(config)
1503
+ elif config.activation_type == "gelu":
1504
+ return cast(Activation, GELU(approximate="none"))
1505
+ elif config.activation_type == "gelu_tanh":
1506
+ return cast(Activation, GELU(approximate="tanh"))
1507
+ elif config.activation_type == "relu":
1508
+ return cast(Activation, ReLU(inplace=False))
1509
+ elif config.activation_type == "silu":
1510
+ return cast(Activation, SiLU(inplace=False))
1511
+ # elif config.activation_type == "llama_geglu":
1512
+ # return LlamaGEGLU(config)
1513
+ # elif config.activation_type == "llama_geglu_tanh":
1514
+ # return LlamaGEGLUTanh(config)
1515
+ elif config.activation_type == "llama_swiglu":
1516
+ return LlamaSwiGLU()
1517
+ elif config.activation_type == "swiglu":
1518
+ return SwiGLU()
1519
+ else:
1520
+ raise NotImplementedError(f"Unknown activation: '{config.activation_type}'")
1521
+
1522
+
1523
+ class QuickGELU(Activation):
1524
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
1525
+ return x * torch.sigmoid(1.702 * x)
1526
+
1527
+ @property
1528
+ def output_multiplier(self) -> float:
1529
+ return 1.0
1530
+
1531
+
1532
+ class GELU(nn.GELU):
1533
+ @property
1534
+ def output_multiplier(self) -> float:
1535
+ return 1.0
1536
+
1537
+
1538
+ class ReLU(nn.ReLU):
1539
+ @property
1540
+ def output_multiplier(self) -> float:
1541
+ return 1.0
1542
+
1543
+
1544
+ class SiLU(nn.SiLU):
1545
+ @property
1546
+ def output_multiplier(self) -> float:
1547
+ return 1.0
1548
+
1549
+
1550
+ def causal_attention_bias(seq_len: int, device: torch.device) -> torch.FloatTensor:
1551
+ att_bias = torch.triu(
1552
+ torch.ones(seq_len, seq_len, device=device, dtype=torch.float),
1553
+ diagonal=1,
1554
+ )
1555
+ att_bias.masked_fill_(att_bias == 1, torch.finfo(att_bias.dtype).min)
1556
+ return att_bias.view(1, 1, seq_len, seq_len) # type: ignore
1557
+
1558
+
1559
+ def get_causal_attention_bias(cache: BufferCache, seq_len: int, device: torch.device) -> torch.Tensor:
1560
+ if (causal_bias := cache.get("causal_attention_bias")) is not None and causal_bias.shape[-1] >= seq_len:
1561
+ if causal_bias.device != device:
1562
+ causal_bias = causal_bias.to(device)
1563
+ cache["causal_attention_bias"] = causal_bias
1564
+ return causal_bias
1565
+ with torch.autocast(device.type, enabled=False):
1566
+ causal_bias = causal_attention_bias(seq_len, device)
1567
+ cache["causal_attention_bias"] = causal_bias
1568
+ return causal_bias
1569
+
1570
+
1571
+ class LayerNormBase(nn.Module):
1572
+ def __init__(
1573
+ self,
1574
+ config: MolmoConfig,
1575
+ *,
1576
+ size: Optional[int] = None,
1577
+ elementwise_affine: Optional[bool] = True,
1578
+ eps: float = 1e-05,
1579
+ weight_initializer: Optional[Callable] = torch.ones,
1580
+ bias_initializer: Optional[Callable] = torch.zeros,
1581
+ ):
1582
+ super().__init__()
1583
+ self.config = config
1584
+ self.eps = self.config.layer_norm_eps or eps
1585
+ self.normalized_shape = (size or config.d_model,)
1586
+ if elementwise_affine or (elementwise_affine is None and self.config.layer_norm_with_affine):
1587
+ self.weight = nn.Parameter(weight_initializer(self.normalized_shape, device=config.init_device))
1588
+ use_bias = self.config.bias_for_layer_norm
1589
+ if use_bias is None:
1590
+ use_bias = self.config.include_bias
1591
+ if use_bias:
1592
+ self.bias = nn.Parameter(bias_initializer(self.normalized_shape, device=config.init_device))
1593
+ else:
1594
+ self.register_parameter("bias", None)
1595
+ else:
1596
+ self.register_parameter("bias", None)
1597
+ self.register_parameter("weight", None)
1598
+
1599
+ @classmethod
1600
+ def build(cls, config: FullMolmoConfig, size: Optional[int] = None, **kwargs):
1601
+ if config.layer_norm_type == "default":
1602
+ return LayerNorm(config, size=size, low_precision=False, **kwargs)
1603
+ elif config.layer_norm_type == "low_precision":
1604
+ return LayerNorm(config, size=size, low_precision=True, **kwargs)
1605
+ elif config.layer_norm_type == "rms":
1606
+ return RMSLayerNorm(config, size=size, **kwargs)
1607
+ else:
1608
+ raise NotImplementedError(f"Unknown LayerNorm type: '{config.layer_norm_type}'")
1609
+
1610
+
1611
+ class RMSLayerNorm(LayerNormBase):
1612
+ """
1613
+ RMS layer norm, a simplified :class:`LayerNorm` implementation
1614
+ """
1615
+
1616
+ def __init__(
1617
+ self,
1618
+ config: FullMolmoConfig,
1619
+ size: Optional[int] = None,
1620
+ elementwise_affine: Optional[bool] = None,
1621
+ eps: float = 1e-5,
1622
+ ):
1623
+ super().__init__(config, size=size, elementwise_affine=elementwise_affine, eps=eps)
1624
+
1625
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
1626
+ with torch.autocast(enabled=False, device_type=x.device.type):
1627
+ og_dtype = x.dtype
1628
+ x = x.to(torch.float32)
1629
+ variance = x.pow(2).mean(-1, keepdim=True)
1630
+ x = x * torch.rsqrt(variance + self.eps)
1631
+ x = x.to(og_dtype)
1632
+
1633
+ if self.weight is not None:
1634
+ if self.bias is not None:
1635
+ return self.weight * x + self.bias
1636
+ else:
1637
+ return self.weight * x
1638
+ else:
1639
+ return x
1640
+
1641
+
1642
+ class LayerNorm(LayerNormBase):
1643
+ """
1644
+ The default :class:`LayerNorm` implementation which can optionally run in low precision.
1645
+ """
1646
+
1647
+ def __init__(
1648
+ self,
1649
+ config: FullMolmoConfig,
1650
+ size: Optional[int] = None,
1651
+ low_precision: bool = False,
1652
+ elementwise_affine: Optional[bool] = None,
1653
+ eps: float = 1e-05,
1654
+ ):
1655
+ super().__init__(config, size=size, elementwise_affine=elementwise_affine, eps=eps)
1656
+ self.low_precision = low_precision
1657
+
1658
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
1659
+ if self.low_precision:
1660
+ module_device = x.device
1661
+ downcast_x = self._cast_if_autocast_enabled(x)
1662
+ downcast_weight = (
1663
+ self._cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight
1664
+ )
1665
+ downcast_bias = self._cast_if_autocast_enabled(self.bias) if self.bias is not None else self.bias
1666
+ with torch.autocast(enabled=False, device_type=module_device.type):
1667
+ return F.layer_norm(
1668
+ downcast_x, self.normalized_shape, weight=downcast_weight, bias=downcast_bias, eps=self.eps
1669
+ )
1670
+ else:
1671
+ return F.layer_norm(x, self.normalized_shape, weight=self.weight, bias=self.bias, eps=self.eps)
1672
+
1673
+
1674
+ class Molmo(nn.Module):
1675
+ def __init__(self, config: FullMolmoConfig, init_params: bool = True):
1676
+ super().__init__()
1677
+ self.config = config
1678
+ self.__cache = BufferCache()
1679
+
1680
+ # Validate config.
1681
+ if self.config.embedding_size is not None and self.config.embedding_size != self.config.vocab_size:
1682
+ if self.config.embedding_size < self.config.vocab_size:
1683
+ raise MolmoConfigurationError("embedding size should be at least as big as vocab size")
1684
+ elif self.config.embedding_size % 128 != 0:
1685
+ import warnings
1686
+
1687
+ warnings.warn(
1688
+ "Embedding size is not a multiple of 128! This could hurt throughput performance.", UserWarning
1689
+ )
1690
+ torch.backends.cuda.enable_flash_sdp(True)
1691
+ torch.backends.cuda.enable_mem_efficient_sdp(False) # this is super slow so make sure torch won't use it
1692
+
1693
+ wte = None
1694
+ if self.config.additional_vocab_size is not None:
1695
+ wte = Embedding(
1696
+ config.embedding_size or config.vocab_size,
1697
+ config.additional_vocab_size,
1698
+ config.d_model,
1699
+ device=config.init_device,
1700
+ initializer_range=config.initializer_range,
1701
+ new_embed_initializer_range=config.new_embedding_init_range
1702
+ )
1703
+ else:
1704
+ wte=nn.Embedding(
1705
+ config.embedding_size or config.vocab_size, config.d_model, device=config.init_device
1706
+ )
1707
+
1708
+ self.transformer = nn.ModuleDict(
1709
+ dict(
1710
+ wte=wte,
1711
+ emb_drop=Dropout(config.embedding_dropout),
1712
+ ln_f=LayerNorm.build(config),
1713
+ )
1714
+ )
1715
+
1716
+ blocks = [MolmoBlock.build(i, config, self.__cache) for i in range(config.n_layers)]
1717
+ if self.config.block_group_size > 1:
1718
+ raise NotImplementedError()
1719
+ else:
1720
+ self.transformer.update({"blocks": nn.ModuleList(blocks)})
1721
+
1722
+ if not self.config.rope:
1723
+ self.transformer.update(
1724
+ {"wpe": nn.Embedding(config.max_sequence_length, config.d_model, device=config.init_device)}
1725
+ )
1726
+ if not config.weight_tying:
1727
+ self.transformer.update(
1728
+ {
1729
+ "ff_out": nn.Linear(
1730
+ config.d_model,
1731
+ config.embedding_size or config.vocab_size,
1732
+ bias=config.include_bias,
1733
+ device=config.init_device,
1734
+ )
1735
+ }
1736
+ )
1737
+
1738
+ self.vision_backbone: Optional[OLMoVisionBackbone] = None
1739
+ if config.vision_backbone is not None:
1740
+ self.vision_backbone = OLMoPretrainedVisionBackbone(config)
1741
+
1742
+ self.__num_fwd_flops: Optional[int] = None
1743
+
1744
+ def reset_parameters(self):
1745
+ if self.vision_backbone is not None:
1746
+ self.vision_backbone.reset_parameters()
1747
+ self.reset_non_vision_parameters()
1748
+
1749
+ def reset_non_vision_parameters(self):
1750
+ self.transformer.wte.reset_parameters()
1751
+ if hasattr(self.transformer.wte, "new_embedding"):
1752
+ nn.init.normal_(self.transformer.wte.new_embedding, std=self.config.new_embedding_init_range)
1753
+
1754
+ if hasattr(self.transformer, "wpe"):
1755
+ nn.init.normal_(self.transformer.wpe, mean=0.0, std=1.0)
1756
+
1757
+ self.transformer.ln_f.reset_parameters() # type: ignore
1758
+
1759
+ if hasattr(self.transformer, "ff_out"):
1760
+ nn.init.normal_(self.transformer.ff_out, mean=0.0, std=0.02)
1761
+
1762
+ if self.config.block_group_size == 1:
1763
+ for block in self.transformer.blocks:
1764
+ block.reset_parameters()
1765
+ else:
1766
+ for block_group in self.transformer.block_groups:
1767
+ block_group.reset_parameters()
1768
+
1769
+
1770
+ def forward(
1771
+ self,
1772
+ input_ids: torch.LongTensor,
1773
+ input_embeddings: Optional[torch.FloatTensor] = None,
1774
+ attention_mask: Optional[torch.Tensor] = None,
1775
+ attention_bias: Optional[torch.Tensor] = None,
1776
+ response_mask: Optional[torch.Tensor] = None,
1777
+ images: Optional[torch.Tensor] = None,
1778
+ image_masks: Optional[torch.Tensor] = None,
1779
+ image_input_idx: Optional[torch.Tensor] = None,
1780
+ subsegment_ids: Optional[torch.Tensor] = None,
1781
+ position_ids: Optional[torch.Tensor] = None,
1782
+ past_key_values: Optional[Sequence[Tuple[torch.Tensor, torch.Tensor]]] = None,
1783
+ use_cache: bool = False,
1784
+ last_logits_only: bool = False,
1785
+ output_hidden_states: Optional[bool] = None,
1786
+ append_last_valid_logits: Optional[torch.Tensor] = None,
1787
+ ) -> ModelOutput:
1788
+ """
1789
+ :param input_ids: A tensor of shape `(batch_size, seq_len)`.
1790
+ :param input_embeddings: A tensor of shape `(batch_size, seq_len, d_model)` with input
1791
+ embeddings. When provided, it is treated as the output of the input embedding layer.
1792
+ :param attention_mask: A tensor of shape `(batch_size, seq_len)` that indicates
1793
+ which input IDs are masked. A `1` value in the mask means that
1794
+ the corresponding input ID should *not* be ignored. A `0` means
1795
+ that the corresponding input ID is masked.
1796
+
1797
+ This has the same meaning as the `attention_mask` in HuggingFace's `transformers`
1798
+ library.
1799
+ :param attention_bias: A tensor of shape `(batch_size, 1, seq_len, seq_len)`,
1800
+ `(1, 1, seq_len, seq_len)`, or `(seq_len, seq_len)`. This is used
1801
+ to introduce causal or other biases.
1802
+
1803
+ If the tensor is a bool or byte tensor, a `True` or `1` at `attention_bias[:, :, i, j]`
1804
+ indicates that the i-th element in the sequence is allowed to attend to the j-th
1805
+ element in the sequence.
1806
+
1807
+ If the tensor is a float tensor, it will just be added to the attention
1808
+ scores before the softmax.
1809
+
1810
+ The default is causal, which corresponds to a lower-diagonal byte matrix of ones.
1811
+ :param response_mask: A tensor of shape `(batch_size, seq_len)` that indicates
1812
+ the response mask. A `1` value in the mask means that the corresponding token
1813
+ is a response token. A `0` means that the corresponding token is not
1814
+ a response token.
1815
+ :param past_key_values: Pre-computed keys and values for each attention block.
1816
+ Can be used to speed up sequential decoding. The `input_ids` which have
1817
+ their past given to this model should not be passed as `input_ids` as they have already been computed.
1818
+ :param use_cache: If `True`, return key and value tensors for each block.
1819
+ :param last_logits_only: If `True`, only compute the logits for the last token of each sequence.
1820
+ This can speed up decoding when you only care about the next token.
1821
+ """
1822
+ output_hidden_states = output_hidden_states if output_hidden_states is not None else False
1823
+
1824
+ if past_key_values:
1825
+ assert len(past_key_values) == self.config.n_layers
1826
+
1827
+ has_image = images is not None
1828
+
1829
+ assert not (has_image and input_embeddings is not None), "Cannot provide both images and input embeddings."
1830
+ assert not (has_image and past_key_values is not None), "Cached key and values should not be used with images."
1831
+
1832
+ batch_size, seq_len = input_ids.size() if input_embeddings is None else input_embeddings.size()[:2]
1833
+ if past_key_values is None:
1834
+ past_length = 0
1835
+ else:
1836
+ past_length = past_key_values[0][0].size(-2)
1837
+
1838
+ if self.config.use_position_ids and attention_mask is None:
1839
+ attention_mask = input_ids != -1
1840
+
1841
+ if subsegment_ids is not None:
1842
+ assert not use_cache, "Subsegment_ids cannot be used with cache."
1843
+ subsegment_mask = subsegment_ids.unsqueeze(2) <= subsegment_ids.unsqueeze(1)
1844
+ attention_mask = (
1845
+ subsegment_mask.to(attention_mask.dtype) *
1846
+ attention_mask.unsqueeze(2) *
1847
+ attention_mask.unsqueeze(1))
1848
+ if position_ids is None:
1849
+ raise ValueError(f"Positioned ids must be given if using subsegment_ids")
1850
+ else:
1851
+ if self.config.use_position_ids and position_ids is None:
1852
+ position_ids = torch.clamp(
1853
+ torch.cumsum(attention_mask.to(torch.int32), dim=-1) - 1,
1854
+ min=0,
1855
+ ).broadcast_to((batch_size, attention_mask.shape[-1]))
1856
+
1857
+ # Get embeddings of input.
1858
+ # shape: (batch_size, seq_len, d_model)
1859
+ if input_ids is not None:
1860
+ input_ids = input_ids * (input_ids != -1).to(input_ids.dtype)
1861
+ x = self.transformer.wte(input_ids) if input_embeddings is None else input_embeddings # type: ignore
1862
+
1863
+ num_image: Optional[int] = None
1864
+ if images is not None:
1865
+ # shape: (batch_size, num_image, num_patch, d_model)
1866
+ # cls_embed: (batch_size, num_image, d_model)
1867
+ image_features, cls_embed = self.vision_backbone(images, image_masks)
1868
+ num_image, num_patch = image_features.shape[1:3]
1869
+ assert image_input_idx.shape == (batch_size, num_image, num_patch)
1870
+
1871
+ # inster the image feature into the embedding.
1872
+ image_features = image_features.view(batch_size, num_image * num_patch, -1)
1873
+ image_input_idx = image_input_idx.view(batch_size, num_image * num_patch)
1874
+
1875
+ valid = image_input_idx >= 0
1876
+ batch_idx = torch.arange(batch_size, device=x.device)
1877
+ batch_idx = torch.tile(batch_idx[:, None], [1, image_features.shape[1]])
1878
+
1879
+ # For hf demo/endpoint
1880
+ image_features = image_features.to(x.device)
1881
+
1882
+ x[batch_idx[valid], image_input_idx[valid]] += image_features[valid]
1883
+
1884
+ if not self.config.rope:
1885
+ # Get positional embeddings.
1886
+ # shape: (1, seq_len)
1887
+ pos = torch.arange(past_length, past_length + seq_len, dtype=torch.long, device=x.device).unsqueeze(0)
1888
+ # shape: (1, seq_len, d_model)
1889
+ pos_emb = self.transformer.wpe(pos) # type: ignore
1890
+ x = pos_emb + x
1891
+
1892
+ # Add input + positional embeddings and apply dropout.
1893
+ # shape: (batch_size, seq_len, d_model)
1894
+ x = self.transformer.emb_drop(x) # type: ignore
1895
+
1896
+ # normalized
1897
+ if self.config.normalize_input_embeds:
1898
+ x = x * (self.config.d_model ** 0.5)
1899
+
1900
+ # Transform the attention mask into what the blocks expect.
1901
+ if attention_mask is not None:
1902
+ # shape: (batch_size, 1, 1, seq_len)
1903
+ if len(attention_mask.shape) == 2:
1904
+ attention_mask = attention_mask[:, :past_length + seq_len]
1905
+ attention_mask = attention_mask.to(dtype=torch.float).view(batch_size, -1)[:, None, None, :]
1906
+ else:
1907
+ attention_mask = attention_mask.unsqueeze(1).to(dtype=torch.float)
1908
+ attention_mask = (1.0 - attention_mask) * torch.finfo(attention_mask.dtype).min
1909
+
1910
+ # Merge attention mask with attention bias.
1911
+ if (
1912
+ attention_bias is not None
1913
+ or attention_mask is not None
1914
+ # NOTE (epwalsh): we need to initialize the attn bias in order for attn to work properly
1915
+ # with key+value cache. Otherwise `F.scaled_dot_product_attention()` doesn't seem to compute
1916
+ # scores correctly.
1917
+ or past_key_values is not None
1918
+ ):
1919
+ if attention_bias is None:
1920
+ attention_bias = get_causal_attention_bias(self.__cache, past_length + seq_len, x.device)
1921
+ elif attention_bias.dtype in (torch.int8, torch.bool):
1922
+ attention_bias = attention_bias.to(dtype=torch.float)
1923
+ attention_bias.masked_fill_(attention_bias == 0.0, torch.finfo(attention_bias.dtype).min)
1924
+
1925
+ # Transform to the right shape and data type.
1926
+ mask_len = seq_len
1927
+ if attention_mask is not None:
1928
+ mask_len = attention_mask.shape[-1]
1929
+ elif past_key_values is not None:
1930
+ mask_len = past_key_values[0][0].shape[-2] + seq_len
1931
+ attention_bias = attention_bias[:, :, :mask_len, :mask_len].to(dtype=torch.float)
1932
+
1933
+ # Add in the masking bias.
1934
+ if attention_mask is not None:
1935
+ attention_bias = attention_bias + attention_mask
1936
+ # Might get -infs after adding attention mask, since dtype.min + dtype.min = -inf.
1937
+ # `F.scaled_dot_product_attention()` doesn't handle -inf like you'd expect, instead
1938
+ # it can produce NaNs.
1939
+ ensure_finite_(attention_bias, check_neg_inf=True, check_pos_inf=False)
1940
+
1941
+ attn_key_values: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = [] if use_cache else None
1942
+
1943
+ # decoder layers
1944
+ all_hidden_states = []
1945
+
1946
+ # Apply blocks one-by-one.
1947
+ if self.config.block_group_size == 1:
1948
+ for block_idx, block in enumerate(self.transformer.blocks):
1949
+ if output_hidden_states:
1950
+ # add hidden states
1951
+ all_hidden_states.append(x)
1952
+
1953
+ layer_past = None if past_key_values is None else past_key_values[block_idx]
1954
+ x, cache = block(x, attention_bias=attention_bias, position_ids=position_ids, layer_past=layer_past, use_cache=use_cache)
1955
+
1956
+ if attn_key_values is not None:
1957
+ assert cache is not None
1958
+ attn_key_values.append(cache)
1959
+ else:
1960
+ for group_idx, block_group in enumerate(self.transformer.block_groups):
1961
+ if output_hidden_states:
1962
+ # add hidden states
1963
+ all_hidden_states.append(x)
1964
+
1965
+ layers_past = (
1966
+ None
1967
+ if past_key_values is None
1968
+ else past_key_values[
1969
+ group_idx * self.config.block_group_size : (group_idx + 1) * self.config.block_group_size
1970
+ ]
1971
+ )
1972
+ x, cache = block_group(
1973
+ x, attention_bias=attention_bias, position_ids=position_ids, layers_past=layers_past, use_cache=use_cache
1974
+ )
1975
+ if attn_key_values is not None:
1976
+ assert cache is not None
1977
+ attn_key_values.extend(cache)
1978
+
1979
+ if last_logits_only:
1980
+ # shape: (batch_size, 1, d_model)
1981
+ if append_last_valid_logits is not None:
1982
+ last_valid_output = x[
1983
+ torch.arange(x.shape[0], device=x.device), append_last_valid_logits.to(x.device)]
1984
+ x = last_valid_output.unsqueeze(1)
1985
+ else:
1986
+ x = x[:, -1, :].unsqueeze(1)
1987
+
1988
+ # Apply final layer norm.
1989
+ # shape: (batch_size, seq_len or 1, d_model)
1990
+ x = self.transformer.ln_f(x) # type: ignore
1991
+ if output_hidden_states:
1992
+ # add final hidden state post-final-layernorm, following HuggingFace's convention
1993
+ all_hidden_states.append(x)
1994
+
1995
+ # Get logits.
1996
+ # shape: (batch_size, seq_len or 1, vocab_size)
1997
+ if self.config.weight_tying:
1998
+ logits = F.linear(x, self.transformer.wte.weight, None) # type: ignore
1999
+ else:
2000
+ logits = self.transformer.ff_out(x) # type: ignore
2001
+ if self.config.scale_logits:
2002
+ logits.mul_(1 / math.sqrt(self.config.d_model))
2003
+
2004
+ if not last_logits_only and append_last_valid_logits is not None:
2005
+ last_valid_logit = logits[
2006
+ torch.arange(logits.shape[0], device=logits.device), append_last_valid_logits]
2007
+ logits = torch.cat([logits[:, :-1], last_valid_logit[:, None]], dim=1)
2008
+
2009
+ return ModelOutput(logits=logits, attn_key_values=attn_key_values, hidden_states=tuple(all_hidden_states) if output_hidden_states else None) # type: ignore[arg-type]
2010
+
2011
+
2012
+ class MolmoForCausalLM(PreTrainedModel):
2013
+ config_class = MolmoConfig
2014
+ base_model_prefix = "model"
2015
+ _no_split_modules = ["MolmoBlock"]
2016
+
2017
+ def __init__(self, config: MolmoConfig, model: Optional[Molmo] = None, init_params: bool = False):
2018
+ super().__init__(config)
2019
+
2020
+ if not model:
2021
+ full_config = FullMolmoConfig(
2022
+ image_padding_embed="pad_and_partial_pad",
2023
+ image_pooling_2d="attention-meanq",
2024
+ attention_layer_norm=config.attention_layer_norm,
2025
+ rope_impl="llama",
2026
+ vocab_size=config.vocab_size,
2027
+ max_sequence_length=config.max_position_embeddings,
2028
+ qkv_bias=config.qkv_bias,
2029
+ norm_after=config.norm_after,
2030
+ embedding_size=config.embedding_size,
2031
+ attention_type="sdpa",
2032
+ embedding_dropout=0,
2033
+ attention_dropout=0,
2034
+ residual_dropout=0,
2035
+ rope=True,
2036
+ weight_tying=False,
2037
+ include_bias=False,
2038
+ d_model=config.hidden_size,
2039
+ mlp_hidden_size=config.intermediate_size,
2040
+ n_layers=config.num_hidden_layers,
2041
+ additional_vocab_size=128,
2042
+ n_heads=config.num_attention_heads,
2043
+ n_kv_heads=config.num_key_value_heads,
2044
+ rope_theta=config.rope_theta,
2045
+ layer_norm_eps=config.layer_norm_eps,
2046
+ layer_norm_type=config.layer_norm_type,
2047
+ vit_layers=[-2, -9],
2048
+ vision_backbone=VisionBackboneConfig(
2049
+ image_default_input_size=(336, 336),
2050
+ image_patch_size=14,
2051
+ image_pos_patch_size=14,
2052
+ image_emb_dim=1024,
2053
+ image_num_heads=16,
2054
+ image_num_key_value_heads=16,
2055
+ image_num_layers=23,
2056
+ image_head_dim=64,
2057
+ image_mlp_dim=4096,
2058
+ image_mlp_activations="quick_gelu",
2059
+ image_dropout_rate=0.0,
2060
+ image_num_pos=577,
2061
+ image_norm_eps=1e-5,
2062
+ attention_dropout=0.0,
2063
+ residual_dropout=0.0,
2064
+ initializer_range=0.02,
2065
+ )
2066
+ )
2067
+ self.model = Molmo(full_config, init_params=init_params)
2068
+ else:
2069
+ self.model = model
2070
+
2071
+
2072
+ def forward(
2073
+ self,
2074
+ input_ids: torch.LongTensor = None,
2075
+ inputs_embeds: Optional[torch.FloatTensor] = None,
2076
+ attention_mask: Optional[torch.Tensor] = None,
2077
+ attention_bias: Optional[torch.Tensor] = None,
2078
+ response_mask: Optional[torch.Tensor] = None,
2079
+ images: Optional[torch.Tensor] = None,
2080
+ image_masks: Optional[torch.Tensor] = None,
2081
+ image_input_idx: Optional[torch.Tensor] = None,
2082
+ subsegment_ids: Optional[torch.Tensor] = None,
2083
+ position_ids: Optional[torch.Tensor] = None,
2084
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
2085
+ labels: Optional[torch.LongTensor] = None,
2086
+ loss_masks: Optional[torch.Tensor] = None,
2087
+ use_cache: Optional[bool] = None,
2088
+ last_logits_only: Optional[bool] = None,
2089
+ output_attentions: Optional[bool] = None,
2090
+ output_hidden_states: Optional[bool] = None,
2091
+ append_last_valid_logits: Optional[torch.Tensor] = None,
2092
+ return_dict: Optional[bool] = None,
2093
+ cache_position: Optional[
2094
+ Cache
2095
+ ] = None, # This is a hack mitigation of an issue in transformers `4.39.x` https://github.com/huggingface/transformers/issues/29426
2096
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
2097
+ if use_cache is None:
2098
+ use_cache = self.config.use_cache
2099
+
2100
+ if output_attentions:
2101
+ raise ValueError("output_attentions is not yet supported in Molmo")
2102
+
2103
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
2104
+
2105
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
2106
+ outputs = self.model.forward(
2107
+ input_ids=input_ids,
2108
+ input_embeddings=inputs_embeds,
2109
+ attention_mask=attention_mask,
2110
+ attention_bias=attention_bias,
2111
+ response_mask=response_mask,
2112
+ images=images,
2113
+ image_masks=image_masks,
2114
+ image_input_idx=image_input_idx,
2115
+ subsegment_ids=subsegment_ids,
2116
+ position_ids=position_ids,
2117
+ past_key_values=past_key_values,
2118
+ use_cache=use_cache,
2119
+ last_logits_only=last_logits_only,
2120
+ output_hidden_states=output_hidden_states,
2121
+ append_last_valid_logits=append_last_valid_logits,
2122
+ )
2123
+
2124
+ logits = outputs.logits
2125
+ hidden_states = outputs.hidden_states
2126
+
2127
+ loss = None
2128
+ if labels is not None:
2129
+ if loss_masks is not None:
2130
+ loss_masks = loss_masks * (loss_masks > 0)
2131
+ batch_size_in_tokens = max(loss_masks.sum().item(), 1)
2132
+ labels = labels.long()
2133
+ labels.masked_fill_(~(loss_masks > 0), -100)
2134
+ labels = labels.view(-1)
2135
+ logits_for_loss = logits.to(torch.float32).view(-1, logits.size(-1))
2136
+ loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-100, reduction='none')
2137
+ loss = loss_fct(logits_for_loss, labels)
2138
+ loss = loss.view(input_ids.shape[0], -1)
2139
+ loss = loss * loss_masks
2140
+ loss = loss.sum() / batch_size_in_tokens
2141
+ use_zloss = getattr(self.config, "softmax_auxiliary_loss", False)
2142
+ if use_zloss:
2143
+ z_squared = logits_for_loss.logsumexp(-1).pow(2)
2144
+ z_loss = self.config.softmax_auxiliary_loss_scale * z_squared
2145
+ z_loss = z_loss.view(input_ids.shape[0], -1)
2146
+ z_loss = z_loss * loss_masks
2147
+ z_loss = z_loss.sum() / batch_size_in_tokens
2148
+ loss += z_loss
2149
+ else:
2150
+ # Shift so that tokens < n predict n
2151
+ shift_logits = logits[..., :-1, :].contiguous()
2152
+ shift_labels = labels[..., 1:].contiguous()
2153
+ # Flatten the tokens
2154
+ loss_fct = torch.nn.CrossEntropyLoss()
2155
+ shift_logits = shift_logits.view(-1, self.config.embedding_size)
2156
+ shift_labels = shift_labels.view(-1)
2157
+ # Enable model parallelism
2158
+ shift_labels = shift_labels.to(shift_logits.device)
2159
+ loss = loss_fct(shift_logits, shift_labels)
2160
+
2161
+ if not return_dict:
2162
+ output = (logits,) + outputs[1:]
2163
+ return (loss,) + output if loss is not None else output
2164
+
2165
+ return CausalLMOutputWithPast(
2166
+ loss=loss,
2167
+ logits=logits,
2168
+ past_key_values=outputs.attn_key_values,
2169
+ hidden_states=hidden_states,
2170
+ )
2171
+
2172
+ def can_generate(self) -> bool:
2173
+ return True
2174
+
2175
+ @torch.no_grad()
2176
+ def generate_from_batch(
2177
+ self,
2178
+ batch: Dict[str, Any],
2179
+ generation_config: Optional[GenerationConfig] = None,
2180
+ **kwargs,
2181
+ ):
2182
+ if generation_config is not None:
2183
+ assert generation_config.use_cache
2184
+
2185
+ images = batch.get("images")
2186
+ image_masks = batch.get("image_masks")
2187
+ image_input_idx = batch.get("image_input_idx")
2188
+
2189
+ # Validate inputs.
2190
+ input_ids = batch["input_ids"]
2191
+ batch_size, seq_len = input_ids.shape
2192
+ attention_mask = batch.get("attention_mask", None)
2193
+ max_new_tokens = generation_config.max_new_tokens
2194
+ assert max_new_tokens is not None
2195
+ mask_len = seq_len + max_new_tokens if self.config.use_position_ids else seq_len
2196
+ position_ids: Optional[torch.Tensor] = None
2197
+ append_last_valid_logits: Optional[torch.Tensor] = None
2198
+ if self.config.use_position_ids and attention_mask is None:
2199
+ attention_mask = input_ids != -1
2200
+ position_ids = torch.clamp(
2201
+ torch.cumsum(attention_mask.to(torch.int32), dim=-1) - 1,
2202
+ min=0
2203
+ )
2204
+ append_last_valid_logits = attention_mask.long().sum(dim=-1) - 1
2205
+ attention_mask = torch.cat(
2206
+ [attention_mask, attention_mask.new_ones((batch_size, max_new_tokens))],
2207
+ dim=1,
2208
+ )
2209
+ if attention_mask is not None:
2210
+ assert attention_mask.shape == (batch_size, mask_len)
2211
+
2212
+ out = super().generate(
2213
+ batch["input_ids"],
2214
+ generation_config,
2215
+ attention_mask=attention_mask,
2216
+ images=images,
2217
+ image_masks=image_masks,
2218
+ image_input_idx=image_input_idx,
2219
+ position_ids=position_ids,
2220
+ append_last_valid_logits=append_last_valid_logits,
2221
+ **kwargs,
2222
+ )
2223
+
2224
+ return out
2225
+
2226
+ def prepare_inputs_for_generation(
2227
+ self, input_ids: torch.LongTensor, past_key_values: Optional[List[Tuple]] = None, **kwargs
2228
+ ):
2229
+ if past_key_values:
2230
+ # This is because we want the model to only process the last generated token.
2231
+ input_ids = input_ids[:, -1:]
2232
+
2233
+ if self.config.use_position_ids:
2234
+ attention_mask = kwargs.get("attention_mask")
2235
+ images = kwargs.get("images")
2236
+ image_masks = kwargs.get("image_masks")
2237
+ image_input_idx = kwargs.get("image_input_idx")
2238
+ position_ids = kwargs.get("position_ids")
2239
+ append_last_valid_logits = kwargs.get("append_last_valid_logits")
2240
+ model_inputs = {
2241
+ "input_ids": input_ids,
2242
+ "attention_mask": attention_mask,
2243
+ "position_ids": position_ids,
2244
+ "past_key_values": past_key_values,
2245
+ "use_cache": True,
2246
+ "last_logits_only": True,
2247
+ }
2248
+ if past_key_values is None:
2249
+ model_inputs["images"] = images
2250
+ model_inputs["image_masks"] = image_masks
2251
+ model_inputs["image_input_idx"] = image_input_idx
2252
+ model_inputs["append_last_valid_logits"] = append_last_valid_logits
2253
+ else:
2254
+ model_inputs = {"input_ids": input_ids, "past_key_values": past_key_values}
2255
+
2256
+ model_inputs.update(kwargs)
2257
+ model_inputs["use_cache"] = kwargs.pop("use_cache", self.config.use_cache)
2258
+ return model_inputs
2259
+
2260
+ def _update_model_kwargs_for_generation(
2261
+ self,
2262
+ outputs: ModelOutput,
2263
+ model_kwargs: Dict[str, Any],
2264
+ is_encoder_decoder: bool = False,
2265
+ num_new_tokens: int = 1,
2266
+ ) -> Dict[str, Any]:
2267
+ if self.config.use_position_ids:
2268
+ model_kwargs["position_ids"] = model_kwargs["position_ids"][:, -1:] + 1
2269
+ if "append_last_valid_logits" in model_kwargs:
2270
+ del model_kwargs["append_last_valid_logits"]
2271
+ if "images" in model_kwargs:
2272
+ del model_kwargs["images"]
2273
+ del model_kwargs["image_masks"]
2274
+ del model_kwargs["image_input_idx"]
2275
+ cache_name, cache = super()._extract_past_from_model_output(outputs)
2276
+ model_kwargs[cache_name] = cache
2277
+ model_kwargs["cache_position"] = model_kwargs["cache_position"][-1:] + num_new_tokens
2278
+ return model_kwargs
2279
+
2280
+ def get_input_embeddings(self) -> torch.nn.Module:
2281
+ return self.model.transformer.wte
2282
+
2283
+ def set_input_embeddings(self, value: torch.nn.Module):
2284
+ self.model.transformer.wte = value
2285
+
2286
+ def get_output_embeddings(self):
2287
+ if self.config.weight_tying:
2288
+ return self.model.transformer.wte
2289
+ else:
2290
+ return self.model.transformer.ff_out
2291
+
2292
+ def set_output_embeddings(self, value: torch.nn.Module):
2293
+ if self.config.weight_tying:
2294
+ self.model.transformer.wte = value
2295
+ else:
2296
+ self.model.transformer.ff_out = value
2297
+
2298
+ def tie_weights(self):
2299
+ """
2300
+ This function is intentionally left as a no-op.
2301
+
2302
+ Weight tying is handled as follows:
2303
+ - When the model is initialized, the `ff_out` layer is conditionally defined based on the `weight_tying` configuration.
2304
+ See: `if not config.weight_tying: self.transformer.update(...)` in `olmo/model.py`.
2305
+ - When computing logits, the `wte` weights are used directly if `weight_tying` is enabled.
2306
+ See: `if self.config.weight_tying: logits = F.linear(x, self.transformer.wte.weight, None)` in the `forward` method.
2307
+
2308
+ Therefore, there is no need to explicitly tie the weights in this function.
2309
+ """
2310
+ pass
2311
+
2312
+ def resize_token_embeddings(
2313
+ self, new_num_tokens: Optional[int] = None, pad_to_multiple_of: Optional[int] = None
2314
+ ) -> torch.nn.Embedding:
2315
+ """
2316
+ Resizes input token embeddings matrix of the model if `new_num_tokens != config.embedding_size`.
2317
+
2318
+ Takes care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.
2319
+
2320
+ Arguments:
2321
+ new_num_tokens (`int`, *optional*):
2322
+ The new number of tokens in the embedding matrix. Increasing the size will add newly initialized
2323
+ vectors at the end. Reducing the size will remove vectors from the end. If not provided or `None`, just
2324
+ returns a pointer to the input tokens `torch.nn.Embedding` module of the model without doing anything.
2325
+ pad_to_multiple_of (`int`, *optional*):
2326
+ If set will pad the embedding matrix to a multiple of the provided value. If `new_num_tokens` is set to
2327
+ `None` will just pad the embedding to a multiple of `pad_to_multiple_of`.
2328
+
2329
+ This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
2330
+ `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128. For more
2331
+ details about this, or help on choosing the correct value for resizing, refer to this guide:
2332
+ https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc
2333
+
2334
+ Return:
2335
+ `torch.nn.Embedding`: Pointer to the input tokens Embeddings Module of the model.
2336
+
2337
+ Note:
2338
+ This method differs from the base class implementation by resizing the `embedding_size` attribute of the
2339
+ model configuration instead of the `vocab_size`. It also includes a warning if the resized `embedding_size`
2340
+ is less than the `vocab_size`. In OLMo, `embedding_size` refers to the dimensionality of the model's token
2341
+ embeddings, while `vocab_size` refers to the number of unique tokens in the vocabulary.
2342
+ """
2343
+ model_embeds = self._resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
2344
+ if new_num_tokens is None and pad_to_multiple_of is None:
2345
+ return model_embeds
2346
+
2347
+ # Update base model and current model config
2348
+ self.config.embedding_size = model_embeds.weight.shape[0]
2349
+ self.model.config.embedding_size = model_embeds.weight.shape[0]
2350
+
2351
+ # Check if the embedding size is less than the vocab size
2352
+ if self.config.embedding_size < self.config.vocab_size:
2353
+ warning_message = (
2354
+ f"Resizing token embeddings to size {self.config.embedding_size}, which is less than the vocab size "
2355
+ f"{self.config.vocab_size} defined in the model configuration. Make sure your tokenizer's vocabulary "
2356
+ "size is less than or equal to the new token embedding size."
2357
+ )
2358
+ log.warning(warning_message)
2359
+
2360
+ # Tie weights again if needed
2361
+ self.tie_weights()
2362
+
2363
+ return model_embeds
2364
+
2365
+
2366
+ # Always register for multi-modal features
2367
+ AutoModelForCausalLM.register(MolmoConfig, MolmoForCausalLM)
preprocessing_molmo.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Processor class for Molmo.
3
+ """
4
+
5
+ from typing import Optional
6
+
7
+ import PIL
8
+ from PIL import ImageOps
9
+ from PIL.Image import Image
10
+
11
+ try:
12
+ from typing import Unpack
13
+ except ImportError:
14
+ from typing_extensions import Unpack
15
+
16
+ import numpy as np
17
+ import torch
18
+
19
+ from transformers.image_utils import ImageInput
20
+ from transformers.processing_utils import (
21
+ TextKwargs,
22
+ ProcessingKwargs,
23
+ ProcessorMixin,
24
+ )
25
+
26
+ from transformers.tokenization_utils_base import TextInput, PreTokenizedInput
27
+ from transformers.utils import logging
28
+
29
+ from transformers import AutoTokenizer
30
+ from .image_preprocessing_molmo import MolmoImagesKwargs, MolmoImageProcessor
31
+
32
+
33
+ logger = logging.get_logger(__name__)
34
+
35
+
36
+ DEFAULT_IMAGE_PATCH_TOKEN = f"<im_patch>"
37
+ DEFAULT_IM_START_TOKEN = f"<im_start>"
38
+ DEFAULT_IM_END_TOKEN = f"<im_end>"
39
+ DEFAULT_IM_COL_TOKEN = f"<im_col>"
40
+ IMAGE_PROMPT = "<|image|>"
41
+
42
+ EXTRA_TOKENS = (DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_COL_TOKEN, IMAGE_PROMPT)
43
+
44
+
45
+ def get_special_token_ids(tokenizer):
46
+ ids = tokenizer.encode("".join(EXTRA_TOKENS), add_special_tokens=False)
47
+ assert len(ids) == len(EXTRA_TOKENS)
48
+ return {k: i for k, i in zip(EXTRA_TOKENS, ids)}
49
+
50
+
51
+ class MolmoTextKwargs(TextKwargs, total=False):
52
+ style: Optional[str]
53
+ system_prompt: Optional[str]
54
+ message_format: Optional[str]
55
+ always_start_with_space: Optional[bool]
56
+ sequence_length: Optional[int]
57
+
58
+
59
+ class MolmoProcessorKwargs(ProcessingKwargs, total=False):
60
+ text_kwargs: MolmoTextKwargs
61
+ images_kwargs: MolmoImagesKwargs
62
+ _defaults = {
63
+ "images_kwargs": {
64
+ "max_crops": 12,
65
+ "overlap_margins": [4, 4],
66
+ "base_image_input_size": [336, 336],
67
+ "image_token_length_w": 12,
68
+ "image_token_length_h": 12,
69
+ "image_patch_size": 14,
70
+ "image_padding_mask": True,
71
+ },
72
+ "text_kwargs": {
73
+ "style": "long_caption",
74
+ "system_prompt": "none",
75
+ "message_format": "role",
76
+ "always_start_with_space": True,
77
+ "sequence_length": 1536,
78
+ "padding": False,
79
+ },
80
+ }
81
+
82
+
83
+ class MolmoProcessor(ProcessorMixin):
84
+ attributes = ["image_processor", "tokenizer"]
85
+ image_processor_class = "AutoImageProcessor"
86
+ tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
87
+
88
+ def __init__(self, image_processor: MolmoImageProcessor = None, tokenizer : AutoTokenizer = None, **kwargs):
89
+ # self.image_processor = image_processor
90
+ # self.tokenizer = tokenizer
91
+ super().__init__(image_processor, tokenizer)
92
+ self._special_tokens = None
93
+
94
+ @property
95
+ def special_token_ids(self):
96
+ if self._special_tokens is None:
97
+ self._special_tokens = get_special_token_ids(self.tokenizer)
98
+ return self._special_tokens
99
+
100
+ def get_tokens_input(self, prompt, message_format, always_start_with_space):
101
+ if message_format == "none" or message_format is None:
102
+ pass
103
+ elif message_format == "role":
104
+ prompt = "User: " + prompt + " Assistant:"
105
+ else:
106
+ raise NotImplementedError(f"Message format {message_format} not implemented")
107
+
108
+ if always_start_with_space:
109
+ prompt = " " + prompt
110
+
111
+ tokens = self.tokenizer.encode(prompt, add_special_tokens=False)
112
+
113
+ return tokens
114
+
115
+ def process(
116
+ self,
117
+ text: TextInput = None,
118
+ images: ImageInput = None,
119
+ *,
120
+ tokens: Optional[PreTokenizedInput] = None,
121
+ **kwargs: Unpack[MolmoProcessorKwargs],
122
+ ):
123
+ output_kwargs = self._merge_kwargs(
124
+ MolmoProcessorKwargs,
125
+ tokenizer_init_kwargs=self.tokenizer.init_kwargs,
126
+ **kwargs,
127
+ )
128
+
129
+ if tokens is None:
130
+ tokens = self.get_tokens_input(
131
+ text,
132
+ output_kwargs["text_kwargs"]["message_format"],
133
+ output_kwargs["text_kwargs"]["always_start_with_space"],
134
+ )
135
+
136
+ image_token_id = self.special_token_ids[IMAGE_PROMPT]
137
+
138
+ if images is not None:
139
+ if not isinstance(images, (list, tuple)):
140
+ images = [images]
141
+ image_arrays = []
142
+ for image in images:
143
+ if isinstance(image, Image):
144
+ image = image.convert("RGB")
145
+ # Handle images with EXIF orientation tags, which PIL will ignore by default
146
+ # https://github.com/python-pillow/Pillow/issues/4703
147
+ img = ImageOps.exif_transpose(image)
148
+ image_arrays.append(np.array(image))
149
+ else:
150
+ assert len(image.shape) == 3 and image.shape[-1] == 3
151
+ image_arrays.append(image.astype(np.uint8))
152
+ images = image_arrays
153
+ # For now only support inserting images at the start
154
+ image_idx = [-1]*len(images)
155
+ else:
156
+ image_idx = None
157
+
158
+ sequence_length = output_kwargs["text_kwargs"]["sequence_length"]
159
+
160
+ image_patch_token_id = self.special_token_ids[DEFAULT_IMAGE_PATCH_TOKEN]
161
+ image_col_token_id = self.special_token_ids[DEFAULT_IM_COL_TOKEN]
162
+ image_start_token_id = self.special_token_ids[DEFAULT_IM_START_TOKEN]
163
+ image_end_token_id = self.special_token_ids[DEFAULT_IM_END_TOKEN]
164
+ out = self.image_processor.multimodal_preprocess(
165
+ images=images,
166
+ image_idx=image_idx,
167
+ tokens=np.asarray(tokens).astype(np.int32),
168
+ sequence_length=sequence_length,
169
+ image_patch_token_id=image_patch_token_id,
170
+ image_col_token_id=image_col_token_id,
171
+ image_start_token_id=image_start_token_id,
172
+ image_end_token_id=image_end_token_id,
173
+ **output_kwargs["images_kwargs"]
174
+ )
175
+
176
+ # Prepend BOS
177
+ # qwen2 and olmo do not have a BOS, and instead use EOS as a generic seperator token.
178
+ bos = self.tokenizer.bos_token_id or self.tokenizer.eos_token_id
179
+ decoder_input_tokens = np.pad(out["input_ids"], [[1, 0]], constant_values=bos)
180
+ out["input_ids"] = decoder_input_tokens
181
+ if "image_input_idx" in out:
182
+ # Shift patch mapping up by one since we added BOS
183
+ image_input_idx = out["image_input_idx"]
184
+ out["image_input_idx"] = np.where(image_input_idx < 0, image_input_idx, image_input_idx + 1)
185
+
186
+ for k, v in out.items():
187
+ out[k] = torch.from_numpy(v)
188
+
189
+ return out
190
+
191
+
192
+ MolmoProcessor.register_for_auto_class()
preprocessor_config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoImageProcessor": "image_preprocessing_molmo.MolmoImageProcessor",
4
+ "AutoProcessor": "preprocessing_molmo.MolmoProcessor"
5
+ },
6
+ "base_image_input_size": [
7
+ 336,
8
+ 336
9
+ ],
10
+ "do_normalize": true,
11
+ "image_mean": [
12
+ 0.48145466,
13
+ 0.4578275,
14
+ 0.40821073
15
+ ],
16
+ "image_padding_mask": true,
17
+ "image_patch_size": 14,
18
+ "image_processor_type": "MolmoImageProcessor",
19
+ "image_std": [
20
+ 0.26862954,
21
+ 0.26130258,
22
+ 0.27577711
23
+ ],
24
+ "image_token_length_h": 12,
25
+ "image_token_length_w": 12,
26
+ "max_crops": 12,
27
+ "overlap_margins": [
28
+ 4,
29
+ 4
30
+ ],
31
+ "processor_class": "MolmoProcessor"
32
+ }
processor_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoProcessor": "preprocessing_molmo.MolmoProcessor"
4
+ },
5
+ "processor_class": "MolmoProcessor"
6
+ }
rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da661aad2e9ab98676885cda2d296e7d5781572d0062fef9c91ad25c971522e1
3
+ size 15920
rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76066b4424ebb894fbf93616ab2e9648b9b421dcd3b26e99900e877a4b1aef69
3
+ size 15984
rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1573cf092a799f5b9d7a1ea62ab9b1b58065859e3ab6d98cc28dc4083afdcfdd
3
+ size 15984
rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cbf4bea5d1ec717842d4dc103e72d1adb2a2b31afc91aefe38bcfcba578f77c6
3
+ size 15984
rng_state_4.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d885754fb7b8ce47bda620803ac75712487e2c508ad1b8100c7f9d38da7c661
3
+ size 15984
rng_state_5.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6862d26c10da6510d3a0336dac2f26b1e85421b284a42237463e13cc78ef3df1
3
+ size 16048
rng_state_6.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c6b9cfcbe810c109da95d448566518364e3ee79c9bb31a904613d5a69c8b367
3
+ size 15920
rng_state_7.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f52b3be52613e7b518e640203ac12f79eb7f2fdfae165af3bb755b5db080c178
3
+ size 15920
sft_args.json ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "molmo-7b-d",
3
+ "model_id_or_path": "pbarker/ComputerBase-v0.1-M-3epoch",
4
+ "model_revision": "main",
5
+ "full_determinism": false,
6
+ "sft_type": "full",
7
+ "freeze_parameters": [],
8
+ "freeze_vit": false,
9
+ "freeze_parameters_ratio": 0.0,
10
+ "additional_trainable_parameters": [],
11
+ "tuner_backend": "peft",
12
+ "template_type": "molmo",
13
+ "output_dir": "/workspace/output/molmo-7b-d/v1-20250103-233013",
14
+ "add_output_dir_suffix": true,
15
+ "ddp_backend": "nccl",
16
+ "ddp_find_unused_parameters": null,
17
+ "ddp_broadcast_buffers": null,
18
+ "ddp_timeout": 1800,
19
+ "seed": 42,
20
+ "resume_from_checkpoint": null,
21
+ "resume_only_model": false,
22
+ "ignore_data_skip": false,
23
+ "dtype": "bf16",
24
+ "packing": false,
25
+ "train_backend": "transformers",
26
+ "tp": 1,
27
+ "pp": 1,
28
+ "min_lr": null,
29
+ "sequence_parallel": false,
30
+ "model_kwargs": {},
31
+ "loss_name": null,
32
+ "dataset": [
33
+ "/workspace/train.jsonl"
34
+ ],
35
+ "val_dataset": [
36
+ "/workspace/val.jsonl"
37
+ ],
38
+ "dataset_seed": 42,
39
+ "dataset_test_ratio": 0.0,
40
+ "use_loss_scale": false,
41
+ "loss_scale_config_path": "/workspace/miniconda/lib/python3.12/site-packages/swift/llm/agent/default_loss_scale_config.json",
42
+ "system": null,
43
+ "tools_prompt": "react_en",
44
+ "max_length": 4096,
45
+ "truncation_strategy": "delete",
46
+ "check_dataset_strategy": "none",
47
+ "streaming": false,
48
+ "streaming_val_size": 0,
49
+ "streaming_buffer_size": 16384,
50
+ "model_name": [
51
+ null,
52
+ null
53
+ ],
54
+ "model_author": [
55
+ null,
56
+ null
57
+ ],
58
+ "quant_method": null,
59
+ "quantization_bit": 0,
60
+ "hqq_axis": 0,
61
+ "hqq_dynamic_config_path": null,
62
+ "bnb_4bit_comp_dtype": "bf16",
63
+ "bnb_4bit_quant_type": "nf4",
64
+ "bnb_4bit_use_double_quant": true,
65
+ "bnb_4bit_quant_storage": null,
66
+ "rescale_image": -1,
67
+ "target_modules": "^(model.transformer)(?!.*(lm_head|output|emb|wte|shared)).*",
68
+ "target_regex": null,
69
+ "modules_to_save": [],
70
+ "lora_rank": 8,
71
+ "lora_alpha": 32,
72
+ "lora_dropout": 0.05,
73
+ "lora_bias_trainable": "none",
74
+ "lora_dtype": "AUTO",
75
+ "lora_lr_ratio": null,
76
+ "use_rslora": false,
77
+ "use_dora": false,
78
+ "init_lora_weights": "true",
79
+ "fourier_n_frequency": 2000,
80
+ "fourier_scaling": 300.0,
81
+ "rope_scaling": null,
82
+ "boft_block_size": 4,
83
+ "boft_block_num": 0,
84
+ "boft_n_butterfly_factor": 1,
85
+ "boft_dropout": 0.0,
86
+ "vera_rank": 256,
87
+ "vera_projection_prng_key": 0,
88
+ "vera_dropout": 0.0,
89
+ "vera_d_initial": 0.1,
90
+ "adapter_act": "gelu",
91
+ "adapter_length": 128,
92
+ "use_galore": false,
93
+ "galore_target_modules": null,
94
+ "galore_rank": 128,
95
+ "galore_update_proj_gap": 50,
96
+ "galore_scale": 1.0,
97
+ "galore_proj_type": "std",
98
+ "galore_optim_per_parameter": false,
99
+ "galore_with_embedding": false,
100
+ "galore_quantization": false,
101
+ "galore_proj_quant": false,
102
+ "galore_proj_bits": 4,
103
+ "galore_proj_group_size": 256,
104
+ "galore_cos_threshold": 0.4,
105
+ "galore_gamma_proj": 2,
106
+ "galore_queue_size": 5,
107
+ "adalora_target_r": 8,
108
+ "adalora_init_r": 12,
109
+ "adalora_tinit": 0,
110
+ "adalora_tfinal": 0,
111
+ "adalora_deltaT": 1,
112
+ "adalora_beta1": 0.85,
113
+ "adalora_beta2": 0.85,
114
+ "adalora_orth_reg_weight": 0.5,
115
+ "ia3_feedforward_modules": [],
116
+ "llamapro_num_new_blocks": 4,
117
+ "llamapro_num_groups": null,
118
+ "neftune_noise_alpha": null,
119
+ "neftune_backend": "transformers",
120
+ "lisa_activated_layers": 0,
121
+ "lisa_step_interval": 20,
122
+ "reft_layer_key": null,
123
+ "reft_layers": null,
124
+ "reft_rank": 4,
125
+ "reft_intervention_type": "LoreftIntervention",
126
+ "reft_args": null,
127
+ "use_liger": false,
128
+ "gradient_checkpointing": false,
129
+ "vit_use_gc": true,
130
+ "deepspeed": {
131
+ "fp16": {
132
+ "enabled": "auto",
133
+ "loss_scale": 0,
134
+ "loss_scale_window": 1000,
135
+ "initial_scale_power": 16,
136
+ "hysteresis": 2,
137
+ "min_loss_scale": 1
138
+ },
139
+ "bf16": {
140
+ "enabled": "auto"
141
+ },
142
+ "optimizer": {
143
+ "type": "AdamW",
144
+ "params": {
145
+ "lr": "auto",
146
+ "betas": "auto",
147
+ "eps": "auto",
148
+ "weight_decay": "auto"
149
+ }
150
+ },
151
+ "scheduler": {
152
+ "type": "WarmupCosineLR",
153
+ "params": {
154
+ "total_num_steps": "auto",
155
+ "warmup_num_steps": "auto"
156
+ }
157
+ },
158
+ "zero_optimization": {
159
+ "stage": 3,
160
+ "offload_optimizer": {
161
+ "device": "none",
162
+ "pin_memory": true
163
+ },
164
+ "offload_param": {
165
+ "device": "none",
166
+ "pin_memory": true
167
+ },
168
+ "overlap_comm": true,
169
+ "contiguous_gradients": true,
170
+ "sub_group_size": 1000000000.0,
171
+ "reduce_bucket_size": "auto",
172
+ "stage3_prefetch_bucket_size": "auto",
173
+ "stage3_param_persistence_threshold": "auto",
174
+ "stage3_max_live_parameters": 1000000000.0,
175
+ "stage3_max_reuse_distance": 1000000000.0,
176
+ "stage3_gather_16bit_weights_on_model_save": true
177
+ },
178
+ "gradient_accumulation_steps": "auto",
179
+ "gradient_clipping": "auto",
180
+ "steps_per_print": 2000,
181
+ "train_batch_size": "auto",
182
+ "train_micro_batch_size_per_gpu": "auto",
183
+ "wall_clock_breakdown": false
184
+ },
185
+ "batch_size": 1,
186
+ "eval_batch_size": 1,
187
+ "auto_find_batch_size": false,
188
+ "num_train_epochs": 4,
189
+ "max_steps": -1,
190
+ "optim": "adamw_torch",
191
+ "adam_beta1": 0.9,
192
+ "adam_beta2": 0.95,
193
+ "adam_epsilon": 1e-08,
194
+ "learning_rate": 1e-05,
195
+ "weight_decay": 0.1,
196
+ "gradient_accumulation_steps": 2,
197
+ "max_grad_norm": 1,
198
+ "predict_with_generate": false,
199
+ "lr_scheduler_type": "cosine",
200
+ "lr_scheduler_kwargs": {},
201
+ "warmup_ratio": 0.05,
202
+ "warmup_steps": 0,
203
+ "eval_steps": 200,
204
+ "save_steps": 200,
205
+ "save_only_model": false,
206
+ "save_total_limit": 5,
207
+ "logging_steps": 5,
208
+ "acc_steps": 1,
209
+ "dataloader_num_workers": 1,
210
+ "dataloader_pin_memory": true,
211
+ "dataloader_drop_last": false,
212
+ "push_to_hub": false,
213
+ "hub_model_id": null,
214
+ "hub_token": null,
215
+ "hub_private_repo": false,
216
+ "hub_strategy": "every_save",
217
+ "test_oom_error": false,
218
+ "disable_tqdm": false,
219
+ "lazy_tokenize": true,
220
+ "preprocess_num_proc": 1,
221
+ "use_flash_attn": null,
222
+ "ignore_args_error": false,
223
+ "check_model_is_latest": true,
224
+ "logging_dir": "/workspace/output/molmo-7b-d/v1-20250103-233013/runs",
225
+ "report_to": [
226
+ "wandb"
227
+ ],
228
+ "acc_strategy": "token",
229
+ "save_on_each_node": false,
230
+ "evaluation_strategy": "epoch",
231
+ "save_strategy": "epoch",
232
+ "save_safetensors": true,
233
+ "gpu_memory_fraction": null,
234
+ "include_num_input_tokens_seen": false,
235
+ "local_repo_path": null,
236
+ "custom_register_path": null,
237
+ "custom_dataset_info": null,
238
+ "device_map_config": null,
239
+ "device_max_memory": [],
240
+ "max_new_tokens": 2048,
241
+ "do_sample": null,
242
+ "temperature": null,
243
+ "top_k": null,
244
+ "top_p": null,
245
+ "repetition_penalty": null,
246
+ "num_beams": 1,
247
+ "fsdp": "",
248
+ "fsdp_config": null,
249
+ "sequence_parallel_size": 1,
250
+ "model_layer_cls_name": null,
251
+ "metric_warmup_step": 0,
252
+ "fsdp_num": 1,
253
+ "per_device_train_batch_size": null,
254
+ "per_device_eval_batch_size": null,
255
+ "eval_strategy": "epoch",
256
+ "self_cognition_sample": 0,
257
+ "train_dataset_mix_ratio": 0.0,
258
+ "train_dataset_mix_ds": [
259
+ "ms-bench"
260
+ ],
261
+ "train_dataset_sample": -1,
262
+ "val_dataset_sample": null,
263
+ "safe_serialization": null,
264
+ "only_save_model": null,
265
+ "neftune_alpha": null,
266
+ "deepspeed_config_path": null,
267
+ "model_cache_dir": null,
268
+ "lora_dropout_p": null,
269
+ "lora_target_modules": [],
270
+ "lora_target_regex": null,
271
+ "lora_modules_to_save": [],
272
+ "boft_target_modules": [],
273
+ "boft_modules_to_save": [],
274
+ "vera_target_modules": [],
275
+ "vera_modules_to_save": [],
276
+ "ia3_target_modules": [],
277
+ "ia3_modules_to_save": [],
278
+ "custom_train_dataset_path": [],
279
+ "custom_val_dataset_path": [],
280
+ "device_map_config_path": null,
281
+ "push_hub_strategy": null,
282
+ "use_self_cognition": false,
283
+ "is_multimodal": true,
284
+ "is_vision": true,
285
+ "lora_use_embedding": false,
286
+ "lora_use_all": false,
287
+ "lora_m2s_use_embedding": false,
288
+ "lora_m2s_use_ln": false,
289
+ "torch_dtype": "torch.bfloat16",
290
+ "fp16": false,
291
+ "bf16": true,
292
+ "rank": 0,
293
+ "local_rank": 0,
294
+ "world_size": 8,
295
+ "local_world_size": 8,
296
+ "bnb_4bit_compute_dtype": "torch.bfloat16",
297
+ "load_in_4bit": false,
298
+ "load_in_8bit": false,
299
+ "train_sampler_random": true,
300
+ "train_type": "sft",
301
+ "training_args": "Seq2SeqTrainingArguments(output_dir='/workspace/output/molmo-7b-d/v1-20250103-233013', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=<IntervalStrategy.EPOCH: 'epoch'>, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=2, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=1e-05, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, max_grad_norm=1, num_train_epochs=4, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs={}, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/workspace/output/molmo-7b-d/v1-20250103-233013/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.EPOCH: 'epoch'>, save_steps=200, save_total_limit=5, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend='nccl', tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=200, dataloader_num_workers=1, dataloader_prefetch_factor=None, past_index=-1, run_name='/workspace/output/molmo-7b-d/v1-20250103-233013', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'optimizer': {'type': 'AdamW', 'params': {'lr': 'auto', 'betas': 'auto', 'eps': 'auto', 'weight_decay': 'auto'}}, 'scheduler': {'type': 'WarmupCosineLR', 'params': {'total_num_steps': 'auto', 'warmup_num_steps': 'auto'}}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH: 'adamw_torch'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['wandb'], ddp_find_unused_parameters=True, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=True, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=False, hub_always_push=False, gradient_checkpointing=False, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy=None, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=False, include_num_input_tokens_seen=False, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=False, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=GenerationConfig {\n \"eos_token_id\": 151643,\n \"max_new_tokens\": 2048,\n \"pad_token_id\": 151643\n}\n, acc_strategy='token', loss_name=None, additional_saved_files=[], train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1)"
302
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,435 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "|<EXTRA_TOKENS_0>|",
4
+ "|<EXTRA_TOKENS_1>|",
5
+ "|<EXTRA_TOKENS_2>|",
6
+ "|<EXTRA_TOKENS_3>|",
7
+ "|<EXTRA_TOKENS_4>|",
8
+ "|<EXTRA_TOKENS_5>|",
9
+ "|<EXTRA_TOKENS_6>|",
10
+ "|<EXTRA_TOKENS_7>|",
11
+ "|<EXTRA_TOKENS_8>|",
12
+ "|<EXTRA_TOKENS_9>|",
13
+ "|<EXTRA_TOKENS_10>|",
14
+ "|<EXTRA_TOKENS_11>|",
15
+ "|<EXTRA_TOKENS_12>|",
16
+ "|<EXTRA_TOKENS_13>|",
17
+ "|<EXTRA_TOKENS_14>|",
18
+ "|<EXTRA_TOKENS_15>|",
19
+ "|<EXTRA_TOKENS_16>|",
20
+ "|<EXTRA_TOKENS_17>|",
21
+ "|<EXTRA_TOKENS_18>|",
22
+ "|<EXTRA_TOKENS_19>|",
23
+ "|<EXTRA_TOKENS_20>|",
24
+ "|<EXTRA_TOKENS_21>|",
25
+ "|<EXTRA_TOKENS_22>|",
26
+ "|<EXTRA_TOKENS_23>|",
27
+ "|<EXTRA_TOKENS_24>|",
28
+ "|<EXTRA_TOKENS_25>|",
29
+ "|<EXTRA_TOKENS_26>|",
30
+ "|<EXTRA_TOKENS_27>|",
31
+ "|<EXTRA_TOKENS_28>|",
32
+ "|<EXTRA_TOKENS_29>|",
33
+ "|<EXTRA_TOKENS_30>|",
34
+ "|<EXTRA_TOKENS_31>|",
35
+ "|<EXTRA_TOKENS_32>|",
36
+ "|<EXTRA_TOKENS_33>|",
37
+ "|<EXTRA_TOKENS_34>|",
38
+ "|<EXTRA_TOKENS_35>|",
39
+ "|<EXTRA_TOKENS_36>|",
40
+ "|<EXTRA_TOKENS_37>|",
41
+ "|<EXTRA_TOKENS_38>|",
42
+ "|<EXTRA_TOKENS_39>|",
43
+ "|<EXTRA_TOKENS_40>|",
44
+ "|<EXTRA_TOKENS_41>|",
45
+ "|<EXTRA_TOKENS_42>|",
46
+ "|<EXTRA_TOKENS_43>|",
47
+ "|<EXTRA_TOKENS_44>|",
48
+ "|<EXTRA_TOKENS_45>|",
49
+ "|<EXTRA_TOKENS_46>|",
50
+ "|<EXTRA_TOKENS_47>|",
51
+ "|<EXTRA_TOKENS_48>|",
52
+ "|<EXTRA_TOKENS_49>|",
53
+ "|<EXTRA_TOKENS_50>|",
54
+ "|<EXTRA_TOKENS_51>|",
55
+ "|<EXTRA_TOKENS_52>|",
56
+ "|<EXTRA_TOKENS_53>|",
57
+ "|<EXTRA_TOKENS_54>|",
58
+ "|<EXTRA_TOKENS_55>|",
59
+ "|<EXTRA_TOKENS_56>|",
60
+ "|<EXTRA_TOKENS_57>|",
61
+ "|<EXTRA_TOKENS_58>|",
62
+ "|<EXTRA_TOKENS_59>|",
63
+ "|<EXTRA_TOKENS_60>|",
64
+ "|<EXTRA_TOKENS_61>|",
65
+ "|<EXTRA_TOKENS_62>|",
66
+ "|<EXTRA_TOKENS_63>|",
67
+ "|<EXTRA_TOKENS_64>|",
68
+ "|<EXTRA_TOKENS_65>|",
69
+ "|<EXTRA_TOKENS_66>|",
70
+ "|<EXTRA_TOKENS_67>|",
71
+ "|<EXTRA_TOKENS_68>|",
72
+ "|<EXTRA_TOKENS_69>|",
73
+ "|<EXTRA_TOKENS_70>|",
74
+ "|<EXTRA_TOKENS_71>|",
75
+ "|<EXTRA_TOKENS_72>|",
76
+ "|<EXTRA_TOKENS_73>|",
77
+ "|<EXTRA_TOKENS_74>|",
78
+ "|<EXTRA_TOKENS_75>|",
79
+ "|<EXTRA_TOKENS_76>|",
80
+ "|<EXTRA_TOKENS_77>|",
81
+ "|<EXTRA_TOKENS_78>|",
82
+ "|<EXTRA_TOKENS_79>|",
83
+ "|<EXTRA_TOKENS_80>|",
84
+ "|<EXTRA_TOKENS_81>|",
85
+ "|<EXTRA_TOKENS_82>|",
86
+ "|<EXTRA_TOKENS_83>|",
87
+ "|<EXTRA_TOKENS_84>|",
88
+ "|<EXTRA_TOKENS_85>|",
89
+ "|<EXTRA_TOKENS_86>|",
90
+ "|<EXTRA_TOKENS_87>|",
91
+ "|<EXTRA_TOKENS_88>|",
92
+ "|<EXTRA_TOKENS_89>|",
93
+ "|<EXTRA_TOKENS_90>|",
94
+ "|<EXTRA_TOKENS_91>|",
95
+ "|<EXTRA_TOKENS_92>|",
96
+ "|<EXTRA_TOKENS_93>|",
97
+ "|<EXTRA_TOKENS_94>|",
98
+ "|<EXTRA_TOKENS_95>|",
99
+ "|<EXTRA_TOKENS_96>|",
100
+ "|<EXTRA_TOKENS_97>|",
101
+ "|<EXTRA_TOKENS_98>|",
102
+ "|<EXTRA_TOKENS_99>|",
103
+ "|<EXTRA_TOKENS_100>|",
104
+ "|<EXTRA_TOKENS_101>|",
105
+ "|<EXTRA_TOKENS_102>|",
106
+ "|<EXTRA_TOKENS_103>|",
107
+ "|<EXTRA_TOKENS_104>|",
108
+ "|<EXTRA_TOKENS_105>|",
109
+ "|<EXTRA_TOKENS_106>|",
110
+ "|<EXTRA_TOKENS_107>|",
111
+ "|<EXTRA_TOKENS_108>|",
112
+ "|<EXTRA_TOKENS_109>|",
113
+ "|<EXTRA_TOKENS_110>|",
114
+ "|<EXTRA_TOKENS_111>|",
115
+ "|<EXTRA_TOKENS_112>|",
116
+ "|<EXTRA_TOKENS_113>|",
117
+ "|<EXTRA_TOKENS_114>|",
118
+ "|<EXTRA_TOKENS_115>|",
119
+ "|<EXTRA_TOKENS_116>|",
120
+ "|<EXTRA_TOKENS_117>|",
121
+ "|<EXTRA_TOKENS_118>|",
122
+ "|<EXTRA_TOKENS_119>|",
123
+ "|<EXTRA_TOKENS_120>|",
124
+ "|<EXTRA_TOKENS_121>|",
125
+ "|<EXTRA_TOKENS_122>|",
126
+ "|<EXTRA_TOKENS_123>|",
127
+ "|<EXTRA_TOKENS_124>|",
128
+ "|<EXTRA_TOKENS_125>|",
129
+ "|<EXTRA_TOKENS_126>|",
130
+ "|<EXTRA_TOKENS_127>|",
131
+ "|<EXTRA_TOKENS_128>|",
132
+ "|<EXTRA_TOKENS_129>|",
133
+ "|<EXTRA_TOKENS_130>|",
134
+ "|<EXTRA_TOKENS_131>|",
135
+ "|<EXTRA_TOKENS_132>|",
136
+ "|<EXTRA_TOKENS_133>|",
137
+ "|<EXTRA_TOKENS_134>|",
138
+ "|<EXTRA_TOKENS_135>|",
139
+ "|<EXTRA_TOKENS_136>|",
140
+ "|<EXTRA_TOKENS_137>|",
141
+ "|<EXTRA_TOKENS_138>|",
142
+ "|<EXTRA_TOKENS_139>|",
143
+ "|<EXTRA_TOKENS_140>|",
144
+ "|<EXTRA_TOKENS_141>|",
145
+ "|<EXTRA_TOKENS_142>|",
146
+ "|<EXTRA_TOKENS_143>|",
147
+ "|<EXTRA_TOKENS_144>|",
148
+ "|<EXTRA_TOKENS_145>|",
149
+ "|<EXTRA_TOKENS_146>|",
150
+ "|<EXTRA_TOKENS_147>|",
151
+ "|<EXTRA_TOKENS_148>|",
152
+ "|<EXTRA_TOKENS_149>|",
153
+ "|<EXTRA_TOKENS_150>|",
154
+ "|<EXTRA_TOKENS_151>|",
155
+ "|<EXTRA_TOKENS_152>|",
156
+ "|<EXTRA_TOKENS_153>|",
157
+ "|<EXTRA_TOKENS_154>|",
158
+ "|<EXTRA_TOKENS_155>|",
159
+ "|<EXTRA_TOKENS_156>|",
160
+ "|<EXTRA_TOKENS_157>|",
161
+ "|<EXTRA_TOKENS_158>|",
162
+ "|<EXTRA_TOKENS_159>|",
163
+ "|<EXTRA_TOKENS_160>|",
164
+ "|<EXTRA_TOKENS_161>|",
165
+ "|<EXTRA_TOKENS_162>|",
166
+ "|<EXTRA_TOKENS_163>|",
167
+ "|<EXTRA_TOKENS_164>|",
168
+ "|<EXTRA_TOKENS_165>|",
169
+ "|<EXTRA_TOKENS_166>|",
170
+ "|<EXTRA_TOKENS_167>|",
171
+ "|<EXTRA_TOKENS_168>|",
172
+ "|<EXTRA_TOKENS_169>|",
173
+ "|<EXTRA_TOKENS_170>|",
174
+ "|<EXTRA_TOKENS_171>|",
175
+ "|<EXTRA_TOKENS_172>|",
176
+ "|<EXTRA_TOKENS_173>|",
177
+ "|<EXTRA_TOKENS_174>|",
178
+ "|<EXTRA_TOKENS_175>|",
179
+ "|<EXTRA_TOKENS_176>|",
180
+ "|<EXTRA_TOKENS_177>|",
181
+ "|<EXTRA_TOKENS_178>|",
182
+ "|<EXTRA_TOKENS_179>|",
183
+ "|<EXTRA_TOKENS_180>|",
184
+ "|<EXTRA_TOKENS_181>|",
185
+ "|<EXTRA_TOKENS_182>|",
186
+ "|<EXTRA_TOKENS_183>|",
187
+ "|<EXTRA_TOKENS_184>|",
188
+ "|<EXTRA_TOKENS_185>|",
189
+ "|<EXTRA_TOKENS_186>|",
190
+ "|<EXTRA_TOKENS_187>|",
191
+ "|<EXTRA_TOKENS_188>|",
192
+ "|<EXTRA_TOKENS_189>|",
193
+ "|<EXTRA_TOKENS_190>|",
194
+ "|<EXTRA_TOKENS_191>|",
195
+ "|<EXTRA_TOKENS_192>|",
196
+ "|<EXTRA_TOKENS_193>|",
197
+ "|<EXTRA_TOKENS_194>|",
198
+ "|<EXTRA_TOKENS_195>|",
199
+ "|<EXTRA_TOKENS_196>|",
200
+ "|<EXTRA_TOKENS_197>|",
201
+ "|<EXTRA_TOKENS_198>|",
202
+ "|<EXTRA_TOKENS_199>|",
203
+ "|<EXTRA_TOKENS_200>|",
204
+ "|<EXTRA_TOKENS_201>|",
205
+ "|<EXTRA_TOKENS_202>|",
206
+ "|<EXTRA_TOKENS_203>|",
207
+ "|<EXTRA_TOKENS_204>|",
208
+ "|<EXTRA_TOKENS_205>|",
209
+ "|<EXTRA_TOKENS_206>|",
210
+ "|<EXTRA_TOKENS_207>|",
211
+ "|<EXTRA_TOKENS_208>|",
212
+ "|<EXTRA_TOKENS_209>|",
213
+ "|<EXTRA_TOKENS_210>|",
214
+ "|<EXTRA_TOKENS_211>|",
215
+ "|<EXTRA_TOKENS_212>|",
216
+ "|<EXTRA_TOKENS_213>|",
217
+ "|<EXTRA_TOKENS_214>|",
218
+ "|<EXTRA_TOKENS_215>|",
219
+ "|<EXTRA_TOKENS_216>|",
220
+ "|<EXTRA_TOKENS_217>|",
221
+ "|<EXTRA_TOKENS_218>|",
222
+ "|<EXTRA_TOKENS_219>|",
223
+ "|<EXTRA_TOKENS_220>|",
224
+ "|<EXTRA_TOKENS_221>|",
225
+ "|<EXTRA_TOKENS_222>|",
226
+ "|<EXTRA_TOKENS_223>|",
227
+ "|<EXTRA_TOKENS_224>|",
228
+ "|<EXTRA_TOKENS_225>|",
229
+ "|<EXTRA_TOKENS_226>|",
230
+ "|<EXTRA_TOKENS_227>|",
231
+ "|<EXTRA_TOKENS_228>|",
232
+ "|<EXTRA_TOKENS_229>|",
233
+ "|<EXTRA_TOKENS_230>|",
234
+ "|<EXTRA_TOKENS_231>|",
235
+ "|<EXTRA_TOKENS_232>|",
236
+ "|<EXTRA_TOKENS_233>|",
237
+ "|<EXTRA_TOKENS_234>|",
238
+ "|<EXTRA_TOKENS_235>|",
239
+ "|<EXTRA_TOKENS_236>|",
240
+ "|<EXTRA_TOKENS_237>|",
241
+ "|<EXTRA_TOKENS_238>|",
242
+ "|<EXTRA_TOKENS_239>|",
243
+ "|<EXTRA_TOKENS_240>|",
244
+ "|<EXTRA_TOKENS_241>|",
245
+ "|<EXTRA_TOKENS_242>|",
246
+ "|<EXTRA_TOKENS_243>|",
247
+ "|<EXTRA_TOKENS_244>|",
248
+ "|<EXTRA_TOKENS_245>|",
249
+ "|<EXTRA_TOKENS_246>|",
250
+ "|<EXTRA_TOKENS_247>|",
251
+ "|<EXTRA_TOKENS_248>|",
252
+ "|<EXTRA_TOKENS_249>|",
253
+ "|<EXTRA_TOKENS_250>|",
254
+ "|<EXTRA_TOKENS_251>|",
255
+ "|<EXTRA_TOKENS_252>|",
256
+ "|<EXTRA_TOKENS_253>|",
257
+ "|<EXTRA_TOKENS_254>|",
258
+ "|<EXTRA_TOKENS_255>|",
259
+ "|<EXTRA_TOKENS_256>|",
260
+ "|<EXTRA_TOKENS_257>|",
261
+ "|<EXTRA_TOKENS_258>|",
262
+ "|<EXTRA_TOKENS_259>|",
263
+ "|<EXTRA_TOKENS_260>|",
264
+ "|<EXTRA_TOKENS_261>|",
265
+ "|<EXTRA_TOKENS_262>|",
266
+ "|<EXTRA_TOKENS_263>|",
267
+ "|<EXTRA_TOKENS_264>|",
268
+ "|<EXTRA_TOKENS_265>|",
269
+ "|<EXTRA_TOKENS_266>|",
270
+ "|<EXTRA_TOKENS_267>|",
271
+ "|<EXTRA_TOKENS_268>|",
272
+ "|<EXTRA_TOKENS_269>|",
273
+ "|<EXTRA_TOKENS_270>|",
274
+ "|<EXTRA_TOKENS_271>|",
275
+ "|<EXTRA_TOKENS_272>|",
276
+ "|<EXTRA_TOKENS_273>|",
277
+ "|<EXTRA_TOKENS_274>|",
278
+ "|<EXTRA_TOKENS_275>|",
279
+ "|<EXTRA_TOKENS_276>|",
280
+ "|<EXTRA_TOKENS_277>|",
281
+ "|<EXTRA_TOKENS_278>|",
282
+ "|<EXTRA_TOKENS_279>|",
283
+ "|<EXTRA_TOKENS_280>|",
284
+ "|<EXTRA_TOKENS_281>|",
285
+ "|<EXTRA_TOKENS_282>|",
286
+ "|<EXTRA_TOKENS_283>|",
287
+ "|<EXTRA_TOKENS_284>|",
288
+ "|<EXTRA_TOKENS_285>|",
289
+ "|<EXTRA_TOKENS_286>|",
290
+ "|<EXTRA_TOKENS_287>|",
291
+ "|<EXTRA_TOKENS_288>|",
292
+ "|<EXTRA_TOKENS_289>|",
293
+ "|<EXTRA_TOKENS_290>|",
294
+ "|<EXTRA_TOKENS_291>|",
295
+ "|<EXTRA_TOKENS_292>|",
296
+ "|<EXTRA_TOKENS_293>|",
297
+ "|<EXTRA_TOKENS_294>|",
298
+ "|<EXTRA_TOKENS_295>|",
299
+ "|<EXTRA_TOKENS_296>|",
300
+ "|<EXTRA_TOKENS_297>|",
301
+ "|<EXTRA_TOKENS_298>|",
302
+ "|<EXTRA_TOKENS_299>|",
303
+ "|<EXTRA_TOKENS_300>|",
304
+ "|<EXTRA_TOKENS_301>|",
305
+ "|<EXTRA_TOKENS_302>|",
306
+ "|<EXTRA_TOKENS_303>|",
307
+ "|<EXTRA_TOKENS_304>|",
308
+ "|<EXTRA_TOKENS_305>|",
309
+ "|<EXTRA_TOKENS_306>|",
310
+ "|<EXTRA_TOKENS_307>|",
311
+ "|<EXTRA_TOKENS_308>|",
312
+ "|<EXTRA_TOKENS_309>|",
313
+ "|<EXTRA_TOKENS_310>|",
314
+ "|<EXTRA_TOKENS_311>|",
315
+ "|<EXTRA_TOKENS_312>|",
316
+ "|<EXTRA_TOKENS_313>|",
317
+ "|<EXTRA_TOKENS_314>|",
318
+ "|<EXTRA_TOKENS_315>|",
319
+ "|<EXTRA_TOKENS_316>|",
320
+ "|<EXTRA_TOKENS_317>|",
321
+ "|<EXTRA_TOKENS_318>|",
322
+ "|<EXTRA_TOKENS_319>|",
323
+ "|<EXTRA_TOKENS_320>|",
324
+ "|<EXTRA_TOKENS_321>|",
325
+ "|<EXTRA_TOKENS_322>|",
326
+ "|<EXTRA_TOKENS_323>|",
327
+ "|<EXTRA_TOKENS_324>|",
328
+ "|<EXTRA_TOKENS_325>|",
329
+ "|<EXTRA_TOKENS_326>|",
330
+ "|<EXTRA_TOKENS_327>|",
331
+ "|<EXTRA_TOKENS_328>|",
332
+ "|<EXTRA_TOKENS_329>|",
333
+ "|<EXTRA_TOKENS_330>|",
334
+ "|<EXTRA_TOKENS_331>|",
335
+ "|<EXTRA_TOKENS_332>|",
336
+ "|<EXTRA_TOKENS_333>|",
337
+ "|<EXTRA_TOKENS_334>|",
338
+ "|<EXTRA_TOKENS_335>|",
339
+ "|<EXTRA_TOKENS_336>|",
340
+ "|<EXTRA_TOKENS_337>|",
341
+ "|<EXTRA_TOKENS_338>|",
342
+ "|<EXTRA_TOKENS_339>|",
343
+ "|<EXTRA_TOKENS_340>|",
344
+ "|<EXTRA_TOKENS_341>|",
345
+ "|<EXTRA_TOKENS_342>|",
346
+ "|<EXTRA_TOKENS_343>|",
347
+ "|<EXTRA_TOKENS_344>|",
348
+ "|<EXTRA_TOKENS_345>|",
349
+ "|<EXTRA_TOKENS_346>|",
350
+ "|<EXTRA_TOKENS_347>|",
351
+ "|<EXTRA_TOKENS_348>|",
352
+ "|<EXTRA_TOKENS_349>|",
353
+ "|<EXTRA_TOKENS_350>|",
354
+ "|<EXTRA_TOKENS_351>|",
355
+ "|<EXTRA_TOKENS_352>|",
356
+ "|<EXTRA_TOKENS_353>|",
357
+ "|<EXTRA_TOKENS_354>|",
358
+ "|<EXTRA_TOKENS_355>|",
359
+ "|<EXTRA_TOKENS_356>|",
360
+ "|<EXTRA_TOKENS_357>|",
361
+ "|<EXTRA_TOKENS_358>|",
362
+ "|<EXTRA_TOKENS_359>|",
363
+ "|<EXTRA_TOKENS_360>|",
364
+ "|<EXTRA_TOKENS_361>|",
365
+ "|<EXTRA_TOKENS_362>|",
366
+ "|<EXTRA_TOKENS_363>|",
367
+ "|<EXTRA_TOKENS_364>|",
368
+ "|<EXTRA_TOKENS_365>|",
369
+ "|<EXTRA_TOKENS_366>|",
370
+ "|<EXTRA_TOKENS_367>|",
371
+ "|<EXTRA_TOKENS_368>|",
372
+ "|<EXTRA_TOKENS_369>|",
373
+ "|<EXTRA_TOKENS_370>|",
374
+ "|<EXTRA_TOKENS_371>|",
375
+ "|<EXTRA_TOKENS_372>|",
376
+ "|<EXTRA_TOKENS_373>|",
377
+ "|<EXTRA_TOKENS_374>|",
378
+ "|<EXTRA_TOKENS_375>|",
379
+ "|<EXTRA_TOKENS_376>|",
380
+ "|<EXTRA_TOKENS_377>|",
381
+ "|<EXTRA_TOKENS_378>|",
382
+ "|<EXTRA_TOKENS_379>|",
383
+ "|<EXTRA_TOKENS_380>|",
384
+ "|<EXTRA_TOKENS_381>|",
385
+ "|<EXTRA_TOKENS_382>|",
386
+ "|<EXTRA_TOKENS_383>|",
387
+ "|<EXTRA_TOKENS_384>|",
388
+ "|<EXTRA_TOKENS_385>|",
389
+ "|<EXTRA_TOKENS_386>|",
390
+ "|<EXTRA_TOKENS_387>|",
391
+ "|<EXTRA_TOKENS_388>|",
392
+ "|<EXTRA_TOKENS_389>|",
393
+ "|<EXTRA_TOKENS_390>|",
394
+ "|<EXTRA_TOKENS_391>|",
395
+ "|<EXTRA_TOKENS_392>|",
396
+ "|<EXTRA_TOKENS_393>|",
397
+ "|<EXTRA_TOKENS_394>|",
398
+ "|<EXTRA_TOKENS_395>|",
399
+ "|<EXTRA_TOKENS_396>|",
400
+ "|<EXTRA_TOKENS_397>|",
401
+ "|<EXTRA_TOKENS_398>|",
402
+ "|<EXTRA_TOKENS_399>|",
403
+ "|<EXTRA_TOKENS_400>|",
404
+ "|<EXTRA_TOKENS_401>|",
405
+ "|<EXTRA_TOKENS_402>|",
406
+ "|<EXTRA_TOKENS_403>|",
407
+ "|<EXTRA_TOKENS_404>|",
408
+ "|<EXTRA_TOKENS_405>|",
409
+ "|<EXTRA_TOKENS_406>|",
410
+ "|<EXTRA_TOKENS_407>|",
411
+ "|<EXTRA_TOKENS_408>|",
412
+ "|<EXTRA_TOKENS_409>|",
413
+ "|<EXTRA_TOKENS_410>|",
414
+ "|<EXTRA_TOKENS_411>|",
415
+ "|<EXTRA_TOKENS_412>|",
416
+ "|<EXTRA_TOKENS_413>|",
417
+ "|<EXTRA_TOKENS_414>|",
418
+ "|<EXTRA_TOKENS_415>|",
419
+ "|<EXTRA_TOKENS_416>|",
420
+ "|<EXTRA_TOKENS_417>|",
421
+ "<im_start>",
422
+ "<im_end>",
423
+ "<im_patch>",
424
+ "<im_col>",
425
+ "<|image|>"
426
+ ],
427
+ "eos_token": "<|endoftext|>",
428
+ "pad_token": {
429
+ "content": "<|endoftext|>",
430
+ "lstrip": false,
431
+ "normalized": false,
432
+ "rstrip": false,
433
+ "single_word": false
434
+ }
435
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6248048a83152ce87663c799492fe7e60c8086f3ae51ce7bd255ccc445746fc0
3
+ size 11501432
tokenizer_config.json ADDED
@@ -0,0 +1,3853 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "151643": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "151644": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "151645": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "151646": {
29
+ "content": "|<EXTRA_TOKENS_0>|",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "151647": {
37
+ "content": "|<EXTRA_TOKENS_1>|",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "151648": {
45
+ "content": "|<EXTRA_TOKENS_2>|",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "151649": {
53
+ "content": "|<EXTRA_TOKENS_3>|",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ },
60
+ "151650": {
61
+ "content": "|<EXTRA_TOKENS_4>|",
62
+ "lstrip": false,
63
+ "normalized": false,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": true
67
+ },
68
+ "151651": {
69
+ "content": "|<EXTRA_TOKENS_5>|",
70
+ "lstrip": false,
71
+ "normalized": false,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": true
75
+ },
76
+ "151652": {
77
+ "content": "|<EXTRA_TOKENS_6>|",
78
+ "lstrip": false,
79
+ "normalized": false,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": true
83
+ },
84
+ "151653": {
85
+ "content": "|<EXTRA_TOKENS_7>|",
86
+ "lstrip": false,
87
+ "normalized": false,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": true
91
+ },
92
+ "151654": {
93
+ "content": "|<EXTRA_TOKENS_8>|",
94
+ "lstrip": false,
95
+ "normalized": false,
96
+ "rstrip": false,
97
+ "single_word": false,
98
+ "special": true
99
+ },
100
+ "151655": {
101
+ "content": "|<EXTRA_TOKENS_9>|",
102
+ "lstrip": false,
103
+ "normalized": false,
104
+ "rstrip": false,
105
+ "single_word": false,
106
+ "special": true
107
+ },
108
+ "151656": {
109
+ "content": "|<EXTRA_TOKENS_10>|",
110
+ "lstrip": false,
111
+ "normalized": false,
112
+ "rstrip": false,
113
+ "single_word": false,
114
+ "special": true
115
+ },
116
+ "151657": {
117
+ "content": "|<EXTRA_TOKENS_11>|",
118
+ "lstrip": false,
119
+ "normalized": false,
120
+ "rstrip": false,
121
+ "single_word": false,
122
+ "special": true
123
+ },
124
+ "151658": {
125
+ "content": "|<EXTRA_TOKENS_12>|",
126
+ "lstrip": false,
127
+ "normalized": false,
128
+ "rstrip": false,
129
+ "single_word": false,
130
+ "special": true
131
+ },
132
+ "151659": {
133
+ "content": "|<EXTRA_TOKENS_13>|",
134
+ "lstrip": false,
135
+ "normalized": false,
136
+ "rstrip": false,
137
+ "single_word": false,
138
+ "special": true
139
+ },
140
+ "151660": {
141
+ "content": "|<EXTRA_TOKENS_14>|",
142
+ "lstrip": false,
143
+ "normalized": false,
144
+ "rstrip": false,
145
+ "single_word": false,
146
+ "special": true
147
+ },
148
+ "151661": {
149
+ "content": "|<EXTRA_TOKENS_15>|",
150
+ "lstrip": false,
151
+ "normalized": false,
152
+ "rstrip": false,
153
+ "single_word": false,
154
+ "special": true
155
+ },
156
+ "151662": {
157
+ "content": "|<EXTRA_TOKENS_16>|",
158
+ "lstrip": false,
159
+ "normalized": false,
160
+ "rstrip": false,
161
+ "single_word": false,
162
+ "special": true
163
+ },
164
+ "151663": {
165
+ "content": "|<EXTRA_TOKENS_17>|",
166
+ "lstrip": false,
167
+ "normalized": false,
168
+ "rstrip": false,
169
+ "single_word": false,
170
+ "special": true
171
+ },
172
+ "151664": {
173
+ "content": "|<EXTRA_TOKENS_18>|",
174
+ "lstrip": false,
175
+ "normalized": false,
176
+ "rstrip": false,
177
+ "single_word": false,
178
+ "special": true
179
+ },
180
+ "151665": {
181
+ "content": "|<EXTRA_TOKENS_19>|",
182
+ "lstrip": false,
183
+ "normalized": false,
184
+ "rstrip": false,
185
+ "single_word": false,
186
+ "special": true
187
+ },
188
+ "151666": {
189
+ "content": "|<EXTRA_TOKENS_20>|",
190
+ "lstrip": false,
191
+ "normalized": false,
192
+ "rstrip": false,
193
+ "single_word": false,
194
+ "special": true
195
+ },
196
+ "151667": {
197
+ "content": "|<EXTRA_TOKENS_21>|",
198
+ "lstrip": false,
199
+ "normalized": false,
200
+ "rstrip": false,
201
+ "single_word": false,
202
+ "special": true
203
+ },
204
+ "151668": {
205
+ "content": "|<EXTRA_TOKENS_22>|",
206
+ "lstrip": false,
207
+ "normalized": false,
208
+ "rstrip": false,
209
+ "single_word": false,
210
+ "special": true
211
+ },
212
+ "151669": {
213
+ "content": "|<EXTRA_TOKENS_23>|",
214
+ "lstrip": false,
215
+ "normalized": false,
216
+ "rstrip": false,
217
+ "single_word": false,
218
+ "special": true
219
+ },
220
+ "151670": {
221
+ "content": "|<EXTRA_TOKENS_24>|",
222
+ "lstrip": false,
223
+ "normalized": false,
224
+ "rstrip": false,
225
+ "single_word": false,
226
+ "special": true
227
+ },
228
+ "151671": {
229
+ "content": "|<EXTRA_TOKENS_25>|",
230
+ "lstrip": false,
231
+ "normalized": false,
232
+ "rstrip": false,
233
+ "single_word": false,
234
+ "special": true
235
+ },
236
+ "151672": {
237
+ "content": "|<EXTRA_TOKENS_26>|",
238
+ "lstrip": false,
239
+ "normalized": false,
240
+ "rstrip": false,
241
+ "single_word": false,
242
+ "special": true
243
+ },
244
+ "151673": {
245
+ "content": "|<EXTRA_TOKENS_27>|",
246
+ "lstrip": false,
247
+ "normalized": false,
248
+ "rstrip": false,
249
+ "single_word": false,
250
+ "special": true
251
+ },
252
+ "151674": {
253
+ "content": "|<EXTRA_TOKENS_28>|",
254
+ "lstrip": false,
255
+ "normalized": false,
256
+ "rstrip": false,
257
+ "single_word": false,
258
+ "special": true
259
+ },
260
+ "151675": {
261
+ "content": "|<EXTRA_TOKENS_29>|",
262
+ "lstrip": false,
263
+ "normalized": false,
264
+ "rstrip": false,
265
+ "single_word": false,
266
+ "special": true
267
+ },
268
+ "151676": {
269
+ "content": "|<EXTRA_TOKENS_30>|",
270
+ "lstrip": false,
271
+ "normalized": false,
272
+ "rstrip": false,
273
+ "single_word": false,
274
+ "special": true
275
+ },
276
+ "151677": {
277
+ "content": "|<EXTRA_TOKENS_31>|",
278
+ "lstrip": false,
279
+ "normalized": false,
280
+ "rstrip": false,
281
+ "single_word": false,
282
+ "special": true
283
+ },
284
+ "151678": {
285
+ "content": "|<EXTRA_TOKENS_32>|",
286
+ "lstrip": false,
287
+ "normalized": false,
288
+ "rstrip": false,
289
+ "single_word": false,
290
+ "special": true
291
+ },
292
+ "151679": {
293
+ "content": "|<EXTRA_TOKENS_33>|",
294
+ "lstrip": false,
295
+ "normalized": false,
296
+ "rstrip": false,
297
+ "single_word": false,
298
+ "special": true
299
+ },
300
+ "151680": {
301
+ "content": "|<EXTRA_TOKENS_34>|",
302
+ "lstrip": false,
303
+ "normalized": false,
304
+ "rstrip": false,
305
+ "single_word": false,
306
+ "special": true
307
+ },
308
+ "151681": {
309
+ "content": "|<EXTRA_TOKENS_35>|",
310
+ "lstrip": false,
311
+ "normalized": false,
312
+ "rstrip": false,
313
+ "single_word": false,
314
+ "special": true
315
+ },
316
+ "151682": {
317
+ "content": "|<EXTRA_TOKENS_36>|",
318
+ "lstrip": false,
319
+ "normalized": false,
320
+ "rstrip": false,
321
+ "single_word": false,
322
+ "special": true
323
+ },
324
+ "151683": {
325
+ "content": "|<EXTRA_TOKENS_37>|",
326
+ "lstrip": false,
327
+ "normalized": false,
328
+ "rstrip": false,
329
+ "single_word": false,
330
+ "special": true
331
+ },
332
+ "151684": {
333
+ "content": "|<EXTRA_TOKENS_38>|",
334
+ "lstrip": false,
335
+ "normalized": false,
336
+ "rstrip": false,
337
+ "single_word": false,
338
+ "special": true
339
+ },
340
+ "151685": {
341
+ "content": "|<EXTRA_TOKENS_39>|",
342
+ "lstrip": false,
343
+ "normalized": false,
344
+ "rstrip": false,
345
+ "single_word": false,
346
+ "special": true
347
+ },
348
+ "151686": {
349
+ "content": "|<EXTRA_TOKENS_40>|",
350
+ "lstrip": false,
351
+ "normalized": false,
352
+ "rstrip": false,
353
+ "single_word": false,
354
+ "special": true
355
+ },
356
+ "151687": {
357
+ "content": "|<EXTRA_TOKENS_41>|",
358
+ "lstrip": false,
359
+ "normalized": false,
360
+ "rstrip": false,
361
+ "single_word": false,
362
+ "special": true
363
+ },
364
+ "151688": {
365
+ "content": "|<EXTRA_TOKENS_42>|",
366
+ "lstrip": false,
367
+ "normalized": false,
368
+ "rstrip": false,
369
+ "single_word": false,
370
+ "special": true
371
+ },
372
+ "151689": {
373
+ "content": "|<EXTRA_TOKENS_43>|",
374
+ "lstrip": false,
375
+ "normalized": false,
376
+ "rstrip": false,
377
+ "single_word": false,
378
+ "special": true
379
+ },
380
+ "151690": {
381
+ "content": "|<EXTRA_TOKENS_44>|",
382
+ "lstrip": false,
383
+ "normalized": false,
384
+ "rstrip": false,
385
+ "single_word": false,
386
+ "special": true
387
+ },
388
+ "151691": {
389
+ "content": "|<EXTRA_TOKENS_45>|",
390
+ "lstrip": false,
391
+ "normalized": false,
392
+ "rstrip": false,
393
+ "single_word": false,
394
+ "special": true
395
+ },
396
+ "151692": {
397
+ "content": "|<EXTRA_TOKENS_46>|",
398
+ "lstrip": false,
399
+ "normalized": false,
400
+ "rstrip": false,
401
+ "single_word": false,
402
+ "special": true
403
+ },
404
+ "151693": {
405
+ "content": "|<EXTRA_TOKENS_47>|",
406
+ "lstrip": false,
407
+ "normalized": false,
408
+ "rstrip": false,
409
+ "single_word": false,
410
+ "special": true
411
+ },
412
+ "151694": {
413
+ "content": "|<EXTRA_TOKENS_48>|",
414
+ "lstrip": false,
415
+ "normalized": false,
416
+ "rstrip": false,
417
+ "single_word": false,
418
+ "special": true
419
+ },
420
+ "151695": {
421
+ "content": "|<EXTRA_TOKENS_49>|",
422
+ "lstrip": false,
423
+ "normalized": false,
424
+ "rstrip": false,
425
+ "single_word": false,
426
+ "special": true
427
+ },
428
+ "151696": {
429
+ "content": "|<EXTRA_TOKENS_50>|",
430
+ "lstrip": false,
431
+ "normalized": false,
432
+ "rstrip": false,
433
+ "single_word": false,
434
+ "special": true
435
+ },
436
+ "151697": {
437
+ "content": "|<EXTRA_TOKENS_51>|",
438
+ "lstrip": false,
439
+ "normalized": false,
440
+ "rstrip": false,
441
+ "single_word": false,
442
+ "special": true
443
+ },
444
+ "151698": {
445
+ "content": "|<EXTRA_TOKENS_52>|",
446
+ "lstrip": false,
447
+ "normalized": false,
448
+ "rstrip": false,
449
+ "single_word": false,
450
+ "special": true
451
+ },
452
+ "151699": {
453
+ "content": "|<EXTRA_TOKENS_53>|",
454
+ "lstrip": false,
455
+ "normalized": false,
456
+ "rstrip": false,
457
+ "single_word": false,
458
+ "special": true
459
+ },
460
+ "151700": {
461
+ "content": "|<EXTRA_TOKENS_54>|",
462
+ "lstrip": false,
463
+ "normalized": false,
464
+ "rstrip": false,
465
+ "single_word": false,
466
+ "special": true
467
+ },
468
+ "151701": {
469
+ "content": "|<EXTRA_TOKENS_55>|",
470
+ "lstrip": false,
471
+ "normalized": false,
472
+ "rstrip": false,
473
+ "single_word": false,
474
+ "special": true
475
+ },
476
+ "151702": {
477
+ "content": "|<EXTRA_TOKENS_56>|",
478
+ "lstrip": false,
479
+ "normalized": false,
480
+ "rstrip": false,
481
+ "single_word": false,
482
+ "special": true
483
+ },
484
+ "151703": {
485
+ "content": "|<EXTRA_TOKENS_57>|",
486
+ "lstrip": false,
487
+ "normalized": false,
488
+ "rstrip": false,
489
+ "single_word": false,
490
+ "special": true
491
+ },
492
+ "151704": {
493
+ "content": "|<EXTRA_TOKENS_58>|",
494
+ "lstrip": false,
495
+ "normalized": false,
496
+ "rstrip": false,
497
+ "single_word": false,
498
+ "special": true
499
+ },
500
+ "151705": {
501
+ "content": "|<EXTRA_TOKENS_59>|",
502
+ "lstrip": false,
503
+ "normalized": false,
504
+ "rstrip": false,
505
+ "single_word": false,
506
+ "special": true
507
+ },
508
+ "151706": {
509
+ "content": "|<EXTRA_TOKENS_60>|",
510
+ "lstrip": false,
511
+ "normalized": false,
512
+ "rstrip": false,
513
+ "single_word": false,
514
+ "special": true
515
+ },
516
+ "151707": {
517
+ "content": "|<EXTRA_TOKENS_61>|",
518
+ "lstrip": false,
519
+ "normalized": false,
520
+ "rstrip": false,
521
+ "single_word": false,
522
+ "special": true
523
+ },
524
+ "151708": {
525
+ "content": "|<EXTRA_TOKENS_62>|",
526
+ "lstrip": false,
527
+ "normalized": false,
528
+ "rstrip": false,
529
+ "single_word": false,
530
+ "special": true
531
+ },
532
+ "151709": {
533
+ "content": "|<EXTRA_TOKENS_63>|",
534
+ "lstrip": false,
535
+ "normalized": false,
536
+ "rstrip": false,
537
+ "single_word": false,
538
+ "special": true
539
+ },
540
+ "151710": {
541
+ "content": "|<EXTRA_TOKENS_64>|",
542
+ "lstrip": false,
543
+ "normalized": false,
544
+ "rstrip": false,
545
+ "single_word": false,
546
+ "special": true
547
+ },
548
+ "151711": {
549
+ "content": "|<EXTRA_TOKENS_65>|",
550
+ "lstrip": false,
551
+ "normalized": false,
552
+ "rstrip": false,
553
+ "single_word": false,
554
+ "special": true
555
+ },
556
+ "151712": {
557
+ "content": "|<EXTRA_TOKENS_66>|",
558
+ "lstrip": false,
559
+ "normalized": false,
560
+ "rstrip": false,
561
+ "single_word": false,
562
+ "special": true
563
+ },
564
+ "151713": {
565
+ "content": "|<EXTRA_TOKENS_67>|",
566
+ "lstrip": false,
567
+ "normalized": false,
568
+ "rstrip": false,
569
+ "single_word": false,
570
+ "special": true
571
+ },
572
+ "151714": {
573
+ "content": "|<EXTRA_TOKENS_68>|",
574
+ "lstrip": false,
575
+ "normalized": false,
576
+ "rstrip": false,
577
+ "single_word": false,
578
+ "special": true
579
+ },
580
+ "151715": {
581
+ "content": "|<EXTRA_TOKENS_69>|",
582
+ "lstrip": false,
583
+ "normalized": false,
584
+ "rstrip": false,
585
+ "single_word": false,
586
+ "special": true
587
+ },
588
+ "151716": {
589
+ "content": "|<EXTRA_TOKENS_70>|",
590
+ "lstrip": false,
591
+ "normalized": false,
592
+ "rstrip": false,
593
+ "single_word": false,
594
+ "special": true
595
+ },
596
+ "151717": {
597
+ "content": "|<EXTRA_TOKENS_71>|",
598
+ "lstrip": false,
599
+ "normalized": false,
600
+ "rstrip": false,
601
+ "single_word": false,
602
+ "special": true
603
+ },
604
+ "151718": {
605
+ "content": "|<EXTRA_TOKENS_72>|",
606
+ "lstrip": false,
607
+ "normalized": false,
608
+ "rstrip": false,
609
+ "single_word": false,
610
+ "special": true
611
+ },
612
+ "151719": {
613
+ "content": "|<EXTRA_TOKENS_73>|",
614
+ "lstrip": false,
615
+ "normalized": false,
616
+ "rstrip": false,
617
+ "single_word": false,
618
+ "special": true
619
+ },
620
+ "151720": {
621
+ "content": "|<EXTRA_TOKENS_74>|",
622
+ "lstrip": false,
623
+ "normalized": false,
624
+ "rstrip": false,
625
+ "single_word": false,
626
+ "special": true
627
+ },
628
+ "151721": {
629
+ "content": "|<EXTRA_TOKENS_75>|",
630
+ "lstrip": false,
631
+ "normalized": false,
632
+ "rstrip": false,
633
+ "single_word": false,
634
+ "special": true
635
+ },
636
+ "151722": {
637
+ "content": "|<EXTRA_TOKENS_76>|",
638
+ "lstrip": false,
639
+ "normalized": false,
640
+ "rstrip": false,
641
+ "single_word": false,
642
+ "special": true
643
+ },
644
+ "151723": {
645
+ "content": "|<EXTRA_TOKENS_77>|",
646
+ "lstrip": false,
647
+ "normalized": false,
648
+ "rstrip": false,
649
+ "single_word": false,
650
+ "special": true
651
+ },
652
+ "151724": {
653
+ "content": "|<EXTRA_TOKENS_78>|",
654
+ "lstrip": false,
655
+ "normalized": false,
656
+ "rstrip": false,
657
+ "single_word": false,
658
+ "special": true
659
+ },
660
+ "151725": {
661
+ "content": "|<EXTRA_TOKENS_79>|",
662
+ "lstrip": false,
663
+ "normalized": false,
664
+ "rstrip": false,
665
+ "single_word": false,
666
+ "special": true
667
+ },
668
+ "151726": {
669
+ "content": "|<EXTRA_TOKENS_80>|",
670
+ "lstrip": false,
671
+ "normalized": false,
672
+ "rstrip": false,
673
+ "single_word": false,
674
+ "special": true
675
+ },
676
+ "151727": {
677
+ "content": "|<EXTRA_TOKENS_81>|",
678
+ "lstrip": false,
679
+ "normalized": false,
680
+ "rstrip": false,
681
+ "single_word": false,
682
+ "special": true
683
+ },
684
+ "151728": {
685
+ "content": "|<EXTRA_TOKENS_82>|",
686
+ "lstrip": false,
687
+ "normalized": false,
688
+ "rstrip": false,
689
+ "single_word": false,
690
+ "special": true
691
+ },
692
+ "151729": {
693
+ "content": "|<EXTRA_TOKENS_83>|",
694
+ "lstrip": false,
695
+ "normalized": false,
696
+ "rstrip": false,
697
+ "single_word": false,
698
+ "special": true
699
+ },
700
+ "151730": {
701
+ "content": "|<EXTRA_TOKENS_84>|",
702
+ "lstrip": false,
703
+ "normalized": false,
704
+ "rstrip": false,
705
+ "single_word": false,
706
+ "special": true
707
+ },
708
+ "151731": {
709
+ "content": "|<EXTRA_TOKENS_85>|",
710
+ "lstrip": false,
711
+ "normalized": false,
712
+ "rstrip": false,
713
+ "single_word": false,
714
+ "special": true
715
+ },
716
+ "151732": {
717
+ "content": "|<EXTRA_TOKENS_86>|",
718
+ "lstrip": false,
719
+ "normalized": false,
720
+ "rstrip": false,
721
+ "single_word": false,
722
+ "special": true
723
+ },
724
+ "151733": {
725
+ "content": "|<EXTRA_TOKENS_87>|",
726
+ "lstrip": false,
727
+ "normalized": false,
728
+ "rstrip": false,
729
+ "single_word": false,
730
+ "special": true
731
+ },
732
+ "151734": {
733
+ "content": "|<EXTRA_TOKENS_88>|",
734
+ "lstrip": false,
735
+ "normalized": false,
736
+ "rstrip": false,
737
+ "single_word": false,
738
+ "special": true
739
+ },
740
+ "151735": {
741
+ "content": "|<EXTRA_TOKENS_89>|",
742
+ "lstrip": false,
743
+ "normalized": false,
744
+ "rstrip": false,
745
+ "single_word": false,
746
+ "special": true
747
+ },
748
+ "151736": {
749
+ "content": "|<EXTRA_TOKENS_90>|",
750
+ "lstrip": false,
751
+ "normalized": false,
752
+ "rstrip": false,
753
+ "single_word": false,
754
+ "special": true
755
+ },
756
+ "151737": {
757
+ "content": "|<EXTRA_TOKENS_91>|",
758
+ "lstrip": false,
759
+ "normalized": false,
760
+ "rstrip": false,
761
+ "single_word": false,
762
+ "special": true
763
+ },
764
+ "151738": {
765
+ "content": "|<EXTRA_TOKENS_92>|",
766
+ "lstrip": false,
767
+ "normalized": false,
768
+ "rstrip": false,
769
+ "single_word": false,
770
+ "special": true
771
+ },
772
+ "151739": {
773
+ "content": "|<EXTRA_TOKENS_93>|",
774
+ "lstrip": false,
775
+ "normalized": false,
776
+ "rstrip": false,
777
+ "single_word": false,
778
+ "special": true
779
+ },
780
+ "151740": {
781
+ "content": "|<EXTRA_TOKENS_94>|",
782
+ "lstrip": false,
783
+ "normalized": false,
784
+ "rstrip": false,
785
+ "single_word": false,
786
+ "special": true
787
+ },
788
+ "151741": {
789
+ "content": "|<EXTRA_TOKENS_95>|",
790
+ "lstrip": false,
791
+ "normalized": false,
792
+ "rstrip": false,
793
+ "single_word": false,
794
+ "special": true
795
+ },
796
+ "151742": {
797
+ "content": "|<EXTRA_TOKENS_96>|",
798
+ "lstrip": false,
799
+ "normalized": false,
800
+ "rstrip": false,
801
+ "single_word": false,
802
+ "special": true
803
+ },
804
+ "151743": {
805
+ "content": "|<EXTRA_TOKENS_97>|",
806
+ "lstrip": false,
807
+ "normalized": false,
808
+ "rstrip": false,
809
+ "single_word": false,
810
+ "special": true
811
+ },
812
+ "151744": {
813
+ "content": "|<EXTRA_TOKENS_98>|",
814
+ "lstrip": false,
815
+ "normalized": false,
816
+ "rstrip": false,
817
+ "single_word": false,
818
+ "special": true
819
+ },
820
+ "151745": {
821
+ "content": "|<EXTRA_TOKENS_99>|",
822
+ "lstrip": false,
823
+ "normalized": false,
824
+ "rstrip": false,
825
+ "single_word": false,
826
+ "special": true
827
+ },
828
+ "151746": {
829
+ "content": "|<EXTRA_TOKENS_100>|",
830
+ "lstrip": false,
831
+ "normalized": false,
832
+ "rstrip": false,
833
+ "single_word": false,
834
+ "special": true
835
+ },
836
+ "151747": {
837
+ "content": "|<EXTRA_TOKENS_101>|",
838
+ "lstrip": false,
839
+ "normalized": false,
840
+ "rstrip": false,
841
+ "single_word": false,
842
+ "special": true
843
+ },
844
+ "151748": {
845
+ "content": "|<EXTRA_TOKENS_102>|",
846
+ "lstrip": false,
847
+ "normalized": false,
848
+ "rstrip": false,
849
+ "single_word": false,
850
+ "special": true
851
+ },
852
+ "151749": {
853
+ "content": "|<EXTRA_TOKENS_103>|",
854
+ "lstrip": false,
855
+ "normalized": false,
856
+ "rstrip": false,
857
+ "single_word": false,
858
+ "special": true
859
+ },
860
+ "151750": {
861
+ "content": "|<EXTRA_TOKENS_104>|",
862
+ "lstrip": false,
863
+ "normalized": false,
864
+ "rstrip": false,
865
+ "single_word": false,
866
+ "special": true
867
+ },
868
+ "151751": {
869
+ "content": "|<EXTRA_TOKENS_105>|",
870
+ "lstrip": false,
871
+ "normalized": false,
872
+ "rstrip": false,
873
+ "single_word": false,
874
+ "special": true
875
+ },
876
+ "151752": {
877
+ "content": "|<EXTRA_TOKENS_106>|",
878
+ "lstrip": false,
879
+ "normalized": false,
880
+ "rstrip": false,
881
+ "single_word": false,
882
+ "special": true
883
+ },
884
+ "151753": {
885
+ "content": "|<EXTRA_TOKENS_107>|",
886
+ "lstrip": false,
887
+ "normalized": false,
888
+ "rstrip": false,
889
+ "single_word": false,
890
+ "special": true
891
+ },
892
+ "151754": {
893
+ "content": "|<EXTRA_TOKENS_108>|",
894
+ "lstrip": false,
895
+ "normalized": false,
896
+ "rstrip": false,
897
+ "single_word": false,
898
+ "special": true
899
+ },
900
+ "151755": {
901
+ "content": "|<EXTRA_TOKENS_109>|",
902
+ "lstrip": false,
903
+ "normalized": false,
904
+ "rstrip": false,
905
+ "single_word": false,
906
+ "special": true
907
+ },
908
+ "151756": {
909
+ "content": "|<EXTRA_TOKENS_110>|",
910
+ "lstrip": false,
911
+ "normalized": false,
912
+ "rstrip": false,
913
+ "single_word": false,
914
+ "special": true
915
+ },
916
+ "151757": {
917
+ "content": "|<EXTRA_TOKENS_111>|",
918
+ "lstrip": false,
919
+ "normalized": false,
920
+ "rstrip": false,
921
+ "single_word": false,
922
+ "special": true
923
+ },
924
+ "151758": {
925
+ "content": "|<EXTRA_TOKENS_112>|",
926
+ "lstrip": false,
927
+ "normalized": false,
928
+ "rstrip": false,
929
+ "single_word": false,
930
+ "special": true
931
+ },
932
+ "151759": {
933
+ "content": "|<EXTRA_TOKENS_113>|",
934
+ "lstrip": false,
935
+ "normalized": false,
936
+ "rstrip": false,
937
+ "single_word": false,
938
+ "special": true
939
+ },
940
+ "151760": {
941
+ "content": "|<EXTRA_TOKENS_114>|",
942
+ "lstrip": false,
943
+ "normalized": false,
944
+ "rstrip": false,
945
+ "single_word": false,
946
+ "special": true
947
+ },
948
+ "151761": {
949
+ "content": "|<EXTRA_TOKENS_115>|",
950
+ "lstrip": false,
951
+ "normalized": false,
952
+ "rstrip": false,
953
+ "single_word": false,
954
+ "special": true
955
+ },
956
+ "151762": {
957
+ "content": "|<EXTRA_TOKENS_116>|",
958
+ "lstrip": false,
959
+ "normalized": false,
960
+ "rstrip": false,
961
+ "single_word": false,
962
+ "special": true
963
+ },
964
+ "151763": {
965
+ "content": "|<EXTRA_TOKENS_117>|",
966
+ "lstrip": false,
967
+ "normalized": false,
968
+ "rstrip": false,
969
+ "single_word": false,
970
+ "special": true
971
+ },
972
+ "151764": {
973
+ "content": "|<EXTRA_TOKENS_118>|",
974
+ "lstrip": false,
975
+ "normalized": false,
976
+ "rstrip": false,
977
+ "single_word": false,
978
+ "special": true
979
+ },
980
+ "151765": {
981
+ "content": "|<EXTRA_TOKENS_119>|",
982
+ "lstrip": false,
983
+ "normalized": false,
984
+ "rstrip": false,
985
+ "single_word": false,
986
+ "special": true
987
+ },
988
+ "151766": {
989
+ "content": "|<EXTRA_TOKENS_120>|",
990
+ "lstrip": false,
991
+ "normalized": false,
992
+ "rstrip": false,
993
+ "single_word": false,
994
+ "special": true
995
+ },
996
+ "151767": {
997
+ "content": "|<EXTRA_TOKENS_121>|",
998
+ "lstrip": false,
999
+ "normalized": false,
1000
+ "rstrip": false,
1001
+ "single_word": false,
1002
+ "special": true
1003
+ },
1004
+ "151768": {
1005
+ "content": "|<EXTRA_TOKENS_122>|",
1006
+ "lstrip": false,
1007
+ "normalized": false,
1008
+ "rstrip": false,
1009
+ "single_word": false,
1010
+ "special": true
1011
+ },
1012
+ "151769": {
1013
+ "content": "|<EXTRA_TOKENS_123>|",
1014
+ "lstrip": false,
1015
+ "normalized": false,
1016
+ "rstrip": false,
1017
+ "single_word": false,
1018
+ "special": true
1019
+ },
1020
+ "151770": {
1021
+ "content": "|<EXTRA_TOKENS_124>|",
1022
+ "lstrip": false,
1023
+ "normalized": false,
1024
+ "rstrip": false,
1025
+ "single_word": false,
1026
+ "special": true
1027
+ },
1028
+ "151771": {
1029
+ "content": "|<EXTRA_TOKENS_125>|",
1030
+ "lstrip": false,
1031
+ "normalized": false,
1032
+ "rstrip": false,
1033
+ "single_word": false,
1034
+ "special": true
1035
+ },
1036
+ "151772": {
1037
+ "content": "|<EXTRA_TOKENS_126>|",
1038
+ "lstrip": false,
1039
+ "normalized": false,
1040
+ "rstrip": false,
1041
+ "single_word": false,
1042
+ "special": true
1043
+ },
1044
+ "151773": {
1045
+ "content": "|<EXTRA_TOKENS_127>|",
1046
+ "lstrip": false,
1047
+ "normalized": false,
1048
+ "rstrip": false,
1049
+ "single_word": false,
1050
+ "special": true
1051
+ },
1052
+ "151774": {
1053
+ "content": "|<EXTRA_TOKENS_128>|",
1054
+ "lstrip": false,
1055
+ "normalized": false,
1056
+ "rstrip": false,
1057
+ "single_word": false,
1058
+ "special": true
1059
+ },
1060
+ "151775": {
1061
+ "content": "|<EXTRA_TOKENS_129>|",
1062
+ "lstrip": false,
1063
+ "normalized": false,
1064
+ "rstrip": false,
1065
+ "single_word": false,
1066
+ "special": true
1067
+ },
1068
+ "151776": {
1069
+ "content": "|<EXTRA_TOKENS_130>|",
1070
+ "lstrip": false,
1071
+ "normalized": false,
1072
+ "rstrip": false,
1073
+ "single_word": false,
1074
+ "special": true
1075
+ },
1076
+ "151777": {
1077
+ "content": "|<EXTRA_TOKENS_131>|",
1078
+ "lstrip": false,
1079
+ "normalized": false,
1080
+ "rstrip": false,
1081
+ "single_word": false,
1082
+ "special": true
1083
+ },
1084
+ "151778": {
1085
+ "content": "|<EXTRA_TOKENS_132>|",
1086
+ "lstrip": false,
1087
+ "normalized": false,
1088
+ "rstrip": false,
1089
+ "single_word": false,
1090
+ "special": true
1091
+ },
1092
+ "151779": {
1093
+ "content": "|<EXTRA_TOKENS_133>|",
1094
+ "lstrip": false,
1095
+ "normalized": false,
1096
+ "rstrip": false,
1097
+ "single_word": false,
1098
+ "special": true
1099
+ },
1100
+ "151780": {
1101
+ "content": "|<EXTRA_TOKENS_134>|",
1102
+ "lstrip": false,
1103
+ "normalized": false,
1104
+ "rstrip": false,
1105
+ "single_word": false,
1106
+ "special": true
1107
+ },
1108
+ "151781": {
1109
+ "content": "|<EXTRA_TOKENS_135>|",
1110
+ "lstrip": false,
1111
+ "normalized": false,
1112
+ "rstrip": false,
1113
+ "single_word": false,
1114
+ "special": true
1115
+ },
1116
+ "151782": {
1117
+ "content": "|<EXTRA_TOKENS_136>|",
1118
+ "lstrip": false,
1119
+ "normalized": false,
1120
+ "rstrip": false,
1121
+ "single_word": false,
1122
+ "special": true
1123
+ },
1124
+ "151783": {
1125
+ "content": "|<EXTRA_TOKENS_137>|",
1126
+ "lstrip": false,
1127
+ "normalized": false,
1128
+ "rstrip": false,
1129
+ "single_word": false,
1130
+ "special": true
1131
+ },
1132
+ "151784": {
1133
+ "content": "|<EXTRA_TOKENS_138>|",
1134
+ "lstrip": false,
1135
+ "normalized": false,
1136
+ "rstrip": false,
1137
+ "single_word": false,
1138
+ "special": true
1139
+ },
1140
+ "151785": {
1141
+ "content": "|<EXTRA_TOKENS_139>|",
1142
+ "lstrip": false,
1143
+ "normalized": false,
1144
+ "rstrip": false,
1145
+ "single_word": false,
1146
+ "special": true
1147
+ },
1148
+ "151786": {
1149
+ "content": "|<EXTRA_TOKENS_140>|",
1150
+ "lstrip": false,
1151
+ "normalized": false,
1152
+ "rstrip": false,
1153
+ "single_word": false,
1154
+ "special": true
1155
+ },
1156
+ "151787": {
1157
+ "content": "|<EXTRA_TOKENS_141>|",
1158
+ "lstrip": false,
1159
+ "normalized": false,
1160
+ "rstrip": false,
1161
+ "single_word": false,
1162
+ "special": true
1163
+ },
1164
+ "151788": {
1165
+ "content": "|<EXTRA_TOKENS_142>|",
1166
+ "lstrip": false,
1167
+ "normalized": false,
1168
+ "rstrip": false,
1169
+ "single_word": false,
1170
+ "special": true
1171
+ },
1172
+ "151789": {
1173
+ "content": "|<EXTRA_TOKENS_143>|",
1174
+ "lstrip": false,
1175
+ "normalized": false,
1176
+ "rstrip": false,
1177
+ "single_word": false,
1178
+ "special": true
1179
+ },
1180
+ "151790": {
1181
+ "content": "|<EXTRA_TOKENS_144>|",
1182
+ "lstrip": false,
1183
+ "normalized": false,
1184
+ "rstrip": false,
1185
+ "single_word": false,
1186
+ "special": true
1187
+ },
1188
+ "151791": {
1189
+ "content": "|<EXTRA_TOKENS_145>|",
1190
+ "lstrip": false,
1191
+ "normalized": false,
1192
+ "rstrip": false,
1193
+ "single_word": false,
1194
+ "special": true
1195
+ },
1196
+ "151792": {
1197
+ "content": "|<EXTRA_TOKENS_146>|",
1198
+ "lstrip": false,
1199
+ "normalized": false,
1200
+ "rstrip": false,
1201
+ "single_word": false,
1202
+ "special": true
1203
+ },
1204
+ "151793": {
1205
+ "content": "|<EXTRA_TOKENS_147>|",
1206
+ "lstrip": false,
1207
+ "normalized": false,
1208
+ "rstrip": false,
1209
+ "single_word": false,
1210
+ "special": true
1211
+ },
1212
+ "151794": {
1213
+ "content": "|<EXTRA_TOKENS_148>|",
1214
+ "lstrip": false,
1215
+ "normalized": false,
1216
+ "rstrip": false,
1217
+ "single_word": false,
1218
+ "special": true
1219
+ },
1220
+ "151795": {
1221
+ "content": "|<EXTRA_TOKENS_149>|",
1222
+ "lstrip": false,
1223
+ "normalized": false,
1224
+ "rstrip": false,
1225
+ "single_word": false,
1226
+ "special": true
1227
+ },
1228
+ "151796": {
1229
+ "content": "|<EXTRA_TOKENS_150>|",
1230
+ "lstrip": false,
1231
+ "normalized": false,
1232
+ "rstrip": false,
1233
+ "single_word": false,
1234
+ "special": true
1235
+ },
1236
+ "151797": {
1237
+ "content": "|<EXTRA_TOKENS_151>|",
1238
+ "lstrip": false,
1239
+ "normalized": false,
1240
+ "rstrip": false,
1241
+ "single_word": false,
1242
+ "special": true
1243
+ },
1244
+ "151798": {
1245
+ "content": "|<EXTRA_TOKENS_152>|",
1246
+ "lstrip": false,
1247
+ "normalized": false,
1248
+ "rstrip": false,
1249
+ "single_word": false,
1250
+ "special": true
1251
+ },
1252
+ "151799": {
1253
+ "content": "|<EXTRA_TOKENS_153>|",
1254
+ "lstrip": false,
1255
+ "normalized": false,
1256
+ "rstrip": false,
1257
+ "single_word": false,
1258
+ "special": true
1259
+ },
1260
+ "151800": {
1261
+ "content": "|<EXTRA_TOKENS_154>|",
1262
+ "lstrip": false,
1263
+ "normalized": false,
1264
+ "rstrip": false,
1265
+ "single_word": false,
1266
+ "special": true
1267
+ },
1268
+ "151801": {
1269
+ "content": "|<EXTRA_TOKENS_155>|",
1270
+ "lstrip": false,
1271
+ "normalized": false,
1272
+ "rstrip": false,
1273
+ "single_word": false,
1274
+ "special": true
1275
+ },
1276
+ "151802": {
1277
+ "content": "|<EXTRA_TOKENS_156>|",
1278
+ "lstrip": false,
1279
+ "normalized": false,
1280
+ "rstrip": false,
1281
+ "single_word": false,
1282
+ "special": true
1283
+ },
1284
+ "151803": {
1285
+ "content": "|<EXTRA_TOKENS_157>|",
1286
+ "lstrip": false,
1287
+ "normalized": false,
1288
+ "rstrip": false,
1289
+ "single_word": false,
1290
+ "special": true
1291
+ },
1292
+ "151804": {
1293
+ "content": "|<EXTRA_TOKENS_158>|",
1294
+ "lstrip": false,
1295
+ "normalized": false,
1296
+ "rstrip": false,
1297
+ "single_word": false,
1298
+ "special": true
1299
+ },
1300
+ "151805": {
1301
+ "content": "|<EXTRA_TOKENS_159>|",
1302
+ "lstrip": false,
1303
+ "normalized": false,
1304
+ "rstrip": false,
1305
+ "single_word": false,
1306
+ "special": true
1307
+ },
1308
+ "151806": {
1309
+ "content": "|<EXTRA_TOKENS_160>|",
1310
+ "lstrip": false,
1311
+ "normalized": false,
1312
+ "rstrip": false,
1313
+ "single_word": false,
1314
+ "special": true
1315
+ },
1316
+ "151807": {
1317
+ "content": "|<EXTRA_TOKENS_161>|",
1318
+ "lstrip": false,
1319
+ "normalized": false,
1320
+ "rstrip": false,
1321
+ "single_word": false,
1322
+ "special": true
1323
+ },
1324
+ "151808": {
1325
+ "content": "|<EXTRA_TOKENS_162>|",
1326
+ "lstrip": false,
1327
+ "normalized": false,
1328
+ "rstrip": false,
1329
+ "single_word": false,
1330
+ "special": true
1331
+ },
1332
+ "151809": {
1333
+ "content": "|<EXTRA_TOKENS_163>|",
1334
+ "lstrip": false,
1335
+ "normalized": false,
1336
+ "rstrip": false,
1337
+ "single_word": false,
1338
+ "special": true
1339
+ },
1340
+ "151810": {
1341
+ "content": "|<EXTRA_TOKENS_164>|",
1342
+ "lstrip": false,
1343
+ "normalized": false,
1344
+ "rstrip": false,
1345
+ "single_word": false,
1346
+ "special": true
1347
+ },
1348
+ "151811": {
1349
+ "content": "|<EXTRA_TOKENS_165>|",
1350
+ "lstrip": false,
1351
+ "normalized": false,
1352
+ "rstrip": false,
1353
+ "single_word": false,
1354
+ "special": true
1355
+ },
1356
+ "151812": {
1357
+ "content": "|<EXTRA_TOKENS_166>|",
1358
+ "lstrip": false,
1359
+ "normalized": false,
1360
+ "rstrip": false,
1361
+ "single_word": false,
1362
+ "special": true
1363
+ },
1364
+ "151813": {
1365
+ "content": "|<EXTRA_TOKENS_167>|",
1366
+ "lstrip": false,
1367
+ "normalized": false,
1368
+ "rstrip": false,
1369
+ "single_word": false,
1370
+ "special": true
1371
+ },
1372
+ "151814": {
1373
+ "content": "|<EXTRA_TOKENS_168>|",
1374
+ "lstrip": false,
1375
+ "normalized": false,
1376
+ "rstrip": false,
1377
+ "single_word": false,
1378
+ "special": true
1379
+ },
1380
+ "151815": {
1381
+ "content": "|<EXTRA_TOKENS_169>|",
1382
+ "lstrip": false,
1383
+ "normalized": false,
1384
+ "rstrip": false,
1385
+ "single_word": false,
1386
+ "special": true
1387
+ },
1388
+ "151816": {
1389
+ "content": "|<EXTRA_TOKENS_170>|",
1390
+ "lstrip": false,
1391
+ "normalized": false,
1392
+ "rstrip": false,
1393
+ "single_word": false,
1394
+ "special": true
1395
+ },
1396
+ "151817": {
1397
+ "content": "|<EXTRA_TOKENS_171>|",
1398
+ "lstrip": false,
1399
+ "normalized": false,
1400
+ "rstrip": false,
1401
+ "single_word": false,
1402
+ "special": true
1403
+ },
1404
+ "151818": {
1405
+ "content": "|<EXTRA_TOKENS_172>|",
1406
+ "lstrip": false,
1407
+ "normalized": false,
1408
+ "rstrip": false,
1409
+ "single_word": false,
1410
+ "special": true
1411
+ },
1412
+ "151819": {
1413
+ "content": "|<EXTRA_TOKENS_173>|",
1414
+ "lstrip": false,
1415
+ "normalized": false,
1416
+ "rstrip": false,
1417
+ "single_word": false,
1418
+ "special": true
1419
+ },
1420
+ "151820": {
1421
+ "content": "|<EXTRA_TOKENS_174>|",
1422
+ "lstrip": false,
1423
+ "normalized": false,
1424
+ "rstrip": false,
1425
+ "single_word": false,
1426
+ "special": true
1427
+ },
1428
+ "151821": {
1429
+ "content": "|<EXTRA_TOKENS_175>|",
1430
+ "lstrip": false,
1431
+ "normalized": false,
1432
+ "rstrip": false,
1433
+ "single_word": false,
1434
+ "special": true
1435
+ },
1436
+ "151822": {
1437
+ "content": "|<EXTRA_TOKENS_176>|",
1438
+ "lstrip": false,
1439
+ "normalized": false,
1440
+ "rstrip": false,
1441
+ "single_word": false,
1442
+ "special": true
1443
+ },
1444
+ "151823": {
1445
+ "content": "|<EXTRA_TOKENS_177>|",
1446
+ "lstrip": false,
1447
+ "normalized": false,
1448
+ "rstrip": false,
1449
+ "single_word": false,
1450
+ "special": true
1451
+ },
1452
+ "151824": {
1453
+ "content": "|<EXTRA_TOKENS_178>|",
1454
+ "lstrip": false,
1455
+ "normalized": false,
1456
+ "rstrip": false,
1457
+ "single_word": false,
1458
+ "special": true
1459
+ },
1460
+ "151825": {
1461
+ "content": "|<EXTRA_TOKENS_179>|",
1462
+ "lstrip": false,
1463
+ "normalized": false,
1464
+ "rstrip": false,
1465
+ "single_word": false,
1466
+ "special": true
1467
+ },
1468
+ "151826": {
1469
+ "content": "|<EXTRA_TOKENS_180>|",
1470
+ "lstrip": false,
1471
+ "normalized": false,
1472
+ "rstrip": false,
1473
+ "single_word": false,
1474
+ "special": true
1475
+ },
1476
+ "151827": {
1477
+ "content": "|<EXTRA_TOKENS_181>|",
1478
+ "lstrip": false,
1479
+ "normalized": false,
1480
+ "rstrip": false,
1481
+ "single_word": false,
1482
+ "special": true
1483
+ },
1484
+ "151828": {
1485
+ "content": "|<EXTRA_TOKENS_182>|",
1486
+ "lstrip": false,
1487
+ "normalized": false,
1488
+ "rstrip": false,
1489
+ "single_word": false,
1490
+ "special": true
1491
+ },
1492
+ "151829": {
1493
+ "content": "|<EXTRA_TOKENS_183>|",
1494
+ "lstrip": false,
1495
+ "normalized": false,
1496
+ "rstrip": false,
1497
+ "single_word": false,
1498
+ "special": true
1499
+ },
1500
+ "151830": {
1501
+ "content": "|<EXTRA_TOKENS_184>|",
1502
+ "lstrip": false,
1503
+ "normalized": false,
1504
+ "rstrip": false,
1505
+ "single_word": false,
1506
+ "special": true
1507
+ },
1508
+ "151831": {
1509
+ "content": "|<EXTRA_TOKENS_185>|",
1510
+ "lstrip": false,
1511
+ "normalized": false,
1512
+ "rstrip": false,
1513
+ "single_word": false,
1514
+ "special": true
1515
+ },
1516
+ "151832": {
1517
+ "content": "|<EXTRA_TOKENS_186>|",
1518
+ "lstrip": false,
1519
+ "normalized": false,
1520
+ "rstrip": false,
1521
+ "single_word": false,
1522
+ "special": true
1523
+ },
1524
+ "151833": {
1525
+ "content": "|<EXTRA_TOKENS_187>|",
1526
+ "lstrip": false,
1527
+ "normalized": false,
1528
+ "rstrip": false,
1529
+ "single_word": false,
1530
+ "special": true
1531
+ },
1532
+ "151834": {
1533
+ "content": "|<EXTRA_TOKENS_188>|",
1534
+ "lstrip": false,
1535
+ "normalized": false,
1536
+ "rstrip": false,
1537
+ "single_word": false,
1538
+ "special": true
1539
+ },
1540
+ "151835": {
1541
+ "content": "|<EXTRA_TOKENS_189>|",
1542
+ "lstrip": false,
1543
+ "normalized": false,
1544
+ "rstrip": false,
1545
+ "single_word": false,
1546
+ "special": true
1547
+ },
1548
+ "151836": {
1549
+ "content": "|<EXTRA_TOKENS_190>|",
1550
+ "lstrip": false,
1551
+ "normalized": false,
1552
+ "rstrip": false,
1553
+ "single_word": false,
1554
+ "special": true
1555
+ },
1556
+ "151837": {
1557
+ "content": "|<EXTRA_TOKENS_191>|",
1558
+ "lstrip": false,
1559
+ "normalized": false,
1560
+ "rstrip": false,
1561
+ "single_word": false,
1562
+ "special": true
1563
+ },
1564
+ "151838": {
1565
+ "content": "|<EXTRA_TOKENS_192>|",
1566
+ "lstrip": false,
1567
+ "normalized": false,
1568
+ "rstrip": false,
1569
+ "single_word": false,
1570
+ "special": true
1571
+ },
1572
+ "151839": {
1573
+ "content": "|<EXTRA_TOKENS_193>|",
1574
+ "lstrip": false,
1575
+ "normalized": false,
1576
+ "rstrip": false,
1577
+ "single_word": false,
1578
+ "special": true
1579
+ },
1580
+ "151840": {
1581
+ "content": "|<EXTRA_TOKENS_194>|",
1582
+ "lstrip": false,
1583
+ "normalized": false,
1584
+ "rstrip": false,
1585
+ "single_word": false,
1586
+ "special": true
1587
+ },
1588
+ "151841": {
1589
+ "content": "|<EXTRA_TOKENS_195>|",
1590
+ "lstrip": false,
1591
+ "normalized": false,
1592
+ "rstrip": false,
1593
+ "single_word": false,
1594
+ "special": true
1595
+ },
1596
+ "151842": {
1597
+ "content": "|<EXTRA_TOKENS_196>|",
1598
+ "lstrip": false,
1599
+ "normalized": false,
1600
+ "rstrip": false,
1601
+ "single_word": false,
1602
+ "special": true
1603
+ },
1604
+ "151843": {
1605
+ "content": "|<EXTRA_TOKENS_197>|",
1606
+ "lstrip": false,
1607
+ "normalized": false,
1608
+ "rstrip": false,
1609
+ "single_word": false,
1610
+ "special": true
1611
+ },
1612
+ "151844": {
1613
+ "content": "|<EXTRA_TOKENS_198>|",
1614
+ "lstrip": false,
1615
+ "normalized": false,
1616
+ "rstrip": false,
1617
+ "single_word": false,
1618
+ "special": true
1619
+ },
1620
+ "151845": {
1621
+ "content": "|<EXTRA_TOKENS_199>|",
1622
+ "lstrip": false,
1623
+ "normalized": false,
1624
+ "rstrip": false,
1625
+ "single_word": false,
1626
+ "special": true
1627
+ },
1628
+ "151846": {
1629
+ "content": "|<EXTRA_TOKENS_200>|",
1630
+ "lstrip": false,
1631
+ "normalized": false,
1632
+ "rstrip": false,
1633
+ "single_word": false,
1634
+ "special": true
1635
+ },
1636
+ "151847": {
1637
+ "content": "|<EXTRA_TOKENS_201>|",
1638
+ "lstrip": false,
1639
+ "normalized": false,
1640
+ "rstrip": false,
1641
+ "single_word": false,
1642
+ "special": true
1643
+ },
1644
+ "151848": {
1645
+ "content": "|<EXTRA_TOKENS_202>|",
1646
+ "lstrip": false,
1647
+ "normalized": false,
1648
+ "rstrip": false,
1649
+ "single_word": false,
1650
+ "special": true
1651
+ },
1652
+ "151849": {
1653
+ "content": "|<EXTRA_TOKENS_203>|",
1654
+ "lstrip": false,
1655
+ "normalized": false,
1656
+ "rstrip": false,
1657
+ "single_word": false,
1658
+ "special": true
1659
+ },
1660
+ "151850": {
1661
+ "content": "|<EXTRA_TOKENS_204>|",
1662
+ "lstrip": false,
1663
+ "normalized": false,
1664
+ "rstrip": false,
1665
+ "single_word": false,
1666
+ "special": true
1667
+ },
1668
+ "151851": {
1669
+ "content": "|<EXTRA_TOKENS_205>|",
1670
+ "lstrip": false,
1671
+ "normalized": false,
1672
+ "rstrip": false,
1673
+ "single_word": false,
1674
+ "special": true
1675
+ },
1676
+ "151852": {
1677
+ "content": "|<EXTRA_TOKENS_206>|",
1678
+ "lstrip": false,
1679
+ "normalized": false,
1680
+ "rstrip": false,
1681
+ "single_word": false,
1682
+ "special": true
1683
+ },
1684
+ "151853": {
1685
+ "content": "|<EXTRA_TOKENS_207>|",
1686
+ "lstrip": false,
1687
+ "normalized": false,
1688
+ "rstrip": false,
1689
+ "single_word": false,
1690
+ "special": true
1691
+ },
1692
+ "151854": {
1693
+ "content": "|<EXTRA_TOKENS_208>|",
1694
+ "lstrip": false,
1695
+ "normalized": false,
1696
+ "rstrip": false,
1697
+ "single_word": false,
1698
+ "special": true
1699
+ },
1700
+ "151855": {
1701
+ "content": "|<EXTRA_TOKENS_209>|",
1702
+ "lstrip": false,
1703
+ "normalized": false,
1704
+ "rstrip": false,
1705
+ "single_word": false,
1706
+ "special": true
1707
+ },
1708
+ "151856": {
1709
+ "content": "|<EXTRA_TOKENS_210>|",
1710
+ "lstrip": false,
1711
+ "normalized": false,
1712
+ "rstrip": false,
1713
+ "single_word": false,
1714
+ "special": true
1715
+ },
1716
+ "151857": {
1717
+ "content": "|<EXTRA_TOKENS_211>|",
1718
+ "lstrip": false,
1719
+ "normalized": false,
1720
+ "rstrip": false,
1721
+ "single_word": false,
1722
+ "special": true
1723
+ },
1724
+ "151858": {
1725
+ "content": "|<EXTRA_TOKENS_212>|",
1726
+ "lstrip": false,
1727
+ "normalized": false,
1728
+ "rstrip": false,
1729
+ "single_word": false,
1730
+ "special": true
1731
+ },
1732
+ "151859": {
1733
+ "content": "|<EXTRA_TOKENS_213>|",
1734
+ "lstrip": false,
1735
+ "normalized": false,
1736
+ "rstrip": false,
1737
+ "single_word": false,
1738
+ "special": true
1739
+ },
1740
+ "151860": {
1741
+ "content": "|<EXTRA_TOKENS_214>|",
1742
+ "lstrip": false,
1743
+ "normalized": false,
1744
+ "rstrip": false,
1745
+ "single_word": false,
1746
+ "special": true
1747
+ },
1748
+ "151861": {
1749
+ "content": "|<EXTRA_TOKENS_215>|",
1750
+ "lstrip": false,
1751
+ "normalized": false,
1752
+ "rstrip": false,
1753
+ "single_word": false,
1754
+ "special": true
1755
+ },
1756
+ "151862": {
1757
+ "content": "|<EXTRA_TOKENS_216>|",
1758
+ "lstrip": false,
1759
+ "normalized": false,
1760
+ "rstrip": false,
1761
+ "single_word": false,
1762
+ "special": true
1763
+ },
1764
+ "151863": {
1765
+ "content": "|<EXTRA_TOKENS_217>|",
1766
+ "lstrip": false,
1767
+ "normalized": false,
1768
+ "rstrip": false,
1769
+ "single_word": false,
1770
+ "special": true
1771
+ },
1772
+ "151864": {
1773
+ "content": "|<EXTRA_TOKENS_218>|",
1774
+ "lstrip": false,
1775
+ "normalized": false,
1776
+ "rstrip": false,
1777
+ "single_word": false,
1778
+ "special": true
1779
+ },
1780
+ "151865": {
1781
+ "content": "|<EXTRA_TOKENS_219>|",
1782
+ "lstrip": false,
1783
+ "normalized": false,
1784
+ "rstrip": false,
1785
+ "single_word": false,
1786
+ "special": true
1787
+ },
1788
+ "151866": {
1789
+ "content": "|<EXTRA_TOKENS_220>|",
1790
+ "lstrip": false,
1791
+ "normalized": false,
1792
+ "rstrip": false,
1793
+ "single_word": false,
1794
+ "special": true
1795
+ },
1796
+ "151867": {
1797
+ "content": "|<EXTRA_TOKENS_221>|",
1798
+ "lstrip": false,
1799
+ "normalized": false,
1800
+ "rstrip": false,
1801
+ "single_word": false,
1802
+ "special": true
1803
+ },
1804
+ "151868": {
1805
+ "content": "|<EXTRA_TOKENS_222>|",
1806
+ "lstrip": false,
1807
+ "normalized": false,
1808
+ "rstrip": false,
1809
+ "single_word": false,
1810
+ "special": true
1811
+ },
1812
+ "151869": {
1813
+ "content": "|<EXTRA_TOKENS_223>|",
1814
+ "lstrip": false,
1815
+ "normalized": false,
1816
+ "rstrip": false,
1817
+ "single_word": false,
1818
+ "special": true
1819
+ },
1820
+ "151870": {
1821
+ "content": "|<EXTRA_TOKENS_224>|",
1822
+ "lstrip": false,
1823
+ "normalized": false,
1824
+ "rstrip": false,
1825
+ "single_word": false,
1826
+ "special": true
1827
+ },
1828
+ "151871": {
1829
+ "content": "|<EXTRA_TOKENS_225>|",
1830
+ "lstrip": false,
1831
+ "normalized": false,
1832
+ "rstrip": false,
1833
+ "single_word": false,
1834
+ "special": true
1835
+ },
1836
+ "151872": {
1837
+ "content": "|<EXTRA_TOKENS_226>|",
1838
+ "lstrip": false,
1839
+ "normalized": false,
1840
+ "rstrip": false,
1841
+ "single_word": false,
1842
+ "special": true
1843
+ },
1844
+ "151873": {
1845
+ "content": "|<EXTRA_TOKENS_227>|",
1846
+ "lstrip": false,
1847
+ "normalized": false,
1848
+ "rstrip": false,
1849
+ "single_word": false,
1850
+ "special": true
1851
+ },
1852
+ "151874": {
1853
+ "content": "|<EXTRA_TOKENS_228>|",
1854
+ "lstrip": false,
1855
+ "normalized": false,
1856
+ "rstrip": false,
1857
+ "single_word": false,
1858
+ "special": true
1859
+ },
1860
+ "151875": {
1861
+ "content": "|<EXTRA_TOKENS_229>|",
1862
+ "lstrip": false,
1863
+ "normalized": false,
1864
+ "rstrip": false,
1865
+ "single_word": false,
1866
+ "special": true
1867
+ },
1868
+ "151876": {
1869
+ "content": "|<EXTRA_TOKENS_230>|",
1870
+ "lstrip": false,
1871
+ "normalized": false,
1872
+ "rstrip": false,
1873
+ "single_word": false,
1874
+ "special": true
1875
+ },
1876
+ "151877": {
1877
+ "content": "|<EXTRA_TOKENS_231>|",
1878
+ "lstrip": false,
1879
+ "normalized": false,
1880
+ "rstrip": false,
1881
+ "single_word": false,
1882
+ "special": true
1883
+ },
1884
+ "151878": {
1885
+ "content": "|<EXTRA_TOKENS_232>|",
1886
+ "lstrip": false,
1887
+ "normalized": false,
1888
+ "rstrip": false,
1889
+ "single_word": false,
1890
+ "special": true
1891
+ },
1892
+ "151879": {
1893
+ "content": "|<EXTRA_TOKENS_233>|",
1894
+ "lstrip": false,
1895
+ "normalized": false,
1896
+ "rstrip": false,
1897
+ "single_word": false,
1898
+ "special": true
1899
+ },
1900
+ "151880": {
1901
+ "content": "|<EXTRA_TOKENS_234>|",
1902
+ "lstrip": false,
1903
+ "normalized": false,
1904
+ "rstrip": false,
1905
+ "single_word": false,
1906
+ "special": true
1907
+ },
1908
+ "151881": {
1909
+ "content": "|<EXTRA_TOKENS_235>|",
1910
+ "lstrip": false,
1911
+ "normalized": false,
1912
+ "rstrip": false,
1913
+ "single_word": false,
1914
+ "special": true
1915
+ },
1916
+ "151882": {
1917
+ "content": "|<EXTRA_TOKENS_236>|",
1918
+ "lstrip": false,
1919
+ "normalized": false,
1920
+ "rstrip": false,
1921
+ "single_word": false,
1922
+ "special": true
1923
+ },
1924
+ "151883": {
1925
+ "content": "|<EXTRA_TOKENS_237>|",
1926
+ "lstrip": false,
1927
+ "normalized": false,
1928
+ "rstrip": false,
1929
+ "single_word": false,
1930
+ "special": true
1931
+ },
1932
+ "151884": {
1933
+ "content": "|<EXTRA_TOKENS_238>|",
1934
+ "lstrip": false,
1935
+ "normalized": false,
1936
+ "rstrip": false,
1937
+ "single_word": false,
1938
+ "special": true
1939
+ },
1940
+ "151885": {
1941
+ "content": "|<EXTRA_TOKENS_239>|",
1942
+ "lstrip": false,
1943
+ "normalized": false,
1944
+ "rstrip": false,
1945
+ "single_word": false,
1946
+ "special": true
1947
+ },
1948
+ "151886": {
1949
+ "content": "|<EXTRA_TOKENS_240>|",
1950
+ "lstrip": false,
1951
+ "normalized": false,
1952
+ "rstrip": false,
1953
+ "single_word": false,
1954
+ "special": true
1955
+ },
1956
+ "151887": {
1957
+ "content": "|<EXTRA_TOKENS_241>|",
1958
+ "lstrip": false,
1959
+ "normalized": false,
1960
+ "rstrip": false,
1961
+ "single_word": false,
1962
+ "special": true
1963
+ },
1964
+ "151888": {
1965
+ "content": "|<EXTRA_TOKENS_242>|",
1966
+ "lstrip": false,
1967
+ "normalized": false,
1968
+ "rstrip": false,
1969
+ "single_word": false,
1970
+ "special": true
1971
+ },
1972
+ "151889": {
1973
+ "content": "|<EXTRA_TOKENS_243>|",
1974
+ "lstrip": false,
1975
+ "normalized": false,
1976
+ "rstrip": false,
1977
+ "single_word": false,
1978
+ "special": true
1979
+ },
1980
+ "151890": {
1981
+ "content": "|<EXTRA_TOKENS_244>|",
1982
+ "lstrip": false,
1983
+ "normalized": false,
1984
+ "rstrip": false,
1985
+ "single_word": false,
1986
+ "special": true
1987
+ },
1988
+ "151891": {
1989
+ "content": "|<EXTRA_TOKENS_245>|",
1990
+ "lstrip": false,
1991
+ "normalized": false,
1992
+ "rstrip": false,
1993
+ "single_word": false,
1994
+ "special": true
1995
+ },
1996
+ "151892": {
1997
+ "content": "|<EXTRA_TOKENS_246>|",
1998
+ "lstrip": false,
1999
+ "normalized": false,
2000
+ "rstrip": false,
2001
+ "single_word": false,
2002
+ "special": true
2003
+ },
2004
+ "151893": {
2005
+ "content": "|<EXTRA_TOKENS_247>|",
2006
+ "lstrip": false,
2007
+ "normalized": false,
2008
+ "rstrip": false,
2009
+ "single_word": false,
2010
+ "special": true
2011
+ },
2012
+ "151894": {
2013
+ "content": "|<EXTRA_TOKENS_248>|",
2014
+ "lstrip": false,
2015
+ "normalized": false,
2016
+ "rstrip": false,
2017
+ "single_word": false,
2018
+ "special": true
2019
+ },
2020
+ "151895": {
2021
+ "content": "|<EXTRA_TOKENS_249>|",
2022
+ "lstrip": false,
2023
+ "normalized": false,
2024
+ "rstrip": false,
2025
+ "single_word": false,
2026
+ "special": true
2027
+ },
2028
+ "151896": {
2029
+ "content": "|<EXTRA_TOKENS_250>|",
2030
+ "lstrip": false,
2031
+ "normalized": false,
2032
+ "rstrip": false,
2033
+ "single_word": false,
2034
+ "special": true
2035
+ },
2036
+ "151897": {
2037
+ "content": "|<EXTRA_TOKENS_251>|",
2038
+ "lstrip": false,
2039
+ "normalized": false,
2040
+ "rstrip": false,
2041
+ "single_word": false,
2042
+ "special": true
2043
+ },
2044
+ "151898": {
2045
+ "content": "|<EXTRA_TOKENS_252>|",
2046
+ "lstrip": false,
2047
+ "normalized": false,
2048
+ "rstrip": false,
2049
+ "single_word": false,
2050
+ "special": true
2051
+ },
2052
+ "151899": {
2053
+ "content": "|<EXTRA_TOKENS_253>|",
2054
+ "lstrip": false,
2055
+ "normalized": false,
2056
+ "rstrip": false,
2057
+ "single_word": false,
2058
+ "special": true
2059
+ },
2060
+ "151900": {
2061
+ "content": "|<EXTRA_TOKENS_254>|",
2062
+ "lstrip": false,
2063
+ "normalized": false,
2064
+ "rstrip": false,
2065
+ "single_word": false,
2066
+ "special": true
2067
+ },
2068
+ "151901": {
2069
+ "content": "|<EXTRA_TOKENS_255>|",
2070
+ "lstrip": false,
2071
+ "normalized": false,
2072
+ "rstrip": false,
2073
+ "single_word": false,
2074
+ "special": true
2075
+ },
2076
+ "151902": {
2077
+ "content": "|<EXTRA_TOKENS_256>|",
2078
+ "lstrip": false,
2079
+ "normalized": false,
2080
+ "rstrip": false,
2081
+ "single_word": false,
2082
+ "special": true
2083
+ },
2084
+ "151903": {
2085
+ "content": "|<EXTRA_TOKENS_257>|",
2086
+ "lstrip": false,
2087
+ "normalized": false,
2088
+ "rstrip": false,
2089
+ "single_word": false,
2090
+ "special": true
2091
+ },
2092
+ "151904": {
2093
+ "content": "|<EXTRA_TOKENS_258>|",
2094
+ "lstrip": false,
2095
+ "normalized": false,
2096
+ "rstrip": false,
2097
+ "single_word": false,
2098
+ "special": true
2099
+ },
2100
+ "151905": {
2101
+ "content": "|<EXTRA_TOKENS_259>|",
2102
+ "lstrip": false,
2103
+ "normalized": false,
2104
+ "rstrip": false,
2105
+ "single_word": false,
2106
+ "special": true
2107
+ },
2108
+ "151906": {
2109
+ "content": "|<EXTRA_TOKENS_260>|",
2110
+ "lstrip": false,
2111
+ "normalized": false,
2112
+ "rstrip": false,
2113
+ "single_word": false,
2114
+ "special": true
2115
+ },
2116
+ "151907": {
2117
+ "content": "|<EXTRA_TOKENS_261>|",
2118
+ "lstrip": false,
2119
+ "normalized": false,
2120
+ "rstrip": false,
2121
+ "single_word": false,
2122
+ "special": true
2123
+ },
2124
+ "151908": {
2125
+ "content": "|<EXTRA_TOKENS_262>|",
2126
+ "lstrip": false,
2127
+ "normalized": false,
2128
+ "rstrip": false,
2129
+ "single_word": false,
2130
+ "special": true
2131
+ },
2132
+ "151909": {
2133
+ "content": "|<EXTRA_TOKENS_263>|",
2134
+ "lstrip": false,
2135
+ "normalized": false,
2136
+ "rstrip": false,
2137
+ "single_word": false,
2138
+ "special": true
2139
+ },
2140
+ "151910": {
2141
+ "content": "|<EXTRA_TOKENS_264>|",
2142
+ "lstrip": false,
2143
+ "normalized": false,
2144
+ "rstrip": false,
2145
+ "single_word": false,
2146
+ "special": true
2147
+ },
2148
+ "151911": {
2149
+ "content": "|<EXTRA_TOKENS_265>|",
2150
+ "lstrip": false,
2151
+ "normalized": false,
2152
+ "rstrip": false,
2153
+ "single_word": false,
2154
+ "special": true
2155
+ },
2156
+ "151912": {
2157
+ "content": "|<EXTRA_TOKENS_266>|",
2158
+ "lstrip": false,
2159
+ "normalized": false,
2160
+ "rstrip": false,
2161
+ "single_word": false,
2162
+ "special": true
2163
+ },
2164
+ "151913": {
2165
+ "content": "|<EXTRA_TOKENS_267>|",
2166
+ "lstrip": false,
2167
+ "normalized": false,
2168
+ "rstrip": false,
2169
+ "single_word": false,
2170
+ "special": true
2171
+ },
2172
+ "151914": {
2173
+ "content": "|<EXTRA_TOKENS_268>|",
2174
+ "lstrip": false,
2175
+ "normalized": false,
2176
+ "rstrip": false,
2177
+ "single_word": false,
2178
+ "special": true
2179
+ },
2180
+ "151915": {
2181
+ "content": "|<EXTRA_TOKENS_269>|",
2182
+ "lstrip": false,
2183
+ "normalized": false,
2184
+ "rstrip": false,
2185
+ "single_word": false,
2186
+ "special": true
2187
+ },
2188
+ "151916": {
2189
+ "content": "|<EXTRA_TOKENS_270>|",
2190
+ "lstrip": false,
2191
+ "normalized": false,
2192
+ "rstrip": false,
2193
+ "single_word": false,
2194
+ "special": true
2195
+ },
2196
+ "151917": {
2197
+ "content": "|<EXTRA_TOKENS_271>|",
2198
+ "lstrip": false,
2199
+ "normalized": false,
2200
+ "rstrip": false,
2201
+ "single_word": false,
2202
+ "special": true
2203
+ },
2204
+ "151918": {
2205
+ "content": "|<EXTRA_TOKENS_272>|",
2206
+ "lstrip": false,
2207
+ "normalized": false,
2208
+ "rstrip": false,
2209
+ "single_word": false,
2210
+ "special": true
2211
+ },
2212
+ "151919": {
2213
+ "content": "|<EXTRA_TOKENS_273>|",
2214
+ "lstrip": false,
2215
+ "normalized": false,
2216
+ "rstrip": false,
2217
+ "single_word": false,
2218
+ "special": true
2219
+ },
2220
+ "151920": {
2221
+ "content": "|<EXTRA_TOKENS_274>|",
2222
+ "lstrip": false,
2223
+ "normalized": false,
2224
+ "rstrip": false,
2225
+ "single_word": false,
2226
+ "special": true
2227
+ },
2228
+ "151921": {
2229
+ "content": "|<EXTRA_TOKENS_275>|",
2230
+ "lstrip": false,
2231
+ "normalized": false,
2232
+ "rstrip": false,
2233
+ "single_word": false,
2234
+ "special": true
2235
+ },
2236
+ "151922": {
2237
+ "content": "|<EXTRA_TOKENS_276>|",
2238
+ "lstrip": false,
2239
+ "normalized": false,
2240
+ "rstrip": false,
2241
+ "single_word": false,
2242
+ "special": true
2243
+ },
2244
+ "151923": {
2245
+ "content": "|<EXTRA_TOKENS_277>|",
2246
+ "lstrip": false,
2247
+ "normalized": false,
2248
+ "rstrip": false,
2249
+ "single_word": false,
2250
+ "special": true
2251
+ },
2252
+ "151924": {
2253
+ "content": "|<EXTRA_TOKENS_278>|",
2254
+ "lstrip": false,
2255
+ "normalized": false,
2256
+ "rstrip": false,
2257
+ "single_word": false,
2258
+ "special": true
2259
+ },
2260
+ "151925": {
2261
+ "content": "|<EXTRA_TOKENS_279>|",
2262
+ "lstrip": false,
2263
+ "normalized": false,
2264
+ "rstrip": false,
2265
+ "single_word": false,
2266
+ "special": true
2267
+ },
2268
+ "151926": {
2269
+ "content": "|<EXTRA_TOKENS_280>|",
2270
+ "lstrip": false,
2271
+ "normalized": false,
2272
+ "rstrip": false,
2273
+ "single_word": false,
2274
+ "special": true
2275
+ },
2276
+ "151927": {
2277
+ "content": "|<EXTRA_TOKENS_281>|",
2278
+ "lstrip": false,
2279
+ "normalized": false,
2280
+ "rstrip": false,
2281
+ "single_word": false,
2282
+ "special": true
2283
+ },
2284
+ "151928": {
2285
+ "content": "|<EXTRA_TOKENS_282>|",
2286
+ "lstrip": false,
2287
+ "normalized": false,
2288
+ "rstrip": false,
2289
+ "single_word": false,
2290
+ "special": true
2291
+ },
2292
+ "151929": {
2293
+ "content": "|<EXTRA_TOKENS_283>|",
2294
+ "lstrip": false,
2295
+ "normalized": false,
2296
+ "rstrip": false,
2297
+ "single_word": false,
2298
+ "special": true
2299
+ },
2300
+ "151930": {
2301
+ "content": "|<EXTRA_TOKENS_284>|",
2302
+ "lstrip": false,
2303
+ "normalized": false,
2304
+ "rstrip": false,
2305
+ "single_word": false,
2306
+ "special": true
2307
+ },
2308
+ "151931": {
2309
+ "content": "|<EXTRA_TOKENS_285>|",
2310
+ "lstrip": false,
2311
+ "normalized": false,
2312
+ "rstrip": false,
2313
+ "single_word": false,
2314
+ "special": true
2315
+ },
2316
+ "151932": {
2317
+ "content": "|<EXTRA_TOKENS_286>|",
2318
+ "lstrip": false,
2319
+ "normalized": false,
2320
+ "rstrip": false,
2321
+ "single_word": false,
2322
+ "special": true
2323
+ },
2324
+ "151933": {
2325
+ "content": "|<EXTRA_TOKENS_287>|",
2326
+ "lstrip": false,
2327
+ "normalized": false,
2328
+ "rstrip": false,
2329
+ "single_word": false,
2330
+ "special": true
2331
+ },
2332
+ "151934": {
2333
+ "content": "|<EXTRA_TOKENS_288>|",
2334
+ "lstrip": false,
2335
+ "normalized": false,
2336
+ "rstrip": false,
2337
+ "single_word": false,
2338
+ "special": true
2339
+ },
2340
+ "151935": {
2341
+ "content": "|<EXTRA_TOKENS_289>|",
2342
+ "lstrip": false,
2343
+ "normalized": false,
2344
+ "rstrip": false,
2345
+ "single_word": false,
2346
+ "special": true
2347
+ },
2348
+ "151936": {
2349
+ "content": "|<EXTRA_TOKENS_290>|",
2350
+ "lstrip": false,
2351
+ "normalized": false,
2352
+ "rstrip": false,
2353
+ "single_word": false,
2354
+ "special": true
2355
+ },
2356
+ "151937": {
2357
+ "content": "|<EXTRA_TOKENS_291>|",
2358
+ "lstrip": false,
2359
+ "normalized": false,
2360
+ "rstrip": false,
2361
+ "single_word": false,
2362
+ "special": true
2363
+ },
2364
+ "151938": {
2365
+ "content": "|<EXTRA_TOKENS_292>|",
2366
+ "lstrip": false,
2367
+ "normalized": false,
2368
+ "rstrip": false,
2369
+ "single_word": false,
2370
+ "special": true
2371
+ },
2372
+ "151939": {
2373
+ "content": "|<EXTRA_TOKENS_293>|",
2374
+ "lstrip": false,
2375
+ "normalized": false,
2376
+ "rstrip": false,
2377
+ "single_word": false,
2378
+ "special": true
2379
+ },
2380
+ "151940": {
2381
+ "content": "|<EXTRA_TOKENS_294>|",
2382
+ "lstrip": false,
2383
+ "normalized": false,
2384
+ "rstrip": false,
2385
+ "single_word": false,
2386
+ "special": true
2387
+ },
2388
+ "151941": {
2389
+ "content": "|<EXTRA_TOKENS_295>|",
2390
+ "lstrip": false,
2391
+ "normalized": false,
2392
+ "rstrip": false,
2393
+ "single_word": false,
2394
+ "special": true
2395
+ },
2396
+ "151942": {
2397
+ "content": "|<EXTRA_TOKENS_296>|",
2398
+ "lstrip": false,
2399
+ "normalized": false,
2400
+ "rstrip": false,
2401
+ "single_word": false,
2402
+ "special": true
2403
+ },
2404
+ "151943": {
2405
+ "content": "|<EXTRA_TOKENS_297>|",
2406
+ "lstrip": false,
2407
+ "normalized": false,
2408
+ "rstrip": false,
2409
+ "single_word": false,
2410
+ "special": true
2411
+ },
2412
+ "151944": {
2413
+ "content": "|<EXTRA_TOKENS_298>|",
2414
+ "lstrip": false,
2415
+ "normalized": false,
2416
+ "rstrip": false,
2417
+ "single_word": false,
2418
+ "special": true
2419
+ },
2420
+ "151945": {
2421
+ "content": "|<EXTRA_TOKENS_299>|",
2422
+ "lstrip": false,
2423
+ "normalized": false,
2424
+ "rstrip": false,
2425
+ "single_word": false,
2426
+ "special": true
2427
+ },
2428
+ "151946": {
2429
+ "content": "|<EXTRA_TOKENS_300>|",
2430
+ "lstrip": false,
2431
+ "normalized": false,
2432
+ "rstrip": false,
2433
+ "single_word": false,
2434
+ "special": true
2435
+ },
2436
+ "151947": {
2437
+ "content": "|<EXTRA_TOKENS_301>|",
2438
+ "lstrip": false,
2439
+ "normalized": false,
2440
+ "rstrip": false,
2441
+ "single_word": false,
2442
+ "special": true
2443
+ },
2444
+ "151948": {
2445
+ "content": "|<EXTRA_TOKENS_302>|",
2446
+ "lstrip": false,
2447
+ "normalized": false,
2448
+ "rstrip": false,
2449
+ "single_word": false,
2450
+ "special": true
2451
+ },
2452
+ "151949": {
2453
+ "content": "|<EXTRA_TOKENS_303>|",
2454
+ "lstrip": false,
2455
+ "normalized": false,
2456
+ "rstrip": false,
2457
+ "single_word": false,
2458
+ "special": true
2459
+ },
2460
+ "151950": {
2461
+ "content": "|<EXTRA_TOKENS_304>|",
2462
+ "lstrip": false,
2463
+ "normalized": false,
2464
+ "rstrip": false,
2465
+ "single_word": false,
2466
+ "special": true
2467
+ },
2468
+ "151951": {
2469
+ "content": "|<EXTRA_TOKENS_305>|",
2470
+ "lstrip": false,
2471
+ "normalized": false,
2472
+ "rstrip": false,
2473
+ "single_word": false,
2474
+ "special": true
2475
+ },
2476
+ "151952": {
2477
+ "content": "|<EXTRA_TOKENS_306>|",
2478
+ "lstrip": false,
2479
+ "normalized": false,
2480
+ "rstrip": false,
2481
+ "single_word": false,
2482
+ "special": true
2483
+ },
2484
+ "151953": {
2485
+ "content": "|<EXTRA_TOKENS_307>|",
2486
+ "lstrip": false,
2487
+ "normalized": false,
2488
+ "rstrip": false,
2489
+ "single_word": false,
2490
+ "special": true
2491
+ },
2492
+ "151954": {
2493
+ "content": "|<EXTRA_TOKENS_308>|",
2494
+ "lstrip": false,
2495
+ "normalized": false,
2496
+ "rstrip": false,
2497
+ "single_word": false,
2498
+ "special": true
2499
+ },
2500
+ "151955": {
2501
+ "content": "|<EXTRA_TOKENS_309>|",
2502
+ "lstrip": false,
2503
+ "normalized": false,
2504
+ "rstrip": false,
2505
+ "single_word": false,
2506
+ "special": true
2507
+ },
2508
+ "151956": {
2509
+ "content": "|<EXTRA_TOKENS_310>|",
2510
+ "lstrip": false,
2511
+ "normalized": false,
2512
+ "rstrip": false,
2513
+ "single_word": false,
2514
+ "special": true
2515
+ },
2516
+ "151957": {
2517
+ "content": "|<EXTRA_TOKENS_311>|",
2518
+ "lstrip": false,
2519
+ "normalized": false,
2520
+ "rstrip": false,
2521
+ "single_word": false,
2522
+ "special": true
2523
+ },
2524
+ "151958": {
2525
+ "content": "|<EXTRA_TOKENS_312>|",
2526
+ "lstrip": false,
2527
+ "normalized": false,
2528
+ "rstrip": false,
2529
+ "single_word": false,
2530
+ "special": true
2531
+ },
2532
+ "151959": {
2533
+ "content": "|<EXTRA_TOKENS_313>|",
2534
+ "lstrip": false,
2535
+ "normalized": false,
2536
+ "rstrip": false,
2537
+ "single_word": false,
2538
+ "special": true
2539
+ },
2540
+ "151960": {
2541
+ "content": "|<EXTRA_TOKENS_314>|",
2542
+ "lstrip": false,
2543
+ "normalized": false,
2544
+ "rstrip": false,
2545
+ "single_word": false,
2546
+ "special": true
2547
+ },
2548
+ "151961": {
2549
+ "content": "|<EXTRA_TOKENS_315>|",
2550
+ "lstrip": false,
2551
+ "normalized": false,
2552
+ "rstrip": false,
2553
+ "single_word": false,
2554
+ "special": true
2555
+ },
2556
+ "151962": {
2557
+ "content": "|<EXTRA_TOKENS_316>|",
2558
+ "lstrip": false,
2559
+ "normalized": false,
2560
+ "rstrip": false,
2561
+ "single_word": false,
2562
+ "special": true
2563
+ },
2564
+ "151963": {
2565
+ "content": "|<EXTRA_TOKENS_317>|",
2566
+ "lstrip": false,
2567
+ "normalized": false,
2568
+ "rstrip": false,
2569
+ "single_word": false,
2570
+ "special": true
2571
+ },
2572
+ "151964": {
2573
+ "content": "|<EXTRA_TOKENS_318>|",
2574
+ "lstrip": false,
2575
+ "normalized": false,
2576
+ "rstrip": false,
2577
+ "single_word": false,
2578
+ "special": true
2579
+ },
2580
+ "151965": {
2581
+ "content": "|<EXTRA_TOKENS_319>|",
2582
+ "lstrip": false,
2583
+ "normalized": false,
2584
+ "rstrip": false,
2585
+ "single_word": false,
2586
+ "special": true
2587
+ },
2588
+ "151966": {
2589
+ "content": "|<EXTRA_TOKENS_320>|",
2590
+ "lstrip": false,
2591
+ "normalized": false,
2592
+ "rstrip": false,
2593
+ "single_word": false,
2594
+ "special": true
2595
+ },
2596
+ "151967": {
2597
+ "content": "|<EXTRA_TOKENS_321>|",
2598
+ "lstrip": false,
2599
+ "normalized": false,
2600
+ "rstrip": false,
2601
+ "single_word": false,
2602
+ "special": true
2603
+ },
2604
+ "151968": {
2605
+ "content": "|<EXTRA_TOKENS_322>|",
2606
+ "lstrip": false,
2607
+ "normalized": false,
2608
+ "rstrip": false,
2609
+ "single_word": false,
2610
+ "special": true
2611
+ },
2612
+ "151969": {
2613
+ "content": "|<EXTRA_TOKENS_323>|",
2614
+ "lstrip": false,
2615
+ "normalized": false,
2616
+ "rstrip": false,
2617
+ "single_word": false,
2618
+ "special": true
2619
+ },
2620
+ "151970": {
2621
+ "content": "|<EXTRA_TOKENS_324>|",
2622
+ "lstrip": false,
2623
+ "normalized": false,
2624
+ "rstrip": false,
2625
+ "single_word": false,
2626
+ "special": true
2627
+ },
2628
+ "151971": {
2629
+ "content": "|<EXTRA_TOKENS_325>|",
2630
+ "lstrip": false,
2631
+ "normalized": false,
2632
+ "rstrip": false,
2633
+ "single_word": false,
2634
+ "special": true
2635
+ },
2636
+ "151972": {
2637
+ "content": "|<EXTRA_TOKENS_326>|",
2638
+ "lstrip": false,
2639
+ "normalized": false,
2640
+ "rstrip": false,
2641
+ "single_word": false,
2642
+ "special": true
2643
+ },
2644
+ "151973": {
2645
+ "content": "|<EXTRA_TOKENS_327>|",
2646
+ "lstrip": false,
2647
+ "normalized": false,
2648
+ "rstrip": false,
2649
+ "single_word": false,
2650
+ "special": true
2651
+ },
2652
+ "151974": {
2653
+ "content": "|<EXTRA_TOKENS_328>|",
2654
+ "lstrip": false,
2655
+ "normalized": false,
2656
+ "rstrip": false,
2657
+ "single_word": false,
2658
+ "special": true
2659
+ },
2660
+ "151975": {
2661
+ "content": "|<EXTRA_TOKENS_329>|",
2662
+ "lstrip": false,
2663
+ "normalized": false,
2664
+ "rstrip": false,
2665
+ "single_word": false,
2666
+ "special": true
2667
+ },
2668
+ "151976": {
2669
+ "content": "|<EXTRA_TOKENS_330>|",
2670
+ "lstrip": false,
2671
+ "normalized": false,
2672
+ "rstrip": false,
2673
+ "single_word": false,
2674
+ "special": true
2675
+ },
2676
+ "151977": {
2677
+ "content": "|<EXTRA_TOKENS_331>|",
2678
+ "lstrip": false,
2679
+ "normalized": false,
2680
+ "rstrip": false,
2681
+ "single_word": false,
2682
+ "special": true
2683
+ },
2684
+ "151978": {
2685
+ "content": "|<EXTRA_TOKENS_332>|",
2686
+ "lstrip": false,
2687
+ "normalized": false,
2688
+ "rstrip": false,
2689
+ "single_word": false,
2690
+ "special": true
2691
+ },
2692
+ "151979": {
2693
+ "content": "|<EXTRA_TOKENS_333>|",
2694
+ "lstrip": false,
2695
+ "normalized": false,
2696
+ "rstrip": false,
2697
+ "single_word": false,
2698
+ "special": true
2699
+ },
2700
+ "151980": {
2701
+ "content": "|<EXTRA_TOKENS_334>|",
2702
+ "lstrip": false,
2703
+ "normalized": false,
2704
+ "rstrip": false,
2705
+ "single_word": false,
2706
+ "special": true
2707
+ },
2708
+ "151981": {
2709
+ "content": "|<EXTRA_TOKENS_335>|",
2710
+ "lstrip": false,
2711
+ "normalized": false,
2712
+ "rstrip": false,
2713
+ "single_word": false,
2714
+ "special": true
2715
+ },
2716
+ "151982": {
2717
+ "content": "|<EXTRA_TOKENS_336>|",
2718
+ "lstrip": false,
2719
+ "normalized": false,
2720
+ "rstrip": false,
2721
+ "single_word": false,
2722
+ "special": true
2723
+ },
2724
+ "151983": {
2725
+ "content": "|<EXTRA_TOKENS_337>|",
2726
+ "lstrip": false,
2727
+ "normalized": false,
2728
+ "rstrip": false,
2729
+ "single_word": false,
2730
+ "special": true
2731
+ },
2732
+ "151984": {
2733
+ "content": "|<EXTRA_TOKENS_338>|",
2734
+ "lstrip": false,
2735
+ "normalized": false,
2736
+ "rstrip": false,
2737
+ "single_word": false,
2738
+ "special": true
2739
+ },
2740
+ "151985": {
2741
+ "content": "|<EXTRA_TOKENS_339>|",
2742
+ "lstrip": false,
2743
+ "normalized": false,
2744
+ "rstrip": false,
2745
+ "single_word": false,
2746
+ "special": true
2747
+ },
2748
+ "151986": {
2749
+ "content": "|<EXTRA_TOKENS_340>|",
2750
+ "lstrip": false,
2751
+ "normalized": false,
2752
+ "rstrip": false,
2753
+ "single_word": false,
2754
+ "special": true
2755
+ },
2756
+ "151987": {
2757
+ "content": "|<EXTRA_TOKENS_341>|",
2758
+ "lstrip": false,
2759
+ "normalized": false,
2760
+ "rstrip": false,
2761
+ "single_word": false,
2762
+ "special": true
2763
+ },
2764
+ "151988": {
2765
+ "content": "|<EXTRA_TOKENS_342>|",
2766
+ "lstrip": false,
2767
+ "normalized": false,
2768
+ "rstrip": false,
2769
+ "single_word": false,
2770
+ "special": true
2771
+ },
2772
+ "151989": {
2773
+ "content": "|<EXTRA_TOKENS_343>|",
2774
+ "lstrip": false,
2775
+ "normalized": false,
2776
+ "rstrip": false,
2777
+ "single_word": false,
2778
+ "special": true
2779
+ },
2780
+ "151990": {
2781
+ "content": "|<EXTRA_TOKENS_344>|",
2782
+ "lstrip": false,
2783
+ "normalized": false,
2784
+ "rstrip": false,
2785
+ "single_word": false,
2786
+ "special": true
2787
+ },
2788
+ "151991": {
2789
+ "content": "|<EXTRA_TOKENS_345>|",
2790
+ "lstrip": false,
2791
+ "normalized": false,
2792
+ "rstrip": false,
2793
+ "single_word": false,
2794
+ "special": true
2795
+ },
2796
+ "151992": {
2797
+ "content": "|<EXTRA_TOKENS_346>|",
2798
+ "lstrip": false,
2799
+ "normalized": false,
2800
+ "rstrip": false,
2801
+ "single_word": false,
2802
+ "special": true
2803
+ },
2804
+ "151993": {
2805
+ "content": "|<EXTRA_TOKENS_347>|",
2806
+ "lstrip": false,
2807
+ "normalized": false,
2808
+ "rstrip": false,
2809
+ "single_word": false,
2810
+ "special": true
2811
+ },
2812
+ "151994": {
2813
+ "content": "|<EXTRA_TOKENS_348>|",
2814
+ "lstrip": false,
2815
+ "normalized": false,
2816
+ "rstrip": false,
2817
+ "single_word": false,
2818
+ "special": true
2819
+ },
2820
+ "151995": {
2821
+ "content": "|<EXTRA_TOKENS_349>|",
2822
+ "lstrip": false,
2823
+ "normalized": false,
2824
+ "rstrip": false,
2825
+ "single_word": false,
2826
+ "special": true
2827
+ },
2828
+ "151996": {
2829
+ "content": "|<EXTRA_TOKENS_350>|",
2830
+ "lstrip": false,
2831
+ "normalized": false,
2832
+ "rstrip": false,
2833
+ "single_word": false,
2834
+ "special": true
2835
+ },
2836
+ "151997": {
2837
+ "content": "|<EXTRA_TOKENS_351>|",
2838
+ "lstrip": false,
2839
+ "normalized": false,
2840
+ "rstrip": false,
2841
+ "single_word": false,
2842
+ "special": true
2843
+ },
2844
+ "151998": {
2845
+ "content": "|<EXTRA_TOKENS_352>|",
2846
+ "lstrip": false,
2847
+ "normalized": false,
2848
+ "rstrip": false,
2849
+ "single_word": false,
2850
+ "special": true
2851
+ },
2852
+ "151999": {
2853
+ "content": "|<EXTRA_TOKENS_353>|",
2854
+ "lstrip": false,
2855
+ "normalized": false,
2856
+ "rstrip": false,
2857
+ "single_word": false,
2858
+ "special": true
2859
+ },
2860
+ "152000": {
2861
+ "content": "|<EXTRA_TOKENS_354>|",
2862
+ "lstrip": false,
2863
+ "normalized": false,
2864
+ "rstrip": false,
2865
+ "single_word": false,
2866
+ "special": true
2867
+ },
2868
+ "152001": {
2869
+ "content": "|<EXTRA_TOKENS_355>|",
2870
+ "lstrip": false,
2871
+ "normalized": false,
2872
+ "rstrip": false,
2873
+ "single_word": false,
2874
+ "special": true
2875
+ },
2876
+ "152002": {
2877
+ "content": "|<EXTRA_TOKENS_356>|",
2878
+ "lstrip": false,
2879
+ "normalized": false,
2880
+ "rstrip": false,
2881
+ "single_word": false,
2882
+ "special": true
2883
+ },
2884
+ "152003": {
2885
+ "content": "|<EXTRA_TOKENS_357>|",
2886
+ "lstrip": false,
2887
+ "normalized": false,
2888
+ "rstrip": false,
2889
+ "single_word": false,
2890
+ "special": true
2891
+ },
2892
+ "152004": {
2893
+ "content": "|<EXTRA_TOKENS_358>|",
2894
+ "lstrip": false,
2895
+ "normalized": false,
2896
+ "rstrip": false,
2897
+ "single_word": false,
2898
+ "special": true
2899
+ },
2900
+ "152005": {
2901
+ "content": "|<EXTRA_TOKENS_359>|",
2902
+ "lstrip": false,
2903
+ "normalized": false,
2904
+ "rstrip": false,
2905
+ "single_word": false,
2906
+ "special": true
2907
+ },
2908
+ "152006": {
2909
+ "content": "|<EXTRA_TOKENS_360>|",
2910
+ "lstrip": false,
2911
+ "normalized": false,
2912
+ "rstrip": false,
2913
+ "single_word": false,
2914
+ "special": true
2915
+ },
2916
+ "152007": {
2917
+ "content": "|<EXTRA_TOKENS_361>|",
2918
+ "lstrip": false,
2919
+ "normalized": false,
2920
+ "rstrip": false,
2921
+ "single_word": false,
2922
+ "special": true
2923
+ },
2924
+ "152008": {
2925
+ "content": "|<EXTRA_TOKENS_362>|",
2926
+ "lstrip": false,
2927
+ "normalized": false,
2928
+ "rstrip": false,
2929
+ "single_word": false,
2930
+ "special": true
2931
+ },
2932
+ "152009": {
2933
+ "content": "|<EXTRA_TOKENS_363>|",
2934
+ "lstrip": false,
2935
+ "normalized": false,
2936
+ "rstrip": false,
2937
+ "single_word": false,
2938
+ "special": true
2939
+ },
2940
+ "152010": {
2941
+ "content": "|<EXTRA_TOKENS_364>|",
2942
+ "lstrip": false,
2943
+ "normalized": false,
2944
+ "rstrip": false,
2945
+ "single_word": false,
2946
+ "special": true
2947
+ },
2948
+ "152011": {
2949
+ "content": "|<EXTRA_TOKENS_365>|",
2950
+ "lstrip": false,
2951
+ "normalized": false,
2952
+ "rstrip": false,
2953
+ "single_word": false,
2954
+ "special": true
2955
+ },
2956
+ "152012": {
2957
+ "content": "|<EXTRA_TOKENS_366>|",
2958
+ "lstrip": false,
2959
+ "normalized": false,
2960
+ "rstrip": false,
2961
+ "single_word": false,
2962
+ "special": true
2963
+ },
2964
+ "152013": {
2965
+ "content": "|<EXTRA_TOKENS_367>|",
2966
+ "lstrip": false,
2967
+ "normalized": false,
2968
+ "rstrip": false,
2969
+ "single_word": false,
2970
+ "special": true
2971
+ },
2972
+ "152014": {
2973
+ "content": "|<EXTRA_TOKENS_368>|",
2974
+ "lstrip": false,
2975
+ "normalized": false,
2976
+ "rstrip": false,
2977
+ "single_word": false,
2978
+ "special": true
2979
+ },
2980
+ "152015": {
2981
+ "content": "|<EXTRA_TOKENS_369>|",
2982
+ "lstrip": false,
2983
+ "normalized": false,
2984
+ "rstrip": false,
2985
+ "single_word": false,
2986
+ "special": true
2987
+ },
2988
+ "152016": {
2989
+ "content": "|<EXTRA_TOKENS_370>|",
2990
+ "lstrip": false,
2991
+ "normalized": false,
2992
+ "rstrip": false,
2993
+ "single_word": false,
2994
+ "special": true
2995
+ },
2996
+ "152017": {
2997
+ "content": "|<EXTRA_TOKENS_371>|",
2998
+ "lstrip": false,
2999
+ "normalized": false,
3000
+ "rstrip": false,
3001
+ "single_word": false,
3002
+ "special": true
3003
+ },
3004
+ "152018": {
3005
+ "content": "|<EXTRA_TOKENS_372>|",
3006
+ "lstrip": false,
3007
+ "normalized": false,
3008
+ "rstrip": false,
3009
+ "single_word": false,
3010
+ "special": true
3011
+ },
3012
+ "152019": {
3013
+ "content": "|<EXTRA_TOKENS_373>|",
3014
+ "lstrip": false,
3015
+ "normalized": false,
3016
+ "rstrip": false,
3017
+ "single_word": false,
3018
+ "special": true
3019
+ },
3020
+ "152020": {
3021
+ "content": "|<EXTRA_TOKENS_374>|",
3022
+ "lstrip": false,
3023
+ "normalized": false,
3024
+ "rstrip": false,
3025
+ "single_word": false,
3026
+ "special": true
3027
+ },
3028
+ "152021": {
3029
+ "content": "|<EXTRA_TOKENS_375>|",
3030
+ "lstrip": false,
3031
+ "normalized": false,
3032
+ "rstrip": false,
3033
+ "single_word": false,
3034
+ "special": true
3035
+ },
3036
+ "152022": {
3037
+ "content": "|<EXTRA_TOKENS_376>|",
3038
+ "lstrip": false,
3039
+ "normalized": false,
3040
+ "rstrip": false,
3041
+ "single_word": false,
3042
+ "special": true
3043
+ },
3044
+ "152023": {
3045
+ "content": "|<EXTRA_TOKENS_377>|",
3046
+ "lstrip": false,
3047
+ "normalized": false,
3048
+ "rstrip": false,
3049
+ "single_word": false,
3050
+ "special": true
3051
+ },
3052
+ "152024": {
3053
+ "content": "|<EXTRA_TOKENS_378>|",
3054
+ "lstrip": false,
3055
+ "normalized": false,
3056
+ "rstrip": false,
3057
+ "single_word": false,
3058
+ "special": true
3059
+ },
3060
+ "152025": {
3061
+ "content": "|<EXTRA_TOKENS_379>|",
3062
+ "lstrip": false,
3063
+ "normalized": false,
3064
+ "rstrip": false,
3065
+ "single_word": false,
3066
+ "special": true
3067
+ },
3068
+ "152026": {
3069
+ "content": "|<EXTRA_TOKENS_380>|",
3070
+ "lstrip": false,
3071
+ "normalized": false,
3072
+ "rstrip": false,
3073
+ "single_word": false,
3074
+ "special": true
3075
+ },
3076
+ "152027": {
3077
+ "content": "|<EXTRA_TOKENS_381>|",
3078
+ "lstrip": false,
3079
+ "normalized": false,
3080
+ "rstrip": false,
3081
+ "single_word": false,
3082
+ "special": true
3083
+ },
3084
+ "152028": {
3085
+ "content": "|<EXTRA_TOKENS_382>|",
3086
+ "lstrip": false,
3087
+ "normalized": false,
3088
+ "rstrip": false,
3089
+ "single_word": false,
3090
+ "special": true
3091
+ },
3092
+ "152029": {
3093
+ "content": "|<EXTRA_TOKENS_383>|",
3094
+ "lstrip": false,
3095
+ "normalized": false,
3096
+ "rstrip": false,
3097
+ "single_word": false,
3098
+ "special": true
3099
+ },
3100
+ "152030": {
3101
+ "content": "|<EXTRA_TOKENS_384>|",
3102
+ "lstrip": false,
3103
+ "normalized": false,
3104
+ "rstrip": false,
3105
+ "single_word": false,
3106
+ "special": true
3107
+ },
3108
+ "152031": {
3109
+ "content": "|<EXTRA_TOKENS_385>|",
3110
+ "lstrip": false,
3111
+ "normalized": false,
3112
+ "rstrip": false,
3113
+ "single_word": false,
3114
+ "special": true
3115
+ },
3116
+ "152032": {
3117
+ "content": "|<EXTRA_TOKENS_386>|",
3118
+ "lstrip": false,
3119
+ "normalized": false,
3120
+ "rstrip": false,
3121
+ "single_word": false,
3122
+ "special": true
3123
+ },
3124
+ "152033": {
3125
+ "content": "|<EXTRA_TOKENS_387>|",
3126
+ "lstrip": false,
3127
+ "normalized": false,
3128
+ "rstrip": false,
3129
+ "single_word": false,
3130
+ "special": true
3131
+ },
3132
+ "152034": {
3133
+ "content": "|<EXTRA_TOKENS_388>|",
3134
+ "lstrip": false,
3135
+ "normalized": false,
3136
+ "rstrip": false,
3137
+ "single_word": false,
3138
+ "special": true
3139
+ },
3140
+ "152035": {
3141
+ "content": "|<EXTRA_TOKENS_389>|",
3142
+ "lstrip": false,
3143
+ "normalized": false,
3144
+ "rstrip": false,
3145
+ "single_word": false,
3146
+ "special": true
3147
+ },
3148
+ "152036": {
3149
+ "content": "|<EXTRA_TOKENS_390>|",
3150
+ "lstrip": false,
3151
+ "normalized": false,
3152
+ "rstrip": false,
3153
+ "single_word": false,
3154
+ "special": true
3155
+ },
3156
+ "152037": {
3157
+ "content": "|<EXTRA_TOKENS_391>|",
3158
+ "lstrip": false,
3159
+ "normalized": false,
3160
+ "rstrip": false,
3161
+ "single_word": false,
3162
+ "special": true
3163
+ },
3164
+ "152038": {
3165
+ "content": "|<EXTRA_TOKENS_392>|",
3166
+ "lstrip": false,
3167
+ "normalized": false,
3168
+ "rstrip": false,
3169
+ "single_word": false,
3170
+ "special": true
3171
+ },
3172
+ "152039": {
3173
+ "content": "|<EXTRA_TOKENS_393>|",
3174
+ "lstrip": false,
3175
+ "normalized": false,
3176
+ "rstrip": false,
3177
+ "single_word": false,
3178
+ "special": true
3179
+ },
3180
+ "152040": {
3181
+ "content": "|<EXTRA_TOKENS_394>|",
3182
+ "lstrip": false,
3183
+ "normalized": false,
3184
+ "rstrip": false,
3185
+ "single_word": false,
3186
+ "special": true
3187
+ },
3188
+ "152041": {
3189
+ "content": "|<EXTRA_TOKENS_395>|",
3190
+ "lstrip": false,
3191
+ "normalized": false,
3192
+ "rstrip": false,
3193
+ "single_word": false,
3194
+ "special": true
3195
+ },
3196
+ "152042": {
3197
+ "content": "|<EXTRA_TOKENS_396>|",
3198
+ "lstrip": false,
3199
+ "normalized": false,
3200
+ "rstrip": false,
3201
+ "single_word": false,
3202
+ "special": true
3203
+ },
3204
+ "152043": {
3205
+ "content": "|<EXTRA_TOKENS_397>|",
3206
+ "lstrip": false,
3207
+ "normalized": false,
3208
+ "rstrip": false,
3209
+ "single_word": false,
3210
+ "special": true
3211
+ },
3212
+ "152044": {
3213
+ "content": "|<EXTRA_TOKENS_398>|",
3214
+ "lstrip": false,
3215
+ "normalized": false,
3216
+ "rstrip": false,
3217
+ "single_word": false,
3218
+ "special": true
3219
+ },
3220
+ "152045": {
3221
+ "content": "|<EXTRA_TOKENS_399>|",
3222
+ "lstrip": false,
3223
+ "normalized": false,
3224
+ "rstrip": false,
3225
+ "single_word": false,
3226
+ "special": true
3227
+ },
3228
+ "152046": {
3229
+ "content": "|<EXTRA_TOKENS_400>|",
3230
+ "lstrip": false,
3231
+ "normalized": false,
3232
+ "rstrip": false,
3233
+ "single_word": false,
3234
+ "special": true
3235
+ },
3236
+ "152047": {
3237
+ "content": "|<EXTRA_TOKENS_401>|",
3238
+ "lstrip": false,
3239
+ "normalized": false,
3240
+ "rstrip": false,
3241
+ "single_word": false,
3242
+ "special": true
3243
+ },
3244
+ "152048": {
3245
+ "content": "|<EXTRA_TOKENS_402>|",
3246
+ "lstrip": false,
3247
+ "normalized": false,
3248
+ "rstrip": false,
3249
+ "single_word": false,
3250
+ "special": true
3251
+ },
3252
+ "152049": {
3253
+ "content": "|<EXTRA_TOKENS_403>|",
3254
+ "lstrip": false,
3255
+ "normalized": false,
3256
+ "rstrip": false,
3257
+ "single_word": false,
3258
+ "special": true
3259
+ },
3260
+ "152050": {
3261
+ "content": "|<EXTRA_TOKENS_404>|",
3262
+ "lstrip": false,
3263
+ "normalized": false,
3264
+ "rstrip": false,
3265
+ "single_word": false,
3266
+ "special": true
3267
+ },
3268
+ "152051": {
3269
+ "content": "|<EXTRA_TOKENS_405>|",
3270
+ "lstrip": false,
3271
+ "normalized": false,
3272
+ "rstrip": false,
3273
+ "single_word": false,
3274
+ "special": true
3275
+ },
3276
+ "152052": {
3277
+ "content": "|<EXTRA_TOKENS_406>|",
3278
+ "lstrip": false,
3279
+ "normalized": false,
3280
+ "rstrip": false,
3281
+ "single_word": false,
3282
+ "special": true
3283
+ },
3284
+ "152053": {
3285
+ "content": "|<EXTRA_TOKENS_407>|",
3286
+ "lstrip": false,
3287
+ "normalized": false,
3288
+ "rstrip": false,
3289
+ "single_word": false,
3290
+ "special": true
3291
+ },
3292
+ "152054": {
3293
+ "content": "|<EXTRA_TOKENS_408>|",
3294
+ "lstrip": false,
3295
+ "normalized": false,
3296
+ "rstrip": false,
3297
+ "single_word": false,
3298
+ "special": true
3299
+ },
3300
+ "152055": {
3301
+ "content": "|<EXTRA_TOKENS_409>|",
3302
+ "lstrip": false,
3303
+ "normalized": false,
3304
+ "rstrip": false,
3305
+ "single_word": false,
3306
+ "special": true
3307
+ },
3308
+ "152056": {
3309
+ "content": "|<EXTRA_TOKENS_410>|",
3310
+ "lstrip": false,
3311
+ "normalized": false,
3312
+ "rstrip": false,
3313
+ "single_word": false,
3314
+ "special": true
3315
+ },
3316
+ "152057": {
3317
+ "content": "|<EXTRA_TOKENS_411>|",
3318
+ "lstrip": false,
3319
+ "normalized": false,
3320
+ "rstrip": false,
3321
+ "single_word": false,
3322
+ "special": true
3323
+ },
3324
+ "152058": {
3325
+ "content": "|<EXTRA_TOKENS_412>|",
3326
+ "lstrip": false,
3327
+ "normalized": false,
3328
+ "rstrip": false,
3329
+ "single_word": false,
3330
+ "special": true
3331
+ },
3332
+ "152059": {
3333
+ "content": "|<EXTRA_TOKENS_413>|",
3334
+ "lstrip": false,
3335
+ "normalized": false,
3336
+ "rstrip": false,
3337
+ "single_word": false,
3338
+ "special": true
3339
+ },
3340
+ "152060": {
3341
+ "content": "|<EXTRA_TOKENS_414>|",
3342
+ "lstrip": false,
3343
+ "normalized": false,
3344
+ "rstrip": false,
3345
+ "single_word": false,
3346
+ "special": true
3347
+ },
3348
+ "152061": {
3349
+ "content": "|<EXTRA_TOKENS_415>|",
3350
+ "lstrip": false,
3351
+ "normalized": false,
3352
+ "rstrip": false,
3353
+ "single_word": false,
3354
+ "special": true
3355
+ },
3356
+ "152062": {
3357
+ "content": "|<EXTRA_TOKENS_416>|",
3358
+ "lstrip": false,
3359
+ "normalized": false,
3360
+ "rstrip": false,
3361
+ "single_word": false,
3362
+ "special": true
3363
+ },
3364
+ "152063": {
3365
+ "content": "|<EXTRA_TOKENS_417>|",
3366
+ "lstrip": false,
3367
+ "normalized": false,
3368
+ "rstrip": false,
3369
+ "single_word": false,
3370
+ "special": true
3371
+ },
3372
+ "152064": {
3373
+ "content": "<im_start>",
3374
+ "lstrip": false,
3375
+ "normalized": false,
3376
+ "rstrip": false,
3377
+ "single_word": false,
3378
+ "special": true
3379
+ },
3380
+ "152065": {
3381
+ "content": "<im_end>",
3382
+ "lstrip": false,
3383
+ "normalized": false,
3384
+ "rstrip": false,
3385
+ "single_word": false,
3386
+ "special": true
3387
+ },
3388
+ "152066": {
3389
+ "content": "<im_patch>",
3390
+ "lstrip": false,
3391
+ "normalized": false,
3392
+ "rstrip": false,
3393
+ "single_word": false,
3394
+ "special": true
3395
+ },
3396
+ "152067": {
3397
+ "content": "<im_col>",
3398
+ "lstrip": false,
3399
+ "normalized": false,
3400
+ "rstrip": false,
3401
+ "single_word": false,
3402
+ "special": true
3403
+ },
3404
+ "152068": {
3405
+ "content": "<|image|>",
3406
+ "lstrip": false,
3407
+ "normalized": false,
3408
+ "rstrip": false,
3409
+ "single_word": false,
3410
+ "special": true
3411
+ }
3412
+ },
3413
+ "additional_special_tokens": [
3414
+ "|<EXTRA_TOKENS_0>|",
3415
+ "|<EXTRA_TOKENS_1>|",
3416
+ "|<EXTRA_TOKENS_2>|",
3417
+ "|<EXTRA_TOKENS_3>|",
3418
+ "|<EXTRA_TOKENS_4>|",
3419
+ "|<EXTRA_TOKENS_5>|",
3420
+ "|<EXTRA_TOKENS_6>|",
3421
+ "|<EXTRA_TOKENS_7>|",
3422
+ "|<EXTRA_TOKENS_8>|",
3423
+ "|<EXTRA_TOKENS_9>|",
3424
+ "|<EXTRA_TOKENS_10>|",
3425
+ "|<EXTRA_TOKENS_11>|",
3426
+ "|<EXTRA_TOKENS_12>|",
3427
+ "|<EXTRA_TOKENS_13>|",
3428
+ "|<EXTRA_TOKENS_14>|",
3429
+ "|<EXTRA_TOKENS_15>|",
3430
+ "|<EXTRA_TOKENS_16>|",
3431
+ "|<EXTRA_TOKENS_17>|",
3432
+ "|<EXTRA_TOKENS_18>|",
3433
+ "|<EXTRA_TOKENS_19>|",
3434
+ "|<EXTRA_TOKENS_20>|",
3435
+ "|<EXTRA_TOKENS_21>|",
3436
+ "|<EXTRA_TOKENS_22>|",
3437
+ "|<EXTRA_TOKENS_23>|",
3438
+ "|<EXTRA_TOKENS_24>|",
3439
+ "|<EXTRA_TOKENS_25>|",
3440
+ "|<EXTRA_TOKENS_26>|",
3441
+ "|<EXTRA_TOKENS_27>|",
3442
+ "|<EXTRA_TOKENS_28>|",
3443
+ "|<EXTRA_TOKENS_29>|",
3444
+ "|<EXTRA_TOKENS_30>|",
3445
+ "|<EXTRA_TOKENS_31>|",
3446
+ "|<EXTRA_TOKENS_32>|",
3447
+ "|<EXTRA_TOKENS_33>|",
3448
+ "|<EXTRA_TOKENS_34>|",
3449
+ "|<EXTRA_TOKENS_35>|",
3450
+ "|<EXTRA_TOKENS_36>|",
3451
+ "|<EXTRA_TOKENS_37>|",
3452
+ "|<EXTRA_TOKENS_38>|",
3453
+ "|<EXTRA_TOKENS_39>|",
3454
+ "|<EXTRA_TOKENS_40>|",
3455
+ "|<EXTRA_TOKENS_41>|",
3456
+ "|<EXTRA_TOKENS_42>|",
3457
+ "|<EXTRA_TOKENS_43>|",
3458
+ "|<EXTRA_TOKENS_44>|",
3459
+ "|<EXTRA_TOKENS_45>|",
3460
+ "|<EXTRA_TOKENS_46>|",
3461
+ "|<EXTRA_TOKENS_47>|",
3462
+ "|<EXTRA_TOKENS_48>|",
3463
+ "|<EXTRA_TOKENS_49>|",
3464
+ "|<EXTRA_TOKENS_50>|",
3465
+ "|<EXTRA_TOKENS_51>|",
3466
+ "|<EXTRA_TOKENS_52>|",
3467
+ "|<EXTRA_TOKENS_53>|",
3468
+ "|<EXTRA_TOKENS_54>|",
3469
+ "|<EXTRA_TOKENS_55>|",
3470
+ "|<EXTRA_TOKENS_56>|",
3471
+ "|<EXTRA_TOKENS_57>|",
3472
+ "|<EXTRA_TOKENS_58>|",
3473
+ "|<EXTRA_TOKENS_59>|",
3474
+ "|<EXTRA_TOKENS_60>|",
3475
+ "|<EXTRA_TOKENS_61>|",
3476
+ "|<EXTRA_TOKENS_62>|",
3477
+ "|<EXTRA_TOKENS_63>|",
3478
+ "|<EXTRA_TOKENS_64>|",
3479
+ "|<EXTRA_TOKENS_65>|",
3480
+ "|<EXTRA_TOKENS_66>|",
3481
+ "|<EXTRA_TOKENS_67>|",
3482
+ "|<EXTRA_TOKENS_68>|",
3483
+ "|<EXTRA_TOKENS_69>|",
3484
+ "|<EXTRA_TOKENS_70>|",
3485
+ "|<EXTRA_TOKENS_71>|",
3486
+ "|<EXTRA_TOKENS_72>|",
3487
+ "|<EXTRA_TOKENS_73>|",
3488
+ "|<EXTRA_TOKENS_74>|",
3489
+ "|<EXTRA_TOKENS_75>|",
3490
+ "|<EXTRA_TOKENS_76>|",
3491
+ "|<EXTRA_TOKENS_77>|",
3492
+ "|<EXTRA_TOKENS_78>|",
3493
+ "|<EXTRA_TOKENS_79>|",
3494
+ "|<EXTRA_TOKENS_80>|",
3495
+ "|<EXTRA_TOKENS_81>|",
3496
+ "|<EXTRA_TOKENS_82>|",
3497
+ "|<EXTRA_TOKENS_83>|",
3498
+ "|<EXTRA_TOKENS_84>|",
3499
+ "|<EXTRA_TOKENS_85>|",
3500
+ "|<EXTRA_TOKENS_86>|",
3501
+ "|<EXTRA_TOKENS_87>|",
3502
+ "|<EXTRA_TOKENS_88>|",
3503
+ "|<EXTRA_TOKENS_89>|",
3504
+ "|<EXTRA_TOKENS_90>|",
3505
+ "|<EXTRA_TOKENS_91>|",
3506
+ "|<EXTRA_TOKENS_92>|",
3507
+ "|<EXTRA_TOKENS_93>|",
3508
+ "|<EXTRA_TOKENS_94>|",
3509
+ "|<EXTRA_TOKENS_95>|",
3510
+ "|<EXTRA_TOKENS_96>|",
3511
+ "|<EXTRA_TOKENS_97>|",
3512
+ "|<EXTRA_TOKENS_98>|",
3513
+ "|<EXTRA_TOKENS_99>|",
3514
+ "|<EXTRA_TOKENS_100>|",
3515
+ "|<EXTRA_TOKENS_101>|",
3516
+ "|<EXTRA_TOKENS_102>|",
3517
+ "|<EXTRA_TOKENS_103>|",
3518
+ "|<EXTRA_TOKENS_104>|",
3519
+ "|<EXTRA_TOKENS_105>|",
3520
+ "|<EXTRA_TOKENS_106>|",
3521
+ "|<EXTRA_TOKENS_107>|",
3522
+ "|<EXTRA_TOKENS_108>|",
3523
+ "|<EXTRA_TOKENS_109>|",
3524
+ "|<EXTRA_TOKENS_110>|",
3525
+ "|<EXTRA_TOKENS_111>|",
3526
+ "|<EXTRA_TOKENS_112>|",
3527
+ "|<EXTRA_TOKENS_113>|",
3528
+ "|<EXTRA_TOKENS_114>|",
3529
+ "|<EXTRA_TOKENS_115>|",
3530
+ "|<EXTRA_TOKENS_116>|",
3531
+ "|<EXTRA_TOKENS_117>|",
3532
+ "|<EXTRA_TOKENS_118>|",
3533
+ "|<EXTRA_TOKENS_119>|",
3534
+ "|<EXTRA_TOKENS_120>|",
3535
+ "|<EXTRA_TOKENS_121>|",
3536
+ "|<EXTRA_TOKENS_122>|",
3537
+ "|<EXTRA_TOKENS_123>|",
3538
+ "|<EXTRA_TOKENS_124>|",
3539
+ "|<EXTRA_TOKENS_125>|",
3540
+ "|<EXTRA_TOKENS_126>|",
3541
+ "|<EXTRA_TOKENS_127>|",
3542
+ "|<EXTRA_TOKENS_128>|",
3543
+ "|<EXTRA_TOKENS_129>|",
3544
+ "|<EXTRA_TOKENS_130>|",
3545
+ "|<EXTRA_TOKENS_131>|",
3546
+ "|<EXTRA_TOKENS_132>|",
3547
+ "|<EXTRA_TOKENS_133>|",
3548
+ "|<EXTRA_TOKENS_134>|",
3549
+ "|<EXTRA_TOKENS_135>|",
3550
+ "|<EXTRA_TOKENS_136>|",
3551
+ "|<EXTRA_TOKENS_137>|",
3552
+ "|<EXTRA_TOKENS_138>|",
3553
+ "|<EXTRA_TOKENS_139>|",
3554
+ "|<EXTRA_TOKENS_140>|",
3555
+ "|<EXTRA_TOKENS_141>|",
3556
+ "|<EXTRA_TOKENS_142>|",
3557
+ "|<EXTRA_TOKENS_143>|",
3558
+ "|<EXTRA_TOKENS_144>|",
3559
+ "|<EXTRA_TOKENS_145>|",
3560
+ "|<EXTRA_TOKENS_146>|",
3561
+ "|<EXTRA_TOKENS_147>|",
3562
+ "|<EXTRA_TOKENS_148>|",
3563
+ "|<EXTRA_TOKENS_149>|",
3564
+ "|<EXTRA_TOKENS_150>|",
3565
+ "|<EXTRA_TOKENS_151>|",
3566
+ "|<EXTRA_TOKENS_152>|",
3567
+ "|<EXTRA_TOKENS_153>|",
3568
+ "|<EXTRA_TOKENS_154>|",
3569
+ "|<EXTRA_TOKENS_155>|",
3570
+ "|<EXTRA_TOKENS_156>|",
3571
+ "|<EXTRA_TOKENS_157>|",
3572
+ "|<EXTRA_TOKENS_158>|",
3573
+ "|<EXTRA_TOKENS_159>|",
3574
+ "|<EXTRA_TOKENS_160>|",
3575
+ "|<EXTRA_TOKENS_161>|",
3576
+ "|<EXTRA_TOKENS_162>|",
3577
+ "|<EXTRA_TOKENS_163>|",
3578
+ "|<EXTRA_TOKENS_164>|",
3579
+ "|<EXTRA_TOKENS_165>|",
3580
+ "|<EXTRA_TOKENS_166>|",
3581
+ "|<EXTRA_TOKENS_167>|",
3582
+ "|<EXTRA_TOKENS_168>|",
3583
+ "|<EXTRA_TOKENS_169>|",
3584
+ "|<EXTRA_TOKENS_170>|",
3585
+ "|<EXTRA_TOKENS_171>|",
3586
+ "|<EXTRA_TOKENS_172>|",
3587
+ "|<EXTRA_TOKENS_173>|",
3588
+ "|<EXTRA_TOKENS_174>|",
3589
+ "|<EXTRA_TOKENS_175>|",
3590
+ "|<EXTRA_TOKENS_176>|",
3591
+ "|<EXTRA_TOKENS_177>|",
3592
+ "|<EXTRA_TOKENS_178>|",
3593
+ "|<EXTRA_TOKENS_179>|",
3594
+ "|<EXTRA_TOKENS_180>|",
3595
+ "|<EXTRA_TOKENS_181>|",
3596
+ "|<EXTRA_TOKENS_182>|",
3597
+ "|<EXTRA_TOKENS_183>|",
3598
+ "|<EXTRA_TOKENS_184>|",
3599
+ "|<EXTRA_TOKENS_185>|",
3600
+ "|<EXTRA_TOKENS_186>|",
3601
+ "|<EXTRA_TOKENS_187>|",
3602
+ "|<EXTRA_TOKENS_188>|",
3603
+ "|<EXTRA_TOKENS_189>|",
3604
+ "|<EXTRA_TOKENS_190>|",
3605
+ "|<EXTRA_TOKENS_191>|",
3606
+ "|<EXTRA_TOKENS_192>|",
3607
+ "|<EXTRA_TOKENS_193>|",
3608
+ "|<EXTRA_TOKENS_194>|",
3609
+ "|<EXTRA_TOKENS_195>|",
3610
+ "|<EXTRA_TOKENS_196>|",
3611
+ "|<EXTRA_TOKENS_197>|",
3612
+ "|<EXTRA_TOKENS_198>|",
3613
+ "|<EXTRA_TOKENS_199>|",
3614
+ "|<EXTRA_TOKENS_200>|",
3615
+ "|<EXTRA_TOKENS_201>|",
3616
+ "|<EXTRA_TOKENS_202>|",
3617
+ "|<EXTRA_TOKENS_203>|",
3618
+ "|<EXTRA_TOKENS_204>|",
3619
+ "|<EXTRA_TOKENS_205>|",
3620
+ "|<EXTRA_TOKENS_206>|",
3621
+ "|<EXTRA_TOKENS_207>|",
3622
+ "|<EXTRA_TOKENS_208>|",
3623
+ "|<EXTRA_TOKENS_209>|",
3624
+ "|<EXTRA_TOKENS_210>|",
3625
+ "|<EXTRA_TOKENS_211>|",
3626
+ "|<EXTRA_TOKENS_212>|",
3627
+ "|<EXTRA_TOKENS_213>|",
3628
+ "|<EXTRA_TOKENS_214>|",
3629
+ "|<EXTRA_TOKENS_215>|",
3630
+ "|<EXTRA_TOKENS_216>|",
3631
+ "|<EXTRA_TOKENS_217>|",
3632
+ "|<EXTRA_TOKENS_218>|",
3633
+ "|<EXTRA_TOKENS_219>|",
3634
+ "|<EXTRA_TOKENS_220>|",
3635
+ "|<EXTRA_TOKENS_221>|",
3636
+ "|<EXTRA_TOKENS_222>|",
3637
+ "|<EXTRA_TOKENS_223>|",
3638
+ "|<EXTRA_TOKENS_224>|",
3639
+ "|<EXTRA_TOKENS_225>|",
3640
+ "|<EXTRA_TOKENS_226>|",
3641
+ "|<EXTRA_TOKENS_227>|",
3642
+ "|<EXTRA_TOKENS_228>|",
3643
+ "|<EXTRA_TOKENS_229>|",
3644
+ "|<EXTRA_TOKENS_230>|",
3645
+ "|<EXTRA_TOKENS_231>|",
3646
+ "|<EXTRA_TOKENS_232>|",
3647
+ "|<EXTRA_TOKENS_233>|",
3648
+ "|<EXTRA_TOKENS_234>|",
3649
+ "|<EXTRA_TOKENS_235>|",
3650
+ "|<EXTRA_TOKENS_236>|",
3651
+ "|<EXTRA_TOKENS_237>|",
3652
+ "|<EXTRA_TOKENS_238>|",
3653
+ "|<EXTRA_TOKENS_239>|",
3654
+ "|<EXTRA_TOKENS_240>|",
3655
+ "|<EXTRA_TOKENS_241>|",
3656
+ "|<EXTRA_TOKENS_242>|",
3657
+ "|<EXTRA_TOKENS_243>|",
3658
+ "|<EXTRA_TOKENS_244>|",
3659
+ "|<EXTRA_TOKENS_245>|",
3660
+ "|<EXTRA_TOKENS_246>|",
3661
+ "|<EXTRA_TOKENS_247>|",
3662
+ "|<EXTRA_TOKENS_248>|",
3663
+ "|<EXTRA_TOKENS_249>|",
3664
+ "|<EXTRA_TOKENS_250>|",
3665
+ "|<EXTRA_TOKENS_251>|",
3666
+ "|<EXTRA_TOKENS_252>|",
3667
+ "|<EXTRA_TOKENS_253>|",
3668
+ "|<EXTRA_TOKENS_254>|",
3669
+ "|<EXTRA_TOKENS_255>|",
3670
+ "|<EXTRA_TOKENS_256>|",
3671
+ "|<EXTRA_TOKENS_257>|",
3672
+ "|<EXTRA_TOKENS_258>|",
3673
+ "|<EXTRA_TOKENS_259>|",
3674
+ "|<EXTRA_TOKENS_260>|",
3675
+ "|<EXTRA_TOKENS_261>|",
3676
+ "|<EXTRA_TOKENS_262>|",
3677
+ "|<EXTRA_TOKENS_263>|",
3678
+ "|<EXTRA_TOKENS_264>|",
3679
+ "|<EXTRA_TOKENS_265>|",
3680
+ "|<EXTRA_TOKENS_266>|",
3681
+ "|<EXTRA_TOKENS_267>|",
3682
+ "|<EXTRA_TOKENS_268>|",
3683
+ "|<EXTRA_TOKENS_269>|",
3684
+ "|<EXTRA_TOKENS_270>|",
3685
+ "|<EXTRA_TOKENS_271>|",
3686
+ "|<EXTRA_TOKENS_272>|",
3687
+ "|<EXTRA_TOKENS_273>|",
3688
+ "|<EXTRA_TOKENS_274>|",
3689
+ "|<EXTRA_TOKENS_275>|",
3690
+ "|<EXTRA_TOKENS_276>|",
3691
+ "|<EXTRA_TOKENS_277>|",
3692
+ "|<EXTRA_TOKENS_278>|",
3693
+ "|<EXTRA_TOKENS_279>|",
3694
+ "|<EXTRA_TOKENS_280>|",
3695
+ "|<EXTRA_TOKENS_281>|",
3696
+ "|<EXTRA_TOKENS_282>|",
3697
+ "|<EXTRA_TOKENS_283>|",
3698
+ "|<EXTRA_TOKENS_284>|",
3699
+ "|<EXTRA_TOKENS_285>|",
3700
+ "|<EXTRA_TOKENS_286>|",
3701
+ "|<EXTRA_TOKENS_287>|",
3702
+ "|<EXTRA_TOKENS_288>|",
3703
+ "|<EXTRA_TOKENS_289>|",
3704
+ "|<EXTRA_TOKENS_290>|",
3705
+ "|<EXTRA_TOKENS_291>|",
3706
+ "|<EXTRA_TOKENS_292>|",
3707
+ "|<EXTRA_TOKENS_293>|",
3708
+ "|<EXTRA_TOKENS_294>|",
3709
+ "|<EXTRA_TOKENS_295>|",
3710
+ "|<EXTRA_TOKENS_296>|",
3711
+ "|<EXTRA_TOKENS_297>|",
3712
+ "|<EXTRA_TOKENS_298>|",
3713
+ "|<EXTRA_TOKENS_299>|",
3714
+ "|<EXTRA_TOKENS_300>|",
3715
+ "|<EXTRA_TOKENS_301>|",
3716
+ "|<EXTRA_TOKENS_302>|",
3717
+ "|<EXTRA_TOKENS_303>|",
3718
+ "|<EXTRA_TOKENS_304>|",
3719
+ "|<EXTRA_TOKENS_305>|",
3720
+ "|<EXTRA_TOKENS_306>|",
3721
+ "|<EXTRA_TOKENS_307>|",
3722
+ "|<EXTRA_TOKENS_308>|",
3723
+ "|<EXTRA_TOKENS_309>|",
3724
+ "|<EXTRA_TOKENS_310>|",
3725
+ "|<EXTRA_TOKENS_311>|",
3726
+ "|<EXTRA_TOKENS_312>|",
3727
+ "|<EXTRA_TOKENS_313>|",
3728
+ "|<EXTRA_TOKENS_314>|",
3729
+ "|<EXTRA_TOKENS_315>|",
3730
+ "|<EXTRA_TOKENS_316>|",
3731
+ "|<EXTRA_TOKENS_317>|",
3732
+ "|<EXTRA_TOKENS_318>|",
3733
+ "|<EXTRA_TOKENS_319>|",
3734
+ "|<EXTRA_TOKENS_320>|",
3735
+ "|<EXTRA_TOKENS_321>|",
3736
+ "|<EXTRA_TOKENS_322>|",
3737
+ "|<EXTRA_TOKENS_323>|",
3738
+ "|<EXTRA_TOKENS_324>|",
3739
+ "|<EXTRA_TOKENS_325>|",
3740
+ "|<EXTRA_TOKENS_326>|",
3741
+ "|<EXTRA_TOKENS_327>|",
3742
+ "|<EXTRA_TOKENS_328>|",
3743
+ "|<EXTRA_TOKENS_329>|",
3744
+ "|<EXTRA_TOKENS_330>|",
3745
+ "|<EXTRA_TOKENS_331>|",
3746
+ "|<EXTRA_TOKENS_332>|",
3747
+ "|<EXTRA_TOKENS_333>|",
3748
+ "|<EXTRA_TOKENS_334>|",
3749
+ "|<EXTRA_TOKENS_335>|",
3750
+ "|<EXTRA_TOKENS_336>|",
3751
+ "|<EXTRA_TOKENS_337>|",
3752
+ "|<EXTRA_TOKENS_338>|",
3753
+ "|<EXTRA_TOKENS_339>|",
3754
+ "|<EXTRA_TOKENS_340>|",
3755
+ "|<EXTRA_TOKENS_341>|",
3756
+ "|<EXTRA_TOKENS_342>|",
3757
+ "|<EXTRA_TOKENS_343>|",
3758
+ "|<EXTRA_TOKENS_344>|",
3759
+ "|<EXTRA_TOKENS_345>|",
3760
+ "|<EXTRA_TOKENS_346>|",
3761
+ "|<EXTRA_TOKENS_347>|",
3762
+ "|<EXTRA_TOKENS_348>|",
3763
+ "|<EXTRA_TOKENS_349>|",
3764
+ "|<EXTRA_TOKENS_350>|",
3765
+ "|<EXTRA_TOKENS_351>|",
3766
+ "|<EXTRA_TOKENS_352>|",
3767
+ "|<EXTRA_TOKENS_353>|",
3768
+ "|<EXTRA_TOKENS_354>|",
3769
+ "|<EXTRA_TOKENS_355>|",
3770
+ "|<EXTRA_TOKENS_356>|",
3771
+ "|<EXTRA_TOKENS_357>|",
3772
+ "|<EXTRA_TOKENS_358>|",
3773
+ "|<EXTRA_TOKENS_359>|",
3774
+ "|<EXTRA_TOKENS_360>|",
3775
+ "|<EXTRA_TOKENS_361>|",
3776
+ "|<EXTRA_TOKENS_362>|",
3777
+ "|<EXTRA_TOKENS_363>|",
3778
+ "|<EXTRA_TOKENS_364>|",
3779
+ "|<EXTRA_TOKENS_365>|",
3780
+ "|<EXTRA_TOKENS_366>|",
3781
+ "|<EXTRA_TOKENS_367>|",
3782
+ "|<EXTRA_TOKENS_368>|",
3783
+ "|<EXTRA_TOKENS_369>|",
3784
+ "|<EXTRA_TOKENS_370>|",
3785
+ "|<EXTRA_TOKENS_371>|",
3786
+ "|<EXTRA_TOKENS_372>|",
3787
+ "|<EXTRA_TOKENS_373>|",
3788
+ "|<EXTRA_TOKENS_374>|",
3789
+ "|<EXTRA_TOKENS_375>|",
3790
+ "|<EXTRA_TOKENS_376>|",
3791
+ "|<EXTRA_TOKENS_377>|",
3792
+ "|<EXTRA_TOKENS_378>|",
3793
+ "|<EXTRA_TOKENS_379>|",
3794
+ "|<EXTRA_TOKENS_380>|",
3795
+ "|<EXTRA_TOKENS_381>|",
3796
+ "|<EXTRA_TOKENS_382>|",
3797
+ "|<EXTRA_TOKENS_383>|",
3798
+ "|<EXTRA_TOKENS_384>|",
3799
+ "|<EXTRA_TOKENS_385>|",
3800
+ "|<EXTRA_TOKENS_386>|",
3801
+ "|<EXTRA_TOKENS_387>|",
3802
+ "|<EXTRA_TOKENS_388>|",
3803
+ "|<EXTRA_TOKENS_389>|",
3804
+ "|<EXTRA_TOKENS_390>|",
3805
+ "|<EXTRA_TOKENS_391>|",
3806
+ "|<EXTRA_TOKENS_392>|",
3807
+ "|<EXTRA_TOKENS_393>|",
3808
+ "|<EXTRA_TOKENS_394>|",
3809
+ "|<EXTRA_TOKENS_395>|",
3810
+ "|<EXTRA_TOKENS_396>|",
3811
+ "|<EXTRA_TOKENS_397>|",
3812
+ "|<EXTRA_TOKENS_398>|",
3813
+ "|<EXTRA_TOKENS_399>|",
3814
+ "|<EXTRA_TOKENS_400>|",
3815
+ "|<EXTRA_TOKENS_401>|",
3816
+ "|<EXTRA_TOKENS_402>|",
3817
+ "|<EXTRA_TOKENS_403>|",
3818
+ "|<EXTRA_TOKENS_404>|",
3819
+ "|<EXTRA_TOKENS_405>|",
3820
+ "|<EXTRA_TOKENS_406>|",
3821
+ "|<EXTRA_TOKENS_407>|",
3822
+ "|<EXTRA_TOKENS_408>|",
3823
+ "|<EXTRA_TOKENS_409>|",
3824
+ "|<EXTRA_TOKENS_410>|",
3825
+ "|<EXTRA_TOKENS_411>|",
3826
+ "|<EXTRA_TOKENS_412>|",
3827
+ "|<EXTRA_TOKENS_413>|",
3828
+ "|<EXTRA_TOKENS_414>|",
3829
+ "|<EXTRA_TOKENS_415>|",
3830
+ "|<EXTRA_TOKENS_416>|",
3831
+ "|<EXTRA_TOKENS_417>|",
3832
+ "<im_start>",
3833
+ "<im_end>",
3834
+ "<im_patch>",
3835
+ "<im_col>",
3836
+ "<|image|>"
3837
+ ],
3838
+ "auto_map": {
3839
+ "AutoProcessor": "preprocessing_molmo.MolmoProcessor"
3840
+ },
3841
+ "bos_token": null,
3842
+ "chat_template": "{% for message in messages -%}\n {%- if (loop.index % 2 == 1 and message['role'] != 'user') or \n (loop.index % 2 == 0 and message['role'].lower() != 'assistant') -%}\n {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}\n {%- endif -%}\n {{ message['role'].capitalize() + ': ' + message['content'] }}\n {%- if not loop.last -%}\n {{ ' ' }}\n {%- endif %}\n {%- endfor -%}\n {%- if add_generation_prompt -%}\n {{ ' Assistant:' }}\n {%- endif %}",
3843
+ "clean_up_tokenization_spaces": false,
3844
+ "eos_token": "<|endoftext|>",
3845
+ "errors": "replace",
3846
+ "extra_special_tokens": {},
3847
+ "model_max_length": 32768,
3848
+ "pad_token": "<|endoftext|>",
3849
+ "processor_class": "MolmoProcessor",
3850
+ "split_special_tokens": false,
3851
+ "tokenizer_class": "Qwen2Tokenizer",
3852
+ "unk_token": null
3853
+ }
trainer_state.json ADDED
@@ -0,0 +1,890 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.21225065,
3
+ "best_model_checkpoint": "/workspace/output/molmo-7b-d/v1-20250103-233013/checkpoint-414",
4
+ "epoch": 3.0,
5
+ "eval_steps": 200,
6
+ "global_step": 414,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "acc": 0.93268561,
13
+ "epoch": 0.007272727272727273,
14
+ "grad_norm": 4.360905168604235,
15
+ "learning_rate": 0.0,
16
+ "loss": 0.15919656,
17
+ "memory(GiB)": 131.1,
18
+ "step": 1,
19
+ "train_speed(iter/s)": 0.051814
20
+ },
21
+ {
22
+ "acc": 0.93592656,
23
+ "epoch": 0.03636363636363636,
24
+ "grad_norm": 5.722024176509264,
25
+ "learning_rate": 4.829949384917788e-06,
26
+ "loss": 0.16946605,
27
+ "memory(GiB)": 131.7,
28
+ "step": 5,
29
+ "train_speed(iter/s)": 0.164947
30
+ },
31
+ {
32
+ "acc": 0.92745094,
33
+ "epoch": 0.07272727272727272,
34
+ "grad_norm": 5.449760109713864,
35
+ "learning_rate": 6.910095361682884e-06,
36
+ "loss": 0.19423571,
37
+ "memory(GiB)": 131.7,
38
+ "step": 10,
39
+ "train_speed(iter/s)": 0.228063
40
+ },
41
+ {
42
+ "acc": 0.9191308,
43
+ "epoch": 0.10909090909090909,
44
+ "grad_norm": 5.591882854054257,
45
+ "learning_rate": 8.126902754116446e-06,
46
+ "loss": 0.2132081,
47
+ "memory(GiB)": 131.7,
48
+ "step": 15,
49
+ "train_speed(iter/s)": 0.262462
50
+ },
51
+ {
52
+ "acc": 0.91665707,
53
+ "epoch": 0.14545454545454545,
54
+ "grad_norm": 8.643333147328232,
55
+ "learning_rate": 8.990241338447979e-06,
56
+ "loss": 0.2489913,
57
+ "memory(GiB)": 132.85,
58
+ "step": 20,
59
+ "train_speed(iter/s)": 0.283686
60
+ },
61
+ {
62
+ "acc": 0.92767944,
63
+ "epoch": 0.18181818181818182,
64
+ "grad_norm": 5.521300358752013,
65
+ "learning_rate": 9.659898769835576e-06,
66
+ "loss": 0.20275159,
67
+ "memory(GiB)": 100.16,
68
+ "step": 25,
69
+ "train_speed(iter/s)": 0.297772
70
+ },
71
+ {
72
+ "acc": 0.91237392,
73
+ "epoch": 0.21818181818181817,
74
+ "grad_norm": 4.691287833576054,
75
+ "learning_rate": 9.999635040777627e-06,
76
+ "loss": 0.24152677,
77
+ "memory(GiB)": 100.16,
78
+ "step": 30,
79
+ "train_speed(iter/s)": 0.306552
80
+ },
81
+ {
82
+ "acc": 0.89811802,
83
+ "epoch": 0.2545454545454545,
84
+ "grad_norm": 2.3267326177072176,
85
+ "learning_rate": 9.995529861428146e-06,
86
+ "loss": 0.2682821,
87
+ "memory(GiB)": 100.16,
88
+ "step": 35,
89
+ "train_speed(iter/s)": 0.312152
90
+ },
91
+ {
92
+ "acc": 0.91184635,
93
+ "epoch": 0.2909090909090909,
94
+ "grad_norm": 3.241573273096398,
95
+ "learning_rate": 9.986867061882612e-06,
96
+ "loss": 0.23578806,
97
+ "memory(GiB)": 100.16,
98
+ "step": 40,
99
+ "train_speed(iter/s)": 0.314978
100
+ },
101
+ {
102
+ "acc": 0.9012291,
103
+ "epoch": 0.32727272727272727,
104
+ "grad_norm": 2.3552152207973713,
105
+ "learning_rate": 9.973654546348053e-06,
106
+ "loss": 0.25761139,
107
+ "memory(GiB)": 100.16,
108
+ "step": 45,
109
+ "train_speed(iter/s)": 0.319208
110
+ },
111
+ {
112
+ "acc": 0.9016325,
113
+ "epoch": 0.36363636363636365,
114
+ "grad_norm": 3.1153181076119703,
115
+ "learning_rate": 9.955904370333514e-06,
116
+ "loss": 0.24715631,
117
+ "memory(GiB)": 100.16,
118
+ "step": 50,
119
+ "train_speed(iter/s)": 0.32296
120
+ },
121
+ {
122
+ "acc": 0.89749699,
123
+ "epoch": 0.4,
124
+ "grad_norm": 2.4498466601081943,
125
+ "learning_rate": 9.933632729650212e-06,
126
+ "loss": 0.25689688,
127
+ "memory(GiB)": 100.16,
128
+ "step": 55,
129
+ "train_speed(iter/s)": 0.325846
130
+ },
131
+ {
132
+ "acc": 0.88724833,
133
+ "epoch": 0.43636363636363634,
134
+ "grad_norm": 4.364723865759911,
135
+ "learning_rate": 9.906859945633999e-06,
136
+ "loss": 0.28743353,
137
+ "memory(GiB)": 100.16,
138
+ "step": 60,
139
+ "train_speed(iter/s)": 0.328247
140
+ },
141
+ {
142
+ "acc": 0.90578156,
143
+ "epoch": 0.4727272727272727,
144
+ "grad_norm": 3.243778418144708,
145
+ "learning_rate": 9.875610446603524e-06,
146
+ "loss": 0.26308877,
147
+ "memory(GiB)": 100.16,
148
+ "step": 65,
149
+ "train_speed(iter/s)": 0.330485
150
+ },
151
+ {
152
+ "acc": 0.89676228,
153
+ "epoch": 0.509090909090909,
154
+ "grad_norm": 3.4165598224968274,
155
+ "learning_rate": 9.83991274557109e-06,
156
+ "loss": 0.26372042,
157
+ "memory(GiB)": 127.96,
158
+ "step": 70,
159
+ "train_speed(iter/s)": 0.332413
160
+ },
161
+ {
162
+ "acc": 0.9054903,
163
+ "epoch": 0.5454545454545454,
164
+ "grad_norm": 3.814636181453338,
165
+ "learning_rate": 9.7997994142265e-06,
166
+ "loss": 0.25466361,
167
+ "memory(GiB)": 127.96,
168
+ "step": 75,
169
+ "train_speed(iter/s)": 0.334379
170
+ },
171
+ {
172
+ "acc": 0.90086946,
173
+ "epoch": 0.5818181818181818,
174
+ "grad_norm": 3.9972259822599243,
175
+ "learning_rate": 9.755307053217622e-06,
176
+ "loss": 0.27588401,
177
+ "memory(GiB)": 127.96,
178
+ "step": 80,
179
+ "train_speed(iter/s)": 0.336004
180
+ },
181
+ {
182
+ "acc": 0.89949837,
183
+ "epoch": 0.6181818181818182,
184
+ "grad_norm": 5.998240972031008,
185
+ "learning_rate": 9.706476258754834e-06,
186
+ "loss": 0.25472341,
187
+ "memory(GiB)": 127.96,
188
+ "step": 85,
189
+ "train_speed(iter/s)": 0.337291
190
+ },
191
+ {
192
+ "acc": 0.88558121,
193
+ "epoch": 0.6545454545454545,
194
+ "grad_norm": 2.7186082929792574,
195
+ "learning_rate": 9.653351585569786e-06,
196
+ "loss": 0.28254557,
197
+ "memory(GiB)": 127.96,
198
+ "step": 90,
199
+ "train_speed(iter/s)": 0.337576
200
+ },
201
+ {
202
+ "acc": 0.90562687,
203
+ "epoch": 0.6909090909090909,
204
+ "grad_norm": 1.6880555029124777,
205
+ "learning_rate": 9.595981506262264e-06,
206
+ "loss": 0.25460241,
207
+ "memory(GiB)": 127.96,
208
+ "step": 95,
209
+ "train_speed(iter/s)": 0.338319
210
+ },
211
+ {
212
+ "acc": 0.90238457,
213
+ "epoch": 0.7272727272727273,
214
+ "grad_norm": 1.824873702466673,
215
+ "learning_rate": 9.534418367072303e-06,
216
+ "loss": 0.25135682,
217
+ "memory(GiB)": 127.96,
218
+ "step": 100,
219
+ "train_speed(iter/s)": 0.33935
220
+ },
221
+ {
222
+ "acc": 0.90719824,
223
+ "epoch": 0.7636363636363637,
224
+ "grad_norm": 3.0523518026276926,
225
+ "learning_rate": 9.468718340117846e-06,
226
+ "loss": 0.23181794,
227
+ "memory(GiB)": 127.96,
228
+ "step": 105,
229
+ "train_speed(iter/s)": 0.340475
230
+ },
231
+ {
232
+ "acc": 0.89296656,
233
+ "epoch": 0.8,
234
+ "grad_norm": 3.6744833597367514,
235
+ "learning_rate": 9.398941372141562e-06,
236
+ "loss": 0.27924564,
237
+ "memory(GiB)": 127.96,
238
+ "step": 110,
239
+ "train_speed(iter/s)": 0.341456
240
+ },
241
+ {
242
+ "acc": 0.89754677,
243
+ "epoch": 0.8363636363636363,
244
+ "grad_norm": 3.250222318126925,
245
+ "learning_rate": 9.325151129813582e-06,
246
+ "loss": 0.26513102,
247
+ "memory(GiB)": 127.96,
248
+ "step": 115,
249
+ "train_speed(iter/s)": 0.342153
250
+ },
251
+ {
252
+ "acc": 0.88903837,
253
+ "epoch": 0.8727272727272727,
254
+ "grad_norm": 2.376728799007849,
255
+ "learning_rate": 9.247414941640045e-06,
256
+ "loss": 0.30169072,
257
+ "memory(GiB)": 133.76,
258
+ "step": 120,
259
+ "train_speed(iter/s)": 0.342998
260
+ },
261
+ {
262
+ "acc": 0.89329395,
263
+ "epoch": 0.9090909090909091,
264
+ "grad_norm": 4.889478322316845,
265
+ "learning_rate": 9.165803736530492e-06,
266
+ "loss": 0.28302565,
267
+ "memory(GiB)": 100.58,
268
+ "step": 125,
269
+ "train_speed(iter/s)": 0.343779
270
+ },
271
+ {
272
+ "acc": 0.89977417,
273
+ "epoch": 0.9454545454545454,
274
+ "grad_norm": 2.0057917841024633,
275
+ "learning_rate": 9.080391979080116e-06,
276
+ "loss": 0.2668047,
277
+ "memory(GiB)": 100.58,
278
+ "step": 130,
279
+ "train_speed(iter/s)": 0.344351
280
+ },
281
+ {
282
+ "acc": 0.90148487,
283
+ "epoch": 0.9818181818181818,
284
+ "grad_norm": 2.470715179920895,
285
+ "learning_rate": 8.991257601625973e-06,
286
+ "loss": 0.25751991,
287
+ "memory(GiB)": 100.58,
288
+ "step": 135,
289
+ "train_speed(iter/s)": 0.345171
290
+ },
291
+ {
292
+ "epoch": 1.0,
293
+ "eval_acc": 0.9078246620237608,
294
+ "eval_loss": 0.2361508309841156,
295
+ "eval_runtime": 10.278,
296
+ "eval_samples_per_second": 11.286,
297
+ "eval_steps_per_second": 1.459,
298
+ "step": 138
299
+ },
300
+ {
301
+ "acc": 0.8134038,
302
+ "epoch": 1.0145454545454546,
303
+ "grad_norm": 1.9385369249323439,
304
+ "learning_rate": 8.917324354080927e-06,
305
+ "loss": 0.254459,
306
+ "memory(GiB)": 100.58,
307
+ "step": 140,
308
+ "train_speed(iter/s)": 0.309598
309
+ },
310
+ {
311
+ "acc": 0.90728855,
312
+ "epoch": 1.050909090909091,
313
+ "grad_norm": 76.54794008048425,
314
+ "learning_rate": 8.82169644486897e-06,
315
+ "loss": 0.23623853,
316
+ "memory(GiB)": 100.58,
317
+ "step": 145,
318
+ "train_speed(iter/s)": 0.311044
319
+ },
320
+ {
321
+ "acc": 0.91997566,
322
+ "epoch": 1.0872727272727274,
323
+ "grad_norm": 1.727673298537959,
324
+ "learning_rate": 8.722581957483633e-06,
325
+ "loss": 0.21817675,
326
+ "memory(GiB)": 100.58,
327
+ "step": 150,
328
+ "train_speed(iter/s)": 0.31275
329
+ },
330
+ {
331
+ "acc": 0.91184559,
332
+ "epoch": 1.1236363636363635,
333
+ "grad_norm": 2.4370845690665974,
334
+ "learning_rate": 8.620071327057833e-06,
335
+ "loss": 0.22411692,
336
+ "memory(GiB)": 100.58,
337
+ "step": 155,
338
+ "train_speed(iter/s)": 0.314364
339
+ },
340
+ {
341
+ "acc": 0.91105995,
342
+ "epoch": 1.16,
343
+ "grad_norm": 4.474578962221848,
344
+ "learning_rate": 8.514258087470745e-06,
345
+ "loss": 0.22455444,
346
+ "memory(GiB)": 100.58,
347
+ "step": 160,
348
+ "train_speed(iter/s)": 0.315941
349
+ },
350
+ {
351
+ "acc": 0.92596989,
352
+ "epoch": 1.1963636363636363,
353
+ "grad_norm": 2.27714865436083,
354
+ "learning_rate": 8.405238786004592e-06,
355
+ "loss": 0.19618599,
356
+ "memory(GiB)": 100.58,
357
+ "step": 165,
358
+ "train_speed(iter/s)": 0.317423
359
+ },
360
+ {
361
+ "acc": 0.91807003,
362
+ "epoch": 1.2327272727272727,
363
+ "grad_norm": 3.476526282944283,
364
+ "learning_rate": 8.293112895251915e-06,
365
+ "loss": 0.21812358,
366
+ "memory(GiB)": 100.58,
367
+ "step": 170,
368
+ "train_speed(iter/s)": 0.318837
369
+ },
370
+ {
371
+ "acc": 0.91757879,
372
+ "epoch": 1.269090909090909,
373
+ "grad_norm": 2.812345046742586,
374
+ "learning_rate": 8.177982722353686e-06,
375
+ "loss": 0.20932765,
376
+ "memory(GiB)": 100.58,
377
+ "step": 175,
378
+ "train_speed(iter/s)": 0.319897
379
+ },
380
+ {
381
+ "acc": 0.9130724,
382
+ "epoch": 1.3054545454545454,
383
+ "grad_norm": 1.909403498812979,
384
+ "learning_rate": 8.059953315651102e-06,
385
+ "loss": 0.22100675,
386
+ "memory(GiB)": 100.58,
387
+ "step": 180,
388
+ "train_speed(iter/s)": 0.320821
389
+ },
390
+ {
391
+ "acc": 0.91083689,
392
+ "epoch": 1.3418181818181818,
393
+ "grad_norm": 3.7534483781265853,
394
+ "learning_rate": 7.93913236883622e-06,
395
+ "loss": 0.22075479,
396
+ "memory(GiB)": 100.58,
397
+ "step": 185,
398
+ "train_speed(iter/s)": 0.321724
399
+ },
400
+ {
401
+ "acc": 0.90749474,
402
+ "epoch": 1.3781818181818182,
403
+ "grad_norm": 3.0657460772043805,
404
+ "learning_rate": 7.815630122688893e-06,
405
+ "loss": 0.22630196,
406
+ "memory(GiB)": 100.58,
407
+ "step": 190,
408
+ "train_speed(iter/s)": 0.3226
409
+ },
410
+ {
411
+ "acc": 0.92584915,
412
+ "epoch": 1.4145454545454546,
413
+ "grad_norm": 5.821099128946982,
414
+ "learning_rate": 7.689559264489661e-06,
415
+ "loss": 0.21087196,
416
+ "memory(GiB)": 100.58,
417
+ "step": 195,
418
+ "train_speed(iter/s)": 0.32333
419
+ },
420
+ {
421
+ "acc": 0.90973835,
422
+ "epoch": 1.450909090909091,
423
+ "grad_norm": 1.830285233435649,
424
+ "learning_rate": 7.5610348252003814e-06,
425
+ "loss": 0.24081864,
426
+ "memory(GiB)": 100.58,
427
+ "step": 200,
428
+ "train_speed(iter/s)": 0.323755
429
+ },
430
+ {
431
+ "acc": 0.91908627,
432
+ "epoch": 1.4872727272727273,
433
+ "grad_norm": 3.46434543645635,
434
+ "learning_rate": 7.43017407450641e-06,
435
+ "loss": 0.21430855,
436
+ "memory(GiB)": 100.58,
437
+ "step": 205,
438
+ "train_speed(iter/s)": 0.324304
439
+ },
440
+ {
441
+ "acc": 0.90855217,
442
+ "epoch": 1.5236363636363637,
443
+ "grad_norm": 1.6445934060533671,
444
+ "learning_rate": 7.2970964138161006e-06,
445
+ "loss": 0.2204694,
446
+ "memory(GiB)": 100.58,
447
+ "step": 210,
448
+ "train_speed(iter/s)": 0.325137
449
+ },
450
+ {
451
+ "acc": 0.9202652,
452
+ "epoch": 1.56,
453
+ "grad_norm": 2.685739587728944,
454
+ "learning_rate": 7.161923267315262e-06,
455
+ "loss": 0.20784543,
456
+ "memory(GiB)": 100.58,
457
+ "step": 215,
458
+ "train_speed(iter/s)": 0.325877
459
+ },
460
+ {
461
+ "acc": 0.92430801,
462
+ "epoch": 1.5963636363636362,
463
+ "grad_norm": 3.4665236755524202,
464
+ "learning_rate": 7.0247779711759566e-06,
465
+ "loss": 0.2091445,
466
+ "memory(GiB)": 100.58,
467
+ "step": 220,
468
+ "train_speed(iter/s)": 0.326598
469
+ },
470
+ {
471
+ "acc": 0.91858587,
472
+ "epoch": 1.6327272727272728,
473
+ "grad_norm": 3.0400419237318674,
474
+ "learning_rate": 6.885785661020759e-06,
475
+ "loss": 0.22234173,
476
+ "memory(GiB)": 100.58,
477
+ "step": 225,
478
+ "train_speed(iter/s)": 0.32754
479
+ },
480
+ {
481
+ "acc": 0.91896229,
482
+ "epoch": 1.669090909090909,
483
+ "grad_norm": 2.50023791606214,
484
+ "learning_rate": 6.7450731577451255e-06,
485
+ "loss": 0.20558548,
486
+ "memory(GiB)": 100.58,
487
+ "step": 230,
488
+ "train_speed(iter/s)": 0.328407
489
+ },
490
+ {
491
+ "acc": 0.92307997,
492
+ "epoch": 1.7054545454545456,
493
+ "grad_norm": 2.789509587118081,
494
+ "learning_rate": 6.602768851802077e-06,
495
+ "loss": 0.21382501,
496
+ "memory(GiB)": 100.58,
497
+ "step": 235,
498
+ "train_speed(iter/s)": 0.329247
499
+ },
500
+ {
501
+ "acc": 0.91400127,
502
+ "epoch": 1.7418181818181817,
503
+ "grad_norm": 2.3889266426439173,
504
+ "learning_rate": 6.45900258605477e-06,
505
+ "loss": 0.21889751,
506
+ "memory(GiB)": 100.58,
507
+ "step": 240,
508
+ "train_speed(iter/s)": 0.330086
509
+ },
510
+ {
511
+ "acc": 0.90683708,
512
+ "epoch": 1.7781818181818183,
513
+ "grad_norm": 3.3107240552086465,
514
+ "learning_rate": 6.313905537303837e-06,
515
+ "loss": 0.21690502,
516
+ "memory(GiB)": 100.58,
517
+ "step": 245,
518
+ "train_speed(iter/s)": 0.330898
519
+ },
520
+ {
521
+ "acc": 0.91603336,
522
+ "epoch": 1.8145454545454545,
523
+ "grad_norm": 2.8852486239120547,
524
+ "learning_rate": 6.167610096597601e-06,
525
+ "loss": 0.2154119,
526
+ "memory(GiB)": 100.58,
527
+ "step": 250,
528
+ "train_speed(iter/s)": 0.331673
529
+ },
530
+ {
531
+ "acc": 0.91818409,
532
+ "epoch": 1.850909090909091,
533
+ "grad_norm": 2.0440810660323585,
534
+ "learning_rate": 6.020249748434384e-06,
535
+ "loss": 0.21951377,
536
+ "memory(GiB)": 100.58,
537
+ "step": 255,
538
+ "train_speed(iter/s)": 0.332356
539
+ },
540
+ {
541
+ "acc": 0.90970173,
542
+ "epoch": 1.8872727272727272,
543
+ "grad_norm": 3.8117037313040574,
544
+ "learning_rate": 5.871958948967106e-06,
545
+ "loss": 0.23594971,
546
+ "memory(GiB)": 100.58,
547
+ "step": 260,
548
+ "train_speed(iter/s)": 0.33293
549
+ },
550
+ {
551
+ "acc": 0.92123165,
552
+ "epoch": 1.9236363636363636,
553
+ "grad_norm": 3.4855685769436375,
554
+ "learning_rate": 5.722873003321322e-06,
555
+ "loss": 0.21117101,
556
+ "memory(GiB)": 100.58,
557
+ "step": 265,
558
+ "train_speed(iter/s)": 0.333662
559
+ },
560
+ {
561
+ "acc": 0.91777382,
562
+ "epoch": 1.96,
563
+ "grad_norm": 2.497000906964384,
564
+ "learning_rate": 5.573127942138622e-06,
565
+ "loss": 0.21624155,
566
+ "memory(GiB)": 100.58,
567
+ "step": 270,
568
+ "train_speed(iter/s)": 0.334225
569
+ },
570
+ {
571
+ "acc": 0.9166666,
572
+ "epoch": 1.9963636363636363,
573
+ "grad_norm": 4.782654736901845,
574
+ "learning_rate": 5.422860397458064e-06,
575
+ "loss": 0.21392875,
576
+ "memory(GiB)": 100.58,
577
+ "step": 275,
578
+ "train_speed(iter/s)": 0.334671
579
+ },
580
+ {
581
+ "epoch": 2.0,
582
+ "eval_acc": 0.9098730028676771,
583
+ "eval_loss": 0.2191523164510727,
584
+ "eval_runtime": 10.1618,
585
+ "eval_samples_per_second": 11.415,
586
+ "eval_steps_per_second": 1.476,
587
+ "step": 276
588
+ },
589
+ {
590
+ "acc": 0.84443541,
591
+ "epoch": 2.0290909090909093,
592
+ "grad_norm": 3.015403395241152,
593
+ "learning_rate": 5.27220747804885e-06,
594
+ "loss": 0.17099829,
595
+ "memory(GiB)": 100.58,
596
+ "step": 280,
597
+ "train_speed(iter/s)": 0.317633
598
+ },
599
+ {
600
+ "acc": 0.93253222,
601
+ "epoch": 2.0654545454545454,
602
+ "grad_norm": 2.167435558475328,
603
+ "learning_rate": 5.121306644308045e-06,
604
+ "loss": 0.18818057,
605
+ "memory(GiB)": 100.58,
606
+ "step": 285,
607
+ "train_speed(iter/s)": 0.3185
608
+ },
609
+ {
610
+ "acc": 0.94647446,
611
+ "epoch": 2.101818181818182,
612
+ "grad_norm": 2.1487311628542898,
613
+ "learning_rate": 4.9702955828374385e-06,
614
+ "loss": 0.15134431,
615
+ "memory(GiB)": 100.58,
616
+ "step": 290,
617
+ "train_speed(iter/s)": 0.319277
618
+ },
619
+ {
620
+ "acc": 0.93036728,
621
+ "epoch": 2.138181818181818,
622
+ "grad_norm": 4.174051904681519,
623
+ "learning_rate": 4.8193120808140185e-06,
624
+ "loss": 0.16832316,
625
+ "memory(GiB)": 100.58,
626
+ "step": 295,
627
+ "train_speed(iter/s)": 0.320077
628
+ },
629
+ {
630
+ "acc": 0.93621769,
631
+ "epoch": 2.174545454545455,
632
+ "grad_norm": 2.3866390406657896,
633
+ "learning_rate": 4.668493900268684e-06,
634
+ "loss": 0.16947901,
635
+ "memory(GiB)": 100.58,
636
+ "step": 300,
637
+ "train_speed(iter/s)": 0.320854
638
+ },
639
+ {
640
+ "acc": 0.93184824,
641
+ "epoch": 2.210909090909091,
642
+ "grad_norm": 2.7745369730901595,
643
+ "learning_rate": 4.517978652387882e-06,
644
+ "loss": 0.16975009,
645
+ "memory(GiB)": 100.58,
646
+ "step": 305,
647
+ "train_speed(iter/s)": 0.321626
648
+ },
649
+ {
650
+ "acc": 0.93711929,
651
+ "epoch": 2.247272727272727,
652
+ "grad_norm": 4.606104787695004,
653
+ "learning_rate": 4.367903671952906e-06,
654
+ "loss": 0.16885712,
655
+ "memory(GiB)": 100.58,
656
+ "step": 310,
657
+ "train_speed(iter/s)": 0.322203
658
+ },
659
+ {
660
+ "acc": 0.93099174,
661
+ "epoch": 2.2836363636363637,
662
+ "grad_norm": 8.944877147631175,
663
+ "learning_rate": 4.218405892031366e-06,
664
+ "loss": 0.17090337,
665
+ "memory(GiB)": 100.58,
666
+ "step": 315,
667
+ "train_speed(iter/s)": 0.322833
668
+ },
669
+ {
670
+ "acc": 0.93137035,
671
+ "epoch": 2.32,
672
+ "grad_norm": 4.336121777570645,
673
+ "learning_rate": 4.069621719035229e-06,
674
+ "loss": 0.1658249,
675
+ "memory(GiB)": 100.58,
676
+ "step": 320,
677
+ "train_speed(iter/s)": 0.323508
678
+ },
679
+ {
680
+ "acc": 0.9393259,
681
+ "epoch": 2.3563636363636364,
682
+ "grad_norm": 6.921537975970479,
683
+ "learning_rate": 3.921686908259354e-06,
684
+ "loss": 0.15576041,
685
+ "memory(GiB)": 100.58,
686
+ "step": 325,
687
+ "train_speed(iter/s)": 0.324182
688
+ },
689
+ {
690
+ "acc": 0.93962708,
691
+ "epoch": 2.3927272727272726,
692
+ "grad_norm": 3.5886891547630877,
693
+ "learning_rate": 3.7747364400141726e-06,
694
+ "loss": 0.16867373,
695
+ "memory(GiB)": 100.58,
696
+ "step": 330,
697
+ "train_speed(iter/s)": 0.324849
698
+ },
699
+ {
700
+ "acc": 0.93609505,
701
+ "epoch": 2.429090909090909,
702
+ "grad_norm": 2.686999433312404,
703
+ "learning_rate": 3.6289043964654526e-06,
704
+ "loss": 0.15810946,
705
+ "memory(GiB)": 100.58,
706
+ "step": 335,
707
+ "train_speed(iter/s)": 0.325493
708
+ },
709
+ {
710
+ "acc": 0.92649899,
711
+ "epoch": 2.4654545454545453,
712
+ "grad_norm": 2.591872854237207,
713
+ "learning_rate": 3.484323839293575e-06,
714
+ "loss": 0.17918372,
715
+ "memory(GiB)": 100.58,
716
+ "step": 340,
717
+ "train_speed(iter/s)": 0.326123
718
+ },
719
+ {
720
+ "acc": 0.93626881,
721
+ "epoch": 2.501818181818182,
722
+ "grad_norm": 2.5738296672570233,
723
+ "learning_rate": 3.341126688283922e-06,
724
+ "loss": 0.16855428,
725
+ "memory(GiB)": 100.58,
726
+ "step": 345,
727
+ "train_speed(iter/s)": 0.326743
728
+ },
729
+ {
730
+ "acc": 0.93825417,
731
+ "epoch": 2.538181818181818,
732
+ "grad_norm": 2.7529925608546466,
733
+ "learning_rate": 3.19944360095919e-06,
734
+ "loss": 0.16165339,
735
+ "memory(GiB)": 100.58,
736
+ "step": 350,
737
+ "train_speed(iter/s)": 0.327363
738
+ },
739
+ {
740
+ "acc": 0.94702225,
741
+ "epoch": 2.5745454545454547,
742
+ "grad_norm": 2.9545927202945315,
743
+ "learning_rate": 3.059403853363393e-06,
744
+ "loss": 0.14523516,
745
+ "memory(GiB)": 100.58,
746
+ "step": 355,
747
+ "train_speed(iter/s)": 0.327926
748
+ },
749
+ {
750
+ "acc": 0.94346981,
751
+ "epoch": 2.610909090909091,
752
+ "grad_norm": 4.047109124196383,
753
+ "learning_rate": 2.9211352221063987e-06,
754
+ "loss": 0.14715908,
755
+ "memory(GiB)": 100.58,
756
+ "step": 360,
757
+ "train_speed(iter/s)": 0.328285
758
+ },
759
+ {
760
+ "acc": 0.94318542,
761
+ "epoch": 2.6472727272727274,
762
+ "grad_norm": 2.3923230638690143,
763
+ "learning_rate": 2.7847638677765936e-06,
764
+ "loss": 0.1494684,
765
+ "memory(GiB)": 100.58,
766
+ "step": 365,
767
+ "train_speed(iter/s)": 0.328722
768
+ },
769
+ {
770
+ "acc": 0.95623245,
771
+ "epoch": 2.6836363636363636,
772
+ "grad_norm": 2.457260493406828,
773
+ "learning_rate": 2.650414219828032e-06,
774
+ "loss": 0.11759402,
775
+ "memory(GiB)": 100.58,
776
+ "step": 370,
777
+ "train_speed(iter/s)": 0.329264
778
+ },
779
+ {
780
+ "acc": 0.94435921,
781
+ "epoch": 2.7199999999999998,
782
+ "grad_norm": 1.5322367904545142,
783
+ "learning_rate": 2.5182088630471517e-06,
784
+ "loss": 0.13577256,
785
+ "memory(GiB)": 100.58,
786
+ "step": 375,
787
+ "train_speed(iter/s)": 0.329788
788
+ },
789
+ {
790
+ "acc": 0.94585953,
791
+ "epoch": 2.7563636363636363,
792
+ "grad_norm": 2.8650025435958666,
793
+ "learning_rate": 2.388268425702614e-06,
794
+ "loss": 0.14076474,
795
+ "memory(GiB)": 100.58,
796
+ "step": 380,
797
+ "train_speed(iter/s)": 0.330302
798
+ },
799
+ {
800
+ "acc": 0.9413455,
801
+ "epoch": 2.792727272727273,
802
+ "grad_norm": 4.510750432829035,
803
+ "learning_rate": 2.2607114694803263e-06,
804
+ "loss": 0.1642381,
805
+ "memory(GiB)": 100.58,
806
+ "step": 385,
807
+ "train_speed(iter/s)": 0.330731
808
+ },
809
+ {
810
+ "acc": 0.93006382,
811
+ "epoch": 2.829090909090909,
812
+ "grad_norm": 2.908591189518448,
813
+ "learning_rate": 2.1356543813040863e-06,
814
+ "loss": 0.17094066,
815
+ "memory(GiB)": 100.58,
816
+ "step": 390,
817
+ "train_speed(iter/s)": 0.331119
818
+ },
819
+ {
820
+ "acc": 0.94227448,
821
+ "epoch": 2.8654545454545453,
822
+ "grad_norm": 2.331626905910975,
823
+ "learning_rate": 2.0132112671405244e-06,
824
+ "loss": 0.14904225,
825
+ "memory(GiB)": 100.58,
826
+ "step": 395,
827
+ "train_speed(iter/s)": 0.331532
828
+ },
829
+ {
830
+ "acc": 0.93090382,
831
+ "epoch": 2.901818181818182,
832
+ "grad_norm": 4.223665768837086,
833
+ "learning_rate": 1.8934938478853108e-06,
834
+ "loss": 0.17768097,
835
+ "memory(GiB)": 100.58,
836
+ "step": 400,
837
+ "train_speed(iter/s)": 0.331963
838
+ },
839
+ {
840
+ "acc": 0.93722563,
841
+ "epoch": 2.9381818181818184,
842
+ "grad_norm": 2.7247775486261734,
843
+ "learning_rate": 1.7766113574255145e-06,
844
+ "loss": 0.15059752,
845
+ "memory(GiB)": 100.58,
846
+ "step": 405,
847
+ "train_speed(iter/s)": 0.332266
848
+ },
849
+ {
850
+ "acc": 0.94374504,
851
+ "epoch": 2.9745454545454546,
852
+ "grad_norm": 2.9951618135706055,
853
+ "learning_rate": 1.6626704429712411e-06,
854
+ "loss": 0.14953468,
855
+ "memory(GiB)": 100.58,
856
+ "step": 410,
857
+ "train_speed(iter/s)": 0.332599
858
+ },
859
+ {
860
+ "epoch": 3.0,
861
+ "eval_acc": 0.9192953707496927,
862
+ "eval_loss": 0.21225064992904663,
863
+ "eval_runtime": 9.5239,
864
+ "eval_samples_per_second": 12.18,
865
+ "eval_steps_per_second": 1.575,
866
+ "step": 414
867
+ }
868
+ ],
869
+ "logging_steps": 5,
870
+ "max_steps": 548,
871
+ "num_input_tokens_seen": 0,
872
+ "num_train_epochs": 4,
873
+ "save_steps": 200,
874
+ "stateful_callbacks": {
875
+ "TrainerControl": {
876
+ "args": {
877
+ "should_epoch_stop": false,
878
+ "should_evaluate": false,
879
+ "should_log": false,
880
+ "should_save": true,
881
+ "should_training_stop": false
882
+ },
883
+ "attributes": {}
884
+ }
885
+ },
886
+ "total_flos": 2.931788793840435e+16,
887
+ "train_batch_size": 1,
888
+ "trial_name": null,
889
+ "trial_params": null
890
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff027f5be53ec7518dcbefa171fe5337bc61486082a24c11c0c08315322b5a87
3
+ size 10680
vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
zero_to_fp32.py ADDED
@@ -0,0 +1,760 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+
3
+ # Copyright (c) Microsoft Corporation.
4
+ # SPDX-License-Identifier: Apache-2.0
5
+
6
+ # DeepSpeed Team
7
+
8
+ # This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
9
+ # copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
10
+ # the future. Once extracted, the weights don't require DeepSpeed and can be used in any
11
+ # application.
12
+ #
13
+ # example:
14
+ # python zero_to_fp32.py . output_dir/
15
+ # or
16
+ # python zero_to_fp32.py . output_dir/ --safe_serialization
17
+
18
+ import argparse
19
+ import torch
20
+ import glob
21
+ import math
22
+ import os
23
+ import re
24
+ import gc
25
+ import json
26
+ import numpy as np
27
+ from tqdm import tqdm
28
+ from collections import OrderedDict
29
+ from dataclasses import dataclass
30
+
31
+ # while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
32
+ # DeepSpeed data structures it has to be available in the current python environment.
33
+ from deepspeed.utils import logger
34
+ from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
35
+ FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
36
+ FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
37
+
38
+
39
+ @dataclass
40
+ class zero_model_state:
41
+ buffers: dict()
42
+ param_shapes: dict()
43
+ shared_params: list
44
+ ds_version: int
45
+ frozen_param_shapes: dict()
46
+ frozen_param_fragments: dict()
47
+
48
+
49
+ debug = 0
50
+
51
+ # load to cpu
52
+ device = torch.device('cpu')
53
+
54
+
55
+ def atoi(text):
56
+ return int(text) if text.isdigit() else text
57
+
58
+
59
+ def natural_keys(text):
60
+ '''
61
+ alist.sort(key=natural_keys) sorts in human order
62
+ http://nedbatchelder.com/blog/200712/human_sorting.html
63
+ (See Toothy's implementation in the comments)
64
+ '''
65
+ return [atoi(c) for c in re.split(r'(\d+)', text)]
66
+
67
+
68
+ def get_model_state_file(checkpoint_dir, zero_stage):
69
+ if not os.path.isdir(checkpoint_dir):
70
+ raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
71
+
72
+ # there should be only one file
73
+ if zero_stage <= 2:
74
+ file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
75
+ elif zero_stage == 3:
76
+ file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
77
+
78
+ if not os.path.exists(file):
79
+ raise FileNotFoundError(f"can't find model states file at '{file}'")
80
+
81
+ return file
82
+
83
+
84
+ def get_checkpoint_files(checkpoint_dir, glob_pattern):
85
+ # XXX: need to test that this simple glob rule works for multi-node setup too
86
+ ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
87
+
88
+ if len(ckpt_files) == 0:
89
+ raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
90
+
91
+ return ckpt_files
92
+
93
+
94
+ def get_optim_files(checkpoint_dir):
95
+ return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
96
+
97
+
98
+ def get_model_state_files(checkpoint_dir):
99
+ return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
100
+
101
+
102
+ def parse_model_states(files):
103
+ zero_model_states = []
104
+ for file in files:
105
+ state_dict = torch.load(file, map_location=device, weights_only=False)
106
+
107
+ if BUFFER_NAMES not in state_dict:
108
+ raise ValueError(f"{file} is not a model state checkpoint")
109
+ buffer_names = state_dict[BUFFER_NAMES]
110
+ if debug:
111
+ print("Found buffers:", buffer_names)
112
+
113
+ # recover just the buffers while restoring them to fp32 if they were saved in fp16
114
+ buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
115
+ param_shapes = state_dict[PARAM_SHAPES]
116
+
117
+ # collect parameters that are included in param_shapes
118
+ param_names = []
119
+ for s in param_shapes:
120
+ for name in s.keys():
121
+ param_names.append(name)
122
+
123
+ # update with frozen parameters
124
+ frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
125
+ if frozen_param_shapes is not None:
126
+ if debug:
127
+ print(f"Found frozen_param_shapes: {frozen_param_shapes}")
128
+ param_names += list(frozen_param_shapes.keys())
129
+
130
+ # handle shared params
131
+ shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
132
+
133
+ ds_version = state_dict.get(DS_VERSION, None)
134
+
135
+ frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
136
+
137
+ z_model_state = zero_model_state(buffers=buffers,
138
+ param_shapes=param_shapes,
139
+ shared_params=shared_params,
140
+ ds_version=ds_version,
141
+ frozen_param_shapes=frozen_param_shapes,
142
+ frozen_param_fragments=frozen_param_fragments)
143
+ zero_model_states.append(z_model_state)
144
+
145
+ return zero_model_states
146
+
147
+
148
+ def parse_optim_states(files, ds_checkpoint_dir):
149
+ total_files = len(files)
150
+ state_dicts = []
151
+ for f in tqdm(files, desc='Loading checkpoint shards'):
152
+ state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
153
+ # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
154
+ # and also handle the case where it was already removed by another helper script
155
+ state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
156
+ state_dicts.append(state_dict)
157
+
158
+ if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
159
+ raise ValueError(f"{files[0]} is not a zero checkpoint")
160
+ zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
161
+ world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
162
+
163
+ # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
164
+ # parameters can be different from data parallelism for non-expert parameters. So we can just
165
+ # use the max of the partition_count to get the dp world_size.
166
+
167
+ if type(world_size) is list:
168
+ world_size = max(world_size)
169
+
170
+ if world_size != total_files:
171
+ raise ValueError(
172
+ f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
173
+ "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
174
+ )
175
+
176
+ # the groups are named differently in each stage
177
+ if zero_stage <= 2:
178
+ fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
179
+ elif zero_stage == 3:
180
+ fp32_groups_key = FP32_FLAT_GROUPS
181
+ else:
182
+ raise ValueError(f"unknown zero stage {zero_stage}")
183
+
184
+ fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
185
+ return zero_stage, world_size, fp32_flat_groups
186
+
187
+
188
+ def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
189
+ """
190
+ Returns fp32 state_dict reconstructed from ds checkpoint
191
+
192
+ Args:
193
+ - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
194
+
195
+ """
196
+ print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
197
+
198
+ optim_files = get_optim_files(ds_checkpoint_dir)
199
+ zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
200
+ print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
201
+
202
+ model_files = get_model_state_files(ds_checkpoint_dir)
203
+
204
+ zero_model_states = parse_model_states(model_files)
205
+ print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
206
+
207
+ if zero_stage <= 2:
208
+ return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
209
+ exclude_frozen_parameters)
210
+ elif zero_stage == 3:
211
+ return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
212
+ exclude_frozen_parameters)
213
+
214
+
215
+ def _zero2_merge_frozen_params(state_dict, zero_model_states):
216
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
217
+ return
218
+
219
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
220
+ frozen_param_fragments = zero_model_states[0].frozen_param_fragments
221
+
222
+ if debug:
223
+ num_elem = sum(s.numel() for s in frozen_param_shapes.values())
224
+ print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
225
+
226
+ wanted_params = len(frozen_param_shapes)
227
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
228
+ avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
229
+ print(f'Frozen params: Have {avail_numel} numels to process.')
230
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
231
+
232
+ total_params = 0
233
+ total_numel = 0
234
+ for name, shape in frozen_param_shapes.items():
235
+ total_params += 1
236
+ unpartitioned_numel = shape.numel()
237
+ total_numel += unpartitioned_numel
238
+
239
+ state_dict[name] = frozen_param_fragments[name]
240
+
241
+ if debug:
242
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
243
+
244
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
245
+
246
+
247
+ def _has_callable(obj, fn):
248
+ attr = getattr(obj, fn, None)
249
+ return callable(attr)
250
+
251
+
252
+ def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
253
+ param_shapes = zero_model_states[0].param_shapes
254
+
255
+ # Reconstruction protocol:
256
+ #
257
+ # XXX: document this
258
+
259
+ if debug:
260
+ for i in range(world_size):
261
+ for j in range(len(fp32_flat_groups[0])):
262
+ print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
263
+
264
+ # XXX: memory usage doubles here (zero2)
265
+ num_param_groups = len(fp32_flat_groups[0])
266
+ merged_single_partition_of_fp32_groups = []
267
+ for i in range(num_param_groups):
268
+ merged_partitions = [sd[i] for sd in fp32_flat_groups]
269
+ full_single_fp32_vector = torch.cat(merged_partitions, 0)
270
+ merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
271
+ avail_numel = sum(
272
+ [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
273
+
274
+ if debug:
275
+ wanted_params = sum([len(shapes) for shapes in param_shapes])
276
+ wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
277
+ # not asserting if there is a mismatch due to possible padding
278
+ print(f"Have {avail_numel} numels to process.")
279
+ print(f"Need {wanted_numel} numels in {wanted_params} params.")
280
+
281
+ # params
282
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
283
+ # out-of-core computing solution
284
+ total_numel = 0
285
+ total_params = 0
286
+ for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
287
+ offset = 0
288
+ avail_numel = full_single_fp32_vector.numel()
289
+ for name, shape in shapes.items():
290
+
291
+ unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
292
+ total_numel += unpartitioned_numel
293
+ total_params += 1
294
+
295
+ if debug:
296
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
297
+ state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
298
+ offset += unpartitioned_numel
299
+
300
+ # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
301
+ # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
302
+ # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
303
+ # live optimizer object, so we are checking that the numbers are within the right range
304
+ align_to = 2 * world_size
305
+
306
+ def zero2_align(x):
307
+ return align_to * math.ceil(x / align_to)
308
+
309
+ if debug:
310
+ print(f"original offset={offset}, avail_numel={avail_numel}")
311
+
312
+ offset = zero2_align(offset)
313
+ avail_numel = zero2_align(avail_numel)
314
+
315
+ if debug:
316
+ print(f"aligned offset={offset}, avail_numel={avail_numel}")
317
+
318
+ # Sanity check
319
+ if offset != avail_numel:
320
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
321
+
322
+ print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
323
+
324
+
325
+ def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
326
+ exclude_frozen_parameters):
327
+ state_dict = OrderedDict()
328
+
329
+ # buffers
330
+ buffers = zero_model_states[0].buffers
331
+ state_dict.update(buffers)
332
+ if debug:
333
+ print(f"added {len(buffers)} buffers")
334
+
335
+ if not exclude_frozen_parameters:
336
+ _zero2_merge_frozen_params(state_dict, zero_model_states)
337
+
338
+ _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
339
+
340
+ # recover shared parameters
341
+ for pair in zero_model_states[0].shared_params:
342
+ if pair[1] in state_dict:
343
+ state_dict[pair[0]] = state_dict[pair[1]]
344
+
345
+ return state_dict
346
+
347
+
348
+ def zero3_partitioned_param_info(unpartitioned_numel, world_size):
349
+ remainder = unpartitioned_numel % world_size
350
+ padding_numel = (world_size - remainder) if remainder else 0
351
+ partitioned_numel = math.ceil(unpartitioned_numel / world_size)
352
+ return partitioned_numel, padding_numel
353
+
354
+
355
+ def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
356
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
357
+ return
358
+
359
+ if debug:
360
+ for i in range(world_size):
361
+ num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
362
+ print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
363
+
364
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
365
+ wanted_params = len(frozen_param_shapes)
366
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
367
+ avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
368
+ print(f'Frozen params: Have {avail_numel} numels to process.')
369
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
370
+
371
+ total_params = 0
372
+ total_numel = 0
373
+ for name, shape in zero_model_states[0].frozen_param_shapes.items():
374
+ total_params += 1
375
+ unpartitioned_numel = shape.numel()
376
+ total_numel += unpartitioned_numel
377
+
378
+ param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
379
+ state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
380
+
381
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
382
+
383
+ if debug:
384
+ print(
385
+ f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
386
+ )
387
+
388
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
389
+
390
+
391
+ class GatheredTensor:
392
+ """
393
+ A pseudo tensor that collects partitioned weights.
394
+ It is more memory efficient when there are multiple groups.
395
+ """
396
+
397
+ def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
398
+ self.flat_groups = flat_groups
399
+ self.flat_groups_offset = flat_groups_offset
400
+ self.offset = offset
401
+ self.partitioned_numel = partitioned_numel
402
+ self.shape = shape
403
+ self.dtype = self.flat_groups[0][0].dtype
404
+
405
+ def contiguous(self):
406
+ """
407
+ Merge partitioned weights from flat_groups into a single tensor.
408
+ """
409
+ end_idx = self.offset + self.partitioned_numel
410
+ world_size = len(self.flat_groups)
411
+ pad_flat_param_chunks = []
412
+
413
+ for rank_i in range(world_size):
414
+ # for each rank, we need to collect weights from related group/groups
415
+ flat_groups_at_rank_i = self.flat_groups[rank_i]
416
+ start_group_id = None
417
+ end_group_id = None
418
+ for group_id in range(len(self.flat_groups_offset)):
419
+ if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
420
+ start_group_id = group_id
421
+ if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
422
+ end_group_id = group_id
423
+ break
424
+ # collect weights from related group/groups
425
+ for group_id in range(start_group_id, end_group_id + 1):
426
+ flat_tensor = flat_groups_at_rank_i[group_id]
427
+ start_offset = self.offset - self.flat_groups_offset[group_id]
428
+ end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
429
+ pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
430
+
431
+ # collect weights from all ranks
432
+ pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
433
+ param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
434
+ return param
435
+
436
+
437
+ def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
438
+ param_shapes = zero_model_states[0].param_shapes
439
+ avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
440
+
441
+ # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
442
+ # param, re-consolidating each param, while dealing with padding if any
443
+
444
+ # merge list of dicts, preserving order
445
+ param_shapes = {k: v for d in param_shapes for k, v in d.items()}
446
+
447
+ if debug:
448
+ for i in range(world_size):
449
+ print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
450
+
451
+ wanted_params = len(param_shapes)
452
+ wanted_numel = sum(shape.numel() for shape in param_shapes.values())
453
+ # not asserting if there is a mismatch due to possible padding
454
+ avail_numel = fp32_flat_groups[0].numel() * world_size
455
+ print(f"Trainable params: Have {avail_numel} numels to process.")
456
+ print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
457
+
458
+ # params
459
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
460
+ # out-of-core computing solution
461
+ offset = 0
462
+ total_numel = 0
463
+ total_params = 0
464
+ flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
465
+ for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
466
+ unpartitioned_numel = shape.numel()
467
+ total_numel += unpartitioned_numel
468
+ total_params += 1
469
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
470
+
471
+ if debug:
472
+ print(
473
+ f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
474
+ )
475
+
476
+ # memory efficient tensor
477
+ tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
478
+ state_dict[name] = tensor
479
+ offset += partitioned_numel
480
+
481
+ offset *= world_size
482
+
483
+ # Sanity check
484
+ if offset != avail_numel:
485
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
486
+
487
+ print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
488
+
489
+
490
+ def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
491
+ exclude_frozen_parameters):
492
+ state_dict = OrderedDict()
493
+
494
+ # buffers
495
+ buffers = zero_model_states[0].buffers
496
+ state_dict.update(buffers)
497
+ if debug:
498
+ print(f"added {len(buffers)} buffers")
499
+
500
+ if not exclude_frozen_parameters:
501
+ _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
502
+
503
+ _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
504
+
505
+ # recover shared parameters
506
+ for pair in zero_model_states[0].shared_params:
507
+ if pair[1] in state_dict:
508
+ state_dict[pair[0]] = state_dict[pair[1]]
509
+
510
+ return state_dict
511
+
512
+
513
+ def to_torch_tensor(state_dict, return_empty_tensor=False):
514
+ """
515
+ Convert state_dict of GatheredTensor to torch tensor
516
+ """
517
+ torch_state_dict = {}
518
+ converted_tensors = {}
519
+ for name, tensor in state_dict.items():
520
+ tensor_id = id(tensor)
521
+ if tensor_id in converted_tensors: # shared tensors
522
+ shared_tensor = torch_state_dict[converted_tensors[tensor_id]]
523
+ torch_state_dict[name] = shared_tensor
524
+ else:
525
+ converted_tensors[tensor_id] = name
526
+ if return_empty_tensor:
527
+ torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
528
+ else:
529
+ torch_state_dict[name] = tensor.contiguous()
530
+ return torch_state_dict
531
+
532
+
533
+ def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
534
+ tag=None,
535
+ exclude_frozen_parameters=False,
536
+ lazy_mode=False):
537
+ """
538
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
539
+ ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
540
+ via a model hub.
541
+
542
+ Args:
543
+ - ``checkpoint_dir``: path to the desired checkpoint folder
544
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
545
+ - ``exclude_frozen_parameters``: exclude frozen parameters
546
+ - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
547
+ Convert the pesduo tensor to torch tensor by ``.contiguous()``
548
+
549
+ Returns:
550
+ - pytorch ``state_dict``
551
+
552
+ A typical usage might be ::
553
+
554
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
555
+ # do the training and checkpoint saving
556
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
557
+ model = model.cpu() # move to cpu
558
+ model.load_state_dict(state_dict)
559
+ # submit to model hub or save the model to share with others
560
+
561
+ In this example the ``model`` will no longer be usable in the deepspeed context of the same
562
+ application. i.e. you will need to re-initialize the deepspeed engine, since
563
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
564
+
565
+ If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
566
+
567
+ Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
568
+ You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
569
+ the checkpoint. Or you can load state_dict in lazy mode ::
570
+
571
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
572
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
573
+ for name, lazy_tensor in state_dict.item():
574
+ tensor = lazy_tensor.contiguous() # to cpu
575
+ print(name, tensor)
576
+ # del tensor to release memory if it no longer in use
577
+ """
578
+ if tag is None:
579
+ latest_path = os.path.join(checkpoint_dir, 'latest')
580
+ if os.path.isfile(latest_path):
581
+ with open(latest_path, 'r') as fd:
582
+ tag = fd.read().strip()
583
+ else:
584
+ raise ValueError(f"Unable to find 'latest' file at {latest_path}")
585
+
586
+ ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
587
+
588
+ if not os.path.isdir(ds_checkpoint_dir):
589
+ raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
590
+
591
+ state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
592
+ if lazy_mode:
593
+ return state_dict
594
+ else:
595
+ return to_torch_tensor(state_dict)
596
+
597
+
598
+ def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
599
+ output_dir,
600
+ max_shard_size="5GB",
601
+ safe_serialization=False,
602
+ tag=None,
603
+ exclude_frozen_parameters=False):
604
+ """
605
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
606
+ loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
607
+
608
+ Args:
609
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
610
+ - ``output_dir``: directory to the pytorch fp32 state_dict output files
611
+ - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
612
+ - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
613
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
614
+ - ``exclude_frozen_parameters``: exclude frozen parameters
615
+ """
616
+
617
+ # Dependency pre-check
618
+ if safe_serialization:
619
+ try:
620
+ from safetensors.torch import save_file
621
+ except ImportError:
622
+ print('If you want to use `safe_serialization`, please `pip install safetensors`')
623
+ raise
624
+ if max_shard_size is not None:
625
+ try:
626
+ from huggingface_hub import split_torch_state_dict_into_shards
627
+ except ImportError:
628
+ print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
629
+ raise
630
+
631
+ # Convert zero checkpoint to state_dict
632
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
633
+ tag,
634
+ exclude_frozen_parameters,
635
+ lazy_mode=True)
636
+
637
+ # Shard the model if it is too big.
638
+ weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
639
+ if max_shard_size is not None:
640
+ filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
641
+ # an memory-efficient approach for sharding
642
+ empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
643
+ state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
644
+ filename_pattern=filename_pattern,
645
+ max_shard_size=max_shard_size)
646
+ else:
647
+ from collections import namedtuple
648
+ StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
649
+ state_dict_split = StateDictSplit(is_sharded=False,
650
+ filename_to_tensors={weights_name: list(state_dict.keys())})
651
+
652
+ # Save the model by shard
653
+ os.makedirs(output_dir, exist_ok=True)
654
+ filename_to_tensors = state_dict_split.filename_to_tensors.items()
655
+ for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
656
+ shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
657
+ shard_state_dict = to_torch_tensor(shard_state_dict)
658
+ output_path = os.path.join(output_dir, shard_file)
659
+ if safe_serialization:
660
+ save_file(shard_state_dict, output_path, metadata={"format": "pt"})
661
+ else:
662
+ torch.save(shard_state_dict, output_path)
663
+ # release the memory of current shard
664
+ for tensor_name in list(shard_state_dict.keys()):
665
+ del state_dict[tensor_name]
666
+ del shard_state_dict[tensor_name]
667
+ del shard_state_dict
668
+ gc.collect()
669
+
670
+ # Save index if sharded
671
+ if state_dict_split.is_sharded:
672
+ index = {
673
+ "metadata": state_dict_split.metadata,
674
+ "weight_map": state_dict_split.tensor_to_filename,
675
+ }
676
+ save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
677
+ save_index_file = os.path.join(output_dir, save_index_file)
678
+ with open(save_index_file, "w", encoding="utf-8") as f:
679
+ content = json.dumps(index, indent=2, sort_keys=True) + "\n"
680
+ f.write(content)
681
+
682
+
683
+ def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
684
+ """
685
+ 1. Put the provided model to cpu
686
+ 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
687
+ 3. Load it into the provided model
688
+
689
+ Args:
690
+ - ``model``: the model object to update
691
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
692
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
693
+
694
+ Returns:
695
+ - ``model`: modified model
696
+
697
+ Make sure you have plenty of CPU memory available before you call this function. If you don't
698
+ have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
699
+ conveniently placed for you in the checkpoint folder.
700
+
701
+ A typical usage might be ::
702
+
703
+ from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
704
+ model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
705
+ # submit to model hub or save the model to share with others
706
+
707
+ Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
708
+ of the same application. i.e. you will need to re-initialize the deepspeed engine, since
709
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
710
+
711
+ """
712
+ logger.info(f"Extracting fp32 weights")
713
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
714
+
715
+ logger.info(f"Overwriting model with fp32 weights")
716
+ model = model.cpu()
717
+ model.load_state_dict(state_dict, strict=False)
718
+
719
+ return model
720
+
721
+
722
+ if __name__ == "__main__":
723
+ parser = argparse.ArgumentParser()
724
+ parser.add_argument("checkpoint_dir",
725
+ type=str,
726
+ help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
727
+ parser.add_argument("output_dir",
728
+ type=str,
729
+ help="directory to the pytorch fp32 state_dict output files"
730
+ "(e.g. path/checkpoint-12-output/)")
731
+ parser.add_argument(
732
+ "--max_shard_size",
733
+ type=str,
734
+ default="5GB",
735
+ help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
736
+ "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
737
+ "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
738
+ "without CPU OOM issues.")
739
+ parser.add_argument(
740
+ "--safe_serialization",
741
+ default=False,
742
+ action='store_true',
743
+ help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
744
+ parser.add_argument("-t",
745
+ "--tag",
746
+ type=str,
747
+ default=None,
748
+ help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
749
+ parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
750
+ parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
751
+ args = parser.parse_args()
752
+
753
+ debug = args.debug
754
+
755
+ convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
756
+ args.output_dir,
757
+ max_shard_size=args.max_shard_size,
758
+ safe_serialization=args.safe_serialization,
759
+ tag=args.tag,
760
+ exclude_frozen_parameters=args.exclude_frozen_parameters)