pbarker commited on Jan 4

Commit

7a91659

verified ·

1 Parent(s): 11845ba

Upload folder using huggingface_hub

Browse files

Files changed (49) hide show

.gitattributes +1 -0
added_tokens.json +428 -0
config.json +33 -0
config_molmo.py +60 -0
generation_config.json +6 -0
global_step412/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
global_step412/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +3 -0
global_step412/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +3 -0
global_step412/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +3 -0
global_step412/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +3 -0
global_step412/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +3 -0
global_step412/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +3 -0
global_step412/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +3 -0
global_step412/zero_pp_rank_0_mp_rank_00_model_states.pt +3 -0
global_step412/zero_pp_rank_1_mp_rank_00_model_states.pt +3 -0
global_step412/zero_pp_rank_2_mp_rank_00_model_states.pt +3 -0
global_step412/zero_pp_rank_3_mp_rank_00_model_states.pt +3 -0
global_step412/zero_pp_rank_4_mp_rank_00_model_states.pt +3 -0
global_step412/zero_pp_rank_5_mp_rank_00_model_states.pt +3 -0
global_step412/zero_pp_rank_6_mp_rank_00_model_states.pt +3 -0
global_step412/zero_pp_rank_7_mp_rank_00_model_states.pt +3 -0
image_preprocessing_molmo.py +546 -0
latest +1 -0
merges.txt +0 -0
model-00001-of-00004.safetensors +3 -0
model-00002-of-00004.safetensors +3 -0
model-00003-of-00004.safetensors +3 -0
model-00004-of-00004.safetensors +3 -0
model.safetensors.index.json +592 -0
modeling_molmo.py +2367 -0
preprocessing_molmo.py +192 -0
preprocessor_config.json +32 -0
processor_config.json +6 -0
rng_state_0.pth +3 -0
rng_state_1.pth +3 -0
rng_state_2.pth +3 -0
rng_state_3.pth +3 -0
rng_state_4.pth +3 -0
rng_state_5.pth +3 -0
rng_state_6.pth +3 -0
rng_state_7.pth +3 -0
sft_args.json +302 -0
special_tokens_map.json +435 -0
tokenizer.json +3 -0
tokenizer_config.json +3853 -0
trainer_state.json +890 -0
training_args.bin +3 -0
vocab.json +0 -0
zero_to_fp32.py +760 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

added_tokens.json ADDED Viewed

	@@ -0,0 +1,428 @@

+{
+  "<im_col>": 152067,
+  "<im_end>": 152065,
+  "<im_patch>": 152066,
+  "<im_start>": 152064,
+  "<|endoftext|>": 151643,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image|>": 152068,
+  "|<EXTRA_TOKENS_0>|": 151646,
+  "|<EXTRA_TOKENS_100>|": 151746,
+  "|<EXTRA_TOKENS_101>|": 151747,
+  "|<EXTRA_TOKENS_102>|": 151748,
+  "|<EXTRA_TOKENS_103>|": 151749,
+  "|<EXTRA_TOKENS_104>|": 151750,
+  "|<EXTRA_TOKENS_105>|": 151751,
+  "|<EXTRA_TOKENS_106>|": 151752,
+  "|<EXTRA_TOKENS_107>|": 151753,
+  "|<EXTRA_TOKENS_108>|": 151754,
+  "|<EXTRA_TOKENS_109>|": 151755,
+  "|<EXTRA_TOKENS_10>|": 151656,
+  "|<EXTRA_TOKENS_110>|": 151756,
+  "|<EXTRA_TOKENS_111>|": 151757,
+  "|<EXTRA_TOKENS_112>|": 151758,
+  "|<EXTRA_TOKENS_113>|": 151759,
+  "|<EXTRA_TOKENS_114>|": 151760,
+  "|<EXTRA_TOKENS_115>|": 151761,
+  "|<EXTRA_TOKENS_116>|": 151762,
+  "|<EXTRA_TOKENS_117>|": 151763,
+  "|<EXTRA_TOKENS_118>|": 151764,
+  "|<EXTRA_TOKENS_119>|": 151765,
+  "|<EXTRA_TOKENS_11>|": 151657,
+  "|<EXTRA_TOKENS_120>|": 151766,
+  "|<EXTRA_TOKENS_121>|": 151767,
+  "|<EXTRA_TOKENS_122>|": 151768,
+  "|<EXTRA_TOKENS_123>|": 151769,
+  "|<EXTRA_TOKENS_124>|": 151770,
+  "|<EXTRA_TOKENS_125>|": 151771,
+  "|<EXTRA_TOKENS_126>|": 151772,
+  "|<EXTRA_TOKENS_127>|": 151773,
+  "|<EXTRA_TOKENS_128>|": 151774,
+  "|<EXTRA_TOKENS_129>|": 151775,
+  "|<EXTRA_TOKENS_12>|": 151658,
+  "|<EXTRA_TOKENS_130>|": 151776,
+  "|<EXTRA_TOKENS_131>|": 151777,
+  "|<EXTRA_TOKENS_132>|": 151778,
+  "|<EXTRA_TOKENS_133>|": 151779,
+  "|<EXTRA_TOKENS_134>|": 151780,
+  "|<EXTRA_TOKENS_135>|": 151781,
+  "|<EXTRA_TOKENS_136>|": 151782,
+  "|<EXTRA_TOKENS_137>|": 151783,
+  "|<EXTRA_TOKENS_138>|": 151784,
+  "|<EXTRA_TOKENS_139>|": 151785,
+  "|<EXTRA_TOKENS_13>|": 151659,
+  "|<EXTRA_TOKENS_140>|": 151786,
+  "|<EXTRA_TOKENS_141>|": 151787,
+  "|<EXTRA_TOKENS_142>|": 151788,
+  "|<EXTRA_TOKENS_143>|": 151789,
+  "|<EXTRA_TOKENS_144>|": 151790,
+  "|<EXTRA_TOKENS_145>|": 151791,
+  "|<EXTRA_TOKENS_146>|": 151792,
+  "|<EXTRA_TOKENS_147>|": 151793,
+  "|<EXTRA_TOKENS_148>|": 151794,
+  "|<EXTRA_TOKENS_149>|": 151795,
+  "|<EXTRA_TOKENS_14>|": 151660,
+  "|<EXTRA_TOKENS_150>|": 151796,
+  "|<EXTRA_TOKENS_151>|": 151797,
+  "|<EXTRA_TOKENS_152>|": 151798,
+  "|<EXTRA_TOKENS_153>|": 151799,
+  "|<EXTRA_TOKENS_154>|": 151800,
+  "|<EXTRA_TOKENS_155>|": 151801,
+  "|<EXTRA_TOKENS_156>|": 151802,
+  "|<EXTRA_TOKENS_157>|": 151803,
+  "|<EXTRA_TOKENS_158>|": 151804,
+  "|<EXTRA_TOKENS_159>|": 151805,
+  "|<EXTRA_TOKENS_15>|": 151661,
+  "|<EXTRA_TOKENS_160>|": 151806,
+  "|<EXTRA_TOKENS_161>|": 151807,
+  "|<EXTRA_TOKENS_162>|": 151808,
+  "|<EXTRA_TOKENS_163>|": 151809,
+  "|<EXTRA_TOKENS_164>|": 151810,
+  "|<EXTRA_TOKENS_165>|": 151811,
+  "|<EXTRA_TOKENS_166>|": 151812,
+  "|<EXTRA_TOKENS_167>|": 151813,
+  "|<EXTRA_TOKENS_168>|": 151814,
+  "|<EXTRA_TOKENS_169>|": 151815,
+  "|<EXTRA_TOKENS_16>|": 151662,
+  "|<EXTRA_TOKENS_170>|": 151816,
+  "|<EXTRA_TOKENS_171>|": 151817,
+  "|<EXTRA_TOKENS_172>|": 151818,
+  "|<EXTRA_TOKENS_173>|": 151819,
+  "|<EXTRA_TOKENS_174>|": 151820,
+  "|<EXTRA_TOKENS_175>|": 151821,
+  "|<EXTRA_TOKENS_176>|": 151822,
+  "|<EXTRA_TOKENS_177>|": 151823,
+  "|<EXTRA_TOKENS_178>|": 151824,
+  "|<EXTRA_TOKENS_179>|": 151825,
+  "|<EXTRA_TOKENS_17>|": 151663,
+  "|<EXTRA_TOKENS_180>|": 151826,
+  "|<EXTRA_TOKENS_181>|": 151827,
+  "|<EXTRA_TOKENS_182>|": 151828,
+  "|<EXTRA_TOKENS_183>|": 151829,
+  "|<EXTRA_TOKENS_184>|": 151830,
+  "|<EXTRA_TOKENS_185>|": 151831,
+  "|<EXTRA_TOKENS_186>|": 151832,
+  "|<EXTRA_TOKENS_187>|": 151833,
+  "|<EXTRA_TOKENS_188>|": 151834,
+  "|<EXTRA_TOKENS_189>|": 151835,
+  "|<EXTRA_TOKENS_18>|": 151664,
+  "|<EXTRA_TOKENS_190>|": 151836,
+  "|<EXTRA_TOKENS_191>|": 151837,
+  "|<EXTRA_TOKENS_192>|": 151838,
+  "|<EXTRA_TOKENS_193>|": 151839,
+  "|<EXTRA_TOKENS_194>|": 151840,
+  "|<EXTRA_TOKENS_195>|": 151841,
+  "|<EXTRA_TOKENS_196>|": 151842,
+  "|<EXTRA_TOKENS_197>|": 151843,
+  "|<EXTRA_TOKENS_198>|": 151844,
+  "|<EXTRA_TOKENS_199>|": 151845,
+  "|<EXTRA_TOKENS_19>|": 151665,
+  "|<EXTRA_TOKENS_1>|": 151647,
+  "|<EXTRA_TOKENS_200>|": 151846,
+  "|<EXTRA_TOKENS_201>|": 151847,
+  "|<EXTRA_TOKENS_202>|": 151848,
+  "|<EXTRA_TOKENS_203>|": 151849,
+  "|<EXTRA_TOKENS_204>|": 151850,
+  "|<EXTRA_TOKENS_205>|": 151851,
+  "|<EXTRA_TOKENS_206>|": 151852,
+  "|<EXTRA_TOKENS_207>|": 151853,
+  "|<EXTRA_TOKENS_208>|": 151854,
+  "|<EXTRA_TOKENS_209>|": 151855,
+  "|<EXTRA_TOKENS_20>|": 151666,
+  "|<EXTRA_TOKENS_210>|": 151856,
+  "|<EXTRA_TOKENS_211>|": 151857,
+  "|<EXTRA_TOKENS_212>|": 151858,
+  "|<EXTRA_TOKENS_213>|": 151859,
+  "|<EXTRA_TOKENS_214>|": 151860,
+  "|<EXTRA_TOKENS_215>|": 151861,
+  "|<EXTRA_TOKENS_216>|": 151862,
+  "|<EXTRA_TOKENS_217>|": 151863,
+  "|<EXTRA_TOKENS_218>|": 151864,
+  "|<EXTRA_TOKENS_219>|": 151865,
+  "|<EXTRA_TOKENS_21>|": 151667,
+  "|<EXTRA_TOKENS_220>|": 151866,
+  "|<EXTRA_TOKENS_221>|": 151867,
+  "|<EXTRA_TOKENS_222>|": 151868,
+  "|<EXTRA_TOKENS_223>|": 151869,
+  "|<EXTRA_TOKENS_224>|": 151870,
+  "|<EXTRA_TOKENS_225>|": 151871,
+  "|<EXTRA_TOKENS_226>|": 151872,
+  "|<EXTRA_TOKENS_227>|": 151873,
+  "|<EXTRA_TOKENS_228>|": 151874,
+  "|<EXTRA_TOKENS_229>|": 151875,
+  "|<EXTRA_TOKENS_22>|": 151668,
+  "|<EXTRA_TOKENS_230>|": 151876,
+  "|<EXTRA_TOKENS_231>|": 151877,
+  "|<EXTRA_TOKENS_232>|": 151878,
+  "|<EXTRA_TOKENS_233>|": 151879,
+  "|<EXTRA_TOKENS_234>|": 151880,
+  "|<EXTRA_TOKENS_235>|": 151881,
+  "|<EXTRA_TOKENS_236>|": 151882,
+  "|<EXTRA_TOKENS_237>|": 151883,
+  "|<EXTRA_TOKENS_238>|": 151884,
+  "|<EXTRA_TOKENS_239>|": 151885,
+  "|<EXTRA_TOKENS_23>|": 151669,
+  "|<EXTRA_TOKENS_240>|": 151886,
+  "|<EXTRA_TOKENS_241>|": 151887,
+  "|<EXTRA_TOKENS_242>|": 151888,
+  "|<EXTRA_TOKENS_243>|": 151889,
+  "|<EXTRA_TOKENS_244>|": 151890,
+  "|<EXTRA_TOKENS_245>|": 151891,
+  "|<EXTRA_TOKENS_246>|": 151892,
+  "|<EXTRA_TOKENS_247>|": 151893,
+  "|<EXTRA_TOKENS_248>|": 151894,
+  "|<EXTRA_TOKENS_249>|": 151895,
+  "|<EXTRA_TOKENS_24>|": 151670,
+  "|<EXTRA_TOKENS_250>|": 151896,
+  "|<EXTRA_TOKENS_251>|": 151897,
+  "|<EXTRA_TOKENS_252>|": 151898,
+  "|<EXTRA_TOKENS_253>|": 151899,
+  "|<EXTRA_TOKENS_254>|": 151900,
+  "|<EXTRA_TOKENS_255>|": 151901,
+  "|<EXTRA_TOKENS_256>|": 151902,
+  "|<EXTRA_TOKENS_257>|": 151903,
+  "|<EXTRA_TOKENS_258>|": 151904,
+  "|<EXTRA_TOKENS_259>|": 151905,
+  "|<EXTRA_TOKENS_25>|": 151671,
+  "|<EXTRA_TOKENS_260>|": 151906,
+  "|<EXTRA_TOKENS_261>|": 151907,
+  "|<EXTRA_TOKENS_262>|": 151908,
+  "|<EXTRA_TOKENS_263>|": 151909,
+  "|<EXTRA_TOKENS_264>|": 151910,
+  "|<EXTRA_TOKENS_265>|": 151911,
+  "|<EXTRA_TOKENS_266>|": 151912,
+  "|<EXTRA_TOKENS_267>|": 151913,
+  "|<EXTRA_TOKENS_268>|": 151914,
+  "|<EXTRA_TOKENS_269>|": 151915,
+  "|<EXTRA_TOKENS_26>|": 151672,
+  "|<EXTRA_TOKENS_270>|": 151916,
+  "|<EXTRA_TOKENS_271>|": 151917,
+  "|<EXTRA_TOKENS_272>|": 151918,
+  "|<EXTRA_TOKENS_273>|": 151919,
+  "|<EXTRA_TOKENS_274>|": 151920,
+  "|<EXTRA_TOKENS_275>|": 151921,
+  "|<EXTRA_TOKENS_276>|": 151922,
+  "|<EXTRA_TOKENS_277>|": 151923,
+  "|<EXTRA_TOKENS_278>|": 151924,
+  "|<EXTRA_TOKENS_279>|": 151925,
+  "|<EXTRA_TOKENS_27>|": 151673,
+  "|<EXTRA_TOKENS_280>|": 151926,
+  "|<EXTRA_TOKENS_281>|": 151927,
+  "|<EXTRA_TOKENS_282>|": 151928,
+  "|<EXTRA_TOKENS_283>|": 151929,
+  "|<EXTRA_TOKENS_284>|": 151930,
+  "|<EXTRA_TOKENS_285>|": 151931,
+  "|<EXTRA_TOKENS_286>|": 151932,
+  "|<EXTRA_TOKENS_287>|": 151933,
+  "|<EXTRA_TOKENS_288>|": 151934,
+  "|<EXTRA_TOKENS_289>|": 151935,
+  "|<EXTRA_TOKENS_28>|": 151674,
+  "|<EXTRA_TOKENS_290>|": 151936,
+  "|<EXTRA_TOKENS_291>|": 151937,
+  "|<EXTRA_TOKENS_292>|": 151938,
+  "|<EXTRA_TOKENS_293>|": 151939,
+  "|<EXTRA_TOKENS_294>|": 151940,
+  "|<EXTRA_TOKENS_295>|": 151941,
+  "|<EXTRA_TOKENS_296>|": 151942,
+  "|<EXTRA_TOKENS_297>|": 151943,
+  "|<EXTRA_TOKENS_298>|": 151944,
+  "|<EXTRA_TOKENS_299>|": 151945,
+  "|<EXTRA_TOKENS_29>|": 151675,
+  "|<EXTRA_TOKENS_2>|": 151648,
+  "|<EXTRA_TOKENS_300>|": 151946,
+  "|<EXTRA_TOKENS_301>|": 151947,
+  "|<EXTRA_TOKENS_302>|": 151948,
+  "|<EXTRA_TOKENS_303>|": 151949,
+  "|<EXTRA_TOKENS_304>|": 151950,
+  "|<EXTRA_TOKENS_305>|": 151951,
+  "|<EXTRA_TOKENS_306>|": 151952,
+  "|<EXTRA_TOKENS_307>|": 151953,
+  "|<EXTRA_TOKENS_308>|": 151954,
+  "|<EXTRA_TOKENS_309>|": 151955,
+  "|<EXTRA_TOKENS_30>|": 151676,
+  "|<EXTRA_TOKENS_310>|": 151956,
+  "|<EXTRA_TOKENS_311>|": 151957,
+  "|<EXTRA_TOKENS_312>|": 151958,
+  "|<EXTRA_TOKENS_313>|": 151959,
+  "|<EXTRA_TOKENS_314>|": 151960,
+  "|<EXTRA_TOKENS_315>|": 151961,
+  "|<EXTRA_TOKENS_316>|": 151962,
+  "|<EXTRA_TOKENS_317>|": 151963,
+  "|<EXTRA_TOKENS_318>|": 151964,
+  "|<EXTRA_TOKENS_319>|": 151965,
+  "|<EXTRA_TOKENS_31>|": 151677,
+  "|<EXTRA_TOKENS_320>|": 151966,
+  "|<EXTRA_TOKENS_321>|": 151967,
+  "|<EXTRA_TOKENS_322>|": 151968,
+  "|<EXTRA_TOKENS_323>|": 151969,
+  "|<EXTRA_TOKENS_324>|": 151970,
+  "|<EXTRA_TOKENS_325>|": 151971,
+  "|<EXTRA_TOKENS_326>|": 151972,
+  "|<EXTRA_TOKENS_327>|": 151973,
+  "|<EXTRA_TOKENS_328>|": 151974,
+  "|<EXTRA_TOKENS_329>|": 151975,
+  "|<EXTRA_TOKENS_32>|": 151678,
+  "|<EXTRA_TOKENS_330>|": 151976,
+  "|<EXTRA_TOKENS_331>|": 151977,
+  "|<EXTRA_TOKENS_332>|": 151978,
+  "|<EXTRA_TOKENS_333>|": 151979,
+  "|<EXTRA_TOKENS_334>|": 151980,
+  "|<EXTRA_TOKENS_335>|": 151981,
+  "|<EXTRA_TOKENS_336>|": 151982,
+  "|<EXTRA_TOKENS_337>|": 151983,
+  "|<EXTRA_TOKENS_338>|": 151984,
+  "|<EXTRA_TOKENS_339>|": 151985,
+  "|<EXTRA_TOKENS_33>|": 151679,
+  "|<EXTRA_TOKENS_340>|": 151986,
+  "|<EXTRA_TOKENS_341>|": 151987,
+  "|<EXTRA_TOKENS_342>|": 151988,
+  "|<EXTRA_TOKENS_343>|": 151989,
+  "|<EXTRA_TOKENS_344>|": 151990,
+  "|<EXTRA_TOKENS_345>|": 151991,
+  "|<EXTRA_TOKENS_346>|": 151992,
+  "|<EXTRA_TOKENS_347>|": 151993,
+  "|<EXTRA_TOKENS_348>|": 151994,
+  "|<EXTRA_TOKENS_349>|": 151995,
+  "|<EXTRA_TOKENS_34>|": 151680,
+  "|<EXTRA_TOKENS_350>|": 151996,
+  "|<EXTRA_TOKENS_351>|": 151997,
+  "|<EXTRA_TOKENS_352>|": 151998,
+  "|<EXTRA_TOKENS_353>|": 151999,
+  "|<EXTRA_TOKENS_354>|": 152000,
+  "|<EXTRA_TOKENS_355>|": 152001,
+  "|<EXTRA_TOKENS_356>|": 152002,
+  "|<EXTRA_TOKENS_357>|": 152003,
+  "|<EXTRA_TOKENS_358>|": 152004,
+  "|<EXTRA_TOKENS_359>|": 152005,
+  "|<EXTRA_TOKENS_35>|": 151681,
+  "|<EXTRA_TOKENS_360>|": 152006,
+  "|<EXTRA_TOKENS_361>|": 152007,
+  "|<EXTRA_TOKENS_362>|": 152008,
+  "|<EXTRA_TOKENS_363>|": 152009,
+  "|<EXTRA_TOKENS_364>|": 152010,
+  "|<EXTRA_TOKENS_365>|": 152011,
+  "|<EXTRA_TOKENS_366>|": 152012,
+  "|<EXTRA_TOKENS_367>|": 152013,
+  "|<EXTRA_TOKENS_368>|": 152014,
+  "|<EXTRA_TOKENS_369>|": 152015,
+  "|<EXTRA_TOKENS_36>|": 151682,
+  "|<EXTRA_TOKENS_370>|": 152016,
+  "|<EXTRA_TOKENS_371>|": 152017,
+  "|<EXTRA_TOKENS_372>|": 152018,
+  "|<EXTRA_TOKENS_373>|": 152019,
+  "|<EXTRA_TOKENS_374>|": 152020,
+  "|<EXTRA_TOKENS_375>|": 152021,
+  "|<EXTRA_TOKENS_376>|": 152022,
+  "|<EXTRA_TOKENS_377>|": 152023,
+  "|<EXTRA_TOKENS_378>|": 152024,
+  "|<EXTRA_TOKENS_379>|": 152025,
+  "|<EXTRA_TOKENS_37>|": 151683,
+  "|<EXTRA_TOKENS_380>|": 152026,
+  "|<EXTRA_TOKENS_381>|": 152027,
+  "|<EXTRA_TOKENS_382>|": 152028,
+  "|<EXTRA_TOKENS_383>|": 152029,
+  "|<EXTRA_TOKENS_384>|": 152030,
+  "|<EXTRA_TOKENS_385>|": 152031,
+  "|<EXTRA_TOKENS_386>|": 152032,
+  "|<EXTRA_TOKENS_387>|": 152033,
+  "|<EXTRA_TOKENS_388>|": 152034,
+  "|<EXTRA_TOKENS_389>|": 152035,
+  "|<EXTRA_TOKENS_38>|": 151684,
+  "|<EXTRA_TOKENS_390>|": 152036,
+  "|<EXTRA_TOKENS_391>|": 152037,
+  "|<EXTRA_TOKENS_392>|": 152038,
+  "|<EXTRA_TOKENS_393>|": 152039,
+  "|<EXTRA_TOKENS_394>|": 152040,
+  "|<EXTRA_TOKENS_395>|": 152041,
+  "|<EXTRA_TOKENS_396>|": 152042,
+  "|<EXTRA_TOKENS_397>|": 152043,
+  "|<EXTRA_TOKENS_398>|": 152044,
+  "|<EXTRA_TOKENS_399>|": 152045,
+  "|<EXTRA_TOKENS_39>|": 151685,
+  "|<EXTRA_TOKENS_3>|": 151649,
+  "|<EXTRA_TOKENS_400>|": 152046,
+  "|<EXTRA_TOKENS_401>|": 152047,
+  "|<EXTRA_TOKENS_402>|": 152048,
+  "|<EXTRA_TOKENS_403>|": 152049,
+  "|<EXTRA_TOKENS_404>|": 152050,
+  "|<EXTRA_TOKENS_405>|": 152051,
+  "|<EXTRA_TOKENS_406>|": 152052,
+  "|<EXTRA_TOKENS_407>|": 152053,
+  "|<EXTRA_TOKENS_408>|": 152054,
+  "|<EXTRA_TOKENS_409>|": 152055,
+  "|<EXTRA_TOKENS_40>|": 151686,
+  "|<EXTRA_TOKENS_410>|": 152056,
+  "|<EXTRA_TOKENS_411>|": 152057,
+  "|<EXTRA_TOKENS_412>|": 152058,
+  "|<EXTRA_TOKENS_413>|": 152059,
+  "|<EXTRA_TOKENS_414>|": 152060,
+  "|<EXTRA_TOKENS_415>|": 152061,
+  "|<EXTRA_TOKENS_416>|": 152062,
+  "|<EXTRA_TOKENS_417>|": 152063,
+  "|<EXTRA_TOKENS_41>|": 151687,
+  "|<EXTRA_TOKENS_42>|": 151688,
+  "|<EXTRA_TOKENS_43>|": 151689,
+  "|<EXTRA_TOKENS_44>|": 151690,
+  "|<EXTRA_TOKENS_45>|": 151691,
+  "|<EXTRA_TOKENS_46>|": 151692,
+  "|<EXTRA_TOKENS_47>|": 151693,
+  "|<EXTRA_TOKENS_48>|": 151694,
+  "|<EXTRA_TOKENS_49>|": 151695,
+  "|<EXTRA_TOKENS_4>|": 151650,
+  "|<EXTRA_TOKENS_50>|": 151696,
+  "|<EXTRA_TOKENS_51>|": 151697,
+  "|<EXTRA_TOKENS_52>|": 151698,
+  "|<EXTRA_TOKENS_53>|": 151699,
+  "|<EXTRA_TOKENS_54>|": 151700,
+  "|<EXTRA_TOKENS_55>|": 151701,
+  "|<EXTRA_TOKENS_56>|": 151702,
+  "|<EXTRA_TOKENS_57>|": 151703,
+  "|<EXTRA_TOKENS_58>|": 151704,
+  "|<EXTRA_TOKENS_59>|": 151705,
+  "|<EXTRA_TOKENS_5>|": 151651,
+  "|<EXTRA_TOKENS_60>|": 151706,
+  "|<EXTRA_TOKENS_61>|": 151707,
+  "|<EXTRA_TOKENS_62>|": 151708,
+  "|<EXTRA_TOKENS_63>|": 151709,
+  "|<EXTRA_TOKENS_64>|": 151710,
+  "|<EXTRA_TOKENS_65>|": 151711,
+  "|<EXTRA_TOKENS_66>|": 151712,
+  "|<EXTRA_TOKENS_67>|": 151713,
+  "|<EXTRA_TOKENS_68>|": 151714,
+  "|<EXTRA_TOKENS_69>|": 151715,
+  "|<EXTRA_TOKENS_6>|": 151652,
+  "|<EXTRA_TOKENS_70>|": 151716,
+  "|<EXTRA_TOKENS_71>|": 151717,
+  "|<EXTRA_TOKENS_72>|": 151718,
+  "|<EXTRA_TOKENS_73>|": 151719,
+  "|<EXTRA_TOKENS_74>|": 151720,
+  "|<EXTRA_TOKENS_75>|": 151721,
+  "|<EXTRA_TOKENS_76>|": 151722,
+  "|<EXTRA_TOKENS_77>|": 151723,
+  "|<EXTRA_TOKENS_78>|": 151724,
+  "|<EXTRA_TOKENS_79>|": 151725,
+  "|<EXTRA_TOKENS_7>|": 151653,
+  "|<EXTRA_TOKENS_80>|": 151726,
+  "|<EXTRA_TOKENS_81>|": 151727,
+  "|<EXTRA_TOKENS_82>|": 151728,
+  "|<EXTRA_TOKENS_83>|": 151729,
+  "|<EXTRA_TOKENS_84>|": 151730,
+  "|<EXTRA_TOKENS_85>|": 151731,
+  "|<EXTRA_TOKENS_86>|": 151732,
+  "|<EXTRA_TOKENS_87>|": 151733,
+  "|<EXTRA_TOKENS_88>|": 151734,
+  "|<EXTRA_TOKENS_89>|": 151735,
+  "|<EXTRA_TOKENS_8>|": 151654,
+  "|<EXTRA_TOKENS_90>|": 151736,
+  "|<EXTRA_TOKENS_91>|": 151737,
+  "|<EXTRA_TOKENS_92>|": 151738,
+  "|<EXTRA_TOKENS_93>|": 151739,
+  "|<EXTRA_TOKENS_94>|": 151740,
+  "|<EXTRA_TOKENS_95>|": 151741,
+  "|<EXTRA_TOKENS_96>|": 151742,
+  "|<EXTRA_TOKENS_97>|": 151743,
+  "|<EXTRA_TOKENS_98>|": 151744,
+  "|<EXTRA_TOKENS_99>|": 151745,
+  "|<EXTRA_TOKENS_9>|": 151655
+}

config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "_name_or_path": "/root/.cache/huggingface/hub/models--pbarker--ComputerBase-v0.1-M-3epoch/snapshots/915dc4d8809028264506a79c6e71387c8a26aa35",
+  "architectures": [
+    "MolmoForCausalLM"
+  ],
+  "attention_layer_norm": false,
+  "auto_map": {
+    "AutoConfig": "config_molmo.MolmoConfig",
+    "AutoModelForCausalLM": "modeling_molmo.MolmoForCausalLM"
+  },
+  "clip_qkv": null,
+  "embedding_size": 152064,
+  "hidden_size": 3584,
+  "initializer_range": 0.02,
+  "intermediate_size": 37888,
+  "layer_norm_eps": 1e-06,
+  "layer_norm_type": "rms",
+  "max_position_embeddings": 4096,
+  "model_type": "molmo",
+  "norm_after": false,
+  "num_attention_heads": 28,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 4,
+  "qkv_bias": true,
+  "rope_theta": 1000000.0,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.47.1",
+  "use_cache": true,
+  "use_position_ids": true,
+  "vocab_size": 152064,
+  "weight_tying": false
+}

config_molmo.py ADDED Viewed

	@@ -0,0 +1,60 @@

+from typing import List
+from transformers import PretrainedConfig, AutoTokenizer
+class MolmoConfig(PretrainedConfig):
+    model_type = "molmo"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        vocab_size=50304,
+        embedding_size=50304,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        use_cache=True,
+        layer_norm_eps: float = 1e-5,
+        rope_theta=10000.0,
+        clip_qkv=None,
+        qkv_bias: bool = False,
+        weight_tying: bool = False,
+        use_position_ids: bool=True,
+        tie_word_embeddings: bool=True,
+        attention_layer_norm: bool=False,
+        norm_after: bool = False,
+        layer_norm_type: str="rms",
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.embedding_size = embedding_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.layer_norm_eps = layer_norm_eps
+        self.weight_tying = weight_tying
+        self.use_position_ids = use_position_ids
+        self.attention_layer_norm = attention_layer_norm
+        self.num_key_value_heads = num_key_value_heads
+        self.initializer_range = initializer_range
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.clip_qkv = clip_qkv
+        self.qkv_bias = qkv_bias
+        self.norm_after = norm_after
+        self.tie_word_embeddings = tie_word_embeddings
+        self.layer_norm_type = layer_norm_type
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+MolmoConfig.register_for_auto_class()

generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "eos_token_id": 151643,
+  "max_new_tokens": 2048,
+  "pad_token_id": 151643,
+  "transformers_version": "4.47.1"
+}

global_step412/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:234760e1ab29f222f2af1ae5fa66e7d1ca9a91fd0a2c569d17ebf5ba4188f78e
+size 12031542784

global_step412/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:69f3d6b1f0a357d8a10328ee4a5fd4ccd7ca23f3ed2e107b1a455a87ddde29e5
+size 12031542784

global_step412/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6c5a05c596fa5ec381c0b0bbaadb93ea6fa0deebbb3a7eb10ec006d9a1a3ecb3
+size 12031542784

global_step412/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:441f71e09bd1ead621e4a0c8e709aa039149eb884a97ba6812a6f33ea3b614b2
+size 12031542784

global_step412/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7205e4226c5b4e55dbf5c6fd2cafaf53c14c0c7737b45b0d5e59231ab630f209
+size 12031542784

global_step412/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:88329221dacbe4dc3f2d55aa1f19498d48f3dc9e8458c67670f1c7a488a7db43
+size 12031542784

global_step412/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:58103b7810deb16532f5e06c81d05b4181baf1bb61e8ab62832c23e959d195ce
+size 12031542784

global_step412/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5497eceffeb58e079a239af29b687b6df1f1124e08809718f03aa7648cd99c51
+size 12031542784

global_step412/zero_pp_rank_0_mp_rank_00_model_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2e9ea986416373bd24e5175cebc60278e072c0b67dc0a72c18456d1bcb36d0da
+size 328563

global_step412/zero_pp_rank_1_mp_rank_00_model_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8354ea1e5ccefb48cdf18bd9006343f1cdcfb637ba5368efc1041079517464d2
+size 328563

global_step412/zero_pp_rank_2_mp_rank_00_model_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:50627b8a21f8fbd0072a80f576737eace787095e07ac906ef57e33ba15a675d5
+size 328563

global_step412/zero_pp_rank_3_mp_rank_00_model_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:98cb72b8672ff264dc5044eb1bfdf0d9d076ce33a6c957f4b400f4133dfa6104
+size 328563

global_step412/zero_pp_rank_4_mp_rank_00_model_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:61cadc80a73bb54a82b2d907853ec419a9565dbea3cdbe439d866d62a3827f6b
+size 328563

global_step412/zero_pp_rank_5_mp_rank_00_model_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9b8117239a0ed06722fc8df0ce89498835f1cffd9f08cb634b5cf58f6dfcbd9b
+size 328563

global_step412/zero_pp_rank_6_mp_rank_00_model_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:49bb6fa62d1627bd34748f03249b227ac633ce21cffa4b024d7f100b1f1a3a50
+size 328563

global_step412/zero_pp_rank_7_mp_rank_00_model_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7aa3f91a7b354c710c864623f970f0b4bc84c3a22a81685a3d53112621f16563
+size 328563

image_preprocessing_molmo.py ADDED Viewed

	@@ -0,0 +1,546 @@

+"""Image processor class for Molmo"""
+from typing import List, Optional, Union, Mapping
+import numpy as np
+import einops
+import torch
+import torchvision.transforms
+from torchvision.transforms import InterpolationMode
+from torchvision.transforms.functional import convert_image_dtype
+from transformers.image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ImageInput,
+    is_valid_image,
+)
+from transformers.processing_utils import ImagesKwargs
+from transformers.image_processing_utils import BaseImageProcessor
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+def pad_to_bounding_box(
+    image, offset_height, offset_width, target_height,
+    target_width, value=0
+):
+    height, width = image.shape[:2]
+    after_padding_width = target_width - offset_width - width
+    after_padding_height = target_height - offset_height - height
+    return np.pad(image, [
+        [offset_height, after_padding_height],
+        [offset_width, after_padding_width],
+        [0, 0]
+    ], constant_values=value)
+def normalize_image(image, offset, scale):
+    image -= np.array(offset, dtype=np.float32)[None, None, :]
+    image /= np.array(scale, dtype=np.float32)[None, None, :]
+    return image
+def resize_and_pad(
+    image,
+    desired_output_size,
+    resize_method="torch-bilinear",
+    pad_value=0,
+    normalize=True,
+    image_mean=OPENAI_CLIP_MEAN,
+    image_std=OPENAI_CLIP_STD,
+):
+    desired_height, desired_width = desired_output_size
+    height, width = image.shape[:2]
+    # Cast into float32 since the training code did this in float32 and it (very rarely) effects
+    # the results after rounding.
+    image_scale_y = np.array(desired_height, np.float32) / np.array(height, np.float32)
+    image_scale_x = np.array(desired_width, np.float32) / np.array(width, np.float32)
+    image_scale = min(image_scale_x, image_scale_y)
+    scaled_height = int(np.array(height, np.float32) * image_scale)
+    scaled_width = int(np.array(width, np.float32) * image_scale)
+    if resize_method == "tensorflow":
+        # This how the original training code did resizing, it can produce slightly different
+        # results then using torch resize so we keep it just in case
+        import tensorflow as tf
+        image = tf.image.convert_image_dtype(tf.constant(image), dtype=tf.float32)
+        image = tf.image.resize(
+            image,
+            [scaled_height, scaled_width],
+            method=tf.image.ResizeMethod.BILINEAR,
+            antialias=True,
+        )
+        image = tf.clip_by_value(image, 0.0, 1.0)
+        image = image.numpy()
+    elif resize_method == "torch-bilinear":
+        image = torch.permute(torch.from_numpy(image), [2, 0, 1])
+        image = convert_image_dtype(image)  # resize in float32 to match the training code
+        image = torchvision.transforms.Resize(
+            [scaled_height, scaled_width], InterpolationMode.BILINEAR, antialias=True
+        )(image)
+        image = torch.clip(image, 0.0, 1.0)
+        image = torch.permute(image, [1, 2, 0]).numpy()
+    else:
+        raise NotImplementedError(resize_method)
+    top_pad = (desired_height - scaled_height) // 2
+    left_pad = (desired_width - scaled_width) // 2
+    padding = [
+        [top_pad, desired_height - scaled_height - top_pad],
+        [left_pad, desired_width - scaled_width - left_pad],
+        [0, 0]
+    ]
+    image_mask = np.pad(np.ones_like(image[:, :, 0], dtype=bool), padding[:2])
+    image = np.pad(image, padding, constant_values=pad_value)
+    if normalize:
+        image = normalize_image(image, offset=image_mean, scale=image_std)
+    return image, image_mask
+def select_tiling(h, w, patch_size, max_num_patches):
+    """Decide how best to divide in image of size [w, h] in up to max_num_patches of size patch_size"""
+    original_size = np.stack([h, w])  # [1, 2]
+    original_res = h * w
+    tilings = []
+    for i in range(1, max_num_patches+1):
+        for j in range(1, max_num_patches+1):
+            if i*j <= max_num_patches:
+                tilings.append((i, j))
+    # sort so argmin and argmax favour smaller tilings in the event of a tie
+    tilings.sort(key=lambda x: (x[0]*x[1], x[0]))
+    candidate_tilings = np.array(tilings, dtype=np.int32)  # [n_resolutions, 2]
+    candidate_resolutions = candidate_tilings * patch_size  # [n_resolutions, 2]
+    # How much we would need to scale the image to fit exactly in each tiling
+    original_size = np.stack([h, w], dtype=np.float32)  # [1, 2]
+    required_scale_d = candidate_resolutions.astype(np.float32) / original_size
+    required_scale = np.min(required_scale_d, axis=-1, keepdims=True)  # [n_resolutions, 1]
+    if np.all(required_scale < 1):
+        # We are forced to downscale, so try to minimize the amount of downscaling
+        ix = np.argmax(required_scale)
+    else:
+        # Pick the resolution that required the least upscaling so that it most closely fits the image
+        required_scale = np.where(required_scale < 1.0, 10e9, required_scale)
+        ix = np.argmin(required_scale)
+    return candidate_tilings[ix]
+class MolmoImagesKwargs(ImagesKwargs, total=False):
+    max_crops: Optional[int]
+    overlap_margins: Optional[List[int]]
+    base_image_input_size: Optional[List[int]]
+    image_token_length_w: Optional[int]
+    image_token_length_h: Optional[int]
+    image_patch_size: Optional[int]
+    image_padding_mask: Optional[bool]
+class MolmoImageProcessor(BaseImageProcessor):
+    """Preprocess images and multi-model inputs"""
+    def __init__(
+        self,
+        max_crops: int = 12,
+        overlap_margins: List[int] = (4, 4),
+        base_image_input_size: List[int] = (336, 336),
+        image_token_length_w: int = 12,
+        image_token_length_h: int = 12,
+        image_patch_size: int = 14,
+        image_padding_mask: bool = True,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.max_crops = max_crops
+        self.overlap_margins = overlap_margins
+        self.base_image_input_size = base_image_input_size
+        self.image_token_length_w = image_token_length_w
+        self.image_token_length_h = image_token_length_h
+        self.image_patch_size = image_patch_size
+        self.image_padding_mask = image_padding_mask
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
+        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
+    def image_to_patches_and_tokens(
+        self,
+        image: ImageInput,
+        image_patch_token_id: int,
+        image_col_token_id: int,
+        image_start_token_id: int,
+        image_end_token_id: int,
+        max_crops: Optional[int] = None,
+        overlap_margins: Optional[List[int]] = None,
+        base_image_input_size: Optional[Union[int, List[int]]] = None,
+        image_token_length_w: Optional[int] = None,
+        image_token_length_h: Optional[int] = None,
+        image_patch_size: Optional[int] = None,
+    ):
+        if isinstance(base_image_input_size, int):
+            base_image_input_size = (base_image_input_size, base_image_input_size)
+        base_image_input_d = image_patch_size
+        tokens_per_image = image_token_length_w * image_token_length_h
+        image_base_patch_w = base_image_input_size[1] // base_image_input_d
+        image_base_patch_h = base_image_input_size[0] // base_image_input_d
+        original_image_h, original_image_w = image.shape[:2]
+        crop_size = base_image_input_size[0]
+        # Discard this many patches from the (left/top, right/bottom) of crops
+        left_margin, right_margin = overlap_margins
+        # left_margin, right_margin = 2, 2
+        assert left_margin % 2 == 0  # Required for compatibility with 2x2 pooling
+        total_margin_pixels = base_image_input_d*(right_margin + left_margin)  # pixels removed per dim
+        crop_patches = base_image_input_size[0] // base_image_input_d  # patches per crop dim
+        crop_window_patches = crop_patches - (right_margin + left_margin)  # usable patches
+        crop_window_size = crop_window_patches * base_image_input_d
+        tiling = select_tiling(
+            original_image_h - total_margin_pixels,
+            original_image_w - total_margin_pixels,
+            crop_window_size,
+            max_crops
+        )
+        src, img_mask = resize_and_pad(
+            image,
+            [tiling[0]*crop_window_size+total_margin_pixels, tiling[1]*crop_window_size+total_margin_pixels]
+        )
+        # Now we have to split the image into crops, while keeping track of how each patch in the
+        # each crop should be ordered in the global image, this require a lot of tricky booking
+        n_crops = tiling[0] * tiling[1]
+        patches_arr = []
+        mask_arr = []
+        patch_ordering_arr = []
+        # We assume 2x2 pooling, but can allow padding the right/bottom with extra
+        # patches if the number of patches per side is not even
+        assert (crop_patches+1)//2 == image_token_length_h
+        assert (crop_patches+1)//2 == image_token_length_w
+        on = 0
+        on_patch = 0
+        for i in range(tiling[0]):
+            y0 = i*crop_window_size
+            if i == 0:
+                crop_y0 = 0
+            else:
+                crop_y0 = left_margin // 2
+            crop_h = image_base_patch_h - (right_margin + left_margin)
+            if i == 0:
+                crop_h += left_margin
+            if i == (tiling[0]-1):
+                crop_h += right_margin
+            for j in range(tiling[1]):
+                x0 = j*crop_window_size
+                if j == 0:
+                    crop_x0 = 0
+                else:
+                    crop_x0 = left_margin // 2
+                crop_w = image_base_patch_w - (right_margin + left_margin)
+                if j == 0:
+                    crop_w += left_margin
+                if j == (tiling[1]-1):
+                    crop_w += right_margin
+                pooled_w = (crop_w + 1) // 2
+                pooled_h = (crop_h + 1) // 2
+                patch_ordering_arr.append(
+                    pad_to_bounding_box(
+                        np.reshape(np.arange(on, on+pooled_h*pooled_w, dtype=np.int32), (pooled_h, pooled_w, 1)),
+                        crop_y0, crop_x0, image_token_length_h, image_token_length_w, value=-1
+                    )[:, :, 0]
+                )
+                patches_arr.append(src[y0:y0+crop_size, x0:x0+crop_size])
+                mask_arr.append(img_mask[y0:y0+crop_size, x0:x0+crop_size])
+                on += pooled_h*pooled_w
+                on_patch += 1
+        patches = np.stack(patches_arr)
+        patch_ordering = np.stack(patch_ordering_arr)
+        img_mask = np.stack(mask_arr)
+        # Switch to [n_crops, n_patches, pixels_per_patch] format
+        image_layout_impatch_w, image_layout_impatch_h = tiling[0], tiling[1]
+        patches = einops.rearrange(
+            patches, 'p (h dh) (w dw) c -> p (h w) (dh dw c)',
+            dh=base_image_input_d,
+            dw=base_image_input_d,
+            h=image_base_patch_h,
+            w=image_base_patch_w
+        )
+        img_mask = einops.rearrange(
+            img_mask, 'p (h dh) (w dw) -> p (h w) (dh dw)',
+            dh=base_image_input_d,
+            dw=base_image_input_d,
+            h=image_base_patch_h,
+            w=image_base_patch_w
+        )
+        img_mask = img_mask.astype(np.float32).mean(axis=-1)
+        patch_ordering = np.reshape(patch_ordering, [-1])
+        valid = patch_ordering >= 0
+        # Transpose order, to get left-to-right order instead of crop-by-crop order
+        patch_ordering_rh = np.reshape(
+            patch_ordering,
+            [tiling[0], tiling[1], image_token_length_h, image_token_length_w]
+        )
+        patch_ordering_rh = np.transpose(patch_ordering_rh, [0, 2, 1, 3])
+        patch_ordering_rh = np.reshape(patch_ordering_rh, [-1])
+        # The transpose will screw up which patches are masked, project the
+        # new order into sparse structure of `patch_ordering` to fix this
+        patch_ordering[valid] = patch_ordering_rh[patch_ordering_rh >= 0]
+        # Now build the output tokens
+        h = tiling[0] * crop_window_patches + (right_margin+left_margin)
+        w = tiling[1] * crop_window_patches + (right_margin+left_margin)
+        per_row = np.full(
+            ((w+1)//2,),
+            image_patch_token_id,
+        )
+        per_row = np.concatenate([per_row, [image_col_token_id]], 0)
+        joint = np.tile(per_row, [(h+1)//2])
+        joint = [
+            [image_start_token_id],
+            joint,
+            [image_end_token_id]
+        ]
+        # Finally do the same for the global image
+        resized, _ = resize_and_pad(image, base_image_input_size)
+        resized = einops.rearrange(
+            resized, '(h dh) (w dw) c -> (h w) (dh dw c)',
+            dh=base_image_input_d,
+            dw=base_image_input_d,
+            h=image_base_patch_h,
+            w=image_base_patch_w
+        )
+        patches = np.concatenate([np.expand_dims(resized, 0), patches], 0)
+        # Global image goes first, so the order of patches in previous crops gets increased
+        patch_ordering = np.where(
+            patch_ordering >= 0,
+            patch_ordering + tokens_per_image,
+            -1
+        )
+        patch_ordering = np.concatenate([np.arange(0, tokens_per_image), patch_ordering], 0)
+        per_row = np.full(
+            (image_token_length_w,),
+            image_patch_token_id,
+        )
+        per_row = np.concatenate([per_row, [image_col_token_id]], 0)
+        extra_tokens = np.tile(per_row, [image_token_length_h])
+        joint = [
+                    [image_start_token_id],
+                    extra_tokens,
+                    [image_end_token_id],
+                ] + joint
+        joint = np.concatenate(joint, 0)
+        img_mask = np.pad(img_mask, [[0, 1], [0, 0]], constant_values=-1)
+        return patches, joint, patch_ordering, img_mask
+    def build_image_input_idx(
+        self,
+        image_tokens: np.ndarray,
+        patch_order: np.ndarray,
+        image_patch_token_id: int,
+        no_image: Optional[bool] = None,
+        image_token_length_w: Optional[int] = None,
+        image_token_length_h: Optional[int] = None,
+    ):
+        """Converts `patch_order` into a mapping of token_id -> patch_id"""
+        tokens_per_image = image_token_length_w * image_token_length_h
+        if no_image is not None and no_image:
+            return np.zeros((0, tokens_per_image), np.int32)
+        # Indices to insert the patches
+        image_input_idx = image_tokens == image_patch_token_id
+        image_input_idx = np.nonzero(image_input_idx)[0].astype(np.int32)
+        if patch_order is not None:
+            n_tokens = image_input_idx.shape[0]
+            patch_order = np.reshape(patch_order, [-1])
+            n_patches = patch_order.shape[0]
+            valid = patch_order >= 0
+            n_valid_patches = valid.sum()
+            assert len(image_input_idx) == n_valid_patches
+            sorted_patch_ixs = np.zeros([n_tokens], np.int32)
+            sorted_patch_ixs[patch_order[valid]] = np.arange(n_valid_patches, dtype=np.int32)
+            # Project the inverted mapping into same sparse structure
+            sorted_patch_ixs_ex = np.full(np.shape(patch_order), -1)
+            sorted_patch_ixs_ex[valid] = sorted_patch_ixs
+            # Do the gather and then re-masked outputs that were masked in `sorted_patch_ixs`
+            valid = (sorted_patch_ixs_ex >= 0).astype(np.int32)
+            image_input_idx = image_input_idx[sorted_patch_ixs_ex*valid]
+            image_input_idx = image_input_idx*valid - 100*(1 - valid)
+            image_input_idx = np.reshape(image_input_idx, [-1, tokens_per_image])
+        return image_input_idx
+    def preprocess(
+        self,
+        image: np.ndarray,
+        image_patch_token_id: int,
+        image_col_token_id: int,
+        image_start_token_id: int,
+        image_end_token_id: int,
+        max_crops: Optional[int] = None,
+        overlap_margins: Optional[List[int]] = None,
+        base_image_input_size: Optional[Union[int, List[int]]] = None,
+        image_token_length_w: Optional[int] = None,
+        image_token_length_h: Optional[int] = None,
+        image_patch_size: Optional[int] = None,
+        **kwargs,
+    ):
+        """Preprocesses an image
+        Returns:
+            crops: (n_crops, n_patches, patch_dim) individual crops, `n_crops` might
+                   change between images but the other dimension are fixed
+            tokens: (n_tokens,) int32 tokens, pad tokens indicate where to insert the
+                                patch features, might include other special tokens as well
+            image_idx: (n_crops, n_patches) index in `tokens` to put the patch features from the
+                       crops after pooling, negative values indicates patches features to exclude
+            padding_mask: (n_crops, n_patches) what percent of each crop is padding, can be None
+                          if the image mask is not being used.
+        """
+        max_crops = max_crops or self.max_crops
+        overlap_margins = overlap_margins or self.overlap_margins
+        base_image_input_size = base_image_input_size or self.base_image_input_size
+        image_token_length_w = image_token_length_w or self.image_token_length_w
+        image_token_length_h = image_token_length_h or self.image_token_length_h
+        image_patch_size = image_patch_size or self.image_patch_size
+        crops, image_tokens, patch_ordering, img_mask = self.image_to_patches_and_tokens(
+            image,
+            image_patch_token_id,
+            image_col_token_id,
+            image_start_token_id,
+            image_end_token_id,
+            max_crops,
+            overlap_margins,
+            base_image_input_size,
+            image_token_length_w,
+            image_token_length_h,
+            image_patch_size,
+        )
+        patch_idx = self.build_image_input_idx(
+            image_tokens,
+            patch_ordering,
+            image_patch_token_id,
+            image_token_length_w=image_token_length_w,
+            image_token_length_h=image_token_length_h,
+        )
+        return crops, image_tokens, patch_idx, img_mask
+    def multimodal_preprocess(
+        self,
+        images: np.ndarray,
+        tokens: List[int],
+        image_idx: np.ndarray,
+        sequence_length: int,
+        image_patch_token_id: int,
+        image_col_token_id: int,
+        image_start_token_id: int,
+        image_end_token_id: int,
+        **kwargs,
+    ):
+        """Merge images and text tokens into multi-modal features for the model
+        :param images: images to use as input
+        :param tokens: input text tokens
+        :param image_idx: where to insert the images into `tokens`
+        :params image_patch_token_id: id to use of tokens that will contain image features
+        :params image_col_token_id: token id for image column special tokens
+        :params image_start_token_id: token id for image start special tokens
+        :params image_end_token_id: token id for image end special tokens
+        :params kwargs: override preprocessor default args
+        """
+        max_total_crops = kwargs.get("max_crops") or self.max_crops
+        image_token_length_w = kwargs.get("image_token_length_w") or self.image_token_length_w
+        image_token_length_h = kwargs.get("image_token_length_h") or self.image_token_length_h
+        image_patch_size = kwargs.get("image_patch_size") or self.image_patch_size
+        base_image_input_size = kwargs.get("base_image_input_size") or self.base_image_input_size
+        image_num_patch = (
+            base_image_input_size[0] // image_patch_size,
+            base_image_input_size[1] // image_patch_size,
+        )
+        image_padding_mask = kwargs.get("image_padding_mask") or self.image_padding_mask
+        tokens_per_image = image_token_length_w * image_token_length_h
+        n_pixels = image_patch_size * image_patch_size * 3
+        n_patches = image_num_patch[0] * image_num_patch[1]
+        if images is None:
+            return {
+                "input_ids": tokens,
+            }
+        else:
+            n = len(images)
+            all_crops = []
+            all_image_idx = []
+            out_tokens = []
+            all_crop_masks = []
+            for ix in range(n):
+                token_ix = image_idx[ix]
+                crops, image_tokens, patch_idx, img_mask = self.preprocess(
+                    images[ix],
+                    image_patch_token_id,
+                    image_col_token_id,
+                    image_start_token_id,
+                    image_end_token_id,
+                    **kwargs,
+                )
+                if token_ix == -1:  # -1 is an image inserted at the very start
+                    start = 0
+                    token_ix = 0
+                    end = 0
+                else:
+                    start = 0 if ix == 0 else image_idx[ix-1] + 1
+                    end = token_ix + 1
+                all_image_idx.append(patch_idx + token_ix)
+                all_crops.append(crops)
+                out_tokens.append(tokens[start:token_ix])
+                out_tokens.append(image_tokens)
+                if ix == (n - 1):
+                    out_tokens.append(tokens[end:])
+                if image_padding_mask:
+                    all_crop_masks.append(img_mask)
+            input_ids = np.concatenate(out_tokens, 0)
+            images = np.concatenate(all_crops, 0)
+            image_input_idx = np.concatenate(all_image_idx, 0)
+            if image_padding_mask:
+                image_masks = np.concatenate(all_crop_masks, 0)
+            else:
+                image_masks = None
+        out = {
+            "input_ids": input_ids,
+            "images": images,
+            "image_input_idx": image_input_idx
+        }
+        if image_masks is not None:
+            out["image_masks"] = image_masks
+        return out
+MolmoImageProcessor.register_for_auto_class()

latest ADDED Viewed

	@@ -0,0 +1 @@


1	+ global_step412

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model-00001-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a64680d5cfae52d6e47f4b5457ba03badc95879615c052ecd7ed6379cce51397
+size 4981346544

model-00002-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:89430b3d75c77933d8124e7286e936cc69b74d99bff38f35baa7056fafdd90f6
+size 4991475304

model-00003-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:54e51ce161091a621052e11bb9d274636dcda751d8a300ec22de26cef998a253
+size 4169357528

model-00004-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:14f6cbc2ec98dd95fb05cde73a27887b6376cd060dc4a460f8f64bd45cb3d90d
+size 1899952568

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,592 @@

+{
+  "metadata": {
+    "total_size": 16042050560
+  },
+  "weight_map": {
+    "model.transformer.blocks.0.att_proj.bias": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.0.att_proj.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.0.attn_norm.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.0.attn_out.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.0.ff_norm.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.0.ff_out.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.0.ff_proj.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.1.att_proj.bias": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.1.att_proj.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.1.attn_norm.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.1.attn_out.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.1.ff_norm.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.1.ff_out.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.1.ff_proj.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.10.att_proj.bias": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.10.att_proj.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.10.attn_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.10.attn_out.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.10.ff_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.10.ff_out.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.10.ff_proj.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.11.att_proj.bias": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.11.att_proj.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.11.attn_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.11.attn_out.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.11.ff_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.11.ff_out.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.11.ff_proj.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.12.att_proj.bias": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.12.att_proj.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.12.attn_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.12.attn_out.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.12.ff_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.12.ff_out.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.12.ff_proj.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.13.att_proj.bias": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.13.att_proj.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.13.attn_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.13.attn_out.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.13.ff_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.13.ff_out.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.13.ff_proj.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.14.att_proj.bias": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.14.att_proj.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.14.attn_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.14.attn_out.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.14.ff_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.14.ff_out.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.14.ff_proj.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.15.att_proj.bias": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.15.att_proj.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.15.attn_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.15.attn_out.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.15.ff_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.15.ff_out.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.15.ff_proj.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.16.att_proj.bias": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.16.att_proj.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.16.attn_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.16.attn_out.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.16.ff_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.16.ff_out.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.16.ff_proj.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.17.att_proj.bias": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.17.att_proj.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.17.attn_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.17.attn_out.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.17.ff_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.17.ff_out.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.17.ff_proj.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.18.att_proj.bias": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.18.att_proj.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.18.attn_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.18.attn_out.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.18.ff_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.18.ff_out.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.18.ff_proj.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.19.att_proj.bias": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.19.att_proj.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.19.attn_norm.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.19.attn_out.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.19.ff_norm.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.19.ff_out.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.19.ff_proj.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.2.att_proj.bias": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.2.att_proj.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.2.attn_norm.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.2.attn_out.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.2.ff_norm.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.2.ff_out.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.2.ff_proj.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.20.att_proj.bias": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.20.att_proj.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.20.attn_norm.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.20.attn_out.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.20.ff_norm.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.20.ff_out.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.20.ff_proj.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.21.att_proj.bias": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.21.att_proj.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.21.attn_norm.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.21.attn_out.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.21.ff_norm.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.21.ff_out.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.21.ff_proj.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.22.att_proj.bias": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.22.att_proj.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.22.attn_norm.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.22.attn_out.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.22.ff_norm.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.22.ff_out.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.22.ff_proj.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.23.att_proj.bias": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.23.att_proj.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.23.attn_norm.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.23.attn_out.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.23.ff_norm.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.23.ff_out.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.23.ff_proj.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.24.att_proj.bias": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.24.att_proj.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.24.attn_norm.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.24.attn_out.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.24.ff_norm.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.24.ff_out.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.24.ff_proj.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.25.att_proj.bias": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.25.att_proj.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.25.attn_norm.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.25.attn_out.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.25.ff_norm.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.25.ff_out.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.25.ff_proj.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.26.att_proj.bias": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.26.att_proj.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.26.attn_norm.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.26.attn_out.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.26.ff_norm.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.26.ff_out.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.26.ff_proj.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.27.att_proj.bias": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.27.att_proj.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.27.attn_norm.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.27.attn_out.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.27.ff_norm.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.27.ff_out.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.27.ff_proj.weight": "model-00003-of-00004.safetensors",
+    "model.transformer.blocks.3.att_proj.bias": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.3.att_proj.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.3.attn_norm.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.3.attn_out.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.3.ff_norm.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.3.ff_out.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.3.ff_proj.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.4.att_proj.bias": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.4.att_proj.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.4.attn_norm.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.4.attn_out.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.4.ff_norm.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.4.ff_out.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.4.ff_proj.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.5.att_proj.bias": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.5.att_proj.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.5.attn_norm.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.5.attn_out.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.5.ff_norm.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.5.ff_out.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.5.ff_proj.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.6.att_proj.bias": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.6.att_proj.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.6.attn_norm.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.6.attn_out.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.6.ff_norm.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.6.ff_out.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.6.ff_proj.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.7.att_proj.bias": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.7.att_proj.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.7.attn_norm.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.7.attn_out.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.7.ff_norm.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.7.ff_out.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.7.ff_proj.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.8.att_proj.bias": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.8.att_proj.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.8.attn_norm.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.8.attn_out.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.8.ff_norm.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.8.ff_out.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.blocks.8.ff_proj.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.9.att_proj.bias": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.9.att_proj.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.9.attn_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.9.attn_out.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.9.ff_norm.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.9.ff_out.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.blocks.9.ff_proj.weight": "model-00002-of-00004.safetensors",
+    "model.transformer.ff_out.weight": "model-00004-of-00004.safetensors",
+    "model.transformer.ln_f.weight": "model-00001-of-00004.safetensors",
+    "model.transformer.wte.embedding": "model-00001-of-00004.safetensors",
+    "model.transformer.wte.new_embedding": "model-00001-of-00004.safetensors",
+    "model.vision_backbone.image_pooling_2d.wk.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_pooling_2d.wk.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_pooling_2d.wo.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_pooling_2d.wo.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_pooling_2d.wq.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_pooling_2d.wq.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_pooling_2d.wv.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_pooling_2d.wv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_projector.w1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_projector.w2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_projector.w3.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.class_embedding": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.patch_embedding.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.positional_embedding": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.pre_ln.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.pre_ln.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.0.attention.wk.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.0.attention.wk.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.0.attention.wo.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.0.attention.wo.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.0.attention.wq.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.0.attention.wq.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.0.attention.wv.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.0.attention.wv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.0.attention_norm.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.0.attention_norm.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.0.feed_forward.w1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.0.feed_forward.w1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.0.feed_forward.w2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.0.feed_forward.w2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.0.ffn_norm.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.0.ffn_norm.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.1.attention.wk.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.1.attention.wk.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.1.attention.wo.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.1.attention.wo.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.1.attention.wq.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.1.attention.wq.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.1.attention.wv.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.1.attention.wv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.1.attention_norm.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.1.attention_norm.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.1.feed_forward.w1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.1.feed_forward.w1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.1.feed_forward.w2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.1.feed_forward.w2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.1.ffn_norm.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.1.ffn_norm.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.10.attention.wk.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.10.attention.wk.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.10.attention.wo.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.10.attention.wo.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.10.attention.wq.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.10.attention.wq.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.10.attention.wv.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.10.attention.wv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.10.attention_norm.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.10.attention_norm.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.10.feed_forward.w1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.10.feed_forward.w1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.10.feed_forward.w2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.10.feed_forward.w2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.10.ffn_norm.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.10.ffn_norm.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.11.attention.wk.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.11.attention.wk.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.11.attention.wo.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.11.attention.wo.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.11.attention.wq.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.11.attention.wq.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.11.attention.wv.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.11.attention.wv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.11.attention_norm.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.11.attention_norm.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.11.feed_forward.w1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.11.feed_forward.w1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.11.feed_forward.w2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.11.feed_forward.w2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.11.ffn_norm.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.11.ffn_norm.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.12.attention.wk.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.12.attention.wk.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.12.attention.wo.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.12.attention.wo.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.12.attention.wq.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.12.attention.wq.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.12.attention.wv.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.12.attention.wv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.12.attention_norm.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.12.attention_norm.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.12.feed_forward.w1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.12.feed_forward.w1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.12.feed_forward.w2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.12.feed_forward.w2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.12.ffn_norm.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.12.ffn_norm.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.13.attention.wk.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.13.attention.wk.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.13.attention.wo.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.13.attention.wo.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.13.attention.wq.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.13.attention.wq.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.13.attention.wv.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.13.attention.wv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.13.attention_norm.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.13.attention_norm.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.13.feed_forward.w1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.13.feed_forward.w1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.13.feed_forward.w2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.13.feed_forward.w2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.13.ffn_norm.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.13.ffn_norm.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.14.attention.wk.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.14.attention.wk.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.14.attention.wo.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.14.attention.wo.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.14.attention.wq.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.14.attention.wq.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.14.attention.wv.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.14.attention.wv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.14.attention_norm.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.14.attention_norm.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.14.feed_forward.w1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.14.feed_forward.w1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.14.feed_forward.w2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.14.feed_forward.w2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.14.ffn_norm.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.14.ffn_norm.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.15.attention.wk.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.15.attention.wk.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.15.attention.wo.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.15.attention.wo.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.15.attention.wq.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.15.attention.wq.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.15.attention.wv.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.15.attention.wv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.15.attention_norm.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.15.attention_norm.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.15.feed_forward.w1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.15.feed_forward.w1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.15.feed_forward.w2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.15.feed_forward.w2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.15.ffn_norm.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.15.ffn_norm.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.16.attention.wk.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.16.attention.wk.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.16.attention.wo.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.16.attention.wo.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.16.attention.wq.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.16.attention.wq.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.16.attention.wv.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.16.attention.wv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.16.attention_norm.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.16.attention_norm.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.16.feed_forward.w1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.16.feed_forward.w1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.16.feed_forward.w2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.16.feed_forward.w2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.16.ffn_norm.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.16.ffn_norm.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.17.attention.wk.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.17.attention.wk.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.17.attention.wo.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.17.attention.wo.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.17.attention.wq.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.17.attention.wq.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.17.attention.wv.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.17.attention.wv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.17.attention_norm.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.17.attention_norm.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.17.feed_forward.w1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.17.feed_forward.w1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.17.feed_forward.w2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.17.feed_forward.w2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.17.ffn_norm.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.17.ffn_norm.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.18.attention.wk.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.18.attention.wk.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.18.attention.wo.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.18.attention.wo.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.18.attention.wq.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.18.attention.wq.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.18.attention.wv.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.18.attention.wv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.18.attention_norm.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.18.attention_norm.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.18.feed_forward.w1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.18.feed_forward.w1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.18.feed_forward.w2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.18.feed_forward.w2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.18.ffn_norm.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.18.ffn_norm.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.19.attention.wk.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.19.attention.wk.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.19.attention.wo.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.19.attention.wo.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.19.attention.wq.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.19.attention.wq.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.19.attention.wv.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.19.attention.wv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.19.attention_norm.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.19.attention_norm.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.19.feed_forward.w1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.19.feed_forward.w1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.19.feed_forward.w2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.19.feed_forward.w2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.19.ffn_norm.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.19.ffn_norm.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.2.attention.wk.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.2.attention.wk.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.2.attention.wo.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.2.attention.wo.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.2.attention.wq.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.2.attention.wq.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.2.attention.wv.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.2.attention.wv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.2.attention_norm.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.2.attention_norm.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.2.feed_forward.w1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.2.feed_forward.w1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.2.feed_forward.w2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.2.feed_forward.w2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.2.ffn_norm.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.2.ffn_norm.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.20.attention.wk.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.20.attention.wk.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.20.attention.wo.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.20.attention.wo.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.20.attention.wq.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.20.attention.wq.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.20.attention.wv.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.20.attention.wv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.20.attention_norm.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.20.attention_norm.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.20.feed_forward.w1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.20.feed_forward.w1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.20.feed_forward.w2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.20.feed_forward.w2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.20.ffn_norm.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.20.ffn_norm.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.21.attention.wk.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.21.attention.wk.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.21.attention.wo.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.21.attention.wo.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.21.attention.wq.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.21.attention.wq.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.21.attention.wv.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.21.attention.wv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.21.attention_norm.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.21.attention_norm.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.21.feed_forward.w1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.21.feed_forward.w1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.21.feed_forward.w2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.21.feed_forward.w2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.21.ffn_norm.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.21.ffn_norm.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.22.attention.wk.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.22.attention.wk.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.22.attention.wo.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.22.attention.wo.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.22.attention.wq.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.22.attention.wq.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.22.attention.wv.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.22.attention.wv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.22.attention_norm.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.22.attention_norm.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.22.feed_forward.w1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.22.feed_forward.w1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.22.feed_forward.w2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.22.feed_forward.w2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.22.ffn_norm.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.22.ffn_norm.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.3.attention.wk.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.3.attention.wk.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.3.attention.wo.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.3.attention.wo.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.3.attention.wq.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.3.attention.wq.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.3.attention.wv.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.3.attention.wv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.3.attention_norm.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.3.attention_norm.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.3.feed_forward.w1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.3.feed_forward.w1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.3.feed_forward.w2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.3.feed_forward.w2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.3.ffn_norm.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.3.ffn_norm.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.4.attention.wk.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.4.attention.wk.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.4.attention.wo.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.4.attention.wo.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.4.attention.wq.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.4.attention.wq.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.4.attention.wv.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.4.attention.wv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.4.attention_norm.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.4.attention_norm.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.4.feed_forward.w1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.4.feed_forward.w1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.4.feed_forward.w2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.4.feed_forward.w2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.4.ffn_norm.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.4.ffn_norm.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.5.attention.wk.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.5.attention.wk.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.5.attention.wo.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.5.attention.wo.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.5.attention.wq.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.5.attention.wq.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.5.attention.wv.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.5.attention.wv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.5.attention_norm.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.5.attention_norm.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.5.feed_forward.w1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.5.feed_forward.w1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.5.feed_forward.w2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.5.feed_forward.w2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.5.ffn_norm.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.5.ffn_norm.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.6.attention.wk.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.6.attention.wk.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.6.attention.wo.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.6.attention.wo.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.6.attention.wq.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.6.attention.wq.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.6.attention.wv.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.6.attention.wv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.6.attention_norm.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.6.attention_norm.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.6.feed_forward.w1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.6.feed_forward.w1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.6.feed_forward.w2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.6.feed_forward.w2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.6.ffn_norm.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.6.ffn_norm.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.7.attention.wk.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.7.attention.wk.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.7.attention.wo.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.7.attention.wo.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.7.attention.wq.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.7.attention.wq.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.7.attention.wv.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.7.attention.wv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.7.attention_norm.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.7.attention_norm.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.7.feed_forward.w1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.7.feed_forward.w1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.7.feed_forward.w2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.7.feed_forward.w2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.7.ffn_norm.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.7.ffn_norm.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.8.attention.wk.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.8.attention.wk.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.8.attention.wo.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.8.attention.wo.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.8.attention.wq.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.8.attention.wq.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.8.attention.wv.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.8.attention.wv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.8.attention_norm.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.8.attention_norm.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.8.feed_forward.w1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.8.feed_forward.w1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.8.feed_forward.w2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.8.feed_forward.w2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.8.ffn_norm.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.8.ffn_norm.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.9.attention.wk.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.9.attention.wk.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.9.attention.wo.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.9.attention.wo.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.9.attention.wq.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.9.attention.wq.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.9.attention.wv.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.9.attention.wv.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.9.attention_norm.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.9.attention_norm.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.9.feed_forward.w1.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.9.feed_forward.w1.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.9.feed_forward.w2.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.9.feed_forward.w2.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.9.ffn_norm.bias": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.image_vit.transformer.resblocks.9.ffn_norm.weight": "model-00004-of-00004.safetensors",
+    "model.vision_backbone.pad_embed": "model-00004-of-00004.safetensors"
+  }
+}

modeling_molmo.py ADDED Viewed

	@@ -0,0 +1,2367 @@

+import logging
+import math
+from copy import deepcopy
+from dataclasses import fields, dataclass, replace
+from enum import Enum
+from typing import List, Optional, Tuple, Union, Dict, Any, Sequence, Callable, cast, MutableMapping
+import torch
+from einops import einsum, einops
+from transformers import PreTrainedModel, GenerationConfig
+from transformers.cache_utils import Cache
+from transformers.modeling_outputs import CausalLMOutputWithPast, ModelOutput
+from transformers.models.auto import AutoModelForCausalLM
+from torch import nn
+from .config_molmo import MolmoConfig
+from torch.nn import functional as F
+log = logging.getLogger(__name__)
+class BufferCache(dict, MutableMapping[str, torch.Tensor]):
+    """
+    Cache for attention biases and other things that would normally be stored as buffers.
+    We avoid using buffers because we've run into various issues doing so with FSDP.
+    In general it appears the way FSDP handles buffers is not well-defined.
+    It doesn't shard them but apparently it does synchronize them across processes, which we want to avoid
+    since (A) it isn't necessary, and (B) we sometimes have `-inf` in these biases which might get turned into
+    NaNs when they're synchronized due to casting or some other issue.
+    """
+class StrEnum(str, Enum):
+    def __str__(self) -> str:
+        return self.value
+    def __repr__(self) -> str:
+        return f"'{str(self)}'"
+class ImageProjectType(StrEnum):
+    mlp = "mlp"
+    mlpx2 = "2mlp"
+    linear = "linear"
+class ImagePooling2DType(StrEnum):
+    attention = "attention"
+    attention_meanq = "attention-meanq"
+    attention_2wide = "attention_2wide"
+    attention_v2 = "attention-v2"
+    none = "none"
+    stack = "stack"
+class ActivationType(StrEnum):
+    quick_gelu = "quick_gelu"
+    gelu = "gelu"
+    gelu_tanh = "gelu_tanh"
+    relu = "relu"
+    silu = "silu"
+    llama_geglu = "llama_geglu"
+    llama_geglu_tanh = "llama_geglu_tanh"
+    llama_swiglu = "llama_swiglu"
+    swiglu = "swiglu"
+def ensure_finite_(x: torch.Tensor, check_neg_inf: bool = True, check_pos_inf: bool = False):
+    """
+    Modify ``x`` in place to replace ``float("-inf")`` with the minimum value of the dtype when ``check_neg_inf``
+    is ``True`` and to replace ``float("inf")`` with the maximum value of the dtype when ``check_pos_inf`` is ``True``.
+    """
+    if check_neg_inf:
+        x.masked_fill_(x == float("-inf"), torch.finfo(x.dtype).min)
+    if check_pos_inf:
+        x.masked_fill_(x == float("inf"), torch.finfo(x.dtype).max)
+class MolmoConfigurationError(Exception):
+    pass
+def _non_meta_init_device(config) -> torch.device:
+    if config.init_device is not None and config.init_device != "meta":
+        return torch.device(config.init_device)
+    else:
+        return torch.device("cuda" if torch.cuda.is_available() else "cpu")
+class RotaryEmbedding(nn.Module):
+    """
+    [Rotary positional embeddings (RoPE)](https://arxiv.org/abs/2104.09864).
+    """
+    def __init__(self, config: MolmoConfig, cache: BufferCache):
+        super().__init__()
+        self.config = config
+        self.__cache = cache
+        # Warm up cache.
+        self.get_rotary_embedding(
+            config.max_position_embeddings or config.max_sequence_length,
+            _non_meta_init_device(config)
+        )
+    def get_rotary_embedding(self, seq_len: int, device: torch.device) -> Tuple[torch.Tensor, torch.Tensor]:
+        if (
+            (pos_sin := self.__cache.get("rope_pos_sin")) is not None
+            and (pos_cos := self.__cache.get("rope_pos_cos")) is not None
+            and pos_sin.shape[-2] >= seq_len
+            and pos_cos.shape[-2] >= seq_len
+        ):
+            if pos_sin.device != device:
+                pos_sin = pos_sin.to(device)
+                self.__cache["rope_pos_sin"] = pos_sin
+            if pos_cos.device != device:
+                pos_cos = pos_cos.to(device)
+                self.__cache["rope_pos_cos"] = pos_cos
+            return pos_sin[:, :, :seq_len, :], pos_cos[:, :, :seq_len, :]
+        with torch.autocast(device.type, enabled=False):
+            dim = self.config.d_model // self.config.n_heads
+            inv_freq = 1.0 / (self.config.rope_theta ** (torch.arange(0, dim, 2, device=device, dtype=torch.float) / dim))
+            seq = torch.arange(seq_len, device=device, dtype=torch.float)
+            freqs = torch.einsum("i , j -> i j", seq, inv_freq)
+            if self.config.rope_impl == "interleave":
+                positions = freqs.repeat_interleave(2, dim=-1)
+            else:
+                positions = torch.cat((freqs, freqs), dim=-1)
+            pos_sin, pos_cos = positions.sin()[None, None, :, :], positions.cos()[None, None, :, :]
+        self.__cache["rope_pos_sin"] = pos_sin
+        self.__cache["rope_pos_cos"] = pos_cos
+        return pos_sin, pos_cos
+    def rotate_half(self, x: torch.Tensor) -> torch.Tensor:
+        B, nh, T, hs = x.size()
+        x = x.view(B, nh, T, 2, hs // 2)
+        x1, x2 = x.unbind(dim=-2)
+        return torch.cat((-x2, x1), dim=-1)
+    def rotate_every_two(self, x: torch.Tensor) -> torch.Tensor:
+        B, nh, T, hs = x.size()
+        x = x.view(B, nh, T, hs // 2, 2)
+        x1, x2 = x.unbind(dim=-1)
+        x = torch.stack((-x2, x1), dim=-1)
+        return x.view(B, nh, T, hs)
+    def apply_rotary_pos_emb(self, pos_sin: torch.Tensor, pos_cos: torch.Tensor, t: torch.Tensor) -> torch.Tensor:
+        if self.config.rope_impl == "interleave":
+            return ((t * pos_cos) + (self.rotate_every_two(t) * pos_sin)).to(t.dtype)
+        else:
+            return ((t * pos_cos) + (self.rotate_half(t) * pos_sin)).to(t.dtype)
+    def forward(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        position_ids: Optional[torch.Tensor] = None
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if self.config.rope_full_precision:
+            q_, k_ = q.float(), k.float()
+        else:
+            q_, k_ = q, k
+        with torch.autocast(q.device.type, enabled=False):
+            batch_size = q_.shape[0]
+            query_len, key_len = q_.shape[-2], k_.shape[-2]  # could be different if layer_past not None
+            if position_ids is not None:
+                freqs_cis_len = (self.config.max_position_embeddings or self.config.max_sequence_length)
+            else:
+                freqs_cis_len = key_len
+            pos_sin, pos_cos = self.get_rotary_embedding(freqs_cis_len, q_.device)
+            pos_sin = pos_sin.type_as(q_)
+            pos_cos = pos_cos.type_as(q_)
+            if position_ids is not None:
+                assert query_len == key_len, "Query and key lengths must be equal when using position IDs."
+                pos_sin = pos_sin[0, 0][position_ids].view(
+                    (batch_size, 1, key_len, pos_sin.shape[-1])
+                )
+                pos_cos = pos_cos[0, 0][position_ids].view(
+                    (batch_size, 1, key_len, pos_cos.shape[-1])
+                )
+            q_ = self.apply_rotary_pos_emb(
+                pos_sin[:, :, key_len - query_len : key_len, :],
+                pos_cos[:, :, key_len - query_len : key_len, :],
+                q_,
+            )
+            k_ = self.apply_rotary_pos_emb(pos_sin, pos_cos, k_)
+        return q_.type_as(q), k_.type_as(k)
+class MolmoBlock(nn.Module):
+    """
+    A base class for transformer block implementations.
+    """
+    def __init__(self, layer_id: int, config: MolmoConfig, cache: BufferCache):
+        super().__init__()
+        self.layer_id = layer_id
+        self.config = config
+        self.hidden_size = (
+            config.mlp_hidden_size if config.mlp_hidden_size is not None else config.mlp_ratio * config.d_model
+        )
+        self.__cache = cache
+        self._activation_checkpoint_fn = None
+        # Dropout.
+        self.dropout = Dropout(config.residual_dropout)
+        # Layer norms.
+        self.k_norm: Optional[LayerNormBase] = None
+        self.q_norm: Optional[LayerNormBase] = None
+        if config.attention_layer_norm:
+            assert config.effective_n_kv_heads is not None
+            self.k_norm = LayerNormBase.build(
+                config,
+                size=(config.d_model // config.n_heads) * config.effective_n_kv_heads,
+                elementwise_affine=config.attention_layer_norm_with_affine,
+            )
+            self.q_norm = LayerNormBase.build(config, elementwise_affine=config.attention_layer_norm_with_affine)
+        # Make sure QKV clip coefficient is positive, otherwise it's not well-defined.
+        if config.clip_qkv is not None:
+            assert config.clip_qkv > 0
+        # Activation function.
+        self.act = Activation.build(config)
+        assert (self.act.output_multiplier * self.hidden_size) % 1 == 0
+        # Attention output projection.
+        input_dim = config.d_model
+        self.attn_out = nn.Linear(
+            input_dim, config.d_model,
+            bias=config.include_bias,
+            device=config.init_device
+        )
+        # Feed-forward output projection.
+        self.ff_out = nn.Linear(
+            int(self.act.output_multiplier * self.hidden_size),
+            config.d_model,
+            bias=config.include_bias,
+            device=config.init_device,
+        )
+        self.ff_out._is_residual = True  # type: ignore
+        # Rotary embeddings.
+        if self.config.rope:
+            self.rotary_emb = RotaryEmbedding(config, self.__cache)
+        self.flash_attn_func = None
+        if config.attention_type == "flash":
+            try:
+                from flash_attn import flash_attn_func  # type: ignore
+                self.flash_attn_func = flash_attn_func
+            except ModuleNotFoundError:
+                pass
+    def reset_parameters(self):
+        if self.k_norm is not None:
+            self.k_norm.reset_parameters()
+        if self.q_norm is not None:
+            self.q_norm.reset_parameters()
+        init_weights(
+            self.config,
+            self.attn_out,
+            d=self.config.d_model,
+            layer_id=self.layer_id,
+            type_of_module=ModuleType.out_module,
+        )
+        init_weights(
+            self.config,
+            self.ff_out,
+            d=self.ff_out.in_features,
+            layer_id=self.layer_id,
+            type_of_module=ModuleType.out_module,
+        )
+    @classmethod
+    def _cast_attn_bias(cls, bias: torch.Tensor, input_dtype: torch.dtype) -> torch.Tensor:
+        target_dtype = input_dtype
+        # NOTE: `is_autocast_enabled()` only checks for CUDA autocast, so we use the separate function
+        # `is_autocast_cpu_enabled()` for CPU autocast.
+        # See https://github.com/pytorch/pytorch/issues/110966.
+        if bias.device.type == "cuda" and torch.is_autocast_enabled():
+            target_dtype = torch.get_autocast_gpu_dtype()
+        elif bias.device.type == "cpu" and torch.is_autocast_cpu_enabled():
+            target_dtype = torch.get_autocast_cpu_dtype()
+        if bias.dtype != target_dtype:
+            bias = bias.to(target_dtype)
+            ensure_finite_(bias, check_neg_inf=True, check_pos_inf=False)
+        return bias
+    def _scaled_dot_product_attention(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        attn_mask: Optional[torch.Tensor] = None,
+        dropout_p: float = 0.0,
+        response_dropout_p: float = 0.0,
+        is_causal: bool = False,
+    ) -> torch.Tensor:
+        """
+        Computes scaled dot product attention on query, key and value tensors, using an optional
+        attention mask if passed, and applying dropout if a probability greater than 0.0 is specified.
+        """
+        if attn_mask is not None:
+            attn_mask = attn_mask.to(q.device)
+        if self.flash_attn_func is not None and attn_mask is None:
+            r = self.flash_attn_func(
+                q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), dropout_p=dropout_p, causal=is_causal
+            )
+            return r.transpose(1, 2)
+        else:
+            # torch's sdpa doesn't support GQA, so we're doing this
+            assert k.size(1) == v.size(1)
+            num_kv_heads = k.size(1)
+            num_q_heads = q.size(1)
+            if num_q_heads != num_kv_heads:
+                assert num_q_heads % num_kv_heads == 0
+                k = k.repeat_interleave(num_q_heads // num_kv_heads, dim=1, output_size=num_q_heads)
+                v = v.repeat_interleave(num_q_heads // num_kv_heads, dim=1, output_size=num_q_heads)
+            return F.scaled_dot_product_attention(
+                q,
+                k,
+                v,
+                attn_mask=attn_mask,
+                dropout_p=dropout_p,
+                is_causal=is_causal,
+            )
+    def attention(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        attention_bias: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
+        B, T, C = q.size()  # batch size, sequence length, d_model
+        dtype = k.dtype
+        # Optionally apply layer norm to keys and queries.
+        if self.q_norm is not None and self.k_norm is not None:
+            q = self.q_norm(q).to(dtype=dtype)
+            k = self.k_norm(k).to(dtype=dtype)
+        # Move head forward to be next to the batch dim.
+        # shape: (B, nh, T, hs)
+        q = q.view(B, T, self.config.n_heads, C // self.config.n_heads).transpose(1, 2)
+        # shape: (B, n_kv_h, T, hs)
+        k = k.view(B, T, self.config.effective_n_kv_heads, C // self.config.n_heads).transpose(1, 2)
+        # shape: (B, n_kv_h, T, hs)
+        v = v.view(B, T, self.config.effective_n_kv_heads, C // self.config.n_heads).transpose(1, 2)
+        if self.config.use_position_ids and self.config.rope:
+            # Apply rotary embeddings
+            q, k = self.rotary_emb(q, k, position_ids=position_ids)
+        if layer_past is not None:
+            past_key, past_value = layer_past
+            k = torch.cat((past_key.to(k.device), k), dim=-2)
+            v = torch.cat((past_value.to(v.device), v), dim=-2)
+        present = (k, v) if use_cache else None
+        query_len, key_len = q.shape[-2], k.shape[-2]  # could be different if layer_past not None
+        if not self.config.use_position_ids and self.config.rope:
+            # Apply rotary embeddings
+            q, k = self.rotary_emb(q, k)
+        if attention_bias is not None:
+            # Resize and cast attention bias.
+            # The current dtype of the attention bias might not match the dtype that the SDP attn function will
+            # run in if AMP is enabled, and this can be a problem if some tokens are masked out due to padding
+            # as down-casting the attention bias to the autocast precision will result in -infs, which will
+            # cause the SDP attn function to produce NaNs.
+            attention_bias = self._cast_attn_bias(
+                attention_bias[:, :, key_len - query_len : key_len, :key_len], dtype
+            )
+        # Get the attention scores.
+        # shape: (B, nh, T, hs)
+        att = self._scaled_dot_product_attention(
+            q,
+            k,
+            v,
+            attn_mask=attention_bias,
+            dropout_p=0.0 if not self.training else self.config.attention_dropout,
+            response_dropout_p=0.0 if not self.training else self.config.response_attention_dropout,
+            is_causal=attention_bias is None,
+        )
+        # Re-assemble all head outputs side-by-side.
+        att = att.transpose(1, 2).contiguous().view(B, T, C)
+        # Apply output projection.
+        return self.attn_out(att), present
+    def forward(
+        self,
+        x: torch.Tensor,
+        attention_bias: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
+        raise NotImplementedError
+    @classmethod
+    def build(cls, layer_id: int, config: MolmoConfig, cache: BufferCache):
+        return MolmoSequentialBlock(layer_id, config, cache)
+class MolmoSequentialBlock(MolmoBlock):
+    """
+    This is a typical transformer block where the output is computed as ``MLP(LN(x + Attention(LN(x))))``
+    (plus another skip connection).
+    """
+    def __init__(self, layer_id: int, config: MolmoConfig, cache: BufferCache):
+        super().__init__(layer_id, config, cache)
+        # Layer norms.
+        self.attn_norm = LayerNorm.build(config)
+        self.ff_norm = LayerNorm.build(config)
+        # Attention input projection. Projects x -> (q, k, v)
+        head_dim = config.d_model // config.n_heads
+        self.fused_dims = (
+            config.d_model,
+            config.effective_n_kv_heads * head_dim,
+            config.effective_n_kv_heads * head_dim,
+        )
+        self.att_proj = nn.Linear(
+            config.d_model, sum(self.fused_dims),
+            bias=config.include_bias or config.qkv_bias,
+            device=config.init_device
+        )
+        # Feed-forward input projection.
+        self.ff_proj = nn.Linear(
+            config.d_model, self.hidden_size, bias=config.include_bias, device=config.init_device
+        )
+    def reset_parameters(self):
+        super().reset_parameters()
+        self.attn_norm.reset_parameters()
+        self.ff_norm.reset_parameters()
+        # NOTE: the standard deviation for these weights does not depend on the layer.
+        init_weights(
+            self.config, self.att_proj, d=self.config.d_model, layer_id=None, type_of_module=ModuleType.in_module
+        )
+        init_weights(
+            self.config, self.ff_proj, d=self.config.d_model, layer_id=None, type_of_module=ModuleType.in_module
+        )
+    def forward(
+        self,
+        x: torch.Tensor,
+        attention_bias: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
+        # Get query, key, value projections.
+        # shape:
+        #  - for regular attn q, k, v: (batch_size, seq_len, d_model)
+        #  - for multi-query attn q: (batch_size, seq_len, d_model)
+        #                      k, v: (batch_size, seq_len, d_model // n_heads)
+        #  - for group query attn q: (batch_size, seq_len, d_model)
+        #                      k, v: (batch_size, seq_len, d_model // n_kv_heads)
+        if not self.config.norm_after:
+            if self._activation_checkpoint_fn is not None:
+                atten_in = self._activation_checkpoint_fn(self.attn_norm, x)
+            else:
+                atten_in = self.attn_norm(x)
+        else:
+            atten_in = x
+        qkv = self.att_proj(atten_in)
+        if self.config.clip_qkv is not None:
+            qkv.clamp_(min=-self.config.clip_qkv, max=self.config.clip_qkv)
+        q, k, v = qkv.split(self.fused_dims, dim=-1)
+        # Get attention scores.
+        if self._activation_checkpoint_fn is not None:
+            att, cache = self._activation_checkpoint_fn(  # type: ignore
+                self.attention, q, k, v, attention_bias, position_ids=position_ids, layer_past=layer_past, use_cache=use_cache
+            )
+        else:
+            att, cache = self.attention(q, k, v, attention_bias, position_ids=position_ids, layer_past=layer_past, use_cache=use_cache)
+        if self.config.norm_after:
+            if self._activation_checkpoint_fn is not None:
+                att = self._activation_checkpoint_fn(self.attn_norm, att)
+            else:
+                att = self.attn_norm(att)
+        # Add attention scores.
+        # shape: (B, T, C)
+        x = x + self.dropout(att)
+        # Add feed-forward projection.
+        # shape: (batch_size, seq_len, d_model)
+        og_x = x
+        if not self.config.norm_after:
+            if self._activation_checkpoint_fn is not None:
+                x = self._activation_checkpoint_fn(self.ff_norm, x)  # type: ignore
+            else:
+                x = self.ff_norm(x)
+        x = self.ff_proj(x)
+        if self._activation_checkpoint_fn is not None:
+            x = self._activation_checkpoint_fn(self.act, x)  # type: ignore
+        else:
+            x = self.act(x)
+        x = self.ff_out(x)
+        if self.config.norm_after:
+            if self._activation_checkpoint_fn is not None:
+                x = self._activation_checkpoint_fn(self.ff_norm, x)  # type: ignore
+            else:
+                x = self.ff_norm(x)
+        x = self.dropout(x)
+        x = og_x + x
+        return x, cache
+class Embedding(nn.Module):
+    def __init__(
+        self,
+        num_embeddings: int,
+        num_new_embeddings: int,
+        features: int,
+        device: Union[str, torch.device],
+        initializer_range: float = 0.02,
+        new_embed_initializer_range: float = 0.02,
+    ):
+        super().__init__()
+        self.initializer_range = initializer_range
+        self.new_embed_initializer_range = new_embed_initializer_range
+        self.embedding = nn.Parameter(
+            torch.zeros(num_embeddings, features, device=device),
+        )
+        self.new_embedding = nn.Parameter(
+            torch.zeros(num_new_embeddings, features, device=device),
+        )
+    def reset_parameters(self):
+        nn.init.normal_(self.embedding, std=self.initializer_range)
+        nn.init.normal_(self.new_embedding, std=self.new_embed_initializer_range)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return F.embedding(x, torch.cat([self.embedding, self.new_embedding], dim=0))
+class Dropout(nn.Dropout):
+    def __init__(
+        self,
+        p: float = 0.5,
+        inplace: bool = False,
+        mask_p: float = 0,
+        broadcast_dims: Sequence[int] = (),
+    ):
+        super().__init__(p, inplace)
+        self.mask_p = mask_p
+        self.broadcast_dims = broadcast_dims
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        """
+        :param input: A tensor of shape `(batch_size, seq_len, embed_dim)`
+        """
+        if self.p == 0.0 and (self.mask_p is None or self.mask_p == 0.0):
+            return input
+        else:
+            if self.p > 0. and len(self.broadcast_dims) > 0 and self.training:
+                keep_prob = 1.0 - self.p
+                dropout_shape = list(input.shape)
+                for dim in self.broadcast_dims:
+                    dropout_shape[dim] = 1
+                keep = input.new_empty(dropout_shape).bernoulli_(keep_prob)
+                multiplier = keep.broadcast_to(input.shape)
+                multiplier.div_(keep_prob)
+                input = input * multiplier
+            else:
+                return F.dropout(input, self.p, self.training, self.inplace)
+@dataclass
+class VisionBackboneConfig:
+    image_default_input_size: Tuple[int, int] = (336, 336)
+    image_patch_size: int = 14
+    image_pos_patch_size: int = 14
+    image_emb_dim: int = 1024
+    image_num_heads: int = 16
+    image_num_key_value_heads: int = 16
+    image_num_layers: int = 24
+    image_head_dim: int = 64
+    image_mlp_dim: int = 4096
+    image_mlp_activations: str = "gelu"
+    image_dropout_rate: float = 0.0
+    image_num_pos: int = 577
+    image_norm_eps: float = 1e-5
+    attention_dropout: float = 0.0
+    residual_dropout: float = 0.0
+    initializer_range: float = 0.02
+    fsdp_wrap: bool = False
+    resize_mode: str = "default"
+    def __post_init__(self):
+        self.image_default_input_size = tuple(self.image_default_input_size)  # type: ignore[assignment]
+    @property
+    def image_num_patch(self):
+        h, w = self.image_default_input_size
+        return h // self.image_patch_size, w // self.image_patch_size
+@dataclass
+class FullMolmoConfig:
+    d_model: int = 768
+    n_heads: int = 12
+    n_kv_heads: Optional[int] = None
+    qkv_bias: bool = False
+    clip_qkv: Optional[float] = None
+    n_layers: int = 12
+    mlp_ratio: int = 4
+    mlp_hidden_size: Optional[int] = None
+    activation_type: str = "swiglu"
+    block_group_size: int = 1
+    rope: bool = True
+    rope_full_precision: bool = True
+    rope_theta: float = 10000.
+    rope_impl: str = "interleave"
+    vision_backbone: Optional[VisionBackboneConfig] = None
+    attention_type: str = "sdpa"
+    float32_attention: bool = True
+    attention_dropout: float = 0.1
+    response_attention_dropout: float = 0.0
+    multi_query_attention: Optional[bool] = None
+    attention_layer_norm: bool = False
+    residual_dropout: float = 0.1
+    embedding_dropout: float = 0.1
+    layer_norm_type: str = "default"
+    layer_norm_with_affine: bool = True
+    layer_norm_eps: Optional[float] = None
+    attention_layer_norm_with_affine: bool = True
+    max_sequence_length: int = 1024
+    max_position_embeddings: Optional[int] = None
+    include_bias: bool = True
+    bias_for_layer_norm: Optional[bool] = None
+    scale_logits: bool = False
+    vocab_size: int = 50257
+    embedding_size: Optional[int] = 50304
+    additional_vocab_size: Optional[int] = None
+    new_embedding_init_range: float = 0.02
+    weight_tying: bool = True
+    pad_token_id: int = -1
+    init_device: Optional[str] = None
+    init_std: float = 0.02
+    init_cutoff_factor: Optional[float] = None
+    norm_after: bool = False
+    precision: Optional[str] = None
+    image_padding_embed: Optional[str] = None
+    vit_layers: Tuple = (-1,)
+    image_pooling_h: int = 2
+    image_pooling_w: int = 2
+    image_pooling_2d: str = "attention"
+    image_projector: str = "mlp"
+    image_feature_dropout: float = 0.0
+    initializer_range: float = 0.02
+    normalize_input_embeds: bool = False
+    use_position_ids: bool = True
+    @property
+    def effective_n_kv_heads(self) -> int:
+        if self.n_kv_heads is None:
+            if self.multi_query_attention is True:
+                return 1
+            else:
+                return self.n_heads
+        else:
+            if self.multi_query_attention is None:
+                return self.n_kv_heads
+            if self.multi_query_attention:
+                n_kv_heads_should_be = 1
+            else:
+                n_kv_heads_should_be = self.n_heads
+            if self.n_kv_heads == n_kv_heads_should_be:
+                return n_kv_heads_should_be
+            else:
+                raise MolmoConfigurationError(
+                    "You can't set `multi_query_attention` and `n_kv_heads` at the same time."
+                )
+    @property
+    def image_num_patch(self):
+        assert self.vision_backbone is not None
+        return self.vision_backbone.image_num_patch
+    @property
+    def image_patch_size(self):
+        assert self.vision_backbone is not None
+        return self.visoin_backbone.image_patch_size
+    def llm_patches_per_crop(self):
+        h, w = self.image_num_patch
+        # Round up in case we need to pad the image features for pooling
+        h = (h + self.image_pooling_h - 1) // self.image_pooling_h
+        w = (w + self.image_pooling_w - 1) // self.image_pooling_w
+        return h, w
+def _expand_token(token, batch_size: int):
+    return token.view(1, 1, -1).expand(batch_size, -1, -1)
+class ViTMLP(nn.Module):
+    def __init__(self, config: FullMolmoConfig):
+        super().__init__()
+        self.config = config
+        v_cfg = config.vision_backbone
+        self.w1 = nn.Linear(
+            v_cfg.image_emb_dim,
+            v_cfg.image_mlp_dim,
+            bias=True,
+            device=config.init_device,
+        )
+        # Activation function.
+        cfg = deepcopy(config)
+        cfg.activation_type = v_cfg.image_mlp_activations
+        self.act = Activation.build(cfg)
+        self.w2 = nn.Linear(
+            v_cfg.image_mlp_dim,
+            v_cfg.image_emb_dim,
+            bias=True,
+            device=config.init_device,
+        )
+    def reset_parameters(self):
+        v_cfg = self.config.vision_backbone
+        nn.init.trunc_normal_(self.w1.weight, std=math.sqrt(1 / v_cfg.image_emb_dim), a=-2.0, b=2.0)
+        nn.init.trunc_normal_(self.w2.weight, std=math.sqrt(1 / v_cfg.image_mlp_dim), a=-2.0, b=2.0)
+        nn.init.zeros_(self.w1.bias)
+        nn.init.zeros_(self.w2.bias)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.w1(x)
+        x = self.act(x)
+        x = self.w2(x)
+        return x
+class ResidualAttentionBlock(nn.Module):
+    def __init__(self, config: FullMolmoConfig):
+        super().__init__()
+        self.config = config
+        v_cfg = config.vision_backbone
+        self.attention = MultiHeadDotProductAttention(config)
+        self.feed_forward = ViTMLP(config)
+        self.attention_norm = nn.LayerNorm(
+            v_cfg.image_emb_dim,
+            eps=v_cfg.image_norm_eps,
+            device=config.init_device,
+        )
+        self.ffn_norm = nn.LayerNorm(
+            v_cfg.image_emb_dim,
+            eps=v_cfg.image_norm_eps,
+            device=config.init_device,
+        )
+    def reset_parameters(self):
+        self.attention.reset_parameters()
+        self.feed_forward.reset_parameters()
+        self.attention_norm.reset_parameters()
+        self.ffn_norm.reset_parameters()
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.attention(self.attention_norm(x))
+        x = x + self.feed_forward(self.ffn_norm(x))
+        return x
+class BlockCollection(nn.Module):
+    def __init__(self, config: FullMolmoConfig):
+        super().__init__()
+        self.config = config
+        self.grad_checkpointing: bool = False
+        v_cfg = config.vision_backbone
+        self.resblocks = nn.ModuleList([
+            ResidualAttentionBlock(config) for _ in range(v_cfg.image_num_layers)
+        ])
+    def reset_parameters(self):
+        for r in self.resblocks:
+            r.reset_parameters()
+    def forward(self, x: torch.Tensor) -> List[torch.Tensor]:
+        hidden_states = []
+        for r in self.resblocks:
+            x = r(x)
+            hidden_states.append(x)
+        return hidden_states
+class LayerNormFp32(nn.LayerNorm):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        orig_type = x.dtype
+        x = F.layer_norm(x.to(torch.float32), self.normalized_shape, self.weight.to(torch.float32),
+                         self.bias.to(torch.float32), self.eps)
+        return x.to(orig_type)
+class VisionTransformer(nn.Module):
+    def __init__(self, config: FullMolmoConfig):
+        super().__init__()
+        self.config = config
+        v_cfg = config.vision_backbone
+        # class embeddings and positional embeddings
+        self.scale = v_cfg.image_emb_dim ** -0.5
+        self.class_embedding = nn.Parameter(
+            torch.zeros(v_cfg.image_emb_dim, device=config.init_device),
+        )
+        self.num_prefix_tokens: int = 1
+        self.positional_embedding = nn.Parameter(
+            torch.zeros(v_cfg.image_num_pos, v_cfg.image_emb_dim, device=config.init_device),
+        )
+        image_patch_size = v_cfg.image_patch_size
+        self.patch_embedding = nn.Linear(
+            image_patch_size * image_patch_size * 3,
+            v_cfg.image_emb_dim,
+            bias=False,
+            device=config.init_device,
+            )
+        self.pre_ln = LayerNormFp32(
+            v_cfg.image_emb_dim,
+            eps=v_cfg.image_norm_eps,
+        )
+        self.transformer = BlockCollection(config)
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        self.transformer.grad_checkpointing = enable
+    def reset_parameters(self):
+        nn.init.normal_(self.class_embedding, std=self.scale)
+        nn.init.normal_(self.positional_embedding, std=self.scale)
+        nn.init.normal_(self.patch_embedding.weight, std=0.02)
+        self.pre_ln.reset_parameters()
+        self.transformer.reset_parameters()
+    def add_pos_emb(self, x: torch.Tensor, patch_num: int) -> torch.Tensor:
+        cls_emb = self.positional_embedding[0:1]
+        pos_emb = self.positional_embedding[1:]
+        pos_emb = pos_emb.reshape(
+            (int(math.sqrt(pos_emb.shape[0])), int(math.sqrt(pos_emb.shape[0])), pos_emb.shape[1])
+        )
+        (patch_num_0, patch_num_1) = patch_num
+        if pos_emb.shape[0] != patch_num_0 or pos_emb.shape[1] != patch_num_1:
+            # Dervied from https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py
+            # antialias: default True in jax.image.resize
+            pos_emb = pos_emb.unsqueeze(0).permute(0, 3, 1, 2)
+            pos_emb = F.interpolate(
+                pos_emb, size=(patch_num_0, patch_num_1), mode="bicubic", align_corners=False, antialias=True,
+            )
+            pos_emb = pos_emb.permute(0, 2, 3, 1).squeeze(0)
+        pos_emb = pos_emb.reshape(-1, pos_emb.shape[-1])
+        x = x + torch.cat([cls_emb[None, :, :], pos_emb[None, :, :]], dim=1).to(x.dtype)
+        return x
+    def forward(self, x: torch.Tensor, patch_num: int = None) -> List[torch.Tensor]:
+        """
+        : param x: (batch_size, num_patch, n_pixels)
+        """
+        if patch_num is None:
+            patch_num = self.config.vision_backbone.image_num_patch
+        B, N, D = x.shape
+        x = self.patch_embedding(x)
+        # class embeddings and positional embeddings
+        x = torch.cat([_expand_token(self.class_embedding, x.shape[0]).to(x.dtype), x], dim=1)
+        x = self.add_pos_emb(x, patch_num)
+        x = self.pre_ln(x)
+        hidden_states = self.transformer(x)
+        return hidden_states
+class MultiHeadDotProductAttention(nn.Module):
+    def __init__(self, config: FullMolmoConfig, use_bias: bool = True, is_vit_layer: Optional[bool] = True):
+        super().__init__()
+        self.config = config
+        self.use_bias = use_bias
+        v_cfg = config.vision_backbone
+        self.embed_dim = v_cfg.image_emb_dim
+        self.num_heads = v_cfg.image_num_heads
+        self.head_dim = v_cfg.image_head_dim
+        self.num_key_value_heads = v_cfg.image_num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.initializer_range = v_cfg.initializer_range
+        self.is_vit_layer = is_vit_layer
+        nlayers = 1 if (is_vit_layer or config.vit_layers is None) else len(config.vit_layers)
+        self.wq = nn.Linear(
+            nlayers * self.embed_dim,
+            self.num_heads * self.head_dim,
+            bias=use_bias,
+            device=config.init_device,
+            )
+        self.wk = nn.Linear(
+            nlayers * self.embed_dim,
+            self.num_key_value_heads * self.head_dim,
+            bias=use_bias,
+            device=config.init_device,
+            )
+        self.wv = nn.Linear(
+            nlayers * self.embed_dim,
+            self.num_key_value_heads * self.head_dim,
+            bias=use_bias,
+            device=config.init_device,
+            )
+        self.wo = nn.Linear(
+            self.num_heads * self.head_dim,
+            self.embed_dim,
+            bias=use_bias,
+            device=config.init_device,
+            )
+        self.attention_dropout: Optional[Dropout] = None
+        if v_cfg.attention_dropout > 0:
+            self.attention_dropout = Dropout(v_cfg.attention_dropout, broadcast_dims=(0, 1))
+        self.residual_dropout = Dropout(v_cfg.residual_dropout)
+    def reset_parameters(self):
+        nn.init.normal_(self.wq.weight, std=self.initializer_range)
+        nn.init.normal_(self.wk.weight, std=self.initializer_range)
+        nn.init.normal_(self.wv.weight, std=self.initializer_range)
+        nn.init.normal_(self.wo.weight, std=self.initializer_range)
+        if self.use_bias:
+            nn.init.constant_(self.wq.bias, 0)
+            nn.init.constant_(self.wk.bias, 0)
+            nn.init.constant_(self.wv.bias, 0)
+            nn.init.constant_(self.wo.bias, 0)
+    def _split_heads(self, hidden_states, num_heads) -> torch.Tensor:
+        return hidden_states.reshape(hidden_states.shape[:2] + (num_heads, self.head_dim))
+    def _merge_heads(self, hidden_states) -> torch.Tensor:
+        return hidden_states.reshape(hidden_states.shape[:2] + (self.embed_dim,))
+    def forward(self, inputs_q: torch.Tensor, inputs_kv: Optional[torch.Tensor] = None) -> torch.Tensor:
+        if inputs_kv is not None:
+            inputs_k = inputs_kv
+            inputs_v = inputs_kv
+        else:
+            inputs_k = inputs_q
+            inputs_v = inputs_q
+        xq, xk, xv = self.wq(inputs_q), self.wk(inputs_k), self.wv(inputs_v)
+        xq = self._split_heads(xq, self.num_heads)
+        xk = self._split_heads(xk, self.num_key_value_heads)
+        xv = self._split_heads(xv, self.num_key_value_heads)
+        if self.num_heads != self.num_key_value_heads:
+            xk = xk.repeat_interleave(self.num_key_value_groups, dim=2, output_size=self.num_heads)
+            xv = xv.repeat_interleave(self.num_key_value_groups, dim=2, output_size=self.num_heads)
+        og_dtype = xq.dtype
+        if self.config.float32_attention:
+            xq = xq.to(torch.float)
+            xk = xk.to(torch.float)
+        if self.config.attention_type == "direct":
+            attn_weights = torch.einsum("...qhd,...khd->...hqk", xq / math.sqrt(xq.size(-1)), xk)
+            attn_weights = F.softmax(attn_weights, dim=-1, dtype=torch.float32).to(xq.dtype)
+            if self.attention_dropout is not None:
+                attn_weights = self.attention_dropout(attn_weights)
+            attn_output = torch.einsum("...hqk,...khd->...qhd", attn_weights.to(xv.dtype), xv)
+        elif self.config.attention_type == "sdpa":
+            if self.config.float32_attention and not torch.is_autocast_enabled():
+                xv = xv.to(torch.float32)
+            attn_output = F.scaled_dot_product_attention(
+                xq.transpose(1, 2).contiguous(),
+                xk.transpose(1, 2).contiguous(),
+                xv.transpose(1, 2).contiguous(),
+                is_causal=False,
+                dropout_p=self.config.vision_backbone.attention_dropout
+            ).transpose(1, 2)
+        else:
+            raise NotImplementedError(self.config.attention_type)
+        attn_output = attn_output.to(og_dtype)
+        attn_output = self._merge_heads(attn_output)
+        attn_output = self.wo(attn_output)
+        attn_output = self.residual_dropout(attn_output)
+        return attn_output
+class MultiHeadAttentionPool(nn.Module):
+    def __init__(
+        self,
+        config: FullMolmoConfig,
+        factor: int = 1,
+        use_bias: bool = True,
+        dropout: bool = True,
+        output_layer: bool = True,
+        mean_residual: bool = False,
+        query: str = "mean",
+        is_vit_layer: Optional[bool] = True
+    ):
+        super().__init__()
+        self.config = config
+        self.factor = factor
+        self.use_bias = use_bias
+        self.dropout = dropout
+        self.output_layer = output_layer
+        self.mean_residual = mean_residual
+        self.query = query
+        v_cfg = config.vision_backbone
+        input_dim = v_cfg.image_emb_dim
+        self.embed_dim = v_cfg.image_emb_dim * factor
+        self.num_heads = v_cfg.image_num_heads
+        self.head_dim = v_cfg.image_head_dim * factor
+        self.num_key_value_heads = v_cfg.image_num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.initializer_range = v_cfg.initializer_range
+        nlayers = 1 if (is_vit_layer or config.vit_layers is None) else len(config.vit_layers)
+        if query != "vector":
+            self.wq = nn.Linear(
+                nlayers * input_dim,
+                self.num_heads * self.head_dim,
+                bias=use_bias,
+                device=config.init_device,
+                )
+        self.wk = nn.Linear(
+            nlayers * input_dim,
+            self.num_key_value_heads * self.head_dim,
+            bias=use_bias,
+            device=config.init_device,
+            )
+        self.wv = nn.Linear(
+            nlayers * input_dim,
+            self.num_key_value_heads * self.head_dim,
+            bias=use_bias,
+            device=config.init_device,
+            )
+        if query == "vector":
+            self.attention_query = nn.Parameter(
+                torch.zeros(
+                    1, self.num_key_value_heads * self.head_dim, device=config.init_device,
+                       ),
+            )
+        if output_layer:
+            self.wo = nn.Linear(
+                self.num_heads * self.head_dim,
+                self.embed_dim,
+                bias=use_bias,
+                device=config.init_device,
+                )
+        self.attention_dropout = Dropout(v_cfg.attention_dropout, broadcast_dims=(0, 1))
+        if dropout:
+            self.residual_dropout = Dropout(v_cfg.residual_dropout)
+    def reset_parameters(self):
+        if self.query != "vector":
+            nn.init.normal_(self.wq.weight, std=self.initializer_range)
+        nn.init.normal_(self.wk.weight, std=self.initializer_range)
+        nn.init.normal_(self.wv.weight, std=self.initializer_range)
+        if self.output_layer:
+            nn.init.normal_(self.wo.weight, std=self.initializer_range)
+        if self.use_bias:
+            if self.query != "vector":
+                nn.init.constant_(self.wq.bias, 0)
+            nn.init.constant_(self.wk.bias, 0)
+            nn.init.constant_(self.wv.bias, 0)
+            if self.output_layer:
+                nn.init.constant_(self.wo.bias, 0)
+        if self.query == "vector":
+            nn.init.normal_(self.attention_query, std=self.initializer_range)
+    def _split_heads(self, hidden_states, num_heads):
+        return hidden_states.reshape(hidden_states.shape[:2] + (num_heads, self.head_dim))
+    def _merge_heads(self, hidden_states):
+        return hidden_states.reshape(hidden_states.shape[:2] + (self.embed_dim,))
+    def forward(self, inputs_kv: torch.Tensor) -> torch.Tensor:
+        xk, xv = self.wk(inputs_kv), self.wv(inputs_kv)
+        if self.query == "mean":
+            inputs_q = inputs_kv.mean(dim=1, keepdim=True)
+            xq = self.wq(inputs_q)
+        elif self.query == "first":
+            inputs_q = inputs_kv[:, :1]
+            xq = self.wq(inputs_q)
+        elif self.query == "vector":
+            xq = self.attention_query.expand(inputs_kv.size(0), -1, -1)
+        elif self.query == "constant":
+            inputs_q = torch.ones_like(inputs_kv[:, :1]) / math.sqrt(inputs_kv.shape[-1])
+            xq = self.wq(inputs_q)
+        else:
+            raise ValueError(f"Unknown query type: {self.query}")
+        xq = self._split_heads(xq, self.num_heads)
+        xk = self._split_heads(xk, self.num_key_value_heads)
+        xv = self._split_heads(xv, self.num_key_value_heads)
+        if self.num_heads != self.num_key_value_heads:
+            xk = xk.repeat_interleave(self.num_key_value_groups, dim=2, output_size=self.num_heads)
+            xv = xv.repeat_interleave(self.num_key_value_groups, dim=2, output_size=self.num_heads)
+        xq = xq.to(torch.float)
+        xk = xk.to(torch.float)
+        xq = xq / math.sqrt(xq.size(-1))
+        attn_weights = torch.einsum("...qhd,...khd->...hqk", xq, xk)
+        attn_weights = F.softmax(attn_weights, dim=-1).to(xq.dtype)
+        attn_weights = self.attention_dropout(attn_weights).to(xv.dtype)
+        attn_output = torch.einsum("...hqk,...khd->...qhd", attn_weights, xv)
+        attn_output = self._merge_heads(attn_output)
+        if self.output_layer:
+            attn_output = self.wo(attn_output)
+        if self.dropout:
+            attn_output = self.residual_dropout(attn_output)
+        if self.mean_residual:
+            attn_output += inputs_kv.mean(dim=1, keepdim=True)
+        return attn_output
+class MLP(nn.Module):
+    def __init__(self, config: FullMolmoConfig, input_dim: int, dropout: float = 0.0):
+        super().__init__()
+        self.config = config
+        self.hidden_size = (
+            config.mlp_hidden_size if config.mlp_hidden_size is not None else config.mlp_ratio * config.d_model
+        )
+        self.initializer_range = config.initializer_range
+        self.w1 = nn.Linear(
+            input_dim,
+            self.hidden_size // 2,
+            bias=False,
+            device=config.init_device,
+            )
+        self.w2 = nn.Linear(
+            self.hidden_size // 2,
+            config.d_model,
+            bias=False,
+            device=config.init_device,
+            )
+        self.w3 = nn.Linear(
+            input_dim,
+            self.hidden_size // 2,
+            bias=False,
+            device=config.init_device,
+            )
+        # Activation function.
+        self.act = Activation.build(config)
+        self.dropout = Dropout(dropout)
+    def reset_parameters(self):
+        nn.init.normal_(self.w1.weight, std=self.initializer_range)
+        nn.init.normal_(self.w2.weight, std=self.initializer_range)
+        nn.init.normal_(self.w3.weight, std=self.initializer_range)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.w2(self.act(self.w1(x), self.w3(x)))
+        x = self.dropout(x)
+        return x
+class Residual(nn.Module):
+    def __init__(self, submodule: nn.Module):
+        super().__init__()
+        self.submodule = submodule
+    def reset_parameters(self):
+        self.submodule.reset_parameters()
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x + self.submodule(x)
+class OLMoVisionBackbone(nn.Module):
+    def __init__(self, config: FullMolmoConfig):
+        super().__init__()
+        self.config = config
+        self.image_vit = VisionTransformer(config)
+        input_dim: int = None
+        self.image_pooling_2d: nn.Module = None
+        if config.image_pooling_2d in {ImagePooling2DType.attention, ImagePooling2DType.attention_meanq}:
+            self.image_pooling_2d = MultiHeadDotProductAttention(config, is_vit_layer=False)
+            input_dim = config.vision_backbone.image_emb_dim
+        elif config.image_pooling_2d == ImagePooling2DType.attention_2wide:
+            cfg = deepcopy(config)
+            cfg.vision_backbone.image_emb_dim *= 2
+            cfg.vision_backbone.image_head_dim *= 2
+            self.image_pooling_2d = MultiHeadDotProductAttention(cfg, is_vit_layer=False)
+            input_dim = cfg.vision_backbone.image_emb_dim
+        elif config.image_pooling_2d == ImagePooling2DType.attention_v2:
+            assert config.vit_layers is not None
+            use_bias = True
+            dropout = True
+            output_layer = True
+            query = "mean"
+            mean_residual = False
+            factor = len(config.vit_layers)
+            self.image_pooling_2d = MultiHeadAttentionPool(
+                config,
+                factor=factor,
+                use_bias=use_bias,
+                dropout=dropout,
+                output_layer=output_layer,
+                mean_residual=mean_residual,
+                query=query,
+                is_vit_layer=False,
+            )
+            input_dim = config.vision_backbone.image_emb_dim * factor
+        elif config.image_pooling_2d in [ImagePooling2DType.none, ImagePooling2DType.stack]:
+            self.image_pooling_2d = None
+            nlayers = 1 if config.vit_layers is None else len(config.vit_layers)
+            input_dim = nlayers * config.vision_backbone.image_emb_dim
+        else:
+            raise NotImplementedError(f"Unknown image pooling 2D method: {config.image_pooling_2d}")
+        self.input_dim = input_dim
+        # `MLP` assume the activation takes two inputs, so it must be a 'llama' version
+        if config.activation_type == ActivationType.swiglu:
+            mlp_config = replace(config, activation_type=ActivationType.llama_swiglu)
+        elif config.activation_type == ActivationType.gelu:
+            mlp_config = replace(config, activation_type=ActivationType.llama_geglu)
+        else:
+            mlp_config = config
+        if config.image_projector == ImageProjectType.mlpx2:
+            self.image_projector = nn.ModuleList(
+                [MLP(mlp_config, input_dim), Residual(MLP(config, input_dim))]
+            )
+        elif config.image_projector == ImageProjectType.mlp:
+            self.image_projector = MLP(mlp_config, input_dim)
+        elif config.image_projector == ImageProjectType.linear:
+            self.image_projector = nn.Linear(
+                input_dim,
+                config.d_model,
+                bias=False,
+                device=config.init_device,
+            )
+        else:
+            raise NotImplementedError(f"Unknown image projector: {config.image_projector}")
+        self.image_feature_dropout = Dropout(config.image_feature_dropout)
+    def reset_parameters(self):
+        if self.image_pooling_2d is not None:
+            self.image_pooling_2d.reset_parameters()
+        if self.config.image_projector == "2mlp":
+            for module in self.image_projector:
+                module.reset_parameters()
+        elif self.config.image_projector == "linear":
+            nn.init.xavier_uniform_(self.image_projector.weight)
+        else:
+            self.image_projector.reset_parameters()
+    def forward(self, images: torch.Tensor, image_masks: torch.Tensor) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        raise NotImplementedError
+class OLMoPretrainedVisionBackbone(OLMoVisionBackbone):
+    def __init__(self, config: FullMolmoConfig):
+        super().__init__(config)
+        v_cfg = self.config.vision_backbone
+        self.grad_checkpointing = False
+        self.num_prefix_tokens = self.image_vit.num_prefix_tokens
+        assert self.num_prefix_tokens in {0, 1}, "Only 0 or 1 prefix tokens are supported"
+        self.pad_embed = None
+        if config.image_padding_embed:
+            image_dim = v_cfg.image_emb_dim*len(self.config.vit_layers)
+            if config.image_padding_embed in ["pad_embed", "regress"]:
+                self.pad_embed = nn.Parameter(
+                    torch.zeros((image_dim,), device=config.init_device))
+            elif config.image_padding_embed == "pad_and_partial_pad":
+                self.pad_embed = nn.Parameter(
+                    torch.zeros((2, image_dim), device=config.init_device))
+            else:
+                raise ValueError(config.image_padding_embed)
+    def reset_parameters(self):
+        super().reset_parameters()
+        self.image_vit.reset_parameters()
+    def encode_image(self, images: torch.Tensor) -> torch.Tensor:
+        """
+        : param images: (batch_size, num_crops, num_patch, n_pixels)
+        """
+        cfg = self.config
+        v_cfg = self.config.vision_backbone
+        B, T, N, D = images.shape
+        mask = ~torch.all(images.view(B * T, N, D) == -1, dim=(1, 2), keepdim=True)
+        # Output all hidden states
+        # n_layers x (batch_num_crops, (1+)n_tokens, image_emb_dim)
+        images = images.view(B * T, N, D)
+        image_features = self.image_vit(images)
+        if cfg.vit_layers is not None:
+            features = []
+            for layer in cfg.vit_layers:
+                features.append(image_features[layer])
+            image_features = torch.cat(features, dim=-1)
+        else:
+            image_features = image_features[-1]
+        cls_embed: torch.Tensor = None
+        if self.num_prefix_tokens > 0:
+            cls_embed = image_features[:, 0]
+            image_features = image_features[:, 1:]
+        image_features = image_features * mask
+        image_features = image_features.view(B, T, N, -1)
+        cls_embed = cls_embed.view(B, T, -1) if cls_embed is not None else None
+        return image_features, cls_embed
+    def forward(self, images: torch.Tensor, image_masks: torch.Tensor) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        cfg = self.config
+        # image_features: (batch_size, num_crops(=num_image), num_patch, nximage_emb_dim)
+        batch_size, num_image = images.shape[:2]
+        image_features, cls_embed = self.encode_image(images)
+        if cfg.image_padding_embed:
+            assert image_masks is not None
+            if cfg.image_padding_embed == "pad_embed":
+                all_pad = (image_masks == 0).to(dtype=torch.float32)
+                pad_embed = self.pad_embed[None, None, None, :]
+                image_features = image_features + pad_embed * torch.unsqueeze(all_pad, -1)
+            elif cfg.image_padding_embed == "regress":
+                pad_embed = self.pad_embed[None, None, None, :]
+                image_features = image_features + pad_embed * torch.unsqueeze(torch.maximum(image_masks, torch.zeros_like(image_masks)), -1)
+            elif cfg.image_padding_embed == "pad_and_partial_pad":
+                pad_embed = self.pad_embed[:, None, None, None, :]
+                all_pad = image_masks == 0
+                partial_pad = torch.logical_and(image_masks < 1, torch.logical_not(all_pad)).to(dtype=image_features.dtype)
+                all_pad = all_pad.to(dtype=image_features.dtype)
+                image_features = image_features + pad_embed[0] * torch.unsqueeze(all_pad, -1)
+                image_features = image_features + pad_embed[1] * torch.unsqueeze(partial_pad, -1)
+            else:
+                raise ValueError(cfg.image_padding_embed)
+        image_features = self.image_feature_dropout(image_features)
+        if cls_embed is not None:
+            cls_embed = self.image_feature_dropout(cls_embed)
+        image_features = image_features.reshape(
+            (batch_size, num_image) + cfg.image_num_patch + (-1,),
+            )
+        if cfg.image_num_patch[0] % cfg.image_pooling_h == 1:
+            # Pad so we can still pool 2x2 patches
+            image_features = F.pad(
+                image_features,
+                (0, 0, 0, 1, 0, 1, 0, 0, 0, 0),
+            )
+        # image pooling
+        image_features = einops.rearrange(
+            image_features,
+            'b n (h dh) (w dw) c -> (b n h w) (dh dw) c',
+            dh=cfg.image_pooling_h,
+            dw=cfg.image_pooling_w,
+        )
+        if cfg.image_pooling_2d == ImagePooling2DType.attention_meanq:
+            query = image_features.mean(-2, keepdim=True)
+            image_features = self.image_pooling_2d(query, image_features)
+        elif cfg.image_pooling_2d not in {ImagePooling2DType.none, ImagePooling2DType.stack}:
+            if self.grad_checkpointing:
+                from torch.utils.checkpoint import checkpoint
+                image_features = checkpoint(self.image_pooling_2d, image_features[:, :1, :], image_features, use_reentrant=False)
+            else:
+                image_features = self.image_pooling_2d(image_features[:, :1, :], image_features)
+        h, w = cfg.llm_patches_per_crop()
+        image_features = image_features.reshape(batch_size, num_image, h * w, -1)
+        # MLP layer to map the feature.
+        if self.grad_checkpointing:
+            from torch.utils.checkpoint import checkpoint
+            image_features = checkpoint(self.image_projector, image_features, use_reentrant=False)
+        else:
+            image_features = self.image_projector(image_features)
+        # image_features: (batch_size, num_image, num_patch, d_model)
+        # cls_embed: (batch_size, num_image, d_model)
+        return image_features, cls_embed
+class ModuleType(str, Enum):
+    in_module = "in"
+    out_module = "out"
+    emb = "emb"
+    final_out = "final_out"
+def init_weights(
+    config: FullMolmoConfig,
+    module: Union[nn.Linear, nn.Embedding],
+    d: Optional[int] = None,
+    layer_id: Optional[int] = None,
+    std_factor: float = 1.0,
+    type_of_module: Optional[ModuleType] = None,
+) -> None:
+    d = d if d is not None else config.d_model
+    std = config.init_std * std_factor
+    if config.init_cutoff_factor is not None:
+        cutoff_value = config.init_cutoff_factor * std
+        nn.init.trunc_normal_(module.weight, mean=0.0, std=std, a=-cutoff_value, b=cutoff_value)
+    else:
+        nn.init.normal_(module.weight, mean=0.0, std=std)
+class LlamaSwiGLU(nn.Module):
+    def forward(self, x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor:
+        return F.silu(x1) * x2
+    @property
+    def output_multiplier(self) -> float:
+        return 0.5
+class SwiGLU(nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x, gate = x.chunk(2, dim=-1)
+        return F.silu(gate) * x
+    @property
+    def output_multiplier(self) -> float:
+        return 0.5
+class Activation(nn.Module):
+    def __init__(self, config: FullMolmoConfig):
+        super().__init__()
+        self.config = config
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        raise NotImplementedError
+    @property
+    def output_multiplier(self) -> float:
+        raise NotImplementedError
+    @classmethod
+    def build(cls, config: FullMolmoConfig) -> 'Activation':
+        if config.activation_type == "quick_gelu":
+            return QuickGELU(config)
+        elif config.activation_type == "gelu":
+            return cast(Activation, GELU(approximate="none"))
+        elif config.activation_type == "gelu_tanh":
+            return cast(Activation, GELU(approximate="tanh"))
+        elif config.activation_type == "relu":
+            return cast(Activation, ReLU(inplace=False))
+        elif config.activation_type == "silu":
+            return cast(Activation, SiLU(inplace=False))
+        # elif config.activation_type == "llama_geglu":
+        #     return LlamaGEGLU(config)
+        # elif config.activation_type == "llama_geglu_tanh":
+        #     return LlamaGEGLUTanh(config)
+        elif config.activation_type == "llama_swiglu":
+            return LlamaSwiGLU()
+        elif config.activation_type == "swiglu":
+            return SwiGLU()
+        else:
+            raise NotImplementedError(f"Unknown activation: '{config.activation_type}'")
+class QuickGELU(Activation):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x * torch.sigmoid(1.702 * x)
+    @property
+    def output_multiplier(self) -> float:
+        return 1.0
+class GELU(nn.GELU):
+    @property
+    def output_multiplier(self) -> float:
+        return 1.0
+class ReLU(nn.ReLU):
+    @property
+    def output_multiplier(self) -> float:
+        return 1.0
+class SiLU(nn.SiLU):
+    @property
+    def output_multiplier(self) -> float:
+        return 1.0
+def causal_attention_bias(seq_len: int, device: torch.device) -> torch.FloatTensor:
+    att_bias = torch.triu(
+        torch.ones(seq_len, seq_len, device=device, dtype=torch.float),
+        diagonal=1,
+    )
+    att_bias.masked_fill_(att_bias == 1, torch.finfo(att_bias.dtype).min)
+    return att_bias.view(1, 1, seq_len, seq_len)  # type: ignore
+def get_causal_attention_bias(cache: BufferCache, seq_len: int, device: torch.device) -> torch.Tensor:
+    if (causal_bias := cache.get("causal_attention_bias")) is not None and causal_bias.shape[-1] >= seq_len:
+        if causal_bias.device != device:
+            causal_bias = causal_bias.to(device)
+            cache["causal_attention_bias"] = causal_bias
+        return causal_bias
+    with torch.autocast(device.type, enabled=False):
+        causal_bias = causal_attention_bias(seq_len, device)
+    cache["causal_attention_bias"] = causal_bias
+    return causal_bias
+class LayerNormBase(nn.Module):
+    def __init__(
+        self,
+        config: MolmoConfig,
+        *,
+        size: Optional[int] = None,
+        elementwise_affine: Optional[bool] = True,
+        eps: float = 1e-05,
+        weight_initializer: Optional[Callable] = torch.ones,
+        bias_initializer: Optional[Callable] = torch.zeros,
+    ):
+        super().__init__()
+        self.config = config
+        self.eps = self.config.layer_norm_eps or eps
+        self.normalized_shape = (size or config.d_model,)
+        if elementwise_affine or (elementwise_affine is None and self.config.layer_norm_with_affine):
+            self.weight = nn.Parameter(weight_initializer(self.normalized_shape, device=config.init_device))
+            use_bias = self.config.bias_for_layer_norm
+            if use_bias is None:
+                use_bias = self.config.include_bias
+            if use_bias:
+                self.bias = nn.Parameter(bias_initializer(self.normalized_shape, device=config.init_device))
+            else:
+                self.register_parameter("bias", None)
+        else:
+            self.register_parameter("bias", None)
+            self.register_parameter("weight", None)
+    @classmethod
+    def build(cls, config: FullMolmoConfig, size: Optional[int] = None, **kwargs):
+        if config.layer_norm_type == "default":
+            return LayerNorm(config, size=size, low_precision=False, **kwargs)
+        elif config.layer_norm_type == "low_precision":
+            return LayerNorm(config, size=size, low_precision=True, **kwargs)
+        elif config.layer_norm_type == "rms":
+            return RMSLayerNorm(config, size=size, **kwargs)
+        else:
+            raise NotImplementedError(f"Unknown LayerNorm type: '{config.layer_norm_type}'")
+class RMSLayerNorm(LayerNormBase):
+    """
+    RMS layer norm, a simplified :class:`LayerNorm` implementation
+    """
+    def __init__(
+        self,
+        config: FullMolmoConfig,
+        size: Optional[int] = None,
+        elementwise_affine: Optional[bool] = None,
+        eps: float = 1e-5,
+    ):
+        super().__init__(config, size=size, elementwise_affine=elementwise_affine, eps=eps)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        with torch.autocast(enabled=False, device_type=x.device.type):
+            og_dtype = x.dtype
+            x = x.to(torch.float32)
+            variance = x.pow(2).mean(-1, keepdim=True)
+            x = x * torch.rsqrt(variance + self.eps)
+            x = x.to(og_dtype)
+        if self.weight is not None:
+            if self.bias is not None:
+                return self.weight * x + self.bias
+            else:
+                return self.weight * x
+        else:
+            return x
+class LayerNorm(LayerNormBase):
+    """
+    The default :class:`LayerNorm` implementation which can optionally run in low precision.
+    """
+    def __init__(
+        self,
+        config: FullMolmoConfig,
+        size: Optional[int] = None,
+        low_precision: bool = False,
+        elementwise_affine: Optional[bool] = None,
+        eps: float = 1e-05,
+    ):
+        super().__init__(config, size=size, elementwise_affine=elementwise_affine, eps=eps)
+        self.low_precision = low_precision
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.low_precision:
+            module_device = x.device
+            downcast_x = self._cast_if_autocast_enabled(x)
+            downcast_weight = (
+                self._cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight
+            )
+            downcast_bias = self._cast_if_autocast_enabled(self.bias) if self.bias is not None else self.bias
+            with torch.autocast(enabled=False, device_type=module_device.type):
+                return F.layer_norm(
+                    downcast_x, self.normalized_shape, weight=downcast_weight, bias=downcast_bias, eps=self.eps
+                )
+        else:
+            return F.layer_norm(x, self.normalized_shape, weight=self.weight, bias=self.bias, eps=self.eps)
+class Molmo(nn.Module):
+    def __init__(self, config: FullMolmoConfig, init_params: bool = True):
+        super().__init__()
+        self.config = config
+        self.__cache = BufferCache()
+        # Validate config.
+        if self.config.embedding_size is not None and self.config.embedding_size != self.config.vocab_size:
+            if self.config.embedding_size < self.config.vocab_size:
+                raise MolmoConfigurationError("embedding size should be at least as big as vocab size")
+            elif self.config.embedding_size % 128 != 0:
+                import warnings
+                warnings.warn(
+                    "Embedding size is not a multiple of 128! This could hurt throughput performance.", UserWarning
+                )
+        torch.backends.cuda.enable_flash_sdp(True)
+        torch.backends.cuda.enable_mem_efficient_sdp(False)  # this is super slow so make sure torch won't use it
+        wte = None
+        if self.config.additional_vocab_size is not None:
+            wte = Embedding(
+                config.embedding_size or config.vocab_size,
+                config.additional_vocab_size,
+                config.d_model,
+                device=config.init_device,
+                initializer_range=config.initializer_range,
+                new_embed_initializer_range=config.new_embedding_init_range
+            )
+        else:
+            wte=nn.Embedding(
+                config.embedding_size or config.vocab_size, config.d_model, device=config.init_device
+            )
+        self.transformer = nn.ModuleDict(
+            dict(
+                wte=wte,
+                emb_drop=Dropout(config.embedding_dropout),
+                ln_f=LayerNorm.build(config),
+            )
+        )
+        blocks = [MolmoBlock.build(i, config, self.__cache) for i in range(config.n_layers)]
+        if self.config.block_group_size > 1:
+            raise NotImplementedError()
+        else:
+            self.transformer.update({"blocks": nn.ModuleList(blocks)})
+        if not self.config.rope:
+            self.transformer.update(
+                {"wpe": nn.Embedding(config.max_sequence_length, config.d_model, device=config.init_device)}
+            )
+        if not config.weight_tying:
+            self.transformer.update(
+                {
+                    "ff_out": nn.Linear(
+                        config.d_model,
+                        config.embedding_size or config.vocab_size,
+                        bias=config.include_bias,
+                        device=config.init_device,
+                        )
+                }
+            )
+        self.vision_backbone: Optional[OLMoVisionBackbone] = None
+        if config.vision_backbone is not None:
+            self.vision_backbone = OLMoPretrainedVisionBackbone(config)
+        self.__num_fwd_flops: Optional[int] = None
+    def reset_parameters(self):
+        if self.vision_backbone is not None:
+            self.vision_backbone.reset_parameters()
+        self.reset_non_vision_parameters()
+    def reset_non_vision_parameters(self):
+        self.transformer.wte.reset_parameters()
+        if hasattr(self.transformer.wte, "new_embedding"):
+            nn.init.normal_(self.transformer.wte.new_embedding, std=self.config.new_embedding_init_range)
+        if hasattr(self.transformer, "wpe"):
+            nn.init.normal_(self.transformer.wpe, mean=0.0, std=1.0)
+        self.transformer.ln_f.reset_parameters()  # type: ignore
+        if hasattr(self.transformer, "ff_out"):
+            nn.init.normal_(self.transformer.ff_out, mean=0.0, std=0.02)
+        if self.config.block_group_size == 1:
+            for block in self.transformer.blocks:
+                block.reset_parameters()
+        else:
+            for block_group in self.transformer.block_groups:
+                block_group.reset_parameters()
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        input_embeddings: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        attention_bias: Optional[torch.Tensor] = None,
+        response_mask: Optional[torch.Tensor] = None,
+        images: Optional[torch.Tensor] = None,
+        image_masks: Optional[torch.Tensor] = None,
+        image_input_idx: Optional[torch.Tensor] = None,
+        subsegment_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Sequence[Tuple[torch.Tensor, torch.Tensor]]] = None,
+        use_cache: bool = False,
+        last_logits_only: bool = False,
+        output_hidden_states: Optional[bool] = None,
+        append_last_valid_logits: Optional[torch.Tensor] = None,
+    ) -> ModelOutput:
+        """
+        :param input_ids: A tensor of shape `(batch_size, seq_len)`.
+        :param input_embeddings: A tensor of shape `(batch_size, seq_len, d_model)` with input
+            embeddings. When provided, it is treated as the output of the input embedding layer.
+        :param attention_mask: A tensor of shape `(batch_size, seq_len)` that indicates
+            which input IDs are masked. A `1` value in the mask means that
+            the corresponding input ID should *not* be ignored. A `0` means
+            that the corresponding input ID is masked.
+            This has the same meaning as the `attention_mask` in HuggingFace's `transformers`
+            library.
+        :param attention_bias: A tensor of shape `(batch_size, 1, seq_len, seq_len)`,
+            `(1, 1, seq_len, seq_len)`, or `(seq_len, seq_len)`. This is used
+            to introduce causal or other biases.
+            If the tensor is a bool or byte tensor, a `True` or `1` at `attention_bias[:, :, i, j]`
+            indicates that the i-th element in the sequence is allowed to attend to the j-th
+            element in the sequence.
+            If the tensor is a float tensor, it will just be added to the attention
+            scores before the softmax.
+            The default is causal, which corresponds to a lower-diagonal byte matrix of ones.
+        :param response_mask: A tensor of shape `(batch_size, seq_len)` that indicates
+            the response mask. A `1` value in the mask means that the corresponding token
+            is a response token. A `0` means that the corresponding token is not
+            a response token.
+        :param past_key_values: Pre-computed keys and values for each attention block.
+            Can be used to speed up sequential decoding. The `input_ids` which have
+            their past given to this model should not be passed as `input_ids` as they have already been computed.
+        :param use_cache: If `True`, return key and value tensors for each block.
+        :param last_logits_only: If `True`, only compute the logits for the last token of each sequence.
+            This can speed up decoding when you only care about the next token.
+        """
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else False
+        if past_key_values:
+            assert len(past_key_values) == self.config.n_layers
+        has_image = images is not None
+        assert not (has_image and input_embeddings is not None), "Cannot provide both images and input embeddings."
+        assert not (has_image and past_key_values is not None), "Cached key and values should not be used with images."
+        batch_size, seq_len = input_ids.size() if input_embeddings is None else input_embeddings.size()[:2]
+        if past_key_values is None:
+            past_length = 0
+        else:
+            past_length = past_key_values[0][0].size(-2)
+        if self.config.use_position_ids and attention_mask is None:
+            attention_mask = input_ids != -1
+        if subsegment_ids is not None:
+            assert not use_cache, "Subsegment_ids cannot be used with cache."
+            subsegment_mask = subsegment_ids.unsqueeze(2) <= subsegment_ids.unsqueeze(1)
+            attention_mask = (
+                subsegment_mask.to(attention_mask.dtype) *
+                attention_mask.unsqueeze(2) *
+                attention_mask.unsqueeze(1))
+            if position_ids is None:
+                raise ValueError(f"Positioned ids must be given if using subsegment_ids")
+        else:
+            if self.config.use_position_ids and position_ids is None:
+                position_ids = torch.clamp(
+                    torch.cumsum(attention_mask.to(torch.int32), dim=-1) - 1,
+                    min=0,
+                    ).broadcast_to((batch_size, attention_mask.shape[-1]))
+        # Get embeddings of input.
+        # shape: (batch_size, seq_len, d_model)
+        if input_ids is not None:
+            input_ids = input_ids * (input_ids != -1).to(input_ids.dtype)
+        x = self.transformer.wte(input_ids) if input_embeddings is None else input_embeddings  # type: ignore
+        num_image: Optional[int] = None
+        if images is not None:
+            # shape: (batch_size, num_image, num_patch, d_model)
+            # cls_embed: (batch_size, num_image, d_model)
+            image_features, cls_embed = self.vision_backbone(images, image_masks)
+            num_image, num_patch = image_features.shape[1:3]
+            assert image_input_idx.shape == (batch_size, num_image, num_patch)
+            # inster the image feature into the embedding.
+            image_features = image_features.view(batch_size, num_image * num_patch, -1)
+            image_input_idx = image_input_idx.view(batch_size, num_image * num_patch)
+            valid = image_input_idx >= 0
+            batch_idx = torch.arange(batch_size, device=x.device)
+            batch_idx = torch.tile(batch_idx[:, None], [1, image_features.shape[1]])
+            # For hf demo/endpoint
+            image_features = image_features.to(x.device)
+            x[batch_idx[valid], image_input_idx[valid]] += image_features[valid]
+        if not self.config.rope:
+            # Get positional embeddings.
+            # shape: (1, seq_len)
+            pos = torch.arange(past_length, past_length + seq_len, dtype=torch.long, device=x.device).unsqueeze(0)
+            # shape: (1, seq_len, d_model)
+            pos_emb = self.transformer.wpe(pos)  # type: ignore
+            x = pos_emb + x
+        # Add input + positional embeddings and apply dropout.
+        # shape: (batch_size, seq_len, d_model)
+        x = self.transformer.emb_drop(x)  # type: ignore
+        # normalized
+        if self.config.normalize_input_embeds:
+            x = x * (self.config.d_model ** 0.5)
+        # Transform the attention mask into what the blocks expect.
+        if attention_mask is not None:
+            # shape: (batch_size, 1, 1, seq_len)
+            if len(attention_mask.shape) == 2:
+                attention_mask = attention_mask[:, :past_length + seq_len]
+                attention_mask = attention_mask.to(dtype=torch.float).view(batch_size, -1)[:, None, None, :]
+            else:
+                attention_mask = attention_mask.unsqueeze(1).to(dtype=torch.float)
+            attention_mask = (1.0 - attention_mask) * torch.finfo(attention_mask.dtype).min
+        # Merge attention mask with attention bias.
+        if (
+            attention_bias is not None
+            or attention_mask is not None
+            # NOTE (epwalsh): we need to initialize the attn bias in order for attn to work properly
+            # with key+value cache. Otherwise `F.scaled_dot_product_attention()` doesn't seem to compute
+            # scores correctly.
+            or past_key_values is not None
+        ):
+            if attention_bias is None:
+                attention_bias = get_causal_attention_bias(self.__cache, past_length + seq_len, x.device)
+            elif attention_bias.dtype in (torch.int8, torch.bool):
+                attention_bias = attention_bias.to(dtype=torch.float)
+                attention_bias.masked_fill_(attention_bias == 0.0, torch.finfo(attention_bias.dtype).min)
+            # Transform to the right shape and data type.
+            mask_len = seq_len
+            if attention_mask is not None:
+                mask_len = attention_mask.shape[-1]
+            elif past_key_values is not None:
+                mask_len = past_key_values[0][0].shape[-2] + seq_len
+            attention_bias = attention_bias[:, :, :mask_len, :mask_len].to(dtype=torch.float)
+            # Add in the masking bias.
+            if attention_mask is not None:
+                attention_bias = attention_bias + attention_mask
+                # Might get -infs after adding attention mask, since dtype.min + dtype.min = -inf.
+                # `F.scaled_dot_product_attention()` doesn't handle -inf like you'd expect, instead
+                # it can produce NaNs.
+                ensure_finite_(attention_bias, check_neg_inf=True, check_pos_inf=False)
+        attn_key_values: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = [] if use_cache else None
+        # decoder layers
+        all_hidden_states = []
+        # Apply blocks one-by-one.
+        if self.config.block_group_size == 1:
+            for block_idx, block in enumerate(self.transformer.blocks):
+                if output_hidden_states:
+                    # add hidden states
+                    all_hidden_states.append(x)
+                layer_past = None if past_key_values is None else past_key_values[block_idx]
+                x, cache = block(x, attention_bias=attention_bias, position_ids=position_ids, layer_past=layer_past, use_cache=use_cache)
+                if attn_key_values is not None:
+                    assert cache is not None
+                    attn_key_values.append(cache)
+        else:
+            for group_idx, block_group in enumerate(self.transformer.block_groups):
+                if output_hidden_states:
+                    # add hidden states
+                    all_hidden_states.append(x)
+                layers_past = (
+                    None
+                    if past_key_values is None
+                    else past_key_values[
+                         group_idx * self.config.block_group_size : (group_idx + 1) * self.config.block_group_size
+                         ]
+                )
+                x, cache = block_group(
+                    x, attention_bias=attention_bias, position_ids=position_ids, layers_past=layers_past, use_cache=use_cache
+                )
+                if attn_key_values is not None:
+                    assert cache is not None
+                    attn_key_values.extend(cache)
+        if last_logits_only:
+            # shape: (batch_size, 1, d_model)
+            if append_last_valid_logits is not None:
+                last_valid_output = x[
+                    torch.arange(x.shape[0], device=x.device), append_last_valid_logits.to(x.device)]
+                x = last_valid_output.unsqueeze(1)
+            else:
+                x = x[:, -1, :].unsqueeze(1)
+        # Apply final layer norm.
+        # shape: (batch_size, seq_len or 1, d_model)
+        x = self.transformer.ln_f(x)  # type: ignore
+        if output_hidden_states:
+            # add final hidden state post-final-layernorm, following HuggingFace's convention
+            all_hidden_states.append(x)
+        # Get logits.
+        # shape: (batch_size, seq_len or 1, vocab_size)
+        if self.config.weight_tying:
+            logits = F.linear(x, self.transformer.wte.weight, None)  # type: ignore
+        else:
+            logits = self.transformer.ff_out(x)  # type: ignore
+        if self.config.scale_logits:
+            logits.mul_(1 / math.sqrt(self.config.d_model))
+        if not last_logits_only and append_last_valid_logits is not None:
+            last_valid_logit = logits[
+                torch.arange(logits.shape[0], device=logits.device), append_last_valid_logits]
+            logits = torch.cat([logits[:, :-1], last_valid_logit[:, None]], dim=1)
+        return ModelOutput(logits=logits, attn_key_values=attn_key_values, hidden_states=tuple(all_hidden_states) if output_hidden_states else None)  # type: ignore[arg-type]
+class MolmoForCausalLM(PreTrainedModel):
+    config_class = MolmoConfig
+    base_model_prefix = "model"
+    _no_split_modules = ["MolmoBlock"]
+    def __init__(self, config: MolmoConfig, model: Optional[Molmo] = None, init_params: bool = False):
+        super().__init__(config)
+        if not model:
+            full_config = FullMolmoConfig(
+                image_padding_embed="pad_and_partial_pad",
+                image_pooling_2d="attention-meanq",
+                attention_layer_norm=config.attention_layer_norm,
+                rope_impl="llama",
+                vocab_size=config.vocab_size,
+                max_sequence_length=config.max_position_embeddings,
+                qkv_bias=config.qkv_bias,
+                norm_after=config.norm_after,
+                embedding_size=config.embedding_size,
+                attention_type="sdpa",
+                embedding_dropout=0,
+                attention_dropout=0,
+                residual_dropout=0,
+                rope=True,
+                weight_tying=False,
+                include_bias=False,
+                d_model=config.hidden_size,
+                mlp_hidden_size=config.intermediate_size,
+                n_layers=config.num_hidden_layers,
+                additional_vocab_size=128,
+                n_heads=config.num_attention_heads,
+                n_kv_heads=config.num_key_value_heads,
+                rope_theta=config.rope_theta,
+                layer_norm_eps=config.layer_norm_eps,
+                layer_norm_type=config.layer_norm_type,
+                vit_layers=[-2, -9],
+                vision_backbone=VisionBackboneConfig(
+                    image_default_input_size=(336, 336),
+                    image_patch_size=14,
+                    image_pos_patch_size=14,
+                    image_emb_dim=1024,
+                    image_num_heads=16,
+                    image_num_key_value_heads=16,
+                    image_num_layers=23,
+                    image_head_dim=64,
+                    image_mlp_dim=4096,
+                    image_mlp_activations="quick_gelu",
+                    image_dropout_rate=0.0,
+                    image_num_pos=577,
+                    image_norm_eps=1e-5,
+                    attention_dropout=0.0,
+                    residual_dropout=0.0,
+                    initializer_range=0.02,
+                )
+            )
+            self.model = Molmo(full_config, init_params=init_params)
+        else:
+            self.model = model
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        attention_bias: Optional[torch.Tensor] = None,
+        response_mask: Optional[torch.Tensor] = None,
+        images: Optional[torch.Tensor] = None,
+        image_masks: Optional[torch.Tensor] = None,
+        image_input_idx: Optional[torch.Tensor] = None,
+        subsegment_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        labels: Optional[torch.LongTensor] = None,
+        loss_masks: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        last_logits_only: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        append_last_valid_logits: Optional[torch.Tensor] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[
+            Cache
+        ] = None,  # This is a hack mitigation of an issue in transformers `4.39.x` https://github.com/huggingface/transformers/issues/29426
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        if use_cache is None:
+            use_cache = self.config.use_cache
+        if output_attentions:
+            raise ValueError("output_attentions is not yet supported in Molmo")
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model.forward(
+            input_ids=input_ids,
+            input_embeddings=inputs_embeds,
+            attention_mask=attention_mask,
+            attention_bias=attention_bias,
+            response_mask=response_mask,
+            images=images,
+            image_masks=image_masks,
+            image_input_idx=image_input_idx,
+            subsegment_ids=subsegment_ids,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            last_logits_only=last_logits_only,
+            output_hidden_states=output_hidden_states,
+            append_last_valid_logits=append_last_valid_logits,
+        )
+        logits = outputs.logits
+        hidden_states = outputs.hidden_states
+        loss = None
+        if labels is not None:
+            if loss_masks is not None:
+                loss_masks = loss_masks * (loss_masks > 0)
+                batch_size_in_tokens = max(loss_masks.sum().item(), 1)
+                labels = labels.long()
+                labels.masked_fill_(~(loss_masks > 0), -100)
+                labels = labels.view(-1)
+                logits_for_loss = logits.to(torch.float32).view(-1, logits.size(-1))
+                loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-100, reduction='none')
+                loss = loss_fct(logits_for_loss, labels)
+                loss = loss.view(input_ids.shape[0], -1)
+                loss = loss * loss_masks
+                loss = loss.sum() / batch_size_in_tokens
+                use_zloss = getattr(self.config, "softmax_auxiliary_loss", False)
+                if use_zloss:
+                    z_squared = logits_for_loss.logsumexp(-1).pow(2)
+                    z_loss = self.config.softmax_auxiliary_loss_scale * z_squared
+                    z_loss = z_loss.view(input_ids.shape[0], -1)
+                    z_loss = z_loss * loss_masks
+                    z_loss = z_loss.sum() / batch_size_in_tokens
+                    loss += z_loss
+            else:
+                # Shift so that tokens < n predict n
+                shift_logits = logits[..., :-1, :].contiguous()
+                shift_labels = labels[..., 1:].contiguous()
+                # Flatten the tokens
+                loss_fct = torch.nn.CrossEntropyLoss()
+                shift_logits = shift_logits.view(-1, self.config.embedding_size)
+                shift_labels = shift_labels.view(-1)
+                # Enable model parallelism
+                shift_labels = shift_labels.to(shift_logits.device)
+                loss = loss_fct(shift_logits, shift_labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.attn_key_values,
+            hidden_states=hidden_states,
+        )
+    def can_generate(self) -> bool:
+        return True
+    @torch.no_grad()
+    def generate_from_batch(
+        self,
+        batch: Dict[str, Any],
+        generation_config: Optional[GenerationConfig] = None,
+        **kwargs,
+    ):
+        if generation_config is not None:
+            assert generation_config.use_cache
+        images = batch.get("images")
+        image_masks = batch.get("image_masks")
+        image_input_idx = batch.get("image_input_idx")
+        # Validate inputs.
+        input_ids = batch["input_ids"]
+        batch_size, seq_len = input_ids.shape
+        attention_mask = batch.get("attention_mask", None)
+        max_new_tokens = generation_config.max_new_tokens
+        assert max_new_tokens is not None
+        mask_len = seq_len + max_new_tokens if self.config.use_position_ids else seq_len
+        position_ids: Optional[torch.Tensor] = None
+        append_last_valid_logits: Optional[torch.Tensor] = None
+        if self.config.use_position_ids and attention_mask is None:
+            attention_mask = input_ids != -1
+            position_ids = torch.clamp(
+                torch.cumsum(attention_mask.to(torch.int32), dim=-1) - 1,
+                min=0
+            )
+            append_last_valid_logits = attention_mask.long().sum(dim=-1) - 1
+            attention_mask = torch.cat(
+                [attention_mask, attention_mask.new_ones((batch_size, max_new_tokens))],
+                dim=1,
+            )
+        if attention_mask is not None:
+            assert attention_mask.shape == (batch_size, mask_len)
+        out = super().generate(
+            batch["input_ids"],
+            generation_config,
+            attention_mask=attention_mask,
+            images=images,
+            image_masks=image_masks,
+            image_input_idx=image_input_idx,
+            position_ids=position_ids,
+            append_last_valid_logits=append_last_valid_logits,
+            **kwargs,
+        )
+        return out
+    def prepare_inputs_for_generation(
+        self, input_ids: torch.LongTensor, past_key_values: Optional[List[Tuple]] = None, **kwargs
+    ):
+        if past_key_values:
+            # This is because we want the model to only process the last generated token.
+            input_ids = input_ids[:, -1:]
+        if self.config.use_position_ids:
+            attention_mask = kwargs.get("attention_mask")
+            images = kwargs.get("images")
+            image_masks = kwargs.get("image_masks")
+            image_input_idx = kwargs.get("image_input_idx")
+            position_ids = kwargs.get("position_ids")
+            append_last_valid_logits = kwargs.get("append_last_valid_logits")
+            model_inputs = {
+                "input_ids": input_ids,
+                "attention_mask": attention_mask,
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": True,
+                "last_logits_only": True,
+            }
+            if past_key_values is None:
+                model_inputs["images"] = images
+                model_inputs["image_masks"] = image_masks
+                model_inputs["image_input_idx"] = image_input_idx
+                model_inputs["append_last_valid_logits"] = append_last_valid_logits
+        else:
+            model_inputs = {"input_ids": input_ids, "past_key_values": past_key_values}
+            model_inputs.update(kwargs)
+            model_inputs["use_cache"] = kwargs.pop("use_cache", self.config.use_cache)
+        return model_inputs
+    def _update_model_kwargs_for_generation(
+        self,
+        outputs: ModelOutput,
+        model_kwargs: Dict[str, Any],
+        is_encoder_decoder: bool = False,
+        num_new_tokens: int = 1,
+    ) -> Dict[str, Any]:
+        if self.config.use_position_ids:
+            model_kwargs["position_ids"] = model_kwargs["position_ids"][:, -1:] + 1
+            if "append_last_valid_logits" in model_kwargs:
+                del model_kwargs["append_last_valid_logits"]
+            if "images" in model_kwargs:
+                del model_kwargs["images"]
+                del model_kwargs["image_masks"]
+                del model_kwargs["image_input_idx"]
+        cache_name, cache = super()._extract_past_from_model_output(outputs)
+        model_kwargs[cache_name] = cache
+        model_kwargs["cache_position"] = model_kwargs["cache_position"][-1:] + num_new_tokens
+        return model_kwargs
+    def get_input_embeddings(self) -> torch.nn.Module:
+        return self.model.transformer.wte
+    def set_input_embeddings(self, value: torch.nn.Module):
+        self.model.transformer.wte = value
+    def get_output_embeddings(self):
+        if self.config.weight_tying:
+            return self.model.transformer.wte
+        else:
+            return self.model.transformer.ff_out
+    def set_output_embeddings(self, value: torch.nn.Module):
+        if self.config.weight_tying:
+            self.model.transformer.wte = value
+        else:
+            self.model.transformer.ff_out = value
+    def tie_weights(self):
+        """
+        This function is intentionally left as a no-op.
+        Weight tying is handled as follows:
+        - When the model is initialized, the `ff_out` layer is conditionally defined based on the `weight_tying` configuration.
+        See: `if not config.weight_tying: self.transformer.update(...)` in `olmo/model.py`.
+        - When computing logits, the `wte` weights are used directly if `weight_tying` is enabled.
+        See: `if self.config.weight_tying: logits = F.linear(x, self.transformer.wte.weight, None)` in the `forward` method.
+        Therefore, there is no need to explicitly tie the weights in this function.
+        """
+        pass
+    def resize_token_embeddings(
+        self, new_num_tokens: Optional[int] = None, pad_to_multiple_of: Optional[int] = None
+    ) -> torch.nn.Embedding:
+        """
+        Resizes input token embeddings matrix of the model if `new_num_tokens != config.embedding_size`.
+        Takes care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.
+        Arguments:
+            new_num_tokens (`int`, *optional*):
+                The new number of tokens in the embedding matrix. Increasing the size will add newly initialized
+                vectors at the end. Reducing the size will remove vectors from the end. If not provided or `None`, just
+                returns a pointer to the input tokens `torch.nn.Embedding` module of the model without doing anything.
+            pad_to_multiple_of (`int`, *optional*):
+                If set will pad the embedding matrix to a multiple of the provided value. If `new_num_tokens` is set to
+                `None` will just pad the embedding to a multiple of `pad_to_multiple_of`.
+                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
+                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128. For more
+                details about this, or help on choosing the correct value for resizing, refer to this guide:
+                https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc
+        Return:
+            `torch.nn.Embedding`: Pointer to the input tokens Embeddings Module of the model.
+        Note:
+            This method differs from the base class implementation by resizing the `embedding_size` attribute of the
+            model configuration instead of the `vocab_size`. It also includes a warning if the resized `embedding_size`
+            is less than the `vocab_size`. In OLMo, `embedding_size` refers to the dimensionality of the model's token
+            embeddings, while `vocab_size` refers to the number of unique tokens in the vocabulary.
+        """
+        model_embeds = self._resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
+        if new_num_tokens is None and pad_to_multiple_of is None:
+            return model_embeds
+        # Update base model and current model config
+        self.config.embedding_size = model_embeds.weight.shape[0]
+        self.model.config.embedding_size = model_embeds.weight.shape[0]
+        # Check if the embedding size is less than the vocab size
+        if self.config.embedding_size < self.config.vocab_size:
+            warning_message = (
+                f"Resizing token embeddings to size {self.config.embedding_size}, which is less than the vocab size "
+                f"{self.config.vocab_size} defined in the model configuration. Make sure your tokenizer's vocabulary "
+                "size is less than or equal to the new token embedding size."
+            )
+            log.warning(warning_message)
+        # Tie weights again if needed
+        self.tie_weights()
+        return model_embeds
+# Always register for multi-modal features
+AutoModelForCausalLM.register(MolmoConfig, MolmoForCausalLM)

preprocessing_molmo.py ADDED Viewed

	@@ -0,0 +1,192 @@

+"""
+Processor class for Molmo.
+"""
+from typing import Optional
+import PIL
+from PIL import ImageOps
+from PIL.Image import Image
+try:
+    from typing import Unpack
+except ImportError:
+    from typing_extensions import Unpack
+import numpy as np
+import torch
+from transformers.image_utils import ImageInput
+from transformers.processing_utils import (
+    TextKwargs,
+    ProcessingKwargs,
+    ProcessorMixin,
+)
+from transformers.tokenization_utils_base import TextInput, PreTokenizedInput
+from transformers.utils import logging
+from transformers import AutoTokenizer
+from .image_preprocessing_molmo import MolmoImagesKwargs, MolmoImageProcessor
+logger = logging.get_logger(__name__)
+DEFAULT_IMAGE_PATCH_TOKEN = f"<im_patch>"
+DEFAULT_IM_START_TOKEN = f"<im_start>"
+DEFAULT_IM_END_TOKEN = f"<im_end>"
+DEFAULT_IM_COL_TOKEN = f"<im_col>"
+IMAGE_PROMPT = "<|image|>"
+EXTRA_TOKENS = (DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_COL_TOKEN, IMAGE_PROMPT)
+def get_special_token_ids(tokenizer):
+    ids = tokenizer.encode("".join(EXTRA_TOKENS), add_special_tokens=False)
+    assert len(ids) == len(EXTRA_TOKENS)
+    return {k: i for k, i in zip(EXTRA_TOKENS, ids)}
+class MolmoTextKwargs(TextKwargs, total=False):
+    style: Optional[str]
+    system_prompt: Optional[str]
+    message_format: Optional[str]
+    always_start_with_space: Optional[bool]
+    sequence_length: Optional[int]
+class MolmoProcessorKwargs(ProcessingKwargs, total=False):
+    text_kwargs: MolmoTextKwargs
+    images_kwargs: MolmoImagesKwargs
+    _defaults = {
+        "images_kwargs": {
+            "max_crops": 12,
+            "overlap_margins": [4, 4],
+            "base_image_input_size": [336, 336],
+            "image_token_length_w": 12,
+            "image_token_length_h": 12,
+            "image_patch_size": 14,
+            "image_padding_mask": True,
+        },
+        "text_kwargs": {
+            "style": "long_caption",
+            "system_prompt": "none",
+            "message_format": "role",
+            "always_start_with_space": True,
+            "sequence_length": 1536,
+            "padding": False,
+        },
+    }
+class MolmoProcessor(ProcessorMixin):
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
+    def __init__(self, image_processor: MolmoImageProcessor = None, tokenizer : AutoTokenizer = None, **kwargs):
+        # self.image_processor = image_processor
+        # self.tokenizer = tokenizer
+        super().__init__(image_processor, tokenizer)
+        self._special_tokens = None
+    @property
+    def special_token_ids(self):
+        if self._special_tokens is None:
+            self._special_tokens = get_special_token_ids(self.tokenizer)
+        return self._special_tokens
+    def get_tokens_input(self, prompt, message_format, always_start_with_space):
+        if message_format == "none" or message_format is None:
+            pass
+        elif message_format == "role":
+            prompt = "User: " + prompt + " Assistant:"
+        else:
+            raise NotImplementedError(f"Message format {message_format} not implemented")
+        if always_start_with_space:
+            prompt = " " + prompt
+        tokens = self.tokenizer.encode(prompt, add_special_tokens=False)
+        return tokens
+    def process(
+        self,
+        text: TextInput = None,
+        images: ImageInput = None,
+        *,
+        tokens: Optional[PreTokenizedInput] = None,
+        **kwargs: Unpack[MolmoProcessorKwargs],
+    ):
+        output_kwargs = self._merge_kwargs(
+            MolmoProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        if tokens is None:
+            tokens = self.get_tokens_input(
+                text,
+                output_kwargs["text_kwargs"]["message_format"],
+                output_kwargs["text_kwargs"]["always_start_with_space"],
+            )
+        image_token_id = self.special_token_ids[IMAGE_PROMPT]
+        if images is not None:
+            if not isinstance(images, (list, tuple)):
+                images = [images]
+            image_arrays = []
+            for image in images:
+                if isinstance(image, Image):
+                    image = image.convert("RGB")
+                    # Handle images with EXIF orientation tags, which PIL will ignore by default
+                    # https://github.com/python-pillow/Pillow/issues/4703
+                    img = ImageOps.exif_transpose(image)
+                    image_arrays.append(np.array(image))
+                else:
+                    assert len(image.shape) == 3 and image.shape[-1] == 3
+                    image_arrays.append(image.astype(np.uint8))
+            images = image_arrays
+            # For now only support inserting images at the start
+            image_idx = [-1]*len(images)
+        else:
+            image_idx = None
+        sequence_length = output_kwargs["text_kwargs"]["sequence_length"]
+        image_patch_token_id = self.special_token_ids[DEFAULT_IMAGE_PATCH_TOKEN]
+        image_col_token_id = self.special_token_ids[DEFAULT_IM_COL_TOKEN]
+        image_start_token_id = self.special_token_ids[DEFAULT_IM_START_TOKEN]
+        image_end_token_id = self.special_token_ids[DEFAULT_IM_END_TOKEN]
+        out = self.image_processor.multimodal_preprocess(
+            images=images,
+            image_idx=image_idx,
+            tokens=np.asarray(tokens).astype(np.int32),
+            sequence_length=sequence_length,
+            image_patch_token_id=image_patch_token_id,
+            image_col_token_id=image_col_token_id,
+            image_start_token_id=image_start_token_id,
+            image_end_token_id=image_end_token_id,
+            **output_kwargs["images_kwargs"]
+        )
+        # Prepend BOS
+        # qwen2 and olmo do not have a BOS, and instead use EOS as a generic seperator token.
+        bos = self.tokenizer.bos_token_id or self.tokenizer.eos_token_id
+        decoder_input_tokens = np.pad(out["input_ids"], [[1, 0]], constant_values=bos)
+        out["input_ids"] = decoder_input_tokens
+        if "image_input_idx" in out:
+            # Shift patch mapping up by one since we added BOS
+            image_input_idx = out["image_input_idx"]
+            out["image_input_idx"] = np.where(image_input_idx < 0, image_input_idx, image_input_idx + 1)
+        for k, v in out.items():
+            out[k] = torch.from_numpy(v)
+        return out
+MolmoProcessor.register_for_auto_class()

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "auto_map": {
+    "AutoImageProcessor": "image_preprocessing_molmo.MolmoImageProcessor",
+    "AutoProcessor": "preprocessing_molmo.MolmoProcessor"
+  },
+  "base_image_input_size": [
+    336,
+    336
+  ],
+  "do_normalize": true,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_padding_mask": true,
+  "image_patch_size": 14,
+  "image_processor_type": "MolmoImageProcessor",
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "image_token_length_h": 12,
+  "image_token_length_w": 12,
+  "max_crops": 12,
+  "overlap_margins": [
+    4,
+    4
+  ],
+  "processor_class": "MolmoProcessor"
+}

processor_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "auto_map": {
+    "AutoProcessor": "preprocessing_molmo.MolmoProcessor"
+  },
+  "processor_class": "MolmoProcessor"
+}

rng_state_0.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:da661aad2e9ab98676885cda2d296e7d5781572d0062fef9c91ad25c971522e1
+size 15920

rng_state_1.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:76066b4424ebb894fbf93616ab2e9648b9b421dcd3b26e99900e877a4b1aef69
+size 15984

rng_state_2.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1573cf092a799f5b9d7a1ea62ab9b1b58065859e3ab6d98cc28dc4083afdcfdd
+size 15984

rng_state_3.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cbf4bea5d1ec717842d4dc103e72d1adb2a2b31afc91aefe38bcfcba578f77c6
+size 15984

rng_state_4.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0d885754fb7b8ce47bda620803ac75712487e2c508ad1b8100c7f9d38da7c661
+size 15984

rng_state_5.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6862d26c10da6510d3a0336dac2f26b1e85421b284a42237463e13cc78ef3df1
+size 16048

rng_state_6.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0c6b9cfcbe810c109da95d448566518364e3ee79c9bb31a904613d5a69c8b367
+size 15920

rng_state_7.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f52b3be52613e7b518e640203ac12f79eb7f2fdfae165af3bb755b5db080c178
+size 15920

sft_args.json ADDED Viewed

	@@ -0,0 +1,302 @@

+{
+  "model_type": "molmo-7b-d",
+  "model_id_or_path": "pbarker/ComputerBase-v0.1-M-3epoch",
+  "model_revision": "main",
+  "full_determinism": false,
+  "sft_type": "full",
+  "freeze_parameters": [],
+  "freeze_vit": false,
+  "freeze_parameters_ratio": 0.0,
+  "additional_trainable_parameters": [],
+  "tuner_backend": "peft",
+  "template_type": "molmo",
+  "output_dir": "/workspace/output/molmo-7b-d/v1-20250103-233013",
+  "add_output_dir_suffix": true,
+  "ddp_backend": "nccl",
+  "ddp_find_unused_parameters": null,
+  "ddp_broadcast_buffers": null,
+  "ddp_timeout": 1800,
+  "seed": 42,
+  "resume_from_checkpoint": null,
+  "resume_only_model": false,
+  "ignore_data_skip": false,
+  "dtype": "bf16",
+  "packing": false,
+  "train_backend": "transformers",
+  "tp": 1,
+  "pp": 1,
+  "min_lr": null,
+  "sequence_parallel": false,
+  "model_kwargs": {},
+  "loss_name": null,
+  "dataset": [
+    "/workspace/train.jsonl"
+  ],
+  "val_dataset": [
+    "/workspace/val.jsonl"
+  ],
+  "dataset_seed": 42,
+  "dataset_test_ratio": 0.0,
+  "use_loss_scale": false,
+  "loss_scale_config_path": "/workspace/miniconda/lib/python3.12/site-packages/swift/llm/agent/default_loss_scale_config.json",
+  "system": null,
+  "tools_prompt": "react_en",
+  "max_length": 4096,
+  "truncation_strategy": "delete",
+  "check_dataset_strategy": "none",
+  "streaming": false,
+  "streaming_val_size": 0,
+  "streaming_buffer_size": 16384,
+  "model_name": [
+    null,
+    null
+  ],
+  "model_author": [
+    null,
+    null
+  ],
+  "quant_method": null,
+  "quantization_bit": 0,
+  "hqq_axis": 0,
+  "hqq_dynamic_config_path": null,
+  "bnb_4bit_comp_dtype": "bf16",
+  "bnb_4bit_quant_type": "nf4",
+  "bnb_4bit_use_double_quant": true,
+  "bnb_4bit_quant_storage": null,
+  "rescale_image": -1,
+  "target_modules": "^(model.transformer)(?!.*(lm_head|output|emb|wte|shared)).*",
+  "target_regex": null,
+  "modules_to_save": [],
+  "lora_rank": 8,
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "lora_bias_trainable": "none",
+  "lora_dtype": "AUTO",
+  "lora_lr_ratio": null,
+  "use_rslora": false,
+  "use_dora": false,
+  "init_lora_weights": "true",
+  "fourier_n_frequency": 2000,
+  "fourier_scaling": 300.0,
+  "rope_scaling": null,
+  "boft_block_size": 4,
+  "boft_block_num": 0,
+  "boft_n_butterfly_factor": 1,
+  "boft_dropout": 0.0,
+  "vera_rank": 256,
+  "vera_projection_prng_key": 0,
+  "vera_dropout": 0.0,
+  "vera_d_initial": 0.1,
+  "adapter_act": "gelu",
+  "adapter_length": 128,
+  "use_galore": false,
+  "galore_target_modules": null,
+  "galore_rank": 128,
+  "galore_update_proj_gap": 50,
+  "galore_scale": 1.0,
+  "galore_proj_type": "std",
+  "galore_optim_per_parameter": false,
+  "galore_with_embedding": false,
+  "galore_quantization": false,
+  "galore_proj_quant": false,
+  "galore_proj_bits": 4,
+  "galore_proj_group_size": 256,
+  "galore_cos_threshold": 0.4,
+  "galore_gamma_proj": 2,
+  "galore_queue_size": 5,
+  "adalora_target_r": 8,
+  "adalora_init_r": 12,
+  "adalora_tinit": 0,
+  "adalora_tfinal": 0,
+  "adalora_deltaT": 1,
+  "adalora_beta1": 0.85,
+  "adalora_beta2": 0.85,
+  "adalora_orth_reg_weight": 0.5,
+  "ia3_feedforward_modules": [],
+  "llamapro_num_new_blocks": 4,
+  "llamapro_num_groups": null,
+  "neftune_noise_alpha": null,
+  "neftune_backend": "transformers",
+  "lisa_activated_layers": 0,
+  "lisa_step_interval": 20,
+  "reft_layer_key": null,
+  "reft_layers": null,
+  "reft_rank": 4,
+  "reft_intervention_type": "LoreftIntervention",
+  "reft_args": null,
+  "use_liger": false,
+  "gradient_checkpointing": false,
+  "vit_use_gc": true,
+  "deepspeed": {
+    "fp16": {
+      "enabled": "auto",
+      "loss_scale": 0,
+      "loss_scale_window": 1000,
+      "initial_scale_power": 16,
+      "hysteresis": 2,
+      "min_loss_scale": 1
+    },
+    "bf16": {
+      "enabled": "auto"
+    },
+    "optimizer": {
+      "type": "AdamW",
+      "params": {
+        "lr": "auto",
+        "betas": "auto",
+        "eps": "auto",
+        "weight_decay": "auto"
+      }
+    },
+    "scheduler": {
+      "type": "WarmupCosineLR",
+      "params": {
+        "total_num_steps": "auto",
+        "warmup_num_steps": "auto"
+      }
+    },
+    "zero_optimization": {
+      "stage": 3,
+      "offload_optimizer": {
+        "device": "none",
+        "pin_memory": true
+      },
+      "offload_param": {
+        "device": "none",
+        "pin_memory": true
+      },
+      "overlap_comm": true,
+      "contiguous_gradients": true,
+      "sub_group_size": 1000000000.0,
+      "reduce_bucket_size": "auto",
+      "stage3_prefetch_bucket_size": "auto",
+      "stage3_param_persistence_threshold": "auto",
+      "stage3_max_live_parameters": 1000000000.0,
+      "stage3_max_reuse_distance": 1000000000.0,
+      "stage3_gather_16bit_weights_on_model_save": true
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+  },
+  "batch_size": 1,
+  "eval_batch_size": 1,
+  "auto_find_batch_size": false,
+  "num_train_epochs": 4,
+  "max_steps": -1,
+  "optim": "adamw_torch",
+  "adam_beta1": 0.9,
+  "adam_beta2": 0.95,
+  "adam_epsilon": 1e-08,
+  "learning_rate": 1e-05,
+  "weight_decay": 0.1,
+  "gradient_accumulation_steps": 2,
+  "max_grad_norm": 1,
+  "predict_with_generate": false,
+  "lr_scheduler_type": "cosine",
+  "lr_scheduler_kwargs": {},
+  "warmup_ratio": 0.05,
+  "warmup_steps": 0,
+  "eval_steps": 200,
+  "save_steps": 200,
+  "save_only_model": false,
+  "save_total_limit": 5,
+  "logging_steps": 5,
+  "acc_steps": 1,
+  "dataloader_num_workers": 1,
+  "dataloader_pin_memory": true,
+  "dataloader_drop_last": false,
+  "push_to_hub": false,
+  "hub_model_id": null,
+  "hub_token": null,
+  "hub_private_repo": false,
+  "hub_strategy": "every_save",
+  "test_oom_error": false,
+  "disable_tqdm": false,
+  "lazy_tokenize": true,
+  "preprocess_num_proc": 1,
+  "use_flash_attn": null,
+  "ignore_args_error": false,
+  "check_model_is_latest": true,
+  "logging_dir": "/workspace/output/molmo-7b-d/v1-20250103-233013/runs",
+  "report_to": [
+    "wandb"
+  ],
+  "acc_strategy": "token",
+  "save_on_each_node": false,
+  "evaluation_strategy": "epoch",
+  "save_strategy": "epoch",
+  "save_safetensors": true,
+  "gpu_memory_fraction": null,
+  "include_num_input_tokens_seen": false,
+  "local_repo_path": null,
+  "custom_register_path": null,
+  "custom_dataset_info": null,
+  "device_map_config": null,
+  "device_max_memory": [],
+  "max_new_tokens": 2048,
+  "do_sample": null,
+  "temperature": null,
+  "top_k": null,
+  "top_p": null,
+  "repetition_penalty": null,
+  "num_beams": 1,
+  "fsdp": "",
+  "fsdp_config": null,
+  "sequence_parallel_size": 1,
+  "model_layer_cls_name": null,
+  "metric_warmup_step": 0,
+  "fsdp_num": 1,
+  "per_device_train_batch_size": null,
+  "per_device_eval_batch_size": null,
+  "eval_strategy": "epoch",
+  "self_cognition_sample": 0,
+  "train_dataset_mix_ratio": 0.0,
+  "train_dataset_mix_ds": [
+    "ms-bench"
+  ],
+  "train_dataset_sample": -1,
+  "val_dataset_sample": null,
+  "safe_serialization": null,
+  "only_save_model": null,
+  "neftune_alpha": null,
+  "deepspeed_config_path": null,
+  "model_cache_dir": null,
+  "lora_dropout_p": null,
+  "lora_target_modules": [],
+  "lora_target_regex": null,
+  "lora_modules_to_save": [],
+  "boft_target_modules": [],
+  "boft_modules_to_save": [],
+  "vera_target_modules": [],
+  "vera_modules_to_save": [],
+  "ia3_target_modules": [],
+  "ia3_modules_to_save": [],
+  "custom_train_dataset_path": [],
+  "custom_val_dataset_path": [],
+  "device_map_config_path": null,
+  "push_hub_strategy": null,
+  "use_self_cognition": false,
+  "is_multimodal": true,
+  "is_vision": true,
+  "lora_use_embedding": false,
+  "lora_use_all": false,
+  "lora_m2s_use_embedding": false,
+  "lora_m2s_use_ln": false,
+  "torch_dtype": "torch.bfloat16",
+  "fp16": false,
+  "bf16": true,
+  "rank": 0,
+  "local_rank": 0,
+  "world_size": 8,
+  "local_world_size": 8,
+  "bnb_4bit_compute_dtype": "torch.bfloat16",
+  "load_in_4bit": false,
+  "load_in_8bit": false,
+  "train_sampler_random": true,
+  "train_type": "sft",
+  "training_args": "Seq2SeqTrainingArguments(output_dir='/workspace/output/molmo-7b-d/v1-20250103-233013', overwrite_output_dir=False, do_train=False, do_eval=True, do_predict=False, eval_strategy=<IntervalStrategy.EPOCH: 'epoch'>, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=1, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=2, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=1e-05, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, max_grad_norm=1, num_train_epochs=4, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs={}, warmup_ratio=0.05, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/workspace/output/molmo-7b-d/v1-20250103-233013/runs', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=True, logging_steps=5, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.EPOCH: 'epoch'>, save_steps=200, save_total_limit=5, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=42, jit_mode_eval=False, use_ipex=False, bf16=True, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=None, local_rank=0, ddp_backend='nccl', tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=200, dataloader_num_workers=1, dataloader_prefetch_factor=None, past_index=-1, run_name='/workspace/output/molmo-7b-d/v1-20250103-233013', disable_tqdm=False, remove_unused_columns=False, label_names=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, fsdp=[], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap=None, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'optimizer': {'type': 'AdamW', 'params': {'lr': 'auto', 'betas': 'auto', 'eps': 'auto', 'weight_decay': 'auto'}}, 'scheduler': {'type': 'WarmupCosineLR', 'params': {'total_num_steps': 'auto', 'warmup_num_steps': 'auto'}}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'offload_param': {'device': 'none', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'sub_group_size': 1000000000.0, 'reduce_bucket_size': 'auto', 'stage3_prefetch_bucket_size': 'auto', 'stage3_param_persistence_threshold': 'auto', 'stage3_max_live_parameters': 1000000000.0, 'stage3_max_reuse_distance': 1000000000.0, 'stage3_gather_16bit_weights_on_model_save': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH: 'adamw_torch'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=['wandb'], ddp_find_unused_parameters=True, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=True, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=None, hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=False, hub_always_push=False, gradient_checkpointing=False, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy=None, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=False, include_num_input_tokens_seen=False, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=False, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=GenerationConfig {\n  \"eos_token_id\": 151643,\n  \"max_new_tokens\": 2048,\n  \"pad_token_id\": 151643\n}\n, acc_strategy='token', loss_name=None, additional_saved_files=[], train_sampler_random=True, metric_warmup_step=0, train_dataset_sample=-1)"
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,435 @@

+{
+  "additional_special_tokens": [
+    "|<EXTRA_TOKENS_0>|",
+    "|<EXTRA_TOKENS_1>|",
+    "|<EXTRA_TOKENS_2>|",
+    "|<EXTRA_TOKENS_3>|",
+    "|<EXTRA_TOKENS_4>|",
+    "|<EXTRA_TOKENS_5>|",
+    "|<EXTRA_TOKENS_6>|",
+    "|<EXTRA_TOKENS_7>|",
+    "|<EXTRA_TOKENS_8>|",
+    "|<EXTRA_TOKENS_9>|",
+    "|<EXTRA_TOKENS_10>|",
+    "|<EXTRA_TOKENS_11>|",
+    "|<EXTRA_TOKENS_12>|",
+    "|<EXTRA_TOKENS_13>|",
+    "|<EXTRA_TOKENS_14>|",
+    "|<EXTRA_TOKENS_15>|",
+    "|<EXTRA_TOKENS_16>|",
+    "|<EXTRA_TOKENS_17>|",
+    "|<EXTRA_TOKENS_18>|",
+    "|<EXTRA_TOKENS_19>|",
+    "|<EXTRA_TOKENS_20>|",
+    "|<EXTRA_TOKENS_21>|",
+    "|<EXTRA_TOKENS_22>|",
+    "|<EXTRA_TOKENS_23>|",
+    "|<EXTRA_TOKENS_24>|",
+    "|<EXTRA_TOKENS_25>|",
+    "|<EXTRA_TOKENS_26>|",
+    "|<EXTRA_TOKENS_27>|",
+    "|<EXTRA_TOKENS_28>|",
+    "|<EXTRA_TOKENS_29>|",
+    "|<EXTRA_TOKENS_30>|",
+    "|<EXTRA_TOKENS_31>|",
+    "|<EXTRA_TOKENS_32>|",
+    "|<EXTRA_TOKENS_33>|",
+    "|<EXTRA_TOKENS_34>|",
+    "|<EXTRA_TOKENS_35>|",
+    "|<EXTRA_TOKENS_36>|",
+    "|<EXTRA_TOKENS_37>|",
+    "|<EXTRA_TOKENS_38>|",
+    "|<EXTRA_TOKENS_39>|",
+    "|<EXTRA_TOKENS_40>|",
+    "|<EXTRA_TOKENS_41>|",
+    "|<EXTRA_TOKENS_42>|",
+    "|<EXTRA_TOKENS_43>|",
+    "|<EXTRA_TOKENS_44>|",
+    "|<EXTRA_TOKENS_45>|",
+    "|<EXTRA_TOKENS_46>|",
+    "|<EXTRA_TOKENS_47>|",
+    "|<EXTRA_TOKENS_48>|",
+    "|<EXTRA_TOKENS_49>|",
+    "|<EXTRA_TOKENS_50>|",
+    "|<EXTRA_TOKENS_51>|",
+    "|<EXTRA_TOKENS_52>|",
+    "|<EXTRA_TOKENS_53>|",
+    "|<EXTRA_TOKENS_54>|",
+    "|<EXTRA_TOKENS_55>|",
+    "|<EXTRA_TOKENS_56>|",
+    "|<EXTRA_TOKENS_57>|",
+    "|<EXTRA_TOKENS_58>|",
+    "|<EXTRA_TOKENS_59>|",
+    "|<EXTRA_TOKENS_60>|",
+    "|<EXTRA_TOKENS_61>|",
+    "|<EXTRA_TOKENS_62>|",
+    "|<EXTRA_TOKENS_63>|",
+    "|<EXTRA_TOKENS_64>|",
+    "|<EXTRA_TOKENS_65>|",
+    "|<EXTRA_TOKENS_66>|",
+    "|<EXTRA_TOKENS_67>|",
+    "|<EXTRA_TOKENS_68>|",
+    "|<EXTRA_TOKENS_69>|",
+    "|<EXTRA_TOKENS_70>|",
+    "|<EXTRA_TOKENS_71>|",
+    "|<EXTRA_TOKENS_72>|",
+    "|<EXTRA_TOKENS_73>|",
+    "|<EXTRA_TOKENS_74>|",
+    "|<EXTRA_TOKENS_75>|",
+    "|<EXTRA_TOKENS_76>|",
+    "|<EXTRA_TOKENS_77>|",
+    "|<EXTRA_TOKENS_78>|",
+    "|<EXTRA_TOKENS_79>|",
+    "|<EXTRA_TOKENS_80>|",
+    "|<EXTRA_TOKENS_81>|",
+    "|<EXTRA_TOKENS_82>|",
+    "|<EXTRA_TOKENS_83>|",
+    "|<EXTRA_TOKENS_84>|",
+    "|<EXTRA_TOKENS_85>|",
+    "|<EXTRA_TOKENS_86>|",
+    "|<EXTRA_TOKENS_87>|",
+    "|<EXTRA_TOKENS_88>|",
+    "|<EXTRA_TOKENS_89>|",
+    "|<EXTRA_TOKENS_90>|",
+    "|<EXTRA_TOKENS_91>|",
+    "|<EXTRA_TOKENS_92>|",
+    "|<EXTRA_TOKENS_93>|",
+    "|<EXTRA_TOKENS_94>|",
+    "|<EXTRA_TOKENS_95>|",
+    "|<EXTRA_TOKENS_96>|",
+    "|<EXTRA_TOKENS_97>|",
+    "|<EXTRA_TOKENS_98>|",
+    "|<EXTRA_TOKENS_99>|",
+    "|<EXTRA_TOKENS_100>|",
+    "|<EXTRA_TOKENS_101>|",
+    "|<EXTRA_TOKENS_102>|",
+    "|<EXTRA_TOKENS_103>|",
+    "|<EXTRA_TOKENS_104>|",
+    "|<EXTRA_TOKENS_105>|",
+    "|<EXTRA_TOKENS_106>|",
+    "|<EXTRA_TOKENS_107>|",
+    "|<EXTRA_TOKENS_108>|",
+    "|<EXTRA_TOKENS_109>|",
+    "|<EXTRA_TOKENS_110>|",
+    "|<EXTRA_TOKENS_111>|",
+    "|<EXTRA_TOKENS_112>|",
+    "|<EXTRA_TOKENS_113>|",
+    "|<EXTRA_TOKENS_114>|",
+    "|<EXTRA_TOKENS_115>|",
+    "|<EXTRA_TOKENS_116>|",
+    "|<EXTRA_TOKENS_117>|",
+    "|<EXTRA_TOKENS_118>|",
+    "|<EXTRA_TOKENS_119>|",
+    "|<EXTRA_TOKENS_120>|",
+    "|<EXTRA_TOKENS_121>|",
+    "|<EXTRA_TOKENS_122>|",
+    "|<EXTRA_TOKENS_123>|",
+    "|<EXTRA_TOKENS_124>|",
+    "|<EXTRA_TOKENS_125>|",
+    "|<EXTRA_TOKENS_126>|",
+    "|<EXTRA_TOKENS_127>|",
+    "|<EXTRA_TOKENS_128>|",
+    "|<EXTRA_TOKENS_129>|",
+    "|<EXTRA_TOKENS_130>|",
+    "|<EXTRA_TOKENS_131>|",
+    "|<EXTRA_TOKENS_132>|",
+    "|<EXTRA_TOKENS_133>|",
+    "|<EXTRA_TOKENS_134>|",
+    "|<EXTRA_TOKENS_135>|",
+    "|<EXTRA_TOKENS_136>|",
+    "|<EXTRA_TOKENS_137>|",
+    "|<EXTRA_TOKENS_138>|",
+    "|<EXTRA_TOKENS_139>|",
+    "|<EXTRA_TOKENS_140>|",
+    "|<EXTRA_TOKENS_141>|",
+    "|<EXTRA_TOKENS_142>|",
+    "|<EXTRA_TOKENS_143>|",
+    "|<EXTRA_TOKENS_144>|",
+    "|<EXTRA_TOKENS_145>|",
+    "|<EXTRA_TOKENS_146>|",
+    "|<EXTRA_TOKENS_147>|",
+    "|<EXTRA_TOKENS_148>|",
+    "|<EXTRA_TOKENS_149>|",
+    "|<EXTRA_TOKENS_150>|",
+    "|<EXTRA_TOKENS_151>|",
+    "|<EXTRA_TOKENS_152>|",
+    "|<EXTRA_TOKENS_153>|",
+    "|<EXTRA_TOKENS_154>|",
+    "|<EXTRA_TOKENS_155>|",
+    "|<EXTRA_TOKENS_156>|",
+    "|<EXTRA_TOKENS_157>|",
+    "|<EXTRA_TOKENS_158>|",
+    "|<EXTRA_TOKENS_159>|",
+    "|<EXTRA_TOKENS_160>|",
+    "|<EXTRA_TOKENS_161>|",
+    "|<EXTRA_TOKENS_162>|",
+    "|<EXTRA_TOKENS_163>|",
+    "|<EXTRA_TOKENS_164>|",
+    "|<EXTRA_TOKENS_165>|",
+    "|<EXTRA_TOKENS_166>|",
+    "|<EXTRA_TOKENS_167>|",
+    "|<EXTRA_TOKENS_168>|",
+    "|<EXTRA_TOKENS_169>|",
+    "|<EXTRA_TOKENS_170>|",
+    "|<EXTRA_TOKENS_171>|",
+    "|<EXTRA_TOKENS_172>|",
+    "|<EXTRA_TOKENS_173>|",
+    "|<EXTRA_TOKENS_174>|",
+    "|<EXTRA_TOKENS_175>|",
+    "|<EXTRA_TOKENS_176>|",
+    "|<EXTRA_TOKENS_177>|",
+    "|<EXTRA_TOKENS_178>|",
+    "|<EXTRA_TOKENS_179>|",
+    "|<EXTRA_TOKENS_180>|",
+    "|<EXTRA_TOKENS_181>|",
+    "|<EXTRA_TOKENS_182>|",
+    "|<EXTRA_TOKENS_183>|",
+    "|<EXTRA_TOKENS_184>|",
+    "|<EXTRA_TOKENS_185>|",
+    "|<EXTRA_TOKENS_186>|",
+    "|<EXTRA_TOKENS_187>|",
+    "|<EXTRA_TOKENS_188>|",
+    "|<EXTRA_TOKENS_189>|",
+    "|<EXTRA_TOKENS_190>|",
+    "|<EXTRA_TOKENS_191>|",
+    "|<EXTRA_TOKENS_192>|",
+    "|<EXTRA_TOKENS_193>|",
+    "|<EXTRA_TOKENS_194>|",
+    "|<EXTRA_TOKENS_195>|",
+    "|<EXTRA_TOKENS_196>|",
+    "|<EXTRA_TOKENS_197>|",
+    "|<EXTRA_TOKENS_198>|",
+    "|<EXTRA_TOKENS_199>|",
+    "|<EXTRA_TOKENS_200>|",
+    "|<EXTRA_TOKENS_201>|",
+    "|<EXTRA_TOKENS_202>|",
+    "|<EXTRA_TOKENS_203>|",
+    "|<EXTRA_TOKENS_204>|",
+    "|<EXTRA_TOKENS_205>|",
+    "|<EXTRA_TOKENS_206>|",
+    "|<EXTRA_TOKENS_207>|",
+    "|<EXTRA_TOKENS_208>|",
+    "|<EXTRA_TOKENS_209>|",
+    "|<EXTRA_TOKENS_210>|",
+    "|<EXTRA_TOKENS_211>|",
+    "|<EXTRA_TOKENS_212>|",
+    "|<EXTRA_TOKENS_213>|",
+    "|<EXTRA_TOKENS_214>|",
+    "|<EXTRA_TOKENS_215>|",
+    "|<EXTRA_TOKENS_216>|",
+    "|<EXTRA_TOKENS_217>|",
+    "|<EXTRA_TOKENS_218>|",
+    "|<EXTRA_TOKENS_219>|",
+    "|<EXTRA_TOKENS_220>|",
+    "|<EXTRA_TOKENS_221>|",
+    "|<EXTRA_TOKENS_222>|",
+    "|<EXTRA_TOKENS_223>|",
+    "|<EXTRA_TOKENS_224>|",
+    "|<EXTRA_TOKENS_225>|",
+    "|<EXTRA_TOKENS_226>|",
+    "|<EXTRA_TOKENS_227>|",
+    "|<EXTRA_TOKENS_228>|",
+    "|<EXTRA_TOKENS_229>|",
+    "|<EXTRA_TOKENS_230>|",
+    "|<EXTRA_TOKENS_231>|",
+    "|<EXTRA_TOKENS_232>|",
+    "|<EXTRA_TOKENS_233>|",
+    "|<EXTRA_TOKENS_234>|",
+    "|<EXTRA_TOKENS_235>|",
+    "|<EXTRA_TOKENS_236>|",
+    "|<EXTRA_TOKENS_237>|",
+    "|<EXTRA_TOKENS_238>|",
+    "|<EXTRA_TOKENS_239>|",
+    "|<EXTRA_TOKENS_240>|",
+    "|<EXTRA_TOKENS_241>|",
+    "|<EXTRA_TOKENS_242>|",
+    "|<EXTRA_TOKENS_243>|",
+    "|<EXTRA_TOKENS_244>|",
+    "|<EXTRA_TOKENS_245>|",
+    "|<EXTRA_TOKENS_246>|",
+    "|<EXTRA_TOKENS_247>|",
+    "|<EXTRA_TOKENS_248>|",
+    "|<EXTRA_TOKENS_249>|",
+    "|<EXTRA_TOKENS_250>|",
+    "|<EXTRA_TOKENS_251>|",
+    "|<EXTRA_TOKENS_252>|",
+    "|<EXTRA_TOKENS_253>|",
+    "|<EXTRA_TOKENS_254>|",
+    "|<EXTRA_TOKENS_255>|",
+    "|<EXTRA_TOKENS_256>|",
+    "|<EXTRA_TOKENS_257>|",
+    "|<EXTRA_TOKENS_258>|",
+    "|<EXTRA_TOKENS_259>|",
+    "|<EXTRA_TOKENS_260>|",
+    "|<EXTRA_TOKENS_261>|",
+    "|<EXTRA_TOKENS_262>|",
+    "|<EXTRA_TOKENS_263>|",
+    "|<EXTRA_TOKENS_264>|",
+    "|<EXTRA_TOKENS_265>|",
+    "|<EXTRA_TOKENS_266>|",
+    "|<EXTRA_TOKENS_267>|",
+    "|<EXTRA_TOKENS_268>|",
+    "|<EXTRA_TOKENS_269>|",
+    "|<EXTRA_TOKENS_270>|",
+    "|<EXTRA_TOKENS_271>|",
+    "|<EXTRA_TOKENS_272>|",
+    "|<EXTRA_TOKENS_273>|",
+    "|<EXTRA_TOKENS_274>|",
+    "|<EXTRA_TOKENS_275>|",
+    "|<EXTRA_TOKENS_276>|",
+    "|<EXTRA_TOKENS_277>|",
+    "|<EXTRA_TOKENS_278>|",
+    "|<EXTRA_TOKENS_279>|",
+    "|<EXTRA_TOKENS_280>|",
+    "|<EXTRA_TOKENS_281>|",
+    "|<EXTRA_TOKENS_282>|",
+    "|<EXTRA_TOKENS_283>|",
+    "|<EXTRA_TOKENS_284>|",
+    "|<EXTRA_TOKENS_285>|",
+    "|<EXTRA_TOKENS_286>|",
+    "|<EXTRA_TOKENS_287>|",
+    "|<EXTRA_TOKENS_288>|",
+    "|<EXTRA_TOKENS_289>|",
+    "|<EXTRA_TOKENS_290>|",
+    "|<EXTRA_TOKENS_291>|",
+    "|<EXTRA_TOKENS_292>|",
+    "|<EXTRA_TOKENS_293>|",
+    "|<EXTRA_TOKENS_294>|",
+    "|<EXTRA_TOKENS_295>|",
+    "|<EXTRA_TOKENS_296>|",
+    "|<EXTRA_TOKENS_297>|",
+    "|<EXTRA_TOKENS_298>|",
+    "|<EXTRA_TOKENS_299>|",
+    "|<EXTRA_TOKENS_300>|",
+    "|<EXTRA_TOKENS_301>|",
+    "|<EXTRA_TOKENS_302>|",
+    "|<EXTRA_TOKENS_303>|",
+    "|<EXTRA_TOKENS_304>|",
+    "|<EXTRA_TOKENS_305>|",
+    "|<EXTRA_TOKENS_306>|",
+    "|<EXTRA_TOKENS_307>|",
+    "|<EXTRA_TOKENS_308>|",
+    "|<EXTRA_TOKENS_309>|",
+    "|<EXTRA_TOKENS_310>|",
+    "|<EXTRA_TOKENS_311>|",
+    "|<EXTRA_TOKENS_312>|",
+    "|<EXTRA_TOKENS_313>|",
+    "|<EXTRA_TOKENS_314>|",
+    "|<EXTRA_TOKENS_315>|",
+    "|<EXTRA_TOKENS_316>|",
+    "|<EXTRA_TOKENS_317>|",
+    "|<EXTRA_TOKENS_318>|",
+    "|<EXTRA_TOKENS_319>|",
+    "|<EXTRA_TOKENS_320>|",
+    "|<EXTRA_TOKENS_321>|",
+    "|<EXTRA_TOKENS_322>|",
+    "|<EXTRA_TOKENS_323>|",
+    "|<EXTRA_TOKENS_324>|",
+    "|<EXTRA_TOKENS_325>|",
+    "|<EXTRA_TOKENS_326>|",
+    "|<EXTRA_TOKENS_327>|",
+    "|<EXTRA_TOKENS_328>|",
+    "|<EXTRA_TOKENS_329>|",
+    "|<EXTRA_TOKENS_330>|",
+    "|<EXTRA_TOKENS_331>|",
+    "|<EXTRA_TOKENS_332>|",
+    "|<EXTRA_TOKENS_333>|",
+    "|<EXTRA_TOKENS_334>|",
+    "|<EXTRA_TOKENS_335>|",
+    "|<EXTRA_TOKENS_336>|",
+    "|<EXTRA_TOKENS_337>|",
+    "|<EXTRA_TOKENS_338>|",
+    "|<EXTRA_TOKENS_339>|",
+    "|<EXTRA_TOKENS_340>|",
+    "|<EXTRA_TOKENS_341>|",
+    "|<EXTRA_TOKENS_342>|",
+    "|<EXTRA_TOKENS_343>|",
+    "|<EXTRA_TOKENS_344>|",
+    "|<EXTRA_TOKENS_345>|",
+    "|<EXTRA_TOKENS_346>|",
+    "|<EXTRA_TOKENS_347>|",
+    "|<EXTRA_TOKENS_348>|",
+    "|<EXTRA_TOKENS_349>|",
+    "|<EXTRA_TOKENS_350>|",
+    "|<EXTRA_TOKENS_351>|",
+    "|<EXTRA_TOKENS_352>|",
+    "|<EXTRA_TOKENS_353>|",
+    "|<EXTRA_TOKENS_354>|",
+    "|<EXTRA_TOKENS_355>|",
+    "|<EXTRA_TOKENS_356>|",
+    "|<EXTRA_TOKENS_357>|",
+    "|<EXTRA_TOKENS_358>|",
+    "|<EXTRA_TOKENS_359>|",
+    "|<EXTRA_TOKENS_360>|",
+    "|<EXTRA_TOKENS_361>|",
+    "|<EXTRA_TOKENS_362>|",
+    "|<EXTRA_TOKENS_363>|",
+    "|<EXTRA_TOKENS_364>|",
+    "|<EXTRA_TOKENS_365>|",
+    "|<EXTRA_TOKENS_366>|",
+    "|<EXTRA_TOKENS_367>|",
+    "|<EXTRA_TOKENS_368>|",
+    "|<EXTRA_TOKENS_369>|",
+    "|<EXTRA_TOKENS_370>|",
+    "|<EXTRA_TOKENS_371>|",
+    "|<EXTRA_TOKENS_372>|",
+    "|<EXTRA_TOKENS_373>|",
+    "|<EXTRA_TOKENS_374>|",
+    "|<EXTRA_TOKENS_375>|",
+    "|<EXTRA_TOKENS_376>|",
+    "|<EXTRA_TOKENS_377>|",
+    "|<EXTRA_TOKENS_378>|",
+    "|<EXTRA_TOKENS_379>|",
+    "|<EXTRA_TOKENS_380>|",
+    "|<EXTRA_TOKENS_381>|",
+    "|<EXTRA_TOKENS_382>|",
+    "|<EXTRA_TOKENS_383>|",
+    "|<EXTRA_TOKENS_384>|",
+    "|<EXTRA_TOKENS_385>|",
+    "|<EXTRA_TOKENS_386>|",
+    "|<EXTRA_TOKENS_387>|",
+    "|<EXTRA_TOKENS_388>|",
+    "|<EXTRA_TOKENS_389>|",
+    "|<EXTRA_TOKENS_390>|",
+    "|<EXTRA_TOKENS_391>|",
+    "|<EXTRA_TOKENS_392>|",
+    "|<EXTRA_TOKENS_393>|",
+    "|<EXTRA_TOKENS_394>|",
+    "|<EXTRA_TOKENS_395>|",
+    "|<EXTRA_TOKENS_396>|",
+    "|<EXTRA_TOKENS_397>|",
+    "|<EXTRA_TOKENS_398>|",
+    "|<EXTRA_TOKENS_399>|",
+    "|<EXTRA_TOKENS_400>|",
+    "|<EXTRA_TOKENS_401>|",
+    "|<EXTRA_TOKENS_402>|",
+    "|<EXTRA_TOKENS_403>|",
+    "|<EXTRA_TOKENS_404>|",
+    "|<EXTRA_TOKENS_405>|",
+    "|<EXTRA_TOKENS_406>|",
+    "|<EXTRA_TOKENS_407>|",
+    "|<EXTRA_TOKENS_408>|",
+    "|<EXTRA_TOKENS_409>|",
+    "|<EXTRA_TOKENS_410>|",
+    "|<EXTRA_TOKENS_411>|",
+    "|<EXTRA_TOKENS_412>|",
+    "|<EXTRA_TOKENS_413>|",
+    "|<EXTRA_TOKENS_414>|",
+    "|<EXTRA_TOKENS_415>|",
+    "|<EXTRA_TOKENS_416>|",
+    "|<EXTRA_TOKENS_417>|",
+    "<im_start>",
+    "<im_end>",
+    "<im_patch>",
+    "<im_col>",
+    "<|image|>"
+  ],
+  "eos_token": "<|endoftext|>",
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6248048a83152ce87663c799492fe7e60c8086f3ae51ce7bd255ccc445746fc0
+size 11501432

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,3853 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "|<EXTRA_TOKENS_0>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "|<EXTRA_TOKENS_1>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "|<EXTRA_TOKENS_2>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "|<EXTRA_TOKENS_3>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "|<EXTRA_TOKENS_4>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "|<EXTRA_TOKENS_5>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "|<EXTRA_TOKENS_6>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "|<EXTRA_TOKENS_7>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "|<EXTRA_TOKENS_8>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "|<EXTRA_TOKENS_9>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "|<EXTRA_TOKENS_10>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "|<EXTRA_TOKENS_11>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151658": {
+      "content": "|<EXTRA_TOKENS_12>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151659": {
+      "content": "|<EXTRA_TOKENS_13>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151660": {
+      "content": "|<EXTRA_TOKENS_14>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151661": {
+      "content": "|<EXTRA_TOKENS_15>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151662": {
+      "content": "|<EXTRA_TOKENS_16>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151663": {
+      "content": "|<EXTRA_TOKENS_17>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151664": {
+      "content": "|<EXTRA_TOKENS_18>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151665": {
+      "content": "|<EXTRA_TOKENS_19>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151666": {
+      "content": "|<EXTRA_TOKENS_20>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151667": {
+      "content": "|<EXTRA_TOKENS_21>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151668": {
+      "content": "|<EXTRA_TOKENS_22>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151669": {
+      "content": "|<EXTRA_TOKENS_23>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151670": {
+      "content": "|<EXTRA_TOKENS_24>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151671": {
+      "content": "|<EXTRA_TOKENS_25>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151672": {
+      "content": "|<EXTRA_TOKENS_26>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151673": {
+      "content": "|<EXTRA_TOKENS_27>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151674": {
+      "content": "|<EXTRA_TOKENS_28>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151675": {
+      "content": "|<EXTRA_TOKENS_29>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151676": {
+      "content": "|<EXTRA_TOKENS_30>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151677": {
+      "content": "|<EXTRA_TOKENS_31>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151678": {
+      "content": "|<EXTRA_TOKENS_32>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151679": {
+      "content": "|<EXTRA_TOKENS_33>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151680": {
+      "content": "|<EXTRA_TOKENS_34>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151681": {
+      "content": "|<EXTRA_TOKENS_35>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151682": {
+      "content": "|<EXTRA_TOKENS_36>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151683": {
+      "content": "|<EXTRA_TOKENS_37>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151684": {
+      "content": "|<EXTRA_TOKENS_38>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151685": {
+      "content": "|<EXTRA_TOKENS_39>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151686": {
+      "content": "|<EXTRA_TOKENS_40>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151687": {
+      "content": "|<EXTRA_TOKENS_41>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151688": {
+      "content": "|<EXTRA_TOKENS_42>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151689": {
+      "content": "|<EXTRA_TOKENS_43>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151690": {
+      "content": "|<EXTRA_TOKENS_44>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151691": {
+      "content": "|<EXTRA_TOKENS_45>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151692": {
+      "content": "|<EXTRA_TOKENS_46>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151693": {
+      "content": "|<EXTRA_TOKENS_47>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151694": {
+      "content": "|<EXTRA_TOKENS_48>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151695": {
+      "content": "|<EXTRA_TOKENS_49>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151696": {
+      "content": "|<EXTRA_TOKENS_50>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151697": {
+      "content": "|<EXTRA_TOKENS_51>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151698": {
+      "content": "|<EXTRA_TOKENS_52>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151699": {
+      "content": "|<EXTRA_TOKENS_53>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151700": {
+      "content": "|<EXTRA_TOKENS_54>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151701": {
+      "content": "|<EXTRA_TOKENS_55>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151702": {
+      "content": "|<EXTRA_TOKENS_56>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151703": {
+      "content": "|<EXTRA_TOKENS_57>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151704": {
+      "content": "|<EXTRA_TOKENS_58>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151705": {
+      "content": "|<EXTRA_TOKENS_59>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151706": {
+      "content": "|<EXTRA_TOKENS_60>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151707": {
+      "content": "|<EXTRA_TOKENS_61>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151708": {
+      "content": "|<EXTRA_TOKENS_62>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151709": {
+      "content": "|<EXTRA_TOKENS_63>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151710": {
+      "content": "|<EXTRA_TOKENS_64>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151711": {
+      "content": "|<EXTRA_TOKENS_65>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151712": {
+      "content": "|<EXTRA_TOKENS_66>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151713": {
+      "content": "|<EXTRA_TOKENS_67>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151714": {
+      "content": "|<EXTRA_TOKENS_68>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151715": {
+      "content": "|<EXTRA_TOKENS_69>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151716": {
+      "content": "|<EXTRA_TOKENS_70>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151717": {
+      "content": "|<EXTRA_TOKENS_71>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151718": {
+      "content": "|<EXTRA_TOKENS_72>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151719": {
+      "content": "|<EXTRA_TOKENS_73>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151720": {
+      "content": "|<EXTRA_TOKENS_74>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151721": {
+      "content": "|<EXTRA_TOKENS_75>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151722": {
+      "content": "|<EXTRA_TOKENS_76>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151723": {
+      "content": "|<EXTRA_TOKENS_77>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151724": {
+      "content": "|<EXTRA_TOKENS_78>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151725": {
+      "content": "|<EXTRA_TOKENS_79>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151726": {
+      "content": "|<EXTRA_TOKENS_80>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151727": {
+      "content": "|<EXTRA_TOKENS_81>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151728": {
+      "content": "|<EXTRA_TOKENS_82>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151729": {
+      "content": "|<EXTRA_TOKENS_83>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151730": {
+      "content": "|<EXTRA_TOKENS_84>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151731": {
+      "content": "|<EXTRA_TOKENS_85>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151732": {
+      "content": "|<EXTRA_TOKENS_86>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151733": {
+      "content": "|<EXTRA_TOKENS_87>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151734": {
+      "content": "|<EXTRA_TOKENS_88>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151735": {
+      "content": "|<EXTRA_TOKENS_89>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151736": {
+      "content": "|<EXTRA_TOKENS_90>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151737": {
+      "content": "|<EXTRA_TOKENS_91>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151738": {
+      "content": "|<EXTRA_TOKENS_92>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151739": {
+      "content": "|<EXTRA_TOKENS_93>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151740": {
+      "content": "|<EXTRA_TOKENS_94>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151741": {
+      "content": "|<EXTRA_TOKENS_95>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151742": {
+      "content": "|<EXTRA_TOKENS_96>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151743": {
+      "content": "|<EXTRA_TOKENS_97>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151744": {
+      "content": "|<EXTRA_TOKENS_98>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151745": {
+      "content": "|<EXTRA_TOKENS_99>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151746": {
+      "content": "|<EXTRA_TOKENS_100>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151747": {
+      "content": "|<EXTRA_TOKENS_101>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151748": {
+      "content": "|<EXTRA_TOKENS_102>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151749": {
+      "content": "|<EXTRA_TOKENS_103>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151750": {
+      "content": "|<EXTRA_TOKENS_104>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151751": {
+      "content": "|<EXTRA_TOKENS_105>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151752": {
+      "content": "|<EXTRA_TOKENS_106>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151753": {
+      "content": "|<EXTRA_TOKENS_107>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151754": {
+      "content": "|<EXTRA_TOKENS_108>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151755": {
+      "content": "|<EXTRA_TOKENS_109>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151756": {
+      "content": "|<EXTRA_TOKENS_110>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151757": {
+      "content": "|<EXTRA_TOKENS_111>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151758": {
+      "content": "|<EXTRA_TOKENS_112>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151759": {
+      "content": "|<EXTRA_TOKENS_113>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151760": {
+      "content": "|<EXTRA_TOKENS_114>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151761": {
+      "content": "|<EXTRA_TOKENS_115>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151762": {
+      "content": "|<EXTRA_TOKENS_116>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151763": {
+      "content": "|<EXTRA_TOKENS_117>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151764": {
+      "content": "|<EXTRA_TOKENS_118>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151765": {
+      "content": "|<EXTRA_TOKENS_119>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151766": {
+      "content": "|<EXTRA_TOKENS_120>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151767": {
+      "content": "|<EXTRA_TOKENS_121>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151768": {
+      "content": "|<EXTRA_TOKENS_122>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151769": {
+      "content": "|<EXTRA_TOKENS_123>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151770": {
+      "content": "|<EXTRA_TOKENS_124>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151771": {
+      "content": "|<EXTRA_TOKENS_125>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151772": {
+      "content": "|<EXTRA_TOKENS_126>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151773": {
+      "content": "|<EXTRA_TOKENS_127>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151774": {
+      "content": "|<EXTRA_TOKENS_128>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151775": {
+      "content": "|<EXTRA_TOKENS_129>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151776": {
+      "content": "|<EXTRA_TOKENS_130>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151777": {
+      "content": "|<EXTRA_TOKENS_131>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151778": {
+      "content": "|<EXTRA_TOKENS_132>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151779": {
+      "content": "|<EXTRA_TOKENS_133>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151780": {
+      "content": "|<EXTRA_TOKENS_134>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151781": {
+      "content": "|<EXTRA_TOKENS_135>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151782": {
+      "content": "|<EXTRA_TOKENS_136>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151783": {
+      "content": "|<EXTRA_TOKENS_137>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151784": {
+      "content": "|<EXTRA_TOKENS_138>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151785": {
+      "content": "|<EXTRA_TOKENS_139>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151786": {
+      "content": "|<EXTRA_TOKENS_140>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151787": {
+      "content": "|<EXTRA_TOKENS_141>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151788": {
+      "content": "|<EXTRA_TOKENS_142>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151789": {
+      "content": "|<EXTRA_TOKENS_143>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151790": {
+      "content": "|<EXTRA_TOKENS_144>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151791": {
+      "content": "|<EXTRA_TOKENS_145>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151792": {
+      "content": "|<EXTRA_TOKENS_146>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151793": {
+      "content": "|<EXTRA_TOKENS_147>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151794": {
+      "content": "|<EXTRA_TOKENS_148>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151795": {
+      "content": "|<EXTRA_TOKENS_149>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151796": {
+      "content": "|<EXTRA_TOKENS_150>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151797": {
+      "content": "|<EXTRA_TOKENS_151>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151798": {
+      "content": "|<EXTRA_TOKENS_152>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151799": {
+      "content": "|<EXTRA_TOKENS_153>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151800": {
+      "content": "|<EXTRA_TOKENS_154>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151801": {
+      "content": "|<EXTRA_TOKENS_155>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151802": {
+      "content": "|<EXTRA_TOKENS_156>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151803": {
+      "content": "|<EXTRA_TOKENS_157>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151804": {
+      "content": "|<EXTRA_TOKENS_158>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151805": {
+      "content": "|<EXTRA_TOKENS_159>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151806": {
+      "content": "|<EXTRA_TOKENS_160>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151807": {
+      "content": "|<EXTRA_TOKENS_161>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151808": {
+      "content": "|<EXTRA_TOKENS_162>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151809": {
+      "content": "|<EXTRA_TOKENS_163>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151810": {
+      "content": "|<EXTRA_TOKENS_164>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151811": {
+      "content": "|<EXTRA_TOKENS_165>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151812": {
+      "content": "|<EXTRA_TOKENS_166>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151813": {
+      "content": "|<EXTRA_TOKENS_167>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151814": {
+      "content": "|<EXTRA_TOKENS_168>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151815": {
+      "content": "|<EXTRA_TOKENS_169>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151816": {
+      "content": "|<EXTRA_TOKENS_170>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151817": {
+      "content": "|<EXTRA_TOKENS_171>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151818": {
+      "content": "|<EXTRA_TOKENS_172>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151819": {
+      "content": "|<EXTRA_TOKENS_173>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151820": {
+      "content": "|<EXTRA_TOKENS_174>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151821": {
+      "content": "|<EXTRA_TOKENS_175>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151822": {
+      "content": "|<EXTRA_TOKENS_176>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151823": {
+      "content": "|<EXTRA_TOKENS_177>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151824": {
+      "content": "|<EXTRA_TOKENS_178>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151825": {
+      "content": "|<EXTRA_TOKENS_179>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151826": {
+      "content": "|<EXTRA_TOKENS_180>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151827": {
+      "content": "|<EXTRA_TOKENS_181>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151828": {
+      "content": "|<EXTRA_TOKENS_182>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151829": {
+      "content": "|<EXTRA_TOKENS_183>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151830": {
+      "content": "|<EXTRA_TOKENS_184>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151831": {
+      "content": "|<EXTRA_TOKENS_185>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151832": {
+      "content": "|<EXTRA_TOKENS_186>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151833": {
+      "content": "|<EXTRA_TOKENS_187>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151834": {
+      "content": "|<EXTRA_TOKENS_188>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151835": {
+      "content": "|<EXTRA_TOKENS_189>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151836": {
+      "content": "|<EXTRA_TOKENS_190>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151837": {
+      "content": "|<EXTRA_TOKENS_191>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151838": {
+      "content": "|<EXTRA_TOKENS_192>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151839": {
+      "content": "|<EXTRA_TOKENS_193>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151840": {
+      "content": "|<EXTRA_TOKENS_194>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151841": {
+      "content": "|<EXTRA_TOKENS_195>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151842": {
+      "content": "|<EXTRA_TOKENS_196>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151843": {
+      "content": "|<EXTRA_TOKENS_197>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151844": {
+      "content": "|<EXTRA_TOKENS_198>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151845": {
+      "content": "|<EXTRA_TOKENS_199>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151846": {
+      "content": "|<EXTRA_TOKENS_200>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151847": {
+      "content": "|<EXTRA_TOKENS_201>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151848": {
+      "content": "|<EXTRA_TOKENS_202>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151849": {
+      "content": "|<EXTRA_TOKENS_203>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151850": {
+      "content": "|<EXTRA_TOKENS_204>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151851": {
+      "content": "|<EXTRA_TOKENS_205>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151852": {
+      "content": "|<EXTRA_TOKENS_206>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151853": {
+      "content": "|<EXTRA_TOKENS_207>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151854": {
+      "content": "|<EXTRA_TOKENS_208>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151855": {
+      "content": "|<EXTRA_TOKENS_209>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151856": {
+      "content": "|<EXTRA_TOKENS_210>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151857": {
+      "content": "|<EXTRA_TOKENS_211>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151858": {
+      "content": "|<EXTRA_TOKENS_212>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151859": {
+      "content": "|<EXTRA_TOKENS_213>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151860": {
+      "content": "|<EXTRA_TOKENS_214>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151861": {
+      "content": "|<EXTRA_TOKENS_215>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151862": {
+      "content": "|<EXTRA_TOKENS_216>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151863": {
+      "content": "|<EXTRA_TOKENS_217>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151864": {
+      "content": "|<EXTRA_TOKENS_218>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151865": {
+      "content": "|<EXTRA_TOKENS_219>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151866": {
+      "content": "|<EXTRA_TOKENS_220>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151867": {
+      "content": "|<EXTRA_TOKENS_221>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151868": {
+      "content": "|<EXTRA_TOKENS_222>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151869": {
+      "content": "|<EXTRA_TOKENS_223>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151870": {
+      "content": "|<EXTRA_TOKENS_224>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151871": {
+      "content": "|<EXTRA_TOKENS_225>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151872": {
+      "content": "|<EXTRA_TOKENS_226>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151873": {
+      "content": "|<EXTRA_TOKENS_227>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151874": {
+      "content": "|<EXTRA_TOKENS_228>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151875": {
+      "content": "|<EXTRA_TOKENS_229>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151876": {
+      "content": "|<EXTRA_TOKENS_230>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151877": {
+      "content": "|<EXTRA_TOKENS_231>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151878": {
+      "content": "|<EXTRA_TOKENS_232>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151879": {
+      "content": "|<EXTRA_TOKENS_233>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151880": {
+      "content": "|<EXTRA_TOKENS_234>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151881": {
+      "content": "|<EXTRA_TOKENS_235>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151882": {
+      "content": "|<EXTRA_TOKENS_236>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151883": {
+      "content": "|<EXTRA_TOKENS_237>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151884": {
+      "content": "|<EXTRA_TOKENS_238>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151885": {
+      "content": "|<EXTRA_TOKENS_239>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151886": {
+      "content": "|<EXTRA_TOKENS_240>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151887": {
+      "content": "|<EXTRA_TOKENS_241>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151888": {
+      "content": "|<EXTRA_TOKENS_242>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151889": {
+      "content": "|<EXTRA_TOKENS_243>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151890": {
+      "content": "|<EXTRA_TOKENS_244>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151891": {
+      "content": "|<EXTRA_TOKENS_245>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151892": {
+      "content": "|<EXTRA_TOKENS_246>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151893": {
+      "content": "|<EXTRA_TOKENS_247>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151894": {
+      "content": "|<EXTRA_TOKENS_248>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151895": {
+      "content": "|<EXTRA_TOKENS_249>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151896": {
+      "content": "|<EXTRA_TOKENS_250>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151897": {
+      "content": "|<EXTRA_TOKENS_251>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151898": {
+      "content": "|<EXTRA_TOKENS_252>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151899": {
+      "content": "|<EXTRA_TOKENS_253>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151900": {
+      "content": "|<EXTRA_TOKENS_254>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151901": {
+      "content": "|<EXTRA_TOKENS_255>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151902": {
+      "content": "|<EXTRA_TOKENS_256>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151903": {
+      "content": "|<EXTRA_TOKENS_257>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151904": {
+      "content": "|<EXTRA_TOKENS_258>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151905": {
+      "content": "|<EXTRA_TOKENS_259>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151906": {
+      "content": "|<EXTRA_TOKENS_260>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151907": {
+      "content": "|<EXTRA_TOKENS_261>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151908": {
+      "content": "|<EXTRA_TOKENS_262>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151909": {
+      "content": "|<EXTRA_TOKENS_263>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151910": {
+      "content": "|<EXTRA_TOKENS_264>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151911": {
+      "content": "|<EXTRA_TOKENS_265>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151912": {
+      "content": "|<EXTRA_TOKENS_266>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151913": {
+      "content": "|<EXTRA_TOKENS_267>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151914": {
+      "content": "|<EXTRA_TOKENS_268>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151915": {
+      "content": "|<EXTRA_TOKENS_269>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151916": {
+      "content": "|<EXTRA_TOKENS_270>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151917": {
+      "content": "|<EXTRA_TOKENS_271>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151918": {
+      "content": "|<EXTRA_TOKENS_272>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151919": {
+      "content": "|<EXTRA_TOKENS_273>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151920": {
+      "content": "|<EXTRA_TOKENS_274>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151921": {
+      "content": "|<EXTRA_TOKENS_275>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151922": {
+      "content": "|<EXTRA_TOKENS_276>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151923": {
+      "content": "|<EXTRA_TOKENS_277>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151924": {
+      "content": "|<EXTRA_TOKENS_278>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151925": {
+      "content": "|<EXTRA_TOKENS_279>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151926": {
+      "content": "|<EXTRA_TOKENS_280>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151927": {
+      "content": "|<EXTRA_TOKENS_281>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151928": {
+      "content": "|<EXTRA_TOKENS_282>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151929": {
+      "content": "|<EXTRA_TOKENS_283>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151930": {
+      "content": "|<EXTRA_TOKENS_284>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151931": {
+      "content": "|<EXTRA_TOKENS_285>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151932": {
+      "content": "|<EXTRA_TOKENS_286>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151933": {
+      "content": "|<EXTRA_TOKENS_287>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151934": {
+      "content": "|<EXTRA_TOKENS_288>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151935": {
+      "content": "|<EXTRA_TOKENS_289>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151936": {
+      "content": "|<EXTRA_TOKENS_290>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151937": {
+      "content": "|<EXTRA_TOKENS_291>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151938": {
+      "content": "|<EXTRA_TOKENS_292>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151939": {
+      "content": "|<EXTRA_TOKENS_293>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151940": {
+      "content": "|<EXTRA_TOKENS_294>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151941": {
+      "content": "|<EXTRA_TOKENS_295>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151942": {
+      "content": "|<EXTRA_TOKENS_296>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151943": {
+      "content": "|<EXTRA_TOKENS_297>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151944": {
+      "content": "|<EXTRA_TOKENS_298>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151945": {
+      "content": "|<EXTRA_TOKENS_299>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151946": {
+      "content": "|<EXTRA_TOKENS_300>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151947": {
+      "content": "|<EXTRA_TOKENS_301>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151948": {
+      "content": "|<EXTRA_TOKENS_302>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151949": {
+      "content": "|<EXTRA_TOKENS_303>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151950": {
+      "content": "|<EXTRA_TOKENS_304>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151951": {
+      "content": "|<EXTRA_TOKENS_305>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151952": {
+      "content": "|<EXTRA_TOKENS_306>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151953": {
+      "content": "|<EXTRA_TOKENS_307>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151954": {
+      "content": "|<EXTRA_TOKENS_308>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151955": {
+      "content": "|<EXTRA_TOKENS_309>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151956": {
+      "content": "|<EXTRA_TOKENS_310>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151957": {
+      "content": "|<EXTRA_TOKENS_311>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151958": {
+      "content": "|<EXTRA_TOKENS_312>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151959": {
+      "content": "|<EXTRA_TOKENS_313>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151960": {
+      "content": "|<EXTRA_TOKENS_314>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151961": {
+      "content": "|<EXTRA_TOKENS_315>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151962": {
+      "content": "|<EXTRA_TOKENS_316>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151963": {
+      "content": "|<EXTRA_TOKENS_317>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151964": {
+      "content": "|<EXTRA_TOKENS_318>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151965": {
+      "content": "|<EXTRA_TOKENS_319>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151966": {
+      "content": "|<EXTRA_TOKENS_320>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151967": {
+      "content": "|<EXTRA_TOKENS_321>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151968": {
+      "content": "|<EXTRA_TOKENS_322>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151969": {
+      "content": "|<EXTRA_TOKENS_323>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151970": {
+      "content": "|<EXTRA_TOKENS_324>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151971": {
+      "content": "|<EXTRA_TOKENS_325>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151972": {
+      "content": "|<EXTRA_TOKENS_326>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151973": {
+      "content": "|<EXTRA_TOKENS_327>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151974": {
+      "content": "|<EXTRA_TOKENS_328>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151975": {
+      "content": "|<EXTRA_TOKENS_329>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151976": {
+      "content": "|<EXTRA_TOKENS_330>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151977": {
+      "content": "|<EXTRA_TOKENS_331>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151978": {
+      "content": "|<EXTRA_TOKENS_332>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151979": {
+      "content": "|<EXTRA_TOKENS_333>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151980": {
+      "content": "|<EXTRA_TOKENS_334>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151981": {
+      "content": "|<EXTRA_TOKENS_335>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151982": {
+      "content": "|<EXTRA_TOKENS_336>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151983": {
+      "content": "|<EXTRA_TOKENS_337>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151984": {
+      "content": "|<EXTRA_TOKENS_338>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151985": {
+      "content": "|<EXTRA_TOKENS_339>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151986": {
+      "content": "|<EXTRA_TOKENS_340>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151987": {
+      "content": "|<EXTRA_TOKENS_341>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151988": {
+      "content": "|<EXTRA_TOKENS_342>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151989": {
+      "content": "|<EXTRA_TOKENS_343>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151990": {
+      "content": "|<EXTRA_TOKENS_344>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151991": {
+      "content": "|<EXTRA_TOKENS_345>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151992": {
+      "content": "|<EXTRA_TOKENS_346>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151993": {
+      "content": "|<EXTRA_TOKENS_347>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151994": {
+      "content": "|<EXTRA_TOKENS_348>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151995": {
+      "content": "|<EXTRA_TOKENS_349>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151996": {
+      "content": "|<EXTRA_TOKENS_350>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151997": {
+      "content": "|<EXTRA_TOKENS_351>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151998": {
+      "content": "|<EXTRA_TOKENS_352>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151999": {
+      "content": "|<EXTRA_TOKENS_353>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152000": {
+      "content": "|<EXTRA_TOKENS_354>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152001": {
+      "content": "|<EXTRA_TOKENS_355>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152002": {
+      "content": "|<EXTRA_TOKENS_356>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152003": {
+      "content": "|<EXTRA_TOKENS_357>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152004": {
+      "content": "|<EXTRA_TOKENS_358>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152005": {
+      "content": "|<EXTRA_TOKENS_359>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152006": {
+      "content": "|<EXTRA_TOKENS_360>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152007": {
+      "content": "|<EXTRA_TOKENS_361>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152008": {
+      "content": "|<EXTRA_TOKENS_362>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152009": {
+      "content": "|<EXTRA_TOKENS_363>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152010": {
+      "content": "|<EXTRA_TOKENS_364>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152011": {
+      "content": "|<EXTRA_TOKENS_365>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152012": {
+      "content": "|<EXTRA_TOKENS_366>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152013": {
+      "content": "|<EXTRA_TOKENS_367>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152014": {
+      "content": "|<EXTRA_TOKENS_368>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152015": {
+      "content": "|<EXTRA_TOKENS_369>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152016": {
+      "content": "|<EXTRA_TOKENS_370>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152017": {
+      "content": "|<EXTRA_TOKENS_371>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152018": {
+      "content": "|<EXTRA_TOKENS_372>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152019": {
+      "content": "|<EXTRA_TOKENS_373>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152020": {
+      "content": "|<EXTRA_TOKENS_374>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152021": {
+      "content": "|<EXTRA_TOKENS_375>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152022": {
+      "content": "|<EXTRA_TOKENS_376>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152023": {
+      "content": "|<EXTRA_TOKENS_377>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152024": {
+      "content": "|<EXTRA_TOKENS_378>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152025": {
+      "content": "|<EXTRA_TOKENS_379>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152026": {
+      "content": "|<EXTRA_TOKENS_380>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152027": {
+      "content": "|<EXTRA_TOKENS_381>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152028": {
+      "content": "|<EXTRA_TOKENS_382>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152029": {
+      "content": "|<EXTRA_TOKENS_383>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152030": {
+      "content": "|<EXTRA_TOKENS_384>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152031": {
+      "content": "|<EXTRA_TOKENS_385>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152032": {
+      "content": "|<EXTRA_TOKENS_386>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152033": {
+      "content": "|<EXTRA_TOKENS_387>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152034": {
+      "content": "|<EXTRA_TOKENS_388>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152035": {
+      "content": "|<EXTRA_TOKENS_389>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152036": {
+      "content": "|<EXTRA_TOKENS_390>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152037": {
+      "content": "|<EXTRA_TOKENS_391>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152038": {
+      "content": "|<EXTRA_TOKENS_392>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152039": {
+      "content": "|<EXTRA_TOKENS_393>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152040": {
+      "content": "|<EXTRA_TOKENS_394>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152041": {
+      "content": "|<EXTRA_TOKENS_395>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152042": {
+      "content": "|<EXTRA_TOKENS_396>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152043": {
+      "content": "|<EXTRA_TOKENS_397>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152044": {
+      "content": "|<EXTRA_TOKENS_398>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152045": {
+      "content": "|<EXTRA_TOKENS_399>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152046": {
+      "content": "|<EXTRA_TOKENS_400>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152047": {
+      "content": "|<EXTRA_TOKENS_401>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152048": {
+      "content": "|<EXTRA_TOKENS_402>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152049": {
+      "content": "|<EXTRA_TOKENS_403>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152050": {
+      "content": "|<EXTRA_TOKENS_404>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152051": {
+      "content": "|<EXTRA_TOKENS_405>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152052": {
+      "content": "|<EXTRA_TOKENS_406>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152053": {
+      "content": "|<EXTRA_TOKENS_407>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152054": {
+      "content": "|<EXTRA_TOKENS_408>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152055": {
+      "content": "|<EXTRA_TOKENS_409>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152056": {
+      "content": "|<EXTRA_TOKENS_410>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152057": {
+      "content": "|<EXTRA_TOKENS_411>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152058": {
+      "content": "|<EXTRA_TOKENS_412>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152059": {
+      "content": "|<EXTRA_TOKENS_413>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152060": {
+      "content": "|<EXTRA_TOKENS_414>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152061": {
+      "content": "|<EXTRA_TOKENS_415>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152062": {
+      "content": "|<EXTRA_TOKENS_416>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152063": {
+      "content": "|<EXTRA_TOKENS_417>|",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152064": {
+      "content": "<im_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152065": {
+      "content": "<im_end>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152066": {
+      "content": "<im_patch>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152067": {
+      "content": "<im_col>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "152068": {
+      "content": "<|image|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "|<EXTRA_TOKENS_0>|",
+    "|<EXTRA_TOKENS_1>|",
+    "|<EXTRA_TOKENS_2>|",
+    "|<EXTRA_TOKENS_3>|",
+    "|<EXTRA_TOKENS_4>|",
+    "|<EXTRA_TOKENS_5>|",
+    "|<EXTRA_TOKENS_6>|",
+    "|<EXTRA_TOKENS_7>|",
+    "|<EXTRA_TOKENS_8>|",
+    "|<EXTRA_TOKENS_9>|",
+    "|<EXTRA_TOKENS_10>|",
+    "|<EXTRA_TOKENS_11>|",
+    "|<EXTRA_TOKENS_12>|",
+    "|<EXTRA_TOKENS_13>|",
+    "|<EXTRA_TOKENS_14>|",
+    "|<EXTRA_TOKENS_15>|",
+    "|<EXTRA_TOKENS_16>|",
+    "|<EXTRA_TOKENS_17>|",
+    "|<EXTRA_TOKENS_18>|",
+    "|<EXTRA_TOKENS_19>|",
+    "|<EXTRA_TOKENS_20>|",
+    "|<EXTRA_TOKENS_21>|",
+    "|<EXTRA_TOKENS_22>|",
+    "|<EXTRA_TOKENS_23>|",
+    "|<EXTRA_TOKENS_24>|",
+    "|<EXTRA_TOKENS_25>|",
+    "|<EXTRA_TOKENS_26>|",
+    "|<EXTRA_TOKENS_27>|",
+    "|<EXTRA_TOKENS_28>|",
+    "|<EXTRA_TOKENS_29>|",
+    "|<EXTRA_TOKENS_30>|",
+    "|<EXTRA_TOKENS_31>|",
+    "|<EXTRA_TOKENS_32>|",
+    "|<EXTRA_TOKENS_33>|",
+    "|<EXTRA_TOKENS_34>|",
+    "|<EXTRA_TOKENS_35>|",
+    "|<EXTRA_TOKENS_36>|",
+    "|<EXTRA_TOKENS_37>|",
+    "|<EXTRA_TOKENS_38>|",
+    "|<EXTRA_TOKENS_39>|",
+    "|<EXTRA_TOKENS_40>|",
+    "|<EXTRA_TOKENS_41>|",
+    "|<EXTRA_TOKENS_42>|",
+    "|<EXTRA_TOKENS_43>|",
+    "|<EXTRA_TOKENS_44>|",
+    "|<EXTRA_TOKENS_45>|",
+    "|<EXTRA_TOKENS_46>|",
+    "|<EXTRA_TOKENS_47>|",
+    "|<EXTRA_TOKENS_48>|",
+    "|<EXTRA_TOKENS_49>|",
+    "|<EXTRA_TOKENS_50>|",
+    "|<EXTRA_TOKENS_51>|",
+    "|<EXTRA_TOKENS_52>|",
+    "|<EXTRA_TOKENS_53>|",
+    "|<EXTRA_TOKENS_54>|",
+    "|<EXTRA_TOKENS_55>|",
+    "|<EXTRA_TOKENS_56>|",
+    "|<EXTRA_TOKENS_57>|",
+    "|<EXTRA_TOKENS_58>|",
+    "|<EXTRA_TOKENS_59>|",
+    "|<EXTRA_TOKENS_60>|",
+    "|<EXTRA_TOKENS_61>|",
+    "|<EXTRA_TOKENS_62>|",
+    "|<EXTRA_TOKENS_63>|",
+    "|<EXTRA_TOKENS_64>|",
+    "|<EXTRA_TOKENS_65>|",
+    "|<EXTRA_TOKENS_66>|",
+    "|<EXTRA_TOKENS_67>|",
+    "|<EXTRA_TOKENS_68>|",
+    "|<EXTRA_TOKENS_69>|",
+    "|<EXTRA_TOKENS_70>|",
+    "|<EXTRA_TOKENS_71>|",
+    "|<EXTRA_TOKENS_72>|",
+    "|<EXTRA_TOKENS_73>|",
+    "|<EXTRA_TOKENS_74>|",
+    "|<EXTRA_TOKENS_75>|",
+    "|<EXTRA_TOKENS_76>|",
+    "|<EXTRA_TOKENS_77>|",
+    "|<EXTRA_TOKENS_78>|",
+    "|<EXTRA_TOKENS_79>|",
+    "|<EXTRA_TOKENS_80>|",
+    "|<EXTRA_TOKENS_81>|",
+    "|<EXTRA_TOKENS_82>|",
+    "|<EXTRA_TOKENS_83>|",
+    "|<EXTRA_TOKENS_84>|",
+    "|<EXTRA_TOKENS_85>|",
+    "|<EXTRA_TOKENS_86>|",
+    "|<EXTRA_TOKENS_87>|",
+    "|<EXTRA_TOKENS_88>|",
+    "|<EXTRA_TOKENS_89>|",
+    "|<EXTRA_TOKENS_90>|",
+    "|<EXTRA_TOKENS_91>|",
+    "|<EXTRA_TOKENS_92>|",
+    "|<EXTRA_TOKENS_93>|",
+    "|<EXTRA_TOKENS_94>|",
+    "|<EXTRA_TOKENS_95>|",
+    "|<EXTRA_TOKENS_96>|",
+    "|<EXTRA_TOKENS_97>|",
+    "|<EXTRA_TOKENS_98>|",
+    "|<EXTRA_TOKENS_99>|",
+    "|<EXTRA_TOKENS_100>|",
+    "|<EXTRA_TOKENS_101>|",
+    "|<EXTRA_TOKENS_102>|",
+    "|<EXTRA_TOKENS_103>|",
+    "|<EXTRA_TOKENS_104>|",
+    "|<EXTRA_TOKENS_105>|",
+    "|<EXTRA_TOKENS_106>|",
+    "|<EXTRA_TOKENS_107>|",
+    "|<EXTRA_TOKENS_108>|",
+    "|<EXTRA_TOKENS_109>|",
+    "|<EXTRA_TOKENS_110>|",
+    "|<EXTRA_TOKENS_111>|",
+    "|<EXTRA_TOKENS_112>|",
+    "|<EXTRA_TOKENS_113>|",
+    "|<EXTRA_TOKENS_114>|",
+    "|<EXTRA_TOKENS_115>|",
+    "|<EXTRA_TOKENS_116>|",
+    "|<EXTRA_TOKENS_117>|",
+    "|<EXTRA_TOKENS_118>|",
+    "|<EXTRA_TOKENS_119>|",
+    "|<EXTRA_TOKENS_120>|",
+    "|<EXTRA_TOKENS_121>|",
+    "|<EXTRA_TOKENS_122>|",
+    "|<EXTRA_TOKENS_123>|",
+    "|<EXTRA_TOKENS_124>|",
+    "|<EXTRA_TOKENS_125>|",
+    "|<EXTRA_TOKENS_126>|",
+    "|<EXTRA_TOKENS_127>|",
+    "|<EXTRA_TOKENS_128>|",
+    "|<EXTRA_TOKENS_129>|",
+    "|<EXTRA_TOKENS_130>|",
+    "|<EXTRA_TOKENS_131>|",
+    "|<EXTRA_TOKENS_132>|",
+    "|<EXTRA_TOKENS_133>|",
+    "|<EXTRA_TOKENS_134>|",
+    "|<EXTRA_TOKENS_135>|",
+    "|<EXTRA_TOKENS_136>|",
+    "|<EXTRA_TOKENS_137>|",
+    "|<EXTRA_TOKENS_138>|",
+    "|<EXTRA_TOKENS_139>|",
+    "|<EXTRA_TOKENS_140>|",
+    "|<EXTRA_TOKENS_141>|",
+    "|<EXTRA_TOKENS_142>|",
+    "|<EXTRA_TOKENS_143>|",
+    "|<EXTRA_TOKENS_144>|",
+    "|<EXTRA_TOKENS_145>|",
+    "|<EXTRA_TOKENS_146>|",
+    "|<EXTRA_TOKENS_147>|",
+    "|<EXTRA_TOKENS_148>|",
+    "|<EXTRA_TOKENS_149>|",
+    "|<EXTRA_TOKENS_150>|",
+    "|<EXTRA_TOKENS_151>|",
+    "|<EXTRA_TOKENS_152>|",
+    "|<EXTRA_TOKENS_153>|",
+    "|<EXTRA_TOKENS_154>|",
+    "|<EXTRA_TOKENS_155>|",
+    "|<EXTRA_TOKENS_156>|",
+    "|<EXTRA_TOKENS_157>|",
+    "|<EXTRA_TOKENS_158>|",
+    "|<EXTRA_TOKENS_159>|",
+    "|<EXTRA_TOKENS_160>|",
+    "|<EXTRA_TOKENS_161>|",
+    "|<EXTRA_TOKENS_162>|",
+    "|<EXTRA_TOKENS_163>|",
+    "|<EXTRA_TOKENS_164>|",
+    "|<EXTRA_TOKENS_165>|",
+    "|<EXTRA_TOKENS_166>|",
+    "|<EXTRA_TOKENS_167>|",
+    "|<EXTRA_TOKENS_168>|",
+    "|<EXTRA_TOKENS_169>|",
+    "|<EXTRA_TOKENS_170>|",
+    "|<EXTRA_TOKENS_171>|",
+    "|<EXTRA_TOKENS_172>|",
+    "|<EXTRA_TOKENS_173>|",
+    "|<EXTRA_TOKENS_174>|",
+    "|<EXTRA_TOKENS_175>|",
+    "|<EXTRA_TOKENS_176>|",
+    "|<EXTRA_TOKENS_177>|",
+    "|<EXTRA_TOKENS_178>|",
+    "|<EXTRA_TOKENS_179>|",
+    "|<EXTRA_TOKENS_180>|",
+    "|<EXTRA_TOKENS_181>|",
+    "|<EXTRA_TOKENS_182>|",
+    "|<EXTRA_TOKENS_183>|",
+    "|<EXTRA_TOKENS_184>|",
+    "|<EXTRA_TOKENS_185>|",
+    "|<EXTRA_TOKENS_186>|",
+    "|<EXTRA_TOKENS_187>|",
+    "|<EXTRA_TOKENS_188>|",
+    "|<EXTRA_TOKENS_189>|",
+    "|<EXTRA_TOKENS_190>|",
+    "|<EXTRA_TOKENS_191>|",
+    "|<EXTRA_TOKENS_192>|",
+    "|<EXTRA_TOKENS_193>|",
+    "|<EXTRA_TOKENS_194>|",
+    "|<EXTRA_TOKENS_195>|",
+    "|<EXTRA_TOKENS_196>|",
+    "|<EXTRA_TOKENS_197>|",
+    "|<EXTRA_TOKENS_198>|",
+    "|<EXTRA_TOKENS_199>|",
+    "|<EXTRA_TOKENS_200>|",
+    "|<EXTRA_TOKENS_201>|",
+    "|<EXTRA_TOKENS_202>|",
+    "|<EXTRA_TOKENS_203>|",
+    "|<EXTRA_TOKENS_204>|",
+    "|<EXTRA_TOKENS_205>|",
+    "|<EXTRA_TOKENS_206>|",
+    "|<EXTRA_TOKENS_207>|",
+    "|<EXTRA_TOKENS_208>|",
+    "|<EXTRA_TOKENS_209>|",
+    "|<EXTRA_TOKENS_210>|",
+    "|<EXTRA_TOKENS_211>|",
+    "|<EXTRA_TOKENS_212>|",
+    "|<EXTRA_TOKENS_213>|",
+    "|<EXTRA_TOKENS_214>|",
+    "|<EXTRA_TOKENS_215>|",
+    "|<EXTRA_TOKENS_216>|",
+    "|<EXTRA_TOKENS_217>|",
+    "|<EXTRA_TOKENS_218>|",
+    "|<EXTRA_TOKENS_219>|",
+    "|<EXTRA_TOKENS_220>|",
+    "|<EXTRA_TOKENS_221>|",
+    "|<EXTRA_TOKENS_222>|",
+    "|<EXTRA_TOKENS_223>|",
+    "|<EXTRA_TOKENS_224>|",
+    "|<EXTRA_TOKENS_225>|",
+    "|<EXTRA_TOKENS_226>|",
+    "|<EXTRA_TOKENS_227>|",
+    "|<EXTRA_TOKENS_228>|",
+    "|<EXTRA_TOKENS_229>|",
+    "|<EXTRA_TOKENS_230>|",
+    "|<EXTRA_TOKENS_231>|",
+    "|<EXTRA_TOKENS_232>|",
+    "|<EXTRA_TOKENS_233>|",
+    "|<EXTRA_TOKENS_234>|",
+    "|<EXTRA_TOKENS_235>|",
+    "|<EXTRA_TOKENS_236>|",
+    "|<EXTRA_TOKENS_237>|",
+    "|<EXTRA_TOKENS_238>|",
+    "|<EXTRA_TOKENS_239>|",
+    "|<EXTRA_TOKENS_240>|",
+    "|<EXTRA_TOKENS_241>|",
+    "|<EXTRA_TOKENS_242>|",
+    "|<EXTRA_TOKENS_243>|",
+    "|<EXTRA_TOKENS_244>|",
+    "|<EXTRA_TOKENS_245>|",
+    "|<EXTRA_TOKENS_246>|",
+    "|<EXTRA_TOKENS_247>|",
+    "|<EXTRA_TOKENS_248>|",
+    "|<EXTRA_TOKENS_249>|",
+    "|<EXTRA_TOKENS_250>|",
+    "|<EXTRA_TOKENS_251>|",
+    "|<EXTRA_TOKENS_252>|",
+    "|<EXTRA_TOKENS_253>|",
+    "|<EXTRA_TOKENS_254>|",
+    "|<EXTRA_TOKENS_255>|",
+    "|<EXTRA_TOKENS_256>|",
+    "|<EXTRA_TOKENS_257>|",
+    "|<EXTRA_TOKENS_258>|",
+    "|<EXTRA_TOKENS_259>|",
+    "|<EXTRA_TOKENS_260>|",
+    "|<EXTRA_TOKENS_261>|",
+    "|<EXTRA_TOKENS_262>|",
+    "|<EXTRA_TOKENS_263>|",
+    "|<EXTRA_TOKENS_264>|",
+    "|<EXTRA_TOKENS_265>|",
+    "|<EXTRA_TOKENS_266>|",
+    "|<EXTRA_TOKENS_267>|",
+    "|<EXTRA_TOKENS_268>|",
+    "|<EXTRA_TOKENS_269>|",
+    "|<EXTRA_TOKENS_270>|",
+    "|<EXTRA_TOKENS_271>|",
+    "|<EXTRA_TOKENS_272>|",
+    "|<EXTRA_TOKENS_273>|",
+    "|<EXTRA_TOKENS_274>|",
+    "|<EXTRA_TOKENS_275>|",
+    "|<EXTRA_TOKENS_276>|",
+    "|<EXTRA_TOKENS_277>|",
+    "|<EXTRA_TOKENS_278>|",
+    "|<EXTRA_TOKENS_279>|",
+    "|<EXTRA_TOKENS_280>|",
+    "|<EXTRA_TOKENS_281>|",
+    "|<EXTRA_TOKENS_282>|",
+    "|<EXTRA_TOKENS_283>|",
+    "|<EXTRA_TOKENS_284>|",
+    "|<EXTRA_TOKENS_285>|",
+    "|<EXTRA_TOKENS_286>|",
+    "|<EXTRA_TOKENS_287>|",
+    "|<EXTRA_TOKENS_288>|",
+    "|<EXTRA_TOKENS_289>|",
+    "|<EXTRA_TOKENS_290>|",
+    "|<EXTRA_TOKENS_291>|",
+    "|<EXTRA_TOKENS_292>|",
+    "|<EXTRA_TOKENS_293>|",
+    "|<EXTRA_TOKENS_294>|",
+    "|<EXTRA_TOKENS_295>|",
+    "|<EXTRA_TOKENS_296>|",
+    "|<EXTRA_TOKENS_297>|",
+    "|<EXTRA_TOKENS_298>|",
+    "|<EXTRA_TOKENS_299>|",
+    "|<EXTRA_TOKENS_300>|",
+    "|<EXTRA_TOKENS_301>|",
+    "|<EXTRA_TOKENS_302>|",
+    "|<EXTRA_TOKENS_303>|",
+    "|<EXTRA_TOKENS_304>|",
+    "|<EXTRA_TOKENS_305>|",
+    "|<EXTRA_TOKENS_306>|",
+    "|<EXTRA_TOKENS_307>|",
+    "|<EXTRA_TOKENS_308>|",
+    "|<EXTRA_TOKENS_309>|",
+    "|<EXTRA_TOKENS_310>|",
+    "|<EXTRA_TOKENS_311>|",
+    "|<EXTRA_TOKENS_312>|",
+    "|<EXTRA_TOKENS_313>|",
+    "|<EXTRA_TOKENS_314>|",
+    "|<EXTRA_TOKENS_315>|",
+    "|<EXTRA_TOKENS_316>|",
+    "|<EXTRA_TOKENS_317>|",
+    "|<EXTRA_TOKENS_318>|",
+    "|<EXTRA_TOKENS_319>|",
+    "|<EXTRA_TOKENS_320>|",
+    "|<EXTRA_TOKENS_321>|",
+    "|<EXTRA_TOKENS_322>|",
+    "|<EXTRA_TOKENS_323>|",
+    "|<EXTRA_TOKENS_324>|",
+    "|<EXTRA_TOKENS_325>|",
+    "|<EXTRA_TOKENS_326>|",
+    "|<EXTRA_TOKENS_327>|",
+    "|<EXTRA_TOKENS_328>|",
+    "|<EXTRA_TOKENS_329>|",
+    "|<EXTRA_TOKENS_330>|",
+    "|<EXTRA_TOKENS_331>|",
+    "|<EXTRA_TOKENS_332>|",
+    "|<EXTRA_TOKENS_333>|",
+    "|<EXTRA_TOKENS_334>|",
+    "|<EXTRA_TOKENS_335>|",
+    "|<EXTRA_TOKENS_336>|",
+    "|<EXTRA_TOKENS_337>|",
+    "|<EXTRA_TOKENS_338>|",
+    "|<EXTRA_TOKENS_339>|",
+    "|<EXTRA_TOKENS_340>|",
+    "|<EXTRA_TOKENS_341>|",
+    "|<EXTRA_TOKENS_342>|",
+    "|<EXTRA_TOKENS_343>|",
+    "|<EXTRA_TOKENS_344>|",
+    "|<EXTRA_TOKENS_345>|",
+    "|<EXTRA_TOKENS_346>|",
+    "|<EXTRA_TOKENS_347>|",
+    "|<EXTRA_TOKENS_348>|",
+    "|<EXTRA_TOKENS_349>|",
+    "|<EXTRA_TOKENS_350>|",
+    "|<EXTRA_TOKENS_351>|",
+    "|<EXTRA_TOKENS_352>|",
+    "|<EXTRA_TOKENS_353>|",
+    "|<EXTRA_TOKENS_354>|",
+    "|<EXTRA_TOKENS_355>|",
+    "|<EXTRA_TOKENS_356>|",
+    "|<EXTRA_TOKENS_357>|",
+    "|<EXTRA_TOKENS_358>|",
+    "|<EXTRA_TOKENS_359>|",
+    "|<EXTRA_TOKENS_360>|",
+    "|<EXTRA_TOKENS_361>|",
+    "|<EXTRA_TOKENS_362>|",
+    "|<EXTRA_TOKENS_363>|",
+    "|<EXTRA_TOKENS_364>|",
+    "|<EXTRA_TOKENS_365>|",
+    "|<EXTRA_TOKENS_366>|",
+    "|<EXTRA_TOKENS_367>|",
+    "|<EXTRA_TOKENS_368>|",
+    "|<EXTRA_TOKENS_369>|",
+    "|<EXTRA_TOKENS_370>|",
+    "|<EXTRA_TOKENS_371>|",
+    "|<EXTRA_TOKENS_372>|",
+    "|<EXTRA_TOKENS_373>|",
+    "|<EXTRA_TOKENS_374>|",
+    "|<EXTRA_TOKENS_375>|",
+    "|<EXTRA_TOKENS_376>|",
+    "|<EXTRA_TOKENS_377>|",
+    "|<EXTRA_TOKENS_378>|",
+    "|<EXTRA_TOKENS_379>|",
+    "|<EXTRA_TOKENS_380>|",
+    "|<EXTRA_TOKENS_381>|",
+    "|<EXTRA_TOKENS_382>|",
+    "|<EXTRA_TOKENS_383>|",
+    "|<EXTRA_TOKENS_384>|",
+    "|<EXTRA_TOKENS_385>|",
+    "|<EXTRA_TOKENS_386>|",
+    "|<EXTRA_TOKENS_387>|",
+    "|<EXTRA_TOKENS_388>|",
+    "|<EXTRA_TOKENS_389>|",
+    "|<EXTRA_TOKENS_390>|",
+    "|<EXTRA_TOKENS_391>|",
+    "|<EXTRA_TOKENS_392>|",
+    "|<EXTRA_TOKENS_393>|",
+    "|<EXTRA_TOKENS_394>|",
+    "|<EXTRA_TOKENS_395>|",
+    "|<EXTRA_TOKENS_396>|",
+    "|<EXTRA_TOKENS_397>|",
+    "|<EXTRA_TOKENS_398>|",
+    "|<EXTRA_TOKENS_399>|",
+    "|<EXTRA_TOKENS_400>|",
+    "|<EXTRA_TOKENS_401>|",
+    "|<EXTRA_TOKENS_402>|",
+    "|<EXTRA_TOKENS_403>|",
+    "|<EXTRA_TOKENS_404>|",
+    "|<EXTRA_TOKENS_405>|",
+    "|<EXTRA_TOKENS_406>|",
+    "|<EXTRA_TOKENS_407>|",
+    "|<EXTRA_TOKENS_408>|",
+    "|<EXTRA_TOKENS_409>|",
+    "|<EXTRA_TOKENS_410>|",
+    "|<EXTRA_TOKENS_411>|",
+    "|<EXTRA_TOKENS_412>|",
+    "|<EXTRA_TOKENS_413>|",
+    "|<EXTRA_TOKENS_414>|",
+    "|<EXTRA_TOKENS_415>|",
+    "|<EXTRA_TOKENS_416>|",
+    "|<EXTRA_TOKENS_417>|",
+    "<im_start>",
+    "<im_end>",
+    "<im_patch>",
+    "<im_col>",
+    "<|image|>"
+  ],
+  "auto_map": {
+    "AutoProcessor": "preprocessing_molmo.MolmoProcessor"
+  },
+  "bos_token": null,
+  "chat_template": "{% for message in messages -%}\n        {%- if (loop.index % 2 == 1 and message['role'] != 'user') or \n          (loop.index % 2 == 0 and message['role'].lower() != 'assistant') -%}\n        {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}\n        {%- endif -%}\n        {{ message['role'].capitalize() + ': ' + message['content'] }}\n        {%- if not loop.last -%}\n        {{ ' ' }}\n        {%- endif %}\n        {%- endfor -%}\n        {%- if add_generation_prompt -%}\n        {{ ' Assistant:' }}\n        {%- endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 32768,
+  "pad_token": "<|endoftext|>",
+  "processor_class": "MolmoProcessor",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,890 @@

+{
+  "best_metric": 0.21225065,
+  "best_model_checkpoint": "/workspace/output/molmo-7b-d/v1-20250103-233013/checkpoint-414",
+  "epoch": 3.0,
+  "eval_steps": 200,
+  "global_step": 414,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "acc": 0.93268561,
+      "epoch": 0.007272727272727273,
+      "grad_norm": 4.360905168604235,
+      "learning_rate": 0.0,
+      "loss": 0.15919656,
+      "memory(GiB)": 131.1,
+      "step": 1,
+      "train_speed(iter/s)": 0.051814
+    },
+    {
+      "acc": 0.93592656,
+      "epoch": 0.03636363636363636,
+      "grad_norm": 5.722024176509264,
+      "learning_rate": 4.829949384917788e-06,
+      "loss": 0.16946605,
+      "memory(GiB)": 131.7,
+      "step": 5,
+      "train_speed(iter/s)": 0.164947
+    },
+    {
+      "acc": 0.92745094,
+      "epoch": 0.07272727272727272,
+      "grad_norm": 5.449760109713864,
+      "learning_rate": 6.910095361682884e-06,
+      "loss": 0.19423571,
+      "memory(GiB)": 131.7,
+      "step": 10,
+      "train_speed(iter/s)": 0.228063
+    },
+    {
+      "acc": 0.9191308,
+      "epoch": 0.10909090909090909,
+      "grad_norm": 5.591882854054257,
+      "learning_rate": 8.126902754116446e-06,
+      "loss": 0.2132081,
+      "memory(GiB)": 131.7,
+      "step": 15,
+      "train_speed(iter/s)": 0.262462
+    },
+    {
+      "acc": 0.91665707,
+      "epoch": 0.14545454545454545,
+      "grad_norm": 8.643333147328232,
+      "learning_rate": 8.990241338447979e-06,
+      "loss": 0.2489913,
+      "memory(GiB)": 132.85,
+      "step": 20,
+      "train_speed(iter/s)": 0.283686
+    },
+    {
+      "acc": 0.92767944,
+      "epoch": 0.18181818181818182,
+      "grad_norm": 5.521300358752013,
+      "learning_rate": 9.659898769835576e-06,
+      "loss": 0.20275159,
+      "memory(GiB)": 100.16,
+      "step": 25,
+      "train_speed(iter/s)": 0.297772
+    },
+    {
+      "acc": 0.91237392,
+      "epoch": 0.21818181818181817,
+      "grad_norm": 4.691287833576054,
+      "learning_rate": 9.999635040777627e-06,
+      "loss": 0.24152677,
+      "memory(GiB)": 100.16,
+      "step": 30,
+      "train_speed(iter/s)": 0.306552
+    },
+    {
+      "acc": 0.89811802,
+      "epoch": 0.2545454545454545,
+      "grad_norm": 2.3267326177072176,
+      "learning_rate": 9.995529861428146e-06,
+      "loss": 0.2682821,
+      "memory(GiB)": 100.16,
+      "step": 35,
+      "train_speed(iter/s)": 0.312152
+    },
+    {
+      "acc": 0.91184635,
+      "epoch": 0.2909090909090909,
+      "grad_norm": 3.241573273096398,
+      "learning_rate": 9.986867061882612e-06,
+      "loss": 0.23578806,
+      "memory(GiB)": 100.16,
+      "step": 40,
+      "train_speed(iter/s)": 0.314978
+    },
+    {
+      "acc": 0.9012291,
+      "epoch": 0.32727272727272727,
+      "grad_norm": 2.3552152207973713,
+      "learning_rate": 9.973654546348053e-06,
+      "loss": 0.25761139,
+      "memory(GiB)": 100.16,
+      "step": 45,
+      "train_speed(iter/s)": 0.319208
+    },
+    {
+      "acc": 0.9016325,
+      "epoch": 0.36363636363636365,
+      "grad_norm": 3.1153181076119703,
+      "learning_rate": 9.955904370333514e-06,
+      "loss": 0.24715631,
+      "memory(GiB)": 100.16,
+      "step": 50,
+      "train_speed(iter/s)": 0.32296
+    },
+    {
+      "acc": 0.89749699,
+      "epoch": 0.4,
+      "grad_norm": 2.4498466601081943,
+      "learning_rate": 9.933632729650212e-06,
+      "loss": 0.25689688,
+      "memory(GiB)": 100.16,
+      "step": 55,
+      "train_speed(iter/s)": 0.325846
+    },
+    {
+      "acc": 0.88724833,
+      "epoch": 0.43636363636363634,
+      "grad_norm": 4.364723865759911,
+      "learning_rate": 9.906859945633999e-06,
+      "loss": 0.28743353,
+      "memory(GiB)": 100.16,
+      "step": 60,
+      "train_speed(iter/s)": 0.328247
+    },
+    {
+      "acc": 0.90578156,
+      "epoch": 0.4727272727272727,
+      "grad_norm": 3.243778418144708,
+      "learning_rate": 9.875610446603524e-06,
+      "loss": 0.26308877,
+      "memory(GiB)": 100.16,
+      "step": 65,
+      "train_speed(iter/s)": 0.330485
+    },
+    {
+      "acc": 0.89676228,
+      "epoch": 0.509090909090909,
+      "grad_norm": 3.4165598224968274,
+      "learning_rate": 9.83991274557109e-06,
+      "loss": 0.26372042,
+      "memory(GiB)": 127.96,
+      "step": 70,
+      "train_speed(iter/s)": 0.332413
+    },
+    {
+      "acc": 0.9054903,
+      "epoch": 0.5454545454545454,
+      "grad_norm": 3.814636181453338,
+      "learning_rate": 9.7997994142265e-06,
+      "loss": 0.25466361,
+      "memory(GiB)": 127.96,
+      "step": 75,
+      "train_speed(iter/s)": 0.334379
+    },
+    {
+      "acc": 0.90086946,
+      "epoch": 0.5818181818181818,
+      "grad_norm": 3.9972259822599243,
+      "learning_rate": 9.755307053217622e-06,
+      "loss": 0.27588401,
+      "memory(GiB)": 127.96,
+      "step": 80,
+      "train_speed(iter/s)": 0.336004
+    },
+    {
+      "acc": 0.89949837,
+      "epoch": 0.6181818181818182,
+      "grad_norm": 5.998240972031008,
+      "learning_rate": 9.706476258754834e-06,
+      "loss": 0.25472341,
+      "memory(GiB)": 127.96,
+      "step": 85,
+      "train_speed(iter/s)": 0.337291
+    },
+    {
+      "acc": 0.88558121,
+      "epoch": 0.6545454545454545,
+      "grad_norm": 2.7186082929792574,
+      "learning_rate": 9.653351585569786e-06,
+      "loss": 0.28254557,
+      "memory(GiB)": 127.96,
+      "step": 90,
+      "train_speed(iter/s)": 0.337576
+    },
+    {
+      "acc": 0.90562687,
+      "epoch": 0.6909090909090909,
+      "grad_norm": 1.6880555029124777,
+      "learning_rate": 9.595981506262264e-06,
+      "loss": 0.25460241,
+      "memory(GiB)": 127.96,
+      "step": 95,
+      "train_speed(iter/s)": 0.338319
+    },
+    {
+      "acc": 0.90238457,
+      "epoch": 0.7272727272727273,
+      "grad_norm": 1.824873702466673,
+      "learning_rate": 9.534418367072303e-06,
+      "loss": 0.25135682,
+      "memory(GiB)": 127.96,
+      "step": 100,
+      "train_speed(iter/s)": 0.33935
+    },
+    {
+      "acc": 0.90719824,
+      "epoch": 0.7636363636363637,
+      "grad_norm": 3.0523518026276926,
+      "learning_rate": 9.468718340117846e-06,
+      "loss": 0.23181794,
+      "memory(GiB)": 127.96,
+      "step": 105,
+      "train_speed(iter/s)": 0.340475
+    },
+    {
+      "acc": 0.89296656,
+      "epoch": 0.8,
+      "grad_norm": 3.6744833597367514,
+      "learning_rate": 9.398941372141562e-06,
+      "loss": 0.27924564,
+      "memory(GiB)": 127.96,
+      "step": 110,
+      "train_speed(iter/s)": 0.341456
+    },
+    {
+      "acc": 0.89754677,
+      "epoch": 0.8363636363636363,
+      "grad_norm": 3.250222318126925,
+      "learning_rate": 9.325151129813582e-06,
+      "loss": 0.26513102,
+      "memory(GiB)": 127.96,
+      "step": 115,
+      "train_speed(iter/s)": 0.342153
+    },
+    {
+      "acc": 0.88903837,
+      "epoch": 0.8727272727272727,
+      "grad_norm": 2.376728799007849,
+      "learning_rate": 9.247414941640045e-06,
+      "loss": 0.30169072,
+      "memory(GiB)": 133.76,
+      "step": 120,
+      "train_speed(iter/s)": 0.342998
+    },
+    {
+      "acc": 0.89329395,
+      "epoch": 0.9090909090909091,
+      "grad_norm": 4.889478322316845,
+      "learning_rate": 9.165803736530492e-06,
+      "loss": 0.28302565,
+      "memory(GiB)": 100.58,
+      "step": 125,
+      "train_speed(iter/s)": 0.343779
+    },
+    {
+      "acc": 0.89977417,
+      "epoch": 0.9454545454545454,
+      "grad_norm": 2.0057917841024633,
+      "learning_rate": 9.080391979080116e-06,
+      "loss": 0.2668047,
+      "memory(GiB)": 100.58,
+      "step": 130,
+      "train_speed(iter/s)": 0.344351
+    },
+    {
+      "acc": 0.90148487,
+      "epoch": 0.9818181818181818,
+      "grad_norm": 2.470715179920895,
+      "learning_rate": 8.991257601625973e-06,
+      "loss": 0.25751991,
+      "memory(GiB)": 100.58,
+      "step": 135,
+      "train_speed(iter/s)": 0.345171
+    },
+    {
+      "epoch": 1.0,
+      "eval_acc": 0.9078246620237608,
+      "eval_loss": 0.2361508309841156,
+      "eval_runtime": 10.278,
+      "eval_samples_per_second": 11.286,
+      "eval_steps_per_second": 1.459,
+      "step": 138
+    },
+    {
+      "acc": 0.8134038,
+      "epoch": 1.0145454545454546,
+      "grad_norm": 1.9385369249323439,
+      "learning_rate": 8.917324354080927e-06,
+      "loss": 0.254459,
+      "memory(GiB)": 100.58,
+      "step": 140,
+      "train_speed(iter/s)": 0.309598
+    },
+    {
+      "acc": 0.90728855,
+      "epoch": 1.050909090909091,
+      "grad_norm": 76.54794008048425,
+      "learning_rate": 8.82169644486897e-06,
+      "loss": 0.23623853,
+      "memory(GiB)": 100.58,
+      "step": 145,
+      "train_speed(iter/s)": 0.311044
+    },
+    {
+      "acc": 0.91997566,
+      "epoch": 1.0872727272727274,
+      "grad_norm": 1.727673298537959,
+      "learning_rate": 8.722581957483633e-06,
+      "loss": 0.21817675,
+      "memory(GiB)": 100.58,
+      "step": 150,
+      "train_speed(iter/s)": 0.31275
+    },
+    {
+      "acc": 0.91184559,
+      "epoch": 1.1236363636363635,
+      "grad_norm": 2.4370845690665974,
+      "learning_rate": 8.620071327057833e-06,
+      "loss": 0.22411692,
+      "memory(GiB)": 100.58,
+      "step": 155,
+      "train_speed(iter/s)": 0.314364
+    },
+    {
+      "acc": 0.91105995,
+      "epoch": 1.16,
+      "grad_norm": 4.474578962221848,
+      "learning_rate": 8.514258087470745e-06,
+      "loss": 0.22455444,
+      "memory(GiB)": 100.58,
+      "step": 160,
+      "train_speed(iter/s)": 0.315941
+    },
+    {
+      "acc": 0.92596989,
+      "epoch": 1.1963636363636363,
+      "grad_norm": 2.27714865436083,
+      "learning_rate": 8.405238786004592e-06,
+      "loss": 0.19618599,
+      "memory(GiB)": 100.58,
+      "step": 165,
+      "train_speed(iter/s)": 0.317423
+    },
+    {
+      "acc": 0.91807003,
+      "epoch": 1.2327272727272727,
+      "grad_norm": 3.476526282944283,
+      "learning_rate": 8.293112895251915e-06,
+      "loss": 0.21812358,
+      "memory(GiB)": 100.58,
+      "step": 170,
+      "train_speed(iter/s)": 0.318837
+    },
+    {
+      "acc": 0.91757879,
+      "epoch": 1.269090909090909,
+      "grad_norm": 2.812345046742586,
+      "learning_rate": 8.177982722353686e-06,
+      "loss": 0.20932765,
+      "memory(GiB)": 100.58,
+      "step": 175,
+      "train_speed(iter/s)": 0.319897
+    },
+    {
+      "acc": 0.9130724,
+      "epoch": 1.3054545454545454,
+      "grad_norm": 1.909403498812979,
+      "learning_rate": 8.059953315651102e-06,
+      "loss": 0.22100675,
+      "memory(GiB)": 100.58,
+      "step": 180,
+      "train_speed(iter/s)": 0.320821
+    },
+    {
+      "acc": 0.91083689,
+      "epoch": 1.3418181818181818,
+      "grad_norm": 3.7534483781265853,
+      "learning_rate": 7.93913236883622e-06,
+      "loss": 0.22075479,
+      "memory(GiB)": 100.58,
+      "step": 185,
+      "train_speed(iter/s)": 0.321724
+    },
+    {
+      "acc": 0.90749474,
+      "epoch": 1.3781818181818182,
+      "grad_norm": 3.0657460772043805,
+      "learning_rate": 7.815630122688893e-06,
+      "loss": 0.22630196,
+      "memory(GiB)": 100.58,
+      "step": 190,
+      "train_speed(iter/s)": 0.3226
+    },
+    {
+      "acc": 0.92584915,
+      "epoch": 1.4145454545454546,
+      "grad_norm": 5.821099128946982,
+      "learning_rate": 7.689559264489661e-06,
+      "loss": 0.21087196,
+      "memory(GiB)": 100.58,
+      "step": 195,
+      "train_speed(iter/s)": 0.32333
+    },
+    {
+      "acc": 0.90973835,
+      "epoch": 1.450909090909091,
+      "grad_norm": 1.830285233435649,
+      "learning_rate": 7.5610348252003814e-06,
+      "loss": 0.24081864,
+      "memory(GiB)": 100.58,
+      "step": 200,
+      "train_speed(iter/s)": 0.323755
+    },
+    {
+      "acc": 0.91908627,
+      "epoch": 1.4872727272727273,
+      "grad_norm": 3.46434543645635,
+      "learning_rate": 7.43017407450641e-06,
+      "loss": 0.21430855,
+      "memory(GiB)": 100.58,
+      "step": 205,
+      "train_speed(iter/s)": 0.324304
+    },
+    {
+      "acc": 0.90855217,
+      "epoch": 1.5236363636363637,
+      "grad_norm": 1.6445934060533671,
+      "learning_rate": 7.2970964138161006e-06,
+      "loss": 0.2204694,
+      "memory(GiB)": 100.58,
+      "step": 210,
+      "train_speed(iter/s)": 0.325137
+    },
+    {
+      "acc": 0.9202652,
+      "epoch": 1.56,
+      "grad_norm": 2.685739587728944,
+      "learning_rate": 7.161923267315262e-06,
+      "loss": 0.20784543,
+      "memory(GiB)": 100.58,
+      "step": 215,
+      "train_speed(iter/s)": 0.325877
+    },
+    {
+      "acc": 0.92430801,
+      "epoch": 1.5963636363636362,
+      "grad_norm": 3.4665236755524202,
+      "learning_rate": 7.0247779711759566e-06,
+      "loss": 0.2091445,
+      "memory(GiB)": 100.58,
+      "step": 220,
+      "train_speed(iter/s)": 0.326598
+    },
+    {
+      "acc": 0.91858587,
+      "epoch": 1.6327272727272728,
+      "grad_norm": 3.0400419237318674,
+      "learning_rate": 6.885785661020759e-06,
+      "loss": 0.22234173,
+      "memory(GiB)": 100.58,
+      "step": 225,
+      "train_speed(iter/s)": 0.32754
+    },
+    {
+      "acc": 0.91896229,
+      "epoch": 1.669090909090909,
+      "grad_norm": 2.50023791606214,
+      "learning_rate": 6.7450731577451255e-06,
+      "loss": 0.20558548,
+      "memory(GiB)": 100.58,
+      "step": 230,
+      "train_speed(iter/s)": 0.328407
+    },
+    {
+      "acc": 0.92307997,
+      "epoch": 1.7054545454545456,
+      "grad_norm": 2.789509587118081,
+      "learning_rate": 6.602768851802077e-06,
+      "loss": 0.21382501,
+      "memory(GiB)": 100.58,
+      "step": 235,
+      "train_speed(iter/s)": 0.329247
+    },
+    {
+      "acc": 0.91400127,
+      "epoch": 1.7418181818181817,
+      "grad_norm": 2.3889266426439173,
+      "learning_rate": 6.45900258605477e-06,
+      "loss": 0.21889751,
+      "memory(GiB)": 100.58,
+      "step": 240,
+      "train_speed(iter/s)": 0.330086
+    },
+    {
+      "acc": 0.90683708,
+      "epoch": 1.7781818181818183,
+      "grad_norm": 3.3107240552086465,
+      "learning_rate": 6.313905537303837e-06,
+      "loss": 0.21690502,
+      "memory(GiB)": 100.58,
+      "step": 245,
+      "train_speed(iter/s)": 0.330898
+    },
+    {
+      "acc": 0.91603336,
+      "epoch": 1.8145454545454545,
+      "grad_norm": 2.8852486239120547,
+      "learning_rate": 6.167610096597601e-06,
+      "loss": 0.2154119,
+      "memory(GiB)": 100.58,
+      "step": 250,
+      "train_speed(iter/s)": 0.331673
+    },
+    {
+      "acc": 0.91818409,
+      "epoch": 1.850909090909091,
+      "grad_norm": 2.0440810660323585,
+      "learning_rate": 6.020249748434384e-06,
+      "loss": 0.21951377,
+      "memory(GiB)": 100.58,
+      "step": 255,
+      "train_speed(iter/s)": 0.332356
+    },
+    {
+      "acc": 0.90970173,
+      "epoch": 1.8872727272727272,
+      "grad_norm": 3.8117037313040574,
+      "learning_rate": 5.871958948967106e-06,
+      "loss": 0.23594971,
+      "memory(GiB)": 100.58,
+      "step": 260,
+      "train_speed(iter/s)": 0.33293
+    },
+    {
+      "acc": 0.92123165,
+      "epoch": 1.9236363636363636,
+      "grad_norm": 3.4855685769436375,
+      "learning_rate": 5.722873003321322e-06,
+      "loss": 0.21117101,
+      "memory(GiB)": 100.58,
+      "step": 265,
+      "train_speed(iter/s)": 0.333662
+    },
+    {
+      "acc": 0.91777382,
+      "epoch": 1.96,
+      "grad_norm": 2.497000906964384,
+      "learning_rate": 5.573127942138622e-06,
+      "loss": 0.21624155,
+      "memory(GiB)": 100.58,
+      "step": 270,
+      "train_speed(iter/s)": 0.334225
+    },
+    {
+      "acc": 0.9166666,
+      "epoch": 1.9963636363636363,
+      "grad_norm": 4.782654736901845,
+      "learning_rate": 5.422860397458064e-06,
+      "loss": 0.21392875,
+      "memory(GiB)": 100.58,
+      "step": 275,
+      "train_speed(iter/s)": 0.334671
+    },
+    {
+      "epoch": 2.0,
+      "eval_acc": 0.9098730028676771,
+      "eval_loss": 0.2191523164510727,
+      "eval_runtime": 10.1618,
+      "eval_samples_per_second": 11.415,
+      "eval_steps_per_second": 1.476,
+      "step": 276
+    },
+    {
+      "acc": 0.84443541,
+      "epoch": 2.0290909090909093,
+      "grad_norm": 3.015403395241152,
+      "learning_rate": 5.27220747804885e-06,
+      "loss": 0.17099829,
+      "memory(GiB)": 100.58,
+      "step": 280,
+      "train_speed(iter/s)": 0.317633
+    },
+    {
+      "acc": 0.93253222,
+      "epoch": 2.0654545454545454,
+      "grad_norm": 2.167435558475328,
+      "learning_rate": 5.121306644308045e-06,
+      "loss": 0.18818057,
+      "memory(GiB)": 100.58,
+      "step": 285,
+      "train_speed(iter/s)": 0.3185
+    },
+    {
+      "acc": 0.94647446,
+      "epoch": 2.101818181818182,
+      "grad_norm": 2.1487311628542898,
+      "learning_rate": 4.9702955828374385e-06,
+      "loss": 0.15134431,
+      "memory(GiB)": 100.58,
+      "step": 290,
+      "train_speed(iter/s)": 0.319277
+    },
+    {
+      "acc": 0.93036728,
+      "epoch": 2.138181818181818,
+      "grad_norm": 4.174051904681519,
+      "learning_rate": 4.8193120808140185e-06,
+      "loss": 0.16832316,
+      "memory(GiB)": 100.58,
+      "step": 295,
+      "train_speed(iter/s)": 0.320077
+    },
+    {
+      "acc": 0.93621769,
+      "epoch": 2.174545454545455,
+      "grad_norm": 2.3866390406657896,
+      "learning_rate": 4.668493900268684e-06,
+      "loss": 0.16947901,
+      "memory(GiB)": 100.58,
+      "step": 300,
+      "train_speed(iter/s)": 0.320854
+    },
+    {
+      "acc": 0.93184824,
+      "epoch": 2.210909090909091,
+      "grad_norm": 2.7745369730901595,
+      "learning_rate": 4.517978652387882e-06,
+      "loss": 0.16975009,
+      "memory(GiB)": 100.58,
+      "step": 305,
+      "train_speed(iter/s)": 0.321626
+    },
+    {
+      "acc": 0.93711929,
+      "epoch": 2.247272727272727,
+      "grad_norm": 4.606104787695004,
+      "learning_rate": 4.367903671952906e-06,
+      "loss": 0.16885712,
+      "memory(GiB)": 100.58,
+      "step": 310,
+      "train_speed(iter/s)": 0.322203
+    },
+    {
+      "acc": 0.93099174,
+      "epoch": 2.2836363636363637,
+      "grad_norm": 8.944877147631175,
+      "learning_rate": 4.218405892031366e-06,
+      "loss": 0.17090337,
+      "memory(GiB)": 100.58,
+      "step": 315,
+      "train_speed(iter/s)": 0.322833
+    },
+    {
+      "acc": 0.93137035,
+      "epoch": 2.32,
+      "grad_norm": 4.336121777570645,
+      "learning_rate": 4.069621719035229e-06,
+      "loss": 0.1658249,
+      "memory(GiB)": 100.58,
+      "step": 320,
+      "train_speed(iter/s)": 0.323508
+    },
+    {
+      "acc": 0.9393259,
+      "epoch": 2.3563636363636364,
+      "grad_norm": 6.921537975970479,
+      "learning_rate": 3.921686908259354e-06,
+      "loss": 0.15576041,
+      "memory(GiB)": 100.58,
+      "step": 325,
+      "train_speed(iter/s)": 0.324182
+    },
+    {
+      "acc": 0.93962708,
+      "epoch": 2.3927272727272726,
+      "grad_norm": 3.5886891547630877,
+      "learning_rate": 3.7747364400141726e-06,
+      "loss": 0.16867373,
+      "memory(GiB)": 100.58,
+      "step": 330,
+      "train_speed(iter/s)": 0.324849
+    },
+    {
+      "acc": 0.93609505,
+      "epoch": 2.429090909090909,
+      "grad_norm": 2.686999433312404,
+      "learning_rate": 3.6289043964654526e-06,
+      "loss": 0.15810946,
+      "memory(GiB)": 100.58,
+      "step": 335,
+      "train_speed(iter/s)": 0.325493
+    },
+    {
+      "acc": 0.92649899,
+      "epoch": 2.4654545454545453,
+      "grad_norm": 2.591872854237207,
+      "learning_rate": 3.484323839293575e-06,
+      "loss": 0.17918372,
+      "memory(GiB)": 100.58,
+      "step": 340,
+      "train_speed(iter/s)": 0.326123
+    },
+    {
+      "acc": 0.93626881,
+      "epoch": 2.501818181818182,
+      "grad_norm": 2.5738296672570233,
+      "learning_rate": 3.341126688283922e-06,
+      "loss": 0.16855428,
+      "memory(GiB)": 100.58,
+      "step": 345,
+      "train_speed(iter/s)": 0.326743
+    },
+    {
+      "acc": 0.93825417,
+      "epoch": 2.538181818181818,
+      "grad_norm": 2.7529925608546466,
+      "learning_rate": 3.19944360095919e-06,
+      "loss": 0.16165339,
+      "memory(GiB)": 100.58,
+      "step": 350,
+      "train_speed(iter/s)": 0.327363
+    },
+    {
+      "acc": 0.94702225,
+      "epoch": 2.5745454545454547,
+      "grad_norm": 2.9545927202945315,
+      "learning_rate": 3.059403853363393e-06,
+      "loss": 0.14523516,
+      "memory(GiB)": 100.58,
+      "step": 355,
+      "train_speed(iter/s)": 0.327926
+    },
+    {
+      "acc": 0.94346981,
+      "epoch": 2.610909090909091,
+      "grad_norm": 4.047109124196383,
+      "learning_rate": 2.9211352221063987e-06,
+      "loss": 0.14715908,
+      "memory(GiB)": 100.58,
+      "step": 360,
+      "train_speed(iter/s)": 0.328285
+    },
+    {
+      "acc": 0.94318542,
+      "epoch": 2.6472727272727274,
+      "grad_norm": 2.3923230638690143,
+      "learning_rate": 2.7847638677765936e-06,
+      "loss": 0.1494684,
+      "memory(GiB)": 100.58,
+      "step": 365,
+      "train_speed(iter/s)": 0.328722
+    },
+    {
+      "acc": 0.95623245,
+      "epoch": 2.6836363636363636,
+      "grad_norm": 2.457260493406828,
+      "learning_rate": 2.650414219828032e-06,
+      "loss": 0.11759402,
+      "memory(GiB)": 100.58,
+      "step": 370,
+      "train_speed(iter/s)": 0.329264
+    },
+    {
+      "acc": 0.94435921,
+      "epoch": 2.7199999999999998,
+      "grad_norm": 1.5322367904545142,
+      "learning_rate": 2.5182088630471517e-06,
+      "loss": 0.13577256,
+      "memory(GiB)": 100.58,
+      "step": 375,
+      "train_speed(iter/s)": 0.329788
+    },
+    {
+      "acc": 0.94585953,
+      "epoch": 2.7563636363636363,
+      "grad_norm": 2.8650025435958666,
+      "learning_rate": 2.388268425702614e-06,
+      "loss": 0.14076474,
+      "memory(GiB)": 100.58,
+      "step": 380,
+      "train_speed(iter/s)": 0.330302
+    },
+    {
+      "acc": 0.9413455,
+      "epoch": 2.792727272727273,
+      "grad_norm": 4.510750432829035,
+      "learning_rate": 2.2607114694803263e-06,
+      "loss": 0.1642381,
+      "memory(GiB)": 100.58,
+      "step": 385,
+      "train_speed(iter/s)": 0.330731
+    },
+    {
+      "acc": 0.93006382,
+      "epoch": 2.829090909090909,
+      "grad_norm": 2.908591189518448,
+      "learning_rate": 2.1356543813040863e-06,
+      "loss": 0.17094066,
+      "memory(GiB)": 100.58,
+      "step": 390,
+      "train_speed(iter/s)": 0.331119
+    },
+    {
+      "acc": 0.94227448,
+      "epoch": 2.8654545454545453,
+      "grad_norm": 2.331626905910975,
+      "learning_rate": 2.0132112671405244e-06,
+      "loss": 0.14904225,
+      "memory(GiB)": 100.58,
+      "step": 395,
+      "train_speed(iter/s)": 0.331532
+    },
+    {
+      "acc": 0.93090382,
+      "epoch": 2.901818181818182,
+      "grad_norm": 4.223665768837086,
+      "learning_rate": 1.8934938478853108e-06,
+      "loss": 0.17768097,
+      "memory(GiB)": 100.58,
+      "step": 400,
+      "train_speed(iter/s)": 0.331963
+    },
+    {
+      "acc": 0.93722563,
+      "epoch": 2.9381818181818184,
+      "grad_norm": 2.7247775486261734,
+      "learning_rate": 1.7766113574255145e-06,
+      "loss": 0.15059752,
+      "memory(GiB)": 100.58,
+      "step": 405,
+      "train_speed(iter/s)": 0.332266
+    },
+    {
+      "acc": 0.94374504,
+      "epoch": 2.9745454545454546,
+      "grad_norm": 2.9951618135706055,
+      "learning_rate": 1.6626704429712411e-06,
+      "loss": 0.14953468,
+      "memory(GiB)": 100.58,
+      "step": 410,
+      "train_speed(iter/s)": 0.332599
+    },
+    {
+      "epoch": 3.0,
+      "eval_acc": 0.9192953707496927,
+      "eval_loss": 0.21225064992904663,
+      "eval_runtime": 9.5239,
+      "eval_samples_per_second": 12.18,
+      "eval_steps_per_second": 1.575,
+      "step": 414
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 548,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 4,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.931788793840435e+16,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ff027f5be53ec7518dcbefa171fe5337bc61486082a24c11c0c08315322b5a87
+size 10680

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

zero_to_fp32.py ADDED Viewed

	@@ -0,0 +1,760 @@

+#!/usr/bin/env python
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example:
+#   python zero_to_fp32.py . output_dir/
+#   or
+#   python zero_to_fp32.py . output_dir/ --safe_serialization
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+import gc
+import json
+import numpy as np
+from tqdm import tqdm
+from collections import OrderedDict
+from dataclasses import dataclass
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+debug = 0
+# load to cpu
+device = torch.device('cpu')
+def atoi(text):
+    return int(text) if text.isdigit() else text
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+    return file
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+    return ckpt_files
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device, weights_only=False)
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+        ds_version = state_dict.get(DS_VERSION, None)
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+    return zero_model_states
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in tqdm(files, desc='Loading checkpoint shards'):
+        state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+    if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+    if type(world_size) is list:
+        world_size = max(world_size)
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+    fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    return zero_stage, world_size, fp32_flat_groups
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+    model_files = get_model_state_files(ds_checkpoint_dir)
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        state_dict[name] = frozen_param_fragments[name]
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+class GatheredTensor:
+    """
+    A pseudo tensor that collects partitioned weights.
+    It is more memory efficient when there are multiple groups.
+    """
+    def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
+        self.flat_groups = flat_groups
+        self.flat_groups_offset = flat_groups_offset
+        self.offset = offset
+        self.partitioned_numel = partitioned_numel
+        self.shape = shape
+        self.dtype = self.flat_groups[0][0].dtype
+    def contiguous(self):
+        """
+        Merge partitioned weights from flat_groups into a single tensor.
+        """
+        end_idx = self.offset + self.partitioned_numel
+        world_size = len(self.flat_groups)
+        pad_flat_param_chunks = []
+        for rank_i in range(world_size):
+            # for each rank, we need to collect weights from related group/groups
+            flat_groups_at_rank_i = self.flat_groups[rank_i]
+            start_group_id = None
+            end_group_id = None
+            for group_id in range(len(self.flat_groups_offset)):
+                if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
+                    start_group_id = group_id
+                if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
+                    end_group_id = group_id
+                    break
+            # collect weights from related group/groups
+            for group_id in range(start_group_id, end_group_id + 1):
+                flat_tensor = flat_groups_at_rank_i[group_id]
+                start_offset = self.offset - self.flat_groups_offset[group_id]
+                end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
+                pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
+        # collect weights from all ranks
+        pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
+        param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
+        return param
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
+    for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+        # memory efficient tensor
+        tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
+        state_dict[name] = tensor
+        offset += partitioned_numel
+    offset *= world_size
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def to_torch_tensor(state_dict, return_empty_tensor=False):
+    """
+    Convert state_dict of GatheredTensor to torch tensor
+    """
+    torch_state_dict = {}
+    converted_tensors = {}
+    for name, tensor in state_dict.items():
+        tensor_id = id(tensor)
+        if tensor_id in converted_tensors:  # shared tensors
+            shared_tensor = torch_state_dict[converted_tensors[tensor_id]]
+            torch_state_dict[name] = shared_tensor
+        else:
+            converted_tensors[tensor_id] = name
+            if return_empty_tensor:
+                torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
+            else:
+                torch_state_dict[name] = tensor.contiguous()
+    return torch_state_dict
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                             tag=None,
+                                             exclude_frozen_parameters=False,
+                                             lazy_mode=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+        - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
+          Convert the pesduo tensor to torch tensor by ``.contiguous()``
+    Returns:
+        - pytorch ``state_dict``
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+    Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
+    You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint. Or you can load state_dict in lazy mode ::
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
+        for name, lazy_tensor in state_dict.item():
+            tensor = lazy_tensor.contiguous()  # to cpu
+            print(name, tensor)
+            # del tensor to release memory if it no longer in use
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+    state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+    if lazy_mode:
+        return state_dict
+    else:
+        return to_torch_tensor(state_dict)
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
+                                               output_dir,
+                                               max_shard_size="5GB",
+                                               safe_serialization=False,
+                                               tag=None,
+                                               exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_dir``: directory to the pytorch fp32 state_dict output files
+        - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
+        - ``safe_serialization``:  whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+    # Dependency pre-check
+    if safe_serialization:
+        try:
+            from safetensors.torch import save_file
+        except ImportError:
+            print('If you want to use `safe_serialization`, please `pip install safetensors`')
+            raise
+    if max_shard_size is not None:
+        try:
+            from huggingface_hub import split_torch_state_dict_into_shards
+        except ImportError:
+            print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
+            raise
+    # Convert zero checkpoint to state_dict
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                                          tag,
+                                                          exclude_frozen_parameters,
+                                                          lazy_mode=True)
+    # Shard the model if it is too big.
+    weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
+    if max_shard_size is not None:
+        filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+        # an memory-efficient approach for sharding
+        empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
+        state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
+                                                              filename_pattern=filename_pattern,
+                                                              max_shard_size=max_shard_size)
+    else:
+        from collections import namedtuple
+        StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
+        state_dict_split = StateDictSplit(is_sharded=False,
+                                          filename_to_tensors={weights_name: list(state_dict.keys())})
+    # Save the model by shard
+    os.makedirs(output_dir, exist_ok=True)
+    filename_to_tensors = state_dict_split.filename_to_tensors.items()
+    for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
+        shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
+        shard_state_dict = to_torch_tensor(shard_state_dict)
+        output_path = os.path.join(output_dir, shard_file)
+        if safe_serialization:
+            save_file(shard_state_dict, output_path, metadata={"format": "pt"})
+        else:
+            torch.save(shard_state_dict, output_path)
+        # release the memory of current shard
+        for tensor_name in list(shard_state_dict.keys()):
+            del state_dict[tensor_name]
+            del shard_state_dict[tensor_name]
+        del shard_state_dict
+        gc.collect()
+    # Save index if sharded
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+        save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
+        save_index_file = os.path.join(output_dir, save_index_file)
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+    Returns:
+        - ``model`: modified model
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    """
+    logger.info(f"Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+    logger.info(f"Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+    return model
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument("output_dir",
+                        type=str,
+                        help="directory to the pytorch fp32 state_dict output files"
+                        "(e.g. path/checkpoint-12-output/)")
+    parser.add_argument(
+        "--max_shard_size",
+        type=str,
+        default="5GB",
+        help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
+        "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
+        "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
+        "without CPU OOM issues.")
+    parser.add_argument(
+        "--safe_serialization",
+        default=False,
+        action='store_true',
+        help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+    debug = args.debug
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_dir,
+                                               max_shard_size=args.max_shard_size,
+                                               safe_serialization=args.safe_serialization,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)