Upload folder using huggingface_hub

Browse files

Files changed (18) hide show

added_tokens.json +1026 -0
config.json +85 -0
configuration_florence2.py +340 -0
generation_config.json +4 -0
latest +1 -0
merges.txt +0 -0
model.safetensors +3 -0
modeling_florence2.py +0 -0
preprocessor_config.json +39 -0
processing_florence2.py +1090 -0
scheduler.pt +3 -0
special_tokens_map.json +0 -0
tokenizer.json +0 -0
tokenizer_config.json +4 -0
trainer_state.json +0 -0
training_args.bin +3 -0
vocab.json +0 -0
zero_to_fp32.py +592 -0

added_tokens.json ADDED Viewed

	@@ -0,0 +1,1026 @@

+{
+  "</cap>": 51270,
+  "</dcap>": 51274,
+  "</grounding>": 51276,
+  "</ncap>": 51272,
+  "</ocr>": 50268,
+  "</od>": 50266,
+  "</poly>": 51287,
+  "</proposal>": 51285,
+  "</region_cap>": 51281,
+  "</region_to_desciption>": 51283,
+  "</seg>": 51278,
+  "<and>": 51288,
+  "<cap>": 51269,
+  "<dcap>": 51273,
+  "<grounding>": 51275,
+  "<loc_0>": 50269,
+  "<loc_100>": 50369,
+  "<loc_101>": 50370,
+  "<loc_102>": 50371,
+  "<loc_103>": 50372,
+  "<loc_104>": 50373,
+  "<loc_105>": 50374,
+  "<loc_106>": 50375,
+  "<loc_107>": 50376,
+  "<loc_108>": 50377,
+  "<loc_109>": 50378,
+  "<loc_10>": 50279,
+  "<loc_110>": 50379,
+  "<loc_111>": 50380,
+  "<loc_112>": 50381,
+  "<loc_113>": 50382,
+  "<loc_114>": 50383,
+  "<loc_115>": 50384,
+  "<loc_116>": 50385,
+  "<loc_117>": 50386,
+  "<loc_118>": 50387,
+  "<loc_119>": 50388,
+  "<loc_11>": 50280,
+  "<loc_120>": 50389,
+  "<loc_121>": 50390,
+  "<loc_122>": 50391,
+  "<loc_123>": 50392,
+  "<loc_124>": 50393,
+  "<loc_125>": 50394,
+  "<loc_126>": 50395,
+  "<loc_127>": 50396,
+  "<loc_128>": 50397,
+  "<loc_129>": 50398,
+  "<loc_12>": 50281,
+  "<loc_130>": 50399,
+  "<loc_131>": 50400,
+  "<loc_132>": 50401,
+  "<loc_133>": 50402,
+  "<loc_134>": 50403,
+  "<loc_135>": 50404,
+  "<loc_136>": 50405,
+  "<loc_137>": 50406,
+  "<loc_138>": 50407,
+  "<loc_139>": 50408,
+  "<loc_13>": 50282,
+  "<loc_140>": 50409,
+  "<loc_141>": 50410,
+  "<loc_142>": 50411,
+  "<loc_143>": 50412,
+  "<loc_144>": 50413,
+  "<loc_145>": 50414,
+  "<loc_146>": 50415,
+  "<loc_147>": 50416,
+  "<loc_148>": 50417,
+  "<loc_149>": 50418,
+  "<loc_14>": 50283,
+  "<loc_150>": 50419,
+  "<loc_151>": 50420,
+  "<loc_152>": 50421,
+  "<loc_153>": 50422,
+  "<loc_154>": 50423,
+  "<loc_155>": 50424,
+  "<loc_156>": 50425,
+  "<loc_157>": 50426,
+  "<loc_158>": 50427,
+  "<loc_159>": 50428,
+  "<loc_15>": 50284,
+  "<loc_160>": 50429,
+  "<loc_161>": 50430,
+  "<loc_162>": 50431,
+  "<loc_163>": 50432,
+  "<loc_164>": 50433,
+  "<loc_165>": 50434,
+  "<loc_166>": 50435,
+  "<loc_167>": 50436,
+  "<loc_168>": 50437,
+  "<loc_169>": 50438,
+  "<loc_16>": 50285,
+  "<loc_170>": 50439,
+  "<loc_171>": 50440,
+  "<loc_172>": 50441,
+  "<loc_173>": 50442,
+  "<loc_174>": 50443,
+  "<loc_175>": 50444,
+  "<loc_176>": 50445,
+  "<loc_177>": 50446,
+  "<loc_178>": 50447,
+  "<loc_179>": 50448,
+  "<loc_17>": 50286,
+  "<loc_180>": 50449,
+  "<loc_181>": 50450,
+  "<loc_182>": 50451,
+  "<loc_183>": 50452,
+  "<loc_184>": 50453,
+  "<loc_185>": 50454,
+  "<loc_186>": 50455,
+  "<loc_187>": 50456,
+  "<loc_188>": 50457,
+  "<loc_189>": 50458,
+  "<loc_18>": 50287,
+  "<loc_190>": 50459,
+  "<loc_191>": 50460,
+  "<loc_192>": 50461,
+  "<loc_193>": 50462,
+  "<loc_194>": 50463,
+  "<loc_195>": 50464,
+  "<loc_196>": 50465,
+  "<loc_197>": 50466,
+  "<loc_198>": 50467,
+  "<loc_199>": 50468,
+  "<loc_19>": 50288,
+  "<loc_1>": 50270,
+  "<loc_200>": 50469,
+  "<loc_201>": 50470,
+  "<loc_202>": 50471,
+  "<loc_203>": 50472,
+  "<loc_204>": 50473,
+  "<loc_205>": 50474,
+  "<loc_206>": 50475,
+  "<loc_207>": 50476,
+  "<loc_208>": 50477,
+  "<loc_209>": 50478,
+  "<loc_20>": 50289,
+  "<loc_210>": 50479,
+  "<loc_211>": 50480,
+  "<loc_212>": 50481,
+  "<loc_213>": 50482,
+  "<loc_214>": 50483,
+  "<loc_215>": 50484,
+  "<loc_216>": 50485,
+  "<loc_217>": 50486,
+  "<loc_218>": 50487,
+  "<loc_219>": 50488,
+  "<loc_21>": 50290,
+  "<loc_220>": 50489,
+  "<loc_221>": 50490,
+  "<loc_222>": 50491,
+  "<loc_223>": 50492,
+  "<loc_224>": 50493,
+  "<loc_225>": 50494,
+  "<loc_226>": 50495,
+  "<loc_227>": 50496,
+  "<loc_228>": 50497,
+  "<loc_229>": 50498,
+  "<loc_22>": 50291,
+  "<loc_230>": 50499,
+  "<loc_231>": 50500,
+  "<loc_232>": 50501,
+  "<loc_233>": 50502,
+  "<loc_234>": 50503,
+  "<loc_235>": 50504,
+  "<loc_236>": 50505,
+  "<loc_237>": 50506,
+  "<loc_238>": 50507,
+  "<loc_239>": 50508,
+  "<loc_23>": 50292,
+  "<loc_240>": 50509,
+  "<loc_241>": 50510,
+  "<loc_242>": 50511,
+  "<loc_243>": 50512,
+  "<loc_244>": 50513,
+  "<loc_245>": 50514,
+  "<loc_246>": 50515,
+  "<loc_247>": 50516,
+  "<loc_248>": 50517,
+  "<loc_249>": 50518,
+  "<loc_24>": 50293,
+  "<loc_250>": 50519,
+  "<loc_251>": 50520,
+  "<loc_252>": 50521,
+  "<loc_253>": 50522,
+  "<loc_254>": 50523,
+  "<loc_255>": 50524,
+  "<loc_256>": 50525,
+  "<loc_257>": 50526,
+  "<loc_258>": 50527,
+  "<loc_259>": 50528,
+  "<loc_25>": 50294,
+  "<loc_260>": 50529,
+  "<loc_261>": 50530,
+  "<loc_262>": 50531,
+  "<loc_263>": 50532,
+  "<loc_264>": 50533,
+  "<loc_265>": 50534,
+  "<loc_266>": 50535,
+  "<loc_267>": 50536,
+  "<loc_268>": 50537,
+  "<loc_269>": 50538,
+  "<loc_26>": 50295,
+  "<loc_270>": 50539,
+  "<loc_271>": 50540,
+  "<loc_272>": 50541,
+  "<loc_273>": 50542,
+  "<loc_274>": 50543,
+  "<loc_275>": 50544,
+  "<loc_276>": 50545,
+  "<loc_277>": 50546,
+  "<loc_278>": 50547,
+  "<loc_279>": 50548,
+  "<loc_27>": 50296,
+  "<loc_280>": 50549,
+  "<loc_281>": 50550,
+  "<loc_282>": 50551,
+  "<loc_283>": 50552,
+  "<loc_284>": 50553,
+  "<loc_285>": 50554,
+  "<loc_286>": 50555,
+  "<loc_287>": 50556,
+  "<loc_288>": 50557,
+  "<loc_289>": 50558,
+  "<loc_28>": 50297,
+  "<loc_290>": 50559,
+  "<loc_291>": 50560,
+  "<loc_292>": 50561,
+  "<loc_293>": 50562,
+  "<loc_294>": 50563,
+  "<loc_295>": 50564,
+  "<loc_296>": 50565,
+  "<loc_297>": 50566,
+  "<loc_298>": 50567,
+  "<loc_299>": 50568,
+  "<loc_29>": 50298,
+  "<loc_2>": 50271,
+  "<loc_300>": 50569,
+  "<loc_301>": 50570,
+  "<loc_302>": 50571,
+  "<loc_303>": 50572,
+  "<loc_304>": 50573,
+  "<loc_305>": 50574,
+  "<loc_306>": 50575,
+  "<loc_307>": 50576,
+  "<loc_308>": 50577,
+  "<loc_309>": 50578,
+  "<loc_30>": 50299,
+  "<loc_310>": 50579,
+  "<loc_311>": 50580,
+  "<loc_312>": 50581,
+  "<loc_313>": 50582,
+  "<loc_314>": 50583,
+  "<loc_315>": 50584,
+  "<loc_316>": 50585,
+  "<loc_317>": 50586,
+  "<loc_318>": 50587,
+  "<loc_319>": 50588,
+  "<loc_31>": 50300,
+  "<loc_320>": 50589,
+  "<loc_321>": 50590,
+  "<loc_322>": 50591,
+  "<loc_323>": 50592,
+  "<loc_324>": 50593,
+  "<loc_325>": 50594,
+  "<loc_326>": 50595,
+  "<loc_327>": 50596,
+  "<loc_328>": 50597,
+  "<loc_329>": 50598,
+  "<loc_32>": 50301,
+  "<loc_330>": 50599,
+  "<loc_331>": 50600,
+  "<loc_332>": 50601,
+  "<loc_333>": 50602,
+  "<loc_334>": 50603,
+  "<loc_335>": 50604,
+  "<loc_336>": 50605,
+  "<loc_337>": 50606,
+  "<loc_338>": 50607,
+  "<loc_339>": 50608,
+  "<loc_33>": 50302,
+  "<loc_340>": 50609,
+  "<loc_341>": 50610,
+  "<loc_342>": 50611,
+  "<loc_343>": 50612,
+  "<loc_344>": 50613,
+  "<loc_345>": 50614,
+  "<loc_346>": 50615,
+  "<loc_347>": 50616,
+  "<loc_348>": 50617,
+  "<loc_349>": 50618,
+  "<loc_34>": 50303,
+  "<loc_350>": 50619,
+  "<loc_351>": 50620,
+  "<loc_352>": 50621,
+  "<loc_353>": 50622,
+  "<loc_354>": 50623,
+  "<loc_355>": 50624,
+  "<loc_356>": 50625,
+  "<loc_357>": 50626,
+  "<loc_358>": 50627,
+  "<loc_359>": 50628,
+  "<loc_35>": 50304,
+  "<loc_360>": 50629,
+  "<loc_361>": 50630,
+  "<loc_362>": 50631,
+  "<loc_363>": 50632,
+  "<loc_364>": 50633,
+  "<loc_365>": 50634,
+  "<loc_366>": 50635,
+  "<loc_367>": 50636,
+  "<loc_368>": 50637,
+  "<loc_369>": 50638,
+  "<loc_36>": 50305,
+  "<loc_370>": 50639,
+  "<loc_371>": 50640,
+  "<loc_372>": 50641,
+  "<loc_373>": 50642,
+  "<loc_374>": 50643,
+  "<loc_375>": 50644,
+  "<loc_376>": 50645,
+  "<loc_377>": 50646,
+  "<loc_378>": 50647,
+  "<loc_379>": 50648,
+  "<loc_37>": 50306,
+  "<loc_380>": 50649,
+  "<loc_381>": 50650,
+  "<loc_382>": 50651,
+  "<loc_383>": 50652,
+  "<loc_384>": 50653,
+  "<loc_385>": 50654,
+  "<loc_386>": 50655,
+  "<loc_387>": 50656,
+  "<loc_388>": 50657,
+  "<loc_389>": 50658,
+  "<loc_38>": 50307,
+  "<loc_390>": 50659,
+  "<loc_391>": 50660,
+  "<loc_392>": 50661,
+  "<loc_393>": 50662,
+  "<loc_394>": 50663,
+  "<loc_395>": 50664,
+  "<loc_396>": 50665,
+  "<loc_397>": 50666,
+  "<loc_398>": 50667,
+  "<loc_399>": 50668,
+  "<loc_39>": 50308,
+  "<loc_3>": 50272,
+  "<loc_400>": 50669,
+  "<loc_401>": 50670,
+  "<loc_402>": 50671,
+  "<loc_403>": 50672,
+  "<loc_404>": 50673,
+  "<loc_405>": 50674,
+  "<loc_406>": 50675,
+  "<loc_407>": 50676,
+  "<loc_408>": 50677,
+  "<loc_409>": 50678,
+  "<loc_40>": 50309,
+  "<loc_410>": 50679,
+  "<loc_411>": 50680,
+  "<loc_412>": 50681,
+  "<loc_413>": 50682,
+  "<loc_414>": 50683,
+  "<loc_415>": 50684,
+  "<loc_416>": 50685,
+  "<loc_417>": 50686,
+  "<loc_418>": 50687,
+  "<loc_419>": 50688,
+  "<loc_41>": 50310,
+  "<loc_420>": 50689,
+  "<loc_421>": 50690,
+  "<loc_422>": 50691,
+  "<loc_423>": 50692,
+  "<loc_424>": 50693,
+  "<loc_425>": 50694,
+  "<loc_426>": 50695,
+  "<loc_427>": 50696,
+  "<loc_428>": 50697,
+  "<loc_429>": 50698,
+  "<loc_42>": 50311,
+  "<loc_430>": 50699,
+  "<loc_431>": 50700,
+  "<loc_432>": 50701,
+  "<loc_433>": 50702,
+  "<loc_434>": 50703,
+  "<loc_435>": 50704,
+  "<loc_436>": 50705,
+  "<loc_437>": 50706,
+  "<loc_438>": 50707,
+  "<loc_439>": 50708,
+  "<loc_43>": 50312,
+  "<loc_440>": 50709,
+  "<loc_441>": 50710,
+  "<loc_442>": 50711,
+  "<loc_443>": 50712,
+  "<loc_444>": 50713,
+  "<loc_445>": 50714,
+  "<loc_446>": 50715,
+  "<loc_447>": 50716,
+  "<loc_448>": 50717,
+  "<loc_449>": 50718,
+  "<loc_44>": 50313,
+  "<loc_450>": 50719,
+  "<loc_451>": 50720,
+  "<loc_452>": 50721,
+  "<loc_453>": 50722,
+  "<loc_454>": 50723,
+  "<loc_455>": 50724,
+  "<loc_456>": 50725,
+  "<loc_457>": 50726,
+  "<loc_458>": 50727,
+  "<loc_459>": 50728,
+  "<loc_45>": 50314,
+  "<loc_460>": 50729,
+  "<loc_461>": 50730,
+  "<loc_462>": 50731,
+  "<loc_463>": 50732,
+  "<loc_464>": 50733,
+  "<loc_465>": 50734,
+  "<loc_466>": 50735,
+  "<loc_467>": 50736,
+  "<loc_468>": 50737,
+  "<loc_469>": 50738,
+  "<loc_46>": 50315,
+  "<loc_470>": 50739,
+  "<loc_471>": 50740,
+  "<loc_472>": 50741,
+  "<loc_473>": 50742,
+  "<loc_474>": 50743,
+  "<loc_475>": 50744,
+  "<loc_476>": 50745,
+  "<loc_477>": 50746,
+  "<loc_478>": 50747,
+  "<loc_479>": 50748,
+  "<loc_47>": 50316,
+  "<loc_480>": 50749,
+  "<loc_481>": 50750,
+  "<loc_482>": 50751,
+  "<loc_483>": 50752,
+  "<loc_484>": 50753,
+  "<loc_485>": 50754,
+  "<loc_486>": 50755,
+  "<loc_487>": 50756,
+  "<loc_488>": 50757,
+  "<loc_489>": 50758,
+  "<loc_48>": 50317,
+  "<loc_490>": 50759,
+  "<loc_491>": 50760,
+  "<loc_492>": 50761,
+  "<loc_493>": 50762,
+  "<loc_494>": 50763,
+  "<loc_495>": 50764,
+  "<loc_496>": 50765,
+  "<loc_497>": 50766,
+  "<loc_498>": 50767,
+  "<loc_499>": 50768,
+  "<loc_49>": 50318,
+  "<loc_4>": 50273,
+  "<loc_500>": 50769,
+  "<loc_501>": 50770,
+  "<loc_502>": 50771,
+  "<loc_503>": 50772,
+  "<loc_504>": 50773,
+  "<loc_505>": 50774,
+  "<loc_506>": 50775,
+  "<loc_507>": 50776,
+  "<loc_508>": 50777,
+  "<loc_509>": 50778,
+  "<loc_50>": 50319,
+  "<loc_510>": 50779,
+  "<loc_511>": 50780,
+  "<loc_512>": 50781,
+  "<loc_513>": 50782,
+  "<loc_514>": 50783,
+  "<loc_515>": 50784,
+  "<loc_516>": 50785,
+  "<loc_517>": 50786,
+  "<loc_518>": 50787,
+  "<loc_519>": 50788,
+  "<loc_51>": 50320,
+  "<loc_520>": 50789,
+  "<loc_521>": 50790,
+  "<loc_522>": 50791,
+  "<loc_523>": 50792,
+  "<loc_524>": 50793,
+  "<loc_525>": 50794,
+  "<loc_526>": 50795,
+  "<loc_527>": 50796,
+  "<loc_528>": 50797,
+  "<loc_529>": 50798,
+  "<loc_52>": 50321,
+  "<loc_530>": 50799,
+  "<loc_531>": 50800,
+  "<loc_532>": 50801,
+  "<loc_533>": 50802,
+  "<loc_534>": 50803,
+  "<loc_535>": 50804,
+  "<loc_536>": 50805,
+  "<loc_537>": 50806,
+  "<loc_538>": 50807,
+  "<loc_539>": 50808,
+  "<loc_53>": 50322,
+  "<loc_540>": 50809,
+  "<loc_541>": 50810,
+  "<loc_542>": 50811,
+  "<loc_543>": 50812,
+  "<loc_544>": 50813,
+  "<loc_545>": 50814,
+  "<loc_546>": 50815,
+  "<loc_547>": 50816,
+  "<loc_548>": 50817,
+  "<loc_549>": 50818,
+  "<loc_54>": 50323,
+  "<loc_550>": 50819,
+  "<loc_551>": 50820,
+  "<loc_552>": 50821,
+  "<loc_553>": 50822,
+  "<loc_554>": 50823,
+  "<loc_555>": 50824,
+  "<loc_556>": 50825,
+  "<loc_557>": 50826,
+  "<loc_558>": 50827,
+  "<loc_559>": 50828,
+  "<loc_55>": 50324,
+  "<loc_560>": 50829,
+  "<loc_561>": 50830,
+  "<loc_562>": 50831,
+  "<loc_563>": 50832,
+  "<loc_564>": 50833,
+  "<loc_565>": 50834,
+  "<loc_566>": 50835,
+  "<loc_567>": 50836,
+  "<loc_568>": 50837,
+  "<loc_569>": 50838,
+  "<loc_56>": 50325,
+  "<loc_570>": 50839,
+  "<loc_571>": 50840,
+  "<loc_572>": 50841,
+  "<loc_573>": 50842,
+  "<loc_574>": 50843,
+  "<loc_575>": 50844,
+  "<loc_576>": 50845,
+  "<loc_577>": 50846,
+  "<loc_578>": 50847,
+  "<loc_579>": 50848,
+  "<loc_57>": 50326,
+  "<loc_580>": 50849,
+  "<loc_581>": 50850,
+  "<loc_582>": 50851,
+  "<loc_583>": 50852,
+  "<loc_584>": 50853,
+  "<loc_585>": 50854,
+  "<loc_586>": 50855,
+  "<loc_587>": 50856,
+  "<loc_588>": 50857,
+  "<loc_589>": 50858,
+  "<loc_58>": 50327,
+  "<loc_590>": 50859,
+  "<loc_591>": 50860,
+  "<loc_592>": 50861,
+  "<loc_593>": 50862,
+  "<loc_594>": 50863,
+  "<loc_595>": 50864,
+  "<loc_596>": 50865,
+  "<loc_597>": 50866,
+  "<loc_598>": 50867,
+  "<loc_599>": 50868,
+  "<loc_59>": 50328,
+  "<loc_5>": 50274,
+  "<loc_600>": 50869,
+  "<loc_601>": 50870,
+  "<loc_602>": 50871,
+  "<loc_603>": 50872,
+  "<loc_604>": 50873,
+  "<loc_605>": 50874,
+  "<loc_606>": 50875,
+  "<loc_607>": 50876,
+  "<loc_608>": 50877,
+  "<loc_609>": 50878,
+  "<loc_60>": 50329,
+  "<loc_610>": 50879,
+  "<loc_611>": 50880,
+  "<loc_612>": 50881,
+  "<loc_613>": 50882,
+  "<loc_614>": 50883,
+  "<loc_615>": 50884,
+  "<loc_616>": 50885,
+  "<loc_617>": 50886,
+  "<loc_618>": 50887,
+  "<loc_619>": 50888,
+  "<loc_61>": 50330,
+  "<loc_620>": 50889,
+  "<loc_621>": 50890,
+  "<loc_622>": 50891,
+  "<loc_623>": 50892,
+  "<loc_624>": 50893,
+  "<loc_625>": 50894,
+  "<loc_626>": 50895,
+  "<loc_627>": 50896,
+  "<loc_628>": 50897,
+  "<loc_629>": 50898,
+  "<loc_62>": 50331,
+  "<loc_630>": 50899,
+  "<loc_631>": 50900,
+  "<loc_632>": 50901,
+  "<loc_633>": 50902,
+  "<loc_634>": 50903,
+  "<loc_635>": 50904,
+  "<loc_636>": 50905,
+  "<loc_637>": 50906,
+  "<loc_638>": 50907,
+  "<loc_639>": 50908,
+  "<loc_63>": 50332,
+  "<loc_640>": 50909,
+  "<loc_641>": 50910,
+  "<loc_642>": 50911,
+  "<loc_643>": 50912,
+  "<loc_644>": 50913,
+  "<loc_645>": 50914,
+  "<loc_646>": 50915,
+  "<loc_647>": 50916,
+  "<loc_648>": 50917,
+  "<loc_649>": 50918,
+  "<loc_64>": 50333,
+  "<loc_650>": 50919,
+  "<loc_651>": 50920,
+  "<loc_652>": 50921,
+  "<loc_653>": 50922,
+  "<loc_654>": 50923,
+  "<loc_655>": 50924,
+  "<loc_656>": 50925,
+  "<loc_657>": 50926,
+  "<loc_658>": 50927,
+  "<loc_659>": 50928,
+  "<loc_65>": 50334,
+  "<loc_660>": 50929,
+  "<loc_661>": 50930,
+  "<loc_662>": 50931,
+  "<loc_663>": 50932,
+  "<loc_664>": 50933,
+  "<loc_665>": 50934,
+  "<loc_666>": 50935,
+  "<loc_667>": 50936,
+  "<loc_668>": 50937,
+  "<loc_669>": 50938,
+  "<loc_66>": 50335,
+  "<loc_670>": 50939,
+  "<loc_671>": 50940,
+  "<loc_672>": 50941,
+  "<loc_673>": 50942,
+  "<loc_674>": 50943,
+  "<loc_675>": 50944,
+  "<loc_676>": 50945,
+  "<loc_677>": 50946,
+  "<loc_678>": 50947,
+  "<loc_679>": 50948,
+  "<loc_67>": 50336,
+  "<loc_680>": 50949,
+  "<loc_681>": 50950,
+  "<loc_682>": 50951,
+  "<loc_683>": 50952,
+  "<loc_684>": 50953,
+  "<loc_685>": 50954,
+  "<loc_686>": 50955,
+  "<loc_687>": 50956,
+  "<loc_688>": 50957,
+  "<loc_689>": 50958,
+  "<loc_68>": 50337,
+  "<loc_690>": 50959,
+  "<loc_691>": 50960,
+  "<loc_692>": 50961,
+  "<loc_693>": 50962,
+  "<loc_694>": 50963,
+  "<loc_695>": 50964,
+  "<loc_696>": 50965,
+  "<loc_697>": 50966,
+  "<loc_698>": 50967,
+  "<loc_699>": 50968,
+  "<loc_69>": 50338,
+  "<loc_6>": 50275,
+  "<loc_700>": 50969,
+  "<loc_701>": 50970,
+  "<loc_702>": 50971,
+  "<loc_703>": 50972,
+  "<loc_704>": 50973,
+  "<loc_705>": 50974,
+  "<loc_706>": 50975,
+  "<loc_707>": 50976,
+  "<loc_708>": 50977,
+  "<loc_709>": 50978,
+  "<loc_70>": 50339,
+  "<loc_710>": 50979,
+  "<loc_711>": 50980,
+  "<loc_712>": 50981,
+  "<loc_713>": 50982,
+  "<loc_714>": 50983,
+  "<loc_715>": 50984,
+  "<loc_716>": 50985,
+  "<loc_717>": 50986,
+  "<loc_718>": 50987,
+  "<loc_719>": 50988,
+  "<loc_71>": 50340,
+  "<loc_720>": 50989,
+  "<loc_721>": 50990,
+  "<loc_722>": 50991,
+  "<loc_723>": 50992,
+  "<loc_724>": 50993,
+  "<loc_725>": 50994,
+  "<loc_726>": 50995,
+  "<loc_727>": 50996,
+  "<loc_728>": 50997,
+  "<loc_729>": 50998,
+  "<loc_72>": 50341,
+  "<loc_730>": 50999,
+  "<loc_731>": 51000,
+  "<loc_732>": 51001,
+  "<loc_733>": 51002,
+  "<loc_734>": 51003,
+  "<loc_735>": 51004,
+  "<loc_736>": 51005,
+  "<loc_737>": 51006,
+  "<loc_738>": 51007,
+  "<loc_739>": 51008,
+  "<loc_73>": 50342,
+  "<loc_740>": 51009,
+  "<loc_741>": 51010,
+  "<loc_742>": 51011,
+  "<loc_743>": 51012,
+  "<loc_744>": 51013,
+  "<loc_745>": 51014,
+  "<loc_746>": 51015,
+  "<loc_747>": 51016,
+  "<loc_748>": 51017,
+  "<loc_749>": 51018,
+  "<loc_74>": 50343,
+  "<loc_750>": 51019,
+  "<loc_751>": 51020,
+  "<loc_752>": 51021,
+  "<loc_753>": 51022,
+  "<loc_754>": 51023,
+  "<loc_755>": 51024,
+  "<loc_756>": 51025,
+  "<loc_757>": 51026,
+  "<loc_758>": 51027,
+  "<loc_759>": 51028,
+  "<loc_75>": 50344,
+  "<loc_760>": 51029,
+  "<loc_761>": 51030,
+  "<loc_762>": 51031,
+  "<loc_763>": 51032,
+  "<loc_764>": 51033,
+  "<loc_765>": 51034,
+  "<loc_766>": 51035,
+  "<loc_767>": 51036,
+  "<loc_768>": 51037,
+  "<loc_769>": 51038,
+  "<loc_76>": 50345,
+  "<loc_770>": 51039,
+  "<loc_771>": 51040,
+  "<loc_772>": 51041,
+  "<loc_773>": 51042,
+  "<loc_774>": 51043,
+  "<loc_775>": 51044,
+  "<loc_776>": 51045,
+  "<loc_777>": 51046,
+  "<loc_778>": 51047,
+  "<loc_779>": 51048,
+  "<loc_77>": 50346,
+  "<loc_780>": 51049,
+  "<loc_781>": 51050,
+  "<loc_782>": 51051,
+  "<loc_783>": 51052,
+  "<loc_784>": 51053,
+  "<loc_785>": 51054,
+  "<loc_786>": 51055,
+  "<loc_787>": 51056,
+  "<loc_788>": 51057,
+  "<loc_789>": 51058,
+  "<loc_78>": 50347,
+  "<loc_790>": 51059,
+  "<loc_791>": 51060,
+  "<loc_792>": 51061,
+  "<loc_793>": 51062,
+  "<loc_794>": 51063,
+  "<loc_795>": 51064,
+  "<loc_796>": 51065,
+  "<loc_797>": 51066,
+  "<loc_798>": 51067,
+  "<loc_799>": 51068,
+  "<loc_79>": 50348,
+  "<loc_7>": 50276,
+  "<loc_800>": 51069,
+  "<loc_801>": 51070,
+  "<loc_802>": 51071,
+  "<loc_803>": 51072,
+  "<loc_804>": 51073,
+  "<loc_805>": 51074,
+  "<loc_806>": 51075,
+  "<loc_807>": 51076,
+  "<loc_808>": 51077,
+  "<loc_809>": 51078,
+  "<loc_80>": 50349,
+  "<loc_810>": 51079,
+  "<loc_811>": 51080,
+  "<loc_812>": 51081,
+  "<loc_813>": 51082,
+  "<loc_814>": 51083,
+  "<loc_815>": 51084,
+  "<loc_816>": 51085,
+  "<loc_817>": 51086,
+  "<loc_818>": 51087,
+  "<loc_819>": 51088,
+  "<loc_81>": 50350,
+  "<loc_820>": 51089,
+  "<loc_821>": 51090,
+  "<loc_822>": 51091,
+  "<loc_823>": 51092,
+  "<loc_824>": 51093,
+  "<loc_825>": 51094,
+  "<loc_826>": 51095,
+  "<loc_827>": 51096,
+  "<loc_828>": 51097,
+  "<loc_829>": 51098,
+  "<loc_82>": 50351,
+  "<loc_830>": 51099,
+  "<loc_831>": 51100,
+  "<loc_832>": 51101,
+  "<loc_833>": 51102,
+  "<loc_834>": 51103,
+  "<loc_835>": 51104,
+  "<loc_836>": 51105,
+  "<loc_837>": 51106,
+  "<loc_838>": 51107,
+  "<loc_839>": 51108,
+  "<loc_83>": 50352,
+  "<loc_840>": 51109,
+  "<loc_841>": 51110,
+  "<loc_842>": 51111,
+  "<loc_843>": 51112,
+  "<loc_844>": 51113,
+  "<loc_845>": 51114,
+  "<loc_846>": 51115,
+  "<loc_847>": 51116,
+  "<loc_848>": 51117,
+  "<loc_849>": 51118,
+  "<loc_84>": 50353,
+  "<loc_850>": 51119,
+  "<loc_851>": 51120,
+  "<loc_852>": 51121,
+  "<loc_853>": 51122,
+  "<loc_854>": 51123,
+  "<loc_855>": 51124,
+  "<loc_856>": 51125,
+  "<loc_857>": 51126,
+  "<loc_858>": 51127,
+  "<loc_859>": 51128,
+  "<loc_85>": 50354,
+  "<loc_860>": 51129,
+  "<loc_861>": 51130,
+  "<loc_862>": 51131,
+  "<loc_863>": 51132,
+  "<loc_864>": 51133,
+  "<loc_865>": 51134,
+  "<loc_866>": 51135,
+  "<loc_867>": 51136,
+  "<loc_868>": 51137,
+  "<loc_869>": 51138,
+  "<loc_86>": 50355,
+  "<loc_870>": 51139,
+  "<loc_871>": 51140,
+  "<loc_872>": 51141,
+  "<loc_873>": 51142,
+  "<loc_874>": 51143,
+  "<loc_875>": 51144,
+  "<loc_876>": 51145,
+  "<loc_877>": 51146,
+  "<loc_878>": 51147,
+  "<loc_879>": 51148,
+  "<loc_87>": 50356,
+  "<loc_880>": 51149,
+  "<loc_881>": 51150,
+  "<loc_882>": 51151,
+  "<loc_883>": 51152,
+  "<loc_884>": 51153,
+  "<loc_885>": 51154,
+  "<loc_886>": 51155,
+  "<loc_887>": 51156,
+  "<loc_888>": 51157,
+  "<loc_889>": 51158,
+  "<loc_88>": 50357,
+  "<loc_890>": 51159,
+  "<loc_891>": 51160,
+  "<loc_892>": 51161,
+  "<loc_893>": 51162,
+  "<loc_894>": 51163,
+  "<loc_895>": 51164,
+  "<loc_896>": 51165,
+  "<loc_897>": 51166,
+  "<loc_898>": 51167,
+  "<loc_899>": 51168,
+  "<loc_89>": 50358,
+  "<loc_8>": 50277,
+  "<loc_900>": 51169,
+  "<loc_901>": 51170,
+  "<loc_902>": 51171,
+  "<loc_903>": 51172,
+  "<loc_904>": 51173,
+  "<loc_905>": 51174,
+  "<loc_906>": 51175,
+  "<loc_907>": 51176,
+  "<loc_908>": 51177,
+  "<loc_909>": 51178,
+  "<loc_90>": 50359,
+  "<loc_910>": 51179,
+  "<loc_911>": 51180,
+  "<loc_912>": 51181,
+  "<loc_913>": 51182,
+  "<loc_914>": 51183,
+  "<loc_915>": 51184,
+  "<loc_916>": 51185,
+  "<loc_917>": 51186,
+  "<loc_918>": 51187,
+  "<loc_919>": 51188,
+  "<loc_91>": 50360,
+  "<loc_920>": 51189,
+  "<loc_921>": 51190,
+  "<loc_922>": 51191,
+  "<loc_923>": 51192,
+  "<loc_924>": 51193,
+  "<loc_925>": 51194,
+  "<loc_926>": 51195,
+  "<loc_927>": 51196,
+  "<loc_928>": 51197,
+  "<loc_929>": 51198,
+  "<loc_92>": 50361,
+  "<loc_930>": 51199,
+  "<loc_931>": 51200,
+  "<loc_932>": 51201,
+  "<loc_933>": 51202,
+  "<loc_934>": 51203,
+  "<loc_935>": 51204,
+  "<loc_936>": 51205,
+  "<loc_937>": 51206,
+  "<loc_938>": 51207,
+  "<loc_939>": 51208,
+  "<loc_93>": 50362,
+  "<loc_940>": 51209,
+  "<loc_941>": 51210,
+  "<loc_942>": 51211,
+  "<loc_943>": 51212,
+  "<loc_944>": 51213,
+  "<loc_945>": 51214,
+  "<loc_946>": 51215,
+  "<loc_947>": 51216,
+  "<loc_948>": 51217,
+  "<loc_949>": 51218,
+  "<loc_94>": 50363,
+  "<loc_950>": 51219,
+  "<loc_951>": 51220,
+  "<loc_952>": 51221,
+  "<loc_953>": 51222,
+  "<loc_954>": 51223,
+  "<loc_955>": 51224,
+  "<loc_956>": 51225,
+  "<loc_957>": 51226,
+  "<loc_958>": 51227,
+  "<loc_959>": 51228,
+  "<loc_95>": 50364,
+  "<loc_960>": 51229,
+  "<loc_961>": 51230,
+  "<loc_962>": 51231,
+  "<loc_963>": 51232,
+  "<loc_964>": 51233,
+  "<loc_965>": 51234,
+  "<loc_966>": 51235,
+  "<loc_967>": 51236,
+  "<loc_968>": 51237,
+  "<loc_969>": 51238,
+  "<loc_96>": 50365,
+  "<loc_970>": 51239,
+  "<loc_971>": 51240,
+  "<loc_972>": 51241,
+  "<loc_973>": 51242,
+  "<loc_974>": 51243,
+  "<loc_975>": 51244,
+  "<loc_976>": 51245,
+  "<loc_977>": 51246,
+  "<loc_978>": 51247,
+  "<loc_979>": 51248,
+  "<loc_97>": 50366,
+  "<loc_980>": 51249,
+  "<loc_981>": 51250,
+  "<loc_982>": 51251,
+  "<loc_983>": 51252,
+  "<loc_984>": 51253,
+  "<loc_985>": 51254,
+  "<loc_986>": 51255,
+  "<loc_987>": 51256,
+  "<loc_988>": 51257,
+  "<loc_989>": 51258,
+  "<loc_98>": 50367,
+  "<loc_990>": 51259,
+  "<loc_991>": 51260,
+  "<loc_992>": 51261,
+  "<loc_993>": 51262,
+  "<loc_994>": 51263,
+  "<loc_995>": 51264,
+  "<loc_996>": 51265,
+  "<loc_997>": 51266,
+  "<loc_998>": 51267,
+  "<loc_999>": 51268,
+  "<loc_99>": 50368,
+  "<loc_9>": 50278,
+  "<ncap>": 51271,
+  "<ocr>": 50267,
+  "<od>": 50265,
+  "<poly>": 51286,
+  "<proposal>": 51284,
+  "<region_cap>": 51280,
+  "<region_to_desciption>": 51282,
+  "<seg>": 51277,
+  "<sep>": 51279
+}

config.json ADDED Viewed

	@@ -0,0 +1,85 @@

+{
+  "_name_or_path": "florence2",
+  "architectures": [
+    "Florence2ForConditionalGeneration"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_florence2.Florence2Config",
+    "AutoModelForCausalLM": "modeling_florence2.Florence2ForConditionalGeneration"
+  },
+  "bos_token_id": 0,
+  "eos_token_id": 2,
+  "ignore_index": -100,
+  "model_type": "florence2",
+  "pad_token_id": 1,
+  "projection_dim": 1024,
+  "text_config": {
+      "vocab_size": 51289,
+      "activation_dropout": 0.1,
+      "activation_function": "gelu",
+      "add_bias_logits": false,
+      "add_final_layer_norm": false,
+      "attention_dropout": 0.1,
+      "bos_token_id": 0,
+      "classif_dropout": 0.1,
+      "classifier_dropout": 0.0,
+      "d_model": 1024,
+      "decoder_attention_heads": 16,
+      "decoder_ffn_dim": 4096,
+      "decoder_layerdrop": 0.0,
+      "decoder_layers": 12,
+      "decoder_start_token_id": 2,
+      "dropout": 0.1,
+      "early_stopping": true,
+      "encoder_attention_heads": 16,
+      "encoder_ffn_dim": 4096,
+      "encoder_layerdrop": 0.0,
+      "encoder_layers": 12,
+      "eos_token_id": 2,
+      "forced_eos_token_id": 2,
+      "forced_bos_token_id": 0,
+      "gradient_checkpointing": false,
+      "init_std": 0.02,
+      "is_encoder_decoder": true,
+      "label2id": {
+        "LABEL_0": 0,
+        "LABEL_1": 1,
+        "LABEL_2": 2
+      },
+      "max_position_embeddings": 1024,
+      "no_repeat_ngram_size": 3,
+      "normalize_before": false,
+      "num_hidden_layers": 12,
+      "pad_token_id": 1,
+      "scale_embedding": false,
+      "num_beams": 3
+  },
+  "vision_config": {
+    "model_type": "davit",
+    "drop_path_rate": 0.1,
+    "patch_size": [7, 3, 3, 3],
+    "patch_stride": [4, 2, 2, 2],
+    "patch_padding": [3, 1, 1, 1],
+    "patch_prenorm": [false, true, true, true],
+    "enable_checkpoint": false,
+    "dim_embed": [256, 512, 1024, 2048],
+    "num_heads": [8, 16, 32, 64],
+    "num_groups": [8, 16, 32, 64],
+    "depths": [1, 1, 9, 1],
+    "window_size": 12,
+    "projection_dim": 1024,
+    "visual_temporal_embedding": {
+        "type": "COSINE",
+        "max_temporal_embeddings": 100
+    },
+    "image_pos_embed": {
+        "type": "learned_abs_2d",
+        "max_pos_embeddings": 50
+    },
+    "image_feature_source": ["spatial_avg_pool", "temporal_avg_pool"]
+  },
+  "vocab_size": 51289,
+  "torch_dtype": "float32",
+  "transformers_version": "4.41.0.dev0",
+  "is_encoder_decoder": true
+}

configuration_florence2.py ADDED Viewed

	@@ -0,0 +1,340 @@

+# coding=utf-8
+# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+""" Florence-2 configuration"""
+from typing import Optional
+from transformers import AutoConfig
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class Florence2VisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Florence2VisionModel`]. It is used to instantiate a Florence2VisionModel
+    according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Florence2VisionModel architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        drop_path_rate (`float`, *optional*, defaults to 0.1):
+            The dropout rate of the drop path layer.
+        patch_size (`List[int]`, *optional*, defaults to [7, 3, 3, 3]):
+            The patch size of the image.
+        patch_stride (`List[int]`, *optional*, defaults to [4, 2, 2, 2]):
+            The patch stride of the image.
+        patch_padding (`List[int]`, *optional*, defaults to [3, 1, 1, 1]):
+            The patch padding of the image.
+        patch_prenorm (`List[bool]`, *optional*, defaults to [false, true, true, true]):
+            Whether to apply layer normalization before the patch embedding layer.
+        enable_checkpoint (`bool`, *optional*, defaults to False):
+            Whether to enable checkpointing.
+        dim_embed (`List[int]`, *optional*, defaults to [256, 512, 1024, 2048]):
+            The dimension of the embedding layer.
+        num_heads (`List[int]`, *optional*, defaults to [8, 16, 32, 64]):
+            The number of attention heads.
+        num_groups (`List[int]`, *optional*, defaults to [8, 16, 32, 64]):
+            The number of groups.
+        depths (`List[int]`, *optional*, defaults to [1, 1, 9, 1]):
+            The depth of the model.
+        window_size (`int`, *optional*, defaults to 12):
+            The window size of the model.
+        projection_dim (`int`, *optional*, defaults to 1024):
+            The dimension of the projection layer.
+        visual_temporal_embedding (`dict`, *optional*):
+            The configuration of the visual temporal embedding.
+        image_pos_embed (`dict`, *optional*):
+            The configuration of the image position embedding.
+        image_feature_source (`List[str]`, *optional*, defaults to ["spatial_avg_pool", "temporal_avg_pool"]):
+            The source of the image feature.
+    Example:
+    ```python
+    >>> from transformers import Florence2VisionConfig, Florence2VisionModel
+    >>> # Initializing a Florence2 Vision style configuration
+    >>> configuration = Florence2VisionConfig()
+    >>> # Initializing a model (with random weights)
+    >>> model = Florence2VisionModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "florence2_vision"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        drop_path_rate=0.1,
+        patch_size=[7, 3, 3, 3],
+        patch_stride=[4, 2, 2, 2],
+        patch_padding=[3, 1, 1, 1],
+        patch_prenorm=[False, True, True, True],
+        enable_checkpoint=False,
+        dim_embed=[256, 512, 1024, 2048],
+        num_heads=[8, 16, 32, 64],
+        num_groups=[8, 16, 32, 64],
+        depths=[1, 1, 9, 1],
+        window_size=12,
+        projection_dim=1024,
+        visual_temporal_embedding=None,
+        image_pos_embed=None,
+        image_feature_source=["spatial_avg_pool", "temporal_avg_pool"],
+        **kwargs,
+    ):
+        self.drop_path_rate = drop_path_rate
+        self.patch_size = patch_size
+        self.patch_stride = patch_stride
+        self.patch_padding = patch_padding
+        self.patch_prenorm = patch_prenorm
+        self.enable_checkpoint = enable_checkpoint
+        self.dim_embed = dim_embed
+        self.num_heads = num_heads
+        self.num_groups = num_groups
+        self.depths = depths
+        self.window_size = window_size
+        self.projection_dim = projection_dim
+        self.visual_temporal_embedding = visual_temporal_embedding
+        self.image_pos_embed = image_pos_embed
+        self.image_feature_source = image_feature_source
+        super().__init__(**kwargs)
+class Florence2LanguageConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Florence2LanguagePreTrainedModel`]. It is used to instantiate a BART
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the BART
+    [facebook/bart-large](https://huggingface.co/facebook/bart-large) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 51289):
+            Vocabulary size of the Florence2Language model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Florence2LanguageModel`].
+        d_model (`int`, *optional*, defaults to 1024):
+            Dimensionality of the layers and the pooler layer.
+        encoder_layers (`int`, *optional*, defaults to 12):
+            Number of encoder layers.
+        decoder_layers (`int`, *optional*, defaults to 12):
+            Number of decoder layers.
+        encoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        decoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        decoder_ffn_dim (`int`, *optional*, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+        encoder_ffn_dim (`int`, *optional*, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        classifier_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for classifier.
+        max_position_embeddings (`int`, *optional*, defaults to 1024):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        init_std (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        encoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        decoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        scale_embedding (`bool`, *optional*, defaults to `False`):
+            Scale embeddings by diving by sqrt(d_model).
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        num_labels (`int`, *optional*, defaults to 3):
+            The number of labels to use in [`Florence2LanguageForSequenceClassification`].
+        forced_eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the token to force as the last generated token when `max_length` is reached. Usually set to
+            `eos_token_id`.
+    Example:
+    ```python
+    >>> from transformers import Florence2LanguageConfig, Florence2LanguageModel
+    >>> # Initializing a Florence2 Language style configuration
+    >>> configuration = Florence2LanguageConfig()
+    >>> # Initializing a model (with random weights)
+    >>> model = Florence2LangaugeModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "florence2_language"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
+    def __init__(
+        self,
+        vocab_size=51289,
+        max_position_embeddings=1024,
+        encoder_layers=12,
+        encoder_ffn_dim=4096,
+        encoder_attention_heads=16,
+        decoder_layers=12,
+        decoder_ffn_dim=4096,
+        decoder_attention_heads=16,
+        encoder_layerdrop=0.0,
+        decoder_layerdrop=0.0,
+        activation_function="gelu",
+        d_model=1024,
+        dropout=0.1,
+        attention_dropout=0.0,
+        activation_dropout=0.0,
+        init_std=0.02,
+        classifier_dropout=0.0,
+        scale_embedding=False,
+        use_cache=True,
+        num_labels=3,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        is_encoder_decoder=True,
+        decoder_start_token_id=2,
+        forced_eos_token_id=2,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.d_model = d_model
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.init_std = init_std
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layerdrop = decoder_layerdrop
+        self.classifier_dropout = classifier_dropout
+        self.use_cache = use_cache
+        self.num_hidden_layers = encoder_layers
+        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
+        super().__init__(
+            num_labels=num_labels,
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            decoder_start_token_id=decoder_start_token_id,
+            forced_eos_token_id=forced_eos_token_id,
+            **kwargs,
+        )
+        # ensure backward compatibility for BART CNN models
+        if self.forced_bos_token_id is None and kwargs.get("force_bos_token_to_be_generated", False):
+            self.forced_bos_token_id = self.bos_token_id
+            warnings.warn(
+                f"Please make sure the config includes `forced_bos_token_id={self.bos_token_id}` in future versions. "
+                "The config can simply be saved and uploaded again to be fixed."
+            )
+class Florence2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Florence2ForConditionalGeneration`]. It is used to instantiate an
+    Florence-2 model according to the specified arguments, defining the model architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vision_config (`Florence2VisionConfig`,  *optional*):
+            Custom vision config or dict
+        text_config (`Union[AutoConfig, dict]`, *optional*):
+            The config object of the text backbone.
+        ignore_index (`int`, *optional*, defaults to -100):
+            The ignore index for the loss function.
+        vocab_size (`int`, *optional*, defaults to 51289):
+            Vocabulary size of the Florence2model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`~Florence2ForConditionalGeneration`]
+        projection_dim (`int`, *optional*, defaults to 1024):
+            Dimension of the multimodal projection space.
+    Example:
+    ```python
+    >>> from transformers import Florence2ForConditionalGeneration, Florence2Config, CLIPVisionConfig, BartConfig
+    >>> # Initializing a clip-like vision config
+    >>> vision_config = CLIPVisionConfig()
+    >>> # Initializing a Bart config
+    >>> text_config = BartConfig()
+    >>> # Initializing a Florence-2 configuration
+    >>> configuration = Florence2Config(vision_config, text_config)
+    >>> # Initializing a model from the florence-2 configuration
+    >>> model = Florence2ForConditionalGeneration(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "florence2"
+    is_composition = False
+    def __init__(
+        self,
+        vision_config=None,
+        text_config=None,
+        ignore_index=-100,
+        vocab_size=51289,
+        projection_dim=1024,
+        **kwargs,
+    ):
+        self.ignore_index = ignore_index
+        self.vocab_size = vocab_size
+        self.projection_dim = projection_dim
+        if vision_config is not None:
+            vision_config = PretrainedConfig(**vision_config)
+        self.vision_config = vision_config
+        self.vocab_size = self.vocab_size
+        self.text_config = text_config
+        if text_config is not None:
+            self.text_config = Florence2LanguageConfig(**text_config)
+        super().__init__(**kwargs)

generation_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+    "num_beams": 3,
+    "early_stopping": false
+}

latest ADDED Viewed

	@@ -0,0 +1 @@


1	+ global_step4800

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8e35dc300c227bbd259ac3f033799ab7ffc8982c243a0447ef68900f5c2ffcd8
+size 1856101938

modeling_florence2.py ADDED Viewed

The diff for this file is too large to render. See raw diff

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "auto_map": {
+    "AutoProcessor": "processing_florence2.Florence2Processor"
+   },
+  "_valid_processor_keys": [
+    "images",
+    "do_resize",
+    "size",
+    "resample",
+    "do_rescale",
+    "rescale_factor",
+    "do_normalize",
+    "image_mean",
+    "image_std",
+    "return_tensors",
+    "data_format",
+    "input_data_format",
+    "do_convert_rgb"
+  ],
+  "do_convert_rgb": null,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "do_center_crop": false,
+  "image_processor_type": "CLIPImageProcessor",
+  "image_seq_length": 577,
+  "image_mean": [0.485, 0.456, 0.406],
+  "image_std":  [0.229, 0.224, 0.225],
+  "processor_class": "Florence2Processor",
+  "resample": 3,
+  "size": {
+    "height": 768,
+    "width":768
+  },
+  "crop_size": {
+    "height": 768,
+    "width": 768
+  }
+}

processing_florence2.py ADDED Viewed

	@@ -0,0 +1,1090 @@

+# coding=utf-8
+# Copyright 2024 Microsoft and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for Florence-2.
+"""
+import re
+import logging
+from typing import List, Optional, Union
+import numpy as np
+import torch
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.image_utils import ImageInput, is_valid_image
+from transformers.processing_utils import ProcessorMixin
+from transformers.tokenization_utils_base import (
+    PaddingStrategy,
+    PreTokenizedInput,
+    TextInput,
+    TruncationStrategy,
+)
+from transformers.utils import TensorType
+logger = logging.getLogger(__name__)
+# Copied from transformers.models.idefics2.processing_idefics2.is_url
+def is_url(val) -> bool:
+    return isinstance(val, str) and val.startswith("http")
+# Copied from transformers.models.idefics2.processing_idefics2.is_image_or_image_url
+def is_image_or_image_url(elem):
+    return is_url(elem) or is_valid_image(elem)
+def _is_str_or_image(elem):
+    return isinstance(elem, (str)) or is_image_or_image_url(elem)
+class Florence2Processor(ProcessorMixin):
+    r"""
+    Constructs a Florence2 processor which wraps a Florence2 image processor and a Florence2 tokenizer into a single processor.
+    [`Florence2Processor`] offers all the functionalities of [`CLIPImageProcessor`] and [`BartTokenizerFast`]. See the
+    [`~Florence2Processor.__call__`] and [`~Florence2Processor.decode`] for more information.
+    Args:
+        image_processor ([`CLIPImageProcessor`], *optional*):
+            The image processor is a required input.
+        tokenizer ([`BartTokenizerFast`], *optional*):
+            The tokenizer is a required input.
+    """
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "CLIPImageProcessor"
+    tokenizer_class = ("BartTokenizer", "BartTokenizerFast")
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+    ):
+        if image_processor is None:
+            raise ValueError("You need to specify an `image_processor`.")
+        if tokenizer is None:
+            raise ValueError("You need to specify a `tokenizer`.")
+        if not hasattr(image_processor, "image_seq_length"):
+            raise ValueError("Image processor is missing an `image_seq_length` attribute.")
+        self.image_seq_length = image_processor.image_seq_length
+        tokens_to_add = {
+                'additional_special_tokens': \
+                    tokenizer.additional_special_tokens + \
+                    ['<od>', '</od>', '<ocr>', '</ocr>'] + \
+                    [f'<loc_{x}>' for x in range(1000)] + \
+                    ['<cap>', '</cap>', '<ncap>', '</ncap>','<dcap>', '</dcap>', '<grounding>', '</grounding>', '<seg>', '</seg>', '<sep>', '<region_cap>', '</region_cap>', '<region_to_desciption>', '</region_to_desciption>', '<proposal>', '</proposal>', '<poly>', '</poly>', '<and>']
+            }
+        tokenizer.add_special_tokens(tokens_to_add)
+        self.tasks_answer_post_processing_type = {
+            '<OCR>': 'pure_text',
+            '<OCR_WITH_REGION>': 'ocr',
+            '<CAPTION>': 'pure_text',
+            '<DETAILED_CAPTION>': 'pure_text',
+            '<MORE_DETAILED_CAPTION>': 'pure_text',
+            '<OD>': 'description_with_bboxes',
+            '<DENSE_REGION_CAPTION>': 'description_with_bboxes',
+            '<CAPTION_TO_PHRASE_GROUNDING>': "phrase_grounding",
+            '<REFERRING_EXPRESSION_SEGMENTATION>': 'polygons',
+            '<REGION_TO_SEGMENTATION>': 'polygons',
+            '<OPEN_VOCABULARY_DETECTION>': 'description_with_bboxes_or_polygons',
+            '<REGION_TO_CATEGORY>': 'pure_text',
+            '<REGION_TO_DESCRIPTION>': 'pure_text',
+            '<REGION_TO_OCR>': 'pure_text',
+            '<REGION_PROPOSAL>': 'bboxes'
+        }
+        self.task_prompts_without_inputs = {
+            '<OCR>': 'What is the text in the image?',
+            '<OCR_WITH_REGION>': 'What is the text in the image, with regions?',
+            '<CAPTION>': 'What does the image describe?',
+            '<DETAILED_CAPTION>': 'Describe in detail what is shown in the image.',
+            '<MORE_DETAILED_CAPTION>': 'Describe with a paragraph what is shown in the image.',
+            '<OD>': 'Locate the objects with category name in the image.',
+            '<DENSE_REGION_CAPTION>': 'Locate the objects in the image, with their descriptions.',
+            '<REGION_PROPOSAL>': 'Locate the region proposals in the image.'
+        }
+        self.task_prompts_with_input = {
+            '<CAPTION_TO_PHRASE_GROUNDING>': "Locate the phrases in the caption: {input}",
+            '<REFERRING_EXPRESSION_SEGMENTATION>': 'Locate {input} in the image with mask',
+            '<REGION_TO_SEGMENTATION>': 'What is the polygon mask of region {input}',
+            '<OPEN_VOCABULARY_DETECTION>': 'Locate {input} in the image.',
+            '<REGION_TO_CATEGORY>': 'What is the region {input}?',
+            '<REGION_TO_DESCRIPTION>': 'What does the region {input} describe?',
+            '<REGION_TO_OCR>': 'What text is in the region {input}?',
+        }
+        self.post_processor = Florence2PostProcesser(tokenizer=tokenizer)
+        super().__init__(image_processor, tokenizer)
+    def _construct_prompts(self, text):
+        # replace the task tokens with the task prompts if task token is in the text
+        prompts = []
+        for _text in text:
+            # 1. fixed task prompts without additional inputs
+            for task_token, task_prompt in self.task_prompts_without_inputs.items():
+                if task_token in _text:
+                    assert _text == task_token, f"Task token {task_token} should be the only token in the text."
+                    _text = task_prompt
+                    break
+            # 2. task prompts with additional inputs
+            for task_token, task_prompt in self.task_prompts_with_input.items():
+                if task_token in _text:
+                    _text = task_prompt.format(input=_text.replace(task_token, ''))
+                    break
+            prompts.append(_text)
+        return prompts
+    def __call__(
+        self,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        images: ImageInput = None,
+        tokenize_newline_separately: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length=None,
+        return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
+        do_resize: bool = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        data_format: Optional["ChannelDimension"] = "channels_first",  # noqa: F821
+        input_data_format: Optional[
+            Union[str, "ChannelDimension"]  # noqa: F821
+        ] = None,
+        resample: "PILImageResampling" = None,  # noqa: F821
+        size=None,
+        do_convert_rgb: bool = None,
+        do_thumbnail: bool = None,
+        do_align_long_axis: bool = None,
+        do_rescale: bool = None,
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to BartTokenizerFast's [`~BartTokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
+        of the above two methods for more information.
+        Args:
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
+                number of channels, H and W are image height and width.
+            tokenize_newline_separately (`bool`, defaults to `True`):
+                Adds a separately tokenized '\n' at the end of the prompt.
+            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
+                Select a strategy to pad the returned sequences (according to the model's padding side and padding
+                index) among:
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            max_length (`int`, *optional*):
+                Maximum length of the returned list and optionally padding length (see above).
+            truncation (`bool`, *optional*):
+                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. If `suffix`
+              is provided, the `input_ids` will also contain the suffix input ids.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+            - **labels** -- Labels compatible with training if `suffix` is not None
+        """
+        return_token_type_ids = False
+        if images is None:
+            raise ValueError("`images` are expected as arguments to a `Florence2Processor` instance.")
+        if text is None:
+            logger.warning_once(
+                "You are using Florence-2 without a text prompt."
+            )
+            text = ""
+        if isinstance(text, List) and isinstance(images, List):
+            if len(images) < len(text):
+                raise ValueError(
+                    f"Received {len(images)} images for {len(text)} prompts. Each prompt should be associated with an image."
+                )
+        if _is_str_or_image(text):
+            text = [text]
+        elif isinstance(text, list) and _is_str_or_image(text[0]):
+            pass
+        pixel_values = self.image_processor(
+            images,
+            do_resize=do_resize,
+            size=size,
+            do_normalize=do_normalize,
+            return_tensors=return_tensors,
+            image_mean=image_mean,
+            image_std=image_std,
+            input_data_format=input_data_format,
+            data_format=data_format,
+            resample=resample,
+            do_convert_rgb=do_convert_rgb,
+        )["pixel_values"]
+        if max_length is not None:
+            max_length -= self.image_seq_length  # max_length has to account for the image tokens
+        text = self._construct_prompts(text)
+        inputs = self.tokenizer(
+            text,
+            return_tensors=return_tensors,
+            padding=padding,
+            max_length=max_length,
+            truncation=truncation,
+            return_token_type_ids=return_token_type_ids,
+        )
+        return_data = {**inputs, "pixel_values": pixel_values}
+        if return_token_type_ids:
+            labels = inputs["input_ids"].masked_fill(inputs["token_type_ids"] == 0, -100)
+            return_data.update({"labels": labels})
+        return BatchFeature(data=return_data)
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Florence2
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to BartTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Florence2
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to BartTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+    @property
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names with CLIP->Florence2
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+    def post_process_generation(self, text, task, image_size):
+        """
+        Post-process the output of the model to each of the task outputs.
+        Args:
+            text (`str`): The text to post-process.
+            task (`str`): The task to post-process the text for.
+            image_size (`Tuple[int, int]`): The size of the image. height x width.
+        """
+        task_answer_post_processing_type = self.tasks_answer_post_processing_type.get(task, 'pure_text')
+        task_answer = self.post_processor(
+            text=text,
+            image_size=image_size,
+            parse_tasks=task_answer_post_processing_type,
+        )[task_answer_post_processing_type]
+        if task_answer_post_processing_type == 'pure_text':
+            final_answer = task_answer
+            # remove the special tokens
+            final_answer = final_answer.replace('<s>', '').replace('</s>', '\n')
+        elif task_answer_post_processing_type in ['od', 'description_with_bboxes', 'bboxes']:
+            od_instances = task_answer
+            bboxes_od = [_od_instance['bbox'] for _od_instance in od_instances]
+            labels_od = [str(_od_instance['cat_name']) for _od_instance in od_instances]
+            final_answer = {'bboxes': bboxes_od, 'labels': labels_od}
+        elif task_answer_post_processing_type in ['ocr']:
+            bboxes = [_od_instance['quad_box'] for _od_instance in task_answer]
+            labels = [str(_od_instance['text']) for _od_instance in task_answer]
+            final_answer = {'quad_boxes': bboxes, 'labels': labels}
+        elif task_answer_post_processing_type in ['phrase_grounding']:
+            bboxes = []
+            labels = []
+            for _grounded_phrase in task_answer:
+                for _bbox in _grounded_phrase['bbox']:
+                    bboxes.append(_bbox)
+                    labels.append(_grounded_phrase['cat_name'])
+            final_answer = {'bboxes': bboxes, 'labels': labels}
+        elif task_answer_post_processing_type in ['description_with_polygons', 'polygons']:
+            labels = []
+            polygons = []
+            for result in task_answer:
+                label = result['cat_name']
+                _polygons = result['polygons']
+                labels.append(label)
+                polygons.append(_polygons)
+            final_answer = {'polygons': polygons, 'labels': labels}
+        elif task_answer_post_processing_type in ['description_with_bboxes_or_polygons']:
+            bboxes = []
+            bboxes_labels = []
+            polygons = []
+            polygons_labels = []
+            for result in task_answer:
+                label = result['cat_name']
+                if 'polygons' in result:
+                    _polygons = result['polygons']
+                    polygons.append(_polygons)
+                    polygons_labels.append(label)
+                else:
+                    _bbox = result['bbox']
+                    bboxes.append(_bbox)
+                    bboxes_labels.append(label)
+            final_answer = {'bboxes': bboxes, 'bboxes_labels': bboxes_labels, 'polygons': polygons, 'polygons_labels': polygons_labels}
+        else:
+            raise ValueError('Unknown task answer post processing type: {}'.format(task_answer_post_processing_type))
+        final_answer = {
+            task: final_answer}
+        return final_answer
+class BoxQuantizer(object):
+    def __init__(self, mode, bins):
+        self.mode = mode
+        self.bins = bins
+    def quantize(self, boxes: torch.Tensor, size):
+        bins_w, bins_h = self.bins  # Quantization bins.
+        size_w, size_h = size       # Original image size.
+        size_per_bin_w = size_w / bins_w
+        size_per_bin_h = size_h / bins_h
+        xmin, ymin, xmax, ymax = boxes.split(1, dim=-1)  # Shape: 4 * [N, 1].
+        if self.mode == 'floor':
+            quantized_xmin = (
+                xmin / size_per_bin_w).floor().clamp(0, bins_w - 1)
+            quantized_ymin = (
+                ymin / size_per_bin_h).floor().clamp(0, bins_h - 1)
+            quantized_xmax = (
+                xmax / size_per_bin_w).floor().clamp(0, bins_w - 1)
+            quantized_ymax = (
+                ymax / size_per_bin_h).floor().clamp(0, bins_h - 1)
+        elif self.mode == 'round':
+            raise NotImplementedError()
+        else:
+            raise ValueError('Incorrect quantization type.')
+        quantized_boxes = torch.cat(
+            (quantized_xmin, quantized_ymin, quantized_xmax, quantized_ymax), dim=-1
+        ).int()
+        return quantized_boxes
+    def dequantize(self, boxes: torch.Tensor, size):
+        bins_w, bins_h = self.bins  # Quantization bins.
+        size_w, size_h = size       # Original image size.
+        size_per_bin_w = size_w / bins_w
+        size_per_bin_h = size_h / bins_h
+        xmin, ymin, xmax, ymax = boxes.split(1, dim=-1)  # Shape: 4 * [N, 1].
+        if self.mode == 'floor':
+            # Add 0.5 to use the center position of the bin as the coordinate.
+            dequantized_xmin = (xmin + 0.5) * size_per_bin_w
+            dequantized_ymin = (ymin + 0.5) * size_per_bin_h
+            dequantized_xmax = (xmax + 0.5) * size_per_bin_w
+            dequantized_ymax = (ymax + 0.5) * size_per_bin_h
+        elif self.mode == 'round':
+            raise NotImplementedError()
+        else:
+            raise ValueError('Incorrect quantization type.')
+        dequantized_boxes = torch.cat(
+            (dequantized_xmin, dequantized_ymin,
+             dequantized_xmax, dequantized_ymax), dim=-1
+        )
+        return dequantized_boxes
+class CoordinatesQuantizer(object):
+    """
+    Quantize coornidates (Nx2)
+    """
+    def __init__(self, mode, bins):
+        self.mode = mode
+        self.bins = bins
+    def quantize(self, coordinates: torch.Tensor, size):
+        bins_w, bins_h = self.bins  # Quantization bins.
+        size_w, size_h = size       # Original image size.
+        size_per_bin_w = size_w / bins_w
+        size_per_bin_h = size_h / bins_h
+        assert coordinates.shape[-1] == 2, 'coordinates should be shape (N, 2)'
+        x, y = coordinates.split(1, dim=-1)  # Shape: 4 * [N, 1].
+        if self.mode == 'floor':
+            quantized_x = (x / size_per_bin_w).floor().clamp(0, bins_w - 1)
+            quantized_y = (y / size_per_bin_h).floor().clamp(0, bins_h - 1)
+        elif self.mode == 'round':
+            raise NotImplementedError()
+        else:
+            raise ValueError('Incorrect quantization type.')
+        quantized_coordinates = torch.cat(
+            (quantized_x, quantized_y), dim=-1
+        ).int()
+        return quantized_coordinates
+    def dequantize(self, coordinates: torch.Tensor, size):
+        bins_w, bins_h = self.bins  # Quantization bins.
+        size_w, size_h = size       # Original image size.
+        size_per_bin_w = size_w / bins_w
+        size_per_bin_h = size_h / bins_h
+        assert coordinates.shape[-1] == 2, 'coordinates should be shape (N, 2)'
+        x, y = coordinates.split(1, dim=-1)  # Shape: 4 * [N, 1].
+        if self.mode == 'floor':
+            # Add 0.5 to use the center position of the bin as the coordinate.
+            dequantized_x = (x + 0.5) * size_per_bin_w
+            dequantized_y = (y + 0.5) * size_per_bin_h
+        elif self.mode == 'round':
+            raise NotImplementedError()
+        else:
+            raise ValueError('Incorrect quantization type.')
+        dequantized_coordinates = torch.cat(
+            (dequantized_x, dequantized_y), dim=-1
+        )
+        return dequantized_coordinates
+class Florence2PostProcesser(object):
+    """
+    Florence-2 post process for converting text prediction to various tasks results.
+    Args:
+        config: A dict of configs.
+        tokenizer: A tokenizer for decoding text to spans.
+        sample config:
+            UNIFIED_POST_PROCESS:
+                # commom configs
+                NUM_BBOX_HEIGHT_BINS: 1000
+                NUM_BBOX_WIDTH_BINS: 1000
+                COORDINATES_HEIGHT_BINS: 1000
+                COORDINATES_WIDTH_BINS: 1000
+                # task specific configs, override the common configs
+                PRASE_TASKS:
+                    - TASK_NAME: 'video_dense_caption'
+                      PATTERN: 'r<time_(\d+)><time_(\d+)>([a-zA-Z0-9 ]+)'
+                      SCORE_MODE: 'avg_cat_name_scores'
+                      NUM_BINS: 100
+                    - TASK_NAME: 'od'
+                      PATTERN: 'r<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>([a-zA-Z0-9 ]+)'
+                      SCORE_MODE: 'avg_cat_name_scores'
+    Returns:
+        parsed_dict (dict): A dict of parsed results.
+    """
+    def __init__(
+        self,
+        tokenizer=None
+    ):
+        parse_tasks = []
+        parse_task_configs = {}
+        config = self._create_default_config()
+        for task in config['PARSE_TASKS']:
+            parse_tasks.append(task['TASK_NAME'])
+            parse_task_configs[task['TASK_NAME']] = task
+        self.config = config
+        self.parse_tasks = parse_tasks
+        self.parse_tasks_configs = parse_task_configs
+        self.tokenizer =  tokenizer
+        if self.tokenizer is not None:
+            self.all_special_tokens = set(self.tokenizer.all_special_tokens)
+        self.init_quantizers()
+        self.black_list_of_phrase_grounding = self._create_black_list_of_phrase_grounding()
+    def _create_black_list_of_phrase_grounding(self):
+        black_list = {}
+        if 'phrase_grounding' in self.parse_tasks and self.parse_tasks_configs['phrase_grounding']['FILTER_BY_BLACK_LIST']:
+            black_list =  set(
+                ['it', 'I', 'me', 'mine',
+                 'you', 'your', 'yours',
+                 'he', 'him', 'his',
+                 'she', 'her', 'hers',
+                 'they', 'them', 'their', 'theirs',
+                 'one', 'oneself',
+                 'we', 'us', 'our', 'ours',
+                 'you', 'your', 'yours',
+                 'they', 'them', 'their', 'theirs',
+                 'mine', 'yours', 'his', 'hers', 'its',
+                 'ours', 'yours', 'theirs',
+                 'myself', 'yourself', 'himself', 'herself', 'itself',
+                 'ourselves', 'yourselves', 'themselves',
+                 'this', 'that',
+                 'these', 'those',
+                 'who', 'whom', 'whose', 'which', 'what',
+                 'who', 'whom', 'whose', 'which', 'that',
+                 'all', 'another', 'any', 'anybody', 'anyone', 'anything',
+                 'each', 'everybody', 'everyone', 'everything',
+                 'few', 'many', 'nobody', 'none', 'one', 'several',
+                 'some', 'somebody', 'someone', 'something',
+                 'each other', 'one another',
+                 'myself', 'yourself', 'himself', 'herself', 'itself',
+                 'ourselves', 'yourselves', 'themselves',
+                 'the image', 'image', 'images', 'the', 'a', 'an', 'a group',
+                 'other objects', 'lots', 'a set',
+                 ]
+            )
+        return black_list
+    def _create_default_config(self):
+        config = {
+            'NUM_BBOX_HEIGHT_BINS': 1000,
+            'NUM_BBOX_WIDTH_BINS': 1000,
+            'BOX_QUANTIZATION_MODE': 'floor',
+            'COORDINATES_HEIGHT_BINS': 1000,
+            'COORDINATES_WIDTH_BINS': 1000,
+            'COORDINATES_QUANTIZATION_MODE': 'floor',
+            'PARSE_TASKS': [
+                {
+                    'TASK_NAME': 'od',
+                    'PATTERN': r'([a-zA-Z0-9 ]+)<loc_(\\d+)><loc_(\\d+)><loc_(\\d+)><loc_(\\d+)>'
+                },
+                {
+                    'TASK_NAME': 'ocr',
+                    'PATTERN':  r'(.+?)<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>',
+                    'AREA_THRESHOLD': 0.01
+                },
+                {
+                    'TASK_NAME': 'phrase_grounding',
+                    'FILTER_BY_BLACK_LIST': True
+                },
+                {
+                    'TASK_NAME': 'pure_text',
+                },
+                {
+                    'TASK_NAME': 'description_with_bboxes',
+                },
+                {
+                    'TASK_NAME': 'description_with_polygons',
+                },
+                {
+                    'TASK_NAME': 'polygons',
+                },
+                {
+                    'TASK_NAME': 'bboxes',
+                },
+                {
+                    'TASK_NAME': 'description_with_bboxes_or_polygons',
+                }
+            ]
+        }
+        return config
+    def init_quantizers(self):
+        # we have box_quantizer (od, grounding) and coordinates_quantizer (ocr, referring_segmentation)
+        num_bbox_height_bins = self.config.get('NUM_BBOX_HEIGHT_BINS', 1000)
+        num_bbox_width_bins = self.config.get('NUM_BBOX_WIDTH_BINS', 1000)
+        box_quantization_mode = self.config.get('BOX_QUANTIZATION_MODE', 'floor')
+        self.box_quantizer = BoxQuantizer(
+            box_quantization_mode,
+            (num_bbox_width_bins, num_bbox_height_bins),
+        )
+        num_bbox_height_bins = self.config['COORDINATES_HEIGHT_BINS'] if 'COORDINATES_HEIGHT_BINS' in self.config else self.config.get('NUM_BBOX_HEIGHT_BINS', 1000)
+        num_bbox_width_bins = self.config['COORDINATES_WIDTH_BINS'] if 'COORDINATES_WIDTH_BINS' in self.config else self.config.get('NUM_BBOX_WIDTH_BINS', 1000)
+        box_quantization_mode = self.config.get('COORDINATES_QUANTIZATION_MODE') if 'COORDINATES_QUANTIZATION_MODE' in self.config else self.config.get('BOX_QUANTIZATION_MODE', 'floor')
+        self.coordinates_quantizer = CoordinatesQuantizer(
+            box_quantization_mode,
+            (num_bbox_width_bins, num_bbox_height_bins),
+        )
+    def decode_with_spans(self, tokenizer, token_ids):
+        filtered_tokens = tokenizer.convert_ids_to_tokens(
+            token_ids, skip_special_tokens=False)
+        assert len(filtered_tokens) == len(token_ids)
+        # To avoid mixing byte-level and unicode for byte-level BPT
+        # we need to build string separately for added tokens and byte-level tokens
+        # cf. https://github.com/huggingface/transformers/issues/1133
+        sub_texts = []
+        for token in filtered_tokens:
+            if token in self.all_special_tokens:
+                sub_texts.append(token)
+            else:
+                if isinstance(tokenizer, (BartTokenizer, BartTokenizerFast)):
+                    sub_text = tokenizer.convert_tokens_to_string([token])
+                elif isinstance(tokenizer, (T5Tokenizer, T5TokenizerFast)):
+                    # Ref: https://github.com/google/sentencepiece#whitespace-is-treated-as-a-basic-symbol
+                    # Note: Do not strip sub_text as it may have functional whitespace
+                    sub_text = token.replace('▁', ' ')
+                else:
+                    raise ValueError(f'type {type(tokenizer)} not supported')
+                sub_texts.append(sub_text)
+        text = ''
+        spans = []
+        for sub_text in sub_texts:
+            span = (len(text), len(text) + len(sub_text))  # [start index, end index).
+            text += sub_text
+            spans.append(span)
+        # Text format:
+        # 1. T5Tokenizer/T5TokenizerFast:
+        #      "<loc_1><loc_2><loc_3><loc_4> transplanting dog<loc_1><loc_2><loc_3><loc_4> cat</s>"
+        #    Equivalent to t5_tokenizer.decode(input_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False, spaces_between_special_tokens=False)
+        # 2. BartTokenizer (need to double check):
+        #      "<s><loc_1><loc_2><loc_3><loc_4>transplanting dog<loc_1><loc_2><loc_3><loc_4>cat</s>"
+        #    Equivalent to bart_tokenizer.decode(input_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False, spaces_between_special_tokens=False)
+        return text, spans
+    def parse_od_from_text_and_spans(
+        self,
+        text,
+        pattern,
+        image_size,
+        phrase_centric=False
+    ):
+        parsed = list(re.finditer(pattern, text))
+        instances = []
+        for i in range(len(parsed)):
+            # Prepare instance.
+            instance = {}
+            if phrase_centric:
+                bbox_bins = [int(parsed[i].group(j)) for j in range(2, 6)]
+            else:
+                bbox_bins = [int(parsed[i].group(j)) for j in range(1, 5)]
+            instance['bbox'] = self.box_quantizer.dequantize(
+                boxes=torch.tensor(bbox_bins),
+                size=image_size
+            ).tolist()
+            if phrase_centric:
+                instance['cat_name'] = parsed[i].group(1).lower().strip()
+            else:
+                instance['cat_name'] = parsed[i].group(5).lower().strip()
+            instances.append(instance)
+        return instances
+    def parse_ocr_from_text_and_spans(self,
+                                    text,
+                                     pattern,
+                                     image_size,
+                                     area_threshold=-1.0,
+        ):
+        bboxes = []
+        labels = []
+        text = text.replace('<s>', '')
+        # ocr with regions
+        parsed = re.findall(pattern, text)
+        instances = []
+        image_width, image_height = image_size
+        for ocr_line in parsed:
+            ocr_content = ocr_line[0]
+            quad_box = ocr_line[1:]
+            quad_box = [int(i) for i in quad_box]
+            quad_box = self.coordinates_quantizer.dequantize(
+                torch.tensor(np.array(quad_box).reshape(-1, 2)),
+                size=image_size
+            ).reshape(-1).tolist()
+            if area_threshold > 0:
+                x_coords = [i for i in quad_box[0::2]]
+                y_coords = [i for i in quad_box[1::2]]
+                # apply the Shoelace formula
+                area = 0.5 * abs(sum(x_coords[i] * y_coords[i + 1] - x_coords[i + 1] * y_coords[i] for i in range(4 - 1)))
+                if area < (image_width * image_height) * area_threshold:
+                    continue
+            bboxes.append(quad_box)
+            labels.append(ocr_content)
+            instances.append({
+                'quad_box': quad_box,
+                'text': ocr_content,
+            })
+        return instances
+    def parse_phrase_grounding_from_text_and_spans(self, text, pattern, image_size):
+        # ignore <s> </s> and <pad>
+        cur_span = 0
+        if text.startswith('<s>'):
+            cur_span += 3
+        text = text.replace('<s>', '')
+        text = text.replace('</s>', '')
+        text = text.replace('<pad>', '')
+        pattern = r"([^<]+(?:<loc_\d+>){4,})"
+        phrases = re.findall(pattern, text)
+        # pattern should be text pattern and od pattern
+        pattern = r'^\s*(.*?)(?=<od>|</od>|<box>|</box>|<bbox>|</bbox>|<loc_)'
+        box_pattern = r'<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>'
+        instances = []
+        for pharse_text in phrases:
+            phrase_text_strip = pharse_text.replace('<ground>', '', 1)
+            phrase_text_strip = pharse_text.replace('<obj>', '', 1)
+            if phrase_text_strip == '':
+                cur_span += len(pharse_text)
+                continue
+            # Prepare instance.
+            instance = {}
+            # parse phrase, get string
+            phrase = re.search(pattern, phrase_text_strip)
+            if phrase is None:
+                cur_span += len(pharse_text)
+                continue
+            # parse bboxes by box_pattern
+            bboxes_parsed = list(re.finditer(box_pattern, pharse_text))
+            if len(bboxes_parsed) == 0:
+                cur_span += len(pharse_text)
+                continue
+            phrase = phrase.group()
+            # remove leading and trailing spaces
+            phrase = phrase.strip()
+            if phrase in self.black_list_of_phrase_grounding:
+                cur_span += len(pharse_text)
+                continue
+            # a list of list
+            bbox_bins = [[int(_bboxes_parsed.group(j)) for j in range(1, 5)] for _bboxes_parsed in bboxes_parsed]
+            instance['bbox'] = self.box_quantizer.dequantize(
+                boxes=torch.tensor(bbox_bins),
+                size=image_size
+            ).tolist()
+            # exclude non-ascii characters
+            phrase = phrase.encode('ascii',errors='ignore').decode('ascii')
+            instance['cat_name'] = phrase
+            instances.append(instance)
+        return instances
+    def parse_description_with_bboxes_from_text_and_spans(self, text, pattern, image_size, allow_empty_phrase=False):
+        # temporary parse solution, split by '.'
+        # ignore <s> </s> and <pad>
+        text = text.replace('<s>', '')
+        text = text.replace('</s>', '')
+        text = text.replace('<pad>', '')
+        if allow_empty_phrase:
+            pattern = rf"(?:(?:<loc_\d+>){{4,}})"
+        else:
+            pattern = r"([^<]+(?:<loc_\d+>){4,})"
+        phrases = re.findall(pattern, text)
+        # pattern should be text pattern and od pattern
+        pattern = r'^\s*(.*?)(?=<od>|</od>|<box>|</box>|<bbox>|</bbox>|<loc_)'
+        box_pattern = r'<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>'
+        instances = []
+        for pharse_text in phrases:
+            phrase_text_strip = pharse_text.replace('<ground>', '', 1)
+            phrase_text_strip = pharse_text.replace('<obj>', '', 1)
+            if phrase_text_strip == '' and not allow_empty_phrase:
+                continue
+            # parse phrase, get string
+            phrase = re.search(pattern, phrase_text_strip)
+            if phrase is None:
+                continue
+            phrase = phrase.group()
+            # remove leading and trailing spaces
+            phrase = phrase.strip()
+            # parse bboxes by box_pattern
+            bboxes_parsed = list(re.finditer(box_pattern, pharse_text))
+            if len(bboxes_parsed) == 0:
+                continue
+            # a list of list
+            bbox_bins = [[int(_bboxes_parsed.group(j)) for j in range(1, 5)] for _bboxes_parsed in bboxes_parsed]
+            bboxes = self.box_quantizer.dequantize(
+                boxes=torch.tensor(bbox_bins),
+                size=image_size
+            ).tolist()
+            phrase = phrase.encode('ascii',errors='ignore').decode('ascii')
+            for _bboxes in bboxes:
+                # Prepare instance.
+                instance = {}
+                instance['bbox'] = _bboxes
+                # exclude non-ascii characters
+                instance['cat_name'] = phrase
+                instances.append(instance)
+        return instances
+    def parse_description_with_polygons_from_text_and_spans(self, text, pattern, image_size,
+                                                            allow_empty_phrase=False,
+                                                            polygon_sep_token='<sep>',
+                                                            polygon_start_token='<poly>',
+                                                            polygon_end_token='</poly>',
+                                                            with_box_at_start=False,
+                                                            ):
+        # ref_seg format: '<expression><x1><y1><x2><y2><><><sep><><><><>'
+        # ignore <s> </s> and <pad>
+        text = text.replace('<s>', '')
+        text = text.replace('</s>', '')
+        text = text.replace('<pad>', '')
+        if allow_empty_phrase:
+            pattern = rf"(?:(?:<loc_\d+>|{re.escape(polygon_sep_token)}|{re.escape(polygon_start_token)}|{re.escape(polygon_end_token)}){{4,}})"
+        else:
+            # [^<]+: This part matches one or more characters that are not the < symbol.
+            # The ^ inside the square brackets [] is a negation, meaning it matches anything except <.
+            #
+            pattern = rf"([^<]+(?:<loc_\d+>|{re.escape(polygon_sep_token)}|{re.escape(polygon_start_token)}|{re.escape(polygon_end_token)}){{4,}})"
+        phrases = re.findall(pattern, text)
+        phrase_string_pattern = r'^\s*(.*?)(?=<od>|</od>|<box>|</box>|<bbox>|</bbox>|<loc_|<poly>)'
+        box_pattern =  rf'((?:<loc_\d+>)+)(?:{re.escape(polygon_sep_token)}|$)'
+        # one polygons instance is separated by polygon_start_token and polygon_end_token
+        polygons_instance_pattern = rf'{re.escape(polygon_start_token)}(.*?){re.escape(polygon_end_token)}'
+        instances = []
+        for phrase_text in phrases:
+            # exclude loc_\d+>
+            # need to get span if want to include category score
+            phrase_text_strip = re.sub(r'^loc_\d+>', '', phrase_text, count=1)
+            # phrase = phrase.replace('<poly>', '')
+            # phrase = phrase.replace('poly>', '')
+            if phrase_text_strip == '' and not allow_empty_phrase:
+                continue
+            # parse phrase, get string
+            phrase = re.search(phrase_string_pattern, phrase_text_strip)
+            if phrase is None:
+                continue
+            phrase = phrase.group()
+            # remove leading and trailing spaces
+            phrase = phrase.strip()
+            # parse bboxes by box_pattern
+            # split by polygon_start_token and polygon_end_token first using polygons_instance_pattern
+            if polygon_start_token in phrase_text and polygon_end_token in phrase_text:
+                polygons_instances_parsed = list(re.finditer(polygons_instance_pattern, phrase_text))
+            else:
+                polygons_instances_parsed = [phrase_text]
+            for _polygons_instances_parsed in polygons_instances_parsed:
+                # Prepare instance.
+                instance = {}
+                # polygons_parsed= list(re.finditer(box_pattern, phrase_text))
+                if isinstance(_polygons_instances_parsed, str):
+                    polygons_parsed= list(re.finditer(box_pattern, _polygons_instances_parsed))
+                else:
+                    polygons_parsed= list(re.finditer(box_pattern, _polygons_instances_parsed.group(1)))
+                if len(polygons_parsed) == 0:
+                    continue
+                # a list of list (polygon)
+                bbox = []
+                polygons = []
+                for _polygon_parsed in polygons_parsed:
+                    # group 1: whole <loc_\d+>...</loc_\d+>
+                    _polygon = _polygon_parsed.group(1)
+                    # parse into list of int
+                    _polygon = [int(_loc_parsed.group(1)) for _loc_parsed in re.finditer(r'<loc_(\d+)>', _polygon)]
+                    if with_box_at_start and len(bbox) == 0:
+                        if len(_polygon) > 4:
+                            # no valid bbox prediction
+                            bbox = _polygon[:4]
+                            _polygon = _polygon[4:]
+                        else:
+                            bbox = [0, 0, 0, 0]
+                    # abandon last element if is not paired
+                    if len(_polygon) % 2 == 1:
+                        _polygon = _polygon[:-1]
+                    # reshape into (n, 2)
+                    _polygon = self.coordinates_quantizer.dequantize(
+                        torch.tensor(np.array(_polygon).reshape(-1, 2)),
+                        size=image_size
+                    ).reshape(-1).tolist()
+                    # reshape back
+                    polygons.append(_polygon)
+                instance['cat_name'] = phrase
+                instance['polygons'] = polygons
+                if len(bbox) != 0:
+                    instance['bbox'] = self.box_quantizer.dequantize(
+                        boxes=torch.tensor([bbox]),
+                        size=image_size
+                    ).tolist()[0]
+                instances.append(instance)
+        return instances
+    def __call__(
+        self,
+        text=None,
+        image_size=None,
+        parse_tasks=None,
+    ):
+        """
+        Args:
+            text: model outputs
+            image_size: (width, height)
+            parse_tasks: a list of tasks to parse, if None, parse all tasks.
+        """
+        if parse_tasks is not None:
+            if isinstance(parse_tasks, str):
+                parse_tasks = [parse_tasks]
+            for _parse_task in parse_tasks:
+                assert _parse_task in self.parse_tasks, f'parse task {_parse_task} not supported'
+        # sequence or text should be provided
+        assert text is not None, 'text should be provided'
+        parsed_dict = {
+            'text': text
+        }
+        for task in self.parse_tasks:
+            if parse_tasks is not None and task not in parse_tasks:
+                continue
+            pattern = self.parse_tasks_configs[task].get('PATTERN', None)
+            if task == 'ocr':
+                instances = self.parse_ocr_from_text_and_spans(
+                    text,
+                    pattern=pattern,
+                    image_size=image_size,
+                    area_threshold=self.parse_tasks_configs[task].get('AREA_THRESHOLD', 0.01),
+                )
+                parsed_dict['ocr'] = instances
+            elif task == 'phrase_grounding':
+                instances = self.parse_phrase_grounding_from_text_and_spans(
+                    text,
+                    pattern=pattern,
+                    image_size=image_size,
+                )
+                parsed_dict['phrase_grounding'] = instances
+            elif task == 'pure_text':
+                parsed_dict['pure_text'] = text
+            elif task == 'description_with_bboxes':
+                instances = self.parse_description_with_bboxes_from_text_and_spans(
+                    text,
+                    pattern=pattern,
+                    image_size=image_size,
+                )
+                parsed_dict['description_with_bboxes'] = instances
+            elif task == 'description_with_polygons':
+                instances = self.parse_description_with_polygons_from_text_and_spans(
+                    text,
+                    pattern=pattern,
+                    image_size=image_size,
+                )
+                parsed_dict['description_with_polygons'] = instances
+            elif task == 'polygons':
+                instances = self.parse_description_with_polygons_from_text_and_spans(
+                    text,
+                    pattern=pattern,
+                    image_size=image_size,
+                    allow_empty_phrase=True,
+                )
+                parsed_dict['polygons'] = instances
+            elif task == 'bboxes':
+                instances = self.parse_description_with_bboxes_from_text_and_spans(
+                    text,
+                    pattern=pattern,
+                    image_size=image_size,
+                    allow_empty_phrase=True,
+                )
+                parsed_dict['bboxes'] = instances
+            elif task == 'description_with_bboxes_or_polygons':
+                if '<poly>' in text:
+                    # only support either polygons or bboxes, not both at the same time
+                    instances = self.parse_description_with_polygons_from_text_and_spans(
+                        text,
+                        pattern=pattern,
+                        image_size=image_size,
+                    )
+                else:
+                    instances = self.parse_description_with_bboxes_from_text_and_spans(
+                        text,
+                        pattern=pattern,
+                        image_size=image_size,
+                    )
+                parsed_dict['description_with_bboxes_or_polygons'] = instances
+            else:
+                raise ValueError("task {} is not supported".format(task))
+        return parsed_dict

scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:447b01591a39408ae380370018f8c3db3a654297cbd0682a220c7c4e9f496973
+size 1064

special_tokens_map.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+    "model_max_length": 1024
+}

trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dc12cf2d0e354f63424e2fe939f573cf06daf39c77dd3c40a5df9ab04bd789d0
+size 6776

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

zero_to_fp32.py ADDED Viewed

	@@ -0,0 +1,592 @@

+#!/usr/bin/env python
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example: python zero_to_fp32.py . pytorch_model.bin
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+from collections import OrderedDict
+from dataclasses import dataclass
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+debug = 0
+# load to cpu
+device = torch.device('cpu')
+def atoi(text):
+    return int(text) if text.isdigit() else text
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+    return file
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+    return ckpt_files
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device)
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+        ds_version = state_dict.get(DS_VERSION, None)
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+    return zero_model_states
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in files:
+        state_dict = torch.load(f, map_location=device)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+    if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+    if type(world_size) is list:
+        world_size = max(world_size)
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+    if zero_stage <= 2:
+        fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    elif zero_stage == 3:
+        # if there is more than one param group, there will be multiple flattened tensors - one
+        # flattened tensor per group - for simplicity merge them into a single tensor
+        #
+        # XXX: could make the script more memory efficient for when there are multiple groups - it
+        # will require matching the sub-lists of param_shapes for each param group flattened tensor
+        fp32_flat_groups = [
+            torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
+        ]
+    return zero_stage, world_size, fp32_flat_groups
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+    model_files = get_model_state_files(ds_checkpoint_dir)
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states)
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        state_dict[name] = frozen_param_fragments[name]
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    _zero2_merge_frozen_params(state_dict, zero_model_states)
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = fp32_flat_groups[0].numel() * world_size
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    for name, shape in param_shapes.items():
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+        # XXX: memory usage doubles here
+        state_dict[name] = torch.cat(
+            tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
+            0).narrow(0, 0, unpartitioned_numel).view(shape)
+        offset += partitioned_numel
+    offset *= world_size
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+    Returns:
+        - pytorch ``state_dict``
+    Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+    you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+    return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir)
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+    """
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+    print(f"Saving fp32 state dict to {output_file}")
+    torch.save(state_dict, output_file)
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+    Returns:
+        - ``model`: modified model
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    """
+    logger.info(f"Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+    logger.info(f"Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+    return model
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument(
+        "output_file",
+        type=str,
+        help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+    debug = args.debug
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, args.output_file, tag=args.tag)