jiuhai commited on
Commit
fc6d379
1 Parent(s): 8cedb8a

Upload folder using huggingface_hub

Browse files
added_tokens.json ADDED
@@ -0,0 +1,1026 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</cap>": 51270,
3
+ "</dcap>": 51274,
4
+ "</grounding>": 51276,
5
+ "</ncap>": 51272,
6
+ "</ocr>": 50268,
7
+ "</od>": 50266,
8
+ "</poly>": 51287,
9
+ "</proposal>": 51285,
10
+ "</region_cap>": 51281,
11
+ "</region_to_desciption>": 51283,
12
+ "</seg>": 51278,
13
+ "<and>": 51288,
14
+ "<cap>": 51269,
15
+ "<dcap>": 51273,
16
+ "<grounding>": 51275,
17
+ "<loc_0>": 50269,
18
+ "<loc_100>": 50369,
19
+ "<loc_101>": 50370,
20
+ "<loc_102>": 50371,
21
+ "<loc_103>": 50372,
22
+ "<loc_104>": 50373,
23
+ "<loc_105>": 50374,
24
+ "<loc_106>": 50375,
25
+ "<loc_107>": 50376,
26
+ "<loc_108>": 50377,
27
+ "<loc_109>": 50378,
28
+ "<loc_10>": 50279,
29
+ "<loc_110>": 50379,
30
+ "<loc_111>": 50380,
31
+ "<loc_112>": 50381,
32
+ "<loc_113>": 50382,
33
+ "<loc_114>": 50383,
34
+ "<loc_115>": 50384,
35
+ "<loc_116>": 50385,
36
+ "<loc_117>": 50386,
37
+ "<loc_118>": 50387,
38
+ "<loc_119>": 50388,
39
+ "<loc_11>": 50280,
40
+ "<loc_120>": 50389,
41
+ "<loc_121>": 50390,
42
+ "<loc_122>": 50391,
43
+ "<loc_123>": 50392,
44
+ "<loc_124>": 50393,
45
+ "<loc_125>": 50394,
46
+ "<loc_126>": 50395,
47
+ "<loc_127>": 50396,
48
+ "<loc_128>": 50397,
49
+ "<loc_129>": 50398,
50
+ "<loc_12>": 50281,
51
+ "<loc_130>": 50399,
52
+ "<loc_131>": 50400,
53
+ "<loc_132>": 50401,
54
+ "<loc_133>": 50402,
55
+ "<loc_134>": 50403,
56
+ "<loc_135>": 50404,
57
+ "<loc_136>": 50405,
58
+ "<loc_137>": 50406,
59
+ "<loc_138>": 50407,
60
+ "<loc_139>": 50408,
61
+ "<loc_13>": 50282,
62
+ "<loc_140>": 50409,
63
+ "<loc_141>": 50410,
64
+ "<loc_142>": 50411,
65
+ "<loc_143>": 50412,
66
+ "<loc_144>": 50413,
67
+ "<loc_145>": 50414,
68
+ "<loc_146>": 50415,
69
+ "<loc_147>": 50416,
70
+ "<loc_148>": 50417,
71
+ "<loc_149>": 50418,
72
+ "<loc_14>": 50283,
73
+ "<loc_150>": 50419,
74
+ "<loc_151>": 50420,
75
+ "<loc_152>": 50421,
76
+ "<loc_153>": 50422,
77
+ "<loc_154>": 50423,
78
+ "<loc_155>": 50424,
79
+ "<loc_156>": 50425,
80
+ "<loc_157>": 50426,
81
+ "<loc_158>": 50427,
82
+ "<loc_159>": 50428,
83
+ "<loc_15>": 50284,
84
+ "<loc_160>": 50429,
85
+ "<loc_161>": 50430,
86
+ "<loc_162>": 50431,
87
+ "<loc_163>": 50432,
88
+ "<loc_164>": 50433,
89
+ "<loc_165>": 50434,
90
+ "<loc_166>": 50435,
91
+ "<loc_167>": 50436,
92
+ "<loc_168>": 50437,
93
+ "<loc_169>": 50438,
94
+ "<loc_16>": 50285,
95
+ "<loc_170>": 50439,
96
+ "<loc_171>": 50440,
97
+ "<loc_172>": 50441,
98
+ "<loc_173>": 50442,
99
+ "<loc_174>": 50443,
100
+ "<loc_175>": 50444,
101
+ "<loc_176>": 50445,
102
+ "<loc_177>": 50446,
103
+ "<loc_178>": 50447,
104
+ "<loc_179>": 50448,
105
+ "<loc_17>": 50286,
106
+ "<loc_180>": 50449,
107
+ "<loc_181>": 50450,
108
+ "<loc_182>": 50451,
109
+ "<loc_183>": 50452,
110
+ "<loc_184>": 50453,
111
+ "<loc_185>": 50454,
112
+ "<loc_186>": 50455,
113
+ "<loc_187>": 50456,
114
+ "<loc_188>": 50457,
115
+ "<loc_189>": 50458,
116
+ "<loc_18>": 50287,
117
+ "<loc_190>": 50459,
118
+ "<loc_191>": 50460,
119
+ "<loc_192>": 50461,
120
+ "<loc_193>": 50462,
121
+ "<loc_194>": 50463,
122
+ "<loc_195>": 50464,
123
+ "<loc_196>": 50465,
124
+ "<loc_197>": 50466,
125
+ "<loc_198>": 50467,
126
+ "<loc_199>": 50468,
127
+ "<loc_19>": 50288,
128
+ "<loc_1>": 50270,
129
+ "<loc_200>": 50469,
130
+ "<loc_201>": 50470,
131
+ "<loc_202>": 50471,
132
+ "<loc_203>": 50472,
133
+ "<loc_204>": 50473,
134
+ "<loc_205>": 50474,
135
+ "<loc_206>": 50475,
136
+ "<loc_207>": 50476,
137
+ "<loc_208>": 50477,
138
+ "<loc_209>": 50478,
139
+ "<loc_20>": 50289,
140
+ "<loc_210>": 50479,
141
+ "<loc_211>": 50480,
142
+ "<loc_212>": 50481,
143
+ "<loc_213>": 50482,
144
+ "<loc_214>": 50483,
145
+ "<loc_215>": 50484,
146
+ "<loc_216>": 50485,
147
+ "<loc_217>": 50486,
148
+ "<loc_218>": 50487,
149
+ "<loc_219>": 50488,
150
+ "<loc_21>": 50290,
151
+ "<loc_220>": 50489,
152
+ "<loc_221>": 50490,
153
+ "<loc_222>": 50491,
154
+ "<loc_223>": 50492,
155
+ "<loc_224>": 50493,
156
+ "<loc_225>": 50494,
157
+ "<loc_226>": 50495,
158
+ "<loc_227>": 50496,
159
+ "<loc_228>": 50497,
160
+ "<loc_229>": 50498,
161
+ "<loc_22>": 50291,
162
+ "<loc_230>": 50499,
163
+ "<loc_231>": 50500,
164
+ "<loc_232>": 50501,
165
+ "<loc_233>": 50502,
166
+ "<loc_234>": 50503,
167
+ "<loc_235>": 50504,
168
+ "<loc_236>": 50505,
169
+ "<loc_237>": 50506,
170
+ "<loc_238>": 50507,
171
+ "<loc_239>": 50508,
172
+ "<loc_23>": 50292,
173
+ "<loc_240>": 50509,
174
+ "<loc_241>": 50510,
175
+ "<loc_242>": 50511,
176
+ "<loc_243>": 50512,
177
+ "<loc_244>": 50513,
178
+ "<loc_245>": 50514,
179
+ "<loc_246>": 50515,
180
+ "<loc_247>": 50516,
181
+ "<loc_248>": 50517,
182
+ "<loc_249>": 50518,
183
+ "<loc_24>": 50293,
184
+ "<loc_250>": 50519,
185
+ "<loc_251>": 50520,
186
+ "<loc_252>": 50521,
187
+ "<loc_253>": 50522,
188
+ "<loc_254>": 50523,
189
+ "<loc_255>": 50524,
190
+ "<loc_256>": 50525,
191
+ "<loc_257>": 50526,
192
+ "<loc_258>": 50527,
193
+ "<loc_259>": 50528,
194
+ "<loc_25>": 50294,
195
+ "<loc_260>": 50529,
196
+ "<loc_261>": 50530,
197
+ "<loc_262>": 50531,
198
+ "<loc_263>": 50532,
199
+ "<loc_264>": 50533,
200
+ "<loc_265>": 50534,
201
+ "<loc_266>": 50535,
202
+ "<loc_267>": 50536,
203
+ "<loc_268>": 50537,
204
+ "<loc_269>": 50538,
205
+ "<loc_26>": 50295,
206
+ "<loc_270>": 50539,
207
+ "<loc_271>": 50540,
208
+ "<loc_272>": 50541,
209
+ "<loc_273>": 50542,
210
+ "<loc_274>": 50543,
211
+ "<loc_275>": 50544,
212
+ "<loc_276>": 50545,
213
+ "<loc_277>": 50546,
214
+ "<loc_278>": 50547,
215
+ "<loc_279>": 50548,
216
+ "<loc_27>": 50296,
217
+ "<loc_280>": 50549,
218
+ "<loc_281>": 50550,
219
+ "<loc_282>": 50551,
220
+ "<loc_283>": 50552,
221
+ "<loc_284>": 50553,
222
+ "<loc_285>": 50554,
223
+ "<loc_286>": 50555,
224
+ "<loc_287>": 50556,
225
+ "<loc_288>": 50557,
226
+ "<loc_289>": 50558,
227
+ "<loc_28>": 50297,
228
+ "<loc_290>": 50559,
229
+ "<loc_291>": 50560,
230
+ "<loc_292>": 50561,
231
+ "<loc_293>": 50562,
232
+ "<loc_294>": 50563,
233
+ "<loc_295>": 50564,
234
+ "<loc_296>": 50565,
235
+ "<loc_297>": 50566,
236
+ "<loc_298>": 50567,
237
+ "<loc_299>": 50568,
238
+ "<loc_29>": 50298,
239
+ "<loc_2>": 50271,
240
+ "<loc_300>": 50569,
241
+ "<loc_301>": 50570,
242
+ "<loc_302>": 50571,
243
+ "<loc_303>": 50572,
244
+ "<loc_304>": 50573,
245
+ "<loc_305>": 50574,
246
+ "<loc_306>": 50575,
247
+ "<loc_307>": 50576,
248
+ "<loc_308>": 50577,
249
+ "<loc_309>": 50578,
250
+ "<loc_30>": 50299,
251
+ "<loc_310>": 50579,
252
+ "<loc_311>": 50580,
253
+ "<loc_312>": 50581,
254
+ "<loc_313>": 50582,
255
+ "<loc_314>": 50583,
256
+ "<loc_315>": 50584,
257
+ "<loc_316>": 50585,
258
+ "<loc_317>": 50586,
259
+ "<loc_318>": 50587,
260
+ "<loc_319>": 50588,
261
+ "<loc_31>": 50300,
262
+ "<loc_320>": 50589,
263
+ "<loc_321>": 50590,
264
+ "<loc_322>": 50591,
265
+ "<loc_323>": 50592,
266
+ "<loc_324>": 50593,
267
+ "<loc_325>": 50594,
268
+ "<loc_326>": 50595,
269
+ "<loc_327>": 50596,
270
+ "<loc_328>": 50597,
271
+ "<loc_329>": 50598,
272
+ "<loc_32>": 50301,
273
+ "<loc_330>": 50599,
274
+ "<loc_331>": 50600,
275
+ "<loc_332>": 50601,
276
+ "<loc_333>": 50602,
277
+ "<loc_334>": 50603,
278
+ "<loc_335>": 50604,
279
+ "<loc_336>": 50605,
280
+ "<loc_337>": 50606,
281
+ "<loc_338>": 50607,
282
+ "<loc_339>": 50608,
283
+ "<loc_33>": 50302,
284
+ "<loc_340>": 50609,
285
+ "<loc_341>": 50610,
286
+ "<loc_342>": 50611,
287
+ "<loc_343>": 50612,
288
+ "<loc_344>": 50613,
289
+ "<loc_345>": 50614,
290
+ "<loc_346>": 50615,
291
+ "<loc_347>": 50616,
292
+ "<loc_348>": 50617,
293
+ "<loc_349>": 50618,
294
+ "<loc_34>": 50303,
295
+ "<loc_350>": 50619,
296
+ "<loc_351>": 50620,
297
+ "<loc_352>": 50621,
298
+ "<loc_353>": 50622,
299
+ "<loc_354>": 50623,
300
+ "<loc_355>": 50624,
301
+ "<loc_356>": 50625,
302
+ "<loc_357>": 50626,
303
+ "<loc_358>": 50627,
304
+ "<loc_359>": 50628,
305
+ "<loc_35>": 50304,
306
+ "<loc_360>": 50629,
307
+ "<loc_361>": 50630,
308
+ "<loc_362>": 50631,
309
+ "<loc_363>": 50632,
310
+ "<loc_364>": 50633,
311
+ "<loc_365>": 50634,
312
+ "<loc_366>": 50635,
313
+ "<loc_367>": 50636,
314
+ "<loc_368>": 50637,
315
+ "<loc_369>": 50638,
316
+ "<loc_36>": 50305,
317
+ "<loc_370>": 50639,
318
+ "<loc_371>": 50640,
319
+ "<loc_372>": 50641,
320
+ "<loc_373>": 50642,
321
+ "<loc_374>": 50643,
322
+ "<loc_375>": 50644,
323
+ "<loc_376>": 50645,
324
+ "<loc_377>": 50646,
325
+ "<loc_378>": 50647,
326
+ "<loc_379>": 50648,
327
+ "<loc_37>": 50306,
328
+ "<loc_380>": 50649,
329
+ "<loc_381>": 50650,
330
+ "<loc_382>": 50651,
331
+ "<loc_383>": 50652,
332
+ "<loc_384>": 50653,
333
+ "<loc_385>": 50654,
334
+ "<loc_386>": 50655,
335
+ "<loc_387>": 50656,
336
+ "<loc_388>": 50657,
337
+ "<loc_389>": 50658,
338
+ "<loc_38>": 50307,
339
+ "<loc_390>": 50659,
340
+ "<loc_391>": 50660,
341
+ "<loc_392>": 50661,
342
+ "<loc_393>": 50662,
343
+ "<loc_394>": 50663,
344
+ "<loc_395>": 50664,
345
+ "<loc_396>": 50665,
346
+ "<loc_397>": 50666,
347
+ "<loc_398>": 50667,
348
+ "<loc_399>": 50668,
349
+ "<loc_39>": 50308,
350
+ "<loc_3>": 50272,
351
+ "<loc_400>": 50669,
352
+ "<loc_401>": 50670,
353
+ "<loc_402>": 50671,
354
+ "<loc_403>": 50672,
355
+ "<loc_404>": 50673,
356
+ "<loc_405>": 50674,
357
+ "<loc_406>": 50675,
358
+ "<loc_407>": 50676,
359
+ "<loc_408>": 50677,
360
+ "<loc_409>": 50678,
361
+ "<loc_40>": 50309,
362
+ "<loc_410>": 50679,
363
+ "<loc_411>": 50680,
364
+ "<loc_412>": 50681,
365
+ "<loc_413>": 50682,
366
+ "<loc_414>": 50683,
367
+ "<loc_415>": 50684,
368
+ "<loc_416>": 50685,
369
+ "<loc_417>": 50686,
370
+ "<loc_418>": 50687,
371
+ "<loc_419>": 50688,
372
+ "<loc_41>": 50310,
373
+ "<loc_420>": 50689,
374
+ "<loc_421>": 50690,
375
+ "<loc_422>": 50691,
376
+ "<loc_423>": 50692,
377
+ "<loc_424>": 50693,
378
+ "<loc_425>": 50694,
379
+ "<loc_426>": 50695,
380
+ "<loc_427>": 50696,
381
+ "<loc_428>": 50697,
382
+ "<loc_429>": 50698,
383
+ "<loc_42>": 50311,
384
+ "<loc_430>": 50699,
385
+ "<loc_431>": 50700,
386
+ "<loc_432>": 50701,
387
+ "<loc_433>": 50702,
388
+ "<loc_434>": 50703,
389
+ "<loc_435>": 50704,
390
+ "<loc_436>": 50705,
391
+ "<loc_437>": 50706,
392
+ "<loc_438>": 50707,
393
+ "<loc_439>": 50708,
394
+ "<loc_43>": 50312,
395
+ "<loc_440>": 50709,
396
+ "<loc_441>": 50710,
397
+ "<loc_442>": 50711,
398
+ "<loc_443>": 50712,
399
+ "<loc_444>": 50713,
400
+ "<loc_445>": 50714,
401
+ "<loc_446>": 50715,
402
+ "<loc_447>": 50716,
403
+ "<loc_448>": 50717,
404
+ "<loc_449>": 50718,
405
+ "<loc_44>": 50313,
406
+ "<loc_450>": 50719,
407
+ "<loc_451>": 50720,
408
+ "<loc_452>": 50721,
409
+ "<loc_453>": 50722,
410
+ "<loc_454>": 50723,
411
+ "<loc_455>": 50724,
412
+ "<loc_456>": 50725,
413
+ "<loc_457>": 50726,
414
+ "<loc_458>": 50727,
415
+ "<loc_459>": 50728,
416
+ "<loc_45>": 50314,
417
+ "<loc_460>": 50729,
418
+ "<loc_461>": 50730,
419
+ "<loc_462>": 50731,
420
+ "<loc_463>": 50732,
421
+ "<loc_464>": 50733,
422
+ "<loc_465>": 50734,
423
+ "<loc_466>": 50735,
424
+ "<loc_467>": 50736,
425
+ "<loc_468>": 50737,
426
+ "<loc_469>": 50738,
427
+ "<loc_46>": 50315,
428
+ "<loc_470>": 50739,
429
+ "<loc_471>": 50740,
430
+ "<loc_472>": 50741,
431
+ "<loc_473>": 50742,
432
+ "<loc_474>": 50743,
433
+ "<loc_475>": 50744,
434
+ "<loc_476>": 50745,
435
+ "<loc_477>": 50746,
436
+ "<loc_478>": 50747,
437
+ "<loc_479>": 50748,
438
+ "<loc_47>": 50316,
439
+ "<loc_480>": 50749,
440
+ "<loc_481>": 50750,
441
+ "<loc_482>": 50751,
442
+ "<loc_483>": 50752,
443
+ "<loc_484>": 50753,
444
+ "<loc_485>": 50754,
445
+ "<loc_486>": 50755,
446
+ "<loc_487>": 50756,
447
+ "<loc_488>": 50757,
448
+ "<loc_489>": 50758,
449
+ "<loc_48>": 50317,
450
+ "<loc_490>": 50759,
451
+ "<loc_491>": 50760,
452
+ "<loc_492>": 50761,
453
+ "<loc_493>": 50762,
454
+ "<loc_494>": 50763,
455
+ "<loc_495>": 50764,
456
+ "<loc_496>": 50765,
457
+ "<loc_497>": 50766,
458
+ "<loc_498>": 50767,
459
+ "<loc_499>": 50768,
460
+ "<loc_49>": 50318,
461
+ "<loc_4>": 50273,
462
+ "<loc_500>": 50769,
463
+ "<loc_501>": 50770,
464
+ "<loc_502>": 50771,
465
+ "<loc_503>": 50772,
466
+ "<loc_504>": 50773,
467
+ "<loc_505>": 50774,
468
+ "<loc_506>": 50775,
469
+ "<loc_507>": 50776,
470
+ "<loc_508>": 50777,
471
+ "<loc_509>": 50778,
472
+ "<loc_50>": 50319,
473
+ "<loc_510>": 50779,
474
+ "<loc_511>": 50780,
475
+ "<loc_512>": 50781,
476
+ "<loc_513>": 50782,
477
+ "<loc_514>": 50783,
478
+ "<loc_515>": 50784,
479
+ "<loc_516>": 50785,
480
+ "<loc_517>": 50786,
481
+ "<loc_518>": 50787,
482
+ "<loc_519>": 50788,
483
+ "<loc_51>": 50320,
484
+ "<loc_520>": 50789,
485
+ "<loc_521>": 50790,
486
+ "<loc_522>": 50791,
487
+ "<loc_523>": 50792,
488
+ "<loc_524>": 50793,
489
+ "<loc_525>": 50794,
490
+ "<loc_526>": 50795,
491
+ "<loc_527>": 50796,
492
+ "<loc_528>": 50797,
493
+ "<loc_529>": 50798,
494
+ "<loc_52>": 50321,
495
+ "<loc_530>": 50799,
496
+ "<loc_531>": 50800,
497
+ "<loc_532>": 50801,
498
+ "<loc_533>": 50802,
499
+ "<loc_534>": 50803,
500
+ "<loc_535>": 50804,
501
+ "<loc_536>": 50805,
502
+ "<loc_537>": 50806,
503
+ "<loc_538>": 50807,
504
+ "<loc_539>": 50808,
505
+ "<loc_53>": 50322,
506
+ "<loc_540>": 50809,
507
+ "<loc_541>": 50810,
508
+ "<loc_542>": 50811,
509
+ "<loc_543>": 50812,
510
+ "<loc_544>": 50813,
511
+ "<loc_545>": 50814,
512
+ "<loc_546>": 50815,
513
+ "<loc_547>": 50816,
514
+ "<loc_548>": 50817,
515
+ "<loc_549>": 50818,
516
+ "<loc_54>": 50323,
517
+ "<loc_550>": 50819,
518
+ "<loc_551>": 50820,
519
+ "<loc_552>": 50821,
520
+ "<loc_553>": 50822,
521
+ "<loc_554>": 50823,
522
+ "<loc_555>": 50824,
523
+ "<loc_556>": 50825,
524
+ "<loc_557>": 50826,
525
+ "<loc_558>": 50827,
526
+ "<loc_559>": 50828,
527
+ "<loc_55>": 50324,
528
+ "<loc_560>": 50829,
529
+ "<loc_561>": 50830,
530
+ "<loc_562>": 50831,
531
+ "<loc_563>": 50832,
532
+ "<loc_564>": 50833,
533
+ "<loc_565>": 50834,
534
+ "<loc_566>": 50835,
535
+ "<loc_567>": 50836,
536
+ "<loc_568>": 50837,
537
+ "<loc_569>": 50838,
538
+ "<loc_56>": 50325,
539
+ "<loc_570>": 50839,
540
+ "<loc_571>": 50840,
541
+ "<loc_572>": 50841,
542
+ "<loc_573>": 50842,
543
+ "<loc_574>": 50843,
544
+ "<loc_575>": 50844,
545
+ "<loc_576>": 50845,
546
+ "<loc_577>": 50846,
547
+ "<loc_578>": 50847,
548
+ "<loc_579>": 50848,
549
+ "<loc_57>": 50326,
550
+ "<loc_580>": 50849,
551
+ "<loc_581>": 50850,
552
+ "<loc_582>": 50851,
553
+ "<loc_583>": 50852,
554
+ "<loc_584>": 50853,
555
+ "<loc_585>": 50854,
556
+ "<loc_586>": 50855,
557
+ "<loc_587>": 50856,
558
+ "<loc_588>": 50857,
559
+ "<loc_589>": 50858,
560
+ "<loc_58>": 50327,
561
+ "<loc_590>": 50859,
562
+ "<loc_591>": 50860,
563
+ "<loc_592>": 50861,
564
+ "<loc_593>": 50862,
565
+ "<loc_594>": 50863,
566
+ "<loc_595>": 50864,
567
+ "<loc_596>": 50865,
568
+ "<loc_597>": 50866,
569
+ "<loc_598>": 50867,
570
+ "<loc_599>": 50868,
571
+ "<loc_59>": 50328,
572
+ "<loc_5>": 50274,
573
+ "<loc_600>": 50869,
574
+ "<loc_601>": 50870,
575
+ "<loc_602>": 50871,
576
+ "<loc_603>": 50872,
577
+ "<loc_604>": 50873,
578
+ "<loc_605>": 50874,
579
+ "<loc_606>": 50875,
580
+ "<loc_607>": 50876,
581
+ "<loc_608>": 50877,
582
+ "<loc_609>": 50878,
583
+ "<loc_60>": 50329,
584
+ "<loc_610>": 50879,
585
+ "<loc_611>": 50880,
586
+ "<loc_612>": 50881,
587
+ "<loc_613>": 50882,
588
+ "<loc_614>": 50883,
589
+ "<loc_615>": 50884,
590
+ "<loc_616>": 50885,
591
+ "<loc_617>": 50886,
592
+ "<loc_618>": 50887,
593
+ "<loc_619>": 50888,
594
+ "<loc_61>": 50330,
595
+ "<loc_620>": 50889,
596
+ "<loc_621>": 50890,
597
+ "<loc_622>": 50891,
598
+ "<loc_623>": 50892,
599
+ "<loc_624>": 50893,
600
+ "<loc_625>": 50894,
601
+ "<loc_626>": 50895,
602
+ "<loc_627>": 50896,
603
+ "<loc_628>": 50897,
604
+ "<loc_629>": 50898,
605
+ "<loc_62>": 50331,
606
+ "<loc_630>": 50899,
607
+ "<loc_631>": 50900,
608
+ "<loc_632>": 50901,
609
+ "<loc_633>": 50902,
610
+ "<loc_634>": 50903,
611
+ "<loc_635>": 50904,
612
+ "<loc_636>": 50905,
613
+ "<loc_637>": 50906,
614
+ "<loc_638>": 50907,
615
+ "<loc_639>": 50908,
616
+ "<loc_63>": 50332,
617
+ "<loc_640>": 50909,
618
+ "<loc_641>": 50910,
619
+ "<loc_642>": 50911,
620
+ "<loc_643>": 50912,
621
+ "<loc_644>": 50913,
622
+ "<loc_645>": 50914,
623
+ "<loc_646>": 50915,
624
+ "<loc_647>": 50916,
625
+ "<loc_648>": 50917,
626
+ "<loc_649>": 50918,
627
+ "<loc_64>": 50333,
628
+ "<loc_650>": 50919,
629
+ "<loc_651>": 50920,
630
+ "<loc_652>": 50921,
631
+ "<loc_653>": 50922,
632
+ "<loc_654>": 50923,
633
+ "<loc_655>": 50924,
634
+ "<loc_656>": 50925,
635
+ "<loc_657>": 50926,
636
+ "<loc_658>": 50927,
637
+ "<loc_659>": 50928,
638
+ "<loc_65>": 50334,
639
+ "<loc_660>": 50929,
640
+ "<loc_661>": 50930,
641
+ "<loc_662>": 50931,
642
+ "<loc_663>": 50932,
643
+ "<loc_664>": 50933,
644
+ "<loc_665>": 50934,
645
+ "<loc_666>": 50935,
646
+ "<loc_667>": 50936,
647
+ "<loc_668>": 50937,
648
+ "<loc_669>": 50938,
649
+ "<loc_66>": 50335,
650
+ "<loc_670>": 50939,
651
+ "<loc_671>": 50940,
652
+ "<loc_672>": 50941,
653
+ "<loc_673>": 50942,
654
+ "<loc_674>": 50943,
655
+ "<loc_675>": 50944,
656
+ "<loc_676>": 50945,
657
+ "<loc_677>": 50946,
658
+ "<loc_678>": 50947,
659
+ "<loc_679>": 50948,
660
+ "<loc_67>": 50336,
661
+ "<loc_680>": 50949,
662
+ "<loc_681>": 50950,
663
+ "<loc_682>": 50951,
664
+ "<loc_683>": 50952,
665
+ "<loc_684>": 50953,
666
+ "<loc_685>": 50954,
667
+ "<loc_686>": 50955,
668
+ "<loc_687>": 50956,
669
+ "<loc_688>": 50957,
670
+ "<loc_689>": 50958,
671
+ "<loc_68>": 50337,
672
+ "<loc_690>": 50959,
673
+ "<loc_691>": 50960,
674
+ "<loc_692>": 50961,
675
+ "<loc_693>": 50962,
676
+ "<loc_694>": 50963,
677
+ "<loc_695>": 50964,
678
+ "<loc_696>": 50965,
679
+ "<loc_697>": 50966,
680
+ "<loc_698>": 50967,
681
+ "<loc_699>": 50968,
682
+ "<loc_69>": 50338,
683
+ "<loc_6>": 50275,
684
+ "<loc_700>": 50969,
685
+ "<loc_701>": 50970,
686
+ "<loc_702>": 50971,
687
+ "<loc_703>": 50972,
688
+ "<loc_704>": 50973,
689
+ "<loc_705>": 50974,
690
+ "<loc_706>": 50975,
691
+ "<loc_707>": 50976,
692
+ "<loc_708>": 50977,
693
+ "<loc_709>": 50978,
694
+ "<loc_70>": 50339,
695
+ "<loc_710>": 50979,
696
+ "<loc_711>": 50980,
697
+ "<loc_712>": 50981,
698
+ "<loc_713>": 50982,
699
+ "<loc_714>": 50983,
700
+ "<loc_715>": 50984,
701
+ "<loc_716>": 50985,
702
+ "<loc_717>": 50986,
703
+ "<loc_718>": 50987,
704
+ "<loc_719>": 50988,
705
+ "<loc_71>": 50340,
706
+ "<loc_720>": 50989,
707
+ "<loc_721>": 50990,
708
+ "<loc_722>": 50991,
709
+ "<loc_723>": 50992,
710
+ "<loc_724>": 50993,
711
+ "<loc_725>": 50994,
712
+ "<loc_726>": 50995,
713
+ "<loc_727>": 50996,
714
+ "<loc_728>": 50997,
715
+ "<loc_729>": 50998,
716
+ "<loc_72>": 50341,
717
+ "<loc_730>": 50999,
718
+ "<loc_731>": 51000,
719
+ "<loc_732>": 51001,
720
+ "<loc_733>": 51002,
721
+ "<loc_734>": 51003,
722
+ "<loc_735>": 51004,
723
+ "<loc_736>": 51005,
724
+ "<loc_737>": 51006,
725
+ "<loc_738>": 51007,
726
+ "<loc_739>": 51008,
727
+ "<loc_73>": 50342,
728
+ "<loc_740>": 51009,
729
+ "<loc_741>": 51010,
730
+ "<loc_742>": 51011,
731
+ "<loc_743>": 51012,
732
+ "<loc_744>": 51013,
733
+ "<loc_745>": 51014,
734
+ "<loc_746>": 51015,
735
+ "<loc_747>": 51016,
736
+ "<loc_748>": 51017,
737
+ "<loc_749>": 51018,
738
+ "<loc_74>": 50343,
739
+ "<loc_750>": 51019,
740
+ "<loc_751>": 51020,
741
+ "<loc_752>": 51021,
742
+ "<loc_753>": 51022,
743
+ "<loc_754>": 51023,
744
+ "<loc_755>": 51024,
745
+ "<loc_756>": 51025,
746
+ "<loc_757>": 51026,
747
+ "<loc_758>": 51027,
748
+ "<loc_759>": 51028,
749
+ "<loc_75>": 50344,
750
+ "<loc_760>": 51029,
751
+ "<loc_761>": 51030,
752
+ "<loc_762>": 51031,
753
+ "<loc_763>": 51032,
754
+ "<loc_764>": 51033,
755
+ "<loc_765>": 51034,
756
+ "<loc_766>": 51035,
757
+ "<loc_767>": 51036,
758
+ "<loc_768>": 51037,
759
+ "<loc_769>": 51038,
760
+ "<loc_76>": 50345,
761
+ "<loc_770>": 51039,
762
+ "<loc_771>": 51040,
763
+ "<loc_772>": 51041,
764
+ "<loc_773>": 51042,
765
+ "<loc_774>": 51043,
766
+ "<loc_775>": 51044,
767
+ "<loc_776>": 51045,
768
+ "<loc_777>": 51046,
769
+ "<loc_778>": 51047,
770
+ "<loc_779>": 51048,
771
+ "<loc_77>": 50346,
772
+ "<loc_780>": 51049,
773
+ "<loc_781>": 51050,
774
+ "<loc_782>": 51051,
775
+ "<loc_783>": 51052,
776
+ "<loc_784>": 51053,
777
+ "<loc_785>": 51054,
778
+ "<loc_786>": 51055,
779
+ "<loc_787>": 51056,
780
+ "<loc_788>": 51057,
781
+ "<loc_789>": 51058,
782
+ "<loc_78>": 50347,
783
+ "<loc_790>": 51059,
784
+ "<loc_791>": 51060,
785
+ "<loc_792>": 51061,
786
+ "<loc_793>": 51062,
787
+ "<loc_794>": 51063,
788
+ "<loc_795>": 51064,
789
+ "<loc_796>": 51065,
790
+ "<loc_797>": 51066,
791
+ "<loc_798>": 51067,
792
+ "<loc_799>": 51068,
793
+ "<loc_79>": 50348,
794
+ "<loc_7>": 50276,
795
+ "<loc_800>": 51069,
796
+ "<loc_801>": 51070,
797
+ "<loc_802>": 51071,
798
+ "<loc_803>": 51072,
799
+ "<loc_804>": 51073,
800
+ "<loc_805>": 51074,
801
+ "<loc_806>": 51075,
802
+ "<loc_807>": 51076,
803
+ "<loc_808>": 51077,
804
+ "<loc_809>": 51078,
805
+ "<loc_80>": 50349,
806
+ "<loc_810>": 51079,
807
+ "<loc_811>": 51080,
808
+ "<loc_812>": 51081,
809
+ "<loc_813>": 51082,
810
+ "<loc_814>": 51083,
811
+ "<loc_815>": 51084,
812
+ "<loc_816>": 51085,
813
+ "<loc_817>": 51086,
814
+ "<loc_818>": 51087,
815
+ "<loc_819>": 51088,
816
+ "<loc_81>": 50350,
817
+ "<loc_820>": 51089,
818
+ "<loc_821>": 51090,
819
+ "<loc_822>": 51091,
820
+ "<loc_823>": 51092,
821
+ "<loc_824>": 51093,
822
+ "<loc_825>": 51094,
823
+ "<loc_826>": 51095,
824
+ "<loc_827>": 51096,
825
+ "<loc_828>": 51097,
826
+ "<loc_829>": 51098,
827
+ "<loc_82>": 50351,
828
+ "<loc_830>": 51099,
829
+ "<loc_831>": 51100,
830
+ "<loc_832>": 51101,
831
+ "<loc_833>": 51102,
832
+ "<loc_834>": 51103,
833
+ "<loc_835>": 51104,
834
+ "<loc_836>": 51105,
835
+ "<loc_837>": 51106,
836
+ "<loc_838>": 51107,
837
+ "<loc_839>": 51108,
838
+ "<loc_83>": 50352,
839
+ "<loc_840>": 51109,
840
+ "<loc_841>": 51110,
841
+ "<loc_842>": 51111,
842
+ "<loc_843>": 51112,
843
+ "<loc_844>": 51113,
844
+ "<loc_845>": 51114,
845
+ "<loc_846>": 51115,
846
+ "<loc_847>": 51116,
847
+ "<loc_848>": 51117,
848
+ "<loc_849>": 51118,
849
+ "<loc_84>": 50353,
850
+ "<loc_850>": 51119,
851
+ "<loc_851>": 51120,
852
+ "<loc_852>": 51121,
853
+ "<loc_853>": 51122,
854
+ "<loc_854>": 51123,
855
+ "<loc_855>": 51124,
856
+ "<loc_856>": 51125,
857
+ "<loc_857>": 51126,
858
+ "<loc_858>": 51127,
859
+ "<loc_859>": 51128,
860
+ "<loc_85>": 50354,
861
+ "<loc_860>": 51129,
862
+ "<loc_861>": 51130,
863
+ "<loc_862>": 51131,
864
+ "<loc_863>": 51132,
865
+ "<loc_864>": 51133,
866
+ "<loc_865>": 51134,
867
+ "<loc_866>": 51135,
868
+ "<loc_867>": 51136,
869
+ "<loc_868>": 51137,
870
+ "<loc_869>": 51138,
871
+ "<loc_86>": 50355,
872
+ "<loc_870>": 51139,
873
+ "<loc_871>": 51140,
874
+ "<loc_872>": 51141,
875
+ "<loc_873>": 51142,
876
+ "<loc_874>": 51143,
877
+ "<loc_875>": 51144,
878
+ "<loc_876>": 51145,
879
+ "<loc_877>": 51146,
880
+ "<loc_878>": 51147,
881
+ "<loc_879>": 51148,
882
+ "<loc_87>": 50356,
883
+ "<loc_880>": 51149,
884
+ "<loc_881>": 51150,
885
+ "<loc_882>": 51151,
886
+ "<loc_883>": 51152,
887
+ "<loc_884>": 51153,
888
+ "<loc_885>": 51154,
889
+ "<loc_886>": 51155,
890
+ "<loc_887>": 51156,
891
+ "<loc_888>": 51157,
892
+ "<loc_889>": 51158,
893
+ "<loc_88>": 50357,
894
+ "<loc_890>": 51159,
895
+ "<loc_891>": 51160,
896
+ "<loc_892>": 51161,
897
+ "<loc_893>": 51162,
898
+ "<loc_894>": 51163,
899
+ "<loc_895>": 51164,
900
+ "<loc_896>": 51165,
901
+ "<loc_897>": 51166,
902
+ "<loc_898>": 51167,
903
+ "<loc_899>": 51168,
904
+ "<loc_89>": 50358,
905
+ "<loc_8>": 50277,
906
+ "<loc_900>": 51169,
907
+ "<loc_901>": 51170,
908
+ "<loc_902>": 51171,
909
+ "<loc_903>": 51172,
910
+ "<loc_904>": 51173,
911
+ "<loc_905>": 51174,
912
+ "<loc_906>": 51175,
913
+ "<loc_907>": 51176,
914
+ "<loc_908>": 51177,
915
+ "<loc_909>": 51178,
916
+ "<loc_90>": 50359,
917
+ "<loc_910>": 51179,
918
+ "<loc_911>": 51180,
919
+ "<loc_912>": 51181,
920
+ "<loc_913>": 51182,
921
+ "<loc_914>": 51183,
922
+ "<loc_915>": 51184,
923
+ "<loc_916>": 51185,
924
+ "<loc_917>": 51186,
925
+ "<loc_918>": 51187,
926
+ "<loc_919>": 51188,
927
+ "<loc_91>": 50360,
928
+ "<loc_920>": 51189,
929
+ "<loc_921>": 51190,
930
+ "<loc_922>": 51191,
931
+ "<loc_923>": 51192,
932
+ "<loc_924>": 51193,
933
+ "<loc_925>": 51194,
934
+ "<loc_926>": 51195,
935
+ "<loc_927>": 51196,
936
+ "<loc_928>": 51197,
937
+ "<loc_929>": 51198,
938
+ "<loc_92>": 50361,
939
+ "<loc_930>": 51199,
940
+ "<loc_931>": 51200,
941
+ "<loc_932>": 51201,
942
+ "<loc_933>": 51202,
943
+ "<loc_934>": 51203,
944
+ "<loc_935>": 51204,
945
+ "<loc_936>": 51205,
946
+ "<loc_937>": 51206,
947
+ "<loc_938>": 51207,
948
+ "<loc_939>": 51208,
949
+ "<loc_93>": 50362,
950
+ "<loc_940>": 51209,
951
+ "<loc_941>": 51210,
952
+ "<loc_942>": 51211,
953
+ "<loc_943>": 51212,
954
+ "<loc_944>": 51213,
955
+ "<loc_945>": 51214,
956
+ "<loc_946>": 51215,
957
+ "<loc_947>": 51216,
958
+ "<loc_948>": 51217,
959
+ "<loc_949>": 51218,
960
+ "<loc_94>": 50363,
961
+ "<loc_950>": 51219,
962
+ "<loc_951>": 51220,
963
+ "<loc_952>": 51221,
964
+ "<loc_953>": 51222,
965
+ "<loc_954>": 51223,
966
+ "<loc_955>": 51224,
967
+ "<loc_956>": 51225,
968
+ "<loc_957>": 51226,
969
+ "<loc_958>": 51227,
970
+ "<loc_959>": 51228,
971
+ "<loc_95>": 50364,
972
+ "<loc_960>": 51229,
973
+ "<loc_961>": 51230,
974
+ "<loc_962>": 51231,
975
+ "<loc_963>": 51232,
976
+ "<loc_964>": 51233,
977
+ "<loc_965>": 51234,
978
+ "<loc_966>": 51235,
979
+ "<loc_967>": 51236,
980
+ "<loc_968>": 51237,
981
+ "<loc_969>": 51238,
982
+ "<loc_96>": 50365,
983
+ "<loc_970>": 51239,
984
+ "<loc_971>": 51240,
985
+ "<loc_972>": 51241,
986
+ "<loc_973>": 51242,
987
+ "<loc_974>": 51243,
988
+ "<loc_975>": 51244,
989
+ "<loc_976>": 51245,
990
+ "<loc_977>": 51246,
991
+ "<loc_978>": 51247,
992
+ "<loc_979>": 51248,
993
+ "<loc_97>": 50366,
994
+ "<loc_980>": 51249,
995
+ "<loc_981>": 51250,
996
+ "<loc_982>": 51251,
997
+ "<loc_983>": 51252,
998
+ "<loc_984>": 51253,
999
+ "<loc_985>": 51254,
1000
+ "<loc_986>": 51255,
1001
+ "<loc_987>": 51256,
1002
+ "<loc_988>": 51257,
1003
+ "<loc_989>": 51258,
1004
+ "<loc_98>": 50367,
1005
+ "<loc_990>": 51259,
1006
+ "<loc_991>": 51260,
1007
+ "<loc_992>": 51261,
1008
+ "<loc_993>": 51262,
1009
+ "<loc_994>": 51263,
1010
+ "<loc_995>": 51264,
1011
+ "<loc_996>": 51265,
1012
+ "<loc_997>": 51266,
1013
+ "<loc_998>": 51267,
1014
+ "<loc_999>": 51268,
1015
+ "<loc_99>": 50368,
1016
+ "<loc_9>": 50278,
1017
+ "<ncap>": 51271,
1018
+ "<ocr>": 50267,
1019
+ "<od>": 50265,
1020
+ "<poly>": 51286,
1021
+ "<proposal>": 51284,
1022
+ "<region_cap>": 51280,
1023
+ "<region_to_desciption>": 51282,
1024
+ "<seg>": 51277,
1025
+ "<sep>": 51279
1026
+ }
config.json ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/fsx_0/user/jiuhai/hub/models--microsoft--Florence-2-large-ft/snapshots/c669c6b8bfbd7f0193fcb31f997879045a3612f3",
3
+ "architectures": [
4
+ "Florence2ForConditionalGeneration"
5
+ ],
6
+ "auto_map": {
7
+ "AutoConfig": "configuration_florence2.Florence2Config",
8
+ "AutoModelForCausalLM": "modeling_florence2.Florence2ForConditionalGeneration"
9
+ },
10
+ "bos_token_id": 0,
11
+ "eos_token_id": 2,
12
+ "ignore_index": -100,
13
+ "is_encoder_decoder": true,
14
+ "model_type": "florence2",
15
+ "pad_token_id": 1,
16
+ "projection_dim": 1024,
17
+ "text_config": {
18
+ "_name_or_path": "",
19
+ "activation_dropout": 0.1,
20
+ "activation_function": "gelu",
21
+ "add_bias_logits": false,
22
+ "add_cross_attention": false,
23
+ "add_final_layer_norm": false,
24
+ "architectures": null,
25
+ "attention_dropout": 0.1,
26
+ "bad_words_ids": null,
27
+ "begin_suppress_tokens": null,
28
+ "bos_token_id": 0,
29
+ "chunk_size_feed_forward": 0,
30
+ "classif_dropout": 0.1,
31
+ "classifier_dropout": 0.0,
32
+ "cross_attention_hidden_size": null,
33
+ "d_model": 1024,
34
+ "decoder_attention_heads": 16,
35
+ "decoder_ffn_dim": 4096,
36
+ "decoder_layerdrop": 0.0,
37
+ "decoder_layers": 12,
38
+ "decoder_start_token_id": 2,
39
+ "diversity_penalty": 0.0,
40
+ "do_sample": false,
41
+ "dropout": 0.1,
42
+ "early_stopping": true,
43
+ "encoder_attention_heads": 16,
44
+ "encoder_ffn_dim": 4096,
45
+ "encoder_layerdrop": 0.0,
46
+ "encoder_layers": 12,
47
+ "encoder_no_repeat_ngram_size": 0,
48
+ "eos_token_id": 2,
49
+ "exponential_decay_length_penalty": null,
50
+ "finetuning_task": null,
51
+ "forced_bos_token_id": 0,
52
+ "forced_eos_token_id": 2,
53
+ "gradient_checkpointing": false,
54
+ "id2label": {
55
+ "0": "LABEL_0",
56
+ "1": "LABEL_1",
57
+ "2": "LABEL_2"
58
+ },
59
+ "init_std": 0.02,
60
+ "is_decoder": false,
61
+ "is_encoder_decoder": true,
62
+ "label2id": {
63
+ "LABEL_0": 0,
64
+ "LABEL_1": 1,
65
+ "LABEL_2": 2
66
+ },
67
+ "length_penalty": 1.0,
68
+ "max_length": 20,
69
+ "max_position_embeddings": 1024,
70
+ "min_length": 0,
71
+ "model_type": "florence2_language",
72
+ "no_repeat_ngram_size": 3,
73
+ "normalize_before": false,
74
+ "num_beam_groups": 1,
75
+ "num_beams": 3,
76
+ "num_hidden_layers": 12,
77
+ "num_return_sequences": 1,
78
+ "output_attentions": false,
79
+ "output_hidden_states": false,
80
+ "output_scores": false,
81
+ "pad_token_id": 1,
82
+ "prefix": null,
83
+ "problem_type": null,
84
+ "pruned_heads": {},
85
+ "remove_invalid_values": false,
86
+ "repetition_penalty": 1.0,
87
+ "return_dict": true,
88
+ "return_dict_in_generate": false,
89
+ "scale_embedding": false,
90
+ "sep_token_id": null,
91
+ "suppress_tokens": null,
92
+ "task_specific_params": null,
93
+ "temperature": 1.0,
94
+ "tf_legacy_loss": false,
95
+ "tie_encoder_decoder": false,
96
+ "tie_word_embeddings": true,
97
+ "tokenizer_class": null,
98
+ "top_k": 50,
99
+ "top_p": 1.0,
100
+ "torch_dtype": null,
101
+ "torchscript": false,
102
+ "typical_p": 1.0,
103
+ "use_bfloat16": false,
104
+ "use_cache": true,
105
+ "vocab_size": 51289
106
+ },
107
+ "torch_dtype": "float32",
108
+ "transformers_version": "4.43.1",
109
+ "vision_config": {
110
+ "_name_or_path": "",
111
+ "add_cross_attention": false,
112
+ "architectures": null,
113
+ "bad_words_ids": null,
114
+ "begin_suppress_tokens": null,
115
+ "bos_token_id": null,
116
+ "chunk_size_feed_forward": 0,
117
+ "cross_attention_hidden_size": null,
118
+ "decoder_start_token_id": null,
119
+ "depths": [
120
+ 1,
121
+ 1,
122
+ 9,
123
+ 1
124
+ ],
125
+ "dim_embed": [
126
+ 256,
127
+ 512,
128
+ 1024,
129
+ 2048
130
+ ],
131
+ "diversity_penalty": 0.0,
132
+ "do_sample": false,
133
+ "drop_path_rate": 0.1,
134
+ "early_stopping": false,
135
+ "enable_checkpoint": false,
136
+ "encoder_no_repeat_ngram_size": 0,
137
+ "eos_token_id": null,
138
+ "exponential_decay_length_penalty": null,
139
+ "finetuning_task": null,
140
+ "forced_bos_token_id": null,
141
+ "forced_eos_token_id": null,
142
+ "id2label": {
143
+ "0": "LABEL_0",
144
+ "1": "LABEL_1"
145
+ },
146
+ "image_feature_source": [
147
+ "spatial_avg_pool",
148
+ "temporal_avg_pool"
149
+ ],
150
+ "image_pos_embed": {
151
+ "max_pos_embeddings": 50,
152
+ "type": "learned_abs_2d"
153
+ },
154
+ "is_decoder": false,
155
+ "is_encoder_decoder": false,
156
+ "label2id": {
157
+ "LABEL_0": 0,
158
+ "LABEL_1": 1
159
+ },
160
+ "length_penalty": 1.0,
161
+ "max_length": 20,
162
+ "min_length": 0,
163
+ "model_type": "",
164
+ "no_repeat_ngram_size": 0,
165
+ "num_beam_groups": 1,
166
+ "num_beams": 1,
167
+ "num_groups": [
168
+ 8,
169
+ 16,
170
+ 32,
171
+ 64
172
+ ],
173
+ "num_heads": [
174
+ 8,
175
+ 16,
176
+ 32,
177
+ 64
178
+ ],
179
+ "num_return_sequences": 1,
180
+ "output_attentions": false,
181
+ "output_hidden_states": false,
182
+ "output_scores": false,
183
+ "pad_token_id": null,
184
+ "patch_padding": [
185
+ 3,
186
+ 1,
187
+ 1,
188
+ 1
189
+ ],
190
+ "patch_prenorm": [
191
+ false,
192
+ true,
193
+ true,
194
+ true
195
+ ],
196
+ "patch_size": [
197
+ 7,
198
+ 3,
199
+ 3,
200
+ 3
201
+ ],
202
+ "patch_stride": [
203
+ 4,
204
+ 2,
205
+ 2,
206
+ 2
207
+ ],
208
+ "prefix": null,
209
+ "problem_type": null,
210
+ "projection_dim": 1024,
211
+ "pruned_heads": {},
212
+ "remove_invalid_values": false,
213
+ "repetition_penalty": 1.0,
214
+ "return_dict": true,
215
+ "return_dict_in_generate": false,
216
+ "sep_token_id": null,
217
+ "suppress_tokens": null,
218
+ "task_specific_params": null,
219
+ "temperature": 1.0,
220
+ "tf_legacy_loss": false,
221
+ "tie_encoder_decoder": false,
222
+ "tie_word_embeddings": true,
223
+ "tokenizer_class": null,
224
+ "top_k": 50,
225
+ "top_p": 1.0,
226
+ "torch_dtype": null,
227
+ "torchscript": false,
228
+ "typical_p": 1.0,
229
+ "use_bfloat16": false,
230
+ "visual_temporal_embedding": {
231
+ "max_temporal_embeddings": 100,
232
+ "type": "COSINE"
233
+ },
234
+ "window_size": 12
235
+ },
236
+ "vocab_size": 51289
237
+ }
configuration_florence2.py ADDED
@@ -0,0 +1,340 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import warnings
15
+ """ Florence-2 configuration"""
16
+
17
+ from typing import Optional
18
+
19
+ from transformers import AutoConfig
20
+ from transformers.configuration_utils import PretrainedConfig
21
+ from transformers.utils import logging
22
+
23
+ logger = logging.get_logger(__name__)
24
+
25
+ class Florence2VisionConfig(PretrainedConfig):
26
+ r"""
27
+ This is the configuration class to store the configuration of a [`Florence2VisionModel`]. It is used to instantiate a Florence2VisionModel
28
+ according to the specified arguments, defining the model architecture. Instantiating a configuration with the
29
+ defaults will yield a similar configuration to that of the Florence2VisionModel architecture.
30
+
31
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
32
+ documentation from [`PretrainedConfig`] for more information.
33
+
34
+ Args:
35
+ drop_path_rate (`float`, *optional*, defaults to 0.1):
36
+ The dropout rate of the drop path layer.
37
+ patch_size (`List[int]`, *optional*, defaults to [7, 3, 3, 3]):
38
+ The patch size of the image.
39
+ patch_stride (`List[int]`, *optional*, defaults to [4, 2, 2, 2]):
40
+ The patch stride of the image.
41
+ patch_padding (`List[int]`, *optional*, defaults to [3, 1, 1, 1]):
42
+ The patch padding of the image.
43
+ patch_prenorm (`List[bool]`, *optional*, defaults to [false, true, true, true]):
44
+ Whether to apply layer normalization before the patch embedding layer.
45
+ enable_checkpoint (`bool`, *optional*, defaults to False):
46
+ Whether to enable checkpointing.
47
+ dim_embed (`List[int]`, *optional*, defaults to [256, 512, 1024, 2048]):
48
+ The dimension of the embedding layer.
49
+ num_heads (`List[int]`, *optional*, defaults to [8, 16, 32, 64]):
50
+ The number of attention heads.
51
+ num_groups (`List[int]`, *optional*, defaults to [8, 16, 32, 64]):
52
+ The number of groups.
53
+ depths (`List[int]`, *optional*, defaults to [1, 1, 9, 1]):
54
+ The depth of the model.
55
+ window_size (`int`, *optional*, defaults to 12):
56
+ The window size of the model.
57
+ projection_dim (`int`, *optional*, defaults to 1024):
58
+ The dimension of the projection layer.
59
+ visual_temporal_embedding (`dict`, *optional*):
60
+ The configuration of the visual temporal embedding.
61
+ image_pos_embed (`dict`, *optional*):
62
+ The configuration of the image position embedding.
63
+ image_feature_source (`List[str]`, *optional*, defaults to ["spatial_avg_pool", "temporal_avg_pool"]):
64
+ The source of the image feature.
65
+ Example:
66
+
67
+ ```python
68
+ >>> from transformers import Florence2VisionConfig, Florence2VisionModel
69
+
70
+ >>> # Initializing a Florence2 Vision style configuration
71
+ >>> configuration = Florence2VisionConfig()
72
+
73
+ >>> # Initializing a model (with random weights)
74
+ >>> model = Florence2VisionModel(configuration)
75
+
76
+ >>> # Accessing the model configuration
77
+ >>> configuration = model.config
78
+ ```"""
79
+
80
+ model_type = "florence2_vision"
81
+ keys_to_ignore_at_inference = ["past_key_values"]
82
+
83
+ def __init__(
84
+ self,
85
+ drop_path_rate=0.1,
86
+ patch_size=[7, 3, 3, 3],
87
+ patch_stride=[4, 2, 2, 2],
88
+ patch_padding=[3, 1, 1, 1],
89
+ patch_prenorm=[False, True, True, True],
90
+ enable_checkpoint=False,
91
+ dim_embed=[256, 512, 1024, 2048],
92
+ num_heads=[8, 16, 32, 64],
93
+ num_groups=[8, 16, 32, 64],
94
+ depths=[1, 1, 9, 1],
95
+ window_size=12,
96
+ projection_dim=1024,
97
+ visual_temporal_embedding=None,
98
+ image_pos_embed=None,
99
+ image_feature_source=["spatial_avg_pool", "temporal_avg_pool"],
100
+ **kwargs,
101
+ ):
102
+ self.drop_path_rate = drop_path_rate
103
+ self.patch_size = patch_size
104
+ self.patch_stride = patch_stride
105
+ self.patch_padding = patch_padding
106
+ self.patch_prenorm = patch_prenorm
107
+ self.enable_checkpoint = enable_checkpoint
108
+ self.dim_embed = dim_embed
109
+ self.num_heads = num_heads
110
+ self.num_groups = num_groups
111
+ self.depths = depths
112
+ self.window_size = window_size
113
+ self.projection_dim = projection_dim
114
+ self.visual_temporal_embedding = visual_temporal_embedding
115
+ self.image_pos_embed = image_pos_embed
116
+ self.image_feature_source = image_feature_source
117
+
118
+ super().__init__(**kwargs)
119
+
120
+
121
+
122
+ class Florence2LanguageConfig(PretrainedConfig):
123
+ r"""
124
+ This is the configuration class to store the configuration of a [`Florence2LanguagePreTrainedModel`]. It is used to instantiate a BART
125
+ model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
126
+ defaults will yield a similar configuration to that of the BART
127
+ [facebook/bart-large](https://huggingface.co/facebook/bart-large) architecture.
128
+
129
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
130
+ documentation from [`PretrainedConfig`] for more information.
131
+
132
+
133
+ Args:
134
+ vocab_size (`int`, *optional*, defaults to 51289):
135
+ Vocabulary size of the Florence2Language model. Defines the number of different tokens that can be represented by the
136
+ `inputs_ids` passed when calling [`Florence2LanguageModel`].
137
+ d_model (`int`, *optional*, defaults to 1024):
138
+ Dimensionality of the layers and the pooler layer.
139
+ encoder_layers (`int`, *optional*, defaults to 12):
140
+ Number of encoder layers.
141
+ decoder_layers (`int`, *optional*, defaults to 12):
142
+ Number of decoder layers.
143
+ encoder_attention_heads (`int`, *optional*, defaults to 16):
144
+ Number of attention heads for each attention layer in the Transformer encoder.
145
+ decoder_attention_heads (`int`, *optional*, defaults to 16):
146
+ Number of attention heads for each attention layer in the Transformer decoder.
147
+ decoder_ffn_dim (`int`, *optional*, defaults to 4096):
148
+ Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
149
+ encoder_ffn_dim (`int`, *optional*, defaults to 4096):
150
+ Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
151
+ activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
152
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
153
+ `"relu"`, `"silu"` and `"gelu_new"` are supported.
154
+ dropout (`float`, *optional*, defaults to 0.1):
155
+ The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
156
+ attention_dropout (`float`, *optional*, defaults to 0.0):
157
+ The dropout ratio for the attention probabilities.
158
+ activation_dropout (`float`, *optional*, defaults to 0.0):
159
+ The dropout ratio for activations inside the fully connected layer.
160
+ classifier_dropout (`float`, *optional*, defaults to 0.0):
161
+ The dropout ratio for classifier.
162
+ max_position_embeddings (`int`, *optional*, defaults to 1024):
163
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
164
+ just in case (e.g., 512 or 1024 or 2048).
165
+ init_std (`float`, *optional*, defaults to 0.02):
166
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
167
+ encoder_layerdrop (`float`, *optional*, defaults to 0.0):
168
+ The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
169
+ for more details.
170
+ decoder_layerdrop (`float`, *optional*, defaults to 0.0):
171
+ The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
172
+ for more details.
173
+ scale_embedding (`bool`, *optional*, defaults to `False`):
174
+ Scale embeddings by diving by sqrt(d_model).
175
+ use_cache (`bool`, *optional*, defaults to `True`):
176
+ Whether or not the model should return the last key/values attentions (not used by all models).
177
+ num_labels (`int`, *optional*, defaults to 3):
178
+ The number of labels to use in [`Florence2LanguageForSequenceClassification`].
179
+ forced_eos_token_id (`int`, *optional*, defaults to 2):
180
+ The id of the token to force as the last generated token when `max_length` is reached. Usually set to
181
+ `eos_token_id`.
182
+
183
+ Example:
184
+
185
+ ```python
186
+ >>> from transformers import Florence2LanguageConfig, Florence2LanguageModel
187
+
188
+ >>> # Initializing a Florence2 Language style configuration
189
+ >>> configuration = Florence2LanguageConfig()
190
+
191
+ >>> # Initializing a model (with random weights)
192
+ >>> model = Florence2LangaugeModel(configuration)
193
+
194
+ >>> # Accessing the model configuration
195
+ >>> configuration = model.config
196
+ ```"""
197
+
198
+ model_type = "florence2_language"
199
+ keys_to_ignore_at_inference = ["past_key_values"]
200
+ attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
201
+
202
+ def __init__(
203
+ self,
204
+ vocab_size=51289,
205
+ max_position_embeddings=1024,
206
+ encoder_layers=12,
207
+ encoder_ffn_dim=4096,
208
+ encoder_attention_heads=16,
209
+ decoder_layers=12,
210
+ decoder_ffn_dim=4096,
211
+ decoder_attention_heads=16,
212
+ encoder_layerdrop=0.0,
213
+ decoder_layerdrop=0.0,
214
+ activation_function="gelu",
215
+ d_model=1024,
216
+ dropout=0.1,
217
+ attention_dropout=0.0,
218
+ activation_dropout=0.0,
219
+ init_std=0.02,
220
+ classifier_dropout=0.0,
221
+ scale_embedding=False,
222
+ use_cache=True,
223
+ num_labels=3,
224
+ pad_token_id=1,
225
+ bos_token_id=0,
226
+ eos_token_id=2,
227
+ is_encoder_decoder=True,
228
+ decoder_start_token_id=2,
229
+ forced_eos_token_id=2,
230
+ **kwargs,
231
+ ):
232
+ self.vocab_size = vocab_size
233
+ self.max_position_embeddings = max_position_embeddings
234
+ self.d_model = d_model
235
+ self.encoder_ffn_dim = encoder_ffn_dim
236
+ self.encoder_layers = encoder_layers
237
+ self.encoder_attention_heads = encoder_attention_heads
238
+ self.decoder_ffn_dim = decoder_ffn_dim
239
+ self.decoder_layers = decoder_layers
240
+ self.decoder_attention_heads = decoder_attention_heads
241
+ self.dropout = dropout
242
+ self.attention_dropout = attention_dropout
243
+ self.activation_dropout = activation_dropout
244
+ self.activation_function = activation_function
245
+ self.init_std = init_std
246
+ self.encoder_layerdrop = encoder_layerdrop
247
+ self.decoder_layerdrop = decoder_layerdrop
248
+ self.classifier_dropout = classifier_dropout
249
+ self.use_cache = use_cache
250
+ self.num_hidden_layers = encoder_layers
251
+ self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
252
+
253
+ super().__init__(
254
+ num_labels=num_labels,
255
+ pad_token_id=pad_token_id,
256
+ bos_token_id=bos_token_id,
257
+ eos_token_id=eos_token_id,
258
+ is_encoder_decoder=is_encoder_decoder,
259
+ decoder_start_token_id=decoder_start_token_id,
260
+ forced_eos_token_id=forced_eos_token_id,
261
+ **kwargs,
262
+ )
263
+
264
+ # ensure backward compatibility for BART CNN models
265
+ if self.forced_bos_token_id is None and kwargs.get("force_bos_token_to_be_generated", False):
266
+ self.forced_bos_token_id = self.bos_token_id
267
+ warnings.warn(
268
+ f"Please make sure the config includes `forced_bos_token_id={self.bos_token_id}` in future versions. "
269
+ "The config can simply be saved and uploaded again to be fixed."
270
+ )
271
+
272
+ class Florence2Config(PretrainedConfig):
273
+ r"""
274
+ This is the configuration class to store the configuration of a [`Florence2ForConditionalGeneration`]. It is used to instantiate an
275
+ Florence-2 model according to the specified arguments, defining the model architecture.
276
+
277
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
278
+ documentation from [`PretrainedConfig`] for more information.
279
+
280
+ Args:
281
+ vision_config (`Florence2VisionConfig`, *optional*):
282
+ Custom vision config or dict
283
+ text_config (`Union[AutoConfig, dict]`, *optional*):
284
+ The config object of the text backbone.
285
+ ignore_index (`int`, *optional*, defaults to -100):
286
+ The ignore index for the loss function.
287
+ vocab_size (`int`, *optional*, defaults to 51289):
288
+ Vocabulary size of the Florence2model. Defines the number of different tokens that can be represented by the
289
+ `inputs_ids` passed when calling [`~Florence2ForConditionalGeneration`]
290
+ projection_dim (`int`, *optional*, defaults to 1024):
291
+ Dimension of the multimodal projection space.
292
+
293
+ Example:
294
+
295
+ ```python
296
+ >>> from transformers import Florence2ForConditionalGeneration, Florence2Config, CLIPVisionConfig, BartConfig
297
+
298
+ >>> # Initializing a clip-like vision config
299
+ >>> vision_config = CLIPVisionConfig()
300
+
301
+ >>> # Initializing a Bart config
302
+ >>> text_config = BartConfig()
303
+
304
+ >>> # Initializing a Florence-2 configuration
305
+ >>> configuration = Florence2Config(vision_config, text_config)
306
+
307
+ >>> # Initializing a model from the florence-2 configuration
308
+ >>> model = Florence2ForConditionalGeneration(configuration)
309
+
310
+ >>> # Accessing the model configuration
311
+ >>> configuration = model.config
312
+ ```"""
313
+
314
+ model_type = "florence2"
315
+ is_composition = False
316
+
317
+ def __init__(
318
+ self,
319
+ vision_config=None,
320
+ text_config=None,
321
+ ignore_index=-100,
322
+ vocab_size=51289,
323
+ projection_dim=1024,
324
+ **kwargs,
325
+ ):
326
+ self.ignore_index = ignore_index
327
+ self.vocab_size = vocab_size
328
+ self.projection_dim = projection_dim
329
+ if vision_config is not None:
330
+ vision_config = PretrainedConfig(**vision_config)
331
+ self.vision_config = vision_config
332
+ self.vocab_size = self.vocab_size
333
+
334
+ self.text_config = text_config
335
+ if text_config is not None:
336
+ self.text_config = Florence2LanguageConfig(**text_config)
337
+
338
+
339
+ super().__init__(**kwargs)
340
+
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
modeling_florence2.py ADDED
The diff for this file is too large to render. See raw diff
 
preprocessor_config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoProcessor": "processing_florence2.Florence2Processor"
4
+ },
5
+ "crop_size": {
6
+ "height": 768,
7
+ "width": 768
8
+ },
9
+ "do_center_crop": false,
10
+ "do_convert_rgb": null,
11
+ "do_normalize": true,
12
+ "do_rescale": true,
13
+ "do_resize": true,
14
+ "image_mean": [
15
+ 0.485,
16
+ 0.456,
17
+ 0.406
18
+ ],
19
+ "image_processor_type": "CLIPImageProcessor",
20
+ "image_seq_length": 577,
21
+ "image_std": [
22
+ 0.229,
23
+ 0.224,
24
+ 0.225
25
+ ],
26
+ "processor_class": "Florence2Processor",
27
+ "resample": 3,
28
+ "rescale_factor": 0.00392156862745098,
29
+ "size": {
30
+ "height": 768,
31
+ "width": 768
32
+ }
33
+ }
processing_florence2.py ADDED
@@ -0,0 +1,1124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 Microsoft and The HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """
16
+ Processor class for Florence-2.
17
+ """
18
+
19
+ import re
20
+ import logging
21
+ from typing import List, Optional, Union
22
+ import numpy as np
23
+
24
+ import torch
25
+
26
+ from transformers.feature_extraction_utils import BatchFeature
27
+ from transformers.image_utils import ImageInput, is_valid_image
28
+ from transformers.processing_utils import ProcessorMixin
29
+ from transformers.tokenization_utils_base import (
30
+ PaddingStrategy,
31
+ PreTokenizedInput,
32
+ TextInput,
33
+ TruncationStrategy,
34
+ )
35
+ from transformers.utils import TensorType
36
+
37
+
38
+ logger = logging.getLogger(__name__)
39
+
40
+ # Copied from transformers.models.idefics2.processing_idefics2.is_url
41
+ def is_url(val) -> bool:
42
+ return isinstance(val, str) and val.startswith("http")
43
+
44
+ # Copied from transformers.models.idefics2.processing_idefics2.is_image_or_image_url
45
+ def is_image_or_image_url(elem):
46
+ return is_url(elem) or is_valid_image(elem)
47
+
48
+
49
+ def _is_str_or_image(elem):
50
+ return isinstance(elem, (str)) or is_image_or_image_url(elem)
51
+
52
+
53
+ class Florence2Processor(ProcessorMixin):
54
+ r"""
55
+ Constructs a Florence2 processor which wraps a Florence2 image processor and a Florence2 tokenizer into a single processor.
56
+
57
+ [`Florence2Processor`] offers all the functionalities of [`CLIPImageProcessor`] and [`BartTokenizerFast`]. See the
58
+ [`~Florence2Processor.__call__`] and [`~Florence2Processor.decode`] for more information.
59
+
60
+ Args:
61
+ image_processor ([`CLIPImageProcessor`], *optional*):
62
+ The image processor is a required input.
63
+ tokenizer ([`BartTokenizerFast`], *optional*):
64
+ The tokenizer is a required input.
65
+ """
66
+
67
+ attributes = ["image_processor", "tokenizer"]
68
+ image_processor_class = "CLIPImageProcessor"
69
+ tokenizer_class = ("BartTokenizer", "BartTokenizerFast")
70
+
71
+ def __init__(
72
+ self,
73
+ image_processor=None,
74
+ tokenizer=None,
75
+ ):
76
+ if image_processor is None:
77
+ raise ValueError("You need to specify an `image_processor`.")
78
+ if tokenizer is None:
79
+ raise ValueError("You need to specify a `tokenizer`.")
80
+ if not hasattr(image_processor, "image_seq_length"):
81
+ raise ValueError("Image processor is missing an `image_seq_length` attribute.")
82
+
83
+ self.image_seq_length = image_processor.image_seq_length
84
+
85
+ tokens_to_add = {
86
+ 'additional_special_tokens': \
87
+ tokenizer.additional_special_tokens + \
88
+ ['<od>', '</od>', '<ocr>', '</ocr>'] + \
89
+ [f'<loc_{x}>' for x in range(1000)] + \
90
+ ['<cap>', '</cap>', '<ncap>', '</ncap>','<dcap>', '</dcap>', '<grounding>', '</grounding>', '<seg>', '</seg>', '<sep>', '<region_cap>', '</region_cap>', '<region_to_desciption>', '</region_to_desciption>', '<proposal>', '</proposal>', '<poly>', '</poly>', '<and>']
91
+ }
92
+ tokenizer.add_special_tokens(tokens_to_add)
93
+
94
+ self.tasks_answer_post_processing_type = {
95
+ '<OCR>': 'pure_text',
96
+ '<OCR_WITH_REGION>': 'ocr',
97
+ '<CAPTION>': 'pure_text',
98
+ '<DETAILED_CAPTION>': 'pure_text',
99
+ '<MORE_DETAILED_CAPTION>': 'pure_text',
100
+ '<OD>': 'description_with_bboxes',
101
+ '<DENSE_REGION_CAPTION>': 'description_with_bboxes',
102
+ '<CAPTION_TO_PHRASE_GROUNDING>': "phrase_grounding",
103
+ '<REFERRING_EXPRESSION_SEGMENTATION>': 'polygons',
104
+ '<REGION_TO_SEGMENTATION>': 'polygons',
105
+ '<OPEN_VOCABULARY_DETECTION>': 'description_with_bboxes_or_polygons',
106
+ '<REGION_TO_CATEGORY>': 'pure_text',
107
+ '<REGION_TO_DESCRIPTION>': 'pure_text',
108
+ '<REGION_TO_OCR>': 'pure_text',
109
+ '<REGION_PROPOSAL>': 'bboxes'
110
+ }
111
+
112
+ self.task_prompts_without_inputs = {
113
+ '<OCR>': 'What is the text in the image?',
114
+ '<OCR_WITH_REGION>': 'What is the text in the image, with regions?',
115
+ '<CAPTION>': 'What does the image describe?',
116
+ '<DETAILED_CAPTION>': 'Describe in detail what is shown in the image.',
117
+ '<MORE_DETAILED_CAPTION>': 'Describe with a paragraph what is shown in the image.',
118
+ '<OD>': 'Locate the objects with category name in the image.',
119
+ '<DENSE_REGION_CAPTION>': 'Locate the objects in the image, with their descriptions.',
120
+ '<REGION_PROPOSAL>': 'Locate the region proposals in the image.'
121
+ }
122
+
123
+ self.task_prompts_with_input = {
124
+ '<CAPTION_TO_PHRASE_GROUNDING>': "Locate the phrases in the caption: {input}",
125
+ '<REFERRING_EXPRESSION_SEGMENTATION>': 'Locate {input} in the image with mask',
126
+ '<REGION_TO_SEGMENTATION>': 'What is the polygon mask of region {input}',
127
+ '<OPEN_VOCABULARY_DETECTION>': 'Locate {input} in the image.',
128
+ '<REGION_TO_CATEGORY>': 'What is the region {input}?',
129
+ '<REGION_TO_DESCRIPTION>': 'What does the region {input} describe?',
130
+ '<REGION_TO_OCR>': 'What text is in the region {input}?',
131
+ }
132
+ self.image_mean = image_processor.image_mean
133
+ self.image_std = image_processor.image_std
134
+ self.crop_size = image_processor.crop_size
135
+ self.post_processor = Florence2PostProcesser(tokenizer=tokenizer)
136
+
137
+
138
+ super().__init__(image_processor, tokenizer)
139
+
140
+ def _construct_prompts(self, text):
141
+ # replace the task tokens with the task prompts if task token is in the text
142
+ prompts = []
143
+ for _text in text:
144
+ # 1. fixed task prompts without additional inputs
145
+ for task_token, task_prompt in self.task_prompts_without_inputs.items():
146
+ if task_token in _text:
147
+ assert _text == task_token, f"Task token {task_token} should be the only token in the text."
148
+ _text = task_prompt
149
+ break
150
+ # 2. task prompts with additional inputs
151
+ for task_token, task_prompt in self.task_prompts_with_input.items():
152
+ if task_token in _text:
153
+ _text = task_prompt.format(input=_text.replace(task_token, ''))
154
+ break
155
+ prompts.append(_text)
156
+ return prompts
157
+
158
+ def __call__(
159
+ self,
160
+ text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
161
+ images: ImageInput = None,
162
+ tokenize_newline_separately: bool = True,
163
+ padding: Union[bool, str, PaddingStrategy] = False,
164
+ truncation: Union[bool, str, TruncationStrategy] = None,
165
+ max_length=None,
166
+ return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
167
+ do_resize: bool = None,
168
+ do_normalize: bool = None,
169
+ image_mean: Optional[Union[float, List[float]]] = None,
170
+ image_std: Optional[Union[float, List[float]]] = None,
171
+ data_format: Optional["ChannelDimension"] = "channels_first", # noqa: F821
172
+ input_data_format: Optional[
173
+ Union[str, "ChannelDimension"] # noqa: F821
174
+ ] = None,
175
+ resample: "PILImageResampling" = None, # noqa: F821
176
+ do_convert_rgb: bool = None,
177
+ do_thumbnail: bool = None,
178
+ do_align_long_axis: bool = None,
179
+ do_rescale: bool = None,
180
+ ) -> BatchFeature:
181
+ """
182
+ Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
183
+ and `kwargs` arguments to BartTokenizerFast's [`~BartTokenizerFast.__call__`] if `text` is not `None` to encode
184
+ the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
185
+ CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
186
+ of the above two methods for more information.
187
+
188
+ Args:
189
+ text (`str`, `List[str]`, `List[List[str]]`):
190
+ The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
191
+ (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
192
+ `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
193
+ images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
194
+ The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
195
+ tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
196
+ number of channels, H and W are image height and width.
197
+ tokenize_newline_separately (`bool`, defaults to `True`):
198
+ Adds a separately tokenized '\n' at the end of the prompt.
199
+ padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
200
+ Select a strategy to pad the returned sequences (according to the model's padding side and padding
201
+ index) among:
202
+ - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
203
+ sequence if provided).
204
+ - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
205
+ acceptable input length for the model if that argument is not provided.
206
+ - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
207
+ lengths).
208
+ max_length (`int`, *optional*):
209
+ Maximum length of the returned list and optionally padding length (see above).
210
+ truncation (`bool`, *optional*):
211
+ Activates truncation to cut input sequences longer than `max_length` to `max_length`.
212
+ return_tensors (`str` or [`~utils.TensorType`], *optional*):
213
+ If set, will return tensors of a particular framework. Acceptable values are:
214
+
215
+ - `'tf'`: Return TensorFlow `tf.constant` objects.
216
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
217
+ - `'np'`: Return NumPy `np.ndarray` objects.
218
+ - `'jax'`: Return JAX `jnp.ndarray` objects.
219
+
220
+ Returns:
221
+ [`BatchFeature`]: A [`BatchFeature`] with the following fields:
222
+
223
+ - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. If `suffix`
224
+ is provided, the `input_ids` will also contain the suffix input ids.
225
+ - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
226
+ `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
227
+ `None`).
228
+ - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
229
+ - **labels** -- Labels compatible with training if `suffix` is not None
230
+ """
231
+
232
+ return_token_type_ids = False
233
+
234
+ if images is None:
235
+ raise ValueError("`images` are expected as arguments to a `Florence2Processor` instance.")
236
+ if text is None:
237
+ logger.warning_once(
238
+ "You are using Florence-2 without a text prompt."
239
+ )
240
+ text = ""
241
+
242
+ if isinstance(text, List) and isinstance(images, List):
243
+ if len(images) < len(text):
244
+ raise ValueError(
245
+ f"Received {len(images)} images for {len(text)} prompts. Each prompt should be associated with an image."
246
+ )
247
+ if _is_str_or_image(text):
248
+ text = [text]
249
+ elif isinstance(text, list) and _is_str_or_image(text[0]):
250
+ pass
251
+
252
+ try:
253
+ pixel_values = self.image_processor(
254
+ images,
255
+ do_resize=do_resize,
256
+ do_normalize=do_normalize,
257
+ return_tensors=return_tensors,
258
+ image_mean=image_mean,
259
+ image_std=image_std,
260
+ input_data_format=input_data_format,
261
+ data_format=data_format,
262
+ resample=resample,
263
+ do_convert_rgb=do_convert_rgb,
264
+ )["pixel_values"]
265
+ except ValueError as e:
266
+ try:
267
+ pixel_values = self.image_processor(
268
+ images,
269
+ do_resize=do_resize,
270
+ do_normalize=do_normalize,
271
+ return_tensors=return_tensors,
272
+ image_mean=image_mean[1],
273
+ image_std=image_std[1],
274
+ input_data_format=input_data_format,
275
+ data_format=data_format,
276
+ resample=resample,
277
+ do_convert_rgb=do_convert_rgb,
278
+ )["pixel_values"]
279
+ print("Grayscale processing succeeded")
280
+ except ValueError as e:
281
+ print(f"Grayscale processing failed: {e}")
282
+
283
+
284
+
285
+ # pixel_values = self.image_processor(
286
+ # images,
287
+ # do_resize=do_resize,
288
+ # do_normalize=do_normalize,
289
+ # return_tensors=return_tensors,
290
+ # image_mean=image_mean,
291
+ # image_std=image_std,
292
+ # input_data_format=input_data_format,
293
+ # data_format=data_format,
294
+ # resample=resample,
295
+ # do_convert_rgb=do_convert_rgb,
296
+ # )["pixel_values"]
297
+
298
+
299
+ if max_length is not None:
300
+ max_length -= self.image_seq_length # max_length has to account for the image tokens
301
+
302
+ text = self._construct_prompts(text)
303
+
304
+ inputs = self.tokenizer(
305
+ text,
306
+ return_tensors=return_tensors,
307
+ padding=padding,
308
+ max_length=max_length,
309
+ truncation=truncation,
310
+ return_token_type_ids=return_token_type_ids,
311
+ )
312
+
313
+ return_data = {**inputs, "pixel_values": pixel_values}
314
+
315
+ if return_token_type_ids:
316
+ labels = inputs["input_ids"].masked_fill(inputs["token_type_ids"] == 0, -100)
317
+ return_data.update({"labels": labels})
318
+ return BatchFeature(data=return_data)
319
+
320
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Florence2
321
+ def batch_decode(self, *args, **kwargs):
322
+ """
323
+ This method forwards all its arguments to BartTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
324
+ refer to the docstring of this method for more information.
325
+ """
326
+ return self.tokenizer.batch_decode(*args, **kwargs)
327
+
328
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Florence2
329
+ def decode(self, *args, **kwargs):
330
+ """
331
+ This method forwards all its arguments to BartTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
332
+ the docstring of this method for more information.
333
+ """
334
+ return self.tokenizer.decode(*args, **kwargs)
335
+
336
+ @property
337
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names with CLIP->Florence2
338
+ def model_input_names(self):
339
+ tokenizer_input_names = self.tokenizer.model_input_names
340
+ image_processor_input_names = self.image_processor.model_input_names
341
+ return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
342
+
343
+ def post_process_generation(self, text, task, image_size):
344
+ """
345
+ Post-process the output of the model to each of the task outputs.
346
+
347
+ Args:
348
+ text (`str`): The text to post-process.
349
+ task (`str`): The task to post-process the text for.
350
+ image_size (`Tuple[int, int]`): The size of the image. height x width.
351
+ """
352
+
353
+ task_answer_post_processing_type = self.tasks_answer_post_processing_type.get(task, 'pure_text')
354
+ task_answer = self.post_processor(
355
+ text=text,
356
+ image_size=image_size,
357
+ parse_tasks=task_answer_post_processing_type,
358
+ )[task_answer_post_processing_type]
359
+
360
+ if task_answer_post_processing_type == 'pure_text':
361
+ final_answer = task_answer
362
+ # remove the special tokens
363
+ final_answer = final_answer.replace('<s>', '').replace('</s>', '')
364
+ elif task_answer_post_processing_type in ['od', 'description_with_bboxes', 'bboxes']:
365
+ od_instances = task_answer
366
+ bboxes_od = [_od_instance['bbox'] for _od_instance in od_instances]
367
+ labels_od = [str(_od_instance['cat_name']) for _od_instance in od_instances]
368
+ final_answer = {'bboxes': bboxes_od, 'labels': labels_od}
369
+ elif task_answer_post_processing_type in ['ocr']:
370
+ bboxes = [_od_instance['quad_box'] for _od_instance in task_answer]
371
+ labels = [str(_od_instance['text']) for _od_instance in task_answer]
372
+ final_answer = {'quad_boxes': bboxes, 'labels': labels}
373
+ elif task_answer_post_processing_type in ['phrase_grounding']:
374
+ bboxes = []
375
+ labels = []
376
+ for _grounded_phrase in task_answer:
377
+ for _bbox in _grounded_phrase['bbox']:
378
+ bboxes.append(_bbox)
379
+ labels.append(_grounded_phrase['cat_name'])
380
+ final_answer = {'bboxes': bboxes, 'labels': labels}
381
+ elif task_answer_post_processing_type in ['description_with_polygons', 'polygons']:
382
+ labels = []
383
+ polygons = []
384
+ for result in task_answer:
385
+ label = result['cat_name']
386
+ _polygons = result['polygons']
387
+ labels.append(label)
388
+ polygons.append(_polygons)
389
+ final_answer = {'polygons': polygons, 'labels': labels}
390
+ elif task_answer_post_processing_type in ['description_with_bboxes_or_polygons']:
391
+ bboxes = []
392
+ bboxes_labels = []
393
+ polygons = []
394
+ polygons_labels = []
395
+ for result in task_answer:
396
+ label = result['cat_name']
397
+ if 'polygons' in result:
398
+ _polygons = result['polygons']
399
+ polygons.append(_polygons)
400
+ polygons_labels.append(label)
401
+ else:
402
+ _bbox = result['bbox']
403
+ bboxes.append(_bbox)
404
+ bboxes_labels.append(label)
405
+ final_answer = {'bboxes': bboxes, 'bboxes_labels': bboxes_labels, 'polygons': polygons, 'polygons_labels': polygons_labels}
406
+ else:
407
+ raise ValueError('Unknown task answer post processing type: {}'.format(task_answer_post_processing_type))
408
+
409
+ final_answer = {
410
+ task: final_answer}
411
+ return final_answer
412
+
413
+ class BoxQuantizer(object):
414
+ def __init__(self, mode, bins):
415
+ self.mode = mode
416
+ self.bins = bins
417
+
418
+ def quantize(self, boxes: torch.Tensor, size):
419
+ bins_w, bins_h = self.bins # Quantization bins.
420
+ size_w, size_h = size # Original image size.
421
+ size_per_bin_w = size_w / bins_w
422
+ size_per_bin_h = size_h / bins_h
423
+ xmin, ymin, xmax, ymax = boxes.split(1, dim=-1) # Shape: 4 * [N, 1].
424
+
425
+ if self.mode == 'floor':
426
+ quantized_xmin = (
427
+ xmin / size_per_bin_w).floor().clamp(0, bins_w - 1)
428
+ quantized_ymin = (
429
+ ymin / size_per_bin_h).floor().clamp(0, bins_h - 1)
430
+ quantized_xmax = (
431
+ xmax / size_per_bin_w).floor().clamp(0, bins_w - 1)
432
+ quantized_ymax = (
433
+ ymax / size_per_bin_h).floor().clamp(0, bins_h - 1)
434
+
435
+ elif self.mode == 'round':
436
+ raise NotImplementedError()
437
+
438
+ else:
439
+ raise ValueError('Incorrect quantization type.')
440
+
441
+ quantized_boxes = torch.cat(
442
+ (quantized_xmin, quantized_ymin, quantized_xmax, quantized_ymax), dim=-1
443
+ ).int()
444
+
445
+ return quantized_boxes
446
+
447
+ def dequantize(self, boxes: torch.Tensor, size):
448
+ bins_w, bins_h = self.bins # Quantization bins.
449
+ size_w, size_h = size # Original image size.
450
+ size_per_bin_w = size_w / bins_w
451
+ size_per_bin_h = size_h / bins_h
452
+ xmin, ymin, xmax, ymax = boxes.split(1, dim=-1) # Shape: 4 * [N, 1].
453
+
454
+ if self.mode == 'floor':
455
+ # Add 0.5 to use the center position of the bin as the coordinate.
456
+ dequantized_xmin = (xmin + 0.5) * size_per_bin_w
457
+ dequantized_ymin = (ymin + 0.5) * size_per_bin_h
458
+ dequantized_xmax = (xmax + 0.5) * size_per_bin_w
459
+ dequantized_ymax = (ymax + 0.5) * size_per_bin_h
460
+
461
+ elif self.mode == 'round':
462
+ raise NotImplementedError()
463
+
464
+ else:
465
+ raise ValueError('Incorrect quantization type.')
466
+
467
+ dequantized_boxes = torch.cat(
468
+ (dequantized_xmin, dequantized_ymin,
469
+ dequantized_xmax, dequantized_ymax), dim=-1
470
+ )
471
+
472
+ return dequantized_boxes
473
+
474
+
475
+ class CoordinatesQuantizer(object):
476
+ """
477
+ Quantize coornidates (Nx2)
478
+ """
479
+
480
+ def __init__(self, mode, bins):
481
+ self.mode = mode
482
+ self.bins = bins
483
+
484
+ def quantize(self, coordinates: torch.Tensor, size):
485
+ bins_w, bins_h = self.bins # Quantization bins.
486
+ size_w, size_h = size # Original image size.
487
+ size_per_bin_w = size_w / bins_w
488
+ size_per_bin_h = size_h / bins_h
489
+ assert coordinates.shape[-1] == 2, 'coordinates should be shape (N, 2)'
490
+ x, y = coordinates.split(1, dim=-1) # Shape: 4 * [N, 1].
491
+
492
+ if self.mode == 'floor':
493
+ quantized_x = (x / size_per_bin_w).floor().clamp(0, bins_w - 1)
494
+ quantized_y = (y / size_per_bin_h).floor().clamp(0, bins_h - 1)
495
+
496
+ elif self.mode == 'round':
497
+ raise NotImplementedError()
498
+
499
+ else:
500
+ raise ValueError('Incorrect quantization type.')
501
+
502
+ quantized_coordinates = torch.cat(
503
+ (quantized_x, quantized_y), dim=-1
504
+ ).int()
505
+
506
+ return quantized_coordinates
507
+
508
+ def dequantize(self, coordinates: torch.Tensor, size):
509
+ bins_w, bins_h = self.bins # Quantization bins.
510
+ size_w, size_h = size # Original image size.
511
+ size_per_bin_w = size_w / bins_w
512
+ size_per_bin_h = size_h / bins_h
513
+ assert coordinates.shape[-1] == 2, 'coordinates should be shape (N, 2)'
514
+ x, y = coordinates.split(1, dim=-1) # Shape: 4 * [N, 1].
515
+
516
+ if self.mode == 'floor':
517
+ # Add 0.5 to use the center position of the bin as the coordinate.
518
+ dequantized_x = (x + 0.5) * size_per_bin_w
519
+ dequantized_y = (y + 0.5) * size_per_bin_h
520
+
521
+ elif self.mode == 'round':
522
+ raise NotImplementedError()
523
+
524
+ else:
525
+ raise ValueError('Incorrect quantization type.')
526
+
527
+ dequantized_coordinates = torch.cat(
528
+ (dequantized_x, dequantized_y), dim=-1
529
+ )
530
+
531
+ return dequantized_coordinates
532
+
533
+
534
+ class Florence2PostProcesser(object):
535
+ """
536
+ Florence-2 post process for converting text prediction to various tasks results.
537
+
538
+ Args:
539
+ config: A dict of configs.
540
+ tokenizer: A tokenizer for decoding text to spans.
541
+ sample config:
542
+ UNIFIED_POST_PROCESS:
543
+ # commom configs
544
+ NUM_BBOX_HEIGHT_BINS: 1000
545
+ NUM_BBOX_WIDTH_BINS: 1000
546
+ COORDINATES_HEIGHT_BINS: 1000
547
+ COORDINATES_WIDTH_BINS: 1000
548
+ # task specific configs, override the common configs
549
+ PRASE_TASKS:
550
+ - TASK_NAME: 'video_dense_caption'
551
+ PATTERN: 'r<time_(\d+)><time_(\d+)>([a-zA-Z0-9 ]+)'
552
+ SCORE_MODE: 'avg_cat_name_scores'
553
+ NUM_BINS: 100
554
+ - TASK_NAME: 'od'
555
+ PATTERN: 'r<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>([a-zA-Z0-9 ]+)'
556
+ SCORE_MODE: 'avg_cat_name_scores'
557
+
558
+ Returns:
559
+ parsed_dict (dict): A dict of parsed results.
560
+ """
561
+ def __init__(
562
+ self,
563
+ tokenizer=None
564
+ ):
565
+ parse_tasks = []
566
+ parse_task_configs = {}
567
+ config = self._create_default_config()
568
+ for task in config['PARSE_TASKS']:
569
+ parse_tasks.append(task['TASK_NAME'])
570
+ parse_task_configs[task['TASK_NAME']] = task
571
+
572
+ self.config = config
573
+ self.parse_tasks = parse_tasks
574
+ self.parse_tasks_configs = parse_task_configs
575
+
576
+ self.tokenizer = tokenizer
577
+ if self.tokenizer is not None:
578
+ self.all_special_tokens = set(self.tokenizer.all_special_tokens)
579
+
580
+ self.init_quantizers()
581
+ self.black_list_of_phrase_grounding = self._create_black_list_of_phrase_grounding()
582
+
583
+ def _create_black_list_of_phrase_grounding(self):
584
+ black_list = {}
585
+
586
+ if 'phrase_grounding' in self.parse_tasks and self.parse_tasks_configs['phrase_grounding']['FILTER_BY_BLACK_LIST']:
587
+ black_list = set(
588
+ ['it', 'I', 'me', 'mine',
589
+ 'you', 'your', 'yours',
590
+ 'he', 'him', 'his',
591
+ 'she', 'her', 'hers',
592
+ 'they', 'them', 'their', 'theirs',
593
+ 'one', 'oneself',
594
+ 'we', 'us', 'our', 'ours',
595
+ 'you', 'your', 'yours',
596
+ 'they', 'them', 'their', 'theirs',
597
+ 'mine', 'yours', 'his', 'hers', 'its',
598
+ 'ours', 'yours', 'theirs',
599
+ 'myself', 'yourself', 'himself', 'herself', 'itself',
600
+ 'ourselves', 'yourselves', 'themselves',
601
+ 'this', 'that',
602
+ 'these', 'those',
603
+ 'who', 'whom', 'whose', 'which', 'what',
604
+ 'who', 'whom', 'whose', 'which', 'that',
605
+ 'all', 'another', 'any', 'anybody', 'anyone', 'anything',
606
+ 'each', 'everybody', 'everyone', 'everything',
607
+ 'few', 'many', 'nobody', 'none', 'one', 'several',
608
+ 'some', 'somebody', 'someone', 'something',
609
+ 'each other', 'one another',
610
+ 'myself', 'yourself', 'himself', 'herself', 'itself',
611
+ 'ourselves', 'yourselves', 'themselves',
612
+ 'the image', 'image', 'images', 'the', 'a', 'an', 'a group',
613
+ 'other objects', 'lots', 'a set',
614
+ ]
615
+ )
616
+
617
+ return black_list
618
+
619
+ def _create_default_config(self):
620
+ config = {
621
+ 'NUM_BBOX_HEIGHT_BINS': 1000,
622
+ 'NUM_BBOX_WIDTH_BINS': 1000,
623
+ 'BOX_QUANTIZATION_MODE': 'floor',
624
+ 'COORDINATES_HEIGHT_BINS': 1000,
625
+ 'COORDINATES_WIDTH_BINS': 1000,
626
+ 'COORDINATES_QUANTIZATION_MODE': 'floor',
627
+ 'PARSE_TASKS': [
628
+ {
629
+ 'TASK_NAME': 'od',
630
+ 'PATTERN': r'([a-zA-Z0-9 ]+)<loc_(\\d+)><loc_(\\d+)><loc_(\\d+)><loc_(\\d+)>'
631
+ },
632
+ {
633
+ 'TASK_NAME': 'ocr',
634
+ 'PATTERN': r'(.+?)<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>',
635
+ 'AREA_THRESHOLD': 0.01
636
+ },
637
+ {
638
+ 'TASK_NAME': 'phrase_grounding',
639
+ 'FILTER_BY_BLACK_LIST': True
640
+ },
641
+ {
642
+ 'TASK_NAME': 'pure_text',
643
+ },
644
+ {
645
+ 'TASK_NAME': 'description_with_bboxes',
646
+ },
647
+ {
648
+ 'TASK_NAME': 'description_with_polygons',
649
+ },
650
+ {
651
+ 'TASK_NAME': 'polygons',
652
+ },
653
+ {
654
+ 'TASK_NAME': 'bboxes',
655
+ },
656
+ {
657
+ 'TASK_NAME': 'description_with_bboxes_or_polygons',
658
+ }
659
+ ]
660
+ }
661
+
662
+ return config
663
+
664
+ def init_quantizers(self):
665
+ # we have box_quantizer (od, grounding) and coordinates_quantizer (ocr, referring_segmentation)
666
+ num_bbox_height_bins = self.config.get('NUM_BBOX_HEIGHT_BINS', 1000)
667
+ num_bbox_width_bins = self.config.get('NUM_BBOX_WIDTH_BINS', 1000)
668
+ box_quantization_mode = self.config.get('BOX_QUANTIZATION_MODE', 'floor')
669
+ self.box_quantizer = BoxQuantizer(
670
+ box_quantization_mode,
671
+ (num_bbox_width_bins, num_bbox_height_bins),
672
+ )
673
+
674
+ num_bbox_height_bins = self.config['COORDINATES_HEIGHT_BINS'] if 'COORDINATES_HEIGHT_BINS' in self.config else self.config.get('NUM_BBOX_HEIGHT_BINS', 1000)
675
+ num_bbox_width_bins = self.config['COORDINATES_WIDTH_BINS'] if 'COORDINATES_WIDTH_BINS' in self.config else self.config.get('NUM_BBOX_WIDTH_BINS', 1000)
676
+ box_quantization_mode = self.config.get('COORDINATES_QUANTIZATION_MODE') if 'COORDINATES_QUANTIZATION_MODE' in self.config else self.config.get('BOX_QUANTIZATION_MODE', 'floor')
677
+ self.coordinates_quantizer = CoordinatesQuantizer(
678
+ box_quantization_mode,
679
+ (num_bbox_width_bins, num_bbox_height_bins),
680
+ )
681
+
682
+ def decode_with_spans(self, tokenizer, token_ids):
683
+ filtered_tokens = tokenizer.convert_ids_to_tokens(
684
+ token_ids, skip_special_tokens=False)
685
+ assert len(filtered_tokens) == len(token_ids)
686
+
687
+ # To avoid mixing byte-level and unicode for byte-level BPT
688
+ # we need to build string separately for added tokens and byte-level tokens
689
+ # cf. https://github.com/huggingface/transformers/issues/1133
690
+ sub_texts = []
691
+ for token in filtered_tokens:
692
+ if token in self.all_special_tokens:
693
+ sub_texts.append(token)
694
+ else:
695
+ if isinstance(tokenizer, (BartTokenizer, BartTokenizerFast)):
696
+ sub_text = tokenizer.convert_tokens_to_string([token])
697
+ elif isinstance(tokenizer, (T5Tokenizer, T5TokenizerFast)):
698
+ # Ref: https://github.com/google/sentencepiece#whitespace-is-treated-as-a-basic-symbol
699
+ # Note: Do not strip sub_text as it may have functional whitespace
700
+ sub_text = token.replace('▁', ' ')
701
+ else:
702
+ raise ValueError(f'type {type(tokenizer)} not supported')
703
+ sub_texts.append(sub_text)
704
+
705
+ text = ''
706
+ spans = []
707
+ for sub_text in sub_texts:
708
+ span = (len(text), len(text) + len(sub_text)) # [start index, end index).
709
+ text += sub_text
710
+ spans.append(span)
711
+
712
+ # Text format:
713
+ # 1. T5Tokenizer/T5TokenizerFast:
714
+ # "<loc_1><loc_2><loc_3><loc_4> transplanting dog<loc_1><loc_2><loc_3><loc_4> cat</s>"
715
+ # Equivalent to t5_tokenizer.decode(input_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False, spaces_between_special_tokens=False)
716
+ # 2. BartTokenizer (need to double check):
717
+ # "<s><loc_1><loc_2><loc_3><loc_4>transplanting dog<loc_1><loc_2><loc_3><loc_4>cat</s>"
718
+ # Equivalent to bart_tokenizer.decode(input_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False, spaces_between_special_tokens=False)
719
+ return text, spans
720
+
721
+ def parse_od_from_text_and_spans(
722
+ self,
723
+ text,
724
+ pattern,
725
+ image_size,
726
+ phrase_centric=False
727
+ ):
728
+ parsed = list(re.finditer(pattern, text))
729
+
730
+ instances = []
731
+ for i in range(len(parsed)):
732
+ # Prepare instance.
733
+ instance = {}
734
+
735
+ if phrase_centric:
736
+ bbox_bins = [int(parsed[i].group(j)) for j in range(2, 6)]
737
+ else:
738
+ bbox_bins = [int(parsed[i].group(j)) for j in range(1, 5)]
739
+ instance['bbox'] = self.box_quantizer.dequantize(
740
+ boxes=torch.tensor(bbox_bins),
741
+ size=image_size
742
+ ).tolist()
743
+
744
+ if phrase_centric:
745
+ instance['cat_name'] = parsed[i].group(1).lower().strip()
746
+ else:
747
+ instance['cat_name'] = parsed[i].group(5).lower().strip()
748
+ instances.append(instance)
749
+
750
+ return instances
751
+
752
+ def parse_ocr_from_text_and_spans(self,
753
+ text,
754
+ pattern,
755
+ image_size,
756
+ area_threshold=-1.0,
757
+ ):
758
+ bboxes = []
759
+ labels = []
760
+ text = text.replace('<s>', '')
761
+ # ocr with regions
762
+ parsed = re.findall(pattern, text)
763
+ instances = []
764
+ image_width, image_height = image_size
765
+
766
+ for ocr_line in parsed:
767
+ ocr_content = ocr_line[0]
768
+ quad_box = ocr_line[1:]
769
+ quad_box = [int(i) for i in quad_box]
770
+ quad_box = self.coordinates_quantizer.dequantize(
771
+ torch.tensor(np.array(quad_box).reshape(-1, 2)),
772
+ size=image_size
773
+ ).reshape(-1).tolist()
774
+
775
+ if area_threshold > 0:
776
+ x_coords = [i for i in quad_box[0::2]]
777
+ y_coords = [i for i in quad_box[1::2]]
778
+
779
+ # apply the Shoelace formula
780
+ area = 0.5 * abs(sum(x_coords[i] * y_coords[i + 1] - x_coords[i + 1] * y_coords[i] for i in range(4 - 1)))
781
+
782
+ if area < (image_width * image_height) * area_threshold:
783
+ continue
784
+
785
+ bboxes.append(quad_box)
786
+ labels.append(ocr_content)
787
+ instances.append({
788
+ 'quad_box': quad_box,
789
+ 'text': ocr_content,
790
+ })
791
+ return instances
792
+
793
+ def parse_phrase_grounding_from_text_and_spans(self, text, pattern, image_size):
794
+ # ignore <s> </s> and <pad>
795
+ cur_span = 0
796
+ if text.startswith('<s>'):
797
+ cur_span += 3
798
+
799
+ text = text.replace('<s>', '')
800
+ text = text.replace('</s>', '')
801
+ text = text.replace('<pad>', '')
802
+
803
+ pattern = r"([^<]+(?:<loc_\d+>){4,})"
804
+ phrases = re.findall(pattern, text)
805
+
806
+ # pattern should be text pattern and od pattern
807
+ pattern = r'^\s*(.*?)(?=<od>|</od>|<box>|</box>|<bbox>|</bbox>|<loc_)'
808
+ box_pattern = r'<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>'
809
+
810
+ instances = []
811
+ for pharse_text in phrases:
812
+ phrase_text_strip = pharse_text.replace('<ground>', '', 1)
813
+ phrase_text_strip = pharse_text.replace('<obj>', '', 1)
814
+
815
+ if phrase_text_strip == '':
816
+ cur_span += len(pharse_text)
817
+ continue
818
+
819
+ # Prepare instance.
820
+ instance = {}
821
+
822
+ # parse phrase, get string
823
+ phrase = re.search(pattern, phrase_text_strip)
824
+ if phrase is None:
825
+ cur_span += len(pharse_text)
826
+ continue
827
+
828
+ # parse bboxes by box_pattern
829
+ bboxes_parsed = list(re.finditer(box_pattern, pharse_text))
830
+ if len(bboxes_parsed) == 0:
831
+ cur_span += len(pharse_text)
832
+ continue
833
+
834
+ phrase = phrase.group()
835
+ # remove leading and trailing spaces
836
+ phrase = phrase.strip()
837
+
838
+ if phrase in self.black_list_of_phrase_grounding:
839
+ cur_span += len(pharse_text)
840
+ continue
841
+
842
+ # a list of list
843
+ bbox_bins = [[int(_bboxes_parsed.group(j)) for j in range(1, 5)] for _bboxes_parsed in bboxes_parsed]
844
+ instance['bbox'] = self.box_quantizer.dequantize(
845
+ boxes=torch.tensor(bbox_bins),
846
+ size=image_size
847
+ ).tolist()
848
+
849
+ # exclude non-ascii characters
850
+ phrase = phrase.encode('ascii',errors='ignore').decode('ascii')
851
+ instance['cat_name'] = phrase
852
+
853
+ instances.append(instance)
854
+
855
+ return instances
856
+
857
+ def parse_description_with_bboxes_from_text_and_spans(self, text, pattern, image_size, allow_empty_phrase=False):
858
+ # temporary parse solution, split by '.'
859
+ # ignore <s> </s> and <pad>
860
+
861
+ text = text.replace('<s>', '')
862
+ text = text.replace('</s>', '')
863
+ text = text.replace('<pad>', '')
864
+
865
+ if allow_empty_phrase:
866
+ pattern = rf"(?:(?:<loc_\d+>){{4,}})"
867
+ else:
868
+ pattern = r"([^<]+(?:<loc_\d+>){4,})"
869
+ phrases = re.findall(pattern, text)
870
+
871
+ # pattern should be text pattern and od pattern
872
+ pattern = r'^\s*(.*?)(?=<od>|</od>|<box>|</box>|<bbox>|</bbox>|<loc_)'
873
+ box_pattern = r'<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>'
874
+
875
+ instances = []
876
+ for pharse_text in phrases:
877
+ phrase_text_strip = pharse_text.replace('<ground>', '', 1)
878
+ phrase_text_strip = pharse_text.replace('<obj>', '', 1)
879
+
880
+ if phrase_text_strip == '' and not allow_empty_phrase:
881
+ continue
882
+
883
+ # parse phrase, get string
884
+ phrase = re.search(pattern, phrase_text_strip)
885
+ if phrase is None:
886
+ continue
887
+
888
+ phrase = phrase.group()
889
+ # remove leading and trailing spaces
890
+ phrase = phrase.strip()
891
+
892
+ # parse bboxes by box_pattern
893
+ bboxes_parsed = list(re.finditer(box_pattern, pharse_text))
894
+ if len(bboxes_parsed) == 0:
895
+ continue
896
+
897
+ # a list of list
898
+ bbox_bins = [[int(_bboxes_parsed.group(j)) for j in range(1, 5)] for _bboxes_parsed in bboxes_parsed]
899
+
900
+ bboxes = self.box_quantizer.dequantize(
901
+ boxes=torch.tensor(bbox_bins),
902
+ size=image_size
903
+ ).tolist()
904
+
905
+ phrase = phrase.encode('ascii',errors='ignore').decode('ascii')
906
+ for _bboxes in bboxes:
907
+ # Prepare instance.
908
+ instance = {}
909
+ instance['bbox'] = _bboxes
910
+ # exclude non-ascii characters
911
+ instance['cat_name'] = phrase
912
+ instances.append(instance)
913
+
914
+ return instances
915
+
916
+ def parse_description_with_polygons_from_text_and_spans(self, text, pattern, image_size,
917
+ allow_empty_phrase=False,
918
+ polygon_sep_token='<sep>',
919
+ polygon_start_token='<poly>',
920
+ polygon_end_token='</poly>',
921
+ with_box_at_start=False,
922
+ ):
923
+
924
+ # ref_seg format: '<expression><x1><y1><x2><y2><><><sep><><><><>'
925
+ # ignore <s> </s> and <pad>
926
+
927
+ text = text.replace('<s>', '')
928
+ text = text.replace('</s>', '')
929
+ text = text.replace('<pad>', '')
930
+
931
+ if allow_empty_phrase:
932
+ pattern = rf"(?:(?:<loc_\d+>|{re.escape(polygon_sep_token)}|{re.escape(polygon_start_token)}|{re.escape(polygon_end_token)}){{4,}})"
933
+ else:
934
+ # [^<]+: This part matches one or more characters that are not the < symbol.
935
+ # The ^ inside the square brackets [] is a negation, meaning it matches anything except <.
936
+ #
937
+ pattern = rf"([^<]+(?:<loc_\d+>|{re.escape(polygon_sep_token)}|{re.escape(polygon_start_token)}|{re.escape(polygon_end_token)}){{4,}})"
938
+ phrases = re.findall(pattern, text)
939
+
940
+ phrase_string_pattern = r'^\s*(.*?)(?=<od>|</od>|<box>|</box>|<bbox>|</bbox>|<loc_|<poly>)'
941
+ box_pattern = rf'((?:<loc_\d+>)+)(?:{re.escape(polygon_sep_token)}|$)'
942
+
943
+ # one polygons instance is separated by polygon_start_token and polygon_end_token
944
+ polygons_instance_pattern = rf'{re.escape(polygon_start_token)}(.*?){re.escape(polygon_end_token)}'
945
+
946
+ instances = []
947
+ for phrase_text in phrases:
948
+
949
+ # exclude loc_\d+>
950
+ # need to get span if want to include category score
951
+ phrase_text_strip = re.sub(r'^loc_\d+>', '', phrase_text, count=1)
952
+
953
+ # phrase = phrase.replace('<poly>', '')
954
+ # phrase = phrase.replace('poly>', '')
955
+
956
+ if phrase_text_strip == '' and not allow_empty_phrase:
957
+ continue
958
+
959
+
960
+ # parse phrase, get string
961
+ phrase = re.search(phrase_string_pattern, phrase_text_strip)
962
+ if phrase is None:
963
+ continue
964
+ phrase = phrase.group()
965
+ # remove leading and trailing spaces
966
+ phrase = phrase.strip()
967
+
968
+ # parse bboxes by box_pattern
969
+
970
+ # split by polygon_start_token and polygon_end_token first using polygons_instance_pattern
971
+ if polygon_start_token in phrase_text and polygon_end_token in phrase_text:
972
+ polygons_instances_parsed = list(re.finditer(polygons_instance_pattern, phrase_text))
973
+ else:
974
+ polygons_instances_parsed = [phrase_text]
975
+
976
+ for _polygons_instances_parsed in polygons_instances_parsed:
977
+ # Prepare instance.
978
+ instance = {}
979
+
980
+ # polygons_parsed= list(re.finditer(box_pattern, phrase_text))
981
+ if isinstance(_polygons_instances_parsed, str):
982
+ polygons_parsed= list(re.finditer(box_pattern, _polygons_instances_parsed))
983
+ else:
984
+ polygons_parsed= list(re.finditer(box_pattern, _polygons_instances_parsed.group(1)))
985
+ if len(polygons_parsed) == 0:
986
+ continue
987
+
988
+ # a list of list (polygon)
989
+ bbox = []
990
+ polygons = []
991
+ for _polygon_parsed in polygons_parsed:
992
+ # group 1: whole <loc_\d+>...</loc_\d+>
993
+ _polygon = _polygon_parsed.group(1)
994
+ # parse into list of int
995
+ _polygon = [int(_loc_parsed.group(1)) for _loc_parsed in re.finditer(r'<loc_(\d+)>', _polygon)]
996
+ if with_box_at_start and len(bbox) == 0:
997
+ if len(_polygon) > 4:
998
+ # no valid bbox prediction
999
+ bbox = _polygon[:4]
1000
+ _polygon = _polygon[4:]
1001
+ else:
1002
+ bbox = [0, 0, 0, 0]
1003
+ # abandon last element if is not paired
1004
+ if len(_polygon) % 2 == 1:
1005
+ _polygon = _polygon[:-1]
1006
+
1007
+ # reshape into (n, 2)
1008
+ _polygon = self.coordinates_quantizer.dequantize(
1009
+ torch.tensor(np.array(_polygon).reshape(-1, 2)),
1010
+ size=image_size
1011
+ ).reshape(-1).tolist()
1012
+ # reshape back
1013
+ polygons.append(_polygon)
1014
+
1015
+ instance['cat_name'] = phrase
1016
+ instance['polygons'] = polygons
1017
+ if len(bbox) != 0:
1018
+ instance['bbox'] = self.box_quantizer.dequantize(
1019
+ boxes=torch.tensor([bbox]),
1020
+ size=image_size
1021
+ ).tolist()[0]
1022
+
1023
+ instances.append(instance)
1024
+
1025
+ return instances
1026
+
1027
+ def __call__(
1028
+ self,
1029
+ text=None,
1030
+ image_size=None,
1031
+ parse_tasks=None,
1032
+ ):
1033
+ """
1034
+ Args:
1035
+ text: model outputs
1036
+ image_size: (width, height)
1037
+ parse_tasks: a list of tasks to parse, if None, parse all tasks.
1038
+
1039
+ """
1040
+ if parse_tasks is not None:
1041
+ if isinstance(parse_tasks, str):
1042
+ parse_tasks = [parse_tasks]
1043
+ for _parse_task in parse_tasks:
1044
+ assert _parse_task in self.parse_tasks, f'parse task {_parse_task} not supported'
1045
+
1046
+ # sequence or text should be provided
1047
+ assert text is not None, 'text should be provided'
1048
+
1049
+ parsed_dict = {
1050
+ 'text': text
1051
+ }
1052
+
1053
+ for task in self.parse_tasks:
1054
+ if parse_tasks is not None and task not in parse_tasks:
1055
+ continue
1056
+
1057
+ pattern = self.parse_tasks_configs[task].get('PATTERN', None)
1058
+
1059
+ if task == 'ocr':
1060
+ instances = self.parse_ocr_from_text_and_spans(
1061
+ text,
1062
+ pattern=pattern,
1063
+ image_size=image_size,
1064
+ area_threshold=self.parse_tasks_configs[task].get('AREA_THRESHOLD', 0.01),
1065
+ )
1066
+ parsed_dict['ocr'] = instances
1067
+ elif task == 'phrase_grounding':
1068
+ instances = self.parse_phrase_grounding_from_text_and_spans(
1069
+ text,
1070
+ pattern=pattern,
1071
+ image_size=image_size,
1072
+ )
1073
+ parsed_dict['phrase_grounding'] = instances
1074
+ elif task == 'pure_text':
1075
+ parsed_dict['pure_text'] = text
1076
+ elif task == 'description_with_bboxes':
1077
+ instances = self.parse_description_with_bboxes_from_text_and_spans(
1078
+ text,
1079
+ pattern=pattern,
1080
+ image_size=image_size,
1081
+ )
1082
+ parsed_dict['description_with_bboxes'] = instances
1083
+ elif task == 'description_with_polygons':
1084
+ instances = self.parse_description_with_polygons_from_text_and_spans(
1085
+ text,
1086
+ pattern=pattern,
1087
+ image_size=image_size,
1088
+ )
1089
+ parsed_dict['description_with_polygons'] = instances
1090
+ elif task == 'polygons':
1091
+ instances = self.parse_description_with_polygons_from_text_and_spans(
1092
+ text,
1093
+ pattern=pattern,
1094
+ image_size=image_size,
1095
+ allow_empty_phrase=True,
1096
+ )
1097
+ parsed_dict['polygons'] = instances
1098
+ elif task == 'bboxes':
1099
+ instances = self.parse_description_with_bboxes_from_text_and_spans(
1100
+ text,
1101
+ pattern=pattern,
1102
+ image_size=image_size,
1103
+ allow_empty_phrase=True,
1104
+ )
1105
+ parsed_dict['bboxes'] = instances
1106
+ elif task == 'description_with_bboxes_or_polygons':
1107
+ if '<poly>' in text:
1108
+ # only support either polygons or bboxes, not both at the same time
1109
+ instances = self.parse_description_with_polygons_from_text_and_spans(
1110
+ text,
1111
+ pattern=pattern,
1112
+ image_size=image_size,
1113
+ )
1114
+ else:
1115
+ instances = self.parse_description_with_bboxes_from_text_and_spans(
1116
+ text,
1117
+ pattern=pattern,
1118
+ image_size=image_size,
1119
+ )
1120
+ parsed_dict['description_with_bboxes_or_polygons'] = instances
1121
+ else:
1122
+ raise ValueError("task {} is not supported".format(task))
1123
+
1124
+ return parsed_dict
processor_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoProcessor": "processing_florence2.Florence2Processor"
4
+ },
5
+ "processor_class": "Florence2Processor"
6
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d63827e77ae378597498dc090baf676bc39771b4ca28a8804c312452cb3f146
3
+ size 1645700830
special_tokens_map.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
vocab.json ADDED
The diff for this file is too large to render. See raw diff