pcuenq HF Staff commited on
Commit
46fbb0d
·
verified ·
1 Parent(s): 0077181

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -1,35 +1,3 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
  *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  *.safetensors filter=lfs diff=lfs merge=lfs -text
2
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
3
+ tokenizer.model filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
.gitkeep ADDED
File without changes
README.md ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # PaddleOCR-VL-0.9B
2
+
3
+ Duplicated from https://huggingface.co/PaddlePaddle/PaddleOCR-VL/tree/main/imgs.
4
+
5
+ Example use with transformers:
6
+
7
+ ```py
8
+ from transformers import AutoModelForCausalLM, AutoProcessor
9
+ import torch
10
+
11
+ DEVICE="cuda" if torch.cuda.is_available() else "mps" if torch.mps.is_available() else "cpu"
12
+ model_id = "./PaddleOCR-VL-0.9B"
13
+
14
+ model = AutoModelForCausalLM.from_pretrained(
15
+ model_id, trust_remote_code=True, dtype=torch.bfloat16
16
+ ).to(DEVICE).eval()
17
+ processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
18
+
19
+ from transformers.image_utils import load_image
20
+ image_url = "https://fiverr-res.cloudinary.com/images/t_main1,q_auto,f_auto,q_auto,f_auto/gigs/154456946/original/41556aac80fc43dcb29ce656d786c0a6f9b4073f/do-handwritten-text-image-or-pdf-to-word-means-typing-form.jpg"
21
+ image = load_image(image_url)
22
+
23
+ messages = [{"role": "user", "content": "OCR"}]
24
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
25
+ inputs = processor(text=[text], images=[image], return_tensors="pt").to(DEVICE)
26
+
27
+ generated = model.generate(**inputs, max_new_tokens=200, do_sample=False)
28
+
29
+ resp = processor.batch_decode(generated, skip_special_tokens=True)[0]
30
+ answer = resp.split(text)[-1].strip()
31
+ print(answer)
32
+ ```
added_tokens.json ADDED
@@ -0,0 +1,1021 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<ecel>": 101308,
3
+ "<fcel>": 101309,
4
+ "<lcel>": 101311,
5
+ "<nl>": 101313,
6
+ "<ucel>": 101312,
7
+ "<xcel>": 101310,
8
+ "<|AUDIO_PLACEHOLDER|>": 100296,
9
+ "<|CROP_COL_SEP|>": 101301,
10
+ "<|CROP_ROW_SEP|>": 101302,
11
+ "<|IMAGE_END|>": 101306,
12
+ "<|IMAGE_PLACEHOLDER|>": 100295,
13
+ "<|IMAGE_SEP|>": 101303,
14
+ "<|IMAGE_START|>": 101305,
15
+ "<|LOC_0|>": 100297,
16
+ "<|LOC_1000|>": 101297,
17
+ "<|LOC_100|>": 100397,
18
+ "<|LOC_101|>": 100398,
19
+ "<|LOC_102|>": 100399,
20
+ "<|LOC_103|>": 100400,
21
+ "<|LOC_104|>": 100401,
22
+ "<|LOC_105|>": 100402,
23
+ "<|LOC_106|>": 100403,
24
+ "<|LOC_107|>": 100404,
25
+ "<|LOC_108|>": 100405,
26
+ "<|LOC_109|>": 100406,
27
+ "<|LOC_10|>": 100307,
28
+ "<|LOC_110|>": 100407,
29
+ "<|LOC_111|>": 100408,
30
+ "<|LOC_112|>": 100409,
31
+ "<|LOC_113|>": 100410,
32
+ "<|LOC_114|>": 100411,
33
+ "<|LOC_115|>": 100412,
34
+ "<|LOC_116|>": 100413,
35
+ "<|LOC_117|>": 100414,
36
+ "<|LOC_118|>": 100415,
37
+ "<|LOC_119|>": 100416,
38
+ "<|LOC_11|>": 100308,
39
+ "<|LOC_120|>": 100417,
40
+ "<|LOC_121|>": 100418,
41
+ "<|LOC_122|>": 100419,
42
+ "<|LOC_123|>": 100420,
43
+ "<|LOC_124|>": 100421,
44
+ "<|LOC_125|>": 100422,
45
+ "<|LOC_126|>": 100423,
46
+ "<|LOC_127|>": 100424,
47
+ "<|LOC_128|>": 100425,
48
+ "<|LOC_129|>": 100426,
49
+ "<|LOC_12|>": 100309,
50
+ "<|LOC_130|>": 100427,
51
+ "<|LOC_131|>": 100428,
52
+ "<|LOC_132|>": 100429,
53
+ "<|LOC_133|>": 100430,
54
+ "<|LOC_134|>": 100431,
55
+ "<|LOC_135|>": 100432,
56
+ "<|LOC_136|>": 100433,
57
+ "<|LOC_137|>": 100434,
58
+ "<|LOC_138|>": 100435,
59
+ "<|LOC_139|>": 100436,
60
+ "<|LOC_13|>": 100310,
61
+ "<|LOC_140|>": 100437,
62
+ "<|LOC_141|>": 100438,
63
+ "<|LOC_142|>": 100439,
64
+ "<|LOC_143|>": 100440,
65
+ "<|LOC_144|>": 100441,
66
+ "<|LOC_145|>": 100442,
67
+ "<|LOC_146|>": 100443,
68
+ "<|LOC_147|>": 100444,
69
+ "<|LOC_148|>": 100445,
70
+ "<|LOC_149|>": 100446,
71
+ "<|LOC_14|>": 100311,
72
+ "<|LOC_150|>": 100447,
73
+ "<|LOC_151|>": 100448,
74
+ "<|LOC_152|>": 100449,
75
+ "<|LOC_153|>": 100450,
76
+ "<|LOC_154|>": 100451,
77
+ "<|LOC_155|>": 100452,
78
+ "<|LOC_156|>": 100453,
79
+ "<|LOC_157|>": 100454,
80
+ "<|LOC_158|>": 100455,
81
+ "<|LOC_159|>": 100456,
82
+ "<|LOC_15|>": 100312,
83
+ "<|LOC_160|>": 100457,
84
+ "<|LOC_161|>": 100458,
85
+ "<|LOC_162|>": 100459,
86
+ "<|LOC_163|>": 100460,
87
+ "<|LOC_164|>": 100461,
88
+ "<|LOC_165|>": 100462,
89
+ "<|LOC_166|>": 100463,
90
+ "<|LOC_167|>": 100464,
91
+ "<|LOC_168|>": 100465,
92
+ "<|LOC_169|>": 100466,
93
+ "<|LOC_16|>": 100313,
94
+ "<|LOC_170|>": 100467,
95
+ "<|LOC_171|>": 100468,
96
+ "<|LOC_172|>": 100469,
97
+ "<|LOC_173|>": 100470,
98
+ "<|LOC_174|>": 100471,
99
+ "<|LOC_175|>": 100472,
100
+ "<|LOC_176|>": 100473,
101
+ "<|LOC_177|>": 100474,
102
+ "<|LOC_178|>": 100475,
103
+ "<|LOC_179|>": 100476,
104
+ "<|LOC_17|>": 100314,
105
+ "<|LOC_180|>": 100477,
106
+ "<|LOC_181|>": 100478,
107
+ "<|LOC_182|>": 100479,
108
+ "<|LOC_183|>": 100480,
109
+ "<|LOC_184|>": 100481,
110
+ "<|LOC_185|>": 100482,
111
+ "<|LOC_186|>": 100483,
112
+ "<|LOC_187|>": 100484,
113
+ "<|LOC_188|>": 100485,
114
+ "<|LOC_189|>": 100486,
115
+ "<|LOC_18|>": 100315,
116
+ "<|LOC_190|>": 100487,
117
+ "<|LOC_191|>": 100488,
118
+ "<|LOC_192|>": 100489,
119
+ "<|LOC_193|>": 100490,
120
+ "<|LOC_194|>": 100491,
121
+ "<|LOC_195|>": 100492,
122
+ "<|LOC_196|>": 100493,
123
+ "<|LOC_197|>": 100494,
124
+ "<|LOC_198|>": 100495,
125
+ "<|LOC_199|>": 100496,
126
+ "<|LOC_19|>": 100316,
127
+ "<|LOC_1|>": 100298,
128
+ "<|LOC_200|>": 100497,
129
+ "<|LOC_201|>": 100498,
130
+ "<|LOC_202|>": 100499,
131
+ "<|LOC_203|>": 100500,
132
+ "<|LOC_204|>": 100501,
133
+ "<|LOC_205|>": 100502,
134
+ "<|LOC_206|>": 100503,
135
+ "<|LOC_207|>": 100504,
136
+ "<|LOC_208|>": 100505,
137
+ "<|LOC_209|>": 100506,
138
+ "<|LOC_20|>": 100317,
139
+ "<|LOC_210|>": 100507,
140
+ "<|LOC_211|>": 100508,
141
+ "<|LOC_212|>": 100509,
142
+ "<|LOC_213|>": 100510,
143
+ "<|LOC_214|>": 100511,
144
+ "<|LOC_215|>": 100512,
145
+ "<|LOC_216|>": 100513,
146
+ "<|LOC_217|>": 100514,
147
+ "<|LOC_218|>": 100515,
148
+ "<|LOC_219|>": 100516,
149
+ "<|LOC_21|>": 100318,
150
+ "<|LOC_220|>": 100517,
151
+ "<|LOC_221|>": 100518,
152
+ "<|LOC_222|>": 100519,
153
+ "<|LOC_223|>": 100520,
154
+ "<|LOC_224|>": 100521,
155
+ "<|LOC_225|>": 100522,
156
+ "<|LOC_226|>": 100523,
157
+ "<|LOC_227|>": 100524,
158
+ "<|LOC_228|>": 100525,
159
+ "<|LOC_229|>": 100526,
160
+ "<|LOC_22|>": 100319,
161
+ "<|LOC_230|>": 100527,
162
+ "<|LOC_231|>": 100528,
163
+ "<|LOC_232|>": 100529,
164
+ "<|LOC_233|>": 100530,
165
+ "<|LOC_234|>": 100531,
166
+ "<|LOC_235|>": 100532,
167
+ "<|LOC_236|>": 100533,
168
+ "<|LOC_237|>": 100534,
169
+ "<|LOC_238|>": 100535,
170
+ "<|LOC_239|>": 100536,
171
+ "<|LOC_23|>": 100320,
172
+ "<|LOC_240|>": 100537,
173
+ "<|LOC_241|>": 100538,
174
+ "<|LOC_242|>": 100539,
175
+ "<|LOC_243|>": 100540,
176
+ "<|LOC_244|>": 100541,
177
+ "<|LOC_245|>": 100542,
178
+ "<|LOC_246|>": 100543,
179
+ "<|LOC_247|>": 100544,
180
+ "<|LOC_248|>": 100545,
181
+ "<|LOC_249|>": 100546,
182
+ "<|LOC_24|>": 100321,
183
+ "<|LOC_250|>": 100547,
184
+ "<|LOC_251|>": 100548,
185
+ "<|LOC_252|>": 100549,
186
+ "<|LOC_253|>": 100550,
187
+ "<|LOC_254|>": 100551,
188
+ "<|LOC_255|>": 100552,
189
+ "<|LOC_256|>": 100553,
190
+ "<|LOC_257|>": 100554,
191
+ "<|LOC_258|>": 100555,
192
+ "<|LOC_259|>": 100556,
193
+ "<|LOC_25|>": 100322,
194
+ "<|LOC_260|>": 100557,
195
+ "<|LOC_261|>": 100558,
196
+ "<|LOC_262|>": 100559,
197
+ "<|LOC_263|>": 100560,
198
+ "<|LOC_264|>": 100561,
199
+ "<|LOC_265|>": 100562,
200
+ "<|LOC_266|>": 100563,
201
+ "<|LOC_267|>": 100564,
202
+ "<|LOC_268|>": 100565,
203
+ "<|LOC_269|>": 100566,
204
+ "<|LOC_26|>": 100323,
205
+ "<|LOC_270|>": 100567,
206
+ "<|LOC_271|>": 100568,
207
+ "<|LOC_272|>": 100569,
208
+ "<|LOC_273|>": 100570,
209
+ "<|LOC_274|>": 100571,
210
+ "<|LOC_275|>": 100572,
211
+ "<|LOC_276|>": 100573,
212
+ "<|LOC_277|>": 100574,
213
+ "<|LOC_278|>": 100575,
214
+ "<|LOC_279|>": 100576,
215
+ "<|LOC_27|>": 100324,
216
+ "<|LOC_280|>": 100577,
217
+ "<|LOC_281|>": 100578,
218
+ "<|LOC_282|>": 100579,
219
+ "<|LOC_283|>": 100580,
220
+ "<|LOC_284|>": 100581,
221
+ "<|LOC_285|>": 100582,
222
+ "<|LOC_286|>": 100583,
223
+ "<|LOC_287|>": 100584,
224
+ "<|LOC_288|>": 100585,
225
+ "<|LOC_289|>": 100586,
226
+ "<|LOC_28|>": 100325,
227
+ "<|LOC_290|>": 100587,
228
+ "<|LOC_291|>": 100588,
229
+ "<|LOC_292|>": 100589,
230
+ "<|LOC_293|>": 100590,
231
+ "<|LOC_294|>": 100591,
232
+ "<|LOC_295|>": 100592,
233
+ "<|LOC_296|>": 100593,
234
+ "<|LOC_297|>": 100594,
235
+ "<|LOC_298|>": 100595,
236
+ "<|LOC_299|>": 100596,
237
+ "<|LOC_29|>": 100326,
238
+ "<|LOC_2|>": 100299,
239
+ "<|LOC_300|>": 100597,
240
+ "<|LOC_301|>": 100598,
241
+ "<|LOC_302|>": 100599,
242
+ "<|LOC_303|>": 100600,
243
+ "<|LOC_304|>": 100601,
244
+ "<|LOC_305|>": 100602,
245
+ "<|LOC_306|>": 100603,
246
+ "<|LOC_307|>": 100604,
247
+ "<|LOC_308|>": 100605,
248
+ "<|LOC_309|>": 100606,
249
+ "<|LOC_30|>": 100327,
250
+ "<|LOC_310|>": 100607,
251
+ "<|LOC_311|>": 100608,
252
+ "<|LOC_312|>": 100609,
253
+ "<|LOC_313|>": 100610,
254
+ "<|LOC_314|>": 100611,
255
+ "<|LOC_315|>": 100612,
256
+ "<|LOC_316|>": 100613,
257
+ "<|LOC_317|>": 100614,
258
+ "<|LOC_318|>": 100615,
259
+ "<|LOC_319|>": 100616,
260
+ "<|LOC_31|>": 100328,
261
+ "<|LOC_320|>": 100617,
262
+ "<|LOC_321|>": 100618,
263
+ "<|LOC_322|>": 100619,
264
+ "<|LOC_323|>": 100620,
265
+ "<|LOC_324|>": 100621,
266
+ "<|LOC_325|>": 100622,
267
+ "<|LOC_326|>": 100623,
268
+ "<|LOC_327|>": 100624,
269
+ "<|LOC_328|>": 100625,
270
+ "<|LOC_329|>": 100626,
271
+ "<|LOC_32|>": 100329,
272
+ "<|LOC_330|>": 100627,
273
+ "<|LOC_331|>": 100628,
274
+ "<|LOC_332|>": 100629,
275
+ "<|LOC_333|>": 100630,
276
+ "<|LOC_334|>": 100631,
277
+ "<|LOC_335|>": 100632,
278
+ "<|LOC_336|>": 100633,
279
+ "<|LOC_337|>": 100634,
280
+ "<|LOC_338|>": 100635,
281
+ "<|LOC_339|>": 100636,
282
+ "<|LOC_33|>": 100330,
283
+ "<|LOC_340|>": 100637,
284
+ "<|LOC_341|>": 100638,
285
+ "<|LOC_342|>": 100639,
286
+ "<|LOC_343|>": 100640,
287
+ "<|LOC_344|>": 100641,
288
+ "<|LOC_345|>": 100642,
289
+ "<|LOC_346|>": 100643,
290
+ "<|LOC_347|>": 100644,
291
+ "<|LOC_348|>": 100645,
292
+ "<|LOC_349|>": 100646,
293
+ "<|LOC_34|>": 100331,
294
+ "<|LOC_350|>": 100647,
295
+ "<|LOC_351|>": 100648,
296
+ "<|LOC_352|>": 100649,
297
+ "<|LOC_353|>": 100650,
298
+ "<|LOC_354|>": 100651,
299
+ "<|LOC_355|>": 100652,
300
+ "<|LOC_356|>": 100653,
301
+ "<|LOC_357|>": 100654,
302
+ "<|LOC_358|>": 100655,
303
+ "<|LOC_359|>": 100656,
304
+ "<|LOC_35|>": 100332,
305
+ "<|LOC_360|>": 100657,
306
+ "<|LOC_361|>": 100658,
307
+ "<|LOC_362|>": 100659,
308
+ "<|LOC_363|>": 100660,
309
+ "<|LOC_364|>": 100661,
310
+ "<|LOC_365|>": 100662,
311
+ "<|LOC_366|>": 100663,
312
+ "<|LOC_367|>": 100664,
313
+ "<|LOC_368|>": 100665,
314
+ "<|LOC_369|>": 100666,
315
+ "<|LOC_36|>": 100333,
316
+ "<|LOC_370|>": 100667,
317
+ "<|LOC_371|>": 100668,
318
+ "<|LOC_372|>": 100669,
319
+ "<|LOC_373|>": 100670,
320
+ "<|LOC_374|>": 100671,
321
+ "<|LOC_375|>": 100672,
322
+ "<|LOC_376|>": 100673,
323
+ "<|LOC_377|>": 100674,
324
+ "<|LOC_378|>": 100675,
325
+ "<|LOC_379|>": 100676,
326
+ "<|LOC_37|>": 100334,
327
+ "<|LOC_380|>": 100677,
328
+ "<|LOC_381|>": 100678,
329
+ "<|LOC_382|>": 100679,
330
+ "<|LOC_383|>": 100680,
331
+ "<|LOC_384|>": 100681,
332
+ "<|LOC_385|>": 100682,
333
+ "<|LOC_386|>": 100683,
334
+ "<|LOC_387|>": 100684,
335
+ "<|LOC_388|>": 100685,
336
+ "<|LOC_389|>": 100686,
337
+ "<|LOC_38|>": 100335,
338
+ "<|LOC_390|>": 100687,
339
+ "<|LOC_391|>": 100688,
340
+ "<|LOC_392|>": 100689,
341
+ "<|LOC_393|>": 100690,
342
+ "<|LOC_394|>": 100691,
343
+ "<|LOC_395|>": 100692,
344
+ "<|LOC_396|>": 100693,
345
+ "<|LOC_397|>": 100694,
346
+ "<|LOC_398|>": 100695,
347
+ "<|LOC_399|>": 100696,
348
+ "<|LOC_39|>": 100336,
349
+ "<|LOC_3|>": 100300,
350
+ "<|LOC_400|>": 100697,
351
+ "<|LOC_401|>": 100698,
352
+ "<|LOC_402|>": 100699,
353
+ "<|LOC_403|>": 100700,
354
+ "<|LOC_404|>": 100701,
355
+ "<|LOC_405|>": 100702,
356
+ "<|LOC_406|>": 100703,
357
+ "<|LOC_407|>": 100704,
358
+ "<|LOC_408|>": 100705,
359
+ "<|LOC_409|>": 100706,
360
+ "<|LOC_40|>": 100337,
361
+ "<|LOC_410|>": 100707,
362
+ "<|LOC_411|>": 100708,
363
+ "<|LOC_412|>": 100709,
364
+ "<|LOC_413|>": 100710,
365
+ "<|LOC_414|>": 100711,
366
+ "<|LOC_415|>": 100712,
367
+ "<|LOC_416|>": 100713,
368
+ "<|LOC_417|>": 100714,
369
+ "<|LOC_418|>": 100715,
370
+ "<|LOC_419|>": 100716,
371
+ "<|LOC_41|>": 100338,
372
+ "<|LOC_420|>": 100717,
373
+ "<|LOC_421|>": 100718,
374
+ "<|LOC_422|>": 100719,
375
+ "<|LOC_423|>": 100720,
376
+ "<|LOC_424|>": 100721,
377
+ "<|LOC_425|>": 100722,
378
+ "<|LOC_426|>": 100723,
379
+ "<|LOC_427|>": 100724,
380
+ "<|LOC_428|>": 100725,
381
+ "<|LOC_429|>": 100726,
382
+ "<|LOC_42|>": 100339,
383
+ "<|LOC_430|>": 100727,
384
+ "<|LOC_431|>": 100728,
385
+ "<|LOC_432|>": 100729,
386
+ "<|LOC_433|>": 100730,
387
+ "<|LOC_434|>": 100731,
388
+ "<|LOC_435|>": 100732,
389
+ "<|LOC_436|>": 100733,
390
+ "<|LOC_437|>": 100734,
391
+ "<|LOC_438|>": 100735,
392
+ "<|LOC_439|>": 100736,
393
+ "<|LOC_43|>": 100340,
394
+ "<|LOC_440|>": 100737,
395
+ "<|LOC_441|>": 100738,
396
+ "<|LOC_442|>": 100739,
397
+ "<|LOC_443|>": 100740,
398
+ "<|LOC_444|>": 100741,
399
+ "<|LOC_445|>": 100742,
400
+ "<|LOC_446|>": 100743,
401
+ "<|LOC_447|>": 100744,
402
+ "<|LOC_448|>": 100745,
403
+ "<|LOC_449|>": 100746,
404
+ "<|LOC_44|>": 100341,
405
+ "<|LOC_450|>": 100747,
406
+ "<|LOC_451|>": 100748,
407
+ "<|LOC_452|>": 100749,
408
+ "<|LOC_453|>": 100750,
409
+ "<|LOC_454|>": 100751,
410
+ "<|LOC_455|>": 100752,
411
+ "<|LOC_456|>": 100753,
412
+ "<|LOC_457|>": 100754,
413
+ "<|LOC_458|>": 100755,
414
+ "<|LOC_459|>": 100756,
415
+ "<|LOC_45|>": 100342,
416
+ "<|LOC_460|>": 100757,
417
+ "<|LOC_461|>": 100758,
418
+ "<|LOC_462|>": 100759,
419
+ "<|LOC_463|>": 100760,
420
+ "<|LOC_464|>": 100761,
421
+ "<|LOC_465|>": 100762,
422
+ "<|LOC_466|>": 100763,
423
+ "<|LOC_467|>": 100764,
424
+ "<|LOC_468|>": 100765,
425
+ "<|LOC_469|>": 100766,
426
+ "<|LOC_46|>": 100343,
427
+ "<|LOC_470|>": 100767,
428
+ "<|LOC_471|>": 100768,
429
+ "<|LOC_472|>": 100769,
430
+ "<|LOC_473|>": 100770,
431
+ "<|LOC_474|>": 100771,
432
+ "<|LOC_475|>": 100772,
433
+ "<|LOC_476|>": 100773,
434
+ "<|LOC_477|>": 100774,
435
+ "<|LOC_478|>": 100775,
436
+ "<|LOC_479|>": 100776,
437
+ "<|LOC_47|>": 100344,
438
+ "<|LOC_480|>": 100777,
439
+ "<|LOC_481|>": 100778,
440
+ "<|LOC_482|>": 100779,
441
+ "<|LOC_483|>": 100780,
442
+ "<|LOC_484|>": 100781,
443
+ "<|LOC_485|>": 100782,
444
+ "<|LOC_486|>": 100783,
445
+ "<|LOC_487|>": 100784,
446
+ "<|LOC_488|>": 100785,
447
+ "<|LOC_489|>": 100786,
448
+ "<|LOC_48|>": 100345,
449
+ "<|LOC_490|>": 100787,
450
+ "<|LOC_491|>": 100788,
451
+ "<|LOC_492|>": 100789,
452
+ "<|LOC_493|>": 100790,
453
+ "<|LOC_494|>": 100791,
454
+ "<|LOC_495|>": 100792,
455
+ "<|LOC_496|>": 100793,
456
+ "<|LOC_497|>": 100794,
457
+ "<|LOC_498|>": 100795,
458
+ "<|LOC_499|>": 100796,
459
+ "<|LOC_49|>": 100346,
460
+ "<|LOC_4|>": 100301,
461
+ "<|LOC_500|>": 100797,
462
+ "<|LOC_501|>": 100798,
463
+ "<|LOC_502|>": 100799,
464
+ "<|LOC_503|>": 100800,
465
+ "<|LOC_504|>": 100801,
466
+ "<|LOC_505|>": 100802,
467
+ "<|LOC_506|>": 100803,
468
+ "<|LOC_507|>": 100804,
469
+ "<|LOC_508|>": 100805,
470
+ "<|LOC_509|>": 100806,
471
+ "<|LOC_50|>": 100347,
472
+ "<|LOC_510|>": 100807,
473
+ "<|LOC_511|>": 100808,
474
+ "<|LOC_512|>": 100809,
475
+ "<|LOC_513|>": 100810,
476
+ "<|LOC_514|>": 100811,
477
+ "<|LOC_515|>": 100812,
478
+ "<|LOC_516|>": 100813,
479
+ "<|LOC_517|>": 100814,
480
+ "<|LOC_518|>": 100815,
481
+ "<|LOC_519|>": 100816,
482
+ "<|LOC_51|>": 100348,
483
+ "<|LOC_520|>": 100817,
484
+ "<|LOC_521|>": 100818,
485
+ "<|LOC_522|>": 100819,
486
+ "<|LOC_523|>": 100820,
487
+ "<|LOC_524|>": 100821,
488
+ "<|LOC_525|>": 100822,
489
+ "<|LOC_526|>": 100823,
490
+ "<|LOC_527|>": 100824,
491
+ "<|LOC_528|>": 100825,
492
+ "<|LOC_529|>": 100826,
493
+ "<|LOC_52|>": 100349,
494
+ "<|LOC_530|>": 100827,
495
+ "<|LOC_531|>": 100828,
496
+ "<|LOC_532|>": 100829,
497
+ "<|LOC_533|>": 100830,
498
+ "<|LOC_534|>": 100831,
499
+ "<|LOC_535|>": 100832,
500
+ "<|LOC_536|>": 100833,
501
+ "<|LOC_537|>": 100834,
502
+ "<|LOC_538|>": 100835,
503
+ "<|LOC_539|>": 100836,
504
+ "<|LOC_53|>": 100350,
505
+ "<|LOC_540|>": 100837,
506
+ "<|LOC_541|>": 100838,
507
+ "<|LOC_542|>": 100839,
508
+ "<|LOC_543|>": 100840,
509
+ "<|LOC_544|>": 100841,
510
+ "<|LOC_545|>": 100842,
511
+ "<|LOC_546|>": 100843,
512
+ "<|LOC_547|>": 100844,
513
+ "<|LOC_548|>": 100845,
514
+ "<|LOC_549|>": 100846,
515
+ "<|LOC_54|>": 100351,
516
+ "<|LOC_550|>": 100847,
517
+ "<|LOC_551|>": 100848,
518
+ "<|LOC_552|>": 100849,
519
+ "<|LOC_553|>": 100850,
520
+ "<|LOC_554|>": 100851,
521
+ "<|LOC_555|>": 100852,
522
+ "<|LOC_556|>": 100853,
523
+ "<|LOC_557|>": 100854,
524
+ "<|LOC_558|>": 100855,
525
+ "<|LOC_559|>": 100856,
526
+ "<|LOC_55|>": 100352,
527
+ "<|LOC_560|>": 100857,
528
+ "<|LOC_561|>": 100858,
529
+ "<|LOC_562|>": 100859,
530
+ "<|LOC_563|>": 100860,
531
+ "<|LOC_564|>": 100861,
532
+ "<|LOC_565|>": 100862,
533
+ "<|LOC_566|>": 100863,
534
+ "<|LOC_567|>": 100864,
535
+ "<|LOC_568|>": 100865,
536
+ "<|LOC_569|>": 100866,
537
+ "<|LOC_56|>": 100353,
538
+ "<|LOC_570|>": 100867,
539
+ "<|LOC_571|>": 100868,
540
+ "<|LOC_572|>": 100869,
541
+ "<|LOC_573|>": 100870,
542
+ "<|LOC_574|>": 100871,
543
+ "<|LOC_575|>": 100872,
544
+ "<|LOC_576|>": 100873,
545
+ "<|LOC_577|>": 100874,
546
+ "<|LOC_578|>": 100875,
547
+ "<|LOC_579|>": 100876,
548
+ "<|LOC_57|>": 100354,
549
+ "<|LOC_580|>": 100877,
550
+ "<|LOC_581|>": 100878,
551
+ "<|LOC_582|>": 100879,
552
+ "<|LOC_583|>": 100880,
553
+ "<|LOC_584|>": 100881,
554
+ "<|LOC_585|>": 100882,
555
+ "<|LOC_586|>": 100883,
556
+ "<|LOC_587|>": 100884,
557
+ "<|LOC_588|>": 100885,
558
+ "<|LOC_589|>": 100886,
559
+ "<|LOC_58|>": 100355,
560
+ "<|LOC_590|>": 100887,
561
+ "<|LOC_591|>": 100888,
562
+ "<|LOC_592|>": 100889,
563
+ "<|LOC_593|>": 100890,
564
+ "<|LOC_594|>": 100891,
565
+ "<|LOC_595|>": 100892,
566
+ "<|LOC_596|>": 100893,
567
+ "<|LOC_597|>": 100894,
568
+ "<|LOC_598|>": 100895,
569
+ "<|LOC_599|>": 100896,
570
+ "<|LOC_59|>": 100356,
571
+ "<|LOC_5|>": 100302,
572
+ "<|LOC_600|>": 100897,
573
+ "<|LOC_601|>": 100898,
574
+ "<|LOC_602|>": 100899,
575
+ "<|LOC_603|>": 100900,
576
+ "<|LOC_604|>": 100901,
577
+ "<|LOC_605|>": 100902,
578
+ "<|LOC_606|>": 100903,
579
+ "<|LOC_607|>": 100904,
580
+ "<|LOC_608|>": 100905,
581
+ "<|LOC_609|>": 100906,
582
+ "<|LOC_60|>": 100357,
583
+ "<|LOC_610|>": 100907,
584
+ "<|LOC_611|>": 100908,
585
+ "<|LOC_612|>": 100909,
586
+ "<|LOC_613|>": 100910,
587
+ "<|LOC_614|>": 100911,
588
+ "<|LOC_615|>": 100912,
589
+ "<|LOC_616|>": 100913,
590
+ "<|LOC_617|>": 100914,
591
+ "<|LOC_618|>": 100915,
592
+ "<|LOC_619|>": 100916,
593
+ "<|LOC_61|>": 100358,
594
+ "<|LOC_620|>": 100917,
595
+ "<|LOC_621|>": 100918,
596
+ "<|LOC_622|>": 100919,
597
+ "<|LOC_623|>": 100920,
598
+ "<|LOC_624|>": 100921,
599
+ "<|LOC_625|>": 100922,
600
+ "<|LOC_626|>": 100923,
601
+ "<|LOC_627|>": 100924,
602
+ "<|LOC_628|>": 100925,
603
+ "<|LOC_629|>": 100926,
604
+ "<|LOC_62|>": 100359,
605
+ "<|LOC_630|>": 100927,
606
+ "<|LOC_631|>": 100928,
607
+ "<|LOC_632|>": 100929,
608
+ "<|LOC_633|>": 100930,
609
+ "<|LOC_634|>": 100931,
610
+ "<|LOC_635|>": 100932,
611
+ "<|LOC_636|>": 100933,
612
+ "<|LOC_637|>": 100934,
613
+ "<|LOC_638|>": 100935,
614
+ "<|LOC_639|>": 100936,
615
+ "<|LOC_63|>": 100360,
616
+ "<|LOC_640|>": 100937,
617
+ "<|LOC_641|>": 100938,
618
+ "<|LOC_642|>": 100939,
619
+ "<|LOC_643|>": 100940,
620
+ "<|LOC_644|>": 100941,
621
+ "<|LOC_645|>": 100942,
622
+ "<|LOC_646|>": 100943,
623
+ "<|LOC_647|>": 100944,
624
+ "<|LOC_648|>": 100945,
625
+ "<|LOC_649|>": 100946,
626
+ "<|LOC_64|>": 100361,
627
+ "<|LOC_650|>": 100947,
628
+ "<|LOC_651|>": 100948,
629
+ "<|LOC_652|>": 100949,
630
+ "<|LOC_653|>": 100950,
631
+ "<|LOC_654|>": 100951,
632
+ "<|LOC_655|>": 100952,
633
+ "<|LOC_656|>": 100953,
634
+ "<|LOC_657|>": 100954,
635
+ "<|LOC_658|>": 100955,
636
+ "<|LOC_659|>": 100956,
637
+ "<|LOC_65|>": 100362,
638
+ "<|LOC_660|>": 100957,
639
+ "<|LOC_661|>": 100958,
640
+ "<|LOC_662|>": 100959,
641
+ "<|LOC_663|>": 100960,
642
+ "<|LOC_664|>": 100961,
643
+ "<|LOC_665|>": 100962,
644
+ "<|LOC_666|>": 100963,
645
+ "<|LOC_667|>": 100964,
646
+ "<|LOC_668|>": 100965,
647
+ "<|LOC_669|>": 100966,
648
+ "<|LOC_66|>": 100363,
649
+ "<|LOC_670|>": 100967,
650
+ "<|LOC_671|>": 100968,
651
+ "<|LOC_672|>": 100969,
652
+ "<|LOC_673|>": 100970,
653
+ "<|LOC_674|>": 100971,
654
+ "<|LOC_675|>": 100972,
655
+ "<|LOC_676|>": 100973,
656
+ "<|LOC_677|>": 100974,
657
+ "<|LOC_678|>": 100975,
658
+ "<|LOC_679|>": 100976,
659
+ "<|LOC_67|>": 100364,
660
+ "<|LOC_680|>": 100977,
661
+ "<|LOC_681|>": 100978,
662
+ "<|LOC_682|>": 100979,
663
+ "<|LOC_683|>": 100980,
664
+ "<|LOC_684|>": 100981,
665
+ "<|LOC_685|>": 100982,
666
+ "<|LOC_686|>": 100983,
667
+ "<|LOC_687|>": 100984,
668
+ "<|LOC_688|>": 100985,
669
+ "<|LOC_689|>": 100986,
670
+ "<|LOC_68|>": 100365,
671
+ "<|LOC_690|>": 100987,
672
+ "<|LOC_691|>": 100988,
673
+ "<|LOC_692|>": 100989,
674
+ "<|LOC_693|>": 100990,
675
+ "<|LOC_694|>": 100991,
676
+ "<|LOC_695|>": 100992,
677
+ "<|LOC_696|>": 100993,
678
+ "<|LOC_697|>": 100994,
679
+ "<|LOC_698|>": 100995,
680
+ "<|LOC_699|>": 100996,
681
+ "<|LOC_69|>": 100366,
682
+ "<|LOC_6|>": 100303,
683
+ "<|LOC_700|>": 100997,
684
+ "<|LOC_701|>": 100998,
685
+ "<|LOC_702|>": 100999,
686
+ "<|LOC_703|>": 101000,
687
+ "<|LOC_704|>": 101001,
688
+ "<|LOC_705|>": 101002,
689
+ "<|LOC_706|>": 101003,
690
+ "<|LOC_707|>": 101004,
691
+ "<|LOC_708|>": 101005,
692
+ "<|LOC_709|>": 101006,
693
+ "<|LOC_70|>": 100367,
694
+ "<|LOC_710|>": 101007,
695
+ "<|LOC_711|>": 101008,
696
+ "<|LOC_712|>": 101009,
697
+ "<|LOC_713|>": 101010,
698
+ "<|LOC_714|>": 101011,
699
+ "<|LOC_715|>": 101012,
700
+ "<|LOC_716|>": 101013,
701
+ "<|LOC_717|>": 101014,
702
+ "<|LOC_718|>": 101015,
703
+ "<|LOC_719|>": 101016,
704
+ "<|LOC_71|>": 100368,
705
+ "<|LOC_720|>": 101017,
706
+ "<|LOC_721|>": 101018,
707
+ "<|LOC_722|>": 101019,
708
+ "<|LOC_723|>": 101020,
709
+ "<|LOC_724|>": 101021,
710
+ "<|LOC_725|>": 101022,
711
+ "<|LOC_726|>": 101023,
712
+ "<|LOC_727|>": 101024,
713
+ "<|LOC_728|>": 101025,
714
+ "<|LOC_729|>": 101026,
715
+ "<|LOC_72|>": 100369,
716
+ "<|LOC_730|>": 101027,
717
+ "<|LOC_731|>": 101028,
718
+ "<|LOC_732|>": 101029,
719
+ "<|LOC_733|>": 101030,
720
+ "<|LOC_734|>": 101031,
721
+ "<|LOC_735|>": 101032,
722
+ "<|LOC_736|>": 101033,
723
+ "<|LOC_737|>": 101034,
724
+ "<|LOC_738|>": 101035,
725
+ "<|LOC_739|>": 101036,
726
+ "<|LOC_73|>": 100370,
727
+ "<|LOC_740|>": 101037,
728
+ "<|LOC_741|>": 101038,
729
+ "<|LOC_742|>": 101039,
730
+ "<|LOC_743|>": 101040,
731
+ "<|LOC_744|>": 101041,
732
+ "<|LOC_745|>": 101042,
733
+ "<|LOC_746|>": 101043,
734
+ "<|LOC_747|>": 101044,
735
+ "<|LOC_748|>": 101045,
736
+ "<|LOC_749|>": 101046,
737
+ "<|LOC_74|>": 100371,
738
+ "<|LOC_750|>": 101047,
739
+ "<|LOC_751|>": 101048,
740
+ "<|LOC_752|>": 101049,
741
+ "<|LOC_753|>": 101050,
742
+ "<|LOC_754|>": 101051,
743
+ "<|LOC_755|>": 101052,
744
+ "<|LOC_756|>": 101053,
745
+ "<|LOC_757|>": 101054,
746
+ "<|LOC_758|>": 101055,
747
+ "<|LOC_759|>": 101056,
748
+ "<|LOC_75|>": 100372,
749
+ "<|LOC_760|>": 101057,
750
+ "<|LOC_761|>": 101058,
751
+ "<|LOC_762|>": 101059,
752
+ "<|LOC_763|>": 101060,
753
+ "<|LOC_764|>": 101061,
754
+ "<|LOC_765|>": 101062,
755
+ "<|LOC_766|>": 101063,
756
+ "<|LOC_767|>": 101064,
757
+ "<|LOC_768|>": 101065,
758
+ "<|LOC_769|>": 101066,
759
+ "<|LOC_76|>": 100373,
760
+ "<|LOC_770|>": 101067,
761
+ "<|LOC_771|>": 101068,
762
+ "<|LOC_772|>": 101069,
763
+ "<|LOC_773|>": 101070,
764
+ "<|LOC_774|>": 101071,
765
+ "<|LOC_775|>": 101072,
766
+ "<|LOC_776|>": 101073,
767
+ "<|LOC_777|>": 101074,
768
+ "<|LOC_778|>": 101075,
769
+ "<|LOC_779|>": 101076,
770
+ "<|LOC_77|>": 100374,
771
+ "<|LOC_780|>": 101077,
772
+ "<|LOC_781|>": 101078,
773
+ "<|LOC_782|>": 101079,
774
+ "<|LOC_783|>": 101080,
775
+ "<|LOC_784|>": 101081,
776
+ "<|LOC_785|>": 101082,
777
+ "<|LOC_786|>": 101083,
778
+ "<|LOC_787|>": 101084,
779
+ "<|LOC_788|>": 101085,
780
+ "<|LOC_789|>": 101086,
781
+ "<|LOC_78|>": 100375,
782
+ "<|LOC_790|>": 101087,
783
+ "<|LOC_791|>": 101088,
784
+ "<|LOC_792|>": 101089,
785
+ "<|LOC_793|>": 101090,
786
+ "<|LOC_794|>": 101091,
787
+ "<|LOC_795|>": 101092,
788
+ "<|LOC_796|>": 101093,
789
+ "<|LOC_797|>": 101094,
790
+ "<|LOC_798|>": 101095,
791
+ "<|LOC_799|>": 101096,
792
+ "<|LOC_79|>": 100376,
793
+ "<|LOC_7|>": 100304,
794
+ "<|LOC_800|>": 101097,
795
+ "<|LOC_801|>": 101098,
796
+ "<|LOC_802|>": 101099,
797
+ "<|LOC_803|>": 101100,
798
+ "<|LOC_804|>": 101101,
799
+ "<|LOC_805|>": 101102,
800
+ "<|LOC_806|>": 101103,
801
+ "<|LOC_807|>": 101104,
802
+ "<|LOC_808|>": 101105,
803
+ "<|LOC_809|>": 101106,
804
+ "<|LOC_80|>": 100377,
805
+ "<|LOC_810|>": 101107,
806
+ "<|LOC_811|>": 101108,
807
+ "<|LOC_812|>": 101109,
808
+ "<|LOC_813|>": 101110,
809
+ "<|LOC_814|>": 101111,
810
+ "<|LOC_815|>": 101112,
811
+ "<|LOC_816|>": 101113,
812
+ "<|LOC_817|>": 101114,
813
+ "<|LOC_818|>": 101115,
814
+ "<|LOC_819|>": 101116,
815
+ "<|LOC_81|>": 100378,
816
+ "<|LOC_820|>": 101117,
817
+ "<|LOC_821|>": 101118,
818
+ "<|LOC_822|>": 101119,
819
+ "<|LOC_823|>": 101120,
820
+ "<|LOC_824|>": 101121,
821
+ "<|LOC_825|>": 101122,
822
+ "<|LOC_826|>": 101123,
823
+ "<|LOC_827|>": 101124,
824
+ "<|LOC_828|>": 101125,
825
+ "<|LOC_829|>": 101126,
826
+ "<|LOC_82|>": 100379,
827
+ "<|LOC_830|>": 101127,
828
+ "<|LOC_831|>": 101128,
829
+ "<|LOC_832|>": 101129,
830
+ "<|LOC_833|>": 101130,
831
+ "<|LOC_834|>": 101131,
832
+ "<|LOC_835|>": 101132,
833
+ "<|LOC_836|>": 101133,
834
+ "<|LOC_837|>": 101134,
835
+ "<|LOC_838|>": 101135,
836
+ "<|LOC_839|>": 101136,
837
+ "<|LOC_83|>": 100380,
838
+ "<|LOC_840|>": 101137,
839
+ "<|LOC_841|>": 101138,
840
+ "<|LOC_842|>": 101139,
841
+ "<|LOC_843|>": 101140,
842
+ "<|LOC_844|>": 101141,
843
+ "<|LOC_845|>": 101142,
844
+ "<|LOC_846|>": 101143,
845
+ "<|LOC_847|>": 101144,
846
+ "<|LOC_848|>": 101145,
847
+ "<|LOC_849|>": 101146,
848
+ "<|LOC_84|>": 100381,
849
+ "<|LOC_850|>": 101147,
850
+ "<|LOC_851|>": 101148,
851
+ "<|LOC_852|>": 101149,
852
+ "<|LOC_853|>": 101150,
853
+ "<|LOC_854|>": 101151,
854
+ "<|LOC_855|>": 101152,
855
+ "<|LOC_856|>": 101153,
856
+ "<|LOC_857|>": 101154,
857
+ "<|LOC_858|>": 101155,
858
+ "<|LOC_859|>": 101156,
859
+ "<|LOC_85|>": 100382,
860
+ "<|LOC_860|>": 101157,
861
+ "<|LOC_861|>": 101158,
862
+ "<|LOC_862|>": 101159,
863
+ "<|LOC_863|>": 101160,
864
+ "<|LOC_864|>": 101161,
865
+ "<|LOC_865|>": 101162,
866
+ "<|LOC_866|>": 101163,
867
+ "<|LOC_867|>": 101164,
868
+ "<|LOC_868|>": 101165,
869
+ "<|LOC_869|>": 101166,
870
+ "<|LOC_86|>": 100383,
871
+ "<|LOC_870|>": 101167,
872
+ "<|LOC_871|>": 101168,
873
+ "<|LOC_872|>": 101169,
874
+ "<|LOC_873|>": 101170,
875
+ "<|LOC_874|>": 101171,
876
+ "<|LOC_875|>": 101172,
877
+ "<|LOC_876|>": 101173,
878
+ "<|LOC_877|>": 101174,
879
+ "<|LOC_878|>": 101175,
880
+ "<|LOC_879|>": 101176,
881
+ "<|LOC_87|>": 100384,
882
+ "<|LOC_880|>": 101177,
883
+ "<|LOC_881|>": 101178,
884
+ "<|LOC_882|>": 101179,
885
+ "<|LOC_883|>": 101180,
886
+ "<|LOC_884|>": 101181,
887
+ "<|LOC_885|>": 101182,
888
+ "<|LOC_886|>": 101183,
889
+ "<|LOC_887|>": 101184,
890
+ "<|LOC_888|>": 101185,
891
+ "<|LOC_889|>": 101186,
892
+ "<|LOC_88|>": 100385,
893
+ "<|LOC_890|>": 101187,
894
+ "<|LOC_891|>": 101188,
895
+ "<|LOC_892|>": 101189,
896
+ "<|LOC_893|>": 101190,
897
+ "<|LOC_894|>": 101191,
898
+ "<|LOC_895|>": 101192,
899
+ "<|LOC_896|>": 101193,
900
+ "<|LOC_897|>": 101194,
901
+ "<|LOC_898|>": 101195,
902
+ "<|LOC_899|>": 101196,
903
+ "<|LOC_89|>": 100386,
904
+ "<|LOC_8|>": 100305,
905
+ "<|LOC_900|>": 101197,
906
+ "<|LOC_901|>": 101198,
907
+ "<|LOC_902|>": 101199,
908
+ "<|LOC_903|>": 101200,
909
+ "<|LOC_904|>": 101201,
910
+ "<|LOC_905|>": 101202,
911
+ "<|LOC_906|>": 101203,
912
+ "<|LOC_907|>": 101204,
913
+ "<|LOC_908|>": 101205,
914
+ "<|LOC_909|>": 101206,
915
+ "<|LOC_90|>": 100387,
916
+ "<|LOC_910|>": 101207,
917
+ "<|LOC_911|>": 101208,
918
+ "<|LOC_912|>": 101209,
919
+ "<|LOC_913|>": 101210,
920
+ "<|LOC_914|>": 101211,
921
+ "<|LOC_915|>": 101212,
922
+ "<|LOC_916|>": 101213,
923
+ "<|LOC_917|>": 101214,
924
+ "<|LOC_918|>": 101215,
925
+ "<|LOC_919|>": 101216,
926
+ "<|LOC_91|>": 100388,
927
+ "<|LOC_920|>": 101217,
928
+ "<|LOC_921|>": 101218,
929
+ "<|LOC_922|>": 101219,
930
+ "<|LOC_923|>": 101220,
931
+ "<|LOC_924|>": 101221,
932
+ "<|LOC_925|>": 101222,
933
+ "<|LOC_926|>": 101223,
934
+ "<|LOC_927|>": 101224,
935
+ "<|LOC_928|>": 101225,
936
+ "<|LOC_929|>": 101226,
937
+ "<|LOC_92|>": 100389,
938
+ "<|LOC_930|>": 101227,
939
+ "<|LOC_931|>": 101228,
940
+ "<|LOC_932|>": 101229,
941
+ "<|LOC_933|>": 101230,
942
+ "<|LOC_934|>": 101231,
943
+ "<|LOC_935|>": 101232,
944
+ "<|LOC_936|>": 101233,
945
+ "<|LOC_937|>": 101234,
946
+ "<|LOC_938|>": 101235,
947
+ "<|LOC_939|>": 101236,
948
+ "<|LOC_93|>": 100390,
949
+ "<|LOC_940|>": 101237,
950
+ "<|LOC_941|>": 101238,
951
+ "<|LOC_942|>": 101239,
952
+ "<|LOC_943|>": 101240,
953
+ "<|LOC_944|>": 101241,
954
+ "<|LOC_945|>": 101242,
955
+ "<|LOC_946|>": 101243,
956
+ "<|LOC_947|>": 101244,
957
+ "<|LOC_948|>": 101245,
958
+ "<|LOC_949|>": 101246,
959
+ "<|LOC_94|>": 100391,
960
+ "<|LOC_950|>": 101247,
961
+ "<|LOC_951|>": 101248,
962
+ "<|LOC_952|>": 101249,
963
+ "<|LOC_953|>": 101250,
964
+ "<|LOC_954|>": 101251,
965
+ "<|LOC_955|>": 101252,
966
+ "<|LOC_956|>": 101253,
967
+ "<|LOC_957|>": 101254,
968
+ "<|LOC_958|>": 101255,
969
+ "<|LOC_959|>": 101256,
970
+ "<|LOC_95|>": 100392,
971
+ "<|LOC_960|>": 101257,
972
+ "<|LOC_961|>": 101258,
973
+ "<|LOC_962|>": 101259,
974
+ "<|LOC_963|>": 101260,
975
+ "<|LOC_964|>": 101261,
976
+ "<|LOC_965|>": 101262,
977
+ "<|LOC_966|>": 101263,
978
+ "<|LOC_967|>": 101264,
979
+ "<|LOC_968|>": 101265,
980
+ "<|LOC_969|>": 101266,
981
+ "<|LOC_96|>": 100393,
982
+ "<|LOC_970|>": 101267,
983
+ "<|LOC_971|>": 101268,
984
+ "<|LOC_972|>": 101269,
985
+ "<|LOC_973|>": 101270,
986
+ "<|LOC_974|>": 101271,
987
+ "<|LOC_975|>": 101272,
988
+ "<|LOC_976|>": 101273,
989
+ "<|LOC_977|>": 101274,
990
+ "<|LOC_978|>": 101275,
991
+ "<|LOC_979|>": 101276,
992
+ "<|LOC_97|>": 100394,
993
+ "<|LOC_980|>": 101277,
994
+ "<|LOC_981|>": 101278,
995
+ "<|LOC_982|>": 101279,
996
+ "<|LOC_983|>": 101280,
997
+ "<|LOC_984|>": 101281,
998
+ "<|LOC_985|>": 101282,
999
+ "<|LOC_986|>": 101283,
1000
+ "<|LOC_987|>": 101284,
1001
+ "<|LOC_988|>": 101285,
1002
+ "<|LOC_989|>": 101286,
1003
+ "<|LOC_98|>": 100395,
1004
+ "<|LOC_990|>": 101287,
1005
+ "<|LOC_991|>": 101288,
1006
+ "<|LOC_992|>": 101289,
1007
+ "<|LOC_993|>": 101290,
1008
+ "<|LOC_994|>": 101291,
1009
+ "<|LOC_995|>": 101292,
1010
+ "<|LOC_996|>": 101293,
1011
+ "<|LOC_997|>": 101294,
1012
+ "<|LOC_998|>": 101295,
1013
+ "<|LOC_999|>": 101296,
1014
+ "<|LOC_99|>": 100396,
1015
+ "<|LOC_9|>": 100306,
1016
+ "<|LOC_BEGIN|>": 101298,
1017
+ "<|LOC_END|>": 101299,
1018
+ "<|LOC_SEP|>": 101300,
1019
+ "<|image_pad|>": 101304,
1020
+ "<|video_pad|>": 101307
1021
+ }
chat_template.jinja ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if not add_generation_prompt is defined -%}
2
+ {%- set add_generation_prompt = true -%}
3
+ {%- endif -%}
4
+ {%- if not cls_token is defined -%}
5
+ {%- set cls_token = "<|begin_of_sentence|>" -%}
6
+ {%- endif -%}
7
+ {%- if not sep_token is defined -%}
8
+ {%- set sep_token = "<|end_of_sentence|>" -%}
9
+ {%- endif -%}
10
+ {{- cls_token -}}
11
+ {%- for message in messages -%}
12
+ {%- if message["role"] == "user" -%}
13
+ {{- "User: <|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>" + message["content"] + "\n" -}}
14
+ {%- elif message["role"] == "assistant" -%}
15
+ {{- "Assistant: " + message["content"] + sep_token -}}
16
+ {%- elif message["role"] == "system" -%}
17
+ {{- message["content"] -}}
18
+ {%- endif -%}
19
+ {%- endfor -%}
20
+ {%- if add_generation_prompt -%}
21
+ {{- "Assistant: " -}}
22
+ {%- endif -%}
config.json ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "PaddleOCRVLForConditionalGeneration"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.0,
6
+ "auto_map": {
7
+ "AutoConfig": "configuration_paddleocr_vl.PaddleOCRVLConfig",
8
+ "AutoModel": "modeling_paddleocr_vl.PaddleOCRVLForConditionalGeneration",
9
+ "AutoModelForCausalLM": "modeling_paddleocr_vl.PaddleOCRVLForConditionalGeneration"
10
+ },
11
+ "compression_ratio": 1.0,
12
+ "head_dim": 128,
13
+ "hidden_act": "silu",
14
+ "hidden_dropout_prob": 0.0,
15
+ "hidden_size": 1024,
16
+ "ignored_index": -100,
17
+ "image_token_id": 100295,
18
+ "intermediate_size": 3072,
19
+ "max_position_embeddings": 131072,
20
+ "max_sequence_length": null,
21
+ "model_type": "paddleocr_vl",
22
+ "num_attention_heads": 16,
23
+ "num_hidden_layers": 18,
24
+ "num_key_value_heads": 2,
25
+ "pad_token_id": 0,
26
+ "rms_norm_eps": 1e-05,
27
+ "rope_scaling": {
28
+ "mrope_section": [
29
+ 16,
30
+ 24,
31
+ 24
32
+ ],
33
+ "rope_type": "default",
34
+ "type": "default"
35
+ },
36
+ "rope_theta": 500000,
37
+ "sliding_window": null,
38
+ "tie_word_embeddings": false,
39
+ "torch_dtype": "bfloat16",
40
+ "transformers_version": "4.55.0",
41
+ "use_bias": false,
42
+ "use_cache": false,
43
+ "use_flash_attention": false,
44
+ "video_token_id": 101307,
45
+ "vision_config": {
46
+ "architectures": [
47
+ "SiglipVisionModel"
48
+ ],
49
+ "attention_dropout": 0.0,
50
+ "auto_map": {
51
+ "AutoConfig": "configuration_paddleocr_vl.PaddleOCRVLConfig",
52
+ "AutoModel": "modeling_paddleocr_vl.SiglipVisionModel"
53
+ },
54
+ "hidden_act": "gelu_pytorch_tanh",
55
+ "hidden_size": 1152,
56
+ "image_size": 384,
57
+ "intermediate_size": 4304,
58
+ "layer_norm_eps": 1e-06,
59
+ "model_type": "paddleocr_vl",
60
+ "num_attention_heads": 16,
61
+ "num_channels": 3,
62
+ "num_hidden_layers": 27,
63
+ "pad_token_id": 0,
64
+ "patch_size": 14,
65
+ "spatial_merge_size": 2,
66
+ "temporal_patch_size": 2,
67
+ "tokens_per_second": 2,
68
+ "torch_dtype": "bfloat16"
69
+ },
70
+ "vision_start_token_id": 101305,
71
+ "vocab_size": 103424,
72
+ "weight_share_add_bias": true,
73
+ "use_3d_rope": true,
74
+ "rope_is_neox_style": true
75
+ }
configuration_paddleocr_vl.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from transformers.configuration_utils import PretrainedConfig
16
+ from transformers.modeling_rope_utils import rope_config_validation
17
+
18
+ class PaddleOCRVisionConfig(PretrainedConfig):
19
+ model_type = "paddleocr_vl"
20
+ base_config_key = "vision_config"
21
+
22
+ def __init__(
23
+ self,
24
+ hidden_size=768,
25
+ intermediate_size=3072,
26
+ num_hidden_layers=12,
27
+ num_attention_heads=12,
28
+ num_channels=3,
29
+ image_size=224,
30
+ patch_size=14,
31
+ hidden_act="gelu_pytorch_tanh",
32
+ layer_norm_eps=1e-6,
33
+ attention_dropout=0.0,
34
+ spatial_merge_size=2,
35
+ temporal_patch_size=2,
36
+ tokens_per_second=2,
37
+ **kwargs,
38
+ ):
39
+ super().__init__(**kwargs)
40
+
41
+ self.hidden_size = hidden_size
42
+ self.intermediate_size = intermediate_size
43
+ self.num_hidden_layers = num_hidden_layers
44
+ self.num_attention_heads = num_attention_heads
45
+ self.num_channels = num_channels
46
+ self.patch_size = patch_size
47
+ self.image_size = image_size
48
+ self.attention_dropout = attention_dropout
49
+ self.layer_norm_eps = layer_norm_eps
50
+ self.hidden_act = hidden_act
51
+ self.spatial_merge_size = spatial_merge_size
52
+ self.temporal_patch_size = temporal_patch_size
53
+ self.tokens_per_second = tokens_per_second
54
+
55
+
56
+
57
+ class PaddleOCRVLConfig(PretrainedConfig):
58
+ """
59
+ Configuration class.
60
+
61
+ This class stores the configuration of an Ernie model, defining the model architecture.
62
+ It inherits from PretrainedConfig and can be used to control model outputs.
63
+ """
64
+
65
+ model_type = "paddleocr_vl"
66
+ keys_to_ignore_at_inference = ["past_key_values"]
67
+ sub_configs = {"vision_config": PaddleOCRVisionConfig}
68
+
69
+ # Default tensor parallel plan for base model `Qwen3`
70
+ base_model_tp_plan = {
71
+ "layers.*.self_attn.q_proj": "colwise",
72
+ "layers.*.self_attn.k_proj": "colwise",
73
+ "layers.*.self_attn.v_proj": "colwise",
74
+ "layers.*.self_attn.o_proj": "rowwise",
75
+ "layers.*.mlp.gate_proj": "colwise",
76
+ "layers.*.mlp.up_proj": "colwise",
77
+ "layers.*.mlp.down_proj": "rowwise",
78
+ }
79
+ base_model_pp_plan = {
80
+ "embed_tokens": (["input_ids"], ["inputs_embeds"]),
81
+ "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
82
+ "norm": (["hidden_states"], ["hidden_states"]),
83
+ }
84
+
85
+ def __init__(
86
+ self,
87
+ vocab_size=32000,
88
+ hidden_size=768,
89
+ intermediate_size=11008,
90
+ max_position_embeddings=32768,
91
+ num_hidden_layers=2,
92
+ num_attention_heads=2,
93
+ image_token_id=101304,
94
+ video_token_id=101305,
95
+ vision_start_token_id=101306,
96
+ rms_norm_eps=1e-6,
97
+ use_cache=False,
98
+ use_flash_attention=False,
99
+ pad_token_id=0,
100
+ bos_token_id=1,
101
+ eos_token_id=2,
102
+ head_dim=128,
103
+ hidden_act="silu",
104
+ use_bias=False,
105
+ rope_theta=10000,
106
+ weight_share_add_bias=True,
107
+ ignored_index=-100,
108
+ attention_probs_dropout_prob=0.0,
109
+ hidden_dropout_prob=0.0,
110
+ compression_ratio: float = 1.0,
111
+ num_key_value_heads=None,
112
+ max_sequence_length=None,
113
+ tie_word_embeddings=False,
114
+ vision_config=None,
115
+ rope_scaling=None,
116
+ **kwargs,
117
+ ):
118
+ """
119
+ Initialize configuration with default or specified parameters.
120
+
121
+ Args:
122
+ vocab_size (int): Size of the vocabulary (number of unique tokens)
123
+ hidden_size (int): Dimensionality of the encoder layers and the pooler layer
124
+ intermediate_size (int): Dimensionality of the "intermediate" (feed-forward) layer
125
+ max_position_embeddings (int): Maximum sequence length the model can handle
126
+ num_hidden_layers (int): Number of hidden layers in the Transformer encoder
127
+ num_attention_heads (int): Number of attention heads for each attention layer
128
+ rms_norm_eps (float): The epsilon used by the RMS normalization layers
129
+ use_cache (bool): Whether to use caching for faster generation (decoding)
130
+ use_flash_attention (bool): Whether to use FlashAttention for optimized attention computation
131
+ pad_token_id (int): Token ID used for padding sequences
132
+ bos_token_id (int): Token ID used for beginning-of-sequence
133
+ eos_token_id (int): Token ID used for end-of-sequence
134
+ use_bias (bool): Whether to use bias terms in linear layers
135
+ rope_theta (float): The base period of the RoPE embeddings
136
+ weight_share_add_bias (bool): Whether to share bias weights in certain layers
137
+ ignored_index (int): Target value that is ignored during loss computation
138
+ attention_probs_dropout_prob (float): Dropout probability for attention weights
139
+ hidden_dropout_prob (float): Dropout probability for hidden layers
140
+ compression_ratio (float): Ratio for KV cache compression (1.0 = no compression)
141
+ num_key_value_heads (int): Number of key/value heads (for Grouped Query Attention)
142
+ max_sequence_length (int): Maximum sequence length for positional embeddings
143
+ **kwargs: Additional keyword arguments passed to parent class
144
+ """
145
+
146
+ # Set default for tied embeddings if not specified.
147
+ super().__init__(
148
+ pad_token_id=pad_token_id,
149
+ bos_token_id=bos_token_id,
150
+ eos_token_id=eos_token_id,
151
+ **kwargs,
152
+ )
153
+ if isinstance(vision_config, dict):
154
+ self.vision_config = self.sub_configs["vision_config"](**vision_config)
155
+ elif vision_config is None:
156
+ self.vision_config = self.sub_configs["vision_config"]()
157
+ self.vocab_size = vocab_size
158
+ self.hidden_size = hidden_size
159
+ self.intermediate_size = intermediate_size
160
+ self.max_position_embeddings = max_position_embeddings
161
+ self.num_hidden_layers = num_hidden_layers
162
+ self.num_attention_heads = num_attention_heads
163
+ self.rms_norm_eps = rms_norm_eps
164
+ self.use_cache = use_cache
165
+ self.use_flash_attention = use_flash_attention
166
+ self.pad_token_id = pad_token_id
167
+ self.bos_token_id = bos_token_id
168
+ self.eos_token_id = eos_token_id
169
+ self.image_token_id = image_token_id
170
+ self.video_token_id = video_token_id
171
+ self.vision_start_token_id = vision_start_token_id
172
+ self.head_dim = head_dim
173
+ self.hidden_act=hidden_act
174
+ self.sliding_window = None
175
+ self.hidden_size = hidden_size
176
+ self.use_bias = use_bias
177
+ self.weight_share_add_bias = weight_share_add_bias
178
+ self.rope_theta = rope_theta
179
+ self.ignored_index = ignored_index
180
+ self.attention_probs_dropout_prob = attention_probs_dropout_prob
181
+ self.hidden_dropout_prob = hidden_dropout_prob
182
+ self.compression_ratio = compression_ratio
183
+ self.num_key_value_heads = num_key_value_heads
184
+ self.max_sequence_length = max_sequence_length
185
+ self.rope_scaling = rope_scaling
186
+ if self.rope_scaling is not None and "type" in self.rope_scaling:
187
+ if self.rope_scaling["type"] == "mrope":
188
+ self.rope_scaling["type"] = "default"
189
+ self.rope_scaling["rope_type"] = self.rope_scaling["type"]
190
+ rope_config_validation(self, ignore_keys={"mrope_section"})
191
+ super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "eos_token_id": 2,
4
+ "transformers_version": "4.55.0",
5
+ "use_cache": false
6
+ }
image_processing.py ADDED
@@ -0,0 +1,569 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Image processor class for PaddleOCR-VL."""
16
+
17
+ import math
18
+ from typing import Dict, List, Optional, Union
19
+
20
+ import numpy as np
21
+ import torch
22
+ from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
23
+ from torchvision.transforms import functional as TF
24
+ from transformers.image_transforms import (
25
+ convert_to_rgb,
26
+ resize,
27
+ to_channel_dimension_format,
28
+ )
29
+ from transformers.image_utils import (
30
+ OPENAI_CLIP_MEAN,
31
+ OPENAI_CLIP_STD,
32
+ ChannelDimension,
33
+ PILImageResampling,
34
+ get_image_size,
35
+ infer_channel_dimension_format,
36
+ is_scaled_image,
37
+ is_valid_image,
38
+ make_list_of_images,
39
+ to_numpy_array,
40
+ valid_images,
41
+ validate_preprocess_arguments,
42
+ )
43
+ from transformers.utils import TensorType, is_vision_available, logging
44
+
45
+
46
+ logger = logging.get_logger(__name__)
47
+
48
+
49
+ if is_vision_available():
50
+ from PIL import Image
51
+
52
+ ImageInput = Union[
53
+ "PIL.Image.Image",
54
+ np.ndarray,
55
+ "torch.Tensor",
56
+ List["PIL.Image.Image"],
57
+ List[np.ndarray],
58
+ List["torch.Tensor"],
59
+ ] # noqa
60
+
61
+
62
+ VideoInput = Union[
63
+ List["PIL.Image.Image"],
64
+ "np.ndarray",
65
+ "torch.Tensor",
66
+ List["np.ndarray"],
67
+ List["torch.Tensor"],
68
+ List[List["PIL.Image.Image"]],
69
+ List[List["np.ndarrray"]],
70
+ List[List["torch.Tensor"]],
71
+ ] # noqa
72
+
73
+
74
+ def make_batched_images(images) -> List[List[ImageInput]]:
75
+ """
76
+ Accepts images in list or nested list format, and makes a list of images for preprocessing.
77
+
78
+ Args:
79
+ images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
80
+ The input image.
81
+
82
+ Returns:
83
+ list: A list of images.
84
+ """
85
+ if (
86
+ isinstance(images, (list, tuple))
87
+ and isinstance(images[0], (list, tuple))
88
+ and is_valid_image(images[0][0])
89
+ ):
90
+ return [img for img_list in images for img in img_list]
91
+
92
+ elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
93
+ return images
94
+
95
+ elif is_valid_image(images):
96
+ return [images]
97
+
98
+ raise ValueError(f"Could not make batched images from {images}")
99
+
100
+
101
+ def adjust_size(size, patch_size):
102
+ num_patches = size // patch_size
103
+ if num_patches % 2 != 0: # 如果是奇数,减1
104
+ num_patches -= 1
105
+ return num_patches * patch_size
106
+
107
+
108
+ def make_batched_videos(videos) -> List[VideoInput]:
109
+ if (
110
+ isinstance(videos, (list, tuple))
111
+ and isinstance(videos[0], (list, tuple))
112
+ and is_valid_image(videos[0][0])
113
+ ):
114
+ return videos
115
+
116
+ elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
117
+ if isinstance(videos[0], Image.Image):
118
+ return [videos]
119
+ elif len(videos[0].shape) == 4:
120
+ return [list(video) for video in videos]
121
+
122
+ elif is_valid_image(videos) and len(videos.shape) == 4:
123
+ return [list(videos)]
124
+
125
+ raise ValueError(f"Could not make batched video from {videos}")
126
+
127
+
128
+ def smart_resize(
129
+ height: int,
130
+ width: int,
131
+ factor: int = 28,
132
+ min_pixels: int = 28 * 28 * 130,
133
+ max_pixels: int = 28 * 28 * 1280,
134
+ ):
135
+ """Rescales the image so that the following conditions are met:
136
+
137
+ 1. Both dimensions (height and width) are divisible by 'factor'.
138
+
139
+ 2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
140
+
141
+ 3. The aspect ratio of the image is maintained as closely as possible.
142
+
143
+ """
144
+ # if height < factor or width < factor:
145
+ # raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor}")
146
+ # if int(height < factor//4) + int(width < factor//4):
147
+ # raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor//4}")
148
+
149
+ if height < factor:
150
+ print(f"smart_resize: height={height} < factor={factor}, reset height=factor")
151
+ width = round((width * factor) / height)
152
+ height = factor
153
+
154
+ if width < factor:
155
+ print(f"smart_resize: width={width} < factor={factor}, reset width=factor")
156
+ height = round((height * factor) / width)
157
+ width = factor
158
+
159
+ if max(height, width) / min(height, width) > 200:
160
+ raise ValueError(
161
+ f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}"
162
+ )
163
+ h_bar = round(height / factor) * factor
164
+ w_bar = round(width / factor) * factor
165
+ if h_bar * w_bar > max_pixels:
166
+ beta = math.sqrt((height * width) / max_pixels)
167
+ h_bar = math.floor(height / beta / factor) * factor
168
+ w_bar = math.floor(width / beta / factor) * factor
169
+ elif h_bar * w_bar < min_pixels:
170
+ beta = math.sqrt(min_pixels / (height * width))
171
+ h_bar = math.ceil(height * beta / factor) * factor
172
+ w_bar = math.ceil(width * beta / factor) * factor
173
+ return h_bar, w_bar
174
+
175
+
176
+ class SiglipImageProcessor(BaseImageProcessor):
177
+ r"""
178
+ Constructs a Siglip image processor that dynamically resizes images based on the original images.
179
+
180
+ Args:
181
+ do_resize (`bool`, *optional*, defaults to `True`):
182
+ Whether to resize the image's (height, width) dimensions.
183
+ resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
184
+ Resampling filter to use when resizing the image.
185
+ do_rescale (`bool`, *optional*, defaults to `True`):
186
+ Whether to rescale the image by the specified scale `rescale_factor`.
187
+ rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
188
+ Scale factor to use if rescaling the image.
189
+ do_normalize (`bool`, *optional*, defaults to `True`):
190
+ Whether to normalize the image.
191
+ image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
192
+ Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.
193
+ image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
194
+ Standard deviation to use if normalizing the image. This is a float or list of floats for each channel in the image.
195
+ do_convert_rgb (`bool`, *optional*, defaults to `True`):
196
+ Whether to convert the image to RGB.
197
+ min_pixels (`int`, *optional*, defaults to `28 * 28 * 130`):
198
+ The min pixels of the image to resize the image.
199
+ max_pixels (`int`, *optional*, defaults to `28 * 28 * 1670`):
200
+ The max pixels of the image to resize the image.
201
+ patch_size (`int`, *optional*, defaults to 14):
202
+ The spacial patch size of the vision encoder.
203
+ temporal_patch_size (`int`, *optional*, defaults to 2):
204
+ The temporal patch size of the vision encoder.
205
+ merge_size (`int`, *optional*, defaults to 2):
206
+ The merge size of the vision encoder to llm encoder.
207
+ """
208
+
209
+ model_input_names = [
210
+ "pixel_values",
211
+ "image_grid_thw",
212
+ "pixel_values_videos",
213
+ "video_grid_thw",
214
+ ]
215
+
216
+ def __init__(
217
+ self,
218
+ do_resize: bool = True,
219
+ resample: PILImageResampling = PILImageResampling.BICUBIC,
220
+ do_rescale: bool = True,
221
+ rescale_factor: Union[int, float] = 1 / 255,
222
+ do_normalize: bool = True,
223
+ image_mean: Optional[Union[float, List[float]]] = None,
224
+ image_std: Optional[Union[float, List[float]]] = None,
225
+ do_convert_rgb: bool = True,
226
+ min_pixels: int = 28 * 28 * 130,
227
+ max_pixels: int = 28 * 28 * 1280,
228
+ patch_size: int = 14,
229
+ temporal_patch_size: int = 1,
230
+ merge_size: int = 2,
231
+ **kwargs,
232
+ ) -> None:
233
+ super().__init__(**kwargs)
234
+ self.do_resize = do_resize
235
+ self.resample = resample
236
+ self.do_rescale = do_rescale
237
+ self.rescale_factor = rescale_factor
238
+ self.do_normalize = do_normalize
239
+ self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
240
+ self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
241
+ self.min_pixels = min_pixels
242
+ self.max_pixels = max_pixels
243
+ self.patch_size = patch_size
244
+ self.temporal_patch_size = temporal_patch_size
245
+ self.merge_size = merge_size
246
+ self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels} # not used
247
+ self.do_convert_rgb = do_convert_rgb
248
+
249
+ def mvit_rescale(self, image: Image.Image, merge_size: int = 2) -> Image.Image:
250
+ try:
251
+ w, h = image.size
252
+ except:
253
+ raise ValueError(str((type(image), image)))
254
+ patch_size = self.patch_size
255
+
256
+ if (w // patch_size) * (h // patch_size) > self.in_token_limit:
257
+ scale = math.sqrt(
258
+ self.in_token_limit / ((w // patch_size) * (h // patch_size))
259
+ )
260
+ new_w, new_h = int(w * scale), int(h * scale)
261
+
262
+ image = image.resize((new_w, new_h), Image.Resampling.BICUBIC)
263
+ if self.pad_input:
264
+ new_w, new_h = image.size
265
+ pad_size_h = merge_size * patch_size
266
+ pad_size_w = merge_size * patch_size
267
+
268
+ pad_h = (pad_size_h - new_h % pad_size_h) % pad_size_h
269
+ pad_w = (pad_size_w - new_w % pad_size_w) % pad_size_w
270
+
271
+ image = TF.pad(image, (0, 0, pad_w, pad_h))
272
+ else:
273
+ new_w, new_h = image.size
274
+ new_w = new_w - new_w % patch_size
275
+ new_h = new_h - new_h % patch_size
276
+
277
+ new_w = adjust_size(new_w, patch_size)
278
+ new_h = adjust_size(new_h, patch_size)
279
+
280
+ image = TF.center_crop(image, (new_h, new_w))
281
+
282
+ w, h = image.size
283
+ if w // patch_size >= 512 or h // patch_size >= 512:
284
+ new_h = min(patch_size * 510, h)
285
+ new_w = min(patch_size * 510, w)
286
+ image = TF.center_crop(image, (new_h, new_w))
287
+ # raise ValueError("Exceed pos emb")
288
+ return image
289
+
290
+ def _preprocess(
291
+ self,
292
+ images: Union[ImageInput, VideoInput],
293
+ do_resize: bool = None,
294
+ resample: PILImageResampling = None,
295
+ do_rescale: bool = None,
296
+ rescale_factor: float = None,
297
+ do_normalize: bool = None,
298
+ image_mean: Optional[Union[float, List[float]]] = None,
299
+ image_std: Optional[Union[float, List[float]]] = None,
300
+ do_convert_rgb: bool = None,
301
+ data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
302
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
303
+ ):
304
+ """
305
+ Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
306
+
307
+ Args:
308
+ images (`ImageInput`):
309
+ Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`.
310
+ vision_info (`List[Dict]`, *optional*):
311
+ Optional list of dictionaries containing additional information about vision inputs.
312
+ do_resize (`bool`, *optional*, defaults to `self.do_resize`):
313
+ Whether to resize the image.
314
+ resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
315
+ Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
316
+ do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
317
+ Whether to rescale the image.
318
+ rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
319
+ Scale factor to use if rescaling the image.
320
+ do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
321
+ Whether to normalize the image.
322
+ image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
323
+ Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
324
+ image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
325
+ Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
326
+ do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
327
+ Whether to convert the image to RGB.
328
+ data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
329
+ The channel dimension format for the output image. Can be one of:
330
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
331
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
332
+ - Unset: Use the channel dimension format of the input image.
333
+ input_data_format (`ChannelDimension` or `str`, *optional*):
334
+ The channel dimension format for the input image. Can be one of:
335
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
336
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
337
+ - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
338
+ """
339
+ images = make_list_of_images(images)
340
+
341
+ if do_convert_rgb:
342
+ images = [convert_to_rgb(image) for image in images]
343
+
344
+ # All transformations expect numpy arrays.
345
+ images = [to_numpy_array(image) for image in images]
346
+
347
+ if is_scaled_image(images[0]) and do_rescale:
348
+ logger.warning_once(
349
+ "It looks like you are trying to rescale already rescaled images. If the input"
350
+ " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
351
+ )
352
+ if input_data_format is None:
353
+ # We assume that all images have the same channel dimension format.
354
+ input_data_format = infer_channel_dimension_format(images[0])
355
+
356
+ height, width = get_image_size(images[0], channel_dim=input_data_format)
357
+ resized_height, resized_width = height, width
358
+ processed_images = []
359
+
360
+ for image in images:
361
+ if do_resize:
362
+ resized_height, resized_width = smart_resize(
363
+ height,
364
+ width,
365
+ factor=self.patch_size * self.merge_size,
366
+ min_pixels=self.min_pixels,
367
+ max_pixels=self.max_pixels,
368
+ )
369
+ image = resize(
370
+ image,
371
+ size=(resized_height, resized_width),
372
+ resample=resample,
373
+ input_data_format=input_data_format,
374
+ )
375
+
376
+ if do_rescale:
377
+ image = self.rescale(
378
+ image, scale=rescale_factor, input_data_format=input_data_format
379
+ )
380
+
381
+ if do_normalize:
382
+ image = self.normalize(
383
+ image=image,
384
+ mean=image_mean,
385
+ std=image_std,
386
+ input_data_format=input_data_format,
387
+ )
388
+ image = to_channel_dimension_format(
389
+ image, data_format, input_channel_dim=input_data_format
390
+ )
391
+ processed_images.append(image)
392
+
393
+ patches = np.array(processed_images)
394
+ if data_format == ChannelDimension.LAST:
395
+ patches = patches.transpose(0, 3, 1, 2)
396
+ if patches.shape[0] == 1:
397
+ patches = np.tile(patches, (self.temporal_patch_size, 1, 1, 1))
398
+ init_patches = patches
399
+ channel = patches.shape[1]
400
+ grid_t = patches.shape[0] // self.temporal_patch_size
401
+ grid_h, grid_w = (
402
+ resized_height // self.patch_size,
403
+ resized_width // self.patch_size,
404
+ )
405
+ patches = patches.reshape(
406
+ grid_t,
407
+ self.temporal_patch_size,
408
+ channel,
409
+ grid_h,
410
+ self.patch_size,
411
+ grid_w,
412
+ self.patch_size,
413
+ )
414
+ patches = patches.transpose(0, 3, 5, 2, 1, 4, 6)
415
+ assert self.temporal_patch_size == 1
416
+ flatten_patches = patches.reshape(
417
+ grid_t * grid_h * grid_w, channel, self.patch_size, self.patch_size
418
+ )
419
+ return flatten_patches, (grid_t, grid_h, grid_w)
420
+
421
+ def preprocess(
422
+ self,
423
+ images: ImageInput,
424
+ videos: VideoInput = None,
425
+ do_resize: bool = None,
426
+ size: Dict[str, int] = None,
427
+ resample: PILImageResampling = None,
428
+ do_rescale: bool = None,
429
+ rescale_factor: float = None,
430
+ do_normalize: bool = None,
431
+ image_mean: Optional[Union[float, List[float]]] = None,
432
+ image_std: Optional[Union[float, List[float]]] = None,
433
+ do_convert_rgb: bool = None,
434
+ return_tensors: Optional[Union[str, TensorType]] = None,
435
+ data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
436
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
437
+ ):
438
+ """
439
+ Args:
440
+ images (`ImageInput`):
441
+ Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
442
+ passing in images with pixel values between 0 and 1, set `do_rescale=False`.
443
+ videos (`VideoInput`):
444
+ Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If
445
+ passing in videos with pixel values between 0 and 1, set `do_rescale=False`.
446
+ do_resize (`bool`, *optional*, defaults to `self.do_resize`):
447
+ Whether to resize the image.
448
+ size (`Dict[str, int]`, *optional*, defaults to `self.size`):
449
+ Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
450
+ the longest edge resized to keep the input aspect ratio.
451
+ resample (`int`, *optional*, defaults to `self.resample`):
452
+ Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
453
+ has an effect if `do_resize` is set to `True`.
454
+ do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
455
+ Whether to rescale the image.
456
+ rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
457
+ Rescale factor to rescale the image by if `do_rescale` is set to `True`.
458
+ do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
459
+ Whether to normalize the image.
460
+ image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
461
+ Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
462
+ image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
463
+ Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
464
+ `True`.
465
+ do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
466
+ Whether to convert the image to RGB.
467
+ return_tensors (`str` or `TensorType`, *optional*):
468
+ The type of tensors to return. Can be one of:
469
+ - Unset: Return a list of `np.ndarray`.
470
+ - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
471
+ - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
472
+ - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
473
+ - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
474
+ data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
475
+ The channel dimension format for the output image. Can be one of:
476
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
477
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
478
+ - Unset: Use the channel dimension format of the input image.
479
+ input_data_format (`ChannelDimension` or `str`, *optional*):
480
+ The channel dimension format for the input image. If unset, the channel dimension format is inferred
481
+ from the input image. Can be one of:
482
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
483
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
484
+ - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
485
+
486
+ """
487
+ do_resize = do_resize if do_resize is not None else self.do_resize
488
+ size = size if size is not None else self.size
489
+ resample = resample if resample is not None else self.resample
490
+ do_rescale = do_rescale if do_rescale is not None else self.do_rescale
491
+ rescale_factor = (
492
+ rescale_factor if rescale_factor is not None else self.rescale_factor
493
+ )
494
+ do_normalize = do_normalize if do_normalize is not None else self.do_normalize
495
+ image_mean = image_mean if image_mean is not None else self.image_mean
496
+ image_std = image_std if image_std is not None else self.image_std
497
+ do_convert_rgb = (
498
+ do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
499
+ )
500
+
501
+ if images is not None:
502
+ images = make_batched_images(images)
503
+ if videos is not None:
504
+ videos = make_batched_videos(videos)
505
+
506
+ if images is not None and not valid_images(images):
507
+ raise ValueError(
508
+ "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
509
+ "torch.Tensor, tf.Tensor or jax.ndarray."
510
+ )
511
+
512
+ validate_preprocess_arguments(
513
+ rescale_factor=rescale_factor,
514
+ do_normalize=do_normalize,
515
+ image_mean=image_mean,
516
+ image_std=image_std,
517
+ do_resize=do_resize,
518
+ size=size,
519
+ resample=resample,
520
+ )
521
+
522
+ if images is not None:
523
+ pixel_values, vision_grid_thws = [], []
524
+ for image in images:
525
+ patches, image_grid_thw = self._preprocess(
526
+ image,
527
+ do_resize=do_resize,
528
+ resample=resample,
529
+ do_rescale=do_rescale,
530
+ rescale_factor=rescale_factor,
531
+ do_normalize=do_normalize,
532
+ image_mean=image_mean,
533
+ image_std=image_std,
534
+ data_format=data_format,
535
+ do_convert_rgb=do_convert_rgb,
536
+ input_data_format=input_data_format,
537
+ )
538
+ pixel_values.extend(patches)
539
+ vision_grid_thws.append(image_grid_thw)
540
+ pixel_values = np.array(pixel_values)
541
+ vision_grid_thws = np.array(vision_grid_thws)
542
+ data = {"pixel_values": pixel_values, "image_grid_thw": vision_grid_thws}
543
+
544
+ if videos is not None:
545
+ pixel_values, vision_grid_thws = [], []
546
+ for images in videos:
547
+ patches, video_grid_thw = self._preprocess(
548
+ images,
549
+ do_resize=do_resize,
550
+ resample=resample,
551
+ do_rescale=do_rescale,
552
+ rescale_factor=rescale_factor,
553
+ do_normalize=do_normalize,
554
+ image_mean=image_mean,
555
+ image_std=image_std,
556
+ data_format=data_format,
557
+ do_convert_rgb=do_convert_rgb,
558
+ input_data_format=input_data_format,
559
+ )
560
+ pixel_values.extend(patches)
561
+ vision_grid_thws.append(video_grid_thw)
562
+ pixel_values = np.array(pixel_values)
563
+ vision_grid_thws = np.array(vision_grid_thws)
564
+ data = {
565
+ "pixel_values_videos": pixel_values,
566
+ "video_grid_thw": vision_grid_thws,
567
+ }
568
+
569
+ return BatchFeature(data=data, tensor_type=return_tensors)
inference.yml ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ Global:
2
+ model_name: PaddleOCR-VL-0.9B
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3085f1042e184f68f8a412aa0f64f2c4b8562989598bbfba326aaa11fc685de8
3
+ size 1917255968
modeling_paddleocr_vl.py ADDED
The diff for this file is too large to render. See raw diff
 
preprocessor_config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoImageProcessor": "image_processing.SiglipImageProcessor",
4
+ "AutoProcessor": "processing_paddleocr_vl.PaddleOCRVLProcessor"
5
+ },
6
+ "do_convert_rgb": true,
7
+ "do_normalize": true,
8
+ "do_rescale": true,
9
+ "do_resize": true,
10
+ "image_mean": [
11
+ 0.5,
12
+ 0.5,
13
+ 0.5
14
+ ],
15
+ "image_processor_type": "SiglipImageProcessor",
16
+ "image_std": [
17
+ 0.5,
18
+ 0.5,
19
+ 0.5
20
+ ],
21
+ "max_pixels": 2822400,
22
+ "merge_size": 2,
23
+ "min_pixels": 147384,
24
+ "patch_size": 14,
25
+ "processor_class": "PaddleOCRVLProcessor",
26
+ "resample": 3,
27
+ "rescale_factor": 0.00392156862745098,
28
+ "size": {
29
+ "max_pixels": 2822400,
30
+ "min_pixels": 147384
31
+ },
32
+ "temporal_patch_size": 1
33
+ }
processing_paddleocr_vl.py ADDED
@@ -0,0 +1,293 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import List, Union
16
+ import numpy as np
17
+ import torch
18
+ from transformers.feature_extraction_utils import BatchFeature
19
+ from transformers.processing_utils import (
20
+ ProcessingKwargs,
21
+ ProcessorMixin,
22
+ Unpack,
23
+ VideosKwargs,
24
+ )
25
+ from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
26
+
27
+
28
+ ImageInput = Union[
29
+ "PIL.Image.Image",
30
+ np.ndarray,
31
+ "torch.Tensor",
32
+ List["PIL.Image.Image"],
33
+ List[np.ndarray],
34
+ List["torch.Tensor"],
35
+ ] # noqa
36
+
37
+
38
+ VideoInput = Union[
39
+ List["PIL.Image.Image"],
40
+ "np.ndarray",
41
+ "torch.Tensor",
42
+ List["np.ndarray"],
43
+ List["torch.Tensor"],
44
+ List[List["PIL.Image.Image"]],
45
+ List[List["np.ndarrray"]],
46
+ List[List["torch.Tensor"]],
47
+ ] # noqa
48
+
49
+
50
+ class PaddleOCRVLVideosProcessorKwargs(VideosKwargs, total=False):
51
+ fps: Union[List[float], float]
52
+
53
+
54
+ class PaddleOCRVLProcessorKwargs(ProcessingKwargs, total=False):
55
+ videos_kwargs: PaddleOCRVLVideosProcessorKwargs
56
+ _defaults = {
57
+ "text_kwargs": {
58
+ "padding": False,
59
+ },
60
+ "videos_kwargs": {"fps": 2.0},
61
+ }
62
+
63
+
64
+ class PaddleOCRVLProcessor(ProcessorMixin):
65
+ r"""
66
+ [`PaddleOCRVLProcessor`] offers all the functionalities of [`SiglipImageProcessor`] and [`Qwen2TokenizerFast`]. See the
67
+ [`~PaddleOCRVLProcessor.__call__`] and [`~PaddleOCRVLProcessor.decode`] for more information.
68
+ Args:
69
+ image_processor ([`SiglipImageProcessor`], *optional*):
70
+ The image processor is a required input.
71
+ tokenizer ([`Qwen2TokenizerFast`], *optional*):
72
+ The tokenizer is a required input.
73
+ chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
74
+ in a chat into a tokenizable string.
75
+ """
76
+
77
+ attributes = ["image_processor", "tokenizer"]
78
+ valid_kwargs = [
79
+ "chat_template",
80
+ "image_std",
81
+ "min_pixels",
82
+ "image_mean",
83
+ "merge_size",
84
+ "image_processor_type",
85
+ "temporal_patch_size",
86
+ "patch_size",
87
+ "max_pixels",
88
+ ]
89
+
90
+ image_processor_class = "AutoImageProcessor"
91
+ tokenizer_class = "AutoTokenizer"
92
+
93
+ def __init__(
94
+ self, image_processor=None, tokenizer=None, chat_template=None, **kwargs
95
+ ):
96
+ self.image_token = (
97
+ "<|IMAGE_PLACEHOLDER|>"
98
+ if not hasattr(tokenizer, "image_token")
99
+ else tokenizer.image_token
100
+ )
101
+ self.video_token = (
102
+ "<|video_pad|>"
103
+ if not hasattr(tokenizer, "video_token")
104
+ else tokenizer.video_token
105
+ )
106
+ super().__init__(image_processor, tokenizer, chat_template=chat_template)
107
+
108
+ def __call__(
109
+ self,
110
+ images: ImageInput = None,
111
+ text: Union[
112
+ TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]
113
+ ] = None,
114
+ videos: VideoInput = None,
115
+ **kwargs: Unpack[PaddleOCRVLProcessorKwargs],
116
+ ) -> BatchFeature:
117
+ """
118
+ Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
119
+ and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
120
+ the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
121
+ SiglipImageProcessor's [`~SiglipImageProcessor.__call__`] if `vision_infos` is not `None`.
122
+
123
+ Args:
124
+ images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
125
+ The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
126
+ tensor. Both channels-first and channels-last formats are supported.
127
+ text (`str`, `List[str]`, `List[List[str]]`):
128
+ The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
129
+ (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
130
+ `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
131
+ videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
132
+ The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
133
+ tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
134
+ return_tensors (`str` or [`~utils.TensorType`], *optional*):
135
+ If set, will return tensors of a particular framework. Acceptable values are:
136
+ - `'tf'`: Return TensorFlow `tf.constant` objects.
137
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
138
+ - `'np'`: Return NumPy `np.ndarray` objects.
139
+ - `'jax'`: Return JAX `jnp.ndarray` objects.
140
+
141
+ Returns:
142
+ [`BatchFeature`]: A [`BatchFeature`] with the following fields:
143
+
144
+ - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
145
+ - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
146
+ `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
147
+ `None`).
148
+ - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
149
+ - **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
150
+ - **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
151
+ - **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
152
+ - **second_per_grid_ts** -- List of video seconds per time grid. Returned when `videos` is not `None`.
153
+ """
154
+ output_kwargs = self._merge_kwargs(
155
+ PaddleOCRVLProcessorKwargs,
156
+ tokenizer_init_kwargs=self.tokenizer.init_kwargs,
157
+ **kwargs,
158
+ )
159
+
160
+ if images is not None:
161
+ image_inputs = self.image_processor(images=images, return_tensors="pt")
162
+ image_inputs["pixel_values"] = image_inputs["pixel_values"]
163
+ image_grid_thw = image_inputs["image_grid_thw"]
164
+
165
+ else:
166
+ image_inputs = {}
167
+ image_grid_thw = None
168
+
169
+ if videos is not None:
170
+ # TODO: add video processing
171
+ videos_inputs = self.image_processor(
172
+ images=None, videos=videos, **output_kwargs["images_kwargs"]
173
+ )
174
+ video_grid_thw = videos_inputs["video_grid_thw"]
175
+
176
+ fps = output_kwargs["videos_kwargs"].pop("fps", 2.0)
177
+ if isinstance(fps, (int, float)):
178
+ second_per_grid_ts = [
179
+ self.image_processor.temporal_patch_size / fps
180
+ ] * len(video_grid_thw)
181
+ elif hasattr(fps, "__len__") and len(fps) == len(video_grid_thw):
182
+ second_per_grid_ts = [
183
+ self.image_processor.temporal_patch_size / tmp for tmp in fps
184
+ ]
185
+ else:
186
+ raise ValueError(
187
+ f"The length of fps ({len(fps) if hasattr(fps, '__len__') else fps}) must be equal to the length of video_grid_thw ({len(video_grid_thw)}) or fps should be a single number."
188
+ )
189
+ videos_inputs.update(
190
+ {"second_per_grid_ts": torch.tensor(second_per_grid_ts)}
191
+ )
192
+
193
+ else:
194
+ videos_inputs = {}
195
+ video_grid_thw = None
196
+
197
+ if not isinstance(text, list):
198
+ text = [text]
199
+
200
+ if image_grid_thw is not None:
201
+ index = 0
202
+ for i in range(len(text)):
203
+ while self.image_token in text[i]:
204
+ text[i] = text[i].replace(
205
+ self.image_token,
206
+ "<|placeholder|>"
207
+ * (
208
+ image_grid_thw[index].prod()
209
+ // self.image_processor.merge_size
210
+ // self.image_processor.merge_size
211
+ ),
212
+ 1,
213
+ )
214
+ index += 1
215
+ text[i] = text[i].replace("<|placeholder|>", self.image_token)
216
+
217
+ if video_grid_thw is not None:
218
+ index = 0
219
+ for i in range(len(text)):
220
+ while self.video_token in text[i]:
221
+ text[i] = text[i].replace(
222
+ self.video_token,
223
+ "<|placeholder|>"
224
+ * (
225
+ video_grid_thw[index].prod()
226
+ // self.image_processor.merge_size
227
+ // self.image_processor.merge_size
228
+ ),
229
+ 1,
230
+ )
231
+ index += 1
232
+ text[i] = text[i].replace("<|placeholder|>", self.video_token)
233
+
234
+ text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
235
+
236
+ return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs})
237
+
238
+ def batch_decode(self, *args, **kwargs):
239
+ """
240
+ This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
241
+ refer to the docstring of this method for more information.
242
+ """
243
+ return self.tokenizer.batch_decode(*args, **kwargs)
244
+
245
+ def decode(self, *args, **kwargs):
246
+ """
247
+ This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
248
+ the docstring of this method for more information.
249
+ """
250
+ return self.tokenizer.decode(*args, **kwargs)
251
+
252
+ def post_process_image_text_to_text(
253
+ self,
254
+ generated_outputs,
255
+ skip_special_tokens=True,
256
+ clean_up_tokenization_spaces=False,
257
+ **kwargs,
258
+ ):
259
+ """
260
+ Post-process the output of the model to decode the text.
261
+
262
+ Args:
263
+ generated_outputs (`torch.Tensor` or `np.ndarray`):
264
+ The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
265
+ or `(sequence_length,)`.
266
+ skip_special_tokens (`bool`, *optional*, defaults to `True`):
267
+ Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
268
+ Clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
269
+ Whether or not to clean up the tokenization spaces. Argument passed to the tokenizer's `batch_decode` method.
270
+ **kwargs:
271
+ Additional arguments to be passed to the tokenizer's `batch_decode method`.
272
+
273
+ Returns:
274
+ `List[str]`: The decoded text.
275
+ """
276
+ return self.tokenizer.batch_decode(
277
+ generated_outputs,
278
+ skip_special_tokens=skip_special_tokens,
279
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
280
+ **kwargs,
281
+ )
282
+
283
+ @property
284
+ def model_input_names(self):
285
+ tokenizer_input_names = self.tokenizer.model_input_names
286
+ image_processor_input_names = self.image_processor.model_input_names
287
+ names_from_processor = list(
288
+ dict.fromkeys(tokenizer_input_names + image_processor_input_names)
289
+ )
290
+ return names_from_processor + ["second_per_grid_ts"]
291
+
292
+
293
+ __all__ = ["PaddleOCRVLProcessor", "PaddleOCRVLProcessor"]
processor_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoProcessor": "processing_paddleocr_vl.PaddleOCRVLProcessor"
4
+ },
5
+ "processor_class": "PaddleOCRVLProcessor"
6
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|IMAGE_PLACEHOLDER|>",
4
+ "<|image_pad|>",
5
+ "<|IMAGE_START|>",
6
+ "<|IMAGE_END|>",
7
+ "<|video_pad|>"
8
+ ],
9
+ "bos_token": {
10
+ "content": "<s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "cls_token": {
17
+ "content": "<|begin_of_sentence|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "eos_token": {
24
+ "content": "</s>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "mask_token": {
31
+ "content": "<mask:1>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "pad_token": {
38
+ "content": "<unk>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "sep_token": {
45
+ "content": "<|end_of_sentence|>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ },
51
+ "unk_token": {
52
+ "content": "<unk>",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false
57
+ }
58
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f90f04fd8e5eb6dfa380f37d10c87392de8438dccb6768a2486b5a96ee76dba6
3
+ size 11187679
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:34ef7db83df785924fb83d7b887b6e822a031c56e15cff40aaf9b982988180df
3
+ size 1614363
tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff