ChengCui sayed99 commited on
Commit
466ad13
Β·
verified Β·
1 Parent(s): 882e3c3

docs: Readme Updated for optimized Usage with transformers library (#60)

Browse files

- docs: Readme Updated for optimized Usage with transformers library (1787ca52c80733e53e8bc59b5b8b3aa9ee7f7018)
- update (4fe79f670d8ec01d412f85db24881a50e732378e)
- update (d7d1f3777c5f5dc95028e0e4bad350d88d214f7d)
- update (e9b397128dde328b68890f838538606f9ab55999)
- merge (f96dabf21b5f97deeb1636adfcfb0d5987de71bf)


Co-authored-by: Sayed Gamal <sayed99@users.noreply.huggingface.co>

Files changed (2) hide show
  1. README.md +76 -5
  2. image_processing.py +6 -0
README.md CHANGED
@@ -73,6 +73,7 @@ PaddleOCR-VL: Boosting Multilingual Document Parsing via a 0.9B Ultra-Compact Vi
73
 
74
  ## News
75
 
 
76
  * ```2025.11.04``` 🌟 PaddleOCR-VL-0.9B is now officially supported on `vLLM` .
77
  * ```2025.10.29``` πŸ€— Supports calling the core module PaddleOCR-VL-0.9B of PaddleOCR-VL via the `transformers` library.
78
  * ```2025.10.16``` πŸš€ We release [PaddleOCR-VL](https://github.com/PaddlePaddle/PaddleOCR), β€” a multilingual documents parsing via a 0.9B Ultra-Compact Vision-Language Model with SOTA performance.
@@ -166,9 +167,14 @@ from PIL import Image
166
  import torch
167
  from transformers import AutoModelForCausalLM, AutoProcessor
168
 
 
 
 
 
 
 
169
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
170
 
171
- CHOSEN_TASK = "ocr" # Options: 'ocr' | 'table' | 'chart' | 'formula'
172
  PROMPTS = {
173
  "ocr": "OCR:",
174
  "table": "Table Recognition:",
@@ -176,8 +182,6 @@ PROMPTS = {
176
  "chart": "Chart Recognition:",
177
  }
178
 
179
- model_path = "PaddlePaddle/PaddleOCR-VL"
180
- image_path = "test.png"
181
  image = Image.open(image_path).convert("RGB")
182
 
183
  model = AutoModelForCausalLM.from_pretrained(
@@ -189,7 +193,7 @@ messages = [
189
  {"role": "user",
190
  "content": [
191
  {"type": "image", "image": image},
192
- {"type": "text", "text": PROMPTS[CHOSEN_TASK]},
193
  ]
194
  }
195
  ]
@@ -198,7 +202,7 @@ inputs = processor.apply_chat_template(
198
  tokenize=True,
199
  add_generation_prompt=True,
200
  return_dict=True,
201
- return_tensors="pt"
202
  ).to(DEVICE)
203
 
204
  outputs = model.generate(**inputs, max_new_tokens=1024)
@@ -206,6 +210,73 @@ outputs = processor.batch_decode(outputs, skip_special_tokens=True)[0]
206
  print(outputs)
207
  ```
208
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
  ## Performance
210
 
211
  ### Page-Level Document Parsing
 
73
 
74
  ## News
75
 
76
+ * ```2025.11.07``` πŸš€ Enabled `flash-attn` in the `transformers` library to achieve faster inference with PaddleOCR-VL-0.9B.
77
  * ```2025.11.04``` 🌟 PaddleOCR-VL-0.9B is now officially supported on `vLLM` .
78
  * ```2025.10.29``` πŸ€— Supports calling the core module PaddleOCR-VL-0.9B of PaddleOCR-VL via the `transformers` library.
79
  * ```2025.10.16``` πŸš€ We release [PaddleOCR-VL](https://github.com/PaddlePaddle/PaddleOCR), β€” a multilingual documents parsing via a 0.9B Ultra-Compact Vision-Language Model with SOTA performance.
 
167
  import torch
168
  from transformers import AutoModelForCausalLM, AutoProcessor
169
 
170
+ # ---- Settings ----
171
+ model_path = "PaddlePaddle/PaddleOCR-VL"
172
+ image_path = "test.png"
173
+ task = "ocr" # Options: 'ocr' | 'table' | 'chart' | 'formula'
174
+ # ------------------
175
+
176
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
177
 
 
178
  PROMPTS = {
179
  "ocr": "OCR:",
180
  "table": "Table Recognition:",
 
182
  "chart": "Chart Recognition:",
183
  }
184
 
 
 
185
  image = Image.open(image_path).convert("RGB")
186
 
187
  model = AutoModelForCausalLM.from_pretrained(
 
193
  {"role": "user",
194
  "content": [
195
  {"type": "image", "image": image},
196
+ {"type": "text", "text": PROMPTS[task]},
197
  ]
198
  }
199
  ]
 
202
  tokenize=True,
203
  add_generation_prompt=True,
204
  return_dict=True,
205
+ return_tensors="pt"
206
  ).to(DEVICE)
207
 
208
  outputs = model.generate(**inputs, max_new_tokens=1024)
 
210
  print(outputs)
211
  ```
212
 
213
+ <details>
214
+ <summary>πŸ‘‰ Click to expand: Use flash-attn to boost performance and reduce memory usage</summary>
215
+
216
+ ```shell
217
+ # ensure the flash-attn2 is installed
218
+ pip install flash-attn --no-build-isolation
219
+ ```
220
+
221
+ ```python
222
+ import torch
223
+ from transformers import AutoModelForCausalLM, AutoProcessor
224
+ from PIL import Image
225
+
226
+ # ---- Settings ----
227
+ model_path = "PaddlePaddle/PaddleOCR-VL"
228
+ image_path = "test.png"
229
+ task = "ocr" # ← change to "table" | "chart" | "formula"
230
+ # ------------------
231
+
232
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
233
+
234
+ model = AutoModelForCausalLM.from_pretrained(
235
+ model_path,
236
+ trust_remote_code=True,
237
+ torch_dtype=torch.bfloat16,
238
+ attn_implementation="flash_attention_2",
239
+ ).to(dtype=torch.bfloat16, device=DEVICE).eval()
240
+ processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
241
+
242
+ PROMPTS = {
243
+ "ocr": "OCR:",
244
+ "table": "Table Recognition:",
245
+ "chart": "Chart Recognition:",
246
+ "formula": "Formula Recognition:",
247
+ }
248
+ messages = [
249
+ {
250
+ "role": "user",
251
+ "content": [
252
+ {"type": "image", "image": Image.open(image_path).convert("RGB")},
253
+ {"type": "text", "text": PROMPTS[task]}
254
+ ]
255
+ }
256
+ ]
257
+
258
+ inputs = processor.apply_chat_template(
259
+ messages,
260
+ tokenize=True,
261
+ add_generation_prompt=True,
262
+ return_dict=True,
263
+ return_tensors="pt"
264
+ ).to(DEVICE)
265
+
266
+ with torch.inference_mode():
267
+ out = model.generate(
268
+ **inputs,
269
+ max_new_tokens=1024,
270
+ do_sample=False,
271
+ use_cache=True
272
+ )
273
+
274
+ outputs = processor.batch_decode(out, skip_special_tokens=True)[0]
275
+ print(outputs)
276
+ ```
277
+
278
+ </details>
279
+
280
  ## Performance
281
 
282
  ### Page-Level Document Parsing
image_processing.py CHANGED
@@ -141,12 +141,18 @@ def smart_resize(
141
  3. The aspect ratio of the image is maintained as closely as possible.
142
 
143
  """
 
 
 
 
144
 
145
  if height < factor:
 
146
  width = round((width * factor) / height)
147
  height = factor
148
 
149
  if width < factor:
 
150
  height = round((height * factor) / width)
151
  width = factor
152
 
 
141
  3. The aspect ratio of the image is maintained as closely as possible.
142
 
143
  """
144
+ # if height < factor or width < factor:
145
+ # raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor}")
146
+ # if int(height < factor//4) + int(width < factor//4):
147
+ # raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor//4}")
148
 
149
  if height < factor:
150
+ print(f"smart_resize: height={height} < factor={factor}, reset height=factor")
151
  width = round((width * factor) / height)
152
  height = factor
153
 
154
  if width < factor:
155
+ print(f"smart_resize: width={width} < factor={factor}, reset width=factor")
156
  height = round((height * factor) / width)
157
  width = factor
158