docs: Readme Updated for optimized Usage with transformers library (#60)

Browse files

- docs: Readme Updated for optimized Usage with transformers library (1787ca52c80733e53e8bc59b5b8b3aa9ee7f7018)
- update (4fe79f670d8ec01d412f85db24881a50e732378e)
- update (d7d1f3777c5f5dc95028e0e4bad350d88d214f7d)
- update (e9b397128dde328b68890f838538606f9ab55999)
- merge (f96dabf21b5f97deeb1636adfcfb0d5987de71bf)

Co-authored-by: Sayed Gamal <sayed99@users.noreply.huggingface.co>

Files changed (2) hide show

README.md +76 -5
image_processing.py +6 -0

README.md CHANGED Viewed

@@ -73,6 +73,7 @@ PaddleOCR-VL: Boosting Multilingual Document Parsing via a 0.9B Ultra-Compact Vi
 ## News
 * ```2025.11.04``` 🌟 PaddleOCR-VL-0.9B is now officially supported on `vLLM` .
 * ```2025.10.29``` 🤗 Supports calling the core module PaddleOCR-VL-0.9B of PaddleOCR-VL via the `transformers` library.
 * ```2025.10.16``` 🚀 We release [PaddleOCR-VL](https://github.com/PaddlePaddle/PaddleOCR), — a multilingual documents parsing via a 0.9B Ultra-Compact Vision-Language Model with SOTA performance.
@@ -166,9 +167,14 @@ from PIL import Image
 import torch
 from transformers import AutoModelForCausalLM, AutoProcessor
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-CHOSEN_TASK = "ocr"  # Options: 'ocr' | 'table' | 'chart' | 'formula'
 PROMPTS = {
     "ocr": "OCR:",
     "table": "Table Recognition:",
@@ -176,8 +182,6 @@ PROMPTS = {
     "chart": "Chart Recognition:",
 }
-model_path = "PaddlePaddle/PaddleOCR-VL"
-image_path = "test.png"
 image = Image.open(image_path).convert("RGB")
 model = AutoModelForCausalLM.from_pretrained(
@@ -189,7 +193,7 @@ messages = [
     {"role": "user",
      "content": [
             {"type": "image", "image": image},
-            {"type": "text", "text": PROMPTS[CHOSEN_TASK]},
         ]
     }
 ]
@@ -198,7 +202,7 @@ inputs = processor.apply_chat_template(
     tokenize=True,
     add_generation_prompt=True,
     return_dict=True,
-	return_tensors="pt"
 ).to(DEVICE)
 outputs = model.generate(**inputs, max_new_tokens=1024)
@@ -206,6 +210,73 @@ outputs = processor.batch_decode(outputs, skip_special_tokens=True)[0]
 print(outputs)
 ```
 ## Performance
 ### Page-Level Document Parsing

 ## News
+* ```2025.11.07``` 🚀 Enabled `flash-attn` in the `transformers` library to achieve faster inference with PaddleOCR-VL-0.9B.
 * ```2025.11.04``` 🌟 PaddleOCR-VL-0.9B is now officially supported on `vLLM` .
 * ```2025.10.29``` 🤗 Supports calling the core module PaddleOCR-VL-0.9B of PaddleOCR-VL via the `transformers` library.
 * ```2025.10.16``` 🚀 We release [PaddleOCR-VL](https://github.com/PaddlePaddle/PaddleOCR), — a multilingual documents parsing via a 0.9B Ultra-Compact Vision-Language Model with SOTA performance.
 import torch
 from transformers import AutoModelForCausalLM, AutoProcessor
+# ---- Settings ----
+model_path = "PaddlePaddle/PaddleOCR-VL"
+image_path = "test.png"
+task = "ocr" # Options: 'ocr' | 'table' | 'chart' | 'formula'
+# ------------------
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 PROMPTS = {
     "ocr": "OCR:",
     "table": "Table Recognition:",
     "chart": "Chart Recognition:",
 }
 image = Image.open(image_path).convert("RGB")
 model = AutoModelForCausalLM.from_pretrained(
     {"role": "user",
      "content": [
             {"type": "image", "image": image},
+            {"type": "text", "text": PROMPTS[task]},
         ]
     }
 ]
     tokenize=True,
     add_generation_prompt=True,
     return_dict=True,
+    return_tensors="pt"
 ).to(DEVICE)
 outputs = model.generate(**inputs, max_new_tokens=1024)
 print(outputs)
 ```
+<details>
+<summary>👉 Click to expand: Use flash-attn to boost performance and reduce memory usage</summary>
+```shell
+# ensure the flash-attn2 is installed
+pip install flash-attn --no-build-isolation
+```
+```python
+import torch
+from transformers import AutoModelForCausalLM, AutoProcessor
+from PIL import Image
+# ---- Settings ----
+model_path = "PaddlePaddle/PaddleOCR-VL"
+image_path = "test.png"
+task = "ocr" # ← change to "table" | "chart" | "formula"
+# ------------------
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+model = AutoModelForCausalLM.from_pretrained(
+    model_path,
+    trust_remote_code=True,
+    torch_dtype=torch.bfloat16,
+    attn_implementation="flash_attention_2",
+).to(dtype=torch.bfloat16, device=DEVICE).eval()
+processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
+PROMPTS = {
+    "ocr": "OCR:",
+    "table": "Table Recognition:",
+    "chart": "Chart Recognition:",
+    "formula": "Formula Recognition:",
+}
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "image", "image": Image.open(image_path).convert("RGB")},
+            {"type": "text",  "text": PROMPTS[task]}
+        ]
+    }
+]
+inputs = processor.apply_chat_template(
+    messages,
+    tokenize=True,
+    add_generation_prompt=True,
+    return_dict=True,
+    return_tensors="pt"
+).to(DEVICE)
+with torch.inference_mode():
+    out = model.generate(
+        **inputs,
+        max_new_tokens=1024,
+        do_sample=False,
+        use_cache=True
+    )
+outputs = processor.batch_decode(out, skip_special_tokens=True)[0]
+print(outputs)
+```
+</details>
 ## Performance
 ### Page-Level Document Parsing

image_processing.py CHANGED Viewed

@@ -141,12 +141,18 @@ def smart_resize(
     3. The aspect ratio of the image is maintained as closely as possible.
     """
     if height < factor:
         width = round((width * factor) / height)
         height = factor
     if width < factor:
         height = round((height * factor) / width)
         width = factor

     3. The aspect ratio of the image is maintained as closely as possible.
     """
+    # if height < factor or width < factor:
+    #    raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor}")
+    # if int(height < factor//4) + int(width < factor//4):
+    #     raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor//4}")
     if height < factor:
+        print(f"smart_resize: height={height} < factor={factor}, reset height=factor")
         width = round((width * factor) / height)
         height = factor
     if width < factor:
+        print(f"smart_resize: width={width} < factor={factor}, reset width=factor")
         height = round((height * factor) / width)
         width = factor