Spaces:

jzhang533
/

paddleocr-vl-for-manga-demo

Running on Zero

App Files Files Community

jzhang533 commited on Nov 28, 2025

Commit

f073f41

1 Parent(s): 1d14db4

minor

Browse files

Signed-off-by: Zhang Jun <jzhang533@gmail.com>

Files changed (2) hide show

app.py +60 -77
requirements.txt +1 -1

app.py CHANGED Viewed

@@ -17,23 +17,23 @@ processor = None
 def load_model():
     global model, processor
-    try:
-        print("Loading model...")
-        model = AutoModelForCausalLM.from_pretrained(
-            MODEL_PATH,
-            trust_remote_code=True,
-            torch_dtype=torch.bfloat16,
-            device_map="auto",
-        ).eval()
-        processor = AutoProcessor.from_pretrained(
-            MODEL_PATH, trust_remote_code=True, use_fast=True
-        )
-        print("Model loaded successfully!")
-    except Exception as e:
-        print(f"Error loading model: {e}")
-        raise e
 # Load model on startup
@@ -53,70 +53,53 @@ def perform_ocr(image):
     if image is None:
         return "Please upload an image first."
-    try:
-        # Ensure model is on GPU
-        if model.device.type == "cpu" and torch.cuda.is_available():
-            print("Moving model to GPU...")
-            model.to("cuda")
-        # Convert to PIL Image if needed
-        if not isinstance(image, Image.Image):
-            image = Image.fromarray(image)
-        # Ensure RGB format
-        image = image.convert("RGB")
-        # Prepare the prompt
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "image", "image": image},
-                    {"type": "text", "text": "OCR:"},
-                ],
-            }
-        ]
-        # Process inputs
-        text = processor.apply_chat_template(
-            messages, tokenize=False, add_generation_prompt=True
         )
-        inputs = processor(text=[text], images=[image], return_tensors="pt")
-        # Generate text
-        with torch.inference_mode():
-            device = next(model.parameters()).device
-            inputs = inputs.to(device)
-            # Extract input_ids and other tensors to avoid keyword argument issues
-            input_ids_tensor = inputs.input_ids if hasattr(inputs, 'input_ids') else inputs.get('input_ids')
-            pixel_values = inputs.pixel_values if hasattr(inputs, 'pixel_values') else inputs.get('pixel_values')
-            attention_mask = inputs.attention_mask if hasattr(inputs, 'attention_mask') else inputs.get('attention_mask')
-            generated_ids = model.generate(
-                input_ids=input_ids_tensor,
-                pixel_values=pixel_values,
-                attention_mask=attention_mask,
-                max_new_tokens=2048,
-                do_sample=False,
-                use_cache=True,
-            )
-        if "input_ids" in inputs:
-            input_ids = inputs.input_ids
-        else:
-            print("inputs: # fallback", inputs)
-            input_ids = inputs.inputs
-        generated_ids_trimmed = [
-            out_ids[len(in_ids):] for in_ids, out_ids in zip(input_ids, generated_ids)
-        ]
-        answer = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        return answer
-    except Exception as e:
-        return f"Error during OCR: {e!s}"
 # Create Gradio interface

 def load_model():
     global model, processor
+    print("Loading model...")
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_PATH,
+        trust_remote_code=True,
+        torch_dtype=torch.bfloat16,
+        device_map="auto",
+    ).eval()
+    processor = AutoProcessor.from_pretrained(
+        MODEL_PATH, trust_remote_code=True, use_fast=True
+    )
+    # Set pad_token_id to avoid warning during generation
+    if model.generation_config.pad_token_id is None:
+        model.generation_config.pad_token_id = processor.tokenizer.eos_token_id
+    print("Model loaded successfully!")
 # Load model on startup
     if image is None:
         return "Please upload an image first."
+    # Ensure model is on GPU
+    if model.device.type == "cpu" and torch.cuda.is_available():
+        print("Moving model to GPU...")
+        model.to("cuda")
+    # Convert to PIL Image if needed
+    if not isinstance(image, Image.Image):
+        image = Image.fromarray(image)
+    # Ensure RGB format
+    image = image.convert("RGB")
+    # Prepare the prompt
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": image},
+                {"type": "text", "text": "OCR:"},
+            ],
+        }
+    ]
+    # Process inputs
+    text = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    inputs = processor(text=[text], images=[image], return_tensors="pt")
+    inputs = {
+        k: (v.to(model.device) if isinstance(v, torch.Tensor) else v)
+        for k, v in inputs.items()
+    }
+    # Generate text
+    with torch.inference_mode():
+        generated = model.generate(
+            **inputs,
+            max_new_tokens=2048,
+            do_sample=False,
+            use_cache=True,
         )
+    input_length = inputs["input_ids"].shape[1]
+    generated_tokens = generated[:, input_length:]
+    answer = processor.batch_decode(generated_tokens, skip_special_tokens=True)[0]
+    return answer
 # Create Gradio interface

requirements.txt CHANGED Viewed

@@ -1,5 +1,5 @@
 torch>=2.0.0
-transformers
 accelerate
 pillow>=10.0.0
 einops

 torch>=2.0.0
+transformers==4.57.1
 accelerate
 pillow>=10.0.0
 einops