Spaces:

iammraat
/

document

Running

App Files Files Community

iammraat commited on 11 days ago

Commit

e55fda2

verified ·

1 Parent(s): 593f815

Update app.py

Browse files

Files changed (1) hide show

app.py +85 -45

app.py CHANGED Viewed

@@ -1,65 +1,105 @@
 import gradio as gr
 import cv2
 import numpy as np
-from paddleocr import PPStructureV3 # Explicitly import the class that exists
-# --- INITIALIZATION ---
-# We do NOT pass a custom model path. We let PPStructureV3 download its own default model.
-# This avoids the "ValueError: Unknown argument" crashes.
-layout_engine = PPStructureV3(
-    use_doc_orientation_classify=True, # Standard V3 argument for orientation
-    enable_mkldnn=False                # CRITICAL: Keeps CPU from crashing
-)
 def analyze_layout(input_image):
     if input_image is None:
         return None, "No image uploaded"
     image_np = np.array(input_image)
-    # Run Inference
-    try:
-        # V3 returns a generator, so we convert to list immediately
-        results = list(layout_engine(image_np))
-    except Exception as e:
-        return image_np, f"Error running layout analysis: {e}"
     viz_image = image_np.copy()
-    detections_text = []
-    if not results:
-        return viz_image, "No layout detected."
-    # --- VISUALIZATION ---
-    for region in results:
-        if not isinstance(region, dict): continue
-        # V3 usually puts the box in 'layout_bbox' or 'bbox'
-        box = region.get('layout_bbox') or region.get('bbox')
-        label = region.get('label', 'unknown')
-        if box is None: continue
-        try:
-            x1, y1, x2, y2 = int(box[0]), int(box[1]), int(box[2]), int(box[3])
-            # Color coding
-            color = (0, 255, 0) # Green (Default)
-            if label == 'title': color = (0, 0, 255)    # Red
-            elif label == 'figure': color = (255, 0, 0) # Blue
-            elif label == 'table': color = (255, 255, 0)# Cyan
-            cv2.rectangle(viz_image, (x1, y1), (x2, y2), color, 3)
-            cv2.putText(viz_image, str(label), (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.8, color, 2)
-            detections_text.append(f"Found {label} at {box}")
-        except Exception:
-            pass
-    return viz_image, "\n".join(detections_text)
-with gr.Blocks(title="PP-DocLayoutV3 Explorer") as demo:
-    gr.Markdown("## 📄 PP-DocLayoutV3 Explorer")
-    gr.Markdown("Auto-downloading the latest V3 weights for structure analysis.")
     with gr.Row():
         with gr.Column():
@@ -68,7 +108,7 @@ with gr.Blocks(title="PP-DocLayoutV3 Explorer") as demo:
         with gr.Column():
             output_img = gr.Image(label="Layout Visualization")
-            output_log = gr.Textbox(label="Detected Regions", lines=10)
     submit_btn.click(fn=analyze_layout, inputs=input_img, outputs=[output_img, output_log])

 import gradio as gr
 import cv2
 import numpy as np
+import onnxruntime as ort
+from huggingface_hub import hf_hub_download
+# --- STEP 1: Download the ONNX Model ---
+print("Downloading ONNX model...")
+model_path = hf_hub_download(repo_id="alex-dinh/PP-DocLayoutV3-ONNX", filename="model.onnx")
+print(f"Model downloaded to: {model_path}")
+# --- STEP 2: Initialize ONNX Engine ---
+# This loads the AI "brain" directly without needing Paddle
+session = ort.InferenceSession(model_path)
+input_names = [i.name for i in session.get_inputs()]
+output_names = [o.name for o in session.get_outputs()]
+# Define labels map (Standard for PP-DocLayout)
+LABELS = {1: "Text", 2: "Title", 3: "List", 4: "Table", 5: "Figure"}
+def preprocess_image(image, target_size=(800, 800)):
+    """
+    Prepares the image exactly how the AI expects it (Resize -> Normalize).
+    """
+    h, w = image.shape[:2]
+    # 1. Resize
+    # We do NOT keep aspect ratio for the input blob, but we keep scales to fix boxes later
+    img_resized = cv2.resize(image, target_size)
+    # 2. Normalize (Standard ImageNet mean/std)
+    img_data = img_resized.astype(np.float32) / 255.0
+    mean = np.array([0.485, 0.456, 0.406], dtype=np.float32)
+    std = np.array([0.229, 0.224, 0.225], dtype=np.float32)
+    img_data = (img_data - mean) / std
+    # 3. Transpose to (Batch, Channel, Height, Width)
+    img_data = img_data.transpose(2, 0, 1)[None, :, :, :]
+    # Calculate scale factors to map detections back to original image
+    scale_factor = np.array([target_size[0] / h, target_size[1] / w], dtype=np.float32).reshape(1, 2)
+    return img_data, scale_factor
 def analyze_layout(input_image):
     if input_image is None:
         return None, "No image uploaded"
+    # Convert PIL to Numpy/OpenCV
     image_np = np.array(input_image)
+    orig_h, orig_w = image_np.shape[:2]
+    # --- INFERENCE ---
+    input_blob, scale_factor = preprocess_image(image_np)
+    # ONNX Runtime inputs
+    inputs = {
+        input_names[0]: input_blob,           # The image data
+        input_names[1]: scale_factor          # The resize scale
+    }
+    # Run!
+    outputs = session.run(output_names, inputs)
+    # --- POST-PROCESSING ---
+    # Output format is typically [Batch, N, 6] -> [Class, Score, X1, Y1, X2, Y2]
+    detections = outputs[0]
     viz_image = image_np.copy()
+    log = []
+    for det in detections:
+        class_id = int(det[0])
+        score = det[1]
+        bbox = det[2:]
+        if score < 0.5: continue # Filter weak detections
+        # Map labels
+        label_name = LABELS.get(class_id, "Unknown")
+        # Coordinates
+        x1, y1, x2, y2 = map(int, bbox)
+        # Color coding
+        color = (0, 255, 0) # Green
+        if label_name == "Title": color = (0, 0, 255)
+        elif label_name == "Table": color = (255, 255, 0)
+        elif label_name == "Figure": color = (255, 0, 0)
+        # Draw
+        cv2.rectangle(viz_image, (x1, y1), (x2, y2), color, 3)
+        cv2.putText(viz_image, f"{label_name} {score:.2f}", (x1, y1-10),
+                   cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 2)
+        log.append(f"Found {label_name} at [{x1}, {y1}, {x2}, {y2}] (Conf: {score:.2f})")
+    return viz_image, "\n".join(log)
+with gr.Blocks(title="ONNX Layout Analysis") as demo:
+    gr.Markdown("## ⚡ Fast V3 Layout Analysis (ONNX)")
+    gr.Markdown("Uses **PP-DocLayoutV3** via ONNX Runtime. No Paddle dependencies.")
     with gr.Row():
         with gr.Column():
         with gr.Column():
             output_img = gr.Image(label="Layout Visualization")
+            output_log = gr.Textbox(label="Detections", lines=10)
     submit_btn.click(fn=analyze_layout, inputs=input_img, outputs=[output_img, output_log])