| |
| |
|
|
| import numpy as np |
| import cv2 |
| import onnxruntime as ort |
| from pathlib import Path |
|
|
| def preprocess_image_doclayout(image, target_input_size=(800, 800)): |
| """ |
| Preprocessing for DocLayoutV3 with 800x800 input. |
| Returns CHW tensor (no batch dim) + scale factors. |
| """ |
| orig_h, orig_w = image.shape[:2] |
| target_h, target_w = target_input_size |
| scale_h = target_h / orig_h |
| scale_w = target_w / orig_w |
|
|
| resized = cv2.resize(image, (target_w, target_h), interpolation=cv2.INTER_LINEAR) |
| rgb = cv2.cvtColor(resized, cv2.COLOR_BGR2RGB) |
| blob = rgb.astype(np.float32) / 255.0 |
|
|
| mean = np.array([0.485, 0.456, 0.406], dtype=np.float32) |
| std = np.array([0.229, 0.224, 0.225], dtype=np.float32) |
| blob = (blob - mean) / std |
|
|
| |
| blob = blob.transpose(2, 0, 1) |
| return blob, scale_h, scale_w |
|
|
|
|
| def preprocess_batch(image_paths, target_input_size=(800, 800)): |
| """ |
| Load and preprocess a list of image paths. |
| Returns: |
| input_blob : (N, 3, H, W) float32 |
| shape_list : (N, 2) float32 [[H, W], ...] |
| scale_list : (N, 2) float32 [[scale_h, scale_w], ...] |
| images : list of original BGR images (for debug / visualisation) |
| """ |
| blobs, shapes, scales, images = [], [], [], [] |
|
|
| for path in image_paths: |
| img = cv2.imread(str(path)) |
| if img is None: |
| raise FileNotFoundError(f"Could not read image: {path}") |
|
|
| blob, scale_h, scale_w = preprocess_image_doclayout(img, target_input_size) |
| blobs.append(blob) |
| shapes.append(target_input_size) |
| scales.append((scale_h, scale_w)) |
| images.append(img) |
|
|
| input_blob = np.stack(blobs, axis=0).astype(np.float32) |
| shape_arr = np.array(shapes, dtype=np.float32) |
| scale_arr = np.array(scales, dtype=np.float32) |
|
|
| return input_blob, shape_arr, scale_arr, images |
|
|
|
|
| def run_doclayout_onnx_batch(image_paths, model_path, conf_thresh=0.5): |
| """ |
| Run DocLayoutV3 on a batch of images. |
| |
| The model's three inputs are: |
| input_names[0] : image shape – expected shape (N, 2) |
| input_names[1] : image tensor – expected shape (N, 3, H, W) |
| input_names[2] : scale factors – expected shape (N, 2) |
| |
| Output shape: (N * max_dets, 7) |
| Values: [image_index, label_index, score, xmin, ymin, xmax, ymax] |
| (Some ONNX exports omit image_index — see note in post-processing.) |
| """ |
| model = ort.InferenceSession(model_path) |
| input_names = [i.name for i in model.get_inputs()] |
| output_names = [o.name for o in model.get_outputs()] |
|
|
| input_blob, shape_arr, scale_arr, images = preprocess_batch(image_paths) |
| n = len(image_paths) |
|
|
| input_feed = { |
| "im_shape": shape_arr, |
| "image": input_blob, |
| "scale_factor": scale_arr, |
| } |
|
|
| raw_output = model.run(output_names, input_feed)[0] |
|
|
| return postprocess_batch(raw_output, n, conf_thresh) |
|
|
|
|
| def postprocess_batch(raw_output, n_images, conf_thresh=0.5): |
| """ |
| Split flat detection output back into per-image results. |
| |
| PP-DocLayout ONNX output columns: |
| [img_idx, label, score, x0, y0, x1, y1, read_order] (8 cols) |
| or |
| [label, score, x0, y0, x1, y1, read_order] (7 cols — single-image compat) |
| |
| We handle both layouts automatically. |
| """ |
| n_cols = raw_output.shape[1] |
|
|
| if n_cols == 8: |
| |
| img_idx_col = raw_output[:, 0].astype(int) |
| detections = raw_output[:, 1:] |
| else: |
| |
| dets_per_image = len(raw_output) // n_images |
| img_idx_col = np.repeat(np.arange(n_images), dets_per_image) |
| detections = raw_output |
|
|
| results = [] |
| for i in range(n_images): |
| mask = img_idx_col == i |
| boxes = detections[mask] |
| boxes = boxes[boxes[:, 1] > conf_thresh] |
| boxes = boxes[np.argsort(boxes[:, 6])] |
| results.append(boxes) |
|
|
| return results |
|
|
|
|
| def print_doclayout_res(boxes, image_label=""): |
| header = f"--- {image_label} ---" if image_label else "--- Results ---" |
| print(header) |
| print("cls_id\tscore\txmin\tymin\txmax\tymax\tread_order") |
| for box in boxes: |
| print( |
| f"{box[0]:.0f}\t\t{box[1]:.3f}\t" |
| f"{box[2]:.2f}\t{box[3]:.2f}\t" |
| f"{box[4]:.2f}\t{box[5]:.2f}\t{box[6]:.0f}" |
| ) |
|
|
|
|
| if __name__ == '__main__': |
| MODEL_PATH = "your/path/to/PP-DocLayoutV3.onnx" |
|
|
| image_paths = [ |
| "your_test_image_1.png", |
| "your_test_image_2.png", |
| "your_test_image_3.png", |
| "your_test_image_4.png", |
| ] |
|
|
| results = run_doclayout_onnx_batch(image_paths, MODEL_PATH, conf_thresh=0.5) |
|
|
| for path, boxes in zip(image_paths, results): |
| print_doclayout_res(boxes, image_label=Path(path).name) |
| print() |
|
|