PP-DocLayoutV3-ONNX / batched_inference_example.py

Create batched_inference_example.py

7952bce verified 5 days ago

5.24 kB

	# Install dependencies:
	# pip install numpy opencv-python onnxruntime

	import numpy as np
	import cv2
	import onnxruntime as ort
	from pathlib import Path

	def preprocess_image_doclayout(image, target_input_size=(800, 800)):
	"""
	Preprocessing for DocLayoutV3 with 800x800 input.
	Returns CHW tensor (no batch dim) + scale factors.
	"""
	orig_h, orig_w = image.shape[:2]
	target_h, target_w = target_input_size
	scale_h = target_h / orig_h
	scale_w = target_w / orig_w

	resized = cv2.resize(image, (target_w, target_h), interpolation=cv2.INTER_LINEAR)
	rgb = cv2.cvtColor(resized, cv2.COLOR_BGR2RGB)
	blob = rgb.astype(np.float32) / 255.0

	mean = np.array([0.485, 0.456, 0.406], dtype=np.float32)
	std = np.array([0.229, 0.224, 0.225], dtype=np.float32)
	blob = (blob - mean) / std

	# CHW — no batch dim yet; caller stacks the batch
	blob = blob.transpose(2, 0, 1)
	return blob, scale_h, scale_w


	def preprocess_batch(image_paths, target_input_size=(800, 800)):
	"""
	Load and preprocess a list of image paths.
	Returns:
	input_blob : (N, 3, H, W) float32
	shape_list : (N, 2) float32 [[H, W], ...]
	scale_list : (N, 2) float32 [[scale_h, scale_w], ...]
	images : list of original BGR images (for debug / visualisation)
	"""
	blobs, shapes, scales, images = [], [], [], []

	for path in image_paths:
	img = cv2.imread(str(path))
	if img is None:
	raise FileNotFoundError(f"Could not read image: {path}")

	blob, scale_h, scale_w = preprocess_image_doclayout(img, target_input_size)
	blobs.append(blob)
	shapes.append(target_input_size) # (H, W)
	scales.append((scale_h, scale_w))
	images.append(img)

	input_blob = np.stack(blobs, axis=0).astype(np.float32) # (N, 3, H, W)
	shape_arr = np.array(shapes, dtype=np.float32) # (N, 2)
	scale_arr = np.array(scales, dtype=np.float32) # (N, 2)

	return input_blob, shape_arr, scale_arr, images


	def run_doclayout_onnx_batch(image_paths, model_path, conf_thresh=0.5):
	"""
	Run DocLayoutV3 on a batch of images.

	The model's three inputs are:
	input_names[0] : image shape – expected shape (N, 2)
	input_names[1] : image tensor – expected shape (N, 3, H, W)
	input_names[2] : scale factors – expected shape (N, 2)

	Output shape: (N * max_dets, 7)
	Values: [image_index, label_index, score, xmin, ymin, xmax, ymax]
	(Some ONNX exports omit image_index — see note in post-processing.)
	"""
	model = ort.InferenceSession(model_path)
	input_names = [i.name for i in model.get_inputs()]
	output_names = [o.name for o in model.get_outputs()]

	input_blob, shape_arr, scale_arr, images = preprocess_batch(image_paths)
	n = len(image_paths)

	input_feed = {
	"im_shape": shape_arr, # (N, 2)
	"image": input_blob, # (N, 3, 800, 800)
	"scale_factor": scale_arr, # (N, 2)
	}

	raw_output = model.run(output_names, input_feed)[0] # (Ndets, 7) or (Ndets, 6)

	return postprocess_batch(raw_output, n, conf_thresh)


	def postprocess_batch(raw_output, n_images, conf_thresh=0.5):
	"""
	Split flat detection output back into per-image results.

	PP-DocLayout ONNX output columns:
	[img_idx, label, score, x0, y0, x1, y1, read_order] (8 cols)
	or
	[label, score, x0, y0, x1, y1, read_order] (7 cols — single-image compat)

	We handle both layouts automatically.
	"""
	n_cols = raw_output.shape[1]

	if n_cols == 8:
	# Batched export: first column is the image index
	img_idx_col = raw_output[:, 0].astype(int)
	detections = raw_output[:, 1:] # drop img_idx → 7 cols
	else:
	# Single-image export used for a batch: distribute evenly
	dets_per_image = len(raw_output) // n_images
	img_idx_col = np.repeat(np.arange(n_images), dets_per_image)
	detections = raw_output

	results = []
	for i in range(n_images):
	mask = img_idx_col == i
	boxes = detections[mask]
	boxes = boxes[boxes[:, 1] > conf_thresh] # confidence filter
	boxes = boxes[np.argsort(boxes[:, 6])] # sort by read_order
	results.append(boxes)

	return results


	def print_doclayout_res(boxes, image_label=""):
	header = f"--- {image_label} ---" if image_label else "--- Results ---"
	print(header)
	print("cls_id\tscore\txmin\tymin\txmax\tymax\tread_order")
	for box in boxes:
	print(
	f"{box[0]:.0f}\t\t{box[1]:.3f}\t"
	f"{box[2]:.2f}\t{box[3]:.2f}\t"
	f"{box[4]:.2f}\t{box[5]:.2f}\t{box[6]:.0f}"
	)


	if __name__ == '__main__':
	MODEL_PATH = "your/path/to/PP-DocLayoutV3.onnx"

	image_paths = [
	"your_test_image_1.png",
	"your_test_image_2.png",
	"your_test_image_3.png",
	"your_test_image_4.png",
	]

	results = run_doclayout_onnx_batch(image_paths, MODEL_PATH, conf_thresh=0.5)

	for path, boxes in zip(image_paths, results):
	print_doclayout_res(boxes, image_label=Path(path).name)
	print()