| """ |
| Full GPU pipeline example for the SBB binarization ONNX model. |
| |
| Shows how to keep the entire image processing chain on the GPU using CuPy, |
| with only the JPEG decode and TIFF save happening on CPU. This is the |
| approach you'd use for a production pipeline where throughput matters. |
| |
| pip install onnxruntime-gpu cupy-cuda12x numpy Pillow |
| python3 example_gpu_pipeline.py input.jpg output.tif |
| |
| On first run, TensorRT builds an optimized engine (~60-90s). This is |
| cached in ./trt_cache/ and reused on subsequent runs. |
| """ |
|
|
| import sys |
| import os |
| import numpy as np |
| import cupy as cp |
| import onnxruntime as ort |
| from PIL import Image |
|
|
| MODEL = "model_convtranspose.onnx" |
| PATCH_SIZE = 448 |
| BATCH_SIZE = 64 |
|
|
| |
| |
| |
| |
| |
| _NORM_LUT = cp.array( |
| np.array([np.float32(np.float64(i) / 255.0) for i in range(256)], |
| dtype=np.float32) |
| ) |
|
|
|
|
| def create_session(model_path): |
| """Create an ONNX Runtime session with TensorRT backend. |
| |
| TensorRT compiles the model into an optimized GPU engine on first run. |
| The engine is cached to disk so subsequent runs start in ~2 seconds. |
| """ |
| cache_dir = "./trt_cache" |
| os.makedirs(cache_dir, exist_ok=True) |
| return ort.InferenceSession(model_path, providers=[ |
| ("TensorrtExecutionProvider", { |
| "device_id": 0, |
| "trt_fp16_enable": False, |
| "trt_engine_cache_enable": True, |
| "trt_engine_cache_path": cache_dir, |
| "trt_builder_optimization_level": 3, |
| }), |
| ("CUDAExecutionProvider", {"device_id": 0}), |
| ]) |
|
|
|
|
| def extract_patches_gpu(img_gpu, patch_size): |
| """Extract non-overlapping patches on GPU. Zero-pads edges.""" |
| h, w = img_gpu.shape[:2] |
| positions = [(x, y) for y in range(0, h, patch_size) |
| for x in range(0, w, patch_size)] |
|
|
| patches = cp.zeros((len(positions), patch_size, patch_size, 3), dtype=cp.uint8) |
| for i, (x, y) in enumerate(positions): |
| ph = min(patch_size, h - y) |
| pw = min(patch_size, w - x) |
| patches[i, :ph, :pw, :] = img_gpu[y:y+ph, x:x+pw, :] |
|
|
| return patches, positions |
|
|
|
|
| def infer_patches(session, patches_uint8): |
| """Normalize and run inference, one batch at a time. |
| |
| Normalizing per-batch (64 patches = 154MB) instead of all at once |
| (500+ patches = 2.6GB) avoids GPU memory fragmentation. |
| """ |
| inp = session.get_inputs()[0].name |
| out = session.get_outputs()[0].name |
| n = patches_uint8.shape[0] |
| out_ch = session.get_outputs()[0].shape[3] or 2 |
|
|
| all_output = cp.zeros((n, PATCH_SIZE, PATCH_SIZE, out_ch), dtype=cp.float32) |
|
|
| for i in range(0, n, BATCH_SIZE): |
| end = min(i + BATCH_SIZE, n) |
|
|
| |
| batch_float = _NORM_LUT[patches_uint8[i:end].astype(cp.int32)] |
|
|
| |
| result = session.run([out], {inp: batch_float.get()})[0] |
|
|
| |
| all_output[i:end] = cp.asarray(result) |
|
|
| return all_output |
|
|
|
|
| def postprocess_gpu(output): |
| """Extract foreground probability, threshold, binarize β all on GPU.""" |
| probs = output[:, :, :, 1] |
| quantized = (probs * 255.0).astype(cp.uint8) |
| return cp.where(quantized <= 128, cp.uint8(255), cp.uint8(0)) |
|
|
|
|
| def reconstruct_gpu(patches, positions, width, height): |
| """Reconstruct full image from patches with overlap averaging β all on GPU.""" |
| result = cp.zeros((height, width), dtype=cp.float32) |
| weight = cp.zeros((height, width), dtype=cp.float32) |
|
|
| for i, (x, y) in enumerate(positions): |
| ah = min(PATCH_SIZE, height - y) |
| aw = min(PATCH_SIZE, width - x) |
| result[y:y+ah, x:x+aw] += patches[i, :ah, :aw].astype(cp.float32) |
| weight[y:y+ah, x:x+aw] += 1.0 |
|
|
| return (result / cp.maximum(weight, 1.0)).astype(cp.uint8) |
|
|
|
|
| def binarize_image(input_path, output_path, model_path=MODEL): |
| """Full pipeline: JPEG in -> binarized TIFF out. |
| |
| Data flow: |
| CPU: decode JPEG |
| CPU -> GPU: upload image (~5ms) |
| GPU: extract patches (~7ms) |
| GPU -> CPU -> GPU: normalize, infer, collect (~175ms per batch) |
| GPU: threshold + binarize (~1ms) |
| GPU: reconstruct from patches (~13ms) |
| GPU -> CPU: download result (~2ms) |
| CPU: save Group4 TIFF |
| """ |
| |
| img = np.array(Image.open(input_path).convert("RGB")) |
| h, w = img.shape[:2] |
|
|
| |
| img_gpu = cp.asarray(img) |
|
|
| |
| patches, positions = extract_patches_gpu(img_gpu, PATCH_SIZE) |
|
|
| |
| session = create_session(model_path) |
| output = infer_patches(session, patches) |
|
|
| |
| binary = postprocess_gpu(output) |
|
|
| |
| result_gpu = reconstruct_gpu(binary, positions, w, h) |
|
|
| |
| result = result_gpu.get() |
|
|
| |
| Image.fromarray(result, "L").convert("1").save( |
| output_path, format="TIFF", compression="group4", dpi=(300, 300) |
| ) |
| print(f"Saved {output_path}") |
|
|
| |
| del img_gpu, patches, output, binary, result_gpu |
|
|
|
|
| if __name__ == "__main__": |
| if len(sys.argv) < 3: |
| print(f"Usage: {sys.argv[0]} <input.jpg> <output.tif>") |
| sys.exit(1) |
| binarize_image(sys.argv[1], sys.argv[2]) |
|
|