|
|
import cv2 |
|
|
import numpy as np |
|
|
from PIL import Image |
|
|
import onnxruntime as ort |
|
|
import json |
|
|
import os |
|
|
|
|
|
|
|
|
MODEL_PATH = "meiki.text.rec.v0.960x32.onnx" |
|
|
INPUT_IMAGE_PATH = "input.jpg" |
|
|
CONFIDENCE_THRESHOLD = 0.1 |
|
|
INPUT_REC_HEIGHT = 32 |
|
|
INPUT_REC_WIDTH = 960 |
|
|
X_OVERLAP_THRESHOLD = 0.3 |
|
|
EPSILON = 1e-6 |
|
|
|
|
|
def preprocess_line_image(image): |
|
|
""" |
|
|
Preprocess a single text line image for recognition: |
|
|
- Resize to height=32, preserving aspect ratio. |
|
|
- If width > 960, scale down to fit. |
|
|
- Pad right to exactly 960×32 with black (0). |
|
|
Returns: |
|
|
input_tensor: numpy array of shape [1, 3, 32, 960] |
|
|
effective_w: width of the resized (non-padded) content |
|
|
""" |
|
|
h, w = image.shape[:2] |
|
|
if h == 0 or w == 0: |
|
|
raise ValueError("Input image is empty") |
|
|
|
|
|
|
|
|
new_h = INPUT_REC_HEIGHT |
|
|
new_w = int(round(w * (new_h / h))) |
|
|
|
|
|
|
|
|
if new_w > INPUT_REC_WIDTH: |
|
|
scale = INPUT_REC_WIDTH / new_w |
|
|
new_w = INPUT_REC_WIDTH |
|
|
new_h = int(round(new_h * scale)) |
|
|
|
|
|
resized = cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_LINEAR) |
|
|
|
|
|
|
|
|
pad_width = INPUT_REC_WIDTH - new_w |
|
|
pad_height = INPUT_REC_HEIGHT - new_h |
|
|
if len(resized.shape) == 2: |
|
|
padded = np.pad(resized, ((0, pad_height), (0, pad_width)), constant_values=0) |
|
|
padded = cv2.cvtColor(padded, cv2.COLOR_GRAY2RGB) |
|
|
else: |
|
|
padded = np.pad(resized, ((0, pad_height), (0, pad_width), (0, 0)), constant_values=0) |
|
|
|
|
|
|
|
|
input_tensor = (np.transpose(padded, (2, 0, 1)).astype(np.float32)) / 255.0 |
|
|
input_tensor = np.expand_dims(input_tensor, axis=0) |
|
|
|
|
|
return input_tensor, new_w, w, h |
|
|
|
|
|
def postprocess_recognition_output(labels, boxes, scores, orig_w, orig_h, effective_w): |
|
|
""" |
|
|
Convert raw model outputs to character-level results with global bounding boxes. |
|
|
Assumes input image corresponds to a single text line at global coords (0,0) to (orig_w, orig_h). |
|
|
""" |
|
|
candidates = [] |
|
|
for lbl, box, scr in zip(labels, boxes, scores): |
|
|
if scr < CONFIDENCE_THRESHOLD: |
|
|
continue |
|
|
char = chr(lbl) |
|
|
|
|
|
rx1, ry1, rx2, ry2 = box |
|
|
|
|
|
rx1 = min(rx1, effective_w) |
|
|
rx2 = min(rx2, effective_w) |
|
|
|
|
|
|
|
|
cx1 = (rx1 / effective_w) * orig_w |
|
|
cx2 = (rx2 / effective_w) * orig_w |
|
|
cy1 = (ry1 / INPUT_REC_HEIGHT) * orig_h |
|
|
cy2 = (ry2 / INPUT_REC_HEIGHT) * orig_h |
|
|
|
|
|
|
|
|
|
|
|
bbox = [int(cx1), int(cy1), int(cx2), int(cy2)] |
|
|
|
|
|
candidates.append({ |
|
|
'char': char, |
|
|
'bbox': bbox, |
|
|
'conf': float(scr), |
|
|
'x_interval': (cx1, cx2) |
|
|
}) |
|
|
|
|
|
|
|
|
candidates.sort(key=lambda c: c['conf'], reverse=True) |
|
|
accepted = [] |
|
|
for cand in candidates: |
|
|
x1_c, x2_c = cand['x_interval'] |
|
|
width_c = x2_c - x1_c + EPSILON |
|
|
keep = True |
|
|
for acc in accepted: |
|
|
x1_a, x2_a = acc['x_interval'] |
|
|
overlap = max(0, min(x2_c, x2_a) - max(x1_c, x1_a)) |
|
|
if overlap / width_c > X_OVERLAP_THRESHOLD: |
|
|
keep = False |
|
|
break |
|
|
if keep: |
|
|
accepted.append(cand) |
|
|
|
|
|
|
|
|
accepted.sort(key=lambda c: c['x_interval'][0]) |
|
|
text = ''.join(c['char'] for c in accepted) |
|
|
|
|
|
chars = [{'char': c['char'], 'bbox': c['bbox']} for c in accepted] |
|
|
|
|
|
return {'text': text, 'chars': chars} |
|
|
|
|
|
def main(): |
|
|
|
|
|
image = cv2.imread(INPUT_IMAGE_PATH) |
|
|
if image is None: |
|
|
raise FileNotFoundError(f"Input image not found: {INPUT_IMAGE_PATH}") |
|
|
|
|
|
|
|
|
session = ort.InferenceSession(MODEL_PATH, providers=['CUDAExecutionProvider','CPUExecutionProvider']) |
|
|
|
|
|
|
|
|
input_tensor, effective_w, orig_w, orig_h = preprocess_line_image(image) |
|
|
|
|
|
|
|
|
orig_size = np.array([[INPUT_REC_WIDTH, INPUT_REC_HEIGHT]], dtype=np.int64) |
|
|
outputs = session.run(None, { |
|
|
"images": input_tensor, |
|
|
"orig_target_sizes": orig_size |
|
|
}) |
|
|
labels_batch, boxes_batch, scores_batch = outputs |
|
|
|
|
|
|
|
|
result = postprocess_recognition_output(labels_batch[0], boxes_batch[0], scores_batch[0], orig_w, orig_h, effective_w) |
|
|
|
|
|
print("--- JSON result ---") |
|
|
print(json.dumps(result, ensure_ascii=False, separators=(',', ':'))) |
|
|
print("\n--- Text result ---") |
|
|
print(result['text']) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|