oneocr / _archive /inspect_model33.py
OneOCR Dev
OneOCR - reverse engineering complete, ONNX pipeline 53% match rate
ce847d4
"""Deep analysis of model_33 (LineLayout) to understand its OneOCRFeatureExtract usage."""
import onnx
from onnx import numpy_helper
import numpy as np
from pathlib import Path
models_dir = Path("oneocr_extracted/onnx_models")
model_path = list(models_dir.glob("model_33_*"))[0]
model = onnx.load(str(model_path))
print(f"Model: {model_path.name}")
print(f"IR version: {model.ir_version}")
print(f"Opsets: {[(o.domain, o.version) for o in model.opset_import]}")
# Show all inputs/outputs
print(f"\nGraph inputs:")
for inp in model.graph.input:
shape = [d.dim_value or d.dim_param for d in inp.type.tensor_type.shape.dim]
print(f" {inp.name}: {shape}")
print(f"\nGraph outputs:")
for out in model.graph.output:
shape = [d.dim_value or d.dim_param for d in out.type.tensor_type.shape.dim]
print(f" {out.name}: {shape}")
# All initializers
print(f"\nInitializers ({len(model.graph.initializer)}):")
for init in model.graph.initializer:
if init.data_type == 8: # STRING
raw = bytes(init.string_data[0] if init.string_data else init.raw_data)
print(f" {init.name}: STRING, {len(raw)} bytes")
else:
data = numpy_helper.to_array(init)
print(f" {init.name}: {data.shape} {data.dtype} [{data.min():.4f}, {data.max():.4f}]")
# Find the OneOCRFeatureExtract node and its context
print(f"\nNodes ({len(model.graph.node)}):")
fe_found = False
for i, node in enumerate(model.graph.node):
if node.op_type == "OneOCRFeatureExtract" or fe_found:
domain_str = f" [{node.domain}]" if node.domain else ""
print(f" [{i}] {node.op_type}{domain_str}: {list(node.input)}{list(node.output)}")
if node.op_type == "OneOCRFeatureExtract":
fe_found = True
for attr in node.attribute:
if attr.type == 2:
print(f" {attr.name} = {attr.i}")
elif attr.type == 1:
print(f" {attr.name} = {attr.f}")
elif attr.type == 7:
print(f" {attr.name} = {list(attr.ints)}")
elif i < 5 or (i > len(model.graph.node) - 5):
# Show first/last few nodes for context
domain_str = f" [{node.domain}]" if node.domain else ""
print(f" [{i}] {node.op_type}{domain_str}: {list(node.input)}{list(node.output)}")
# Show nodes BEFORE OneOCRFeatureExtract (the conv backbone)
print(f"\nConv backbone structure (last 5 nodes before FE):")
fe_idx = None
for i, node in enumerate(model.graph.node):
if node.op_type == "OneOCRFeatureExtract":
fe_idx = i
break
if fe_idx:
for i in range(max(0, fe_idx - 5), fe_idx + 1):
node = model.graph.node[i]
domain_str = f" [{node.domain}]" if node.domain else ""
print(f" [{i}] {node.op_type}{domain_str}: {list(node.input)}{list(node.output)}")
# Analyze the feature/config blob
for init in model.graph.initializer:
if init.name == "feature/config":
raw = bytes(init.string_data[0] if init.string_data else init.raw_data)
print(f"\nfeature/config blob: {len(raw)} bytes")
# Try big-endian float32
be = np.frombuffer(raw, dtype='>f4').copy()
print(f" Big-endian float32: {len(be)} values")
print(f" Finite: {np.isfinite(be).sum()}")
in_range = np.sum(np.abs(be[np.isfinite(be)]) < 10)
print(f" In [-10,10]: {in_range} ({100*in_range/len(be):.1f}%)")
print(f" First 20: {be[:20]}")
print(f" Last 20: {be[-20:]}")
# Try to find dimension markers
for i, v in enumerate(be):
if v in [128.0, 256.0, 512.0] or (v > 0 and v == int(v) and 10 < v < 10000):
print(f" Potential dim at [{i}]: {v}")