OCR-for-Captcha / app.py
toandev's picture
WIP
aacc8da
import torch
import onnx
import onnxruntime as rt
from torchvision import transforms as T
from pathlib import Path
from PIL import Image
from huggingface_hub import login, hf_hub_download
import os
import gradio as gr
from utils.tokenizer_base import Tokenizer
login(os.getenv("HF_TOKEN"))
cwd = Path(__file__).parent.resolve()
model_file = os.path.join(cwd, hf_hub_download("toandev/OCR-for-Captcha", "model.onnx"))
img_size = (32, 128)
vocab = r"0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"
tokenizer = Tokenizer(vocab)
def to_numpy(tensor):
return (
tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()
)
def get_transform(img_size):
transforms = []
transforms.extend(
[
T.Resize(img_size, T.InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(0.5, 0.5),
]
)
return T.Compose(transforms)
def load_model(model_file):
transform = get_transform(img_size)
onnx_model = onnx.load(model_file)
onnx.checker.check_model(onnx_model)
s = rt.InferenceSession(model_file)
return transform, s
transform, s = load_model(model_file=model_file)
def infer(img: Image.Image):
x = transform(img.convert("RGB")).unsqueeze(0)
ort_inputs = {s.get_inputs()[0].name: to_numpy(x)}
logits = s.run(None, ort_inputs)[0]
probs = torch.tensor(logits).softmax(-1)
preds, probs = tokenizer.decode(probs)
return preds[0]
demo = gr.Interface(
infer,
gr.components.Image(type="pil"),
gr.components.Textbox(),
title="OCR for CAPTCHA",
description="Solve captchas from images including letters and numbers, success rate is about 80-90%.",
examples=[
"1.png",
"2.jpg",
"3.jpg",
"4.png",
"5.png",
],
)
demo.launch()