|
from img2table.ocr import DocTR |
|
from torchvision import transforms |
|
from transformers import AutoModelForObjectDetection |
|
|
|
import torch |
|
|
|
def box_cxcywh_to_xyxy(x): |
|
x_c, y_c, w, h = x.unbind(-1) |
|
b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)] |
|
return torch.stack(b, dim=1) |
|
|
|
def rescale_bboxes(out_bbox, size): |
|
width, height = size |
|
boxes = box_cxcywh_to_xyxy(out_bbox) |
|
boxes = boxes * torch.tensor( |
|
[width, height, width, height], dtype=torch.float32 |
|
) |
|
return boxes |
|
|
|
def outputs_to_objects(outputs, img_size, id2label): |
|
m = outputs.logits.softmax(-1).max(-1) |
|
pred_labels = list(m.indices.detach().cpu().numpy())[0] |
|
pred_scores = list(m.values.detach().cpu().numpy())[0] |
|
pred_bboxes = outputs["pred_boxes"].detach().cpu()[0] |
|
pred_bboxes = [ |
|
elem.tolist() for elem in rescale_bboxes(pred_bboxes, img_size) |
|
] |
|
|
|
objects = [] |
|
for label, score, bbox in zip(pred_labels, pred_scores, pred_bboxes): |
|
class_label = id2label[int(label)] |
|
if not class_label == "no object": |
|
objects.append( |
|
{ |
|
"label": class_label, |
|
"score": float(score), |
|
"bbox": [float(elem) for elem in bbox], |
|
} |
|
) |
|
|
|
return objects |
|
|
|
class MaxResize(object): |
|
def __init__(self, max_size=800): |
|
self.max_size = max_size |
|
|
|
def __call__(self, image): |
|
width, height = image.size |
|
current_max_size = max(width, height) |
|
scale = self.max_size / current_max_size |
|
resized_image = image.resize( |
|
(int(round(scale * width)), int(round(scale * height))) |
|
) |
|
|
|
return resized_image |
|
|
|
detection_transform = transforms.Compose( |
|
[ |
|
MaxResize(800), |
|
transforms.ToTensor(), |
|
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), |
|
] |
|
) |
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
model = AutoModelForObjectDetection.from_pretrained("microsoft/table-transformer-detection", revision="no_timm").to(device) |
|
|
|
ocr = DocTR() |
|
|