|
from optimum.onnxruntime import ORTModelForVision2Seq |
|
from transformers import TrOCRProcessor |
|
import numpy as np |
|
import onnxruntime |
|
import math |
|
import cv2 |
|
import os |
|
|
|
class TextRecognition: |
|
def __init__(self, |
|
processor_path, |
|
model_path, |
|
device = 'cpu', |
|
half_precision = False, |
|
line_threshold = 120): |
|
self.device = device |
|
self.half_precision = half_precision |
|
self.line_threshold = line_threshold |
|
self.processor_path = processor_path |
|
self.model_path = model_path |
|
self.processor = self.init_processor() |
|
self.recognition_model = self.init_recognition_model() |
|
|
|
def init_processor(self): |
|
"""Function for initializing the processor.""" |
|
try: |
|
processor = TrOCRProcessor.from_pretrained(self.processor_path) |
|
return processor |
|
except Exception as e: |
|
print('Failed to initialize processor: %s' % e) |
|
|
|
def init_recognition_model(self): |
|
"""Function for initializing the text detection model.""" |
|
sess_options = onnxruntime.SessionOptions() |
|
sess_options.intra_op_num_threads = 3 |
|
sess_options.inter_op_num_threads = 3 |
|
try: |
|
recognition_model = ORTModelForVision2Seq.from_pretrained(self.model_path) |
|
return recognition_model |
|
except Exception as e: |
|
print('Failed to load the text recognition model: %s' % e) |
|
|
|
def crop_line(self, image, polygon, height, width): |
|
"""Crops predicted text line based on the polygon coordinates |
|
and returns binarised text line image.""" |
|
poly = np.array([[int(lst[0]), int(lst[1])] for lst in polygon]) |
|
mask = np.zeros([height, width], dtype=np.uint8) |
|
cv2.drawContours(mask, [poly], -1, (255, 255, 255), -1, cv2.LINE_AA) |
|
rect = cv2.boundingRect(poly) |
|
cropped = image[rect[1]: rect[1] + rect[3], rect[0]: rect[0] + rect[2]] |
|
|
|
mask_crop = mask[rect[1]: rect[1] + rect[3], rect[0]: rect[0] + rect[2]] |
|
res = cv2.bitwise_and(cropped, cropped, mask = mask_crop) |
|
|
|
wbg = np.ones_like(cropped, np.uint8) * 255 |
|
cv2.bitwise_not(wbg,wbg, mask=mask_crop) |
|
row_image = wbg+res |
|
return row_image |
|
|
|
def crop_lines(self, polygons, image, height, width): |
|
"""Returns a list of line images cropped following the detected polygon coordinates.""" |
|
cropped_lines = [] |
|
for i, polygon in enumerate(polygons): |
|
cropped_line = self.crop_line(image, polygon, height, width) |
|
cropped_lines.append(cropped_line) |
|
return cropped_lines |
|
|
|
def get_scores(self, lgscores): |
|
"""Get exponent of log scores.""" |
|
scores = [] |
|
for lgscore in lgscores: |
|
score = math.exp(lgscore) |
|
scores.append(score) |
|
return scores |
|
|
|
def predict_text(self, cropped_lines): |
|
"""Functions for predicting text content from the cropped line images.""" |
|
pixel_values = self.processor(cropped_lines, return_tensors="pt").pixel_values |
|
generated_dict = self.recognition_model.generate(pixel_values.to(self.device), max_new_tokens=128, return_dict_in_generate=True, output_scores=True) |
|
generated_ids, lgscores = generated_dict['sequences'], generated_dict['sequences_scores'] |
|
scores = self.get_scores(lgscores.tolist()) |
|
generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True) |
|
return scores, generated_text |
|
|
|
def get_text_lines(self, cropped_lines): |
|
scores, generated_text = [], [] |
|
if len(cropped_lines) <= self.line_threshold: |
|
scores, generated_text = self.predict_text(cropped_lines) |
|
else: |
|
n = math.ceil(len(cropped_lines) / self.line_threshold) |
|
for i in range(n): |
|
start = int(i * self.line_threshold) |
|
end = int(min(start + self.line_threshold, len(cropped_lines))) |
|
sc, gt = self.predict_text(cropped_lines[start:end]) |
|
scores += sc |
|
generated_text += gt |
|
return scores, generated_text |
|
|
|
def get_res_dict(self, polygons, generated_text, height, width, image_name, line_confs, scores): |
|
"""Combines the results in a dictionary form.""" |
|
line_dicts = [] |
|
for i in range(len(generated_text)): |
|
line_dict = {'polygon': polygons[i], 'text': generated_text[i], 'conf': line_confs[i], 'text_conf':scores[i]} |
|
line_dicts.append(line_dict) |
|
lines_dict = {'img_name': image_name, 'height': height, 'width': width, 'text_lines': line_dicts} |
|
return lines_dict |
|
|
|
def process_lines(self, polygons, image, height, width): |
|
|
|
cropped_lines = self.crop_lines(polygons, image, height, width) |
|
|
|
scores, generated_text = self.get_text_lines(cropped_lines) |
|
return generated_text |
|
|