Spaces:
Runtime error
Runtime error
| import pytesseract | |
| from PIL import Image | |
| import numpy as np | |
| from transformers import LayoutLMTokenizer | |
| # pytesseract.pytesseract.tesseract_cmd = r"C:\\Program Files\\Tesseract-OCR\\tesseract.exe" | |
| tokenizer = LayoutLMTokenizer.from_pretrained("microsoft/layoutlm-base-uncased") | |
| def normalize_box(box, width, height): | |
| return [ | |
| int(1000 * (box[0] / width)), | |
| int(1000 * (box[1] / height)), | |
| int(1000 * (box[2] / width)), | |
| int(1000 * (box[3] / height)), | |
| ] | |
| def apply_ocr(image): | |
| # get the image | |
| # image = Image.open(example['image_path']) | |
| width, height = image.size | |
| example={} | |
| # apply ocr to the image | |
| ocr_df = pytesseract.image_to_data(image, output_type='data.frame') | |
| float_cols = ocr_df.select_dtypes('float').columns | |
| ocr_df = ocr_df.dropna().reset_index(drop=True) | |
| ocr_df[float_cols] = ocr_df[float_cols].round(0).astype(int) | |
| ocr_df = ocr_df.replace(r'^\s*$', np.nan, regex=True) | |
| ocr_df = ocr_df.dropna().reset_index(drop=True) | |
| # get the words and actual (unnormalized) bounding boxes | |
| #words = [word for word in ocr_df.text if str(word) != 'nan']) | |
| words = list(ocr_df.text) | |
| words = [str(w) for w in words] | |
| coordinates = ocr_df[['left', 'top', 'width', 'height']] | |
| actual_boxes = [] | |
| for idx, row in coordinates.iterrows(): | |
| x, y, w, h = tuple(row) # the row comes in (left, top, width, height) format | |
| actual_box = [x, y, x+w, y+h] # we turn it into (left, top, left+width, top+height) to get the actual box | |
| actual_boxes.append(actual_box) | |
| # normalize the bounding boxes | |
| boxes = [] | |
| for box in actual_boxes: | |
| boxes.append(normalize_box(box, width, height)) | |
| # add as extra columns | |
| assert len(words) == len(boxes) | |
| example['words'] = words | |
| example['bbox'] = boxes | |
| return example | |
| def encode_example(example, max_seq_length=512, pad_token_box=[0, 0, 0, 0]): | |
| words = example['words'] | |
| normalized_word_boxes = example['bbox'] | |
| assert len(words) == len(normalized_word_boxes) | |
| token_boxes = [] | |
| for word, box in zip(words, normalized_word_boxes): | |
| word_tokens = tokenizer.tokenize(word) | |
| token_boxes.extend([box] * len(word_tokens)) | |
| # Truncation of token_boxes | |
| special_tokens_count = 2 | |
| if len(token_boxes) > max_seq_length - special_tokens_count: | |
| token_boxes = token_boxes[: (max_seq_length - special_tokens_count)] | |
| # add bounding boxes of cls + sep tokens | |
| token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]] | |
| encoding = tokenizer(' '.join(words), padding='max_length', truncation=True) | |
| # Padding of token_boxes up the bounding boxes to the sequence length. | |
| input_ids = tokenizer(' '.join(words), truncation=True)["input_ids"] | |
| padding_length = max_seq_length - len(input_ids) | |
| token_boxes += [pad_token_box] * padding_length | |
| encoding['bbox'] = token_boxes | |
| assert len(encoding['input_ids']) == max_seq_length | |
| assert len(encoding['attention_mask']) == max_seq_length | |
| assert len(encoding['token_type_ids']) == max_seq_length | |
| assert len(encoding['bbox']) == max_seq_length | |
| return encoding |