Spaces:

Harsimran19
/

DocumentImageClassifier

Runtime error

App Files Files Community

DocumentImageClassifier / preprocess.py

Harsimran19

Upload 4 files

4331eba over 1 year ago

raw

history blame

3.25 kB

	import pytesseract
	from PIL import Image
	import numpy as np
	from transformers import LayoutLMTokenizer


	pytesseract.pytesseract.tesseract_cmd = r"C:\\Program Files\\Tesseract-OCR\\tesseract.exe"
	tokenizer = LayoutLMTokenizer.from_pretrained("microsoft/layoutlm-base-uncased")
	def normalize_box(box, width, height):
	return [
	int(1000 * (box[0] / width)),
	int(1000 * (box[1] / height)),
	int(1000 * (box[2] / width)),
	int(1000 * (box[3] / height)),
	]

	def apply_ocr(image):
	# get the image
	# image = Image.open(example['image_path'])

	width, height = image.size
	example={}
	# apply ocr to the image
	ocr_df = pytesseract.image_to_data(image, output_type='data.frame')
	float_cols = ocr_df.select_dtypes('float').columns
	ocr_df = ocr_df.dropna().reset_index(drop=True)
	ocr_df[float_cols] = ocr_df[float_cols].round(0).astype(int)
	ocr_df = ocr_df.replace(r'^\s*$', np.nan, regex=True)
	ocr_df = ocr_df.dropna().reset_index(drop=True)

	# get the words and actual (unnormalized) bounding boxes
	#words = [word for word in ocr_df.text if str(word) != 'nan'])
	words = list(ocr_df.text)
	words = [str(w) for w in words]
	coordinates = ocr_df[['left', 'top', 'width', 'height']]
	actual_boxes = []
	for idx, row in coordinates.iterrows():
	x, y, w, h = tuple(row) # the row comes in (left, top, width, height) format
	actual_box = [x, y, x+w, y+h] # we turn it into (left, top, left+width, top+height) to get the actual box
	actual_boxes.append(actual_box)

	# normalize the bounding boxes
	boxes = []
	for box in actual_boxes:
	boxes.append(normalize_box(box, width, height))

	# add as extra columns
	assert len(words) == len(boxes)
	example['words'] = words
	example['bbox'] = boxes
	return example
	def encode_example(example, max_seq_length=512, pad_token_box=[0, 0, 0, 0]):
	words = example['words']
	normalized_word_boxes = example['bbox']

	assert len(words) == len(normalized_word_boxes)

	token_boxes = []
	for word, box in zip(words, normalized_word_boxes):
	word_tokens = tokenizer.tokenize(word)
	token_boxes.extend([box] * len(word_tokens))

	# Truncation of token_boxes
	special_tokens_count = 2
	if len(token_boxes) > max_seq_length - special_tokens_count:
	token_boxes = token_boxes[: (max_seq_length - special_tokens_count)]

	# add bounding boxes of cls + sep tokens
	token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]

	encoding = tokenizer(' '.join(words), padding='max_length', truncation=True)
	# Padding of token_boxes up the bounding boxes to the sequence length.
	input_ids = tokenizer(' '.join(words), truncation=True)["input_ids"]
	padding_length = max_seq_length - len(input_ids)
	token_boxes += [pad_token_box] * padding_length
	encoding['bbox'] = token_boxes

	assert len(encoding['input_ids']) == max_seq_length
	assert len(encoding['attention_mask']) == max_seq_length
	assert len(encoding['token_type_ids']) == max_seq_length
	assert len(encoding['bbox']) == max_seq_length

	return encoding