{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Checkbox Detection and Text Extraction Workflow\n", "\n", "![Checkbox Detection and corresponding text extraction](images/checkbox_detection_workflow.png \"Checkbox Detection and Text Extraction Workflow\")" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.\n" ] } ], "source": [ "import easyocr\n", "import cv2\n", "from ultralytics import YOLO \n", "\n", "# Load OCR model into memory\n", "reader = easyocr.Reader(['en']) # this needs to run only once to load the model into memory\n", "\n", "# Define constants\n", "BOX_COLORS = {\n", " \"unchecked\": (242, 48, 48),\n", " \"checked\": (38, 115, 101),\n", " \"block\": (242, 159, 5)\n", "}\n", "BOX_PADDING = 2\n", "\n", "# Load models\n", "DETECTION_MODEL = YOLO(\"models/detector-model.pt\") \n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Helper function to detect checkboxes" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def detect_checkbox(image_path):\n", " \"\"\"\n", " Output inference image with bounding box\n", " Args:\n", " - image: to check for checkboxes\n", " Return: image with bounding boxes drawn \n", " \"\"\"\n", " image = cv2.imread(image_path)\n", " if image is None:\n", " return image\n", " \n", " # Predict on image\n", " results = DETECTION_MODEL.predict(source=image, conf=0.1, iou=0.8) # Predict on image\n", " boxes = results[0].boxes # Get bounding boxes\n", "\n", " if len(boxes) == 0:\n", " return image\n", " \n", " box_coordinates = []\n", "\n", " print('detection_class_conf', 'start_box', 'end_box')\n", " # Get bounding boxes\n", " for box in boxes:\n", " detection_class_conf = round(box.conf.item(), 2)\n", " detection_class = list(BOX_COLORS)[int(box.cls)]\n", " # Get start and end points of the current box\n", " start_box = (int(box.xyxy[0][0]), int(box.xyxy[0][1]))\n", " end_box = (int(box.xyxy[0][2]), int(box.xyxy[0][3]))\n", " box = image[start_box[1]:end_box[1], start_box[0]: end_box[0], :]\n", " \n", " if detection_class == 'checked':\n", " box_coordinates.append((start_box, end_box))\n", " \n", " # Display the start and end coordinates of bbox\n", " print(detection_class_conf, start_box, end_box)\n", " \n", " # 01. DRAW BOUNDING BOX OF OBJECT\n", " line_thickness = round(0.002 * (image.shape[0] + image.shape[1]) / 2) + 1\n", " image = cv2.rectangle(img=image, \n", " pt1=start_box, \n", " pt2=end_box,\n", " color=BOX_COLORS['checked'], \n", " thickness = line_thickness) # Draw the box with predefined colors\n", " \n", " image = cv2.putText(img=image, org=start_box, text=detection_class, fontFace=0, color=(0,0,0), fontScale=line_thickness/3)\n", "\n", " # 02. DRAW LABEL\n", " text = str(detection_class_conf)\n", " # Get text dimensions to draw wrapping box\n", " font_thickness = max(line_thickness - 1, 1)\n", " (text_w, text_h), _ = cv2.getTextSize(text=text, fontFace=2, fontScale=line_thickness/3, thickness=font_thickness)\n", " # Draw wrapping box for text\n", " image = cv2.rectangle(img=image,\n", " pt1=(start_box[0], start_box[1] - text_h - BOX_PADDING*2),\n", " pt2=(start_box[0] + text_w + BOX_PADDING * 2, start_box[1]),\n", " color=BOX_COLORS['checked'],\n", " thickness=-1)\n", " # Put class name on image\n", " start_text = (start_box[0] + BOX_PADDING, start_box[1] - BOX_PADDING)\n", " image = cv2.putText(img=image, text=text, org=start_text, fontFace=0, color=(255,255,255), fontScale=line_thickness/3, thickness=font_thickness)\n", " \n", " return image, box_coordinates" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Detect checkbox\n", "\n", "Takes around ~ 3-6 seconds for each image" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "0: 640x544 5 uncheckeds, 7 checkeds, 820.2ms\n", "Speed: 3.9ms preprocess, 820.2ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 544)\n", "detection_class_conf start_box end_box\n", "0.83 (520, 1162) (551, 1193)\n", "0.83 (522, 1106) (553, 1137)\n", "0.83 (522, 1052) (552, 1082)\n", "0.79 (522, 998) (552, 1029)\n", "0.64 (190, 1003) (212, 1024)\n", "0.29 (186, 1281) (210, 1303)\n", "0.14 (188, 1336) (207, 1355)\n" ] } ], "source": [ "image_filename = f'images/sample.png'\n", "checkbox_img, checkbox_coordinates = detect_checkbox(image_filename)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Display detected checkbox" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import matplotlib.pyplot as plt\n", "\n", "plt.imshow(checkbox_img)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## EasyOCR to get the coordinates of the detected text\n", "\n", "Extract all of the text from the given image along with their coordinates\n", "\n", "It takes ~12 seconds for each image" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# canvas_size = 1400\n", "# image_filename = f'output/{pdf_filename}_output_{curr_image}.png'\n", "# image_filename = f'output/{pdf_filename}_output_1.png'\n", "\n", "# canvas_size = 1400, mag_ratio = 1.5, -> 1371\n", "# canvas_size = 900, mag_ratio = 1.8 -> 3785\n", "\n", "result = reader.readtext(image_filename, decoder = 'beamsearch',\n", " text_threshold = 0.8, low_text = 0.2, link_threshold = 0.4,\n", " canvas_size = 1500, mag_ratio = 1.5,\n", " slope_ths = 0.1, ycenter_ths = 0.8, height_ths = 0.8,\n", " width_ths = 1.0, y_ths = 0.8, x_ths = 1.0, add_margin = 0.1)\n", "\n", "# for each in result:\n", "# print(each)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Extract the coordinates of the CHECKED checkbox and all the texts detected in the given image" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The bottom right coordinate of all the CHECKED checkboxes\n", "[(552, 998), (212, 1003), (552, 1052), (553, 1106), (551, 1162), (210, 1281), (207, 1336)]\n", "\n", "The bottom left coordinate of all the detected text\n", "(233, 137) you have a mailing address other than the one listed above to which applications should be sentplease provide:\n", "(112, 257) A2. Source of Institutional Control: (click to select from dropdown)\n", "(702, 288) Private (Nonprofit)\n", "(114, 332) A3. Classify your undergraduate institution: (click to select\n", "(788, 336) dropdown)\n", "(730, 367) Coeducational\n", "(115, 415) A4. Academic year calendar: (click to select from dropdown)\n", "(759, 446) Quarter\n", "(113, 494) A4A. Describe if calendar differs by program or other:\n", "(114, 911) As. Degrees offered by your institution (select all that apply).\n", "(220, 999) Certificate\n", "(553, 999) Master's\n", "(220, 1056) Diploma\n", "(553, 1054) Post-Master's certificate\n", "(220, 1113) Associate\n", "(552, 1109) Doctoral degree - research/scholarship\n", "(220, 1167) Terminal\n", "(552, 1162) Doctoral degree - professional practice\n", "(220, 1224) Transfer\n", "(552, 1220) Doctoral degree - other\n", "(220, 1278) Bachelor's\n", "(220, 1333) Post-Bachelor's certificate\n", "(112, 1438) A6. Diversity, Equity, and Inclusion\n", "(113, 1464) Ifyou have a diversity, equity, and inclusion office or department; please provide the URL of the corresponding Web page:\n", "(812, 1499) https LLWWW duedulequity\n", "(542, 1543) END OF SECTION A\n", "(737, 332) from\n" ] } ], "source": [ "# Get the bottom right coordinates of the CHECKED checkbox\n", "checkbox_bottom_right_coord = []\n", "\n", "for each in checkbox_coordinates:\n", " checkbox_bottom_right_coord.append((each[1][0], each[0][1]))\n", "\n", "# Sort based on the coordinates\n", "checkbox_bottom_right_coord = sorted(checkbox_bottom_right_coord, key=lambda point: point[1])\n", "\n", "print(\"The bottom right coordinate of all the CHECKED checkboxes\")\n", "print(checkbox_bottom_right_coord)\n", "\n", "\n", "# Get the bottom left coordinate of all the detected text\n", "print(\"\\nThe bottom left coordinate of all the detected text\")\n", "detected_text = {}\n", "\n", "for index, each in enumerate(result):\n", " x_coord = int(each[0][0][0])\n", " y_coord = int(each[0][0][1])\n", " detected_text[(x_coord, y_coord)] = each[1]\n", "\n", "for k, v in detected_text.items():\n", " print(k, v)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Helper function to calculate Euclidean distance between two coordinates" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "import math\n", "\n", "def euclidean_distance(coord1, coord2):\n", " return math.sqrt((coord1[0] - coord2[0])**2 + (coord1[1] - coord2[1])**2)\n", "\n", "def nearest_coordinate(target_coord, coordinates):\n", " min_distance = float('inf')\n", " nearest_coord = None\n", " \n", " for coord in coordinates:\n", " distance = euclidean_distance(target_coord, coord)\n", " if distance < min_distance:\n", " min_distance = distance\n", " nearest_coord = coord\n", " \n", " \n", " return nearest_coord, euclidean_distance(target_coord, nearest_coord)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Extract the text corresponding to the CHECKED checkboxes based on the Euclidean distance\n", "\n", "" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Master's\n", "Certificate\n", "Post-Master's certificate\n", "Doctoral degree - research/scholarship\n", "Doctoral degree - professional practice\n", "Bachelor's\n", "Post-Bachelor's certificate\n" ] } ], "source": [ "# TODO Text cleanup, lowercase to match with the desired word_list for a particular section\n", "\n", "for each_checkbox_coord in checkbox_bottom_right_coord:\n", " nearest, distance = nearest_coordinate(each_checkbox_coord, list(detected_text.keys()))\n", " if distance <= 15:\n", " print(detected_text[nearest])\n", " \n" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.2" } }, "nbformat": 4, "nbformat_minor": 2 }