{ "cells": [ { "cell_type": "code", "execution_count": 25, "id": "536c48a1", "metadata": {}, "outputs": [], "source": [ "#!pip install pytesseract\n", "from PIL import Image\n", "import pandas as pd\n", "import pytesseract\n", "from pytesseract import Output\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 26, "id": "611fd576-62e6-406a-83ed-6d0a8497e34d", "metadata": {}, "outputs": [], "source": [ "#!pip install pyarrow" ] }, { "cell_type": "code", "execution_count": 27, "id": "f97b4939", "metadata": {}, "outputs": [], "source": [ "df = pd.read_parquet('./testing.parquet')" ] }, { "cell_type": "code", "execution_count": 28, "id": "afd89e19-9348-414e-a951-4e36dfa3fb60", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | image | \n", "ocr_annotation_texts | \n", "image_height | \n", "image_width | \n", "
---|---|---|---|---|
0 | \n", "b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIHDR\\x00\\x00\\... | \n", "71 2 84 11 \\n43 7 57 9 PROJECT BRIEF\\n14 11 19... | \n", "1000 | \n", "762 | \n", "
1 | \n", "b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIHDR\\x00\\x00\\... | \n", "3 3 11 10 B&W\\n77 3 87 10 QUALITY\\n15 4 74 9 Q... | \n", "1000 | \n", "762 | \n", "
2 | \n", "b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIHDR\\x00\\x00\\... | \n", "12 11 15 13 TO:\\n24 11 34 13 R. B. SPELL\\n64 1... | \n", "1000 | \n", "754 | \n", "
3 | \n", "b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIHDR\\x00\\x00\\... | \n", "28 6 73 9 SPORTS MARKETING ENTERPRISES DOCUMEN... | \n", "1000 | \n", "795 | \n", "
4 | \n", "b'\\x89PNG\\r\\n\\x1a\\n\\x00\\x00\\x00\\rIHDR\\x00\\x00\\... | \n", "18 8 25 9 S.P. Zolot\\n2 8 5 9 TO:\\n60 8 73 10 ... | \n", "1000 | \n", "754 | \n", "