Update app.py
Browse files
app.py
CHANGED
@@ -33,7 +33,7 @@ from PIL import Image
|
|
33 |
from PyPDF2 import PdfFileReader
|
34 |
from pdf2image import convert_from_bytes
|
35 |
import pdfplumber
|
36 |
-
from line_cor import mark_region
|
37 |
import pdf2image
|
38 |
|
39 |
|
@@ -48,6 +48,42 @@ import pytesseract
|
|
48 |
|
49 |
#pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
|
50 |
from PIL import Image
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
@st.experimental_singleton
|
52 |
def read_pdf(file):
|
53 |
images=pdf2image.convert_from_path(file)
|
|
|
33 |
from PyPDF2 import PdfFileReader
|
34 |
from pdf2image import convert_from_bytes
|
35 |
import pdfplumber
|
36 |
+
#from line_cor import mark_region
|
37 |
import pdf2image
|
38 |
|
39 |
|
|
|
48 |
|
49 |
#pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
|
50 |
from PIL import Image
|
51 |
+
def mark_region(im):
|
52 |
+
|
53 |
+
#im = cv2.imread(image_path)
|
54 |
+
|
55 |
+
gray = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
|
56 |
+
blur = cv2.GaussianBlur(gray, (9,9), 0)
|
57 |
+
thresh = cv2.adaptiveThreshold(blur,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV,11,30)
|
58 |
+
|
59 |
+
# Dilate to combine adjacent text contours
|
60 |
+
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (9,9))
|
61 |
+
dilate = cv2.dilate(thresh, kernel, iterations=4)
|
62 |
+
|
63 |
+
# Find contours, highlight text areas, and extract ROIs
|
64 |
+
cnts = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
65 |
+
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
|
66 |
+
|
67 |
+
line_items_coordinates = []
|
68 |
+
for c in cnts:
|
69 |
+
area = cv2.contourArea(c)
|
70 |
+
x,y,w,h = cv2.boundingRect(c)
|
71 |
+
|
72 |
+
if y >= 600 and x <= 1000:
|
73 |
+
if area > 10000:
|
74 |
+
image = cv2.rectangle(im, (x,y), (2200, y+h), color=(255,0,255), thickness=3)
|
75 |
+
line_items_coordinates.append([(x,y), (2200, y+h)])
|
76 |
+
|
77 |
+
if y >= 2400 and x<= 2000:
|
78 |
+
image = cv2.rectangle(im, (x,y), (2200, y+h), color=(255,0,255), thickness=3)
|
79 |
+
line_items_coordinates.append([(x,y), (2200, y+h)])
|
80 |
+
|
81 |
+
|
82 |
+
return image, line_items_coordinates
|
83 |
+
|
84 |
+
|
85 |
+
|
86 |
+
|
87 |
@st.experimental_singleton
|
88 |
def read_pdf(file):
|
89 |
images=pdf2image.convert_from_path(file)
|