Soumen commited on
Commit
f4332f9
1 Parent(s): 5aeb295

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -1
app.py CHANGED
@@ -33,7 +33,7 @@ from PIL import Image
33
  from PyPDF2 import PdfFileReader
34
  from pdf2image import convert_from_bytes
35
  import pdfplumber
36
- from line_cor import mark_region
37
  import pdf2image
38
 
39
 
@@ -48,6 +48,42 @@ import pytesseract
48
 
49
  #pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
50
  from PIL import Image
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  @st.experimental_singleton
52
  def read_pdf(file):
53
  images=pdf2image.convert_from_path(file)
 
33
  from PyPDF2 import PdfFileReader
34
  from pdf2image import convert_from_bytes
35
  import pdfplumber
36
+ #from line_cor import mark_region
37
  import pdf2image
38
 
39
 
 
48
 
49
  #pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
50
  from PIL import Image
51
+ def mark_region(im):
52
+
53
+ #im = cv2.imread(image_path)
54
+
55
+ gray = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
56
+ blur = cv2.GaussianBlur(gray, (9,9), 0)
57
+ thresh = cv2.adaptiveThreshold(blur,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV,11,30)
58
+
59
+ # Dilate to combine adjacent text contours
60
+ kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (9,9))
61
+ dilate = cv2.dilate(thresh, kernel, iterations=4)
62
+
63
+ # Find contours, highlight text areas, and extract ROIs
64
+ cnts = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
65
+ cnts = cnts[0] if len(cnts) == 2 else cnts[1]
66
+
67
+ line_items_coordinates = []
68
+ for c in cnts:
69
+ area = cv2.contourArea(c)
70
+ x,y,w,h = cv2.boundingRect(c)
71
+
72
+ if y >= 600 and x <= 1000:
73
+ if area > 10000:
74
+ image = cv2.rectangle(im, (x,y), (2200, y+h), color=(255,0,255), thickness=3)
75
+ line_items_coordinates.append([(x,y), (2200, y+h)])
76
+
77
+ if y >= 2400 and x<= 2000:
78
+ image = cv2.rectangle(im, (x,y), (2200, y+h), color=(255,0,255), thickness=3)
79
+ line_items_coordinates.append([(x,y), (2200, y+h)])
80
+
81
+
82
+ return image, line_items_coordinates
83
+
84
+
85
+
86
+
87
  @st.experimental_singleton
88
  def read_pdf(file):
89
  images=pdf2image.convert_from_path(file)