Soumen commited on
Commit
3e4f1f9
1 Parent(s): f9d9653
Files changed (1) hide show
  1. app.py +19 -4
app.py CHANGED
@@ -32,7 +32,7 @@ import docx2txt
32
  from PIL import Image
33
  from PyPDF2 import PdfFileReader
34
  import pdfplumber
35
-
36
  # NLP Pkgs
37
  from textblob import TextBlob
38
  import spacy
@@ -41,6 +41,7 @@ import requests
41
  import cv2
42
  import numpy as np
43
  import pytesseract
 
44
  #pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
45
  from PIL import Image
46
  def read_pdf(file):
@@ -49,14 +50,17 @@ def read_pdf(file):
49
  all_page_text = ""
50
  for i in range(count):
51
  page = pdfReader.getPage(i)
52
- all_page_text += page.extractText()
 
 
 
53
 
54
  return all_page_text
55
 
56
  #def read_pdf_with_pdfplumber(file):
57
  # with pdfplumber.open(file) as pdf:
58
  # page = pdf.pages[0]
59
- # return page.extract_text()s
60
  st.title("Streamlit NLP APP")
61
  @st.experimental_singleton
62
  def text_analyzer(my_text):
@@ -107,7 +111,18 @@ def main():
107
  img = Image.open(uploaded_photo)
108
  img = img.save("img.png")
109
  img = cv2.imread("img.png")
110
- text = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(img)
 
 
 
 
 
 
 
 
 
 
 
111
  st.success(text)
112
  elif camera_photo:
113
  img = Image.open(camera_photo)
 
32
  from PIL import Image
33
  from PyPDF2 import PdfFileReader
34
  import pdfplumber
35
+ from line_cor import mark_region
36
  # NLP Pkgs
37
  from textblob import TextBlob
38
  import spacy
 
41
  import cv2
42
  import numpy as np
43
  import pytesseract
44
+
45
  #pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
46
  from PIL import Image
47
  def read_pdf(file):
 
50
  all_page_text = ""
51
  for i in range(count):
52
  page = pdfReader.getPage(i)
53
+ image_name = "Page_" + str(i) + ".jpg"
54
+ page.save(image_name, "JPEG")
55
+ text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
56
+ all_page_text += text + " " #page.extractText()
57
 
58
  return all_page_text
59
 
60
  #def read_pdf_with_pdfplumber(file):
61
  # with pdfplumber.open(file) as pdf:
62
  # page = pdf.pages[0]
63
+ # return page.extract_text()
64
  st.title("Streamlit NLP APP")
65
  @st.experimental_singleton
66
  def text_analyzer(my_text):
 
111
  img = Image.open(uploaded_photo)
112
  img = img.save("img.png")
113
  img = cv2.imread("img.png")
114
+ # get co-ordinates to crop the image
115
+ image, lc = mark_region(img)
116
+ c = lc[1]
117
+ # cropping image img = image[y0:y1, x0:x1]
118
+ img = image[c[0][1]:c[1][1], c[0][0]:c[1][0]]
119
+ plt.figure(figsize=(10,10))
120
+ plt.imshow(img)
121
+ # convert the image to black and white for better OCR
122
+ ret,thresh1 = cv2.threshold(img,120,255,cv2.THRESH_BINARY)
123
+ # pytesseract image to string to get results
124
+ text = str(pytesseract.image_to_string(thresh1, config='--psm 6',lang="ben")) if st.checkbox("Bangla") else str(pytesseract.image_to_string(thresh1, config='--psm 6'))
125
+ #text = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(img)
126
  st.success(text)
127
  elif camera_photo:
128
  img = Image.open(camera_photo)