BMukhtar commited on
Commit
2d38cb5
1 Parent(s): e4045bf
app.py CHANGED
@@ -1,4 +1,3 @@
1
-
2
  import streamlit as st
3
  from PIL import Image
4
  import os
@@ -10,7 +9,6 @@ from pdf2image import convert_from_bytes
10
  #from st_btn_group import st_btn_group
11
  #from streamlit_option_menu import option_menu
12
  import docx
13
- from docx.shared import Pt
14
  from io import BytesIO
15
  #import streamlit.components.v1 as components
16
  import base64
@@ -118,12 +116,12 @@ col1, col2 = st.columns(2)
118
  # return image, result
119
  import time
120
 
121
- max_page = 100
122
  def recognize_page_image(image):
123
  start = time.time()
124
  result = [[0,"Sample 1"],[1,"Sample 2"]]
125
- result = reader.readtext(np.array(image), paragraph=False)
126
- result = get_paragraph(result)
127
  end = time.time()
128
  return result,(end-start)
129
 
@@ -164,7 +162,7 @@ def process_pdf(uploaded_file):
164
  button_group.write(button_group_html,unsafe_allow_html=True)
165
  #col1.write("</div>",unsafe_allow_html=True)
166
  progress_bar.progress(0.99,text=f'{min(total_pages,max_page)} бет жүктелді')
167
-
168
  def get_paragraph(raw_result, x_ths=1, y_ths=0.5, mode = 'ltr'):
169
  # create basic attributes
170
  box_group = []
@@ -188,18 +186,24 @@ def get_paragraph(raw_result, x_ths=1, y_ths=0.5, mode = 'ltr'):
188
  else:
189
  current_box_group = [box for box in box_group if box[7]==current_group]
190
  mean_height = np.mean([box[5] for box in current_box_group])
191
- min_gx = min([box[1] for box in current_box_group]) - x_ths*mean_height
192
- max_gx = max([box[2] for box in current_box_group]) + x_ths*mean_height
193
- min_gy = min([box[3] for box in current_box_group]) - y_ths*mean_height
194
- max_gy = max([box[4] for box in current_box_group]) + y_ths*mean_height
195
  add_box = False
196
- for box in box_group0:
197
- same_horizontal_level = (min_gx<=box[1]<=max_gx) or (min_gx<=box[2]<=max_gx)
198
- same_vertical_level = (min_gy<=box[3]<=max_gy) or (min_gy<=box[4]<=max_gy)
199
- if same_horizontal_level and same_vertical_level:
200
- box[7] = current_group
201
- add_box = True
202
- break
 
 
 
 
 
 
203
  # cannot add more box, go to next group
204
  if add_box==False:
205
  current_group += 1
@@ -233,6 +237,7 @@ def get_paragraph(raw_result, x_ths=1, y_ths=0.5, mode = 'ltr'):
233
 
234
  return result
235
 
 
236
  if uploaded_file is not None:
237
  if uploaded_file.type == "application/pdf":
238
  placeholder = col2.empty()
@@ -247,9 +252,9 @@ if uploaded_file is not None:
247
  image = Image.open(uploaded_file)
248
  #with open(os.path.join("tempDir",image_file))
249
  col1.image(image)
250
- result = reader.readtext(np.array(image), paragraph=True)
 
251
  result_text = "\n\n".join([item[1] for item in result])
252
  button_group_html = generateButtonGroup(result)
253
  col2.write(button_group_html, unsafe_allow_html=True)
254
- col2.markdown(result_text)
255
-
 
 
1
  import streamlit as st
2
  from PIL import Image
3
  import os
 
9
  #from st_btn_group import st_btn_group
10
  #from streamlit_option_menu import option_menu
11
  import docx
 
12
  from io import BytesIO
13
  #import streamlit.components.v1 as components
14
  import base64
 
116
  # return image, result
117
  import time
118
 
119
+ max_page = 5
120
  def recognize_page_image(image):
121
  start = time.time()
122
  result = [[0,"Sample 1"],[1,"Sample 2"]]
123
+ result = reader.readtext(np.array(image), batch_size=64, paragraph=False, y_ths=0, width_ths = 0)
124
+ result = get_paragraph(result, y_ths=0, x_ths = 0)
125
  end = time.time()
126
  return result,(end-start)
127
 
 
162
  button_group.write(button_group_html,unsafe_allow_html=True)
163
  #col1.write("</div>",unsafe_allow_html=True)
164
  progress_bar.progress(0.99,text=f'{min(total_pages,max_page)} бет жүктелді')
165
+
166
  def get_paragraph(raw_result, x_ths=1, y_ths=0.5, mode = 'ltr'):
167
  # create basic attributes
168
  box_group = []
 
186
  else:
187
  current_box_group = [box for box in box_group if box[7]==current_group]
188
  mean_height = np.mean([box[5] for box in current_box_group])
189
+ # min_gx = min([box[1] for box in current_box_group]) - x_ths*mean_height
190
+ # max_gx = max([box[2] for box in current_box_group]) + x_ths*mean_height
191
+ # min_gy = min([box[3] for box in current_box_group]) - y_ths*mean_height
192
+ # max_gy = max([box[4] for box in current_box_group]) + y_ths*mean_height
193
  add_box = False
194
+
195
+ for box in current_box_group:
196
+ min_gx = box[1] - x_ths*mean_height
197
+ max_gx = box[2] + x_ths*mean_height
198
+ min_gy = box[3] - y_ths*mean_height
199
+ max_gy = box[4] + y_ths*mean_height
200
+ for box in box_group0:
201
+ same_horizontal_level = (min_gx<=box[1]<=max_gx) or (min_gx<=box[2]<=max_gx)
202
+ same_vertical_level = (min_gy<=box[6]<=max_gy)
203
+ if same_horizontal_level and same_vertical_level:
204
+ box[7] = current_group
205
+ add_box = True
206
+ break
207
  # cannot add more box, go to next group
208
  if add_box==False:
209
  current_group += 1
 
237
 
238
  return result
239
 
240
+
241
  if uploaded_file is not None:
242
  if uploaded_file.type == "application/pdf":
243
  placeholder = col2.empty()
 
252
  image = Image.open(uploaded_file)
253
  #with open(os.path.join("tempDir",image_file))
254
  col1.image(image)
255
+ result = reader.readtext(np.array(image), batch_size=64, paragraph=False, y_ths=0, width_ths = 0)
256
+ result = get_paragraph(result, y_ths=0)
257
  result_text = "\n\n".join([item[1] for item in result])
258
  button_group_html = generateButtonGroup(result)
259
  col2.write(button_group_html, unsafe_allow_html=True)
260
+ col2.markdown(result_text)
 
models/__pycache__/best_norm_ED.cpython-310.pyc CHANGED
Binary files a/models/__pycache__/best_norm_ED.cpython-310.pyc and b/models/__pycache__/best_norm_ED.cpython-310.pyc differ
 
models/best_norm_ED.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a93677c37a1bc9a268eb362df2772fbc9a5237b375740254e63063be2cebf6a4
3
  size 15217067
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0a93dd748a84d3998efccee420e3cabdf6b1693d3411374e871bcdb8c078169
3
  size 15217067