muryshev commited on
Commit
eeebb29
·
1 Parent(s): 1554eeb
Files changed (4) hide show
  1. Dockerfile +8 -5
  2. app.py +24 -3
  3. lib/ocr_1.py +236 -0
  4. requirements.txt +5 -1
Dockerfile CHANGED
@@ -8,21 +8,24 @@ ENV APP_HOME /app
8
 
9
  # Install Tesseract and its dependencies
10
  RUN apt-get update && apt-get install --no-install-recommends -y \
11
- tesseract-ocr \
12
- tesseract-ocr-rus poppler-utils && \
13
  rm -rf /var/lib/apt/lists/*
14
 
15
  # Create and set the working directory
16
  RUN mkdir /var/www
17
  RUN mkdir /var/www/tmp
18
- RUN chmod +w /var/www/tmp
 
 
 
 
19
  ENV HOME /var/www
20
  WORKDIR /var/www
21
  COPY . /var/www
22
 
23
  RUN pip install --no-cache-dir -r requirements.txt
24
-
25
  EXPOSE 7860
26
 
27
  # Run the Flask application
28
- CMD flask run --host=0.0.0.0 --port=7860
 
8
 
9
  # Install Tesseract and its dependencies
10
  RUN apt-get update && apt-get install --no-install-recommends -y \
11
+ tesseract-ocr tesseract-ocr-rus poppler-utils python3-opencv && \
 
12
  rm -rf /var/lib/apt/lists/*
13
 
14
  # Create and set the working directory
15
  RUN mkdir /var/www
16
  RUN mkdir /var/www/tmp
17
+ RUN chmod a+w /var/www/tmp
18
+
19
+ RUN groupadd -r flaskuser && useradd -r -g flaskuser flaskuser
20
+
21
+
22
  ENV HOME /var/www
23
  WORKDIR /var/www
24
  COPY . /var/www
25
 
26
  RUN pip install --no-cache-dir -r requirements.txt
27
+ USER flaskuser
28
  EXPOSE 7860
29
 
30
  # Run the Flask application
31
+ CMD flask run --host=0.0.0.0 --port=7860
app.py CHANGED
@@ -4,6 +4,7 @@ from flask import Flask, request, jsonify
4
  import pytesseract
5
  from pdf2image import convert_from_bytes
6
  from flask_cors import CORS
 
7
 
8
  os.environ['TESSDATA_PREFIX'] = '/usr/share/tesseract-ocr/5/tessdata'
9
 
@@ -13,7 +14,7 @@ UPLOAD_FOLDER = './tmp'
13
  app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
14
 
15
  # Endpoint for uploading PDF and extracting text
16
- @app.route('/upload', methods=['POST'])
17
  def upload_file():
18
  # Check if the post request has the file part
19
  if 'file' not in request.files:
@@ -41,14 +42,34 @@ def upload_file():
41
  # text += pytesseract.image_to_string(img, lang='rus')
42
 
43
 
44
- # присрать сюда вызов библиотеки Андрея с temp_path
45
 
46
 
47
  os.remove(temp_path)
48
 
49
- return jsonify({'text': text})
50
  else:
51
  return jsonify({'error': 'File must be a PDF'})
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  if __name__ == '__main__':
54
  app.run(debug=True)
 
4
  import pytesseract
5
  from pdf2image import convert_from_bytes
6
  from flask_cors import CORS
7
+ from lib import ocr_1
8
 
9
  os.environ['TESSDATA_PREFIX'] = '/usr/share/tesseract-ocr/5/tessdata'
10
 
 
14
  app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
15
 
16
  # Endpoint for uploading PDF and extracting text
17
+ @app.route('/recognize', methods=['POST'])
18
  def upload_file():
19
  # Check if the post request has the file part
20
  if 'file' not in request.files:
 
42
  # text += pytesseract.image_to_string(img, lang='rus')
43
 
44
 
45
+ docs_info = ocr_1.processSingleFile(temp_path)
46
 
47
 
48
  os.remove(temp_path)
49
 
50
+ return jsonify(docs_info)
51
  else:
52
  return jsonify({'error': 'File must be a PDF'})
53
 
54
+ # Endpoint for uploading PDF and extracting text
55
+ @app.route('/analize', methods=['POST'])
56
+ def analize():
57
+ # Get the text data from the request
58
+ text_data = request.json.get('text')
59
+
60
+ # Process the text data and generate the JSON response
61
+ result = []
62
+
63
+ # Example processing: Split the text into two groups
64
+ group1 = [{"название параметра группы 1": word} for word in text_data.split()[:len(text_data)//2]]
65
+ group2 = [{"название параметра группы 2": word} for word in text_data.split()[len(text_data)//2:]]
66
+
67
+ # Append the groups to the result list
68
+ result.append(group1)
69
+ result.append(group2)
70
+
71
+ # Return the JSON response
72
+ return jsonify(result)
73
+
74
  if __name__ == '__main__':
75
  app.run(debug=True)
lib/ocr_1.py ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PIL import Image, ImageFilter
2
+ import cv2
3
+ import pytesseract
4
+ from pytesseract import Output
5
+ from os import listdir
6
+ from os.path import isfile, join
7
+ import numpy as np
8
+ import json
9
+ import matplotlib.pyplot as plt
10
+ from pdf2image import convert_from_path
11
+ from matplotlib import pyplot as plt
12
+ import re
13
+
14
+ def processFiles(pdfs, verbose = False) :
15
+ images_per_pdf_2d = [convert_from_path(file) for file in pdfs]
16
+
17
+ images_per_pdf = []
18
+ docfilenames = []
19
+ pagenames = []
20
+ fileindices = []
21
+ for i in range(len(images_per_pdf_2d)) :
22
+ docfilenames.append(pdfs[i][:-4])
23
+ pageindices = []
24
+ for j in range(len(images_per_pdf_2d[i])) :
25
+ images_per_pdf.append(images_per_pdf_2d[i][j])
26
+ pagenames.append(pdfs[i][:-4] + '_page_' + str(j))
27
+ pageindices.append(len(pagenames) - 1)
28
+ # print(i, j, len(pagenames) - 1, pagenames[-1])
29
+
30
+ fileindices.append(pageindices)
31
+
32
+ gray_images_per_pdf_cropped = []
33
+ for i in range(len(images_per_pdf)) :
34
+ image = images_per_pdf[i]
35
+ crop = image.convert("L").crop((
36
+ 750, 150, # left top point
37
+ 1654, 850 # right bottom point
38
+ ))
39
+ gray_images_per_pdf_cropped.append(crop)
40
+
41
+ texts = [pytesseract.image_to_string(image, lang='rus') for image in gray_images_per_pdf_cropped]
42
+ fulltexts = [pytesseract.image_to_string(image, lang='rus') for image in images_per_pdf]
43
+
44
+ cropped_images = gray_images_per_pdf_cropped
45
+ init_size = cropped_images[0].size
46
+ thresh_imgs = [
47
+ image.resize(
48
+ (init_size[0] //4, init_size[1] // 4)
49
+ ).point(
50
+ lambda x: 0 if x < 220 else 255
51
+ ).filter(
52
+ ImageFilter.MedianFilter(5)
53
+ ).filter(
54
+ ImageFilter.MinFilter(15) #15
55
+ ) for i,(name,image) in enumerate(zip(pagenames, cropped_images))
56
+ ]
57
+
58
+ masks = thresh_imgs
59
+ masks_arr = [np.array(img) for img in masks]
60
+ mask_shape = masks_arr[0].shape
61
+
62
+ str_size = 7
63
+ masks = []
64
+ masks_bw = []
65
+ for name, mask in zip(pagenames, masks_arr):
66
+ cleaned_mask = mask.copy()
67
+
68
+ for iter in range(mask_shape[0] // str_size):
69
+ temp_mean = int(cleaned_mask[iter*str_size : iter*str_size + str_size, :].mean())
70
+
71
+ if (temp_mean < 49) or (temp_mean > 160):
72
+ cleaned_mask[iter*str_size : iter*str_size + str_size, :] = 255
73
+
74
+ vertical_threshold = 200
75
+
76
+ for i in range(mask_shape[1] // str_size + 1):
77
+ if (i*str_size + str_size) > mask_shape[1]:
78
+ temp_mean_vertical = int(cleaned_mask[:, i*str_size : mask_shape[1]].mean())
79
+
80
+ if temp_mean_vertical > vertical_threshold:
81
+ cleaned_mask[:, i*str_size : mask_shape[1]] = 255
82
+ else:
83
+ temp_mean_vertical = int(cleaned_mask[:, i*str_size : i*str_size + str_size].mean())
84
+
85
+ if temp_mean_vertical > vertical_threshold:
86
+ cleaned_mask[:, i*str_size : i*str_size + str_size] = 255
87
+
88
+ image = Image.fromarray(cleaned_mask).filter(
89
+ ImageFilter.MedianFilter(13)
90
+ ).filter(
91
+ ImageFilter.MinFilter(25) #15
92
+ )
93
+ masks.append(image)
94
+ masks_bw.append(image.convert('1'))
95
+
96
+ masks_bw_arr = [np.array(img) for img in masks_bw]
97
+
98
+ # check which pages have address box: if there is no address box the mask is empty
99
+
100
+ addressexists = [bool((~mask_bw).sum()) for mask_bw in masks_bw_arr]
101
+
102
+ # this is a list of CB names that may be used in address
103
+
104
+ CBnames = [
105
+ 'цб рф',
106
+ 'центральный банк',
107
+ 'центрального банка',
108
+ 'банк россии',
109
+ 'банка россии',
110
+ ]
111
+
112
+ # check which pages have address box addressed to CB
113
+
114
+ toCB = []
115
+ for i in range(len(addressexists)) :
116
+ iftoCB = False
117
+ for j in range(len(CBnames)) :
118
+ if addressexists[i] and CBnames[j] in texts[i].lower() :
119
+ iftoCB = True
120
+ break
121
+
122
+ toCB.append(iftoCB)
123
+
124
+ # build 3-level list: file -> doc -> page
125
+
126
+ docindices = []
127
+ doctypes = []
128
+ for i in range(len(fileindices)) :
129
+ docs = []
130
+ types = []
131
+ pages = []
132
+ doctype = False
133
+ for j in range(len(fileindices[i])) :
134
+ index = fileindices[i][j]
135
+ ifaddress = addressexists[index]
136
+ iftoCB = toCB[index]
137
+ if ifaddress :
138
+ if len(pages) > 0 :
139
+ docs.append(pages)
140
+ types.append(doctype)
141
+
142
+ pages = []
143
+ doctype = iftoCB
144
+
145
+ pages.append(index)
146
+
147
+ docs.append(pages)
148
+ types.append(doctype)
149
+ docindices.append(docs)
150
+ doctypes.append(types)
151
+
152
+ cropped = cropped_images
153
+ orig_size = cropped[0].size
154
+ masks = [mask.convert('L').resize((orig_size)) for mask in masks]
155
+
156
+ if verbose :
157
+ for i in range(len(masks)) :
158
+ img = np.array(masks[i])
159
+ out = np.array(cropped[i])
160
+
161
+ bw = cv2.inRange(img, 0, 12)
162
+ contours, hierarchy = cv2.findContours(bw, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
163
+
164
+ aaa = cv2.drawContours(out, contours, -1, (0, 255, 0), 5, cv2.LINE_AA, hierarchy, 1)
165
+
166
+ print()
167
+ print(pagenames[i])
168
+ print('Address exists :', addressexists[i])
169
+ print('To CB :', toCB[i])
170
+ # if addressflags[i] :
171
+
172
+ # if toCB[i] :
173
+ # print('text :', texts[i])
174
+ plt.imshow(Image.fromarray(aaa))
175
+ plt.show()
176
+
177
+ # print recognized text with marks: file - > doc # and doc type -> page number and text
178
+
179
+ docs_info = []
180
+ for i in range(len(docindices)) :
181
+ docs = []
182
+ if verbose :
183
+ print('File =', docfilenames[i])
184
+
185
+ for j in range(len(docindices[i])) :
186
+ doc = {}
187
+ doctype = 'Сопроводительное письмо'
188
+ if doctypes[i][j] :
189
+ doctype = 'Обращение'
190
+
191
+ doc['Тип документа'] = doctype
192
+ text = ''
193
+ if verbose :
194
+ print('Doc =', j, 'Type =', doctype)
195
+
196
+ for k in range(len(docindices[i][j])) :
197
+ index = docindices[i][j][k]
198
+ text += fulltexts[index]
199
+ if verbose :
200
+ print('Page =', pagenames[index])
201
+ print(fulltexts[index])
202
+ print('--- end of page ---')
203
+ print()
204
+
205
+ text = re.sub(r'\n +', r'\n', text)
206
+ text = re.sub(r'\n+', r'\n', text)
207
+ doc['Текст документа'] = text
208
+ docs.append(doc)
209
+
210
+ docs_info.append(docs)
211
+
212
+ for i in range(len(docindices)) :
213
+ for j in range(len(docindices[i])) :
214
+ for k in range(len(docindices[i][j])) :
215
+ index = docindices[i][j][k]
216
+ if toCB[index] :
217
+ if verbose :
218
+ print('Page =', pagenames[index])
219
+ print(texts[index].strip())
220
+ print('------------------------')
221
+ print()
222
+
223
+ return docs_info
224
+
225
+ def processSingleFile(file) :
226
+ return processFiles([file])
227
+
228
+ # docs_info =
229
+ # [
230
+ # {
231
+ # 'Имя поля' : 'Текст поля',
232
+ # ...
233
+ # },
234
+ # ...
235
+ # ]
236
+ # то есть это массив документов, содержащихся в файле, для каждого документа задан словарь 'Имя поля' : 'Текст поля' (сейчас там 2 поля для каждого документа)
requirements.txt CHANGED
@@ -1,4 +1,8 @@
1
  flask
2
  flask-cors
3
  pytesseract
4
- pdf2image
 
 
 
 
 
1
  flask
2
  flask-cors
3
  pytesseract
4
+ pdf2image
5
+ opencv-python
6
+ matplotlib
7
+ numpy
8
+ pillow