aritheanalyst commited on
Commit
0751294
1 Parent(s): be153c8

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +291 -0
app.py ADDED
@@ -0,0 +1,291 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #**************** IMPORT PACKAGES ********************
2
+ import flask
3
+ from flask import render_template, jsonify, Flask, redirect, url_for, request, flash
4
+ from flask_cors import CORS, cross_origin
5
+ from werkzeug.utils import secure_filename
6
+ import numpy as np
7
+ import pytesseract as pt
8
+ import pdf2image
9
+ from fpdf import FPDF
10
+ import re
11
+ import nltk
12
+ from nltk.tokenize import sent_tokenize
13
+ from nltk.tokenize import word_tokenize
14
+ import os
15
+ import pdfkit
16
+ import yake
17
+ from transformers import AutoTokenizer, AutoModelForPreTraining, AutoModel, AutoConfig
18
+ from summarizer import Summarizer,TransformerSummarizer
19
+ from transformers import pipelines
20
+ #nltk.download('punkt')
21
+
22
+ print("lets go")
23
+
24
+
25
+ app = flask.Flask(__name__)
26
+ app.config["DEBUG"] = True
27
+ UPLOAD_FOLDER = './pdfs'
28
+
29
+ ALLOWED_EXTENSIONS = {'txt', 'pdf'}
30
+ app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
31
+
32
+ #***************** FLASK *****************************
33
+ CORS(app)
34
+
35
+
36
+ def allowed_file(filename):
37
+ return '.' in filename and \
38
+ filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
39
+
40
+
41
+
42
+ #model_name = 'laxya007/gpt2_legal'
43
+ #model_name = 'facebook/bart-large-cnn'
44
+ model_name = 'nlpaueb/legal-bert-base-uncased'
45
+
46
+
47
+ #The setup of huggingface.co
48
+
49
+ print("lets go")
50
+
51
+ custom_config = AutoConfig.from_pretrained(model_name)
52
+ print("lets go")
53
+ custom_config.output_hidden_states=True
54
+ print("lets go")
55
+ custom_tokenizer = AutoTokenizer.from_pretrained(model_name)
56
+ print("lets go")
57
+ #custom_model = AutoModel.from_pretrained(model_name, config=custom_config)
58
+ print("lets go")
59
+ bert_legal_model = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer)
60
+ print('Using model {}\n'.format(model_name))
61
+
62
+
63
+
64
+ # main index page route
65
+ @app.route('/')
66
+ @cross_origin()
67
+ def index():
68
+ return render_template('index.html')
69
+
70
+ @cross_origin()
71
+ @app.route('/results')
72
+ def results():
73
+ return render_template('results.html')
74
+
75
+
76
+
77
+ @app.route('/predict', methods=['GET', 'POST'])
78
+ def uploads():
79
+ if request.method == 'GET':
80
+ # Get the file from post request
81
+
82
+ numsent = int(request.args['number'])
83
+ text = str(request.args['text'])
84
+ content = text
85
+
86
+
87
+ summary_text = ""
88
+ for i, paragraph in enumerate(content.split("\n\n")):
89
+
90
+ paragraph = paragraph.replace('\n',' ')
91
+ paragraph = paragraph.replace('\t','')
92
+ paragraph = ' '.join(paragraph.split())
93
+ # count words in the paragraph and exclude if less than 4 words
94
+ tokens = word_tokenize(paragraph)
95
+ # only do real words
96
+ tokens = [word for word in tokens if word.isalpha()]
97
+ # print("\nTokens: {}\n".format(len(tokens)))
98
+ # only do sentences with more than 1 words excl. alpha crap
99
+ if len(tokens) <= 1:
100
+ continue
101
+ # Perhaps also ignore paragraphs with no sentence?
102
+ sentences = sent_tokenize(paragraph)
103
+
104
+ paragraph = ' '.join(tokens)
105
+
106
+ print("\nParagraph:")
107
+ print(paragraph+"\n")
108
+ # T5 needs to have 'summarize' in order to work:
109
+ # text = "summarize:" + paragraph
110
+ text = paragraph
111
+
112
+ summary = bert_legal_model(text, min_length = 8, ratio = 0.05)
113
+ # summary = tokenizer_t5.decode(summary_ids[0], skip_special_tokens=True)
114
+ summary_text += str(summary) + "\n\n"
115
+ print("Summary:")
116
+ print(summary)
117
+
118
+ content2 = content.replace('\n',' ')
119
+ content2 = content2.replace('\t','')
120
+ summary = bert_legal_model(content2, min_length = 8, num_sentences=25)
121
+
122
+
123
+
124
+ # write all to file for inspection and storage
125
+ all_text = "The Summary-- " + str(summary) + "\n\n\n" \
126
+ + "The Larger Summary-- " + str(summary_text)
127
+
128
+
129
+ all_text2 = all_text.encode('latin-1', 'replace').decode('latin-1')
130
+ all_text2 = all_text2.replace('?','.')
131
+ all_text2 = all_text2.replace('\n',' ')
132
+ all_text2 = all_text2.replace('..','.')
133
+ all_text2 = all_text2.replace(',.',',')
134
+ all_text2 = all_text2.replace('-- ','\n\n\n')
135
+
136
+ pdf = FPDF()
137
+
138
+ # Add a page
139
+ pdf.add_page()
140
+
141
+ pdf.set_font("Times", size = 12)
142
+
143
+ # open the text file in read mode
144
+ f = all_text2
145
+
146
+ # insert the texts in pdf
147
+ pdf.multi_cell(190, 10, txt = f, align = 'C')
148
+
149
+
150
+ # save the pdf with name .pdf
151
+ pdf.output("./static/legal.pdf")
152
+ all_text
153
+
154
+
155
+ return render_template('results.html')
156
+ return None
157
+
158
+
159
+
160
+
161
+ @app.route('/predictpdf', methods=['GET', 'POST'])
162
+ def uploads2():
163
+ if request.method == 'POST':
164
+ # Get the file from post request
165
+
166
+ numsent = int(request.args['number'])
167
+ if 'file' not in request.files:
168
+ flash('No file part')
169
+ return redirect(request.url)
170
+ file = request.files['file']
171
+ # if user does not select file, browser also
172
+ # submit an empty part without filename
173
+ if file.filename == '':
174
+ flash('No selected file')
175
+ return redirect(request.url)
176
+ if file and allowed_file(file.filename):
177
+ filename = "legal.pdf"
178
+ file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename))
179
+
180
+ f = request.files['file']
181
+ f.save(secure_filename(f.filename))
182
+
183
+
184
+ path = os.getcwd()
185
+ folder_name = 'pdfs'
186
+ path = os.path.join(path, folder_name)
187
+
188
+ list_of_files = []
189
+ for root, dirs, files in os.walk(path):
190
+ for file in files:
191
+ if(file.endswith(".pdf")):
192
+ # print(os.path.join(root,file))
193
+ list_of_files.append(os.path.join(root,file))
194
+
195
+ print("\nProcessing {} files...\n".format(len(list_of_files)))
196
+ total_pages = 0
197
+
198
+ for filename in list_of_files:
199
+ print(filename)
200
+ file = os.path.splitext(os.path.basename(filename))[0]
201
+ pages = pdf2image.convert_from_path(pdf_path=filename, dpi=400, size=(1654,2340))
202
+ total_pages += len(pages)
203
+ print("\nProcessing the next {} pages...\n".format(len(pages)))
204
+
205
+ # Then save all pages as images and convert them to text except the last page
206
+ # TODO: create this as a function
207
+ content = ""
208
+ dir_name = 'images/' + file + '/'
209
+ os.makedirs(dir_name, exist_ok=True)
210
+ # If folder doesn't exist, then create it.
211
+ for i in range(len(pages)-1):
212
+ pages[i].save(dir_name + str(i) + '.jpg')
213
+ # OCR the image using Google's tesseract
214
+ content += pt.image_to_string(pages[i])
215
+
216
+ summary_text = ""
217
+ for i, paragraph in enumerate(content.split("\n\n")):
218
+
219
+ paragraph = paragraph.replace('\n',' ')
220
+ paragraph = paragraph.replace('\t','')
221
+ paragraph = ' '.join(paragraph.split())
222
+ # count words in the paragraph and exclude if less than 4 words
223
+ tokens = word_tokenize(paragraph)
224
+ # only do real words
225
+ tokens = [word for word in tokens if word.isalpha()]
226
+ # print("\nTokens: {}\n".format(len(tokens)))
227
+ # only do sentences with more than 1 words excl. alpha crap
228
+ if len(tokens) <= 1:
229
+ continue
230
+ # Perhaps also ignore paragraphs with no sentence?
231
+ sentences = sent_tokenize(paragraph)
232
+
233
+ paragraph = ' '.join(tokens)
234
+
235
+ print("\nParagraph:")
236
+ print(paragraph+"\n")
237
+ # T5 needs to have 'summarize' in order to work:
238
+ # text = "summarize:" + paragraph
239
+ text = paragraph
240
+
241
+ summary = bert_legal_model(text, min_length = 8, ratio = 0.05)
242
+ # summary = tokenizer_t5.decode(summary_ids[0], skip_special_tokens=True)
243
+ summary_text += str(summary) + "\n\n"
244
+ print("Summary:")
245
+ print(summary)
246
+
247
+ content2 = content.replace('\n',' ')
248
+ content2 = content2.replace('\t','')
249
+ summary = bert_legal_model(content2, min_length = 8, num_sentences=25)
250
+
251
+
252
+
253
+ # write all to file for inspection and storage
254
+ all_text = "The Summary-- " + str(summary) + "\n\n\n" \
255
+ + "The Larger Summary-- " + str(summary_text)
256
+
257
+
258
+ all_text2 = all_text.encode('latin-1', 'replace').decode('latin-1')
259
+ all_text2 = all_text2.replace('?','.')
260
+ all_text2 = all_text2.replace('\n',' ')
261
+ all_text2 = all_text2.replace('..','.')
262
+ all_text2 = all_text2.replace(',.',',')
263
+ all_text2 = all_text2.replace('-- ','\n\n\n')
264
+
265
+ pdf = FPDF()
266
+
267
+ # Add a page
268
+ pdf.add_page()
269
+
270
+ pdf.set_font("Times", size = 12)
271
+
272
+ # open the text file in read mode
273
+ f = all_text2
274
+
275
+ # insert the texts in pdf
276
+ pdf.multi_cell(190, 10, txt = f, align = 'C')
277
+
278
+
279
+ # save the pdf with name .pdf
280
+ pdf.output("./static/legal.pdf")
281
+ all_text
282
+
283
+
284
+ return render_template('results.html')
285
+ return None
286
+
287
+
288
+ if __name__ == "__main__":
289
+ app.run(host='0.0.0.0', port=8000, debug=True)
290
+
291
+