legalsummarizer / app.py
arithescientist's picture
Update app.py
2f9a1ee
raw
history blame
9.34 kB
#**************** IMPORT PACKAGES ********************
import flask
from flask import render_template, jsonify, Flask, redirect, url_for, request, flash
from flask_cors import CORS, cross_origin
from werkzeug.utils import secure_filename
import numpy as np
import pytesseract as pt
import pdf2image
from fpdf import FPDF
import re
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import os
import pdfkit
import yake
from transformers import AutoTokenizer, AutoModelForPreTraining, AutoModel, AutoConfig
from summarizer import Summarizer,TransformerSummarizer
from transformers import pipelines
#nltk.download('punkt')
print("lets go")
app = flask.Flask(__name__)
app.config["DEBUG"] = True
UPLOAD_FOLDER = './pdfs'
ALLOWED_EXTENSIONS = {'txt', 'pdf'}
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
#***************** FLASK *****************************
CORS(app)
def allowed_file(filename):
return '.' in filename and \
filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
#model_name = 'laxya007/gpt2_legal'
#model_name = 'facebook/bart-large-cnn'
model_name = 'nlpaueb/legal-bert-base-uncased'
#The setup of huggingface.co
print("lets go")
custom_config = AutoConfig.from_pretrained(model_name)
custom_config.output_hidden_states=True
custom_tokenizer = AutoTokenizer.from_pretrained(model_name)
custom_model = AutoModel.from_pretrained(model_name, config=custom_config)
bert_legal_model = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer)
print('Using model {}\n'.format(model_name))
# main index page route
@app.route('/')
@cross_origin()
def index():
return render_template('index.html')
@cross_origin()
@app.route('/results')
def results():
return render_template('results.html')
@app.route('/predict', methods=['GET', 'POST'])
def uploads():
if request.method == 'GET':
# Get the file from post request
numsent = int(request.args['number'])
text = str(request.args['text'])
content = text
summary_text = ""
for i, paragraph in enumerate(content.split("\n\n")):
paragraph = paragraph.replace('\n',' ')
paragraph = paragraph.replace('\t','')
paragraph = ' '.join(paragraph.split())
# count words in the paragraph and exclude if less than 4 words
tokens = word_tokenize(paragraph)
# only do real words
tokens = [word for word in tokens if word.isalpha()]
# print("\nTokens: {}\n".format(len(tokens)))
# only do sentences with more than 1 words excl. alpha crap
if len(tokens) <= 1:
continue
# Perhaps also ignore paragraphs with no sentence?
sentences = sent_tokenize(paragraph)
paragraph = ' '.join(tokens)
print("\nParagraph:")
print(paragraph+"\n")
# T5 needs to have 'summarize' in order to work:
# text = "summarize:" + paragraph
text = paragraph
summary = bert_legal_model(text, min_length = 8, ratio = 0.05)
# summary = tokenizer_t5.decode(summary_ids[0], skip_special_tokens=True)
summary_text += str(summary) + "\n\n"
print("Summary:")
print(summary)
content2 = content.replace('\n',' ')
content2 = content2.replace('\t','')
summary = bert_legal_model(content2, min_length = 8, num_sentences=25)
# write all to file for inspection and storage
all_text = "The Summary-- " + str(summary) + "\n\n\n" \
+ "The Larger Summary-- " + str(summary_text)
all_text2 = all_text.encode('latin-1', 'replace').decode('latin-1')
all_text2 = all_text2.replace('?','.')
all_text2 = all_text2.replace('\n',' ')
all_text2 = all_text2.replace('..','.')
all_text2 = all_text2.replace(',.',',')
all_text2 = all_text2.replace('-- ','\n\n\n')
pdf = FPDF()
# Add a page
pdf.add_page()
pdf.set_font("Times", size = 12)
# open the text file in read mode
f = all_text2
# insert the texts in pdf
pdf.multi_cell(190, 10, txt = f, align = 'C')
# save the pdf with name .pdf
pdf.output("./static/legal.pdf")
all_text
return render_template('results.html')
return None
@app.route('/predictpdf', methods=['GET', 'POST'])
def uploads2():
if request.method == 'POST':
# Get the file from post request
numsent = int(request.args['number'])
if 'file' not in request.files:
flash('No file part')
return redirect(request.url)
file = request.files['file']
# if user does not select file, browser also
# submit an empty part without filename
if file.filename == '':
flash('No selected file')
return redirect(request.url)
if file and allowed_file(file.filename):
filename = "legal.pdf"
file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename))
f = request.files['file']
f.save(secure_filename(f.filename))
path = os.getcwd()
folder_name = 'pdfs'
path = os.path.join(path, folder_name)
list_of_files = []
for root, dirs, files in os.walk(path):
for file in files:
if(file.endswith(".pdf")):
# print(os.path.join(root,file))
list_of_files.append(os.path.join(root,file))
print("\nProcessing {} files...\n".format(len(list_of_files)))
total_pages = 0
for filename in list_of_files:
print(filename)
file = os.path.splitext(os.path.basename(filename))[0]
pages = pdf2image.convert_from_path(pdf_path=filename, dpi=400, size=(1654,2340))
total_pages += len(pages)
print("\nProcessing the next {} pages...\n".format(len(pages)))
# Then save all pages as images and convert them to text except the last page
# TODO: create this as a function
content = ""
dir_name = 'images/' + file + '/'
os.makedirs(dir_name, exist_ok=True)
# If folder doesn't exist, then create it.
for i in range(len(pages)-1):
pages[i].save(dir_name + str(i) + '.jpg')
# OCR the image using Google's tesseract
content += pt.image_to_string(pages[i])
summary_text = ""
for i, paragraph in enumerate(content.split("\n\n")):
paragraph = paragraph.replace('\n',' ')
paragraph = paragraph.replace('\t','')
paragraph = ' '.join(paragraph.split())
# count words in the paragraph and exclude if less than 4 words
tokens = word_tokenize(paragraph)
# only do real words
tokens = [word for word in tokens if word.isalpha()]
# print("\nTokens: {}\n".format(len(tokens)))
# only do sentences with more than 1 words excl. alpha crap
if len(tokens) <= 1:
continue
# Perhaps also ignore paragraphs with no sentence?
sentences = sent_tokenize(paragraph)
paragraph = ' '.join(tokens)
print("\nParagraph:")
print(paragraph+"\n")
# T5 needs to have 'summarize' in order to work:
# text = "summarize:" + paragraph
text = paragraph
summary = bert_legal_model(text, min_length = 8, ratio = 0.05)
# summary = tokenizer_t5.decode(summary_ids[0], skip_special_tokens=True)
summary_text += str(summary) + "\n\n"
print("Summary:")
print(summary)
content2 = content.replace('\n',' ')
content2 = content2.replace('\t','')
summary = bert_legal_model(content2, min_length = 8, num_sentences=25)
# write all to file for inspection and storage
all_text = "The Summary-- " + str(summary) + "\n\n\n" \
+ "The Larger Summary-- " + str(summary_text)
all_text2 = all_text.encode('latin-1', 'replace').decode('latin-1')
all_text2 = all_text2.replace('?','.')
all_text2 = all_text2.replace('\n',' ')
all_text2 = all_text2.replace('..','.')
all_text2 = all_text2.replace(',.',',')
all_text2 = all_text2.replace('-- ','\n\n\n')
pdf = FPDF()
# Add a page
pdf.add_page()
pdf.set_font("Times", size = 12)
# open the text file in read mode
f = all_text2
# insert the texts in pdf
pdf.multi_cell(190, 10, txt = f, align = 'C')
# save the pdf with name .pdf
pdf.output("./static/legal.pdf")
all_text
return render_template('results.html')
return None
if __name__ == "__main__":
app.run()