Spaces:
Running
Running
#**************** IMPORT PACKAGES ******************** | |
import flask | |
from flask import render_template, jsonify, Flask, redirect, url_for, request, flash | |
from flask_cors import CORS, cross_origin | |
from werkzeug.utils import secure_filename | |
import numpy as np | |
import pytesseract as pt | |
import pdf2image | |
from fpdf import FPDF | |
import re | |
import nltk | |
from nltk.tokenize import sent_tokenize | |
from nltk.tokenize import word_tokenize | |
import os | |
import pdfkit | |
import yake | |
from transformers import AutoTokenizer, AutoModelForPreTraining, AutoModel, AutoConfig | |
from summarizer import Summarizer,TransformerSummarizer | |
from transformers import pipelines | |
#nltk.download('punkt') | |
print("lets go") | |
app = flask.Flask(__name__) | |
app.config["DEBUG"] = True | |
UPLOAD_FOLDER = './pdfs' | |
ALLOWED_EXTENSIONS = {'txt', 'pdf'} | |
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER | |
#***************** FLASK ***************************** | |
CORS(app) | |
def allowed_file(filename): | |
return '.' in filename and \ | |
filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS | |
#model_name = 'laxya007/gpt2_legal' | |
#model_name = 'facebook/bart-large-cnn' | |
model_name = 'nlpaueb/legal-bert-base-uncased' | |
#The setup of huggingface.co | |
print("lets go") | |
custom_config = AutoConfig.from_pretrained(model_name) | |
custom_config.output_hidden_states=True | |
custom_tokenizer = AutoTokenizer.from_pretrained(model_name) | |
custom_model = AutoModel.from_pretrained(model_name, config=custom_config) | |
bert_legal_model = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer) | |
print('Using model {}\n'.format(model_name)) | |
# main index page route | |
def index(): | |
return render_template('index.html') | |
def results(): | |
return render_template('results.html') | |
def uploads(): | |
if request.method == 'GET': | |
# Get the file from post request | |
numsent = int(request.args['number']) | |
text = str(request.args['text']) | |
content = text | |
summary_text = "" | |
for i, paragraph in enumerate(content.split("\n\n")): | |
paragraph = paragraph.replace('\n',' ') | |
paragraph = paragraph.replace('\t','') | |
paragraph = ' '.join(paragraph.split()) | |
# count words in the paragraph and exclude if less than 4 words | |
tokens = word_tokenize(paragraph) | |
# only do real words | |
tokens = [word for word in tokens if word.isalpha()] | |
# print("\nTokens: {}\n".format(len(tokens))) | |
# only do sentences with more than 1 words excl. alpha crap | |
if len(tokens) <= 1: | |
continue | |
# Perhaps also ignore paragraphs with no sentence? | |
sentences = sent_tokenize(paragraph) | |
paragraph = ' '.join(tokens) | |
print("\nParagraph:") | |
print(paragraph+"\n") | |
# T5 needs to have 'summarize' in order to work: | |
# text = "summarize:" + paragraph | |
text = paragraph | |
summary = bert_legal_model(text, min_length = 8, ratio = 0.05) | |
# summary = tokenizer_t5.decode(summary_ids[0], skip_special_tokens=True) | |
summary_text += str(summary) + "\n\n" | |
print("Summary:") | |
print(summary) | |
content2 = content.replace('\n',' ') | |
content2 = content2.replace('\t','') | |
summary = bert_legal_model(content2, min_length = 8, num_sentences=25) | |
# write all to file for inspection and storage | |
all_text = "The Summary-- " + str(summary) + "\n\n\n" \ | |
+ "The Larger Summary-- " + str(summary_text) | |
all_text2 = all_text.encode('latin-1', 'replace').decode('latin-1') | |
all_text2 = all_text2.replace('?','.') | |
all_text2 = all_text2.replace('\n',' ') | |
all_text2 = all_text2.replace('..','.') | |
all_text2 = all_text2.replace(',.',',') | |
all_text2 = all_text2.replace('-- ','\n\n\n') | |
pdf = FPDF() | |
# Add a page | |
pdf.add_page() | |
pdf.set_font("Times", size = 12) | |
# open the text file in read mode | |
f = all_text2 | |
# insert the texts in pdf | |
pdf.multi_cell(190, 10, txt = f, align = 'C') | |
# save the pdf with name .pdf | |
pdf.output("./static/legal.pdf") | |
all_text | |
return render_template('results.html') | |
return None | |
def uploads2(): | |
if request.method == 'POST': | |
# Get the file from post request | |
numsent = int(request.args['number']) | |
if 'file' not in request.files: | |
flash('No file part') | |
return redirect(request.url) | |
file = request.files['file'] | |
# if user does not select file, browser also | |
# submit an empty part without filename | |
if file.filename == '': | |
flash('No selected file') | |
return redirect(request.url) | |
if file and allowed_file(file.filename): | |
filename = "legal.pdf" | |
file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename)) | |
f = request.files['file'] | |
f.save(secure_filename(f.filename)) | |
path = os.getcwd() | |
folder_name = 'pdfs' | |
path = os.path.join(path, folder_name) | |
list_of_files = [] | |
for root, dirs, files in os.walk(path): | |
for file in files: | |
if(file.endswith(".pdf")): | |
# print(os.path.join(root,file)) | |
list_of_files.append(os.path.join(root,file)) | |
print("\nProcessing {} files...\n".format(len(list_of_files))) | |
total_pages = 0 | |
for filename in list_of_files: | |
print(filename) | |
file = os.path.splitext(os.path.basename(filename))[0] | |
pages = pdf2image.convert_from_path(pdf_path=filename, dpi=400, size=(1654,2340)) | |
total_pages += len(pages) | |
print("\nProcessing the next {} pages...\n".format(len(pages))) | |
# Then save all pages as images and convert them to text except the last page | |
# TODO: create this as a function | |
content = "" | |
dir_name = 'images/' + file + '/' | |
os.makedirs(dir_name, exist_ok=True) | |
# If folder doesn't exist, then create it. | |
for i in range(len(pages)-1): | |
pages[i].save(dir_name + str(i) + '.jpg') | |
# OCR the image using Google's tesseract | |
content += pt.image_to_string(pages[i]) | |
summary_text = "" | |
for i, paragraph in enumerate(content.split("\n\n")): | |
paragraph = paragraph.replace('\n',' ') | |
paragraph = paragraph.replace('\t','') | |
paragraph = ' '.join(paragraph.split()) | |
# count words in the paragraph and exclude if less than 4 words | |
tokens = word_tokenize(paragraph) | |
# only do real words | |
tokens = [word for word in tokens if word.isalpha()] | |
# print("\nTokens: {}\n".format(len(tokens))) | |
# only do sentences with more than 1 words excl. alpha crap | |
if len(tokens) <= 1: | |
continue | |
# Perhaps also ignore paragraphs with no sentence? | |
sentences = sent_tokenize(paragraph) | |
paragraph = ' '.join(tokens) | |
print("\nParagraph:") | |
print(paragraph+"\n") | |
# T5 needs to have 'summarize' in order to work: | |
# text = "summarize:" + paragraph | |
text = paragraph | |
summary = bert_legal_model(text, min_length = 8, ratio = 0.05) | |
# summary = tokenizer_t5.decode(summary_ids[0], skip_special_tokens=True) | |
summary_text += str(summary) + "\n\n" | |
print("Summary:") | |
print(summary) | |
content2 = content.replace('\n',' ') | |
content2 = content2.replace('\t','') | |
summary = bert_legal_model(content2, min_length = 8, num_sentences=25) | |
# write all to file for inspection and storage | |
all_text = "The Summary-- " + str(summary) + "\n\n\n" \ | |
+ "The Larger Summary-- " + str(summary_text) | |
all_text2 = all_text.encode('latin-1', 'replace').decode('latin-1') | |
all_text2 = all_text2.replace('?','.') | |
all_text2 = all_text2.replace('\n',' ') | |
all_text2 = all_text2.replace('..','.') | |
all_text2 = all_text2.replace(',.',',') | |
all_text2 = all_text2.replace('-- ','\n\n\n') | |
pdf = FPDF() | |
# Add a page | |
pdf.add_page() | |
pdf.set_font("Times", size = 12) | |
# open the text file in read mode | |
f = all_text2 | |
# insert the texts in pdf | |
pdf.multi_cell(190, 10, txt = f, align = 'C') | |
# save the pdf with name .pdf | |
pdf.output("./static/legal.pdf") | |
all_text | |
return render_template('results.html') | |
return None | |
if __name__ == "__main__": | |
app.run() | |