Spaces:

arithescientist
/

legalsummarizer

Running

File size: 9,342 Bytes

#**************** IMPORT PACKAGES ********************
import flask
from flask import render_template, jsonify, Flask, redirect, url_for, request, flash
from flask_cors import CORS, cross_origin
from werkzeug.utils import secure_filename
import numpy as np
import pytesseract as pt
import pdf2image
from fpdf import FPDF
import re
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import os
import pdfkit
import yake
from transformers import AutoTokenizer, AutoModelForPreTraining, AutoModel, AutoConfig
from summarizer import Summarizer,TransformerSummarizer
from transformers import pipelines
#nltk.download('punkt')

print("lets go")


app = flask.Flask(__name__)
app.config["DEBUG"] = True
UPLOAD_FOLDER = './pdfs'

ALLOWED_EXTENSIONS = {'txt', 'pdf'}
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER

#***************** FLASK *****************************
CORS(app)


def allowed_file(filename):
    return '.' in filename and \
           filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS



#model_name = 'laxya007/gpt2_legal'
#model_name = 'facebook/bart-large-cnn'
model_name = 'nlpaueb/legal-bert-base-uncased'


#The setup of huggingface.co

print("lets go")

custom_config = AutoConfig.from_pretrained(model_name)
custom_config.output_hidden_states=True
custom_tokenizer = AutoTokenizer.from_pretrained(model_name)
custom_model = AutoModel.from_pretrained(model_name, config=custom_config)
bert_legal_model = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer)
print('Using model {}\n'.format(model_name))



# main index page route
@app.route('/')
@cross_origin()
def index():
    return render_template('index.html')

@cross_origin()
@app.route('/results')
def results():
    return render_template('results.html')



@app.route('/predict', methods=['GET', 'POST'])
def uploads():
    if request.method == 'GET':
        # Get the file from post request

        numsent = int(request.args['number'])
        text = str(request.args['text'])
        content = text


        summary_text = ""
        for i, paragraph in enumerate(content.split("\n\n")):
            
            paragraph = paragraph.replace('\n',' ')
            paragraph = paragraph.replace('\t','')
            paragraph = ' '.join(paragraph.split())
            # count words in the paragraph and exclude if less than 4 words
            tokens = word_tokenize(paragraph)
            # only do real words
            tokens = [word for word in tokens if word.isalpha()]
            # print("\nTokens: {}\n".format(len(tokens)))
            # only do sentences with more than 1 words excl. alpha crap
            if len(tokens) <= 1:
                continue
            # Perhaps also ignore paragraphs with no sentence?
            sentences = sent_tokenize(paragraph)
            
            paragraph = ' '.join(tokens)

            print("\nParagraph:")
            print(paragraph+"\n")
            # T5 needs to have 'summarize' in order to work:
            # text = "summarize:" + paragraph
            text = paragraph
            
            summary = bert_legal_model(text,  min_length = 8, ratio = 0.05)
            # summary = tokenizer_t5.decode(summary_ids[0], skip_special_tokens=True)
            summary_text += str(summary) + "\n\n"
            print("Summary:")
            print(summary)

        content2 = content.replace('\n',' ')
        content2 = content2.replace('\t','')
        summary = bert_legal_model(content2, min_length = 8, num_sentences=25)
        


        # write all to file for inspection and storage
        all_text = "The Summary-- " + str(summary) + "\n\n\n" \
            + "The Larger Summary-- " + str(summary_text)
            

        all_text2 = all_text.encode('latin-1', 'replace').decode('latin-1')
        all_text2 = all_text2.replace('?','.')
        all_text2 = all_text2.replace('\n',' ')
        all_text2 = all_text2.replace('..','.')
        all_text2 = all_text2.replace(',.',',')
        all_text2 = all_text2.replace('-- ','\n\n\n')

        pdf = FPDF()  

        # Add a page
        pdf.add_page()

        pdf.set_font("Times", size = 12)

        # open the text file in read mode
        f = all_text2

        # insert the texts in pdf
        pdf.multi_cell(190, 10, txt = f, align = 'C')


        # save the pdf with name .pdf
        pdf.output("./static/legal.pdf")  
        all_text

        
        return render_template('results.html')
    return None




@app.route('/predictpdf', methods=['GET', 'POST'])
def uploads2():
    if request.method == 'POST':
        # Get the file from post request

        numsent = int(request.args['number'])
        if 'file' not in request.files:
            flash('No file part')
            return redirect(request.url)
        file = request.files['file']
        # if user does not select file, browser also
        # submit an empty part without filename
        if file.filename == '':
            flash('No selected file')
            return redirect(request.url)
        if file and allowed_file(file.filename):
            filename = "legal.pdf"
            file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename))

        f = request.files['file']
        f.save(secure_filename(f.filename))


        path = os.getcwd()
        folder_name = 'pdfs'
        path = os.path.join(path, folder_name) 

        list_of_files = []
        for root, dirs, files in os.walk(path):
            for file in files:
                if(file.endswith(".pdf")):
                    # print(os.path.join(root,file))
                    list_of_files.append(os.path.join(root,file))

        print("\nProcessing {} files...\n".format(len(list_of_files)))
        total_pages = 0

        for filename in list_of_files:
            print(filename)
            file = os.path.splitext(os.path.basename(filename))[0]
            pages = pdf2image.convert_from_path(pdf_path=filename, dpi=400, size=(1654,2340))
            total_pages += len(pages)
            print("\nProcessing the next {} pages...\n".format(len(pages)))

            # Then save all pages as images and convert them to text except the last page
            # TODO: create this as a function
            content = ""
            dir_name = 'images/' + file + '/' 
            os.makedirs(dir_name, exist_ok=True)
            # If folder doesn't exist, then create it.
            for i in range(len(pages)-1):
                pages[i].save(dir_name + str(i) + '.jpg')
                # OCR the image using Google's tesseract
                content += pt.image_to_string(pages[i])

            summary_text = ""
            for i, paragraph in enumerate(content.split("\n\n")):
                
                paragraph = paragraph.replace('\n',' ')
                paragraph = paragraph.replace('\t','')
                paragraph = ' '.join(paragraph.split())
                # count words in the paragraph and exclude if less than 4 words
                tokens = word_tokenize(paragraph)
                # only do real words
                tokens = [word for word in tokens if word.isalpha()]
                # print("\nTokens: {}\n".format(len(tokens)))
                # only do sentences with more than 1 words excl. alpha crap
                if len(tokens) <= 1:
                    continue
                # Perhaps also ignore paragraphs with no sentence?
                sentences = sent_tokenize(paragraph)
                
                paragraph = ' '.join(tokens)

                print("\nParagraph:")
                print(paragraph+"\n")
                # T5 needs to have 'summarize' in order to work:
                # text = "summarize:" + paragraph
                text = paragraph
                
                summary = bert_legal_model(text,  min_length = 8, ratio = 0.05)
                # summary = tokenizer_t5.decode(summary_ids[0], skip_special_tokens=True)
                summary_text += str(summary) + "\n\n"
                print("Summary:")
                print(summary)

            content2 = content.replace('\n',' ')
            content2 = content2.replace('\t','')
            summary = bert_legal_model(content2, min_length = 8, num_sentences=25)
            


            # write all to file for inspection and storage
            all_text = "The Summary-- " + str(summary) + "\n\n\n" \
                + "The Larger Summary-- " + str(summary_text)
                

            all_text2 = all_text.encode('latin-1', 'replace').decode('latin-1')
            all_text2 = all_text2.replace('?','.')
            all_text2 = all_text2.replace('\n',' ')
            all_text2 = all_text2.replace('..','.')
            all_text2 = all_text2.replace(',.',',')
            all_text2 = all_text2.replace('-- ','\n\n\n')

            pdf = FPDF()  

            # Add a page
            pdf.add_page()

            pdf.set_font("Times", size = 12)

            # open the text file in read mode
            f = all_text2

            # insert the texts in pdf
            pdf.multi_cell(190, 10, txt = f, align = 'C')


            # save the pdf with name .pdf
            pdf.output("./static/legal.pdf")  
            all_text

            
        return render_template('results.html')
    return None


if __name__ == "__main__":
    app.run()