lincolnlegal / app.py
aritheanalyst's picture
Update app.py
3767d92
#**************** IMPORT PACKAGES ********************
import gradio as gr
import numpy as np
import pytesseract as pt
import pdf2image
import os
import tempfile
from fpdf import FPDF
import re
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import pdfkit
import yake
from zipfile import ZipFile
from gtts import gTTS
from transformers import AutoTokenizer, AutoModelForPreTraining, AutoModel, AutoConfig
from summarizer import Summarizer,TransformerSummarizer
from transformers import pipelines
from pdfminer.high_level import extract_text
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
nltk.download('punkt')
model_name = 'nlpaueb/legal-bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
def pdf_to_text(text, PDF):
Min = int(20)
if text == "":
# The setup of huggingface.co
file_obj = PDF
#n = int(Percent.replace('%', ''))
text = extract_text(file_obj.name)
inputs = tokenizer([text], max_length=1024, return_tensors="pt")
Min = int(Min)
# Generate Summary
summary_ids = model.generate(inputs["input_ids"], num_beams=2,min_length=Min, max_length=Min+1000)
output_text = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
else:
inputs = tokenizer([text], max_length=1024, return_tensors="pt")
# Generate Summary
summary_ids = model.generate(inputs["input_ids"], num_beams=2,min_length=Min, max_length=Min+1000)
output_text = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
#output_text= bert_legal_model(text, min_length = 8, ratio = 0.05)
pdf = FPDF()
pdf.add_page()
pdf.set_font("Times", size = 12)
# open the text file in read mode
f = output_text
# insert the texts in pdf
pdf.multi_cell(190, 10, txt = f, align = 'C')
# save the pdf with name .pdf
pdf.output("legal.pdf")
myobj = gTTS(text=output_text, lang='en', slow=False)
myobj.save("legal.wav")
return "legal.wav", output_text, "legal.pdf"
# return path
#pageObject.extractText()
iface = gr.Interface(fn = pdf_to_text,
inputs =["text", "file"], outputs=["audio","text", "file"] )
if __name__ == "__main__":
iface.launch(share=True)