lincolnlegal / app.py
arithescientist's picture
Update app.py
3813c2d
raw
history blame
2.04 kB
#**************** IMPORT PACKAGES ********************
import gradio as gr
import numpy as np
import pytesseract as pt
import pdf2image
import os
import tempfile
from fpdf import FPDF
import re
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import os
import pdfkit
import yake
from zipfile import ZipFile
from gtts import gTTS
from transformers import AutoTokenizer, AutoModelForPreTraining, AutoModel, AutoConfig
from summarizer import Summarizer,TransformerSummarizer
from transformers import pipelines
from pdfminer.high_level import extract_text
nltk.download('punkt')
def pdf_to_text(file_obj):
model_name = 'nlpaueb/legal-bert-base-uncased'
# The setup of huggingface.co
custom_config = AutoConfig.from_pretrained(model_name)
custom_config.output_hidden_states=True
custom_tokenizer = AutoTokenizer.from_pretrained(model_name)
custom_model = AutoModel.from_pretrained(model_name, config=custom_config)
bert_legal_model = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer)
text = extract_text(file_obj.name)
output_text= bert_legal_model(text, min_length = 8, ratio = 0.05)
output_text = output_text.replace(' ',' ')
output_text = output_text.replace(',.',',')
output_text = output_text.replace('\n',' ')
output_text = output_text.replace('..','.')
output_text = "dbgffsdvdbg"
pdf = FPDF()
pdf.add_page()
pdf.set_font("Times", size = 12)
# open the text file in read mode
f = output_text
# insert the texts in pdf
pdf.multi_cell(190, 10, txt = f, align = 'C')
# save the pdf with name .pdf
pdf.output("legal.pdf")
myobj = gTTS(text=output_text, lang='en', slow=False)
myobj.save("legal.wav")
return "legal.wav", output_text, "legal.pdf"
# path = folder_name
# return path
#pageObject.extractText()
iface = gr.Interface(fn = pdf_to_text,
inputs ="file", outputs=["text", "file"] )
if __name__ == "__main__":
iface.launch(share=True)