Spaces:
Sleeping
Sleeping
#**************** IMPORT PACKAGES ******************** | |
import gradio as gr | |
import numpy as np | |
import pytesseract as pt | |
import pdf2image | |
import os | |
import tempfile | |
from fpdf import FPDF | |
import re | |
import nltk | |
from nltk.tokenize import sent_tokenize | |
from nltk.tokenize import word_tokenize | |
import os | |
import pdfkit | |
import yake | |
from zipfile import ZipFile | |
from gtts import gTTS | |
from transformers import AutoTokenizer, AutoModelForPreTraining, AutoModel, AutoConfig | |
from summarizer import Summarizer,TransformerSummarizer | |
from transformers import pipelines | |
from pdfminer.high_level import extract_text | |
nltk.download('punkt') | |
def pdf_to_text(file_obj): | |
model_name = 'nlpaueb/legal-bert-base-uncased' | |
# The setup of huggingface.co | |
custom_config = AutoConfig.from_pretrained(model_name) | |
custom_config.output_hidden_states=True | |
custom_tokenizer = AutoTokenizer.from_pretrained(model_name) | |
custom_model = AutoModel.from_pretrained(model_name, config=custom_config) | |
bert_legal_model = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer) | |
text = extract_text(file_obj.name) | |
output_text= bert_legal_model(text, min_length = 8, ratio = 0.05) | |
output_text = output_text.replace(' ',' ') | |
output_text = output_text.replace(',.',',') | |
output_text = output_text.replace('\n',' ') | |
output_text = output_text.replace('..','.') | |
output_text = "dbgffsdvdbg" | |
pdf = FPDF() | |
pdf.add_page() | |
pdf.set_font("Times", size = 12) | |
# open the text file in read mode | |
f = output_text | |
# insert the texts in pdf | |
pdf.multi_cell(190, 10, txt = f, align = 'C') | |
# save the pdf with name .pdf | |
pdf.output("legal.pdf") | |
myobj = gTTS(text=output_text, lang='en', slow=False) | |
myobj.save("legal.wav") | |
return "legal.wav", output_text, "legal.pdf" | |
# path = folder_name | |
# return path | |
#pageObject.extractText() | |
iface = gr.Interface(fn = pdf_to_text, | |
inputs ="file", outputs=["text", "file"] ) | |
if __name__ == "__main__": | |
iface.launch(share=True) |