#**************** IMPORT PACKAGES ******************** import gradio as gr import numpy as np import pytesseract as pt import pdf2image import os import tempfile from fpdf import FPDF import re import nltk from nltk.tokenize import sent_tokenize from nltk.tokenize import word_tokenize import os import pdfkit import yake from zipfile import ZipFile from gtts import gTTS from transformers import AutoTokenizer, AutoModelForPreTraining, AutoModel, AutoConfig from summarizer import Summarizer,TransformerSummarizer from transformers import pipelines from pdfminer.high_level import extract_text nltk.download('punkt') def pdf_to_text(file_obj): model_name = 'nlpaueb/legal-bert-base-uncased' # The setup of huggingface.co custom_config = AutoConfig.from_pretrained(model_name) custom_config.output_hidden_states=True custom_tokenizer = AutoTokenizer.from_pretrained(model_name) custom_model = AutoModel.from_pretrained(model_name, config=custom_config) bert_legal_model = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer) text = extract_text(file_obj.name) output_text= bert_legal_model(text, min_length = 8, ratio = 0.05) output_text = output_text.replace(' ',' ') output_text = output_text.replace(',.',',') output_text = output_text.replace('\n',' ') output_text = output_text.replace('..','.') output_text = "dbgffsdvdbg" pdf = FPDF() pdf.add_page() pdf.set_font("Times", size = 12) # open the text file in read mode f = output_text # insert the texts in pdf pdf.multi_cell(190, 10, txt = f, align = 'C') # save the pdf with name .pdf pdf.output("legal.pdf") myobj = gTTS(text=output_text, lang='en', slow=False) myobj.save("legal.wav") return "legal.wav", output_text, "legal.pdf" # path = folder_name # return path #pageObject.extractText() iface = gr.Interface(fn = pdf_to_text, inputs ="file", outputs=["text", "file"] ) if __name__ == "__main__": iface.launch(share=True)