#**************** IMPORT PACKAGES ******************** import gradio as gr import numpy as np import pytesseract as pt import pdf2image import os import tempfile from fpdf import FPDF import re import nltk from nltk.tokenize import sent_tokenize from nltk.tokenize import word_tokenize import pdfkit import yake from zipfile import ZipFile from gtts import gTTS from transformers import AutoTokenizer, AutoModelForPreTraining, AutoModel, AutoConfig from summarizer import Summarizer,TransformerSummarizer from transformers import pipelines from pdfminer.high_level import extract_text from transformers import AutoTokenizer, AutoModelForSeq2SeqLM nltk.download('punkt') model_name = 'nlpaueb/legal-bert-base-uncased' tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn") model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn") def pdf_to_text(text, PDF): Min = int(20) if text == "": # The setup of huggingface.co file_obj = PDF #n = int(Percent.replace('%', '')) text = extract_text(file_obj.name) inputs = tokenizer([text], max_length=1024, return_tensors="pt") Min = int(Min) # Generate Summary summary_ids = model.generate(inputs["input_ids"], num_beams=2,min_length=Min, max_length=Min+1000) output_text = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] else: inputs = tokenizer([text], max_length=1024, return_tensors="pt") # Generate Summary summary_ids = model.generate(inputs["input_ids"], num_beams=2,min_length=Min, max_length=Min+1000) output_text = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] #output_text= bert_legal_model(text, min_length = 8, ratio = 0.05) pdf = FPDF() pdf.add_page() pdf.set_font("Times", size = 12) # open the text file in read mode f = output_text # insert the texts in pdf pdf.multi_cell(190, 10, txt = f, align = 'C') # save the pdf with name .pdf pdf.output("legal.pdf") myobj = gTTS(text=output_text, lang='en', slow=False) myobj.save("legal.wav") return "legal.wav", output_text, "legal.pdf" # return path #pageObject.extractText() iface = gr.Interface(fn = pdf_to_text, inputs =["text", "file"], outputs=["audio","text", "file"] ) if __name__ == "__main__": iface.launch(share=True)