Spaces:
Sleeping
Sleeping
#**************** IMPORT PACKAGES ******************** | |
import gradio as gr | |
import numpy as np | |
import pytesseract as pt | |
import pdf2image | |
import os | |
import tempfile | |
from fpdf import FPDF | |
import re | |
import nltk | |
from nltk.tokenize import sent_tokenize | |
from nltk.tokenize import word_tokenize | |
import pdfkit | |
import yake | |
from zipfile import ZipFile | |
from gtts import gTTS | |
from transformers import AutoTokenizer, AutoModelForPreTraining, AutoModel, AutoConfig | |
from summarizer import Summarizer, TransformerSummarizer | |
from transformers import pipelines | |
from pdfminer.high_level import extract_text | |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM | |
nltk.download('punkt') | |
model_name = 'nlpaueb/legal-bert-base-uncased' | |
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn") | |
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn") | |
def pdf_to_text(text, PDF): | |
Min = int(20) | |
if text == "": | |
# The setup of huggingface.co | |
file_obj = PDF | |
text = extract_text(file_obj.name) | |
inputs = tokenizer([text], max_length=1024, return_tensors="pt") | |
Min = int(Min) | |
# Generate Summary | |
summary_ids = model.generate(inputs["input_ids"], num_beams=2, min_length=Min, max_length=Min+1000) | |
output_text = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] | |
else: | |
inputs = tokenizer([text], max_length=1024, return_tensors="pt") | |
# Generate Summary | |
summary_ids = model.generate(inputs["input_ids"], num_beams=2, min_length=Min, max_length=Min+1000) | |
output_text = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] | |
pdf = FPDF() | |
pdf.add_page() | |
pdf.set_font("Times", size = 12) | |
f = output_text | |
pdf.multi_cell(190, 10, txt = f, align = 'C') | |
pdf.output("legal.pdf") | |
myobj = gTTS(text=output_text, lang='en', slow=False) | |
myobj.save("legal.wav") | |
return "legal.wav", output_text, "legal.pdf" | |
iface = gr.Interface(fn=pdf_to_text, inputs=["text", "file"], outputs=["audio", "text", "file"]) | |
if __name__ == "__main__": | |
iface.launch() # Removed 'share=True' | |