Spaces:
Sleeping
Sleeping
File size: 3,339 Bytes
17e34a5 94bf427 8ccf10b 94bf427 3813c2d 94bf427 ac28e59 82e6a9a cffceba 8ccf10b 11ef927 8ccf10b c0d316e 8ccf10b c0d316e 8ccf10b 204d8e4 c0d316e deb55dd 8ccf10b c0d316e abdc1ac c0d316e 9d0e6a8 8ccf10b 7f2b3e5 ac28e59 ec7dfc2 ac28e59 94bf427 ac28e59 8ccf10b deb55dd 8ccf10b deb55dd 94bf427 8ccf10b 94bf427 8ccf10b 7f2b3e5 94bf427 8ccf10b 94bf427 7f2b3e5 8ccf10b f6ccaae 6de4d60 f6ccaae 6de4d60 204d8e4 f6ccaae c0d316e f6ccaae 8ccf10b f6ccaae 6de4d60 342a4a2 ac28e59 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
import gradio as gr
import os
import nltk
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from fpdf import FPDF
from gtts import gTTS
from pdfminer.high_level import extract_text
from docx import Document
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
nltk.download('punkt')
# Load the models and tokenizers
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
# Convert DOCX to PDF using reportlab
def docx_to_pdf(docx_file, output_pdf="converted_doc.pdf"):
doc = Document(docx_file)
full_text = []
for para in doc.paragraphs:
full_text.append(para.text)
pdf = canvas.Canvas(output_pdf, pagesize=letter)
pdf.setFont("Helvetica", 12)
text = pdf.beginText(40, 750)
for line in full_text:
text.textLine(line)
pdf.drawText(text)
pdf.save()
return output_pdf
# Process input file (PDF or DOCX)
def pdf_to_text(text, PDF, min_length=20):
try:
file_extension = os.path.splitext(PDF.name)[1].lower()
if file_extension == '.docx':
pdf_file_path = docx_to_pdf(PDF.name)
text = extract_text(pdf_file_path)
elif file_extension == '.pdf' and text == "":
text = extract_text(PDF.name)
inputs = tokenizer([text], max_length=1024, truncation=True, return_tensors="pt")
min_length = int(min_length)
summary_ids = model.generate(inputs["input_ids"], num_beams=2, min_length=min_length, max_length=min_length+1000)
output_text = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
pdf = FPDF()
pdf.add_page()
pdf.set_font("Times", size=12)
pdf.multi_cell(190, 10, txt=output_text, align='C')
pdf_output_path = "legal.pdf"
pdf.output(pdf_output_path)
audio_output_path = "legal.wav"
tts = gTTS(text=output_text, lang='en', slow=False)
tts.save(audio_output_path)
return audio_output_path, output_text, pdf_output_path
except Exception as e:
return None, f"An error occurred: {str(e)}", None
# Preloaded document handler
def process_sample_document(min_length=20):
sample_document_path = "Marbury v. Madison.pdf"
with open(sample_document_path, "rb") as f:
return pdf_to_text("", f, min_length)
# Gradio interface
with gr.Blocks() as iface:
with gr.Row():
process_sample_button = gr.Button("Summarize Marbury v. Madison Case Pre-Uploaded")
text_input = gr.Textbox(label="Input Text")
file_input = gr.File(label="Upload PDF or DOCX")
slider = gr.Slider(minimum=10, maximum=100, step=10, value=20, label="Summary Minimum Length")
audio_output = gr.Audio(label="Generated Audio")
summary_output = gr.Textbox(label="Generated Summary")
pdf_output = gr.File(label="Summary PDF")
process_sample_button.click(fn=process_sample_document, inputs=slider, outputs=[audio_output, summary_output, pdf_output])
file_input.change(fn=pdf_to_text, inputs=[text_input, file_input, slider], outputs=[audio_output, summary_output, pdf_output])
if __name__ == "__main__":
iface.launch()
|