Spaces:

arithescientist
/

lincolnlegal

Sleeping

App Files Files Community

Ari commited on Sep 5, 2024

Commit

8ccf10b

verified ·

1 Parent(s): deb55dd

Update app.py

Browse files

Files changed (1) hide show

app.py +22 -47

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import gradio as gr
 import os
-import re
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 from fpdf import FPDF
 from gtts import gTTS
@@ -9,40 +9,23 @@ from docx import Document
 from reportlab.lib.pagesizes import letter
 from reportlab.pdfgen import canvas
-# Use LegalBERT for handling legal documents
-tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
-model = AutoModelForSeq2SeqLM.from_pretrained("nlpaueb/legal-bert-base-uncased")
-# Function to chunk the text into manageable pieces
-def chunk_text(text, max_token_len=512):
-    sentences = re.split(r'(?<=[.!?]) +', text)
-    chunks = []
-    current_chunk = []
-    current_length = 0
-    for sentence in sentences:
-        tokens = tokenizer.tokenize(sentence)
-        if current_length + len(tokens) <= max_token_len:
-            current_chunk.append(sentence)
-            current_length += len(tokens)
-        else:
-            chunks.append(" ".join(current_chunk))
-            current_chunk = [sentence]
-            current_length = len(tokens)
-    if current_chunk:
-        chunks.append(" ".join(current_chunk))
-    return chunks
 def docx_to_pdf(docx_file, output_pdf="converted_doc.pdf"):
     doc = Document(docx_file)
-    full_text = [para.text for para in doc.paragraphs]
     pdf = canvas.Canvas(output_pdf, pagesize=letter)
     pdf.setFont("Helvetica", 12)
-    text = pdf.beginText(40, 750)
     for line in full_text:
         text.textLine(line)
@@ -50,14 +33,8 @@ def docx_to_pdf(docx_file, output_pdf="converted_doc.pdf"):
     pdf.save()
     return output_pdf
-# Summarize each chunk and then recursively summarize the summaries
-def summarize_chunk(chunk, min_length=50, max_length=150):
-    inputs = tokenizer([chunk], max_length=512, truncation=True, return_tensors="pt")
-    summary_ids = model.generate(inputs["input_ids"], num_beams=2, min_length=min_length, max_length=max_length)
-    return tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
-# Main processing function using recursive summarization
-def pdf_to_text(text, PDF, min_length=50):
     try:
         file_extension = os.path.splitext(PDF.name)[1].lower()
@@ -67,32 +44,30 @@ def pdf_to_text(text, PDF, min_length=50):
         elif file_extension == '.pdf' and text == "":
             text = extract_text(PDF.name)
-        chunks = chunk_text(text)
-        summarized_chunks = [summarize_chunk(chunk, min_length=min_length) for chunk in chunks]
-        # Combine summaries and recursively summarize the combined text
-        summarized_text = " ".join(summarized_chunks)
-        final_summary = summarize_chunk(summarized_text, min_length=min_length, max_length=min_length+150)
-        # Save summarized text to PDF
         pdf = FPDF()
         pdf.add_page()
         pdf.set_font("Times", size=12)
-        pdf.multi_cell(190, 10, txt=final_summary, align='C')
         pdf_output_path = "legal.pdf"
         pdf.output(pdf_output_path)
-        # Convert summarized text to audio
         audio_output_path = "legal.wav"
-        tts = gTTS(text=final_summary, lang='en', slow=False)
         tts.save(audio_output_path)
-        return audio_output_path, final_summary, pdf_output_path
     except Exception as e:
         return None, f"An error occurred: {str(e)}", None
-def process_sample_document(min_length=50):
     sample_document_path = "Marbury v. Madison.pdf"
     with open(sample_document_path, "rb") as f:
@@ -105,7 +80,7 @@ with gr.Blocks() as iface:
     text_input = gr.Textbox(label="Input Text")
     file_input = gr.File(label="Upload PDF or DOCX")
-    slider = gr.Slider(minimum=10, maximum=300, step=10, value=50, label="Summary Minimum Length")
     audio_output = gr.Audio(label="Generated Audio")
     summary_output = gr.Textbox(label="Generated Summary")

 import gradio as gr
 import os
+import nltk
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 from fpdf import FPDF
 from gtts import gTTS
 from reportlab.lib.pagesizes import letter
 from reportlab.pdfgen import canvas
+nltk.download('punkt')
+# Load the models and tokenizers
+tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
+model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
+# Convert DOCX to PDF using reportlab
 def docx_to_pdf(docx_file, output_pdf="converted_doc.pdf"):
     doc = Document(docx_file)
+    full_text = []
+    for para in doc.paragraphs:
+        full_text.append(para.text)
     pdf = canvas.Canvas(output_pdf, pagesize=letter)
     pdf.setFont("Helvetica", 12)
+    text = pdf.beginText(40, 750)
     for line in full_text:
         text.textLine(line)
     pdf.save()
     return output_pdf
+# Process input file (PDF or DOCX)
+def pdf_to_text(text, PDF, min_length=20):
     try:
         file_extension = os.path.splitext(PDF.name)[1].lower()
         elif file_extension == '.pdf' and text == "":
             text = extract_text(PDF.name)
+        inputs = tokenizer([text], max_length=1024, truncation=True, return_tensors="pt")
+        min_length = int(min_length)
+        summary_ids = model.generate(inputs["input_ids"], num_beams=2, min_length=min_length, max_length=min_length+1000)
+        output_text = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
         pdf = FPDF()
         pdf.add_page()
         pdf.set_font("Times", size=12)
+        pdf.multi_cell(190, 10, txt=output_text, align='C')
         pdf_output_path = "legal.pdf"
         pdf.output(pdf_output_path)
         audio_output_path = "legal.wav"
+        tts = gTTS(text=output_text, lang='en', slow=False)
         tts.save(audio_output_path)
+        return audio_output_path, output_text, pdf_output_path
     except Exception as e:
         return None, f"An error occurred: {str(e)}", None
+# Preloaded document handler
+def process_sample_document(min_length=20):
     sample_document_path = "Marbury v. Madison.pdf"
     with open(sample_document_path, "rb") as f:
     text_input = gr.Textbox(label="Input Text")
     file_input = gr.File(label="Upload PDF or DOCX")
+    slider = gr.Slider(minimum=10, maximum=100, step=10, value=20, label="Summary Minimum Length")
     audio_output = gr.Audio(label="Generated Audio")
     summary_output = gr.Textbox(label="Generated Summary")