arithescientist's picture
Update app.py
b1f3a7a verified
raw
history blame
5.14 kB
import gradio as gr
import os
import torch
from transformers import AutoTokenizer, AutoModel
from fpdf import FPDF
from gtts import gTTS
from pdfminer.high_level import extract_text
from docx import Document
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
import spacy
# Load spaCy English model
nlp = spacy.load("en_core_web_sm")
# Load the LegalBERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
model = AutoModel.from_pretrained("nlpaueb/legal-bert-base-uncased")
# Convert DOCX to PDF using ReportLab
def docx_to_pdf(docx_file, output_pdf="converted_doc.pdf"):
doc = Document(docx_file)
full_text = [para.text for para in doc.paragraphs]
pdf = canvas.Canvas(output_pdf, pagesize=letter)
pdf.setFont("Helvetica", 12)
text_object = pdf.beginText(40, 750)
for line in full_text:
text_object.textLine(line)
pdf.drawText(text_object)
pdf.save()
return output_pdf
# Extractive summarization using LegalBERT and spaCy
def extractive_summarization(text, num_sentences=5):
# Tokenize text into sentences using spaCy
doc = nlp(text)
sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]
# Handle case where document has fewer sentences than requested
num_sentences = min(num_sentences, len(sentences))
# Encode sentences
inputs = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True)
with torch.no_grad():
outputs = model(**inputs)
# Get sentence embeddings by averaging token embeddings
embeddings = outputs.last_hidden_state.mean(dim=1)
# Compute similarity of each sentence to the document embedding
document_embedding = embeddings.mean(dim=0, keepdim=True)
similarities = torch.nn.functional.cosine_similarity(embeddings, document_embedding)
# Select top sentences based on similarity scores
top_k = torch.topk(similarities, k=num_sentences)
selected_indices = top_k.indices.sort().values # Sort indices to maintain original order
summary_sentences = [sentences[idx] for idx in selected_indices]
# Combine sentences into summary
summary = ' '.join(summary_sentences)
return summary
# Process input file (PDF or DOCX)
def pdf_to_text(text, PDF, num_sentences=5):
try:
if PDF is not None:
file_extension = os.path.splitext(PDF.name)[1].lower()
if file_extension == '.docx':
pdf_file_path = docx_to_pdf(PDF.name)
text = extract_text(pdf_file_path)
elif file_extension == '.pdf':
text = extract_text(PDF.name)
else:
return None, "Unsupported file type", None
elif text != "":
pass # Use the text input provided by the user
else:
return None, "Please provide input text or upload a file.", None
summary = extractive_summarization(text, num_sentences)
# Generate a PDF of the summary
pdf = FPDF()
pdf.add_page()
pdf.set_font("Times", size=12)
pdf.multi_cell(190, 10, txt=summary, align='L')
pdf_output_path = "legal_summary.pdf"
pdf.output(pdf_output_path)
# Generate an audio file of the summary
audio_output_path = "legal_summary.wav"
tts = gTTS(text=summary, lang='en', slow=False)
tts.save(audio_output_path)
return audio_output_path, summary, pdf_output_path
except Exception as e:
return None, f"An error occurred: {str(e)}", None
# Preloaded document handler
def process_sample_document(num_sentences=5):
sample_document_path = "Marbury v. Madison.pdf"
with open(sample_document_path, "rb") as f:
return pdf_to_text("", f, num_sentences)
# Gradio interface
with gr.Blocks() as iface:
with gr.Row():
process_sample_button = gr.Button("Summarize Marbury v. Madison Case (Pre-Uploaded)")
text_input = gr.Textbox(label="Input Text")
file_input = gr.File(label="Upload PDF or DOCX")
slider = gr.Slider(minimum=1, maximum=20, step=1, value=5, label="Number of Summary Sentences")
audio_output = gr.Audio(label="Generated Audio")
summary_output = gr.Textbox(label="Generated Summary")
pdf_output = gr.File(label="Summary PDF")
# Update the function calls to match new parameters
process_sample_button.click(
fn=process_sample_document,
inputs=slider,
outputs=[audio_output, summary_output, pdf_output]
)
# Use submit event for the text input and file input
def on_submit(text, file, num_sentences):
return pdf_to_text(text, file, num_sentences)
text_input.submit(
fn=on_submit,
inputs=[text_input, file_input, slider],
outputs=[audio_output, summary_output, pdf_output]
)
file_input.change(
fn=on_submit,
inputs=[text_input, file_input, slider],
outputs=[audio_output, summary_output, pdf_output]
)
if __name__ == "__main__":
iface.launch()