lincolnlegalbart

Sleeping

arithescientist commited on Oct 11, 2024

Commit

b1f3a7a

verified ·

1 Parent(s): 390758e

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import gradio as gr
 import os
-import nltk
 import torch
 from transformers import AutoTokenizer, AutoModel
 from fpdf import FPDF
@@ -9,9 +8,10 @@ from pdfminer.high_level import extract_text
 from docx import Document
 from reportlab.lib.pagesizes import letter
 from reportlab.pdfgen import canvas
-nltk.download('punkt')
-from nltk.tokenize import sent_tokenize
 # Load the LegalBERT model and tokenizer
 tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
@@ -33,10 +33,11 @@ def docx_to_pdf(docx_file, output_pdf="converted_doc.pdf"):
     pdf.save()
     return output_pdf
-# Extractive summarization using LegalBERT
 def extractive_summarization(text, num_sentences=5):
-    # Tokenize text into sentences
-    sentences = sent_tokenize(text)
     # Handle case where document has fewer sentences than requested
     num_sentences = min(num_sentences, len(sentences))
     # Encode sentences

 import gradio as gr
 import os
 import torch
 from transformers import AutoTokenizer, AutoModel
 from fpdf import FPDF
 from docx import Document
 from reportlab.lib.pagesizes import letter
 from reportlab.pdfgen import canvas
+import spacy
+# Load spaCy English model
+nlp = spacy.load("en_core_web_sm")
 # Load the LegalBERT model and tokenizer
 tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
     pdf.save()
     return output_pdf
+# Extractive summarization using LegalBERT and spaCy
 def extractive_summarization(text, num_sentences=5):
+    # Tokenize text into sentences using spaCy
+    doc = nlp(text)
+    sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]
     # Handle case where document has fewer sentences than requested
     num_sentences = min(num_sentences, len(sentences))
     # Encode sentences