arithescientist commited on
Commit
b1f3a7a
·
verified ·
1 Parent(s): 390758e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -6
app.py CHANGED
@@ -1,6 +1,5 @@
1
  import gradio as gr
2
  import os
3
- import nltk
4
  import torch
5
  from transformers import AutoTokenizer, AutoModel
6
  from fpdf import FPDF
@@ -9,9 +8,10 @@ from pdfminer.high_level import extract_text
9
  from docx import Document
10
  from reportlab.lib.pagesizes import letter
11
  from reportlab.pdfgen import canvas
 
12
 
13
- nltk.download('punkt')
14
- from nltk.tokenize import sent_tokenize
15
 
16
  # Load the LegalBERT model and tokenizer
17
  tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
@@ -33,10 +33,11 @@ def docx_to_pdf(docx_file, output_pdf="converted_doc.pdf"):
33
  pdf.save()
34
  return output_pdf
35
 
36
- # Extractive summarization using LegalBERT
37
  def extractive_summarization(text, num_sentences=5):
38
- # Tokenize text into sentences
39
- sentences = sent_tokenize(text)
 
40
  # Handle case where document has fewer sentences than requested
41
  num_sentences = min(num_sentences, len(sentences))
42
  # Encode sentences
 
1
  import gradio as gr
2
  import os
 
3
  import torch
4
  from transformers import AutoTokenizer, AutoModel
5
  from fpdf import FPDF
 
8
  from docx import Document
9
  from reportlab.lib.pagesizes import letter
10
  from reportlab.pdfgen import canvas
11
+ import spacy
12
 
13
+ # Load spaCy English model
14
+ nlp = spacy.load("en_core_web_sm")
15
 
16
  # Load the LegalBERT model and tokenizer
17
  tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
 
33
  pdf.save()
34
  return output_pdf
35
 
36
+ # Extractive summarization using LegalBERT and spaCy
37
  def extractive_summarization(text, num_sentences=5):
38
+ # Tokenize text into sentences using spaCy
39
+ doc = nlp(text)
40
+ sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]
41
  # Handle case where document has fewer sentences than requested
42
  num_sentences = min(num_sentences, len(sentences))
43
  # Encode sentences