Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1 |
import gradio as gr
|
2 |
import os
|
3 |
-
import nltk
|
4 |
import torch
|
5 |
from transformers import AutoTokenizer, AutoModel
|
6 |
from fpdf import FPDF
|
@@ -9,9 +8,10 @@ from pdfminer.high_level import extract_text
|
|
9 |
from docx import Document
|
10 |
from reportlab.lib.pagesizes import letter
|
11 |
from reportlab.pdfgen import canvas
|
|
|
12 |
|
13 |
-
|
14 |
-
|
15 |
|
16 |
# Load the LegalBERT model and tokenizer
|
17 |
tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
|
@@ -33,10 +33,11 @@ def docx_to_pdf(docx_file, output_pdf="converted_doc.pdf"):
|
|
33 |
pdf.save()
|
34 |
return output_pdf
|
35 |
|
36 |
-
# Extractive summarization using LegalBERT
|
37 |
def extractive_summarization(text, num_sentences=5):
|
38 |
-
# Tokenize text into sentences
|
39 |
-
|
|
|
40 |
# Handle case where document has fewer sentences than requested
|
41 |
num_sentences = min(num_sentences, len(sentences))
|
42 |
# Encode sentences
|
|
|
1 |
import gradio as gr
|
2 |
import os
|
|
|
3 |
import torch
|
4 |
from transformers import AutoTokenizer, AutoModel
|
5 |
from fpdf import FPDF
|
|
|
8 |
from docx import Document
|
9 |
from reportlab.lib.pagesizes import letter
|
10 |
from reportlab.pdfgen import canvas
|
11 |
+
import spacy
|
12 |
|
13 |
+
# Load spaCy English model
|
14 |
+
nlp = spacy.load("en_core_web_sm")
|
15 |
|
16 |
# Load the LegalBERT model and tokenizer
|
17 |
tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
|
|
|
33 |
pdf.save()
|
34 |
return output_pdf
|
35 |
|
36 |
+
# Extractive summarization using LegalBERT and spaCy
|
37 |
def extractive_summarization(text, num_sentences=5):
|
38 |
+
# Tokenize text into sentences using spaCy
|
39 |
+
doc = nlp(text)
|
40 |
+
sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]
|
41 |
# Handle case where document has fewer sentences than requested
|
42 |
num_sentences = min(num_sentences, len(sentences))
|
43 |
# Encode sentences
|