SummerTime / model /single_doc /lexrank_model.py
aliabd
full demo working with old graido
7e3e85d
raw
history blame contribute delete
No virus
1.85 kB
from lexrank import STOPWORDS
from lexrank import LexRank as LR
import nltk
from .base_single_doc_model import SingleDocSummModel
class LexRankModel(SingleDocSummModel):
# static variables
model_name = "LexRank"
is_extractive = True
is_neural = False
def __init__(self, data, summary_length=2, threshold=0.1):
super(LexRankModel, self).__init__()
nltk.download("punkt", quiet=True)
corpus = [nltk.sent_tokenize(example) for example in data]
self.lxr = LR(corpus, stopwords=STOPWORDS["en"])
self.summary_length = summary_length
self.threshold = threshold
def summarize(self, corpus, queries=None):
self.assert_summ_input_type(corpus, queries)
documents = [nltk.sent_tokenize(document) for document in corpus]
summaries = [
" ".join(
self.lxr.get_summary(
document, summary_size=self.summary_length, threshold=self.threshold
)
)
for document in documents
]
return summaries
@classmethod
def show_capability(cls):
basic_description = cls.generate_basic_description()
more_details = (
"Works by using a graph-based method to identify the most salient sentences in the document. \n"
"Strengths: \n - Fast with low memory usage \n - Allows for control of summary length \n "
"Weaknesses: \n - Not as accurate as neural methods. \n "
"Initialization arguments: \n "
"- `corpus`: Unlabelled corpus of documents. ` \n "
"- `summary_length`: sentence length of summaries \n "
"- `threshold`: Level of salience required for sentence to be included in summary."
)
print(f"{basic_description} \n {'#'*20} \n {more_details}")