File size: 1,845 Bytes
7e3e85d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
from lexrank import STOPWORDS
from lexrank import LexRank as LR
import nltk

from .base_single_doc_model import SingleDocSummModel


class LexRankModel(SingleDocSummModel):
    # static variables
    model_name = "LexRank"
    is_extractive = True
    is_neural = False

    def __init__(self, data, summary_length=2, threshold=0.1):
        super(LexRankModel, self).__init__()

        nltk.download("punkt", quiet=True)
        corpus = [nltk.sent_tokenize(example) for example in data]
        self.lxr = LR(corpus, stopwords=STOPWORDS["en"])
        self.summary_length = summary_length
        self.threshold = threshold

    def summarize(self, corpus, queries=None):
        self.assert_summ_input_type(corpus, queries)

        documents = [nltk.sent_tokenize(document) for document in corpus]
        summaries = [
            " ".join(
                self.lxr.get_summary(
                    document, summary_size=self.summary_length, threshold=self.threshold
                )
            )
            for document in documents
        ]

        return summaries

    @classmethod
    def show_capability(cls):
        basic_description = cls.generate_basic_description()
        more_details = (
            "Works by using a graph-based method to identify the most salient sentences in the document. \n"
            "Strengths: \n - Fast with low memory usage \n - Allows for control of summary length \n "
            "Weaknesses: \n - Not as accurate as neural methods. \n "
            "Initialization arguments: \n "
            "- `corpus`: Unlabelled corpus of documents. ` \n "
            "- `summary_length`: sentence length of summaries \n "
            "- `threshold`: Level of salience required for sentence to be included in summary."
        )
        print(f"{basic_description} \n {'#'*20} \n {more_details}")