File size: 2,188 Bytes
cc3c391
8a6bb0b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
from transformers import pipeline

class FillInSummary:
    """Organizing Summarization and Subsequent Fill-In-Task."""

    def __init__(self):
        """Initialize Class with Summarizer and NER-Model."""
        # Refer to https://huggingface.co/docs/transformers/v4.18.0/en/main_classes/pipelines#transformers.SummarizationPipeline
        # for further information about configuration.
        self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
        # Using default model: https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english
        self.ner = ner= pipeline("ner", aggregation_strategy='simple')

    def summarize(self, text: str) -> str:
        """Summarize given Text.

        Parameter
        ---------
        text : str
          Text to be summarized. Must not exceeds BART's maximal input length.

        Returns
        -------
        str
          Summary
        """
        # Refer to https://huggingface.co/docs/transformers/main/en/main_classes/configuration#transformers.PretrainedConfig
        # for further configuration of of the
        output: list = self.summarizer(
            text,
            max_length=400,
            min_length=100,
            do_sample=False)
        return output[0]['summary_text']

    def blank_ne_out(self, text: str) -> dict():
        """Blank out named entities.

        Transforms 'X did this.' to {
          'text': '_ did this',
          'ner': [{
            'end': 1,
            'entity_group': 'ORG',
            'score': 0.73085225,
            'start': 0,
            'word': 'X'
            }]}

        Parameter
        ---------
        text : str
          Summarized text.

        Returns
        -------
        dict
          Entails blanked out text and recognized named entity list.
        """
        ner_list: list = self.ner(text)
        output_str: str = text
        for start, end in map(lambda e : (e['start'], e['end']), ner_list):
            length: int = end - start
            output_str = output_str[0:start] + ("_" * length) + output_str[end::]
        return {
            'text': output_str,
            'ner': ner_list
        }