File size: 7,344 Bytes
c337225
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
"""
Data structure classes required and used for multiple levels of granularity in spans.
"""
from data_loader import dl_sa
from mosestokenizer import MosesDetokenizer
detokenize = MosesDetokenizer('en')


class PhraseAnnotation:
    def __init__(self, initial_word):
        self.words = [initial_word]
        self._resolved_annotation = initial_word.resolved_annotation
        self.ppr_for_ned_candidates = initial_word.ppr_for_ned_candidates

    @property
    def has_valid_bioes_labels(self):
        # B = 0, I = 1, O = 2, E = 3, S = 4
        return all([x.has_valid_bioes_labels and x.bioes_labels is not None for x in self.words])

    def add(self, word):
        self.words.append(word)
        # There are some phrases that are annotated as O but have PPRforNED candidates, those will be ignored!
        if self._resolved_annotation > 0 and self.ppr_for_ned_candidates != word.ppr_for_ned_candidates:
            self.ppr_for_ned_candidates = list(set(self.ppr_for_ned_candidates) & set(word.ppr_for_ned_candidates))

    def all_possible_annotations(self):
        all_common_ids = set.intersection(*[set([y[0] for y in x.candidates]) for x in self.words])
        all_common_ids_average_confidence = map(lambda x: sum(x)/len(x), [
            [sum(y[1])/len(y[1]) for x in self.words for y in x.candidates if y[0] == k] for k in all_common_ids])
        return sorted(zip(all_common_ids, all_common_ids_average_confidence), key=lambda x: x[1], reverse=True)

    def set_alternative_as_resolved_annotation(self, alternative):
        self._resolved_annotation = alternative

    @property
    def resolved_annotation(self):
        return self._resolved_annotation

    @property
    def subword_annotations(self):
        return [x for w in self.words for x in w.annotations]

    @property
    def word_string(self):
        return detokenize([x.word_string.replace("\n", "\u010a").replace("£", "£").replace("âĦ¢", '™')
                          .replace('ü','ü').replace('é', 'é').replace('ÃŃ', 'í') for x in self.words])

    @property
    def begin_character(self):
        return self.words[0].token_offsets[0][1][0]

    @property
    def end_character(self):
        return self.words[-1].token_offsets[-1][1][-1]

    @property
    def average_annotation_confidence(self):
        ac = [x.resolved_annotation_confidence for x in self.words]
        return sum(ac) / len(ac)

    def __str__(self):
        return f"{self.word_string} ({self.begin_character}, {self.end_character}) | annotation: " \
               f"{self.words[0].annotations[0].idx2tag[self.resolved_annotation]}"


class WordAnnotation:
    def __init__(self, subword_annotations, token_offsets, ppr_for_ned_candidates=None):
        if ppr_for_ned_candidates is None:
            ppr_for_ned_candidates = []
        self.annotations = subword_annotations
        self.token_offsets = token_offsets
        self.ppr_for_ned_candidates = ppr_for_ned_candidates
        self.is_valid_annotation = False if not subword_annotations else True
        self.word_string = ''.join([x[0].replace('\u0120', '') for x in token_offsets])
        # even if self.is_valid_annotation is True we could still have the candidates to be empty
        #   since there could be no consensus among the subword predictions.
        self.candidates = sorted([] if not self.is_valid_annotation else [
            (cid, self._get_assigned_probabilities(cid)) for cid in set.intersection(*[set(y.top_k_i_list)
                                                                                       for y in self.annotations])],
                                 key=lambda x: sum(x[1])/len(x[1]), reverse=True)
        self.resolved_annotation = self._resolve_annotation()
        rc = self._get_assigned_probabilities(self.resolved_annotation)
        self.resolved_annotation_confidence = sum(rc) / len(rc)
        if not self.candidates:
            self.candidates = [(self.resolved_annotation, rc)]
        assert self.resolved_annotation in [x[0] for x in self.candidates]
        self.has_valid_bioes_labels = all([x.has_valid_bioes_label for x in self.annotations])
        self.bioes_labels = None if not self.has_valid_bioes_labels else [x.bioes_label for x in self.annotations]

    def _resolve_annotation(self):
        if not self.is_valid_annotation:
            return 0
        r = [x.item() for x in self.annotations]
        if r.count(r[0]) == len(r):
            annotation = r[0]
        elif self.candidates:
            # here we return the annotation with the highest average probability prediction over all the subwords
            annotation = self.candidates[0][0]
        else:
            # here we return the annotation which the model has predicted as highest probability for
            #   the majority of the subwords
            most_frequent = max(set(r), key=r.count)
            if r.count(most_frequent) == 1:
                annotation = r[0]
            else:
                annotation = most_frequent
        return annotation

    def _get_assigned_probabilities(self, cid):
        assigned_probabilities = []
        for a in self.annotations:
            found = False
            for i, p in zip(a.top_k_i_list, a.top_k_p_list):
                if i == cid:
                    assigned_probabilities.append(p)
                    found = True
                    break
            if not found:
                assigned_probabilities.append(0.0)
        assert len(assigned_probabilities) == len(self.annotations)
        return assigned_probabilities

    def __str__(self):
        ann = self.annotations[0].idx2tag[self.resolved_annotation]
        cdns = ','.join([f'({self.annotations[0].idx2tag[x[0]]}: {sum(x[1])/len(x[1])})' for x in self.candidates])
        return f"{self.word_string} | annotation: {ann} | candidates: [{cdns}]"


class SubwordAnnotation:
    """
    The value of his class will be equal to the value of its "self.top_k_i_list[0]", the rest of the information will be
     carried over for future decision-making and evaluation.
    """
    def __init__(self, top_k_p_list, top_k_i_list, subword_string):
        self.top_k_p_list = top_k_p_list
        self.top_k_i_list = top_k_i_list
        subword_string = "UNDEF_STR" if not subword_string else subword_string
        self.subword_string = subword_string.replace('\u0120', '')
        self.bioes_label = 2
        self.has_valid_bioes_label = False
        self.bioes_probabilities = None

    def __eq__(self, other):
        if isinstance(other, int):
            return self.top_k_i_list[0] == other
        elif isinstance(other, SubwordAnnotation):
            return self.top_k_i_list[0] == other.top_k_i_list[0]
        else:
            raise ValueError

    def __str__(self):
        return f"({self.subword_string}, <<" \
               f"{'>> <<'.join([f'{dl_sa.mentions_itos[i]}: {p:.3f}' for i, p in zip(self.top_k_i_list, self.top_k_p_list)])}>>)"

    def item(self):
        return self.top_k_i_list[0]

    def item_probability(self):
        return self.top_k_p_list[0]

    def set_bioes_label(self, label: int, probs: list):
        assert 0 <= label <= 5
        assert len(probs) == 5
        self.has_valid_bioes_label = True
        self.bioes_label = label
        self.bioes_probabilities = probs