File size: 5,487 Bytes
c337225 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 |
"""
This file contains the implementation of the candidate manager in charge of loading the candidate sets,
and modifying the phrase annotations using the loaded candidates.
"""
import json
from span_annotation import PhraseAnnotation
from configuration import get_resources_dir
class CandidateManager:
def __init__(self, mentions_vocab, is_kb_yago = False, is_ppr_for_ned = False, is_context_agnostic = False,
is_indexed_for_spans= False):
self.mentions_vocab = mentions_vocab
self.candidates = None
if is_kb_yago:
print(" * Loading the candidates stored for KB+YAGO ...")
is_context_agnostic = True
is_indexed_for_spans = False
self.load_kb_plus_yago()
elif is_ppr_for_ned:
print(" * Loading the {} PPRforNED candidate set ...".format(
'context agnostic' if is_context_agnostic else 'context aware'))
self.load_ppr_for_ned_candidates(is_context_agnostic, is_indexed_for_spans)
else:
raise ValueError("Either \'is_kb_yago\' or \'is_ppr_for_ned\' flags must be True!")
self.is_context_agnostic = is_context_agnostic
self.is_indexed_for_spans = is_indexed_for_spans
self.is_kb_yago = is_kb_yago
self.is_ppr_for_ned = is_ppr_for_ned
def load_ppr_for_ned_candidates(self, is_context_agnostic, is_indexed_for_spans):
if is_context_agnostic:
file_address = "context_agnostic_mentions.json"
elif is_indexed_for_spans:
file_address = "context_aware_spans.json"
else:
file_address = "context_aware_mentions.json"
candidates_a = json.load(open(
get_resources_dir() / "data" / "candidates" / "aida_testa_pprforned" / file_address, "r"))
candidates_b = json.load(open(
get_resources_dir() / "data" / "candidates" / "aida_testb_pprforned" / file_address, "r"))
if is_context_agnostic:
for key in candidates_b:
if key in candidates_a:
for elem in candidates_b[key]:
if elem not in candidates_a[key]:
candidates_a[key].append(elem)
else:
candidates_a[key] = candidates_b[key]
else:
candidates_a.update(candidates_b)
self.candidates = candidates_a
def load_kb_plus_yago(self):
self.candidates = json.load(open(
get_resources_dir() / "data" / "candidates" / "kb_plus_yago_candidates.json", "r"))
def _fetch_candidates(self, phrase_annotation, sentence = None):
candidates = []
if self.is_kb_yago:
phrase_to_check = phrase_annotation.word_string.lower()
if phrase_to_check in self.candidates:
candidates = self.candidates[phrase_to_check]
elif self.is_ppr_for_ned:
# TODO lower-cased check mention surface forms
span_key = f"({phrase_annotation.begin_character}, {phrase_annotation.end_character})"
if self.is_context_agnostic and phrase_annotation.word_string in self.candidates:
candidates = self.candidates[phrase_annotation.word_string]
elif not self.is_context_agnostic and sentence in self.candidates:
if self.is_indexed_for_spans and span_key in self.candidates[sentence]:
candidates = self.candidates[sentence][span_key]
elif not self.is_indexed_for_spans and phrase_annotation.word_string in self.candidates[sentence]:
candidates = self.candidates[sentence][phrase_annotation.word_string]
return candidates
def modify_phrase_annotation_using_candidates(self, phrase_annotation: PhraseAnnotation, sentence: str = None):
"""
The method post processes the :param phrase_annotation: found in a :param sentence: to make sure it is bound to
the predefined {self.candidates} set.
It is not possible to perform candidate look up for spans in context agnostic scenario
so {self.is_indexed_for_spans} will only be considered where {self.is_context_agnostic} is False.
"""
if self.candidates is None or phrase_annotation.resolved_annotation == 0:
return
candidates = self._fetch_candidates(phrase_annotation, sentence)
if not candidates:
phrase_annotation.set_alternative_as_resolved_annotation(0)
return
if self.is_kb_yago:
candidates_ = [self.mentions_vocab[x[0]] for x in candidates if x[0] in self.mentions_vocab]
prior_probabilities_ = [x[1] for x in candidates if x[0] in self.mentions_vocab]
else:
candidates_ = [self.mentions_vocab[x] for x in candidates if x in self.mentions_vocab]
prior_probabilities_ = [1.0 for x in candidates if x in self.mentions_vocab]
# TODO use the prior_probabilities_ to adjust the probabilities
if candidates_:
all_p_anns = phrase_annotation.all_possible_annotations()
filtered_p_predictions = sorted(
[x for x in all_p_anns if x[0] in candidates_], key=lambda y: y[1], reverse=True)
if filtered_p_predictions:
phrase_annotation.set_alternative_as_resolved_annotation(filtered_p_predictions[0][0])
else:
phrase_annotation.set_alternative_as_resolved_annotation(0) |