import wikipedia class KB(): def __init__(self): self.entities = {} # { entity_title: {...} } self.relations = [] # [ head: entity_title, type: ..., tail: entity_title, # meta: { article_url: { spans: [...] } } ] self.sources = {} # { article_url: {...} } def merge_with_kb(self, kb2): for r in kb2.relations: article_url = list(r["meta"].keys())[0] source_data = kb2.sources[article_url] self.add_relation(r, source_data["article_title"], source_data["article_publish_date"]) def are_relations_equal(self, r1, r2): return all(r1[attr] == r2[attr] for attr in ["head", "type", "tail"]) def exists_relation(self, r1): return any(self.are_relations_equal(r1, r2) for r2 in self.relations) def merge_relations(self, r2): r1 = [r for r in self.relations if self.are_relations_equal(r2, r)][0] # if different article article_url = list(r2["meta"].keys())[0] if article_url not in r1["meta"]: r1["meta"][article_url] = r2["meta"][article_url] # if existing article else: spans_to_add = [span for span in r2["meta"][article_url]["spans"] if span not in r1["meta"][article_url]["spans"]] r1["meta"][article_url]["spans"] += spans_to_add def get_wikipedia_data(self, candidate_entity): try: page = wikipedia.page(candidate_entity, auto_suggest=False) entity_data = { "title": page.title, "url": page.url, "summary": page.summary } return entity_data except: return None def add_entity(self, e): self.entities[e["title"]] = {k:v for k,v in e.items() if k != "title"} def add_relation(self, r, article_title, article_publish_date): # check on wikipedia candidate_entities = [r["head"], r["tail"]] entities = [self.get_wikipedia_data(ent) for ent in candidate_entities] # if one entity does not exist, stop if any(ent is None for ent in entities): return # manage new entities for e in entities: self.add_entity(e) # rename relation entities with their wikipedia titles r["head"] = entities[0]["title"] r["tail"] = entities[1]["title"] # add source if not in kb article_url = list(r["meta"].keys())[0] if article_url not in self.sources: self.sources[article_url] = { "article_title": article_title, "article_publish_date": article_publish_date } # manage new relation if not self.exists_relation(r): self.relations.append(r) else: self.merge_relations(r) def get_textual_representation(self): res = "" res += "### Entities\n" for e in self.entities.items(): # shorten summary e_temp = (e[0], {k:(v[:100] + "..." if k == "summary" else v) for k,v in e[1].items()}) res += f"- {e_temp}\n" res += "\n" res += "### Relations\n" for r in self.relations: res += f"- {r}\n" res += "\n" res += "### Sources\n" for s in self.sources.items(): res += f"- {s}\n" return res