Spaces:
Build error
Build error
# -*- coding: utf-8 -*- | |
''' | |
@Author : Jiangjie Chen | |
@Time : 2020/11/12 21:19 | |
@Contact : jjchen19@fudan.edu.cn | |
@Description: | |
''' | |
import wikipediaapi | |
import nltk | |
from nltk.tokenize import sent_tokenize | |
nltk.download('punkt') | |
try: | |
from entitylinker import ELClient | |
except: | |
from .entitylinker import ELClient | |
class DocRetrieval: | |
def __init__(self, link_type): | |
self.wiki = wikipediaapi.Wikipedia('en') | |
self.er_client = ELClient(link_type, verbose=True) | |
def _get_page(self, title): | |
summary = self.wiki.page(title).summary | |
sents = [] | |
for i, sent in enumerate(sent_tokenize(summary)): | |
sents.append((title, i, sent, 0)) | |
return sents | |
def retrieve_docs(self, claim): | |
el_results = self.er_client.link(claim) | |
sents = [] | |
for text, label, kb_id, title in el_results: | |
if title == '': continue | |
sents += self._get_page(title) | |
return sents | |
if __name__ == '__main__': | |
doc = DocRetrieval('tagme') | |
print(doc.retrieve_docs('joe biden won the U.S. president.')) | |
print(doc.retrieve_docs('Joe Biden won the U.S. president.')) |