| import logging |
| import numpy as np |
| import requests |
|
|
| from typing import Dict |
| from tqdm import tqdm |
|
|
| from concurrent.futures import ThreadPoolExecutor |
|
|
| ENTITY_PATH = '/data/jcherian/wikipedia_entity_map.npz' |
| WIKIDATA_URL = "https://www.wikidata.org/w/api.php" |
| logger = logging.getLogger(__name__) |
| logging.basicConfig(filename='human.log', level=logging.INFO) |
|
|
|
|
| def get_id(response : Dict) -> str: |
| if response.get("entities", None) is None: |
| return None |
| wikidata_codes = list(response['entities'].keys()) |
| assert len(wikidata_codes) == 1 |
| return wikidata_codes[0] |
|
|
|
|
| def is_human(response : Dict, id: str) -> bool: |
| instances = response['entities'][id]['claims'].get('P31', []) |
| for inst in instances: |
| if inst['mainsnak']['datavalue']['value']['id'] == 'Q5': |
| return True |
| return False |
|
|
| def validate_entity(k): |
| name = k.split('/')[-1] |
| adapter = requests.adapters.HTTPAdapter(max_retries=10) |
| with requests.session() as s: |
| s.mount("https://", adapter) |
| response = s.get(url=WIKIDATA_URL, params={"action" : "wbgetentities", |
| "sites" : "enwiki", |
| "titles" : name, |
| "normalize": "1", |
| "languages": "en", |
| "format": "json", |
| "props": "claims"}) |
|
|
| try: |
| response = response.json() |
| except: |
| print(response.text) |
|
|
| wiki_id = get_id(response) |
| |
| if wiki_id is None: |
| return name, False |
|
|
| try: |
| human = is_human(response, wiki_id) |
| except: |
| return name, False |
| logger.info(f"{name}, {human}") |
| return name, human |
|
|
|
|
| if __name__ == "__main__": |
| wiki_entities = np.load(ENTITY_PATH) |
| entity_names = list(wiki_entities.keys()) |
| try: |
| with ThreadPoolExecutor(max_workers=5) as executor: |
| res = list( |
| tqdm( |
| executor.map( |
| lambda k : validate_entity(k), |
| entity_names |
| ), |
| total=len(entity_names) |
| ) |
| ) |
| except: |
| import pickle |
| with open('human.pkl', 'wb') as fp: |
| pickle.dump(res, fp) |
| |
|
|
| import pickle |
| with open('human.pkl', 'wb') as fp: |
| pickle.dump(res, fp) |
|
|
| import IPython; IPython.embed() |
|
|