MartinoMensio
/

spaCy-entity-linker

Model card Files Files and versions Community

egerber1 commited on Oct 12, 2020

Commit

8b513d0

•

0 Parent(s):

initial commit

Browse files

Files changed (17) hide show

.gitignore +5 -0
README.md +81 -0
downloadKnowledgeBase.sh +5 -0
requirements.txt +1 -0
setup.py +48 -0
spacyEntityLinker/DatabaseConnection.py +191 -0
spacyEntityLinker/EntityCandidates.py +22 -0
spacyEntityLinker/EntityClassifier.py +49 -0
spacyEntityLinker/EntityCollection.py +65 -0
spacyEntityLinker/EntityElement.py +124 -0
spacyEntityLinker/EntityLinker.py +30 -0
spacyEntityLinker/TermCandidate.py +38 -0
spacyEntityLinker/TermCandidateExtractor.py +52 -0
spacyEntityLinker/__init__.py +4 -0
spacyEntityLinker/__main__.py +28 -0
tests/test_EntityLinker.py +24 -0
tests/test_TermCandidateExtractor.py +9 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+data/*
+.idea
+*.log
+.ipynb_checkpoints
+data_spacy_entity_linker

README.md ADDED Viewed

	@@ -0,0 +1,81 @@

+# Spacy Entity Linker
+## Introduction
+Spacy Entity Linker is a pipeline for spaCy that performs Linked Entity Extraction with Wikidata on
+a given Document.
+The Entity Linking System operates by matching potential candidates from each sentence
+ (subject, object, prepositional phrase, compounds, etc.) to aliases
+from Wikidata. The package allows to easily find the category behind each entity (e.g. "banana" is type "food" OR "Microsoft" is type "company"). It can
+is therefore useful for information extraction tasks and labeling tasks.
+The package was written before a working Linked Entity Solution existed inside spaCy. In comparison to spaCy's linked entity system, it has the following examples
+- no extensive training required (string-matching is done on a database)
+- knowledge base can be dynamically updated without retraining
+- entity categories can be easily resolved
+- grouping entities by category
+It also comes along with a number of disadvantages:
+- it is slower than the spaCy implementation due to the use of a database for finding entities
+- no context sensitivity due to the implementation of the "max-prior method" for entitiy disambiguation
+## Use
+```python
+import spacy
+from SpacyEntityLinker import EntityLinker
+#Initialize Entity Linker
+entityLinker = EntityLinker()
+#initialize language model
+nlp = spacy.load("en_core_web_sm")
+#add pipeline
+nlp.add_pipe(entityLinker, last=True, name="entityLinker")
+doc = nlp("I watched the Pirates of the Carribean last silvester")
+#returns all entities in the whole document
+all_linked_entities=doc._.linkedEntities
+#iterates over sentences and prints linked entities
+for sent in doc.sents:
+    sent._.linkedEntities.pretty_print()
+'''
+https://www.wikidata.org/wiki/Q194318     194318     Pirates of the Caribbean        Series of fantasy adventure films
+https://www.wikidata.org/wiki/Q12525597   12525597   Silvester                       the day celebrated on 31 December (Roman Catholic Church) or 2 January (Eastern Orthodox Churches)
+'''
+```
+## Example
+In the following example we will use SpacyEntityLinker to extract all
+### Entity Linking Policy
+Currently the only method for choosing an entity given different possible matches (e.g. Paris - city vs Paris - firstname) is max-prior. This method achieves around 70% accuracy on predicting
+the correct entities behind link descriptions on wikipedia.
+## Note
+The Entity Linker at the current state is still experimental and should not be used in production mode.
+## Performance
+The current implementation supports only Sqlite. This is advantageous for development because
+it does not requirement any special setup and configuration. However, for more performance critical usecases, a different
+database with in-memory access (e.g. Redis) should be used. This may be implemented in the future.
+## Installation
+To install the package run: <code>pip install spacy-entity-linker</code>
+Afterwards, the knowledge base (Wikidata) must be downloaded. This can be done by calling
+<code>python -m spacyEntityLinker download_knowledge_base</code>
+This will download and extract a ~500mb file that contains a preprocessed version of Wikidata
+## TODO
+- [ ] implement Entity Classifier based on sentence embeddings for improved accuracy
+- [ ] implement get_picture_urls() on EntityElement
+- [ ] retrieve statements for each EntityElement (inlinks + outlinks)

downloadKnowledgeBase.sh ADDED Viewed

	@@ -0,0 +1,5 @@

+#!/bin/bash
+wget "https://wikidatafiles.nyc3.digitaloceanspaces.com/Hosting/Hosting/SpacyEntityLinker/datafiles.tar.gz" -O /tmp/knowledge_base.tar.gz
+tar -xzf /tmp/knowledge_base.tar.gz --directory ./data_spacy_entity_linker
+rm /tmp/knowledge_base.tar.gz

requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ spacy>=2.1.9

setup.py ADDED Viewed

	@@ -0,0 +1,48 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright (c) 2014 SeatGeek
+# This file is part of fuzzywuzzy.
+from spacyEntityLinker import __version__
+import os
+try:
+    from setuptools import setup
+except ImportError:
+    from distutils.core import setup
+def open_file(fname):
+    return open(os.path.join(os.path.dirname(__file__), fname))
+with open("README.md", "r") as fh:
+    long_description = fh.read()
+setup(
+    name='spacy-entity-linker',
+    version=__version__,
+    author='Emanuel Gerber',
+    author_email='emanuel.j.gerber@gmail.com',
+    packages=['spacyEntityLinker'],
+    url='https://github.com/egerber/spacy-entity-linker',
+    license="MIT",
+    classifiers=["Environment :: Console",
+                 "Intended Audience :: Developers",
+                 "Intended Audience :: Science/Research",
+                 "License :: OSI Approved :: MIT License",
+                 "Operating System :: POSIX :: Linux",
+                 "Programming Language :: Cython",
+                 "Programming Language :: Python",
+                 "Programming Language :: Python :: 2",
+                 "Programming Language :: Python :: 2.7",
+                 "Programming Language :: Python :: 3",
+                 "Programming Language :: Python :: 3.4"
+                 ],
+    description='Linked Entity Pipeline for spaCy',
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    zip_safe=True,
+)

spacyEntityLinker/DatabaseConnection.py ADDED Viewed

	@@ -0,0 +1,191 @@

+import sqlite3
+import os
+MAX_DEPTH_CHAIN = 10
+P_INSTANCE_OF = 31
+P_SUBCLASS = 279
+MAX_ITEMS_CACHE = 100000
+conn = None
+entity_cache = {}
+chain_cache = {}
+DB_DEFAULT_PATH = os.path.abspath('../data_spacy_entity_linker/wikidb_filtered.db')
+wikidata_instance = None
+def get_wikidata_instance():
+    global wikidata_instance
+    if wikidata_instance is None:
+        wikidata_instance = WikidataQueryController()
+    return wikidata_instance
+class WikidataQueryController:
+    def __init__(self):
+        self.conn = None
+        self.cache = {
+            "entity": {},
+            "chain": {},
+            "name": {}
+        }
+        self.init_database_connection()
+    def _get_cached_value(self, cache_type, key):
+        return self.cache[cache_type][key]
+    def _is_cached(self, cache_type, key):
+        return key in self.cache[cache_type]
+    def _add_to_cache(self, cache_type, key, value):
+        if len(self.cache[cache_type]) < MAX_ITEMS_CACHE:
+            self.cache[cache_type][key] = value
+    def init_database_connection(self, path=DB_DEFAULT_PATH):
+        self.conn = sqlite3.connect(path)
+    def clear_cache(self):
+        self.cache["entity"].clear()
+        self.cache["chain"].clear()
+        self.cache["name"].clear()
+    def get_entities_from_alias(self, alias):
+        c = self.conn.cursor()
+        if self._is_cached("entity", alias):
+            return self._get_cached_value("entity", alias).copy()
+        query_alias = """SELECT j.item_id,j.en_label, j.en_description,j.views,j.inlinks,a.en_alias from aliases as a
+            LEFT JOIN joined as j ON a.item_id = j.item_id
+            WHERE a.en_alias_lowercase = ? and j.item_id NOT NULL"""
+        c.execute(query_alias, [alias.lower()])
+        fetched_rows = c.fetchall()
+        self._add_to_cache("entity", alias, fetched_rows)
+        return fetched_rows
+    def get_instances_of(self, item_id, properties=[P_INSTANCE_OF, P_SUBCLASS], count=1000):
+        query = "SELECT source_item_id from statements where target_item_id={} and edge_property_id IN ({}) LIMIT {}".format(
+            item_id, ",".join([str(prop) for prop in properties]), count)
+        c = self.conn.cursor()
+        c.execute(query)
+        res = c.fetchall()
+        return [e[0] for e in res]
+    def get_entity_name(self, item_id):
+        if self._is_cached("name", item_id):
+            return self._get_cached_value("name", item_id)
+        c = self.conn.cursor()
+        query = "SELECT en_label from joined WHERE item_id=?"
+        c.execute(query, [item_id])
+        res = c.fetchone()
+        if res and len(res):
+            if res[0] == None:
+                self._append_chain_elements("name", item_id, 'no label')
+            else:
+                self._append_chain_elements("name", item_id, res[0])
+        else:
+            self._append_chain_elements("name", item_id, '<none>')
+        return self._get_cached_value("name", item_id)
+    def get_entity(self, item_id):
+        c = self.conn.cursor()
+        query = "SELECT j.item_id,j.en_label,j.en_description,j.views,j.inlinks from joined as j " \
+                "WHERE j.item_id=={}".format(item_id)
+        res = c.execute(query)
+        return res.fetchone()
+    def get_children(self, item_id, limit=100):
+        c = self.conn.cursor()
+        query = "SELECT j.item_id,j.en_label,j.en_description,j.views,j.inlinks from joined as j " \
+                "JOIN statements as s on j.item_id=s.source_item_id " \
+                "WHERE s.target_item_id={} and s.edge_property_id IN (279,31) LIMIT {}".format(item_id, limit)
+        res = c.execute(query)
+        return res.fetchall()
+    def get_parents(self, item_id, limit=100):
+        c = self.conn.cursor()
+        query = "SELECT j.item_id,j.en_label,j.en_description,j.views,j.inlinks from joined as j " \
+                "JOIN statements as s on j.item_id=s.target_item_id " \
+                "WHERE s.source_item_id={} and s.edge_property_id IN (279,31) LIMIT {}".format(item_id, limit)
+        res = c.execute(query)
+        return res.fetchall()
+    def get_categories(self, item_id, max_depth=10):
+        chain = []
+        edges = []
+        self._append_chain_elements(item_id, 0, chain, edges, max_depth, [P_INSTANCE_OF, P_SUBCLASS])
+        return [el[0] for el in chain]
+    def get_chain(self, item_id, max_depth=10, property=P_INSTANCE_OF):
+        chain = []
+        edges = []
+        self._append_chain_elements(item_id, 0, chain, edges, max_depth, property)
+        return chain
+    def get_recursive_edges(self, item_id):
+        chain = []
+        edges = []
+        self._append_chain_elements(self, item_id, 0, chain, edges)
+        return edges
+    def _append_chain_elements(self, item_id, level=0, chain=[], edges=[], max_depth=10, property=P_INSTANCE_OF):
+        properties = property
+        if type(property) != list:
+            properties = [property]
+        if self._is_cached("chain", (item_id, max_depth)):
+            chain += self._get_cached_value("chain", (item_id, max_depth)).copy()
+            return
+        # prevent infinite recursion
+        if level >= max_depth:
+            return
+        c = self.conn.cursor()
+        query = "SELECT target_item_id,edge_property_id from statements where source_item_id={} and edge_property_id IN ({})".format(
+            item_id, ",".join([str(prop) for prop in properties]))
+        # set value for current item in order to prevent infinite recursion
+        self._add_to_cache("chain", (item_id, max_depth), [])
+        for target_item in c.execute(query):
+            chain_ids = [el[0] for el in chain]
+            if not (target_item[0] in chain_ids):
+                chain += [(target_item[0], level + 1)]
+                edges.append((item_id, target_item[0], target_item[1]))
+                self._append_chain_elements(target_item[0], level=level + 1, chain=chain, edges=edges,
+                                            max_depth=max_depth,
+                                            property=property)
+        self._add_to_cache("chain", (item_id, max_depth), chain)
+if __name__ == '__main__':
+    queryInstance = WikidataQueryController()
+    queryInstance.init_database_connection()
+    print(queryInstance.get_categories(13191, max_depth=1))
+    print(queryInstance.get_categories(13191, max_depth=1))

spacyEntityLinker/EntityCandidates.py ADDED Viewed

	@@ -0,0 +1,22 @@

+class EntityCandidates:
+    def __init__(self, entity_elements):
+        self.entity_elements = entity_elements
+    def __iter__(self):
+        for entity in self.entity_elements:
+            yield entity
+    def __len__(self):
+        return len(self.entity_elements)
+    def __getitem__(self, item):
+        return self.entity_elements[item]
+    def pretty_print(self):
+        for entity in self.entity_elements:
+            entity.pretty_print()
+    def __str__(self):
+        return str(["entity {}: {} (<{}>)".format(i, entity.get_label(), entity.get_description()) for i, entity in
+                    enumerate(self.entity_elements)])

spacyEntityLinker/EntityClassifier.py ADDED Viewed

	@@ -0,0 +1,49 @@

+from itertools import groupby
+import numpy as np
+class EntityClassifier:
+    def __init__(self):
+        pass
+    def _get_grouped_by_length(self, entities):
+        sorted_by_len = sorted(entities, key=lambda entity: len(entity.get_span()), reverse=True)
+        entities_by_length = {}
+        for length, group in groupby(sorted_by_len, lambda entity: len(entity.get_span())):
+            entities = list(group)
+            entities_by_length[length] = entities
+        return entities_by_length
+    def _filter_max_length(self, entities):
+        entities_by_length = self._get_grouped_by_length(entities)
+        max_length = max(list(entities_by_length.keys()))
+        return entities_by_length[max_length]
+    def _select_max_prior(self, entities):
+        priors = [entity.get_prior() for entity in entities]
+        return entities[np.argmax(priors)]
+    def _get_casing_difference(self, word1, original):
+        difference = 0
+        for w1, w2 in zip(word1, original):
+            if w1 != w2:
+                difference += 1
+        return difference
+    def _filter_most_similar(self, entities):
+        similarities = np.array(
+            [self._get_casing_difference(entity.get_span().text, entity.get_original_alias()) for entity in entities])
+        min_indices = np.where(similarities == similarities.min())[0].tolist()
+        return [entities[i] for i in min_indices]
+    def __call__(self, entities):
+        filtered_by_length = self._filter_max_length(entities)
+        filtered_by_casing = self._filter_most_similar(filtered_by_length)
+        return self._select_max_prior(filtered_by_casing)

spacyEntityLinker/EntityCollection.py ADDED Viewed

	@@ -0,0 +1,65 @@

+from collections import Counter, defaultdict
+from spacyEntityLinker.DatabaseConnection import get_wikidata_instance
+class EntityCollection:
+    def __init__(self, entities=[]):
+        self.entities = entities
+    def __iter__(self):
+        for entity in self.entities:
+            yield entity
+    def __getitem__(self, item):
+        return self.entities[item]
+    def __len__(self):
+        return len(self.entities)
+    def append(self, entity):
+        self.entities.append(entity)
+    def get_categories(self, max_depth=1):
+        categories = []
+        for entity in self.entities:
+            categories += entity.get_categories(max_depth)
+        return categories
+    def print_categories(self, max_depth=1, limit=10):
+        wikidataInstance = get_wikidata_instance()
+        all_categories = []
+        category_to_entites = defaultdict(list)
+        for e in self.entities:
+            for category in e.get_categories(max_depth):
+                category_to_entites[category].append(e)
+                all_categories.append(category)
+        counter = Counter()
+        counter.update(all_categories)
+        for category, frequency in counter.most_common(limit):
+            print("{} ({}) : {}".format(wikidataInstance.get_entity_name(category), frequency,
+                                        ','.join([str(e) for e in category_to_entites[category]])))
+    def pretty_print(self):
+        for entity in self.entities:
+            entity.pretty_print()
+    def grouped_by_category(self, max_depth=1):
+        counter = Counter()
+        counter.update(self.get_categories(max_depth))
+        return counter
+    def get_distinct_categories(self, max_depth=1):
+        return list(set(self.get_categories(max_depth)))
+    def most_frequent_categories(self):
+        pass
+    def get_most_significant_categories(self, priors):
+        pass

spacyEntityLinker/EntityElement.py ADDED Viewed

	@@ -0,0 +1,124 @@

+from spacyEntityLinker.DatabaseConnection import get_wikidata_instance
+class EntityElement:
+    def __init__(self, row, span):
+        self.identifier = row[0]
+        self.prior = 0
+        self.original_alias = None
+        self.in_degree = None
+        if len(row) > 1:
+            self.label = row[1]
+        if len(row) > 2:
+            self.description = row[2]
+        if len(row) > 3 and row[3]:
+            self.prior = row[3]
+        if len(row) > 4 and row[4]:
+            self.in_degree = row[4]
+        if len(row) > 5 and row[5]:
+            self.original_alias = row[5]
+        self.span = span
+        self.chain = None
+        self.chain_ids = None
+        self.wikidata_instance = get_wikidata_instance()
+    def get_in_degree(self):
+        return self.in_degree
+    def get_original_alias(self):
+        return self.original_alias
+    def is_singleton(self):
+        return len(self.get_chain()) == 0
+    def get_span(self):
+        return self.span
+    def get_label(self):
+        return self.label
+    def get_id(self):
+        return self.identifier
+    def get_prior(self):
+        return self.prior
+    def get_chain(self):
+        if self.chain is None:
+            self.chain = self.wikidata_instance.get_chain(self.identifier, max_depth=10, property=31)
+        return self.chain
+    def is_category(self):
+        pass
+    def is_leaf(self):
+        pass
+    def get_categories(self, max_depth=10):
+        return self.wikidata_instance.get_categories(self.identifier, max_depth=max_depth)
+    def get_children(self, limit=10):
+        return [EntityElement(row, None) for row in self.wikidata_instance.get_children(self.get_id(), limit)]
+    def get_parents(self, limit=10):
+        return [EntityElement(row, None) for row in self.wikidata_instance.get_parents(self.get_id(), limit)]
+    def get_subclass_hierarchy(self):
+        chain = self.wikidata_instance.get_chain(self.identifier, max_depth=5, property=279)
+        return [self.wikidata_instance.get_entity_name(el[0]) for el in chain]
+    def get_instance_of_hierarchy(self):
+        chain = self.wikidata_instance.get_chain(self.identifier, max_depth=5, property=31)
+        return [self.wikidata_instance.get_entity_name(el[0]) for el in chain]
+    def get_chain_ids(self, max_depth=10):
+        if self.chain_ids is None:
+            self.chain_ids = set([el[0] for el in self.get_chain(max_depth=max_depth)])
+        return self.chain_ids
+    def get_description(self):
+        if self.description:
+            return self.description
+        else:
+            return ""
+    def is_intersecting(self, other_element):
+        return len(self.get_chain_ids().intersection(other_element.get_chain_ids())) > 0
+    def serialize(self):
+        return {
+            "id": self.get_id(),
+            "label": self.get_label(),
+            "span": self.get_span()
+        }
+    def pretty_print(self):
+        print(
+            "https://www.wikidata.org/wiki/Q{0:<10} {1:<10} {2:<30}  {3:<100}".format(self.get_id(),
+                                                                                      self.get_id(),
+                                                                                      self.get_label(),
+                                                                                      self.get_description()[:100]))
+    def pretty_string(self, description=False):
+        if description:
+            return ','.join([span.text for span in self.span]) + "  => {} <{}>".format(self.get_label(),
+                                                                                       self.get_description())
+        else:
+            return ','.join([span.text for span in self.span]) + "  => {}".format(self.get_label())
+    def save(self, category):
+        for span in self.span:
+            span.sent._.linked_entities.append(
+                {"id": self.identifier, "range": [span.start, span.end + 1], "category": category})
+    def __str__(self):
+        label = self.get_label()
+        if label:
+            return label
+        else:
+            return ""

spacyEntityLinker/EntityLinker.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from spacyEntityLinker.EntityClassifier import EntityClassifier
+from spacyEntityLinker.EntityCollection import EntityCollection
+from spacyEntityLinker.TermCandidateExtractor import TermCandidateExtractor
+from spacy.tokens import Doc, Span
+class EntityLinker:
+    def __init__(self):
+        Doc.set_extension("linkedEntities", default=EntityCollection(), force=True)
+        Span.set_extension("linkedEntities", default=None, force=True)
+    def __call__(self, doc):
+        tce = TermCandidateExtractor(doc)
+        classifier = EntityClassifier()
+        for sent in doc.sents:
+            sent._.linkedEntities = EntityCollection([])
+        entities = []
+        for termCandidates in tce:
+            entityCandidates = termCandidates.get_entity_candidates()
+            if len(entityCandidates) > 0:
+                entity = classifier(entityCandidates)
+                entity.span.sent._.linkedEntities.append(entity)
+                entities.append(entity)
+        doc._.linkedEntities = EntityCollection(entities)
+        return doc

spacyEntityLinker/TermCandidate.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from spacyEntityLinker.EntityCandidates import EntityCandidates
+from spacyEntityLinker.EntityElement import EntityElement
+from spacyEntityLinker.DatabaseConnection import get_wikidata_instance
+class TermCandidate:
+    def __init__(self, span):
+        self.variations = [span]
+    def pretty_print(self):
+        print("Term Candidates are [{}]".format(self))
+    def append(self, span):
+        self.variations.append(span)
+    def has_plural(self, variation):
+        return any([t.tag_ == "NNS" for t in variation])
+    def get_singular(self, variation):
+        return ' '.join([t.text if t.tag_ != "NNS" else t.lemma_ for t in variation])
+    def __str__(self):
+        return ', '.join([variation.text for variation in self.variations])
+    def get_entity_candidates(self):
+        wikidata_instance = get_wikidata_instance()
+        entities_by_variation = {}
+        for variation in self.variations:
+            entities_by_variation[variation] = wikidata_instance.get_entities_from_alias(variation.text)
+            if self.has_plural(variation):
+                entities_by_variation[variation] += wikidata_instance.get_entities_from_alias(
+                    self.get_singular(variation))
+        entity_elements = []
+        for variation, entities in entities_by_variation.items():
+            entity_elements += [EntityElement(entity, variation) for entity in entities]
+        return EntityCandidates(entity_elements)

spacyEntityLinker/TermCandidateExtractor.py ADDED Viewed

	@@ -0,0 +1,52 @@

+from spacyEntityLinker.TermCandidate import TermCandidate
+class TermCandidateExtractor:
+    def __init__(self, doc):
+        self.doc = doc
+    def __iter__(self):
+        for sent in self.doc.sents:
+            for candidate in self._get_candidates_in_sent(sent, self.doc):
+                yield candidate
+    def _get_candidates_in_sent(self, sent, doc):
+        root = list(filter(lambda token: token.dep_ == "ROOT", sent))[0]
+        excluded_children = []
+        candidates = []
+        def get_candidates(node, doc):
+            if (node.pos_ in ["PROPN", "NOUN"]) and node.pos_ not in ["PRON"]:
+                term_candidates = TermCandidate(doc[node.i:node.i + 1])
+                for child in node.children:
+                    start_index = min(node.i, child.i)
+                    end_index = max(node.i, child.i)
+                    if child.dep_ == "compound" or child.dep_ == "amod":
+                        subtree_tokens = list(child.subtree)
+                        if all([c.dep_ == "compound" for c in subtree_tokens]):
+                            start_index = min([c.i for c in subtree_tokens])
+                        term_candidates.append(doc[start_index:end_index + 1])
+                        if not child.dep_ == "amod":
+                            term_candidates.append(doc[start_index:start_index + 1])
+                        excluded_children.append(child)
+                    if child.dep_ == "prep" and child.text == "of":
+                        end_index = max([c.i for c in child.subtree])
+                        term_candidates.append(doc[start_index:end_index + 1])
+                candidates.append(term_candidates)
+            for child in node.children:
+                if child in excluded_children:
+                    continue
+                get_candidates(child, doc)
+        get_candidates(root, doc)
+        return candidates

spacyEntityLinker/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .EntityLinker import EntityLinker
+__version__ = '0.0.2'
+__all__ = [EntityLinker]

spacyEntityLinker/__main__.py ADDED Viewed

	@@ -0,0 +1,28 @@

+if __name__ == "__main__":
+    import sys
+    import urllib
+    import urllib.request
+    import tarfile
+    import os
+    if len(sys.argv) < 2:
+        print("No arguments given")
+        pass
+    command = sys.argv.pop(1)
+    if command == "download_knowledge_base":
+        FILE_URL = "https://wikidatafiles.nyc3.digitaloceanspaces.com/Hosting/Hosting/SpacyEntityLinker/datafiles.tar.gz"
+        OUTPUT_TAR_FILE = os.path.abspath(
+            os.path.dirname(__file__)) + '/../data_spacy_entity_linker/wikidb_filtered.tar.gz'
+        OUTPUT_DB_PATH = os.path.abspath(os.path.dirname(__file__)) + '/../data_spacy_entity_linker'
+        if not os.path.exists(OUTPUT_DB_PATH):
+            os.makedirs(OUTPUT_DB_PATH)
+        urllib.request.urlretrieve(FILE_URL, OUTPUT_TAR_FILE)
+        tar = tarfile.open(OUTPUT_TAR_FILE)
+        tar.extractall(OUTPUT_DB_PATH)
+        tar.close()
+        os.remove(OUTPUT_TAR_FILE)

tests/test_EntityLinker.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import unittest
+import spacy
+from spacyEntityLinker.EntityLinker import EntityLinker
+class TestEntityLinker(unittest.TestCase):
+    def __init__(self, arg, *args, **kwargs):
+        super(TestEntityLinker, self).__init__(arg, *args, **kwargs)
+        self.nlp = spacy.load('en_core_web_sm')
+    def test_initialization(self):
+        entityLinker = EntityLinker()
+        self.nlp.add_pipe(entityLinker, last=True, name="entityLinker")
+        doc = self.nlp("I watched the Pirates of the Caribbean last silvester. Then I saw a snake. It was great.")
+        doc._.linkedEntities.pretty_print()
+        for sent in doc.sents:
+            sent._.linkedEntities.pretty_print()
+        self.nlp.remove_pipe("entityLinker")

tests/test_TermCandidateExtractor.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import unittest
+import spacy
+import spacyEntityLinker.TermCandidateExtractor
+class TestCandidateExtractor(unittest.TestCase):
+    def __init__(self, arg, *args, **kwargs):
+        super(TestCandidateExtractor, self).__init__(arg, *args, **kwargs)