Spaces:

atharvat80
/

Wikipedia2Vec-NED

Runtime error

App Files Files

Atharva commited on May 1, 2022

Commit

f74445c

•

1 Parent(s): 3d3358f

initial commit

Browse files

Files changed (9) hide show

.gitignore +2 -0
app.py +119 -0
data/entity_anchors.bin +3 -0
data/entity_prior.bin +3 -0
data/model.bin +3 -0
data/wiki2vec_w10_100d.bin +3 -0
requirements.txt +4 -0
src/__init__.py +207 -0
src/stopwords.py +71 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ __pycache__/
2	+ *.py[cod]

app.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import pandas as pd
+import streamlit as st
+import streamlit.components.v1 as components
+from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline
+from src import GBRT, wikipedia_search, google_search
+TYPE = {
+    'LOC': ' location',
+    'PER': ' person',
+    'ORG': ' organization',
+    'MISC': ''
+}
+COLOR = {
+    'LOC': '#40E0D0',
+    'PER': '#6495ED',
+    'ORG': '#CCCCFF',
+    'MISC': '#FF7F50'
+}
+# ---------------------------------------------------------------------------
+# Loading models
+# ---------------------------------------------------------------------------
+@st.cache(allow_output_mutation=True, show_spinner=True)
+def load_models():
+    # NER
+    tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
+    bert_ner = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
+    tagger = pipeline("token-classification", model=bert_ner, tokenizer=tokenizer,
+                      device=0, aggregation_strategy="average")
+    # NED
+    model = GBRT()
+    return model, tagger
+# ---------------------------------------------------------------------------
+# Page setup
+# ---------------------------------------------------------------------------
+st.set_page_config(layout="wide", page_title='Named Entity Disambiguation')
+st.write("## Named Entity Disambiguation")
+col1, col2 = st.columns(2)
+# ---------------------------------------------------------------------------
+# Candidate Generation
+# ---------------------------------------------------------------------------
+def get_candidates(mentions_tags):
+    candidates = []
+    cache = {}
+    for mention, tag in mentions_tags:
+        if (mention, tag) in cache.keys():
+            candidates.append((mention, cache[(mention, tag)]))
+        else:
+            res1 = google_search(mention + TYPE[tag])
+            res2 = wikipedia_search(mention, limit=10)
+            cands = list(set(res1 + res2))
+            cache[(mention, tag)] = cands
+            candidates.append((mention, cands))
+    return candidates
+# ---------------------------------------------------------------------------
+# Rendering Setup
+# ---------------------------------------------------------------------------
+def display_tag(text, typ, label):
+    if label != 'NIL':
+        label = "https://en.wikipedia.org/wiki/" + label
+    return f"""
+    <a style="margin: 0 5px; padding: 2px 4px; border-radius: 4px; text-decoration:none;
+              background-color:{COLOR[typ]}; color: white; cursor:pointer" href={label} target="_blank">
+        <span style="margin-right:3px">{text}</span>
+        <span style="border-style:1px white solid; padding: 2px;">{typ}</span>
+    </a>"""
+# ---------------------------------------------------------------------------
+# Full Pipeline
+# ---------------------------------------------------------------------------
+def main(text):
+    ner_results = tagger(text)
+    tagged, last_pos = '', 0
+    with st.spinner('Generating Candidates'):
+        mentions_cands = get_candidates([(res['word'], res['entity_group']) for res in ner_results])
+    with st.spinner('Disambiguating Mentions'):
+        preditions = model.link(mentions_cands, text)
+    with st.spinner('Rendering Results'):
+        for i, res in enumerate(ner_results):
+            tag = display_tag(res['word'], res['entity_group'], preditions[i][1])
+            tagged += text[last_pos:res['start']] + tag
+            last_pos = res['end']
+        tagged += text[last_pos:]
+    with col2:
+        st.write("### Disambiguated Text")
+        components.html(f'<p style="line-height: 1.8; margin-top:30px; font-family: sans-serif">{tagged}</p>',
+                        scrolling=True, height=500)
+    df = pd.DataFrame(data=preditions, columns=['Mention', 'Prediction', 'Confidence'])
+    st.write("**Additional Information**")
+    st.dataframe(df)
+if __name__ == '__main__':
+    model, tagger = load_models()
+    with col1:
+        st.write("### Input Text")
+        user_input = st.text_area('Press Ctrl + Enter to update results',
+                                  'George Washington went to Washington.', height=350)
+        if user_input:
+            main(user_input)

data/entity_anchors.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c7d51bcbca1f5f4486ef73cc26867ee723d7e62a1d707da24b9e2017657d15fe
+size 1191130865

data/entity_prior.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:469c177f0e6236d1e3dad0d1efe907dcb4c8004acaf7451a17b18754b5cfbcd7
+size 525736013

data/model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:675668d7f25be41a7b388061081cf6fe7c04f344bb866561718f57c3a2fbc6a5
+size 21047161

data/wiki2vec_w10_100d.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:da0a561df04532687acd4f018de60aa5bcdefa57a75bbfd871f7ed7b72f06b76
+size 3858917918

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+nltk
+numpy
+pandas
+wikipedia2vec

src/__init__.py ADDED Viewed

	@@ -0,0 +1,207 @@

+# ---------------------------------------------------------------------------
+#  IMPORTS
+# ---------------------------------------------------------------------------
+import os
+import pickle
+import nltk
+import numpy as np
+import requests
+from nltk import edit_distance, pos_tag
+from nltk.tokenize import word_tokenize
+from wikipedia2vec import Wikipedia2Vec
+from src.stopwords import STOP_WORDS
+# ---------------------------------------------------------------------------
+#  SETUP AND HELPER FUNCTIONS
+# ---------------------------------------------------------------------------
+nltk.download('averaged_perceptron_tagger')
+DATADIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'data')
+with open(os.path.join(DATADIR, 'entity_anchors.bin'), 'rb') as f:
+    prior_prob = pickle.load(f)
+with open(os.path.join(DATADIR, 'entity_prior.bin'), 'rb') as f:
+    entity_prior = pickle.load(f)
+def get_edit_dist(x, y):
+    return edit_distance(x, y)
+def get_entity_prior(entity):
+    try:
+        return entity_prior[entity.replace('_', ' ')]
+    except:
+        return 0
+def get_prior_prob(entity, mention):
+    try:
+        entity = entity.replace('_', ' ')
+        mention = mention.lower()
+        return prior_prob[mention][entity] / sum(prior_prob[mention].values())
+    except:
+        return 0
+def get_max_prior_prob(mentions, candidates):
+    max_prob = {i: max([get_prior_prob(i, j) for j in mentions])
+                for i in candidates}
+    return max_prob
+def cosine_similarity(v1, v2):
+    v1v2 = np.linalg.norm(v1) * np.linalg.norm(v2)
+    if v1v2 == 0:
+        return 0
+    else:
+        return np.dot(v2, v1) / v1v2
+def wikipedia_search(query, limit=20):
+    service_url = 'https://en.wikipedia.org/w/api.php'
+    params = {
+        'action': 'opensearch',
+        'search': query,
+        'namespace': 0,
+        'limit': limit,
+        'redirects': 'resolve',
+    }
+    results = requests.get(service_url, params=params).json()[1]
+    results = [i.replace(' ', '_')
+               for i in results if 'disambiguation' not in i.lower()]
+    return results
+def google_search(query, limit=10):
+    service_url = "https://www.googleapis.com/customsearch/v1/siterestrict"
+    params = {
+        'q': query,
+        'num': limit,
+        'start': 0,
+        'key': os.environ.get('APIKEY'),
+        'cx': os.environ.get('CESCX')
+    }
+    res = requests.get(service_url, params=params)
+    try:
+        cands = [i['title'].replace(' - Wikipedia', '') for i in res.json()["items"]]
+        return [i.replace(' ', '_') for i in cands]
+    except:
+        return []
+# ---------------------------------------------------------------------------
+#  NED SYSTEMS
+# ---------------------------------------------------------------------------
+### Base Model ###
+class Base:
+    def __init__(self):
+        self.emb = Wikipedia2Vec.load(os.path.join(DATADIR, 'wiki2vec_w10_100d.bin'))
+        self.stop_words = STOP_WORDS
+        self.tokenizer = word_tokenize
+        self.nouns_only = True
+        self.vector_size = self.emb.train_params['dim_size']
+    def get_nouns(self, tokens):
+        nouns = []
+        for word, pos in pos_tag(tokens):
+            if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS'):
+                nouns.extend(word.split(' '))
+        return list(set(nouns))
+    def filter(self, tokens):
+        tokens = list(set(tokens))
+        tokens = [w for w in tokens if not(w.lower() in self.stop_words)]
+        tokens = [w for w in tokens if w.isalnum()]
+        return self.get_nouns(tokens) if self.nouns_only else tokens
+    def encode_entity(self, entity):
+        entity = entity.replace('_', ' ')
+        if self.emb.get_entity(entity) is not None:
+            return self.emb.get_entity_vector(entity)
+        else:
+            return np.zeros(self.vector_size)
+    def encode_sentence(self, s):
+        words = self.filter(self.tokenizer(s.lower()))
+        emb, n = np.zeros(self.vector_size), 1
+        for w in words:
+            try:
+                emb += self.emb.get_word_vector(w)
+                n += 1
+            except KeyError:
+                pass
+        return emb/n
+### Advance Model ###
+class GBRT(Base):
+    def __init__(self):
+        super().__init__()
+        with open(os.path.join(DATADIR, 'model.bin'), 'rb') as f:
+            self.model = pickle.load(f)
+    def encode_context_entities(self, context_entities):
+        emb, n = np.zeros(self.vector_size), 1
+        for i in context_entities:
+            emb += self.encode_entity(i)
+            n += 1
+        return emb/n
+    def link(self, mentions_cands, context):
+        n_features = self.model.n_features_in_
+        # Calculate max prior probability of all candidates.
+        mentions = set([i for i, _ in mentions_cands])
+        candidates = set([i for _, j in mentions_cands for i in j])
+        max_prob = get_max_prior_prob(mentions, candidates)
+        # Find unambiguous entities
+        unamb_entities = [x for i, j in mentions_cands for x in j if get_prior_prob(x, i) > 0.95]
+        context_ent_emb = self.encode_context_entities(unamb_entities)
+        # Make predictions
+        context_emb = self.encode_sentence(context)
+        predictions = []
+        for mention, candidates in mentions_cands:
+            # Generate feature values
+            num_cands = len(candidates)
+            X = []
+            for candidate in candidates:
+                cand = candidate.replace('_', ' ').lower()
+                ment = mention.lower()
+                cand_emb = self.encode_entity(candidate)
+                X.append([
+                    candidate,
+                    get_prior_prob(candidate, mention),
+                    get_entity_prior(candidate),
+                    max_prob[candidate],
+                    num_cands,
+                    get_edit_dist(ment, cand),
+                    int(ment == cand),
+                    int(ment in cand),
+                    int(cand.startswith(cand) or cand.endswith(ment)),
+                    cosine_similarity(cand_emb, context_emb),
+                    cosine_similarity(cand_emb, context_ent_emb)
+                ])
+            # Add rank
+            X.sort(key=lambda x: x[-1] + x[-2], reverse=True)
+            X = [j + [i + 1] for i, j in enumerate(X)]
+            # Predict
+            pred, conf = 'NIL', 0
+            for i in X:
+                c = self.model.predict(np.array([i[1:]]))[0]
+                if c > conf:
+                    pred = i[0]
+                    conf = c
+            predictions.append([mention, pred, conf])
+        return predictions

src/stopwords.py ADDED Viewed

	@@ -0,0 +1,71 @@

+# Stop words
+STOP_WORDS = set("""
+a about above across after afterwards again against all almost alone along
+already also although always am among amongst amount an and another any anyhow
+anyone anything anyway anywhere are around as at
+back be became because become becomes becoming been before beforehand behind
+being below beside besides between beyond both bottom but by
+call can cannot ca could
+did do does doing done down due during
+each eight either eleven else elsewhere empty enough even ever every
+everyone everything everywhere except
+few fifteen fifty first five for former formerly forty four from front full
+further
+get give go
+had has have he hence her here hereafter hereby herein hereupon hers herself
+him himself his how however hundred
+i if in indeed into is it its itself
+keep
+last latter latterly least less
+just
+made make many may me meanwhile might mine more moreover most mostly move much
+must my myself
+name namely neither never nevertheless next nine no nobody none noone nor not
+nothing now nowhere
+of off often on once one only onto or other others otherwise our ours ourselves
+out over own
+part per perhaps please put
+quite
+rather re really regarding
+same say see seem seemed seeming seems serious several she should show side
+since six sixty so some somehow someone something sometime sometimes somewhere
+still such
+take ten than that the their them themselves then thence there thereafter
+thereby therefore therein thereupon these they third this those though three
+through throughout thru thus to together too top toward towards twelve twenty
+two
+under until up unless upon us used using
+various very very via was we well were what whatever when whence whenever where
+whereafter whereas whereby wherein whereupon wherever whether which while
+whither who whoever whole whom whose why will with within without would
+yet you your yours yourself yourselves
+""".split())
+contractions = ["n't", "'d", "'ll", "'m", "'re", "'s", "'ve"]
+STOP_WORDS.update(contractions)
+for apostrophe in ["‘", "’"]:
+    for stopword in contractions:
+        STOP_WORDS.add(stopword.replace("'", apostrophe))