Spaces:

atharvat80
/

Wikipedia2Vec-NED

Runtime error

Atharva commited on May 3, 2022

Commit

5bc0741

•

1 Parent(s): 5ae066c

pipeline update

Files changed (2) hide show

app.py CHANGED Viewed

@@ -2,7 +2,7 @@ import pandas as pd
 import streamlit as st
 from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline
-from src import GBRT, google_search, wikidata_search
 TYPE = {
     'LOC': ' location',
@@ -52,11 +52,12 @@ def get_candidates(mentions_tags):
         if (mention, tag) in cache.keys():
             candidates.append((mention, cache[(mention, tag)]))
         else:
-            res1 = google_search(mention + TYPE[tag], limit=3)
-            res2 = wikidata_search(mention, limit=3)
-            cands = list(set(res1 + res2))
             cache[(mention, tag)] = cands
             candidates.append((mention, cands))
     return candidates

 import streamlit as st
 from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline
+from src import GBRT, wikipedia_search, wikidata_search
 TYPE = {
     'LOC': ' location',
         if (mention, tag) in cache.keys():
             candidates.append((mention, cache[(mention, tag)]))
         else:
+            cands = wikidata_search(mention, limit=3)
+            if cands == []:
+                cands = wikipedia_search(mention, limit=3)
             cache[(mention, tag)] = cands
             candidates.append((mention, cands))
+            print(mention, cands)
     return candidates

src/__init__.py CHANGED Viewed

@@ -106,22 +106,19 @@ def wikidata_search(query, limit=3):
     return [i for i in candidates if is_disamb_page(i) == False]
-def google_search(query, limit=3):
-    service_url = "https://www.googleapis.com/customsearch/v1/siterestrict"
     params = {
-        'q': query,
-        'num': limit,
-        'start': 0,
-        'key': os.environ.get('APIKEY'),
-        'cx': os.environ.get('CESCX')
     }
-    res = requests.get(service_url, params=params)
-    try:
-        cands = [i['title'].replace(' - Wikipedia', '') for i in res.json()["items"]]
-        cands = [i for i in cands if is_disamb_page(i) == False]
-        return [i.replace(' ', '_') for i in cands]
-    except:
-        return []
 def get_entity_extract(entity_title, num_sentences=0):

     return [i for i in candidates if is_disamb_page(i) == False]
+def wikipedia_search(query, limit=3):
+    service_url = 'https://en.wikipedia.org/w/api.php'
     params = {
+        'action': 'opensearch',
+        'search': query,
+        'namespace': 0,
+        'limit': limit,
+        'redirects': 'resolve',
     }
+    results = requests.get(service_url, params=params).json()[1]
+    results = [i.replace(' ', '_') for i in results if 'disambiguation' not in i.lower()]
+    return [i for i in results if is_disamb_page(i) == False]
 def get_entity_extract(entity_title, num_sentences=0):