Update ✨Entity Linking Application✨.py
Browse files- ✨Entity Linking Application✨.py +27 -52
✨Entity Linking Application✨.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
import pandas as pd
|
2 |
import json
|
3 |
import numpy as np
|
4 |
import re
|
@@ -15,20 +14,18 @@ from openai import OpenAI
|
|
15 |
import sys
|
16 |
import time
|
17 |
from bs4 import BeautifulSoup
|
18 |
-
from fake_useragent import UserAgent
|
19 |
import requests
|
20 |
-
|
21 |
-
|
|
|
|
|
|
|
22 |
|
23 |
-
headers = {
|
24 |
-
"User-Agent": f"{ua.random}"
|
25 |
-
}
|
26 |
|
27 |
folder_path = '/home/user/app/qids_folder'
|
28 |
|
29 |
if not os.path.exists(folder_path):
|
30 |
os.mkdir(folder_path)
|
31 |
-
print(f"folder created at {folder_path}")
|
32 |
else:
|
33 |
pass
|
34 |
|
@@ -54,14 +51,11 @@ async def combination_method(name, session):
|
|
54 |
x = itertools_combinations(new_name, 2)
|
55 |
for i in x:
|
56 |
new_word = (i[0] + " " + i[1])
|
57 |
-
url = f"https://
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
for link in href_links:
|
63 |
-
if link.startswith('https://en.wikipedia.org/wiki/'):
|
64 |
-
data.add(link.split("/")[-1])
|
65 |
return data
|
66 |
|
67 |
async def single_method(name, session):
|
@@ -69,30 +63,24 @@ async def single_method(name, session):
|
|
69 |
data = set()
|
70 |
new_name = name.replace("-", " ").replace("/", " ").split()
|
71 |
for i in new_name:
|
72 |
-
url = f"https://
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
for link in href_links:
|
78 |
-
if link.startswith('https://en.wikipedia.org/wiki/'):
|
79 |
-
data.add(link.split("/")[-1])
|
80 |
return data
|
81 |
|
82 |
-
async def mains(name,
|
83 |
data = set()
|
84 |
disam_data = set()
|
85 |
qids = set()
|
86 |
|
87 |
async with aiohttp.ClientSession() as session:
|
88 |
-
url = f"https://
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
for link in href_links:
|
94 |
-
if link.startswith('https://en.wikipedia.org/wiki/'):
|
95 |
-
data.add(link.split("/")[-1])
|
96 |
|
97 |
wikipedia_url = f"https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch={name}&srlimit=1&srprop=&srenablerewrites=True&srinfo=suggestion&format=json"
|
98 |
json_data = await fetch_json(wikipedia_url, session)
|
@@ -126,14 +114,14 @@ async def mains(name, single, combi):
|
|
126 |
disam_data.add(ids)
|
127 |
|
128 |
# Makes combinations of the name
|
129 |
-
if
|
130 |
if len(name.replace("-", " ").split()) >= 3:
|
131 |
combination_names = await combination_method(name, session)
|
132 |
for i in combination_names:
|
133 |
disam_data.add(i)
|
134 |
|
135 |
# Checks every word alone
|
136 |
-
if
|
137 |
if len(name.replace("-", " ").replace("/", " ").split()) >= 2:
|
138 |
singles = await single_method(name, session)
|
139 |
for i in singles:
|
@@ -270,18 +258,6 @@ async def main(name):
|
|
270 |
with open(f"/home/user/app/info_extraction/{name}.json", "w", encoding="utf-8") as flast:
|
271 |
json.dump(final_list, flast)
|
272 |
|
273 |
-
#def check_sentence(sentence):
|
274 |
-
# two_consecutive_uppercase = r"[A-Z]{2}"
|
275 |
-
# uppercase_followed_by_fullstop = r"[A-Z]\."
|
276 |
-
|
277 |
-
# if re.search(two_consecutive_uppercase, sentence):
|
278 |
-
# return True
|
279 |
-
|
280 |
-
# if re.search(uppercase_followed_by_fullstop, sentence):
|
281 |
-
# return True
|
282 |
-
|
283 |
-
# return False
|
284 |
-
|
285 |
def main_cli():
|
286 |
st.title("✨ Entity Linking Application ✨")
|
287 |
st.caption("This web application is part of my master’s dissertation.")
|
@@ -306,8 +282,7 @@ def main_cli():
|
|
306 |
|
307 |
input_sentence_user = st.text_input("Enter a sentence:", "", disabled=st.session_state.running)
|
308 |
input_mention_user = st.text_input("Enter a textural reference (mention) that is inside the sentence:", "", disabled=st.session_state.running)
|
309 |
-
|
310 |
-
combi = st.selectbox("Make combinations of each word? (Useful for difficult mentions)", ['Yes', 'No'], index=1, disabled=st.session_state.running)
|
311 |
disambi = st.selectbox("Run acronym disambiguation? (Enable it if the mention include an acronym or if it is nested)", ['Yes', 'No'], index=0, disabled=st.session_state.running)
|
312 |
|
313 |
if st.button("Run Entity Linking", key="run_button", disabled=st.session_state.running):
|
@@ -426,18 +401,18 @@ def main_cli():
|
|
426 |
list_with_contexts.append(context)
|
427 |
st.write("✅ Applied Data Normilzation module (1/5)")
|
428 |
# Candidate Retrieval & Information Gathering
|
429 |
-
async def big_main(mention,
|
430 |
mention = mention.split(",")
|
431 |
with st.spinner("Applying Candidate Retrieval module... (2/5)"):
|
432 |
for i in mention:
|
433 |
-
await mains(i,
|
434 |
st.write("✅ Applied Candidate Retrieval module (2/5)")
|
435 |
with st.spinner("Applying Information Gathering module... (3/5)"):
|
436 |
for i in mention:
|
437 |
await main(i)
|
438 |
st.write("✅ Applied Information Gathering module (3/5)")
|
439 |
|
440 |
-
asyncio.run(big_main(name,
|
441 |
|
442 |
number = 0
|
443 |
for i,j,o in zip(list_with_full_names,list_with_contexts,list_with_names_to_show):
|
|
|
|
|
1 |
import json
|
2 |
import numpy as np
|
3 |
import re
|
|
|
14 |
import sys
|
15 |
import time
|
16 |
from bs4 import BeautifulSoup
|
|
|
17 |
import requests
|
18 |
+
import nest_asyncio
|
19 |
+
import httpx
|
20 |
+
|
21 |
+
|
22 |
+
nest_asyncio.apply()
|
23 |
|
|
|
|
|
|
|
24 |
|
25 |
folder_path = '/home/user/app/qids_folder'
|
26 |
|
27 |
if not os.path.exists(folder_path):
|
28 |
os.mkdir(folder_path)
|
|
|
29 |
else:
|
30 |
pass
|
31 |
|
|
|
51 |
x = itertools_combinations(new_name, 2)
|
52 |
for i in x:
|
53 |
new_word = (i[0] + " " + i[1])
|
54 |
+
url = f"https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch={new_word}&srlimit=20&srprop=&srenablerewrites=True&format=json"
|
55 |
+
json_data = await fetch_json(url, session)
|
56 |
+
suggestion = json_data.get('query', {}).get('search', {})
|
57 |
+
for pageid in suggestion:
|
58 |
+
data.add(pageid.get('title', {}))
|
|
|
|
|
|
|
59 |
return data
|
60 |
|
61 |
async def single_method(name, session):
|
|
|
63 |
data = set()
|
64 |
new_name = name.replace("-", " ").replace("/", " ").split()
|
65 |
for i in new_name:
|
66 |
+
url = f"https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch={i}&srlimit=20&srprop=&srenablerewrites=True&format=json"
|
67 |
+
json_data = await fetch_json(url, session)
|
68 |
+
suggestion = json_data.get('query', {}).get('search', {})
|
69 |
+
for pageid in suggestion:
|
70 |
+
data.add(pageid.get('title', {}))
|
|
|
|
|
|
|
71 |
return data
|
72 |
|
73 |
+
async def mains(name, deep_search):
|
74 |
data = set()
|
75 |
disam_data = set()
|
76 |
qids = set()
|
77 |
|
78 |
async with aiohttp.ClientSession() as session:
|
79 |
+
url = f"https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch={name}&srlimit=20&srprop=&srenablerewrites=True&format=json"
|
80 |
+
json_data = await fetch_json(url, session)
|
81 |
+
suggestion = json_data.get('query', {}).get('search', {})
|
82 |
+
for pageid in suggestion:
|
83 |
+
data.add(pageid.get('title', {}))
|
|
|
|
|
|
|
84 |
|
85 |
wikipedia_url = f"https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch={name}&srlimit=1&srprop=&srenablerewrites=True&srinfo=suggestion&format=json"
|
86 |
json_data = await fetch_json(wikipedia_url, session)
|
|
|
114 |
disam_data.add(ids)
|
115 |
|
116 |
# Makes combinations of the name
|
117 |
+
if deep_search == "Yes":
|
118 |
if len(name.replace("-", " ").split()) >= 3:
|
119 |
combination_names = await combination_method(name, session)
|
120 |
for i in combination_names:
|
121 |
disam_data.add(i)
|
122 |
|
123 |
# Checks every word alone
|
124 |
+
if deep_search == "Yes":
|
125 |
if len(name.replace("-", " ").replace("/", " ").split()) >= 2:
|
126 |
singles = await single_method(name, session)
|
127 |
for i in singles:
|
|
|
258 |
with open(f"/home/user/app/info_extraction/{name}.json", "w", encoding="utf-8") as flast:
|
259 |
json.dump(final_list, flast)
|
260 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
261 |
def main_cli():
|
262 |
st.title("✨ Entity Linking Application ✨")
|
263 |
st.caption("This web application is part of my master’s dissertation.")
|
|
|
282 |
|
283 |
input_sentence_user = st.text_input("Enter a sentence:", "", disabled=st.session_state.running)
|
284 |
input_mention_user = st.text_input("Enter a textural reference (mention) that is inside the sentence:", "", disabled=st.session_state.running)
|
285 |
+
deep_search = st.selectbox("Perform deep search? (Useful for difficult mentions)", ['Yes', 'No'], index=1, disabled=st.session_state.running)
|
|
|
286 |
disambi = st.selectbox("Run acronym disambiguation? (Enable it if the mention include an acronym or if it is nested)", ['Yes', 'No'], index=0, disabled=st.session_state.running)
|
287 |
|
288 |
if st.button("Run Entity Linking", key="run_button", disabled=st.session_state.running):
|
|
|
401 |
list_with_contexts.append(context)
|
402 |
st.write("✅ Applied Data Normilzation module (1/5)")
|
403 |
# Candidate Retrieval & Information Gathering
|
404 |
+
async def big_main(mention, deep_search):
|
405 |
mention = mention.split(",")
|
406 |
with st.spinner("Applying Candidate Retrieval module... (2/5)"):
|
407 |
for i in mention:
|
408 |
+
await mains(i, deep_search)
|
409 |
st.write("✅ Applied Candidate Retrieval module (2/5)")
|
410 |
with st.spinner("Applying Information Gathering module... (3/5)"):
|
411 |
for i in mention:
|
412 |
await main(i)
|
413 |
st.write("✅ Applied Information Gathering module (3/5)")
|
414 |
|
415 |
+
asyncio.run(big_main(name, deep_search))
|
416 |
|
417 |
number = 0
|
418 |
for i,j,o in zip(list_with_full_names,list_with_contexts,list_with_names_to_show):
|