NikosKprl commited on
Commit
fa2e30c
·
verified ·
1 Parent(s): 79d44c7

Update ✨Entity Linking Application✨.py

Browse files
Files changed (1) hide show
  1. ✨Entity Linking Application✨.py +27 -52
✨Entity Linking Application✨.py CHANGED
@@ -1,4 +1,3 @@
1
- import pandas as pd
2
  import json
3
  import numpy as np
4
  import re
@@ -15,20 +14,18 @@ from openai import OpenAI
15
  import sys
16
  import time
17
  from bs4 import BeautifulSoup
18
- from fake_useragent import UserAgent
19
  import requests
20
-
21
- ua = UserAgent()
 
 
 
22
 
23
- headers = {
24
- "User-Agent": f"{ua.random}"
25
- }
26
 
27
  folder_path = '/home/user/app/qids_folder'
28
 
29
  if not os.path.exists(folder_path):
30
  os.mkdir(folder_path)
31
- print(f"folder created at {folder_path}")
32
  else:
33
  pass
34
 
@@ -54,14 +51,11 @@ async def combination_method(name, session):
54
  x = itertools_combinations(new_name, 2)
55
  for i in x:
56
  new_word = (i[0] + " " + i[1])
57
- url = f"https://www.google.com/search?q={new_word} site:en.wikipedia.org inurl:/wiki/ -inurl:? -inurl:Category: -inurl:File: -inurl:Special: -inurl:Help:&num=5"
58
- html = requests.get(url, headers=headers)
59
- soup = BeautifulSoup(html.text, "html.parser")
60
- elements_with_href = soup.find_all(href=True)
61
- href_links = [element['href'] for element in elements_with_href]
62
- for link in href_links:
63
- if link.startswith('https://en.wikipedia.org/wiki/'):
64
- data.add(link.split("/")[-1])
65
  return data
66
 
67
  async def single_method(name, session):
@@ -69,30 +63,24 @@ async def single_method(name, session):
69
  data = set()
70
  new_name = name.replace("-", " ").replace("/", " ").split()
71
  for i in new_name:
72
- url = f"https://www.google.com/search?q={i} site:en.wikipedia.org inurl:/wiki/ -inurl:? -inurl:Category: -inurl:File: -inurl:Special: -inurl:Help:&num=5"
73
- html = requests.get(url, headers=headers)
74
- soup = BeautifulSoup(html.text, "html.parser")
75
- elements_with_href = soup.find_all(href=True)
76
- href_links = [element['href'] for element in elements_with_href]
77
- for link in href_links:
78
- if link.startswith('https://en.wikipedia.org/wiki/'):
79
- data.add(link.split("/")[-1])
80
  return data
81
 
82
- async def mains(name, single, combi):
83
  data = set()
84
  disam_data = set()
85
  qids = set()
86
 
87
  async with aiohttp.ClientSession() as session:
88
- url = f"https://www.google.com/search?q={name} site:en.wikipedia.org inurl:/wiki/ -inurl:? -inurl:Category: -inurl:File: -inurl:Special: -inurl:Help:"
89
- html = requests.get(url, headers=headers)
90
- soup = BeautifulSoup(html.text, "html.parser")
91
- elements_with_href = soup.find_all(href=True)
92
- href_links = [element['href'] for element in elements_with_href]
93
- for link in href_links:
94
- if link.startswith('https://en.wikipedia.org/wiki/'):
95
- data.add(link.split("/")[-1])
96
 
97
  wikipedia_url = f"https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch={name}&srlimit=1&srprop=&srenablerewrites=True&srinfo=suggestion&format=json"
98
  json_data = await fetch_json(wikipedia_url, session)
@@ -126,14 +114,14 @@ async def mains(name, single, combi):
126
  disam_data.add(ids)
127
 
128
  # Makes combinations of the name
129
- if combi == "Yes":
130
  if len(name.replace("-", " ").split()) >= 3:
131
  combination_names = await combination_method(name, session)
132
  for i in combination_names:
133
  disam_data.add(i)
134
 
135
  # Checks every word alone
136
- if single == "Yes":
137
  if len(name.replace("-", " ").replace("/", " ").split()) >= 2:
138
  singles = await single_method(name, session)
139
  for i in singles:
@@ -270,18 +258,6 @@ async def main(name):
270
  with open(f"/home/user/app/info_extraction/{name}.json", "w", encoding="utf-8") as flast:
271
  json.dump(final_list, flast)
272
 
273
- #def check_sentence(sentence):
274
- # two_consecutive_uppercase = r"[A-Z]{2}"
275
- # uppercase_followed_by_fullstop = r"[A-Z]\."
276
-
277
- # if re.search(two_consecutive_uppercase, sentence):
278
- # return True
279
-
280
- # if re.search(uppercase_followed_by_fullstop, sentence):
281
- # return True
282
-
283
- # return False
284
-
285
  def main_cli():
286
  st.title("✨ Entity Linking Application ✨")
287
  st.caption("This web application is part of my master’s dissertation.")
@@ -306,8 +282,7 @@ def main_cli():
306
 
307
  input_sentence_user = st.text_input("Enter a sentence:", "", disabled=st.session_state.running)
308
  input_mention_user = st.text_input("Enter a textural reference (mention) that is inside the sentence:", "", disabled=st.session_state.running)
309
- single = st.selectbox("Search each word individually? (Useful for difficult mentions)", ['Yes', 'No'], index=1, disabled=st.session_state.running)
310
- combi = st.selectbox("Make combinations of each word? (Useful for difficult mentions)", ['Yes', 'No'], index=1, disabled=st.session_state.running)
311
  disambi = st.selectbox("Run acronym disambiguation? (Enable it if the mention include an acronym or if it is nested)", ['Yes', 'No'], index=0, disabled=st.session_state.running)
312
 
313
  if st.button("Run Entity Linking", key="run_button", disabled=st.session_state.running):
@@ -426,18 +401,18 @@ def main_cli():
426
  list_with_contexts.append(context)
427
  st.write("✅ Applied Data Normilzation module (1/5)")
428
  # Candidate Retrieval & Information Gathering
429
- async def big_main(mention, single, combi):
430
  mention = mention.split(",")
431
  with st.spinner("Applying Candidate Retrieval module... (2/5)"):
432
  for i in mention:
433
- await mains(i, single, combi)
434
  st.write("✅ Applied Candidate Retrieval module (2/5)")
435
  with st.spinner("Applying Information Gathering module... (3/5)"):
436
  for i in mention:
437
  await main(i)
438
  st.write("✅ Applied Information Gathering module (3/5)")
439
 
440
- asyncio.run(big_main(name, single, combi))
441
 
442
  number = 0
443
  for i,j,o in zip(list_with_full_names,list_with_contexts,list_with_names_to_show):
 
 
1
  import json
2
  import numpy as np
3
  import re
 
14
  import sys
15
  import time
16
  from bs4 import BeautifulSoup
 
17
  import requests
18
+ import nest_asyncio
19
+ import httpx
20
+
21
+
22
+ nest_asyncio.apply()
23
 
 
 
 
24
 
25
  folder_path = '/home/user/app/qids_folder'
26
 
27
  if not os.path.exists(folder_path):
28
  os.mkdir(folder_path)
 
29
  else:
30
  pass
31
 
 
51
  x = itertools_combinations(new_name, 2)
52
  for i in x:
53
  new_word = (i[0] + " " + i[1])
54
+ url = f"https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch={new_word}&srlimit=20&srprop=&srenablerewrites=True&format=json"
55
+ json_data = await fetch_json(url, session)
56
+ suggestion = json_data.get('query', {}).get('search', {})
57
+ for pageid in suggestion:
58
+ data.add(pageid.get('title', {}))
 
 
 
59
  return data
60
 
61
  async def single_method(name, session):
 
63
  data = set()
64
  new_name = name.replace("-", " ").replace("/", " ").split()
65
  for i in new_name:
66
+ url = f"https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch={i}&srlimit=20&srprop=&srenablerewrites=True&format=json"
67
+ json_data = await fetch_json(url, session)
68
+ suggestion = json_data.get('query', {}).get('search', {})
69
+ for pageid in suggestion:
70
+ data.add(pageid.get('title', {}))
 
 
 
71
  return data
72
 
73
+ async def mains(name, deep_search):
74
  data = set()
75
  disam_data = set()
76
  qids = set()
77
 
78
  async with aiohttp.ClientSession() as session:
79
+ url = f"https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch={name}&srlimit=20&srprop=&srenablerewrites=True&format=json"
80
+ json_data = await fetch_json(url, session)
81
+ suggestion = json_data.get('query', {}).get('search', {})
82
+ for pageid in suggestion:
83
+ data.add(pageid.get('title', {}))
 
 
 
84
 
85
  wikipedia_url = f"https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch={name}&srlimit=1&srprop=&srenablerewrites=True&srinfo=suggestion&format=json"
86
  json_data = await fetch_json(wikipedia_url, session)
 
114
  disam_data.add(ids)
115
 
116
  # Makes combinations of the name
117
+ if deep_search == "Yes":
118
  if len(name.replace("-", " ").split()) >= 3:
119
  combination_names = await combination_method(name, session)
120
  for i in combination_names:
121
  disam_data.add(i)
122
 
123
  # Checks every word alone
124
+ if deep_search == "Yes":
125
  if len(name.replace("-", " ").replace("/", " ").split()) >= 2:
126
  singles = await single_method(name, session)
127
  for i in singles:
 
258
  with open(f"/home/user/app/info_extraction/{name}.json", "w", encoding="utf-8") as flast:
259
  json.dump(final_list, flast)
260
 
 
 
 
 
 
 
 
 
 
 
 
 
261
  def main_cli():
262
  st.title("✨ Entity Linking Application ✨")
263
  st.caption("This web application is part of my master’s dissertation.")
 
282
 
283
  input_sentence_user = st.text_input("Enter a sentence:", "", disabled=st.session_state.running)
284
  input_mention_user = st.text_input("Enter a textural reference (mention) that is inside the sentence:", "", disabled=st.session_state.running)
285
+ deep_search = st.selectbox("Perform deep search? (Useful for difficult mentions)", ['Yes', 'No'], index=1, disabled=st.session_state.running)
 
286
  disambi = st.selectbox("Run acronym disambiguation? (Enable it if the mention include an acronym or if it is nested)", ['Yes', 'No'], index=0, disabled=st.session_state.running)
287
 
288
  if st.button("Run Entity Linking", key="run_button", disabled=st.session_state.running):
 
401
  list_with_contexts.append(context)
402
  st.write("✅ Applied Data Normilzation module (1/5)")
403
  # Candidate Retrieval & Information Gathering
404
+ async def big_main(mention, deep_search):
405
  mention = mention.split(",")
406
  with st.spinner("Applying Candidate Retrieval module... (2/5)"):
407
  for i in mention:
408
+ await mains(i, deep_search)
409
  st.write("✅ Applied Candidate Retrieval module (2/5)")
410
  with st.spinner("Applying Information Gathering module... (3/5)"):
411
  for i in mention:
412
  await main(i)
413
  st.write("✅ Applied Information Gathering module (3/5)")
414
 
415
+ asyncio.run(big_main(name, deep_search))
416
 
417
  number = 0
418
  for i,j,o in zip(list_with_full_names,list_with_contexts,list_with_names_to_show):