noobArtInt commited on
Commit
c28e1a4
1 Parent(s): ff0c832

Main Commit

Browse files
Files changed (1) hide show
  1. main.py +584 -0
main.py ADDED
@@ -0,0 +1,584 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import streamlit as st
3
+ import wikipedia
4
+ from wikipedia import WikipediaPage
5
+ import pandas as pd
6
+ import spacy
7
+ import unicodedata
8
+ from nltk.corpus import stopwords
9
+ import numpy as np
10
+ import nltk
11
+ from newspaper import Article
12
+ nltk.download('stopwords')
13
+ from string import punctuation
14
+ import json
15
+ import time
16
+ from datetime import datetime, timedelta
17
+ import urllib
18
+ from io import BytesIO
19
+ from PIL import Image, UnidentifiedImageError
20
+ from SPARQLWrapper import SPARQLWrapper, JSON, N3
21
+ from fuzzywuzzy import process, fuzz
22
+ from st_aggrid import GridOptionsBuilder, AgGrid, GridUpdateMode, DataReturnMode
23
+
24
+
25
+ sparql = SPARQLWrapper('https://dbpedia.org/sparql')
26
+
27
+ class ExtractArticleEntities:
28
+ """ Extract article entities from a document using natural language processing (NLP) and fuzzy matching.
29
+
30
+ Parameters
31
+
32
+ - text: a string or the text of a news article to be parsed
33
+
34
+ Usage:
35
+ import ExtractArticleEntities
36
+ instantiate with text parameter ie. entities = ExtractArticleEntities(text)
37
+ retrieve Who, What, When, Where entities with entities.www_json
38
+ Non-organised entities with entiities.json
39
+
40
+
41
+ """
42
+
43
+ def __init__(self, text):
44
+ self.text = text # preprocess text at initialisation
45
+ self.text = self.preprocessing(self.text)
46
+ print(self.text)
47
+ print('_____text_____')
48
+ self.json = {}
49
+ # Create empty dataframe to hold entity data for ease of processing
50
+ self.entity_df = pd.DataFrame(columns=["entity", "description"])
51
+ # Load the spacy model
52
+ self.nlp = spacy.load('en_core_web_lg')
53
+
54
+ print('___________self.nlp', self.nlp._path)
55
+ # Parse the text
56
+ self.entity_df = self.get_who_what_where_when()
57
+ # Disambiguate entities
58
+
59
+ self.entity_df = self.fuzzy_disambiguation()
60
+ self.get_related_entity()
61
+ self.get_popularity()
62
+ # Create JSON representation of entities
63
+ self.entity_df = self.entity_df.drop_duplicates(subset=["description"])
64
+
65
+ self.entity_df = self.entity_df.reset_index(drop=True)
66
+
67
+ # ungrouped entity returned as json
68
+ self.json = self.entity_json()
69
+ # return json with entities grouped into who, what, where, when keys
70
+ self.www_json = self.get_wwww_json()
71
+
72
+
73
+ # def get_related_entity(self):
74
+ # entities = self.entity_df.description
75
+ # labels = self.entity_df.entity
76
+ # related_entity = []
77
+ # for entity, label in zip(entities, labels):
78
+ # if label in ('PERSON', 'ORG','GPE','NORP','LOC'):
79
+ # related_entity.append(wikipedia.search(entity, 3))
80
+ # else:
81
+ # related_entity.append([None])
82
+
83
+ # self.entity_df['Wikipedia Entity'] = related_entity
84
+
85
+ def get_popularity(self):
86
+ # names = self.entity_df.description
87
+ # related_names = self.entity_df['Matched Entity']
88
+ # for name, related_name in zip(names, related_names):
89
+ # if related_name:
90
+ # related_name.append(name)
91
+ # pytrends.build_payload(related_name, timeframe='now 4-d')
92
+ # st.dataframe(pytrends.interest_over_time())
93
+ # time.sleep(2)
94
+ master_df = pd.DataFrame()
95
+ view_list = []
96
+ for entity in self.entity_df['Matched Entity']:
97
+ if entity:
98
+ entity_to_look = entity[0]
99
+ # print(entity_to_look, '_______')
100
+ entity_to_look = entity_to_look.replace(' ','_')
101
+ print(entity_to_look, '_______')
102
+ headers = {
103
+ 'accept': 'application/json',
104
+ 'User-Agent': 'Foo bar'
105
+ }
106
+
107
+ now = datetime.now()
108
+ now_dt = now.strftime(r'%Y%m%d')
109
+ week_back = now - timedelta(days=7)
110
+ week_back_dt = week_back.strftime(r'%Y%m%d')
111
+ resp = requests.get(f'https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia.org/all-access/all-agents/{entity_to_look}/daily/{week_back_dt}/{now_dt}', headers=headers)
112
+ data = resp.json()
113
+ # print(data)
114
+ df = pd.json_normalize(data['items'])
115
+ view_count = sum(df['views'])
116
+
117
+ else:
118
+ view_count = 0
119
+ view_list.append(view_count)
120
+
121
+ self.entity_df['Views'] = view_list
122
+
123
+
124
+ for entity in ('PERSON','ORG','GPE','NORP','LOC'):
125
+ related_entity_view_list = []
126
+ grouped_df = self.entity_df[self.entity_df['entity'] == entity]
127
+ grouped_df['Matched count'] = grouped_df['fuzzy_match'].apply(len)
128
+ grouped_df['Wiki count'] = grouped_df['Matched Entity'].apply(len)
129
+
130
+ grouped_df = grouped_df.sort_values(by=['Views', 'Matched count', 'Wiki count'], ascending=False).reset_index(drop=True)
131
+ if not grouped_df.empty:
132
+ # st.dataframe(grouped_df)
133
+ master_df = pd.concat([master_df, grouped_df])
134
+
135
+ self.sorted_entity_df = master_df
136
+ if 'Views' in self.sorted_entity_df:
137
+ self.sorted_entity_df = self.sorted_entity_df.sort_values(by=['Views'], ascending=False).reset_index(drop=True)
138
+ # st.dataframe(self.sorted_entity_df)
139
+ # names = grouped_df['description'][:5].values
140
+ # print(names, type(names))
141
+ # if names.any():
142
+ # # pytrends.build_payload(names, timeframe='now 1-m')
143
+ # st.dataframe(pytrends.get_historical_interest(names,
144
+ # year_start=2022, month_start=10, day_start=1,
145
+ # hour_start=0,
146
+ # year_end=2022, month_end=10, day_end=21,
147
+ # hour_end=0, cat=0, geo='', gprop='', sleep=0))
148
+ # st.dataframe()
149
+ # time.sleep(2)
150
+ # st.dataframe(grouped_df)
151
+
152
+ def get_related_entity(self):
153
+ names = self.entity_df.description
154
+ entities = self.entity_df.entity
155
+ self.related_entity = []
156
+ match_scores = []
157
+ for name, entity in zip(names, entities):
158
+ if entity in ('PERSON','ORG','GPE','NORP','LOC'):
159
+ related_names = wikipedia.search(name, 10)
160
+ self.related_entity.append(related_names)
161
+ matches = process.extract(name, related_names)
162
+ match_scores.append([match[0] for match in matches if match[1]>= 90 ])
163
+ else:
164
+ self.related_entity.append([None])
165
+ match_scores.append([])
166
+ # Remove nulls
167
+
168
+ self.entity_df['Wikipedia Entity'] = self.related_entity
169
+ self.entity_df['Matched Entity'] = match_scores
170
+
171
+ def fuzzy_disambiguation(self):
172
+ # Load the entity data
173
+ self.entity_df['fuzzy_match'] = ''
174
+ # Load the entity data
175
+ person_choices = self.entity_df.loc[self.entity_df['entity'] == 'PERSON']
176
+ org_choices = self.entity_df.loc[self.entity_df['entity'] == 'ORG']
177
+ where_choices = self.entity_df.loc[self.entity_df['entity'] == 'GPE']
178
+ norp_choices = self.entity_df.loc[self.entity_df['entity'] == 'NORP']
179
+ loc_choices = self.entity_df.loc[self.entity_df['entity'] == 'LOC']
180
+ date_choices = self.entity_df.loc[self.entity_df['entity'] == 'DATE']
181
+
182
+
183
+ def fuzzy_match(row, choices):
184
+ '''This function disambiguates entities by looking for maximum three matches with a score of 80 or more
185
+ for each of the entity types. If there is no match, then the function returns None. '''
186
+ match = process.extract(row["description"], choices["description"], limit=3)
187
+
188
+ match = [m[0] for m in match if m[1] > 80 and m[1] != 100]
189
+
190
+ if len(match) == 0:
191
+ match = []
192
+
193
+ if match:
194
+ self.fuzzy_match_dict[row["description"]] = match
195
+
196
+ return match
197
+
198
+ # Apply the fuzzy matching function to the entity dataframe
199
+
200
+ self.fuzzy_match_dict = {}
201
+
202
+ for i, row in self.entity_df.iterrows():
203
+
204
+ if row['entity'] == 'PERSON':
205
+
206
+ self.entity_df.at[i, 'fuzzy_match'] = fuzzy_match(row, person_choices)
207
+
208
+ elif row['entity'] == 'ORG':
209
+
210
+ self.entity_df.at[i, 'fuzzy_match'] = fuzzy_match(row, org_choices)
211
+ elif row['entity'] == 'GPE':
212
+
213
+ self.entity_df.at[i, 'fuzzy_match'] = fuzzy_match(row, where_choices)
214
+
215
+ elif row['entity'] == 'NORP':
216
+
217
+ self.entity_df.at[i, 'fuzzy_match'] = fuzzy_match(row, norp_choices)
218
+ elif row['entity'] == 'LOC':
219
+
220
+ self.entity_df.at[i, 'fuzzy_match'] = fuzzy_match(row, loc_choices)
221
+ elif row['entity'] == 'DATE':
222
+
223
+ self.entity_df.at[i, 'fuzzy_match'] = fuzzy_match(row, date_choices)
224
+
225
+ return self.entity_df
226
+
227
+ def preprocessing(self, text):
228
+ """This function takes a text string and strips out all punctuation. It then normalizes the string to a
229
+ normalized form (using the "NFKD" normalization algorithm). Finally, it strips any special characters and
230
+ converts them to their unicode equivalents. """
231
+
232
+ # remove punctuation
233
+ text = text.translate(str.maketrans("", "", punctuation))
234
+ # normalize the text
235
+ stop_words = stopwords.words('english')
236
+
237
+ # Removing Stop words can cause losing context, instead stopwords can be utilized for knowledge
238
+ filtered_words = [word for word in self.text.split()] #if word not in stop_words]
239
+
240
+ # This is very hacky. Need a better way of handling bad encoding
241
+ pre_text = " ".join(filtered_words)
242
+ pre_text = pre_text = pre_text.replace(' ', ' ')
243
+ pre_text = pre_text.replace('’', "'")
244
+ pre_text = pre_text.replace('“', '"')
245
+ pre_text = pre_text.replace('â€', '"')
246
+ pre_text = pre_text.replace('‘', "'")
247
+ pre_text = pre_text.replace('…', '...')
248
+ pre_text = pre_text.replace('–', '-')
249
+ pre_text = pre_text.replace("\x9d", '-')
250
+ # normalize the text
251
+ pre_text = unicodedata.normalize("NFKD", pre_text)
252
+ # strip punctuation again as some remains in first pass
253
+ pre_text = pre_text.translate(str.maketrans("", "", punctuation))
254
+
255
+
256
+ return pre_text
257
+
258
+ def get_who_what_where_when(self):
259
+ """Get entity information in a document.
260
+
261
+
262
+ This function will return a DataFrame with the following columns:
263
+
264
+ - entity: the entity being queried
265
+ - description: a brief description of the entity
266
+
267
+ Usage:
268
+
269
+ get_who_what_where_when(text)
270
+
271
+ Example:
272
+
273
+ > get_who_what_where_when('This is a test')
274
+
275
+ PERSON
276
+ ORG
277
+ GPE
278
+ LOC
279
+ PRODUCT
280
+ EVENT
281
+ LAW
282
+ LANGUAGE
283
+ NORP
284
+ DATE
285
+ GPE
286
+ TIME"""
287
+
288
+ # list to hold entity data
289
+ article_entity_list = []
290
+ # tokenize the text
291
+ doc = self.nlp(self.text)
292
+ # iterate over the entities in the document but only keep those which are meaningful
293
+ desired_entities = ['PERSON', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT', 'LAW', 'LANGUAGE', 'NORP', 'DATE', 'GPE',
294
+ 'TIME']
295
+ self.label_dict = {}
296
+
297
+ # stop_words = stopwords.words('english')
298
+ for ent in doc.ents:
299
+
300
+ self.label_dict[ent] = ent.label_
301
+ if ent.label_ in desired_entities:
302
+ # add the entity to the list
303
+ entity_dict = {ent.label_: ent.text}
304
+
305
+ article_entity_list.append(entity_dict)
306
+
307
+ # dedupe the entities but only on exact match of values as occasional it will assign an ORG entity to PER
308
+ deduplicated_entities = {frozenset(item.values()):
309
+ item for item in article_entity_list}.values()
310
+ # create a dataframe from the entities
311
+ for record in deduplicated_entities:
312
+ record_df = pd.DataFrame(record.items(), columns=["entity", "description"])
313
+ self.entity_df = pd.concat([self.entity_df, record_df], ignore_index=True)
314
+
315
+ print(self.entity_df)
316
+ print('______________________')
317
+ return self.entity_df
318
+
319
+ def entity_json(self):
320
+ """Returns a JSON representation of an entity defined by the `entity_df` dataframe. The `entity_json` function
321
+ will return a JSON object with the following fields:
322
+ - entity: The type of the entity in the text
323
+ - description: The name of the entity as described in the input text
324
+ - fuzzy_match: A list of fuzzy matches for the entity. This is useful for disambiguating entities that are similar
325
+ """
326
+
327
+ self.json = json.loads(self.entity_df.to_json(orient='records'))
328
+ # self.json = json.dumps(self.json, indent=2)
329
+ return self.json
330
+
331
+ def get_wwww_json(self):
332
+ """This function returns a JSON representation of the `get_who_what_where_when` function. The `get_www_json`
333
+ function will return a JSON object with the following fields:
334
+ - entity: The type of the entity in the text
335
+ - description: The name of the entity as described in the input text
336
+ - fuzzy_match: A list of fuzzy matches for the entity. This is useful for disambiguating entities that are similar
337
+ """
338
+
339
+ # create a json object from the entity dataframe
340
+ who_dict = {"who": [ent for ent in self.entity_json() if ent['entity'] in ['ORG', 'PERSON']]}
341
+ where_dict = {"where": [ent for ent in self.entity_json() if ent['entity'] in ['GPE', 'LOC']]}
342
+ when_dict = {"when": [ent for ent in self.entity_json() if ent['entity'] in ['DATE', 'TIME']]}
343
+ what_dict = {
344
+ "what": [ent for ent in self.entity_json() if ent['entity'] in ['PRODUCT', 'EVENT', 'LAW', 'LANGUAGE',
345
+ 'NORP']]}
346
+ article_wwww = [who_dict, where_dict, when_dict, what_dict]
347
+ self.wwww_json = json.dumps(article_wwww,indent=2)
348
+
349
+ return self.wwww_json
350
+
351
+
352
+ news_article = st.text_input('Paste an Article here to be parsed')
353
+ if 'parsed' not in st.session_state:
354
+ st.session_state['parsed'] = None
355
+ st.session_state['article'] = None
356
+ if news_article:
357
+ st.write('Your news article is')
358
+ st.write(news_article)
359
+
360
+ if st.button('Get details'):
361
+
362
+ parsed = ExtractArticleEntities(news_article)
363
+ if parsed:
364
+ st.session_state['article'] = parsed.sorted_entity_df
365
+ st.session_state['parsed'] = True
366
+ st.session_state['json'] = parsed.www_json
367
+
368
+ # if not st.session_state['article'].empty:
369
+
370
+ def preprocessing(text):
371
+ """This function takes a text string and strips out all punctuation. It then normalizes the string to a
372
+ normalized form (using the "NFKD" normalization algorithm). Finally, it strips any special characters and
373
+ converts them to their unicode equivalents. """
374
+
375
+ # remove punctuation
376
+ if text:
377
+ text = text.translate(str.maketrans("", "", punctuation))
378
+ # normalize the text
379
+ stop_words = stopwords.words('english')
380
+
381
+ # Removing Stop words can cause losing context, instead stopwords can be utilized for knowledge
382
+ filtered_words = [word for word in text.split()] #if word not in stop_words]
383
+
384
+ # This is very hacky. Need a better way of handling bad encoding
385
+ pre_text = " ".join(filtered_words)
386
+ pre_text = pre_text = pre_text.replace(' ', ' ')
387
+ pre_text = pre_text.replace('’', "'")
388
+ pre_text = pre_text.replace('“', '"')
389
+ pre_text = pre_text.replace('â€', '"')
390
+ pre_text = pre_text.replace('‘', "'")
391
+ pre_text = pre_text.replace('…', '...')
392
+ pre_text = pre_text.replace('–', '-')
393
+ pre_text = pre_text.replace("\x9d", '-')
394
+ # normalize the text
395
+ pre_text = unicodedata.normalize("NFKD", pre_text)
396
+ # strip punctuation again as some remains in first pass
397
+ pre_text = pre_text.translate(str.maketrans("", "", punctuation))
398
+
399
+ else:
400
+ pre_text = None
401
+ return pre_text
402
+
403
+ def filter_wiki_df(df):
404
+
405
+ key_list = df.keys()[:2]
406
+ # df.to_csv('test.csv')
407
+ df = df[key_list]
408
+ # if len(df.keys()) == 2:
409
+ df['Match Check'] = np.where(df[df.keys()[0]] != df[df.keys()[1]], True, False)
410
+
411
+ df = df[df['Match Check']!= False]
412
+ df = df[key_list]
413
+ df = df.dropna(how='any').reset_index(drop=True)
414
+ # filtered_term = []
415
+ # for terms in df[df.keys()[0]]:
416
+ # if isinstance(terms, str):
417
+ # filtered_term.append(preprocessing(terms))
418
+ # else:
419
+ # filtered_term.append(None)
420
+ # df[df.keys()[0]] = filtered_term
421
+ df.rename(columns = {key_list[0]: 'Attribute', key_list[1]: 'Value'}, inplace = True)
422
+
423
+ return df
424
+
425
+ def get_entity_from_selectbox(related_entity):
426
+ entity = st.selectbox('Please select the term:', related_entity, key='foo')
427
+ if entity:
428
+ summary_entity = wikipedia.summary(entity, 3)
429
+ return summary_entity
430
+
431
+ if st.session_state['parsed']:
432
+ df = st.session_state['article']
433
+ # left, right = st.columns(2)
434
+ # with left:
435
+ df_to_st = pd.DataFrame()
436
+
437
+ df_to_st['Name'] = df['description']
438
+ df_to_st['Is a type of'] = df['entity']
439
+ df_to_st['Related to'] = df['Matched Entity']
440
+ df_to_st['Is a type of'] = df_to_st['Is a type of'].replace({'PERSON':'Person',
441
+ 'ORG':'Organization',
442
+ 'GPE':'Political Location',
443
+ 'NORP':'Political or Religious Groups',
444
+ 'LOC':'Non Political Location'})
445
+ gb = GridOptionsBuilder.from_dataframe(df_to_st)
446
+ gb.configure_pagination(paginationAutoPageSize=True) #Add pagination
447
+ gb.configure_side_bar() #Add a sidebar
448
+ gb.configure_selection('multiple', use_checkbox=True, groupSelectsChildren="Group checkbox select children") #Enable multi-row selection
449
+ gridOptions = gb.build()
450
+
451
+ st.dataframe(df_to_st)
452
+ grid_response = AgGrid(
453
+ df_to_st,
454
+ gridOptions=gridOptions,
455
+ data_return_mode='AS_INPUT',
456
+ update_mode='MODEL_CHANGED',
457
+ fit_columns_on_grid_load=False,
458
+ enable_enterprise_modules=True,
459
+ height=350,
460
+ width='100%',
461
+ reload_data=True
462
+ )
463
+
464
+ data = grid_response['data']
465
+ selected = grid_response['selected_rows']
466
+ selected_df = pd.DataFrame(selected)
467
+ if not selected_df.empty:
468
+ selected_entity = selected_df[['Name', 'Is a type of', 'Related to']]
469
+ st.dataframe(selected_entity)
470
+
471
+
472
+ # with right:
473
+ # st.json(st.session_state['json'])
474
+
475
+ entities_list = df['description']
476
+ # selected_entity = st.selectbox('Which entity you want to choose?',
477
+ # entities_list)
478
+ if not selected_df.empty and selected_entity['Name'].any():
479
+
480
+ # lookup_url = rf'https://lookup.dbpedia.org/api/search?query={selected_entity}'
481
+ # r = requests.get(lookup_url)
482
+
483
+ selected_row = df.loc[df['description'] == selected_entity['Name'][0]]
484
+
485
+ entity_value = selected_row.values
486
+ # st.write('Entity is a ', entity_value[0][0])
487
+ label, name, fuzzy, related, related_match,_,_,_ = entity_value[0]
488
+ not_matched = [word for word in related if word not in related_match]
489
+ fuzzy = fuzzy[0] if len(fuzzy) > 0 else ''
490
+ related = related[0] if len(related) > 0 else ''
491
+ not_matched = not_matched[0] if len(not_matched) > 0 else related
492
+
493
+ related_entity_list = [name, fuzzy, not_matched]
494
+ related_entity = entity_value[0][1:]
495
+
496
+ google_query_term = ' '.join(related_entity_list)
497
+ # search()
498
+ try:
499
+ urls = [i for i in search(google_query_term ,stop = 10,pause = 2.0, tld='com', lang='en', tbs='0', user_agent = get_random_user_agent())]
500
+ except:
501
+ urls = []
502
+ # urls = search(google_query_term+' news latest', num_results=10)
503
+ st.session_state['wiki_summary'] = False
504
+ all_related_entity = []
505
+ print(related_entity, ' _____')
506
+ for el in related_entity[:-2]:
507
+ if isinstance(el, str):
508
+ all_related_entity.append(el)
509
+ elif isinstance(el, int):
510
+ all_related_entity.append(str(el))
511
+ else:
512
+ all_related_entity.extend(el)
513
+ # [ if type(el) == 'int' all_related_entity.extend(el) else all_related_entity.extend([el])for el in related_entity]
514
+ for entity in all_related_entity:
515
+ # print(all_related_entity)
516
+ # try:
517
+ if True:
518
+ if entity:
519
+ print(entity)
520
+ entity = entity.replace(' ', '_')
521
+ query = f'''
522
+ SELECT ?name ?comment ?image
523
+ WHERE {{ dbr:{entity} rdfs:label ?name.
524
+ dbr:{entity} rdfs:comment ?comment.
525
+ dbr:{entity} dbo:thumbnail ?image.
526
+
527
+ FILTER (lang(?name) = 'en')
528
+ FILTER (lang(?comment) = 'en')
529
+ }}'''
530
+ sparql.setQuery(query)
531
+
532
+ sparql.setReturnFormat(JSON)
533
+ qres = sparql.query().convert()
534
+ if qres['results']['bindings']:
535
+ result = qres['results']['bindings'][0]
536
+ name, comment, image_url = result['name']['value'], result['comment']['value'], result['image']['value']
537
+ # urllib.request.urlretrieve(image_url, "img.jpg")
538
+
539
+ # img = Image.open("/Users/anujkarn/NER/img.jpg")
540
+ wiki_url = f'https://en.wikipedia.org/wiki/{entity}'
541
+
542
+ st.write(name)
543
+ # st.image(img)
544
+ st.write(image_url)
545
+ # try:
546
+ response = requests.get(image_url)
547
+ # display(Image.open(BytesIO(response.content)))
548
+ try:
549
+ related_image = Image.open(BytesIO(response.content))
550
+ st.image(related_image)
551
+ except UnidentifiedImageError:
552
+ st.write('Not able to get image')
553
+ pass
554
+
555
+ # except error as e:
556
+ # st.write(f'Image not parsed because of : {e}')
557
+ summary_entity = comment
558
+ wiki_knowledge_df = pd.read_html(wiki_url)[0]
559
+ wiki_knowledge_df = filter_wiki_df(wiki_knowledge_df)
560
+
561
+ # st.write('Showing desciption for entity:', name)
562
+ # if st.button('Want something else?'):
563
+ # summary_entity = get_entity_from_selectbox(all_related_entity)
564
+ break
565
+ # summary_entity = wikipedia.summary(entity, 3)
566
+ else:
567
+ print(qres)
568
+ print(query)
569
+ summary_entity = None
570
+ if not summary_entity:
571
+ try:
572
+ summary_entity = get_entity_from_selectbox(all_related_entity)
573
+ # page = WikipediaPage(entity)
574
+
575
+ except wikipedia.exceptions.DisambiguationError:
576
+ st.write('Disambiguation is there for term')
577
+
578
+
579
+ if selected_entity['Name'].any():
580
+ st.write(f'Summary for {selected_entity["Name"][0]}')
581
+ st.write(summary_entity)
582
+
583
+
584
+