m. polinsky commited on
Commit
ef00cba
1 Parent(s): f6278d5

Create streamlit_app.py

Browse files
Files changed (1) hide show
  1. streamlit_app.py +563 -0
streamlit_app.py ADDED
@@ -0,0 +1,563 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ from transformers import pipeline, AutoModel, AutoTokenizer
4
+
5
+ import time
6
+
7
+ from time import time as t
8
+
9
+ from gazpacho import Soup, get
10
+
11
+ import tokenizers
12
+
13
+ import json
14
+
15
+ import requests
16
+
17
+ #############
18
+
19
+ # FUNCTIONS #
20
+
21
+ #############
22
+
23
+ ex = []
24
+
25
+ # Query the HuggingFace Inference engine.
26
+
27
+ def query(payload):
28
+
29
+ data = json.dumps(payload)
30
+
31
+ response = requests.request("POST", API_URL, headers=headers, data=data)
32
+
33
+ return json.loads(response.content.decode("utf-8"))
34
+
35
+
36
+
37
+ def ner_query(payload):
38
+
39
+ data = json.dumps(payload)
40
+
41
+ response = requests.request("POST", NER_API_URL, headers=headers, data=data)
42
+
43
+ return json.loads(response.content.decode("utf-8"))
44
+
45
+ # gets links and identifies if they're cnn or npr
46
+
47
+ def get_articles(user_choices, cnn_dict, npr_dict):
48
+
49
+ clustLinks = []
50
+
51
+ heds = {}
52
+
53
+
54
+
55
+ # Get all headlines from each cluster -- add to dict and record number of clusters of interest the headline appeared in.
56
+
57
+ for each in user_choices:
58
+
59
+ for beach in clusters[each.lower()]:
60
+
61
+ if beach not in heds:
62
+
63
+ heds[beach] = 1
64
+
65
+ else:
66
+
67
+ heds[beach] += 1
68
+
69
+
70
+
71
+ # Convert keys (headlines) to list then sort in descending order of prevalence
72
+
73
+ sorted_heds = list(heds.keys())
74
+
75
+ sorted_heds.sort(key=lambda b: heds[b], reverse=True)
76
+
77
+
78
+
79
+ for each in sorted_heds:
80
+
81
+ try:
82
+
83
+ # look up the headline in cnn
84
+
85
+ clustLinks.append(('cnn',cnn_dict[each]))
86
+
87
+ # if exception KeyError then lookup in npr
88
+
89
+ except KeyError:
90
+
91
+ clustLinks.append(('npr',npr_dict[each]))
92
+
93
+ return clustLinks
94
+
95
+ # gets articles from source via scraping
96
+
97
+ def retrieve(input_reslist):
98
+
99
+ cnn = 'https://lite.cnn.com'
100
+
101
+ npr = 'https://text.npr.org'
102
+
103
+ articles = []
104
+
105
+
106
+
107
+ # Scrapes from npr or cnn. Should modularize this and use a dict as a switch-case
108
+
109
+ for each in input_reslist:
110
+
111
+ if each[0] == 'npr':
112
+
113
+ container = Soup(get(npr+each[1])).find('div', {'class': "paragraphs-container"}).find('p')
114
+
115
+ articles.append(container)
116
+
117
+ if each[0] == 'cnn':
118
+
119
+ container = Soup(get(cnn+each[1])).find('div', {'class': 'afe4286c'})
120
+
121
+ # Extract all text from paragraph tags, each extracted from container
122
+
123
+ #story = '\n'.join([x.text for x in container.find('p') if x.text != ''])
124
+
125
+ story = container.find('p')
126
+
127
+ articles.append(story[4:])
128
+
129
+ time.sleep(1)
130
+
131
+ return articles
132
+
133
+ # Returns a list of articles
134
+
135
+ # Takes list of articles and assigns each articles' text to an int for some reason....
136
+
137
+ #
138
+
139
+ ## *** Dictionary might shuffle articles?
140
+
141
+ #
142
+
143
+ def art_prep(retrieved):
144
+
145
+ a = []
146
+
147
+ for each in retrieved:
148
+
149
+ if type(each) is not list:
150
+
151
+ a.append(each.strip())
152
+
153
+ else:
154
+
155
+ a.append(''.join([art.strip() for art in each]))
156
+
157
+ return a
158
+
159
+ # User choices is the list of user-chosen entities.
160
+
161
+ def seek_and_sum(user_choices, cnn_dict, npr_dict):
162
+
163
+ # If no topics are selected return nothing
164
+
165
+ if len(user_choices) == 0:
166
+
167
+ return []
168
+
169
+ digs = []
170
+
171
+ prepped=art_prep(retrieve(get_articles(user_choices, cnn_dict, npr_dict)))
172
+
173
+ # Final is the output...the digest.
174
+
175
+ for piece in prepped:
176
+
177
+ digs.append(create_summaries(piece, 'sshleifer/distilbart-cnn-12-6'))
178
+
179
+ # Opportunity for processing here
180
+
181
+
182
+
183
+ return digs
184
+
185
+ # Chunks
186
+
187
+ def chunk_piece(piece, limit):
188
+
189
+ words = len(piece.split(' ')) # rough estimate of words. # words <= number tokens generally.
190
+
191
+ perchunk = words//limit
192
+
193
+ base_range = [i*limit for i in range(perchunk+1)]
194
+
195
+ range_list = [i for i in zip(base_range,base_range[1:])]
196
+
197
+ #range_list.append((range_list[-1][1],words)) try leaving off the end (or pad it?)
198
+
199
+ chunked_pieces = [' '.join(piece.split(' ')[i:j]).replace('\n','').replace('.','. ') for i,j in range_list]
200
+
201
+ return chunked_pieces
202
+
203
+ # Summarizes
204
+
205
+ def create_summaries(piece, chkpnt, lim=400):
206
+
207
+ tokenizer = AutoTokenizer.from_pretrained(chkpnt)
208
+
209
+ limit = lim
210
+
211
+ count = -1
212
+
213
+ summary = []
214
+
215
+ words = len(piece.split(' '))
216
+
217
+ if words >= limit:
218
+
219
+ # chunk the piece
220
+
221
+ #print(f'Chunking....')
222
+
223
+ proceed = False
224
+
225
+ while not proceed:
226
+
227
+ try:
228
+
229
+ chunked_pieces = chunk_piece(piece, limit)
230
+
231
+ for chunk in chunked_pieces:
232
+
233
+ token_length = len(tokenizer(chunk))
234
+
235
+
236
+
237
+ # Perform summarization
238
+
239
+ if token_length <= 512:
240
+
241
+ data = query({ "inputs": str(chunk), "parameters": {"do_sample": False} }) # The way I'm passing the chunk could be the problem? In a loop by ref?
242
+
243
+ summary.append(data[0]['summary_text'])
244
+
245
+ proceed = True
246
+
247
+ else:
248
+
249
+ proceed = False
250
+
251
+ limit -= 2 # Try to back off as little as possible.
252
+
253
+ summary = [] # empty summary we're starting again.
254
+
255
+ except IndexError: # Caused when 400 words get tokenized to > 512 tokens. Rare.
256
+
257
+ proceed = False
258
+
259
+ # lower the limit
260
+
261
+ limit -= 2 # Try to back off as little as possible.
262
+
263
+ summary = [] # empty summary we're starting again.
264
+
265
+ days_summary = ' '.join(summary) # Concatenate partial summaries
266
+
267
+ else:
268
+
269
+ #print(f'Summarizing whole piece')
270
+
271
+ proceed = False
272
+
273
+ while not proceed:
274
+
275
+ try:
276
+
277
+ # Perform summarization
278
+
279
+ data = query({ "inputs": str(piece), "parameters": {"do_sample": False} })
280
+
281
+ days_summary = data[0]['summary_text']
282
+
283
+ proceed= True
284
+
285
+ except IndexError:
286
+
287
+ proceed = False
288
+
289
+ piece = piece[:-4]
290
+
291
+ days_summary = ''
292
+
293
+ return days_summary
294
+
295
+ # This function creates a nice output from the dictionary the NER pipeline returns.
296
+
297
+ # It works for grouped_entities = True or False.
298
+
299
+ def ner_results(ner_object, indent=False, groups=True, NER_THRESHOLD=0.5):
300
+
301
+ # empty lists to collect our entities
302
+
303
+ people, places, orgs, misc = [], [], [], []
304
+
305
+ # 'ent' and 'designation' handle the difference between dictionary keys
306
+
307
+ # for aggregation strategy grouped vs ungrouped
308
+
309
+ ent = 'entity' if not groups else 'entity_group'
310
+
311
+ designation = 'I-' if not groups else ''
312
+
313
+ # Define actions -- this is a switch-case dictionary.
314
+
315
+ actions = {designation+'PER':people.append,
316
+
317
+ designation+'LOC':places.append,
318
+
319
+ designation+'ORG':orgs.append,
320
+
321
+ designation+'MISC':misc.append}
322
+
323
+ # For each dictionary in the ner result list, if it doesn't contain a '#'
324
+
325
+ # and the confidence is > 90%, add the entity name to the list for its type.
326
+
327
+ readable = [ actions[d[ent]](d['word']) for d in ner_object if '#' not in d['word'] and d['score'] > NER_THRESHOLD ]
328
+
329
+ # create list of all entities to return
330
+
331
+ ner_list = [i for i in set(people) if len(i) > 2] + [i for i in set(places) if len(i) > 2] + [i for i in set(orgs) if len(i) > 2] + [i for i in set(misc) if len(i) > 2]
332
+
333
+ return ner_list
334
+
335
+
336
+
337
+ @st.cache(hash_funcs={tokenizers.Tokenizer: id})
338
+
339
+ def create_ner_dicts(state=True):
340
+
341
+ # Changing this will run the method again, refreshing the topics
342
+
343
+ status = state
344
+
345
+
346
+
347
+ url1 = 'https://lite.cnn.com/en'
348
+
349
+ soup_cnn = Soup(get(url1))
350
+
351
+ # extract each headline from the div containing the links.
352
+
353
+ cnn_text = [i.text for i in soup_cnn.find('div', {'class': 'afe4286c'}).find('a')]
354
+
355
+ cnn_links = [i.attrs['href'] for i in soup_cnn.find('div', {'class': 'afe4286c'}).find('a')]
356
+
357
+ cnn = [i for i in cnn_text if 'Analysis:' not in i and 'Opinion:' not in i]
358
+
359
+
360
+
361
+
362
+
363
+ # Get current links...in the future you'll have to check for overlaps.
364
+
365
+ url2 = 'https://text.npr.org/1001'
366
+
367
+ soup = Soup(get(url2))
368
+
369
+ # extract each headline
370
+
371
+ npr_text = [i.text for i in soup.find('div', {'class': 'topic-container'}).find('ul').find('a')]
372
+
373
+ npr_links = [i.attrs['href'] for i in soup.find('div', {'class': 'topic-container'}).find('ul').find('a')]
374
+
375
+ npr = [i for i in npr_text if 'Opinion:' not in i]
376
+
377
+
378
+
379
+ cnn_dict = {k[0]:k[1] for k in zip(cnn_text,cnn_links)}
380
+
381
+ npr_dict = {k[0]:k[1] for k in zip(npr_text,npr_links)}
382
+
383
+
384
+
385
+ # START Perform NER
386
+
387
+ cnn_ner = {x:ner_results(ner_query(x)) for x in cnn} ###################################################################################################
388
+
389
+ npr_ner = {x:ner_results(ner_query(x)) for x in npr} ################################# ################################# #################################
390
+
391
+
392
+
393
+ return cnn_dict, npr_dict, cnn_ner, npr_ner
394
+
395
+ ## A function to change a state variable in create_dicts() above
396
+
397
+ ## that then runs it and creates updated clusters.
398
+
399
+ def get_news_topics(cnn_ner, npr_ner):
400
+
401
+
402
+
403
+ ## END Perform NER
404
+
405
+
406
+
407
+ # Select from articles.
408
+
409
+
410
+
411
+ ## Select from articles that are clusterable only. (Entities were recognized.)
412
+
413
+ cnn_final = {x:npr_ner[x] for x in npr_ner.keys() if len(npr_ner[x]) != 0}
414
+
415
+ npr_final = {y:cnn_ner[y] for y in cnn_ner.keys() if len(cnn_ner[y]) != 0 }
416
+
417
+
418
+
419
+ # What's in the news?
420
+
421
+ # Get entities named in the pool of articles we're drawing from
422
+
423
+ e_list = []
424
+
425
+ for i in [i for i in cnn_final.values()]:
426
+
427
+ for j in i:
428
+
429
+ e_list.append(j)
430
+
431
+ for k in [k for k in npr_final.values()]:
432
+
433
+ for j in k:
434
+
435
+ e_list.append(j)
436
+
437
+
438
+
439
+ # This is a dictionary with keys: the list items....
440
+
441
+ clusters = {k.lower():[] for k in e_list}
442
+
443
+
444
+
445
+ ## Perform Clustering
446
+
447
+ for hed in cnn_final.keys():
448
+
449
+ for item in cnn_final[hed]:
450
+
451
+ clusters[item.lower()].append(hed) # placing the headline in the list corresponding to the dictionary key for each entity.
452
+
453
+ for hed in npr_final.keys():
454
+
455
+ for item in npr_final[hed]:
456
+
457
+ clusters[item.lower()].append(hed)
458
+
459
+
460
+
461
+ return clusters
462
+
463
+
464
+
465
+
466
+
467
+ def update_topics():
468
+
469
+ st.legacy_caching.clear_cache()
470
+
471
+ dicts = [i for i in create_ner_dicts()]
472
+
473
+ clusters = get_news_topics(cnn_ner, npr_ner)
474
+
475
+ return clusters, dicts
476
+
477
+
478
+
479
+
480
+
481
+ #############
482
+
483
+ # SETUP #
484
+
485
+ #############
486
+
487
+ # Auth for HF Inference API and URL to the model we're using -- distilbart-cnn-12-6
488
+
489
+ headers = {"Authorization": f"""Bearer {st.secrets["ato"]}"""}
490
+
491
+ API_URL = "https://api-inference.huggingface.co/models/sshleifer/distilbart-cnn-12-6"
492
+
493
+ NER_API_URL = "https://api-inference.huggingface.co/models/dbmdz/bert-large-cased-finetuned-conll03-english"
494
+
495
+ #############
496
+
497
+ #PROCESSING #
498
+
499
+ #############
500
+
501
+ st.write(f"""**Welcome!**\nThis app lets you generate digests of topics currently in the news. Select up to three current news topics and the digest lets you know what the latest news on those topics is!""") # Can I make this disappear?
502
+
503
+ cnn_dict, npr_dict, cnn_ner, npr_ner = create_ner_dicts()
504
+
505
+ clusters = get_news_topics(cnn_ner, npr_ner)
506
+
507
+ selections = []
508
+
509
+ choices = [None]
510
+
511
+ for i in list(clusters.keys()):
512
+
513
+ choices.append(i)
514
+
515
+ # button to refresh topics
516
+
517
+ if st.button("Refresh topics!"):
518
+
519
+ new_data = update_topics()
520
+
521
+ clusters = new_data[0]
522
+
523
+ cnn_dict, npr_dict, cnn_ner, npr_ner = new_data[1][0], new_data[1][1], new_data[1][2], new_data[1][3]
524
+
525
+
526
+
527
+ # Form used to take 3 menu inputs
528
+
529
+ with st.form(key='columns_in_form'):
530
+
531
+ cols = st.columns(3)
532
+
533
+ for i, col in enumerate(cols):
534
+
535
+ selections.append(col.selectbox(f'Make a Selection', choices, key=i))
536
+
537
+ submitted = st.form_submit_button('Submit')
538
+
539
+ if submitted:
540
+
541
+ selections = [i for i in selections if i is not None]
542
+
543
+ with st.spinner(text="Digesting...please wait, this may take up to 20 seconds..."):
544
+
545
+ digest = seek_and_sum(selections, cnn_dict, npr_dict)
546
+
547
+ if len(digest) == 0:
548
+
549
+ st.write("You didn't select a topic!")
550
+
551
+ else:
552
+
553
+ st.write("Your digest is ready:\n")
554
+
555
+
556
+
557
+ count = 0
558
+
559
+ for each in digest:
560
+
561
+ count += 1
562
+
563
+ st.write(each)