File size: 16,252 Bytes
86e83a7
 
 
 
 
 
c98eb20
 
53e941f
86e83a7
 
 
8dcc3b4
 
 
 
86e83a7
 
 
 
 
3be1eba
86e83a7
 
 
 
 
 
 
 
 
 
0678bbe
 
86e83a7
 
 
 
 
0678bbe
 
86e83a7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
633dfe8
86e83a7
 
 
 
 
8a24173
83d299f
86e83a7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8a24173
86e83a7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ebbe4a
 
 
 
 
 
 
 
 
446eb69
 
3ebbe4a
aa9d086
 
3ebbe4a
 
 
 
 
 
446eb69
 
3ebbe4a
3016a4d
3ebbe4a
 
a883e93
 
 
 
 
 
86e83a7
 
 
 
 
 
 
ee570e2
915ce78
86e83a7
 
83d299f
86e83a7
 
 
c73ef20
86e83a7
c73ef20
86e83a7
 
 
 
 
c73ef20
 
 
2825378
 
483bcd9
9aae26a
12d66ab
d63e616
f62134a
2825378
d7a972e
c73ef20
d7a972e
f62134a
d7a972e
 
 
 
f62134a
 
34852fa
33be678
0a4f5ca
86e83a7
 
 
 
 
c73ef20
0d9fac7
 
 
86e83a7
f7afba1
c73ef20
9c89ceb
 
c0cf9de
f7afba1
264654c
c73ef20
de838f5
f7afba1
c73ef20
c4e82b4
86e83a7
21a972c
d39e612
86e83a7
 
 
 
13d98ae
e790551
86e83a7
 
 
 
c0c6fd6
86e83a7
 
ad821d0
86e83a7
 
33be678
86e83a7
 
ad821d0
a6d8736
 
 
b844a4c
cd5196e
86e83a7
 
 
 
 
6145843
e67c0ef
86e83a7
446eb69
86e83a7
020effe
1192f02
eaf763c
 
733b47b
1583dd7
48140f6
1583dd7
48140f6
f8ab9a2
cef16b3
3ebbe4a
eaf763c
43bd952
a3285c8
cd5196e
 
 
 
 
 
 
 
 
 
 
 
 
9798c0e
3fc7419
 
17c93bd
 
 
 
 
 
 
 
 
 
 
3fc7419
50dabf3
cd5196e
cef16b3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
# streamlit_app.py manages the whole TopicDig process
from typing import List, Set
from collections import namedtuple
import random
import requests
import json
import re 

from datetime import datetime as dt
from codetiming import Timer
import streamlit as st

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

from digestor import Digestor
from source import Source
from scrape_sources import NPRLite, CNNText, stub


@st.cache()
def initialize(limit, rando, use_cache=True):
    clusters: dict[str:List[namedtuple]] = dict()
    # This is a container for the source classes.
    # Make sure you handle this.  Whats the deal.
    sources:List[Source]= [] # Write them and import? Read a config?
    # FOR NOW ONLY add this explicitly here.  
    # MUST read in final version though.
    sources.append(NPRLite(
        'npr', 
        'https://text.npr.org/1001', 
        'sshleifer/distilbart-cnn-12-6',
        #'google/pegasus-multi_news',
        'dbmdz/bert-large-cased-finetuned-conll03-english'
        ))
    sources.append(CNNText(
        'cnn',
        'https://lite.cnn.com', 
        'sshleifer/distilbart-cnn-12-6',
        #'google/pegasus-multi_news',
        'dbmdz/bert-large-cased-finetuned-conll03-english'
    ))


    # initialize list to hold cluster data namedtuples
    cluster_data: List[namedtuple('article', ['link','hed','entities', 'source'])] 
    article_dict : dict[str:namedtuple]
    
    # For all sources retrieve_cluster_data 
    # returns List[namedtuples] with empty entity lists
    
    cluster_data = []
    article_meta = namedtuple('article_meta',['source', 'count'])
    cluster_meta : List[article_meta] = []
    for data_source in sources:
        if limit is not None:
        # c_data is a list of articleTuples and c_meta is the length of that but actually the length of one of the source lists...weird.
            c_data, c_meta = data_source.retrieve_cluster_data(limit//len(sources)) 
        else:
            c_data, c_meta = data_source.retrieve_cluster_data() 
        cluster_data.append(c_data)
        cluster_meta.append(article_meta(data_source.source_name, c_meta))
        st.session_state[data_source.source_name] = f"Number of articles from source: {c_meta}"

    cluster_data = cluster_data[0] + cluster_data[1]
    # NER
    # iterate the list of namedtuples, 
    for tup in cluster_data:
        # pass each hed to the api query method, return the dict 
        # through the ner_results function to the 'entities' list. 
        # Populate stub entities list
        perform_ner(tup, cache=use_cache)
        generate_clusters(clusters, tup)
    st.session_state['num_clusters'] = f"""Total number of clusters: {len(clusters)}"""
    
    # Article stubs tracks all stubs
    # If cluster is unsummarized, its hed's value is the namedtuple stub.  
    # Else reference digestor instance so summary can be found.
    article_dict = {stub.hed: stub for stub in cluster_data}
    
    
    return article_dict, clusters


# Am I going to use this for those two lines?
def perform_ner(tup:namedtuple('article',['link','hed','entities', 'source']), cache=True):
    with Timer(name="ner_query_time", logger=None):
        result = ner_results(ner_query(
                {
                    "inputs":tup.hed,
                    "paramters":
                    {
                        "use_cache": cache,
                    },
                }            
            ))
    for i in result:
        tup.entities.append(i) 


def ner_query(payload):
    data = json.dumps(payload)
    response = requests.request("POST", NER_API_URL, headers=headers, data=data)
    return json.loads(response.content.decode("utf-8"))



def generate_clusters(
    the_dict: dict, 
    tup : namedtuple('article_stub',[ 'link','hed','entities', 'source'])
    ) -> dict:
    for entity in tup.entities: 
        # Add cluster if entity not already in dict
        if entity not in the_dict:
            the_dict[entity] = []
        # Add this article's link to the cluster dict
        the_dict[entity].append(tup)
    

def ner_results(ner_object, groups=True, NER_THRESHOLD=0.5) -> List[str]:
    # empty lists to collect our entities
    people, places, orgs, misc = [], [], [], []

    # 'ent' and 'designation' handle the difference between dictionary keys 
    # for aggregation strategy grouped vs ungrouped
    ent = 'entity' if not groups else 'entity_group'
    designation = 'I-' if not groups else ''

    # Define actions -- this is a switch-case dictionary.
    # keys are the identifiers used inthe return dict from
    # the ner_query.  
    # values are list.append() for each of the lists
    # created at the top of the function.  They hold sorted entities.
    # actions is used to pass entities into the lists.
    # Why I called it actions I have no idea rename it.
    actions = {designation+'PER':people.append,
               designation+'LOC':places.append, 
               designation+'ORG':orgs.append,
               designation+'MISC':misc.append
            } # Is this an antipattern?

    #  For each dictionary in the ner result list, if the entity str doesn't contain a '#' 
    # and the confidence is > 90%, add the entity to the list for its type.
  
    # actions[d[ent]](d['word']) accesses the key of actions that is returned 
    # from d[ent] and then passes the entity name, returned by d['word'] to 
    # the 'list.append' waiting to be called in the dict actions. 
    # Note the ().  We access actions to call its append...
    readable = [ actions[d[ent]](d['word']) for d in ner_object if '#' not in d['word'] and d['score'] > NER_THRESHOLD ]

    # create list of all entities to return
    ner_list = [i for i in set(people) if len(i) > 2] + [i for i in set(places) if len(i) > 2] + [i for i in set(orgs) if len(i) > 2] + [i for i in set(misc) if len(i) > 2]

    return ner_list
    
def show_length_graph():
    labels = [i for i in range(outdata['article_count'])]
    original_length = [outdata['summaries'][i]['original_length'] for i in outdata['summaries']]
    summarized_length = [outdata['summaries'][i]['summary_length'] for i in outdata['summaries']]       
    x = np.arange(len(labels))  # the label locations
    width = 0.35  # the width of the bars
     
    fig, ax = plt.subplots(figsize=(14,8))
    rects1 = ax.bar(x - width/2, original_length, width,  label='Original', color='lightgreen',zorder=0)
    rects2 = ax.bar(x + width/2, summarized_length, width, label='Summary', color='lightblue',zorder=0)
   
    rects3 = ax.bar(x - width/2, original_length, width, color='none',edgecolor='black', lw=1.25,zorder=1)
    rects4 = ax.bar(x + width/2, summarized_length, width, color='none',edgecolor='black', lw=1.25,zorder=1)
    
    # Add some text for labels, title and custom x-axis tick labels, etc.
    ax.set_ylabel('Text Length')
    ax.set_xticks(x)
    ax.set_yticks([i for i in range(0,max(original_length),max(summarized_length))])
    ax.set_xticklabels(labels)
    ax.set_xlabel('Source article')
    ax.legend(loc='upper right')
    
    plt.title('Original to Summarized Text Compression (space-separated tokens)')
    #ax.hist(arr, bins=20)
    st.pyplot(fig)
 
def check_for_word_and_word(in_string):
    m = re.search(r'(\w\w+)\sand\s\1', in_string)
    if m is not None:
        return m.group()
    return None

# These could be passed through the command line
# or read from a config file.
# One of these is needed here for NER and one in Digestor for summarization.
NER_API_URL =  "https://api-inference.huggingface.co/models/dbmdz/bert-large-cased-finetuned-conll03-english"
headers = {"Authorization": f"""Bearer {st.secrets['ato']}"""}

LIMIT = 30 # Controls time and number of clusters.
USE_CACHE = True

if not USE_CACHE:
    print("NOT USING CACHE")
if LIMIT is not None:
    print(f"LIMIT: {LIMIT}")

# digest store am I using this though?  - april 15 2022
digests = dict() # key is cluster, value is digestor object
out_dicts = [] # Am I using this? -dit
# list to accept user choices
# retrieve cluster data and create dict to track each article (articleStubs)
# and create topic clusters by performing ner.
print("Initializing....")
article_dict, clusters = initialize(LIMIT, USE_CACHE)  
# We now have clusters and cluster data.  Redundancy?

# Welcome and explainer
st.title("Welcome to TopicDig")
st.subheader("Automatic news article summarization with transformers!")
st.success(f"You select the topics, we summarize the relevant news and show you a digest, plus some info to help contextualize what the machine did.")
st.write(f"On the left you'll find a list of topics recently gleaned from current news headlines.  TopicDig lets you assemble digests of these stories using transformers!")
st.warning("Enjoy, and remember, these summaries contain a few kinds of issues, from untruths to missing attribution or topic sentences.  For more information on truthfulness in automatic summarization with transformers see https://arxiv.org/abs/2109.07958.")

st.subheader(f"How it works:")
st.write(f"""Select 1 to 3 topics from the drop down menus and click 'submit' to start generating your digest.  \n\n Extra options include refreshing the topics and changing the length of summaries and consequently of the digest.""")
    
# Provides expandable container for refresh and summarization parameters, currently only chunk size
with st.expander("See extra options"):
    st.subheader("Refresh topics: ")
    st.write("You may want to refresh the topic lists if the app loaded several hours ago or you get no summary.")
    # button to refresh topics
    if st.button("Refresh topics!"):
        article_dict, clusters = initialize(LIMIT, USE_CACHE)      
    st.subheader("Select chunk size: ")
    st.write("Smaller chunks means more of the article included in the summary and a longer digest.")
    chunk_size = st.select_slider(label="Chunk size", options=[i for i in range(50,801,50)], value=400)
 
   
    
selections = []
choices = list(clusters.keys())
choices.insert(0,'None')

# May be desired in sidebar - april 15 2022
# st.write(f"CNN articles: {st.session_state['cnn']}")
# st.write(f"NPR articles: {st.session_state['npr']}")
# st.write(f"Number of clusters {st.session_state['num_clusters']}")


# Display topics to user currently in sidebar - april 15 2022
st.sidebar.subheader("Topics")  
st.sidebar.write("Here are the current news topics and the number of articles whose headlines featured those topics.") 
show_clusters = {i:len(clusters[i]) for i in clusters.keys()}
cdf = pd.DataFrame(data={"Cluster":list(show_clusters.keys()), "Articles":list(show_clusters.values())}  ).sort_values(by='Articles', ascending=False)
styler = cdf.style.hide_index()
st.sidebar.write(styler.to_html(), unsafe_allow_html=True)


# Get session time
st.session_state['dt'] = dt.now()
# Form used to take 3 menu inputs
with st.form(key='columns_in_form'):
    cols = st.columns(3)
    for i, col in enumerate(cols):
        selections.append(col.selectbox(f'Make a Selection', choices, key=i))
    submitted = st.form_submit_button('Submit')
    if submitted:
        selections = [i for i in selections for j in selections if i is not None]
        with st.spinner(text="Creating your digest: this will take a few moments."):
            chosen = []
            
            for i in selections: # i is supposed to be a list of stubs, mostly one
                if i != 'None':
                    for j in clusters[i]:
                        if j not in chosen:
                            chosen.append(j) # j is a stub.
        
            # Digestor uses 'chosen' to create digest.  
            # 'user_choicese' is passed for reference.  
            digestor = Digestor(timer=Timer(), cache = USE_CACHE, stubs=chosen, user_choices=selections, token_limit=1024, word_limit=chunk_size)
            # happens internally but may be used differently so it isn't automatic upon digestor creation.
            # Easily turn caching off for testing.
            st.subheader("What you'll see:")
            st.write("First you'll see a list of links appear below.  These are the links to the original articles being summarized for your digest, so you can get the full story if you're interested, or check the summary against the source.")
            st.write("In a few moments, your machine-generated digest will appear below the links, and below that you'll see an approximate word count of your digest and the time in seconds that the whole process took!")
            st.write("You'll also see a graph showing, for each article and summary, the original and summarized lengths.")
            st.error("Remember: This only demos news article summarization.  It is not yet completely reliable, and may distort some facts.  An analysis of factfulness is in progress by the app creator.")
           # st.write("Finally, you will see some possible errors detected in the summaries.  This area of NLP is far from perfection and always developing.  Hopefully this is an interesting step in the path!")
            digestor.digest() # creates summaries and stores them associated with the digest



            # Get displayable digest and digest data
            outdata = digestor.build_digest()
           
        if len(digestor.text) == 0:
            st.write("No text to return...very sorry.  Please hit 'refresh topics' in the options panel!")    
        else:
            st.subheader("Your digest:")
            st.info(digestor.text.replace("$","\$"))
    
            st.subheader("Summarization stats:")
            col1, col2, col3 = st.columns(3)
            col1.metric("Digest Time", f"""{digestor.timer.timers['digest_time']:.2f}""", "seconds")
            col2.metric("Digest Length", str(len(digestor.text.split(" "))), 'space-sep tokens' )
            col3.metric("Article Count", str(outdata['article_count']), "articles" )

            st.subheader("Article Compression:")
            # Summarize the findings for all models
            show_length_graph()
            
            # Issues section: search for known problems with summaries
            
           # st.header("Things to look for: ")
           # st.subheader("Factfulness:")
           # st.write("Automatically checking the truthfulness of a document isn't a trivial task, and is not implemented here.  Users are encouraged to use their own wider knowledge to look for possible falsehoods.  In the normal news a reader is understood to have a certain amount of understanding to comprehend the news.  This experimental application requires a bit more, but seems promising.")
            #st.subheader("Repetition:")
            #rep_check = check_for_word_and_word(digestor.text)
            #if rep_check is not None:
            #    st.write(f"Following phrases repeat: {rep_check}")
            #    found_index = digestor.text.find(rep_check)
            #    st.write("Sample:")
            #    st.write(f"{text[found_index-40:found_index+40]}")
            #else:
            #    st.write("No repetition detected.")
            #  
            # Same article from different sources
            
            #st.subheader("Text redundancy: ")
          # for each in selections:
          #      if each != 'None':
          #          # check if two source articles share a cluster and not a source.
          #          sources = {}
          #          for i in clusters[each]:
          ###              if i[3].source_name not in sources:
            #            st.write(f"i[3].source_name: {i[3].source_name}")
            #                sources[i[3].source_name] = 0
            #            else:
            #                print("One or more articles on the same topic may have come from different sources.  \n\n This may cause redundancy in the digest, though it can also add further clarity, if the two articles are significantly different.")
             #               break
                    
                    
           # st.write("If more than one source have their own versions of the same topic from the same perspective, the result may be repetitive, or it may add nuance and the two summaries may complement each other.")