File size: 11,750 Bytes
80b5ef0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b37a49f
ef55d0e
80b5ef0
 
8a15221
80b5ef0
 
 
3195fcb
80b5ef0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63dcf6f
80b5ef0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3e11f3e
80b5ef0
 
 
 
 
 
 
 
 
 
 
 
 
63010fb
80b5ef0
 
 
b37a49f
80b5ef0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
03dd514
53247c1
80b5ef0
057b299
53247c1
80b5ef0
 
 
 
 
 
 
 
 
 
 
 
 
a8c2197
80b5ef0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7f061b3
0c5a91f
a8c2197
80b5ef0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0131d43
fc43255
fed8c31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
# digestor.py is an implementation of a digestor that creates news digests.
# the digestor manages the creation of summaries and assembles them into one digest...

import requests, json
from collections import namedtuple
from functools import lru_cache
from typing import List
from dataclasses import dataclass, field
from datetime import datetime as dt
import streamlit as st

from codetiming import Timer
from transformers import AutoTokenizer

from source import Source, Summary
from scrape_sources import stub as stb



@dataclass
class Digestor:
    timer: Timer
    cache: bool = True
    text: str = field(default="no_digest")
    stubs: List = field(default_factory=list)
    # For clarity.
    # Each stub/summary has its entities. 
    user_choices: List =field(default_factory=list)
    # The digest text
    summaries: List = field(default_factory=list)
    #sources:List = field(default_factory=list) # I'm thinking create a string list for easy ref
    # text:str = None

    digest_meta:namedtuple(
        "digestMeta", 
        [
            'digest_time',
            'number_articles',
            'digest_length', 
            'articles_per_cluster'
        ])  = None

    # Summarization params:
    token_limit: int = 1024
    word_limit: int = 400
    SUMMARIZATION_PARAMETERS = {
                                "do_sample": False,
                                "use_cache": cache,
                                } 

    # Inference parameters
    API_URL = "https://api-inference.huggingface.co/models/sshleifer/distilbart-cnn-12-6"
    headers = {"Authorization": f"""Bearer {st.secrets['ato']}"""}
    
    # I would like to keep the whole scraped text separate if I can,
    # which I'm not doing here
    # After this runs, the digestor is populated with s
    
    # relevance is a matter of how many chosen clusters this article belongs to.
    # max relevance is the number of unique chosen entities.  min is 1.
    # Allows placing articles that hit more chosen topics to go higher up,
    # mirroring "upside down pyramid" journalism convention, i.e. ordering facts by decreasing information content.
    def relevance(self, summary):
        return len(set(self.user_choices) & set(summary.cluster_list))

    def digest(self):
        """Retrieves all data for user-chosen articles, builds summary object list"""
        # Clear timer from previous digestion
        self.timer.timers.clear()
        # Start digest timer
        with Timer(name=f"digest_time", text="Total digest time: {seconds:.4f} seconds"):
            # Loop through stubs, collecting data and instantiating 
            # and collecting Summary objects.
            for stub in self.stubs:
               
                # Check to see if we already have access to this summary:
                if not isinstance(stub, stb):
                    self.summaries.append(stub)
                else:
                    # if not:
                    summary_data: List
                    # Get full article data 
                    text, summary_data = stub.source.retrieve_article(stub)
                    # Drop problem scrapes
                    # Log here
                    if text != None and summary_data != None:
                        # Start chunk timer
                        with Timer(name=f"{stub.hed}_chunk_time", logger=None):
                            chunk_list = self.chunk_piece(text, self.word_limit, stub.source.source_summarization_checkpoint)
                        # start totoal summarization timer.  Summarization queries are timed in 'perform_summarzation()'
                        with Timer(name=f"{stub.hed}_summary_time", text="Whole article summarization time: {:.4f} seconds"):
                            summary = self.perform_summarization(
                                stub.hed,
                                chunk_list, 
                                self.API_URL, 
                                self.headers,
                                cache = self.cache,
                                )
                        # return these things and instantiate a Summary object with them, 
                        # add that summary object to a list or somesuch collection.
                        # There is also timer data and data on articles

                        self.summaries.append(
                            Summary(
                                source=summary_data[0], 
                                cluster_list=summary_data[1],
                                link_ext=summary_data[2], 
                                hed=summary_data[3], 
                                dek=summary_data[4], 
                                date=summary_data[5], 
                                authors=summary_data[6], 
                                original_length = summary_data[7], 
                                summary_text=summary, 
                                summary_length=len(' '.join(summary).split(' ')),
                                chunk_time=self.timer.timers[f'{stub.hed}_chunk_time'],
                                query_time=self.timer.timers[f"{stub.hed}_query_time"],
                                mean_query_time=self.timer.timers.mean(f'{stub.hed}_query_time'),
                                summary_time=self.timer.timers[f'{stub.hed}_summary_time'],
                                
                                )
                                   )
                    else:
                        print("Null article")   # log this.  


            # When finished, order the summaries based on the number of user-selected clusters each article appears in.
            self.summaries.sort(key=self.relevance, reverse=True)

    # Query the HuggingFace Inference engine.  
    def query(self, payload, API_URL, headers):
        """Performs summarization inference API call."""
        data = json.dumps(payload)
        response = requests.request("POST", API_URL, headers=headers, data=data)
        return json.loads(response.content.decode("utf-8"))


    def chunk_piece(self, piece, limit, tokenizer_checkpoint, include_tail=False):
        """Breaks articles into chunks that will fit the desired token length limit"""        
        # Get approximate word count
        words = len(piece.split(' ')) # rough estimate of words.  # words <= number tokens generally.
        # get number of chunks by dividing number of words by chunk size (word limit) 
        # Create list of ints to create rangelist from
        base_range = [i*limit for i in range(words//limit+1)]
        # For articles less than limit in length base_range will only contain zero.
        # For most articles there is a small final chunk less than the limit.  
        # It may make summaries less coherent.
        if include_tail or base_range == [0]: 
            base_range.append(base_range[-1]+words%limit) # add odd part at end of text...maybe remove.
        # list of int ranges 
        range_list = [i for i in zip(base_range,base_range[1:])]
        

        # Setup for chunking/checking tokenized chunk length
        fractured = piece.split(' ')
        tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint)
        chunk_list = []
    
        # Finally, chunk the piece, adjusting the chunks if too long.
        for i, j in range_list:
            if (tokenized_len := len(tokenizer(chunk := ' '.join(fractured[i:j])))) <= self.token_limit:  # d[i:j]).replace('\n',' ')))) <= self.token_limit:
                chunk_list.append(chunk)
            else: # if chunks of <limit> words are too long, back them off.
                chunk_list.append(' '.join(chunk.split(' ')[: self.token_limit - tokenized_len ]))  # tokenized_len ]).replace('\n',' '))
        chunk_list = [i.replace(' . ','. ') for i in chunk_list]
        return chunk_list



    # Returns list of summarized chunks instead of concatenating them which loses info about the process.
    def perform_summarization(self, stubhead, chunklist : List[str], API_URL: str, headers: None, cache=True) -> List[str]:
        """For each in chunk_list, appends result of query(chunk) to list collection_bin."""
        collection_bin = []
        repeat = 0
        # loop list and pass each chunk to the summarization API, storing results.
        # API CALLS: consider placing the code from query() into here. * * * *
        for chunk in chunklist:
            safe = False
            summarized_chunk = None
            with Timer(name=f"{stubhead}_query_time", logger=None):
                while not safe and repeat < 4:
                    try: # make these digest params. 
                        summarized_chunk =  self.query(
                            { 
                                "inputs": str(chunk), 
                                "parameters": self.SUMMARIZATION_PARAMETERS
                            },
                        API_URL, 
                        headers,
                        )[0]['summary_text']
                        safe = True
                    except Exception as e:
                        print("Summarization error, repeating...")
                        print(e)
                        repeat+=1
            print(summarized_chunk)            
            if summarized_chunk is not None:
                collection_bin.append(summarized_chunk) 
        return collection_bin 
    


    # Order for display, arrange links?
    def build_digest(self) -> str:
        """Called to show the digest.  Also creates data dict for digest and summaries."""
        # builds summaries from pieces in each object
        # orders summaries according to cluster count
        # above done below not
        # Manages data to be presented along with digest.
        # returns all as data to display method either here or in main.
        digest = []
        for each in self.summaries:
            digest.append(' '.join(each.summary_text))
            
        self.text = '\n\n'.join(digest) 

        # Create dict to write out digest data for analysis
        out_data = {}
        t = dt.now()
        datetime_str = f"""{t.hour:.2f}:{t.minute:.2f}:{t.second:.2f}"""
        choices_str = ', '.join(self.user_choices) 
        digest_str = '\n\t'.join(digest)
        
        
        # This is a long comprehension to store all the fields and values in each summary.
        # integer: {
                # name_of_field:value except for source, 
                         #   which is unhashable so needs explicit handling.
               #   }
        summaries = { #  k is a summary tuple, i,p = enumerate(k)
                # Here we take the first dozen words of the first summary chunk as key
                c: {
                # field name : value unless its the source
                k._fields[i]:p if k._fields[i]!='source' 
                else 
                {
                    'name': k.source.source_name, 
                    'source_url': k.source.source_url, 
                    'Summarization" Checkpoint': k.source.source_summarization_checkpoint, 
                    'NER Checkpoint': k.source.source_ner_checkpoint,
                } for i,p in enumerate(k)
                } for c,k in enumerate(self.summaries)}

        out_data['timestamp'] = datetime_str
        out_data['article_count'] = len(self.summaries)
        out_data['digest_length'] = len(digest_str.split(" "))
        out_data['sum_params'] = {
                        'token_limit':self.token_limit,
                        'word_limit':self.word_limit,
                        'params':self.SUMMARIZATION_PARAMETERS,
                        }
        out_data['summaries'] = summaries

        return out_data