m. polinsky commited on
Commit
fed8c31
1 Parent(s): 3195fcb

adding data gathering to digestor.py

Browse files
Files changed (1) hide show
  1. digestor.py +39 -0
digestor.py CHANGED
@@ -210,3 +210,42 @@ class Digestor:
210
  digest.append(' '.join(each.summary_text))
211
 
212
  self.text = '\n\n'.join(digest)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
  digest.append(' '.join(each.summary_text))
211
 
212
  self.text = '\n\n'.join(digest)
213
+
214
+ # Create dict to write out digest data for analysis
215
+ out_data = {}
216
+ t = dt.now()
217
+ datetime_str = f"""{t.hour:.2f}:{t.minute:.2f}:{t.second:.2f}"""
218
+ choices_str = ', '.join(self.user_choices)
219
+ digest_str = '\n\t'.join(digest)
220
+
221
+
222
+ # This is a long comprehension to store all the fields and values in each summary.
223
+ # integer: {
224
+ # name_of_field:value except for source,
225
+ # which is unhashable so needs explicit handling.
226
+ # }
227
+ summaries = { # k is a summary tuple, i,p = enumerate(k)
228
+ # Here we take the first dozen words of the first summary chunk as key
229
+ c: {
230
+ # field name : value unless its the source
231
+ k._fields[i]:p if k._fields[i]!='source'
232
+ else
233
+ {
234
+ 'name': k.source.source_name,
235
+ 'source_url': k.source.source_url,
236
+ 'Summarization" Checkpoint': k.source.source_summarization_checkpoint,
237
+ 'NER Checkpoint': k.source.source_ner_checkpoint,
238
+ } for i,p in enumerate(k)
239
+ } for c,k in enumerate(self.summaries)}
240
+
241
+ out_data['timestamp'] = datetime_str
242
+ out_data['article_count'] = len(self.summaries)
243
+ out_data['digest_length'] = len(digest_str.split(" "))
244
+ out_data['sum_params'] = {
245
+ 'token_limit':self.token_limit,
246
+ 'word_limit':self.word_limit,
247
+ 'params':self.SUMMARIZATION_PARAMETERS,
248
+ }
249
+ out_data['summaries'] = summaries
250
+
251
+ return out_data