m. polinsky
commited on
Commit
•
fed8c31
1
Parent(s):
3195fcb
adding data gathering to digestor.py
Browse files- digestor.py +39 -0
digestor.py
CHANGED
@@ -210,3 +210,42 @@ class Digestor:
|
|
210 |
digest.append(' '.join(each.summary_text))
|
211 |
|
212 |
self.text = '\n\n'.join(digest)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
210 |
digest.append(' '.join(each.summary_text))
|
211 |
|
212 |
self.text = '\n\n'.join(digest)
|
213 |
+
|
214 |
+
# Create dict to write out digest data for analysis
|
215 |
+
out_data = {}
|
216 |
+
t = dt.now()
|
217 |
+
datetime_str = f"""{t.hour:.2f}:{t.minute:.2f}:{t.second:.2f}"""
|
218 |
+
choices_str = ', '.join(self.user_choices)
|
219 |
+
digest_str = '\n\t'.join(digest)
|
220 |
+
|
221 |
+
|
222 |
+
# This is a long comprehension to store all the fields and values in each summary.
|
223 |
+
# integer: {
|
224 |
+
# name_of_field:value except for source,
|
225 |
+
# which is unhashable so needs explicit handling.
|
226 |
+
# }
|
227 |
+
summaries = { # k is a summary tuple, i,p = enumerate(k)
|
228 |
+
# Here we take the first dozen words of the first summary chunk as key
|
229 |
+
c: {
|
230 |
+
# field name : value unless its the source
|
231 |
+
k._fields[i]:p if k._fields[i]!='source'
|
232 |
+
else
|
233 |
+
{
|
234 |
+
'name': k.source.source_name,
|
235 |
+
'source_url': k.source.source_url,
|
236 |
+
'Summarization" Checkpoint': k.source.source_summarization_checkpoint,
|
237 |
+
'NER Checkpoint': k.source.source_ner_checkpoint,
|
238 |
+
} for i,p in enumerate(k)
|
239 |
+
} for c,k in enumerate(self.summaries)}
|
240 |
+
|
241 |
+
out_data['timestamp'] = datetime_str
|
242 |
+
out_data['article_count'] = len(self.summaries)
|
243 |
+
out_data['digest_length'] = len(digest_str.split(" "))
|
244 |
+
out_data['sum_params'] = {
|
245 |
+
'token_limit':self.token_limit,
|
246 |
+
'word_limit':self.word_limit,
|
247 |
+
'params':self.SUMMARIZATION_PARAMETERS,
|
248 |
+
}
|
249 |
+
out_data['summaries'] = summaries
|
250 |
+
|
251 |
+
return out_data
|