m. polinsky commited on
Commit
c6e9812
1 Parent(s): 5cab1a3

Create source.py

Browse files
Files changed (1) hide show
  1. source.py +51 -0
source.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # source.py provides an abstract dataclass for a data source
2
+ from abc import ABC, abstractmethod
3
+ from dataclasses import dataclass
4
+ from collections import namedtuple
5
+ from typing import List, Optional
6
+
7
+ Summary = namedtuple('Summary',['source','cluster_list','link_ext','hed','dek','date','authors','original_length','summary_text','summary_length','chunk_time', 'query_time', 'mean_query_time', 'summary_time'])
8
+ Summary.__doc__ = f"""
9
+ Summary: a namedtuple for storing Summaries and relevant metadata.
10
+
11
+ • Source: A Source object for the source of the summarized document.
12
+ • cluster_list: A list of the NER entities detected in this article's hed (headline).
13
+ • link_ext: The link extension of the article (on the base url, source's source_url)
14
+ • hed, dek: headline and subheader. These are standard industry terms.
15
+ Dek is None if not applicable.
16
+ • date: Date of publication/update listed in article.
17
+ • authors: list of authors, currently a string containing the byline.
18
+ • original_length: length of the original article
19
+ • cluster_num: Number of clusters the source article appears in
20
+ • summary_text: List of summarized chunks.
21
+ • summary_length: Length of summary text
22
+ • stats for stats
23
+ """
24
+
25
+ @dataclass
26
+ class Source(ABC):
27
+ source_name: Optional[str] = ""
28
+ source_url: Optional[str] = ""
29
+ # Checkpoint str encourages use of source-appropriate models.
30
+ source_summarization_checkpoint: Optional[str] = ""
31
+ source_ner_checkpoint: Optional[str] = ""
32
+
33
+ """
34
+ User must implement a source-dependent method
35
+ to retrieve data used to create clusters.
36
+
37
+ This gets called when clustering is performed.
38
+ """
39
+ @abstractmethod
40
+ def retrieve_cluster_data(self) -> List[namedtuple]:
41
+ pass
42
+
43
+ """
44
+ User must implement a source-dependent method
45
+ to retrieve texts for summarization.
46
+
47
+ This gets called once topics for digestion have been selected.
48
+ """
49
+ @abstractmethod
50
+ def retrieve_article(self) -> List[namedtuple]:
51
+ pass