aliabd commited on
Commit
7e3e85d
1 Parent(s): d26e36a

full demo working with old graido

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .idea/SummerTime.iml +8 -0
  2. .idea/inspectionProfiles/Project_Default.xml +16 -0
  3. .idea/inspectionProfiles/profiles_settings.xml +6 -0
  4. .idea/modules.xml +8 -0
  5. LICENSE +202 -0
  6. README.md +1 -1
  7. SummerTime.egg-info/PKG-INFO +124 -0
  8. SummerTime.egg-info/SOURCES.txt +46 -0
  9. SummerTime.egg-info/dependency_links.txt +1 -0
  10. SummerTime.egg-info/top_level.txt +4 -0
  11. __init__.py +3 -0
  12. app.py +28 -0
  13. build/scripts-3.9/summertime +3 -0
  14. dataset/__init__.py +36 -0
  15. dataset/dataset_loaders.py +501 -0
  16. dataset/non_huggingface_datasets_builders/arxiv_longsummarization.py +104 -0
  17. dataset/non_huggingface_datasets_builders/qmsum.py +119 -0
  18. dataset/non_huggingface_datasets_builders/scisummnet.py +105 -0
  19. dataset/non_huggingface_datasets_builders/summscreen.py +123 -0
  20. dataset/st_dataset.py +281 -0
  21. dependencies.txt +11 -0
  22. dist/SummerTime-0.1-py3-none-any.whl +0 -0
  23. download.py +3 -0
  24. evaluation/__init__.py +14 -0
  25. evaluation/base_metric.py +27 -0
  26. evaluation/bertscore_metric.py +20 -0
  27. evaluation/bleu_metric.py +20 -0
  28. evaluation/meteor_metric.py +31 -0
  29. evaluation/rouge_metric.py +23 -0
  30. evaluation/rougewe_metric.py +24 -0
  31. evaluation/summeval_metric.py +18 -0
  32. model/__init__.py +34 -0
  33. model/base_model.py +81 -0
  34. model/defaults.py +10 -0
  35. model/dialogue/__init__.py +1 -0
  36. model/dialogue/hmnet/ExampleRawData/meeting_summarization/AMI_proprec/test_ami.json +1 -0
  37. model/dialogue/hmnet/ExampleRawData/meeting_summarization/role_dict_ext.json +1 -0
  38. model/dialogue/hmnet/config/dialogue.conf +98 -0
  39. model/dialogue/hmnet_model.py +483 -0
  40. model/multi_doc/__init__.py +2 -0
  41. model/multi_doc/base_multi_doc_model.py +40 -0
  42. model/multi_doc/multi_doc_joint_model.py +51 -0
  43. model/multi_doc/multi_doc_separate_model.py +49 -0
  44. model/query_based/__init__.py +2 -0
  45. model/query_based/base_query_based_model.py +147 -0
  46. model/query_based/bm25_model.py +45 -0
  47. model/query_based/tf_idf_model.py +46 -0
  48. model/single_doc/__init__.py +5 -0
  49. model/single_doc/bart_model.py +36 -0
  50. model/single_doc/base_single_doc_model.py +36 -0
.idea/SummerTime.iml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <module type="PYTHON_MODULE" version="4">
3
+ <component name="NewModuleRootManager">
4
+ <content url="file://$MODULE_DIR$" />
5
+ <orderEntry type="inheritedJdk" />
6
+ <orderEntry type="sourceFolder" forTests="false" />
7
+ </component>
8
+ </module>
.idea/inspectionProfiles/Project_Default.xml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <component name="InspectionProjectProfileManager">
2
+ <profile version="1.0">
3
+ <option name="myName" value="Project Default" />
4
+ <inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
5
+ <option name="ignoredPackages">
6
+ <value>
7
+ <list size="3">
8
+ <item index="0" class="java.lang.String" itemvalue="onnxruntime" />
9
+ <item index="1" class="java.lang.String" itemvalue="onnx_tf" />
10
+ <item index="2" class="java.lang.String" itemvalue="onnx" />
11
+ </list>
12
+ </value>
13
+ </option>
14
+ </inspection_tool>
15
+ </profile>
16
+ </component>
.idea/inspectionProfiles/profiles_settings.xml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
1
+ <component name="InspectionProjectProfileManager">
2
+ <settings>
3
+ <option name="USE_PROJECT_PROFILE" value="false" />
4
+ <version value="1.0" />
5
+ </settings>
6
+ </component>
.idea/modules.xml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="ProjectModuleManager">
4
+ <modules>
5
+ <module fileurl="file://$PROJECT_DIR$/.idea/SummerTime.iml" filepath="$PROJECT_DIR$/.idea/SummerTime.iml" />
6
+ </modules>
7
+ </component>
8
+ </project>
LICENSE ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ https://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "{}"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright 2021 SummerTime
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ https://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
202
+
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
  title: SummerTime
3
- emoji: 💩
4
  colorFrom: purple
5
  colorTo: green
6
  sdk: gradio
1
  ---
2
  title: SummerTime
3
+ emoji: 🔥
4
  colorFrom: purple
5
  colorTo: green
6
  sdk: gradio
SummerTime.egg-info/PKG-INFO ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.1
2
+ Name: SummerTime
3
+ Version: 0.1
4
+ Summary: A summarization mode
5
+ Home-page: https://github.com/LILYlab
6
+ Author: Ansong Ni, Murori Mutuma, Zhangir Azerbayev, Yusen Zhang, Tao Yu, Dragomir Radev
7
+ Author-email: ansong.ni@yale.edu, murorimutuma@gmail.com, zhangir.azerbayev@yale.edu
8
+ License: UNKNOWN
9
+ Description: # SummerTime
10
+
11
+ A library to help users choose appropriate summarization tools based on their specific tasks or needs. Includes models, evaluation metrics, and datasets.
12
+
13
+
14
+
15
+ ## Installation and setup
16
+
17
+ #### Create and activate a new `conda` environment:
18
+ ```bash
19
+ conda create -n st python=3.7
20
+ conda activate st
21
+ ```
22
+
23
+ #### `pip` dependencies for local demo:
24
+ ```bash
25
+ pip install -r requirements.txt
26
+ ```
27
+
28
+
29
+
30
+ ## Quick Start
31
+ Imports model, initializes default model, and summarizes sample documents.
32
+ ```python
33
+ import model as st_model
34
+
35
+ model = st_model.summarizer()
36
+ documents = [
37
+ """ PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions.
38
+ The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected
39
+ by the shutoffs which were expected to last through at least midday tomorrow."""
40
+ ]
41
+ model.summarize(documents)
42
+
43
+ # ["California's largest electricity provider has turned off power to hundreds of thousands of customers."]
44
+ ```
45
+
46
+ Also, please run `demo.ipynb` demo Jupyter notebook for more examples. To start demo Jupyter notebook on localhost:
47
+ ```bash
48
+ jupyter notebook demo.ipynb
49
+ ```
50
+
51
+
52
+
53
+ ## Models
54
+ Import and initialization:
55
+ ```python
56
+ import model as st_model
57
+
58
+ default_model = std_model.summarizer()
59
+ bart_model = std_model.bart_model.BartModel()
60
+ pegasus_model = std_model.pegasus_model.PegasusModel()
61
+ lexrank_model = std_model.lexrank_model.LexRankModel()
62
+ textrank_model = st_model.textrank_model.TextRankModel()
63
+ ```
64
+
65
+ All models can be initialized with the following optional options:
66
+ ```python
67
+ def __init__(self,
68
+ trained_domain: str=None,
69
+ max_input_length: int=None,
70
+ max_output_length: int=None,
71
+ ):
72
+ ```
73
+
74
+ All models implement the following methods:
75
+ ```python
76
+ def summarize(self,
77
+ corpus: Union[List[str], List[List[str]]],
78
+ queries: List[str]=None) -> List[str]:
79
+
80
+ def show_capability(cls) -> None:
81
+
82
+ def generate_basic_description(cls) -> str:
83
+ ```
84
+
85
+
86
+
87
+ ## Evaluation
88
+ Import and initialization:
89
+ ```python
90
+ import eval as st_eval
91
+
92
+ bert_eval = st_eval.bertscore()
93
+ bleu_eval = st_eval.bleu_eval()
94
+ rouge_eval = st_eval.rouge()
95
+ rougewe_eval = st_eval.rougewe()
96
+ ```
97
+
98
+ All evaluation metrics can be initialized with the following optional arguments:
99
+ ```python
100
+ def __init__(self, metric_name):
101
+ ```
102
+
103
+ All evaluation metric objects implement the following methods:
104
+ ```python
105
+ def evaluate(self, model, data):
106
+
107
+ def get_dict(self, keys):
108
+ ```
109
+
110
+
111
+ ## Datasets
112
+ Import and initialization:
113
+ ```python
114
+ import dataset.stdatasets as st_data
115
+ ```
116
+
117
+ ## Contributors
118
+ This repository is built by the [LILY Lab](https://yale-lily.github.io/) at Yale University, led by Prof. [Dragomir Radev](https://cpsc.yale.edu/people/dragomir-radev). The main contributors are [Ansong Ni](https://niansong1996.github.io), Zhangir Azerbayev, Troy Feng, Murori Mutuma and Yusen Zhang (Penn State). For comments and question, please open an issue.
119
+
120
+ Platform: UNKNOWN
121
+ Classifier: Programming Language :: Python :: 3
122
+ Classifier: License :: OSI Approved :: MIT License
123
+ Classifier: Operating System :: OS Independent
124
+ Description-Content-Type: text/markdown
SummerTime.egg-info/SOURCES.txt ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ README.md
2
+ setup.py
3
+ summertime.py
4
+ SummerTime.egg-info/PKG-INFO
5
+ SummerTime.egg-info/SOURCES.txt
6
+ SummerTime.egg-info/dependency_links.txt
7
+ SummerTime.egg-info/top_level.txt
8
+ dataset/__init__.py
9
+ dataset/datasets_demo.py
10
+ dataset/huggingface_datasets.py
11
+ dataset/non_huggingface_datasets.py
12
+ dataset/st_dataset.py
13
+ evaluation/__init__.py
14
+ evaluation/base_metric.py
15
+ evaluation/bertscore_metric.py
16
+ evaluation/bleu_metric.py
17
+ evaluation/meteor_metric.py
18
+ evaluation/rouge_metric.py
19
+ evaluation/rougewe_metric.py
20
+ evaluation/summeval_metric.py
21
+ model/__init__.py
22
+ model/base_model.py
23
+ model/defaults.py
24
+ model/dialogue/__init__.py
25
+ model/dialogue/hmnet_model.py
26
+ model/multi_doc/__init__.py
27
+ model/multi_doc/base_multi_doc_model.py
28
+ model/multi_doc/multi_doc_joint_model.py
29
+ model/multi_doc/multi_doc_separate_model.py
30
+ model/query_based/__init__.py
31
+ model/query_based/base_query_based_model.py
32
+ model/query_based/bm25_model.py
33
+ model/query_based/tf_idf_model.py
34
+ model/single_doc/__init__.py
35
+ model/single_doc/bart_model.py
36
+ model/single_doc/base_single_doc_model.py
37
+ model/single_doc/lexrank_model.py
38
+ model/single_doc/longformer_model.py
39
+ model/single_doc/pegasus_model.py
40
+ model/single_doc/textrank_model.py
41
+ tests/__init__.py
42
+ tests/dataset_test.py
43
+ tests/demo_test.py
44
+ tests/evaluation_test.py
45
+ tests/integration_test.py
46
+ tests/model_test.py
SummerTime.egg-info/dependency_links.txt ADDED
@@ -0,0 +1 @@
 
1
+
SummerTime.egg-info/top_level.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
1
+ dataset
2
+ evaluation
3
+ model
4
+ tests
__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ import SummerTime.model
2
+ import SummerTime.dataset.st_dataset as data
3
+ import SummerTime.evaluation
app.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import model as st_model
3
+ import gradio as gr
4
+
5
+
6
+ model = st_model.summarizer()
7
+
8
+ def inference(text):
9
+ documents = [text]
10
+ model.summarize(documents)
11
+ return model.summarize(documents)[0]
12
+
13
+ title = "SummerTime: Text Summarization for Non-Experts"
14
+ description = "This is a demo of SummerTime: An open-source text summarization toolkit for non-experts. You can read more about the project at the links below. Input your text below (or click one of the examples to load them), and the model will generate a summary for it."
15
+ article = "<p style='text-align: center'><a target='_blank' href='https://arxiv.org/abs/2108.12738'>SummerTime: Text Summarization Toolkit for Non-experts</a> | <a target='_blank' href='https://github.com/Yale-LILY/SummerTime'>Github Repo</a> | <a target='_blank' href='https://colab.research.google.com/drive/19tPdBgaJ4_QjSiFyoxtpnFGW4OG1gTec?usp=sharing'>Colab Notebook</a></p>"
16
+
17
+ gr.Interface(
18
+ inference,
19
+ [gr.inputs.Textbox(label="Input", lines=20)],
20
+ gr.outputs.Textbox(label="Output"),
21
+ title=title,
22
+ description=description,
23
+ article=article,
24
+ examples=[["""PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions.
25
+ The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected
26
+ by the shutoffs which were expected to last through at least midday tomorrow."""],
27
+ ["""Representative Kevin McCarthy, the House Republican leader, has threatened to retaliate against any company that complies with the congressional committee investigating the Jan. 6 riot, after the panel asked dozens of firms to preserve the phone and social media records of 11 far-right members of Congress who pushed to overturn the results of the 2020 election. Mr. McCarthy’s warning was an escalation of his efforts to thwart a full accounting of the deadly attack at the Capitol carried out by a pro-Trump mob, and his latest attempt to insulate the former president and Republican lawmakers from scrutiny of any ties to the violence. It came after he led the G.O.P. opposition to the creation of an independent bipartisan commission to investigate the riot, and then pulled five Republican congressmen from the select committee that Democrats created on their own, boycotting the proceedings."""],
28
+ ["""Asked about the report, Google responded in an email that its "advertising technologies help websites and apps fund their content, enable small businesses to grow, and protect users from exploitative privacy practices and bad ad experiences." A lawsuit by 38 U.S. states and territories accuses Google of abusing its market power in an effort to make its search engine as dominant inside cars, TVs and speakers as it is in phones. This was consolidated with the federal lawsuit for purposes of discovery. Texas, backed by other states, filed a separate lawsuit against Google, accusing it of breaking antitrust law in how it runs its online advertising business."""]]).launch(debug=True)
build/scripts-3.9/summertime ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ #!python
2
+
3
+ print("welcome to Summer Time!")
dataset/__init__.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataset.dataset_loaders import (
2
+ CnndmDataset,
3
+ MultinewsDataset,
4
+ SamsumDataset,
5
+ XsumDataset,
6
+ PubmedqaDataset,
7
+ MlsumDataset,
8
+ ScisummnetDataset,
9
+ SummscreenDataset,
10
+ QMsumDataset,
11
+ ArxivDataset,
12
+ )
13
+
14
+
15
+ SUPPORTED_SUMM_DATASETS = [
16
+ CnndmDataset,
17
+ MultinewsDataset,
18
+ SamsumDataset,
19
+ XsumDataset,
20
+ PubmedqaDataset,
21
+ MlsumDataset,
22
+ ScisummnetDataset,
23
+ SummscreenDataset,
24
+ QMsumDataset,
25
+ ArxivDataset,
26
+ ]
27
+
28
+
29
+ def list_all_datasets():
30
+ all_datasets = []
31
+ for ds in SUPPORTED_SUMM_DATASETS:
32
+ dataset_description = ds.generate_basic_description()
33
+
34
+ all_datasets.append((ds.dataset_name, dataset_description))
35
+
36
+ return all_datasets
dataset/dataset_loaders.py ADDED
@@ -0,0 +1,501 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from os import path
2
+ from tqdm import tqdm
3
+ from typing import List, Generator, Optional, Union
4
+
5
+ from datasets import Dataset
6
+
7
+ from dataset.st_dataset import SummInstance, SummDataset
8
+
9
+
10
+ # Set directory to load non_huggingface dataset scripts
11
+ FILE_DIRECTORY_PATH = path.dirname(path.realpath(__file__))
12
+ BASE_NONHUGGINGFACE_DATASETS_PATH = path.join(
13
+ FILE_DIRECTORY_PATH, "non_huggingface_datasets_builders"
14
+ )
15
+
16
+
17
+ # Huggingface Datasets
18
+
19
+
20
+ class CnndmDataset(SummDataset):
21
+ """
22
+ The CNN/DM dataset
23
+ """
24
+
25
+ dataset_name = "CNN/DailyMail"
26
+
27
+ is_query_based = False
28
+ is_dialogue_based = False
29
+ is_multi_document = False
30
+
31
+ huggingface_dataset = True
32
+ huggingface_page = "https://huggingface.co/datasets/cnn_dailymail"
33
+
34
+ def __init__(self):
35
+ super().__init__(
36
+ dataset_args=(
37
+ "cnn_dailymail",
38
+ "3.0.0",
39
+ )
40
+ )
41
+
42
+ def _process_data(self, data: Dataset) -> Generator[SummInstance, None, None]:
43
+ """
44
+ Overrides the SummDataset '_process_data()' method
45
+ This method processes the data contained in the dataset
46
+ and puts each data instance into a SummInstance object
47
+ :param dataset: a train/validation/test dataset
48
+ :rtype: a generator yielding SummInstance objects
49
+ """
50
+ for instance in tqdm(data):
51
+ article: str = instance["article"]
52
+ highlights: str = instance["highlights"]
53
+ summ_instance = SummInstance(source=article, summary=highlights)
54
+
55
+ yield summ_instance
56
+
57
+
58
+ class MultinewsDataset(SummDataset):
59
+ """
60
+ The Multi News dataset
61
+ """
62
+
63
+ dataset_name = "Multinews"
64
+
65
+ is_query_based = False
66
+ is_dialogue_based = False
67
+ is_multi_document = True
68
+
69
+ huggingface_dataset = True
70
+ huggingface_page = "https://huggingface.co/datasets/multi_news"
71
+
72
+ def __init__(self):
73
+ super().__init__(dataset_args=("multi_news",))
74
+
75
+ def _process_data(self, data: Dataset) -> Generator[SummInstance, None, None]:
76
+ """
77
+ Overrides the SummDataset '_process_data()' method
78
+ This method processes the data contained in the dataset
79
+ and puts each data instance into a SummInstance object
80
+ :param dataset: a train/validation/test dataset
81
+ :rtype: a generator yielding SummInstance objects
82
+ """
83
+ for instance in tqdm(data):
84
+ document: list = [
85
+ doc for doc in instance["document"].split("|||||") if doc
86
+ ] # removes the empty string generated
87
+ # since each doc ends with the delimiting token '|||||'
88
+ # the final doc creates an empty string
89
+ summary: str = instance["summary"]
90
+ summ_instance = SummInstance(source=document, summary=summary)
91
+
92
+ yield summ_instance
93
+
94
+
95
+ class SamsumDataset(SummDataset):
96
+ """
97
+ The SAMsum Dataset
98
+ """
99
+
100
+ dataset_name = "Samsum"
101
+
102
+ is_query_based = False
103
+ is_dialogue_based = True
104
+ is_multi_document = False
105
+
106
+ huggingface_dataset = True
107
+ huggingface_page = "https://huggingface.co/datasets/samsum"
108
+
109
+ def __init__(self):
110
+ super().__init__(dataset_args=("samsum",))
111
+
112
+ def _process_data(self, data: Dataset) -> Generator[SummInstance, None, None]:
113
+ """
114
+ Overrides the SummDataset '_process_data()' method
115
+ This method processes the data contained in the dataset
116
+ and puts each data instance into a SummInstance object
117
+ :param dataset: a train/validation/test dataset
118
+ :rtype: a generator yielding SummInstance objects
119
+ """
120
+ for instance in tqdm(data):
121
+ dialogue: List = instance["dialogue"].split(
122
+ "\r\n"
123
+ ) # split each dialogue into a list of strings such as
124
+ # ["speaker1 : utter..", "speaker2 : utter..."]
125
+ summary: str = instance["summary"]
126
+ summ_instance = SummInstance(source=dialogue, summary=summary)
127
+
128
+ yield summ_instance
129
+
130
+
131
+ class XsumDataset(SummDataset):
132
+ """
133
+ The Xsum Dataset
134
+ """
135
+
136
+ dataset_name = "Xsum"
137
+
138
+ huggingface_dataset = True
139
+ huggingface_page = "https://huggingface.co/datasets/xsum"
140
+
141
+ is_query_based = False
142
+ is_dialogue_based = False
143
+ is_multi_document = False
144
+
145
+ def __init__(self):
146
+ super().__init__(dataset_args=("xsum",))
147
+
148
+ def _process_data(self, data: Dataset) -> Generator[SummInstance, None, None]:
149
+ """
150
+ Overrides the SummDataset '_process_data()' method
151
+ This method processes the data contained in the dataset
152
+ and puts each data instance into a SummInstance object
153
+ :param dataset: a train/validation/test dataset
154
+ :rtype: a generator yielding SummInstance objects
155
+ """
156
+ for instance in tqdm(data):
157
+ document: List = instance["document"]
158
+ summary: str = instance["summary"]
159
+ summ_instance = SummInstance(source=document, summary=summary)
160
+
161
+ yield summ_instance
162
+
163
+
164
+ class PubmedqaDataset(SummDataset):
165
+ """
166
+ The Pubmed QA dataset
167
+ """
168
+
169
+ dataset_name = "Pubmedqa"
170
+
171
+ is_query_based = True
172
+ is_dialogue_based = False
173
+ is_multi_document = False
174
+
175
+ huggingface_dataset = True
176
+ huggingface_page = "https://huggingface.co/datasets/pubmed_qa"
177
+
178
+ def __init__(self, seed=None):
179
+ super().__init__(
180
+ dataset_args=(
181
+ "pubmed_qa",
182
+ "pqa_artificial",
183
+ )
184
+ )
185
+
186
+ def _process_data(self, data: Dataset) -> Generator[SummInstance, None, None]:
187
+ """
188
+ Overrides the SummDataset '_process_data()' method
189
+ This method processes the data contained in the dataset
190
+ and puts each data instance into a SummInstance object
191
+ :param dataset: a train/validation/test dataset
192
+ :rtype: a generator yielding SummInstance objects
193
+ """
194
+ for instance in tqdm(data):
195
+ context: str = " ".join(instance["context"]["contexts"])
196
+ answer: str = instance["long_answer"]
197
+ query: str = instance["question"]
198
+ summ_instance = SummInstance(source=context, summary=answer, query=query)
199
+
200
+ yield summ_instance
201
+
202
+
203
+ class MlsumDataset(SummDataset):
204
+ """
205
+ The MLsum Dataset - A multi-lingual dataset featuring 5 languages
206
+ Includes 1.5 million news articles and their corresponding summaries
207
+
208
+ "de" - German
209
+ "es" - Spanish
210
+ "fr" - French
211
+ "ru" - Russian
212
+ "tu" - Turkish
213
+ """
214
+
215
+ dataset_name = "MlSum"
216
+
217
+ is_query_based = False
218
+ is_dialogue_based = False
219
+ is_multi_document = False
220
+
221
+ huggingface_dataset = True
222
+ huggingface_page = "https://huggingface.co/datasets/mlsum"
223
+ supported_languages = ["de", "es", "fr", "ru", "tu"]
224
+
225
+ mlsum_instantiation_guide = """The languages supported for the Mlsum Dataset are:
226
+ de - German
227
+ es - Spanish
228
+ fr - French
229
+ ru - Russian
230
+ tu - Turkish
231
+
232
+ Examples to instantiate the dataset:
233
+ 1. Dataset with only one language
234
+ dataset = MlsumDataset({language_token})
235
+ dataset = MlsumDataset("es")
236
+ dataset = MlsumDataset("tu")...
237
+
238
+ 2. Dataset with a multiple languages
239
+ dataset = MlsumDataset({list of language_token})
240
+ dataset = MlsumDataset(["es","de"])
241
+ dataset = MlsumDataset(["es","de", "tu"])...
242
+
243
+ 3. Dataset with all supported languages (default)
244
+ dataset = MlsumDataset(all)
245
+ dataset = MlsumDataset()
246
+ """
247
+
248
+ def __init__(self, languages: Optional[Union[str, List[str]]] = "all"):
249
+ super().__init__(dataset_args=(languages,))
250
+
251
+ def _load_dataset_safe(self, languages: Optional[Union[str, List[str]]]):
252
+ """
253
+ Overrides the parent class method
254
+ Method loads multiple datasets of different languages provided in :param languages:
255
+ It then concatenates these datasets into one combined dataset
256
+ :rtype: datasetDict containing the combined dataset
257
+ :param languages: Optional, either a string or list of strings specifying the languages
258
+ to load
259
+ """
260
+ print(MlsumDataset.mlsum_instantiation_guide)
261
+
262
+ # Choose languages to download articles
263
+ if languages == "all":
264
+ selected_languages = MlsumDataset.supported_languages
265
+ elif isinstance(languages, list):
266
+ for language in languages:
267
+ assert self.is_supported(language)
268
+ selected_languages = languages
269
+ else:
270
+ assert self.is_supported(languages)
271
+ selected_languages = [languages]
272
+
273
+ # Concatenate selected languaeges into one dataset
274
+ language_datasets = []
275
+ for language in selected_languages:
276
+ dataset = super()._load_dataset_safe(
277
+ "mlsum",
278
+ language,
279
+ )
280
+
281
+ language_datasets.append(dataset)
282
+
283
+ mlsum_dataset = self._concatenate_dataset_dicts(language_datasets)
284
+
285
+ return mlsum_dataset
286
+
287
+ def _process_data(self, data: Dataset) -> Generator[SummInstance, None, None]:
288
+ """
289
+ Overrides the SummDataset '_process_data()' method
290
+ This method processes the data contained in the dataset
291
+ and puts each data instance into a SummInstance object
292
+ :param dataset: a train/validation/test dataset
293
+ :rtype: a generator yielding SummInstance objects
294
+ """
295
+ for instance in tqdm(data):
296
+ article: List = instance["text"]
297
+ summary: str = instance["summary"]
298
+ summ_instance = SummInstance(source=article, summary=summary)
299
+
300
+ yield summ_instance
301
+
302
+ def is_supported(self, language: str):
303
+ """
304
+ Checks whether the requested langues is supported
305
+ :param language: string containing the requested language
306
+ :rtype bool:
307
+ """
308
+ if language not in MlsumDataset.supported_languages:
309
+ print(MlsumDataset.mlsum_instantiation_guide)
310
+ raise ValueError(
311
+ f"The language(s): '{language}' entered is not supported. See above message for usage info"
312
+ )
313
+ else:
314
+ return True
315
+
316
+
317
+ # Non-huggingface datasets
318
+
319
+
320
+ class ScisummnetDataset(SummDataset):
321
+ """
322
+ The SciSummNet dataset. As a dataset not included by huggingface, we need to do manually download, set basic
323
+ information for the dataset
324
+ """
325
+
326
+ dataset_name = "ScisummNet"
327
+
328
+ version = "1.1.0"
329
+ description = (
330
+ "A summary of scientific papers should ideally incorporate the impact of the papers on the "
331
+ "research community reflected by citations. To facilitate research in citation-aware scientific "
332
+ "paper summarization (Scisumm), the CL-Scisumm shared task has been organized since 2014 for "
333
+ "papers in the computational linguistics and NLP domain."
334
+ )
335
+
336
+ is_dialogue_based = False
337
+ is_multi_document = False
338
+ is_query_based = False
339
+
340
+ huggingface_dataset = False
341
+ builder_script_path = path.join(
342
+ BASE_NONHUGGINGFACE_DATASETS_PATH, dataset_name.lower() + ".py"
343
+ )
344
+
345
+ def __init__(self, seed=None):
346
+ super().__init__()
347
+
348
+ def _process_data(self, data: Dataset) -> Generator[SummInstance, None, None]:
349
+ """
350
+ Overrides the SummDataset '_process_data()' method
351
+ This method processes the data contained in the dataset
352
+ and puts each data instance into a SummInstance object
353
+ :param dataset: a train/validation/test dataset
354
+ :rtype: a generator yielding SummInstance objects
355
+ """
356
+ for instance in tqdm(data):
357
+ docs: List = [
358
+ instance["document_xml"],
359
+ instance["citing_sentences_annotated.json"],
360
+ ]
361
+ summary: str = instance["summary"]
362
+ summ_instance = SummInstance(source=docs, summary=summary)
363
+
364
+ yield summ_instance
365
+
366
+
367
+ class SummscreenDataset(SummDataset):
368
+ """
369
+ The SummScreen dataset. As a dataset not included by huggingface, we need to do manually download, set basic
370
+ information for the dataset
371
+ """
372
+
373
+ dataset_name = "Summscreen"
374
+
375
+ version = "1.1.0"
376
+ is_dialogue_based = True
377
+ is_multi_document = False
378
+ is_query_based = False
379
+
380
+ huggingface_dataset = False
381
+ builder_script_path = path.join(
382
+ BASE_NONHUGGINGFACE_DATASETS_PATH, dataset_name.lower() + ".py"
383
+ )
384
+
385
+ def __init__(self, seed=None):
386
+ super().__init__()
387
+
388
+ def _process_data(self, data: Dataset) -> Generator[SummInstance, None, None]:
389
+ """
390
+ Overrides the SummDataset '_process_data()' method
391
+ This method processes the data contained in the dataset
392
+ and puts each data instance into a SummInstance object
393
+ :param dataset: a train/validation/test dataset
394
+ :rtype: a generator yielding SummInstance objects
395
+ """
396
+ for instance in tqdm(data):
397
+ transcript: List = instance[
398
+ "transcript"
399
+ ] # convert string into a list of string dialogues
400
+ recap: str = instance["recap"]
401
+ summ_instance = SummInstance(source=transcript, summary=recap)
402
+
403
+ yield summ_instance
404
+
405
+
406
+ class QMsumDataset(SummDataset):
407
+ """
408
+ QMSum Dataset
409
+ """
410
+
411
+ dataset_name = "QMsum"
412
+ description = """
413
+ QMSum is a new human-annotated benchmark for query-based multi-domain meeting summarization task,
414
+ which consists of 1,808 query-summary pairs over 232 meetings in multiple domains.
415
+ """
416
+
417
+ is_dialogue_based = True
418
+ is_multi_document = False
419
+ is_query_based = True
420
+
421
+ huggingface_dataset = False
422
+ builder_script_path = path.join(
423
+ BASE_NONHUGGINGFACE_DATASETS_PATH, dataset_name.lower() + ".py"
424
+ )
425
+
426
+ def __init__(self):
427
+ super().__init__()
428
+
429
+ def _process_data(self, data: Dataset) -> Generator[SummInstance, None, None]:
430
+ """
431
+ Overrides the SummDataset '_process_data()' method
432
+ This method processes the data contained in the dataset
433
+ and puts each data instance into a SummInstance object
434
+ :param dataset: a train/validation/test dataset
435
+ :rtype: a generator yielding SummInstance objects
436
+ """
437
+ for instance in tqdm(data):
438
+ for query_set in (
439
+ instance["general_query_list"] + instance["specific_query_list"]
440
+ ):
441
+ meeting: List = [
442
+ utterance["speaker"] + " : " + utterance["content"]
443
+ for utterance in instance["meeting_transcripts"]
444
+ ]
445
+ query: str = query_set["query"]
446
+ summary: str = query_set["answer"]
447
+ summ_instance = SummInstance(
448
+ source=meeting, summary=summary, query=query
449
+ )
450
+
451
+ yield summ_instance
452
+
453
+
454
+ class ArxivDataset(SummDataset):
455
+ """
456
+ The Arxiv Dataset
457
+ """
458
+
459
+ dataset_name = "Arxiv_longsummarization"
460
+ description = """
461
+ A summarization dataset comprised of pairs of scientific papers.
462
+ The dataset provides a challenging testbed for abstractive summarization.
463
+ It contains papers and their abstracts.
464
+ """
465
+
466
+ is_dialogue_based = False
467
+ is_multi_document = False
468
+ is_query_based = False
469
+
470
+ huggingface_dataset = False
471
+ builder_script_path = path.join(
472
+ BASE_NONHUGGINGFACE_DATASETS_PATH, dataset_name.lower() + ".py"
473
+ )
474
+
475
+ def __init__(self):
476
+
477
+ print(
478
+ "*****************",
479
+ "***Attention***",
480
+ "This dataset is quite large (approx 5Gb and will need about 15 Gb for the extraction process",
481
+ "Cancel/interrupt the download if size and time constraints will not be met",
482
+ "*****************",
483
+ sep="\n",
484
+ )
485
+
486
+ super().__init__()
487
+
488
+ def _process_data(self, data: Dataset) -> Generator[SummInstance, None, None]:
489
+ """
490
+ Overrides the SummDataset '_process_data()' method
491
+ This method processes the data contained in the dataset
492
+ and puts each data instance into a SummInstance object
493
+ :param dataset: a train/validation/test dataset
494
+ :rtype: a generator yielding SummInstance objects
495
+ """
496
+ for instance in tqdm(data):
497
+ article: List = instance["article_text"]
498
+ abstract: str = " ".join(instance["abstract_text"])
499
+ summ_instance = SummInstance(source=article, summary=abstract)
500
+
501
+ yield summ_instance
dataset/non_huggingface_datasets_builders/arxiv_longsummarization.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import datasets
4
+
5
+
6
+ """Arxiv dataset."""
7
+
8
+
9
+ _CITATION = """
10
+ @article{Cohan_2018,
11
+ title={A Discourse-Aware Attention Model for Abstractive Summarization of
12
+ Long Documents},
13
+ url={http://dx.doi.org/10.18653/v1/n18-2097},
14
+ DOI={10.18653/v1/n18-2097},
15
+ journal={Proceedings of the 2018 Conference of the North American Chapter of
16
+ the Association for Computational Linguistics: Human Language
17
+ Technologies, Volume 2 (Short Papers)},
18
+ publisher={Association for Computational Linguistics},
19
+ author={Cohan, Arman and Dernoncourt, Franck and Kim, Doo Soon and Bui, Trung and Kim, Seokhwan and Chang, Walter and Goharian, Nazli},
20
+ year={2018}
21
+ }
22
+ """
23
+
24
+ _DESCRIPTION = """
25
+ A summarization dataset comprised of pairs of scientific papers.
26
+ The dataset provides a challenging testbed for abstractive summarization.
27
+ It contains papers and their abstracts.
28
+ """
29
+
30
+ _HOMEPAGE = "https://github.com/armancohan/long-summarization"
31
+
32
+ _LICENSE = "Apache-2.0 License"
33
+
34
+ _URL = "https://archive.org/download/armancohan-long-summarization-paper-code/arxiv-dataset.zip"
35
+
36
+
37
+ class SummertimeArxiv(datasets.GeneratorBasedBuilder):
38
+ """Arxiv long summarization dataset."""
39
+
40
+ VERSION = datasets.Version("1.0.0")
41
+
42
+ BUILDER_CONFIGS = [
43
+ datasets.BuilderConfig(),
44
+ ]
45
+
46
+ def _info(self):
47
+ features = datasets.Features(
48
+ {
49
+ "article_id": datasets.Value("string"),
50
+ "article_text": [datasets.Value("string")],
51
+ "abstract_text": [datasets.Value("string")],
52
+ }
53
+ )
54
+ return datasets.DatasetInfo(
55
+ description=_DESCRIPTION,
56
+ features=features,
57
+ supervised_keys=None,
58
+ homepage=_HOMEPAGE,
59
+ license=_LICENSE,
60
+ citation=_CITATION,
61
+ )
62
+
63
+ def _split_generators(self, dl_manager):
64
+ """Returns SplitGenerators."""
65
+ my_urls = _URL
66
+ path = dl_manager.download_and_extract(my_urls)
67
+ path = os.path.join(path, "arxiv-dataset")
68
+
69
+ trainpath = os.path.join(path, "train.txt")
70
+ valpath = os.path.join(path, "val.txt")
71
+ testpath = os.path.join(path, "test.txt")
72
+
73
+ return [
74
+ datasets.SplitGenerator(
75
+ name=datasets.Split.TRAIN,
76
+ # These kwargs will be passed to _generate_examples
77
+ gen_kwargs={"filepath": trainpath, "split": "train"},
78
+ ),
79
+ datasets.SplitGenerator(
80
+ name=datasets.Split.VALIDATION,
81
+ # These kwargs will be passed to _generate_examples
82
+ gen_kwargs={"filepath": valpath, "split": "val"},
83
+ ),
84
+ datasets.SplitGenerator(
85
+ name=datasets.Split.TEST,
86
+ # These kwargs will be passed to _generate_examples
87
+ gen_kwargs={"filepath": testpath, "split": "test"},
88
+ ),
89
+ ]
90
+
91
+ def _generate_examples(self, filepath, split):
92
+ """Yields examples."""
93
+
94
+ with open(filepath, "r") as f:
95
+ for line in f:
96
+
97
+ instance = json.loads(line)
98
+
99
+ entry = {}
100
+ entry["article_id"] = instance["article_id"]
101
+ entry["article_text"] = instance["article_text"]
102
+ entry["abstract_text"] = instance["abstract_text"]
103
+
104
+ yield entry["article_id"], entry
dataset/non_huggingface_datasets_builders/qmsum.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import datasets
4
+
5
+
6
+ """QMsum dataset."""
7
+
8
+
9
+ _CITATION = """
10
+ @inproceedings{zhong2021qmsum,
11
+ title={{QMS}um: {A} {N}ew {B}enchmark for {Q}uery-based {M}ulti-domain {M}eeting {S}ummarization},
12
+ author={Zhong, Ming and Yin, Da and Yu, Tao and Zaidi, Ahmad and Mutuma, Mutethia and Jha, Rahul and Hassan Awadallah, Ahmed and Celikyilmaz, Asli and Liu, Yang and Qiu, Xipeng and Radev, Dragomir},
13
+ booktitle={North American Association for Computational Linguistics (NAACL)},
14
+ year={2021}
15
+ }
16
+ """
17
+
18
+ _DESCRIPTION = """
19
+ QMSum is a new human-annotated benchmark for query-based multi-domain meeting summarization task, \
20
+ which consists of 1,808 query-summary pairs over 232 meetings in multiple domains.
21
+ """
22
+
23
+ _HOMEPAGE = "https://github.com/Yale-LILY/QMSum"
24
+
25
+ _BASE_URL = "https://raw.githubusercontent.com/Yale-LILY/QMSum/main/data/ALL/jsonl"
26
+ _URLs = {
27
+ "train": _BASE_URL + "/train.jsonl",
28
+ "val": _BASE_URL + "/val.jsonl",
29
+ "test": _BASE_URL + "/test.jsonl",
30
+ }
31
+
32
+
33
+ class SummertimeQmsum(datasets.GeneratorBasedBuilder):
34
+ """QMsum dataset."""
35
+
36
+ VERSION = datasets.Version("1.0.0")
37
+
38
+ BUILDER_CONFIGS = [
39
+ datasets.BuilderConfig(),
40
+ ]
41
+
42
+ def _info(self):
43
+ features = datasets.Features(
44
+ {
45
+ "entry_number": datasets.Value("string"),
46
+ "meeting_transcripts": [
47
+ {
48
+ "speaker": datasets.Value("string"),
49
+ "content": datasets.Value("string"),
50
+ }
51
+ ],
52
+ "general_query_list": [
53
+ {
54
+ "query": datasets.Value("string"),
55
+ "answer": datasets.Value("string"),
56
+ }
57
+ ],
58
+ "specific_query_list": [
59
+ {
60
+ "query": datasets.Value("string"),
61
+ "answer": datasets.Value("string"),
62
+ "relevant_text_span": [[datasets.Value("string")]],
63
+ }
64
+ ],
65
+ }
66
+ )
67
+ return datasets.DatasetInfo(
68
+ description=_DESCRIPTION,
69
+ features=features,
70
+ supervised_keys=None,
71
+ homepage=_HOMEPAGE,
72
+ license=None,
73
+ citation=_CITATION,
74
+ )
75
+
76
+ def _split_generators(self, dl_manager):
77
+ """Returns SplitGenerators."""
78
+ my_urls = _URLs
79
+ downloaded_files = dl_manager.download_and_extract(my_urls)
80
+
81
+ trainpath = downloaded_files["train"]
82
+ valpath = downloaded_files["val"]
83
+ testpath = downloaded_files["test"]
84
+
85
+ return [
86
+ datasets.SplitGenerator(
87
+ name=datasets.Split.TRAIN,
88
+ # These kwargs will be passed to _generate_examples
89
+ gen_kwargs={"filepath": trainpath, "split": "train"},
90
+ ),
91
+ datasets.SplitGenerator(
92
+ name=datasets.Split.VALIDATION,
93
+ # These kwargs will be passed to _generate_examples
94
+ gen_kwargs={"filepath": valpath, "split": "val"},
95
+ ),
96
+ datasets.SplitGenerator(
97
+ name=datasets.Split.TEST,
98
+ # These kwargs will be passed to _generate_examples
99
+ gen_kwargs={"filepath": testpath, "split": "test"},
100
+ ),
101
+ ]
102
+
103
+ def _generate_examples(self, filepath, split):
104
+ """Yields examples."""
105
+
106
+ extraction_path = os.path.join(filepath)
107
+
108
+ with open(extraction_path) as f:
109
+ for i, line in enumerate(f):
110
+
111
+ instance = json.loads(line)
112
+
113
+ entry = {}
114
+ entry["entry_number"] = split + "_" + str(i)
115
+ entry["meeting_transcripts"] = instance["meeting_transcripts"]
116
+ entry["general_query_list"] = instance["general_query_list"]
117
+ entry["specific_query_list"] = instance["specific_query_list"]
118
+
119
+ yield entry["entry_number"], entry
dataset/non_huggingface_datasets_builders/scisummnet.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import datasets
3
+
4
+
5
+ """Scisummnet dataset."""
6
+
7
+
8
+ _CITATION = """
9
+ @InProceedings{yasunaga&al.19.scisumm,
10
+ title = {{ScisummNet}: A Large Annotated Corpus and Content-Impact Models for Scientific Paper Summarization with Citation Networks},
11
+ author = {Michihiro Yasunaga and Jungo Kasai and Rui Zhang and Alexander Fabbri and Irene Li and Dan Friedman and Dragomir Radev},
12
+ booktitle = {Proceedings of AAAI 2019},
13
+ year = {2019}
14
+ }
15
+ @InProceedings{yasunaga&al.17,
16
+ title = {Graph-based Neural Multi-Document Summarization},
17
+ author = {Yasunaga, Michihiro and Zhang, Rui and Meelu, Kshitijh and Pareek, Ayush and Srinivasan, Krishnan and Radev, Dragomir R.},
18
+ booktitle = {Proceedings of CoNLL 2017},
19
+ year = {2017}
20
+ }
21
+ """
22
+
23
+ _DESCRIPTION = """
24
+ A summary of scientific papers should ideally incorporate the impact of the papers on the research community
25
+ reflected by citations. To facilitate research in citation-aware scientific paper summarization (Scisumm),
26
+ the CL-Scisumm shared task has been organized since 2014 for papers in the computational linguistics and NLP domain.
27
+ """
28
+
29
+ _HOMEPAGE = "https://cs.stanford.edu/~myasu/projects/scisumm_net/"
30
+
31
+ _LICENSE = "CC BY-SA 4.0"
32
+
33
+ _URLs = "https://cs.stanford.edu/~myasu/projects/scisumm_net/scisummnet_release1.1__20190413.zip"
34
+
35
+
36
+ class SummertimeScisummnet(datasets.GeneratorBasedBuilder):
37
+ """Scisummnet dataset."""
38
+
39
+ VERSION = datasets.Version("1.1.0")
40
+
41
+ BUILDER_CONFIGS = [
42
+ datasets.BuilderConfig(),
43
+ ]
44
+
45
+ def _info(self):
46
+ features = datasets.Features(
47
+ {
48
+ "entry_number": datasets.Value("string"),
49
+ "document_xml": datasets.Value("string"),
50
+ "citing_sentences_annotated.json": datasets.Value("string"),
51
+ "summary": datasets.Value("string"),
52
+ }
53
+ )
54
+ return datasets.DatasetInfo(
55
+ description=_DESCRIPTION,
56
+ features=features,
57
+ supervised_keys=None,
58
+ homepage=_HOMEPAGE,
59
+ license=_LICENSE,
60
+ citation=_CITATION,
61
+ )
62
+
63
+ def _split_generators(self, dl_manager):
64
+ """Returns SplitGenerators."""
65
+ my_urls = _URLs
66
+ path = dl_manager.download_and_extract(my_urls)
67
+ trainpath = os.path.join(
68
+ path, "scisummnet_release1.1__20190413", "top1000_complete"
69
+ )
70
+ return [
71
+ datasets.SplitGenerator(
72
+ name=datasets.Split.TRAIN,
73
+ # These kwargs will be passed to _generate_examples
74
+ gen_kwargs={"extraction_path": trainpath, "split": "train"},
75
+ )
76
+ ]
77
+
78
+ def _generate_examples(self, extraction_path, split):
79
+ """Yields examples."""
80
+
81
+ for folder in os.listdir(extraction_path):
82
+
83
+ entry = {}
84
+
85
+ entry["entry_number"] = folder
86
+
87
+ doc_xml_path = os.path.join(
88
+ extraction_path, folder, "Documents_xml", folder + ".xml"
89
+ )
90
+ with open(doc_xml_path, "r", encoding="utf-8") as f:
91
+ entry["document_xml"] = f.read()
92
+
93
+ cite_annot_path = os.path.join(
94
+ extraction_path, folder, "citing_sentences_annotated.json"
95
+ )
96
+ with open(cite_annot_path, "r", encoding="utf-8") as f:
97
+ entry["citing_sentences_annotated.json"] = f.read()
98
+
99
+ summary_path = os.path.join(
100
+ extraction_path, folder, "summary", folder + ".gold.txt"
101
+ )
102
+ with open(summary_path, "r", encoding="utf-8") as f:
103
+ entry["summary"] = f.read()
104
+
105
+ yield entry["entry_number"], entry
dataset/non_huggingface_datasets_builders/summscreen.py ADDED
@@ -0,0 +1,123 @@