kmfoda commited on
Commit
0379fdb
0 Parent(s):

Initial upload

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +35 -0
  2. .gitignore +5 -0
  3. LICENSE +201 -0
  4. README.md +12 -0
  5. align.py +364 -0
  6. app.py +323 -0
  7. components.py +567 -0
  8. examples/booksum/booksum_sf_sample.jsonl +3 -0
  9. examples/booksum/booksum_sf_sample_processed.cache/meta.yaml +3 -0
  10. examples/booksum/booksum_sf_sample_processed.cache/mgr/blocks/6074137904/data.feather +3 -0
  11. examples/booksum/booksum_sf_sample_processed.cache/mgr/blocks/6074137904/meta.yaml +3 -0
  12. examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/BertscoreAligner:spacy:document:spacy:summary:long_t5_tglobal_xl_booksum/data.dill +3 -0
  13. examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/BertscoreAligner:spacy:document:spacy:summary:long_t5_tglobal_xl_booksum/meta.yaml +3 -0
  14. examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/BertscoreAligner:spacy:document:spacy:summary:long_t5_tglobal_xl_booksum/state.dill +3 -0
  15. examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/BertscoreAligner:spacy:document:spacy:summary:long_t5_tglobal_xl_booksum_sfsum/data.dill +3 -0
  16. examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/BertscoreAligner:spacy:document:spacy:summary:long_t5_tglobal_xl_booksum_sfsum/meta.yaml +3 -0
  17. examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/BertscoreAligner:spacy:document:spacy:summary:long_t5_tglobal_xl_booksum_sfsum/state.dill +3 -0
  18. examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/BertscoreAligner:spacy:document:spacy:summary:reference/data.dill +3 -0
  19. examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/BertscoreAligner:spacy:document:spacy:summary:reference/meta.yaml +3 -0
  20. examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/BertscoreAligner:spacy:document:spacy:summary:reference/state.dill +3 -0
  21. examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/BertscoreAligner:spacy:summary:reference:spacy:summary:long_t5_tglobal_xl_booksum/data.dill +3 -0
  22. examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/BertscoreAligner:spacy:summary:reference:spacy:summary:long_t5_tglobal_xl_booksum/meta.yaml +3 -0
  23. examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/BertscoreAligner:spacy:summary:reference:spacy:summary:long_t5_tglobal_xl_booksum/state.dill +3 -0
  24. examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/BertscoreAligner:spacy:summary:reference:spacy:summary:long_t5_tglobal_xl_booksum_sfsum/data.dill +3 -0
  25. examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/BertscoreAligner:spacy:summary:reference:spacy:summary:long_t5_tglobal_xl_booksum_sfsum/meta.yaml +3 -0
  26. examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/BertscoreAligner:spacy:summary:reference:spacy:summary:long_t5_tglobal_xl_booksum_sfsum/state.dill +3 -0
  27. examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/NGramAligner:spacy:document:spacy:summary:long_t5_tglobal_xl_booksum/data.dill +3 -0
  28. examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/NGramAligner:spacy:document:spacy:summary:long_t5_tglobal_xl_booksum/meta.yaml +3 -0
  29. examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/NGramAligner:spacy:document:spacy:summary:long_t5_tglobal_xl_booksum/state.dill +3 -0
  30. examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/NGramAligner:spacy:document:spacy:summary:long_t5_tglobal_xl_booksum_sfsum/data.dill +3 -0
  31. examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/NGramAligner:spacy:document:spacy:summary:long_t5_tglobal_xl_booksum_sfsum/meta.yaml +3 -0
  32. examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/NGramAligner:spacy:document:spacy:summary:long_t5_tglobal_xl_booksum_sfsum/state.dill +3 -0
  33. examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/NGramAligner:spacy:document:spacy:summary:reference/data.dill +3 -0
  34. examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/NGramAligner:spacy:document:spacy:summary:reference/meta.yaml +3 -0
  35. examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/NGramAligner:spacy:document:spacy:summary:reference/state.dill +3 -0
  36. examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/NGramAligner:spacy:summary:reference:spacy:summary:long_t5_tglobal_xl_booksum/data.dill +3 -0
  37. examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/NGramAligner:spacy:summary:reference:spacy:summary:long_t5_tglobal_xl_booksum/meta.yaml +3 -0
  38. examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/NGramAligner:spacy:summary:reference:spacy:summary:long_t5_tglobal_xl_booksum/state.dill +3 -0
  39. examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/NGramAligner:spacy:summary:reference:spacy:summary:long_t5_tglobal_xl_booksum_sfsum/data.dill +3 -0
  40. examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/NGramAligner:spacy:summary:reference:spacy:summary:long_t5_tglobal_xl_booksum_sfsum/meta.yaml +3 -0
  41. examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/NGramAligner:spacy:summary:reference:spacy:summary:long_t5_tglobal_xl_booksum_sfsum/state.dill +3 -0
  42. examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/StaticEmbeddingAligner:spacy:document:spacy:summary:long_t5_tglobal_xl_booksum/data.dill +3 -0
  43. examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/StaticEmbeddingAligner:spacy:document:spacy:summary:long_t5_tglobal_xl_booksum/meta.yaml +3 -0
  44. examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/StaticEmbeddingAligner:spacy:document:spacy:summary:long_t5_tglobal_xl_booksum/state.dill +3 -0
  45. examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/StaticEmbeddingAligner:spacy:document:spacy:summary:long_t5_tglobal_xl_booksum_sfsum/data.dill +3 -0
  46. examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/StaticEmbeddingAligner:spacy:document:spacy:summary:long_t5_tglobal_xl_booksum_sfsum/meta.yaml +3 -0
  47. examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/StaticEmbeddingAligner:spacy:document:spacy:summary:long_t5_tglobal_xl_booksum_sfsum/state.dill +3 -0
  48. examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/StaticEmbeddingAligner:spacy:document:spacy:summary:reference/data.dill +3 -0
  49. examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/StaticEmbeddingAligner:spacy:document:spacy:summary:reference/meta.yaml +3 -0
  50. examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/StaticEmbeddingAligner:spacy:document:spacy:summary:reference/state.dill +3 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ftz filter=lfs diff=lfs merge=lfs -text
6
+ *.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.h5 filter=lfs diff=lfs merge=lfs -text
8
+ *.joblib filter=lfs diff=lfs merge=lfs -text
9
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
10
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
11
+ *.model filter=lfs diff=lfs merge=lfs -text
12
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
13
+ *.npy filter=lfs diff=lfs merge=lfs -text
14
+ *.npz filter=lfs diff=lfs merge=lfs -text
15
+ *.onnx filter=lfs diff=lfs merge=lfs -text
16
+ *.ot filter=lfs diff=lfs merge=lfs -text
17
+ *.parquet filter=lfs diff=lfs merge=lfs -text
18
+ *.pb filter=lfs diff=lfs merge=lfs -text
19
+ *.pickle filter=lfs diff=lfs merge=lfs -text
20
+ *.pkl filter=lfs diff=lfs merge=lfs -text
21
+ *.pt filter=lfs diff=lfs merge=lfs -text
22
+ *.pth filter=lfs diff=lfs merge=lfs -text
23
+ *.rar filter=lfs diff=lfs merge=lfs -text
24
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
25
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
26
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
27
+ *.tflite filter=lfs diff=lfs merge=lfs -text
28
+ *.tgz filter=lfs diff=lfs merge=lfs -text
29
+ *.wasm filter=lfs diff=lfs merge=lfs -text
30
+ *.xz filter=lfs diff=lfs merge=lfs -text
31
+ *.zip filter=lfs diff=lfs merge=lfs -text
32
+ *.zst filter=lfs diff=lfs merge=lfs -text
33
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
34
+ examples/** filter=lfs diff=lfs merge=lfs -text
35
+ *.gif filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ .DS_STORE
2
+ _workings
3
+ env
4
+ _env
5
+ __pycache__
LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright 2021 SummVis
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
README.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Chapter Summary Summvis
3
+ emoji: 📈
4
+ colorFrom: green
5
+ colorTo: gray
6
+ sdk: streamlit
7
+ sdk_version: 1.10.0
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
11
+
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
align.py ADDED
@@ -0,0 +1,364 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import heapq
2
+ import itertools
3
+ from abc import ABC, abstractmethod
4
+ from collections import defaultdict
5
+ from operator import itemgetter
6
+ from typing import List, Dict, Tuple
7
+ from typing import Sequence
8
+ from abc import ABC
9
+
10
+ import numpy as np
11
+ import torch
12
+ from bert_score import BERTScorer
13
+ from nltk import PorterStemmer
14
+ from spacy.tokens import Doc, Span
15
+ from toolz import itertoolz
16
+ from transformers import AutoTokenizer
17
+ from transformers.tokenization_utils_base import PaddingStrategy
18
+
19
+
20
+ class EmbeddingModel(ABC):
21
+ @abstractmethod
22
+ def embed(
23
+ self,
24
+ sents: List[Span]
25
+ ):
26
+ pass
27
+
28
+
29
+ class ContextualEmbedding(EmbeddingModel):
30
+
31
+ def __init__(self, model, tokenizer_name, max_length, batch_size=32):
32
+ self.model = model
33
+ self.tokenizer = SpacyHuggingfaceTokenizer(tokenizer_name, max_length)
34
+ self._device = model.device
35
+ self.batch_size = batch_size
36
+
37
+ def embed(
38
+ self,
39
+ sents: List[Span]
40
+ ):
41
+ spacy_embs_list = []
42
+ for start_idx in range(0, len(sents), self.batch_size):
43
+ batch = sents[start_idx: start_idx + self.batch_size]
44
+ encoded_input, special_tokens_masks, token_alignments = self.tokenizer.batch_encode(batch)
45
+ encoded_input = {k: v.to(self._device) for k, v in encoded_input.items()}
46
+ with torch.no_grad():
47
+ model_output = self.model(**encoded_input)
48
+ embeddings = model_output[0].cpu()
49
+ for embs, mask, token_alignment \
50
+ in zip(embeddings, special_tokens_masks, token_alignments):
51
+ mask = torch.tensor(mask)
52
+ embs = embs[mask == 0] # Filter embeddings at special token positions
53
+ spacy_embs = []
54
+ for hf_idxs in token_alignment:
55
+ if hf_idxs is None:
56
+ pooled_embs = torch.zeros_like(embs[0])
57
+ else:
58
+ pooled_embs = embs[hf_idxs].mean(dim=0) # Pool embeddings that map to the same spacy token
59
+ spacy_embs.append(pooled_embs.numpy())
60
+ spacy_embs = np.stack(spacy_embs)
61
+ spacy_embs = spacy_embs / np.linalg.norm(spacy_embs, axis=-1, keepdims=True) # Normalize
62
+ spacy_embs_list.append(spacy_embs)
63
+ for embs, sent in zip(spacy_embs_list, sents):
64
+ assert len(embs) == len(sent)
65
+ return spacy_embs_list
66
+
67
+
68
+ class StaticEmbedding(EmbeddingModel):
69
+
70
+ def embed(
71
+ self,
72
+ sents: List[Span]
73
+ ):
74
+ return [
75
+ np.stack([t.vector / (t.vector_norm or 1) for t in sent])
76
+ for sent in sents
77
+ ]
78
+
79
+
80
+ class Aligner(ABC):
81
+ @abstractmethod
82
+ def align(
83
+ self,
84
+ source: Doc,
85
+ targets: Sequence[Doc]
86
+ ) -> List[Dict]:
87
+ """Compute alignment from summary tokens to doc tokens
88
+ Args:
89
+ source: Source spaCy document
90
+ targets: Target spaCy documents
91
+ Returns: List of alignments, one for each target document"""
92
+ pass
93
+
94
+
95
+ class EmbeddingAligner(Aligner):
96
+
97
+ def __init__(
98
+ self,
99
+ embedding: EmbeddingModel,
100
+ threshold: float,
101
+ top_k: int,
102
+ baseline_val=0
103
+ ):
104
+ self.threshold = threshold
105
+ self.top_k = top_k
106
+ self.embedding = embedding
107
+ self.baseline_val = baseline_val
108
+
109
+ def align(
110
+ self,
111
+ source: Doc,
112
+ targets: Sequence[Doc]
113
+ ) -> List[Dict]:
114
+ """Compute alignment from summary tokens to doc tokens with greatest semantic similarity
115
+ Args:
116
+ source: Source spaCy document
117
+ targets: Target spaCy documents
118
+ Returns: List of alignments, one for each target document
119
+ """
120
+ if len(source) == 0:
121
+ return [{} for _ in targets]
122
+ all_sents = list(source.sents) + list(itertools.chain.from_iterable(target.sents for target in targets))
123
+ chunk_sizes = [_iter_len(source.sents)] + \
124
+ [_iter_len(target.sents) for target in targets]
125
+ all_sents_token_embeddings = self.embedding.embed(all_sents)
126
+ chunked_sents_token_embeddings = _split(all_sents_token_embeddings, chunk_sizes)
127
+ source_sent_token_embeddings = chunked_sents_token_embeddings[0]
128
+ source_token_embeddings = np.concatenate(source_sent_token_embeddings)
129
+ for token_idx, token in enumerate(source):
130
+ if token.is_stop or token.is_punct:
131
+ source_token_embeddings[token_idx] = 0
132
+ alignments = []
133
+ for i, target in enumerate(targets):
134
+ target_sent_token_embeddings = chunked_sents_token_embeddings[i + 1]
135
+ target_token_embeddings = np.concatenate(target_sent_token_embeddings)
136
+ for token_idx, token in enumerate(target):
137
+ if token.is_stop or token.is_punct:
138
+ target_token_embeddings[token_idx] = 0
139
+ alignment = defaultdict(list)
140
+ for score, target_idx, source_idx in self._emb_sim_sparse(
141
+ target_token_embeddings,
142
+ source_token_embeddings,
143
+ ):
144
+ alignment[target_idx].append((source_idx, score))
145
+ # TODO used argpartition to get nlargest
146
+ for j in list(alignment):
147
+ alignment[j] = heapq.nlargest(self.top_k, alignment[j], itemgetter(1))
148
+ alignments.append(alignment)
149
+ return alignments
150
+
151
+ def _emb_sim_sparse(self, embs_1, embs_2):
152
+ sim = embs_1 @ embs_2.T
153
+ sim = (sim - self.baseline_val) / (1 - self.baseline_val)
154
+ keep = sim > self.threshold
155
+ keep_idxs_1, keep_idxs_2 = np.where(keep)
156
+ keep_scores = sim[keep]
157
+ return list(zip(keep_scores, keep_idxs_1, keep_idxs_2))
158
+
159
+
160
+ class BertscoreAligner(EmbeddingAligner):
161
+ def __init__(
162
+ self,
163
+ threshold,
164
+ top_k
165
+ ):
166
+ scorer = BERTScorer(lang="en", rescale_with_baseline=True)
167
+ model = scorer._model
168
+ embedding = ContextualEmbedding(model, "roberta-large", 510)
169
+ baseline_val = scorer.baseline_vals[2].item()
170
+
171
+ super(BertscoreAligner, self).__init__(
172
+ embedding, threshold, top_k, baseline_val
173
+ )
174
+
175
+
176
+ class StaticEmbeddingAligner(EmbeddingAligner):
177
+ def __init__(
178
+ self,
179
+ threshold,
180
+ top_k
181
+ ):
182
+ embedding = StaticEmbedding()
183
+ super(StaticEmbeddingAligner, self).__init__(
184
+ embedding, threshold, top_k
185
+ )
186
+
187
+
188
+ class NGramAligner(Aligner):
189
+
190
+ def __init__(self):
191
+ self.stemmer = PorterStemmer()
192
+
193
+ def align(
194
+ self,
195
+ source: Doc,
196
+ targets: List[Doc],
197
+ ) -> List[Dict]:
198
+
199
+ alignments = []
200
+ source_ngram_spans = self._get_ngram_spans(source)
201
+ for target in targets:
202
+ target_ngram_spans = self._get_ngram_spans(target)
203
+ alignments.append(
204
+ self._align_ngrams(target_ngram_spans, source_ngram_spans)
205
+ )
206
+ return alignments
207
+
208
+ def _get_ngram_spans(
209
+ self,
210
+ doc: Doc,
211
+ ):
212
+ ngrams = []
213
+ for sent in doc.sents:
214
+ for n in range(1, len(list(sent))):
215
+ tokens = [t for t in sent if not (t.is_stop or t.is_punct)]
216
+ ngrams.extend(_ngrams(tokens, n))
217
+
218
+ def ngram_key(ngram):
219
+ return tuple(self.stemmer.stem(token.text).lower() for token in ngram)
220
+
221
+ key_to_ngrams = itertoolz.groupby(ngram_key, ngrams)
222
+ key_to_spans = {}
223
+ for k, grouped_ngrams in key_to_ngrams.items():
224
+ key_to_spans[k] = [
225
+ (ngram[0].i, ngram[-1].i + 1)
226
+ for ngram in grouped_ngrams
227
+ ]
228
+ return key_to_spans
229
+
230
+ def _align_ngrams(
231
+ self,
232
+ ngram_spans_1: Dict[Tuple[str], List[Tuple[int, int]]],
233
+ ngram_spans_2: Dict[Tuple[str], List[Tuple[int, int]]]
234
+ ) -> Dict[Tuple[int, int], List[Tuple[int, int]]]:
235
+ """Align ngram spans between two documents
236
+ Args:
237
+ ngram_spans_1: Map from (normalized_token1, normalized_token2, ...) n-gram tuple to a list of token spans
238
+ of format (start_pos, end_pos)
239
+ ngram_spans_2: Same format as above, but for second text
240
+ Returns: map from each (start, end) span in text 1 to list of aligned (start, end) spans in text 2
241
+ """
242
+ if not ngram_spans_1 or not ngram_spans_2:
243
+ return {}
244
+ max_span_end_1 = max(span[1] for span in itertools.chain.from_iterable(ngram_spans_1.values()))
245
+ token_is_available_1 = [True] * max_span_end_1 #
246
+ matched_keys = list(set(ngram_spans_1.keys()) & set(ngram_spans_2.keys())) # Matched normalized ngrams betwee
247
+ matched_keys.sort(key=len, reverse=True) # Process n-grams from longest to shortest
248
+
249
+ alignment = defaultdict(list) # Map from each matched span in text 1 to list of aligned spans in text 2
250
+ for key in matched_keys:
251
+ spans_1 = ngram_spans_1[key]
252
+ spans_2 = ngram_spans_2[key]
253
+ available_spans_1 = [span for span in spans_1 if all(token_is_available_1[slice(*span)])]
254
+ matched_spans_1 = []
255
+ if available_spans_1 and spans_2:
256
+ # if ngram can be matched to available spans in both sequences
257
+ for span in available_spans_1:
258
+ # It's possible that these newly matched spans may be overlapping with one another, so
259
+ # check that token positions still available (only one span allowed ber token in text 1):
260
+ if all(token_is_available_1[slice(*span)]):
261
+ matched_spans_1.append(span)
262
+ token_is_available_1[slice(*span)] = [False] * (span[1] - span[0])
263
+ for span1 in matched_spans_1:
264
+ alignment[span1] = spans_2
265
+
266
+ return alignment
267
+
268
+
269
+ class SpacyHuggingfaceTokenizer:
270
+ def __init__(
271
+ self,
272
+ model_name,
273
+ max_length
274
+ ):
275
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
276
+ self.max_length = max_length
277
+
278
+ def batch_encode(
279
+ self,
280
+ sents: List[Span]
281
+ ):
282
+ token_alignments = []
283
+ token_ids_list = []
284
+
285
+ # Tokenize each sentence and special tokens.
286
+ for sent in sents:
287
+ hf_tokens, token_alignment = self.tokenize(sent)
288
+ token_alignments.append(token_alignment)
289
+ token_ids = self.tokenizer.convert_tokens_to_ids(hf_tokens)
290
+ encoding = self.tokenizer.prepare_for_model(
291
+ token_ids,
292
+ add_special_tokens=True,
293
+ padding=False,
294
+ )
295
+ token_ids_list.append(encoding['input_ids'])
296
+
297
+ # Add padding
298
+ max_length = max(map(len, token_ids_list))
299
+ attention_mask = []
300
+ input_ids = []
301
+ special_tokens_masks = []
302
+ for token_ids in token_ids_list:
303
+ encoding = self.tokenizer.prepare_for_model(
304
+ token_ids,
305
+ padding=PaddingStrategy.MAX_LENGTH,
306
+ max_length=max_length,
307
+ add_special_tokens=False
308
+ )
309
+ input_ids.append(encoding['input_ids'])
310
+ attention_mask.append(encoding['attention_mask'])
311
+ special_tokens_masks.append(
312
+ self.tokenizer.get_special_tokens_mask(
313
+ encoding['input_ids'],
314
+ already_has_special_tokens=True
315
+ )
316
+ )
317
+
318
+ encoded = {
319
+ 'input_ids': torch.tensor(input_ids),
320
+ 'attention_mask': torch.tensor(attention_mask)
321
+ }
322
+ return encoded, special_tokens_masks, token_alignments
323
+
324
+ def tokenize(
325
+ self,
326
+ sent
327
+ ):
328
+ """Convert spacy sentence to huggingface tokens and compute the alignment"""
329
+ hf_tokens = []
330
+ token_alignment = []
331
+ for i, token in enumerate(sent):
332
+ # "Tokenize" each word individually, so as to track the alignment between spaCy/HF tokens
333
+ # Prefix all tokens with a space except the first one in the sentence
334
+ if i == 0:
335
+ token_text = token.text
336
+ else:
337
+ token_text = ' ' + token.text
338
+ start_hf_idx = len(hf_tokens)
339
+ word_tokens = self.tokenizer.tokenize(token_text)
340
+ end_hf_idx = len(hf_tokens) + len(word_tokens)
341
+ if end_hf_idx < self.max_length:
342
+ hf_tokens.extend(word_tokens)
343
+ hf_idxs = list(range(start_hf_idx, end_hf_idx))
344
+ else:
345
+ hf_idxs = None
346
+ token_alignment.append(hf_idxs)
347
+ return hf_tokens, token_alignment
348
+
349
+
350
+ def _split(data, sizes):
351
+ it = iter(data)
352
+ return [[next(it) for _ in range(size)] for size in sizes]
353
+
354
+
355
+ def _iter_len(it):
356
+ return sum(1 for _ in it)
357
+
358
+ # TODO set up batching
359
+ # To get top K axis and value per row: https://stackoverflow.com/questions/42832711/using-np-argpartition-to-index-values-in-a-multidimensional-array
360
+
361
+
362
+ def _ngrams(tokens, n):
363
+ for i in range(len(tokens) - n + 1):
364
+ yield tokens[i:i + n]
app.py ADDED
@@ -0,0 +1,323 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import operator
4
+ import os
5
+ import re
6
+ from pathlib import Path
7
+
8
+ import spacy
9
+ import spacy.lang.en
10
+ import streamlit as st
11
+ from meerkat import DataPanel
12
+ from spacy.tokens import Doc
13
+
14
+ from align import NGramAligner, BertscoreAligner, StaticEmbeddingAligner
15
+ from components import MainView
16
+ from utils import clean_text
17
+
18
+ MIN_SEMANTIC_SIM_THRESHOLD = 0.1
19
+ MAX_SEMANTIC_SIM_TOP_K = 10
20
+
21
+ Doc.set_extension("name", default=None, force=True)
22
+ Doc.set_extension("column", default=None, force=True)
23
+
24
+
25
+ class Instance():
26
+ def __init__(self, id_, document, reference, preds, data=None):
27
+ self.id = id_
28
+ self.document = document
29
+ self.reference = reference
30
+ self.preds = preds
31
+ self.data = data
32
+
33
+
34
+ @st.cache(allow_output_mutation=True)
35
+ def load_from_index(filename, index):
36
+ with open(filename) as f:
37
+ for i, line in enumerate(f):
38
+ if i == index:
39
+ return json.loads(line.strip())
40
+
41
+
42
+ def _nlp_key(x: spacy.Language):
43
+ return str(x.path)
44
+
45
+
46
+ @st.cache(allow_output_mutation=True, hash_funcs={spacy.lang.en.English: _nlp_key})
47
+ def load_dataset(path: str, nlp: spacy.Language):
48
+ if path.endswith('.jsonl'):
49
+ return DataPanel.from_jsonl(path)
50
+ try:
51
+ return DataPanel.read(path, nlp=nlp)
52
+ except NotADirectoryError:
53
+ return DataPanel.from_jsonl(path)
54
+
55
+
56
+ @st.cache(allow_output_mutation=True)
57
+ def get_nlp():
58
+ try:
59
+ nlp = spacy.load("en_core_web_lg")
60
+ except:
61
+ nlp = spacy.load("en_core_web_sm")
62
+ is_lg = False
63
+ else:
64
+ is_lg = True
65
+ nlp.add_pipe('sentencizer', before="parser")
66
+ return nlp, is_lg
67
+
68
+
69
+ def retrieve(dataset, index, filename=None):
70
+ if index >= len(dataset):
71
+ st.error(f"Index {index} exceeds dataset length.")
72
+
73
+ eval_dataset = None
74
+ if filename:
75
+ # TODO Handle this through dedicated fields
76
+ if "cnn_dailymail" in filename:
77
+ eval_dataset = "cnndm"
78
+ elif "xsum" in filename:
79
+ eval_dataset = "xsum"
80
+
81
+ data = dataset[index]
82
+ id_ = data.get('id', '')
83
+
84
+ try:
85
+ document = data['spacy:document']
86
+ except KeyError:
87
+ if not is_lg:
88
+ st.error("'en_core_web_lg model' is required unless loading from cached file."
89
+ "To install: 'python -m spacy download en_core_web_lg'")
90
+ try:
91
+ text = data['document']
92
+ except KeyError:
93
+ text = data['article']
94
+ if not text:
95
+ st.error("Document is blank")
96
+ return
97
+ document = nlp(text if args.no_clean else clean_text(text))
98
+ document._.name = "Document"
99
+ document._.column = "document"
100
+
101
+ try:
102
+ reference = data['spacy:summary:reference']
103
+
104
+ except KeyError:
105
+ if not is_lg:
106
+ st.error("'en_core_web_lg model' is required unless loading from cached file."
107
+ "To install: 'python -m spacy download en_core_web_lg'")
108
+ try:
109
+ text = data['summary'] if 'summary' in data else data['summary:reference']
110
+ except KeyError:
111
+ text = data.get('highlights')
112
+ if text:
113
+ reference = nlp(text if args.no_clean else clean_text(text))
114
+ else:
115
+ reference = None
116
+ if reference is not None:
117
+ reference._.name = "Reference"
118
+ reference._.column = "summary:reference"
119
+
120
+ model_names = set()
121
+ for k in data:
122
+ m = re.match('(preprocessed_)?summary:(?P<model>.*)', k)
123
+ if m:
124
+ model_name = m.group('model')
125
+ if model_name != 'reference':
126
+ model_names.add(model_name)
127
+
128
+ preds = []
129
+ for model_name in model_names:
130
+ try:
131
+ pred = data[f"spacy:summary:{model_name}"]
132
+ except KeyError:
133
+ if not is_lg:
134
+ st.error("'en_core_web_lg model' is required unless loading from cached file."
135
+ "To install: 'python -m spacy download en_core_web_lg'")
136
+ text = data[f"summary:{model_name}"]
137
+ pred = nlp(text if args.no_clean else clean_text(text))
138
+
139
+ parts = model_name.split("-")
140
+ primary_sort = 0
141
+ if len(parts) == 2:
142
+ model, train_dataset = parts
143
+ if train_dataset == eval_dataset:
144
+ formatted_model_name = model.upper()
145
+ else:
146
+ formatted_model_name = f"{model.upper()} ({train_dataset.upper()}-trained)"
147
+ if train_dataset in ["xsum", "cnndm"]:
148
+ primary_sort = 1
149
+ else:
150
+ primary_sort = 2
151
+ else:
152
+ formatted_model_name = model_name.upper()
153
+ pred._.name = formatted_model_name
154
+ pred._.column = f"summary:{model_name}"
155
+ preds.append(
156
+ ((primary_sort, formatted_model_name), pred)
157
+ )
158
+
159
+ preds = [pred for _, pred in sorted(preds)]
160
+
161
+ return Instance(
162
+ id_=id_,
163
+ document=document,
164
+ reference=reference,
165
+ preds=preds,
166
+ data=data,
167
+ )
168
+
169
+
170
+ def filter_alignment(alignment, threshold, top_k):
171
+ filtered_alignment = {}
172
+ for k, v in alignment.items():
173
+ filtered_matches = [(match_idx, score) for match_idx, score in v if score >= threshold]
174
+ if filtered_matches:
175
+ filtered_alignment[k] = sorted(filtered_matches, key=operator.itemgetter(1), reverse=True)[:top_k]
176
+ return filtered_alignment
177
+
178
+
179
+ def select_comparison(example):
180
+ all_summaries = []
181
+
182
+ if example.reference:
183
+ all_summaries.append(example.reference)
184
+ if example.preds:
185
+ all_summaries.extend(example.preds)
186
+
187
+ from_documents = [example.document]
188
+ if example.reference:
189
+ from_documents.append(example.reference)
190
+ document_names = [document._.name for document in from_documents]
191
+ select_document_name = sidebar_placeholder_from.selectbox(
192
+ label="Comparison FROM:",
193
+ options=document_names
194
+ )
195
+ document_index = document_names.index(select_document_name)
196
+ selected_document = from_documents[document_index]
197
+
198
+ remaining_summaries = [summary for summary in all_summaries if
199
+ summary._.name != selected_document._.name]
200
+ remaining_summary_names = [summary._.name for summary in remaining_summaries]
201
+
202
+ selected_summary_names = sidebar_placeholder_to.multiselect(
203
+ 'Comparison TO:',
204
+ remaining_summary_names,
205
+ remaining_summary_names
206
+ )
207
+ selected_summaries = []
208
+ for summary_name in selected_summary_names:
209
+ summary_index = remaining_summary_names.index(summary_name)
210
+ selected_summaries.append(remaining_summaries[summary_index])
211
+ return selected_document, selected_summaries
212
+
213
+
214
+ def show_main(example):
215
+ # Get user input
216
+
217
+ semantic_sim_type = st.sidebar.radio(
218
+ "Semantic similarity type:",
219
+ ["Contextual embedding", "Static embedding"]
220
+ )
221
+ semantic_sim_threshold = st.sidebar.slider(
222
+ "Semantic similarity threshold:",
223
+ min_value=MIN_SEMANTIC_SIM_THRESHOLD,
224
+ max_value=1.0,
225
+ step=0.1,
226
+ value=0.2,
227
+ )
228
+ semantic_sim_top_k = st.sidebar.slider(
229
+ "Semantic similarity top-k:",
230
+ min_value=1,
231
+ max_value=MAX_SEMANTIC_SIM_TOP_K,
232
+ step=1,
233
+ value=10,
234
+ )
235
+
236
+ document, summaries = select_comparison(example)
237
+ layout = st.sidebar.radio("Layout:", ["Vertical", "Horizontal"]).lower()
238
+ scroll = True
239
+ gray_out_stopwords = st.sidebar.checkbox(label="Gray out stopwords", value=True)
240
+
241
+ # Gather data
242
+ try:
243
+ lexical_alignments = [
244
+ example.data[f'{NGramAligner.__name__}:spacy:{document._.column}:spacy:{summary._.column}']
245
+ for summary in summaries
246
+ ]
247
+ except KeyError:
248
+ lexical_alignments = NGramAligner().align(document, summaries)
249
+
250
+ if semantic_sim_type == "Static embedding":
251
+ try:
252
+ semantic_alignments = [
253
+ example.data[f'{StaticEmbeddingAligner.__name__}:spacy:{document._.column}:spacy:{summary._.column}']
254
+ for summary in summaries
255
+ ]
256
+ except KeyError:
257
+ semantic_alignments = StaticEmbeddingAligner(
258
+ semantic_sim_threshold,
259
+ semantic_sim_top_k).align(
260
+ document,
261
+ summaries
262
+ )
263
+ else:
264
+ try:
265
+ semantic_alignments = [
266
+ example.data[f'{BertscoreAligner.__name__}:spacy:{document._.column}:spacy:{summary._.column}']
267
+ for summary in summaries
268
+ ]
269
+ except KeyError:
270
+ semantic_alignments = BertscoreAligner(semantic_sim_threshold,
271
+ semantic_sim_top_k).align(document,
272
+ summaries)
273
+
274
+ MainView(
275
+ document,
276
+ summaries,
277
+ semantic_alignments,
278
+ lexical_alignments,
279
+ layout,
280
+ scroll,
281
+ gray_out_stopwords,
282
+ ).show(height=720)
283
+
284
+
285
+ if __name__ == "__main__":
286
+
287
+ st.set_page_config(layout="wide")
288
+
289
+ parser = argparse.ArgumentParser()
290
+ parser.add_argument('--path', type=str, default='data')
291
+ parser.add_argument('--no_clean', action='store_true', default=False,
292
+ help="Do not clean text (remove extraneous spaces, newlines).")
293
+ args = parser.parse_args()
294
+
295
+ nlp, is_lg = get_nlp()
296
+
297
+ path = Path(args.path)
298
+ path_dir = path.parent
299
+ all_files = set(map(os.path.basename, path_dir.glob('*')))
300
+ files = sorted([
301
+ fname for fname in all_files if not (fname.endswith(".py") or fname.startswith("."))
302
+ ])
303
+ if path.is_file:
304
+ try:
305
+ file_index = files.index(path.name)
306
+ except:
307
+ raise FileNotFoundError(f"File not found: {path.name}")
308
+ else:
309
+ file_index = 0
310
+ col1, col2 = st.beta_columns((3, 1))
311
+ filename = col1.selectbox(label="File:", options=files, index=file_index)
312
+ dataset = load_dataset(str(path_dir / filename), nlp=nlp)
313
+
314
+ dataset_size = len(dataset)
315
+ query = col2.number_input(f"Index (Size: {dataset_size}):", value=0, min_value=0, max_value=dataset_size - 1)
316
+
317
+ sidebar_placeholder_from = st.sidebar.empty()
318
+ sidebar_placeholder_to = st.sidebar.empty()
319
+
320
+ if query is not None:
321
+ example = retrieve(dataset, query, filename)
322
+ if example:
323
+ show_main(example)
components.py ADDED
@@ -0,0 +1,567 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import defaultdict
2
+ from itertools import count
3
+ from operator import itemgetter
4
+ from pathlib import Path
5
+ from typing import Dict, Optional
6
+ from typing import List, Tuple, Union
7
+
8
+ import htbuilder
9
+ import streamlit as st
10
+ from htbuilder import span, div, script, style, link, styles, HtmlElement, br
11
+ from htbuilder.units import px
12
+ from spacy.tokens import Doc
13
+
14
+ palette = [
15
+ "#66c2a5",
16
+ "#fc8d62",
17
+ "#8da0cb",
18
+ "#e78ac3",
19
+ "#a6d854",
20
+ "#ffd92f",
21
+ "#e5c494",
22
+ "#b3b3b3",
23
+ ]
24
+ inactive_color = "#BBB"
25
+
26
+
27
+ def local_stylesheet(path):
28
+ with open(path) as f:
29
+ css = f.read()
30
+ return style()(
31
+ css
32
+ )
33
+
34
+
35
+ def remote_stylesheet(url):
36
+ return link(
37
+ href=url
38
+ )
39
+
40
+
41
+ def local_script(path):
42
+ with open(path) as f:
43
+ code = f.read()
44
+ return script()(
45
+ code
46
+ )
47
+
48
+
49
+ def remote_script(url):
50
+ return script(
51
+ src=url
52
+ )
53
+
54
+
55
+ def get_color(sent_idx):
56
+ return palette[sent_idx % len(palette)]
57
+
58
+
59
+ def hex_to_rgb(hex):
60
+ hex = hex.replace("#", '')
61
+ return tuple(int(hex[i:i + 2], 16) for i in (0, 2, 4))
62
+
63
+
64
+ def color_with_opacity(hex_color, opacity):
65
+ rgb = hex_to_rgb(hex_color)
66
+ return f"rgba({rgb[0]},{rgb[1]},{rgb[2]},{opacity:.2f})"
67
+
68
+
69
+ class Component:
70
+
71
+ def show(self, width=None, height=None, scrolling=True, **kwargs):
72
+ out = div(style=styles(
73
+ **kwargs
74
+ ))(self.html())
75
+ html = str(out)
76
+ st.components.v1.html(html, width=width, height=height, scrolling=scrolling)
77
+
78
+ def html(self):
79
+ raise NotImplemented
80
+
81
+
82
+ class MainView(Component):
83
+
84
+ def __init__(
85
+ self,
86
+ document: Doc,
87
+ summaries: List[Doc],
88
+ semantic_alignments: Optional[List[Dict]],
89
+ lexical_alignments: Optional[List[Dict]],
90
+ layout: str,
91
+ scroll: bool,
92
+ gray_out_stopwords: bool
93
+ ):
94
+ self.document = document
95
+ self.summaries = summaries
96
+ self.semantic_alignments = semantic_alignments
97
+ self.lexical_alignments = lexical_alignments
98
+ self.layout = layout
99
+ self.scroll = scroll
100
+ self.gray_out_stopwords = gray_out_stopwords
101
+
102
+ def html(self):
103
+
104
+ # Add document elements
105
+ if self.document._.name == 'Document':
106
+ document_name = 'Source Document'
107
+ else:
108
+ document_name = self.document._.name + ' summary'
109
+ doc_header = div(
110
+ id_="document-header"
111
+ )(
112
+ document_name
113
+ )
114
+ doc_elements = []
115
+
116
+ # Add document content, which comprises multiple elements, one for each summary. Only the elment corresponding to
117
+ # selected summary will be visible.
118
+
119
+ mu = MultiUnderline()
120
+
121
+ for summary_idx, summary in enumerate(self.summaries):
122
+ token_idx_to_sent_idx = {}
123
+ for sent_idx, sent in enumerate(summary.sents):
124
+ for token in sent:
125
+ token_idx_to_sent_idx[token.i] = sent_idx
126
+ is_selected_summary = (summary_idx == 0) # By default, first summary is selected
127
+
128
+ if self.semantic_alignments is not None:
129
+ doc_token_idx_to_matches = defaultdict(list)
130
+ semantic_alignment = self.semantic_alignments[summary_idx]
131
+ for summary_token_idx, matches in semantic_alignment.items():
132
+ for doc_token_idx, sim in matches:
133
+ doc_token_idx_to_matches[doc_token_idx].append((summary_token_idx, sim))
134
+ else:
135
+ doc_token_idx_to_matches = {}
136
+
137
+ token_elements = []
138
+ for doc_token_idx, doc_token in enumerate(self.document):
139
+ if doc_token.is_stop or doc_token.is_punct:
140
+ classes = ["stopword"]
141
+ if self.gray_out_stopwords:
142
+ classes.append("grayed-out")
143
+ el = span(
144
+ _class=" ".join(classes)
145
+ )(
146
+ doc_token.text
147
+ )
148
+
149
+ else:
150
+ matches = doc_token_idx_to_matches.get(doc_token_idx)
151
+ if matches:
152
+ summary_token_idx, sim = max(matches, key=itemgetter(1))
153
+ sent_idx = token_idx_to_sent_idx[summary_token_idx]
154
+ color_primary = get_color(sent_idx)
155
+ highlight_color_primary = color_with_opacity(color_primary, sim)
156
+ props = {
157
+ 'data-highlight-id': str(doc_token_idx),
158
+ 'data-primary-color': highlight_color_primary
159
+ }
160
+ match_classes = []
161
+ for summary_token_idx, sim in matches:
162
+ sent_idx = token_idx_to_sent_idx[summary_token_idx]
163
+ match_classes.append(f"summary-highlight-{summary_idx}-{summary_token_idx}")
164
+ color = color_with_opacity(get_color(sent_idx), sim)
165
+ props[f"data-color-{summary_idx}-{summary_token_idx}"] = color
166
+ props["data-match-classes"] = " ".join(match_classes)
167
+ el = self._highlight(
168
+ doc_token.text,
169
+ highlight_color_primary,
170
+ color_primary,
171
+ match_classes + ["annotation-hidden"],
172
+ **props
173
+ )
174
+ else:
175
+ el = doc_token.text
176
+ token_elements.append(el)
177
+
178
+ spans = []
179
+ if self.lexical_alignments is not None:
180
+ lexical_alignment = self.lexical_alignments[summary_idx]
181
+ for summary_span, doc_spans in lexical_alignment.items():
182
+ summary_span_start, summary_span_end = summary_span
183
+ span_id = f"{summary_idx}-{summary_span_start}-{summary_span_end}"
184
+ sent_idx = token_idx_to_sent_idx[summary_span_start]
185
+ for doc_span_start, doc_span_end in doc_spans:
186
+ spans.append((
187
+ doc_span_start,
188
+ doc_span_end,
189
+ sent_idx,
190
+ get_color(sent_idx),
191
+ span_id
192
+ ))
193
+ token_elements = mu.markup(token_elements, spans)
194
+
195
+ classes = ["main-doc", "bordered"]
196
+ if self.scroll:
197
+ classes.append("scroll")
198
+
199
+ main_doc = div(
200
+ _class=" ".join(classes)
201
+ )(
202
+ token_elements
203
+ ),
204
+
205
+ classes = ["doc"]
206
+ if is_selected_summary:
207
+ classes.append("display")
208
+ else:
209
+ classes.append("nodisplay")
210
+ doc_elements.append(
211
+ div(
212
+ **{
213
+ "class": " ".join(classes),
214
+ "data-index": summary_idx
215
+ }
216
+ )(
217
+ main_doc,
218
+ div(_class="proxy-doc"),
219
+ div(_class="proxy-scroll")
220
+ )
221
+ )
222
+
223
+ summary_title = "Summary"
224
+ summary_header = div(
225
+ id_="summary-header"
226
+ )(
227
+ summary_title,
228
+ div(id="summary-header-gap"),
229
+ )
230
+
231
+ summary_items = []
232
+ for summary_idx, summary in enumerate(self.summaries):
233
+ token_idx_to_sent_idx = {}
234
+ for sent_idx, sent in enumerate(summary.sents):
235
+ for token in sent:
236
+ token_idx_to_sent_idx[token.i] = sent_idx
237
+
238
+ spans = []
239
+ matches_ngram = [False] * len(list(summary))
240
+ if self.lexical_alignments is not None:
241
+ lexical_alignment = self.lexical_alignments[summary_idx]
242
+ for summary_span in lexical_alignment.keys():
243
+ start, end = summary_span
244
+ matches_ngram[slice(start, end)] = [True] * (end - start)
245
+ span_id = f"{summary_idx}-{start}-{end}"
246
+ sent_idx = token_idx_to_sent_idx[start]
247
+ spans.append((
248
+ start,
249
+ end,
250
+ sent_idx,
251
+ get_color(sent_idx),
252
+ span_id
253
+ ))
254
+
255
+ if self.semantic_alignments is not None:
256
+ semantic_alignment = self.semantic_alignments[summary_idx]
257
+ else:
258
+ semantic_alignment = {}
259
+ token_elements = []
260
+ for token_idx, token in enumerate(summary):
261
+ if token.is_stop or token.is_punct:
262
+ classes = ["stopword"]
263
+ if self.gray_out_stopwords:
264
+ classes.append("grayed-out")
265
+ el = span(
266
+ _class=" ".join(classes)
267
+ )(
268
+ token.text
269
+ )
270
+ else:
271
+ classes = []
272
+ if token.ent_iob_ in ('I', 'B'):
273
+ classes.append("entity")
274
+ if matches_ngram[token_idx]:
275
+ classes.append("matches-ngram")
276
+ matches = semantic_alignment.get(token_idx)
277
+ if matches:
278
+ top_match = max(matches, key=itemgetter(1))
279
+ top_sim = max(top_match[1], 0)
280
+ top_doc_token_idx = top_match[0]
281
+ props = {
282
+ "data-highlight-id": f"{summary_idx}-{token_idx}",
283
+ "data-top-doc-highlight-id": str(top_doc_token_idx),
284
+ "data-top-doc-sim": f"{top_sim:.2f}",
285
+ }
286
+ classes.extend([
287
+ "annotation-hidden",
288
+ f"summary-highlight-{summary_idx}-{token_idx}"
289
+ ])
290
+ sent_idx = token_idx_to_sent_idx[token_idx]
291
+ el = self._highlight(
292
+ token.text,
293
+ color_with_opacity(get_color(sent_idx), top_sim),
294
+ color_with_opacity(get_color(sent_idx), 1),
295
+ classes,
296
+ **props
297
+ )
298
+ else:
299
+ if classes:
300
+ el = span(_class=" ".join(classes))(token.text)
301
+ else:
302
+ el = token.text
303
+ token_elements.append(el)
304
+
305
+ token_elements = mu.markup(token_elements, spans)
306
+
307
+ classes = ["summary-item"]
308
+ if summary_idx == 0: # Default is for first summary to be selected
309
+ classes.append("selected")
310
+
311
+ summary_items.append(
312
+ div(
313
+ **{"class": ' '.join(classes), "data-index": summary_idx}
314
+ )(
315
+ div(_class="name")(summary._.name),
316
+ div(_class="content")(token_elements)
317
+ )
318
+ )
319
+ classes = ["summary-list", "bordered"]
320
+ if self.scroll:
321
+ classes.append("scroll")
322
+ if self.lexical_alignments is not None:
323
+ classes.append("has-lexical-alignment")
324
+ if self.semantic_alignments is not None:
325
+ classes.append("has-semantic-alignment")
326
+ summary_list = div(
327
+ _class=" ".join(classes)
328
+ )(
329
+ summary_items
330
+ )
331
+
332
+ annotation_key = \
333
+ """
334
+ <ul class="annotation-key">
335
+ <li class="annotation-key-label">Annotations:</li>
336
+ <li id="option-lexical" class="option selected">
337
+ <span class="annotation-key-ngram">N-Gram overlap</span>
338
+ </li>
339
+ <li id="option-semantic" class="option selected">
340
+ <span class="annotation-key-semantic">Semantic overlap</span>
341
+ </li>
342
+ <li id="option-novel" class="option selected">
343
+ <span class="annotation-key-novel">Novel words</span>
344
+ </li>
345
+ <li id="option-entity" class="option selected">
346
+ <span class="annotation-key-entity">Novel entities</span>
347
+ </li>
348
+
349
+ </ul>
350
+ """
351
+
352
+ body = div(
353
+ annotation_key,
354
+ div(
355
+ _class=f"vis-container {self.layout}-layout"
356
+ )(
357
+ div(
358
+ _class="doc-container"
359
+ )(
360
+ doc_header,
361
+ *doc_elements
362
+ ),
363
+ div(
364
+ _class="summary-container"
365
+ )(
366
+ summary_header,
367
+ summary_list
368
+ )
369
+ ),
370
+ )
371
+ return [
372
+ """<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.0.1/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-+0n0xVW2eSR5OomGNYDnhzAbDsOXxcvSN1TPprVMTNDbiYZCxYbOOl7+AMvyTG2x" crossorigin="anonymous">""",
373
+ local_stylesheet(Path(__file__).parent / "resources" / "summvis.css"),
374
+ """<link rel="preconnect" href="https://fonts.gstatic.com">
375
+ <link href="https://fonts.googleapis.com/css2?family=Roboto:wght@400;500&display=swap" rel="stylesheet">""",
376
+ body,
377
+ """<script
378
+ src="https://code.jquery.com/jquery-3.5.1.min.js"
379
+ integrity="sha256-9/aliU8dGd2tb6OSsuzixeV4y/faTqgFtohetphbbj0="
380
+ crossorigin="anonymous"></script>
381
+ <script src="https://cdn.jsdelivr.net/npm/bootstrap@4.6.0/dist/js/bootstrap.bundle.min.js"
382
+ integrity="sha384-Piv4xVNRyMGpqkS2by6br4gNJ7DXjqk09RmUpJ8jgGtD7zP9yug3goQfGII0yAns"
383
+ crossorigin="anonymous"></script>""",
384
+ local_script(Path(__file__).parent / "resources" / "jquery.color-2.1.2.min.js"),
385
+ local_script(Path(__file__).parent / "resources" / "summvis.js"),
386
+ """<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.0.1/dist/js/bootstrap.bundle.min.js" integrity="sha384-gtEjrD/SeCtmISkJkNUaaKMoLD0//ElJ19smozuHV6z3Iehds+3Ulb9Bn9Plx0x4" crossorigin="anonymous"></script>"""
387
+ ]
388
+
389
+ def _highlight(
390
+ self,
391
+ token: Union[str, HtmlElement],
392
+ background_color,
393
+ dotted_underline_color,
394
+ classes: List[str],
395
+ **props
396
+ ):
397
+ return span(
398
+ _class=" ".join(classes + ["highlight"]),
399
+ style=styles(
400
+ background_color=background_color,
401
+ border_bottom=f"4px dotted {dotted_underline_color}",
402
+ ),
403
+ **props
404
+ )(token)
405
+
406
+
407
+ SPACE = "&ensp;"
408
+
409
+
410
+ class MultiUnderline:
411
+ def __init__(
412
+ self,
413
+ underline_thickness=3,
414
+ underline_spacing=1
415
+ ):
416
+ self.underline_thickness = underline_thickness
417
+ self.underline_spacing = underline_spacing
418
+
419
+ def markup(
420
+ self,
421
+ tokens: List[Union[str, HtmlElement]],
422
+ spans: List[Tuple[int, int, int, str, str]]
423
+ ):
424
+ """Style text with multiple layers of colored underlines.
425
+ Args:
426
+ tokens: list of tokens, either string or html element
427
+ spans: list of (start_pos, end_pos, rank, color, id) tuples defined as:
428
+ start_pos: start position of underline span
429
+ end_pos: end position of underline span
430
+ rank: rank for stacking order of underlines, all else being equal
431
+ color: color of underline
432
+ id: id of underline (encoded as a class label in resulting html element)
433
+ Returns:
434
+ List of HTML elements
435
+ """
436
+
437
+ # Map from span start position to span
438
+ start_to_spans = defaultdict(list)
439
+ for span in spans:
440
+ start = span[0]
441
+ start_to_spans[start].append(span)
442
+
443
+ # Map from each underline slot position to list of active spans
444
+ slot_to_spans = {}
445
+
446
+ # Collection of html elements
447
+ elements = []
448
+
449
+ first_token_in_line = True
450
+ for pos, token in enumerate(tokens):
451
+ # Remove spans that are no longer active (end < pos)
452
+ slot_to_spans = defaultdict(
453
+ list,
454
+ {
455
+ slot: [span for span in spans if span[1] > pos] # span[1] contains end of spans
456
+ for slot, spans in slot_to_spans.items() if spans
457
+ }
458
+ )
459
+
460
+ # Add underlines to space between tokens for any continuing underlines
461
+ if first_token_in_line:
462
+ first_token_in_line = False
463
+ else:
464
+ elements.append(self._get_underline_element(SPACE, slot_to_spans))
465
+
466
+ # Find slot for any new spans
467
+ new_spans = start_to_spans.pop(pos, None)
468
+ if new_spans:
469
+ new_spans.sort(
470
+ key=lambda span: (-(span[1] - span[0]), span[2])) # Sort by span length (reversed), rank
471
+ for new_span in new_spans:
472
+ # Find an existing slot or add a new one
473
+ for slot, spans in sorted(slot_to_spans.items(), key=itemgetter(0)): # Sort by slot index
474
+ if spans:
475
+ containing_span = spans[
476
+ 0] # The first span in the slot strictly contains all other spans
477
+ containing_start, containing_end = containing_span[0:2]
478
+ containing_color = containing_span[3]
479
+ start, end = new_span[0:2]
480
+ color = new_span[3]
481
+ # If the new span (1) is strictly contained in this span, or (2) exactly matches this span
482
+ # and is the same color, then add span to this slot
483
+ if end <= containing_end and (
484
+ (start > containing_start or end < containing_end) or
485
+ (start == containing_start and end == containing_end and color == containing_color)
486
+ ):
487
+ spans.append(new_span)
488
+ break
489
+ else:
490
+ # Find a new slot index to add the span
491
+ for slot_index in count():
492
+ spans = slot_to_spans[slot_index]
493
+ if not spans: # If slot is free, take it
494
+ spans.append(new_span)
495
+ break
496
+ if token in ("\n", "\r", "\r\n"):
497
+ elements.append(br())
498
+ first_token_in_line = True
499
+ else:
500
+ # Add underlines to token for all active spans
501
+ elements.append(self._get_underline_element(token, slot_to_spans))
502
+ return elements
503
+
504
+ def _get_underline_element(self, token, slot_to_spans):
505
+ if not slot_to_spans:
506
+ return token
507
+ max_slot_index = max(slot_to_spans.keys())
508
+ element = token
509
+ for slot_index in range(max_slot_index + 1):
510
+ spans = slot_to_spans[slot_index]
511
+ if not spans:
512
+ color = "rgba(0, 0, 0, 0)" # Transparent element w/opacity=0
513
+ props = {}
514
+ else:
515
+ containing_slot = spans[0]
516
+ color = containing_slot[3]
517
+ classes = ["underline"]
518
+ if token != SPACE:
519
+ classes.append("token-underline")
520
+ classes.extend([f"span-{span[4]}" for span in spans]) # Encode ids in class names
521
+ props = {
522
+ "class": " ".join(classes),
523
+ "data-primary-color": color
524
+ }
525
+ if slot_index == 0:
526
+ padding_bottom = 0
527
+ else:
528
+ padding_bottom = self.underline_spacing
529
+ display = "inline-block"
530
+ element = htbuilder.span(
531
+ style=styles(
532
+ display=display,
533
+ border_bottom=f"{self.underline_thickness}px solid",
534
+ border_color=color,
535
+ padding_bottom=px(padding_bottom),
536
+ ),
537
+ **props
538
+ )(element)
539
+
540
+ # Return outermost nested span
541
+ return element
542
+
543
+
544
+ if __name__ == "__main__":
545
+ from htbuilder import div
546
+
547
+ # Test
548
+ text = "The quick brown fox jumps"
549
+ tokens = text.split()
550
+ tokens = [
551
+ "The",
552
+ htbuilder.span(style=styles(color="red"))("quick"),
553
+ "brown",
554
+ "fox",
555
+ "jumps"
556
+ ]
557
+ spans = [
558
+ (0, 2, 0, "green", "green1"),
559
+ (1, 3, 0, "orange", "orange1"),
560
+ (3, 4, 0, "red", "red1"),
561
+ (2, 4, 0, "blue", "blue1"),
562
+ (1, 5, 0, "orange", "orange1"),
563
+ ]
564
+
565
+ mu = MultiUnderline()
566
+ html = str(div(mu.markup(tokens, spans)))
567
+ print(html)
examples/booksum/booksum_sf_sample.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:480c9b9d1e401075aa1fc17d9409f899fc01462202b8688da1cdef8a7d7fef94
3
+ size 102823
examples/booksum/booksum_sf_sample_processed.cache/meta.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f78e53c09ab076c9b88a065d732be8e1f99d480293c505d86421dd59f20903e
3
+ size 2139
examples/booksum/booksum_sf_sample_processed.cache/mgr/blocks/6074137904/data.feather ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc9991b16ce0d69587ae842d082a8cc6dca7c16bf52b55f53e5d894b6ff12773
3
+ size 139442
examples/booksum/booksum_sf_sample_processed.cache/mgr/blocks/6074137904/meta.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a0664bbc8c1074184efb08415a61f149a4b2385b57df87bef71ec5e511aa48a
3
+ size 63
examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/BertscoreAligner:spacy:document:spacy:summary:long_t5_tglobal_xl_booksum/data.dill ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:adf75a80c7218afdcd8690c9bcf50d96b312d4b9222dbca0f03e014f265bc48f
3
+ size 223683
examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/BertscoreAligner:spacy:document:spacy:summary:long_t5_tglobal_xl_booksum/meta.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a834ab33bb0405b065b9ca6ae9a57dbdef46609cd3079d68de2a4c45dda60a1
3
+ size 70
examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/BertscoreAligner:spacy:document:spacy:summary:long_t5_tglobal_xl_booksum/state.dill ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d4540a4caae3db3fd5922acb905088aa5ee4ae29ec3e06c8e2b397716c7106f
3
+ size 126
examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/BertscoreAligner:spacy:document:spacy:summary:long_t5_tglobal_xl_booksum_sfsum/data.dill ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f2e5968ac162a2a3c0c875cef5e5df1991598147c99b5899ee291181aecfc88d
3
+ size 186772
examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/BertscoreAligner:spacy:document:spacy:summary:long_t5_tglobal_xl_booksum_sfsum/meta.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a834ab33bb0405b065b9ca6ae9a57dbdef46609cd3079d68de2a4c45dda60a1
3
+ size 70
examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/BertscoreAligner:spacy:document:spacy:summary:long_t5_tglobal_xl_booksum_sfsum/state.dill ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d4540a4caae3db3fd5922acb905088aa5ee4ae29ec3e06c8e2b397716c7106f
3
+ size 126
examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/BertscoreAligner:spacy:document:spacy:summary:reference/data.dill ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9fa245b2f5f44ae0d3a4c6338c124e681a397d85e77bf9a852bbd039ab9c6e2c
3
+ size 230480
examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/BertscoreAligner:spacy:document:spacy:summary:reference/meta.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a834ab33bb0405b065b9ca6ae9a57dbdef46609cd3079d68de2a4c45dda60a1
3
+ size 70
examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/BertscoreAligner:spacy:document:spacy:summary:reference/state.dill ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d4540a4caae3db3fd5922acb905088aa5ee4ae29ec3e06c8e2b397716c7106f
3
+ size 126
examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/BertscoreAligner:spacy:summary:reference:spacy:summary:long_t5_tglobal_xl_booksum/data.dill ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12017f50df0c5694c1e3d60dc5848c23e54471f661a1f61898fc939f10abbdec
3
+ size 111010
examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/BertscoreAligner:spacy:summary:reference:spacy:summary:long_t5_tglobal_xl_booksum/meta.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a834ab33bb0405b065b9ca6ae9a57dbdef46609cd3079d68de2a4c45dda60a1
3
+ size 70
examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/BertscoreAligner:spacy:summary:reference:spacy:summary:long_t5_tglobal_xl_booksum/state.dill ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d4540a4caae3db3fd5922acb905088aa5ee4ae29ec3e06c8e2b397716c7106f
3
+ size 126
examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/BertscoreAligner:spacy:summary:reference:spacy:summary:long_t5_tglobal_xl_booksum_sfsum/data.dill ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f7be389d66dc4c379035647c6d901c33f83cfbe7a532798ef046c80d07a136ed
3
+ size 100990
examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/BertscoreAligner:spacy:summary:reference:spacy:summary:long_t5_tglobal_xl_booksum_sfsum/meta.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a834ab33bb0405b065b9ca6ae9a57dbdef46609cd3079d68de2a4c45dda60a1
3
+ size 70
examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/BertscoreAligner:spacy:summary:reference:spacy:summary:long_t5_tglobal_xl_booksum_sfsum/state.dill ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d4540a4caae3db3fd5922acb905088aa5ee4ae29ec3e06c8e2b397716c7106f
3
+ size 126
examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/NGramAligner:spacy:document:spacy:summary:long_t5_tglobal_xl_booksum/data.dill ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5489f6e5d6f02ddc656f3ac10a920312cb45a37e0342a1f105b7631a278f31e3
3
+ size 16355
examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/NGramAligner:spacy:document:spacy:summary:long_t5_tglobal_xl_booksum/meta.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a834ab33bb0405b065b9ca6ae9a57dbdef46609cd3079d68de2a4c45dda60a1
3
+ size 70
examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/NGramAligner:spacy:document:spacy:summary:long_t5_tglobal_xl_booksum/state.dill ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d4540a4caae3db3fd5922acb905088aa5ee4ae29ec3e06c8e2b397716c7106f
3
+ size 126
examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/NGramAligner:spacy:document:spacy:summary:long_t5_tglobal_xl_booksum_sfsum/data.dill ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f0f99fac854e2681b08a8a941b4f069f994b9c5842364bde9b9a77b52eec85ba
3
+ size 16791
examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/NGramAligner:spacy:document:spacy:summary:long_t5_tglobal_xl_booksum_sfsum/meta.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a834ab33bb0405b065b9ca6ae9a57dbdef46609cd3079d68de2a4c45dda60a1
3
+ size 70
examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/NGramAligner:spacy:document:spacy:summary:long_t5_tglobal_xl_booksum_sfsum/state.dill ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d4540a4caae3db3fd5922acb905088aa5ee4ae29ec3e06c8e2b397716c7106f
3
+ size 126
examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/NGramAligner:spacy:document:spacy:summary:reference/data.dill ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:68c30a9596aa97a27a12fe057a9e5ab10e0c5f503a0d42589d08c789be3f7fee
3
+ size 15608
examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/NGramAligner:spacy:document:spacy:summary:reference/meta.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a834ab33bb0405b065b9ca6ae9a57dbdef46609cd3079d68de2a4c45dda60a1
3
+ size 70
examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/NGramAligner:spacy:document:spacy:summary:reference/state.dill ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d4540a4caae3db3fd5922acb905088aa5ee4ae29ec3e06c8e2b397716c7106f
3
+ size 126
examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/NGramAligner:spacy:summary:reference:spacy:summary:long_t5_tglobal_xl_booksum/data.dill ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb100474825bb20478a36f7dcea6b3f718ac5ff864540290644c560a00c8090a
3
+ size 4634
examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/NGramAligner:spacy:summary:reference:spacy:summary:long_t5_tglobal_xl_booksum/meta.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a834ab33bb0405b065b9ca6ae9a57dbdef46609cd3079d68de2a4c45dda60a1
3
+ size 70
examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/NGramAligner:spacy:summary:reference:spacy:summary:long_t5_tglobal_xl_booksum/state.dill ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d4540a4caae3db3fd5922acb905088aa5ee4ae29ec3e06c8e2b397716c7106f
3
+ size 126
examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/NGramAligner:spacy:summary:reference:spacy:summary:long_t5_tglobal_xl_booksum_sfsum/data.dill ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd6faabe10cb2532d6f6258091ba47d503c51798bc9664c6b873d1b52ae879b4
3
+ size 4915
examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/NGramAligner:spacy:summary:reference:spacy:summary:long_t5_tglobal_xl_booksum_sfsum/meta.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a834ab33bb0405b065b9ca6ae9a57dbdef46609cd3079d68de2a4c45dda60a1
3
+ size 70
examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/NGramAligner:spacy:summary:reference:spacy:summary:long_t5_tglobal_xl_booksum_sfsum/state.dill ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d4540a4caae3db3fd5922acb905088aa5ee4ae29ec3e06c8e2b397716c7106f
3
+ size 126
examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/StaticEmbeddingAligner:spacy:document:spacy:summary:long_t5_tglobal_xl_booksum/data.dill ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51cc7fde06cea550f5f10749e1c7d534fe23ba6b635785e0c5af2595d6ba47e8
3
+ size 320091
examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/StaticEmbeddingAligner:spacy:document:spacy:summary:long_t5_tglobal_xl_booksum/meta.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a834ab33bb0405b065b9ca6ae9a57dbdef46609cd3079d68de2a4c45dda60a1
3
+ size 70
examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/StaticEmbeddingAligner:spacy:document:spacy:summary:long_t5_tglobal_xl_booksum/state.dill ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d4540a4caae3db3fd5922acb905088aa5ee4ae29ec3e06c8e2b397716c7106f
3
+ size 126
examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/StaticEmbeddingAligner:spacy:document:spacy:summary:long_t5_tglobal_xl_booksum_sfsum/data.dill ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf7ca905febdfef1cd2f702125c2daa131af6fe6ff360bfc7121c6b595e44349
3
+ size 294430
examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/StaticEmbeddingAligner:spacy:document:spacy:summary:long_t5_tglobal_xl_booksum_sfsum/meta.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a834ab33bb0405b065b9ca6ae9a57dbdef46609cd3079d68de2a4c45dda60a1
3
+ size 70
examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/StaticEmbeddingAligner:spacy:document:spacy:summary:long_t5_tglobal_xl_booksum_sfsum/state.dill ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d4540a4caae3db3fd5922acb905088aa5ee4ae29ec3e06c8e2b397716c7106f
3
+ size 126
examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/StaticEmbeddingAligner:spacy:document:spacy:summary:reference/data.dill ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f0cfe2f81a5e4fbc5e3bde2a436c4171507caa3a19ee2379458e27a9a033fbb
3
+ size 341548
examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/StaticEmbeddingAligner:spacy:document:spacy:summary:reference/meta.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a834ab33bb0405b065b9ca6ae9a57dbdef46609cd3079d68de2a4c45dda60a1
3
+ size 70
examples/booksum/booksum_sf_sample_processed.cache/mgr/columns/StaticEmbeddingAligner:spacy:document:spacy:summary:reference/state.dill ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d4540a4caae3db3fd5922acb905088aa5ee4ae29ec3e06c8e2b397716c7106f
3
+ size 126