SmartPy commited on
Commit
dcfa2ec
1 Parent(s): c1fb6c4

Upload 6 files

Browse files
Files changed (6) hide show
  1. app.py +280 -0
  2. example.txt +1 -0
  3. requirements.txt +9 -0
  4. summarize.py +144 -0
  5. textrank.py +41 -0
  6. utils.py +121 -0
app.py ADDED
@@ -0,0 +1,280 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import contextlib
3
+ import logging
4
+ import random
5
+ import re
6
+ import time
7
+ from pathlib import Path
8
+
9
+ import gradio as gr
10
+ import nltk
11
+ from cleantext import clean
12
+
13
+ from summarize import load_model_and_tokenizer, summarize_via_tokenbatches
14
+ from utils import load_example_filenames, truncate_word_count, saves_summary
15
+ from textrank import get_summary
16
+
17
+ example_path = "/content/drive/MyDrive/space/"
18
+ nltk.download("stopwords") # TODO=find where this requirement originates from
19
+
20
+ logging.basicConfig(
21
+ level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
22
+ )
23
+
24
+
25
+ def proc_submission(
26
+ input_text: str,
27
+ model_size: str,
28
+ num_beams,
29
+ token_batch_length,
30
+ length_penalty,
31
+ repetition_penalty,
32
+ no_repeat_ngram_size,
33
+ max_input_length: int = 1024,
34
+ ):
35
+
36
+ settings = {
37
+ "length_penalty": float(length_penalty),
38
+ "repetition_penalty": float(repetition_penalty),
39
+ "no_repeat_ngram_size": int(no_repeat_ngram_size),
40
+ "encoder_no_repeat_ngram_size": 4,
41
+ "num_beams": int(num_beams),
42
+ "min_length": 4,
43
+ "max_length": int(token_batch_length // 4),
44
+ "early_stopping": True,
45
+ "do_sample": False,
46
+ }
47
+ st = time.perf_counter()
48
+ history = {}
49
+ clean_text = clean(input_text, lower=False)
50
+ max_input_length = 1024 if "base" in model_size.lower() else max_input_length
51
+ clean_text = get_summary(clean_text)
52
+ processed = truncate_word_count(clean_text, max_input_length)
53
+
54
+ if processed["was_truncated"]:
55
+ tr_in = processed["truncated_text"]
56
+ # create elaborate HTML warning
57
+ input_wc = re.split(r"\s+", input_text)
58
+ msg = f"""
59
+ <div style="background-color: #FFA500; color: white; padding: 20px;">
60
+ <h3>Warning</h3>
61
+ <p>Input text was truncated to {max_input_length} words. That's about {100*max_input_length/len(input_wc):.2f}% of the submission.</p>
62
+ </div>
63
+ """
64
+ logging.warning(msg)
65
+ history["WARNING"] = msg
66
+ else:
67
+ tr_in = input_text
68
+ msg = None
69
+
70
+ if len(input_text) < 50:
71
+ # this is essentially a different case from the above
72
+ msg = f"""
73
+ <div style="background-color: #880808; color: white; padding: 20px;">
74
+ <h3>Warning</h3>
75
+ <p>Input text is too short to summarize. Detected {len(input_text)} characters.
76
+ Please load text by selecting an example from the dropdown menu or by pasting text into the text box.</p>
77
+ </div>
78
+ """
79
+ logging.warning(msg)
80
+ logging.warning("RETURNING EMPTY STRING")
81
+ history["WARNING"] = msg
82
+
83
+ return msg, "", []
84
+
85
+ _summaries = summarize_via_tokenbatches(
86
+ tr_in,
87
+ model,
88
+ tokenizer,
89
+ batch_length=token_batch_length,
90
+ **settings,
91
+ )
92
+ sum_text = [f"Section {i}: " + s["summary"][0] for i, s in enumerate(_summaries)]
93
+ sum_scores = [
94
+ f" - Section {i}: {round(s['summary_score'],4)}"
95
+ for i, s in enumerate(_summaries)
96
+ ]
97
+
98
+ sum_text_out = "\n".join(sum_text)
99
+ history["Summary Scores"] = "<br><br>"
100
+ scores_out = "\n".join(sum_scores)
101
+ rt = round((time.perf_counter() - st) / 60, 2)
102
+ print(f"Runtime: {rt} minutes")
103
+ html = ""
104
+ html += f"<p>Runtime: {rt} minutes on CPU</p>"
105
+ if msg is not None:
106
+ html += msg
107
+
108
+ html += ""
109
+
110
+ # save to file
111
+ saved_file = saves_summary(_summaries)
112
+
113
+ return html, sum_text_out, scores_out, saved_file
114
+
115
+
116
+ def load_single_example_text(
117
+ example_path: str or Path="/content/example.txt",
118
+ max_pages=20,
119
+ ):
120
+ """
121
+ load_single_example - a helper function for the gradio module to load examples
122
+ Returns:
123
+ list of str, the examples
124
+ """
125
+ global name_to_path
126
+ full_ex_path = name_to_path[example_path]
127
+ full_ex_path = Path(full_ex_path)
128
+ if full_ex_path.suffix == ".txt":
129
+ with open(full_ex_path, "r", encoding="utf-8", errors="ignore") as f:
130
+ raw_text = f.read()
131
+ text = clean(raw_text, lower=False)
132
+ else:
133
+ logging.error(f"Unknown file type {full_ex_path.suffix}")
134
+ text = "ERROR - check example path"
135
+
136
+ return text
137
+
138
+ if __name__ == "__main__":
139
+ logging.info("Starting app instance")
140
+ os.environ[
141
+ "TOKENIZERS_PARALLELISM"
142
+ ] = "false" # parallelism on tokenizers is buggy with gradio
143
+ logging.info("Loading summ models")
144
+ with contextlib.redirect_stdout(None):
145
+ model, tokenizer = load_model_and_tokenizer(
146
+ "SmartPy/bart-large-cnn-finetuned-scientific_summarize"
147
+ )
148
+
149
+ name_to_path = load_example_filenames(example_path)
150
+ logging.info(f"Loaded {len(name_to_path)} examples")
151
+ demo = gr.Blocks()
152
+ _examples = list(name_to_path.keys())
153
+ with demo:
154
+
155
+ gr.Markdown("# Document Summarization with Long-Document Transformers")
156
+ gr.Markdown(
157
+ "This is an example use case for fine-tuned long document transformers. The model is trained on Scientific Article summaries (via the Yale Scientific Article Summarization Dataset). The models in this demo are [Bart-large-cnn](https://huggingface.co/facebook/bart-large-cnn)."
158
+ )
159
+ with gr.Column():
160
+
161
+ gr.Markdown("## Load Inputs & Select Parameters")
162
+ gr.Markdown(
163
+ "Enter text below in the text area. The text will be summarized [using the selected parameters](https://huggingface.co/blog/how-to-generate). "
164
+ )
165
+ with gr.Row(variant="compact"):
166
+ with gr.Column(scale=0.5, variant="compact"):
167
+
168
+ model_size = gr.Radio(
169
+ choices=["bart-large-cnn"],
170
+ label="Model Variant",
171
+ value="bart-large-cnn",
172
+ )
173
+ num_beams = gr.Radio(
174
+ choices=[2, 3, 4],
175
+ label="Beam Search: # of Beams",
176
+ value=2,
177
+ )
178
+ with gr.Column(variant="compact"):
179
+ example_name = gr.Dropdown(
180
+ _examples,
181
+ label="Examples",
182
+ value=random.choice(_examples),
183
+ )
184
+
185
+ with gr.Row():
186
+ input_text = gr.Textbox(
187
+ lines=4,
188
+ label="Input Text (for summarization)",
189
+ placeholder="Enter text to summarize, the text will be cleaned and truncated on Spaces. Narrative, academic (both papers and lecture transcription), and article text work well. May take a bit to generate depending on the input text :)",
190
+ )
191
+ with gr.Column(min_width=100, scale=0.5):
192
+ load_examples_button = gr.Button(
193
+ "Load Example",
194
+ )
195
+
196
+ with gr.Column():
197
+ gr.Markdown("## Generate Summary")
198
+ gr.Markdown(
199
+ "Summarization should take ~1-2 minutes for most settings, but may extend up to 5-10 minutes in some scenarios."
200
+ )
201
+ summarize_button = gr.Button(
202
+ "Summarize!",
203
+ variant="primary",
204
+ )
205
+
206
+ output_text = gr.HTML("<p><em>Output will appear below:</em></p>")
207
+ gr.Markdown("### Summary Output")
208
+ summary_text = gr.Textbox(
209
+ label="Summary", placeholder="The generated summary will appear here"
210
+ )
211
+ gr.Markdown(
212
+ "The summary scores can be thought of as representing the quality of the summary. less-negative numbers (closer to 0) are better:"
213
+ )
214
+ summary_scores = gr.Textbox(
215
+ label="Summary Scores", placeholder="Summary scores will appear here"
216
+ )
217
+
218
+ text_file = gr.File(
219
+ label="Download Summary as Text File",
220
+ file_count="single",
221
+ type="file",
222
+ interactive=False,
223
+ )
224
+
225
+ gr.Markdown("---")
226
+ with gr.Column():
227
+ gr.Markdown("### Advanced Settings")
228
+ with gr.Row(variant="compact"):
229
+ length_penalty = gr.inputs.Slider(
230
+ minimum=0.5,
231
+ maximum=1.0,
232
+ label="length penalty",
233
+ default=0.7,
234
+ step=0.05,
235
+ )
236
+ token_batch_length = gr.Radio(
237
+ choices=[512, 768, 1024, 1536],
238
+ label="token batch length",
239
+ value=1024,
240
+ )
241
+
242
+ with gr.Row(variant="compact"):
243
+ repetition_penalty = gr.inputs.Slider(
244
+ minimum=1.0,
245
+ maximum=5.0,
246
+ label="repetition penalty",
247
+ default=3.5,
248
+ step=0.1,
249
+ )
250
+ no_repeat_ngram_size = gr.Radio(
251
+ choices=[2, 3, 4],
252
+ label="no repeat ngram size",
253
+ value=3,
254
+ )
255
+ with gr.Column():
256
+ gr.Markdown("### About the Model")
257
+ gr.Markdown(
258
+ "These models are fine-tuned on the [1000 most cited papers in the ACL Anthology Network (AAN)](http://arxiv.org/pdf/1909.01716.pdf).The goal was to create a model that can generalize well and is useful in summarizing lots of text in academic and daily usage."
259
+ )
260
+ gr.Markdown("---")
261
+
262
+ load_examples_button.click(
263
+ fn=load_single_example_text, inputs=[example_name], outputs=[input_text]
264
+ )
265
+
266
+ summarize_button.click(
267
+ fn=proc_submission,
268
+ inputs=[
269
+ input_text,
270
+ model_size,
271
+ num_beams,
272
+ token_batch_length,
273
+ length_penalty,
274
+ repetition_penalty,
275
+ no_repeat_ngram_size,
276
+ ],
277
+ outputs=[output_text, summary_text, summary_scores, text_file],
278
+ )
279
+
280
+ demo.launch(enable_queue=True, debug=True)
example.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Human evaluation machine translation ( MT ) weigh many aspect translation , include adequacy , fidelity , fluency translation ( Hovy , 1999 ; White O ’ Connell , 1994 ) . A comprehensive catalog MT evaluation technique rich literature give Reeder ( 2001 ) . For part , various human evaluation approach quite expensive ( Hovy , 1999 ) . Moreover , take week month finish . This big problem developer machine translation system need monitor effect daily change system order weed bad idea good ideas . We believe MT progress stem evaluation logjam fruitful research idea wait release 1So call method bilingual evaluation understudy , BLEU . evaluation bottleneck . Developers would benefit inexpensive automatic evaluation quick , language-independent , correlate highly human evaluation . We propose evaluation method paper . How one measure translation performance ? The close machine translation professional human translation , good . This central idea behind proposal . To judge quality machine translation , one measure closeness one reference human translation accord numerical metric . Thus , MT evaluation system require two ingredients : We fashion closeness metric highly successful word error rate metric use speech recognition community , appropriately modify multiple reference translation allow legitimate difference word choice word order . The main idea use weighted average variable length phrase match reference translations . This view give rise family metric use various weight schemes . We select promising baseline metric family . In Section 2 , describe baseline metric detail . In Section 3 , evaluate performance BLEU . In Section 4 , describe human evaluation experiment . In Section 5 , compare baseline metric performance human evaluations . Typically , many “ perfect ” translation give source sentence . These translation may vary word choice word order even use words . And yet human clearly distinguish good translation bad one . For example , consider two candidate translation forever hear activity guidebook party direct . Although appear subject , differ markedly quality . For comparison , provide three reference human translation sentence . guarantee military force always command Party . Reference 3 : It practical guide army always heed direction party . It clear good translation , Candidate 1 , share many word phrase three reference translations , Candidate 2 . We shortly quantify notion share Section 2 . 1 . But first observe Candidate 1 share & quot ; It guide action & quot ; Reference 1 , & quot ; & quot ; Reference 2 , & quot ; ensures military & quot ; Reference 1 , & quot ; always & quot ; References 2 3 , & quot ; commands & quot ; Reference 1 , finally & quot ; party & quot ; Reference 2 ( ignore capitalization ) . In contrast , Candidate 2 exhibit far matches , extent less . It clear program rank Candidate 1 high Candidate 2 simply compare ngram match candidate translation reference translations . Experiments large collection translation present Section 5 show ranking ability general phenomenon , artifact toy examples . The primary programming task BLEU implementor compare n-grams candidate n-grams reference translation count number matches . These match positionindependent . The matches , good candidate translation . For simplicity , first focus compute unigram matches . The cornerstone metric familiar precision measure . To compute precision , one simply count number candidate translation word ( unigrams ) occur reference translation divide total number word candidate translation . Unfortunately , MT system overgenerate “ reasonable ” words , result improbable , high-precision , translation like example 2 . Intuitively problem clear : reference word consider exhaust matching candidate word identified . We formalize intuition modified unigram precision . To compute , one first count maximum number time word occur single reference translation . Next , one clip total count candidate word maximum reference count , 2adds clip count , divide total ( unclipped ) number candidate words . In Example 1 , Candidate 1 achieve modified unigram precision 17/18 ; whereas Candidate 2 achieve modified unigram precision 8/14 . Similarly , modified unigram precision Example 2 2/7 , even though standard unigram precision 7/7 . Modified n-gram precision compute similarly n : candidate n-gram count corresponding maximum reference count collected . The candidate count clip corresponding reference maximum value , summed , divide total number candidate ngrams . In Example 1 , Candidate 1 achieve modified bigram precision 10/17 , whereas low quality Candidate 2 achieve modified bigram precision 1/13 . In Example 2 , ( implausible ) candidate achieve modified bigram precision 0 . This sort modified n-gram precision score capture two aspect translation : adequacy fluency . A translation use word ( 1-grams ) reference tend satisfy adequacy . The longer n-gram match account fluency . 4 2 . 1 . 1 Modified n-gram precision block text How compute modify n-gram precision multi-sentence test set ? Although one typically evaluate MT system corpus entire documents , basic unit evaluation sentence . A source sentence may translate many target sentences , case abuse terminology refer corresponding target sentence “ sentence . ” We first compute n-gram match sentence sentence . Next , add clipped n-gram count candidate sentence divide number candidate n-grams test corpus compute modified precision score , pn , entire test corpus . 4BLEU need match human judgment average test corpus ; score individual sentence often vary human judgments . For example , system produce fluent phrase “ East Asian economy ” penalize heavily longer n-gram precision reference happen read “ economy East Asia . ” The key BLEU ’ success system treat similarly multiple human translator different style used , effect cancel comparison systems . 2 . 1 . 2 Ranking system use modify n-gram precision To verify modify n-gram precision distinguishes good translation bad translations , compute modified precision number output ( good ) human translator standard ( poor ) machine translation system use 4 reference translation 127 source sentences . The average precision result show Figure 1 . The strong signal differentiate human ( high precision ) machine ( low precision ) striking . The difference becomes strong go unigram precision 4-gram precision . It appear single n-gram precision score distinguish good translation bad translation . To useful , however , metric must also reliably distinguish translation differ greatly quality . Furthermore , must distinguish two human translation differ quality . This latter requirement ensure continued validity metric MT approach human translation quality . To end , obtain human translation someone lack native proficiency source ( Chinese ) target language ( English ) . For comparison , acquire human translation document native English speaker . We also obtain machine translation three commercial systems . These five “ systems ” — two human three machine — score two reference professional human translations . The average modified n-gram precision result show Figure 2 . Each n-gram statistic imply Phrase ( n -gram ) Length ranking : H2 ( Human-2 ) good H1 ( Human1 ) , big drop quality H1 S3 ( Machine/System-3 ) . S3 appear good S2 turn appear good S1 . Remarkably , rank order assign “ systems ” human judges , discuss later . While seem ample signal single n-gram precision , robust combine signal single number metric . 2 . 1 . 3 Combining modify n-gram precision How combine modified precision various n-gram sizes ? A weight linear average modified precision result encouraging result 5 systems . However , see Figure 2 , modify n-gram precision decay roughly exponentially n : modified unigram precision much large modified bigram precision turn much big modified trigram precision . A reasonable averaging scheme must take exponential decay account ; weighted average logarithm modified precision satisifies requirement . BLEU use average logarithm uniform weights , equivalent use geometric mean modified n-gram precisions . 5 , 6 Experimentally , obtain best correlation monolingual human judgment use maximum n-gram order 4 , although 3-grams 5-grams give comparable results . A candidate translation neither long short , evaluation metric enforce . To extent , n-gram precision already accomplish . N-gram precision penalize spurious word candidate appear reference translations . Additionally , modify precision penalize word occur frequently candidate translation maximum reference count . This reward use word many time warranted penalize use word time occur references . However , modify n-gram precision alone fail enforce proper translation length , illustrate short , absurd example . Because candidate short compare proper length , one expect find inflated precisions : modified unigram precision 2/2 , modified bigram precision 1/1 . Traditionally , precision pair recall overcome length-related problems . However , BLEU consider multiple reference translations , may use different word choice translate source word . Furthermore , good candidate translation use ( recall ) one possible choices , . Indeed , recall choice lead bad translation . Here example . The first candidate recall word references , obviously poor translation second candidate . Thus , naive recall compute set reference word good measure . Admittedly , one could align reference translation discover synonymous word compute recall concept rather words . But , give reference translation vary length differ word order syntax , computation complicated . Candidate translation longer reference already penalize modified n-gram precision measure : need penalize . Consequently , introduce multiplicative brevity penalty factor . With brevity penalty place , high-scoring candidate translation must match reference translation length , word choice , word order . Note neither brevity penalty modified n-gram precision length effect directly consider source length ; instead , consider range reference translation length target language . We wish make brevity penalty 1 . 0 candidate ’ length reference translation ’ length . For example , three reference lengths 12 , 15 , 17 word candidate translation terse 12 words , want brevity penalty 1 . We call close reference sentence length “ best match length . ” One consideration remains : compute brevity penalty sentence sentence average penalties , length deviation short sentence would punish harshly . Instead , compute brevity penalty entire corpus allow freedom sentence level . We first compute test corpus ’ effective reference length , r , sum best match length candidate sentence corpus . We choose brevity penalty decay exponential r/c , c total length candidate translation corpus . We take geometric mean test corpus ’ modify precision score multiply result exponential brevity penalty factor . Currently , case folding text normalization perform compute precision . We first compute geometric average modified n-gram precisions , pn , use n-grams length N positive weight wn sum one . Next , let c length candidate translation r effective reference corpus length . We compute brevity penalty BP , The ranking behavior immediately apparent log domain , log BLEU = min ( 1 − In baseline , use N = 4 uniform weight wn = 1/N . The BLEU metric range 0 1 . Few translation attain score 1 unless identical reference translation . For reason , even human translator necessarily score 1 . It important note reference translation per sentence , high score . Thus , one must cautious make even “ rough ” comparison evaluation different number reference translations : test corpus 500 sentence ( 40 general news stories ) , human translator score 0 . 3468 four reference score 0 . 2571 two references . Table 1 show BLEU score 5 system two reference test corpus . The MT system S2 S3 close metric . Hence , several question arise : To answer questions , divide test corpus 20 block 25 sentence , compute BLEU metric block individually . We thus 20 sample BLEU metric system . We compute means , variances , pair t-statistics display Table 2 . The t-statistic compare system left neighbor table . For example , = 6 pair S1 S2 . Note number Table 1 BLEU metric aggregate 500 sentences , mean Table 2 average BLEU metric aggregate 25 sentences . As expected , two set result close system differ small finite block size effects . Since paired t-statistic 1 . 7 95 % significant , difference systems ’ score statistically significant . The report variance 25-sentence block serve upper bound variance sizeable test set like 500 sentence corpus . How many reference translation need ? We simulate single-reference test corpus randomly select one 4 reference translation single reference 40 stories . In way , ensure degree stylistic variation . The system maintain rank order multiple references . This outcome suggest may use big test corpus single reference translation , provide translation translator . We two group human judges . The first group , call monolingual group , consist 10 native speaker English . The second group , call bilingual group , consist 10 native speaker Chinese live United States past several years . None human judge professional translator . The human judge 5 standard system Chinese sentence subset extract random 500 sentence test corpus . We pair source sentence 5 translations , total 250 pair Chinese source English translations . We prepare web page translation pair randomly order disperse five translation source sentence . All judge use webpage saw sentence pair order . They rat translation 1 ( bad ) 5 ( good ) . The monolingual group make judgment base translations ’ readability fluency . As must expected , judge liberal others . And sentence easy translate others . To account intrinsic difference judge sentences , compare judge ’ rating sentence across systems . We perform four pairwise t-test comparison adjacent system order aggregate average score . Figure 3 show mean difference score two consecutive system 95 % confidence interval mean . We see S2 quite bit well S1 ( mean opinion score difference 0 . 326 5-point scale ) , S3 judge little good ( 0 . 114 ) . Both difference significant 95 % level . 7 The human H1 much good best system , though bit bad human H2 . This surprising give H1 native speaker either Chinese English , whereas H2 native English speaker . Again , difference human translator significant beyond 95 % level . 5 BLEU vs The Human Evaluation Figure 5 show linear regression monolingual group score function BLEU score two reference translation 5 systems . The high correlation coefficient 0 . 99 indicates BLEU track human judgment well . Particularly interesting well BLEU distinguishes S2 S3 quite close . Figure 6 show comparable regression result bilingual group . The correlation coefficient 0 . 96 . We take bad system reference point compare BLEU score human judgment score remain system relative bad system . We take BLEU , monolingual group , bilingual group score 5 system linearly normalize corresponding range ( maximum minimum score across 5 systems ) . The normalized score show Figure 7 . This figure illustrate high correlation BLEU score monolingual group . Of particular interest accuracy BLEU ’ estimate small difference S2 S3 large difference S3 H1 . The figure also highlight relatively large gap MT system human translators . 8 In addition , surmise bilingual group forgive judge H1 relative H2 monolingual group find rather large difference fluency translations . We believe BLEU accelerate MT R & D cycle allow researcher rapidly home effective modeling ideas . Our belief reinforce recent statistical analysis BLEU ’ correlation human judgment translation English four quite different language ( Arabic , Chinese , French , Spanish ) represent 3 different language family ( Papineni et al. , 2002 ) ! BLEU ’ strength correlate highly human judg $ Crossing chasm Chinese-English translation appear significant challenge current state-of-the-art systems . ments average individual sentence judgment error test corpus rather attempt divine exact human judgment every sentence : quantity lead quality . Finally , since MT summarization view natural language generation textual context , believe BLEU could adapt evaluate summarization similar NLG tasks . Acknowledgments This work partially support Defense Advanced Research Projects Agency monitor SPAWAR contract No . N66001-99-2-8916 . The view finding contain material author necessarily reflect position policy Government official endorsement inferred . We gratefully acknowledge comment geometric mean John Makhoul BBN discussion George Doddington NIST . We especially wish thank colleague serve monolingual bilingual judge pool perseverance judge output ChineseEnglish MT systems .
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ clean-text[gpl]
2
+ gradio
3
+ nltk
4
+ torch
5
+ tqdm
6
+ transformers
7
+ accelerate
8
+ sentence_transformers
9
+ 'networkx<2.7'
summarize.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+
3
+ import torch
4
+ from tqdm.auto import tqdm
5
+ from transformers import BartForConditionalGeneration, BartTokenizer
6
+
7
+
8
+ def load_model_and_tokenizer(model_name):
9
+ """
10
+ load_model_and_tokenizer - a function that loads a model and tokenizer from huggingface
11
+ Args:
12
+ model_name (str): the name of the model to load
13
+ Returns:
14
+ AutoModelForSeq2SeqLM: the model
15
+ AutoTokenizer: the tokenizer
16
+ """
17
+ device = "cuda" if torch.cuda.is_available() else "cpu"
18
+ model = BartForConditionalGeneration.from_pretrained(
19
+ model_name,
20
+ # low_cpu_mem_usage=True,
21
+ # use_cache=False,
22
+ ).to(device)
23
+ tokenizer = BartTokenizer.from_pretrained(model_name)
24
+
25
+ logging.info(f"Loaded model {model_name} to {device}")
26
+ return model, tokenizer
27
+
28
+
29
+ def summarize_and_score(
30
+ ids, mask, model, tokenizer, is_general_attention_model=True, **kwargs
31
+ ):
32
+ """
33
+ summarize_and_score - given a batch of ids and a mask, return a summary and a score for the summary
34
+ Args:
35
+ ids (): the batch of ids
36
+ mask (): the attention mask for the batch
37
+ model (): the model to use for summarization
38
+ tokenizer (): the tokenizer to use for summarization
39
+ is_general_attention_model (bool, optional): whether the model is a general attention model. Defaults to True.
40
+ Returns:
41
+ str: the summary of the batch
42
+ """
43
+
44
+ ids = ids[None, :]
45
+ mask = mask[None, :]
46
+
47
+ input_ids = ids.to("cuda") if torch.cuda.is_available() else ids
48
+ attention_mask = mask.to("cuda") if torch.cuda.is_available() else mask
49
+
50
+ global_attention_mask = torch.zeros_like(attention_mask)
51
+ # put global attention on <s> token
52
+ global_attention_mask[:, 0] = 1
53
+
54
+ if is_general_attention_model:
55
+ summary_pred_ids = model.generate(
56
+ input_ids,
57
+ attention_mask=attention_mask,
58
+ output_scores=True,
59
+ return_dict_in_generate=True,
60
+ **kwargs,
61
+ )
62
+ else:
63
+ summary_pred_ids = model.generate(
64
+ input_ids,
65
+ attention_mask=attention_mask,
66
+ global_attention_mask=global_attention_mask,
67
+ output_scores=True,
68
+ return_dict_in_generate=True,
69
+ **kwargs,
70
+ )
71
+ summary = tokenizer.batch_decode(
72
+ summary_pred_ids.sequences,
73
+ skip_special_tokens=True,
74
+ remove_invalid_values=True,
75
+ )
76
+ score = round(summary_pred_ids.sequences_scores.cpu().numpy()[0], 4)
77
+
78
+ return summary, score
79
+
80
+
81
+ def summarize_via_tokenbatches(
82
+ input_text: str,
83
+ model,
84
+ tokenizer,
85
+ batch_length=2048,
86
+ batch_stride=16,
87
+ **kwargs,
88
+ ):
89
+ """
90
+ summarize_via_tokenbatches - a function that takes a string and returns a summary
91
+ Args:
92
+ input_text (str): the text to summarize
93
+ model (): the model to use for summarizationz
94
+ tokenizer (): the tokenizer to use for summarization
95
+ batch_length (int, optional): the length of each batch. Defaults to 2048.
96
+ batch_stride (int, optional): the stride of each batch. Defaults to 16. The stride is the number of tokens that overlap between batches.
97
+ Returns:
98
+ str: the summary
99
+ """
100
+ # log all input parameters
101
+ if batch_length < 512:
102
+ batch_length = 512
103
+ print("WARNING: batch_length was set to 512")
104
+ print(
105
+ f"input parameters: {kwargs}, batch_length={batch_length}, batch_stride={batch_stride}"
106
+ )
107
+ encoded_input = tokenizer(
108
+ input_text,
109
+ padding="max_length",
110
+ truncation=True,
111
+ max_length=batch_length,
112
+ stride=batch_stride,
113
+ return_overflowing_tokens=True,
114
+ add_special_tokens=False,
115
+ return_tensors="pt",
116
+ )
117
+
118
+ in_id_arr, att_arr = encoded_input.input_ids, encoded_input.attention_mask
119
+ gen_summaries = []
120
+
121
+ pbar = tqdm(total=len(in_id_arr))
122
+
123
+ for _id, _mask in zip(in_id_arr, att_arr):
124
+
125
+ result, score = summarize_and_score(
126
+ ids=_id,
127
+ mask=_mask,
128
+ model=model,
129
+ tokenizer=tokenizer,
130
+ **kwargs,
131
+ )
132
+ score = round(float(score), 4)
133
+ _sum = {
134
+ "input_tokens": _id,
135
+ "summary": result,
136
+ "summary_score": score,
137
+ }
138
+ gen_summaries.append(_sum)
139
+ print(f"\t{result[0]}\nScore:\t{score}")
140
+ pbar.update()
141
+
142
+ pbar.close()
143
+
144
+ return gen_summaries
textrank.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ import nltk
4
+ nltk.download('punkt') # one time execution
5
+ import re
6
+ import warnings
7
+ warnings.filterwarnings('ignore')
8
+ from sklearn.metrics.pairwise import cosine_similarity
9
+ import networkx as nx
10
+ from tqdm import tqdm
11
+
12
+ from sentence_transformers import SentenceTransformer
13
+
14
+ model = SentenceTransformer('all-mpnet-base-v2')
15
+
16
+ model.to('cuda')
17
+ def get_summary(text, num_words: int=1000):
18
+ sentences = nltk.sent_tokenize(text)
19
+ embeddings = model.encode(sentences, show_progress_bar=False)
20
+ try:
21
+ sim_matrix = cosine_similarity(embeddings)
22
+ except Exception as e:
23
+ print(e, type(e))
24
+ print(embeddings.shape)
25
+ nx_graph = nx.from_numpy_array(sim_matrix)
26
+ scores = nx.pagerank(nx_graph)
27
+
28
+ ranked_sentences = sorted(((scores[i],s, i) for i,s in enumerate(sentences)), reverse=True)
29
+ final_sents = []
30
+ total_length = 0
31
+ for score, sents, i in ranked_sentences:
32
+ total_length += len(sents.split())
33
+ if total_length < num_words:
34
+ final_sents.append((score, sents, i))
35
+ else:
36
+ break
37
+
38
+ top_k_sents = sorted(final_sents, key=lambda x: x[2])
39
+ sents = " ".join([s[1] for s in top_k_sents])
40
+
41
+ return sents
utils.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ utils.py - Utility functions for the project.
3
+ """
4
+
5
+ import re
6
+ from pathlib import Path
7
+ from datetime import datetime
8
+ from natsort import natsorted
9
+ import subprocess
10
+
11
+
12
+ def get_timestamp() -> str:
13
+ """
14
+ get_timestamp - get a timestamp for the current time
15
+ Returns:
16
+ str, the timestamp
17
+ """
18
+ return datetime.now().strftime("%Y%m%d_%H%M%S")
19
+
20
+
21
+ def truncate_word_count(text, max_words=512):
22
+ """
23
+ truncate_word_count - a helper function for the gradio module
24
+ Parameters
25
+ ----------
26
+ text : str, required, the text to be processed
27
+ max_words : int, optional, the maximum number of words, default=512
28
+ Returns
29
+ -------
30
+ dict, the text and whether it was truncated
31
+ """
32
+ # split on whitespace with regex
33
+ words = re.split(r"\s+", text)
34
+ processed = {}
35
+ if len(words) > max_words:
36
+ processed["was_truncated"] = True
37
+ processed["truncated_text"] = " ".join(words[:max_words])
38
+ else:
39
+ processed["was_truncated"] = False
40
+ processed["truncated_text"] = text
41
+ return processed
42
+
43
+
44
+ def load_examples(src, filetypes=[".txt", ".pdf"]):
45
+ """
46
+ load_examples - a helper function for the gradio module to load examples
47
+ Returns:
48
+ list of str, the examples
49
+ """
50
+ src = Path(src)
51
+ src.mkdir(exist_ok=True)
52
+
53
+ pdf_url = (
54
+ "https://www.dropbox.com/s/y92xy7o5qb88yij/all_you_need_is_attention.pdf?dl=1"
55
+ )
56
+ subprocess.run(["wget", pdf_url, "-O", src / "all_you_need_is_attention.pdf"])
57
+ examples = [f for f in src.iterdir() if f.suffix in filetypes]
58
+ examples = natsorted(examples)
59
+ # load the examples into a list
60
+ text_examples = []
61
+ for example in examples:
62
+ with open(example, "r") as f:
63
+ text = f.read()
64
+ text_examples.append([text, "base", 2, 1024, 0.7, 3.5, 3])
65
+
66
+ return text_examples
67
+
68
+
69
+ def load_example_filenames(example_path: str or Path):
70
+ """
71
+ load_example_filenames - a helper function for the gradio module to load examples
72
+ Returns:
73
+ dict, the examples (filename:full path)
74
+ """
75
+ example_path = Path(example_path)
76
+ # load the examples into a list
77
+ examples = {f.name: f for f in example_path.glob("*.txt")}
78
+ return examples
79
+
80
+
81
+ def saves_summary(summarize_output, outpath: str or Path = None, add_signature=True):
82
+ """
83
+ saves_summary - save the summary generated from summarize_via_tokenbatches() to a text file
84
+ _summaries = summarize_via_tokenbatches(
85
+ text,
86
+ batch_length=token_batch_length,
87
+ batch_stride=batch_stride,
88
+ **settings,
89
+ )
90
+ """
91
+
92
+ outpath = (
93
+ Path.cwd() / f"document_summary_{get_timestamp()}.txt"
94
+ if outpath is None
95
+ else Path(outpath)
96
+ )
97
+ sum_text = [s["summary"][0] for s in summarize_output]
98
+ sum_scores = [f"\n - {round(s['summary_score'],4)}" for s in summarize_output]
99
+ scores_text = "\n".join(sum_scores)
100
+ full_summary = "\n\t".join(sum_text)
101
+
102
+ with open(
103
+ outpath,
104
+ "w",
105
+ ) as fo:
106
+ if add_signature:
107
+ fo.write(
108
+ "Generated with the Document Summarization space :) https://hf.co/spaces/pszemraj/document-summarization\n\n"
109
+ )
110
+ fo.writelines(full_summary)
111
+ with open(
112
+ outpath,
113
+ "a",
114
+ ) as fo:
115
+
116
+ fo.write("\n" * 3)
117
+ fo.write(f"\n\nSection Scores:\n")
118
+ fo.writelines(scores_text)
119
+ fo.write("\n\n---\n")
120
+
121
+ return outpath