Blaise-g commited on
Commit
09c590a
β€’
1 Parent(s): 10d5b39

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -79
app.py CHANGED
@@ -17,18 +17,17 @@ logging.basicConfig(
17
  level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
18
  )
19
 
 
20
  def proc_submission(
21
  input_text: str,
22
- summary_type: str,
23
- model_type: str,
24
  num_beams,
25
  token_batch_length,
26
  length_penalty,
27
- max_input_length: int = 2000,
28
  ):
29
  """
30
  proc_submission - a helper function for the gradio module to process submissions
31
-
32
  Args:
33
  input_text (str): the input text to summarize
34
  model_size (str): the size of the model to use
@@ -38,93 +37,62 @@ def proc_submission(
38
  repetition_penalty (float): the repetition penalty to use
39
  no_repeat_ngram_size (int): the no repeat ngram size to use
40
  max_input_length (int, optional): the maximum input length to use. Defaults to 768.
41
-
42
  Returns:
43
- str in HTML format, string of the summary, str of compression rate in %
44
  """
45
 
46
- settings_tldr = {
47
- "length_penalty": float(length_penalty),#0.6,
48
- "repetition_penalty": 3.5,
49
- "no_repeat_ngram_size": 3,
50
- "encoder_no_repeat_ngram_size": 4,
51
- "num_beams": int(num_beams),
52
- "min_length": 11,
53
- "max_length": 62,
54
- "early_stopping": True,
55
- }
56
- settings_det = {
57
- "length_penalty": float(length_penalty),#2.0 if (model_type == "LED") else 0.8,
58
  "repetition_penalty": 3.5,
59
  "no_repeat_ngram_size": 3,
60
  "encoder_no_repeat_ngram_size": 4,
61
  "num_beams": int(num_beams),
62
- "min_length": 100,
63
- "max_length": 512,#int(token_batch_length // 4) if (token_batch_length <500) else 512,
64
  "early_stopping": True,
 
65
  }
66
  st = time.perf_counter()
67
  history = {}
68
  clean_text = clean(input_text, lower=False)
69
- #max_input_length = 2048 if model_type == "tldr" else max_input_length
70
  processed = truncate_word_count(clean_text, max_input_length)
71
 
72
  if processed["was_truncated"]:
73
  tr_in = processed["truncated_text"]
74
- msg = f"Input text was truncated to {max_input_length} words to fit within computational constraints"
75
  logging.warning(msg)
76
  history["WARNING"] = msg
77
  else:
78
  tr_in = input_text
79
  msg = None
80
-
81
- #if (summary_type == "TLDR"):
82
- #_summaries = summarize_via_tokenbatches(
83
- #tr_in,
84
- #model_led_tldr if (model_type == "LED") else model_tldr,
85
- #tokenizer_led_tldr if (model_type == "LED") else tokenizer_tldr,
86
- #batch_length=token_batch_length,
87
- #**settings_tldr,
88
- #)
89
-
90
- #else:
91
- #_summaries = summarize_via_tokenbatches(
92
- #tr_in,
93
- #model_led_det if (model_type == "LED") else model_det,
94
- #tokenizer_led_det if (model_type == "LED") else tokenizer_det,
95
- #batch_length=token_batch_length,
96
- #**settings_det,
97
- #)
98
-
99
- settings = settings_tldr if summary_type == 'tldr' else settings_det
100
-
101
  _summaries = summarize_via_tokenbatches(
102
  tr_in,
103
- model_tldr if (summary_type == "tldr") else model_det,
104
- tokenizer_tldr if (summary_type == "tldr") else tokenizer_det,
105
  batch_length=token_batch_length,
106
  **settings,
107
  )
108
-
109
  sum_text = [f"Section {i}: " + s["summary"][0] for i, s in enumerate(_summaries)]
110
- compression_rate = [
111
- f" - Section {i}: {round(s['compression_rate'],3)}"
112
  for i, s in enumerate(_summaries)
113
  ]
114
 
115
  sum_text_out = "\n".join(sum_text)
116
- history["compression_rate"] = "<br><br>"
117
- rate_out = "\n".join(compression_rate)
118
  rt = round((time.perf_counter() - st) / 60, 2)
119
  print(f"Runtime: {rt} minutes")
120
  html = ""
121
- html += f"<p>Runtime: {rt} minutes on CPU inference</p>"
122
  if msg is not None:
123
  html += f"<h2>WARNING:</h2><hr><b>{msg}</b><br><br>"
124
 
125
  html += ""
126
 
127
- return html, sum_text_out, rate_out
128
 
129
 
130
  def load_single_example_text(
@@ -172,10 +140,8 @@ def load_uploaded_file(file_obj):
172
 
173
  if __name__ == "__main__":
174
 
175
- model_det, tokenizer_det = load_model_and_tokenizer("Blaise-g/longt5_tglobal_large_sumpubmed")
176
- model_tldr, tokenizer_tldr = load_model_and_tokenizer("Blaise-g/longt5_tglobal_large_scitldr")
177
- #model_led_det, tokenizer_led_det = #load_model_and_tokenizer("Blaise-g/led_pubmed_sumpubmed_1")
178
- #model_led_tldr, tokenizer_led_tldr = load_model_and_tokenizer("Blaise-g/led_large_sumpbumed_scitldr")
179
 
180
  name_to_path = load_example_filenames(_here / "examples")
181
  logging.info(f"Loaded {len(name_to_path)} examples")
@@ -185,20 +151,17 @@ if __name__ == "__main__":
185
 
186
  gr.Markdown("# Automatic summarization of biomedical research papers with neural abstractive methods into a long and comprehensive synopsis or extreme TLDR summary version")
187
  gr.Markdown(
188
- "A rather simple demo (developed for my Master Thesis project) using an ad-hoc fine-tuned LongT5 or LED model to summarize long biomedical articles (or any scientific text related to the biomedical domain) into a detailed, explanatory synopsis or extreme TLDR summary."
189
  )
190
  with gr.Column():
191
 
192
- gr.Markdown("### Load Text Inputs, Select Model & Summary Type")
193
  gr.Markdown(
194
- "Enter text below in the text area. The text will be summarized [using the selected text generation parameters](https://huggingface.co/blog/how-to-generate). Optionally load an available example below or upload a file."
195
  )
196
  with gr.Row():
197
- summary_type = gr.Radio(
198
- choices=["tldr", "detailed"], label="Summary Type", value="tldr"
199
- )
200
- model_type = gr.Radio(
201
- choices=["LongT5", "LED"], label="Model Architecture", value="LongT5"
202
  )
203
  num_beams = gr.Radio(
204
  choices=[2, 3, 4],
@@ -206,7 +169,7 @@ if __name__ == "__main__":
206
  value=2,
207
  )
208
  gr.Markdown(
209
- "_The LED model is less performant than the LongT5 model, but it's smaller in terms of size and therefore all other parameters being equal allows for a longer input sequence._"
210
  )
211
  with gr.Row():
212
  length_penalty = gr.inputs.Slider(
@@ -219,9 +182,8 @@ if __name__ == "__main__":
219
  token_batch_length = gr.Radio(
220
  choices=[512, 768, 1024],
221
  label="token batch length",
222
- value=1024,
223
  )
224
-
225
  with gr.Row():
226
  example_name = gr.Dropdown(
227
  list(name_to_path.keys()),
@@ -233,7 +195,7 @@ if __name__ == "__main__":
233
  input_text = gr.Textbox(
234
  lines=6,
235
  label="Input Text (for summarization)",
236
- placeholder="Enter any scientific text to be condensed into a long and comprehensive digested format or an extreme TLDR summary version, the text will be preprocessed and truncated if necessary to fit within the computational constraints. The models were trained to handle long scientific papers but generalize reasonably well also to shorter text documents like abstracts with an appropriate. Might take a while to produce long summaries :)",
237
  )
238
  gr.Markdown("Upload your own file:")
239
  with gr.Row():
@@ -249,7 +211,7 @@ if __name__ == "__main__":
249
  with gr.Column():
250
  gr.Markdown("## Generate Summary")
251
  gr.Markdown(
252
- "Summary generation should take approximately less than 2 minutes for most settings."
253
  )
254
  summarize_button = gr.Button(
255
  "Summarize!",
@@ -259,24 +221,24 @@ if __name__ == "__main__":
259
  output_text = gr.HTML("<p><em>Output will appear below:</em></p>")
260
  gr.Markdown("### Summary Output")
261
  summary_text = gr.Textbox(
262
- label="Summary πŸ“", placeholder="The generated πŸ“ will appear here"
263
  )
264
  gr.Markdown(
265
- "The compression rate indicates the ratio between the machine-generated summary length and the input text (from 0% to 100%). The higher the compression rate the more extreme the summary is."
266
  )
267
- compression_rate = gr.Textbox(
268
- label="Compression rate πŸ—œ", placeholder="The πŸ—œ will appear here"
269
  )
270
 
271
  gr.Markdown("---")
272
 
273
  with gr.Column():
274
- gr.Markdown("## About the Models")
275
  gr.Markdown(
276
  "- [Blaise-g/longt5_tglobal_large_sumpubmed](https://huggingface.co/Blaise-g/longt5_tglobal_large_sumpubmed) is a fine-tuned checkpoint of [Stancld/longt5-tglobal-large-16384-pubmed-3k_steps](https://huggingface.co/Stancld/longt5-tglobal-large-16384-pubmed-3k_steps) on the [SumPubMed dataset](https://aclanthology.org/2021.acl-srw.30/). [Blaise-g/longt5_tglobal_large_scitldr](https://huggingface.co/Blaise-g/longt5_tglobal_large_scitldr) is a fine-tuned checkpoint of [Blaise-g/longt5_tglobal_large_sumpubmed](https://huggingface.co/Blaise-g/longt5_tglobal_large_sumpubmed) on the [Scitldr dataset](https://arxiv.org/abs/2004.15011). The goal was to create two models capable of handling the complex information contained in long biomedical documents and subsequently producing scientific summaries according to one of the two possible levels of conciseness: 1) A long explanatory synopsis that retains the majority of domain-specific language used in the original source text. 2)A one sentence long, TLDR style summary."
277
  )
278
  gr.Markdown(
279
- "- The two most important parameters-empirically-are the `num_beams` and `token_batch_length`. However, increasing these will also increase the amount of time it takes to generate a summary."
280
  )
281
  gr.Markdown("---")
282
 
@@ -292,13 +254,12 @@ if __name__ == "__main__":
292
  fn=proc_submission,
293
  inputs=[
294
  input_text,
295
- summary_type,
296
- model_type,
297
- length_penalty,
298
  num_beams,
299
  token_batch_length,
 
300
  ],
301
- outputs=[output_text, summary_text, compression_rate],
302
  )
303
 
304
  demo.launch(enable_queue=True, share=False)
 
17
  level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
18
  )
19
 
20
+
21
  def proc_submission(
22
  input_text: str,
23
+ model_size: str,
 
24
  num_beams,
25
  token_batch_length,
26
  length_penalty,
27
+ max_input_length: int = 768,
28
  ):
29
  """
30
  proc_submission - a helper function for the gradio module to process submissions
 
31
  Args:
32
  input_text (str): the input text to summarize
33
  model_size (str): the size of the model to use
 
37
  repetition_penalty (float): the repetition penalty to use
38
  no_repeat_ngram_size (int): the no repeat ngram size to use
39
  max_input_length (int, optional): the maximum input length to use. Defaults to 768.
 
40
  Returns:
41
+ str in HTML format, string of the summary, str of score
42
  """
43
 
44
+ settings = {
45
+ "length_penalty": float(length_penalty),
 
 
 
 
 
 
 
 
 
 
46
  "repetition_penalty": 3.5,
47
  "no_repeat_ngram_size": 3,
48
  "encoder_no_repeat_ngram_size": 4,
49
  "num_beams": int(num_beams),
50
+ "min_length": 4,
51
+ "max_length": int(token_batch_length // 4),
52
  "early_stopping": True,
53
+ "do_sample": False,
54
  }
55
  st = time.perf_counter()
56
  history = {}
57
  clean_text = clean(input_text, lower=False)
58
+ max_input_length = 2048 if model_size == "tldr" else max_input_length
59
  processed = truncate_word_count(clean_text, max_input_length)
60
 
61
  if processed["was_truncated"]:
62
  tr_in = processed["truncated_text"]
63
+ msg = f"Input text was truncated to {max_input_length} words to fit within the computational constraints"
64
  logging.warning(msg)
65
  history["WARNING"] = msg
66
  else:
67
  tr_in = input_text
68
  msg = None
69
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  _summaries = summarize_via_tokenbatches(
71
  tr_in,
72
+ model_sm if model_size == "tldr" else model,
73
+ tokenizer_sm if model_size == "tldr" else tokenizer,
74
  batch_length=token_batch_length,
75
  **settings,
76
  )
 
77
  sum_text = [f"Section {i}: " + s["summary"][0] for i, s in enumerate(_summaries)]
78
+ sum_scores = [
79
+ f" - Section {i}: {round(s['summary_score'],4)}"
80
  for i, s in enumerate(_summaries)
81
  ]
82
 
83
  sum_text_out = "\n".join(sum_text)
84
+ history["Summary Scores"] = "<br><br>"
85
+ scores_out = "\n".join(sum_scores)
86
  rt = round((time.perf_counter() - st) / 60, 2)
87
  print(f"Runtime: {rt} minutes")
88
  html = ""
89
+ html += f"<p>Runtime: {rt} minutes on CPU</p>"
90
  if msg is not None:
91
  html += f"<h2>WARNING:</h2><hr><b>{msg}</b><br><br>"
92
 
93
  html += ""
94
 
95
+ return html, sum_text_out, scores_out
96
 
97
 
98
  def load_single_example_text(
 
140
 
141
  if __name__ == "__main__":
142
 
143
+ model, tokenizer = load_model_and_tokenizer("Blaise-g/longt5_tglobal_large_sumpubmed")
144
+ model_sm, tokenizer_sm = load_model_and_tokenizer("Blaise-g/longt5_tglobal_large_scitldr")
 
 
145
 
146
  name_to_path = load_example_filenames(_here / "examples")
147
  logging.info(f"Loaded {len(name_to_path)} examples")
 
151
 
152
  gr.Markdown("# Automatic summarization of biomedical research papers with neural abstractive methods into a long and comprehensive synopsis or extreme TLDR summary version")
153
  gr.Markdown(
154
+ "A rather simple demo developed for my Master Thesis project using ad-hoc fine-tuned abstractive summarization models to summarize long biomedical articles (or any scientific text related to the biomedical domain) into a detailed, explanatory synopsis or extreme TLDR summary."
155
  )
156
  with gr.Column():
157
 
158
+ gr.Markdown("### Select Summary type and text generation parameters then load input text")
159
  gr.Markdown(
160
+ "Enter text below in the text area. The text will be summarized [using the selected parameters](https://huggingface.co/blog/how-to-generate). Optionally load an example below or upload a file."
161
  )
162
  with gr.Row():
163
+ model_size = gr.Radio(
164
+ choices=["tldr", "detailed"], label="Summary type", value="detailed"
 
 
 
165
  )
166
  num_beams = gr.Radio(
167
  choices=[2, 3, 4],
 
169
  value=2,
170
  )
171
  gr.Markdown(
172
+ "_The base model is less performant than the large model, but is faster and will accept up to 2048 words per input (Large model accepts up to 768)._"
173
  )
174
  with gr.Row():
175
  length_penalty = gr.inputs.Slider(
 
182
  token_batch_length = gr.Radio(
183
  choices=[512, 768, 1024],
184
  label="token batch length",
185
+ value=512,
186
  )
 
187
  with gr.Row():
188
  example_name = gr.Dropdown(
189
  list(name_to_path.keys()),
 
195
  input_text = gr.Textbox(
196
  lines=6,
197
  label="Input Text (for summarization)",
198
+ placeholder="Enter text to summarize, the text will be cleaned and truncated on Spaces. Narrative, academic (both papers and lecture transcription), and article text work well. May take a bit to generate depending on the input text :)",
199
  )
200
  gr.Markdown("Upload your own file:")
201
  with gr.Row():
 
211
  with gr.Column():
212
  gr.Markdown("## Generate Summary")
213
  gr.Markdown(
214
+ "Summary generation should take approximately 1-2 minutes for most settings."
215
  )
216
  summarize_button = gr.Button(
217
  "Summarize!",
 
221
  output_text = gr.HTML("<p><em>Output will appear below:</em></p>")
222
  gr.Markdown("### Summary Output")
223
  summary_text = gr.Textbox(
224
+ label="Summary", placeholder="The generated summary will appear here"
225
  )
226
  gr.Markdown(
227
+ "The summary scores can be thought of as representing the quality of the summary. less-negative numbers (closer to 0) are better:"
228
  )
229
+ summary_scores = gr.Textbox(
230
+ label="Summary Scores", placeholder="Summary scores will appear here"
231
  )
232
 
233
  gr.Markdown("---")
234
 
235
  with gr.Column():
236
+ gr.Markdown("## About the Model")
237
  gr.Markdown(
238
  "- [Blaise-g/longt5_tglobal_large_sumpubmed](https://huggingface.co/Blaise-g/longt5_tglobal_large_sumpubmed) is a fine-tuned checkpoint of [Stancld/longt5-tglobal-large-16384-pubmed-3k_steps](https://huggingface.co/Stancld/longt5-tglobal-large-16384-pubmed-3k_steps) on the [SumPubMed dataset](https://aclanthology.org/2021.acl-srw.30/). [Blaise-g/longt5_tglobal_large_scitldr](https://huggingface.co/Blaise-g/longt5_tglobal_large_scitldr) is a fine-tuned checkpoint of [Blaise-g/longt5_tglobal_large_sumpubmed](https://huggingface.co/Blaise-g/longt5_tglobal_large_sumpubmed) on the [Scitldr dataset](https://arxiv.org/abs/2004.15011). The goal was to create two models capable of handling the complex information contained in long biomedical documents and subsequently producing scientific summaries according to one of the two possible levels of conciseness: 1) A long explanatory synopsis that retains the majority of domain-specific language used in the original source text. 2)A one sentence long, TLDR style summary."
239
  )
240
  gr.Markdown(
241
+ "- The two most important text generation parameters are the `num_beams` and `token_batch_length`. However, increasing them will also increase the amount of execution time needed to generate a summary."
242
  )
243
  gr.Markdown("---")
244
 
 
254
  fn=proc_submission,
255
  inputs=[
256
  input_text,
257
+ model_size,
 
 
258
  num_beams,
259
  token_batch_length,
260
+ length_penalty,
261
  ],
262
+ outputs=[output_text, summary_text, summary_scores],
263
  )
264
 
265
  demo.launch(enable_queue=True, share=False)