pszemraj commited on
Commit
7e0dde7
β€’
1 Parent(s): 62a2921

πŸ“ πŸ’„ improve docs and UI

Browse files

Signed-off-by: peter szemraj <peterszemraj@gmail.com>

Files changed (3) hide show
  1. aggregate.py +6 -3
  2. app.py +61 -39
  3. utils.py +23 -21
aggregate.py CHANGED
@@ -1,5 +1,5 @@
1
  """
2
- aggregate.py is a module for aggregating text from multiple sources, or multiple parts of a single source.
3
  Primary usage is through the BatchAggregator class.
4
 
5
  How it works:
@@ -29,7 +29,8 @@ class BatchAggregator:
29
  Usage:
30
  >>> from aggregate import BatchAggregator
31
  >>> aggregator = BatchAggregator()
32
- >>> aggregator.aggregate(["This is a test", "This is another test"])
 
33
  """
34
 
35
  GENERIC_CONFIG = GenerationConfig(
@@ -187,7 +188,7 @@ class BatchAggregator:
187
  **kwargs,
188
  ) -> str:
189
  f"""
190
- Generate a summary of the specified texts.
191
 
192
  Args:
193
  text_list (list): The texts to summarize.
@@ -211,6 +212,8 @@ class BatchAggregator:
211
  self.logger.info(
212
  f"Input tokens:\t{self.count_tokens(prompt)}. Output tokens:\t{self.count_tokens(result)}"
213
  )
 
 
214
  return result
215
 
216
  def count_tokens(self, text: str) -> int:
 
1
  """
2
+ aggregate.py - module for aggregating text from multiple sources/multiple parts of a single source.
3
  Primary usage is through the BatchAggregator class.
4
 
5
  How it works:
 
29
  Usage:
30
  >>> from aggregate import BatchAggregator
31
  >>> aggregator = BatchAggregator()
32
+ >>> agg = aggregator.infer_aggregate(["This is a test", "This is another test"])
33
+ >>> print(agg)
34
  """
35
 
36
  GENERIC_CONFIG = GenerationConfig(
 
188
  **kwargs,
189
  ) -> str:
190
  f"""
191
+ infer_aggregate - infers a consolidated summary from a list of texts.
192
 
193
  Args:
194
  text_list (list): The texts to summarize.
 
212
  self.logger.info(
213
  f"Input tokens:\t{self.count_tokens(prompt)}. Output tokens:\t{self.count_tokens(result)}"
214
  )
215
+ self.logger.debug(f"Generated text:\n{result}")
216
+
217
  return result
218
 
219
  def count_tokens(self, text: str) -> int:
app.py CHANGED
@@ -2,7 +2,7 @@
2
  app.py - the main module for the gradio app for summarization
3
 
4
  Usage:
5
- python app.py
6
 
7
  Environment Variables:
8
  USE_TORCH (str): whether to use torch (1) or not (0)
@@ -20,7 +20,6 @@ import random
20
  import re
21
  import time
22
  from pathlib import Path
23
- import pprint as pp
24
 
25
  os.environ["USE_TORCH"] = "1"
26
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
@@ -33,18 +32,19 @@ logging.basicConfig(
33
  import gradio as gr
34
  import nltk
35
  import torch
36
- from aggregate import BatchAggregator
37
  from cleantext import clean
38
  from doctr.models import ocr_predictor
 
 
39
  from pdf2text import convert_PDF_to_Text
40
  from summarize import load_model_and_tokenizer, summarize_via_tokenbatches
41
  from utils import (
42
  extract_batches,
43
  load_example_filenames,
 
44
  saves_summary,
45
  textlist2html,
46
  truncate_word_count,
47
- remove_stagnant_files,
48
  )
49
 
50
  _here = Path(__file__).parent
@@ -62,12 +62,13 @@ MODEL_OPTIONS = [
62
  ] # models users can choose from
63
 
64
  SUMMARY_PLACEHOLDER = "<p><em>Output will appear below:</em></p>"
 
65
 
66
- # if duplicating space,, uncomment this line to adjust the max words
67
  # os.environ["APP_MAX_WORDS"] = str(2048) # set the max words to 2048
68
  # os.environ["APP_OCR_MAX_PAGES"] = str(40) # set the max pages to 40
69
 
70
- aggregator = BatchAggregator("MBZUAI/LaMini-Flan-T5-783M")
71
 
72
 
73
  def aggregate_text(
@@ -77,8 +78,8 @@ def aggregate_text(
77
  """
78
  Aggregate the text from the batches.
79
 
80
- NOTE: you should probably include passing the BatchAggregator object as a parameter if using this code
81
- outside of this file.
82
  :param batches_html: The batches to aggregate, in html format
83
  :param text_file: The text file to append the aggregate summary to
84
  :return: The aggregate summary in html format
@@ -104,13 +105,13 @@ def aggregate_text(
104
  content_batches = [batch["content"] for batch in extracted_batches]
105
  full_summary = aggregator.infer_aggregate(content_batches)
106
 
107
- # if a path that exists is provided, save the summary with markdown formatting
108
  if out_path:
109
  out_path = Path(out_path)
110
 
111
  try:
112
  with open(out_path, "a", encoding="utf-8") as f:
113
- f.write("\n\n### Aggregate Summary\n\n")
114
  f.write(
115
  "- This is an instruction-based LLM aggregation of the previous 'summary batches'.\n"
116
  )
@@ -341,9 +342,9 @@ def load_uploaded_file(file_obj, max_pages: int = 20, lower: bool = False) -> st
341
  raw_text = f.read()
342
  text = clean(raw_text, lower=lower)
343
  elif file_path.suffix == ".pdf":
344
- logger.info(f"loading as PDF file {file_path}")
345
  max_pages = int(os.environ.get("APP_MAX_PAGES", max_pages))
346
- logger.info(f"max_pages set to: {max_pages}")
347
  conversion_stats = convert_PDF_to_Text(
348
  file_path,
349
  ocr_model=ocr_model,
@@ -357,13 +358,15 @@ def load_uploaded_file(file_obj, max_pages: int = 20, lower: bool = False) -> st
357
  return text
358
  except Exception as e:
359
  logger.error(f"Trying to load file:\t{file_path},\nerror:\t{e}")
360
- return "Error: Could not read file. Ensure that it is a valid text file with encoding UTF-8 if text, and a PDF if PDF."
361
 
362
 
363
  def parse_args():
 
364
  parser = argparse.ArgumentParser(
365
- description="Document Summarization with Long-Document Transformers",
366
  formatter_class=argparse.ArgumentDefaultsHelpFormatter,
 
367
  )
368
  parser.add_argument(
369
  "--share",
@@ -415,7 +418,7 @@ if __name__ == "__main__":
415
  with demo:
416
  gr.Markdown("# Document Summarization with Long-Document Transformers")
417
  gr.Markdown(
418
- "An example use case for fine-tuned long document transformers. Model(s) are trained on [book summaries](https://huggingface.co/datasets/kmfoda/booksum). Architectures in this demo are [LongT5-base](https://huggingface.co/pszemraj/long-t5-tglobal-base-16384-book-summary) and [Pegasus-X-Large](https://huggingface.co/pszemraj/pegasus-x-large-book-summary)."
419
  )
420
  with gr.Column():
421
  gr.Markdown("## Load Inputs & Select Parameters")
@@ -440,7 +443,7 @@ if __name__ == "__main__":
440
  load_examples_button = gr.Button(
441
  "Load Example in Dropdown",
442
  )
443
- load_file_button = gr.Button("Load an Uploaded File")
444
  with gr.Column(variant="compact"):
445
  example_name = gr.Dropdown(
446
  _examples,
@@ -457,22 +460,23 @@ if __name__ == "__main__":
457
  input_text = gr.Textbox(
458
  lines=4,
459
  max_lines=12,
460
- label="Input Text (for summarization)",
461
  placeholder="Enter text to summarize, the text will be cleaned and truncated on Spaces. Narrative, academic (both papers and lecture transcription), and article text work well. May take a bit to generate depending on the input text :)",
462
  )
463
  gr.Markdown("---")
464
  with gr.Column():
465
  gr.Markdown("## Generate Summary")
466
- gr.Markdown(
467
- "_Summarization should take ~1-2 minutes for most settings, but may extend up to 5-10 minutes in some scenarios._"
468
- )
469
- summarize_button = gr.Button(
470
- "Summarize!",
471
- variant="primary",
472
- ) # TODO: collapse button to be on same line as something else
 
473
  output_text = gr.HTML("<p><em>Output will appear below:</em></p>")
474
  with gr.Column():
475
- gr.Markdown("#### Results & Scores")
476
  with gr.Row():
477
  with gr.Column(variant="compact"):
478
  gr.Markdown(
@@ -486,24 +490,42 @@ if __name__ == "__main__":
486
  )
487
  with gr.Column(variant="compact"):
488
  gr.Markdown(
489
- "Scores represent the summary quality **roughly** as a measure of the model's 'confidence'. less-negative numbers (closer to 0) are better."
490
  )
491
  summary_scores = gr.Textbox(
492
  label="Summary Scores",
493
  placeholder="Summary scores will appear here",
494
  )
495
- with gr.Column():
496
- gr.Markdown("#### **Summary Output**")
497
  summary_text = gr.HTML(
498
- label="Summary", value="<i>Summary will appear here!</i>"
 
499
  )
500
  with gr.Column():
501
- gr.Markdown("##### **Aggregate Summary Batches**")
502
- aggregate_button = gr.Button(
503
- "Aggregate!",
504
- variant="primary",
505
- ) # TODO: collapse button to be on same line as something else
506
- aggregated_summary = gr.HTML(label="Aggregate Summary", value="")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
507
 
508
  gr.Markdown("---")
509
  with gr.Column():
@@ -539,15 +561,15 @@ if __name__ == "__main__":
539
  value=3,
540
  )
541
  with gr.Column():
542
- gr.Markdown("### About")
543
  gr.Markdown(
544
- "- Models are fine-tuned on the [BookSum dataset](https://arxiv.org/abs/2105.08209). The goal was to create a model that generalizes well and is useful for summarizing text in academic and everyday use."
545
  )
546
  gr.Markdown(
547
- "- _Update April 2023:_ Additional models fine-tuned on the [PLOS](https://huggingface.co/datasets/pszemraj/scientific_lay_summarisation-plos-norm) and [ELIFE](https://huggingface.co/datasets/pszemraj/scientific_lay_summarisation-elife-norm) subsets of the [scientific lay summaries](https://arxiv.org/abs/2210.09932) dataset are available (see dropdown at the top)."
548
  )
549
  gr.Markdown(
550
- "Adjust the max input words & max PDF pages for OCR by duplicating this space and [setting the environment variables](https://huggingface.co/docs/hub/spaces-overview#managing-secrets) `APP_MAX_WORDS` and `APP_OCR_MAX_PAGES` to the desired integer values."
551
  )
552
  gr.Markdown("---")
553
 
 
2
  app.py - the main module for the gradio app for summarization
3
 
4
  Usage:
5
+ python app.py --help
6
 
7
  Environment Variables:
8
  USE_TORCH (str): whether to use torch (1) or not (0)
 
20
  import re
21
  import time
22
  from pathlib import Path
 
23
 
24
  os.environ["USE_TORCH"] = "1"
25
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
32
  import gradio as gr
33
  import nltk
34
  import torch
 
35
  from cleantext import clean
36
  from doctr.models import ocr_predictor
37
+
38
+ from aggregate import BatchAggregator
39
  from pdf2text import convert_PDF_to_Text
40
  from summarize import load_model_and_tokenizer, summarize_via_tokenbatches
41
  from utils import (
42
  extract_batches,
43
  load_example_filenames,
44
+ remove_stagnant_files,
45
  saves_summary,
46
  textlist2html,
47
  truncate_word_count,
 
48
  )
49
 
50
  _here = Path(__file__).parent
 
62
  ] # models users can choose from
63
 
64
  SUMMARY_PLACEHOLDER = "<p><em>Output will appear below:</em></p>"
65
+ AGGREGATE_MODEL = "MBZUAI/LaMini-Flan-T5-783M" # model to use for aggregation
66
 
67
+ # if duplicating space: uncomment this line to adjust the max words
68
  # os.environ["APP_MAX_WORDS"] = str(2048) # set the max words to 2048
69
  # os.environ["APP_OCR_MAX_PAGES"] = str(40) # set the max pages to 40
70
 
71
+ aggregator = BatchAggregator(AGGREGATE_MODEL)
72
 
73
 
74
  def aggregate_text(
 
78
  """
79
  Aggregate the text from the batches.
80
 
81
+ NOTE: you should probably include the BatchAggregator object as a fn arg if using this code
82
+
83
  :param batches_html: The batches to aggregate, in html format
84
  :param text_file: The text file to append the aggregate summary to
85
  :return: The aggregate summary in html format
 
105
  content_batches = [batch["content"] for batch in extracted_batches]
106
  full_summary = aggregator.infer_aggregate(content_batches)
107
 
108
+ # if a path that exists is provided, append the summary with markdown formatting
109
  if out_path:
110
  out_path = Path(out_path)
111
 
112
  try:
113
  with open(out_path, "a", encoding="utf-8") as f:
114
+ f.write("\n\n## Aggregate Summary\n\n")
115
  f.write(
116
  "- This is an instruction-based LLM aggregation of the previous 'summary batches'.\n"
117
  )
 
342
  raw_text = f.read()
343
  text = clean(raw_text, lower=lower)
344
  elif file_path.suffix == ".pdf":
345
+ logger.info(f"loading a PDF file: {file_path.name}")
346
  max_pages = int(os.environ.get("APP_MAX_PAGES", max_pages))
347
+ logger.info(f"max_pages is: {max_pages}. Starting conversion...")
348
  conversion_stats = convert_PDF_to_Text(
349
  file_path,
350
  ocr_model=ocr_model,
 
358
  return text
359
  except Exception as e:
360
  logger.error(f"Trying to load file:\t{file_path},\nerror:\t{e}")
361
+ return f"Error: Could not read file {file_path.name}. Make sure it is a PDF, TXT, or MD file."
362
 
363
 
364
  def parse_args():
365
+ """arguments for the command line interface"""
366
  parser = argparse.ArgumentParser(
367
+ description="Document Summarization with Long-Document Transformers Demo",
368
  formatter_class=argparse.ArgumentDefaultsHelpFormatter,
369
+ epilog="Runs a local-only web app to summarize documents. use --share for a public link to share.",
370
  )
371
  parser.add_argument(
372
  "--share",
 
418
  with demo:
419
  gr.Markdown("# Document Summarization with Long-Document Transformers")
420
  gr.Markdown(
421
+ "An example use case for fine-tuned long document transformers. Model(s) are trained on [book summaries](https://hf.co/datasets/kmfoda/booksum). Architectures [in this demo](https://hf.co/spaces/pszemraj/document-summarization) are [LongT5-base](https://hf.co/pszemraj/long-t5-tglobal-base-16384-book-summary) and [Pegasus-X-Large](https://hf.co/pszemraj/pegasus-x-large-book-summary)."
422
  )
423
  with gr.Column():
424
  gr.Markdown("## Load Inputs & Select Parameters")
 
443
  load_examples_button = gr.Button(
444
  "Load Example in Dropdown",
445
  )
446
+ load_file_button = gr.Button("Load & Process File")
447
  with gr.Column(variant="compact"):
448
  example_name = gr.Dropdown(
449
  _examples,
 
460
  input_text = gr.Textbox(
461
  lines=4,
462
  max_lines=12,
463
+ label="Text to Summarize",
464
  placeholder="Enter text to summarize, the text will be cleaned and truncated on Spaces. Narrative, academic (both papers and lecture transcription), and article text work well. May take a bit to generate depending on the input text :)",
465
  )
466
  gr.Markdown("---")
467
  with gr.Column():
468
  gr.Markdown("## Generate Summary")
469
+ with gr.Row():
470
+ summarize_button = gr.Button(
471
+ "Summarize!",
472
+ variant="primary",
473
+ )
474
+ gr.Markdown(
475
+ "_Summarization should take ~1-2 minutes for most settings, but may extend up to 5-10 minutes in some scenarios._"
476
+ )
477
  output_text = gr.HTML("<p><em>Output will appear below:</em></p>")
478
  with gr.Column():
479
+ gr.Markdown("### Results & Scores")
480
  with gr.Row():
481
  with gr.Column(variant="compact"):
482
  gr.Markdown(
 
490
  )
491
  with gr.Column(variant="compact"):
492
  gr.Markdown(
493
+ "Scores **roughly** represent the summary quality as a measure of the model's 'confidence'. less-negative numbers (closer to 0) are better."
494
  )
495
  summary_scores = gr.Textbox(
496
  label="Summary Scores",
497
  placeholder="Summary scores will appear here",
498
  )
499
+ with gr.Column(variant="panel"):
500
+ gr.Markdown("### **Summary Output**")
501
  summary_text = gr.HTML(
502
+ label="Summary",
503
+ value="<center><i>Summary will appear here!</i></center>",
504
  )
505
  with gr.Column():
506
+ gr.Markdown("### **Aggregate Summary Batches**")
507
+ gr.Markdown(
508
+ "_Note: this is an experimental feature. Feedback welcome in the [discussions](https://hf.co/spaces/pszemraj/document-summarization/discussions)!_"
509
+ )
510
+ with gr.Row():
511
+ aggregate_button = gr.Button(
512
+ "Aggregate!",
513
+ variant="primary",
514
+ )
515
+ gr.Markdown(
516
+ f"""Aggregate the above batches into a cohesive summary.
517
+ - a secondary instruct-tuned LM consolidates info from the batches
518
+ - current model: [{AGGREGATE_MODEL}](https://hf.co/{AGGREGATE_MODEL})
519
+ """
520
+ )
521
+ with gr.Column(variant="panel"):
522
+ aggregated_summary = gr.HTML(
523
+ label="Aggregate Summary",
524
+ value="<center><i>Aggregate summary will appear here!</i></center>",
525
+ )
526
+ gr.Markdown(
527
+ "\n\n_Aggregate summary also appended to the bottom of the `.txt` file!_"
528
+ )
529
 
530
  gr.Markdown("---")
531
  with gr.Column():
 
561
  value=3,
562
  )
563
  with gr.Column():
564
+ gr.Markdown("## About")
565
  gr.Markdown(
566
+ "- Models are fine-tuned on the [πŸ…±οΈookSum dataset](https://arxiv.org/abs/2105.08209). The goal was to create a model that generalizes well and is useful for summarizing text in academic and everyday use."
567
  )
568
  gr.Markdown(
569
+ "- _Update April 2023:_ Additional models fine-tuned on the [PLOS](https://hf.co/datasets/pszemraj/scientific_lay_summarisation-plos-norm) and [ELIFE](https://hf.co/datasets/pszemraj/scientific_lay_summarisation-elife-norm) subsets of the [scientific lay summaries](https://arxiv.org/abs/2210.09932) dataset are available (see dropdown at the top)."
570
  )
571
  gr.Markdown(
572
+ "Adjust the max input words & max PDF pages for OCR by duplicating this space and [setting the environment variables](https://hf.co/docs/hub/spaces-overview#managing-secrets) `APP_MAX_WORDS` and `APP_OCR_MAX_PAGES` to the desired integer values."
573
  )
574
  gr.Markdown("---")
575
 
utils.py CHANGED
@@ -4,6 +4,7 @@
4
  import logging
5
  import os
6
  import re
 
7
  import subprocess
8
  from collections import defaultdict, deque
9
  from datetime import datetime, timedelta
@@ -111,10 +112,9 @@ def compare_model_size(model_name: str, threshold: int = 500) -> bool:
111
  if not matches:
112
  return None
113
 
114
- # Extract the parameter count and unit from the last match
115
  parameter_count, unit = matches[-1]
116
-
117
- parameter_count = int(parameter_count) # Convert to an integer
118
 
119
  # Convert to the standard form (M for million, G for billion, k for thousand)
120
  if unit == "G" or unit == "b":
@@ -129,7 +129,14 @@ def compare_model_size(model_name: str, threshold: int = 500) -> bool:
129
  return parameter_count > threshold
130
 
131
 
132
- def validate_pytorch2(torch_version: str = None):
 
 
 
 
 
 
 
133
  torch_version = torch.__version__ if torch_version is None else torch_version
134
 
135
  pattern = r"^2\.\d+(\.\d+)*"
@@ -140,8 +147,8 @@ def validate_pytorch2(torch_version: str = None):
140
  def get_timestamp(detailed=False) -> str:
141
  """
142
  get_timestamp - get a timestamp for the current time
143
- Returns:
144
- str, the timestamp
145
  """
146
  return (
147
  datetime.now().strftime("%b%d%Y_%H%M%S%f")
@@ -150,18 +157,13 @@ def get_timestamp(detailed=False) -> str:
150
  )
151
 
152
 
153
- def truncate_word_count(text, max_words=1024):
154
  """
155
- truncate_word_count - a helper function for the gradio module
156
- Parameters
157
- ----------
158
- text : str, required, the text to be processed
159
- max_words : int, optional, the maximum number of words, default=512
160
- Returns
161
- -------
162
- dict, the text and whether it was truncated
163
  """
164
- # split on whitespace with regex
165
  words = re.split(r"\s+", text)
166
  processed = {}
167
  if len(words) > max_words:
@@ -176,8 +178,7 @@ def truncate_word_count(text, max_words=1024):
176
  def load_examples(src, filetypes=[".txt", ".pdf"]):
177
  """
178
  load_examples - a helper function for the gradio module to load examples
179
- Returns:
180
- list of str, the examples
181
  """
182
  src = Path(src)
183
  src.mkdir(exist_ok=True)
@@ -210,7 +211,8 @@ def load_example_filenames(example_path: str or Path):
210
  return examples
211
 
212
 
213
- def textlist2html(text_batches):
 
214
  # Step 1: Generate each summary batch as a string of HTML
215
  formatted_batches = [
216
  f"""
@@ -244,7 +246,7 @@ def textlist2html(text_batches):
244
  return text_html_block
245
 
246
 
247
- def extract_batches(html_string, pattern=None, flags=None) -> list:
248
  """
249
  Extract batches of text from an HTML string.
250
 
@@ -336,7 +338,7 @@ def extract_keywords(
336
 
337
  def saves_summary(
338
  summarize_output, outpath: str or Path = None, add_signature=True, **kwargs
339
- ):
340
  """
341
  saves_summary - save the summary generated from summarize_via_tokenbatches() to a text file
342
 
 
4
  import logging
5
  import os
6
  import re
7
+ import string
8
  import subprocess
9
  from collections import defaultdict, deque
10
  from datetime import datetime, timedelta
 
112
  if not matches:
113
  return None
114
 
115
+ # Extract the parameter count and unit
116
  parameter_count, unit = matches[-1]
117
+ parameter_count = int(parameter_count)
 
118
 
119
  # Convert to the standard form (M for million, G for billion, k for thousand)
120
  if unit == "G" or unit == "b":
 
129
  return parameter_count > threshold
130
 
131
 
132
+ def validate_pytorch2(torch_version: str = None) -> bool:
133
+ """
134
+ validate_pytorch2 - validate that the PyTorch version is 2.0 or greater
135
+
136
+ :param str torch_version: the PyTorch version to validate, defaults to None
137
+ :return: True if the PyTorch version is 2.0 or greater, False otherwise
138
+ """
139
+
140
  torch_version = torch.__version__ if torch_version is None else torch_version
141
 
142
  pattern = r"^2\.\d+(\.\d+)*"
 
147
  def get_timestamp(detailed=False) -> str:
148
  """
149
  get_timestamp - get a timestamp for the current time
150
+ :param bool detailed: whether to include seconds and microseconds, defaults to False
151
+ :return: str, the timestamp
152
  """
153
  return (
154
  datetime.now().strftime("%b%d%Y_%H%M%S%f")
 
157
  )
158
 
159
 
160
+ def truncate_word_count(text: str, max_words=1024) -> dict:
161
  """
162
+ truncate_word_count - truncate a text to a maximum number of words
163
+ :param str text: the text to truncate
164
+ :param int max_words: the maximum number of words to keep, defaults to 1024
165
+ :return: dict, the processed text
 
 
 
 
166
  """
 
167
  words = re.split(r"\s+", text)
168
  processed = {}
169
  if len(words) > max_words:
 
178
  def load_examples(src, filetypes=[".txt", ".pdf"]):
179
  """
180
  load_examples - a helper function for the gradio module to load examples
181
+ :param str src: the path to the examples
 
182
  """
183
  src = Path(src)
184
  src.mkdir(exist_ok=True)
 
211
  return examples
212
 
213
 
214
+ def textlist2html(text_batches: List[str]) -> str:
215
+ """textlist2html - convert a list of text summaries into a single HTML string"""
216
  # Step 1: Generate each summary batch as a string of HTML
217
  formatted_batches = [
218
  f"""
 
246
  return text_html_block
247
 
248
 
249
+ def extract_batches(html_string: str, pattern=None, flags=None) -> list:
250
  """
251
  Extract batches of text from an HTML string.
252
 
 
338
 
339
  def saves_summary(
340
  summarize_output, outpath: str or Path = None, add_signature=True, **kwargs
341
+ ) -> Path:
342
  """
343
  saves_summary - save the summary generated from summarize_via_tokenbatches() to a text file
344