Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
π π improve docs and UI
Browse filesSigned-off-by: peter szemraj <peterszemraj@gmail.com>
- aggregate.py +6 -3
- app.py +61 -39
- utils.py +23 -21
aggregate.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
"""
|
2 |
-
aggregate.py
|
3 |
Primary usage is through the BatchAggregator class.
|
4 |
|
5 |
How it works:
|
@@ -29,7 +29,8 @@ class BatchAggregator:
|
|
29 |
Usage:
|
30 |
>>> from aggregate import BatchAggregator
|
31 |
>>> aggregator = BatchAggregator()
|
32 |
-
>>> aggregator.
|
|
|
33 |
"""
|
34 |
|
35 |
GENERIC_CONFIG = GenerationConfig(
|
@@ -187,7 +188,7 @@ class BatchAggregator:
|
|
187 |
**kwargs,
|
188 |
) -> str:
|
189 |
f"""
|
190 |
-
|
191 |
|
192 |
Args:
|
193 |
text_list (list): The texts to summarize.
|
@@ -211,6 +212,8 @@ class BatchAggregator:
|
|
211 |
self.logger.info(
|
212 |
f"Input tokens:\t{self.count_tokens(prompt)}. Output tokens:\t{self.count_tokens(result)}"
|
213 |
)
|
|
|
|
|
214 |
return result
|
215 |
|
216 |
def count_tokens(self, text: str) -> int:
|
|
|
1 |
"""
|
2 |
+
aggregate.py - module for aggregating text from multiple sources/multiple parts of a single source.
|
3 |
Primary usage is through the BatchAggregator class.
|
4 |
|
5 |
How it works:
|
|
|
29 |
Usage:
|
30 |
>>> from aggregate import BatchAggregator
|
31 |
>>> aggregator = BatchAggregator()
|
32 |
+
>>> agg = aggregator.infer_aggregate(["This is a test", "This is another test"])
|
33 |
+
>>> print(agg)
|
34 |
"""
|
35 |
|
36 |
GENERIC_CONFIG = GenerationConfig(
|
|
|
188 |
**kwargs,
|
189 |
) -> str:
|
190 |
f"""
|
191 |
+
infer_aggregate - infers a consolidated summary from a list of texts.
|
192 |
|
193 |
Args:
|
194 |
text_list (list): The texts to summarize.
|
|
|
212 |
self.logger.info(
|
213 |
f"Input tokens:\t{self.count_tokens(prompt)}. Output tokens:\t{self.count_tokens(result)}"
|
214 |
)
|
215 |
+
self.logger.debug(f"Generated text:\n{result}")
|
216 |
+
|
217 |
return result
|
218 |
|
219 |
def count_tokens(self, text: str) -> int:
|
app.py
CHANGED
@@ -2,7 +2,7 @@
|
|
2 |
app.py - the main module for the gradio app for summarization
|
3 |
|
4 |
Usage:
|
5 |
-
python app.py
|
6 |
|
7 |
Environment Variables:
|
8 |
USE_TORCH (str): whether to use torch (1) or not (0)
|
@@ -20,7 +20,6 @@ import random
|
|
20 |
import re
|
21 |
import time
|
22 |
from pathlib import Path
|
23 |
-
import pprint as pp
|
24 |
|
25 |
os.environ["USE_TORCH"] = "1"
|
26 |
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
@@ -33,18 +32,19 @@ logging.basicConfig(
|
|
33 |
import gradio as gr
|
34 |
import nltk
|
35 |
import torch
|
36 |
-
from aggregate import BatchAggregator
|
37 |
from cleantext import clean
|
38 |
from doctr.models import ocr_predictor
|
|
|
|
|
39 |
from pdf2text import convert_PDF_to_Text
|
40 |
from summarize import load_model_and_tokenizer, summarize_via_tokenbatches
|
41 |
from utils import (
|
42 |
extract_batches,
|
43 |
load_example_filenames,
|
|
|
44 |
saves_summary,
|
45 |
textlist2html,
|
46 |
truncate_word_count,
|
47 |
-
remove_stagnant_files,
|
48 |
)
|
49 |
|
50 |
_here = Path(__file__).parent
|
@@ -62,12 +62,13 @@ MODEL_OPTIONS = [
|
|
62 |
] # models users can choose from
|
63 |
|
64 |
SUMMARY_PLACEHOLDER = "<p><em>Output will appear below:</em></p>"
|
|
|
65 |
|
66 |
-
# if duplicating space
|
67 |
# os.environ["APP_MAX_WORDS"] = str(2048) # set the max words to 2048
|
68 |
# os.environ["APP_OCR_MAX_PAGES"] = str(40) # set the max pages to 40
|
69 |
|
70 |
-
aggregator = BatchAggregator(
|
71 |
|
72 |
|
73 |
def aggregate_text(
|
@@ -77,8 +78,8 @@ def aggregate_text(
|
|
77 |
"""
|
78 |
Aggregate the text from the batches.
|
79 |
|
80 |
-
NOTE: you should probably include
|
81 |
-
|
82 |
:param batches_html: The batches to aggregate, in html format
|
83 |
:param text_file: The text file to append the aggregate summary to
|
84 |
:return: The aggregate summary in html format
|
@@ -104,13 +105,13 @@ def aggregate_text(
|
|
104 |
content_batches = [batch["content"] for batch in extracted_batches]
|
105 |
full_summary = aggregator.infer_aggregate(content_batches)
|
106 |
|
107 |
-
# if a path that exists is provided,
|
108 |
if out_path:
|
109 |
out_path = Path(out_path)
|
110 |
|
111 |
try:
|
112 |
with open(out_path, "a", encoding="utf-8") as f:
|
113 |
-
f.write("\n\n
|
114 |
f.write(
|
115 |
"- This is an instruction-based LLM aggregation of the previous 'summary batches'.\n"
|
116 |
)
|
@@ -341,9 +342,9 @@ def load_uploaded_file(file_obj, max_pages: int = 20, lower: bool = False) -> st
|
|
341 |
raw_text = f.read()
|
342 |
text = clean(raw_text, lower=lower)
|
343 |
elif file_path.suffix == ".pdf":
|
344 |
-
logger.info(f"loading
|
345 |
max_pages = int(os.environ.get("APP_MAX_PAGES", max_pages))
|
346 |
-
logger.info(f"max_pages
|
347 |
conversion_stats = convert_PDF_to_Text(
|
348 |
file_path,
|
349 |
ocr_model=ocr_model,
|
@@ -357,13 +358,15 @@ def load_uploaded_file(file_obj, max_pages: int = 20, lower: bool = False) -> st
|
|
357 |
return text
|
358 |
except Exception as e:
|
359 |
logger.error(f"Trying to load file:\t{file_path},\nerror:\t{e}")
|
360 |
-
return "Error: Could not read file.
|
361 |
|
362 |
|
363 |
def parse_args():
|
|
|
364 |
parser = argparse.ArgumentParser(
|
365 |
-
description="Document Summarization with Long-Document Transformers",
|
366 |
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
|
|
367 |
)
|
368 |
parser.add_argument(
|
369 |
"--share",
|
@@ -415,7 +418,7 @@ if __name__ == "__main__":
|
|
415 |
with demo:
|
416 |
gr.Markdown("# Document Summarization with Long-Document Transformers")
|
417 |
gr.Markdown(
|
418 |
-
"An example use case for fine-tuned long document transformers. Model(s) are trained on [book summaries](https://
|
419 |
)
|
420 |
with gr.Column():
|
421 |
gr.Markdown("## Load Inputs & Select Parameters")
|
@@ -440,7 +443,7 @@ if __name__ == "__main__":
|
|
440 |
load_examples_button = gr.Button(
|
441 |
"Load Example in Dropdown",
|
442 |
)
|
443 |
-
load_file_button = gr.Button("Load
|
444 |
with gr.Column(variant="compact"):
|
445 |
example_name = gr.Dropdown(
|
446 |
_examples,
|
@@ -457,22 +460,23 @@ if __name__ == "__main__":
|
|
457 |
input_text = gr.Textbox(
|
458 |
lines=4,
|
459 |
max_lines=12,
|
460 |
-
label="
|
461 |
placeholder="Enter text to summarize, the text will be cleaned and truncated on Spaces. Narrative, academic (both papers and lecture transcription), and article text work well. May take a bit to generate depending on the input text :)",
|
462 |
)
|
463 |
gr.Markdown("---")
|
464 |
with gr.Column():
|
465 |
gr.Markdown("## Generate Summary")
|
466 |
-
gr.
|
467 |
-
|
468 |
-
|
469 |
-
|
470 |
-
|
471 |
-
|
472 |
-
|
|
|
473 |
output_text = gr.HTML("<p><em>Output will appear below:</em></p>")
|
474 |
with gr.Column():
|
475 |
-
gr.Markdown("
|
476 |
with gr.Row():
|
477 |
with gr.Column(variant="compact"):
|
478 |
gr.Markdown(
|
@@ -486,24 +490,42 @@ if __name__ == "__main__":
|
|
486 |
)
|
487 |
with gr.Column(variant="compact"):
|
488 |
gr.Markdown(
|
489 |
-
"Scores represent the summary quality
|
490 |
)
|
491 |
summary_scores = gr.Textbox(
|
492 |
label="Summary Scores",
|
493 |
placeholder="Summary scores will appear here",
|
494 |
)
|
495 |
-
with gr.Column():
|
496 |
-
gr.Markdown("
|
497 |
summary_text = gr.HTML(
|
498 |
-
label="Summary",
|
|
|
499 |
)
|
500 |
with gr.Column():
|
501 |
-
gr.Markdown("
|
502 |
-
|
503 |
-
"
|
504 |
-
|
505 |
-
)
|
506 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
507 |
|
508 |
gr.Markdown("---")
|
509 |
with gr.Column():
|
@@ -539,15 +561,15 @@ if __name__ == "__main__":
|
|
539 |
value=3,
|
540 |
)
|
541 |
with gr.Column():
|
542 |
-
gr.Markdown("
|
543 |
gr.Markdown(
|
544 |
-
"- Models are fine-tuned on the [
|
545 |
)
|
546 |
gr.Markdown(
|
547 |
-
"- _Update April 2023:_ Additional models fine-tuned on the [PLOS](https://
|
548 |
)
|
549 |
gr.Markdown(
|
550 |
-
"Adjust the max input words & max PDF pages for OCR by duplicating this space and [setting the environment variables](https://
|
551 |
)
|
552 |
gr.Markdown("---")
|
553 |
|
|
|
2 |
app.py - the main module for the gradio app for summarization
|
3 |
|
4 |
Usage:
|
5 |
+
python app.py --help
|
6 |
|
7 |
Environment Variables:
|
8 |
USE_TORCH (str): whether to use torch (1) or not (0)
|
|
|
20 |
import re
|
21 |
import time
|
22 |
from pathlib import Path
|
|
|
23 |
|
24 |
os.environ["USE_TORCH"] = "1"
|
25 |
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
|
|
32 |
import gradio as gr
|
33 |
import nltk
|
34 |
import torch
|
|
|
35 |
from cleantext import clean
|
36 |
from doctr.models import ocr_predictor
|
37 |
+
|
38 |
+
from aggregate import BatchAggregator
|
39 |
from pdf2text import convert_PDF_to_Text
|
40 |
from summarize import load_model_and_tokenizer, summarize_via_tokenbatches
|
41 |
from utils import (
|
42 |
extract_batches,
|
43 |
load_example_filenames,
|
44 |
+
remove_stagnant_files,
|
45 |
saves_summary,
|
46 |
textlist2html,
|
47 |
truncate_word_count,
|
|
|
48 |
)
|
49 |
|
50 |
_here = Path(__file__).parent
|
|
|
62 |
] # models users can choose from
|
63 |
|
64 |
SUMMARY_PLACEHOLDER = "<p><em>Output will appear below:</em></p>"
|
65 |
+
AGGREGATE_MODEL = "MBZUAI/LaMini-Flan-T5-783M" # model to use for aggregation
|
66 |
|
67 |
+
# if duplicating space: uncomment this line to adjust the max words
|
68 |
# os.environ["APP_MAX_WORDS"] = str(2048) # set the max words to 2048
|
69 |
# os.environ["APP_OCR_MAX_PAGES"] = str(40) # set the max pages to 40
|
70 |
|
71 |
+
aggregator = BatchAggregator(AGGREGATE_MODEL)
|
72 |
|
73 |
|
74 |
def aggregate_text(
|
|
|
78 |
"""
|
79 |
Aggregate the text from the batches.
|
80 |
|
81 |
+
NOTE: you should probably include the BatchAggregator object as a fn arg if using this code
|
82 |
+
|
83 |
:param batches_html: The batches to aggregate, in html format
|
84 |
:param text_file: The text file to append the aggregate summary to
|
85 |
:return: The aggregate summary in html format
|
|
|
105 |
content_batches = [batch["content"] for batch in extracted_batches]
|
106 |
full_summary = aggregator.infer_aggregate(content_batches)
|
107 |
|
108 |
+
# if a path that exists is provided, append the summary with markdown formatting
|
109 |
if out_path:
|
110 |
out_path = Path(out_path)
|
111 |
|
112 |
try:
|
113 |
with open(out_path, "a", encoding="utf-8") as f:
|
114 |
+
f.write("\n\n## Aggregate Summary\n\n")
|
115 |
f.write(
|
116 |
"- This is an instruction-based LLM aggregation of the previous 'summary batches'.\n"
|
117 |
)
|
|
|
342 |
raw_text = f.read()
|
343 |
text = clean(raw_text, lower=lower)
|
344 |
elif file_path.suffix == ".pdf":
|
345 |
+
logger.info(f"loading a PDF file: {file_path.name}")
|
346 |
max_pages = int(os.environ.get("APP_MAX_PAGES", max_pages))
|
347 |
+
logger.info(f"max_pages is: {max_pages}. Starting conversion...")
|
348 |
conversion_stats = convert_PDF_to_Text(
|
349 |
file_path,
|
350 |
ocr_model=ocr_model,
|
|
|
358 |
return text
|
359 |
except Exception as e:
|
360 |
logger.error(f"Trying to load file:\t{file_path},\nerror:\t{e}")
|
361 |
+
return f"Error: Could not read file {file_path.name}. Make sure it is a PDF, TXT, or MD file."
|
362 |
|
363 |
|
364 |
def parse_args():
|
365 |
+
"""arguments for the command line interface"""
|
366 |
parser = argparse.ArgumentParser(
|
367 |
+
description="Document Summarization with Long-Document Transformers Demo",
|
368 |
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
369 |
+
epilog="Runs a local-only web app to summarize documents. use --share for a public link to share.",
|
370 |
)
|
371 |
parser.add_argument(
|
372 |
"--share",
|
|
|
418 |
with demo:
|
419 |
gr.Markdown("# Document Summarization with Long-Document Transformers")
|
420 |
gr.Markdown(
|
421 |
+
"An example use case for fine-tuned long document transformers. Model(s) are trained on [book summaries](https://hf.co/datasets/kmfoda/booksum). Architectures [in this demo](https://hf.co/spaces/pszemraj/document-summarization) are [LongT5-base](https://hf.co/pszemraj/long-t5-tglobal-base-16384-book-summary) and [Pegasus-X-Large](https://hf.co/pszemraj/pegasus-x-large-book-summary)."
|
422 |
)
|
423 |
with gr.Column():
|
424 |
gr.Markdown("## Load Inputs & Select Parameters")
|
|
|
443 |
load_examples_button = gr.Button(
|
444 |
"Load Example in Dropdown",
|
445 |
)
|
446 |
+
load_file_button = gr.Button("Load & Process File")
|
447 |
with gr.Column(variant="compact"):
|
448 |
example_name = gr.Dropdown(
|
449 |
_examples,
|
|
|
460 |
input_text = gr.Textbox(
|
461 |
lines=4,
|
462 |
max_lines=12,
|
463 |
+
label="Text to Summarize",
|
464 |
placeholder="Enter text to summarize, the text will be cleaned and truncated on Spaces. Narrative, academic (both papers and lecture transcription), and article text work well. May take a bit to generate depending on the input text :)",
|
465 |
)
|
466 |
gr.Markdown("---")
|
467 |
with gr.Column():
|
468 |
gr.Markdown("## Generate Summary")
|
469 |
+
with gr.Row():
|
470 |
+
summarize_button = gr.Button(
|
471 |
+
"Summarize!",
|
472 |
+
variant="primary",
|
473 |
+
)
|
474 |
+
gr.Markdown(
|
475 |
+
"_Summarization should take ~1-2 minutes for most settings, but may extend up to 5-10 minutes in some scenarios._"
|
476 |
+
)
|
477 |
output_text = gr.HTML("<p><em>Output will appear below:</em></p>")
|
478 |
with gr.Column():
|
479 |
+
gr.Markdown("### Results & Scores")
|
480 |
with gr.Row():
|
481 |
with gr.Column(variant="compact"):
|
482 |
gr.Markdown(
|
|
|
490 |
)
|
491 |
with gr.Column(variant="compact"):
|
492 |
gr.Markdown(
|
493 |
+
"Scores **roughly** represent the summary quality as a measure of the model's 'confidence'. less-negative numbers (closer to 0) are better."
|
494 |
)
|
495 |
summary_scores = gr.Textbox(
|
496 |
label="Summary Scores",
|
497 |
placeholder="Summary scores will appear here",
|
498 |
)
|
499 |
+
with gr.Column(variant="panel"):
|
500 |
+
gr.Markdown("### **Summary Output**")
|
501 |
summary_text = gr.HTML(
|
502 |
+
label="Summary",
|
503 |
+
value="<center><i>Summary will appear here!</i></center>",
|
504 |
)
|
505 |
with gr.Column():
|
506 |
+
gr.Markdown("### **Aggregate Summary Batches**")
|
507 |
+
gr.Markdown(
|
508 |
+
"_Note: this is an experimental feature. Feedback welcome in the [discussions](https://hf.co/spaces/pszemraj/document-summarization/discussions)!_"
|
509 |
+
)
|
510 |
+
with gr.Row():
|
511 |
+
aggregate_button = gr.Button(
|
512 |
+
"Aggregate!",
|
513 |
+
variant="primary",
|
514 |
+
)
|
515 |
+
gr.Markdown(
|
516 |
+
f"""Aggregate the above batches into a cohesive summary.
|
517 |
+
- a secondary instruct-tuned LM consolidates info from the batches
|
518 |
+
- current model: [{AGGREGATE_MODEL}](https://hf.co/{AGGREGATE_MODEL})
|
519 |
+
"""
|
520 |
+
)
|
521 |
+
with gr.Column(variant="panel"):
|
522 |
+
aggregated_summary = gr.HTML(
|
523 |
+
label="Aggregate Summary",
|
524 |
+
value="<center><i>Aggregate summary will appear here!</i></center>",
|
525 |
+
)
|
526 |
+
gr.Markdown(
|
527 |
+
"\n\n_Aggregate summary also appended to the bottom of the `.txt` file!_"
|
528 |
+
)
|
529 |
|
530 |
gr.Markdown("---")
|
531 |
with gr.Column():
|
|
|
561 |
value=3,
|
562 |
)
|
563 |
with gr.Column():
|
564 |
+
gr.Markdown("## About")
|
565 |
gr.Markdown(
|
566 |
+
"- Models are fine-tuned on the [π
±οΈookSum dataset](https://arxiv.org/abs/2105.08209). The goal was to create a model that generalizes well and is useful for summarizing text in academic and everyday use."
|
567 |
)
|
568 |
gr.Markdown(
|
569 |
+
"- _Update April 2023:_ Additional models fine-tuned on the [PLOS](https://hf.co/datasets/pszemraj/scientific_lay_summarisation-plos-norm) and [ELIFE](https://hf.co/datasets/pszemraj/scientific_lay_summarisation-elife-norm) subsets of the [scientific lay summaries](https://arxiv.org/abs/2210.09932) dataset are available (see dropdown at the top)."
|
570 |
)
|
571 |
gr.Markdown(
|
572 |
+
"Adjust the max input words & max PDF pages for OCR by duplicating this space and [setting the environment variables](https://hf.co/docs/hub/spaces-overview#managing-secrets) `APP_MAX_WORDS` and `APP_OCR_MAX_PAGES` to the desired integer values."
|
573 |
)
|
574 |
gr.Markdown("---")
|
575 |
|
utils.py
CHANGED
@@ -4,6 +4,7 @@
|
|
4 |
import logging
|
5 |
import os
|
6 |
import re
|
|
|
7 |
import subprocess
|
8 |
from collections import defaultdict, deque
|
9 |
from datetime import datetime, timedelta
|
@@ -111,10 +112,9 @@ def compare_model_size(model_name: str, threshold: int = 500) -> bool:
|
|
111 |
if not matches:
|
112 |
return None
|
113 |
|
114 |
-
# Extract the parameter count and unit
|
115 |
parameter_count, unit = matches[-1]
|
116 |
-
|
117 |
-
parameter_count = int(parameter_count) # Convert to an integer
|
118 |
|
119 |
# Convert to the standard form (M for million, G for billion, k for thousand)
|
120 |
if unit == "G" or unit == "b":
|
@@ -129,7 +129,14 @@ def compare_model_size(model_name: str, threshold: int = 500) -> bool:
|
|
129 |
return parameter_count > threshold
|
130 |
|
131 |
|
132 |
-
def validate_pytorch2(torch_version: str = None):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
torch_version = torch.__version__ if torch_version is None else torch_version
|
134 |
|
135 |
pattern = r"^2\.\d+(\.\d+)*"
|
@@ -140,8 +147,8 @@ def validate_pytorch2(torch_version: str = None):
|
|
140 |
def get_timestamp(detailed=False) -> str:
|
141 |
"""
|
142 |
get_timestamp - get a timestamp for the current time
|
143 |
-
|
144 |
-
|
145 |
"""
|
146 |
return (
|
147 |
datetime.now().strftime("%b%d%Y_%H%M%S%f")
|
@@ -150,18 +157,13 @@ def get_timestamp(detailed=False) -> str:
|
|
150 |
)
|
151 |
|
152 |
|
153 |
-
def truncate_word_count(text, max_words=1024):
|
154 |
"""
|
155 |
-
truncate_word_count - a
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
max_words : int, optional, the maximum number of words, default=512
|
160 |
-
Returns
|
161 |
-
-------
|
162 |
-
dict, the text and whether it was truncated
|
163 |
"""
|
164 |
-
# split on whitespace with regex
|
165 |
words = re.split(r"\s+", text)
|
166 |
processed = {}
|
167 |
if len(words) > max_words:
|
@@ -176,8 +178,7 @@ def truncate_word_count(text, max_words=1024):
|
|
176 |
def load_examples(src, filetypes=[".txt", ".pdf"]):
|
177 |
"""
|
178 |
load_examples - a helper function for the gradio module to load examples
|
179 |
-
|
180 |
-
list of str, the examples
|
181 |
"""
|
182 |
src = Path(src)
|
183 |
src.mkdir(exist_ok=True)
|
@@ -210,7 +211,8 @@ def load_example_filenames(example_path: str or Path):
|
|
210 |
return examples
|
211 |
|
212 |
|
213 |
-
def textlist2html(text_batches):
|
|
|
214 |
# Step 1: Generate each summary batch as a string of HTML
|
215 |
formatted_batches = [
|
216 |
f"""
|
@@ -244,7 +246,7 @@ def textlist2html(text_batches):
|
|
244 |
return text_html_block
|
245 |
|
246 |
|
247 |
-
def extract_batches(html_string, pattern=None, flags=None) -> list:
|
248 |
"""
|
249 |
Extract batches of text from an HTML string.
|
250 |
|
@@ -336,7 +338,7 @@ def extract_keywords(
|
|
336 |
|
337 |
def saves_summary(
|
338 |
summarize_output, outpath: str or Path = None, add_signature=True, **kwargs
|
339 |
-
):
|
340 |
"""
|
341 |
saves_summary - save the summary generated from summarize_via_tokenbatches() to a text file
|
342 |
|
|
|
4 |
import logging
|
5 |
import os
|
6 |
import re
|
7 |
+
import string
|
8 |
import subprocess
|
9 |
from collections import defaultdict, deque
|
10 |
from datetime import datetime, timedelta
|
|
|
112 |
if not matches:
|
113 |
return None
|
114 |
|
115 |
+
# Extract the parameter count and unit
|
116 |
parameter_count, unit = matches[-1]
|
117 |
+
parameter_count = int(parameter_count)
|
|
|
118 |
|
119 |
# Convert to the standard form (M for million, G for billion, k for thousand)
|
120 |
if unit == "G" or unit == "b":
|
|
|
129 |
return parameter_count > threshold
|
130 |
|
131 |
|
132 |
+
def validate_pytorch2(torch_version: str = None) -> bool:
|
133 |
+
"""
|
134 |
+
validate_pytorch2 - validate that the PyTorch version is 2.0 or greater
|
135 |
+
|
136 |
+
:param str torch_version: the PyTorch version to validate, defaults to None
|
137 |
+
:return: True if the PyTorch version is 2.0 or greater, False otherwise
|
138 |
+
"""
|
139 |
+
|
140 |
torch_version = torch.__version__ if torch_version is None else torch_version
|
141 |
|
142 |
pattern = r"^2\.\d+(\.\d+)*"
|
|
|
147 |
def get_timestamp(detailed=False) -> str:
|
148 |
"""
|
149 |
get_timestamp - get a timestamp for the current time
|
150 |
+
:param bool detailed: whether to include seconds and microseconds, defaults to False
|
151 |
+
:return: str, the timestamp
|
152 |
"""
|
153 |
return (
|
154 |
datetime.now().strftime("%b%d%Y_%H%M%S%f")
|
|
|
157 |
)
|
158 |
|
159 |
|
160 |
+
def truncate_word_count(text: str, max_words=1024) -> dict:
|
161 |
"""
|
162 |
+
truncate_word_count - truncate a text to a maximum number of words
|
163 |
+
:param str text: the text to truncate
|
164 |
+
:param int max_words: the maximum number of words to keep, defaults to 1024
|
165 |
+
:return: dict, the processed text
|
|
|
|
|
|
|
|
|
166 |
"""
|
|
|
167 |
words = re.split(r"\s+", text)
|
168 |
processed = {}
|
169 |
if len(words) > max_words:
|
|
|
178 |
def load_examples(src, filetypes=[".txt", ".pdf"]):
|
179 |
"""
|
180 |
load_examples - a helper function for the gradio module to load examples
|
181 |
+
:param str src: the path to the examples
|
|
|
182 |
"""
|
183 |
src = Path(src)
|
184 |
src.mkdir(exist_ok=True)
|
|
|
211 |
return examples
|
212 |
|
213 |
|
214 |
+
def textlist2html(text_batches: List[str]) -> str:
|
215 |
+
"""textlist2html - convert a list of text summaries into a single HTML string"""
|
216 |
# Step 1: Generate each summary batch as a string of HTML
|
217 |
formatted_batches = [
|
218 |
f"""
|
|
|
246 |
return text_html_block
|
247 |
|
248 |
|
249 |
+
def extract_batches(html_string: str, pattern=None, flags=None) -> list:
|
250 |
"""
|
251 |
Extract batches of text from an HTML string.
|
252 |
|
|
|
338 |
|
339 |
def saves_summary(
|
340 |
summarize_output, outpath: str or Path = None, add_signature=True, **kwargs
|
341 |
+
) -> Path:
|
342 |
"""
|
343 |
saves_summary - save the summary generated from summarize_via_tokenbatches() to a text file
|
344 |
|