pszemraj's picture
⚡️ improve performance, enable longer text
8312087
raw
history blame
14.9 kB
"""
app.py - the main module for the gradio app
Usage:
python app.py
"""
import contextlib
import gc
import logging
import os
import random
import re
import time
from pathlib import Path
os.environ["USE_TORCH"] = "1"
os.environ[
"TOKENIZERS_PARALLELISM"
] = "false" # parallelism on tokenizers is buggy with gradio
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
import gradio as gr
import nltk
import torch
from cleantext import clean
from doctr.models import ocr_predictor
from pdf2text import convert_PDF_to_Text
from summarize import load_model_and_tokenizer, summarize_via_tokenbatches
from utils import load_example_filenames, saves_summary, truncate_word_count
_here = Path(__file__).parent
nltk.download("stopwords", quiet=True)
MODEL_OPTIONS = [
"pszemraj/long-t5-tglobal-base-16384-book-summary",
"pszemraj/long-t5-tglobal-base-sci-simplify",
"pszemraj/long-t5-tglobal-base-sci-simplify-elife",
"pszemraj/long-t5-tglobal-base-16384-booksci-summary-v1",
"pszemraj/pegasus-x-large-book-summary",
] # models users can choose from
def predict(
input_text: str,
model_name: str,
token_batch_length: int = 1024,
empty_cache: bool = True,
**settings,
) -> list:
"""
predict - helper fn to support multiple models for summarization at once
:param str input_text: the input text to summarize
:param str model_name: model name to use
:param int token_batch_length: the length of the token batches to use
:param bool empty_cache: whether to empty the cache before loading a new= model
:return: list of dicts with keys "summary" and "score"
"""
if torch.cuda.is_available() and empty_cache:
torch.cuda.empty_cache()
model, tokenizer = load_model_and_tokenizer(model_name)
summaries = summarize_via_tokenbatches(
input_text,
model,
tokenizer,
batch_length=token_batch_length,
**settings,
)
del model
del tokenizer
gc.collect()
return summaries
def proc_submission(
input_text: str,
model_name: str,
num_beams: int,
token_batch_length: int,
length_penalty: float,
repetition_penalty: float,
no_repeat_ngram_size: int,
max_input_length: int = 4096,
):
"""
proc_submission - a helper function for the gradio module to process submissions
Args:
input_text (str): the input text to summarize
model_name (str): the hf model tag of the model to use
num_beams (int): the number of beams to use
token_batch_length (int): the length of the token batches to use
length_penalty (float): the length penalty to use
repetition_penalty (float): the repetition penalty to use
no_repeat_ngram_size (int): the no repeat ngram size to use
max_input_length (int, optional): the maximum input length to use. Defaults to 2048.
Returns:
str in HTML format, string of the summary, str of score
"""
settings = {
"length_penalty": float(length_penalty),
"repetition_penalty": float(repetition_penalty),
"no_repeat_ngram_size": int(no_repeat_ngram_size),
"encoder_no_repeat_ngram_size": 4,
"num_beams": int(num_beams),
"min_length": 4,
"max_length": int(token_batch_length // 4),
"early_stopping": True,
"do_sample": False,
}
st = time.perf_counter()
history = {}
clean_text = clean(input_text, lower=False)
processed = truncate_word_count(clean_text, max_words=max_input_length)
if processed["was_truncated"]:
tr_in = processed["truncated_text"]
# create elaborate HTML warning
input_wc = re.split(r"\s+", input_text)
msg = f"""
<div style="background-color: #FFA500; color: white; padding: 20px;">
<h3>Warning</h3>
<p>Input text was truncated to {max_input_length} words. That's about {100*max_input_length/len(input_wc):.2f}% of the submission.</p>
</div>
"""
logging.warning(msg)
history["WARNING"] = msg
else:
tr_in = input_text
msg = None
if len(input_text) < 50:
# this is essentially a different case from the above
msg = f"""
<div style="background-color: #880808; color: white; padding: 20px;">
<h3>Warning</h3>
<p>Input text is too short to summarize. Detected {len(input_text)} characters.
Please load text by selecting an example from the dropdown menu or by pasting text into the text box.</p>
</div>
"""
logging.warning(msg)
logging.warning("RETURNING EMPTY STRING")
history["WARNING"] = msg
return msg, "", []
_summaries = predict(
input_text=tr_in,
model_name=model_name,
token_batch_length=token_batch_length,
**settings,
)
sum_text = [
f"Batch {i}:\n\t" + s["summary"][0] for i, s in enumerate(_summaries, start=1)
]
sum_scores = [
f" - Batch Summary {i}: {round(s['summary_score'],4)}"
for i, s in enumerate(_summaries)
]
sum_text_out = "\n".join(sum_text)
history["Summary Scores"] = "<br><br>"
scores_out = "\n".join(sum_scores)
rt = round((time.perf_counter() - st) / 60, 2)
logging.info(f"Runtime: {rt} minutes")
html = ""
html += f"<p>Runtime: {rt} minutes with model: {model_name}</p>"
if msg is not None:
html += msg
html += ""
# save to file
saved_file = saves_summary(_summaries)
return html, sum_text_out, scores_out, saved_file
def load_single_example_text(
example_path: str or Path,
max_pages: int = 20,
) -> str:
"""
load_single_example_text - loads a single example text file
:param strorPath example_path: name of the example to load
:param int max_pages: the maximum number of pages to load from a PDF
:return str: the text of the example
"""
global name_to_path
full_ex_path = name_to_path[example_path]
full_ex_path = Path(full_ex_path)
if full_ex_path.suffix == ".txt":
with open(full_ex_path, "r", encoding="utf-8", errors="ignore") as f:
raw_text = f.read()
text = clean(raw_text, lower=False)
elif full_ex_path.suffix == ".pdf":
logging.info(f"Loading PDF file {full_ex_path}")
conversion_stats = convert_PDF_to_Text(
full_ex_path,
ocr_model=ocr_model,
max_pages=max_pages,
)
text = conversion_stats["converted_text"]
else:
logging.error(f"Unknown file type {full_ex_path.suffix}")
text = "ERROR - check example path"
return text
def load_uploaded_file(file_obj, max_pages: int = 20, lower: bool = False) -> str:
"""
load_uploaded_file - loads a file uploaded by the user
:param file_obj (POTENTIALLY list): Gradio file object inside a list
:param int max_pages: the maximum number of pages to load from a PDF
:param bool lower: whether to lowercase the text
:return str: the text of the file
"""
# check if mysterious file object is a list
if isinstance(file_obj, list):
file_obj = file_obj[0]
file_path = Path(file_obj.name)
try:
logging.info(f"Loading file:\t{file_path}")
if file_path.suffix == ".txt":
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
raw_text = f.read()
text = clean(raw_text, lower=lower)
elif file_path.suffix == ".pdf":
logging.info(f"loading as PDF file {file_path}")
conversion_stats = convert_PDF_to_Text(
file_path,
ocr_model=ocr_model,
max_pages=max_pages,
)
text = conversion_stats["converted_text"]
else:
logging.error(f"Unknown file type {file_path.suffix}")
text = "ERROR - check file - unknown file type"
return text
except Exception as e:
logging.error(f"Trying to load file:\t{file_path},\nerror:\t{e}")
return "Error: Could not read file. Ensure that it is a valid text file with encoding UTF-8 if text, and a PDF if PDF."
if __name__ == "__main__":
logging.info("Starting app instance")
logging.info("Loading OCR model")
with contextlib.redirect_stdout(None):
ocr_model = ocr_predictor(
"db_resnet50",
"crnn_mobilenet_v3_large",
pretrained=True,
assume_straight_pages=True,
)
name_to_path = load_example_filenames(_here / "examples")
logging.info(f"Loaded {len(name_to_path)} examples")
demo = gr.Blocks()
_examples = list(name_to_path.keys())
with demo:
gr.Markdown("# Document Summarization with Long-Document Transformers")
gr.Markdown(
"This is an example use case for fine-tuned long document transformers. The model is trained on book summaries (via the BookSum dataset). The models in this demo are [LongT5-base](https://huggingface.co/pszemraj/long-t5-tglobal-base-16384-book-summary) and [Pegasus-X-Large](https://huggingface.co/pszemraj/pegasus-x-large-book-summary)."
)
with gr.Column():
gr.Markdown("## Load Inputs & Select Parameters")
gr.Markdown(
"Enter text below in the text area. The text will be summarized [using the selected parameters](https://huggingface.co/blog/how-to-generate). Optionally load an example below or upload a file. (`.txt` or `.pdf` - _[link to guide](https://i.imgur.com/c6Cs9ly.png)_)"
)
with gr.Row(variant="compact"):
with gr.Column(scale=0.5, variant="compact"):
model_name = gr.Dropdown(
choices=MODEL_OPTIONS,
value=MODEL_OPTIONS[0],
label="Model Name",
)
num_beams = gr.Radio(
choices=[2, 3, 4],
label="Beam Search: # of Beams",
value=2,
)
load_examples_button = gr.Button(
"Load Example in Dropdown",
)
load_file_button = gr.Button("Load an Uploaded File")
with gr.Column(variant="compact"):
example_name = gr.Dropdown(
_examples,
label="Examples",
value=random.choice(_examples),
)
uploaded_file = gr.File(
label="File Upload",
file_count="single",
type="file",
)
with gr.Row():
input_text = gr.Textbox(
lines=4,
label="Input Text (for summarization)",
placeholder="Enter text to summarize, the text will be cleaned and truncated on Spaces. Narrative, academic (both papers and lecture transcription), and article text work well. May take a bit to generate depending on the input text :)",
)
with gr.Column():
gr.Markdown("## Generate Summary")
gr.Markdown(
"Summarization should take ~1-2 minutes for most settings, but may extend up to 5-10 minutes in some scenarios."
)
summarize_button = gr.Button(
"Summarize!",
variant="primary",
)
output_text = gr.HTML("<p><em>Output will appear below:</em></p>")
gr.Markdown("### Summary Output")
summary_text = gr.Textbox(
label="Summary", placeholder="The generated summary will appear here"
)
gr.Markdown(
"The summary scores can be thought of as representing the quality of the summary. less-negative numbers (closer to 0) are better:"
)
summary_scores = gr.Textbox(
label="Summary Scores", placeholder="Summary scores will appear here"
)
text_file = gr.File(
label="Download as Text File",
file_count="single",
type="file",
interactive=False,
)
gr.Markdown("---")
with gr.Column():
gr.Markdown("### Advanced Settings")
with gr.Row(variant="compact"):
length_penalty = gr.Slider(
minimum=0.5,
maximum=1.0,
label="length penalty",
default=0.7,
step=0.05,
)
token_batch_length = gr.Radio(
choices=[512, 768, 1024, 1536],
label="token batch length",
value=1024,
)
with gr.Row(variant="compact"):
repetition_penalty = gr.Slider(
minimum=1.0,
maximum=5.0,
label="repetition penalty",
default=3.5,
step=0.1,
)
no_repeat_ngram_size = gr.Radio(
choices=[2, 3, 4],
label="no repeat ngram size",
value=3,
)
with gr.Column():
gr.Markdown("### About")
gr.Markdown(
"- Models are fine-tuned on the [BookSum dataset](https://arxiv.org/abs/2105.08209). The goal was to create a model that generalizes well and is useful for summarizing text in academic and everyday use."
)
gr.Markdown(
"- _Update April 2023:_ Additional models fine-tuned on the [PLOS](https://huggingface.co/datasets/pszemraj/scientific_lay_summarisation-plos-norm) and [ELIFE](https://huggingface.co/datasets/pszemraj/scientific_lay_summarisation-elife-norm) subsets of the [scientific lay summaries](https://arxiv.org/abs/2210.09932) dataset are available (see dropdown at the top)."
)
gr.Markdown("---")
load_examples_button.click(
fn=load_single_example_text, inputs=[example_name], outputs=[input_text]
)
load_file_button.click(
fn=load_uploaded_file, inputs=uploaded_file, outputs=[input_text]
)
summarize_button.click(
fn=proc_submission,
inputs=[
input_text,
model_name,
num_beams,
token_batch_length,
length_penalty,
repetition_penalty,
no_repeat_ngram_size,
],
outputs=[output_text, summary_text, summary_scores, text_file],
)
demo.launch(enable_queue=True)