Spaces:

pszemraj
/

summarize-long-text

Running on CPU Upgrade

File size: 5,620 Bytes

fe0e9af
904400a
8dbbc84
53cfd2d
fe0e9af
 
 
 
 
c0a9b19
fe0e9af
 
 
 
 
 
 
 
 
 
66e7228
fe0e9af
 
98a3ea7
fe0e9af
9b3e02d
fe0e9af
 
 
7a2e137
fe0e9af
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b9e8529
 
3247bd6
 
fe0e9af
53cfd2d
fe0e9af
 
98a3ea7
fe0e9af
 
f4f4797
8dbbc84
fe0e9af
 
 
f4f4797
fe0e9af
 
f4f4797
98a3ea7
 
66e7228
 
 
ecba037
3b66adc
 
 
 
fe0e9af
ecba037
7a2e137
 
fe0e9af
b1e0e58
53cfd2d
 
fe0e9af
 
 
 
7dcd8f3
fe0e9af
 
 
 
 
 
66e7228
fe0e9af
 
66e7228
98a3ea7
8dbbc84
e59ac21
fe0e9af
 
 
24e11fd
 
 
98a3ea7
24e11fd
df69f18
dd63c17
3b66adc
fe0e9af
b2df366
fe0e9af
50085ad
afa6ede
ecba037
afa6ede
ecba037
 
66e7228
fe0e9af
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ca941f
fe0e9af
 
8dbbc84
c0a9b19
3ca941f
3b66adc

import logging
import re
from pathlib import Path
import time
import gradio as gr
import nltk
from cleantext import clean

from summarize import load_model_and_tokenizer, summarize_via_tokenbatches
from utils import load_examples, truncate_word_count

_here = Path(__file__).parent

nltk.download("stopwords")  # TODO=find where this requirement originates from

import transformers

transformers.logging.set_verbosity_error()
logging.basicConfig()


def proc_submission(
    input_text: str,
    model_size: str,
    num_beams,
    token_batch_length,
    length_penalty,
    repetition_penalty,
    no_repeat_ngram_size,
    max_input_length: int = 768,
):
    """
    proc_submission - a helper function for the gradio module
    Parameters
    ----------
    input_text : str, required, the text to be processed
    max_input_length : int, optional, the maximum length of the input text, default=512
    Returns
    -------
    str of HTML, the interactive HTML form for the model
    """

    settings = {
        "length_penalty": length_penalty,
        "repetition_penalty": repetition_penalty,
        "no_repeat_ngram_size": no_repeat_ngram_size,
        "encoder_no_repeat_ngram_size": 4,
        "num_beams": num_beams,
        "min_length": 4,
        "max_length": int(token_batch_length // 4),
        "early_stopping": True,
        "do_sample": False,
    }
    st = time.perf_counter()
    history = {}
    clean_text = clean(input_text, lower=False)
    max_input_length = 1024 if model_size == "base" else max_input_length
    processed = truncate_word_count(clean_text, max_input_length)
    if processed["was_truncated"]:
        tr_in = processed["truncated_text"]
        msg = f"Input text was truncated to {max_input_length} words (based on whitespace)"
        logging.warning(msg)
        history["WARNING"] = msg
    else:
        tr_in = input_text

    _summaries = summarize_via_tokenbatches(
        tr_in,
        model_sm if model_size == "base" else model,
        tokenizer_sm if model_size == "base" else tokenizer,
        batch_length=token_batch_length,
        **settings,
    )
    sum_text = [f"Section {i}: " + s["summary"][0] for i, s in enumerate(_summaries)]
    sum_scores = [
        f"\n - Section {i}: {round(s['summary_score'],4)}"
        for i, s in enumerate(_summaries)
    ]

    history["Summary Text"] = "<br>".join(sum_text)
    history["Summary Scores"] = "The summary scores can be thought of as representing the quality of the summary. less-negative numbers (closer to 0) are better.<br><br>"
    history["Summary Scores"] += "\n".join(sum_scores)
    html = ""
    rt = round((time.perf_counter() - st) / 60, 2)
    print(f"Runtime: {rt} minutes")
    html += f"<p>Runtime: {rt} minutes on CPU</p>"
    for name, item in history.items():
        html += (
            f"<h2>{name}:</h2><hr><b>{item}</b><br><br>"
            if "summary" not in name.lower()
            else f"<h2>{name}:</h2><hr>{item}<br><br>"
        )

    html += ""

    return html


if __name__ == "__main__":

    model, tokenizer = load_model_and_tokenizer("pszemraj/led-large-book-summary")
    model_sm, tokenizer_sm = load_model_and_tokenizer("pszemraj/led-base-book-summary")
    title = "Long-Form Summarization: LED & BookSum"
    description = "A simple demo of how to use a fine-tuned LED model to summarize long-form text. [This model](https://huggingface.co/pszemraj/led-large-book-summary) is a fine-tuned version of [allenai/led-large-16384](https://huggingface.co/allenai/led-large-16384) on the [BookSum dataset](https://arxiv.org/abs/2105.08209).The goal was to create a model that can generalize well and is useful in summarizing lots of text in academic and daily usage. See [model card](https://huggingface.co/pszemraj/led-large-book-summary) for a notebook with GPU inference (much faster) on Colab."
    gr.Interface(
        proc_submission,
        inputs=[
            gr.inputs.Textbox(
                lines=10,
                label="input text",
                placeholder="Enter text to summarize, the text will be cleaned and truncated on Spaces. Narrative, academic (both papers and lecture transcription), and article text work well. May take a bit to generate depending on the input text :)",
            ),
            gr.inputs.Radio(
                choices=["base", "large"], label="model size", default="large"
            ),
            gr.inputs.Slider(
                minimum=2, maximum=4, label="num_beams", default=2, step=1
            ),
            gr.inputs.Slider(
                minimum=512,
                maximum=1024,
                label="token_batch_length",
                default=512,
                step=256,
            ),
            gr.inputs.Slider(
                minimum=0.5, maximum=1.1, label="length_penalty", default=0.7, step=0.05
            ),
            gr.inputs.Slider(
                minimum=1.0,
                maximum=5.0,
                label="repetition_penalty",
                default=3.5,
                step=0.1,
            ),
            gr.inputs.Slider(
                minimum=2, maximum=4, label="no_repeat_ngram_size", default=3, step=1
            ),
        ],
        outputs="html",
        examples_per_page=2,
        title=title,
        description=description,
        article="The model can be used with tag [pszemraj/led-large-book-summary](https://huggingface.co/pszemraj/led-large-book-summary). See the model card for details on usage & a notebook for a tutorial.",
        examples=load_examples(_here / "examples"),
        cache_examples=True,
    ).launch()