document-summarization

Build error

App Files Files Community

pszemraj commited on May 27, 2023

Commit

e414859

1 Parent(s): 2d980d5

✨ mwe working aggregation

Browse files

Signed-off-by: peter szemraj <peterszemraj@gmail.com>

Files changed (2) hide show

aggregate.py +158 -67
app.py +91 -10

aggregate.py CHANGED Viewed

@@ -1,10 +1,12 @@
-# imports
 import logging
 import time
 import torch
 from transformers import GenerationConfig, pipeline
 # Setting up logging
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
@@ -12,94 +14,182 @@ logging.basicConfig(
 class BatchAggregator:
     def __init__(
         self, model_name: str = "pszemraj/bart-large-mnli-dolly_hhrlhf-v1", **kwargs
     ):
         self.logger = logging.getLogger(__name__)
         self.model_name = model_name
-        self.logger.info(f"Initializing aggregator with model {model_name}")
-        self.aggregator = pipeline(
-            "text2text-generation",
-            model_name,
-            device=0 if torch.cuda.is_available() else -1,
-            torch_dtype=torch.float32,
-        )
         try:
-            self.aggregator.model = torch.compile(self.aggregator.model)
         except Exception as e:
-            self.logger.warning(f"Could not compile model with Torch 2.0: {e}")
         try:
-            self.aggregator.model.generation_config = GenerationConfig.from_pretrained(
-                self.model_name
-            )
         except Exception as e:
-            self.logger.warning(
-                f"Could not load generation config, using defaults: {e}"
-            )
-            self.aggregator.model.generation_config = GenerationConfig(
-                num_beams=4,
-                early_stopping=True,
-                do_sample=False,
-                min_new_tokens=32,
-                max_new_tokens=192,
-                repetition_penalty=1.1,
-                length_penalty=1.5,
-                no_repeat_ngram_size=4,
-                encoder_no_repeat_ngram_size=5,
-                decoder_start_token_id=0,
-                eos_token_id=1,
-                pad_token_id=0,
-            )
-            if "bart" in model_name.lower():
-                self.logger.info("Using BART model, updating generation config")
-                upd = {
-                    "num_beams": 8,
-                    "repetition_penalty": 1.3,
-                    "length_penalty": 1.0,
-                    "_from_model_config": False,
-                    "max_new_tokens": 256,
-                    "min_new_tokens": 32,
-                    "no_repeat_ngram_size": 3,
-                    "encoder_no_repeat_ngram_size": 6,
-                }
-                self.aggregator.model.generation_config.update(**upd)
-        if self.model_name != "pszemraj/bart-large-mnli-dolly_hhrlhf-v1":
-            self.logger.info("Updating generation config with defaults")
-            self.update_generation_config()
         self.logger.info(self.aggregator.model.generation_config.to_json_string())
     def update_generation_config(self, **kwargs):
-        self.logger.info(f"Updating generation config with {kwargs}")
-        default = GenerationConfig(
-            num_beams=4,
-            early_stopping=True,
-            do_sample=False,
-            min_new_tokens=32,
-            max_new_tokens=192,
-            repetition_penalty=1.1,
-            length_penalty=1.5,
-            no_repeat_ngram_size=4,
-            encoder_no_repeat_ngram_size=5,
-            decoder_start_token_id=0,
-            eos_token_id=1,
-            pad_token_id=0,
-        ).to_dict()
-        self.aggregator.model.generation_config.update(**default)
-    def _replace_pipeline(model_name)
     def infer_aggregate(
         self,
         text_list: list,
-        instruction: str = "Write a comprehensive yet concise summary in paragraph form that pulls together the main points of the following text:",
         **kwargs,
-    ):
         joined_text = "\n".join(text_list)
         prompt = f"{instruction}\n\n{joined_text}\n"
         if kwargs:
             self.update_generation_config(**kwargs)
         st = time.perf_counter()
-        self.logger.info(f"Running inference on {len(text_list)} texts")
         result = self.aggregator(
             prompt,
             generation_config=self.aggregator.model.generation_config,
@@ -110,7 +200,8 @@ class BatchAggregator:
         )
         return result
-    def count_tokens(self, text: str):
         return (
             len(self.aggregator.tokenizer.encode(text, truncation=False, padding=False))
             if text

+import pprint as pp
 import logging
 import time
 import torch
 from transformers import GenerationConfig, pipeline
+from utils import compare_model_size
 # Setting up logging
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
 class BatchAggregator:
+    CONFIGURED_MODELS = [
+        "pszemraj/bart-large-mnli-dolly_hhrlhf-v1"
+    ]  # TODO: Add models here
+    DEFAULT_INSTRUCTION = "Write a comprehensive yet concise summary that pulls together the main points of the following text:"
+    GENERIC_CONFIG = GenerationConfig(
+        num_beams=8,
+        early_stopping=True,
+        do_sample=False,
+        min_new_tokens=32,
+        max_new_tokens=256,
+        repetition_penalty=1.1,
+        length_penalty=1.4,
+        no_repeat_ngram_size=4,
+        encoder_no_repeat_ngram_size=5,
+    )
     def __init__(
         self, model_name: str = "pszemraj/bart-large-mnli-dolly_hhrlhf-v1", **kwargs
     ):
+        self.device = None
+        self.is_compiled = False
         self.logger = logging.getLogger(__name__)
+        self.init_model(model_name)
+    def init_model(self, model_name: str) -> None:
+        """
+        Initialize the model.
+        :param model_name: The name of the model to use.
+        """
+        # Free up memory
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        self.logger.info(f"Setting model to {model_name}")
         self.model_name = model_name
+        self.aggregator = self._create_pipeline(model_name)
+        self._configure_model()
+        # update the generation config with the specific tokenizer
+        tokenizer_params = {
+            "decoder_start_token_id": 0
+            if "t5" in model_name.lower()
+            else self.aggregator.tokenizer.eos_token_id,
+            "eos_token_id": 1
+            if "t5" in model_name.lower()
+            else self.aggregator.tokenizer.eos_token_id,
+            "pad_token_id": 0
+            if "t5" in model_name.lower()
+            else self.aggregator.tokenizer.pad_token_id,
+        }
+        self.update_generation_config(**tokenizer_params)
+    def _create_pipeline(
+        self, model_name: str = "pszemraj/bart-large-mnli-dolly_hhrlhf-v1"
+    ) -> pipeline:
+        """
+        _create_pipeline creates a pipeline for the model.
+        :param str model_name: model name to use, default: "pszemraj/bart-large-mnli-dolly_hhrlhf-v1"
+        :return pipeline: the pipeline for the model
+        :raises Exception: if the pipeline cannot be created
+        """
+        self.device = 0 if torch.cuda.is_available() else -1
         try:
+            self.logger.info(
+                f"Creating pipeline with model {model_name} on device {self.device}"
+            )
+            return pipeline(
+                "text2text-generation",
+                model_name,
+                device=self.device,
+                torch_dtype=torch.float32,
+            )
         except Exception as e:
+            self.logger.error(f"Failed to create pipeline: {e}")
+            raise
+    def _configure_model(self):
+        """
+        Configure the model for generation.
+        """
         try:
+            self.aggregator.model = torch.compile(self.aggregator.model)
+            self.is_compiled = True
         except Exception as e:
+            self.logger.warning(f"Could not compile model with Torch 2.0: {e}")
+        if self.model_name not in self.CONFIGURED_MODELS:
+            self.logger.info("Setting generation config to general defaults")
+            self._set_default_generation_config()
+        else:
+            try:
+                self.logger.info("Loading generation config from hub")
+                self.aggregator.model.generation_config = (
+                    GenerationConfig.from_pretrained(self.model_name)
+                )
+            except Exception as e:
+                self.logger.warning(
+                    f"Could not load generation config, using defaults: {e}"
+                )
+                self._set_default_generation_config()
         self.logger.info(self.aggregator.model.generation_config.to_json_string())
+    def _set_default_generation_config(self):
+        """
+        Set the default generation configuration for the model.
+        """
+        self.aggregator.model.generation_config = self.GENERIC_CONFIG
+        if "bart" in self.model_name.lower():
+            self.logger.info("Using BART model, updating generation config")
+            upd = {
+                "num_beams": 8,
+                "repetition_penalty": 1.3,
+                "length_penalty": 1.0,
+                "_from_model_config": False,
+                "max_new_tokens": 256,
+                "min_new_tokens": 32,
+                "no_repeat_ngram_size": 3,
+                "encoder_no_repeat_ngram_size": 6,
+            }  # TODO: clean up
+            self.aggregator.model.generation_config.update(**upd)
+        if (
+            "large"
+            or "xl" in self.model_name.lower()
+            or compare_model_size(self.model_name, 500)
+        ):
+            upd = {"num_beams": 4}
+            self.update_generation_config(**upd)
     def update_generation_config(self, **kwargs):
+        """
+        Update the generation configuration with the specified parameters.
+        Args:
+            **kwargs: The parameters to update in the generation configuration.
+        """
+        self.logger.info(f"Updating generation config with {pp.pformat(kwargs)}")
+        self.aggregator.model.generation_config.update(**kwargs)
+    def update_loglevel(self, level: str = "INFO"):
+        """
+        Update the log level.
+        Args:
+            level (str): The log level to set. Defaults to "INFO".
+        """
+        self.logger.setLevel(level)
     def infer_aggregate(
         self,
         text_list: list,
+        instruction: str = DEFAULT_INSTRUCTION,
         **kwargs,
+    ) -> str:
+        f"""
+        Generate a summary of the specified texts.
+        Args:
+            text_list (list): The texts to summarize.
+            instruction (str): The instruction for the summary. Defaults to {self.DEFAULT_INSTRUCTION}.
+            **kwargs: Additional parameters to update in the generation configuration.
+        Returns:
+            The generated summary.
+        """
         joined_text = "\n".join(text_list)
         prompt = f"{instruction}\n\n{joined_text}\n"
         if kwargs:
             self.update_generation_config(**kwargs)
         st = time.perf_counter()
+        self.logger.info(f"inference on {len(text_list)} texts ...")
         result = self.aggregator(
             prompt,
             generation_config=self.aggregator.model.generation_config,
         )
         return result
+    def count_tokens(self, text: str) -> int:
+        """count the number of tokens in a text"""
         return (
             len(self.aggregator.tokenizer.encode(text, truncation=False, padding=False))
             if text

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """
-app.py - the main module for the gradio app
 Usage:
     python app.py
@@ -19,6 +19,7 @@ import random
 import re
 import time
 from pathlib import Path
 os.environ["USE_TORCH"] = "1"
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
@@ -31,16 +32,18 @@ logging.basicConfig(
 import gradio as gr
 import nltk
 import torch
 from cleantext import clean
 from doctr.models import ocr_predictor
 from pdf2text import convert_PDF_to_Text
 from summarize import load_model_and_tokenizer, summarize_via_tokenbatches
 from utils import (
     load_example_filenames,
     saves_summary,
     textlist2html,
     truncate_word_count,
 )
 _here = Path(__file__).parent
@@ -57,10 +60,76 @@ MODEL_OPTIONS = [
     "pszemraj/pegasus-x-large-book-summary",
 ]  # models users can choose from
 # if duplicating space,, uncomment this line to adjust the max words
 # os.environ["APP_MAX_WORDS"] = str(2048)  # set the max words to 2048
 # os.environ["APP_OCR_MAX_PAGES"] = str(40)  # set the max pages to 40
 def predict(
     input_text: str,
@@ -128,6 +197,7 @@ def proc_submission(
         str in HTML format, string of the summary, str of score
     """
     settings = {
         "length_penalty": float(length_penalty),
         "repetition_penalty": float(repetition_penalty),
@@ -208,7 +278,6 @@ def proc_submission(
     # save to file
     settings["model_name"] = model_name
     saved_file = saves_summary(summarize_output=_summaries, outpath=None, **settings)
     return html, full_summary, scores_out, saved_file
@@ -361,7 +430,7 @@ if __name__ == "__main__":
             summarize_button = gr.Button(
                 "Summarize!",
                 variant="primary",
-            )
             output_text = gr.HTML("<p><em>Output will appear below:</em></p>")
             with gr.Column():
                 gr.Markdown("#### Results & Scores")
@@ -384,11 +453,19 @@ if __name__ == "__main__":
                             label="Summary Scores",
                             placeholder="Summary scores will appear here",
                         )
-            gr.Markdown("#### **Summary Output**")
-            summary_text = gr.HTML(
-                label="Summary", value="<i>Summary will appear here!</i>"
-            )
         gr.Markdown("---")
         with gr.Column():
             gr.Markdown("### Advanced Settings")
@@ -456,5 +533,9 @@ if __name__ == "__main__":
             ],
             outputs=[output_text, summary_text, summary_scores, text_file],
         )
-    demo.launch(enable_queue=True)

 """
+app.py - the main module for the gradio app for summarization
 Usage:
     python app.py
 import re
 import time
 from pathlib import Path
+import pprint as pp
 os.environ["USE_TORCH"] = "1"
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 import gradio as gr
 import nltk
 import torch
+from aggregate import BatchAggregator
 from cleantext import clean
 from doctr.models import ocr_predictor
 from pdf2text import convert_PDF_to_Text
 from summarize import load_model_and_tokenizer, summarize_via_tokenbatches
 from utils import (
+    extract_batches,
     load_example_filenames,
     saves_summary,
     textlist2html,
     truncate_word_count,
+    remove_stagnant_files,
 )
 _here = Path(__file__).parent
     "pszemraj/pegasus-x-large-book-summary",
 ]  # models users can choose from
+SUMMARY_PLACEHOLDER = "<p><em>Output will appear below:</em></p>"
 # if duplicating space,, uncomment this line to adjust the max words
 # os.environ["APP_MAX_WORDS"] = str(2048)  # set the max words to 2048
 # os.environ["APP_OCR_MAX_PAGES"] = str(40)  # set the max pages to 40
+aggregator = BatchAggregator("MBZUAI/LaMini-Flan-T5-783M")
+def aggregate_text(
+    summary_text: str,
+    text_file: gr.inputs.File = None,
+):
+    """
+    Aggregate the text from the batches.
+        NOTE: you should probably include passing the BatchAggregator object as a parameter if using this code
+        outside of this file.
+    :param batches_html: The batches to aggregate, in html format
+    """
+    if summary_text is None or summary_text == SUMMARY_PLACEHOLDER:
+        logging.error("No text provided. Make sure a summary has been generated first.")
+        return "Error: No text provided. Make sure a summary has been generated first."
+    try:
+        extracted_batches = extract_batches(summary_text)
+    except Exception as e:
+        logging.info(summary_text)
+        logging.info(f"the batches html is: {type(summary_text)}")
+        return f"Error: unable to extract batches - check input: {e}"
+    if not extracted_batches:
+        logging.error("unable to extract batches - check input")
+        return "Error: unable to extract batches - check input"
+    out_path = None
+    if text_file is not None:
+        out_path = text_file.name  # assuming name attribute stores the file path
+    content_batches = [batch["content"] for batch in extracted_batches]
+    full_summary = aggregator.infer_aggregate(content_batches)
+    # if a path that exists is provided, save the summary with markdown formatting
+    if out_path:
+        out_path = Path(out_path)
+        try:
+            with open(out_path, "a", encoding="utf-8") as f:
+                f.write("\n\n### Aggregate Summary\n\n")
+                f.write(
+                    "- This is an instruction-based LLM aggregation of the previous 'summary batches'.\n"
+                )
+                f.write(f"- Aggregation model: {aggregator.model_name}\n\n")
+                f.write(f"{full_summary}\n\n")
+            logging.info(f"Updated {out_path} with aggregate summary")
+        except Exception as e:
+            logging.error(f"unable to update {out_path} with aggregate summary: {e}")
+    full_summary_html = f"""
+        <div style="
+            margin-bottom: 20px;
+            font-size: 18px;
+            line-height: 1.5em;
+            color: #333;
+        ">
+            <h2 style="font-size: 22px; color: #555;">Aggregate Summary:</h2>
+            <p style="white-space: pre-line;">{full_summary}</p>
+        </div>
+        """
+    return full_summary_html
 def predict(
     input_text: str,
         str in HTML format, string of the summary, str of score
     """
+    remove_stagnant_files()  # clean up old files
     settings = {
         "length_penalty": float(length_penalty),
         "repetition_penalty": float(repetition_penalty),
     # save to file
     settings["model_name"] = model_name
     saved_file = saves_summary(summarize_output=_summaries, outpath=None, **settings)
     return html, full_summary, scores_out, saved_file
             summarize_button = gr.Button(
                 "Summarize!",
                 variant="primary",
+            )  # TODO: collapse button to be on same line as something else
             output_text = gr.HTML("<p><em>Output will appear below:</em></p>")
             with gr.Column():
                 gr.Markdown("#### Results & Scores")
                             label="Summary Scores",
                             placeholder="Summary scores will appear here",
                         )
+            with gr.Column():
+                gr.Markdown("#### **Summary Output**")
+                summary_text = gr.HTML(
+                    label="Summary", value="<i>Summary will appear here!</i>"
+                )
+            with gr.Column():
+                gr.Markdown("##### **Aggregate Summary Batches**")
+                aggregate_button = gr.Button(
+                    "Aggregate!",
+                    variant="primary",
+                )  # TODO: collapse button to be on same line as something else
+                aggregated_summary = gr.HTML(label="Aggregate Summary", value="")
         gr.Markdown("---")
         with gr.Column():
             gr.Markdown("### Advanced Settings")
             ],
             outputs=[output_text, summary_text, summary_scores, text_file],
         )
+        aggregate_button.click(
+            fn=aggregate_text,
+            inputs=[summary_text, text_file],
+            outputs=[aggregated_summary],
+        )
+    demo.launch(enable_queue=True, share=True)