document-summarization

Running

App Files Files Community

pszemraj commited on Apr 9, 2023

Commit

c006617

•

1 Parent(s): 2956200

🔊 improve logging and docs

Browse files

Signed-off-by: peter szemraj <peterszemraj@gmail.com>

Files changed (1) hide show

summarize.py +20 -20

summarize.py CHANGED Viewed

@@ -1,25 +1,22 @@
 import logging
 import torch
 from tqdm.auto import tqdm
 from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
-def load_model_and_tokenizer(model_name):
     """
-    load_model_and_tokenizer - a function that loads a model and tokenizer from huggingface
-    Args:
-        model_name (str): the name of the model to load
-    Returns:
-        AutoModelForSeq2SeqLM: the model
-        AutoTokenizer: the tokenizer
     """
     device = "cuda" if torch.cuda.is_available() else "cpu"
     model = AutoModelForSeq2SeqLM.from_pretrained(
         model_name,
-        # low_cpu_mem_usage=True,
-        # use_cache=False,
     ).to(device)
     model = model.eval()
@@ -32,7 +29,7 @@ def load_model_and_tokenizer(model_name):
 def summarize_and_score(
     ids, mask, model, tokenizer, is_general_attention_model=True, **kwargs
-):
     """
     summarize_and_score - given a batch of ids and a mask, return a summary and a score for the summary
@@ -42,9 +39,9 @@ def summarize_and_score(
         model   (): the model to use for summarization
         tokenizer (): the tokenizer to use for summarization
         is_general_attention_model (bool, optional): whether the model is a general attention model. Defaults to True.
     Returns:
-        str: the summary of the batch
     """
     ids = ids[None, :]
@@ -91,25 +88,29 @@ def summarize_via_tokenbatches(
     batch_length=2048,
     batch_stride=16,
     **kwargs,
-):
     """
-    summarize_via_tokenbatches - a function that takes a string and returns a summary
     Args:
         input_text (str): the text to summarize
-        model (): the model to use for summarizationz
         tokenizer (): the tokenizer to use for summarization
         batch_length (int, optional): the length of each batch. Defaults to 2048.
         batch_stride (int, optional): the stride of each batch. Defaults to 16. The stride is the number of tokens that overlap between batches.
     Returns:
-        str: the summary
     """
     # log all input parameters
     if batch_length < 512:
         batch_length = 512
-        print("WARNING: batch_length was set to 512")
-    print(
         f"input parameters: {kwargs}, batch_length={batch_length}, batch_stride={batch_stride}"
     )
     encoded_input = tokenizer(
@@ -129,7 +130,6 @@ def summarize_via_tokenbatches(
     pbar = tqdm(total=len(in_id_arr))
     for _id, _mask in zip(in_id_arr, att_arr):
         result, score = summarize_and_score(
             ids=_id,
             mask=_mask,
@@ -144,7 +144,7 @@ def summarize_via_tokenbatches(
             "summary_score": score,
         }
         gen_summaries.append(_sum)
-        print(f"\t{result[0]}\nScore:\t{score}")
         pbar.update()
     pbar.close()

 import logging
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(message)s")
 import torch
 from tqdm.auto import tqdm
 from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+def load_model_and_tokenizer(model_name: str) -> tuple:
     """
+    load_model_and_tokenizer - load a model and tokenizer from a model name/ID on the hub
+    :param str model_name: the model name/ID on the hub
+    :return tuple: a tuple containing the model and tokenizer
     """
     device = "cuda" if torch.cuda.is_available() else "cpu"
     model = AutoModelForSeq2SeqLM.from_pretrained(
         model_name,
     ).to(device)
     model = model.eval()
 def summarize_and_score(
     ids, mask, model, tokenizer, is_general_attention_model=True, **kwargs
+) -> tuple:
     """
     summarize_and_score - given a batch of ids and a mask, return a summary and a score for the summary
         model   (): the model to use for summarization
         tokenizer (): the tokenizer to use for summarization
         is_general_attention_model (bool, optional): whether the model is a general attention model. Defaults to True.
+        **kwargs: any additional arguments to pass to the model
     Returns:
+        tuple (str, float): the summary,  the score for the summary
     """
     ids = ids[None, :]
     batch_length=2048,
     batch_stride=16,
     **kwargs,
+) -> list:
     """
+    summarize_via_tokenbatches - summarize a long string via batches of tokens
     Args:
         input_text (str): the text to summarize
+        model (): the model to use for summarization
         tokenizer (): the tokenizer to use for summarization
         batch_length (int, optional): the length of each batch. Defaults to 2048.
         batch_stride (int, optional): the stride of each batch. Defaults to 16. The stride is the number of tokens that overlap between batches.
     Returns:
+        list: a list of dictionaries containing the input tokens, the summary, and the summary score
     """
+    logger = logging.getLogger(__name__)
     # log all input parameters
     if batch_length < 512:
         batch_length = 512
+        logger.warning(
+            f"batch_length must be at least 512. Setting batch_length to {batch_length}"
+        )
+    logger.info(
         f"input parameters: {kwargs}, batch_length={batch_length}, batch_stride={batch_stride}"
     )
     encoded_input = tokenizer(
     pbar = tqdm(total=len(in_id_arr))
     for _id, _mask in zip(in_id_arr, att_arr):
         result, score = summarize_and_score(
             ids=_id,
             mask=_mask,
             "summary_score": score,
         }
         gen_summaries.append(_sum)
+        logger.info(f"\t{result[0]}\nScore:\t{score}")
         pbar.update()
     pbar.close()