document-summarization

Build error

App Files Files Community

pszemraj commited on Apr 9, 2023

Commit

1d3a103

1 Parent(s): b8e1b99

📝 docstrings

Browse files

Signed-off-by: peter szemraj <peterszemraj@gmail.com>

Files changed (1) hide show

app.py +41 -27

app.py CHANGED Viewed

@@ -1,3 +1,9 @@
 import contextlib
 import logging
 import os
@@ -19,7 +25,6 @@ import gradio as gr
 import nltk
 import torch
 from cleantext import clean
-from doctr.io import DocumentFile
 from doctr.models import ocr_predictor
 from pdf2text import convert_PDF_to_Text
@@ -28,7 +33,7 @@ from utils import load_example_filenames, saves_summary, truncate_word_count
 _here = Path(__file__).parent
-nltk.download("stopwords")  # TODO=find where this requirement originates from
 MODEL_OPTIONS = [
@@ -37,7 +42,7 @@ MODEL_OPTIONS = [
     "pszemraj/long-t5-tglobal-base-sci-simplify-elife",
     "pszemraj/long-t5-tglobal-base-16384-booksci-summary-v1",
     "pszemraj/pegasus-x-large-book-summary",
-]
 def predict(
@@ -46,8 +51,16 @@ def predict(
     token_batch_length: int = 1024,
     empty_cache: bool = True,
     **settings,
-):
-    """helper fn to support multiple models at once"""
     if torch.cuda.is_available() and empty_cache:
         torch.cuda.empty_cache()
@@ -143,9 +156,11 @@ def proc_submission(
         token_batch_length=token_batch_length,
         **settings,
     )
-    sum_text = [f"Section {i}:\n\t" + s["summary"][0] for i, s in enumerate(_summaries)]
     sum_scores = [
-        f" - Section {i}: {round(s['summary_score'],4)}"
         for i, s in enumerate(_summaries)
     ]
@@ -153,9 +168,9 @@ def proc_submission(
     history["Summary Scores"] = "<br><br>"
     scores_out = "\n".join(sum_scores)
     rt = round((time.perf_counter() - st) / 60, 2)
-    print(f"Runtime: {rt} minutes")
     html = ""
-    html += f"<p>Runtime: {rt} minutes on CPU</p>"
     if msg is not None:
         html += msg
@@ -170,11 +185,13 @@ def proc_submission(
 def load_single_example_text(
     example_path: str or Path,
     max_pages=20,
-):
     """
-    load_single_example - a helper function for the gradio module to load examples
-    Returns:
-        list of str, the examples
     """
     global name_to_path
     full_ex_path = name_to_path[example_path]
@@ -198,30 +215,27 @@ def load_single_example_text(
     return text
-def load_uploaded_file(file_obj, max_pages=20):
     """
-    load_uploaded_file - process an uploaded file
-    Args:
-        file_obj (POTENTIALLY list): Gradio file object inside a list
-    Returns:
-        str, the uploaded file contents
     """
-    # file_path = Path(file_obj[0].name)
     # check if mysterious file object is a list
     if isinstance(file_obj, list):
         file_obj = file_obj[0]
     file_path = Path(file_obj.name)
     try:
         if file_path.suffix == ".txt":
             with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
                 raw_text = f.read()
-            text = clean(raw_text, lower=False)
         elif file_path.suffix == ".pdf":
-            logging.info(f"Loading PDF file {file_path}")
             conversion_stats = convert_PDF_to_Text(
                 file_path,
                 ocr_model=ocr_model,
@@ -230,11 +244,11 @@ def load_uploaded_file(file_obj, max_pages=20):
             text = conversion_stats["converted_text"]
         else:
             logging.error(f"Unknown file type {file_path.suffix}")
-            text = "ERROR - check example path"
         return text
     except Exception as e:
-        logging.info(f"Trying to load file with path {file_path}, error: {e}")
         return "Error: Could not read file. Ensure that it is a valid text file with encoding UTF-8 if text, and a PDF if PDF."

+"""
+app.py - the main module for the gradio app
+Usage:
+    python app.py
+"""
 import contextlib
 import logging
 import os
 import nltk
 import torch
 from cleantext import clean
 from doctr.models import ocr_predictor
 from pdf2text import convert_PDF_to_Text
 _here = Path(__file__).parent
+nltk.download("stopwords", quiet=True)
 MODEL_OPTIONS = [
     "pszemraj/long-t5-tglobal-base-sci-simplify-elife",
     "pszemraj/long-t5-tglobal-base-16384-booksci-summary-v1",
     "pszemraj/pegasus-x-large-book-summary",
+]  # models users can choose from
 def predict(
     token_batch_length: int = 1024,
     empty_cache: bool = True,
     **settings,
+) -> list:
+    """
+    predict - helper fn to support multiple models for summarization at once
+    :param str input_text: the input text to summarize
+    :param str model_name: model name to use
+    :param int token_batch_length: the length of the token batches to use
+    :param bool empty_cache: whether to empty the cache before loading a new= model
+    :return: list of dicts with keys "summary" and "score"
+    """
     if torch.cuda.is_available() and empty_cache:
         torch.cuda.empty_cache()
         token_batch_length=token_batch_length,
         **settings,
     )
+    sum_text = [
+        f"Batch {i}:\n\t" + s["summary"][0] for i, s in enumerate(_summaries, start=1)
+    ]
     sum_scores = [
+        f" - Batch Summary {i}: {round(s['summary_score'],4)}"
         for i, s in enumerate(_summaries)
     ]
     history["Summary Scores"] = "<br><br>"
     scores_out = "\n".join(sum_scores)
     rt = round((time.perf_counter() - st) / 60, 2)
+    logging.info(f"Runtime: {rt} minutes")
     html = ""
+    html += f"<p>Runtime: {rt} minutes with model: {model_name}</p>"
     if msg is not None:
         html += msg
 def load_single_example_text(
     example_path: str or Path,
     max_pages=20,
+) -> str:
     """
+    load_single_example_text - loads a single example text file
+    :param strorPath example_path: name of the example to load
+    :param int max_pages: the maximum number of pages to load from a PDF
+    :return str: the text of the example
     """
     global name_to_path
     full_ex_path = name_to_path[example_path]
     return text
+def load_uploaded_file(file_obj, max_pages: int = 20, lower: bool = False) -> str:
     """
+    load_uploaded_file - loads a file uploaded by the user
+    :param file_obj (POTENTIALLY list): Gradio file object inside a list
+    :param int max_pages: the maximum number of pages to load from a PDF
+    :param bool lower: whether to lowercase the text
+    :return str: the text of the file
     """
     # check if mysterious file object is a list
     if isinstance(file_obj, list):
         file_obj = file_obj[0]
     file_path = Path(file_obj.name)
     try:
+        logging.info(f"Loading file:\t{file_path}")
         if file_path.suffix == ".txt":
             with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
                 raw_text = f.read()
+            text = clean(raw_text, lower=lower)
         elif file_path.suffix == ".pdf":
+            logging.info(f"loading as PDF file {file_path}")
             conversion_stats = convert_PDF_to_Text(
                 file_path,
                 ocr_model=ocr_model,
             text = conversion_stats["converted_text"]
         else:
             logging.error(f"Unknown file type {file_path.suffix}")
+            text = "ERROR - check file - unknown file type"
         return text
     except Exception as e:
+        logging.error(f"Trying to load file:\t{file_path},\nerror:\t{e}")
         return "Error: Could not read file. Ensure that it is a valid text file with encoding UTF-8 if text, and a PDF if PDF."