Spaces:

pszemraj
/

document-summarization

Running on CPU Upgrade

App Files Files Community

pszemraj commited on May 29, 2023

Commit

c1cba4f

•

1 Parent(s): ca983bc

⚰️ 🎨 clean up and rm verbose testing code

Browse files

Signed-off-by: peter szemraj <peterszemraj@gmail.com>

Files changed (3) hide show

aggregate.py +1 -1
app.py +2 -23
utils.py +1 -1

aggregate.py CHANGED Viewed

@@ -7,8 +7,8 @@ How it works:
 2. The language model does it.
 3. Yaay!
 """
-import pprint as pp
 import logging
 import time
 import torch

 2. The language model does it.
 3. Yaay!
 """
 import logging
+import pprint as pp
 import time
 import torch

app.py CHANGED Viewed

@@ -19,9 +19,9 @@ import contextlib
 import gc
 import logging
 import os
 import random
 import re
-import pprint as pp
 import sys
 import time
 from pathlib import Path
@@ -47,13 +47,12 @@ from summarize import load_model_and_tokenizer, summarize_via_tokenbatches
 from utils import (
     contraction_aware_tokenize,
     extract_batches,
-    extract_keywords,
     load_example_filenames,
     remove_stagnant_files,
     saves_summary,
     textlist2html,
     truncate_word_count,
-    remove_stopwords,
 )
 _here = Path(__file__).parent
@@ -268,22 +267,6 @@ def proc_submission(
         model_input_text = truncation_validated["processed_text"]
         msg = None
-    if predrop_stopwords:
-        # TODO: remove this
-        outdir = Path.cwd() / "scratch" / "predrop_stopwords-v4"
-        outdir.mkdir(parents=True, exist_ok=True)
-        keywords_cln = " ".join(extract_keywords(cln_text, kw_max_len=4))
-        keywords_sw_removed = "_".join(extract_keywords(model_input_text, kw_max_len=4))
-        cln_filename = f"{keywords_cln}_{len(cln_text)}.txt"
-        cln_outdir = outdir.parent / "source-text"
-        cln_outdir.mkdir(parents=True, exist_ok=True)
-        with open(cln_outdir / cln_filename, "w", encoding="utf-8") as f:
-            f.write(cln_text)
-        sw_rm_filename = f"{keywords_sw_removed}_{len(model_input_text)}.txt"
-        with open(outdir / sw_rm_filename, "w", encoding="utf-8") as f:
-            f.write(model_input_text)
-        logging.info(f"saved predrop_stopwords file to {outdir / sw_rm_filename}")
     if len(input_text) < 50:
         # this is essentially a different case from the above
         msg = f"""
@@ -326,7 +309,6 @@ def proc_submission(
     html += ""
-    # save to file
     settings["remove_stopwords"] = predrop_stopwords
     settings["model_name"] = model_name
     saved_file = saves_summary(summarize_output=_summaries, outpath=None, **settings)
@@ -460,9 +442,6 @@ def parse_args():
         choices=["DEBUG", "INFO", "WARNING", "ERROR"],
         help="Set the logging level",
     )
-    # if "--help" in sys.argv or "-h" in sys.argv:
-    #     parser.print_help()
-    #     sys.exit(0)
     return parser.parse_args()

 import gc
 import logging
 import os
+import pprint as pp
 import random
 import re
 import sys
 import time
 from pathlib import Path
 from utils import (
     contraction_aware_tokenize,
     extract_batches,
     load_example_filenames,
     remove_stagnant_files,
+    remove_stopwords,
     saves_summary,
     textlist2html,
     truncate_word_count,
 )
 _here = Path(__file__).parent
         model_input_text = truncation_validated["processed_text"]
         msg = None
     if len(input_text) < 50:
         # this is essentially a different case from the above
         msg = f"""
     html += ""
     settings["remove_stopwords"] = predrop_stopwords
     settings["model_name"] = model_name
     saved_file = saves_summary(summarize_output=_summaries, outpath=None, **settings)
         choices=["DEBUG", "INFO", "WARNING", "ERROR"],
         help="Set the logging level",
     )
     return parser.parse_args()

utils.py CHANGED Viewed

@@ -19,7 +19,7 @@ logging.basicConfig(
 import torch
 from natsort import natsorted
-from nltk.tokenize import word_tokenize, WhitespaceTokenizer, sent_tokenize
 from rapidfuzz import fuzz
 STOPWORDS = set(

 import torch
 from natsort import natsorted
+from nltk.tokenize import WhitespaceTokenizer, sent_tokenize, word_tokenize
 from rapidfuzz import fuzz
 STOPWORDS = set(