pszemraj commited on
Commit
80098ed
1 Parent(s): 435abb4

✨ option to drop stopwords pre-summ

Browse files

Signed-off-by: peter szemraj <peterszemraj@gmail.com>

Files changed (2) hide show
  1. app.py +13 -1
  2. utils.py +50 -10
app.py CHANGED
@@ -32,6 +32,7 @@ os.environ["TOKENIZERS_PARALLELISM"] = "false"
32
  logging.basicConfig(
33
  level=logging.INFO,
34
  format="%(asctime)s [%(levelname)s] %(name)s - %(message)s",
 
35
  )
36
 
37
  import gradio as gr
@@ -50,6 +51,7 @@ from utils import (
50
  saves_summary,
51
  textlist2html,
52
  truncate_word_count,
 
53
  )
54
 
55
  _here = Path(__file__).parent
@@ -194,6 +196,7 @@ def proc_submission(
194
  length_penalty: float,
195
  repetition_penalty: float,
196
  no_repeat_ngram_size: int,
 
197
  max_input_length: int = 6144,
198
  ):
199
  """
@@ -230,11 +233,14 @@ def proc_submission(
230
  "do_sample": False,
231
  }
232
  max_input_length = int(os.environ.get("APP_MAX_WORDS", max_input_length))
233
- logging.info(f"max_input_length set to: {max_input_length}")
 
 
234
 
235
  st = time.perf_counter()
236
  history = {}
237
  clean_text = clean(input_text, lower=False)
 
238
  processed = truncate_word_count(clean_text, max_words=max_input_length)
239
 
240
  if processed["was_truncated"]:
@@ -296,6 +302,7 @@ def proc_submission(
296
  html += ""
297
 
298
  # save to file
 
299
  settings["model_name"] = model_name
300
  saved_file = saves_summary(summarize_output=_summaries, outpath=None, **settings)
301
  return html, full_summary, scores_out, saved_file
@@ -607,6 +614,10 @@ if __name__ == "__main__":
607
  label="no repeat ngram size",
608
  value=3,
609
  )
 
 
 
 
610
  with gr.Column():
611
  gr.Markdown("## About")
612
  gr.Markdown(
@@ -638,6 +649,7 @@ if __name__ == "__main__":
638
  length_penalty,
639
  repetition_penalty,
640
  no_repeat_ngram_size,
 
641
  ],
642
  outputs=[output_text, summary_text, summary_scores, text_file],
643
  )
 
32
  logging.basicConfig(
33
  level=logging.INFO,
34
  format="%(asctime)s [%(levelname)s] %(name)s - %(message)s",
35
+ datefmt="%Y-%b-%d %H:%M:%S",
36
  )
37
 
38
  import gradio as gr
 
51
  saves_summary,
52
  textlist2html,
53
  truncate_word_count,
54
+ remove_stopwords,
55
  )
56
 
57
  _here = Path(__file__).parent
 
196
  length_penalty: float,
197
  repetition_penalty: float,
198
  no_repeat_ngram_size: int,
199
+ predrop_stopwords: bool,
200
  max_input_length: int = 6144,
201
  ):
202
  """
 
233
  "do_sample": False,
234
  }
235
  max_input_length = int(os.environ.get("APP_MAX_WORDS", max_input_length))
236
+ logging.info(
237
+ f"max_input_length set to: {max_input_length}. pre-drop stopwords: {predrop_stopwords}"
238
+ )
239
 
240
  st = time.perf_counter()
241
  history = {}
242
  clean_text = clean(input_text, lower=False)
243
+ clean_text = remove_stopwords(clean_text) if predrop_stopwords else clean_text
244
  processed = truncate_word_count(clean_text, max_words=max_input_length)
245
 
246
  if processed["was_truncated"]:
 
302
  html += ""
303
 
304
  # save to file
305
+ settings["remove_stopwords"] = predrop_stopwords
306
  settings["model_name"] = model_name
307
  saved_file = saves_summary(summarize_output=_summaries, outpath=None, **settings)
308
  return html, full_summary, scores_out, saved_file
 
614
  label="no repeat ngram size",
615
  value=3,
616
  )
617
+ predrop_stopwords = gr.Checkbox(
618
+ label="Drop Stopwords (Pre-Truncation)",
619
+ value=False,
620
+ )
621
  with gr.Column():
622
  gr.Markdown("## About")
623
  gr.Markdown(
 
649
  length_penalty,
650
  repetition_penalty,
651
  no_repeat_ngram_size,
652
+ predrop_stopwords,
653
  ],
654
  outputs=[output_text, summary_text, summary_scores, text_file],
655
  )
utils.py CHANGED
@@ -19,34 +19,74 @@ logging.basicConfig(
19
 
20
  import torch
21
  from natsort import natsorted
22
- from nltk.tokenize import word_tokenize
23
  from rapidfuzz import fuzz
24
 
25
- # Define stopwords
26
  STOPWORDS = set(
27
  "a about above after again against all am an and any are aren't as at be because been before being below between both but by can't cannot could couldn't did didn't do does doesn't doing don't down during each few for from further had hadn't has hasn't have haven't having he he'd he'll he's her here here's hers herself him himself his how how's i i'd i'll i'm i've if in into is isn't it it's its itself let's me more most mustn't my myself no nor not of off on once only or other ought our ours ourselves out over own same shan't she she'd she'll she's should shouldn't so some such than that that's the their theirs them themselves then there there's these they they'd they'll they're they've this those through to too under until up very was wasn't we we'd we'll we're we've were weren't what what's when when's where where's which while who who's whom why why's with won't would wouldn't you you'd you'll you're you've your yours yourself yourselves".split()
28
  )
29
 
30
 
31
- def remove_stopwords(text: str, stopwords: list = STOPWORDS) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  """
33
- remove_stopwords - Remove stopwords from a string.
34
 
35
- :param str text: text to remove stopwords from
36
- :param list stopwords: list of stopwords to remove, defaults to STOPWORDS
37
- :return, str: text with stopwords removed
 
38
  """
39
- words = word_tokenize(text)
40
- filtered_words = []
41
 
 
42
  for word in words:
43
- word = word.strip(string.punctuation) # remove punctuation
 
44
 
45
  if word.lower() not in stopwords:
46
  filtered_words.append(word)
47
 
48
  filtered_text = " ".join(filtered_words)
49
 
 
 
 
 
 
 
 
 
 
50
  return filtered_text
51
 
52
 
 
19
 
20
  import torch
21
  from natsort import natsorted
22
+ from nltk.tokenize import word_tokenize, WhitespaceTokenizer
23
  from rapidfuzz import fuzz
24
 
 
25
  STOPWORDS = set(
26
  "a about above after again against all am an and any are aren't as at be because been before being below between both but by can't cannot could couldn't did didn't do does doesn't doing don't down during each few for from further had hadn't has hasn't have haven't having he he'd he'll he's her here here's hers herself him himself his how how's i i'd i'll i'm i've if in into is isn't it it's its itself let's me more most mustn't my myself no nor not of off on once only or other ought our ours ourselves out over own same shan't she she'd she'll she's should shouldn't so some such than that that's the their theirs them themselves then there there's these they they'd they'll they're they've this those through to too under until up very was wasn't we we'd we'll we're we've were weren't what what's when when's where where's which while who who's whom why why's with won't would wouldn't you you'd you'll you're you've your yours yourself yourselves".split()
27
  )
28
 
29
 
30
+ def custom_tokenize(text: str) -> List[str]:
31
+ """custom_tokenize - merges words containing apostrophes as one token."""
32
+
33
+ # Tokenize the text using the WhitespaceTokenizer
34
+ tokenizer = WhitespaceTokenizer()
35
+ tokens = tokenizer.tokenize(text)
36
+
37
+ merged_tokens = []
38
+ merged_token = ""
39
+
40
+ for token in tokens:
41
+ if re.search(r"\w+'\w+", token):
42
+ # Token contains an apostrophe, merge with previous token
43
+ merged_token += token
44
+ else:
45
+ # no apostrophe, add previous merged token (if any) and current
46
+ if merged_token:
47
+ merged_tokens.append(merged_token)
48
+ merged_token = ""
49
+ merged_tokens.append(token)
50
+
51
+ # Add the last merged token (if any)
52
+ if merged_token:
53
+ merged_tokens.append(merged_token)
54
+
55
+ return merged_tokens
56
+
57
+
58
+ def remove_stopwords(
59
+ text: str, stopwords: List[str] = STOPWORDS, use_custom_tokenize: bool = True
60
+ ) -> str:
61
  """
62
+ remove_stopwords - Remove stopwords from text.
63
 
64
+ :param str text: input text
65
+ :param List[str] stopwords: list of stopwords, defaults to STOPWORDS
66
+ :param bool use_custom_tokenize: use custom apostrophe tokenizer, defaults to True
67
+ :return str: text with stopwords removed
68
  """
69
+ words = custom_tokenize(text) if use_custom_tokenize else word_tokenize(text)
 
70
 
71
+ filtered_words = []
72
  for word in words:
73
+ # Remove leading and trailing punctuation marks
74
+ word = word.strip(string.punctuation)
75
 
76
  if word.lower() not in stopwords:
77
  filtered_words.append(word)
78
 
79
  filtered_text = " ".join(filtered_words)
80
 
81
+ # Replace multiple consecutive whitespaces with a single space
82
+ filtered_text = re.sub(r"\s+", " ", filtered_text)
83
+ filtered_text = filtered_text.strip()
84
+
85
+ # Restore original whitespaces around punctuation marks
86
+ filtered_text = re.sub(
87
+ r"\s*([{}])\s*".format(re.escape(string.punctuation)), r"\1", filtered_text
88
+ )
89
+
90
  return filtered_text
91
 
92