pszemraj commited on
Commit
9e8f29e
1 Parent(s): 80098ed

⚡️ 🐛 fix issue of wrong input text, disambiguate vars

Browse files

Signed-off-by: peter szemraj <peterszemraj@gmail.com>

Files changed (2) hide show
  1. app.py +13 -8
  2. utils.py +12 -8
app.py CHANGED
@@ -45,7 +45,9 @@ from aggregate import BatchAggregator
45
  from pdf2text import convert_PDF_to_Text
46
  from summarize import load_model_and_tokenizer, summarize_via_tokenbatches
47
  from utils import (
 
48
  extract_batches,
 
49
  load_example_filenames,
50
  remove_stagnant_files,
51
  saves_summary,
@@ -241,10 +243,13 @@ def proc_submission(
241
  history = {}
242
  clean_text = clean(input_text, lower=False)
243
  clean_text = remove_stopwords(clean_text) if predrop_stopwords else clean_text
244
- processed = truncate_word_count(clean_text, max_words=max_input_length)
 
 
 
245
 
246
- if processed["was_truncated"]:
247
- tr_in = processed["truncated_text"]
248
  # create elaborate HTML warning
249
  input_wc = re.split(r"\s+", input_text)
250
  msg = f"""
@@ -256,7 +261,7 @@ def proc_submission(
256
  logging.warning(msg)
257
  history["WARNING"] = msg
258
  else:
259
- tr_in = input_text
260
  msg = None
261
 
262
  if len(input_text) < 50:
@@ -278,7 +283,7 @@ def proc_submission(
278
  return msg, "<strong>No summary generated.</strong>", "", []
279
 
280
  _summaries = predict(
281
- input_text=tr_in,
282
  model_name=model_name,
283
  token_batch_length=token_batch_length,
284
  **settings,
@@ -410,14 +415,14 @@ def parse_args():
410
  "--add_beam_option",
411
  type=int,
412
  default=None,
413
- help=f"Add a beam search option to the list of beam search options: {pp.pformat(BEAM_OPTIONS, compact=True)}",
414
  )
415
  parser.add_argument(
416
  "-batch",
417
  "--token_batch_option",
418
  type=int,
419
  default=None,
420
- help=f"Add a token batch option to the list of token batch options: {pp.pformat(TOKEN_BATCH_OPTIONS, compact=True)}",
421
  )
422
  parser.add_argument(
423
  "-level",
@@ -577,7 +582,7 @@ if __name__ == "__main__":
577
  value="<center><i>Aggregate summary will appear here!</i></center>",
578
  )
579
  gr.Markdown(
580
- "\n\n_Aggregate summary also appended to the bottom of the `.txt` file!_"
581
  )
582
 
583
  gr.Markdown("---")
 
45
  from pdf2text import convert_PDF_to_Text
46
  from summarize import load_model_and_tokenizer, summarize_via_tokenbatches
47
  from utils import (
48
+ contraction_aware_tokenize,
49
  extract_batches,
50
+ extract_keywords,
51
  load_example_filenames,
52
  remove_stagnant_files,
53
  saves_summary,
 
243
  history = {}
244
  clean_text = clean(input_text, lower=False)
245
  clean_text = remove_stopwords(clean_text) if predrop_stopwords else clean_text
246
+ logging.info(
247
+ f"pre-truncation word count: {len(contraction_aware_tokenize(clean_text))}"
248
+ )
249
+ truncation_validated = truncate_word_count(clean_text, max_words=max_input_length)
250
 
251
+ if truncation_validated["was_truncated"]:
252
+ model_input_text = truncation_validated["processed_text"]
253
  # create elaborate HTML warning
254
  input_wc = re.split(r"\s+", input_text)
255
  msg = f"""
 
261
  logging.warning(msg)
262
  history["WARNING"] = msg
263
  else:
264
+ model_input_text = truncation_validated["processed_text"]
265
  msg = None
266
 
267
  if len(input_text) < 50:
 
283
  return msg, "<strong>No summary generated.</strong>", "", []
284
 
285
  _summaries = predict(
286
+ input_text=model_input_text,
287
  model_name=model_name,
288
  token_batch_length=token_batch_length,
289
  **settings,
 
415
  "--add_beam_option",
416
  type=int,
417
  default=None,
418
+ help=f"Add a beam search option to the demo UI options, default: {pp.pformat(BEAM_OPTIONS, compact=True)}",
419
  )
420
  parser.add_argument(
421
  "-batch",
422
  "--token_batch_option",
423
  type=int,
424
  default=None,
425
+ help=f"Add a token batch size to the demo UI options, default: {pp.pformat(TOKEN_BATCH_OPTIONS, compact=True)}",
426
  )
427
  parser.add_argument(
428
  "-level",
 
582
  value="<center><i>Aggregate summary will appear here!</i></center>",
583
  )
584
  gr.Markdown(
585
+ "\n\n_Aggregate summary is also appended to the bottom of the `.txt` file._"
586
  )
587
 
588
  gr.Markdown("---")
utils.py CHANGED
@@ -27,8 +27,8 @@ STOPWORDS = set(
27
  )
28
 
29
 
30
- def custom_tokenize(text: str) -> List[str]:
31
- """custom_tokenize - merges words containing apostrophes as one token."""
32
 
33
  # Tokenize the text using the WhitespaceTokenizer
34
  tokenizer = WhitespaceTokenizer()
@@ -56,17 +56,21 @@ def custom_tokenize(text: str) -> List[str]:
56
 
57
 
58
  def remove_stopwords(
59
- text: str, stopwords: List[str] = STOPWORDS, use_custom_tokenize: bool = True
60
  ) -> str:
61
  """
62
  remove_stopwords - Remove stopwords from text.
63
 
64
  :param str text: input text
65
  :param List[str] stopwords: list of stopwords, defaults to STOPWORDS
66
- :param bool use_custom_tokenize: use custom apostrophe tokenizer, defaults to True
67
  :return str: text with stopwords removed
68
  """
69
- words = custom_tokenize(text) if use_custom_tokenize else word_tokenize(text)
 
 
 
 
70
 
71
  filtered_words = []
72
  for word in words:
@@ -204,14 +208,14 @@ def truncate_word_count(text: str, max_words=1024) -> dict:
204
  :param int max_words: the maximum number of words to keep, defaults to 1024
205
  :return: dict, the processed text
206
  """
207
- words = re.split(r"\s+", text)
208
  processed = {}
209
  if len(words) > max_words:
210
  processed["was_truncated"] = True
211
- processed["truncated_text"] = " ".join(words[:max_words])
212
  else:
213
  processed["was_truncated"] = False
214
- processed["truncated_text"] = text
215
  return processed
216
 
217
 
 
27
  )
28
 
29
 
30
+ def contraction_aware_tokenize(text: str) -> List[str]:
31
+ """contraction_aware_tokenize - merges words containing apostrophes as one token."""
32
 
33
  # Tokenize the text using the WhitespaceTokenizer
34
  tokenizer = WhitespaceTokenizer()
 
56
 
57
 
58
  def remove_stopwords(
59
+ text: str, stopwords: List[str] = STOPWORDS, contraction_tokenize: bool = True
60
  ) -> str:
61
  """
62
  remove_stopwords - Remove stopwords from text.
63
 
64
  :param str text: input text
65
  :param List[str] stopwords: list of stopwords, defaults to STOPWORDS
66
+ :param bool contraction_tokenize: use custom apostrophe tokenizer, defaults to True
67
  :return str: text with stopwords removed
68
  """
69
+ words = (
70
+ contraction_aware_tokenize(text)
71
+ if contraction_tokenize
72
+ else word_tokenize(text)
73
+ )
74
 
75
  filtered_words = []
76
  for word in words:
 
208
  :param int max_words: the maximum number of words to keep, defaults to 1024
209
  :return: dict, the processed text
210
  """
211
+ words = contraction_aware_tokenize(str(text))
212
  processed = {}
213
  if len(words) > max_words:
214
  processed["was_truncated"] = True
215
+ processed["processed_text"] = " ".join(words[:max_words])
216
  else:
217
  processed["was_truncated"] = False
218
+ processed["processed_text"] = text
219
  return processed
220
 
221