Peter commited on
Commit
904400a
1 Parent(s): 596d396

change to regex based split

Browse files
Files changed (1) hide show
  1. app.py +3 -1
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import logging
2
  from pathlib import Path
3
  import os
 
4
  import gradio as gr
5
  import nltk
6
  import torch
@@ -29,7 +30,8 @@ def truncate_word_count(text, max_words=512):
29
  -------
30
  dict, the text and whether it was truncated
31
  """
32
- words = text.split()
 
33
  processed = {}
34
  if len(words) > max_words:
35
  processed["was_truncated"] = True
 
1
  import logging
2
  from pathlib import Path
3
  import os
4
+ import re
5
  import gradio as gr
6
  import nltk
7
  import torch
 
30
  -------
31
  dict, the text and whether it was truncated
32
  """
33
+ # split on whitespace with regex
34
+ words = re.split(r"\s+", text)
35
  processed = {}
36
  if len(words) > max_words:
37
  processed["was_truncated"] = True