boris commited on
Commit
7b58e88
1 Parent(s): 0465605

feat(text): improvements on pre-processing

Browse files
Files changed (1) hide show
  1. dalle_mini/text.py +17 -7
dalle_mini/text.py CHANGED
@@ -95,7 +95,7 @@ def fix_html(t):
95
 
96
 
97
  def replace_punctuation_with_commas(t):
98
- return re.sub("""([()[\].,|:;?!=+~\-])""", ",", t)
99
 
100
 
101
  def simplify_quotes(t):
@@ -114,7 +114,7 @@ def remove_comma_numbers(t):
114
 
115
 
116
  def pre_process_dot_numbers(t):
117
- return re.sub("(\d)\.(\d)", fr"\1{temp_token}dot{temp_token}\2", t)
118
 
119
 
120
  def post_process_dot_numbers(t):
@@ -132,6 +132,14 @@ def post_process_quotes(t):
132
  return re.sub(f"{temp_token}quote{temp_token}", "'", t)
133
 
134
 
 
 
 
 
 
 
 
 
135
  def merge_commas(t):
136
  return re.sub("(\s*,+\s*)+", ", ", t)
137
 
@@ -143,17 +151,17 @@ def add_space_after_commas(t):
143
  def handle_special_chars(t):
144
  "Handle special characters"
145
  # replace "-" with a space when between words without space
146
- t = re.sub("([a-zA-Z])-([a-zA-Z])", r"\1 \2", t)
147
- # always add space around & or % or / or $
148
- return re.sub("([%&\/$])", r" \1 ", t)
149
 
150
 
151
  def expand_hashtags(t, hashtag_processor):
152
  "Remove # and try to split words"
153
- return re.sub("#(\w+)", lambda m: " , " + hashtag_processor(m.group(1)), t)
154
 
155
 
156
- _re_ignore_chars = """[_#\\]"""
157
 
158
 
159
  def ignore_chars(t):
@@ -219,6 +227,7 @@ class TextNormalizer:
219
  # handle dots in numbers and quotes - Part 1
220
  t = pre_process_dot_numbers(t)
221
  t = pre_process_quotes(t)
 
222
  # handle special characters
223
  t = handle_special_chars(t)
224
  # handle hashtags
@@ -232,6 +241,7 @@ class TextNormalizer:
232
  # handle dots in numbers and quotes - Part 2
233
  t = post_process_dot_numbers(t)
234
  t = post_process_quotes(t)
 
235
  # handle repeating characters
236
  t = remove_repeating_chars(t)
237
  # merge quotes
 
95
 
96
 
97
  def replace_punctuation_with_commas(t):
98
+ return re.sub("([()[\].,|:;?!=+~\-\/])", ",", t)
99
 
100
 
101
  def simplify_quotes(t):
 
114
 
115
 
116
  def pre_process_dot_numbers(t):
117
+ return re.sub("(\w)\.(\w)", fr"\1{temp_token}dot{temp_token}\2", t)
118
 
119
 
120
  def post_process_dot_numbers(t):
 
132
  return re.sub(f"{temp_token}quote{temp_token}", "'", t)
133
 
134
 
135
+ def pre_process_dates(t):
136
+ return re.sub("(\d)/(\d)", fr"\1{temp_token}slash{temp_token}\2", t)
137
+
138
+
139
+ def post_process_dates(t):
140
+ return re.sub(f"{temp_token}slash{temp_token}", "/", t)
141
+
142
+
143
  def merge_commas(t):
144
  return re.sub("(\s*,+\s*)+", ", ", t)
145
 
 
151
  def handle_special_chars(t):
152
  "Handle special characters"
153
  # replace "-" with a space when between words without space
154
+ t = re.sub("(\w)-(\w)", r"\1 \2", t)
155
+ # always add space around some characters
156
+ return re.sub("([%&\/$*])", r" \1 ", t)
157
 
158
 
159
  def expand_hashtags(t, hashtag_processor):
160
  "Remove # and try to split words"
161
+ return re.sub("#(\w+)", lambda m: hashtag_processor(m.group(1)), t)
162
 
163
 
164
+ _re_ignore_chars = r"[_#\\]"
165
 
166
 
167
  def ignore_chars(t):
 
227
  # handle dots in numbers and quotes - Part 1
228
  t = pre_process_dot_numbers(t)
229
  t = pre_process_quotes(t)
230
+ t = pre_process_dates(t)
231
  # handle special characters
232
  t = handle_special_chars(t)
233
  # handle hashtags
 
241
  # handle dots in numbers and quotes - Part 2
242
  t = post_process_dot_numbers(t)
243
  t = post_process_quotes(t)
244
+ t = post_process_dates(t)
245
  # handle repeating characters
246
  t = remove_repeating_chars(t)
247
  # merge quotes