boris commited on
Commit
849c5f3
1 Parent(s): a96c347

feat(text): few improvements

Browse files
Files changed (1) hide show
  1. dalle_mini/text.py +27 -30
dalle_mini/text.py CHANGED
@@ -150,7 +150,7 @@ def handle_special_chars(t):
150
 
151
  def expand_hashtags(t, hashtag_processor):
152
  "Remove # and try to split words"
153
- return re.sub("#(\w+)", lambda m: hashtag_processor(m.group(1)), t)
154
 
155
 
156
  _re_ignore_chars = """[_#\/\\%]"""
@@ -197,15 +197,13 @@ class TextNormalizer:
197
  def __init__(self):
198
  self._hashtag_processor = HashtagProcessor()
199
 
200
- def __call__(self, t, clip=False):
201
-
202
  # fix some characters
203
  t = ftfy.fix_text(t)
204
  # fix html
205
  t = fix_html(t)
206
- if not clip:
207
- # decode and simplify text: see unidecode library
208
- t = unidecode(t)
209
  # lower case
210
  t = t.lower()
211
  # replace <PERSON> (for CC12M)
@@ -218,32 +216,31 @@ class TextNormalizer:
218
  t = remove_urls(t)
219
  # remove commas in numbers
220
  t = remove_comma_numbers(t)
221
- if not clip:
222
- # handle dots in numbers and quotes - Part 1
223
- t = pre_process_dot_numbers(t)
224
- t = pre_process_quotes(t)
225
- # handle special characters
226
- t = handle_special_chars(t)
227
- # handle hashtags
228
- t = expand_hashtags(t, self._hashtag_processor)
229
- # ignore useless characters
230
- t = ignore_chars(t)
231
- # simplify quotes
232
- t = simplify_quotes(t)
233
- # all punctuation becomes commas
234
- t = replace_punctuation_with_commas(t)
235
- # handle dots in numbers and quotes - Part 2
236
- t = post_process_dot_numbers(t)
237
- t = post_process_quotes(t)
238
- # handle repeating characters
239
- t = remove_repeating_chars(t)
240
- # merge commas
241
- t = merge_commas(t)
242
- # merge quotes
243
- t = merge_quotes(t)
244
  # remove multiple spaces
245
  t = remove_extra_spaces(t)
246
  # remove first and last comma
247
  t = remove_first_last_commas(t)
248
  # always start with a space
249
- return f" {t}" if not clip else t
 
150
 
151
  def expand_hashtags(t, hashtag_processor):
152
  "Remove # and try to split words"
153
+ return re.sub("#(\w+)", lambda m: " , " + hashtag_processor(m.group(1)), t)
154
 
155
 
156
  _re_ignore_chars = """[_#\/\\%]"""
 
197
  def __init__(self):
198
  self._hashtag_processor = HashtagProcessor()
199
 
200
+ def __call__(self, t):
 
201
  # fix some characters
202
  t = ftfy.fix_text(t)
203
  # fix html
204
  t = fix_html(t)
205
+ # decode and simplify text: see unidecode library
206
+ t = unidecode(t)
 
207
  # lower case
208
  t = t.lower()
209
  # replace <PERSON> (for CC12M)
 
216
  t = remove_urls(t)
217
  # remove commas in numbers
218
  t = remove_comma_numbers(t)
219
+ # handle dots in numbers and quotes - Part 1
220
+ t = pre_process_dot_numbers(t)
221
+ t = pre_process_quotes(t)
222
+ # handle special characters
223
+ t = handle_special_chars(t)
224
+ # handle hashtags
225
+ t = expand_hashtags(t, self._hashtag_processor)
226
+ # ignore useless characters
227
+ t = ignore_chars(t)
228
+ # simplify quotes
229
+ t = simplify_quotes(t)
230
+ # all punctuation becomes commas
231
+ t = replace_punctuation_with_commas(t)
232
+ # handle dots in numbers and quotes - Part 2
233
+ t = post_process_dot_numbers(t)
234
+ t = post_process_quotes(t)
235
+ # handle repeating characters
236
+ t = remove_repeating_chars(t)
237
+ # merge quotes
238
+ t = merge_quotes(t)
239
+ # merge commas
240
+ t = merge_commas(t)
 
241
  # remove multiple spaces
242
  t = remove_extra_spaces(t)
243
  # remove first and last comma
244
  t = remove_first_last_commas(t)
245
  # always start with a space
246
+ return f" {t}"