import re # START_TOKEN = '' # END_TOKEN = '' # PADDING_TOKEN = '' START_TOKEN = '' END_TOKEN = '' PADDING_TOKEN = '' def _make_padding_sequence(seq_length): return ''.join([END_TOKEN] + seq_length * [PADDING_TOKEN]) def cleanup_simple_wikipedia(text, seq_length): pad_seq = _make_padding_sequence(seq_length) text = START_TOKEN + re.sub(r'\n\n', pad_seq + START_TOKEN, text) + pad_seq return text def cleanup_wikipedia(text, seq_length): pad_seq = _make_padding_sequence(seq_length) text = re.sub(r'= = = (.+?) = = =\n', r'\1', text) lines = [line.strip() for line in text.splitlines()] text = START_TOKEN + re.sub(r'\n\n', pad_seq + START_TOKEN, '\n'.join(lines)[1:]) + pad_seq return text def cleanup_qed(text, seq_length): # TODO: this should probably be padded too, but it’s difficult to detect when subtitles start and end # The handling of proper nouns and of parentheses isn’t perfect, but this is still an improvement over the base text punctuation_ex = re.compile(r'([.!?]\s*)') unimportant_chars_ex = re.compile(r'\(.*?\)|[.!?]') lines = [] for line in text.splitlines(): nchars = len(line) if nchars > 0: line_body = unimportant_chars_ex.sub('', line) f_upper = sum(c.isupper() for c in line_body) / len(line_body) if f_upper >= 0.5: # Mostly uppercase characters # Taken from https://stackoverflow.com/a/41662260 split_on_punctuation = punctuation_ex.split(line.replace('l', 'I')) line = ''.join([sentence.capitalize() for sentence in split_on_punctuation]) lines.append(line.strip()) return START_TOKEN + '\n'.join(lines) + END_TOKEN + ''.join(seq_length * [PADDING_TOKEN]) def cleanup_extra_spaces(text): multiple_spaces_ex = re.compile(r'[ \t\u00A0]+') space_before_punctuation_ex = re.compile(r'[ \t\u00A0]([.,;!?])') text = multiple_spaces_ex.sub(' ', text) text = space_before_punctuation_ex.sub(r'\1', text) return text def cleanup_bnc_spoken(text, seq_length): pad_seq = _make_padding_sequence(seq_length) text = cleanup_extra_spaces(text) text = START_TOKEN + re.sub(r'\n\n', pad_seq + START_TOKEN, text) + pad_seq return text def cleanup_aochildes(text, seq_length): text = cleanup_extra_spaces(text) return START_TOKEN + text + _make_padding_sequence(seq_length) def cleanup_cbt(text, seq_length): text = cleanup_extra_spaces(text) space_before_apostroph = re.compile(r"([\w\d])[ \t\u00A0](['’]\w)") #space_before_quote = re.compile(r"[ \t\u00A0](['’])") #space_after_quote = re.compile(r"([`])[ \t\u00A0]") #text = space_before_quote.sub(r'\1', text) #text = space_after_quote.sub(r'\1', text) text = space_before_apostroph.sub(r'\1\2', text) return START_TOKEN + text + _make_padding_sequence(seq_length) def cleanup_children_stories(text, seq_length): # Sometimes one skipped line marks the beginning of a new story, # but sometimes it is present within a same story, which doesn’t # make it very useful for separating independent stories. return START_TOKEN + text + _make_padding_sequence(seq_length) def cleanup_gutenberg(text, seq_length): # Overall, the text is clean, however some entries don’t seem # very useful, e.g. figure captions preceded by a number. # Not sure if we should remove them, because that would also # remove bullet lists which are otherwise consistent with the # surrounding text. # No start or end tokens because the text seems to be cut. return text + ''.join(seq_length * [PADDING_TOKEN]) def cleanup_open_subtitles(text, seq_length): # The text is mostly clean, apart from some subtitle credits # such as "Subtitles by ...". subtitle_credit_ex = re.compile(r'^.*subtitle.*$\n', re.MULTILINE | re.IGNORECASE) text = subtitle_credit_ex.sub('', text) return START_TOKEN + text + _make_padding_sequence(seq_length) def cleanup_switchboard(text, seq_length): # No start or end tokens because the text seems to be cut. return text + ''.join(seq_length * [PADDING_TOKEN])