|
import re |
|
|
|
|
|
|
|
|
|
|
|
START_TOKEN = '' |
|
END_TOKEN = '' |
|
PADDING_TOKEN = '' |
|
|
|
def _make_padding_sequence(seq_length): |
|
return ''.join([END_TOKEN] + seq_length * [PADDING_TOKEN]) |
|
|
|
def cleanup_simple_wikipedia(text, seq_length): |
|
pad_seq = _make_padding_sequence(seq_length) |
|
text = START_TOKEN + re.sub(r'\n\n', pad_seq + START_TOKEN, text) + pad_seq |
|
return text |
|
|
|
def cleanup_wikipedia(text, seq_length): |
|
pad_seq = _make_padding_sequence(seq_length) |
|
text = re.sub(r'= = = (.+?) = = =\n', r'\1', text) |
|
lines = [line.strip() for line in text.splitlines()] |
|
text = START_TOKEN + re.sub(r'\n\n', pad_seq + START_TOKEN, '\n'.join(lines)[1:]) + pad_seq |
|
return text |
|
|
|
def cleanup_qed(text, seq_length): |
|
|
|
|
|
punctuation_ex = re.compile(r'([.!?]\s*)') |
|
unimportant_chars_ex = re.compile(r'\(.*?\)|[.!?]') |
|
lines = [] |
|
for line in text.splitlines(): |
|
nchars = len(line) |
|
if nchars > 0: |
|
line_body = unimportant_chars_ex.sub('', line) |
|
f_upper = sum(c.isupper() for c in line_body) / len(line_body) |
|
if f_upper >= 0.5: |
|
|
|
split_on_punctuation = punctuation_ex.split(line.replace('l', 'I')) |
|
line = ''.join([sentence.capitalize() for sentence in split_on_punctuation]) |
|
lines.append(line.strip()) |
|
return START_TOKEN + '\n'.join(lines) + END_TOKEN + ''.join(seq_length * [PADDING_TOKEN]) |
|
|
|
def cleanup_extra_spaces(text): |
|
multiple_spaces_ex = re.compile(r'[ \t\u00A0]+') |
|
space_before_punctuation_ex = re.compile(r'[ \t\u00A0]([.,;!?])') |
|
text = multiple_spaces_ex.sub(' ', text) |
|
text = space_before_punctuation_ex.sub(r'\1', text) |
|
return text |
|
|
|
def cleanup_bnc_spoken(text, seq_length): |
|
pad_seq = _make_padding_sequence(seq_length) |
|
text = cleanup_extra_spaces(text) |
|
text = START_TOKEN + re.sub(r'\n\n', pad_seq + START_TOKEN, text) + pad_seq |
|
return text |
|
|
|
def cleanup_aochildes(text, seq_length): |
|
text = cleanup_extra_spaces(text) |
|
return START_TOKEN + text + _make_padding_sequence(seq_length) |
|
|
|
def cleanup_cbt(text, seq_length): |
|
text = cleanup_extra_spaces(text) |
|
space_before_apostroph = re.compile(r"([\w\d])[ \t\u00A0](['’]\w)") |
|
|
|
|
|
|
|
|
|
text = space_before_apostroph.sub(r'\1\2', text) |
|
return START_TOKEN + text + _make_padding_sequence(seq_length) |
|
|
|
def cleanup_children_stories(text, seq_length): |
|
|
|
|
|
|
|
return START_TOKEN + text + _make_padding_sequence(seq_length) |
|
|
|
def cleanup_gutenberg(text, seq_length): |
|
|
|
|
|
|
|
|
|
|
|
|
|
return text + ''.join(seq_length * [PADDING_TOKEN]) |
|
|
|
def cleanup_open_subtitles(text, seq_length): |
|
|
|
|
|
subtitle_credit_ex = re.compile(r'^.*subtitle.*$\n', re.MULTILINE | re.IGNORECASE) |
|
text = subtitle_credit_ex.sub('', text) |
|
return START_TOKEN + text + _make_padding_sequence(seq_length) |
|
|
|
def cleanup_switchboard(text, seq_length): |
|
|
|
return text + ''.join(seq_length * [PADDING_TOKEN]) |