Add script used to clean the dataset
Browse files- mrclean.py +95 -0
mrclean.py
ADDED
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
|
3 |
+
# START_TOKEN = '<s>'
|
4 |
+
# END_TOKEN = '</s>'
|
5 |
+
# PADDING_TOKEN = '<pad>'
|
6 |
+
|
7 |
+
START_TOKEN = ''
|
8 |
+
END_TOKEN = ''
|
9 |
+
PADDING_TOKEN = ''
|
10 |
+
|
11 |
+
def _make_padding_sequence(seq_length):
|
12 |
+
return ''.join([END_TOKEN] + seq_length * [PADDING_TOKEN])
|
13 |
+
|
14 |
+
def cleanup_simple_wikipedia(text, seq_length):
|
15 |
+
pad_seq = _make_padding_sequence(seq_length)
|
16 |
+
text = START_TOKEN + re.sub(r'\n\n', pad_seq + START_TOKEN, text) + pad_seq
|
17 |
+
return text
|
18 |
+
|
19 |
+
def cleanup_wikipedia(text, seq_length):
|
20 |
+
pad_seq = _make_padding_sequence(seq_length)
|
21 |
+
text = re.sub(r'= = = (.+?) = = =\n', r'\1', text)
|
22 |
+
lines = [line.strip() for line in text.splitlines()]
|
23 |
+
text = START_TOKEN + re.sub(r'\n\n', pad_seq + START_TOKEN, '\n'.join(lines)[1:]) + pad_seq
|
24 |
+
return text
|
25 |
+
|
26 |
+
def cleanup_qed(text, seq_length):
|
27 |
+
# TODO: this should probably be padded too, but it’s difficult to detect when subtitles start and end
|
28 |
+
# The handling of proper nouns and of parentheses isn’t perfect, but this is still an improvement over the base text
|
29 |
+
punctuation_ex = re.compile(r'([.!?]\s*)')
|
30 |
+
unimportant_chars_ex = re.compile(r'\(.*?\)|[.!?]')
|
31 |
+
lines = []
|
32 |
+
for line in text.splitlines():
|
33 |
+
nchars = len(line)
|
34 |
+
if nchars > 0:
|
35 |
+
line_body = unimportant_chars_ex.sub('', line)
|
36 |
+
f_upper = sum(c.isupper() for c in line_body) / len(line_body)
|
37 |
+
if f_upper >= 0.5: # Mostly uppercase characters
|
38 |
+
# Taken from https://stackoverflow.com/a/41662260
|
39 |
+
split_on_punctuation = punctuation_ex.split(line.replace('l', 'I'))
|
40 |
+
line = ''.join([sentence.capitalize() for sentence in split_on_punctuation])
|
41 |
+
lines.append(line.strip())
|
42 |
+
return START_TOKEN + '\n'.join(lines) + END_TOKEN + ''.join(seq_length * [PADDING_TOKEN])
|
43 |
+
|
44 |
+
def cleanup_extra_spaces(text):
|
45 |
+
multiple_spaces_ex = re.compile(r'[ \t\u00A0]+')
|
46 |
+
space_before_punctuation_ex = re.compile(r'[ \t\u00A0]([.,;!?])')
|
47 |
+
text = multiple_spaces_ex.sub(' ', text)
|
48 |
+
text = space_before_punctuation_ex.sub(r'\1', text)
|
49 |
+
return text
|
50 |
+
|
51 |
+
def cleanup_bnc_spoken(text, seq_length):
|
52 |
+
pad_seq = _make_padding_sequence(seq_length)
|
53 |
+
text = cleanup_extra_spaces(text)
|
54 |
+
text = START_TOKEN + re.sub(r'\n\n', pad_seq + START_TOKEN, text) + pad_seq
|
55 |
+
return text
|
56 |
+
|
57 |
+
def cleanup_aochildes(text, seq_length):
|
58 |
+
text = cleanup_extra_spaces(text)
|
59 |
+
return START_TOKEN + text + _make_padding_sequence(seq_length)
|
60 |
+
|
61 |
+
def cleanup_cbt(text, seq_length):
|
62 |
+
text = cleanup_extra_spaces(text)
|
63 |
+
space_before_apostroph = re.compile(r"([\w\d])[ \t\u00A0](['’]\w)")
|
64 |
+
#space_before_quote = re.compile(r"[ \t\u00A0](['’])")
|
65 |
+
#space_after_quote = re.compile(r"([`])[ \t\u00A0]")
|
66 |
+
#text = space_before_quote.sub(r'\1', text)
|
67 |
+
#text = space_after_quote.sub(r'\1', text)
|
68 |
+
text = space_before_apostroph.sub(r'\1\2', text)
|
69 |
+
return START_TOKEN + text + _make_padding_sequence(seq_length)
|
70 |
+
|
71 |
+
def cleanup_children_stories(text, seq_length):
|
72 |
+
# Sometimes one skipped line marks the beginning of a new story,
|
73 |
+
# but sometimes it is present within a same story, which doesn’t
|
74 |
+
# make it very useful for separating independent stories.
|
75 |
+
return START_TOKEN + text + _make_padding_sequence(seq_length)
|
76 |
+
|
77 |
+
def cleanup_gutenberg(text, seq_length):
|
78 |
+
# Overall, the text is clean, however some entries don’t seem
|
79 |
+
# very useful, e.g. figure captions preceded by a number.
|
80 |
+
# Not sure if we should remove them, because that would also
|
81 |
+
# remove bullet lists which are otherwise consistent with the
|
82 |
+
# surrounding text.
|
83 |
+
# No start or end tokens because the text seems to be cut.
|
84 |
+
return text + ''.join(seq_length * [PADDING_TOKEN])
|
85 |
+
|
86 |
+
def cleanup_open_subtitles(text, seq_length):
|
87 |
+
# The text is mostly clean, apart from some subtitle credits
|
88 |
+
# such as "Subtitles by ...".
|
89 |
+
subtitle_credit_ex = re.compile(r'^.*subtitle.*$\n', re.MULTILINE | re.IGNORECASE)
|
90 |
+
text = subtitle_credit_ex.sub('', text)
|
91 |
+
return START_TOKEN + text + _make_padding_sequence(seq_length)
|
92 |
+
|
93 |
+
def cleanup_switchboard(text, seq_length):
|
94 |
+
# No start or end tokens because the text seems to be cut.
|
95 |
+
return text + ''.join(seq_length * [PADDING_TOKEN])
|