Spaces:
Runtime error
Runtime error
File size: 2,081 Bytes
b6e5241 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
import re
from nltk import sent_tokenize, word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
detokenizer = TreebankWordDetokenizer()
def handle_dounble_quote(sent):
cur_str = ''
exp_left = True
ignore_space = False
for char in sent:
if char == '"':
if exp_left: #this is a left "
cur_str = cur_str.rstrip() + ' "'
exp_left = (not exp_left)
ignore_space = True
else: #this is a right "
cur_str = cur_str.rstrip() + '" '
exp_left = (not exp_left)
ignore_space = False
else:
if ignore_space: #expecting right
if char == ' ':
continue
else:
cur_str = cur_str + char
ignore_space = False
else:
cur_str = cur_str + char
cur_str = cur_str.strip()
cur_str = re.sub(r'[ ]+', ' ', cur_str)
return cur_str
def postprocess_space(sent):
sent = re.sub(r'[ ]+\.', '.', sent)
sent = re.sub(r'[ ]+,', ',', sent)
sent = re.sub(r'[ ]+!', '!', sent)
sent = re.sub(r'[ ]+\?', '?', sent)
sent = re.sub(r'\([ ]+', '(', sent)
sent = re.sub(r'[ ]+\)', ')', sent)
sent = re.sub(r' \'s( |\.|,|!|\?)', r"'s\1", sent)
sent = re.sub(r'n \'t( |\.|,|!|\?)', r"n't\1", sent)
return sent
def detokenize_sent(sent):
#Clean raw sent
sent = re.sub(r'\' s ', '\'s ', sent)
toks = sent.split()
if len([1 for t in toks if t=="'"]) % 2 == 0:
toks = ['"' if t=="'" else t for t in toks]
sent = ' '.join(toks)
#
sents = sent_tokenize(sent)
final_sents = []
for _sent in sents:
_sent = detokenizer.detokenize(_sent.split())
res = handle_dounble_quote(_sent)
if res == -1:
print ('unbalanced double quote')
print (_sent)
else:
_sent = res
final_sents.append(_sent)
sent = ' '.join(final_sents)
sent = postprocess_space(sent)
return sent
|