File size: 2,081 Bytes
b6e5241
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import re
from nltk import sent_tokenize, word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
detokenizer = TreebankWordDetokenizer()

def handle_dounble_quote(sent):
    cur_str = ''
    exp_left = True
    ignore_space = False
    for char in sent:
        if char == '"':
            if exp_left: #this is a left "
                cur_str = cur_str.rstrip() + ' "'
                exp_left = (not exp_left)
                ignore_space = True
            else: #this is a right "
                cur_str = cur_str.rstrip() + '" '
                exp_left = (not exp_left)
                ignore_space = False
        else:
            if ignore_space: #expecting right
                if char == ' ':
                    continue
                else:
                    cur_str = cur_str + char
                    ignore_space = False
            else:
                cur_str = cur_str + char
    cur_str = cur_str.strip()
    cur_str = re.sub(r'[ ]+', ' ', cur_str)
    return cur_str

def postprocess_space(sent):
    sent = re.sub(r'[ ]+\.', '.', sent)
    sent = re.sub(r'[ ]+,', ',', sent)
    sent = re.sub(r'[ ]+!', '!', sent)
    sent = re.sub(r'[ ]+\?', '?', sent)
    sent = re.sub(r'\([ ]+', '(', sent)
    sent = re.sub(r'[ ]+\)', ')', sent)
    sent = re.sub(r' \'s( |\.|,|!|\?)', r"'s\1", sent)
    sent = re.sub(r'n \'t( |\.|,|!|\?)', r"n't\1", sent)
    return sent

def detokenize_sent(sent):
    #Clean raw sent
    sent = re.sub(r'\' s ', '\'s ', sent)
    toks = sent.split()
    if len([1 for t in toks if t=="'"]) % 2 == 0:
        toks = ['"' if t=="'" else t for t in toks]
    sent = ' '.join(toks)
    #
    sents = sent_tokenize(sent)
    final_sents = []
    for _sent in sents:
        _sent = detokenizer.detokenize(_sent.split())
        res = handle_dounble_quote(_sent)
        if res == -1:
            print ('unbalanced double quote')
            print (_sent)
        else:
            _sent = res
        final_sents.append(_sent)
    sent = ' '.join(final_sents)
    sent = postprocess_space(sent)
    return sent