File size: 2,176 Bytes
fa01b79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import json
from glob import glob
import re
from nltk import word_tokenize as lib_tokenizer
import string


def preprocess(x, max_length=-1, remove_puncts=False):
    x = nltk_tokenize(x)
    x = x.replace("\n", " ")
    if remove_puncts:
        x = "".join([i for i in x if i not in string.punctuation])
    if max_length > 0:
        x = " ".join(x.split()[:max_length])
    return x


def nltk_tokenize(x):
    return " ".join(word_tokenize(strip_context(x))).strip()


def post_process_answer(x, entity_dict):
    if type(x) is not str:
        return x
    try:
        x = strip_answer_string(x)
    except:
        return "NaN"
    x = "".join([c for c in x if c not in string.punctuation])
    x = " ".join(x.split())
    y = x.lower()
    if len(y) > 1 and y.split()[0].isnumeric() and ("tháng" not in x):
        return y.split()[0]
    if not (x.isnumeric() or "ngày" in x or "tháng" in x or "năm" in x):
        if len(x.split()) <= 2:
            return entity_dict.get(x.lower(), x)
        else:
            return x
    else:
        return y


dict_map = dict({})


def word_tokenize(text):
    global dict_map
    words = text.split()
    words_norm = []
    for w in words:
        if dict_map.get(w, None) is None:
            dict_map[w] = ' '.join(lib_tokenizer(w)).replace('``', '"').replace("''", '"')
        words_norm.append(dict_map[w])
    return words_norm


def strip_answer_string(text):
    text = text.strip()
    while text[-1] in '.,/><;:\'"[]{}+=-_)(*&^!~`':
        if text[0] != '(' and text[-1] == ')' and '(' in text:
            break
        if text[-1] == '"' and text[0] != '"' and text.count('"') > 1:
            break
        text = text[:-1].strip()
    while text[0] in '.,/><;:\'"[]{}+=-_)(*&^!~`':
        if text[0] == '"' and text[-1] != '"' and text.count('"') > 1:
            break
        text = text[1:].strip()
    text = text.strip()
    return text


def strip_context(text):
    text = text.replace('\n', ' ')
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text


def check_number(x):
    x = str(x).lower()
    return (x.isnumeric() or "ngày" in x or "tháng" in x or "năm" in x)