|
import json |
|
from glob import glob |
|
import re |
|
import nltk |
|
nltk.download('punkt') |
|
from nltk import word_tokenize as lib_tokenizer |
|
import string |
|
|
|
|
|
def preprocess(x, max_length=-1, remove_puncts=False): |
|
x = nltk_tokenize(x) |
|
x = x.replace("\n", " ") |
|
if remove_puncts: |
|
x = "".join([i for i in x if i not in string.punctuation]) |
|
if max_length > 0: |
|
x = " ".join(x.split()[:max_length]) |
|
return x |
|
|
|
|
|
def nltk_tokenize(x): |
|
return " ".join(word_tokenize(strip_context(x))).strip() |
|
|
|
|
|
def post_process_answer(x, entity_dict): |
|
if type(x) is not str: |
|
return x |
|
try: |
|
x = strip_answer_string(x) |
|
except: |
|
return "NaN" |
|
x = "".join([c for c in x if c not in string.punctuation]) |
|
x = " ".join(x.split()) |
|
y = x.lower() |
|
if len(y) > 1 and y.split()[0].isnumeric() and ("tháng" not in x): |
|
return y.split()[0] |
|
if not (x.isnumeric() or "ngày" in x or "tháng" in x or "năm" in x): |
|
if len(x.split()) <= 2: |
|
return entity_dict.get(x.lower(), x) |
|
else: |
|
return x |
|
else: |
|
return y |
|
|
|
|
|
dict_map = dict({}) |
|
|
|
|
|
def word_tokenize(text): |
|
global dict_map |
|
words = text.split() |
|
words_norm = [] |
|
for w in words: |
|
if dict_map.get(w, None) is None: |
|
dict_map[w] = ' '.join(lib_tokenizer(w)).replace('``', '"').replace("''", '"') |
|
words_norm.append(dict_map[w]) |
|
return words_norm |
|
|
|
|
|
def strip_answer_string(text): |
|
text = text.strip() |
|
while text[-1] in '.,/><;:\'"[]{}+=-_)(*&^!~`': |
|
if text[0] != '(' and text[-1] == ')' and '(' in text: |
|
break |
|
if text[-1] == '"' and text[0] != '"' and text.count('"') > 1: |
|
break |
|
text = text[:-1].strip() |
|
while text[0] in '.,/><;:\'"[]{}+=-_)(*&^!~`': |
|
if text[0] == '"' and text[-1] != '"' and text.count('"') > 1: |
|
break |
|
text = text[1:].strip() |
|
text = text.strip() |
|
return text |
|
|
|
|
|
def strip_context(text): |
|
text = text.replace('\n', ' ') |
|
text = re.sub(r'\s+', ' ', text) |
|
text = text.strip() |
|
return text |
|
|
|
|
|
def check_number(x): |
|
x = str(x).lower() |
|
return (x.isnumeric() or "ngày" in x or "tháng" in x or "năm" in x) |
|
|