import json from glob import glob import re import nltk nltk.download('punkt') from nltk import word_tokenize as lib_tokenizer import string def preprocess(x, max_length=-1, remove_puncts=False): x = nltk_tokenize(x) x = x.replace("\n", " ") if remove_puncts: x = "".join([i for i in x if i not in string.punctuation]) if max_length > 0: x = " ".join(x.split()[:max_length]) return x def nltk_tokenize(x): return " ".join(word_tokenize(strip_context(x))).strip() def post_process_answer(x, entity_dict): if type(x) is not str: return x try: x = strip_answer_string(x) except: return "NaN" x = "".join([c for c in x if c not in string.punctuation]) x = " ".join(x.split()) y = x.lower() if len(y) > 1 and y.split()[0].isnumeric() and ("tháng" not in x): return y.split()[0] if not (x.isnumeric() or "ngày" in x or "tháng" in x or "năm" in x): if len(x.split()) <= 2: return entity_dict.get(x.lower(), x) else: return x else: return y dict_map = dict({}) def word_tokenize(text): global dict_map words = text.split() words_norm = [] for w in words: if dict_map.get(w, None) is None: dict_map[w] = ' '.join(lib_tokenizer(w)).replace('``', '"').replace("''", '"') words_norm.append(dict_map[w]) return words_norm def strip_answer_string(text): text = text.strip() while text[-1] in '.,/><;:\'"[]{}+=-_)(*&^!~`': if text[0] != '(' and text[-1] == ')' and '(' in text: break if text[-1] == '"' and text[0] != '"' and text.count('"') > 1: break text = text[:-1].strip() while text[0] in '.,/><;:\'"[]{}+=-_)(*&^!~`': if text[0] == '"' and text[-1] != '"' and text.count('"') > 1: break text = text[1:].strip() text = text.strip() return text def strip_context(text): text = text.replace('\n', ' ') text = re.sub(r'\s+', ' ', text) text = text.strip() return text def check_number(x): x = str(x).lower() return (x.isnumeric() or "ngày" in x or "tháng" in x or "năm" in x)