foxxy-hm
/

e2eqa-wiki

Question Answering

Inference Endpoints

Model card Files Files and versions Community

e2eqa-wiki / features /text_utils.py

foxxy-hm's picture

Upload 6 files

aa8b9d8 over 1 year ago

2.21 kB

	import json
	from glob import glob
	import re
	import nltk
	nltk.download('punkt')
	from nltk import word_tokenize as lib_tokenizer
	import string


	def preprocess(x, max_length=-1, remove_puncts=False):
	x = nltk_tokenize(x)
	x = x.replace("\n", " ")
	if remove_puncts:
	x = "".join([i for i in x if i not in string.punctuation])
	if max_length > 0:
	x = " ".join(x.split()[:max_length])
	return x


	def nltk_tokenize(x):
	return " ".join(word_tokenize(strip_context(x))).strip()


	def post_process_answer(x, entity_dict):
	if type(x) is not str:
	return x
	try:
	x = strip_answer_string(x)
	except:
	return "NaN"
	x = "".join([c for c in x if c not in string.punctuation])
	x = " ".join(x.split())
	y = x.lower()
	if len(y) > 1 and y.split()[0].isnumeric() and ("tháng" not in x):
	return y.split()[0]
	if not (x.isnumeric() or "ngày" in x or "tháng" in x or "năm" in x):
	if len(x.split()) <= 2:
	return entity_dict.get(x.lower(), x)
	else:
	return x
	else:
	return y


	dict_map = dict({})


	def word_tokenize(text):
	global dict_map
	words = text.split()
	words_norm = []
	for w in words:
	if dict_map.get(w, None) is None:
	dict_map[w] = ' '.join(lib_tokenizer(w)).replace('``', '"').replace("''", '"')
	words_norm.append(dict_map[w])
	return words_norm


	def strip_answer_string(text):
	text = text.strip()
	while text[-1] in '.,/><;:\'"[]{}+=-_)(*&^!~`':
	if text[0] != '(' and text[-1] == ')' and '(' in text:
	break
	if text[-1] == '"' and text[0] != '"' and text.count('"') > 1:
	break
	text = text[:-1].strip()
	while text[0] in '.,/><;:\'"[]{}+=-_)(*&^!~`':
	if text[0] == '"' and text[-1] != '"' and text.count('"') > 1:
	break
	text = text[1:].strip()
	text = text.strip()
	return text


	def strip_context(text):
	text = text.replace('\n', ' ')
	text = re.sub(r'\s+', ' ', text)
	text = text.strip()
	return text


	def check_number(x):
	x = str(x).lower()
	return (x.isnumeric() or "ngày" in x or "tháng" in x or "năm" in x)