sanganaka
/

SanDP

Model card Files Files and versions

SanDP / utils /io_ /reader.py

shivrajanand's picture

Upload folder using huggingface_hub

a7b3936 verified 8 months ago

history blame contribute delete

3.96 kB

	from .instance import NER_DependencyInstance
	from .instance import Sentence
	from .prepare_data import ROOT, END, MAX_CHAR_LENGTH

	class Reader(object):
	def __init__(self, file_path, alphabets):
	self.__source_file = open(file_path, 'r')
	self.alphabets = alphabets

	def close(self):
	self.__source_file.close()

	def getNext(self, lower_case=False, symbolic_root=False, symbolic_end=False):
	line = self.__source_file.readline()
	# skip multiple blank lines.
	while len(line) > 0 and len(line.strip()) == 0:
	line = self.__source_file.readline()
	if len(line) == 0:
	return None

	lines = []
	while len(line.strip()) > 0:
	line = line.strip()
	lines.append(line.split('\t'))
	line = self.__source_file.readline()

	length = len(lines)
	if length == 0:
	return None

	heads = []
	tokens_dict = {}
	ids_dict = {}
	for alphabet_name in self.alphabets.keys():
	tokens_dict[alphabet_name] = []
	ids_dict[alphabet_name] = []
	if symbolic_root:
	for alphabet_name, alphabet in self.alphabets.items():
	if alphabet_name.startswith('char'):
	tokens_dict[alphabet_name].append([ROOT, ])
	ids_dict[alphabet_name].append([alphabet.get_index(ROOT), ])
	else:
	tokens_dict[alphabet_name].append(ROOT)
	ids_dict[alphabet_name].append(alphabet.get_index(ROOT))
	heads.append(0)

	for tokens in lines:
	chars = []
	char_ids = []
	if lower_case:
	tokens[1] = tokens[1].lower()
	for char in tokens[1]:
	chars.append(char)
	char_ids.append(self.alphabets['char_alphabet'].get_index(char))
	if len(chars) > MAX_CHAR_LENGTH:
	chars = chars[:MAX_CHAR_LENGTH]
	char_ids = char_ids[:MAX_CHAR_LENGTH]
	tokens_dict['char_alphabet'].append(chars)
	ids_dict['char_alphabet'].append(char_ids)

	word = tokens[1]
	# print(word+ ' ')
	pos = tokens[2]
	ner = tokens[3]
	head = int(tokens[4])
	arc_tag = tokens[5]
	if len(tokens) > 6:
	auto_label = tokens[6]
	tokens_dict['auto_label_alphabet'].append(auto_label)
	ids_dict['auto_label_alphabet'].append(self.alphabets['auto_label_alphabet'].get_index(auto_label))
	tokens_dict['word_alphabet'].append(word)
	ids_dict['word_alphabet'].append(self.alphabets['word_alphabet'].get_index(word))
	tokens_dict['pos_alphabet'].append(pos)
	ids_dict['pos_alphabet'].append(self.alphabets['pos_alphabet'].get_index(pos))
	tokens_dict['ner_alphabet'].append(ner)
	ids_dict['ner_alphabet'].append(self.alphabets['ner_alphabet'].get_index(ner))
	tokens_dict['arc_alphabet'].append(arc_tag)
	ids_dict['arc_alphabet'].append(self.alphabets['arc_alphabet'].get_index(arc_tag))
	heads.append(head)

	if symbolic_end:
	for alphabet_name, alphabet in self.alphabets.items():
	if alphabet_name.startswith('char'):
	tokens_dict[alphabet_name].append([END, ])
	ids_dict[alphabet_name].append([alphabet.get_index(END), ])
	else:
	tokens_dict[alphabet_name] = [END]
	ids_dict[alphabet_name] = [alphabet.get_index(END)]
	heads.append(0)

	return NER_DependencyInstance(Sentence(tokens_dict['word_alphabet'], ids_dict['word_alphabet'],
	tokens_dict['char_alphabet'], ids_dict['char_alphabet']),
	tokens_dict, ids_dict, heads)