File size: 3,960 Bytes
e8f4897 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
from .instance import NER_DependencyInstance
from .instance import Sentence
from .prepare_data import ROOT, END, MAX_CHAR_LENGTH
class Reader(object):
def __init__(self, file_path, alphabets):
self.__source_file = open(file_path, 'r')
self.alphabets = alphabets
def close(self):
self.__source_file.close()
def getNext(self, lower_case=False, symbolic_root=False, symbolic_end=False):
line = self.__source_file.readline()
# skip multiple blank lines.
while len(line) > 0 and len(line.strip()) == 0:
line = self.__source_file.readline()
if len(line) == 0:
return None
lines = []
while len(line.strip()) > 0:
line = line.strip()
lines.append(line.split('\t'))
line = self.__source_file.readline()
length = len(lines)
if length == 0:
return None
heads = []
tokens_dict = {}
ids_dict = {}
for alphabet_name in self.alphabets.keys():
tokens_dict[alphabet_name] = []
ids_dict[alphabet_name] = []
if symbolic_root:
for alphabet_name, alphabet in self.alphabets.items():
if alphabet_name.startswith('char'):
tokens_dict[alphabet_name].append([ROOT, ])
ids_dict[alphabet_name].append([alphabet.get_index(ROOT), ])
else:
tokens_dict[alphabet_name].append(ROOT)
ids_dict[alphabet_name].append(alphabet.get_index(ROOT))
heads.append(0)
for tokens in lines:
chars = []
char_ids = []
if lower_case:
tokens[1] = tokens[1].lower()
for char in tokens[1]:
chars.append(char)
char_ids.append(self.alphabets['char_alphabet'].get_index(char))
if len(chars) > MAX_CHAR_LENGTH:
chars = chars[:MAX_CHAR_LENGTH]
char_ids = char_ids[:MAX_CHAR_LENGTH]
tokens_dict['char_alphabet'].append(chars)
ids_dict['char_alphabet'].append(char_ids)
word = tokens[1]
# print(word+ ' ')
pos = tokens[2]
ner = tokens[3]
head = int(tokens[4])
arc_tag = tokens[5]
if len(tokens) > 6:
auto_label = tokens[6]
tokens_dict['auto_label_alphabet'].append(auto_label)
ids_dict['auto_label_alphabet'].append(self.alphabets['auto_label_alphabet'].get_index(auto_label))
tokens_dict['word_alphabet'].append(word)
ids_dict['word_alphabet'].append(self.alphabets['word_alphabet'].get_index(word))
tokens_dict['pos_alphabet'].append(pos)
ids_dict['pos_alphabet'].append(self.alphabets['pos_alphabet'].get_index(pos))
tokens_dict['ner_alphabet'].append(ner)
ids_dict['ner_alphabet'].append(self.alphabets['ner_alphabet'].get_index(ner))
tokens_dict['arc_alphabet'].append(arc_tag)
ids_dict['arc_alphabet'].append(self.alphabets['arc_alphabet'].get_index(arc_tag))
heads.append(head)
if symbolic_end:
for alphabet_name, alphabet in self.alphabets.items():
if alphabet_name.startswith('char'):
tokens_dict[alphabet_name].append([END, ])
ids_dict[alphabet_name].append([alphabet.get_index(END), ])
else:
tokens_dict[alphabet_name] = [END]
ids_dict[alphabet_name] = [alphabet.get_index(END)]
heads.append(0)
return NER_DependencyInstance(Sentence(tokens_dict['word_alphabet'], ids_dict['word_alphabet'],
tokens_dict['char_alphabet'], ids_dict['char_alphabet']),
tokens_dict, ids_dict, heads) |