| from .instance import NER_DependencyInstance |
| from .instance import Sentence |
| from .prepare_data import ROOT, END, MAX_CHAR_LENGTH |
|
|
| class Reader(object): |
| def __init__(self, file_path, alphabets): |
| self.__source_file = open(file_path, 'r') |
| self.alphabets = alphabets |
|
|
| def close(self): |
| self.__source_file.close() |
|
|
| def getNext(self, lower_case=False, symbolic_root=False, symbolic_end=False): |
| line = self.__source_file.readline() |
| |
| while len(line) > 0 and len(line.strip()) == 0: |
| line = self.__source_file.readline() |
| if len(line) == 0: |
| return None |
|
|
| lines = [] |
| while len(line.strip()) > 0: |
| line = line.strip() |
| lines.append(line.split('\t')) |
| line = self.__source_file.readline() |
|
|
| length = len(lines) |
| if length == 0: |
| return None |
|
|
| heads = [] |
| tokens_dict = {} |
| ids_dict = {} |
| for alphabet_name in self.alphabets.keys(): |
| tokens_dict[alphabet_name] = [] |
| ids_dict[alphabet_name] = [] |
| if symbolic_root: |
| for alphabet_name, alphabet in self.alphabets.items(): |
| if alphabet_name.startswith('char'): |
| tokens_dict[alphabet_name].append([ROOT, ]) |
| ids_dict[alphabet_name].append([alphabet.get_index(ROOT), ]) |
| else: |
| tokens_dict[alphabet_name].append(ROOT) |
| ids_dict[alphabet_name].append(alphabet.get_index(ROOT)) |
| heads.append(0) |
|
|
| for tokens in lines: |
| chars = [] |
| char_ids = [] |
| if lower_case: |
| tokens[1] = tokens[1].lower() |
| for char in tokens[1]: |
| chars.append(char) |
| char_ids.append(self.alphabets['char_alphabet'].get_index(char)) |
| if len(chars) > MAX_CHAR_LENGTH: |
| chars = chars[:MAX_CHAR_LENGTH] |
| char_ids = char_ids[:MAX_CHAR_LENGTH] |
| tokens_dict['char_alphabet'].append(chars) |
| ids_dict['char_alphabet'].append(char_ids) |
|
|
| word = tokens[1] |
| |
| pos = tokens[2] |
| ner = tokens[3] |
| head = int(tokens[4]) |
| arc_tag = tokens[5] |
| if len(tokens) > 6: |
| auto_label = tokens[6] |
| tokens_dict['auto_label_alphabet'].append(auto_label) |
| ids_dict['auto_label_alphabet'].append(self.alphabets['auto_label_alphabet'].get_index(auto_label)) |
| tokens_dict['word_alphabet'].append(word) |
| ids_dict['word_alphabet'].append(self.alphabets['word_alphabet'].get_index(word)) |
| tokens_dict['pos_alphabet'].append(pos) |
| ids_dict['pos_alphabet'].append(self.alphabets['pos_alphabet'].get_index(pos)) |
| tokens_dict['ner_alphabet'].append(ner) |
| ids_dict['ner_alphabet'].append(self.alphabets['ner_alphabet'].get_index(ner)) |
| tokens_dict['arc_alphabet'].append(arc_tag) |
| ids_dict['arc_alphabet'].append(self.alphabets['arc_alphabet'].get_index(arc_tag)) |
| heads.append(head) |
|
|
| if symbolic_end: |
| for alphabet_name, alphabet in self.alphabets.items(): |
| if alphabet_name.startswith('char'): |
| tokens_dict[alphabet_name].append([END, ]) |
| ids_dict[alphabet_name].append([alphabet.get_index(END), ]) |
| else: |
| tokens_dict[alphabet_name] = [END] |
| ids_dict[alphabet_name] = [alphabet.get_index(END)] |
| heads.append(0) |
|
|
| return NER_DependencyInstance(Sentence(tokens_dict['word_alphabet'], ids_dict['word_alphabet'], |
| tokens_dict['char_alphabet'], ids_dict['char_alphabet']), |
| tokens_dict, ids_dict, heads) |