File size: 3,960 Bytes
e8f4897
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
from .instance import NER_DependencyInstance
from .instance import Sentence
from .prepare_data import ROOT, END, MAX_CHAR_LENGTH

class Reader(object):
    def __init__(self, file_path, alphabets):
        self.__source_file = open(file_path, 'r')
        self.alphabets = alphabets

    def close(self):
        self.__source_file.close()

    def getNext(self, lower_case=False, symbolic_root=False, symbolic_end=False):
        line = self.__source_file.readline()
        # skip multiple blank lines.
        while len(line) > 0 and len(line.strip()) == 0:
            line = self.__source_file.readline()
        if len(line) == 0:
            return None

        lines = []
        while len(line.strip()) > 0:
            line = line.strip()
            lines.append(line.split('\t'))
            line = self.__source_file.readline()

        length = len(lines)
        if length == 0:
            return None

        heads = []
        tokens_dict = {}
        ids_dict = {}
        for alphabet_name in self.alphabets.keys():
            tokens_dict[alphabet_name] = []
            ids_dict[alphabet_name] = []
        if symbolic_root:
            for alphabet_name, alphabet in self.alphabets.items():
                if alphabet_name.startswith('char'):
                    tokens_dict[alphabet_name].append([ROOT, ])
                    ids_dict[alphabet_name].append([alphabet.get_index(ROOT), ])
                else:
                    tokens_dict[alphabet_name].append(ROOT)
                    ids_dict[alphabet_name].append(alphabet.get_index(ROOT))
            heads.append(0)

        for tokens in lines:
            chars = []
            char_ids = []
            if lower_case:
                tokens[1] = tokens[1].lower()
            for char in tokens[1]:
                chars.append(char)
                char_ids.append(self.alphabets['char_alphabet'].get_index(char))
            if len(chars) > MAX_CHAR_LENGTH:
                chars = chars[:MAX_CHAR_LENGTH]
                char_ids = char_ids[:MAX_CHAR_LENGTH]
            tokens_dict['char_alphabet'].append(chars)
            ids_dict['char_alphabet'].append(char_ids)

            word = tokens[1]
            # print(word+ ' ')
            pos = tokens[2]
            ner = tokens[3]
            head = int(tokens[4])
            arc_tag = tokens[5]
            if len(tokens) > 6:
                auto_label = tokens[6]
                tokens_dict['auto_label_alphabet'].append(auto_label)
                ids_dict['auto_label_alphabet'].append(self.alphabets['auto_label_alphabet'].get_index(auto_label))
            tokens_dict['word_alphabet'].append(word)
            ids_dict['word_alphabet'].append(self.alphabets['word_alphabet'].get_index(word))
            tokens_dict['pos_alphabet'].append(pos)
            ids_dict['pos_alphabet'].append(self.alphabets['pos_alphabet'].get_index(pos))
            tokens_dict['ner_alphabet'].append(ner)
            ids_dict['ner_alphabet'].append(self.alphabets['ner_alphabet'].get_index(ner))
            tokens_dict['arc_alphabet'].append(arc_tag)
            ids_dict['arc_alphabet'].append(self.alphabets['arc_alphabet'].get_index(arc_tag))
            heads.append(head)

        if symbolic_end:
            for alphabet_name, alphabet in self.alphabets.items():
                if alphabet_name.startswith('char'):
                    tokens_dict[alphabet_name].append([END, ])
                    ids_dict[alphabet_name].append([alphabet.get_index(END), ])
                else:
                    tokens_dict[alphabet_name] = [END]
                    ids_dict[alphabet_name] = [alphabet.get_index(END)]
            heads.append(0)

        return NER_DependencyInstance(Sentence(tokens_dict['word_alphabet'], ids_dict['word_alphabet'],
                                               tokens_dict['char_alphabet'], ids_dict['char_alphabet']),
                                      tokens_dict, ids_dict, heads)