File size: 6,647 Bytes
05922fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
import gzip
import json
import os
import logging
from argparse import ArgumentParser
from itertools import accumulate

import nltk
import numpy as np
from tools.framenet.nltk_framenet import framenet, framenet15
from tqdm import tqdm

from tools.framenet.fn_util import framenet_split, Sentence

logger = logging.getLogger('fn')


def _load_raw(version):
    if version == '1.5':
        nltk.download('framenet_v15')
        return framenet15
    else:
        nltk.download('framenet_v17')
        return framenet


def one_frame(sentence, ann):
    frame_info = {'label': ann.frame.name}
    target_list = list()
    for start, end in ann.Target:
        start, end = sentence.span(start, end)
        target_list.extend(list(range(start, end+1)))
    assert len(target_list) > 0
    frame_info['span'] = [sorted(target_list)[0], sorted(target_list)[-1]]
    frame_info['lu'] = ann.LU.name
    frame_info['children'] = fes = list()
    for start, end, fe_name in ann.FE[0]:
        start, end = sentence.span(start, end)
        fes.append({'span': [start, end], 'label': fe_name})
    return frame_info


def load_nltk_exemplars(version, exclude_ann_ids=None):
    exclude_ann_ids = exclude_ann_ids or list()
    fn = _load_raw(version)
    egs = list()
    bar = tqdm()
    skipped = 0
    try:
        for eg in fn.annotations(full_text=False):
            if 'Target' not in eg.keys():
                # A bug of nltk
                continue
            if eg.ID in exclude_ann_ids:
                skipped += 1
                continue
            try:
                sentence = Sentence(eg.text)
                egs.append({
                    'tokens': list(map(str, sentence.tokens)), 'annotations': [one_frame(sentence, eg)],
                    'meta': {
                        'fully_annotated': False,
                        'source': f'framenet_v{version}',
                        'with_fe': True,
                        'type': 'exemplar',
                        'ann_ids': [eg.ID],
                    }
                })
                bar.update()
            except:
                pass
    except:
        pass
    bar.close()
    logger.info(f'Loaded {len(egs)} sentences for framenet v{version} from exemplars. (skipped {skipped} sentences)')
    return egs


def load_nltk_fully_annotated(version):
    fn = _load_raw(version)

    splits = list(framenet_split.keys())
    all_containers = {split: [] for split in splits}
    for doc in tqdm(fn.docs()):
        container = all_containers['train']
        for sp in splits:
            if doc.filename in framenet_split[sp]:
                container = all_containers[sp]

        for sent in doc.sentence:
            sentence = Sentence(sent.text)
            all_frames = list()
            ann_ids = []
            for ann in sent.annotationSet:
                if ann._type == 'posannotationset':
                    continue
                assert ann._type == 'fulltext_annotationset'
                if 'Target' not in ann.keys():
                    logger.warning('Target not found.')
                    continue
                if 'ID' in ann:
                    ann_ids.append(ann['ID'])
                frame_info = one_frame(sentence, ann)
                all_frames.append(frame_info)
            eg_dict = {
                'tokens': list(map(str, sentence.tokens)), 'annotations': all_frames,
                'meta': {
                    'source': f'framenet_v{version}',
                    'fully_annotated': True,
                    'with_fe': True,
                    'type': 'full text',
                    'sentence ID': sent.ID,
                    'doc': doc.filename,
                    'ann_ids': ann_ids
                }
            }
            container.append(eg_dict)

    for sp in splits:
        logger.info(f'Load {len(all_containers[sp])} for {sp}.')
    return all_containers


def load_expanded_fn(path):
    raise NotImplementedError
    with gzip.open(path, 'rb') as compressed:
        lines = compressed.read().decode()
    instances = list()
    lines = lines.split('\n')
    for line in tqdm(lines):
        if len(line) != 0:
            instances.append(json.loads(line))
    logger.info(f'{len(instances)} lines loaded.')

    dataset = list()
    for instance in tqdm(instances, desc='Processing expanded framenet...'):
        for output in instance['outputs']:
            ins_dict = dict()
            ins_dict['meta'] = {
                'source': 'expanded framenet',
                'type': 'paraphrase',
                'exemplar_id': instance['exemplar_id'],
                'annoset_id': instance['annoset_id']
            }
            words = output['output_string']
            text = ' '.join(words)
            length_offsets = [0] + list(accumulate(map(len, words)))
            start_idx, end_idx = output['output_trigger_offset']
            start_idx = length_offsets[start_idx] + start_idx
            end_idx = length_offsets[end_idx] + end_idx - 2
            sentence = Sentence(text)
            ins_dict['text'] = sentence.tokens
            ins_dict['pos'] = sentence.pos
            ins_dict['tag'] = sentence.tag
            ins_dict['frame'] = [{
                'name': instance['frame_name'],
                'target': list(range(sentence.span(start_idx, end_idx)[0], sentence.span(start_idx, end_idx)[1]+1)),
                'lu': output['output_trigger'],
                'fe': []
            }]
            ins_dict['score'] = {
                'pbr': np.exp(-output['pbr_score']),
                'aligner': output['aligner_score'],
            }
            ins_dict['with_fe'] = False
            ins_dict['fully_annotated'] = False
            dataset.append(ins_dict)
    logger.info(f'{len(dataset)} sentences loaded.')
    return dataset


if __name__ == '__main__':
    logging.basicConfig(level='INFO')
    arg_parser = ArgumentParser()
    arg_parser.add_argument('output', type=str)
    arg_parser.add_argument('-v', type=str, default='1.7')
    cmd_args = arg_parser.parse_args()
    full = load_nltk_fully_annotated(cmd_args.v)
    full_ann_ids = list()
    for split in ['train', 'dev', 'test']:
        for sent in full[split]:
            full_ann_ids.extend(sent['meta']['ann_ids'])
    exe = load_nltk_exemplars(cmd_args.v, full_ann_ids)
    os.makedirs(cmd_args.output, exist_ok=True)
    with open(os.path.join(cmd_args.output, 'full.' + cmd_args.v.replace('.', '') + '.json'), 'w') as fp:
        json.dump(full, fp)
    with open(os.path.join(cmd_args.output, 'exe.' + cmd_args.v.replace('.', '') + '.json'), 'w') as fp:
        json.dump(exe, fp)