File size: 6,647 Bytes
05922fb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 |
import gzip
import json
import os
import logging
from argparse import ArgumentParser
from itertools import accumulate
import nltk
import numpy as np
from tools.framenet.nltk_framenet import framenet, framenet15
from tqdm import tqdm
from tools.framenet.fn_util import framenet_split, Sentence
logger = logging.getLogger('fn')
def _load_raw(version):
if version == '1.5':
nltk.download('framenet_v15')
return framenet15
else:
nltk.download('framenet_v17')
return framenet
def one_frame(sentence, ann):
frame_info = {'label': ann.frame.name}
target_list = list()
for start, end in ann.Target:
start, end = sentence.span(start, end)
target_list.extend(list(range(start, end+1)))
assert len(target_list) > 0
frame_info['span'] = [sorted(target_list)[0], sorted(target_list)[-1]]
frame_info['lu'] = ann.LU.name
frame_info['children'] = fes = list()
for start, end, fe_name in ann.FE[0]:
start, end = sentence.span(start, end)
fes.append({'span': [start, end], 'label': fe_name})
return frame_info
def load_nltk_exemplars(version, exclude_ann_ids=None):
exclude_ann_ids = exclude_ann_ids or list()
fn = _load_raw(version)
egs = list()
bar = tqdm()
skipped = 0
try:
for eg in fn.annotations(full_text=False):
if 'Target' not in eg.keys():
# A bug of nltk
continue
if eg.ID in exclude_ann_ids:
skipped += 1
continue
try:
sentence = Sentence(eg.text)
egs.append({
'tokens': list(map(str, sentence.tokens)), 'annotations': [one_frame(sentence, eg)],
'meta': {
'fully_annotated': False,
'source': f'framenet_v{version}',
'with_fe': True,
'type': 'exemplar',
'ann_ids': [eg.ID],
}
})
bar.update()
except:
pass
except:
pass
bar.close()
logger.info(f'Loaded {len(egs)} sentences for framenet v{version} from exemplars. (skipped {skipped} sentences)')
return egs
def load_nltk_fully_annotated(version):
fn = _load_raw(version)
splits = list(framenet_split.keys())
all_containers = {split: [] for split in splits}
for doc in tqdm(fn.docs()):
container = all_containers['train']
for sp in splits:
if doc.filename in framenet_split[sp]:
container = all_containers[sp]
for sent in doc.sentence:
sentence = Sentence(sent.text)
all_frames = list()
ann_ids = []
for ann in sent.annotationSet:
if ann._type == 'posannotationset':
continue
assert ann._type == 'fulltext_annotationset'
if 'Target' not in ann.keys():
logger.warning('Target not found.')
continue
if 'ID' in ann:
ann_ids.append(ann['ID'])
frame_info = one_frame(sentence, ann)
all_frames.append(frame_info)
eg_dict = {
'tokens': list(map(str, sentence.tokens)), 'annotations': all_frames,
'meta': {
'source': f'framenet_v{version}',
'fully_annotated': True,
'with_fe': True,
'type': 'full text',
'sentence ID': sent.ID,
'doc': doc.filename,
'ann_ids': ann_ids
}
}
container.append(eg_dict)
for sp in splits:
logger.info(f'Load {len(all_containers[sp])} for {sp}.')
return all_containers
def load_expanded_fn(path):
raise NotImplementedError
with gzip.open(path, 'rb') as compressed:
lines = compressed.read().decode()
instances = list()
lines = lines.split('\n')
for line in tqdm(lines):
if len(line) != 0:
instances.append(json.loads(line))
logger.info(f'{len(instances)} lines loaded.')
dataset = list()
for instance in tqdm(instances, desc='Processing expanded framenet...'):
for output in instance['outputs']:
ins_dict = dict()
ins_dict['meta'] = {
'source': 'expanded framenet',
'type': 'paraphrase',
'exemplar_id': instance['exemplar_id'],
'annoset_id': instance['annoset_id']
}
words = output['output_string']
text = ' '.join(words)
length_offsets = [0] + list(accumulate(map(len, words)))
start_idx, end_idx = output['output_trigger_offset']
start_idx = length_offsets[start_idx] + start_idx
end_idx = length_offsets[end_idx] + end_idx - 2
sentence = Sentence(text)
ins_dict['text'] = sentence.tokens
ins_dict['pos'] = sentence.pos
ins_dict['tag'] = sentence.tag
ins_dict['frame'] = [{
'name': instance['frame_name'],
'target': list(range(sentence.span(start_idx, end_idx)[0], sentence.span(start_idx, end_idx)[1]+1)),
'lu': output['output_trigger'],
'fe': []
}]
ins_dict['score'] = {
'pbr': np.exp(-output['pbr_score']),
'aligner': output['aligner_score'],
}
ins_dict['with_fe'] = False
ins_dict['fully_annotated'] = False
dataset.append(ins_dict)
logger.info(f'{len(dataset)} sentences loaded.')
return dataset
if __name__ == '__main__':
logging.basicConfig(level='INFO')
arg_parser = ArgumentParser()
arg_parser.add_argument('output', type=str)
arg_parser.add_argument('-v', type=str, default='1.7')
cmd_args = arg_parser.parse_args()
full = load_nltk_fully_annotated(cmd_args.v)
full_ann_ids = list()
for split in ['train', 'dev', 'test']:
for sent in full[split]:
full_ann_ids.extend(sent['meta']['ann_ids'])
exe = load_nltk_exemplars(cmd_args.v, full_ann_ids)
os.makedirs(cmd_args.output, exist_ok=True)
with open(os.path.join(cmd_args.output, 'full.' + cmd_args.v.replace('.', '') + '.json'), 'w') as fp:
json.dump(full, fp)
with open(os.path.join(cmd_args.output, 'exe.' + cmd_args.v.replace('.', '') + '.json'), 'w') as fp:
json.dump(exe, fp)
|