Gosse Minnema
Initial commit
05922fb
import json
import os
import copy
from collections import defaultdict
from argparse import ArgumentParser
from tqdm import tqdm
def extract_sentences(raw_doc):
sentence_tokens = list() # [(start, end), list_tokens, event_list]
for sent_boundary in raw_doc['_views']['_InitialView']['Sentence']:
start, end = sent_boundary.get('begin', 0), sent_boundary.get('end')
sentence_tokens.append([(start, end), list(), list()])
begin2sentence, end2sentence = dict(), dict()
for token in raw_doc['_views']['_InitialView']['Token']:
start, end = token.get('begin', 0), token.get('end')
added = False
for sent_idx, (bound, tl, _) in enumerate(sentence_tokens):
if start in range(*bound) and (end - 1) in range(*bound):
assert not added
begin2sentence[start] = (sent_idx, len(tl))
end2sentence[end] = (sent_idx, len(tl))
tl.append((start, end))
added = True
assert added
return sentence_tokens, begin2sentence, end2sentence
def read_aida2kairos(mapping_path):
mapping = dict()
for line in open(mapping_path).readlines():
kairos, aida_list = line.replace('\n', '').replace(',', '').split('\t')
for aida in aida_list.split():
if aida in 'x?':
continue
if aida in mapping:
print('warning:', aida, 'already in the mapping, repeated.')
mapping[aida] = kairos
return mapping
def read_aida(corpus_path, mapping_path):
print('reading aida data')
n_negative, n_span_mismatch, n_diff = 0, 0, 0
outputs = list()
mapping = read_aida2kairos(mapping_path)
for event_fn in tqdm(os.listdir(corpus_path)):
event_name = event_fn.split('-')[0]
if event_name not in mapping:
print('warning:', event_name, 'not in the mapping.')
continue
event_name = mapping[event_name]
for doc_name in os.listdir(os.path.join(corpus_path, event_fn)):
if not doc_name.endswith('json'):
continue
raw_doc = json.load(open(os.path.join(corpus_path, event_fn, doc_name)))
sentences, begin2sentence, end2sentence = extract_sentences(raw_doc)
for fss_no, fss in raw_doc['_referenced_fss'].items():
if fss_no == '1':
continue
begin, end, is_negative = fss['begin'], fss['end'], fss['negative_example']
if is_negative:
n_negative += 1
continue
if begin not in begin2sentence or end not in end2sentence:
n_span_mismatch += 1
continue
(b_idx_sent, b_idx_token), (e_idx_sent, e_idx_token) = begin2sentence[begin], end2sentence[end]
if b_idx_sent != e_idx_sent:
n_diff += 1
continue
sentences[b_idx_sent][2].append([b_idx_token, e_idx_token])
text = raw_doc['_referenced_fss']['1']['sofaString']
for _, tokens, events in sentences:
tokens = [text[start:end] for start, end in tokens]
for (start, end) in events:
outputs.append({
'tokens': copy.deepcopy(tokens),
'annotation': {
'start_idx': start,
'end_idx': end,
'label': event_name,
}
})
print(f'Loaded {len(outputs)} annotations.')
print(f'{n_negative} negative annotations are ignored.')
print(f'{n_span_mismatch} mismatched annotations are ignored.')
print(f'{n_diff} annotations across sentences are ignored.')
return outputs
if __name__ == '__main__':
parser = ArgumentParser()
parser.add_argument('aida', type=str)
parser.add_argument('aida2kairos', type=str)
parser.add_argument('dst', type=str)
args = parser.parse_args()
aida = read_aida(args.aida, args.aida2kairos)
json.dump(aida, open(args.dst, 'w'))