import json import os import copy from collections import defaultdict from argparse import ArgumentParser from tqdm import tqdm def extract_sentences(raw_doc): sentence_tokens = list() # [(start, end), list_tokens, event_list] for sent_boundary in raw_doc['_views']['_InitialView']['Sentence']: start, end = sent_boundary.get('begin', 0), sent_boundary.get('end') sentence_tokens.append([(start, end), list(), list()]) begin2sentence, end2sentence = dict(), dict() for token in raw_doc['_views']['_InitialView']['Token']: start, end = token.get('begin', 0), token.get('end') added = False for sent_idx, (bound, tl, _) in enumerate(sentence_tokens): if start in range(*bound) and (end - 1) in range(*bound): assert not added begin2sentence[start] = (sent_idx, len(tl)) end2sentence[end] = (sent_idx, len(tl)) tl.append((start, end)) added = True assert added return sentence_tokens, begin2sentence, end2sentence def read_aida2kairos(mapping_path): mapping = dict() for line in open(mapping_path).readlines(): kairos, aida_list = line.replace('\n', '').replace(',', '').split('\t') for aida in aida_list.split(): if aida in 'x?': continue if aida in mapping: print('warning:', aida, 'already in the mapping, repeated.') mapping[aida] = kairos return mapping def read_aida(corpus_path, mapping_path): print('reading aida data') n_negative, n_span_mismatch, n_diff = 0, 0, 0 outputs = list() mapping = read_aida2kairos(mapping_path) for event_fn in tqdm(os.listdir(corpus_path)): event_name = event_fn.split('-')[0] if event_name not in mapping: print('warning:', event_name, 'not in the mapping.') continue event_name = mapping[event_name] for doc_name in os.listdir(os.path.join(corpus_path, event_fn)): if not doc_name.endswith('json'): continue raw_doc = json.load(open(os.path.join(corpus_path, event_fn, doc_name))) sentences, begin2sentence, end2sentence = extract_sentences(raw_doc) for fss_no, fss in raw_doc['_referenced_fss'].items(): if fss_no == '1': continue begin, end, is_negative = fss['begin'], fss['end'], fss['negative_example'] if is_negative: n_negative += 1 continue if begin not in begin2sentence or end not in end2sentence: n_span_mismatch += 1 continue (b_idx_sent, b_idx_token), (e_idx_sent, e_idx_token) = begin2sentence[begin], end2sentence[end] if b_idx_sent != e_idx_sent: n_diff += 1 continue sentences[b_idx_sent][2].append([b_idx_token, e_idx_token]) text = raw_doc['_referenced_fss']['1']['sofaString'] for _, tokens, events in sentences: tokens = [text[start:end] for start, end in tokens] for (start, end) in events: outputs.append({ 'tokens': copy.deepcopy(tokens), 'annotation': { 'start_idx': start, 'end_idx': end, 'label': event_name, } }) print(f'Loaded {len(outputs)} annotations.') print(f'{n_negative} negative annotations are ignored.') print(f'{n_span_mismatch} mismatched annotations are ignored.') print(f'{n_diff} annotations across sentences are ignored.') return outputs if __name__ == '__main__': parser = ArgumentParser() parser.add_argument('aida', type=str) parser.add_argument('aida2kairos', type=str) parser.add_argument('dst', type=str) args = parser.parse_args() aida = read_aida(args.aida, args.aida2kairos) json.dump(aida, open(args.dst, 'w'))