|
import json |
|
import os |
|
import copy |
|
from collections import defaultdict |
|
from argparse import ArgumentParser |
|
from tqdm import tqdm |
|
|
|
|
|
def extract_sentences(raw_doc): |
|
sentence_tokens = list() |
|
for sent_boundary in raw_doc['_views']['_InitialView']['Sentence']: |
|
start, end = sent_boundary.get('begin', 0), sent_boundary.get('end') |
|
sentence_tokens.append([(start, end), list(), list()]) |
|
begin2sentence, end2sentence = dict(), dict() |
|
for token in raw_doc['_views']['_InitialView']['Token']: |
|
start, end = token.get('begin', 0), token.get('end') |
|
added = False |
|
for sent_idx, (bound, tl, _) in enumerate(sentence_tokens): |
|
if start in range(*bound) and (end - 1) in range(*bound): |
|
assert not added |
|
begin2sentence[start] = (sent_idx, len(tl)) |
|
end2sentence[end] = (sent_idx, len(tl)) |
|
tl.append((start, end)) |
|
added = True |
|
assert added |
|
return sentence_tokens, begin2sentence, end2sentence |
|
|
|
|
|
def read_aida2kairos(mapping_path): |
|
mapping = dict() |
|
for line in open(mapping_path).readlines(): |
|
kairos, aida_list = line.replace('\n', '').replace(',', '').split('\t') |
|
for aida in aida_list.split(): |
|
if aida in 'x?': |
|
continue |
|
if aida in mapping: |
|
print('warning:', aida, 'already in the mapping, repeated.') |
|
mapping[aida] = kairos |
|
return mapping |
|
|
|
|
|
def read_aida(corpus_path, mapping_path): |
|
print('reading aida data') |
|
n_negative, n_span_mismatch, n_diff = 0, 0, 0 |
|
outputs = list() |
|
mapping = read_aida2kairos(mapping_path) |
|
for event_fn in tqdm(os.listdir(corpus_path)): |
|
event_name = event_fn.split('-')[0] |
|
if event_name not in mapping: |
|
print('warning:', event_name, 'not in the mapping.') |
|
continue |
|
event_name = mapping[event_name] |
|
|
|
for doc_name in os.listdir(os.path.join(corpus_path, event_fn)): |
|
if not doc_name.endswith('json'): |
|
continue |
|
raw_doc = json.load(open(os.path.join(corpus_path, event_fn, doc_name))) |
|
sentences, begin2sentence, end2sentence = extract_sentences(raw_doc) |
|
for fss_no, fss in raw_doc['_referenced_fss'].items(): |
|
if fss_no == '1': |
|
continue |
|
begin, end, is_negative = fss['begin'], fss['end'], fss['negative_example'] |
|
if is_negative: |
|
n_negative += 1 |
|
continue |
|
if begin not in begin2sentence or end not in end2sentence: |
|
n_span_mismatch += 1 |
|
continue |
|
(b_idx_sent, b_idx_token), (e_idx_sent, e_idx_token) = begin2sentence[begin], end2sentence[end] |
|
if b_idx_sent != e_idx_sent: |
|
n_diff += 1 |
|
continue |
|
sentences[b_idx_sent][2].append([b_idx_token, e_idx_token]) |
|
|
|
text = raw_doc['_referenced_fss']['1']['sofaString'] |
|
|
|
for _, tokens, events in sentences: |
|
tokens = [text[start:end] for start, end in tokens] |
|
for (start, end) in events: |
|
outputs.append({ |
|
'tokens': copy.deepcopy(tokens), |
|
'annotation': { |
|
'start_idx': start, |
|
'end_idx': end, |
|
'label': event_name, |
|
} |
|
}) |
|
|
|
print(f'Loaded {len(outputs)} annotations.') |
|
print(f'{n_negative} negative annotations are ignored.') |
|
print(f'{n_span_mismatch} mismatched annotations are ignored.') |
|
print(f'{n_diff} annotations across sentences are ignored.') |
|
|
|
return outputs |
|
|
|
|
|
if __name__ == '__main__': |
|
parser = ArgumentParser() |
|
parser.add_argument('aida', type=str) |
|
parser.add_argument('aida2kairos', type=str) |
|
parser.add_argument('dst', type=str) |
|
args = parser.parse_args() |
|
|
|
aida = read_aida(args.aida, args.aida2kairos) |
|
|
|
json.dump(aida, open(args.dst, 'w')) |
|
|
|
|