Spaces:

responsibility-framing
/

sociolome

Running

sociolome / scripts /aida_experiment /read_aida.py

Gosse Minnema

Initial commit

05922fb 10 months ago

4.17 kB

	import json
	import os
	import copy
	from collections import defaultdict
	from argparse import ArgumentParser
	from tqdm import tqdm


	def extract_sentences(raw_doc):
	sentence_tokens = list() # [(start, end), list_tokens, event_list]
	for sent_boundary in raw_doc['_views']['_InitialView']['Sentence']:
	start, end = sent_boundary.get('begin', 0), sent_boundary.get('end')
	sentence_tokens.append([(start, end), list(), list()])
	begin2sentence, end2sentence = dict(), dict()
	for token in raw_doc['_views']['_InitialView']['Token']:
	start, end = token.get('begin', 0), token.get('end')
	added = False
	for sent_idx, (bound, tl, _) in enumerate(sentence_tokens):
	if start in range(bound) and (end - 1) in range(bound):
	assert not added
	begin2sentence[start] = (sent_idx, len(tl))
	end2sentence[end] = (sent_idx, len(tl))
	tl.append((start, end))
	added = True
	assert added
	return sentence_tokens, begin2sentence, end2sentence


	def read_aida2kairos(mapping_path):
	mapping = dict()
	for line in open(mapping_path).readlines():
	kairos, aida_list = line.replace('\n', '').replace(',', '').split('\t')
	for aida in aida_list.split():
	if aida in 'x?':
	continue
	if aida in mapping:
	print('warning:', aida, 'already in the mapping, repeated.')
	mapping[aida] = kairos
	return mapping


	def read_aida(corpus_path, mapping_path):
	print('reading aida data')
	n_negative, n_span_mismatch, n_diff = 0, 0, 0
	outputs = list()
	mapping = read_aida2kairos(mapping_path)
	for event_fn in tqdm(os.listdir(corpus_path)):
	event_name = event_fn.split('-')[0]
	if event_name not in mapping:
	print('warning:', event_name, 'not in the mapping.')
	continue
	event_name = mapping[event_name]

	for doc_name in os.listdir(os.path.join(corpus_path, event_fn)):
	if not doc_name.endswith('json'):
	continue
	raw_doc = json.load(open(os.path.join(corpus_path, event_fn, doc_name)))
	sentences, begin2sentence, end2sentence = extract_sentences(raw_doc)
	for fss_no, fss in raw_doc['_referenced_fss'].items():
	if fss_no == '1':
	continue
	begin, end, is_negative = fss['begin'], fss['end'], fss['negative_example']
	if is_negative:
	n_negative += 1
	continue
	if begin not in begin2sentence or end not in end2sentence:
	n_span_mismatch += 1
	continue
	(b_idx_sent, b_idx_token), (e_idx_sent, e_idx_token) = begin2sentence[begin], end2sentence[end]
	if b_idx_sent != e_idx_sent:
	n_diff += 1
	continue
	sentences[b_idx_sent][2].append([b_idx_token, e_idx_token])

	text = raw_doc['_referenced_fss']['1']['sofaString']

	for _, tokens, events in sentences:
	tokens = [text[start:end] for start, end in tokens]
	for (start, end) in events:
	outputs.append({
	'tokens': copy.deepcopy(tokens),
	'annotation': {
	'start_idx': start,
	'end_idx': end,
	'label': event_name,
	}
	})

	print(f'Loaded {len(outputs)} annotations.')
	print(f'{n_negative} negative annotations are ignored.')
	print(f'{n_span_mismatch} mismatched annotations are ignored.')
	print(f'{n_diff} annotations across sentences are ignored.')

	return outputs


	if __name__ == '__main__':
	parser = ArgumentParser()
	parser.add_argument('aida', type=str)
	parser.add_argument('aida2kairos', type=str)
	parser.add_argument('dst', type=str)
	args = parser.parse_args()

	aida = read_aida(args.aida, args.aida2kairos)

	json.dump(aida, open(args.dst, 'w'))