LCM / utils /io_ /prepare_data.py

Upload folder using huggingface_hub

e8f4897 verified about 2 months ago

16.7 kB

	import os.path
	import numpy as np
	from .alphabet import Alphabet
	from .logger import get_logger
	import torch

	# Special vocabulary symbols - we always put them at the end.
	PAD = "_<PAD>_"
	ROOT = "_<ROOT>_"
	END = "_<END>_"
	_START_VOCAB = [PAD, ROOT, END]

	MAX_CHAR_LENGTH = 45
	NUM_CHAR_PAD = 2

	UNK_ID = 0
	PAD_ID_WORD = 1
	PAD_ID_CHAR = 1
	PAD_ID_TAG = 0

	NUM_SYMBOLIC_TAGS = 3

	_buckets = [10, 15, 20, 25, 30, 35, 40, 50, 60, 70, 80, 90, 100, 140]

	from .reader import Reader

	def create_alphabets(alphabet_directory, train_paths, extra_paths=None, max_vocabulary_size=100000, embedd_dict=None,
	min_occurence=1, lower_case=False):
	def expand_vocab(vocab_list, char_alphabet, pos_alphabet, ner_alphabet, arc_alphabet):
	vocab_set = set(vocab_list)
	for data_path in extra_paths:
	with open(data_path, 'r') as file:
	for line in file:
	line = line.strip()
	if len(line) == 0:
	continue

	tokens = line.split('\t')
	if lower_case:
	tokens[1] = tokens[1].lower()
	for char in tokens[1]:
	char_alphabet.add(char)

	word = tokens[1]
	pos = tokens[2]
	ner = tokens[3]
	arc_tag = tokens[5]

	pos_alphabet.add(pos)
	ner_alphabet.add(ner)
	arc_alphabet.add(arc_tag)
	if embedd_dict is not None:
	if word not in vocab_set and (word in embedd_dict or word.lower() in embedd_dict):
	vocab_set.add(word)
	vocab_list.append(word)
	else:
	if word not in vocab_set:
	vocab_set.add(word)
	vocab_list.append(word)
	return vocab_list, char_alphabet, pos_alphabet, ner_alphabet, arc_alphabet

	logger = get_logger("Create Alphabets")
	word_alphabet = Alphabet('word', defualt_value=True, singleton=True)
	char_alphabet = Alphabet('character', defualt_value=True)
	pos_alphabet = Alphabet('pos', defualt_value=True)
	ner_alphabet = Alphabet('ner', defualt_value=True)
	arc_alphabet = Alphabet('arc', defualt_value=True)
	auto_label_alphabet = Alphabet('auto_labeler', defualt_value=True)
	if not os.path.isdir(alphabet_directory):
	logger.info("Creating Alphabets: %s" % alphabet_directory)

	char_alphabet.add(PAD)
	pos_alphabet.add(PAD)
	ner_alphabet.add(PAD)
	arc_alphabet.add(PAD)
	auto_label_alphabet.add(PAD)

	char_alphabet.add(ROOT)
	pos_alphabet.add(ROOT)
	ner_alphabet.add(ROOT)
	arc_alphabet.add(ROOT)
	auto_label_alphabet.add(ROOT)

	char_alphabet.add(END)
	pos_alphabet.add(END)
	ner_alphabet.add(END)
	arc_alphabet.add(END)
	auto_label_alphabet.add(END)

	vocab = dict()
	if isinstance(train_paths, str):
	train_paths = [train_paths]
	for train_path in train_paths:
	with open(train_path, 'r') as file:
	for line in file:
	line = line.strip()
	if len(line) == 0:
	continue

	tokens = line.split('\t')
	if lower_case:
	tokens[1] = tokens[1].lower()
	for char in tokens[1]:
	char_alphabet.add(char)

	word = tokens[1]
	# print(word)
	pos = tokens[2]
	ner = tokens[3]
	arc_tag = tokens[5]

	pos_alphabet.add(pos)
	ner_alphabet.add(ner)
	arc_alphabet.add(arc_tag)

	if word in vocab:
	vocab[word] += 1
	else:
	vocab[word] = 1

	# collect singletons
	singletons = set([word for word, count in vocab.items() if count <= min_occurence])

	# if a singleton is in pretrained embedding dict, set the count to min_occur + c
	if embedd_dict is not None:
	for word in vocab.keys():
	if word in embedd_dict or word.lower() in embedd_dict:
	vocab[word] += min_occurence

	vocab_list = sorted(vocab, key=vocab.get, reverse=True)
	vocab_list = [word for word in vocab_list if vocab[word] > min_occurence]
	vocab_list = _START_VOCAB + vocab_list

	if extra_paths is not None:
	vocab_list, char_alphabet, pos_alphabet, ner_alphabet, arc_alphabet = \
	expand_vocab(vocab_list, char_alphabet, pos_alphabet, ner_alphabet, arc_alphabet)

	if len(vocab_list) > max_vocabulary_size:
	vocab_list = vocab_list[:max_vocabulary_size]

	for word in vocab_list:
	word_alphabet.add(word)
	if word in singletons:
	word_alphabet.add_singleton(word_alphabet.get_index(word))

	word_alphabet.save(alphabet_directory)
	char_alphabet.save(alphabet_directory)
	pos_alphabet.save(alphabet_directory)
	ner_alphabet.save(alphabet_directory)
	arc_alphabet.save(alphabet_directory)
	auto_label_alphabet.save(alphabet_directory)

	else:
	print('loading saved alphabet from %s' % alphabet_directory)
	word_alphabet.load(alphabet_directory)
	char_alphabet.load(alphabet_directory)
	pos_alphabet.load(alphabet_directory)
	ner_alphabet.load(alphabet_directory)
	arc_alphabet.load(alphabet_directory)
	auto_label_alphabet.load(alphabet_directory)

	word_alphabet.close()
	char_alphabet.close()
	pos_alphabet.close()
	ner_alphabet.close()
	arc_alphabet.close()
	auto_label_alphabet.close()

	alphabet_dict = {'word_alphabet': word_alphabet, 'char_alphabet': char_alphabet, 'pos_alphabet': pos_alphabet,
	'ner_alphabet': ner_alphabet, 'arc_alphabet': arc_alphabet, 'auto_label_alphabet': auto_label_alphabet}
	return alphabet_dict

	def create_alphabets_for_sequence_tagger(alphabet_directory, parser_alphabet_directory, paths):
	logger = get_logger("Create Alphabets")
	print('loading saved alphabet from %s' % parser_alphabet_directory)
	word_alphabet = Alphabet('word', defualt_value=True, singleton=True)
	char_alphabet = Alphabet('character', defualt_value=True)
	pos_alphabet = Alphabet('pos', defualt_value=True)
	ner_alphabet = Alphabet('ner', defualt_value=True)
	arc_alphabet = Alphabet('arc', defualt_value=True)
	auto_label_alphabet = Alphabet('auto_labeler', defualt_value=True)

	word_alphabet.load(parser_alphabet_directory)
	char_alphabet.load(parser_alphabet_directory)
	pos_alphabet.load(parser_alphabet_directory)
	ner_alphabet.load(parser_alphabet_directory)
	arc_alphabet.load(parser_alphabet_directory)
	try:
	auto_label_alphabet.load(alphabet_directory)
	except:
	print('Creating auto labeler alphabet')
	auto_label_alphabet.add(PAD)
	auto_label_alphabet.add(ROOT)
	auto_label_alphabet.add(END)
	for path in paths:
	with open(path, 'r') as file:
	for line in file:
	line = line.strip()
	if len(line) == 0:
	continue
	tokens = line.split('\t')
	if len(tokens) > 6:
	auto_label = tokens[6]
	auto_label_alphabet.add(auto_label)

	word_alphabet.save(alphabet_directory)
	char_alphabet.save(alphabet_directory)
	pos_alphabet.save(alphabet_directory)
	ner_alphabet.save(alphabet_directory)
	arc_alphabet.save(alphabet_directory)
	auto_label_alphabet.save(alphabet_directory)
	word_alphabet.close()
	char_alphabet.close()
	pos_alphabet.close()
	ner_alphabet.close()
	arc_alphabet.close()
	auto_label_alphabet.close()
	alphabet_dict = {'word_alphabet': word_alphabet, 'char_alphabet': char_alphabet, 'pos_alphabet': pos_alphabet,
	'ner_alphabet': ner_alphabet, 'arc_alphabet': arc_alphabet, 'auto_label_alphabet': auto_label_alphabet}
	return alphabet_dict

	def read_data(source_path, alphabets, max_size=None,
	lower_case=False, symbolic_root=False, symbolic_end=False):
	data = [[] for _ in _buckets]
	max_char_length = [0 for _ in _buckets]
	print('Reading data from %s' % ', '.join(source_path) if type(source_path) is list else source_path)
	counter = 0
	if type(source_path) is not list:
	source_path = [source_path]
	for path in source_path:
	reader = Reader(path, alphabets)
	inst = reader.getNext(lower_case=lower_case, symbolic_root=symbolic_root, symbolic_end=symbolic_end)
	while inst is not None and (not max_size or counter < max_size):
	counter += 1
	inst_size = inst.length()
	sent = inst.sentence
	for bucket_id, bucket_size in enumerate(_buckets):
	if inst_size < bucket_size:
	data[bucket_id].append([sent.word_ids, sent.char_id_seqs, inst.ids['pos_alphabet'], inst.ids['ner_alphabet'],
	inst.heads, inst.ids['arc_alphabet'], inst.ids['auto_label_alphabet']])
	max_len = max([len(char_seq) for char_seq in sent.char_seqs])
	if max_char_length[bucket_id] < max_len:
	max_char_length[bucket_id] = max_len
	break

	inst = reader.getNext(lower_case=lower_case, symbolic_root=symbolic_root, symbolic_end=symbolic_end)
	reader.close()
	print("Total number of data: %d" % counter)
	return data, max_char_length

	def read_data_to_variable(source_path, alphabets, device, max_size=None,
	lower_case=False, symbolic_root=False, symbolic_end=False):
	data, max_char_length = read_data(source_path, alphabets,
	max_size=max_size, lower_case=lower_case,
	symbolic_root=symbolic_root, symbolic_end=symbolic_end)
	bucket_sizes = [len(data[b]) for b in range(len(_buckets))]

	data_variable = []

	for bucket_id in range(len(_buckets)):
	bucket_size = bucket_sizes[bucket_id]
	if bucket_size <= 0:
	data_variable.append((1, 1))
	continue

	bucket_length = _buckets[bucket_id]
	char_length = min(MAX_CHAR_LENGTH, max_char_length[bucket_id] + NUM_CHAR_PAD)
	wid_inputs = np.empty([bucket_size, bucket_length], dtype=np.int64)
	cid_inputs = np.empty([bucket_size, bucket_length, char_length], dtype=np.int64)
	pid_inputs = np.empty([bucket_size, bucket_length], dtype=np.int64)
	nid_inputs = np.empty([bucket_size, bucket_length], dtype=np.int64)
	hid_inputs = np.empty([bucket_size, bucket_length], dtype=np.int64)
	aid_inputs = np.empty([bucket_size, bucket_length], dtype=np.int64)
	mid_inputs = np.empty([bucket_size, bucket_length], dtype=np.int64)

	masks = np.zeros([bucket_size, bucket_length], dtype=np.float32)
	single = np.zeros([bucket_size, bucket_length], dtype=np.int64)

	lengths = np.empty(bucket_size, dtype=np.int64)

	for i, inst in enumerate(data[bucket_id]):
	wids, cid_seqs, pids, nids, hids, aids, mids = inst
	inst_size = len(wids)
	lengths[i] = inst_size
	# word ids
	wid_inputs[i, :inst_size] = wids
	wid_inputs[i, inst_size:] = PAD_ID_WORD
	for c, cids in enumerate(cid_seqs):
	cid_inputs[i, c, :len(cids)] = cids
	cid_inputs[i, c, len(cids):] = PAD_ID_CHAR
	cid_inputs[i, inst_size:, :] = PAD_ID_CHAR
	# pos ids
	pid_inputs[i, :inst_size] = pids
	pid_inputs[i, inst_size:] = PAD_ID_TAG
	# ner ids
	nid_inputs[i, :inst_size] = nids
	nid_inputs[i, inst_size:] = PAD_ID_TAG
	# arc ids
	aid_inputs[i, :inst_size] = aids
	aid_inputs[i, inst_size:] = PAD_ID_TAG
	# auto_label ids
	mid_inputs[i, :inst_size] = mids
	mid_inputs[i, inst_size:] = PAD_ID_TAG
	# heads
	hid_inputs[i, :inst_size] = hids
	hid_inputs[i, inst_size:] = PAD_ID_TAG
	# masks
	masks[i, :inst_size] = 1.0
	for j, wid in enumerate(wids):
	if alphabets['word_alphabet'].is_singleton(wid):
	single[i, j] = 1

	words = torch.LongTensor(wid_inputs)
	chars = torch.LongTensor(cid_inputs)
	pos = torch.LongTensor(pid_inputs)
	ner = torch.LongTensor(nid_inputs)
	heads = torch.LongTensor(hid_inputs)
	arc = torch.LongTensor(aid_inputs)
	auto_label = torch.LongTensor(mid_inputs)
	masks = torch.FloatTensor(masks)
	single = torch.LongTensor(single)
	lengths = torch.LongTensor(lengths)
	words = words.to(device)
	chars = chars.to(device)
	pos = pos.to(device)
	ner = ner.to(device)
	heads = heads.to(device)
	arc = arc.to(device)
	auto_label = auto_label.to(device)
	masks = masks.to(device)
	single = single.to(device)
	lengths = lengths.to(device)

	data_variable.append((words, chars, pos, ner, heads, arc, auto_label, masks, single, lengths))

	return data_variable, bucket_sizes

	def iterate_batch(data, batch_size, device, unk_replace=0.0, shuffle=False):
	data_variable, bucket_sizes = data

	bucket_indices = np.arange(len(_buckets))
	if shuffle:
	np.random.shuffle((bucket_indices))

	for bucket_id in bucket_indices:
	bucket_size = bucket_sizes[bucket_id]
	bucket_length = _buckets[bucket_id]
	if bucket_size <= 0:
	continue

	words, chars, pos, ner, heads, arc, auto_label, masks, single, lengths = data_variable[bucket_id]
	if unk_replace:
	ones = single.data.new(bucket_size, bucket_length).fill_(1)
	noise = masks.data.new(bucket_size, bucket_length).bernoulli_(unk_replace).long()
	words = words * (ones - single * noise)

	indices = None
	if shuffle:
	indices = torch.randperm(bucket_size).long()
	indices = indices.to(device)
	for start_idx in range(0, bucket_size, batch_size):
	if shuffle:
	excerpt = indices[start_idx:start_idx + batch_size]
	else:
	excerpt = slice(start_idx, start_idx + batch_size)
	yield words[excerpt], chars[excerpt], pos[excerpt], ner[excerpt], heads[excerpt], arc[excerpt], auto_label[excerpt], \
	masks[excerpt], lengths[excerpt]

	def iterate_batch_rand_bucket_choosing(data, batch_size, device, unk_replace=0.0):
	data_variable, bucket_sizes = data
	indices_left = [set(np.arange(bucket_size)) for bucket_size in bucket_sizes]
	while sum(bucket_sizes) > 0:
	non_empty_buckets = [i for i, bucket_size in enumerate(bucket_sizes) if bucket_size > 0]
	bucket_id = np.random.choice(non_empty_buckets)
	bucket_size = bucket_sizes[bucket_id]
	bucket_length = _buckets[bucket_id]

	words, chars, pos, ner, heads, arc, auto_label, masks, single, lengths = data_variable[bucket_id]
	min_batch_size = min(bucket_size, batch_size)
	indices = torch.LongTensor(np.random.choice(list(indices_left[bucket_id]), min_batch_size, replace=False))
	set_indices = set(indices.numpy())
	indices_left[bucket_id] = indices_left[bucket_id].difference(set_indices)
	indices = indices.to(device)
	words = words[indices]
	if unk_replace:
	ones = single.data.new(min_batch_size, bucket_length).fill_(1)
	noise = masks.data.new(min_batch_size, bucket_length).bernoulli_(unk_replace).long()
	words = words * (ones - single[indices] * noise)
	bucket_sizes = [len(s) for s in indices_left]
	yield words, chars[indices], pos[indices], ner[indices], heads[indices], arc[indices], auto_label[indices], masks[indices], lengths[indices]


	def calc_num_batches(data, batch_size):
	_, bucket_sizes = data
	bucket_sizes_mod_batch_size = [int(bucket_size / batch_size) + 1 if bucket_size > 0 else 0 for bucket_size in bucket_sizes]
	num_batches = sum(bucket_sizes_mod_batch_size)
	return num_batches