Spaces:

nielklug
/

mhg-parsing

Sleeping

App Files Files Community

mhg-parsing / MHGTagger /Data.py

nielklug

init

6ed21b9 5 months ago

raw

history blame contribute delete

6.11 kB


	import sys
	from collections import Counter, OrderedDict
	import pickle
	import numpy

	unk_string = '<UNK>'
	pad_string = '<PAD>'

	def read_tagged_sentences(path, max_sent_len):
	"""
	Read a dataset.
	Each line consists of a token and a tag separated by a tab character
	"""
	sentences, words, tags = [], [], []
	with open(path) as file:
	for line in file:
	line = line.rstrip()
	if line:
	word, tag, *_ = line.split("\t")
	words.append(word)
	tags.append(tag)
	else:
	# empty line marking the end of a sentence
	if 0 < len(words) < max_sent_len:
	sentences.append((words, tags))
	words, tags = [], []
	return sentences


	def read_word_embeddings(filename):
	# Read word embeddings from file.
	word_embeddings = []
	if filename is not None:
	print("reading word embeddings ...", file=sys.stderr)
	with open(filename) as file:
	for line in file:
	word, *vec = line.rstrip().split(' ')
	if word != unk_string:
	word_embeddings.append((word, numpy.array(vec, dtype=numpy.float32)))
	print("done", file=sys.stderr)
	word_emb_size = len(word_embeddings[0][1]) if word_embeddings else 0
	return word_embeddings, word_emb_size


	def make_dict(counter, min_freq=0, add_pad_symbol=False):
	"""
	Create a dictionary which maps strings with some minimal frequency to numbers.
	We don't use pack_padded sequence, so it is OK to assign ID 1 to the
	padding symbol.
	"""
	symlist = [unk_string] + ([pad_string] if add_pad_symbol else []) + \
	[elem for elem,freq in counter.most_common() if freq>=min_freq]
	string2ID = {elem:i for i,elem in enumerate(symlist)}
	return string2ID, symlist


	class Data(object):
	"""
	class for reading a tagged training and development corpus or a test corpus
	"""

	IGNORE_INDEX = -100

	def __init__(self, *args):
	if len(args) == 1:
	self.init_test(*args)
	else:
	self.init_train(*args)

	### functions needed during training ###############################################

	def init_train(self, path_train, path_dev, word_trunc_len,
	min_char_freq, max_sent_len, word_embeddings, ignore_tag):

	self.word_trunc_len = word_trunc_len # length to which words are truncated or filled up

	# reading the datasets
	self.train_sentences = read_tagged_sentences(path_train, max_sent_len)
	self.dev_sentences = read_tagged_sentences(path_dev, max_sent_len)

	### create dictionaries which map characters or tags to IDs
	char_counter = Counter()
	tag_counter = Counter()
	for words, tags in self.train_sentences:
	tag_counter.update(tags)
	for word in words:
	char_counter.update(word)
	self.char2ID, _ = make_dict(char_counter, min_char_freq, add_pad_symbol=True)

	if ignore_tag is not None:
	tag_counter.pop(ignore_tag, None) # remove this special tag if present
	self.tag2ID, self.ID2tag = make_dict(tag_counter)
	self.tag2ID[ignore_tag] = self.IGNORE_INDEX # empty tags will not be trained
	else:
	self.tag2ID, self.ID2tag = make_dict(tag_counter)

	### sizes of the symbol inventories
	self.num_char_types = len(self.char2ID)
	self.num_tag_types = len(self.ID2tag)

	self.word_embeddings, self.word_emb_size = read_word_embeddings(word_embeddings)


	def get_charIDs(self, word):
	'''
	maps a word to a sequence of character IDs
	'''

	unkID = self.char2ID[unk_string]
	padID = self.char2ID[pad_string]

	charIDs = [self.char2ID.get(c, unkID) for c in word]

	# add enough padding symbols
	fwd_charIDs = [padID] * self.word_trunc_len + charIDs
	bwd_charIDs = [padID] * self.word_trunc_len + charIDs[::-1]

	# truncate
	fwd_charIDs = fwd_charIDs[-self.word_trunc_len:]
	bwd_charIDs = bwd_charIDs[-self.word_trunc_len:]

	return fwd_charIDs, bwd_charIDs


	def words2charIDvec(self, words):
	"""
	converts words to char-ID vectors
	"""

	### convert words to character ID sequences
	fwd_charID_seqs = []
	bwd_charID_seqs = []
	for word in words:
	fwd_charIDs, bwd_charIDs = self.get_charIDs(word)
	fwd_charID_seqs.append(fwd_charIDs)
	bwd_charID_seqs.append(bwd_charIDs)

	fwd_charID_seqs = numpy.asarray(fwd_charID_seqs, dtype='int32')
	bwd_charID_seqs = numpy.asarray(bwd_charID_seqs, dtype='int32')

	return fwd_charID_seqs, bwd_charID_seqs


	def tags2IDs(self, tags):
	"""
	takes a list of tags and converts them to IDs using the tag2ID dictionary
	"""
	unkID = self.tag2ID[unk_string]
	IDs = [self.tag2ID.get(tag, unkID) for tag in tags]
	return numpy.asarray(IDs, dtype='int32')


	def save_parameters(self, filename):
	""" save parameters to a file """
	all_params = (self.word_trunc_len, self.char2ID, self.ID2tag)
	with open(filename, "wb") as file:
	pickle.dump(all_params, file)


	### functions needed during tagging ###############################################

	def init_test(self, filename):
	""" load parameters from a file """
	with open(filename, "rb") as file:
	self.word_trunc_len, self.char2ID, self.ID2tag = pickle.load(file)

	def sentences(self, filename):
	""" read data to be tagged. One token per line. Empty line follows a sentence """
	with open(filename) as f:
	words = []
	for line in f:
	line = line.rstrip()
	if line != '':
	words.append(line)
	elif len(words) > 0:
	# empty line indicates the end of a sentence
	yield words
	words = []

	def single_sentences(self, sentence):
	yield sentence

	def IDs2tags(self, IDs):
	""" takes a list of IDs and converts them to tags using the ID2tag dictionary """
	return [self.ID2tag[int(ID)] for ID in IDs]