Spaces:

nickil
/

weakly-supervised-parsing

Build error

add initial files

47c0211 over 2 years ago

No virus

1.36 kB

	from collections import defaultdict, Counter
	from nltk.corpus import stopwords


	class RuleBasedHeuristic:
	def __init__(self, sentence=None, corpus=None):
	self.sentence = sentence
	self.corpus = corpus

	def add_contiguous_titlecase_words(self, row):
	matches = []
	dd = defaultdict(list)
	count = 0
	for i, j in zip(row, row[1:]):
	if j[0] - i[0] == 1:
	dd[count].append(i[-1] + " " + j[-1])
	else:
	count += 1
	for key, value in dd.items():
	if len(value) > 1:
	out = value[0]
	inter = ""
	for item in value[1:]:
	inter += " " + item.split()[-1]
	matches.append(out + inter)
	else:
	matches.extend(value)
	return matches

	def augment_using_most_frequent_starting_token(self, N=1):
	first_token = []
	for sentence in self.corpus:
	first_token.append(sentence.split()[0])
	return Counter(first_token).most_common(N)

	def get_top_tokens(self, top_most_common_ptb=None):
	out = set(stopwords.words("english"))
	if top_most_common_ptb:
	out.update([token for token, counts in self.augment_using_most_frequent_starting_token(N=top_most_common_ptb)])
	return out