nickil's picture
add initial files
47c0211
raw
history blame
No virus
1.36 kB
from collections import defaultdict, Counter
from nltk.corpus import stopwords
class RuleBasedHeuristic:
def __init__(self, sentence=None, corpus=None):
self.sentence = sentence
self.corpus = corpus
def add_contiguous_titlecase_words(self, row):
matches = []
dd = defaultdict(list)
count = 0
for i, j in zip(row, row[1:]):
if j[0] - i[0] == 1:
dd[count].append(i[-1] + " " + j[-1])
else:
count += 1
for key, value in dd.items():
if len(value) > 1:
out = value[0]
inter = ""
for item in value[1:]:
inter += " " + item.split()[-1]
matches.append(out + inter)
else:
matches.extend(value)
return matches
def augment_using_most_frequent_starting_token(self, N=1):
first_token = []
for sentence in self.corpus:
first_token.append(sentence.split()[0])
return Counter(first_token).most_common(N)
def get_top_tokens(self, top_most_common_ptb=None):
out = set(stopwords.words("english"))
if top_most_common_ptb:
out.update([token for token, counts in self.augment_using_most_frequent_starting_token(N=top_most_common_ptb)])
return out