File size: 1,362 Bytes
47c0211
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
from collections import defaultdict, Counter
from nltk.corpus import stopwords


class RuleBasedHeuristic:
    def __init__(self, sentence=None, corpus=None):
        self.sentence = sentence
        self.corpus = corpus

    def add_contiguous_titlecase_words(self, row):
        matches = []
        dd = defaultdict(list)
        count = 0
        for i, j in zip(row, row[1:]):
            if j[0] - i[0] == 1:
                dd[count].append(i[-1] + " " + j[-1])
            else:
                count += 1
        for key, value in dd.items():
            if len(value) > 1:
                out = value[0]
                inter = ""
                for item in value[1:]:
                    inter += " " + item.split()[-1]
                matches.append(out + inter)
            else:
                matches.extend(value)
        return matches

    def augment_using_most_frequent_starting_token(self, N=1):
        first_token = []
        for sentence in self.corpus:
            first_token.append(sentence.split()[0])
        return Counter(first_token).most_common(N)

    def get_top_tokens(self, top_most_common_ptb=None):
        out = set(stopwords.words("english"))
        if top_most_common_ptb:
            out.update([token for token, counts in self.augment_using_most_frequent_starting_token(N=top_most_common_ptb)])
        return out