File size: 7,153 Bytes
0d083c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5671dcd
0d083c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
# -*- coding: utf-8 -*-
"""final-maybe

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1UueiutTkBBB9Gb2Brp4mQVUvw7cjXTwo
"""

import nltk

SOS = "<s> "
EOS = "</s>"
UNK = "<UNK>"

"""Add Sentence Tokens:
    
    To identify the beginning and end of the sentence 
    add the StartOfSentence and EndOfSentence tokens.

    The argument 'sentences' takes a list of str and 'n' is the order of the model.
    The function returns the list of generated of sentences.
    
    For bigram models (or greater) both tokens are added otherwise or only one is added.
"""

def add_sentence_tokens(sentences, n):
    sos = SOS * (n-1) if n > 1 else SOS
    return ['{}{} {}'.format(sos, s, EOS) for s in sentences]

"""Replace singletons:
    
    For the tokens appearing only ones in the corpus, replace it with <UNK>
    
    The argument 'tokens' takes input of the tokens comprised in the corpus.
    The function returns list of tokens after replacing each singleton with <UNK>
"""

def replace_singletons(tokens):
    vocab = nltk.FreqDist(tokens)
    return [token if vocab[token] > 1 else UNK for token in tokens]

"""Preprocess:
    
    The function takes the argument 'sentences' that takes the list of str of
    preprocess. The argument 'n' is the order of the model.
    Adds the above three tokens to the sentences and tokenize.
    The function returns preprocessed sentences.
"""

def preprocess(sentences, n):
    sentences = add_sentence_tokens(sentences, n)
    tokens = ' '.join(sentences).split(' ')
    tokens = replace_singletons(tokens)
    return tokens


from itertools import product
import math
from pathlib import Path

"""    This function loads training and testing corpus from a directory.
    The argument 'data_dir' contains path of the directory. The directory should contain files: 'train.txt' and 'test.txt'
    Function will return train and test sets as lists of sentences.
"""

def load_data(data_dir):
    train_path = data_dir + 'train.txt'
    test_path  = data_dir + 'test.txt'

    with open(train_path, 'r') as f:
        train = [l.strip() for l in f.readlines()]
    with open(test_path, 'r') as f:
        test = [l.strip() for l in f.readlines()]
    return train, test

"""Trained N-gram model:

    A trained model for the given corpus is constructed by preprocessing the 
    corpus and calculating the smoothed probabilities of each n-gram. 
    The arguments contains training data (list of strings), n (integer; order of the model), 
    and an integer used for laplace smoothing.
    Further, the model has a method for calculating perplexity.
"""

class LanguageModel(object):
    def __init__(self, train_data, n, laplace=1):
        self.n = n
        self.laplace = laplace
        self.tokens = preprocess(train_data, n)
        self.vocab  = nltk.FreqDist(self.tokens)
        self.model  = self._create_model()
        self.masks  = list(reversed(list(product((0,1), repeat=n))))

    def _smooth(self):
        """
        The n tokens of n-gram in training corpus and first n-1 tokens of each n-gram
        results in Laplace smoothenedd probability.
        The function returns the smoothened probability mapped to its n-gram.

        """
        vocab_size = len(self.vocab)

        n_grams = nltk.ngrams(self.tokens, self.n)
        n_vocab = nltk.FreqDist(n_grams)

        m_grams = nltk.ngrams(self.tokens, self.n-1)
        m_vocab = nltk.FreqDist(m_grams)

        def smoothed_count(n_gram, n_count):
            m_gram = n_gram[:-1]
            m_count = m_vocab[m_gram]
            return (n_count + self.laplace) / (m_count + self.laplace * vocab_size)

        return { n_gram: smoothed_count(n_gram, count) for n_gram, count in n_vocab.items() }

    def _create_model(self):
        """
        This function creates a probability distribution of the vocabulary of training corpus.
        The probabilities in a unigram model are simply relative frequencies of each token over the whole corpus.
        Otherwise, the relative frequencies are Laplace-smoothed probabilities.
        Function returns a dictionary which maps each n-gram, which is in the form of tuple of strings, to its probabilities (float)

        """
        if self.n == 1:
            num_tokens = len(self.tokens)
            return { (unigram,): count / num_tokens for unigram, count in self.vocab.items() }
        else:
            return self._smooth()

    def _convert_oov(self, ngram):
        """
        This function handles the words which are encountered in the test and converts the given n-gram to one which is known by the model.
        Stop when the model contains an entry for every permutation.
        The function returns n-gram with <UNK> tokens in certain positions such that the model
            contains an entry for it.
        """
        mask = lambda ngram, bitmask: tuple((token if flag == 1 else "<UNK>" for token,flag in zip(ngram, bitmask)))

        ngram = (ngram,) if type(ngram) is str else ngram
        for possible_known in [mask(ngram, bitmask) for bitmask in self.masks]:
            if possible_known in self.model:
                return possible_known

    def perplexity(self, test_data):
        """
        Perplexity of the model is calculated using the sentences and returns
        a float value. 
        
        """
        test_tokens = preprocess(test_data, self.n)
        test_ngrams = nltk.ngrams(test_tokens, self.n)
        N = len(test_tokens)

        known_ngrams  = (self._convert_oov(ngram) for ngram in test_ngrams)
        probabilities = [self.model[ngram] for ngram in known_ngrams]

        return math.exp((-1/N) * sum(map(math.log, probabilities)))

    def _best_candidate(self, prev, i, without=[]):
        """
        Selects the most probable token depending on the basis of previous
        (n-1) tokens. 
        The function takes the argument of previous (n-1) tokens, and the tokens to
        exclude from candidates list.
        The function returns the most probable token and its probability.

        """
        blacklist  = ["<UNK>"] + without
        candidates = ((ngram[-1],prob) for ngram,prob in self.model.items() if ngram[:-1]==prev)
        candidates = filter(lambda candidate: candidate[0] not in blacklist, candidates)
        candidates = sorted(candidates, key=lambda candidate: candidate[1], reverse=True)
        if len(candidates) == 0:
            return ("</s>", 1)
        else:
            return candidates[0 if prev != () and prev[-1] != "<s>" else i]

# data_path = '/content/drive/Shareddrives/MathProject22/Dataset/data/'
# train, test = load_data(data_path)

# #if __name__ == '__main__':
# model_instance= LanguageModel(train[0:100], 3, 0)
#    # first number is the n of n gram
#    # second number is the coefficient whether laplace used or not

# print(model_instance.perplexity(test))

# prev=('I','love',)
# print(model_instance._best_candidate(prev,1)[0])
# # `1 is ith best fit as a candidate

# import pickle
# filename = 'without_laplace.sav'
# pickle.dump(model_instance, open(filename, 'wb'))

# len(train)