Shruhrid Banthia commited on
Commit
0d083c9
1 Parent(s): 6164485

model.sav and final_maybe used

Browse files
Files changed (4) hide show
  1. .gitattributes +2 -0
  2. app.py +32 -0
  3. final_maybe.py +196 -0
  4. model.sav +3 -0
.gitattributes CHANGED
@@ -26,3 +26,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
26
  *.zip filter=lfs diff=lfs merge=lfs -text
27
  *.zstandard filter=lfs diff=lfs merge=lfs -text
28
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
26
  *.zip filter=lfs diff=lfs merge=lfs -text
27
  *.zstandard filter=lfs diff=lfs merge=lfs -text
28
  *tfevents* filter=lfs diff=lfs merge=lfs -text
29
+ *.sav filter=lfs diff=lfs merge=lfs -text
30
+ model.sav filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import gradio.inputs
3
+ import pandas as pd
4
+
5
+ import os #interacting with input and output directories
6
+
7
+ import pickle
8
+ from final_maybe import LanguageModel
9
+ with open('model.sav','rb') as handle:
10
+ loaded_model = pickle.load(handle)
11
+ def fn(X_test):
12
+
13
+
14
+ X_final = tuple(map(str, X_test.split(' ')))
15
+ model = loaded_model
16
+ result = model._best_candidate(X_final,0)
17
+
18
+ return result
19
+ description = "Give two words as input and our model will predict the next word"
20
+ here = gr.Interface(fn=fn,
21
+ inputs= gradio.inputs.Textbox( lines=1, placeholder=None, default="", label=None),
22
+ outputs='text',
23
+ title="Next Word Prediction",
24
+ description=description,
25
+ theme="default",
26
+ allow_flagging="auto",
27
+ flagging_dir='flagging records')
28
+ #here.launch(inline=False, share = True)
29
+ if __name__ == "__main__":
30
+ app, local_url, share_url = here.launch(share=True)
31
+
32
+
final_maybe.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """final-maybe
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1UueiutTkBBB9Gb2Brp4mQVUvw7cjXTwo
8
+ """
9
+
10
+ import nltk
11
+
12
+ SOS = "<s> "
13
+ EOS = "</s>"
14
+ UNK = "<UNK>"
15
+
16
+ """Add Sentence Tokens:
17
+
18
+ To identify the beginning and end of the sentence
19
+ add the StartOfSentence and EndOfSentence tokens.
20
+
21
+ The argument 'sentences' takes a list of str and 'n' is the order of the model.
22
+ The function returns the list of generated of sentences.
23
+
24
+ For bigram models (or greater) both tokens are added otherwise or only one is added.
25
+ """
26
+
27
+ def add_sentence_tokens(sentences, n):
28
+ sos = SOS * (n-1) if n > 1 else SOS
29
+ return ['{}{} {}'.format(sos, s, EOS) for s in sentences]
30
+
31
+ """Replace singletons:
32
+
33
+ For the tokens appearing only ones in the corpus, replace it with <UNK>
34
+
35
+ The argument 'tokens' takes input of the tokens comprised in the corpus.
36
+ The function returns list of tokens after replacing each singleton with <UNK>
37
+ """
38
+
39
+ def replace_singletons(tokens):
40
+ vocab = nltk.FreqDist(tokens)
41
+ return [token if vocab[token] > 1 else UNK for token in tokens]
42
+
43
+ """Preprocess:
44
+
45
+ The function takes the argument 'sentences' that takes the list of str of
46
+ preprocess. The argument 'n' is the order of the model.
47
+ Adds the above three tokens to the sentences and tokenize.
48
+ The function returns preprocessed sentences.
49
+ """
50
+
51
+ def preprocess(sentences, n):
52
+ sentences = add_sentence_tokens(sentences, n)
53
+ tokens = ' '.join(sentences).split(' ')
54
+ tokens = replace_singletons(tokens)
55
+ return tokens
56
+
57
+ import argparse
58
+ from itertools import product
59
+ import math
60
+ from pathlib import Path
61
+
62
+ """ This function loads training and testing corpus from a directory.
63
+ The argument 'data_dir' contains path of the directory. The directory should contain files: 'train.txt' and 'test.txt'
64
+ Function will return train and test sets as lists of sentences.
65
+ """
66
+
67
+ def load_data(data_dir):
68
+ train_path = data_dir + 'train.txt'
69
+ test_path = data_dir + 'test.txt'
70
+
71
+ with open(train_path, 'r') as f:
72
+ train = [l.strip() for l in f.readlines()]
73
+ with open(test_path, 'r') as f:
74
+ test = [l.strip() for l in f.readlines()]
75
+ return train, test
76
+
77
+ """Trained N-gram model:
78
+
79
+ A trained model for the given corpus is constructed by preprocessing the
80
+ corpus and calculating the smoothed probabilities of each n-gram.
81
+ The arguments contains training data (list of strings), n (integer; order of the model),
82
+ and an integer used for laplace smoothing.
83
+ Further, the model has a method for calculating perplexity.
84
+ """
85
+
86
+ class LanguageModel(object):
87
+ def __init__(self, train_data, n, laplace=1):
88
+ self.n = n
89
+ self.laplace = laplace
90
+ self.tokens = preprocess(train_data, n)
91
+ self.vocab = nltk.FreqDist(self.tokens)
92
+ self.model = self._create_model()
93
+ self.masks = list(reversed(list(product((0,1), repeat=n))))
94
+
95
+ def _smooth(self):
96
+ """
97
+ The n tokens of n-gram in training corpus and first n-1 tokens of each n-gram
98
+ results in Laplace smoothenedd probability.
99
+ The function returns the smoothened probability mapped to its n-gram.
100
+
101
+ """
102
+ vocab_size = len(self.vocab)
103
+
104
+ n_grams = nltk.ngrams(self.tokens, self.n)
105
+ n_vocab = nltk.FreqDist(n_grams)
106
+
107
+ m_grams = nltk.ngrams(self.tokens, self.n-1)
108
+ m_vocab = nltk.FreqDist(m_grams)
109
+
110
+ def smoothed_count(n_gram, n_count):
111
+ m_gram = n_gram[:-1]
112
+ m_count = m_vocab[m_gram]
113
+ return (n_count + self.laplace) / (m_count + self.laplace * vocab_size)
114
+
115
+ return { n_gram: smoothed_count(n_gram, count) for n_gram, count in n_vocab.items() }
116
+
117
+ def _create_model(self):
118
+ """
119
+ This function creates a probability distribution of the vocabulary of training corpus.
120
+ The probabilities in a unigram model are simply relative frequencies of each token over the whole corpus.
121
+ Otherwise, the relative frequencies are Laplace-smoothed probabilities.
122
+ Function returns a dictionary which maps each n-gram, which is in the form of tuple of strings, to its probabilities (float)
123
+
124
+ """
125
+ if self.n == 1:
126
+ num_tokens = len(self.tokens)
127
+ return { (unigram,): count / num_tokens for unigram, count in self.vocab.items() }
128
+ else:
129
+ return self._smooth()
130
+
131
+ def _convert_oov(self, ngram):
132
+ """
133
+ This function handles the words which are encountered in the test and converts the given n-gram to one which is known by the model.
134
+ Stop when the model contains an entry for every permutation.
135
+ The function returns n-gram with <UNK> tokens in certain positions such that the model
136
+ contains an entry for it.
137
+ """
138
+ mask = lambda ngram, bitmask: tuple((token if flag == 1 else "<UNK>" for token,flag in zip(ngram, bitmask)))
139
+
140
+ ngram = (ngram,) if type(ngram) is str else ngram
141
+ for possible_known in [mask(ngram, bitmask) for bitmask in self.masks]:
142
+ if possible_known in self.model:
143
+ return possible_known
144
+
145
+ def perplexity(self, test_data):
146
+ """
147
+ Perplexity of the model is calculated using the sentences and returns
148
+ a float value.
149
+
150
+ """
151
+ test_tokens = preprocess(test_data, self.n)
152
+ test_ngrams = nltk.ngrams(test_tokens, self.n)
153
+ N = len(test_tokens)
154
+
155
+ known_ngrams = (self._convert_oov(ngram) for ngram in test_ngrams)
156
+ probabilities = [self.model[ngram] for ngram in known_ngrams]
157
+
158
+ return math.exp((-1/N) * sum(map(math.log, probabilities)))
159
+
160
+ def _best_candidate(self, prev, i, without=[]):
161
+ """
162
+ Selects the most probable token depending on the basis of previous
163
+ (n-1) tokens.
164
+ The function takes the argument of previous (n-1) tokens, and the tokens to
165
+ exclude from candidates list.
166
+ The function returns the most probable token and its probability.
167
+
168
+ """
169
+ blacklist = ["<UNK>"] + without
170
+ candidates = ((ngram[-1],prob) for ngram,prob in self.model.items() if ngram[:-1]==prev)
171
+ candidates = filter(lambda candidate: candidate[0] not in blacklist, candidates)
172
+ candidates = sorted(candidates, key=lambda candidate: candidate[1], reverse=True)
173
+ if len(candidates) == 0:
174
+ return ("</s>", 1)
175
+ else:
176
+ return candidates[0 if prev != () and prev[-1] != "<s>" else i]
177
+
178
+ # data_path = '/content/drive/Shareddrives/MathProject22/Dataset/data/'
179
+ # train, test = load_data(data_path)
180
+
181
+ # #if __name__ == '__main__':
182
+ # model_instance= LanguageModel(train[0:100], 3, 0)
183
+ # # first number is the n of n gram
184
+ # # second number is the coefficient whether laplace used or not
185
+
186
+ # print(model_instance.perplexity(test))
187
+
188
+ # prev=('I','love',)
189
+ # print(model_instance._best_candidate(prev,1)[0])
190
+ # # `1 is ith best fit as a candidate
191
+
192
+ # import pickle
193
+ # filename = 'without_laplace.sav'
194
+ # pickle.dump(model_instance, open(filename, 'wb'))
195
+
196
+ # len(train)
model.sav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90914edcae5441b12869a114f2ec6dca3bca84f2bd615f695adc5c24ea63392f
3
+ size 1083244544