Spaces:
Runtime error
Runtime error
Shruhrid Banthia
commited on
Commit
•
0d083c9
1
Parent(s):
6164485
model.sav and final_maybe used
Browse files- .gitattributes +2 -0
- app.py +32 -0
- final_maybe.py +196 -0
- model.sav +3 -0
.gitattributes
CHANGED
@@ -26,3 +26,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
26 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
27 |
*.zstandard filter=lfs diff=lfs merge=lfs -text
|
28 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
26 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
27 |
*.zstandard filter=lfs diff=lfs merge=lfs -text
|
28 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.sav filter=lfs diff=lfs merge=lfs -text
|
30 |
+
model.sav filter=lfs diff=lfs merge=lfs -text
|
app.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import gradio.inputs
|
3 |
+
import pandas as pd
|
4 |
+
|
5 |
+
import os #interacting with input and output directories
|
6 |
+
|
7 |
+
import pickle
|
8 |
+
from final_maybe import LanguageModel
|
9 |
+
with open('model.sav','rb') as handle:
|
10 |
+
loaded_model = pickle.load(handle)
|
11 |
+
def fn(X_test):
|
12 |
+
|
13 |
+
|
14 |
+
X_final = tuple(map(str, X_test.split(' ')))
|
15 |
+
model = loaded_model
|
16 |
+
result = model._best_candidate(X_final,0)
|
17 |
+
|
18 |
+
return result
|
19 |
+
description = "Give two words as input and our model will predict the next word"
|
20 |
+
here = gr.Interface(fn=fn,
|
21 |
+
inputs= gradio.inputs.Textbox( lines=1, placeholder=None, default="", label=None),
|
22 |
+
outputs='text',
|
23 |
+
title="Next Word Prediction",
|
24 |
+
description=description,
|
25 |
+
theme="default",
|
26 |
+
allow_flagging="auto",
|
27 |
+
flagging_dir='flagging records')
|
28 |
+
#here.launch(inline=False, share = True)
|
29 |
+
if __name__ == "__main__":
|
30 |
+
app, local_url, share_url = here.launch(share=True)
|
31 |
+
|
32 |
+
|
final_maybe.py
ADDED
@@ -0,0 +1,196 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""final-maybe
|
3 |
+
|
4 |
+
Automatically generated by Colaboratory.
|
5 |
+
|
6 |
+
Original file is located at
|
7 |
+
https://colab.research.google.com/drive/1UueiutTkBBB9Gb2Brp4mQVUvw7cjXTwo
|
8 |
+
"""
|
9 |
+
|
10 |
+
import nltk
|
11 |
+
|
12 |
+
SOS = "<s> "
|
13 |
+
EOS = "</s>"
|
14 |
+
UNK = "<UNK>"
|
15 |
+
|
16 |
+
"""Add Sentence Tokens:
|
17 |
+
|
18 |
+
To identify the beginning and end of the sentence
|
19 |
+
add the StartOfSentence and EndOfSentence tokens.
|
20 |
+
|
21 |
+
The argument 'sentences' takes a list of str and 'n' is the order of the model.
|
22 |
+
The function returns the list of generated of sentences.
|
23 |
+
|
24 |
+
For bigram models (or greater) both tokens are added otherwise or only one is added.
|
25 |
+
"""
|
26 |
+
|
27 |
+
def add_sentence_tokens(sentences, n):
|
28 |
+
sos = SOS * (n-1) if n > 1 else SOS
|
29 |
+
return ['{}{} {}'.format(sos, s, EOS) for s in sentences]
|
30 |
+
|
31 |
+
"""Replace singletons:
|
32 |
+
|
33 |
+
For the tokens appearing only ones in the corpus, replace it with <UNK>
|
34 |
+
|
35 |
+
The argument 'tokens' takes input of the tokens comprised in the corpus.
|
36 |
+
The function returns list of tokens after replacing each singleton with <UNK>
|
37 |
+
"""
|
38 |
+
|
39 |
+
def replace_singletons(tokens):
|
40 |
+
vocab = nltk.FreqDist(tokens)
|
41 |
+
return [token if vocab[token] > 1 else UNK for token in tokens]
|
42 |
+
|
43 |
+
"""Preprocess:
|
44 |
+
|
45 |
+
The function takes the argument 'sentences' that takes the list of str of
|
46 |
+
preprocess. The argument 'n' is the order of the model.
|
47 |
+
Adds the above three tokens to the sentences and tokenize.
|
48 |
+
The function returns preprocessed sentences.
|
49 |
+
"""
|
50 |
+
|
51 |
+
def preprocess(sentences, n):
|
52 |
+
sentences = add_sentence_tokens(sentences, n)
|
53 |
+
tokens = ' '.join(sentences).split(' ')
|
54 |
+
tokens = replace_singletons(tokens)
|
55 |
+
return tokens
|
56 |
+
|
57 |
+
import argparse
|
58 |
+
from itertools import product
|
59 |
+
import math
|
60 |
+
from pathlib import Path
|
61 |
+
|
62 |
+
""" This function loads training and testing corpus from a directory.
|
63 |
+
The argument 'data_dir' contains path of the directory. The directory should contain files: 'train.txt' and 'test.txt'
|
64 |
+
Function will return train and test sets as lists of sentences.
|
65 |
+
"""
|
66 |
+
|
67 |
+
def load_data(data_dir):
|
68 |
+
train_path = data_dir + 'train.txt'
|
69 |
+
test_path = data_dir + 'test.txt'
|
70 |
+
|
71 |
+
with open(train_path, 'r') as f:
|
72 |
+
train = [l.strip() for l in f.readlines()]
|
73 |
+
with open(test_path, 'r') as f:
|
74 |
+
test = [l.strip() for l in f.readlines()]
|
75 |
+
return train, test
|
76 |
+
|
77 |
+
"""Trained N-gram model:
|
78 |
+
|
79 |
+
A trained model for the given corpus is constructed by preprocessing the
|
80 |
+
corpus and calculating the smoothed probabilities of each n-gram.
|
81 |
+
The arguments contains training data (list of strings), n (integer; order of the model),
|
82 |
+
and an integer used for laplace smoothing.
|
83 |
+
Further, the model has a method for calculating perplexity.
|
84 |
+
"""
|
85 |
+
|
86 |
+
class LanguageModel(object):
|
87 |
+
def __init__(self, train_data, n, laplace=1):
|
88 |
+
self.n = n
|
89 |
+
self.laplace = laplace
|
90 |
+
self.tokens = preprocess(train_data, n)
|
91 |
+
self.vocab = nltk.FreqDist(self.tokens)
|
92 |
+
self.model = self._create_model()
|
93 |
+
self.masks = list(reversed(list(product((0,1), repeat=n))))
|
94 |
+
|
95 |
+
def _smooth(self):
|
96 |
+
"""
|
97 |
+
The n tokens of n-gram in training corpus and first n-1 tokens of each n-gram
|
98 |
+
results in Laplace smoothenedd probability.
|
99 |
+
The function returns the smoothened probability mapped to its n-gram.
|
100 |
+
|
101 |
+
"""
|
102 |
+
vocab_size = len(self.vocab)
|
103 |
+
|
104 |
+
n_grams = nltk.ngrams(self.tokens, self.n)
|
105 |
+
n_vocab = nltk.FreqDist(n_grams)
|
106 |
+
|
107 |
+
m_grams = nltk.ngrams(self.tokens, self.n-1)
|
108 |
+
m_vocab = nltk.FreqDist(m_grams)
|
109 |
+
|
110 |
+
def smoothed_count(n_gram, n_count):
|
111 |
+
m_gram = n_gram[:-1]
|
112 |
+
m_count = m_vocab[m_gram]
|
113 |
+
return (n_count + self.laplace) / (m_count + self.laplace * vocab_size)
|
114 |
+
|
115 |
+
return { n_gram: smoothed_count(n_gram, count) for n_gram, count in n_vocab.items() }
|
116 |
+
|
117 |
+
def _create_model(self):
|
118 |
+
"""
|
119 |
+
This function creates a probability distribution of the vocabulary of training corpus.
|
120 |
+
The probabilities in a unigram model are simply relative frequencies of each token over the whole corpus.
|
121 |
+
Otherwise, the relative frequencies are Laplace-smoothed probabilities.
|
122 |
+
Function returns a dictionary which maps each n-gram, which is in the form of tuple of strings, to its probabilities (float)
|
123 |
+
|
124 |
+
"""
|
125 |
+
if self.n == 1:
|
126 |
+
num_tokens = len(self.tokens)
|
127 |
+
return { (unigram,): count / num_tokens for unigram, count in self.vocab.items() }
|
128 |
+
else:
|
129 |
+
return self._smooth()
|
130 |
+
|
131 |
+
def _convert_oov(self, ngram):
|
132 |
+
"""
|
133 |
+
This function handles the words which are encountered in the test and converts the given n-gram to one which is known by the model.
|
134 |
+
Stop when the model contains an entry for every permutation.
|
135 |
+
The function returns n-gram with <UNK> tokens in certain positions such that the model
|
136 |
+
contains an entry for it.
|
137 |
+
"""
|
138 |
+
mask = lambda ngram, bitmask: tuple((token if flag == 1 else "<UNK>" for token,flag in zip(ngram, bitmask)))
|
139 |
+
|
140 |
+
ngram = (ngram,) if type(ngram) is str else ngram
|
141 |
+
for possible_known in [mask(ngram, bitmask) for bitmask in self.masks]:
|
142 |
+
if possible_known in self.model:
|
143 |
+
return possible_known
|
144 |
+
|
145 |
+
def perplexity(self, test_data):
|
146 |
+
"""
|
147 |
+
Perplexity of the model is calculated using the sentences and returns
|
148 |
+
a float value.
|
149 |
+
|
150 |
+
"""
|
151 |
+
test_tokens = preprocess(test_data, self.n)
|
152 |
+
test_ngrams = nltk.ngrams(test_tokens, self.n)
|
153 |
+
N = len(test_tokens)
|
154 |
+
|
155 |
+
known_ngrams = (self._convert_oov(ngram) for ngram in test_ngrams)
|
156 |
+
probabilities = [self.model[ngram] for ngram in known_ngrams]
|
157 |
+
|
158 |
+
return math.exp((-1/N) * sum(map(math.log, probabilities)))
|
159 |
+
|
160 |
+
def _best_candidate(self, prev, i, without=[]):
|
161 |
+
"""
|
162 |
+
Selects the most probable token depending on the basis of previous
|
163 |
+
(n-1) tokens.
|
164 |
+
The function takes the argument of previous (n-1) tokens, and the tokens to
|
165 |
+
exclude from candidates list.
|
166 |
+
The function returns the most probable token and its probability.
|
167 |
+
|
168 |
+
"""
|
169 |
+
blacklist = ["<UNK>"] + without
|
170 |
+
candidates = ((ngram[-1],prob) for ngram,prob in self.model.items() if ngram[:-1]==prev)
|
171 |
+
candidates = filter(lambda candidate: candidate[0] not in blacklist, candidates)
|
172 |
+
candidates = sorted(candidates, key=lambda candidate: candidate[1], reverse=True)
|
173 |
+
if len(candidates) == 0:
|
174 |
+
return ("</s>", 1)
|
175 |
+
else:
|
176 |
+
return candidates[0 if prev != () and prev[-1] != "<s>" else i]
|
177 |
+
|
178 |
+
# data_path = '/content/drive/Shareddrives/MathProject22/Dataset/data/'
|
179 |
+
# train, test = load_data(data_path)
|
180 |
+
|
181 |
+
# #if __name__ == '__main__':
|
182 |
+
# model_instance= LanguageModel(train[0:100], 3, 0)
|
183 |
+
# # first number is the n of n gram
|
184 |
+
# # second number is the coefficient whether laplace used or not
|
185 |
+
|
186 |
+
# print(model_instance.perplexity(test))
|
187 |
+
|
188 |
+
# prev=('I','love',)
|
189 |
+
# print(model_instance._best_candidate(prev,1)[0])
|
190 |
+
# # `1 is ith best fit as a candidate
|
191 |
+
|
192 |
+
# import pickle
|
193 |
+
# filename = 'without_laplace.sav'
|
194 |
+
# pickle.dump(model_instance, open(filename, 'wb'))
|
195 |
+
|
196 |
+
# len(train)
|
model.sav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:90914edcae5441b12869a114f2ec6dca3bca84f2bd615f695adc5c24ea63392f
|
3 |
+
size 1083244544
|