Spaces:

Shruhrid
/

Next_Word_Prediction

Runtime error

App Files Files Community

Next_Word_Prediction / final_maybe.py

Shruhrid Banthia

added requirements and removed some dependencies.

5671dcd over 2 years ago

raw

history blame contribute delete

No virus

7.15 kB

	# -- coding: utf-8 --
	"""final-maybe

	Automatically generated by Colaboratory.

	Original file is located at
	https://colab.research.google.com/drive/1UueiutTkBBB9Gb2Brp4mQVUvw7cjXTwo
	"""

	import nltk

	SOS = "<s> "
	EOS = "</s>"
	UNK = "<UNK>"

	"""Add Sentence Tokens:

	To identify the beginning and end of the sentence
	add the StartOfSentence and EndOfSentence tokens.

	The argument 'sentences' takes a list of str and 'n' is the order of the model.
	The function returns the list of generated of sentences.

	For bigram models (or greater) both tokens are added otherwise or only one is added.
	"""

	def add_sentence_tokens(sentences, n):
	sos = SOS * (n-1) if n > 1 else SOS
	return ['{}{} {}'.format(sos, s, EOS) for s in sentences]

	"""Replace singletons:

	For the tokens appearing only ones in the corpus, replace it with <UNK>

	The argument 'tokens' takes input of the tokens comprised in the corpus.
	The function returns list of tokens after replacing each singleton with <UNK>
	"""

	def replace_singletons(tokens):
	vocab = nltk.FreqDist(tokens)
	return [token if vocab[token] > 1 else UNK for token in tokens]

	"""Preprocess:

	The function takes the argument 'sentences' that takes the list of str of
	preprocess. The argument 'n' is the order of the model.
	Adds the above three tokens to the sentences and tokenize.
	The function returns preprocessed sentences.
	"""

	def preprocess(sentences, n):
	sentences = add_sentence_tokens(sentences, n)
	tokens = ' '.join(sentences).split(' ')
	tokens = replace_singletons(tokens)
	return tokens


	from itertools import product
	import math
	from pathlib import Path

	""" This function loads training and testing corpus from a directory.
	The argument 'data_dir' contains path of the directory. The directory should contain files: 'train.txt' and 'test.txt'
	Function will return train and test sets as lists of sentences.
	"""

	def load_data(data_dir):
	train_path = data_dir + 'train.txt'
	test_path = data_dir + 'test.txt'

	with open(train_path, 'r') as f:
	train = [l.strip() for l in f.readlines()]
	with open(test_path, 'r') as f:
	test = [l.strip() for l in f.readlines()]
	return train, test

	"""Trained N-gram model:

	A trained model for the given corpus is constructed by preprocessing the
	corpus and calculating the smoothed probabilities of each n-gram.
	The arguments contains training data (list of strings), n (integer; order of the model),
	and an integer used for laplace smoothing.
	Further, the model has a method for calculating perplexity.
	"""

	class LanguageModel(object):
	def __init__(self, train_data, n, laplace=1):
	self.n = n
	self.laplace = laplace
	self.tokens = preprocess(train_data, n)
	self.vocab = nltk.FreqDist(self.tokens)
	self.model = self._create_model()
	self.masks = list(reversed(list(product((0,1), repeat=n))))

	def _smooth(self):
	"""
	The n tokens of n-gram in training corpus and first n-1 tokens of each n-gram
	results in Laplace smoothenedd probability.
	The function returns the smoothened probability mapped to its n-gram.

	"""
	vocab_size = len(self.vocab)

	n_grams = nltk.ngrams(self.tokens, self.n)
	n_vocab = nltk.FreqDist(n_grams)

	m_grams = nltk.ngrams(self.tokens, self.n-1)
	m_vocab = nltk.FreqDist(m_grams)

	def smoothed_count(n_gram, n_count):
	m_gram = n_gram[:-1]
	m_count = m_vocab[m_gram]
	return (n_count + self.laplace) / (m_count + self.laplace * vocab_size)

	return { n_gram: smoothed_count(n_gram, count) for n_gram, count in n_vocab.items() }

	def _create_model(self):
	"""
	This function creates a probability distribution of the vocabulary of training corpus.
	The probabilities in a unigram model are simply relative frequencies of each token over the whole corpus.
	Otherwise, the relative frequencies are Laplace-smoothed probabilities.
	Function returns a dictionary which maps each n-gram, which is in the form of tuple of strings, to its probabilities (float)

	"""
	if self.n == 1:
	num_tokens = len(self.tokens)
	return { (unigram,): count / num_tokens for unigram, count in self.vocab.items() }
	else:
	return self._smooth()

	def _convert_oov(self, ngram):
	"""
	This function handles the words which are encountered in the test and converts the given n-gram to one which is known by the model.
	Stop when the model contains an entry for every permutation.
	The function returns n-gram with <UNK> tokens in certain positions such that the model
	contains an entry for it.
	"""
	mask = lambda ngram, bitmask: tuple((token if flag == 1 else "<UNK>" for token,flag in zip(ngram, bitmask)))

	ngram = (ngram,) if type(ngram) is str else ngram
	for possible_known in [mask(ngram, bitmask) for bitmask in self.masks]:
	if possible_known in self.model:
	return possible_known

	def perplexity(self, test_data):
	"""
	Perplexity of the model is calculated using the sentences and returns
	a float value.

	"""
	test_tokens = preprocess(test_data, self.n)
	test_ngrams = nltk.ngrams(test_tokens, self.n)
	N = len(test_tokens)

	known_ngrams = (self._convert_oov(ngram) for ngram in test_ngrams)
	probabilities = [self.model[ngram] for ngram in known_ngrams]

	return math.exp((-1/N) * sum(map(math.log, probabilities)))

	def _best_candidate(self, prev, i, without=[]):
	"""
	Selects the most probable token depending on the basis of previous
	(n-1) tokens.
	The function takes the argument of previous (n-1) tokens, and the tokens to
	exclude from candidates list.
	The function returns the most probable token and its probability.

	"""
	blacklist = ["<UNK>"] + without
	candidates = ((ngram[-1],prob) for ngram,prob in self.model.items() if ngram[:-1]==prev)
	candidates = filter(lambda candidate: candidate[0] not in blacklist, candidates)
	candidates = sorted(candidates, key=lambda candidate: candidate[1], reverse=True)
	if len(candidates) == 0:
	return ("</s>", 1)
	else:
	return candidates[0 if prev != () and prev[-1] != "<s>" else i]

	# data_path = '/content/drive/Shareddrives/MathProject22/Dataset/data/'
	# train, test = load_data(data_path)

	# #if __name__ == '__main__':
	# model_instance= LanguageModel(train[0:100], 3, 0)
	# # first number is the n of n gram
	# # second number is the coefficient whether laplace used or not

	# print(model_instance.perplexity(test))

	# prev=('I','love',)
	# print(model_instance._best_candidate(prev,1)[0])
	# # `1 is ith best fit as a candidate

	# import pickle
	# filename = 'without_laplace.sav'
	# pickle.dump(model_instance, open(filename, 'wb'))

	# len(train)