bayes-chatbot / main.py
ierhon's picture
Add n-gram update
157c5bd verified
import math
similar_letters = {"й": "и", "ё": "е", "e": "е", "t": "т", "i": "l", "o": "о", "k": "к", "3": "з", "a": "а", "x": "х", "c": "с", "m": "м"}
letters = "qwertyuiopasdfghjklzxcvbnmёйцукенгшщзхъфывапролджэячсмитьбю "
def countwords(x):
temp = {}
for word in x:
if word not in temp:
temp[word] = 1
else:
temp[word] += 1
return temp
def add_dict(a, b):
temp = {}
for key in a:
if key in b:
temp[key] = a[key]+b[key]
else:
temp[key] = a[key]
for key in b:
if key not in a:
temp[key] = b[key]
return temp
class Chatbot:
def __init__(self, name = None, n: int = 1, letter_replace: bool = True, data: dict = None, frequency_weight: float = 0, div_by_len: bool = False):
self.name = name
self.letter_replace = letter_replace
self.frequency_weight = frequency_weight
self.div_by_len = div_by_len
self.model = {}
self.n = n-1
if data is not None:
self.train(data)
def tokenize(self, text: str, n: int = 1):
preprocess = ""
for x in text.lower():
if x in letters:
if x in similar_letters and self.letter_replace:
preprocess += similar_letters[x]
else:
preprocess += x
else:
preprocess += " " + x + " "
tokens = preprocess.split()
output = tokens.copy()
for i in range(self.n):
for num, word in enumerate(tokens[:-i]):
output.append(' '.join(tokens[num:num+i]))
return output
def train(self, data: dict):
lendata = len(data)
lendata_div = 1/lendata
for x in data:
if data[x] not in self.model:
self.model[data[x]] = {"word count": countwords(self.tokenize(x)), "probabilities": {}, "weight count": 1, "weight": 0}
else:
self.model[data[x]]["word count"] = add_dict(countwords(self.tokenize(x)), self.model[data[x]]["word count"])
self.model[data[x]]["weight count"] += 1
for x in self.model:
probabilities = {}
div = 1/math.fsum(list(self.model[x]["word count"].values()))
for word in self.model[x]["word count"]:
probabilities[word] = self.model[x]["word count"][word]*div
self.model[x]["probabilities"] = probabilities
self.model[x]["weight"] = self.model[x]["weight count"] * lendata_div
def get_responses(self, text: str):
tokens = self.tokenize(text)
lentokens = len(tokens)
lentokens_div = 1/lentokens
scores = []
for choice in self.model:
score = 0
for token in tokens:
if token in self.model[choice]["probabilities"]:
score += self.model[choice]["probabilities"][token]
if self.div_by_len:
score *= lentokens_div
score *= self.frequency_weight*self.model[choice]["weight"] + (1-self.frequency_weight)
scores.append((choice, score))
return sorted(scores, key=lambda x: x[1], reverse=True)
def __call__(self, text: str):
return self.get_responses(text)[0][0]
if __name__ == "__main__":
import json
with open("dataset.json", "r") as file:
data = json.load(file)
cb = Chatbot(data=data)
while True:
message = input("User: ")
response = cb(message)
print("Chatbot:", response)
if response == "Пока":
break