import math similar_letters = {"й": "и", "ё": "е", "e": "е", "t": "т", "i": "l", "o": "о", "k": "к", "3": "з", "a": "а", "x": "х", "c": "с", "m": "м"} letters = "qwertyuiopasdfghjklzxcvbnmёйцукенгшщзхъфывапролджэячсмитьбю " def countwords(x): temp = {} for word in x: if word not in temp: temp[word] = 1 else: temp[word] += 1 return temp def add_dict(a, b): temp = {} for key in a: if key in b: temp[key] = a[key]+b[key] else: temp[key] = a[key] for key in b: if key not in a: temp[key] = b[key] return temp class Chatbot: def __init__(self, name = None, n: int = 1, letter_replace: bool = True, data: dict = None, frequency_weight: float = 0, div_by_len: bool = False): self.name = name self.letter_replace = letter_replace self.frequency_weight = frequency_weight self.div_by_len = div_by_len self.model = {} self.n = n-1 if data is not None: self.train(data) def tokenize(self, text: str, n: int = 1): preprocess = "" for x in text.lower(): if x in letters: if x in similar_letters and self.letter_replace: preprocess += similar_letters[x] else: preprocess += x else: preprocess += " " + x + " " tokens = preprocess.split() output = tokens.copy() for i in range(self.n): for num, word in enumerate(tokens[:-i]): output.append(' '.join(tokens[num:num+i])) return output def train(self, data: dict): lendata = len(data) lendata_div = 1/lendata for x in data: if data[x] not in self.model: self.model[data[x]] = {"word count": countwords(self.tokenize(x)), "probabilities": {}, "weight count": 1, "weight": 0} else: self.model[data[x]]["word count"] = add_dict(countwords(self.tokenize(x)), self.model[data[x]]["word count"]) self.model[data[x]]["weight count"] += 1 for x in self.model: probabilities = {} div = 1/math.fsum(list(self.model[x]["word count"].values())) for word in self.model[x]["word count"]: probabilities[word] = self.model[x]["word count"][word]*div self.model[x]["probabilities"] = probabilities self.model[x]["weight"] = self.model[x]["weight count"] * lendata_div def get_responses(self, text: str): tokens = self.tokenize(text) lentokens = len(tokens) lentokens_div = 1/lentokens scores = [] for choice in self.model: score = 0 for token in tokens: if token in self.model[choice]["probabilities"]: score += self.model[choice]["probabilities"][token] if self.div_by_len: score *= lentokens_div score *= self.frequency_weight*self.model[choice]["weight"] + (1-self.frequency_weight) scores.append((choice, score)) return sorted(scores, key=lambda x: x[1], reverse=True) def __call__(self, text: str): return self.get_responses(text)[0][0] if __name__ == "__main__": import json with open("dataset.json", "r") as file: data = json.load(file) cb = Chatbot(data=data) while True: message = input("User: ") response = cb(message) print("Chatbot:", response) if response == "Пока": break