File size: 3,682 Bytes

import math

similar_letters = {"й": "и", "ё": "е", "e": "е", "t": "т", "i": "l", "o": "о", "k": "к", "3": "з", "a": "а", "x": "х", "c": "с", "m": "м"}
letters = "qwertyuiopasdfghjklzxcvbnmёйцукенгшщзхъфывапролджэячсмитьбю "

def countwords(x):
    temp = {}
    for word in x:
        if word not in temp:
            temp[word] = 1
        else:
            temp[word] += 1
    return temp

def add_dict(a, b):
    temp = {}
    for key in a:
        if key in b:
            temp[key] = a[key]+b[key]
        else:
            temp[key] = a[key]
    for key in b:
        if key not in a:
            temp[key] = b[key]
    return temp

class Chatbot:
    def __init__(self, name = None, n: int = 1, letter_replace: bool = True, data: dict = None, frequency_weight: float = 0, div_by_len: bool = False):
        self.name = name
        self.letter_replace = letter_replace
        self.frequency_weight = frequency_weight
        self.div_by_len = div_by_len
        self.model = {}
        self.n = n-1
        if data is not None:
            self.train(data)
    def tokenize(self, text: str, n: int = 1):
        preprocess = ""
        for x in text.lower():
            if x in letters:
                if x in similar_letters and self.letter_replace:
                    preprocess += similar_letters[x]
                else:
                    preprocess += x
            else:
                preprocess += " " + x + " "
        tokens = preprocess.split()
        output = tokens.copy()
        for i in range(self.n):
            for num, word in enumerate(tokens[:-i]):
                output.append(' '.join(tokens[num:num+i]))
        return output
    def train(self, data: dict):
        lendata = len(data)
        lendata_div = 1/lendata
        for x in data:
            if data[x] not in self.model:
                self.model[data[x]] = {"word count": countwords(self.tokenize(x)), "probabilities": {}, "weight count": 1, "weight": 0}
            else:
                self.model[data[x]]["word count"] = add_dict(countwords(self.tokenize(x)), self.model[data[x]]["word count"])
                self.model[data[x]]["weight count"] += 1
        for x in self.model:
            probabilities = {}
            div = 1/math.fsum(list(self.model[x]["word count"].values()))
            for word in self.model[x]["word count"]:
                probabilities[word] = self.model[x]["word count"][word]*div
            self.model[x]["probabilities"] = probabilities
            self.model[x]["weight"] = self.model[x]["weight count"] * lendata_div
    def get_responses(self, text: str):
        tokens = self.tokenize(text)
        lentokens = len(tokens)
        lentokens_div = 1/lentokens
        scores = []
        for choice in self.model:
            score = 0
            for token in tokens:
                if token in self.model[choice]["probabilities"]:
                    score += self.model[choice]["probabilities"][token]
            if self.div_by_len:
                score *= lentokens_div
            score *= self.frequency_weight*self.model[choice]["weight"] + (1-self.frequency_weight)
            scores.append((choice, score))
        return sorted(scores, key=lambda x: x[1], reverse=True)
    def __call__(self, text: str):
        return self.get_responses(text)[0][0]

if __name__ == "__main__":
    import json

    with open("dataset.json", "r") as file:
        data = json.load(file)

    cb = Chatbot(data=data)
    while True:
        message = input("User: ")
        response = cb(message)
        print("Chatbot:", response)
        if response == "Пока":
            break