ierhon commited on
Commit
31aa98a
1 Parent(s): 630df93

Create main.py

Browse files
Files changed (1) hide show
  1. main.py +90 -0
main.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+
3
+ similar_letters = {"й": "и", "ё": "е", "e": "е", "t": "т", "i": "l", "o": "о", "k": "к", "3": "з", "a": "а", "x": "х", "c": "с", "m": "м"}
4
+ letters = "qwertyuiopasdfghjklzxcvbnmёйцукенгшщзхъфывапролджэячсмитьбю"
5
+
6
+ def countwords(x):
7
+ temp = {}
8
+ for word in x:
9
+ if word not in temp:
10
+ temp[word] = 1
11
+ else:
12
+ temp[word] += 1
13
+ return temp
14
+
15
+ def add_dict(a, b):
16
+ temp = {}
17
+ for key in a:
18
+ if key in b:
19
+ temp[key] = a[key]+b[key]
20
+ else:
21
+ temp[key] = a[key]
22
+ for key in b:
23
+ if key not in a:
24
+ temp[key] = b[key]
25
+ return a
26
+
27
+ class Chatbot:
28
+ def __init__(self, name=None, letter_replace: bool = True, data: dict = None, frequency_weight: float = 0, div_by_len: bool = False):
29
+ self.name = name
30
+ self.letter_replace = letter_replace
31
+ self.frequency_weight = frequency_weight
32
+ self.div_by_len = div_by_len
33
+ self.model = {}
34
+ if data is not None:
35
+ self.train(data)
36
+ def tokenize(self, text: str):
37
+ preprocess = ""
38
+ for x in text.lower():
39
+ if x in letters:
40
+ if x in similar_letters and self.letter_replace:
41
+ preprocess += similar_letters[x]
42
+ else:
43
+ preprocess += x
44
+ return preprocess.split()
45
+ def train(self, data: dict):
46
+ lendata = len(data)
47
+ lendata_div = 1/lendata
48
+ for x in data:
49
+ if data[x] not in self.model:
50
+ self.model[data[x]] = {"word count": countwords(self.tokenize(x)), "probabilities": {}, "weight count": 1, "weight": 0}
51
+ else:
52
+ self.model[data[x]]["word count"] = add_dict(countwords(self.tokenize(x)), self.model[data[x]]["word count"])
53
+ self.model[data[x]]["weight count"] += 1
54
+ for x in self.model:
55
+ probabilities = {}
56
+ div = 1/math.fsum(list(self.model[x]["word count"].values()))
57
+ for word in self.model[x]["word count"]:
58
+ probabilities[word] = self.model[x]["word count"][word]*div
59
+ self.model[x]["probabilities"] = probabilities
60
+ self.model[x]["weight"] = self.model[x]["weight count"] * lendata_div
61
+ def get_responses(self, text: str):
62
+ tokens = self.tokenize(text)
63
+ lentokens = len(tokens)
64
+ lentokens_div = 1/lentokens
65
+ scores = []
66
+ for choice in self.model:
67
+ score = 0
68
+ for token in tokens:
69
+ if token in self.model[choice]["probabilities"]:
70
+ score += self.model[choice]["probabilities"][token]
71
+ if self.div_by_len:
72
+ score *= lentokens_div
73
+ score *= self.frequency_weight*self.model[choice]["weight"] + (1-self.frequency_weight)
74
+ scores.append((choice, score))
75
+ return sorted(scores, key=lambda x: x[1], reverse=True)
76
+ def __call__(self, text: str):
77
+ return self.get_responses(text)[0][0]
78
+
79
+ if __name__ == "__main__":
80
+ import json
81
+
82
+ with open("dataset.json", "r") as file:
83
+ data = json.load(file)
84
+
85
+ cb = Chatbot(data=data)
86
+ while True:
87
+ message = input("User: ").lower()
88
+ print("Chatbot:", cb(message))
89
+ if "пока" in message:
90
+ break