ierhon commited on
Commit
c358215
1 Parent(s): 251ec78

Update tokenizer

Browse files
Files changed (1) hide show
  1. main.py +3 -1
main.py CHANGED
@@ -1,7 +1,7 @@
1
  import math
2
 
3
  similar_letters = {"й": "и", "ё": "е", "e": "е", "t": "т", "i": "l", "o": "о", "k": "к", "3": "з", "a": "а", "x": "х", "c": "с", "m": "м"}
4
- letters = "qwertyuiopasdfghjklzxcvbnmёйцукенгшщзхъфывапролджэячсмитьбю"
5
 
6
  def countwords(x):
7
  temp = {}
@@ -41,6 +41,8 @@ class Chatbot:
41
  preprocess += similar_letters[x]
42
  else:
43
  preprocess += x
 
 
44
  return preprocess.split()
45
  def train(self, data: dict):
46
  lendata = len(data)
 
1
  import math
2
 
3
  similar_letters = {"й": "и", "ё": "е", "e": "е", "t": "т", "i": "l", "o": "о", "k": "к", "3": "з", "a": "а", "x": "х", "c": "с", "m": "м"}
4
+ letters = "qwertyuiopasdfghjklzxcvbnmёйцукенгшщзхъфывапролджэячсмитьбю "
5
 
6
  def countwords(x):
7
  temp = {}
 
41
  preprocess += similar_letters[x]
42
  else:
43
  preprocess += x
44
+ else:
45
+ preprocess += " "+x+" "
46
  return preprocess.split()
47
  def train(self, data: dict):
48
  lendata = len(data)