ierhon commited on
Commit
157c5bd
1 Parent(s): a995a3b

Add n-gram update

Browse files
Files changed (1) hide show
  1. main.py +10 -4
main.py CHANGED
@@ -25,15 +25,16 @@ def add_dict(a, b):
25
  return temp
26
 
27
  class Chatbot:
28
- def __init__(self, name=None, letter_replace: bool = True, data: dict = None, frequency_weight: float = 0, div_by_len: bool = False):
29
  self.name = name
30
  self.letter_replace = letter_replace
31
  self.frequency_weight = frequency_weight
32
  self.div_by_len = div_by_len
33
  self.model = {}
 
34
  if data is not None:
35
  self.train(data)
36
- def tokenize(self, text: str):
37
  preprocess = ""
38
  for x in text.lower():
39
  if x in letters:
@@ -42,8 +43,13 @@ class Chatbot:
42
  else:
43
  preprocess += x
44
  else:
45
- preprocess += " "+x+" "
46
- return preprocess.split()
 
 
 
 
 
47
  def train(self, data: dict):
48
  lendata = len(data)
49
  lendata_div = 1/lendata
 
25
  return temp
26
 
27
  class Chatbot:
28
+ def __init__(self, name = None, n: int = 1, letter_replace: bool = True, data: dict = None, frequency_weight: float = 0, div_by_len: bool = False):
29
  self.name = name
30
  self.letter_replace = letter_replace
31
  self.frequency_weight = frequency_weight
32
  self.div_by_len = div_by_len
33
  self.model = {}
34
+ self.n = n-1
35
  if data is not None:
36
  self.train(data)
37
+ def tokenize(self, text: str, n: int = 1):
38
  preprocess = ""
39
  for x in text.lower():
40
  if x in letters:
 
43
  else:
44
  preprocess += x
45
  else:
46
+ preprocess += " " + x + " "
47
+ tokens = preprocess.split()
48
+ output = tokens.copy()
49
+ for i in range(self.n):
50
+ for num, word in enumerate(tokens[:-i]):
51
+ output.append(' '.join(tokens[num:num+i]))
52
+ return output
53
  def train(self, data: dict):
54
  lendata = len(data)
55
  lendata_div = 1/lendata