mshetairy commited on
Commit
a769454
1 Parent(s): bb5f61d

Update query_preprocessing

Browse files
Files changed (1) hide show
  1. naive_chatbot/naive_chatbot.py +28 -1
naive_chatbot/naive_chatbot.py CHANGED
@@ -2,6 +2,9 @@
2
  """Naive Chatbot"""
3
  import logging
4
  import pickle
 
 
 
5
  import numpy as np
6
  import tensorflow as tf
7
  from camel_tools.utils.normalize import normalize_unicode
@@ -34,7 +37,20 @@ max_length = 32
34
  oov_tok = '<OOV>' # Out of Vocabulary
35
  training_portion = 1
36
  previous_reply = 'احنا لسه في بداية الكلام'
37
-
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
  def load_pickle_data(filepath):
40
  with open(filepath, 'rb') as pickle_file:
@@ -114,6 +130,17 @@ class NaiveChatbot:
114
  pass
115
 
116
  def preprocess_query(self, query):
 
 
 
 
 
 
 
 
 
 
 
117
  norm = normalize_unicode(query)
118
  # Normalize alef variants to 'ا'
119
  norm = normalize_alef_ar(norm)
 
2
  """Naive Chatbot"""
3
  import logging
4
  import pickle
5
+ import string
6
+ import re
7
+
8
  import numpy as np
9
  import tensorflow as tf
10
  from camel_tools.utils.normalize import normalize_unicode
 
37
  oov_tok = '<OOV>' # Out of Vocabulary
38
  training_portion = 1
39
  previous_reply = 'احنا لسه في بداية الكلام'
40
+ arabic_punctuations = '''«»`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
41
+ english_punctuations = string.punctuation
42
+ punctuations_list = arabic_punctuations + english_punctuations
43
+ arabic_diacritics = re.compile("""
44
+ ّ | # Tashdid
45
+ َ | # Fatha
46
+ ً | # Tanwin Fath
47
+ ُ | # Damma
48
+ ٌ | # Tanwin Damm
49
+ ِ | # Kasra
50
+ ٍ | # Tanwin Kasr
51
+ ْ | # Sukun
52
+ ـ # Tatwil/Kashida
53
+ """, re.VERBOSE)
54
 
55
  def load_pickle_data(filepath):
56
  with open(filepath, 'rb') as pickle_file:
 
130
  pass
131
 
132
  def preprocess_query(self, query):
133
+ text = query.translate(str.maketrans('', '', punctuations_list))
134
+ # remove diacritics
135
+ text = re.sub(arabic_diacritics, '', str(text))
136
+ # remoce emoji
137
+ regrex_pattern = re.compile(pattern = "["
138
+ u"\U0001F600-\U0001F64F" # emoticons
139
+ u"\U0001F300-\U0001F5FF" # symbols & pictographs
140
+ u"\U0001F680-\U0001F6FF" # transport & map symbols
141
+ u"\U0001F1E0-\U0001F1FF" # flags (iOS)
142
+ "]+", flags = re.UNICODE)
143
+ query = regrex_pattern.sub(r'',text)
144
  norm = normalize_unicode(query)
145
  # Normalize alef variants to 'ا'
146
  norm = normalize_alef_ar(norm)