Spaces:
Sleeping
Sleeping
Update query_preprocessing
Browse files
naive_chatbot/naive_chatbot.py
CHANGED
@@ -2,6 +2,9 @@
|
|
2 |
"""Naive Chatbot"""
|
3 |
import logging
|
4 |
import pickle
|
|
|
|
|
|
|
5 |
import numpy as np
|
6 |
import tensorflow as tf
|
7 |
from camel_tools.utils.normalize import normalize_unicode
|
@@ -34,7 +37,20 @@ max_length = 32
|
|
34 |
oov_tok = '<OOV>' # Out of Vocabulary
|
35 |
training_portion = 1
|
36 |
previous_reply = 'احنا لسه في بداية الكلام'
|
37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
|
39 |
def load_pickle_data(filepath):
|
40 |
with open(filepath, 'rb') as pickle_file:
|
@@ -114,6 +130,17 @@ class NaiveChatbot:
|
|
114 |
pass
|
115 |
|
116 |
def preprocess_query(self, query):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
117 |
norm = normalize_unicode(query)
|
118 |
# Normalize alef variants to 'ا'
|
119 |
norm = normalize_alef_ar(norm)
|
|
|
2 |
"""Naive Chatbot"""
|
3 |
import logging
|
4 |
import pickle
|
5 |
+
import string
|
6 |
+
import re
|
7 |
+
|
8 |
import numpy as np
|
9 |
import tensorflow as tf
|
10 |
from camel_tools.utils.normalize import normalize_unicode
|
|
|
37 |
oov_tok = '<OOV>' # Out of Vocabulary
|
38 |
training_portion = 1
|
39 |
previous_reply = 'احنا لسه في بداية الكلام'
|
40 |
+
arabic_punctuations = '''«»`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
|
41 |
+
english_punctuations = string.punctuation
|
42 |
+
punctuations_list = arabic_punctuations + english_punctuations
|
43 |
+
arabic_diacritics = re.compile("""
|
44 |
+
ّ | # Tashdid
|
45 |
+
َ | # Fatha
|
46 |
+
ً | # Tanwin Fath
|
47 |
+
ُ | # Damma
|
48 |
+
ٌ | # Tanwin Damm
|
49 |
+
ِ | # Kasra
|
50 |
+
ٍ | # Tanwin Kasr
|
51 |
+
ْ | # Sukun
|
52 |
+
ـ # Tatwil/Kashida
|
53 |
+
""", re.VERBOSE)
|
54 |
|
55 |
def load_pickle_data(filepath):
|
56 |
with open(filepath, 'rb') as pickle_file:
|
|
|
130 |
pass
|
131 |
|
132 |
def preprocess_query(self, query):
|
133 |
+
text = query.translate(str.maketrans('', '', punctuations_list))
|
134 |
+
# remove diacritics
|
135 |
+
text = re.sub(arabic_diacritics, '', str(text))
|
136 |
+
# remoce emoji
|
137 |
+
regrex_pattern = re.compile(pattern = "["
|
138 |
+
u"\U0001F600-\U0001F64F" # emoticons
|
139 |
+
u"\U0001F300-\U0001F5FF" # symbols & pictographs
|
140 |
+
u"\U0001F680-\U0001F6FF" # transport & map symbols
|
141 |
+
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
|
142 |
+
"]+", flags = re.UNICODE)
|
143 |
+
query = regrex_pattern.sub(r'',text)
|
144 |
norm = normalize_unicode(query)
|
145 |
# Normalize alef variants to 'ا'
|
146 |
norm = normalize_alef_ar(norm)
|