Spaces:
Sleeping
Sleeping
Update chat.py
Browse files
chat.py
CHANGED
@@ -1,94 +1,94 @@
|
|
1 |
-
|
2 |
-
import streamlit as st
|
3 |
-
import pickle
|
4 |
-
import pandas as pd
|
5 |
-
import re # Regular expressions to use sub function for replacing the useless text from the data
|
6 |
-
import tensorflow as tf
|
7 |
-
import numpy as np
|
8 |
-
import pickle
|
9 |
-
from tensorflow.keras.preprocessing.text import Tokenizer
|
10 |
-
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
def clean_text(text):
|
15 |
-
text = re.sub(r',', '', text)
|
16 |
-
text = re.sub(r'\'', '', text)
|
17 |
-
text = re.sub(r'\"', '', text)
|
18 |
-
text = re.sub(r'\(', '', text)
|
19 |
-
text = re.sub(r'\)', '', text)
|
20 |
-
text = re.sub(r'\n', '', text)
|
21 |
-
text = re.sub(r'β', '', text)
|
22 |
-
text = re.sub(r'β', '', text)
|
23 |
-
text = re.sub(r'β', '', text)
|
24 |
-
text = re.sub(r'\.', '', text)
|
25 |
-
text = re.sub(r';', '', text)
|
26 |
-
text = re.sub(r':', '', text)
|
27 |
-
text = re.sub(r'\-', '', text)
|
28 |
-
return text
|
29 |
-
|
30 |
-
@st.cache_data
|
31 |
-
def loadata():
|
32 |
-
data = pd.read_excel("IT_Knowledge_Base_Final_FR.xlsx")
|
33 |
-
data.drop("questions", axis=1, inplace=True)
|
34 |
-
data = data.to_string()
|
35 |
-
lower_text= data.lower()
|
36 |
-
split_dataset= lower_text.splitlines()
|
37 |
-
final=''
|
38 |
-
for line in split_dataset:
|
39 |
-
line= clean_text(line)
|
40 |
-
final+='\n'+line
|
41 |
-
final_dataset= final.split('\n')
|
42 |
-
return final_dataset
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
max_vocab=100000 #je veux que chaque phrase ait maximum 10k mots
|
47 |
-
tokenizer = Tokenizer(num_words=max_vocab) #arrange la taille max de chaque phrase
|
48 |
-
tokenizer.fit_on_texts(loadata())
|
49 |
-
wor2idx= tokenizer.word_index #tranforme les mots en index
|
50 |
-
input_seq=[] #transforme la sequence de mot en matrice de chiffre
|
51 |
-
for line in loadata():
|
52 |
-
token_list= tokenizer.texts_to_sequences([line])[0]
|
53 |
-
for i in range(1, len(token_list)):
|
54 |
-
n_gram_seq= token_list[:i+1]
|
55 |
-
input_seq.append(n_gram_seq)
|
56 |
-
max_seq_length=max(len(x) for x in input_seq)
|
57 |
-
input_seq= np.array(pad_sequences(input_seq, maxlen=max_seq_length, padding='pre'))
|
58 |
-
|
59 |
-
model = pickle.load(open("
|
60 |
-
|
61 |
-
|
62 |
-
def predict_words(seed, no_words=50, tokenizer=tokenizer,max_seq_length=max_seq_length):
|
63 |
-
for i in range(no_words):
|
64 |
-
token_list = tokenizer.texts_to_sequences([seed])[0]
|
65 |
-
token_list = pad_sequences([token_list], maxlen=max_seq_length-1, padding='pre')
|
66 |
-
predicted = np.argmax(model.predict(token_list), axis=1)
|
67 |
-
|
68 |
-
new_word = ''
|
69 |
-
|
70 |
-
for word, index in tokenizer.word_index.items():
|
71 |
-
if predicted == index:
|
72 |
-
new_word = word
|
73 |
-
break
|
74 |
-
seed += " " + new_word
|
75 |
-
return seed
|
76 |
-
|
77 |
-
|
78 |
-
def chatbot(message):
|
79 |
-
response= predict_words(message)
|
80 |
-
return f"{response}"
|
81 |
-
|
82 |
-
def main():
|
83 |
-
st.title("Chatbot ")
|
84 |
-
|
85 |
-
user_input = st.text_input("vous:", "")
|
86 |
-
|
87 |
-
if st.button("envoyer"):
|
88 |
-
response = chatbot(user_input)
|
89 |
-
st.text_area("Chatbot:", value=response, height=100)
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
main()
|
94 |
# pickled_model.predict(X_test)
|
|
|
1 |
+
|
2 |
+
import streamlit as st
|
3 |
+
import pickle
|
4 |
+
import pandas as pd
|
5 |
+
import re # Regular expressions to use sub function for replacing the useless text from the data
|
6 |
+
import tensorflow as tf
|
7 |
+
import numpy as np
|
8 |
+
import pickle
|
9 |
+
from tensorflow.keras.preprocessing.text import Tokenizer
|
10 |
+
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
11 |
+
|
12 |
+
|
13 |
+
|
14 |
+
def clean_text(text):
|
15 |
+
text = re.sub(r',', '', text)
|
16 |
+
text = re.sub(r'\'', '', text)
|
17 |
+
text = re.sub(r'\"', '', text)
|
18 |
+
text = re.sub(r'\(', '', text)
|
19 |
+
text = re.sub(r'\)', '', text)
|
20 |
+
text = re.sub(r'\n', '', text)
|
21 |
+
text = re.sub(r'β', '', text)
|
22 |
+
text = re.sub(r'β', '', text)
|
23 |
+
text = re.sub(r'β', '', text)
|
24 |
+
text = re.sub(r'\.', '', text)
|
25 |
+
text = re.sub(r';', '', text)
|
26 |
+
text = re.sub(r':', '', text)
|
27 |
+
text = re.sub(r'\-', '', text)
|
28 |
+
return text
|
29 |
+
|
30 |
+
@st.cache_data
|
31 |
+
def loadata():
|
32 |
+
data = pd.read_excel("IT_Knowledge_Base_Final_FR.xlsx")
|
33 |
+
data.drop("questions", axis=1, inplace=True)
|
34 |
+
data = data.to_string()
|
35 |
+
lower_text= data.lower()
|
36 |
+
split_dataset= lower_text.splitlines()
|
37 |
+
final=''
|
38 |
+
for line in split_dataset:
|
39 |
+
line= clean_text(line)
|
40 |
+
final+='\n'+line
|
41 |
+
final_dataset= final.split('\n')
|
42 |
+
return final_dataset
|
43 |
+
|
44 |
+
|
45 |
+
|
46 |
+
max_vocab=100000 #je veux que chaque phrase ait maximum 10k mots
|
47 |
+
tokenizer = Tokenizer(num_words=max_vocab) #arrange la taille max de chaque phrase
|
48 |
+
tokenizer.fit_on_texts(loadata())
|
49 |
+
wor2idx= tokenizer.word_index #tranforme les mots en index
|
50 |
+
input_seq=[] #transforme la sequence de mot en matrice de chiffre
|
51 |
+
for line in loadata():
|
52 |
+
token_list= tokenizer.texts_to_sequences([line])[0]
|
53 |
+
for i in range(1, len(token_list)):
|
54 |
+
n_gram_seq= token_list[:i+1]
|
55 |
+
input_seq.append(n_gram_seq)
|
56 |
+
max_seq_length=max(len(x) for x in input_seq)
|
57 |
+
input_seq= np.array(pad_sequences(input_seq, maxlen=max_seq_length, padding='pre'))
|
58 |
+
|
59 |
+
model = pickle.load(open("modelfinalfinal1.pkl","rb"))
|
60 |
+
|
61 |
+
|
62 |
+
def predict_words(seed, no_words=50, tokenizer=tokenizer,max_seq_length=max_seq_length):
|
63 |
+
for i in range(no_words):
|
64 |
+
token_list = tokenizer.texts_to_sequences([seed])[0]
|
65 |
+
token_list = pad_sequences([token_list], maxlen=max_seq_length-1, padding='pre')
|
66 |
+
predicted = np.argmax(model.predict(token_list), axis=1)
|
67 |
+
|
68 |
+
new_word = ''
|
69 |
+
|
70 |
+
for word, index in tokenizer.word_index.items():
|
71 |
+
if predicted == index:
|
72 |
+
new_word = word
|
73 |
+
break
|
74 |
+
seed += " " + new_word
|
75 |
+
return seed
|
76 |
+
|
77 |
+
|
78 |
+
def chatbot(message):
|
79 |
+
response= predict_words(message)
|
80 |
+
return f"{response}"
|
81 |
+
|
82 |
+
def main():
|
83 |
+
st.title("Chatbot ")
|
84 |
+
|
85 |
+
user_input = st.text_input("vous:", "")
|
86 |
+
|
87 |
+
if st.button("envoyer"):
|
88 |
+
response = chatbot(user_input)
|
89 |
+
st.text_area("Chatbot:", value=response, height=100)
|
90 |
+
|
91 |
+
|
92 |
+
|
93 |
+
main()
|
94 |
# pickled_model.predict(X_test)
|