ElieMark commited on
Commit
227ccf3
β€’
1 Parent(s): 62bbe5d

Update chat.py

Browse files
Files changed (1) hide show
  1. chat.py +93 -93
chat.py CHANGED
@@ -1,94 +1,94 @@
1
-
2
- import streamlit as st
3
- import pickle
4
- import pandas as pd
5
- import re # Regular expressions to use sub function for replacing the useless text from the data
6
- import tensorflow as tf
7
- import numpy as np
8
- import pickle
9
- from tensorflow.keras.preprocessing.text import Tokenizer
10
- from tensorflow.keras.preprocessing.sequence import pad_sequences
11
-
12
-
13
-
14
- def clean_text(text):
15
- text = re.sub(r',', '', text)
16
- text = re.sub(r'\'', '', text)
17
- text = re.sub(r'\"', '', text)
18
- text = re.sub(r'\(', '', text)
19
- text = re.sub(r'\)', '', text)
20
- text = re.sub(r'\n', '', text)
21
- text = re.sub(r'β€œ', '', text)
22
- text = re.sub(r'”', '', text)
23
- text = re.sub(r'’', '', text)
24
- text = re.sub(r'\.', '', text)
25
- text = re.sub(r';', '', text)
26
- text = re.sub(r':', '', text)
27
- text = re.sub(r'\-', '', text)
28
- return text
29
-
30
- @st.cache_data
31
- def loadata():
32
- data = pd.read_excel("IT_Knowledge_Base_Final_FR.xlsx")
33
- data.drop("questions", axis=1, inplace=True)
34
- data = data.to_string()
35
- lower_text= data.lower()
36
- split_dataset= lower_text.splitlines()
37
- final=''
38
- for line in split_dataset:
39
- line= clean_text(line)
40
- final+='\n'+line
41
- final_dataset= final.split('\n')
42
- return final_dataset
43
-
44
-
45
-
46
- max_vocab=100000 #je veux que chaque phrase ait maximum 10k mots
47
- tokenizer = Tokenizer(num_words=max_vocab) #arrange la taille max de chaque phrase
48
- tokenizer.fit_on_texts(loadata())
49
- wor2idx= tokenizer.word_index #tranforme les mots en index
50
- input_seq=[] #transforme la sequence de mot en matrice de chiffre
51
- for line in loadata():
52
- token_list= tokenizer.texts_to_sequences([line])[0]
53
- for i in range(1, len(token_list)):
54
- n_gram_seq= token_list[:i+1]
55
- input_seq.append(n_gram_seq)
56
- max_seq_length=max(len(x) for x in input_seq)
57
- input_seq= np.array(pad_sequences(input_seq, maxlen=max_seq_length, padding='pre'))
58
-
59
- model = pickle.load(open("model2.pkl","rb"))
60
-
61
-
62
- def predict_words(seed, no_words=50, tokenizer=tokenizer,max_seq_length=max_seq_length):
63
- for i in range(no_words):
64
- token_list = tokenizer.texts_to_sequences([seed])[0]
65
- token_list = pad_sequences([token_list], maxlen=max_seq_length-1, padding='pre')
66
- predicted = np.argmax(model.predict(token_list), axis=1)
67
-
68
- new_word = ''
69
-
70
- for word, index in tokenizer.word_index.items():
71
- if predicted == index:
72
- new_word = word
73
- break
74
- seed += " " + new_word
75
- return seed
76
-
77
-
78
- def chatbot(message):
79
- response= predict_words(message)
80
- return f"{response}"
81
-
82
- def main():
83
- st.title("Chatbot ")
84
-
85
- user_input = st.text_input("vous:", "")
86
-
87
- if st.button("envoyer"):
88
- response = chatbot(user_input)
89
- st.text_area("Chatbot:", value=response, height=100)
90
-
91
-
92
-
93
- main()
94
  # pickled_model.predict(X_test)
 
1
+
2
+ import streamlit as st
3
+ import pickle
4
+ import pandas as pd
5
+ import re # Regular expressions to use sub function for replacing the useless text from the data
6
+ import tensorflow as tf
7
+ import numpy as np
8
+ import pickle
9
+ from tensorflow.keras.preprocessing.text import Tokenizer
10
+ from tensorflow.keras.preprocessing.sequence import pad_sequences
11
+
12
+
13
+
14
+ def clean_text(text):
15
+ text = re.sub(r',', '', text)
16
+ text = re.sub(r'\'', '', text)
17
+ text = re.sub(r'\"', '', text)
18
+ text = re.sub(r'\(', '', text)
19
+ text = re.sub(r'\)', '', text)
20
+ text = re.sub(r'\n', '', text)
21
+ text = re.sub(r'β€œ', '', text)
22
+ text = re.sub(r'”', '', text)
23
+ text = re.sub(r'’', '', text)
24
+ text = re.sub(r'\.', '', text)
25
+ text = re.sub(r';', '', text)
26
+ text = re.sub(r':', '', text)
27
+ text = re.sub(r'\-', '', text)
28
+ return text
29
+
30
+ @st.cache_data
31
+ def loadata():
32
+ data = pd.read_excel("IT_Knowledge_Base_Final_FR.xlsx")
33
+ data.drop("questions", axis=1, inplace=True)
34
+ data = data.to_string()
35
+ lower_text= data.lower()
36
+ split_dataset= lower_text.splitlines()
37
+ final=''
38
+ for line in split_dataset:
39
+ line= clean_text(line)
40
+ final+='\n'+line
41
+ final_dataset= final.split('\n')
42
+ return final_dataset
43
+
44
+
45
+
46
+ max_vocab=100000 #je veux que chaque phrase ait maximum 10k mots
47
+ tokenizer = Tokenizer(num_words=max_vocab) #arrange la taille max de chaque phrase
48
+ tokenizer.fit_on_texts(loadata())
49
+ wor2idx= tokenizer.word_index #tranforme les mots en index
50
+ input_seq=[] #transforme la sequence de mot en matrice de chiffre
51
+ for line in loadata():
52
+ token_list= tokenizer.texts_to_sequences([line])[0]
53
+ for i in range(1, len(token_list)):
54
+ n_gram_seq= token_list[:i+1]
55
+ input_seq.append(n_gram_seq)
56
+ max_seq_length=max(len(x) for x in input_seq)
57
+ input_seq= np.array(pad_sequences(input_seq, maxlen=max_seq_length, padding='pre'))
58
+
59
+ model = pickle.load(open("modelfinalfinal1.pkl","rb"))
60
+
61
+
62
+ def predict_words(seed, no_words=50, tokenizer=tokenizer,max_seq_length=max_seq_length):
63
+ for i in range(no_words):
64
+ token_list = tokenizer.texts_to_sequences([seed])[0]
65
+ token_list = pad_sequences([token_list], maxlen=max_seq_length-1, padding='pre')
66
+ predicted = np.argmax(model.predict(token_list), axis=1)
67
+
68
+ new_word = ''
69
+
70
+ for word, index in tokenizer.word_index.items():
71
+ if predicted == index:
72
+ new_word = word
73
+ break
74
+ seed += " " + new_word
75
+ return seed
76
+
77
+
78
+ def chatbot(message):
79
+ response= predict_words(message)
80
+ return f"{response}"
81
+
82
+ def main():
83
+ st.title("Chatbot ")
84
+
85
+ user_input = st.text_input("vous:", "")
86
+
87
+ if st.button("envoyer"):
88
+ response = chatbot(user_input)
89
+ st.text_area("Chatbot:", value=response, height=100)
90
+
91
+
92
+
93
+ main()
94
  # pickled_model.predict(X_test)