import streamlit as st import pickle import pandas as pd import re # Regular expressions to use sub function for replacing the useless text from the data import tensorflow as tf import numpy as np import pickle from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences def clean_text(text): text = re.sub(r',', '', text) text = re.sub(r'\'', '', text) text = re.sub(r'\"', '', text) text = re.sub(r'\(', '', text) text = re.sub(r'\)', '', text) text = re.sub(r'\n', '', text) text = re.sub(r'“', '', text) text = re.sub(r'”', '', text) text = re.sub(r'’', '', text) text = re.sub(r'\.', '', text) text = re.sub(r';', '', text) text = re.sub(r':', '', text) text = re.sub(r'\-', '', text) return text @st.cache_data def loadata(): data = pd.read_excel("IT_Knowledge_Base_Final_FR.xlsx") data.drop("questions", axis=1, inplace=True) data = data.to_string() lower_text= data.lower() split_dataset= lower_text.splitlines() final='' for line in split_dataset: line= clean_text(line) final+='\n'+line final_dataset= final.split('\n') return final_dataset max_vocab=100000 #je veux que chaque phrase ait maximum 10k mots tokenizer = Tokenizer(num_words=max_vocab) #arrange la taille max de chaque phrase tokenizer.fit_on_texts(loadata()) wor2idx= tokenizer.word_index #tranforme les mots en index input_seq=[] #transforme la sequence de mot en matrice de chiffre for line in loadata(): token_list= tokenizer.texts_to_sequences([line])[0] for i in range(1, len(token_list)): n_gram_seq= token_list[:i+1] input_seq.append(n_gram_seq) max_seq_length=max(len(x) for x in input_seq) input_seq= np.array(pad_sequences(input_seq, maxlen=max_seq_length, padding='pre')) model = pickle.load(open('modelfinalfinal1.pkl', 'rb')) def predict_words(seed, no_words=50, tokenizer=tokenizer,max_seq_length=max_seq_length): for i in range(no_words): token_list = tokenizer.texts_to_sequences([seed])[0] token_list = pad_sequences([token_list], maxlen=max_seq_length-1, padding='pre') predicted = np.argmax(model.predict(token_list), axis=1) new_word = '' for word, index in tokenizer.word_index.items(): if predicted == index: new_word = word break seed += " " + new_word return seed def chatbot(message): response= predict_words(message) return f"{response}" def main(): st.title("Chatbot ") user_input = st.text_input("vous:", "") if st.button("envoyer"): response = chatbot(user_input) st.text_area("Chatbot:", value=response, height=100) main() # pickled_model.predict(X_test)