import numpy as np import json from keras.optimizers import Adam, SGD from keras.models import Sequential from keras.layers import Embedding, Dense, Dropout, Flatten, PReLU from keras.preprocessing.text import Tokenizer from keras_self_attention import SeqSelfAttention, SeqWeightedAttention from model_settings import * with open(dataset_file, "r") as f: dset = json.load(f) with open(responses_file, "r") as f: # TODO: add support to a json-only dataset dset_size = len(f.readlines()) tokenizer = Tokenizer() # a tokenizer is a thing to split text into words, it might have some other stuff like making all the letters lowercase, etc. tokenizer.fit_on_texts(list(dset.keys())) vocab_size = len(tokenizer.word_index) + 1 model = Sequential() model.add(Embedding(input_dim=vocab_size, output_dim=emb_size, input_length=inp_len)) model.add(SeqSelfAttention()) # an ATTENTION LAYER makes the model LEARN the MAIN INFORMATION in the text, AND NOT the TEXT ITSELF model.add(Flatten()) # SelfAttention and the embedding layer outputs a 2D array, it's a list of words with a list of numbers for each word model.add(Dense(1024, activation="relu")) # 1024 relu neurons, why? 2 to the power of 10 is 1024 and I'm a fan of ReLU, it's double-fast (fast training and fast to compute function, no division, square roots or powers, just (x>0)*x ) and overall cool model.add(Dropout(0.5)) # dropout makes ___ task harder __ removing ____ information, 0.5 means delete 50% (it resets neurons to 0 so the model will truly focus on what's important, and not learn on some data that's there by accident) model.add(Dense(512, activation="relu")) model.add(Dense(512, activation="relu")) model.add(Dense(256, activation="relu")) model.add(Dense(dset_size, activation="softmax")) # softmax is made for output, if the output should have only 1 neuron active, that means only one positive number is allowed and other are zeros X = [] # we're loading the training data into input X y = [] # and output y for key in dset: tokens = tokenizer.texts_to_sequences([key,])[0] X.append(np.array((list(tokens)+[0,]*inp_len)[:inp_len])) # refusing to use pad_sequences for an unspecified reason and creating the worst line of code output_array = np.zeros(dset_size) output_array[dset[key]] = 1 # 0 0 0 1 0 0 0 0 0, the neuron of the each line activates in the correct response y.append(output_array) X = np.array(X) # normal lists are way slower than numpy arrays (remember, a list and an array is not the same thing, an array is far more limited) y = np.array(y) # that's why keras supports only numpy arrays ^ model.compile(optimizer=Adam(), loss="categorical_crossentropy", metrics=["accuracy",]) # settings for the training, loss means the way to calculate loss - categorical crossentropy model.fit(X, y, epochs=10, batch_size=8) # training the model, epochs means how many times does it have to read the data, batch_size is an optimization to train on multiple messages at the same time. Loss and accuracy are the opposite things, loss is how far the output is from a correct one, from 1 to 0, and accuracy how often does the model get the answer right, from 0 to 1. # Add , workers=4, use_multiprocessing=True) if you don't have a GPU model.summary() # just for you to see info about the model, useful because you can check the parameter count model.save("chatbot.keras")