import numpy as np | |
import json | |
from keras.optimizers import Adam, SGD | |
from keras.models import Sequential | |
from keras.layers import Embedding, Dense, Dropout, Flatten, PReLU | |
from keras.preprocessing.text import Tokenizer | |
from keras_self_attention import SeqSelfAttention, SeqWeightedAttention | |
from model_settings import * | |
with open(dataset_file, "r") as f: | |
dset = json.load(f) | |
with open(responses_file, "r") as f: # TODO: add support to a json-only dataset | |
dset_size = len(f.readlines()) | |
tokenizer = Tokenizer() # a tokenizer is a thing to split text into words, it might have some other stuff like making all the letters lowercase, etc. | |
tokenizer.fit_on_texts(list(dset.keys())) | |
vocab_size = len(tokenizer.word_index) + 1 | |
model = Sequential() | |
model.add(Embedding(input_dim=vocab_size, output_dim=emb_size, input_length=inp_len)) | |
model.add(SeqSelfAttention()) # an ATTENTION LAYER makes the model LEARN the MAIN INFORMATION in the text, AND NOT the TEXT ITSELF | |
model.add(Flatten()) # SelfAttention and the embedding layer outputs a 2D array, it's a list of words with a list of numbers for each word | |
model.add(Dense(1024, activation="relu")) # 1024 relu neurons, why? 2 to the power of 10 is 1024 and I'm a fan of ReLU, it's double-fast (fast training and fast to compute function, no division, square roots or powers, just (x>0)*x ) and overall cool | |
model.add(Dropout(0.5)) # dropout makes ___ task harder __ removing ____ information, 0.5 means delete 50% (it resets neurons to 0 so the model will truly focus on what's important, and not learn on some data that's there by accident) | |
model.add(Dense(512, activation="relu")) | |
model.add(Dense(512, activation="relu")) | |
model.add(Dense(256, activation="relu")) | |
model.add(Dense(dset_size, activation="softmax")) # softmax is made for output, if the output should have only 1 neuron active, that means only one positive number is allowed and other are zeros | |
X = [] # we're loading the training data into input X | |
y = [] # and output y | |
for key in dset: | |
tokens = tokenizer.texts_to_sequences([key,])[0] | |
X.append(np.array((list(tokens)+[0,]*inp_len)[:inp_len])) # refusing to use pad_sequences for an unspecified reason and creating the worst line of code | |
output_array = np.zeros(dset_size) | |
output_array[dset[key]] = 1 # 0 0 0 1 0 0 0 0 0, the neuron of the each line activates in the correct response | |
y.append(output_array) | |
X = np.array(X) # normal lists are way slower than numpy arrays (remember, a list and an array is not the same thing, an array is far more limited) | |
y = np.array(y) # that's why keras supports only numpy arrays ^ | |
model.compile(optimizer=Adam(), loss="categorical_crossentropy", metrics=["accuracy",]) # settings for the training, loss means the way to calculate loss - categorical crossentropy | |
model.fit(X, y, epochs=10, batch_size=8) # training the model, epochs means how many times does it have to read the data, batch_size is an optimization to train on multiple messages at the same time. Loss and accuracy are the opposite things, loss is how far the output is from a correct one, from 1 to 0, and accuracy how often does the model get the answer right, from 0 to 1. | |
# Add , workers=4, use_multiprocessing=True) if you don't have a GPU | |
model.summary() # just for you to see info about the model, useful because you can check the parameter count | |
model.save("chatbot.keras") | |