ierhon's picture
Improving generalization on small datasets
76b74a3
raw
history blame
2.08 kB
import gradio as gr
from todset import todset
import numpy as np
from keras.models import Sequential
from keras.layers import Embedding, Dense, Dropout, Flatten, PReLU
from keras.preprocessing.text import Tokenizer
from keras_self_attention import SeqSelfAttention, SeqWeightedAttention
emb_size = 128
inp_len = 16
maxshift = 4
def train(data: str, message: str):
if "→" not in data or "\n" not in data:
return "Dataset should be like:\nquestion→answer\nquestion→answer\netc."
dset, responses = todset(data)
resps_len = len(responses)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(list(dset.keys()))
vocab_size = len(tokenizer.word_index) + 1
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=emb_size, input_length=inp_len))
model.add(SeqSelfAttention())
model.add(Flatten())
model.add(Dense(1024, activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(512, activation="relu"))
model.add(Dense(512, activation="relu"))
model.add(Dense(256, activation="relu"))
model.add(Dense(resps_len, activation="softmax"))
X = []
y = []
for key in dset:
for p in range(maxshift):
tokens = tokenizer.texts_to_sequences([key,])[0]
X.append(np.array(([0,]*p+list(tokens)+[0,]*inp_len)[:inp_len]))
output_array = np.zeros(resps_len)
output_array[dset[key]] = 1
y.append(output_array)
X = np.array(X)
y = np.array(y)
model.compile(loss="categorical_crossentropy", metrics=["accuracy",])
model.fit(X, y, epochs=10, batch_size=8, workers=4, use_multiprocessing=True)
tokens = tokenizer.texts_to_sequences([message,])[0]
prediction = model.predict(np.array([(list(tokens)+[0,]*inp_len)[:inp_len],]))[0]
max_o = 0
max_v = 0
for ind, i in enumerate(prediction):
if max_v < i:
max_v = i
max_o = ind
return responses[ind]
iface = gr.Interface(fn=train, inputs=["text", "text"], outputs="text")
iface.launch()