|
from __future__ import absolute_import, print_function, division, unicode_literals |
|
|
|
import test_helper |
|
|
|
from nose.plugins.attrib import attr |
|
import json |
|
import numpy as np |
|
|
|
from torchmoji.class_avg_finetuning import relabel |
|
from torchmoji.sentence_tokenizer import SentenceTokenizer |
|
|
|
from torchmoji.finetuning import ( |
|
calculate_batchsize_maxlen, |
|
freeze_layers, |
|
change_trainable, |
|
finetune, |
|
load_benchmark |
|
) |
|
from torchmoji.model_def import ( |
|
torchmoji_transfer, |
|
torchmoji_feature_encoding, |
|
torchmoji_emojis |
|
) |
|
from torchmoji.global_variables import ( |
|
PRETRAINED_PATH, |
|
NB_TOKENS, |
|
VOCAB_PATH, |
|
ROOT_PATH |
|
) |
|
|
|
|
|
def test_calculate_batchsize_maxlen(): |
|
""" Batch size and max length are calculated properly. |
|
""" |
|
texts = ['a b c d', |
|
'e f g h i'] |
|
batch_size, maxlen = calculate_batchsize_maxlen(texts) |
|
|
|
assert batch_size == 250 |
|
assert maxlen == 10, maxlen |
|
|
|
|
|
def test_freeze_layers(): |
|
""" Correct layers are frozen. |
|
""" |
|
model = torchmoji_transfer(5) |
|
keyword = 'output_layer' |
|
|
|
model = freeze_layers(model, unfrozen_keyword=keyword) |
|
|
|
for name, module in model.named_children(): |
|
trainable = keyword.lower() in name.lower() |
|
assert all(p.requires_grad == trainable for p in module.parameters()) |
|
|
|
|
|
def test_change_trainable(): |
|
""" change_trainable() changes trainability of layers. |
|
""" |
|
model = torchmoji_transfer(5) |
|
change_trainable(model.embed, False) |
|
assert not any(p.requires_grad for p in model.embed.parameters()) |
|
change_trainable(model.embed, True) |
|
assert all(p.requires_grad for p in model.embed.parameters()) |
|
|
|
|
|
def test_torchmoji_transfer_extend_embedding(): |
|
""" Defining torchmoji with extension. |
|
""" |
|
extend_with = 50 |
|
model = torchmoji_transfer(5, weight_path=PRETRAINED_PATH, |
|
extend_embedding=extend_with) |
|
embedding_layer = model.embed |
|
assert embedding_layer.weight.size()[0] == NB_TOKENS + extend_with |
|
|
|
|
|
def test_torchmoji_return_attention(): |
|
seq_tensor = np.array([[1]]) |
|
|
|
model = torchmoji_emojis(weight_path=PRETRAINED_PATH) |
|
|
|
assert len(model(seq_tensor)) == 1 |
|
|
|
model = torchmoji_emojis(weight_path=PRETRAINED_PATH, return_attention=True) |
|
assert len(model(seq_tensor)) == 2 |
|
|
|
|
|
def test_relabel(): |
|
""" relabel() works with multi-class labels. |
|
""" |
|
nb_classes = 3 |
|
inputs = np.array([ |
|
[True, False, False], |
|
[False, True, False], |
|
[True, False, True], |
|
]) |
|
expected_0 = np.array([True, False, True]) |
|
expected_1 = np.array([False, True, False]) |
|
expected_2 = np.array([False, False, True]) |
|
|
|
assert np.array_equal(relabel(inputs, 0, nb_classes), expected_0) |
|
assert np.array_equal(relabel(inputs, 1, nb_classes), expected_1) |
|
assert np.array_equal(relabel(inputs, 2, nb_classes), expected_2) |
|
|
|
|
|
def test_relabel_binary(): |
|
""" relabel() works with binary classification (no changes to labels) |
|
""" |
|
nb_classes = 2 |
|
inputs = np.array([True, False, False]) |
|
|
|
assert np.array_equal(relabel(inputs, 0, nb_classes), inputs) |
|
|
|
|
|
@attr('slow') |
|
def test_finetune_full(): |
|
""" finetuning using 'full'. |
|
""" |
|
DATASET_PATH = ROOT_PATH+'/data/SS-Youtube/raw.pickle' |
|
nb_classes = 2 |
|
|
|
|
|
|
|
min_acc = 0.68 |
|
|
|
with open(VOCAB_PATH, 'r') as f: |
|
vocab = json.load(f) |
|
|
|
data = load_benchmark(DATASET_PATH, vocab, extend_with=10000) |
|
print('Loading pyTorch model from {}.'.format(PRETRAINED_PATH)) |
|
model = torchmoji_transfer(nb_classes, PRETRAINED_PATH, extend_embedding=data['added']) |
|
print(model) |
|
model, acc = finetune(model, data['texts'], data['labels'], nb_classes, |
|
data['batch_size'], method='full', nb_epochs=1) |
|
|
|
print("Finetune full SS-Youtube 1 epoch acc: {}".format(acc)) |
|
assert acc >= min_acc |
|
|
|
|
|
@attr('slow') |
|
def test_finetune_last(): |
|
""" finetuning using 'last'. |
|
""" |
|
dataset_path = ROOT_PATH + '/data/SS-Youtube/raw.pickle' |
|
nb_classes = 2 |
|
min_acc = 0.68 |
|
|
|
with open(VOCAB_PATH, 'r') as f: |
|
vocab = json.load(f) |
|
|
|
data = load_benchmark(dataset_path, vocab) |
|
print('Loading model from {}.'.format(PRETRAINED_PATH)) |
|
model = torchmoji_transfer(nb_classes, PRETRAINED_PATH) |
|
print(model) |
|
model, acc = finetune(model, data['texts'], data['labels'], nb_classes, |
|
data['batch_size'], method='last', nb_epochs=1) |
|
|
|
print("Finetune last SS-Youtube 1 epoch acc: {}".format(acc)) |
|
|
|
assert acc >= min_acc |
|
|
|
|
|
def test_score_emoji(): |
|
""" Emoji predictions make sense. |
|
""" |
|
test_sentences = [ |
|
'I love mom\'s cooking', |
|
'I love how you never reply back..', |
|
'I love cruising with my homies', |
|
'I love messing with yo mind!!', |
|
'I love you and now you\'re just gone..', |
|
'This is shit', |
|
'This is the shit' |
|
] |
|
|
|
expected = [ |
|
np.array([36, 4, 8, 16, 47]), |
|
np.array([1, 19, 55, 25, 46]), |
|
np.array([31, 6, 30, 15, 13]), |
|
np.array([54, 44, 9, 50, 49]), |
|
np.array([46, 5, 27, 35, 34]), |
|
np.array([55, 32, 27, 1, 37]), |
|
np.array([48, 11, 6, 31, 9]) |
|
] |
|
|
|
def top_elements(array, k): |
|
ind = np.argpartition(array, -k)[-k:] |
|
return ind[np.argsort(array[ind])][::-1] |
|
|
|
|
|
with open(VOCAB_PATH, 'r') as f: |
|
vocabulary = json.load(f) |
|
|
|
st = SentenceTokenizer(vocabulary, 30) |
|
tokens, _, _ = st.tokenize_sentences(test_sentences) |
|
|
|
|
|
model = torchmoji_emojis(weight_path=PRETRAINED_PATH) |
|
prob = model(tokens) |
|
|
|
|
|
for i, t_prob in enumerate(list(prob)): |
|
assert np.array_equal(top_elements(t_prob, 5), expected[i]) |
|
|
|
|
|
def test_encode_texts(): |
|
""" Text encoding is stable. |
|
""" |
|
|
|
TEST_SENTENCES = ['I love mom\'s cooking', |
|
'I love how you never reply back..', |
|
'I love cruising with my homies', |
|
'I love messing with yo mind!!', |
|
'I love you and now you\'re just gone..', |
|
'This is shit', |
|
'This is the shit'] |
|
|
|
|
|
maxlen = 30 |
|
batch_size = 32 |
|
|
|
with open(VOCAB_PATH, 'r') as f: |
|
vocabulary = json.load(f) |
|
|
|
st = SentenceTokenizer(vocabulary, maxlen) |
|
|
|
print('Loading model from {}.'.format(PRETRAINED_PATH)) |
|
model = torchmoji_feature_encoding(PRETRAINED_PATH) |
|
print(model) |
|
tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES) |
|
encoding = model(tokenized) |
|
|
|
avg_across_sentences = np.around(np.mean(encoding, axis=0)[:5], 3) |
|
assert np.allclose(avg_across_sentences, np.array([-0.023, 0.021, -0.037, -0.001, -0.005])) |
|
|
|
test_encode_texts() |