| from __future__ import absolute_import, print_function, division, unicode_literals |
|
|
| import test_helper |
|
|
| from nose.plugins.attrib import attr |
| import json |
| import numpy as np |
|
|
| from torchmoji.class_avg_finetuning import relabel |
| from torchmoji.sentence_tokenizer import SentenceTokenizer |
|
|
| from torchmoji.finetuning import ( |
| calculate_batchsize_maxlen, |
| freeze_layers, |
| change_trainable, |
| finetune, |
| load_benchmark |
| ) |
| from torchmoji.model_def import ( |
| torchmoji_transfer, |
| torchmoji_feature_encoding, |
| torchmoji_emojis |
| ) |
| from torchmoji.global_variables import ( |
| PRETRAINED_PATH, |
| NB_TOKENS, |
| VOCAB_PATH, |
| ROOT_PATH |
| ) |
|
|
|
|
| def test_calculate_batchsize_maxlen(): |
| """ Batch size and max length are calculated properly. |
| """ |
| texts = ['a b c d', |
| 'e f g h i'] |
| batch_size, maxlen = calculate_batchsize_maxlen(texts) |
|
|
| assert batch_size == 250 |
| assert maxlen == 10, maxlen |
|
|
|
|
| def test_freeze_layers(): |
| """ Correct layers are frozen. |
| """ |
| model = torchmoji_transfer(5) |
| keyword = 'output_layer' |
|
|
| model = freeze_layers(model, unfrozen_keyword=keyword) |
|
|
| for name, module in model.named_children(): |
| trainable = keyword.lower() in name.lower() |
| assert all(p.requires_grad == trainable for p in module.parameters()) |
|
|
|
|
| def test_change_trainable(): |
| """ change_trainable() changes trainability of layers. |
| """ |
| model = torchmoji_transfer(5) |
| change_trainable(model.embed, False) |
| assert not any(p.requires_grad for p in model.embed.parameters()) |
| change_trainable(model.embed, True) |
| assert all(p.requires_grad for p in model.embed.parameters()) |
|
|
|
|
| def test_torchmoji_transfer_extend_embedding(): |
| """ Defining torchmoji with extension. |
| """ |
| extend_with = 50 |
| model = torchmoji_transfer(5, weight_path=PRETRAINED_PATH, |
| extend_embedding=extend_with) |
| embedding_layer = model.embed |
| assert embedding_layer.weight.size()[0] == NB_TOKENS + extend_with |
|
|
|
|
| def test_torchmoji_return_attention(): |
| seq_tensor = np.array([[1]]) |
| |
| model = torchmoji_emojis(weight_path=PRETRAINED_PATH) |
| |
| assert len(model(seq_tensor)) == 1 |
| |
| model = torchmoji_emojis(weight_path=PRETRAINED_PATH, return_attention=True) |
| assert len(model(seq_tensor)) == 2 |
|
|
|
|
| def test_relabel(): |
| """ relabel() works with multi-class labels. |
| """ |
| nb_classes = 3 |
| inputs = np.array([ |
| [True, False, False], |
| [False, True, False], |
| [True, False, True], |
| ]) |
| expected_0 = np.array([True, False, True]) |
| expected_1 = np.array([False, True, False]) |
| expected_2 = np.array([False, False, True]) |
|
|
| assert np.array_equal(relabel(inputs, 0, nb_classes), expected_0) |
| assert np.array_equal(relabel(inputs, 1, nb_classes), expected_1) |
| assert np.array_equal(relabel(inputs, 2, nb_classes), expected_2) |
|
|
|
|
| def test_relabel_binary(): |
| """ relabel() works with binary classification (no changes to labels) |
| """ |
| nb_classes = 2 |
| inputs = np.array([True, False, False]) |
|
|
| assert np.array_equal(relabel(inputs, 0, nb_classes), inputs) |
|
|
|
|
| @attr('slow') |
| def test_finetune_full(): |
| """ finetuning using 'full'. |
| """ |
| DATASET_PATH = ROOT_PATH+'/data/SS-Youtube/raw.pickle' |
| nb_classes = 2 |
| |
| |
| |
| min_acc = 0.68 |
|
|
| with open(VOCAB_PATH, 'r') as f: |
| vocab = json.load(f) |
|
|
| data = load_benchmark(DATASET_PATH, vocab, extend_with=10000) |
| print('Loading pyTorch model from {}.'.format(PRETRAINED_PATH)) |
| model = torchmoji_transfer(nb_classes, PRETRAINED_PATH, extend_embedding=data['added']) |
| print(model) |
| model, acc = finetune(model, data['texts'], data['labels'], nb_classes, |
| data['batch_size'], method='full', nb_epochs=1) |
|
|
| print("Finetune full SS-Youtube 1 epoch acc: {}".format(acc)) |
| assert acc >= min_acc |
|
|
|
|
| @attr('slow') |
| def test_finetune_last(): |
| """ finetuning using 'last'. |
| """ |
| dataset_path = ROOT_PATH + '/data/SS-Youtube/raw.pickle' |
| nb_classes = 2 |
| min_acc = 0.68 |
|
|
| with open(VOCAB_PATH, 'r') as f: |
| vocab = json.load(f) |
|
|
| data = load_benchmark(dataset_path, vocab) |
| print('Loading model from {}.'.format(PRETRAINED_PATH)) |
| model = torchmoji_transfer(nb_classes, PRETRAINED_PATH) |
| print(model) |
| model, acc = finetune(model, data['texts'], data['labels'], nb_classes, |
| data['batch_size'], method='last', nb_epochs=1) |
|
|
| print("Finetune last SS-Youtube 1 epoch acc: {}".format(acc)) |
|
|
| assert acc >= min_acc |
|
|
|
|
| def test_score_emoji(): |
| """ Emoji predictions make sense. |
| """ |
| test_sentences = [ |
| 'I love mom\'s cooking', |
| 'I love how you never reply back..', |
| 'I love cruising with my homies', |
| 'I love messing with yo mind!!', |
| 'I love you and now you\'re just gone..', |
| 'This is shit', |
| 'This is the shit' |
| ] |
|
|
| expected = [ |
| np.array([36, 4, 8, 16, 47]), |
| np.array([1, 19, 55, 25, 46]), |
| np.array([31, 6, 30, 15, 13]), |
| np.array([54, 44, 9, 50, 49]), |
| np.array([46, 5, 27, 35, 34]), |
| np.array([55, 32, 27, 1, 37]), |
| np.array([48, 11, 6, 31, 9]) |
| ] |
|
|
| def top_elements(array, k): |
| ind = np.argpartition(array, -k)[-k:] |
| return ind[np.argsort(array[ind])][::-1] |
|
|
| |
| with open(VOCAB_PATH, 'r') as f: |
| vocabulary = json.load(f) |
|
|
| st = SentenceTokenizer(vocabulary, 30) |
| tokens, _, _ = st.tokenize_sentences(test_sentences) |
|
|
| |
| model = torchmoji_emojis(weight_path=PRETRAINED_PATH) |
| prob = model(tokens) |
|
|
| |
| for i, t_prob in enumerate(list(prob)): |
| assert np.array_equal(top_elements(t_prob, 5), expected[i]) |
|
|
|
|
| def test_encode_texts(): |
| """ Text encoding is stable. |
| """ |
|
|
| TEST_SENTENCES = ['I love mom\'s cooking', |
| 'I love how you never reply back..', |
| 'I love cruising with my homies', |
| 'I love messing with yo mind!!', |
| 'I love you and now you\'re just gone..', |
| 'This is shit', |
| 'This is the shit'] |
|
|
|
|
| maxlen = 30 |
| batch_size = 32 |
|
|
| with open(VOCAB_PATH, 'r') as f: |
| vocabulary = json.load(f) |
|
|
| st = SentenceTokenizer(vocabulary, maxlen) |
|
|
| print('Loading model from {}.'.format(PRETRAINED_PATH)) |
| model = torchmoji_feature_encoding(PRETRAINED_PATH) |
| print(model) |
| tokenized, _, _ = st.tokenize_sentences(TEST_SENTENCES) |
| encoding = model(tokenized) |
|
|
| avg_across_sentences = np.around(np.mean(encoding, axis=0)[:5], 3) |
| assert np.allclose(avg_across_sentences, np.array([-0.023, 0.021, -0.037, -0.001, -0.005])) |
|
|
| test_encode_texts() |