| from __future__ import absolute_import, print_function, division, unicode_literals |
| import test_helper |
| import json |
|
|
| from torchmoji.sentence_tokenizer import SentenceTokenizer |
|
|
| sentences = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'] |
|
|
| dicts = [ |
| {'label': 0}, |
| {'label': 1}, |
| {'label': 2}, |
| {'label': 3}, |
| {'label': 4}, |
| {'label': 5}, |
| {'label': 6}, |
| {'label': 7}, |
| {'label': 8}, |
| {'label': 9}, |
| ] |
|
|
| train_ind = [0, 5, 3, 6, 8] |
| val_ind = [9, 2, 1] |
| test_ind = [4, 7] |
|
|
| with open('../model/vocabulary.json', 'r') as f: |
| vocab = json.load(f) |
|
|
| def test_dataset_split_parameter(): |
| """ Dataset is split in the desired ratios |
| """ |
| split_parameter = [0.7, 0.1, 0.2] |
| st = SentenceTokenizer(vocab, 30) |
|
|
| result, result_dicts, _ = st.split_train_val_test(sentences, dicts, |
| split_parameter, extend_with=0) |
| train = result[0] |
| val = result[1] |
| test = result[2] |
|
|
| train_dicts = result_dicts[0] |
| val_dicts = result_dicts[1] |
| test_dicts = result_dicts[2] |
|
|
| assert len(train) == len(sentences) * split_parameter[0] |
| assert len(val) == len(sentences) * split_parameter[1] |
| assert len(test) == len(sentences) * split_parameter[2] |
|
|
| assert len(train_dicts) == len(dicts) * split_parameter[0] |
| assert len(val_dicts) == len(dicts) * split_parameter[1] |
| assert len(test_dicts) == len(dicts) * split_parameter[2] |
|
|
| def test_dataset_split_explicit(): |
| """ Dataset is split according to given indices |
| """ |
| split_parameter = [train_ind, val_ind, test_ind] |
| st = SentenceTokenizer(vocab, 30) |
| tokenized, _, _ = st.tokenize_sentences(sentences) |
|
|
| result, result_dicts, added = st.split_train_val_test(sentences, dicts, split_parameter, extend_with=0) |
| train = result[0] |
| val = result[1] |
| test = result[2] |
|
|
| train_dicts = result_dicts[0] |
| val_dicts = result_dicts[1] |
| test_dicts = result_dicts[2] |
|
|
| tokenized = tokenized |
|
|
| for i, sentence in enumerate(sentences): |
| if i in train_ind: |
| assert tokenized[i] in train |
| assert dicts[i] in train_dicts |
| elif i in val_ind: |
| assert tokenized[i] in val |
| assert dicts[i] in val_dicts |
| elif i in test_ind: |
| assert tokenized[i] in test |
| assert dicts[i] in test_dicts |
|
|
| assert len(train) == len(train_ind) |
| assert len(val) == len(val_ind) |
| assert len(test) == len(test_ind) |
| assert len(train_dicts) == len(train_ind) |
| assert len(val_dicts) == len(val_ind) |
| assert len(test_dicts) == len(test_ind) |
|
|
| def test_id_to_sentence(): |
| """Tokenizing and converting back preserves the input. |
| """ |
| vb = {'CUSTOM_MASK': 0, |
| 'aasdf': 1000, |
| 'basdf': 2000} |
|
|
| sentence = 'aasdf basdf basdf basdf' |
| st = SentenceTokenizer(vb, 30) |
| token, _, _ = st.tokenize_sentences([sentence]) |
| assert st.to_sentence(token[0]) == sentence |
|
|
| def test_id_to_sentence_with_unknown(): |
| """Tokenizing and converting back preserves the input, except for unknowns. |
| """ |
| vb = {'CUSTOM_MASK': 0, |
| 'CUSTOM_UNKNOWN': 1, |
| 'aasdf': 1000, |
| 'basdf': 2000} |
|
|
| sentence = 'aasdf basdf ccc' |
| expected = 'aasdf basdf CUSTOM_UNKNOWN' |
| st = SentenceTokenizer(vb, 30) |
| token, _, _ = st.tokenize_sentences([sentence]) |
| assert st.to_sentence(token[0]) == expected |
|
|