from __future__ import absolute_import, print_function, division, unicode_literals import test_helper import json from torchmoji.sentence_tokenizer import SentenceTokenizer sentences = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'] dicts = [ {'label': 0}, {'label': 1}, {'label': 2}, {'label': 3}, {'label': 4}, {'label': 5}, {'label': 6}, {'label': 7}, {'label': 8}, {'label': 9}, ] train_ind = [0, 5, 3, 6, 8] val_ind = [9, 2, 1] test_ind = [4, 7] with open('../model/vocabulary.json', 'r') as f: vocab = json.load(f) def test_dataset_split_parameter(): """ Dataset is split in the desired ratios """ split_parameter = [0.7, 0.1, 0.2] st = SentenceTokenizer(vocab, 30) result, result_dicts, _ = st.split_train_val_test(sentences, dicts, split_parameter, extend_with=0) train = result[0] val = result[1] test = result[2] train_dicts = result_dicts[0] val_dicts = result_dicts[1] test_dicts = result_dicts[2] assert len(train) == len(sentences) * split_parameter[0] assert len(val) == len(sentences) * split_parameter[1] assert len(test) == len(sentences) * split_parameter[2] assert len(train_dicts) == len(dicts) * split_parameter[0] assert len(val_dicts) == len(dicts) * split_parameter[1] assert len(test_dicts) == len(dicts) * split_parameter[2] def test_dataset_split_explicit(): """ Dataset is split according to given indices """ split_parameter = [train_ind, val_ind, test_ind] st = SentenceTokenizer(vocab, 30) tokenized, _, _ = st.tokenize_sentences(sentences) result, result_dicts, added = st.split_train_val_test(sentences, dicts, split_parameter, extend_with=0) train = result[0] val = result[1] test = result[2] train_dicts = result_dicts[0] val_dicts = result_dicts[1] test_dicts = result_dicts[2] tokenized = tokenized for i, sentence in enumerate(sentences): if i in train_ind: assert tokenized[i] in train assert dicts[i] in train_dicts elif i in val_ind: assert tokenized[i] in val assert dicts[i] in val_dicts elif i in test_ind: assert tokenized[i] in test assert dicts[i] in test_dicts assert len(train) == len(train_ind) assert len(val) == len(val_ind) assert len(test) == len(test_ind) assert len(train_dicts) == len(train_ind) assert len(val_dicts) == len(val_ind) assert len(test_dicts) == len(test_ind) def test_id_to_sentence(): """Tokenizing and converting back preserves the input. """ vb = {'CUSTOM_MASK': 0, 'aasdf': 1000, 'basdf': 2000} sentence = 'aasdf basdf basdf basdf' st = SentenceTokenizer(vb, 30) token, _, _ = st.tokenize_sentences([sentence]) assert st.to_sentence(token[0]) == sentence def test_id_to_sentence_with_unknown(): """Tokenizing and converting back preserves the input, except for unknowns. """ vb = {'CUSTOM_MASK': 0, 'CUSTOM_UNKNOWN': 1, 'aasdf': 1000, 'basdf': 2000} sentence = 'aasdf basdf ccc' expected = 'aasdf basdf CUSTOM_UNKNOWN' st = SentenceTokenizer(vb, 30) token, _, _ = st.tokenize_sentences([sentence]) assert st.to_sentence(token[0]) == expected