|
from __future__ import absolute_import, print_function, division, unicode_literals |
|
import test_helper |
|
import json |
|
|
|
from torchmoji.sentence_tokenizer import SentenceTokenizer |
|
|
|
sentences = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'] |
|
|
|
dicts = [ |
|
{'label': 0}, |
|
{'label': 1}, |
|
{'label': 2}, |
|
{'label': 3}, |
|
{'label': 4}, |
|
{'label': 5}, |
|
{'label': 6}, |
|
{'label': 7}, |
|
{'label': 8}, |
|
{'label': 9}, |
|
] |
|
|
|
train_ind = [0, 5, 3, 6, 8] |
|
val_ind = [9, 2, 1] |
|
test_ind = [4, 7] |
|
|
|
with open('../model/vocabulary.json', 'r') as f: |
|
vocab = json.load(f) |
|
|
|
def test_dataset_split_parameter(): |
|
""" Dataset is split in the desired ratios |
|
""" |
|
split_parameter = [0.7, 0.1, 0.2] |
|
st = SentenceTokenizer(vocab, 30) |
|
|
|
result, result_dicts, _ = st.split_train_val_test(sentences, dicts, |
|
split_parameter, extend_with=0) |
|
train = result[0] |
|
val = result[1] |
|
test = result[2] |
|
|
|
train_dicts = result_dicts[0] |
|
val_dicts = result_dicts[1] |
|
test_dicts = result_dicts[2] |
|
|
|
assert len(train) == len(sentences) * split_parameter[0] |
|
assert len(val) == len(sentences) * split_parameter[1] |
|
assert len(test) == len(sentences) * split_parameter[2] |
|
|
|
assert len(train_dicts) == len(dicts) * split_parameter[0] |
|
assert len(val_dicts) == len(dicts) * split_parameter[1] |
|
assert len(test_dicts) == len(dicts) * split_parameter[2] |
|
|
|
def test_dataset_split_explicit(): |
|
""" Dataset is split according to given indices |
|
""" |
|
split_parameter = [train_ind, val_ind, test_ind] |
|
st = SentenceTokenizer(vocab, 30) |
|
tokenized, _, _ = st.tokenize_sentences(sentences) |
|
|
|
result, result_dicts, added = st.split_train_val_test(sentences, dicts, split_parameter, extend_with=0) |
|
train = result[0] |
|
val = result[1] |
|
test = result[2] |
|
|
|
train_dicts = result_dicts[0] |
|
val_dicts = result_dicts[1] |
|
test_dicts = result_dicts[2] |
|
|
|
tokenized = tokenized |
|
|
|
for i, sentence in enumerate(sentences): |
|
if i in train_ind: |
|
assert tokenized[i] in train |
|
assert dicts[i] in train_dicts |
|
elif i in val_ind: |
|
assert tokenized[i] in val |
|
assert dicts[i] in val_dicts |
|
elif i in test_ind: |
|
assert tokenized[i] in test |
|
assert dicts[i] in test_dicts |
|
|
|
assert len(train) == len(train_ind) |
|
assert len(val) == len(val_ind) |
|
assert len(test) == len(test_ind) |
|
assert len(train_dicts) == len(train_ind) |
|
assert len(val_dicts) == len(val_ind) |
|
assert len(test_dicts) == len(test_ind) |
|
|
|
def test_id_to_sentence(): |
|
"""Tokenizing and converting back preserves the input. |
|
""" |
|
vb = {'CUSTOM_MASK': 0, |
|
'aasdf': 1000, |
|
'basdf': 2000} |
|
|
|
sentence = 'aasdf basdf basdf basdf' |
|
st = SentenceTokenizer(vb, 30) |
|
token, _, _ = st.tokenize_sentences([sentence]) |
|
assert st.to_sentence(token[0]) == sentence |
|
|
|
def test_id_to_sentence_with_unknown(): |
|
"""Tokenizing and converting back preserves the input, except for unknowns. |
|
""" |
|
vb = {'CUSTOM_MASK': 0, |
|
'CUSTOM_UNKNOWN': 1, |
|
'aasdf': 1000, |
|
'basdf': 2000} |
|
|
|
sentence = 'aasdf basdf ccc' |
|
expected = 'aasdf basdf CUSTOM_UNKNOWN' |
|
st = SentenceTokenizer(vb, 30) |
|
token, _, _ = st.tokenize_sentences([sentence]) |
|
assert st.to_sentence(token[0]) == expected |
|
|