colinryan
/

hf-deepmoji

Inference Endpoints

Model card Files Files and versions Community

hf-deepmoji / tests /test_sentence_tokenizer.py

thomwolf's picture

thomwolf HF staff

Initial commit

cc0b62b almost 7 years ago

No virus

3.39 kB

	from __future__ import absolute_import, print_function, division, unicode_literals
	import test_helper
	import json

	from torchmoji.sentence_tokenizer import SentenceTokenizer

	sentences = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']

	dicts = [
	{'label': 0},
	{'label': 1},
	{'label': 2},
	{'label': 3},
	{'label': 4},
	{'label': 5},
	{'label': 6},
	{'label': 7},
	{'label': 8},
	{'label': 9},
	]

	train_ind = [0, 5, 3, 6, 8]
	val_ind = [9, 2, 1]
	test_ind = [4, 7]

	with open('../model/vocabulary.json', 'r') as f:
	vocab = json.load(f)

	def test_dataset_split_parameter():
	""" Dataset is split in the desired ratios
	"""
	split_parameter = [0.7, 0.1, 0.2]
	st = SentenceTokenizer(vocab, 30)

	result, result_dicts, _ = st.split_train_val_test(sentences, dicts,
	split_parameter, extend_with=0)
	train = result[0]
	val = result[1]
	test = result[2]

	train_dicts = result_dicts[0]
	val_dicts = result_dicts[1]
	test_dicts = result_dicts[2]

	assert len(train) == len(sentences) * split_parameter[0]
	assert len(val) == len(sentences) * split_parameter[1]
	assert len(test) == len(sentences) * split_parameter[2]

	assert len(train_dicts) == len(dicts) * split_parameter[0]
	assert len(val_dicts) == len(dicts) * split_parameter[1]
	assert len(test_dicts) == len(dicts) * split_parameter[2]

	def test_dataset_split_explicit():
	""" Dataset is split according to given indices
	"""
	split_parameter = [train_ind, val_ind, test_ind]
	st = SentenceTokenizer(vocab, 30)
	tokenized, _, _ = st.tokenize_sentences(sentences)

	result, result_dicts, added = st.split_train_val_test(sentences, dicts, split_parameter, extend_with=0)
	train = result[0]
	val = result[1]
	test = result[2]

	train_dicts = result_dicts[0]
	val_dicts = result_dicts[1]
	test_dicts = result_dicts[2]

	tokenized = tokenized

	for i, sentence in enumerate(sentences):
	if i in train_ind:
	assert tokenized[i] in train
	assert dicts[i] in train_dicts
	elif i in val_ind:
	assert tokenized[i] in val
	assert dicts[i] in val_dicts
	elif i in test_ind:
	assert tokenized[i] in test
	assert dicts[i] in test_dicts

	assert len(train) == len(train_ind)
	assert len(val) == len(val_ind)
	assert len(test) == len(test_ind)
	assert len(train_dicts) == len(train_ind)
	assert len(val_dicts) == len(val_ind)
	assert len(test_dicts) == len(test_ind)

	def test_id_to_sentence():
	"""Tokenizing and converting back preserves the input.
	"""
	vb = {'CUSTOM_MASK': 0,
	'aasdf': 1000,
	'basdf': 2000}

	sentence = 'aasdf basdf basdf basdf'
	st = SentenceTokenizer(vb, 30)
	token, _, _ = st.tokenize_sentences([sentence])
	assert st.to_sentence(token[0]) == sentence

	def test_id_to_sentence_with_unknown():
	"""Tokenizing and converting back preserves the input, except for unknowns.
	"""
	vb = {'CUSTOM_MASK': 0,
	'CUSTOM_UNKNOWN': 1,
	'aasdf': 1000,
	'basdf': 2000}

	sentence = 'aasdf basdf ccc'
	expected = 'aasdf basdf CUSTOM_UNKNOWN'
	st = SentenceTokenizer(vb, 30)
	token, _, _ = st.tokenize_sentences([sentence])
	assert st.to_sentence(token[0]) == expected