hf-deepmoji / tests /test_tokenizer.py

Initial commit

cc0b62b over 6 years ago

No virus

4.43 kB

	# -- coding: utf-8 --
	""" Tokenization tests.
	"""
	from __future__ import absolute_import, print_function, division, unicode_literals

	import sys
	from nose.tools import nottest
	from os.path import dirname, abspath
	sys.path.append(dirname(dirname(abspath(__file__))))
	from torchmoji.tokenizer import tokenize

	TESTS_NORMAL = [
	('200K words!', ['200', 'K', 'words', '!']),
	]

	TESTS_EMOJIS = [
	('i \U0001f496 you to the moon and back',
	['i', '\U0001f496', 'you', 'to', 'the', 'moon', 'and', 'back']),
	("i\U0001f496you to the \u2605's and back",
	['i', '\U0001f496', 'you', 'to', 'the',
	'\u2605', "'", 's', 'and', 'back']),
	('~<3~', ['~', '<3', '~']),
	('<333', ['<333']),
	(':-)', [':-)']),
	('>:-(', ['>:-(']),
	('\u266b\u266a\u2605\u2606\u2665\u2764\u2661',
	['\u266b', '\u266a', '\u2605', '\u2606',
	'\u2665', '\u2764', '\u2661']),
	]

	TESTS_URLS = [
	('www.sample.com', ['www.sample.com']),
	('http://endless.horse', ['http://endless.horse']),
	('https://github.mit.ed', ['https://github.mit.ed']),
	]

	TESTS_TWITTER = [
	('#blacklivesmatter', ['#blacklivesmatter']),
	('#99_percent.', ['#99_percent', '.']),
	('the#99%', ['the', '#99', '%']),
	('@golden_zenith', ['@golden_zenith']),
	('@99_percent', ['@99_percent']),
	('latte-express@mit.ed', ['latte-express@mit.ed']),
	]

	TESTS_PHONE_NUMS = [
	('518)528-0252', ['518', ')', '528', '-', '0252']),
	('1200-0221-0234', ['1200', '-', '0221', '-', '0234']),
	('1200.0221.0234', ['1200', '.', '0221', '.', '0234']),
	]

	TESTS_DATETIME = [
	('15:00', ['15', ':', '00']),
	('2:00pm', ['2', ':', '00', 'pm']),
	('9/14/16', ['9', '/', '14', '/', '16']),
	]

	TESTS_CURRENCIES = [
	('517.933\xa3', ['517', '.', '933', '\xa3']),
	('$517.87', ['$', '517', '.', '87']),
	('1201.6598', ['1201', '.', '6598']),
	('120,6', ['120', ',', '6']),
	('10,00\u20ac', ['10', ',', '00', '\u20ac']),
	('1,000', ['1', ',', '000']),
	('1200pesos', ['1200', 'pesos']),
	]

	TESTS_NUM_SYM = [
	('5162f', ['5162', 'f']),
	('f5162', ['f', '5162']),
	('1203(', ['1203', '(']),
	('(1203)', ['(', '1203', ')']),
	('1200/', ['1200', '/']),
	('1200+', ['1200', '+']),
	('1202o-east', ['1202', 'o-east']),
	('1200r', ['1200', 'r']),
	('1200-1400', ['1200', '-', '1400']),
	('120/today', ['120', '/', 'today']),
	('today/120', ['today', '/', '120']),
	('120/5', ['120', '/', '5']),
	("120'/5", ['120', "'", '/', '5']),
	('120/5pro', ['120', '/', '5', 'pro']),
	("1200's,)", ['1200', "'", 's', ',', ')']),
	('120.76.218.207', ['120', '.', '76', '.', '218', '.', '207']),
	]

	TESTS_PUNCTUATION = [
	("don''t", ['don', "''", 't']),
	("don'tcha", ["don'tcha"]),
	('no?!?!;', ['no', '?', '!', '?', '!', ';']),
	('no??!!..', ['no', '??', '!!', '..']),
	('a.m.', ['a.m.']),
	('.s.u', ['.', 's', '.', 'u']),
	('!!i..n__', ['!!', 'i', '..', 'n', '__']),
	('lv(<3)w(3>)u Mr.!', ['lv', '(', '<3', ')', 'w', '(', '3',
	'>', ')', 'u', 'Mr.', '!']),
	('-->', ['--', '>']),
	('->', ['-', '>']),
	('<-', ['<', '-']),
	('<--', ['<', '--']),
	('hello (@person)', ['hello', '(', '@person', ')']),
	]


	def test_normal():
	""" Normal/combined usage.
	"""
	test_base(TESTS_NORMAL)


	def test_emojis():
	""" Tokenizing emojis/emoticons/decorations.
	"""
	test_base(TESTS_EMOJIS)


	def test_urls():
	""" Tokenizing URLs.
	"""
	test_base(TESTS_URLS)


	def test_twitter():
	""" Tokenizing hashtags, mentions and emails.
	"""
	test_base(TESTS_TWITTER)


	def test_phone_nums():
	""" Tokenizing phone numbers.
	"""
	test_base(TESTS_PHONE_NUMS)


	def test_datetime():
	""" Tokenizing dates and times.
	"""
	test_base(TESTS_DATETIME)


	def test_currencies():
	""" Tokenizing currencies.
	"""
	test_base(TESTS_CURRENCIES)


	def test_num_sym():
	""" Tokenizing combinations of numbers and symbols.
	"""
	test_base(TESTS_NUM_SYM)


	def test_punctuation():
	""" Tokenizing punctuation and contractions.
	"""
	test_base(TESTS_PUNCTUATION)


	@nottest
	def test_base(tests):
	""" Base function for running tests.
	"""
	for (test, expected) in tests:
	actual = tokenize(test)
	assert actual == expected, \
	"Tokenization of \'{}\' failed, expected: {}, actual: {}"\
	.format(test, expected, actual)