Spaces:
Running
Running
# -*- coding: utf-8 -*- | |
""" Tokenization tests. | |
""" | |
from __future__ import absolute_import, print_function, division, unicode_literals | |
import sys | |
from nose.tools import nottest | |
from os.path import dirname, abspath | |
sys.path.append(dirname(dirname(abspath(__file__)))) | |
from torchmoji.tokenizer import tokenize | |
TESTS_NORMAL = [ | |
('200K words!', ['200', 'K', 'words', '!']), | |
] | |
TESTS_EMOJIS = [ | |
('i \U0001f496 you to the moon and back', | |
['i', '\U0001f496', 'you', 'to', 'the', 'moon', 'and', 'back']), | |
("i\U0001f496you to the \u2605's and back", | |
['i', '\U0001f496', 'you', 'to', 'the', | |
'\u2605', "'", 's', 'and', 'back']), | |
('~<3~', ['~', '<3', '~']), | |
('<333', ['<333']), | |
(':-)', [':-)']), | |
('>:-(', ['>:-(']), | |
('\u266b\u266a\u2605\u2606\u2665\u2764\u2661', | |
['\u266b', '\u266a', '\u2605', '\u2606', | |
'\u2665', '\u2764', '\u2661']), | |
] | |
TESTS_URLS = [ | |
('www.sample.com', ['www.sample.com']), | |
('http://endless.horse', ['http://endless.horse']), | |
('https://github.mit.ed', ['https://github.mit.ed']), | |
] | |
TESTS_TWITTER = [ | |
('#blacklivesmatter', ['#blacklivesmatter']), | |
('#99_percent.', ['#99_percent', '.']), | |
('the#99%', ['the', '#99', '%']), | |
('@golden_zenith', ['@golden_zenith']), | |
('@99_percent', ['@99_percent']), | |
('latte-express@mit.ed', ['latte-express@mit.ed']), | |
] | |
TESTS_PHONE_NUMS = [ | |
('518)528-0252', ['518', ')', '528', '-', '0252']), | |
('1200-0221-0234', ['1200', '-', '0221', '-', '0234']), | |
('1200.0221.0234', ['1200', '.', '0221', '.', '0234']), | |
] | |
TESTS_DATETIME = [ | |
('15:00', ['15', ':', '00']), | |
('2:00pm', ['2', ':', '00', 'pm']), | |
('9/14/16', ['9', '/', '14', '/', '16']), | |
] | |
TESTS_CURRENCIES = [ | |
('517.933\xa3', ['517', '.', '933', '\xa3']), | |
('$517.87', ['$', '517', '.', '87']), | |
('1201.6598', ['1201', '.', '6598']), | |
('120,6', ['120', ',', '6']), | |
('10,00\u20ac', ['10', ',', '00', '\u20ac']), | |
('1,000', ['1', ',', '000']), | |
('1200pesos', ['1200', 'pesos']), | |
] | |
TESTS_NUM_SYM = [ | |
('5162f', ['5162', 'f']), | |
('f5162', ['f', '5162']), | |
('1203(', ['1203', '(']), | |
('(1203)', ['(', '1203', ')']), | |
('1200/', ['1200', '/']), | |
('1200+', ['1200', '+']), | |
('1202o-east', ['1202', 'o-east']), | |
('1200r', ['1200', 'r']), | |
('1200-1400', ['1200', '-', '1400']), | |
('120/today', ['120', '/', 'today']), | |
('today/120', ['today', '/', '120']), | |
('120/5', ['120', '/', '5']), | |
("120'/5", ['120', "'", '/', '5']), | |
('120/5pro', ['120', '/', '5', 'pro']), | |
("1200's,)", ['1200', "'", 's', ',', ')']), | |
('120.76.218.207', ['120', '.', '76', '.', '218', '.', '207']), | |
] | |
TESTS_PUNCTUATION = [ | |
("don''t", ['don', "''", 't']), | |
("don'tcha", ["don'tcha"]), | |
('no?!?!;', ['no', '?', '!', '?', '!', ';']), | |
('no??!!..', ['no', '??', '!!', '..']), | |
('a.m.', ['a.m.']), | |
('.s.u', ['.', 's', '.', 'u']), | |
('!!i..n__', ['!!', 'i', '..', 'n', '__']), | |
('lv(<3)w(3>)u Mr.!', ['lv', '(', '<3', ')', 'w', '(', '3', | |
'>', ')', 'u', 'Mr.', '!']), | |
('-->', ['--', '>']), | |
('->', ['-', '>']), | |
('<-', ['<', '-']), | |
('<--', ['<', '--']), | |
('hello (@person)', ['hello', '(', '@person', ')']), | |
] | |
def test_normal(): | |
""" Normal/combined usage. | |
""" | |
test_base(TESTS_NORMAL) | |
def test_emojis(): | |
""" Tokenizing emojis/emoticons/decorations. | |
""" | |
test_base(TESTS_EMOJIS) | |
def test_urls(): | |
""" Tokenizing URLs. | |
""" | |
test_base(TESTS_URLS) | |
def test_twitter(): | |
""" Tokenizing hashtags, mentions and emails. | |
""" | |
test_base(TESTS_TWITTER) | |
def test_phone_nums(): | |
""" Tokenizing phone numbers. | |
""" | |
test_base(TESTS_PHONE_NUMS) | |
def test_datetime(): | |
""" Tokenizing dates and times. | |
""" | |
test_base(TESTS_DATETIME) | |
def test_currencies(): | |
""" Tokenizing currencies. | |
""" | |
test_base(TESTS_CURRENCIES) | |
def test_num_sym(): | |
""" Tokenizing combinations of numbers and symbols. | |
""" | |
test_base(TESTS_NUM_SYM) | |
def test_punctuation(): | |
""" Tokenizing punctuation and contractions. | |
""" | |
test_base(TESTS_PUNCTUATION) | |
def test_base(tests): | |
""" Base function for running tests. | |
""" | |
for (test, expected) in tests: | |
actual = tokenize(test) | |
assert actual == expected, \ | |
"Tokenization of \'{}\' failed, expected: {}, actual: {}"\ | |
.format(test, expected, actual) | |