# -*- coding: utf-8 -*- """ Tokenization tests. """ from __future__ import absolute_import, print_function, division, unicode_literals import sys from nose.tools import nottest from os.path import dirname, abspath sys.path.append(dirname(dirname(abspath(__file__)))) from torchmoji.tokenizer import tokenize TESTS_NORMAL = [ ('200K words!', ['200', 'K', 'words', '!']), ] TESTS_EMOJIS = [ ('i \U0001f496 you to the moon and back', ['i', '\U0001f496', 'you', 'to', 'the', 'moon', 'and', 'back']), ("i\U0001f496you to the \u2605's and back", ['i', '\U0001f496', 'you', 'to', 'the', '\u2605', "'", 's', 'and', 'back']), ('~<3~', ['~', '<3', '~']), ('<333', ['<333']), (':-)', [':-)']), ('>:-(', ['>:-(']), ('\u266b\u266a\u2605\u2606\u2665\u2764\u2661', ['\u266b', '\u266a', '\u2605', '\u2606', '\u2665', '\u2764', '\u2661']), ] TESTS_URLS = [ ('www.sample.com', ['www.sample.com']), ('http://endless.horse', ['http://endless.horse']), ('https://github.mit.ed', ['https://github.mit.ed']), ] TESTS_TWITTER = [ ('#blacklivesmatter', ['#blacklivesmatter']), ('#99_percent.', ['#99_percent', '.']), ('the#99%', ['the', '#99', '%']), ('@golden_zenith', ['@golden_zenith']), ('@99_percent', ['@99_percent']), ('latte-express@mit.ed', ['latte-express@mit.ed']), ] TESTS_PHONE_NUMS = [ ('518)528-0252', ['518', ')', '528', '-', '0252']), ('1200-0221-0234', ['1200', '-', '0221', '-', '0234']), ('1200.0221.0234', ['1200', '.', '0221', '.', '0234']), ] TESTS_DATETIME = [ ('15:00', ['15', ':', '00']), ('2:00pm', ['2', ':', '00', 'pm']), ('9/14/16', ['9', '/', '14', '/', '16']), ] TESTS_CURRENCIES = [ ('517.933\xa3', ['517', '.', '933', '\xa3']), ('$517.87', ['$', '517', '.', '87']), ('1201.6598', ['1201', '.', '6598']), ('120,6', ['120', ',', '6']), ('10,00\u20ac', ['10', ',', '00', '\u20ac']), ('1,000', ['1', ',', '000']), ('1200pesos', ['1200', 'pesos']), ] TESTS_NUM_SYM = [ ('5162f', ['5162', 'f']), ('f5162', ['f', '5162']), ('1203(', ['1203', '(']), ('(1203)', ['(', '1203', ')']), ('1200/', ['1200', '/']), ('1200+', ['1200', '+']), ('1202o-east', ['1202', 'o-east']), ('1200r', ['1200', 'r']), ('1200-1400', ['1200', '-', '1400']), ('120/today', ['120', '/', 'today']), ('today/120', ['today', '/', '120']), ('120/5', ['120', '/', '5']), ("120'/5", ['120', "'", '/', '5']), ('120/5pro', ['120', '/', '5', 'pro']), ("1200's,)", ['1200', "'", 's', ',', ')']), ('120.76.218.207', ['120', '.', '76', '.', '218', '.', '207']), ] TESTS_PUNCTUATION = [ ("don''t", ['don', "''", 't']), ("don'tcha", ["don'tcha"]), ('no?!?!;', ['no', '?', '!', '?', '!', ';']), ('no??!!..', ['no', '??', '!!', '..']), ('a.m.', ['a.m.']), ('.s.u', ['.', 's', '.', 'u']), ('!!i..n__', ['!!', 'i', '..', 'n', '__']), ('lv(<3)w(3>)u Mr.!', ['lv', '(', '<3', ')', 'w', '(', '3', '>', ')', 'u', 'Mr.', '!']), ('-->', ['--', '>']), ('->', ['-', '>']), ('<-', ['<', '-']), ('<--', ['<', '--']), ('hello (@person)', ['hello', '(', '@person', ')']), ] def test_normal(): """ Normal/combined usage. """ test_base(TESTS_NORMAL) def test_emojis(): """ Tokenizing emojis/emoticons/decorations. """ test_base(TESTS_EMOJIS) def test_urls(): """ Tokenizing URLs. """ test_base(TESTS_URLS) def test_twitter(): """ Tokenizing hashtags, mentions and emails. """ test_base(TESTS_TWITTER) def test_phone_nums(): """ Tokenizing phone numbers. """ test_base(TESTS_PHONE_NUMS) def test_datetime(): """ Tokenizing dates and times. """ test_base(TESTS_DATETIME) def test_currencies(): """ Tokenizing currencies. """ test_base(TESTS_CURRENCIES) def test_num_sym(): """ Tokenizing combinations of numbers and symbols. """ test_base(TESTS_NUM_SYM) def test_punctuation(): """ Tokenizing punctuation and contractions. """ test_base(TESTS_PUNCTUATION) @nottest def test_base(tests): """ Base function for running tests. """ for (test, expected) in tests: actual = tokenize(test) assert actual == expected, \ "Tokenization of \'{}\' failed, expected: {}, actual: {}"\ .format(test, expected, actual)