Spaces:
Running
Running
# -*- coding: utf-8 -*- | |
import sys | |
from os.path import dirname, abspath | |
sys.path.append(dirname(dirname(abspath(__file__)))) | |
from nose.tools import raises | |
from torchmoji.word_generator import WordGenerator | |
IS_PYTHON2 = int(sys.version[0]) == 2 | |
def test_only_unicode_accepted(): | |
""" Non-Unicode strings raise a ValueError. | |
In Python 3 all string are Unicode | |
""" | |
if not IS_PYTHON2: | |
raise ValueError("You are using python 3 so this test should always pass") | |
sentences = [ | |
u'Hello world', | |
u'I am unicode', | |
'I am not unicode', | |
] | |
wg = WordGenerator(sentences) | |
for w in wg: | |
pass | |
def test_unicode_sentences_ignored_if_set(): | |
""" Strings with Unicode characters tokenize to empty array if they're not allowed. | |
""" | |
sentence = [u'Dobrý den, jak se máš?'] | |
wg = WordGenerator(sentence, allow_unicode_text=False) | |
assert wg.get_words(sentence[0]) == [] | |
def test_check_ascii(): | |
""" check_ascii recognises ASCII words properly. | |
In Python 3 all string are Unicode | |
""" | |
if not IS_PYTHON2: | |
return | |
wg = WordGenerator([]) | |
assert wg.check_ascii('ASCII') | |
assert not wg.check_ascii('ščřžýá') | |
assert not wg.check_ascii('❤ ☀ ☆ ☂ ☻ ♞ ☯ ☭ ☢') | |
def test_convert_unicode_word(): | |
""" convert_unicode_word converts Unicode words correctly. | |
""" | |
wg = WordGenerator([], allow_unicode_text=True) | |
result = wg.convert_unicode_word(u'č') | |
assert result == (True, u'\u010d'), '{}'.format(result) | |
def test_convert_unicode_word_ignores_if_set(): | |
""" convert_unicode_word ignores Unicode words if set. | |
""" | |
wg = WordGenerator([], allow_unicode_text=False) | |
result = wg.convert_unicode_word(u'č') | |
assert result == (False, ''), '{}'.format(result) | |
def test_convert_unicode_chars(): | |
""" convert_unicode_word correctly converts accented characters. | |
""" | |
wg = WordGenerator([], allow_unicode_text=True) | |
result = wg.convert_unicode_word(u'ěščřžýáíé') | |
assert result == (True, u'\u011b\u0161\u010d\u0159\u017e\xfd\xe1\xed\xe9'), '{}'.format(result) | |