beanbox-apis / torchmoji /tokenizer.py
johnpaulbin's picture
First model version
3affa92
# -*- coding: utf-8 -*-
'''
Splits up a Unicode string into a list of tokens.
Recognises:
- Abbreviations
- URLs
- Emails
- #hashtags
- @mentions
- emojis
- emoticons (limited support)
Multiple consecutive symbols are also treated as a single token.
'''
from __future__ import absolute_import, division, print_function, unicode_literals
import re
# Basic patterns.
RE_NUM = r'[0-9]+'
RE_WORD = r'[a-zA-Z]+'
RE_WHITESPACE = r'\s+'
RE_ANY = r'.'
# Combined words such as 'red-haired' or 'CUSTOM_TOKEN'
RE_COMB = r'[a-zA-Z]+[-_][a-zA-Z]+'
# English-specific patterns
RE_CONTRACTIONS = RE_WORD + r'\'' + RE_WORD
TITLES = [
r'Mr\.',
r'Ms\.',
r'Mrs\.',
r'Dr\.',
r'Prof\.',
]
# Ensure case insensitivity
RE_TITLES = r'|'.join([r'(?i)' + t for t in TITLES])
# Symbols have to be created as separate patterns in order to match consecutive
# identical symbols.
SYMBOLS = r'()<!?.,/\'\"-_=\\§|´ˇ°[]<>{}~$^&*;:%+\xa3€`'
RE_SYMBOL = r'|'.join([re.escape(s) + r'+' for s in SYMBOLS])
# Hash symbols and at symbols have to be defined separately in order to not
# clash with hashtags and mentions if there are multiple - i.e.
# ##hello -> ['#', '#hello'] instead of ['##', 'hello']
SPECIAL_SYMBOLS = r'|#+(?=#[a-zA-Z0-9_]+)|@+(?=@[a-zA-Z0-9_]+)|#+|@+'
RE_SYMBOL += SPECIAL_SYMBOLS
RE_ABBREVIATIONS = r'\b(?<!\.)(?:[A-Za-z]\.){2,}'
# Twitter-specific patterns
RE_HASHTAG = r'#[a-zA-Z0-9_]+'
RE_MENTION = r'@[a-zA-Z0-9_]+'
RE_URL = r'(?:https?://|www\.)(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
RE_EMAIL = r'\b[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+\b'
# Emoticons and emojis
RE_HEART = r'(?:<+/?3+)+'
EMOTICONS_START = [
r'>:',
r':',
r'=',
r';',
]
EMOTICONS_MID = [
r'-',
r',',
r'^',
'\'',
'\"',
]
EMOTICONS_END = [
r'D',
r'd',
r'p',
r'P',
r'v',
r')',
r'o',
r'O',
r'(',
r'3',
r'/',
r'|',
'\\',
]
EMOTICONS_EXTRA = [
r'-_-',
r'x_x',
r'^_^',
r'o.o',
r'o_o',
r'(:',
r'):',
r');',
r'(;',
]
RE_EMOTICON = r'|'.join([re.escape(s) for s in EMOTICONS_EXTRA])
for s in EMOTICONS_START:
for m in EMOTICONS_MID:
for e in EMOTICONS_END:
RE_EMOTICON += '|{0}{1}?{2}+'.format(re.escape(s), re.escape(m), re.escape(e))
# requires ucs4 in python2.7 or python3+
# RE_EMOJI = r"""[\U0001F300-\U0001F64F\U0001F680-\U0001F6FF\u2600-\u26FF\u2700-\u27BF]"""
# safe for all python
RE_EMOJI = r"""\ud83c[\udf00-\udfff]|\ud83d[\udc00-\ude4f\ude80-\udeff]|[\u2600-\u26FF\u2700-\u27BF]"""
# List of matched token patterns, ordered from most specific to least specific.
TOKENS = [
RE_URL,
RE_EMAIL,
RE_COMB,
RE_HASHTAG,
RE_MENTION,
RE_HEART,
RE_EMOTICON,
RE_CONTRACTIONS,
RE_TITLES,
RE_ABBREVIATIONS,
RE_NUM,
RE_WORD,
RE_SYMBOL,
RE_EMOJI,
RE_ANY
]
# List of ignored token patterns
IGNORED = [
RE_WHITESPACE
]
# Final pattern
RE_PATTERN = re.compile(r'|'.join(IGNORED) + r'|(' + r'|'.join(TOKENS) + r')',
re.UNICODE)
def tokenize(text):
'''Splits given input string into a list of tokens.
# Arguments:
text: Input string to be tokenized.
# Returns:
List of strings (tokens).
'''
result = RE_PATTERN.findall(text)
# Remove empty strings
result = [t for t in result if t.strip()]
return result