|
|
|
''' |
|
Splits up a Unicode string into a list of tokens. |
|
Recognises: |
|
- Abbreviations |
|
- URLs |
|
- Emails |
|
- #hashtags |
|
- @mentions |
|
- emojis |
|
- emoticons (limited support) |
|
|
|
Multiple consecutive symbols are also treated as a single token. |
|
''' |
|
from __future__ import absolute_import, division, print_function, unicode_literals |
|
|
|
import re |
|
|
|
|
|
RE_NUM = r'[0-9]+' |
|
RE_WORD = r'[a-zA-Z]+' |
|
RE_WHITESPACE = r'\s+' |
|
RE_ANY = r'.' |
|
|
|
|
|
RE_COMB = r'[a-zA-Z]+[-_][a-zA-Z]+' |
|
|
|
|
|
RE_CONTRACTIONS = RE_WORD + r'\'' + RE_WORD |
|
|
|
TITLES = [ |
|
r'Mr\.', |
|
r'Ms\.', |
|
r'Mrs\.', |
|
r'Dr\.', |
|
r'Prof\.', |
|
] |
|
|
|
RE_TITLES = r'|'.join([r'(?i)' + t for t in TITLES]) |
|
|
|
|
|
|
|
SYMBOLS = r'()<!?.,/\'\"-_=\\§|´ˇ°[]<>{}~$^&*;:%+\xa3€`' |
|
RE_SYMBOL = r'|'.join([re.escape(s) + r'+' for s in SYMBOLS]) |
|
|
|
|
|
|
|
|
|
SPECIAL_SYMBOLS = r'|#+(?=#[a-zA-Z0-9_]+)|@+(?=@[a-zA-Z0-9_]+)|#+|@+' |
|
RE_SYMBOL += SPECIAL_SYMBOLS |
|
|
|
RE_ABBREVIATIONS = r'\b(?<!\.)(?:[A-Za-z]\.){2,}' |
|
|
|
|
|
RE_HASHTAG = r'#[a-zA-Z0-9_]+' |
|
RE_MENTION = r'@[a-zA-Z0-9_]+' |
|
|
|
RE_URL = r'(?:https?://|www\.)(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' |
|
RE_EMAIL = r'\b[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+\b' |
|
|
|
|
|
RE_HEART = r'(?:<+/?3+)+' |
|
EMOTICONS_START = [ |
|
r'>:', |
|
r':', |
|
r'=', |
|
r';', |
|
] |
|
EMOTICONS_MID = [ |
|
r'-', |
|
r',', |
|
r'^', |
|
'\'', |
|
'\"', |
|
] |
|
EMOTICONS_END = [ |
|
r'D', |
|
r'd', |
|
r'p', |
|
r'P', |
|
r'v', |
|
r')', |
|
r'o', |
|
r'O', |
|
r'(', |
|
r'3', |
|
r'/', |
|
r'|', |
|
'\\', |
|
] |
|
EMOTICONS_EXTRA = [ |
|
r'-_-', |
|
r'x_x', |
|
r'^_^', |
|
r'o.o', |
|
r'o_o', |
|
r'(:', |
|
r'):', |
|
r');', |
|
r'(;', |
|
] |
|
|
|
RE_EMOTICON = r'|'.join([re.escape(s) for s in EMOTICONS_EXTRA]) |
|
for s in EMOTICONS_START: |
|
for m in EMOTICONS_MID: |
|
for e in EMOTICONS_END: |
|
RE_EMOTICON += '|{0}{1}?{2}+'.format(re.escape(s), re.escape(m), re.escape(e)) |
|
|
|
|
|
|
|
|
|
RE_EMOJI = r"""\ud83c[\udf00-\udfff]|\ud83d[\udc00-\ude4f\ude80-\udeff]|[\u2600-\u26FF\u2700-\u27BF]""" |
|
|
|
|
|
TOKENS = [ |
|
RE_URL, |
|
RE_EMAIL, |
|
RE_COMB, |
|
RE_HASHTAG, |
|
RE_MENTION, |
|
RE_HEART, |
|
RE_EMOTICON, |
|
RE_CONTRACTIONS, |
|
RE_TITLES, |
|
RE_ABBREVIATIONS, |
|
RE_NUM, |
|
RE_WORD, |
|
RE_SYMBOL, |
|
RE_EMOJI, |
|
RE_ANY |
|
] |
|
|
|
|
|
IGNORED = [ |
|
RE_WHITESPACE |
|
] |
|
|
|
|
|
RE_PATTERN = re.compile(r'|'.join(IGNORED) + r'|(' + r'|'.join(TOKENS) + r')', |
|
re.UNICODE) |
|
|
|
|
|
def tokenize(text): |
|
'''Splits given input string into a list of tokens. |
|
|
|
# Arguments: |
|
text: Input string to be tokenized. |
|
|
|
# Returns: |
|
List of strings (tokens). |
|
''' |
|
result = RE_PATTERN.findall(text) |
|
|
|
|
|
result = [t for t in result if t.strip()] |
|
return result |
|
|