|
|
|
|
|
from __future__ import print_function, division, unicode_literals |
|
import sys |
|
import re |
|
import string |
|
import emoji |
|
from itertools import groupby |
|
|
|
import numpy as np |
|
from torchmoji.tokenizer import RE_MENTION, RE_URL |
|
from torchmoji.global_variables import SPECIAL_TOKENS |
|
|
|
IS_PYTHON2 = int(sys.version[0]) == 2 |
|
chr_ = unichr if IS_PYTHON2 else chr |
|
|
|
AtMentionRegex = re.compile(RE_MENTION) |
|
urlRegex = re.compile(RE_URL) |
|
|
|
|
|
VARIATION_SELECTORS = [ '\ufe00', |
|
'\ufe01', |
|
'\ufe02', |
|
'\ufe03', |
|
'\ufe04', |
|
'\ufe05', |
|
'\ufe06', |
|
'\ufe07', |
|
'\ufe08', |
|
'\ufe09', |
|
'\ufe0a', |
|
'\ufe0b', |
|
'\ufe0c', |
|
'\ufe0d', |
|
'\ufe0e', |
|
'\ufe0f'] |
|
|
|
|
|
ALL_CHARS = (chr_(i) for i in range(sys.maxunicode)) |
|
CONTROL_CHARS = ''.join(map(chr_, list(range(0,32)) + list(range(127,160)))) |
|
CONTROL_CHAR_REGEX = re.compile('[%s]' % re.escape(CONTROL_CHARS)) |
|
|
|
def is_special_token(word): |
|
equal = False |
|
for spec in SPECIAL_TOKENS: |
|
if word == spec: |
|
equal = True |
|
break |
|
return equal |
|
|
|
def mostly_english(words, english, pct_eng_short=0.5, pct_eng_long=0.6, ignore_special_tokens=True, min_length=2): |
|
""" Ensure text meets threshold for containing English words """ |
|
|
|
n_words = 0 |
|
n_english = 0 |
|
|
|
if english is None: |
|
return True, 0, 0 |
|
|
|
for w in words: |
|
if len(w) < min_length: |
|
continue |
|
if punct_word(w): |
|
continue |
|
if ignore_special_tokens and is_special_token(w): |
|
continue |
|
n_words += 1 |
|
if w in english: |
|
n_english += 1 |
|
|
|
if n_words < 2: |
|
return True, n_words, n_english |
|
if n_words < 5: |
|
valid_english = n_english >= n_words * pct_eng_short |
|
else: |
|
valid_english = n_english >= n_words * pct_eng_long |
|
return valid_english, n_words, n_english |
|
|
|
def correct_length(words, min_words, max_words, ignore_special_tokens=True): |
|
""" Ensure text meets threshold for containing English words |
|
and that it's within the min and max words limits. """ |
|
|
|
if min_words is None: |
|
min_words = 0 |
|
|
|
if max_words is None: |
|
max_words = 99999 |
|
|
|
n_words = 0 |
|
for w in words: |
|
if punct_word(w): |
|
continue |
|
if ignore_special_tokens and is_special_token(w): |
|
continue |
|
n_words += 1 |
|
valid = min_words <= n_words and n_words <= max_words |
|
return valid |
|
|
|
def punct_word(word, punctuation=string.punctuation): |
|
return all([True if c in punctuation else False for c in word]) |
|
|
|
def load_non_english_user_set(): |
|
non_english_user_set = set(np.load('uids.npz')['data']) |
|
return non_english_user_set |
|
|
|
def non_english_user(userid, non_english_user_set): |
|
neu_found = int(userid) in non_english_user_set |
|
return neu_found |
|
|
|
def separate_emojis_and_text(text): |
|
emoji_chars = [] |
|
non_emoji_chars = [] |
|
for c in text: |
|
if c in emoji.UNICODE_EMOJI: |
|
emoji_chars.append(c) |
|
else: |
|
non_emoji_chars.append(c) |
|
return ''.join(emoji_chars), ''.join(non_emoji_chars) |
|
|
|
def extract_emojis(text, wanted_emojis): |
|
text = remove_variation_selectors(text) |
|
return [c for c in text if c in wanted_emojis] |
|
|
|
def remove_variation_selectors(text): |
|
""" Remove styling glyph variants for Unicode characters. |
|
For instance, remove skin color from emojis. |
|
""" |
|
for var in VARIATION_SELECTORS: |
|
text = text.replace(var, '') |
|
return text |
|
|
|
def shorten_word(word): |
|
""" Shorten groupings of 3+ identical consecutive chars to 2, e.g. '!!!!' --> '!!' |
|
""" |
|
|
|
|
|
try: |
|
word.decode('ascii') |
|
except (UnicodeDecodeError, UnicodeEncodeError, AttributeError) as e: |
|
return word |
|
|
|
|
|
if len(word) < 3: |
|
return word |
|
|
|
|
|
letter_groups = [list(g) for k, g in groupby(word)] |
|
triple_or_more = [''.join(g) for g in letter_groups if len(g) >= 3] |
|
if len(triple_or_more) == 0: |
|
return word |
|
|
|
|
|
short_word = word |
|
for trip in triple_or_more: |
|
short_word = short_word.replace(trip, trip[0]*2) |
|
|
|
return short_word |
|
|
|
def detect_special_tokens(word): |
|
try: |
|
int(word) |
|
word = SPECIAL_TOKENS[4] |
|
except ValueError: |
|
if AtMentionRegex.findall(word): |
|
word = SPECIAL_TOKENS[2] |
|
elif urlRegex.findall(word): |
|
word = SPECIAL_TOKENS[3] |
|
return word |
|
|
|
def process_word(word): |
|
""" Shortening and converting the word to a special token if relevant. |
|
""" |
|
word = shorten_word(word) |
|
word = detect_special_tokens(word) |
|
return word |
|
|
|
def remove_control_chars(text): |
|
return CONTROL_CHAR_REGEX.sub('', text) |
|
|
|
def convert_nonbreaking_space(text): |
|
|
|
for r in ['\\\\xc2', '\\xc2', '\xc2', '\\\\xa0', '\\xa0', '\xa0']: |
|
text = text.replace(r, ' ') |
|
return text |
|
|
|
def convert_linebreaks(text): |
|
|
|
|
|
for r in ['\\\\n', '\\n', '\n', '\\\\r', '\\r', '\r', '<br>']: |
|
text = text.replace(r, ' ' + SPECIAL_TOKENS[5] + ' ') |
|
return text |
|
|