beanbox-apis / torchmoji /filter_utils.py
johnpaulbin's picture
First model version
3affa92
# -*- coding: utf-8 -*-
from __future__ import print_function, division, unicode_literals
import sys
import re
import string
import emoji
from itertools import groupby
import numpy as np
from torchmoji.tokenizer import RE_MENTION, RE_URL
from torchmoji.global_variables import SPECIAL_TOKENS
try:
unichr # Python 2
except NameError:
unichr = chr # Python 3
AtMentionRegex = re.compile(RE_MENTION)
urlRegex = re.compile(RE_URL)
# from http://bit.ly/2rdjgjE (UTF-8 encodings and Unicode chars)
VARIATION_SELECTORS = [ '\ufe00',
'\ufe01',
'\ufe02',
'\ufe03',
'\ufe04',
'\ufe05',
'\ufe06',
'\ufe07',
'\ufe08',
'\ufe09',
'\ufe0a',
'\ufe0b',
'\ufe0c',
'\ufe0d',
'\ufe0e',
'\ufe0f']
# from https://stackoverflow.com/questions/92438/stripping-non-printable-characters-from-a-string-in-python
ALL_CHARS = (unichr(i) for i in range(sys.maxunicode))
CONTROL_CHARS = ''.join(map(unichr, list(range(0,32)) + list(range(127,160))))
CONTROL_CHAR_REGEX = re.compile('[%s]' % re.escape(CONTROL_CHARS))
def is_special_token(word):
equal = False
for spec in SPECIAL_TOKENS:
if word == spec:
equal = True
break
return equal
def mostly_english(words, english, pct_eng_short=0.5, pct_eng_long=0.6, ignore_special_tokens=True, min_length=2):
""" Ensure text meets threshold for containing English words """
n_words = 0
n_english = 0
if english is None:
return True, 0, 0
for w in words:
if len(w) < min_length:
continue
if punct_word(w):
continue
if ignore_special_tokens and is_special_token(w):
continue
n_words += 1
if w in english:
n_english += 1
if n_words < 2:
return True, n_words, n_english
if n_words < 5:
valid_english = n_english >= n_words * pct_eng_short
else:
valid_english = n_english >= n_words * pct_eng_long
return valid_english, n_words, n_english
def correct_length(words, min_words, max_words, ignore_special_tokens=True):
""" Ensure text meets threshold for containing English words
and that it's within the min and max words limits. """
if min_words is None:
min_words = 0
if max_words is None:
max_words = 99999
n_words = 0
for w in words:
if punct_word(w):
continue
if ignore_special_tokens and is_special_token(w):
continue
n_words += 1
valid = min_words <= n_words and n_words <= max_words
return valid
def punct_word(word, punctuation=string.punctuation):
return all([True if c in punctuation else False for c in word])
def load_non_english_user_set():
non_english_user_set = set(np.load('uids.npz')['data'])
return non_english_user_set
def non_english_user(userid, non_english_user_set):
neu_found = int(userid) in non_english_user_set
return neu_found
def separate_emojis_and_text(text):
emoji_chars = []
non_emoji_chars = []
for c in text:
if c in emoji.UNICODE_EMOJI:
emoji_chars.append(c)
else:
non_emoji_chars.append(c)
return ''.join(emoji_chars), ''.join(non_emoji_chars)
def extract_emojis(text, wanted_emojis):
text = remove_variation_selectors(text)
return [c for c in text if c in wanted_emojis]
def remove_variation_selectors(text):
""" Remove styling glyph variants for Unicode characters.
For instance, remove skin color from emojis.
"""
for var in VARIATION_SELECTORS:
text = text.replace(var, '')
return text
def shorten_word(word):
""" Shorten groupings of 3+ identical consecutive chars to 2, e.g. '!!!!' --> '!!'
"""
# only shorten ASCII words
try:
word.decode('ascii')
except (UnicodeDecodeError, UnicodeEncodeError, AttributeError) as e:
return word
# must have at least 3 char to be shortened
if len(word) < 3:
return word
# find groups of 3+ consecutive letters
letter_groups = [list(g) for k, g in groupby(word)]
triple_or_more = [''.join(g) for g in letter_groups if len(g) >= 3]
if len(triple_or_more) == 0:
return word
# replace letters to find the short word
short_word = word
for trip in triple_or_more:
short_word = short_word.replace(trip, trip[0]*2)
return short_word
def detect_special_tokens(word):
try:
int(word)
word = SPECIAL_TOKENS[4]
except ValueError:
if AtMentionRegex.findall(word):
word = SPECIAL_TOKENS[2]
elif urlRegex.findall(word):
word = SPECIAL_TOKENS[3]
return word
def process_word(word):
""" Shortening and converting the word to a special token if relevant.
"""
word = shorten_word(word)
word = detect_special_tokens(word)
return word
def remove_control_chars(text):
return CONTROL_CHAR_REGEX.sub('', text)
def convert_nonbreaking_space(text):
# ugly hack handling non-breaking space no matter how badly it's been encoded in the input
for r in ['\\\\xc2', '\\xc2', '\xc2', '\\\\xa0', '\\xa0', '\xa0']:
text = text.replace(r, ' ')
return text
def convert_linebreaks(text):
# ugly hack handling non-breaking space no matter how badly it's been encoded in the input
# space around to ensure proper tokenization
for r in ['\\\\n', '\\n', '\n', '\\\\r', '\\r', '\r', '<br>']:
text = text.replace(r, ' ' + SPECIAL_TOKENS[5] + ' ')
return text