# -*- coding: utf-8 -*- from __future__ import print_function, division, unicode_literals import sys import re import string import emoji from itertools import groupby import numpy as np from torchmoji.tokenizer import RE_MENTION, RE_URL from torchmoji.global_variables import SPECIAL_TOKENS IS_PYTHON2 = int(sys.version[0]) == 2 chr_ = unichr if IS_PYTHON2 else chr AtMentionRegex = re.compile(RE_MENTION) urlRegex = re.compile(RE_URL) # from http://bit.ly/2rdjgjE (UTF-8 encodings and Unicode chars) VARIATION_SELECTORS = [ '\ufe00', '\ufe01', '\ufe02', '\ufe03', '\ufe04', '\ufe05', '\ufe06', '\ufe07', '\ufe08', '\ufe09', '\ufe0a', '\ufe0b', '\ufe0c', '\ufe0d', '\ufe0e', '\ufe0f'] # from https://stackoverflow.com/questions/92438/stripping-non-printable-characters-from-a-string-in-python ALL_CHARS = (chr_(i) for i in range(sys.maxunicode)) CONTROL_CHARS = ''.join(map(chr_, list(range(0,32)) + list(range(127,160)))) CONTROL_CHAR_REGEX = re.compile('[%s]' % re.escape(CONTROL_CHARS)) def is_special_token(word): equal = False for spec in SPECIAL_TOKENS: if word == spec: equal = True break return equal def mostly_english(words, english, pct_eng_short=0.5, pct_eng_long=0.6, ignore_special_tokens=True, min_length=2): """ Ensure text meets threshold for containing English words """ n_words = 0 n_english = 0 if english is None: return True, 0, 0 for w in words: if len(w) < min_length: continue if punct_word(w): continue if ignore_special_tokens and is_special_token(w): continue n_words += 1 if w in english: n_english += 1 if n_words < 2: return True, n_words, n_english if n_words < 5: valid_english = n_english >= n_words * pct_eng_short else: valid_english = n_english >= n_words * pct_eng_long return valid_english, n_words, n_english def correct_length(words, min_words, max_words, ignore_special_tokens=True): """ Ensure text meets threshold for containing English words and that it's within the min and max words limits. """ if min_words is None: min_words = 0 if max_words is None: max_words = 99999 n_words = 0 for w in words: if punct_word(w): continue if ignore_special_tokens and is_special_token(w): continue n_words += 1 valid = min_words <= n_words and n_words <= max_words return valid def punct_word(word, punctuation=string.punctuation): return all([True if c in punctuation else False for c in word]) def load_non_english_user_set(): non_english_user_set = set(np.load('uids.npz')['data']) return non_english_user_set def non_english_user(userid, non_english_user_set): neu_found = int(userid) in non_english_user_set return neu_found def separate_emojis_and_text(text): emoji_chars = [] non_emoji_chars = [] for c in text: if c in emoji.UNICODE_EMOJI: emoji_chars.append(c) else: non_emoji_chars.append(c) return ''.join(emoji_chars), ''.join(non_emoji_chars) def extract_emojis(text, wanted_emojis): text = remove_variation_selectors(text) return [c for c in text if c in wanted_emojis] def remove_variation_selectors(text): """ Remove styling glyph variants for Unicode characters. For instance, remove skin color from emojis. """ for var in VARIATION_SELECTORS: text = text.replace(var, '') return text def shorten_word(word): """ Shorten groupings of 3+ identical consecutive chars to 2, e.g. '!!!!' --> '!!' """ # only shorten ASCII words try: word.decode('ascii') except (UnicodeDecodeError, UnicodeEncodeError, AttributeError) as e: return word # must have at least 3 char to be shortened if len(word) < 3: return word # find groups of 3+ consecutive letters letter_groups = [list(g) for k, g in groupby(word)] triple_or_more = [''.join(g) for g in letter_groups if len(g) >= 3] if len(triple_or_more) == 0: return word # replace letters to find the short word short_word = word for trip in triple_or_more: short_word = short_word.replace(trip, trip[0]*2) return short_word def detect_special_tokens(word): try: int(word) word = SPECIAL_TOKENS[4] except ValueError: if AtMentionRegex.findall(word): word = SPECIAL_TOKENS[2] elif urlRegex.findall(word): word = SPECIAL_TOKENS[3] return word def process_word(word): """ Shortening and converting the word to a special token if relevant. """ word = shorten_word(word) word = detect_special_tokens(word) return word def remove_control_chars(text): return CONTROL_CHAR_REGEX.sub('', text) def convert_nonbreaking_space(text): # ugly hack handling non-breaking space no matter how badly it's been encoded in the input for r in ['\\\\xc2', '\\xc2', '\xc2', '\\\\xa0', '\\xa0', '\xa0']: text = text.replace(r, ' ') return text def convert_linebreaks(text): # ugly hack handling non-breaking space no matter how badly it's been encoded in the input # space around to ensure proper tokenization for r in ['\\\\n', '\\n', '\n', '\\\\r', '\\r', '\r', '
']: text = text.replace(r, ' ' + SPECIAL_TOKENS[5] + ' ') return text