hf-deepmoji / torchmoji /filter_utils.py

Initial commit

cc0b62b about 7 years ago

5.85 kB


	# -- coding: utf-8 --
	from __future__ import print_function, division, unicode_literals
	import sys
	import re
	import string
	import emoji
	from itertools import groupby

	import numpy as np
	from torchmoji.tokenizer import RE_MENTION, RE_URL
	from torchmoji.global_variables import SPECIAL_TOKENS

	IS_PYTHON2 = int(sys.version[0]) == 2
	chr_ = unichr if IS_PYTHON2 else chr

	AtMentionRegex = re.compile(RE_MENTION)
	urlRegex = re.compile(RE_URL)

	# from http://bit.ly/2rdjgjE (UTF-8 encodings and Unicode chars)
	VARIATION_SELECTORS = [ '\ufe00',
	'\ufe01',
	'\ufe02',
	'\ufe03',
	'\ufe04',
	'\ufe05',
	'\ufe06',
	'\ufe07',
	'\ufe08',
	'\ufe09',
	'\ufe0a',
	'\ufe0b',
	'\ufe0c',
	'\ufe0d',
	'\ufe0e',
	'\ufe0f']

	# from https://stackoverflow.com/questions/92438/stripping-non-printable-characters-from-a-string-in-python
	ALL_CHARS = (chr_(i) for i in range(sys.maxunicode))
	CONTROL_CHARS = ''.join(map(chr_, list(range(0,32)) + list(range(127,160))))
	CONTROL_CHAR_REGEX = re.compile('[%s]' % re.escape(CONTROL_CHARS))

	def is_special_token(word):
	equal = False
	for spec in SPECIAL_TOKENS:
	if word == spec:
	equal = True
	break
	return equal

	def mostly_english(words, english, pct_eng_short=0.5, pct_eng_long=0.6, ignore_special_tokens=True, min_length=2):
	""" Ensure text meets threshold for containing English words """

	n_words = 0
	n_english = 0

	if english is None:
	return True, 0, 0

	for w in words:
	if len(w) < min_length:
	continue
	if punct_word(w):
	continue
	if ignore_special_tokens and is_special_token(w):
	continue
	n_words += 1
	if w in english:
	n_english += 1

	if n_words < 2:
	return True, n_words, n_english
	if n_words < 5:
	valid_english = n_english >= n_words * pct_eng_short
	else:
	valid_english = n_english >= n_words * pct_eng_long
	return valid_english, n_words, n_english

	def correct_length(words, min_words, max_words, ignore_special_tokens=True):
	""" Ensure text meets threshold for containing English words
	and that it's within the min and max words limits. """

	if min_words is None:
	min_words = 0

	if max_words is None:
	max_words = 99999

	n_words = 0
	for w in words:
	if punct_word(w):
	continue
	if ignore_special_tokens and is_special_token(w):
	continue
	n_words += 1
	valid = min_words <= n_words and n_words <= max_words
	return valid

	def punct_word(word, punctuation=string.punctuation):
	return all([True if c in punctuation else False for c in word])

	def load_non_english_user_set():
	non_english_user_set = set(np.load('uids.npz')['data'])
	return non_english_user_set

	def non_english_user(userid, non_english_user_set):
	neu_found = int(userid) in non_english_user_set
	return neu_found

	def separate_emojis_and_text(text):
	emoji_chars = []
	non_emoji_chars = []
	for c in text:
	if c in emoji.UNICODE_EMOJI:
	emoji_chars.append(c)
	else:
	non_emoji_chars.append(c)
	return ''.join(emoji_chars), ''.join(non_emoji_chars)

	def extract_emojis(text, wanted_emojis):
	text = remove_variation_selectors(text)
	return [c for c in text if c in wanted_emojis]

	def remove_variation_selectors(text):
	""" Remove styling glyph variants for Unicode characters.
	For instance, remove skin color from emojis.
	"""
	for var in VARIATION_SELECTORS:
	text = text.replace(var, '')
	return text

	def shorten_word(word):
	""" Shorten groupings of 3+ identical consecutive chars to 2, e.g. '!!!!' --> '!!'
	"""

	# only shorten ASCII words
	try:
	word.decode('ascii')
	except (UnicodeDecodeError, UnicodeEncodeError, AttributeError) as e:
	return word

	# must have at least 3 char to be shortened
	if len(word) < 3:
	return word

	# find groups of 3+ consecutive letters
	letter_groups = [list(g) for k, g in groupby(word)]
	triple_or_more = [''.join(g) for g in letter_groups if len(g) >= 3]
	if len(triple_or_more) == 0:
	return word

	# replace letters to find the short word
	short_word = word
	for trip in triple_or_more:
	short_word = short_word.replace(trip, trip[0]*2)

	return short_word

	def detect_special_tokens(word):
	try:
	int(word)
	word = SPECIAL_TOKENS[4]
	except ValueError:
	if AtMentionRegex.findall(word):
	word = SPECIAL_TOKENS[2]
	elif urlRegex.findall(word):
	word = SPECIAL_TOKENS[3]
	return word

	def process_word(word):
	""" Shortening and converting the word to a special token if relevant.
	"""
	word = shorten_word(word)
	word = detect_special_tokens(word)
	return word

	def remove_control_chars(text):
	return CONTROL_CHAR_REGEX.sub('', text)

	def convert_nonbreaking_space(text):
	# ugly hack handling non-breaking space no matter how badly it's been encoded in the input
	for r in ['\\\\xc2', '\\xc2', '\xc2', '\\\\xa0', '\\xa0', '\xa0']:
	text = text.replace(r, ' ')
	return text

	def convert_linebreaks(text):
	# ugly hack handling non-breaking space no matter how badly it's been encoded in the input
	# space around to ensure proper tokenization
	for r in ['\\\\n', '\\n', '\n', '\\\\r', '\\r', '\r', '<br>']:
	text = text.replace(r, ' ' + SPECIAL_TOKENS[5] + ' ')
	return text