Spaces:
Running
Running
File size: 12,065 Bytes
b874891 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 |
# -*- coding: utf-8 -*-
''' Extracts lists of words from a given input to be used for later vocabulary
generation or for creating tokenized datasets.
Supports functionality for handling different file types and
filtering/processing of this input.
'''
from __future__ import division, print_function, unicode_literals
import re
import unicodedata
import numpy as np
from text_unidecode import unidecode
from torchmoji.tokenizer import RE_MENTION, tokenize
from torchmoji.filter_utils import (convert_linebreaks,
convert_nonbreaking_space,
correct_length,
extract_emojis,
mostly_english,
non_english_user,
process_word,
punct_word,
remove_control_chars,
remove_variation_selectors,
separate_emojis_and_text)
try:
unicode # Python 2
except NameError:
unicode = str # Python 3
# Only catch retweets in the beginning of the tweet as those are the
# automatically added ones.
# We do not want to remove tweets like "Omg.. please RT this!!"
RETWEETS_RE = re.compile(r'^[rR][tT]')
# Use fast and less precise regex for removing tweets with URLs
# It doesn't matter too much if a few tweets with URL's make it through
URLS_RE = re.compile(r'https?://|www\.')
MENTION_RE = re.compile(RE_MENTION)
ALLOWED_CONVERTED_UNICODE_PUNCTUATION = """!"#$'()+,-.:;<=>?@`~"""
class WordGenerator():
''' Cleanses input and converts into words. Needs all sentences to be in
Unicode format. Has subclasses that read sentences differently based on
file type.
Takes a generator as input. This can be from e.g. a file.
unicode_handling in ['ignore_sentence', 'convert_punctuation', 'allow']
unicode_handling in ['ignore_emoji', 'ignore_sentence', 'allow']
'''
def __init__(self, stream, allow_unicode_text=False, ignore_emojis=True,
remove_variation_selectors=True, break_replacement=True):
self.stream = stream
self.allow_unicode_text = allow_unicode_text
self.remove_variation_selectors = remove_variation_selectors
self.ignore_emojis = ignore_emojis
self.break_replacement = break_replacement
self.reset_stats()
def get_words(self, sentence):
""" Tokenizes a sentence into individual words.
Converts Unicode punctuation into ASCII if that option is set.
Ignores sentences with Unicode if that option is set.
Returns an empty list of words if the sentence has Unicode and
that is not allowed.
"""
if not isinstance(sentence, unicode):
raise ValueError("All sentences should be Unicode-encoded!")
sentence = sentence.strip().lower()
if self.break_replacement:
sentence = convert_linebreaks(sentence)
if self.remove_variation_selectors:
sentence = remove_variation_selectors(sentence)
# Split into words using simple whitespace splitting and convert
# Unicode. This is done to prevent word splitting issues with
# twokenize and Unicode
words = sentence.split()
converted_words = []
for w in words:
accept_sentence, c_w = self.convert_unicode_word(w)
# Unicode word detected and not allowed
if not accept_sentence:
return []
else:
converted_words.append(c_w)
sentence = ' '.join(converted_words)
words = tokenize(sentence)
words = [process_word(w) for w in words]
return words
def check_ascii(self, word):
""" Returns whether a word is ASCII """
try:
word.decode('ascii')
return True
except (UnicodeDecodeError, UnicodeEncodeError, AttributeError):
return False
def convert_unicode_punctuation(self, word):
word_converted_punct = []
for c in word:
decoded_c = unidecode(c).lower()
if len(decoded_c) == 0:
# Cannot decode to anything reasonable
word_converted_punct.append(c)
else:
# Check if all punctuation and therefore fine
# to include unidecoded version
allowed_punct = punct_word(
decoded_c,
punctuation=ALLOWED_CONVERTED_UNICODE_PUNCTUATION)
if allowed_punct:
word_converted_punct.append(decoded_c)
else:
word_converted_punct.append(c)
return ''.join(word_converted_punct)
def convert_unicode_word(self, word):
""" Converts Unicode words to ASCII using unidecode. If Unicode is not
allowed (set as a variable during initialization), then only
punctuation that can be converted to ASCII will be allowed.
"""
if self.check_ascii(word):
return True, word
# First we ensure that the Unicode is normalized so it's
# always a single character.
word = unicodedata.normalize("NFKC", word)
# Convert Unicode punctuation to ASCII equivalent. We want
# e.g. "\u203c" (double exclamation mark) to be treated the same
# as "!!" no matter if we allow other Unicode characters or not.
word = self.convert_unicode_punctuation(word)
if self.ignore_emojis:
_, word = separate_emojis_and_text(word)
# If conversion of punctuation and removal of emojis took care
# of all the Unicode or if we allow Unicode then everything is fine
if self.check_ascii(word) or self.allow_unicode_text:
return True, word
else:
# Sometimes we might want to simply ignore Unicode sentences
# (e.g. for vocabulary creation). This is another way to prevent
# "polution" of strange Unicode tokens from low quality datasets
return False, ''
def data_preprocess_filtering(self, line, iter_i):
""" To be overridden with specific preprocessing/filtering behavior
if desired.
Returns a boolean of whether the line should be accepted and the
preprocessed text.
Runs prior to tokenization.
"""
return True, line, {}
def data_postprocess_filtering(self, words, iter_i):
""" To be overridden with specific postprocessing/filtering behavior
if desired.
Returns a boolean of whether the line should be accepted and the
postprocessed text.
Runs after tokenization.
"""
return True, words, {}
def extract_valid_sentence_words(self, line):
""" Line may either a string of a list of strings depending on how
the stream is being parsed.
Domain-specific processing and filtering can be done both prior to
and after tokenization.
Custom information about the line can be extracted during the
processing phases and returned as a dict.
"""
info = {}
pre_valid, pre_line, pre_info = \
self.data_preprocess_filtering(line, self.stats['total'])
info.update(pre_info)
if not pre_valid:
self.stats['pretokenization_filtered'] += 1
return False, [], info
words = self.get_words(pre_line)
if len(words) == 0:
self.stats['unicode_filtered'] += 1
return False, [], info
post_valid, post_words, post_info = \
self.data_postprocess_filtering(words, self.stats['total'])
info.update(post_info)
if not post_valid:
self.stats['posttokenization_filtered'] += 1
return post_valid, post_words, info
def generate_array_from_input(self):
sentences = []
for words in self:
sentences.append(words)
return sentences
def reset_stats(self):
self.stats = {'pretokenization_filtered': 0,
'unicode_filtered': 0,
'posttokenization_filtered': 0,
'total': 0,
'valid': 0}
def __iter__(self):
if self.stream is None:
raise ValueError("Stream should be set before iterating over it!")
for line in self.stream:
valid, words, info = self.extract_valid_sentence_words(line)
# Words may be filtered away due to unidecode etc.
# In that case the words should not be passed on.
if valid and len(words):
self.stats['valid'] += 1
yield words, info
self.stats['total'] += 1
class TweetWordGenerator(WordGenerator):
''' Returns np array or generator of ASCII sentences for given tweet input.
Any file opening/closing should be handled outside of this class.
'''
def __init__(self, stream, wanted_emojis=None, english_words=None,
non_english_user_set=None, allow_unicode_text=False,
ignore_retweets=True, ignore_url_tweets=True,
ignore_mention_tweets=False):
self.wanted_emojis = wanted_emojis
self.english_words = english_words
self.non_english_user_set = non_english_user_set
self.ignore_retweets = ignore_retweets
self.ignore_url_tweets = ignore_url_tweets
self.ignore_mention_tweets = ignore_mention_tweets
WordGenerator.__init__(self, stream,
allow_unicode_text=allow_unicode_text)
def validated_tweet(self, data):
''' A bunch of checks to determine whether the tweet is valid.
Also returns emojis contained by the tweet.
'''
# Ordering of validations is important for speed
# If it passes all checks, then the tweet is validated for usage
# Skips incomplete tweets
if len(data) <= 9:
return False, []
text = data[9]
if self.ignore_retweets and RETWEETS_RE.search(text):
return False, []
if self.ignore_url_tweets and URLS_RE.search(text):
return False, []
if self.ignore_mention_tweets and MENTION_RE.search(text):
return False, []
if self.wanted_emojis is not None:
uniq_emojis = np.unique(extract_emojis(text, self.wanted_emojis))
if len(uniq_emojis) == 0:
return False, []
else:
uniq_emojis = []
if self.non_english_user_set is not None and \
non_english_user(data[1], self.non_english_user_set):
return False, []
return True, uniq_emojis
def data_preprocess_filtering(self, line, iter_i):
fields = line.strip().split("\t")
valid, emojis = self.validated_tweet(fields)
text = fields[9].replace('\\n', '') \
.replace('\\r', '') \
.replace('&', '&') if valid else ''
return valid, text, {'emojis': emojis}
def data_postprocess_filtering(self, words, iter_i):
valid_length = correct_length(words, 1, None)
valid_english, n_words, n_english = mostly_english(words,
self.english_words)
if valid_length and valid_english:
return True, words, {'length': len(words),
'n_normal_words': n_words,
'n_english': n_english}
else:
return False, [], {'length': len(words),
'n_normal_words': n_words,
'n_english': n_english}
|