# -*- coding: utf-8 -*- from __future__ import print_function, division import codecs import csv import numpy as np from emoji import UNICODE_EMOJI def read_english(path="english_words.txt", add_emojis=True): # read english words for filtering (includes emojis as part of set) english = set() with codecs.open(path, "r", "utf-8") as f: for line in f: line = line.strip().lower().replace('\n', '') if len(line): english.add(line) if add_emojis: for e in UNICODE_EMOJI: english.add(e) return english def read_wanted_emojis(path="wanted_emojis.csv"): emojis = [] with open(path, 'rb') as f: reader = csv.reader(f) for line in reader: line = line[0].strip().replace('\n', '') line = line.decode('unicode-escape') emojis.append(line) return emojis def read_non_english_users(path="unwanted_users.npz"): try: neu_set = set(np.load(path)['userids']) except IOError: neu_set = set() return neu_set