Spaces:
Running
Running
File size: 1,073 Bytes
3affa92 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 |
# -*- coding: utf-8 -*-
from __future__ import print_function, division
import codecs
import csv
import numpy as np
from emoji import UNICODE_EMOJI
def read_english(path="english_words.txt", add_emojis=True):
# read english words for filtering (includes emojis as part of set)
english = set()
with codecs.open(path, "r", "utf-8") as f:
for line in f:
line = line.strip().lower().replace('\n', '')
if len(line):
english.add(line)
if add_emojis:
for e in UNICODE_EMOJI:
english.add(e)
return english
def read_wanted_emojis(path="wanted_emojis.csv"):
emojis = []
with open(path, 'rb') as f:
reader = csv.reader(f)
for line in reader:
line = line[0].strip().replace('\n', '')
line = line.decode('unicode-escape')
emojis.append(line)
return emojis
def read_non_english_users(path="unwanted_users.npz"):
try:
neu_set = set(np.load(path)['userids'])
except IOError:
neu_set = set()
return neu_set
|