DoubleHardDebias / utils.py
Daniel-Saeedi's picture
Updating requirement
a45c033
import string
from tqdm import tqdm
# import pickle
import scipy
import numpy as np
from numpy import linalg as LA
from sklearn.decomposition import PCA
# Experiment 1
WEAT_words = {
'A':['John', 'Paul', 'Mike', 'Kevin', 'Steve', 'Greg', 'Jeff', 'Bill'],
'B':['Amy', 'Joan', 'Lisa', 'Sarah', 'Diana', 'Kate', 'Ann', 'Donna'],
'C':['executive', 'management', 'professional', 'corporation', 'salary', 'office', 'business', 'career'],
'D':['home', 'parents', 'children', 'family', 'cousins', 'marriage', 'wedding', 'relatives'],
'E':['math', 'algebra', 'geometry', 'calculus', 'equations', 'computation', 'numbers', 'addition'],
'F':['poetry', 'art', 'dance', 'literature', 'novel', 'symphony', 'drama', 'sculpture'],
'G':['science', 'technology', 'physics', 'chemistry', 'einstein', 'nasa', 'experiment', 'astronomy'],
'H':['poetry', 'art', 'shakespeare', 'dance', 'literature', 'novel', 'symphony', 'drama'],
}
def has_punct(w):
if any([c in string.punctuation for c in w]):
return True
return False
def has_digit(w):
if any([c in '0123456789' for c in w]):
return True
return False
def limit_vocab(wv, w2i, vocab, exclude = None):
vocab_limited = []
for w in tqdm(vocab[:50000]):
if w.lower() != w:
continue
if len(w) >= 20:
continue
if has_digit(w):
continue
if '_' in w:
p = [has_punct(subw) for subw in w.split('_')]
if not any(p):
vocab_limited.append(w)
continue
if has_punct(w):
continue
vocab_limited.append(w)
if exclude:
vocab_limited = list(set(vocab_limited) - set(exclude))
print("size of vocabulary:", len(vocab_limited))
wv_limited = np.zeros((len(vocab_limited), len(wv[0, :])))
for i,w in enumerate(vocab_limited):
wv_limited[i,:] = wv[w2i[w],:]
w2i_limited = {w: i for i, w in enumerate(vocab_limited)}
return vocab_limited, wv_limited, w2i_limited
def norm_stand(wv):
W_norm = np.zeros(wv.shape)
d = (np.sum(wv ** 2, 1) ** (0.5))
W_norm = (wv.T / d).T
return W_norm
def normalize(wv):
# normalize vectors
norms = np.apply_along_axis(LA.norm, 1, wv)
wv = wv / norms[:, np.newaxis]
return wv
def topK(w, wv, w2i, vocab, k=10):
# extract the word vector for word w
idx = w2i[w]
vec = wv[idx, :]
# compute similarity of w with all words in the vocabulary
sim = wv.dot(vec)
# sim = []
# for i in range(len(wv)):
# sim.append(1-scipy.spatial.distance.cosine(wv[i, :], vec))
# sim = np.array(sim)
# sort similarities by descending order
sort_sim = (sim.argsort())[::-1]
# choose topK
best = sort_sim[:(k+1)]
return [vocab[i] for i in best if i!=idx]
def similarity(w1, w2, wv, w2i):
i1 = w2i[w1]
i2 = w2i[w2]
vec1 = wv[i1, :]
vec2 = wv[i2, :]
return 1-scipy.spatial.distance.cosine(vec1, vec2)
def drop(u, v):
return u - v * u.dot(v) / v.dot(v)
from sklearn.decomposition import PCA
from sklearn import preprocessing
def doPCA(pairs, wv, w2i):
matrix = []
cnt = 0
if type(pairs[0]) is list:
for a, b in pairs:
if not (a in w2i and b in w2i): continue
center = (wv[w2i[a], :] + wv[w2i[b], :])/2
matrix.append(wv[w2i[a], :] - center)
matrix.append(wv[w2i[b], :] - center)
cnt += 1
else:
for a in pairs:
if not (a in w2i): continue
matrix.append(wv[w2i[a], :])
cnt += 1
embeds = np.array(matrix)
wv_mean = np.mean(np.array(embeds), axis=0)
wv_hat = np.zeros(embeds.shape).astype(float)
for i in range(len(embeds)):
wv_hat[i, :] = embeds[i, :] - wv_mean
matrix = wv_hat
matrix = np.array(matrix)
pca = PCA()
pca.fit(matrix)
print('pairs used in PCA: ', cnt)
return pca
# get tuples of biases and counts of masculine/feminine NN for each word (for bias-by-neighbors)
import operator
def bias_by_neighbors(wv, w2i, vocab, gender_bias_bef, size, neighbours_num = 100):
tuples = []
sorted_g = sorted(gender_bias_bef.items(), key=operator.itemgetter(1))
female = [item[0] for item in sorted_g[:size]]
male = [item[0] for item in sorted_g[-size:]]
# vocab = male + female
selected = female + male if size > 0 else vocab
for w in selected:
top = topK(w, wv, w2i, vocab, k=neighbours_num+5)[:neighbours_num]
m = 0
f = 0
for t in top:
if gender_bias_bef[t] > 0:
m+=1
else:
f+=1
tuples.append((w, gender_bias_bef[w], m, f))
return tuples
def get_tuples_prof(wv, w2i, vocab, words, gender_bias_dict):
wv = normalize(wv)
tuples = []
for w in words:
if w not in gender_bias_dict:
continue
top = topK(w, wv, w2i, vocab, k=105)[:100]
m = 0
f = 0
for t in top:
if gender_bias_dict[t] > 0:
m+=1
else:
f+=1
tuples.append((w, gender_bias_dict[w], m, f))
return tuples
# compute correlation between bias-by-projection and bias-by-neighbors
import scipy.stats
def pearson(a,b):
return scipy.stats.pearsonr(a,b)
def compute_corr(tuples, i1, i2):
a = []
b = []
for t in tuples:
a.append(t[i1])
b.append(t[i2])
assert(len(a)==len(b))
print('pearson: ', scipy.stats.pearsonr(a,b))
print('spearman: ', scipy.stats.spearmanr(a, b))
# Auxiliary finctions
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
def visualize(vectors, y_true, y_pred, ax, title, random_state, num_clusters = 2):
# perform TSNE
X_embedded = TSNE(n_components=2, random_state=random_state).fit_transform(vectors)
for x,p,y in zip(X_embedded, y_pred, y_true):
if p:
if y:
ax.scatter(x[0], x[1], marker = '.', c = 'c')
else:
ax.scatter(x[0], x[1], marker = 'x', c = 'c')
else:
if y:
ax.scatter(x[0], x[1], marker = '.', c = 'darkviolet')
else:
ax.scatter(x[0], x[1], marker = 'x', c = 'darkviolet')
ax.text(.01, .9, title ,transform=ax.transAxes, fontsize=15)
def extract_vectors(words, wv, w2i):
X = [wv[w2i[x],:] for x in words]
return X
def cluster_and_visualize(words, X, random_state, y_true, num=2):
y_pred = KMeans(n_clusters=num, random_state=random_state).fit_predict(X)
# fig, axs = plt.subplots(figsize=(6, 3))
# visualize(X, y_true, y_pred, axs, 'Original', random_state)
correct = [1 if item1 == item2 else 0 for (item1,item2) in zip(y_true, y_pred) ]
print('precision', max(sum(correct)/float(len(correct)), 1 - sum(correct)/float(len(correct))))
import scipy.stats
from sklearn import svm
def train_and_predict(wv, w2i, vocab, size_train, size_test, males, females):
X_train = [wv[w2i[w],:] for w in males[:size_train]+females[:size_train]]
Y_train = [1]*size_train + [0]*size_train
X_test = [wv[w2i[w],:] for w in males[size_train:]+females[size_train:]]
Y_test = [1]*size_test + [0]*size_test
clf = svm.SVC(gamma='auto')
clf.fit(X_train, Y_train)
preds = clf.predict(X_test)
accuracy = [1 if y==z else 0 for y,z in zip(preds, Y_test)]
acc = float(sum(accuracy))/len(accuracy)
print('accuracy:', float(sum(accuracy))/len(accuracy))
return acc
# Auxiliary functions for experiments by Caliskan et al.
import scipy
import scipy.misc as misc
import itertools
def s_word(w, A, B, wv, w2i, vocab, all_s_words):
if w in all_s_words:
return all_s_words[w]
mean_a = []
mean_b = []
for a in A:
mean_a.append(similarity(w, a, wv, w2i))
for b in B:
mean_b.append(similarity(w, b, wv, w2i))
mean_a = sum(mean_a)/float(len(mean_a))
mean_b = sum(mean_b)/float(len(mean_b))
all_s_words[w] = mean_a - mean_b
return all_s_words[w]
def s_group(X, Y, A, B, wv, w2i, vocab, all_s_words):
total = 0
for x in X:
total += s_word(x, A, B, wv, w2i, vocab, all_s_words)
for y in Y:
total -= s_word(y, A, B, wv, w2i, vocab, all_s_words)
return total
def p_value_exhust(X, Y, A, B, wv, w2i, vocab):
if len(X) > 10:
print('might take too long, use sampled version: p_value')
return
assert(len(X) == len(Y))
all_s_words = {}
s_orig = s_group(X, Y, A, B, wv, w2i, vocab, all_s_words)
union = set(X+Y)
subset_size = int(len(union)/2)
larger = 0
total = 0
for subset in set(itertools.combinations(union, subset_size)):
total += 1
Xi = list(set(subset))
Yi = list(union - set(subset))
if s_group(Xi, Yi, A, B, wv, w2i, vocab, all_s_words) > s_orig:
larger += 1
print('num of samples', total)
return larger/float(total)
def association_diff(t, A, B, wv, w2i):
mean_a = []
mean_b = []
for a in A:
mean_a.append(similarity(t, a, wv, w2i))
for b in B:
mean_b.append(similarity(t, b, wv, w2i))
mean_a = sum(mean_a)/float(len(mean_a))
mean_b = sum(mean_b)/float(len(mean_b))
return mean_a - mean_b
def effect_size(X, Y, A, B, wv, w2i, vocab):
assert(len(X) == len(Y))
assert(len(A) == len(B))
norm_x = []
norm_y = []
for x in X:
norm_x.append(association_diff(x, A, B, wv, w2i))
for y in Y:
norm_y.append(association_diff(y, A, B, wv, w2i))
std = np.std(norm_x+norm_y, ddof=1)
norm_x = sum(norm_x) / float(len(norm_x))
norm_y = sum(norm_y) / float(len(norm_y))
return (norm_x-norm_y)/std
def p_value_sample(X, Y, A, B, wv, w2i, vocab):
random.seed(10)
np.random.seed(10)
all_s_words = {}
assert(len(X) == len(Y))
length = len(X)
s_orig = s_group(X, Y, A, B, wv, w2i, vocab, all_s_words)
num_of_samples = min(1000000, int(scipy.special.comb(length*2,length)*100))
print('num of samples', num_of_samples)
larger = 0
for i in range(num_of_samples):
permute = np.random.permutation(X+Y)
Xi = permute[:length]
Yi = permute[length:]
if s_group(Xi, Yi, A, B, space, all_s_words) > s_orig:
larger += 1
return larger/float(num_of_samples)