import string from tqdm import tqdm # import pickle import scipy import numpy as np from numpy import linalg as LA from sklearn.decomposition import PCA # Experiment 1 WEAT_words = { 'A':['John', 'Paul', 'Mike', 'Kevin', 'Steve', 'Greg', 'Jeff', 'Bill'], 'B':['Amy', 'Joan', 'Lisa', 'Sarah', 'Diana', 'Kate', 'Ann', 'Donna'], 'C':['executive', 'management', 'professional', 'corporation', 'salary', 'office', 'business', 'career'], 'D':['home', 'parents', 'children', 'family', 'cousins', 'marriage', 'wedding', 'relatives'], 'E':['math', 'algebra', 'geometry', 'calculus', 'equations', 'computation', 'numbers', 'addition'], 'F':['poetry', 'art', 'dance', 'literature', 'novel', 'symphony', 'drama', 'sculpture'], 'G':['science', 'technology', 'physics', 'chemistry', 'einstein', 'nasa', 'experiment', 'astronomy'], 'H':['poetry', 'art', 'shakespeare', 'dance', 'literature', 'novel', 'symphony', 'drama'], } def has_punct(w): if any([c in string.punctuation for c in w]): return True return False def has_digit(w): if any([c in '0123456789' for c in w]): return True return False def limit_vocab(wv, w2i, vocab, exclude = None): vocab_limited = [] for w in tqdm(vocab[:50000]): if w.lower() != w: continue if len(w) >= 20: continue if has_digit(w): continue if '_' in w: p = [has_punct(subw) for subw in w.split('_')] if not any(p): vocab_limited.append(w) continue if has_punct(w): continue vocab_limited.append(w) if exclude: vocab_limited = list(set(vocab_limited) - set(exclude)) print("size of vocabulary:", len(vocab_limited)) wv_limited = np.zeros((len(vocab_limited), len(wv[0, :]))) for i,w in enumerate(vocab_limited): wv_limited[i,:] = wv[w2i[w],:] w2i_limited = {w: i for i, w in enumerate(vocab_limited)} return vocab_limited, wv_limited, w2i_limited def norm_stand(wv): W_norm = np.zeros(wv.shape) d = (np.sum(wv ** 2, 1) ** (0.5)) W_norm = (wv.T / d).T return W_norm def normalize(wv): # normalize vectors norms = np.apply_along_axis(LA.norm, 1, wv) wv = wv / norms[:, np.newaxis] return wv def topK(w, wv, w2i, vocab, k=10): # extract the word vector for word w idx = w2i[w] vec = wv[idx, :] # compute similarity of w with all words in the vocabulary sim = wv.dot(vec) # sim = [] # for i in range(len(wv)): # sim.append(1-scipy.spatial.distance.cosine(wv[i, :], vec)) # sim = np.array(sim) # sort similarities by descending order sort_sim = (sim.argsort())[::-1] # choose topK best = sort_sim[:(k+1)] return [vocab[i] for i in best if i!=idx] def similarity(w1, w2, wv, w2i): i1 = w2i[w1] i2 = w2i[w2] vec1 = wv[i1, :] vec2 = wv[i2, :] return 1-scipy.spatial.distance.cosine(vec1, vec2) def drop(u, v): return u - v * u.dot(v) / v.dot(v) from sklearn.decomposition import PCA from sklearn import preprocessing def doPCA(pairs, wv, w2i): matrix = [] cnt = 0 if type(pairs[0]) is list: for a, b in pairs: if not (a in w2i and b in w2i): continue center = (wv[w2i[a], :] + wv[w2i[b], :])/2 matrix.append(wv[w2i[a], :] - center) matrix.append(wv[w2i[b], :] - center) cnt += 1 else: for a in pairs: if not (a in w2i): continue matrix.append(wv[w2i[a], :]) cnt += 1 embeds = np.array(matrix) wv_mean = np.mean(np.array(embeds), axis=0) wv_hat = np.zeros(embeds.shape).astype(float) for i in range(len(embeds)): wv_hat[i, :] = embeds[i, :] - wv_mean matrix = wv_hat matrix = np.array(matrix) pca = PCA() pca.fit(matrix) print('pairs used in PCA: ', cnt) return pca # get tuples of biases and counts of masculine/feminine NN for each word (for bias-by-neighbors) import operator def bias_by_neighbors(wv, w2i, vocab, gender_bias_bef, size, neighbours_num = 100): tuples = [] sorted_g = sorted(gender_bias_bef.items(), key=operator.itemgetter(1)) female = [item[0] for item in sorted_g[:size]] male = [item[0] for item in sorted_g[-size:]] # vocab = male + female selected = female + male if size > 0 else vocab for w in selected: top = topK(w, wv, w2i, vocab, k=neighbours_num+5)[:neighbours_num] m = 0 f = 0 for t in top: if gender_bias_bef[t] > 0: m+=1 else: f+=1 tuples.append((w, gender_bias_bef[w], m, f)) return tuples def get_tuples_prof(wv, w2i, vocab, words, gender_bias_dict): wv = normalize(wv) tuples = [] for w in words: if w not in gender_bias_dict: continue top = topK(w, wv, w2i, vocab, k=105)[:100] m = 0 f = 0 for t in top: if gender_bias_dict[t] > 0: m+=1 else: f+=1 tuples.append((w, gender_bias_dict[w], m, f)) return tuples # compute correlation between bias-by-projection and bias-by-neighbors import scipy.stats def pearson(a,b): return scipy.stats.pearsonr(a,b) def compute_corr(tuples, i1, i2): a = [] b = [] for t in tuples: a.append(t[i1]) b.append(t[i2]) assert(len(a)==len(b)) print('pearson: ', scipy.stats.pearsonr(a,b)) print('spearman: ', scipy.stats.spearmanr(a, b)) # Auxiliary finctions from sklearn.cluster import KMeans from sklearn.manifold import TSNE def visualize(vectors, y_true, y_pred, ax, title, random_state, num_clusters = 2): # perform TSNE X_embedded = TSNE(n_components=2, random_state=random_state).fit_transform(vectors) for x,p,y in zip(X_embedded, y_pred, y_true): if p: if y: ax.scatter(x[0], x[1], marker = '.', c = 'c') else: ax.scatter(x[0], x[1], marker = 'x', c = 'c') else: if y: ax.scatter(x[0], x[1], marker = '.', c = 'darkviolet') else: ax.scatter(x[0], x[1], marker = 'x', c = 'darkviolet') ax.text(.01, .9, title ,transform=ax.transAxes, fontsize=15) def extract_vectors(words, wv, w2i): X = [wv[w2i[x],:] for x in words] return X def cluster_and_visualize(words, X, random_state, y_true, num=2): y_pred = KMeans(n_clusters=num, random_state=random_state).fit_predict(X) # fig, axs = plt.subplots(figsize=(6, 3)) # visualize(X, y_true, y_pred, axs, 'Original', random_state) correct = [1 if item1 == item2 else 0 for (item1,item2) in zip(y_true, y_pred) ] print('precision', max(sum(correct)/float(len(correct)), 1 - sum(correct)/float(len(correct)))) import scipy.stats from sklearn import svm def train_and_predict(wv, w2i, vocab, size_train, size_test, males, females): X_train = [wv[w2i[w],:] for w in males[:size_train]+females[:size_train]] Y_train = [1]*size_train + [0]*size_train X_test = [wv[w2i[w],:] for w in males[size_train:]+females[size_train:]] Y_test = [1]*size_test + [0]*size_test clf = svm.SVC(gamma='auto') clf.fit(X_train, Y_train) preds = clf.predict(X_test) accuracy = [1 if y==z else 0 for y,z in zip(preds, Y_test)] acc = float(sum(accuracy))/len(accuracy) print('accuracy:', float(sum(accuracy))/len(accuracy)) return acc # Auxiliary functions for experiments by Caliskan et al. import scipy import scipy.misc as misc import itertools def s_word(w, A, B, wv, w2i, vocab, all_s_words): if w in all_s_words: return all_s_words[w] mean_a = [] mean_b = [] for a in A: mean_a.append(similarity(w, a, wv, w2i)) for b in B: mean_b.append(similarity(w, b, wv, w2i)) mean_a = sum(mean_a)/float(len(mean_a)) mean_b = sum(mean_b)/float(len(mean_b)) all_s_words[w] = mean_a - mean_b return all_s_words[w] def s_group(X, Y, A, B, wv, w2i, vocab, all_s_words): total = 0 for x in X: total += s_word(x, A, B, wv, w2i, vocab, all_s_words) for y in Y: total -= s_word(y, A, B, wv, w2i, vocab, all_s_words) return total def p_value_exhust(X, Y, A, B, wv, w2i, vocab): if len(X) > 10: print('might take too long, use sampled version: p_value') return assert(len(X) == len(Y)) all_s_words = {} s_orig = s_group(X, Y, A, B, wv, w2i, vocab, all_s_words) union = set(X+Y) subset_size = int(len(union)/2) larger = 0 total = 0 for subset in set(itertools.combinations(union, subset_size)): total += 1 Xi = list(set(subset)) Yi = list(union - set(subset)) if s_group(Xi, Yi, A, B, wv, w2i, vocab, all_s_words) > s_orig: larger += 1 print('num of samples', total) return larger/float(total) def association_diff(t, A, B, wv, w2i): mean_a = [] mean_b = [] for a in A: mean_a.append(similarity(t, a, wv, w2i)) for b in B: mean_b.append(similarity(t, b, wv, w2i)) mean_a = sum(mean_a)/float(len(mean_a)) mean_b = sum(mean_b)/float(len(mean_b)) return mean_a - mean_b def effect_size(X, Y, A, B, wv, w2i, vocab): assert(len(X) == len(Y)) assert(len(A) == len(B)) norm_x = [] norm_y = [] for x in X: norm_x.append(association_diff(x, A, B, wv, w2i)) for y in Y: norm_y.append(association_diff(y, A, B, wv, w2i)) std = np.std(norm_x+norm_y, ddof=1) norm_x = sum(norm_x) / float(len(norm_x)) norm_y = sum(norm_y) / float(len(norm_y)) return (norm_x-norm_y)/std def p_value_sample(X, Y, A, B, wv, w2i, vocab): random.seed(10) np.random.seed(10) all_s_words = {} assert(len(X) == len(Y)) length = len(X) s_orig = s_group(X, Y, A, B, wv, w2i, vocab, all_s_words) num_of_samples = min(1000000, int(scipy.special.comb(length*2,length)*100)) print('num of samples', num_of_samples) larger = 0 for i in range(num_of_samples): permute = np.random.permutation(X+Y) Xi = permute[:length] Yi = permute[length:] if s_group(Xi, Yi, A, B, space, all_s_words) > s_orig: larger += 1 return larger/float(num_of_samples)