Spaces:

DS-20202
/

DoubleHardDebias

Build error

App Files Files Community

DoubleHardDebias / utils.py

DS-20202

data and utils

95b1b47 verified over 2 years ago

raw

history blame

10.9 kB

	import string
	from tqdm import tqdm
	import pickle

	import scipy
	import numpy as np
	from numpy import linalg as LA
	from sklearn.decomposition import PCA

	# Experiment 1
	WEAT_words = {
	'A':['John', 'Paul', 'Mike', 'Kevin', 'Steve', 'Greg', 'Jeff', 'Bill'],
	'B':['Amy', 'Joan', 'Lisa', 'Sarah', 'Diana', 'Kate', 'Ann', 'Donna'],
	'C':['executive', 'management', 'professional', 'corporation', 'salary', 'office', 'business', 'career'],
	'D':['home', 'parents', 'children', 'family', 'cousins', 'marriage', 'wedding', 'relatives'],
	'E':['math', 'algebra', 'geometry', 'calculus', 'equations', 'computation', 'numbers', 'addition'],
	'F':['poetry', 'art', 'dance', 'literature', 'novel', 'symphony', 'drama', 'sculpture'],
	'G':['science', 'technology', 'physics', 'chemistry', 'einstein', 'nasa', 'experiment', 'astronomy'],
	'H':['poetry', 'art', 'shakespeare', 'dance', 'literature', 'novel', 'symphony', 'drama'],
	}


	def has_punct(w):

	if any([c in string.punctuation for c in w]):
	return True
	return False

	def has_digit(w):

	if any([c in '0123456789' for c in w]):
	return True
	return False

	def limit_vocab(wv, w2i, vocab, exclude = None):
	vocab_limited = []
	for w in tqdm(vocab[:50000]):
	if w.lower() != w:
	continue
	if len(w) >= 20:
	continue
	if has_digit(w):
	continue
	if '_' in w:
	p = [has_punct(subw) for subw in w.split('_')]
	if not any(p):
	vocab_limited.append(w)
	continue
	if has_punct(w):
	continue
	vocab_limited.append(w)

	if exclude:
	vocab_limited = list(set(vocab_limited) - set(exclude))

	print("size of vocabulary:", len(vocab_limited))

	wv_limited = np.zeros((len(vocab_limited), len(wv[0, :])))
	for i,w in enumerate(vocab_limited):
	wv_limited[i,:] = wv[w2i[w],:]

	w2i_limited = {w: i for i, w in enumerate(vocab_limited)}

	return vocab_limited, wv_limited, w2i_limited

	def norm_stand(wv):
	W_norm = np.zeros(wv.shape)
	d = (np.sum(wv 2, 1) (0.5))
	W_norm = (wv.T / d).T
	return W_norm

	def normalize(wv):

	# normalize vectors
	norms = np.apply_along_axis(LA.norm, 1, wv)
	wv = wv / norms[:, np.newaxis]
	return wv


	def topK(w, wv, w2i, vocab, k=10):

	# extract the word vector for word w
	idx = w2i[w]
	vec = wv[idx, :]

	# compute similarity of w with all words in the vocabulary
	sim = wv.dot(vec)
	# sim = []
	# for i in range(len(wv)):
	# sim.append(1-scipy.spatial.distance.cosine(wv[i, :], vec))
	# sim = np.array(sim)

	# sort similarities by descending order
	sort_sim = (sim.argsort())[::-1]

	# choose topK
	best = sort_sim[:(k+1)]

	return [vocab[i] for i in best if i!=idx]


	def similarity(w1, w2, wv, w2i):

	i1 = w2i[w1]
	i2 = w2i[w2]
	vec1 = wv[i1, :]
	vec2 = wv[i2, :]

	return 1-scipy.spatial.distance.cosine(vec1, vec2)



	def drop(u, v):
	return u - v * u.dot(v) / v.dot(v)

	from sklearn.decomposition import PCA
	from sklearn import preprocessing

	def doPCA(pairs, wv, w2i):

	matrix = []
	cnt = 0

	if type(pairs[0]) is list:
	for a, b in pairs:
	if not (a in w2i and b in w2i): continue
	center = (wv[w2i[a], :] + wv[w2i[b], :])/2
	matrix.append(wv[w2i[a], :] - center)
	matrix.append(wv[w2i[b], :] - center)
	cnt += 1
	else:
	for a in pairs:
	if not (a in w2i): continue
	matrix.append(wv[w2i[a], :])
	cnt += 1

	embeds = np.array(matrix)
	wv_mean = np.mean(np.array(embeds), axis=0)
	wv_hat = np.zeros(embeds.shape).astype(float)

	for i in range(len(embeds)):
	wv_hat[i, :] = embeds[i, :] - wv_mean
	matrix = wv_hat

	matrix = np.array(matrix)
	pca = PCA()
	pca.fit(matrix)
	print('pairs used in PCA: ', cnt)
	return pca

	# get tuples of biases and counts of masculine/feminine NN for each word (for bias-by-neighbors)
	import operator
	def bias_by_neighbors(wv, w2i, vocab, gender_bias_bef, size, neighbours_num = 100):

	tuples = []

	sorted_g = sorted(gender_bias_bef.items(), key=operator.itemgetter(1))
	female = [item[0] for item in sorted_g[:size]]
	male = [item[0] for item in sorted_g[-size:]]
	# vocab = male + female
	selected = female + male if size > 0 else vocab

	for w in selected:

	top = topK(w, wv, w2i, vocab, k=neighbours_num+5)[:neighbours_num]

	m = 0
	f = 0
	for t in top:
	if gender_bias_bef[t] > 0:
	m+=1
	else:
	f+=1

	tuples.append((w, gender_bias_bef[w], m, f))

	return tuples

	def get_tuples_prof(wv, w2i, vocab, words, gender_bias_dict):

	wv = normalize(wv)

	tuples = []
	for w in words:
	if w not in gender_bias_dict:
	continue

	top = topK(w, wv, w2i, vocab, k=105)[:100]

	m = 0
	f = 0
	for t in top:
	if gender_bias_dict[t] > 0:
	m+=1
	else:
	f+=1

	tuples.append((w, gender_bias_dict[w], m, f))

	return tuples

	# compute correlation between bias-by-projection and bias-by-neighbors

	import scipy.stats

	def pearson(a,b):

	return scipy.stats.pearsonr(a,b)

	def compute_corr(tuples, i1, i2):

	a = []
	b = []
	for t in tuples:
	a.append(t[i1])
	b.append(t[i2])
	assert(len(a)==len(b))
	print('pearson: ', scipy.stats.pearsonr(a,b))
	print('spearman: ', scipy.stats.spearmanr(a, b))

	# Auxiliary finctions

	from sklearn.cluster import KMeans
	from sklearn.manifold import TSNE

	def visualize(vectors, y_true, y_pred, ax, title, random_state, num_clusters = 2):

	# perform TSNE

	X_embedded = TSNE(n_components=2, random_state=random_state).fit_transform(vectors)
	for x,p,y in zip(X_embedded, y_pred, y_true):
	if p:
	if y:
	ax.scatter(x[0], x[1], marker = '.', c = 'c')
	else:
	ax.scatter(x[0], x[1], marker = 'x', c = 'c')
	else:
	if y:
	ax.scatter(x[0], x[1], marker = '.', c = 'darkviolet')
	else:
	ax.scatter(x[0], x[1], marker = 'x', c = 'darkviolet')


	ax.text(.01, .9, title ,transform=ax.transAxes, fontsize=15)


	def extract_vectors(words, wv, w2i):

	X = [wv[w2i[x],:] for x in words]

	return X


	def cluster_and_visualize(words, X, random_state, y_true, num=2):

	y_pred = KMeans(n_clusters=num, random_state=random_state).fit_predict(X)
	# fig, axs = plt.subplots(figsize=(6, 3))
	# visualize(X, y_true, y_pred, axs, 'Original', random_state)
	correct = [1 if item1 == item2 else 0 for (item1,item2) in zip(y_true, y_pred) ]
	print('precision', max(sum(correct)/float(len(correct)), 1 - sum(correct)/float(len(correct))))


	import scipy.stats
	from sklearn import svm
	def train_and_predict(wv, w2i, vocab, size_train, size_test, males, females):

	X_train = [wv[w2i[w],:] for w in males[:size_train]+females[:size_train]]
	Y_train = [1]size_train + [0]size_train
	X_test = [wv[w2i[w],:] for w in males[size_train:]+females[size_train:]]
	Y_test = [1]size_test + [0]size_test

	clf = svm.SVC(gamma='auto')
	clf.fit(X_train, Y_train)

	preds = clf.predict(X_test)

	accuracy = [1 if y==z else 0 for y,z in zip(preds, Y_test)]
	acc = float(sum(accuracy))/len(accuracy)
	print('accuracy:', float(sum(accuracy))/len(accuracy))

	return acc


	# Auxiliary functions for experiments by Caliskan et al.

	import scipy
	import scipy.misc as misc
	import itertools


	def s_word(w, A, B, wv, w2i, vocab, all_s_words):

	if w in all_s_words:
	return all_s_words[w]

	mean_a = []
	mean_b = []

	for a in A:
	mean_a.append(similarity(w, a, wv, w2i))
	for b in B:
	mean_b.append(similarity(w, b, wv, w2i))

	mean_a = sum(mean_a)/float(len(mean_a))
	mean_b = sum(mean_b)/float(len(mean_b))

	all_s_words[w] = mean_a - mean_b

	return all_s_words[w]


	def s_group(X, Y, A, B, wv, w2i, vocab, all_s_words):

	total = 0
	for x in X:
	total += s_word(x, A, B, wv, w2i, vocab, all_s_words)
	for y in Y:
	total -= s_word(y, A, B, wv, w2i, vocab, all_s_words)

	return total


	def p_value_exhust(X, Y, A, B, wv, w2i, vocab):

	if len(X) > 10:
	print('might take too long, use sampled version: p_value')
	return

	assert(len(X) == len(Y))

	all_s_words = {}
	s_orig = s_group(X, Y, A, B, wv, w2i, vocab, all_s_words)

	union = set(X+Y)
	subset_size = int(len(union)/2)

	larger = 0
	total = 0
	for subset in set(itertools.combinations(union, subset_size)):
	total += 1
	Xi = list(set(subset))
	Yi = list(union - set(subset))
	if s_group(Xi, Yi, A, B, wv, w2i, vocab, all_s_words) > s_orig:
	larger += 1
	print('num of samples', total)
	return larger/float(total)

	def association_diff(t, A, B, wv, w2i):

	mean_a = []
	mean_b = []

	for a in A:
	mean_a.append(similarity(t, a, wv, w2i))
	for b in B:
	mean_b.append(similarity(t, b, wv, w2i))

	mean_a = sum(mean_a)/float(len(mean_a))
	mean_b = sum(mean_b)/float(len(mean_b))

	return mean_a - mean_b

	def effect_size(X, Y, A, B, wv, w2i, vocab):

	assert(len(X) == len(Y))
	assert(len(A) == len(B))

	norm_x = []
	norm_y = []

	for x in X:
	norm_x.append(association_diff(x, A, B, wv, w2i))
	for y in Y:
	norm_y.append(association_diff(y, A, B, wv, w2i))

	std = np.std(norm_x+norm_y, ddof=1)
	norm_x = sum(norm_x) / float(len(norm_x))
	norm_y = sum(norm_y) / float(len(norm_y))

	return (norm_x-norm_y)/std


	def p_value_sample(X, Y, A, B, wv, w2i, vocab):

	random.seed(10)
	np.random.seed(10)
	all_s_words = {}

	assert(len(X) == len(Y))
	length = len(X)

	s_orig = s_group(X, Y, A, B, wv, w2i, vocab, all_s_words)

	num_of_samples = min(1000000, int(scipy.special.comb(length2,length)100))
	print('num of samples', num_of_samples)
	larger = 0
	for i in range(num_of_samples):
	permute = np.random.permutation(X+Y)
	Xi = permute[:length]
	Yi = permute[length:]
	if s_group(Xi, Yi, A, B, space, all_s_words) > s_orig:
	larger += 1

	return larger/float(num_of_samples)