Spaces:

Daniel-Saeedi
/

DoubleHardDebias

Runtime error

App Files Files Community

DoubleHardDebias / utils.py

Daniel-Saeedi

Updating requirement

a45c033 almost 2 years ago

raw history blame contribute delete

No virus

10.9 kB

	import string
	from tqdm import tqdm
	# import pickle

	import scipy
	import numpy as np
	from numpy import linalg as LA
	from sklearn.decomposition import PCA

	# Experiment 1
	WEAT_words = {
	'A':['John', 'Paul', 'Mike', 'Kevin', 'Steve', 'Greg', 'Jeff', 'Bill'],
	'B':['Amy', 'Joan', 'Lisa', 'Sarah', 'Diana', 'Kate', 'Ann', 'Donna'],
	'C':['executive', 'management', 'professional', 'corporation', 'salary', 'office', 'business', 'career'],
	'D':['home', 'parents', 'children', 'family', 'cousins', 'marriage', 'wedding', 'relatives'],
	'E':['math', 'algebra', 'geometry', 'calculus', 'equations', 'computation', 'numbers', 'addition'],
	'F':['poetry', 'art', 'dance', 'literature', 'novel', 'symphony', 'drama', 'sculpture'],
	'G':['science', 'technology', 'physics', 'chemistry', 'einstein', 'nasa', 'experiment', 'astronomy'],
	'H':['poetry', 'art', 'shakespeare', 'dance', 'literature', 'novel', 'symphony', 'drama'],
	}


	def has_punct(w):

	if any([c in string.punctuation for c in w]):
	return True
	return False

	def has_digit(w):

	if any([c in '0123456789' for c in w]):
	return True
	return False

	def limit_vocab(wv, w2i, vocab, exclude = None):
	vocab_limited = []
	for w in tqdm(vocab[:50000]):
	if w.lower() != w:
	continue
	if len(w) >= 20:
	continue
	if has_digit(w):
	continue
	if '_' in w:
	p = [has_punct(subw) for subw in w.split('_')]
	if not any(p):
	vocab_limited.append(w)
	continue
	if has_punct(w):
	continue
	vocab_limited.append(w)

	if exclude:
	vocab_limited = list(set(vocab_limited) - set(exclude))

	print("size of vocabulary:", len(vocab_limited))

	wv_limited = np.zeros((len(vocab_limited), len(wv[0, :])))
	for i,w in enumerate(vocab_limited):
	wv_limited[i,:] = wv[w2i[w],:]

	w2i_limited = {w: i for i, w in enumerate(vocab_limited)}

	return vocab_limited, wv_limited, w2i_limited

	def norm_stand(wv):
	W_norm = np.zeros(wv.shape)
	d = (np.sum(wv 2, 1) (0.5))
	W_norm = (wv.T / d).T
	return W_norm

	def normalize(wv):

	# normalize vectors
	norms = np.apply_along_axis(LA.norm, 1, wv)
	wv = wv / norms[:, np.newaxis]
	return wv


	def topK(w, wv, w2i, vocab, k=10):

	# extract the word vector for word w
	idx = w2i[w]
	vec = wv[idx, :]

	# compute similarity of w with all words in the vocabulary
	sim = wv.dot(vec)
	# sim = []
	# for i in range(len(wv)):
	# sim.append(1-scipy.spatial.distance.cosine(wv[i, :], vec))
	# sim = np.array(sim)

	# sort similarities by descending order
	sort_sim = (sim.argsort())[::-1]

	# choose topK
	best = sort_sim[:(k+1)]

	return [vocab[i] for i in best if i!=idx]


	def similarity(w1, w2, wv, w2i):

	i1 = w2i[w1]
	i2 = w2i[w2]
	vec1 = wv[i1, :]
	vec2 = wv[i2, :]

	return 1-scipy.spatial.distance.cosine(vec1, vec2)



	def drop(u, v):
	return u - v * u.dot(v) / v.dot(v)

	from sklearn.decomposition import PCA
	from sklearn import preprocessing

	def doPCA(pairs, wv, w2i):

	matrix = []
	cnt = 0

	if type(pairs[0]) is list:
	for a, b in pairs:
	if not (a in w2i and b in w2i): continue
	center = (wv[w2i[a], :] + wv[w2i[b], :])/2
	matrix.append(wv[w2i[a], :] - center)
	matrix.append(wv[w2i[b], :] - center)
	cnt += 1
	else:
	for a in pairs:
	if not (a in w2i): continue
	matrix.append(wv[w2i[a], :])
	cnt += 1

	embeds = np.array(matrix)
	wv_mean = np.mean(np.array(embeds), axis=0)
	wv_hat = np.zeros(embeds.shape).astype(float)

	for i in range(len(embeds)):
	wv_hat[i, :] = embeds[i, :] - wv_mean
	matrix = wv_hat

	matrix = np.array(matrix)
	pca = PCA()
	pca.fit(matrix)
	print('pairs used in PCA: ', cnt)
	return pca

	# get tuples of biases and counts of masculine/feminine NN for each word (for bias-by-neighbors)
	import operator
	def bias_by_neighbors(wv, w2i, vocab, gender_bias_bef, size, neighbours_num = 100):

	tuples = []

	sorted_g = sorted(gender_bias_bef.items(), key=operator.itemgetter(1))
	female = [item[0] for item in sorted_g[:size]]
	male = [item[0] for item in sorted_g[-size:]]
	# vocab = male + female
	selected = female + male if size > 0 else vocab

	for w in selected:

	top = topK(w, wv, w2i, vocab, k=neighbours_num+5)[:neighbours_num]

	m = 0
	f = 0
	for t in top:
	if gender_bias_bef[t] > 0:
	m+=1
	else:
	f+=1

	tuples.append((w, gender_bias_bef[w], m, f))

	return tuples

	def get_tuples_prof(wv, w2i, vocab, words, gender_bias_dict):

	wv = normalize(wv)

	tuples = []
	for w in words:
	if w not in gender_bias_dict:
	continue

	top = topK(w, wv, w2i, vocab, k=105)[:100]

	m = 0
	f = 0
	for t in top:
	if gender_bias_dict[t] > 0:
	m+=1
	else:
	f+=1

	tuples.append((w, gender_bias_dict[w], m, f))

	return tuples

	# compute correlation between bias-by-projection and bias-by-neighbors

	import scipy.stats

	def pearson(a,b):

	return scipy.stats.pearsonr(a,b)

	def compute_corr(tuples, i1, i2):

	a = []
	b = []
	for t in tuples:
	a.append(t[i1])
	b.append(t[i2])
	assert(len(a)==len(b))
	print('pearson: ', scipy.stats.pearsonr(a,b))
	print('spearman: ', scipy.stats.spearmanr(a, b))

	# Auxiliary finctions

	from sklearn.cluster import KMeans
	from sklearn.manifold import TSNE

	def visualize(vectors, y_true, y_pred, ax, title, random_state, num_clusters = 2):

	# perform TSNE

	X_embedded = TSNE(n_components=2, random_state=random_state).fit_transform(vectors)
	for x,p,y in zip(X_embedded, y_pred, y_true):
	if p:
	if y:
	ax.scatter(x[0], x[1], marker = '.', c = 'c')
	else:
	ax.scatter(x[0], x[1], marker = 'x', c = 'c')
	else:
	if y:
	ax.scatter(x[0], x[1], marker = '.', c = 'darkviolet')
	else:
	ax.scatter(x[0], x[1], marker = 'x', c = 'darkviolet')


	ax.text(.01, .9, title ,transform=ax.transAxes, fontsize=15)


	def extract_vectors(words, wv, w2i):

	X = [wv[w2i[x],:] for x in words]

	return X


	def cluster_and_visualize(words, X, random_state, y_true, num=2):

	y_pred = KMeans(n_clusters=num, random_state=random_state).fit_predict(X)
	# fig, axs = plt.subplots(figsize=(6, 3))
	# visualize(X, y_true, y_pred, axs, 'Original', random_state)
	correct = [1 if item1 == item2 else 0 for (item1,item2) in zip(y_true, y_pred) ]
	print('precision', max(sum(correct)/float(len(correct)), 1 - sum(correct)/float(len(correct))))


	import scipy.stats
	from sklearn import svm
	def train_and_predict(wv, w2i, vocab, size_train, size_test, males, females):

	X_train = [wv[w2i[w],:] for w in males[:size_train]+females[:size_train]]
	Y_train = [1]size_train + [0]size_train
	X_test = [wv[w2i[w],:] for w in males[size_train:]+females[size_train:]]
	Y_test = [1]size_test + [0]size_test

	clf = svm.SVC(gamma='auto')
	clf.fit(X_train, Y_train)

	preds = clf.predict(X_test)

	accuracy = [1 if y==z else 0 for y,z in zip(preds, Y_test)]
	acc = float(sum(accuracy))/len(accuracy)
	print('accuracy:', float(sum(accuracy))/len(accuracy))

	return acc


	# Auxiliary functions for experiments by Caliskan et al.

	import scipy
	import scipy.misc as misc
	import itertools


	def s_word(w, A, B, wv, w2i, vocab, all_s_words):

	if w in all_s_words:
	return all_s_words[w]

	mean_a = []
	mean_b = []

	for a in A:
	mean_a.append(similarity(w, a, wv, w2i))
	for b in B:
	mean_b.append(similarity(w, b, wv, w2i))

	mean_a = sum(mean_a)/float(len(mean_a))
	mean_b = sum(mean_b)/float(len(mean_b))

	all_s_words[w] = mean_a - mean_b

	return all_s_words[w]


	def s_group(X, Y, A, B, wv, w2i, vocab, all_s_words):

	total = 0
	for x in X:
	total += s_word(x, A, B, wv, w2i, vocab, all_s_words)
	for y in Y:
	total -= s_word(y, A, B, wv, w2i, vocab, all_s_words)

	return total


	def p_value_exhust(X, Y, A, B, wv, w2i, vocab):

	if len(X) > 10:
	print('might take too long, use sampled version: p_value')
	return

	assert(len(X) == len(Y))

	all_s_words = {}
	s_orig = s_group(X, Y, A, B, wv, w2i, vocab, all_s_words)

	union = set(X+Y)
	subset_size = int(len(union)/2)

	larger = 0
	total = 0
	for subset in set(itertools.combinations(union, subset_size)):
	total += 1
	Xi = list(set(subset))
	Yi = list(union - set(subset))
	if s_group(Xi, Yi, A, B, wv, w2i, vocab, all_s_words) > s_orig:
	larger += 1
	print('num of samples', total)
	return larger/float(total)

	def association_diff(t, A, B, wv, w2i):

	mean_a = []
	mean_b = []

	for a in A:
	mean_a.append(similarity(t, a, wv, w2i))
	for b in B:
	mean_b.append(similarity(t, b, wv, w2i))

	mean_a = sum(mean_a)/float(len(mean_a))
	mean_b = sum(mean_b)/float(len(mean_b))

	return mean_a - mean_b

	def effect_size(X, Y, A, B, wv, w2i, vocab):

	assert(len(X) == len(Y))
	assert(len(A) == len(B))

	norm_x = []
	norm_y = []

	for x in X:
	norm_x.append(association_diff(x, A, B, wv, w2i))
	for y in Y:
	norm_y.append(association_diff(y, A, B, wv, w2i))

	std = np.std(norm_x+norm_y, ddof=1)
	norm_x = sum(norm_x) / float(len(norm_x))
	norm_y = sum(norm_y) / float(len(norm_y))

	return (norm_x-norm_y)/std


	def p_value_sample(X, Y, A, B, wv, w2i, vocab):

	random.seed(10)
	np.random.seed(10)
	all_s_words = {}

	assert(len(X) == len(Y))
	length = len(X)

	s_orig = s_group(X, Y, A, B, wv, w2i, vocab, all_s_words)

	num_of_samples = min(1000000, int(scipy.special.comb(length2,length)100))
	print('num of samples', num_of_samples)
	larger = 0
	for i in range(num_of_samples):
	permute = np.random.permutation(X+Y)
	Xi = permute[:length]
	Yi = permute[length:]
	if s_group(Xi, Yi, A, B, space, all_s_words) > s_orig:
	larger += 1

	return larger/float(num_of_samples)