Baseline human labels for ours vs. other methods, with 3-per-row voting.

In [1]:
import csv
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from collections import defaultdict

MAX_FILES=2

In [2]:
def get_data(filename):
 csvfile = open(filename)
 reader = csv.reader(csvfile)

 data = []
 for i, row in enumerate(reader):
 if i == 0:
 headers = row
 else:
 data.append(row)
 csvfile.close()
 return headers, data

# Get stats

Run these cells in order to:
* get stats for ontopicness and fluency to copy/paste
* save percents for each topic for plotting

## topics

In [3]:
# for topics
def decode(st):
 ints = [int(s) for s in st.split('_')]
 # Version 2
 ii, j1, j2 = ints[0], np.mod(ints[1], MAX_FILES), np.mod(ints[2], MAX_FILES)
 return ii, j1, j2

# p-value of two binomial distributions
# one sided tail
def two_samp(x1, x2, n1, n2):
 p1 = x1/n1
 p2 = x2/n2
 phat = (x1 + x2) / (n1 + n2)
 z = (p1 - p2) / np.sqrt(phat * (1-phat) * (1/n1 + 1/n2))
 return stats.norm.sf(np.abs(z))

def print_info_t(scores, counts, single_pvalue=True):
 pvalues = np.zeros((MAX_FILES, MAX_FILES))
 for i in range(MAX_FILES):
 for j in range(i, MAX_FILES):
 dist_i = [1] * scores[i] + [0] * (counts[i] - scores[i])
 dist_j = [1] * scores[j] + [0] * (counts[j] - scores[j])
 pvalue = two_samp(scores[i], scores[j], counts[i], counts[j])
 pvalues[i, j] = pvalue
 pvalues[j, i] = pvalue
 percs = scores / counts

 print('total counts, on topic counts, percentages:')
 for i in range(MAX_FILES):
 if i == 0 and single_pvalue and MAX_FILES == 2:
 print('{},{},{},{}'.format(counts[i], scores[i], percs[i], pvalues[0][1]))
 else:
 print('{},{},{}'.format(counts[i], scores[i], percs[i]))

 if not (single_pvalue and MAX_FILES == 2):
 for row in pvalues:
 print('{},{}'.format(row[0],row[1]))

def get_counts_indices(data, order_index, label_indices):
 scores = np.zeros(MAX_FILES, dtype=int)
 counts = np.zeros(MAX_FILES, dtype=int)
 skipped = 0
 for rownum, row in enumerate(data):
 order = row[order_index]
 for label_index in label_indices:
 label = row[label_index].lower()
 if len(order) > 0 and len(label) > 0:
 a_cat, b_cat = decode(order)[1:]
 # print(label, order, a_cat, b_cat)
 if label == 'a' or label == 'both':
 scores[a_cat] += 1
 if label == 'b' or label == 'both':
 scores[b_cat] += 1
 counts[a_cat] += 1
 counts[b_cat] += 1
 if label not in ['a', 'b', 'both', 'neither']:
 print('******invalid label: {}'.format(label))
 else:
 #print('empty label; skipping', rownum)
 skipped += 1
 print('skipped {}'.format(skipped))
 print_info_t(scores, counts)
 return scores, counts

# vote by row. each row contributes to one count (and 0 or 1 score based on majority vote)
def get_counts_vote_row(data, order_index, label_indices):
 scores = np.zeros(MAX_FILES, dtype=int)
 counts = np.zeros(MAX_FILES, dtype=int)
 skipped = 0
 for rownum, row in enumerate(data):
 order = row[order_index]
 if len(order) == 0:
 skipped += 1
 else:
 a_cat, b_cat = decode(order)[1:]
 row_score_a, row_score_b, row_counts = 0, 0, 0
 for label_index in label_indices:
 label = row[label_index].lower()
 if len(label) > 0:
 if label == 'a' or label == 'both':
 row_score_a += 1
 if label == 'b' or label == 'both':
 row_score_b += 1
 row_counts += 1
 if label not in ['a', 'b', 'both', 'neither']:
 print('******invalid label: {}'.format(label))
 else:
 print('empty label for nonempty prompt', rownum)
 # update big points
 if row_counts == 3:
 scores[a_cat] += row_score_a // 2
 scores[b_cat] += row_score_b // 2
 counts[a_cat] += 1
 counts[b_cat] += 1
 else:
 print('incomplete row...')
 print('skipped {}'.format(skipped))
 print_info_t(scores, counts)
 return scores, counts

## fluency

In [4]:
def print_info_f_lists(scorelist, single_pvalue=True):
 for i in range(MAX_FILES):
 if len(scorelist[i]) == 0:
 print('skipping; no data')
 return

 pvalues = np.zeros((MAX_FILES, MAX_FILES))
 for i in range(MAX_FILES):
 for j in range(i, MAX_FILES):
 pvalue = stats.ttest_ind(scorelist[i], scorelist[j]).pvalue
 pvalues[i, j] = pvalue
 pvalues[j, i] = pvalue

 print('mean, stdev, min, max, counts:')
 for i in range(MAX_FILES):
 if i == 0 and single_pvalue and len(scorelist) == 2:
 print('{},{},{},{},{},{}'.format(np.mean(scorelist[i]), np.std(scorelist[i]),
 np.min(scorelist[i]), np.max(scorelist[i]), len(scorelist[i]), pvalues[0][1]))
 else:
 print('{},{},{},{},{}'.format(np.mean(scorelist[i]), np.std(scorelist[i]),
 np.min(scorelist[i]), np.max(scorelist[i]), len(scorelist[i])))
 if not (single_pvalue and len(scorelist) == 2):
 print('p-values')
 for row in pvalues:
 print('{},{}'.format(row[0],row[1]))

def get_fluencies_indices(data, order_index, label_indices):
 scorelist = [[], []]
 skipped = 0
 for r, row in enumerate(data):
 order = row[order_index]
 if len(order) == 0:
 continue
 for label_ind_pair in label_indices:
 #a_cat, b_cat = decode(order)[1:]
 cats = decode(order)[1:]
 for i, ind in enumerate(label_ind_pair):
 label = row[ind]
 if len(label) > 0:
 scorelist[cats[i]].append(int(label))
 else:
 skipped += 1
 print('skipped {}'.format(skipped))
 print_info_f_lists(scorelist)
 return scorelist

## Run on all files

In [7]:
# aggregated human labeled everything
dirname = 'ctrl_wd_openai_csvs/'
# comment out any of the below if you don't want to include them in "all"
file_info = [
 'ctrl_legal.csv',
 'ctrl_politics.csv',
 'ctrl_religion.csv',
 'ctrl_science.csv',
 'ctrl_technologies.csv',
 'ctrl_positive.csv',
 'ctrl_negative.csv',
 'openai_positive.csv',
 'greedy_legal.csv',
 'greedy_military.csv',
 'greedy_politics.csv',
 'greedy_religion.csv',
 'greedy_science.csv',
 'greedy_space.csv',
 'greedy_technologies.csv',
 'greedy_positive.csv',
 'greedy_negative.csv',
]

In [9]:
# hardcoded indices
category_index = -1 # index of encoded seed and methods
topic_indices = [2, 6, 10]
fluency_indices = [(3,4), (7,8), (11,12)]

all_scores = np.zeros(MAX_FILES, dtype=int)
all_counts = np.zeros(MAX_FILES, dtype=int)
percs_ordered = np.zeros((len(file_info), MAX_FILES)) # percents saved in same order as file names
for i, fname in enumerate(file_info):
 filename = dirname + fname
 headers, data = get_data(filename)
 print(fname)
 scores, counts = get_counts_vote_row(data, category_index, topic_indices)
 all_scores += scores
 all_counts += counts
 percs_ordered[i] = 100 * scores / counts
 print()
print('all:')
print_info_t(all_scores, all_counts)
print('\n------------\n')

# uber labeled fluencies
all_fluencies = [[], []]
for fname in file_info:
 filename = dirname + fname
 headers, data = get_data(filename)
 print(fname)
 new_scores = get_fluencies_indices(data, category_index, fluency_indices)
 for i in range(len(all_fluencies)):
 all_fluencies[i].extend(new_scores[i])
 print()
print('all:')
print_info_f_lists(all_fluencies)
print('total counts')

for x in all_fluencies:
 print(len(x))
 
all_scores_hist = all_fluencies

ctrl_legal.csv
skipped 0
total counts, on topic counts, percentages:
20,7,0.35,0.24507648020791256
20,5,0.25

ctrl_politics.csv
skipped 0
total counts, on topic counts, percentages:
20,7,0.35,0.16864350736717681
20,10,0.5

ctrl_religion.csv
skipped 0
total counts, on topic counts, percentages:
20,12,0.6,0.000782701129001274
20,20,1.0

ctrl_science.csv
skipped 0
total counts, on topic counts, percentages:
20,15,0.75,0.012580379600204389
20,8,0.4

ctrl_technologies.csv
skipped 0
total counts, on topic counts, percentages:
20,15,0.75,0.005502076588434386
20,7,0.35

ctrl_positive.csv
skipped 0
total counts, on topic counts, percentages:
15,13,0.8666666666666667,0.312103057383203
15,12,0.8

ctrl_negative.csv
skipped 0
total counts, on topic counts, percentages:
15,8,0.5333333333333333,0.12785217497142026
15,11,0.7333333333333333

openai_positive.csv
skipped 0
total counts, on topic counts, percentages:
45,38,0.8444444444444444,7.502148606340828e-12
45,6,0.13333333333333333

greedy_legal.csv

 
