llm_uncertainty / winogender_sentences.py
emilylearning's picture
update explanations / comments
a828a08
raw
history blame
4.01 kB
######################################################################
##
## This script is a lightly modified version of that provided in winogender-schemas
## https://github.com/rudinger/winogender-schemas
##
######################################################################
import csv
import os
from pathlib import Path
from collections import OrderedDict
# This script fully instantiates the 120 templates in ../data/templates.tsv
# to generate the 720 sentences in ../data/all_sentences.tsv
# By default this script prints to stdout, and can be run with no arguments:
def load_templates(path):
fp = open(path, 'r')
next(fp) # first line headers
S = []
for line in fp:
line = line.strip().split('\t')
occupation, other_participant, answer, sentence = line[0], line[1], line[2], line[3]
S.append((occupation, other_participant, answer, sentence))
return S
def generate(occupation, other_participant, sentence, second_ref="", context=None):
toks = sentence.split(" ")
occ_index = toks.index("$OCCUPATION")
part_index = toks.index("$PARTICIPANT")
toks[occ_index] = occupation
# we are using the instantiated participant, e.g. "client", "patient", "customer",...
if not second_ref:
toks[part_index] = other_participant
elif second_ref != 'someone':
toks[part_index] = second_ref
else:
# we are using the bleached NP "someone" for the other participant
# first, remove the token that precedes $PARTICIPANT, i.e. "the"
toks = toks[:part_index-1]+toks[part_index:]
# recompute participant index (it should be part_index - 1)
part_index = toks.index("$PARTICIPANT")
if part_index == 0:
toks[part_index] = "Someone"
else:
toks[part_index] = "someone"
NOM = "$NOM_PRONOUN"
POSS = "$POSS_PRONOUN"
ACC = "$ACC_PRONOUN"
special_toks = set({NOM, POSS, ACC})
mask_map = {NOM: "MASK", POSS: "MASK", ACC: "MASK"}
mask_toks = [x if not x in special_toks else mask_map[x] for x in toks]
masked_sent = " ".join(mask_toks)
return masked_sent
# %%
def get_sentences():
script_dir = os.path.dirname(__file__)
rel_path = "winogender_schema"
abs_path = os.path.join(script_dir, rel_path)
Path(abs_path).mkdir(parents=True, exist_ok=True)
# %%
S = load_templates(os.path.join(abs_path, "templates.tsv"))
# %%
with open(os.path.join(abs_path, "all_sentences.tsv"), 'w', newline='') as csvfile:
sentence_writer = csv.writer(csvfile, delimiter='\t')
sentence_writer.writerow(['sentid', 'sentence'])
sentence_dict = OrderedDict()
for s in S:
occupation, other_participant, answer, sentence = s
gendered_sentence = generate(
occupation, other_participant, sentence)
gendered_sentid = f"{occupation}_{other_participant}_{answer}"
sentence_dict[gendered_sentid] = gendered_sentence
someone_sentence = generate(
occupation, other_participant, sentence, second_ref='someone')
someone_sentid = f"{occupation}_someone_{answer}"
sentence_dict[someone_sentid] = someone_sentence
man_sentence = generate(
occupation, other_participant, sentence, second_ref='man')
man_sentid = f"{occupation}_man_{answer}"
sentence_dict[man_sentid] = man_sentence
woman_sentence = generate(
occupation, other_participant, sentence, second_ref='woman')
woman_sentid = f"{occupation}_woman_{answer}"
sentence_dict[woman_sentid] = woman_sentence
sentence_writer.writerow([gendered_sentid, gendered_sentence])
sentence_writer.writerow([someone_sentid, someone_sentence])
sentence_writer.writerow([man_sentid, man_sentence])
sentence_writer.writerow([woman_sentid, woman_sentence])
return sentence_dict