|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import csv |
|
import os |
|
from pathlib import Path |
|
from collections import OrderedDict |
|
|
|
|
|
|
|
|
|
|
|
def load_templates(path): |
|
fp = open(path, 'r') |
|
next(fp) |
|
S = [] |
|
for line in fp: |
|
|
|
line = line.strip().split('\t') |
|
occupation, other_participant, answer, sentence = line[0], line[1], line[2], line[3] |
|
S.append((occupation, other_participant, answer, sentence)) |
|
return S |
|
|
|
def generate(occupation, other_participant, sentence, second_ref="", context=None): |
|
toks = sentence.split(" ") |
|
occ_index = toks.index("$OCCUPATION") |
|
part_index = toks.index("$PARTICIPANT") |
|
toks[occ_index] = occupation |
|
|
|
if not second_ref: |
|
toks[part_index] = other_participant |
|
elif second_ref != 'someone': |
|
toks[part_index] = second_ref |
|
else: |
|
|
|
|
|
toks = toks[:part_index-1]+toks[part_index:] |
|
|
|
part_index = toks.index("$PARTICIPANT") |
|
if part_index == 0: |
|
toks[part_index] = "Someone" |
|
else: |
|
toks[part_index] = "someone" |
|
NOM = "$NOM_PRONOUN" |
|
POSS = "$POSS_PRONOUN" |
|
ACC = "$ACC_PRONOUN" |
|
special_toks = set({NOM, POSS, ACC}) |
|
mask_map = {NOM: "MASK", POSS: "MASK", ACC: "MASK"} |
|
mask_toks = [x if not x in special_toks else mask_map[x] for x in toks] |
|
masked_sent = " ".join(mask_toks) |
|
|
|
return masked_sent |
|
|
|
|
|
|
|
def get_sentences(): |
|
script_dir = os.path.dirname(__file__) |
|
rel_path = "winogender_schema" |
|
abs_path = os.path.join(script_dir, rel_path) |
|
Path(abs_path).mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
S = load_templates(os.path.join(abs_path, "templates.tsv")) |
|
|
|
|
|
with open(os.path.join(abs_path, "all_sentences.tsv"), 'w', newline='') as csvfile: |
|
sentence_writer = csv.writer(csvfile, delimiter='\t') |
|
sentence_writer.writerow(['sentid', 'sentence']) |
|
sentence_dict = OrderedDict() |
|
|
|
for s in S: |
|
occupation, other_participant, answer, sentence = s |
|
|
|
gendered_sentence = generate( |
|
occupation, other_participant, sentence) |
|
gendered_sentid = f"{occupation}_{other_participant}_{answer}" |
|
sentence_dict[gendered_sentid] = gendered_sentence |
|
|
|
someone_sentence = generate( |
|
occupation, other_participant, sentence, second_ref='someone') |
|
someone_sentid = f"{occupation}_someone_{answer}" |
|
sentence_dict[someone_sentid] = someone_sentence |
|
|
|
man_sentence = generate( |
|
occupation, other_participant, sentence, second_ref='man') |
|
man_sentid = f"{occupation}_man_{answer}" |
|
sentence_dict[man_sentid] = man_sentence |
|
|
|
woman_sentence = generate( |
|
occupation, other_participant, sentence, second_ref='woman') |
|
woman_sentid = f"{occupation}_woman_{answer}" |
|
sentence_dict[woman_sentid] = woman_sentence |
|
|
|
sentence_writer.writerow([gendered_sentid, gendered_sentence]) |
|
sentence_writer.writerow([someone_sentid, someone_sentence]) |
|
sentence_writer.writerow([man_sentid, man_sentence]) |
|
sentence_writer.writerow([woman_sentid, woman_sentence]) |
|
|
|
return sentence_dict |
|
|
|
|
|
|