Gosse Minnema
Add sociofillmore code, load dataset via private dataset repo
b11ac48
raw
history blame
3.64 kB
import sys
import os
import json
sys.path.append("./libs")
from OpenDutchWordnet import Wn_grid_parser
def find_all_le_hyponyms(instance, le_id):
print(f"Starting from `{le_id}`...")
le_el = instance.les_find_le(le_id)
le_ss = le_el.get_synset_id()
siblings = {le.get_id() for le in instance.les_all_les_of_one_synset(le_ss)}
print(f"Siblings: {siblings}")
synset_el = instance.synsets_find_synset(le_ss)
print(f"Top-level synset: `{le_el.get_synset_id()}`...")
hyponyms = find_all_synset_hyponyms(instance, synset_el)
return siblings.union(hyponyms)
def find_all_synset_hyponyms(instance, synset_el):
print(f"Finding hyponyms of synset with gloss: `{synset_el.get_glosses()[:1]}`...")
hypo_les = set()
hypo_rels = synset_el.get_relations("has_hyponym")
for rel in hypo_rels:
hypo_ss = rel.get_target()
print(hypo_ss)
ss_les = {le.get_id() for le in instance.les_all_les_of_one_synset(hypo_ss)}
for i in ss_les:
print(f"\tfound LE: {i}")
ss_les.update(find_all_synset_hyponyms(instance, instance.synsets_find_synset(hypo_ss)))
hypo_les.update(ss_les)
return hypo_les
def find_siblings_and_hyperonym(instance, le_id):
le_el = instance.les_find_le(le_id)
le_ss = le_el.get_synset_id()
siblings = {le.get_id() for le in instance.les_all_les_of_one_synset(le_ss)}
print(siblings)
synset_el = instance.synsets_find_synset(le_ss)
hyper = synset_el.get_relations("has_hyperonym")[0]
hyper_ss = instance.synsets_find_synset(hyper.get_target())
print(hyper_ss.get_glosses())
print({le.get_id() for le in instance.les_all_les_of_one_synset(hyper.get_target())})
def main():
instance = Wn_grid_parser(Wn_grid_parser.odwn)
# find_all_le_hyponyms(instance, "slachtoffer-n-4")
dicts = {
"vehicles": {
"WN:cars": sorted(find_all_le_hyponyms(instance, "automobiel-n-1")),
"WN:motorbikes": sorted(find_all_le_hyponyms(instance, "motorfiets-n-1")),
"WN:bikes": sorted(find_all_le_hyponyms(instance, "fiets-n-1")),
"WN:buses": sorted(find_all_le_hyponyms(instance, "autobus-n-1")),
"extra": sorted(["scootmobiel", "e-bike"])
},
"persons": {
"WN:driver": sorted(find_all_le_hyponyms(instance, "bestuurder-n-2")),
"WN:cyclist": sorted(find_all_le_hyponyms(instance, "fietser-n-1")),
"WN:walker": sorted(find_all_le_hyponyms(instance, "loper-n-4")),
"WN:pedestrian": sorted(find_all_le_hyponyms(instance, "voetganger-n-1")),
"WN:victim": sorted(find_all_le_hyponyms(instance, "slachtoffer-n-4")),
"extra": sorted(
["man", "vrouw", "jongen", "meisje", "persoon", "bejaarde", "maaltijdbezorger"]
)
}
}
ignore_file = "output/crashes/predict_bechdel/lexical_dicts_ignore.json"
if os.path.isfile(ignore_file):
with open(ignore_file, encoding="utf-8") as f_ign:
ignore = json.load(f_ign)
cleaned_dicts = {}
for category in dicts.keys():
cleaned_dicts[category] = {}
for subcat, words in dicts[category].items():
ignore_subcat = ignore.get(category, {}).get(subcat, [])
cleaned_dicts[category][subcat] = [w for w in words if w not in ignore_subcat]
else:
cleaned_dicts = dicts
with open("output/crashes/predict_bechdel/lexical_dicts.json", "w", encoding="utf-8") as f_out:
json.dump(cleaned_dicts, f_out, indent=4)
if __name__ == "__main__":
main()