Gosse Minnema
Add sociofillmore code, load dataset via private dataset repo
b11ac48
raw
history blame
No virus
13.1 kB
import random
import json
import os
random.seed(2021)
NUM_SENTENCES = 100_000
NUM_FAILS = 25
SENT_TYPES = ("0_PTY", "1_PTY", "2_PTY")
SENT_1_PTY_TYPES = ("VICTIM", "OUTCOME", "DRIVE")
SENT_ACTIVE_TYPES = ("ACTIVE", "NON_ACTIVE")
SENTS_0_PTY_OUTCOME = ("[[OUTCOME]] [[CIRCUMSTANCE]] [[PLACE]]",
"[[OUTCOME]] [[CIRCUMSTANCE]] [[TIME]]", "[[OUTCOME]] [[CIRCUMSTANCE]]")
SENTS_1_PTY_VICTIM = ("[[SUBJECT]] [[VERB_V2]] [[PLACE]]",
"[[SUBJECT]] [[TIME]] [[VERB_V2]]", "[[SUBJECT]] [[VERB_V2]]")
SENTS_1_PTY_OUTCOME = ("[[SUBJECT]] [[OUTCOME]] [[PLACE]] [[CIRCUMSTANCE]]",
"[[SUBJECT]] [[OUTCOME]] [[CIRCUMSTANCE]]")
SENTS_1_PTY_DRIVE = ("[[SUBJECT]] [[VP_DRIVE]] [[PLACE]]",
"[[SUBJECT]] [[VP_DRIVE]]")
SENTS_2_PTYS = ("[[SUBJECT]] [[VERB_V2]] [[VERB_P]] [[OTHER]] [[VERB_REST]] [[PLACE]]",
"[[SUBJECT]] [[VERB_V2]] [[TIME]] [[VERB_P]] [[OTHER]] [[VERB_REST]]", "[[SUBJECT]] [[VERB_V2]] [[VERB_P]] [[OTHER]] [[VERB_REST]]")
PLACES = ("op stationsplein", "in stadscentrum", "op kruispunt Westerhaven", "op A27", "op A10", "in Lelystad",
"in Assen", "in Amsterdam", "bij Renkum", "in Schilderswijk", "bij knooppunt Lunetten", "op zuidelijke ringweg",
"in de buurt van de Erasmusbrug", "op schoolplein Stedelijk Gymnasium", "bij afrit Rotterdam-Noord", "op Kanaleneiland")
TIMES = ("tijdens avondspits", "vrijdagavond",
"dinsdagochtend", "donderdagnacht", "rond middaguur")
CIRCUMSTANCES = ("na ongeluk", "na aanrijding", "na botsing", "na crash")
CIRCUMSTANCES_AGT = (", dader ervandoor", ", dader ervandoor", ", dader rijdt door", ", bestuurder rijdt door")
OUTCOME_0_TYPES = ("TRAFFIC", "HUMAN")
OUTCOMES_0_TRAFFIC = ("verkeersopstopping", "file", "veel vertraging")
OUTCOMES_0_HUMAN = ("dode", "zwaargewonde", "gewonde", "drie gewonden")
OUTCOMES_1 = ("dood", "overleden", "zwaargewond", "lichtgewond", "ongedeerd")
SUBJECT_TYPES = ("WEAK_PTY", "DRIVER", "VERHICLE")
VPS_DRIVE_ACTIVE = ("rijdt tegen boom", "veroorzaakt ongeluk")
VPS_DRIVE_NON_ACTIVE = ("verongelukt", "gecrasht", "uit de bocht gevlogen", "raakt gewond", "raakt gewond door klap")
EVENT_VERBS_1_VICTIM = ("aangereden", "geschept", "raakt gewond", "raakt gewond door klap")
EVENT_VERBS_2_ACTIVE_ANY = ("raakt|_|_", "botst|op|_", "botst|tegen|_")
EVENT_VERBS_2_ACTIVE_DRIVE = ("rijdt|_|aan", "rijdt|_|dood", "schept|_|_")
EVENT_VERBS_2_NON_ACTIVE_DRIVER = (
"aangereden|door|_", "geschept|door|_")
EVENT_VERBS_2_NON_ACTIVE_VEHICLE = (
"aangereden|door|_", "geschept|door|_", "komt|onder|_")
EVENT_VERBS_2_NON_ACTIVE_ANY = (
"geraakt|door|_",)
WEAK_PTY_NPS = ("fietser", "skateboarder", "wielrenner", "rolschaatser", "jogger", "voetganger", "motorrijder",
"fietskoerier", "[[PERSON]] op fiets", "[[PERSON]] op e-bike")
ANY_PERSON_NPS = ("vrouw", "man", "meisje", "jongen",
"bejaarde vrouw", "bejaarde man", "Duitser", "toerist")
CYCLIST_PERSON_NPS = ("postbode", "maaltijdbezorger", "politieagent")
DRIVER_NPS = ("automobilist", "automobiliste", "bestuurder", "dronken automobilist", "dronken bestuurder", "motorrijder",
"minderjarige bestuurder", "trucker", "taxichauffeur", "[[PERSON]] in auto", "dronken [[PERSON]] in auto")
VEHICLE_NPS = ("auto", "personenauto", "vrachtwagen", "tractor", "auto met caravan", "scooter", "motor",
"tram", "stadsbus", "lijn 10", "touringcar", "camper", "vorkheftruck")
def generate_weak_pty():
noun_phrase = random.choice(WEAK_PTY_NPS)
if "[[PERSON]]" in noun_phrase:
person = random.choice(ANY_PERSON_NPS + CYCLIST_PERSON_NPS)
return noun_phrase.replace("[[PERSON]]", person)
else:
return noun_phrase
def generate_driver():
noun_phrase = random.choice(DRIVER_NPS)
if "[[PERSON]]" in noun_phrase:
person = random.choice(ANY_PERSON_NPS)
return noun_phrase.replace("[[PERSON]]", person)
else:
return noun_phrase
def make_sentence(template, fields):
sentence = template
for field, value in fields.items():
sentence = sentence.replace(f"[[{field}]]", value)
sentence = sentence.replace("_", "").replace(" ", " ").strip()
sentence = sentence[0].upper() + sentence[1:]
return sentence
def main():
sentences = {}
dup_fails = 0
while len(sentences) < NUM_SENTENCES and dup_fails < NUM_FAILS:
fields = {}
label = {"party_mentioned": 0, "party_human": 0, "active": False}
fields["TIME"] = random.choice(TIMES)
fields["PLACE"] = random.choice(PLACES)
sent_type = random.choice(SENT_TYPES)
if sent_type == "0_PTY":
if random.random() < 0.5:
fields["CIRCUMSTANCE"] = random.choice(CIRCUMSTANCES)
else:
fields["CIRCUMSTANCE"] = random.choice(CIRCUMSTANCES_AGT)
label["party_mentioned"] += 1
label["party_human"] += 1
outcome_type = random.choice(OUTCOME_0_TYPES)
if outcome_type == "TRAFFIC":
fields["OUTCOME"] = random.choice(OUTCOMES_0_TRAFFIC)
else:
fields["OUTCOME"] = random.choice(OUTCOMES_0_HUMAN)
label["party_mentioned"] += 1
label["party_human"] += 1
sentence = make_sentence(
random.choice(SENTS_0_PTY_OUTCOME), fields)
elif sent_type == "1_PTY":
if random.random() < 0.5:
fields["CIRCUMSTANCE"] = random.choice(CIRCUMSTANCES)
else:
fields["CIRCUMSTANCE"] = random.choice(CIRCUMSTANCES_AGT)
label["party_mentioned"] += 1
label["party_human"] += 1
sent_subtype = random.choice(SENT_1_PTY_TYPES)
if sent_subtype == "VICTIM":
label["party_mentioned"] += 1
label["party_human"] += 1
fields["SUBJECT"] = generate_weak_pty()
fields["VERB_V2"] = random.choice(EVENT_VERBS_1_VICTIM)
sentence = make_sentence(
random.choice(SENTS_1_PTY_VICTIM), fields)
elif sent_subtype == "OUTCOME":
subject_type = random.choice(["WEAK_PTY", "DRIVER"])
fields["OUTCOME"] = random.choice(OUTCOMES_1)
if subject_type == "WEAK_PTY":
label["party_mentioned"] += 1
label["party_human"] += 1
fields["SUBJECT"] = generate_weak_pty()
else: # driver
label["party_mentioned"] += 1
label["party_human"] += 1
fields["SUBJECT"] = generate_driver()
sentence = make_sentence(
random.choice(SENTS_1_PTY_OUTCOME), fields)
else: # drive
subject_type = random.choice(["DRIVER", "VERHICLE"])
active_type = random.choice(SENT_ACTIVE_TYPES)
if active_type == "ACTIVE":
fields["VP_DRIVE"] = random.choice(VPS_DRIVE_ACTIVE)
label["active"] = True
else:
fields["VP_DRIVE"] = random.choice(VPS_DRIVE_NON_ACTIVE)
if subject_type == "DRIVER":
label["party_mentioned"] += 1
label["party_human"] += 1
fields["SUBJECT"] = generate_driver()
else: # vehicle
label["party_mentioned"] += 1
fields["SUBJECT"] = random.choice(VEHICLE_NPS)
sentence = make_sentence(
random.choice(SENTS_1_PTY_DRIVE), fields)
else: # 2 pty
active_type = random.choice(SENT_ACTIVE_TYPES)
if active_type == "ACTIVE":
subject_type = random.choice(["WEAK_PTY", "DRIVER", "VERHICLE"])
label["active"] = True
if subject_type == "WEAK_PTY":
label["party_mentioned"] += 1
label["party_human"] += 1
fields["VERB_V2"], fields["VERB_P"], fields["VERB_REST"] = random.choice(
EVENT_VERBS_2_ACTIVE_ANY).split("|")
fields["SUBJECT"] = generate_weak_pty()
other_type = random.choice(["WEAK_PTY", "VEHICLE"])
elif subject_type == "DRIVER":
label["party_mentioned"] += 1
label["party_human"] += 1
fields["SUBJECT"] = generate_driver()
if random.random() < 0.5:
fields["VERB_V2"], fields["VERB_P"], fields["VERB_REST"] = random.choice(
EVENT_VERBS_2_ACTIVE_ANY).split("|")
other_type = random.choice(["WEAK_PTY", "VEHICLE"])
else:
fields["VERB_V2"], fields["VERB_P"], fields["VERB_REST"] = random.choice(
EVENT_VERBS_2_ACTIVE_DRIVE).split("|")
other_type = "WEAK_PTY"
else: # vehicle
label["party_mentioned"] += 1
fields["VERB_V2"], fields["VERB_P"], fields["VERB_REST"] = random.choice(
EVENT_VERBS_2_ACTIVE_ANY + EVENT_VERBS_2_ACTIVE_DRIVE).split("|")
fields["SUBJECT"] = random.choice(VEHICLE_NPS)
if other_type == "WEAK_PTY":
label["party_mentioned"] += 1
label["party_human"] += 1
fields["OTHER"] = generate_weak_pty()
elif other_type == "DRIVER":
label["party_mentioned"] += 1
label["party_human"] += 1
fields["OTHER"] = generate_driver()
else: # vehicle
label["party_mentioned"] += 1
fields["OTHER"] = random.choice(VEHICLE_NPS)
else: # non-active
other_type = random.choice(["WEAK_PTY", "DRIVER", "VEHICLE"])
if other_type == "WEAK_PTY":
label["party_mentioned"] += 1
label["party_human"] += 1
fields["OTHER"] = generate_weak_pty()
fields["VERB_V2"], fields["VERB_P"], fields["VERB_REST"] = random.choice(
EVENT_VERBS_2_NON_ACTIVE_ANY).split("|")
subject_type = random.choice(["WEAK_PTY", "DRIVER", "VEHICLE"])
elif other_type == "DRIVER":
label["party_mentioned"] += 1
label["party_human"] += 1
fields["OTHER"] = generate_driver()
if random.random() < 0.5:
fields["VERB_V2"], fields["VERB_P"], fields["VERB_REST"] = random.choice(
EVENT_VERBS_2_NON_ACTIVE_ANY).split("|")
subject_type = random.choice(["WEAK_PTY", "DRIVER", "VEHICLE"])
else:
fields["VERB_V2"], fields["VERB_P"], fields["VERB_REST"] = random.choice(
EVENT_VERBS_2_NON_ACTIVE_DRIVER).split("|")
subject_type = random.choice(["WEAK_PTY"])
else: # "vehicle"
label["party_mentioned"] += 1
fields["OTHER"] = random.choice(VEHICLE_NPS)
if random.random() < 0.5:
fields["VERB_V2"], fields["VERB_P"], fields["VERB_REST"] = random.choice(
EVENT_VERBS_2_NON_ACTIVE_ANY).split("|")
subject_type = random.choice(["WEAK_PTY", "DRIVER", "VEHICLE"])
else:
fields["VERB_V2"], fields["VERB_P"], fields["VERB_REST"] = random.choice(
EVENT_VERBS_2_NON_ACTIVE_VEHICLE).split("|")
subject_type = random.choice(["WEAK_PTY"])
if subject_type == "WEAK_PTY":
label["party_mentioned"] += 1
label["party_human"] += 1
fields["SUBJECT"] = generate_weak_pty()
elif subject_type == "DRIVER":
label["party_mentioned"] += 1
label["party_human"] += 1
fields["SUBJECT"] = generate_driver()
else: # vehicle
label["party_mentioned"] += 1
fields["SUBJECT"] = random.choice(VEHICLE_NPS)
sentence = make_sentence(random.choice(SENTS_2_PTYS), fields)
if sentence not in sentences:
sentences[sentence] = label
dup_fails = 0
else:
dup_fails += 1
with open("output/crashes/generate_templates/sentences.jsonl", "w", encoding="utf-8") as f_out:
for sentence, label in sentences.items():
f_out.write(json.dumps({"sentence": sentence, "label": label}) + os.linesep)
f_out.write(os.linesep)
if __name__ == "__main__":
main()