|
import random |
|
import json |
|
import os |
|
|
|
random.seed(2021) |
|
|
|
|
|
NUM_SENTENCES = 100_000 |
|
NUM_FAILS = 25 |
|
|
|
|
|
SENT_TYPES = ("0_PTY", "1_PTY", "2_PTY") |
|
SENT_1_PTY_TYPES = ("VICTIM", "OUTCOME", "DRIVE") |
|
SENT_ACTIVE_TYPES = ("ACTIVE", "NON_ACTIVE") |
|
|
|
SENTS_0_PTY_OUTCOME = ("[[OUTCOME]] [[CIRCUMSTANCE]] [[PLACE]]", |
|
"[[OUTCOME]] [[CIRCUMSTANCE]] [[TIME]]", "[[OUTCOME]] [[CIRCUMSTANCE]]") |
|
SENTS_1_PTY_VICTIM = ("[[SUBJECT]] [[VERB_V2]] [[PLACE]]", |
|
"[[SUBJECT]] [[TIME]] [[VERB_V2]]", "[[SUBJECT]] [[VERB_V2]]") |
|
SENTS_1_PTY_OUTCOME = ("[[SUBJECT]] [[OUTCOME]] [[PLACE]] [[CIRCUMSTANCE]]", |
|
"[[SUBJECT]] [[OUTCOME]] [[CIRCUMSTANCE]]") |
|
SENTS_1_PTY_DRIVE = ("[[SUBJECT]] [[VP_DRIVE]] [[PLACE]]", |
|
"[[SUBJECT]] [[VP_DRIVE]]") |
|
SENTS_2_PTYS = ("[[SUBJECT]] [[VERB_V2]] [[VERB_P]] [[OTHER]] [[VERB_REST]] [[PLACE]]", |
|
"[[SUBJECT]] [[VERB_V2]] [[TIME]] [[VERB_P]] [[OTHER]] [[VERB_REST]]", "[[SUBJECT]] [[VERB_V2]] [[VERB_P]] [[OTHER]] [[VERB_REST]]") |
|
|
|
PLACES = ("op stationsplein", "in stadscentrum", "op kruispunt Westerhaven", "op A27", "op A10", "in Lelystad", |
|
"in Assen", "in Amsterdam", "bij Renkum", "in Schilderswijk", "bij knooppunt Lunetten", "op zuidelijke ringweg", |
|
"in de buurt van de Erasmusbrug", "op schoolplein Stedelijk Gymnasium", "bij afrit Rotterdam-Noord", "op Kanaleneiland") |
|
TIMES = ("tijdens avondspits", "vrijdagavond", |
|
"dinsdagochtend", "donderdagnacht", "rond middaguur") |
|
CIRCUMSTANCES = ("na ongeluk", "na aanrijding", "na botsing", "na crash") |
|
CIRCUMSTANCES_AGT = (", dader ervandoor", ", dader ervandoor", ", dader rijdt door", ", bestuurder rijdt door") |
|
|
|
OUTCOME_0_TYPES = ("TRAFFIC", "HUMAN") |
|
OUTCOMES_0_TRAFFIC = ("verkeersopstopping", "file", "veel vertraging") |
|
OUTCOMES_0_HUMAN = ("dode", "zwaargewonde", "gewonde", "drie gewonden") |
|
OUTCOMES_1 = ("dood", "overleden", "zwaargewond", "lichtgewond", "ongedeerd") |
|
|
|
SUBJECT_TYPES = ("WEAK_PTY", "DRIVER", "VERHICLE") |
|
|
|
VPS_DRIVE_ACTIVE = ("rijdt tegen boom", "veroorzaakt ongeluk") |
|
VPS_DRIVE_NON_ACTIVE = ("verongelukt", "gecrasht", "uit de bocht gevlogen", "raakt gewond", "raakt gewond door klap") |
|
EVENT_VERBS_1_VICTIM = ("aangereden", "geschept", "raakt gewond", "raakt gewond door klap") |
|
EVENT_VERBS_2_ACTIVE_ANY = ("raakt|_|_", "botst|op|_", "botst|tegen|_") |
|
EVENT_VERBS_2_ACTIVE_DRIVE = ("rijdt|_|aan", "rijdt|_|dood", "schept|_|_") |
|
EVENT_VERBS_2_NON_ACTIVE_DRIVER = ( |
|
"aangereden|door|_", "geschept|door|_") |
|
EVENT_VERBS_2_NON_ACTIVE_VEHICLE = ( |
|
"aangereden|door|_", "geschept|door|_", "komt|onder|_") |
|
EVENT_VERBS_2_NON_ACTIVE_ANY = ( |
|
"geraakt|door|_",) |
|
|
|
|
|
|
|
WEAK_PTY_NPS = ("fietser", "skateboarder", "wielrenner", "rolschaatser", "jogger", "voetganger", "motorrijder", |
|
"fietskoerier", "[[PERSON]] op fiets", "[[PERSON]] op e-bike") |
|
ANY_PERSON_NPS = ("vrouw", "man", "meisje", "jongen", |
|
"bejaarde vrouw", "bejaarde man", "Duitser", "toerist") |
|
CYCLIST_PERSON_NPS = ("postbode", "maaltijdbezorger", "politieagent") |
|
DRIVER_NPS = ("automobilist", "automobiliste", "bestuurder", "dronken automobilist", "dronken bestuurder", "motorrijder", |
|
"minderjarige bestuurder", "trucker", "taxichauffeur", "[[PERSON]] in auto", "dronken [[PERSON]] in auto") |
|
VEHICLE_NPS = ("auto", "personenauto", "vrachtwagen", "tractor", "auto met caravan", "scooter", "motor", |
|
"tram", "stadsbus", "lijn 10", "touringcar", "camper", "vorkheftruck") |
|
|
|
|
|
def generate_weak_pty(): |
|
noun_phrase = random.choice(WEAK_PTY_NPS) |
|
if "[[PERSON]]" in noun_phrase: |
|
person = random.choice(ANY_PERSON_NPS + CYCLIST_PERSON_NPS) |
|
return noun_phrase.replace("[[PERSON]]", person) |
|
else: |
|
return noun_phrase |
|
|
|
|
|
def generate_driver(): |
|
noun_phrase = random.choice(DRIVER_NPS) |
|
if "[[PERSON]]" in noun_phrase: |
|
person = random.choice(ANY_PERSON_NPS) |
|
return noun_phrase.replace("[[PERSON]]", person) |
|
else: |
|
return noun_phrase |
|
|
|
|
|
def make_sentence(template, fields): |
|
sentence = template |
|
for field, value in fields.items(): |
|
sentence = sentence.replace(f"[[{field}]]", value) |
|
sentence = sentence.replace("_", "").replace(" ", " ").strip() |
|
sentence = sentence[0].upper() + sentence[1:] |
|
return sentence |
|
|
|
|
|
def main(): |
|
sentences = {} |
|
|
|
dup_fails = 0 |
|
while len(sentences) < NUM_SENTENCES and dup_fails < NUM_FAILS: |
|
fields = {} |
|
|
|
label = {"party_mentioned": 0, "party_human": 0, "active": False} |
|
|
|
fields["TIME"] = random.choice(TIMES) |
|
fields["PLACE"] = random.choice(PLACES) |
|
|
|
sent_type = random.choice(SENT_TYPES) |
|
if sent_type == "0_PTY": |
|
if random.random() < 0.5: |
|
fields["CIRCUMSTANCE"] = random.choice(CIRCUMSTANCES) |
|
else: |
|
fields["CIRCUMSTANCE"] = random.choice(CIRCUMSTANCES_AGT) |
|
label["party_mentioned"] += 1 |
|
label["party_human"] += 1 |
|
|
|
outcome_type = random.choice(OUTCOME_0_TYPES) |
|
if outcome_type == "TRAFFIC": |
|
fields["OUTCOME"] = random.choice(OUTCOMES_0_TRAFFIC) |
|
else: |
|
fields["OUTCOME"] = random.choice(OUTCOMES_0_HUMAN) |
|
label["party_mentioned"] += 1 |
|
label["party_human"] += 1 |
|
sentence = make_sentence( |
|
random.choice(SENTS_0_PTY_OUTCOME), fields) |
|
|
|
elif sent_type == "1_PTY": |
|
if random.random() < 0.5: |
|
fields["CIRCUMSTANCE"] = random.choice(CIRCUMSTANCES) |
|
else: |
|
fields["CIRCUMSTANCE"] = random.choice(CIRCUMSTANCES_AGT) |
|
label["party_mentioned"] += 1 |
|
label["party_human"] += 1 |
|
|
|
sent_subtype = random.choice(SENT_1_PTY_TYPES) |
|
if sent_subtype == "VICTIM": |
|
label["party_mentioned"] += 1 |
|
label["party_human"] += 1 |
|
fields["SUBJECT"] = generate_weak_pty() |
|
fields["VERB_V2"] = random.choice(EVENT_VERBS_1_VICTIM) |
|
sentence = make_sentence( |
|
random.choice(SENTS_1_PTY_VICTIM), fields) |
|
elif sent_subtype == "OUTCOME": |
|
subject_type = random.choice(["WEAK_PTY", "DRIVER"]) |
|
fields["OUTCOME"] = random.choice(OUTCOMES_1) |
|
if subject_type == "WEAK_PTY": |
|
label["party_mentioned"] += 1 |
|
label["party_human"] += 1 |
|
fields["SUBJECT"] = generate_weak_pty() |
|
else: |
|
label["party_mentioned"] += 1 |
|
label["party_human"] += 1 |
|
fields["SUBJECT"] = generate_driver() |
|
sentence = make_sentence( |
|
random.choice(SENTS_1_PTY_OUTCOME), fields) |
|
else: |
|
subject_type = random.choice(["DRIVER", "VERHICLE"]) |
|
active_type = random.choice(SENT_ACTIVE_TYPES) |
|
if active_type == "ACTIVE": |
|
fields["VP_DRIVE"] = random.choice(VPS_DRIVE_ACTIVE) |
|
label["active"] = True |
|
else: |
|
fields["VP_DRIVE"] = random.choice(VPS_DRIVE_NON_ACTIVE) |
|
if subject_type == "DRIVER": |
|
label["party_mentioned"] += 1 |
|
label["party_human"] += 1 |
|
fields["SUBJECT"] = generate_driver() |
|
else: |
|
label["party_mentioned"] += 1 |
|
fields["SUBJECT"] = random.choice(VEHICLE_NPS) |
|
sentence = make_sentence( |
|
random.choice(SENTS_1_PTY_DRIVE), fields) |
|
else: |
|
active_type = random.choice(SENT_ACTIVE_TYPES) |
|
if active_type == "ACTIVE": |
|
subject_type = random.choice(["WEAK_PTY", "DRIVER", "VERHICLE"]) |
|
label["active"] = True |
|
|
|
if subject_type == "WEAK_PTY": |
|
label["party_mentioned"] += 1 |
|
label["party_human"] += 1 |
|
fields["VERB_V2"], fields["VERB_P"], fields["VERB_REST"] = random.choice( |
|
EVENT_VERBS_2_ACTIVE_ANY).split("|") |
|
fields["SUBJECT"] = generate_weak_pty() |
|
other_type = random.choice(["WEAK_PTY", "VEHICLE"]) |
|
elif subject_type == "DRIVER": |
|
label["party_mentioned"] += 1 |
|
label["party_human"] += 1 |
|
fields["SUBJECT"] = generate_driver() |
|
if random.random() < 0.5: |
|
fields["VERB_V2"], fields["VERB_P"], fields["VERB_REST"] = random.choice( |
|
EVENT_VERBS_2_ACTIVE_ANY).split("|") |
|
other_type = random.choice(["WEAK_PTY", "VEHICLE"]) |
|
else: |
|
fields["VERB_V2"], fields["VERB_P"], fields["VERB_REST"] = random.choice( |
|
EVENT_VERBS_2_ACTIVE_DRIVE).split("|") |
|
other_type = "WEAK_PTY" |
|
|
|
else: |
|
label["party_mentioned"] += 1 |
|
fields["VERB_V2"], fields["VERB_P"], fields["VERB_REST"] = random.choice( |
|
EVENT_VERBS_2_ACTIVE_ANY + EVENT_VERBS_2_ACTIVE_DRIVE).split("|") |
|
fields["SUBJECT"] = random.choice(VEHICLE_NPS) |
|
|
|
if other_type == "WEAK_PTY": |
|
label["party_mentioned"] += 1 |
|
label["party_human"] += 1 |
|
fields["OTHER"] = generate_weak_pty() |
|
elif other_type == "DRIVER": |
|
label["party_mentioned"] += 1 |
|
label["party_human"] += 1 |
|
fields["OTHER"] = generate_driver() |
|
else: |
|
label["party_mentioned"] += 1 |
|
fields["OTHER"] = random.choice(VEHICLE_NPS) |
|
|
|
else: |
|
other_type = random.choice(["WEAK_PTY", "DRIVER", "VEHICLE"]) |
|
if other_type == "WEAK_PTY": |
|
label["party_mentioned"] += 1 |
|
label["party_human"] += 1 |
|
fields["OTHER"] = generate_weak_pty() |
|
fields["VERB_V2"], fields["VERB_P"], fields["VERB_REST"] = random.choice( |
|
EVENT_VERBS_2_NON_ACTIVE_ANY).split("|") |
|
subject_type = random.choice(["WEAK_PTY", "DRIVER", "VEHICLE"]) |
|
|
|
elif other_type == "DRIVER": |
|
label["party_mentioned"] += 1 |
|
label["party_human"] += 1 |
|
fields["OTHER"] = generate_driver() |
|
if random.random() < 0.5: |
|
fields["VERB_V2"], fields["VERB_P"], fields["VERB_REST"] = random.choice( |
|
EVENT_VERBS_2_NON_ACTIVE_ANY).split("|") |
|
subject_type = random.choice(["WEAK_PTY", "DRIVER", "VEHICLE"]) |
|
else: |
|
fields["VERB_V2"], fields["VERB_P"], fields["VERB_REST"] = random.choice( |
|
EVENT_VERBS_2_NON_ACTIVE_DRIVER).split("|") |
|
subject_type = random.choice(["WEAK_PTY"]) |
|
|
|
else: |
|
label["party_mentioned"] += 1 |
|
fields["OTHER"] = random.choice(VEHICLE_NPS) |
|
if random.random() < 0.5: |
|
fields["VERB_V2"], fields["VERB_P"], fields["VERB_REST"] = random.choice( |
|
EVENT_VERBS_2_NON_ACTIVE_ANY).split("|") |
|
subject_type = random.choice(["WEAK_PTY", "DRIVER", "VEHICLE"]) |
|
else: |
|
fields["VERB_V2"], fields["VERB_P"], fields["VERB_REST"] = random.choice( |
|
EVENT_VERBS_2_NON_ACTIVE_VEHICLE).split("|") |
|
subject_type = random.choice(["WEAK_PTY"]) |
|
|
|
if subject_type == "WEAK_PTY": |
|
label["party_mentioned"] += 1 |
|
label["party_human"] += 1 |
|
fields["SUBJECT"] = generate_weak_pty() |
|
elif subject_type == "DRIVER": |
|
label["party_mentioned"] += 1 |
|
label["party_human"] += 1 |
|
fields["SUBJECT"] = generate_driver() |
|
else: |
|
label["party_mentioned"] += 1 |
|
fields["SUBJECT"] = random.choice(VEHICLE_NPS) |
|
|
|
sentence = make_sentence(random.choice(SENTS_2_PTYS), fields) |
|
|
|
if sentence not in sentences: |
|
sentences[sentence] = label |
|
dup_fails = 0 |
|
else: |
|
dup_fails += 1 |
|
|
|
with open("output/crashes/generate_templates/sentences.jsonl", "w", encoding="utf-8") as f_out: |
|
for sentence, label in sentences.items(): |
|
f_out.write(json.dumps({"sentence": sentence, "label": label}) + os.linesep) |
|
f_out.write(os.linesep) |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|