import re from dataclasses import dataclass from string import punctuation import pandas as pd all_punctuation = punctuation + "‘’·—»" # keep in dollar signs all_punctuation = all_punctuation.replace("$", "") # "regex separator" # captures the following: 1+ spaces OR 1+ non-word characters (ex: "/", "-"), OR 1 word boundary # apply the this variable using an `fr` string in the regex substituion (ex: `fr"\bw{sep}force\b"`) sep = "(?: +|\W+|\b)" @dataclass class RegexRemoval: description: str regex_str: str # usually raw string: r"your string" def __post_init__(self): self.regex = re.compile(self.regex_str, re.IGNORECASE) @dataclass class RegexSubstitution: description: str regex_str: str # usually raw string: r"your string" replacement: str priority: int = 10 # higher values → run later (eg: 1 runs before 20) def __post_init__(self): self.regex = re.compile(self.regex_str, re.IGNORECASE) removals = [ RegexRemoval("OBSCIS", r"(OBSCIS)"), RegexRemoval( "MO Suffix", r"\b\w\s\w\s\w\w?\s\w\s\d{2}(?: |\W)\d{2}(?: |\W)\d{4}", ), RegexRemoval( "Statute Prefix", r"\S{1,2}\s\d\S{0,3}\.\d\S{0,3}\.\d\S{0,3}(?:\.\d?\S{0,3}?)?" ), ] substitutions = [ # LESS THAN / GREATER THAN terms ========= RegexSubstitution("Less Than", fr"\b(?:<|lt)\b", " less than "), RegexSubstitution("Less Than 2", fr"\blt(?=\d+)\b", "less than "), RegexSubstitution("Less Than 3", fr"\<", " less than "), RegexSubstitution("Greater Than", fr"\b(?:>|gt|\>)\b", " greater than "), RegexSubstitution("Greater Than 2", fr"\bgt(?=\d+)\b", "greater than "), RegexSubstitution("Greater Than 3", fr"\>", " greater than "), # WITH terms =========== RegexSubstitution("With Out", fr"\bw{sep}(?:o|out)\b", "without"), RegexSubstitution("With Out 2", fr"\bwo\b", "without"), RegexSubstitution("Within", fr"\bw{sep}(?:i|in)\b", "within", priority=5), RegexSubstitution( "With Intent", fr"\bw{sep}\s?in?t?e?n?t?\b", "with intent", ), RegexSubstitution( "with a", fr"\bw{sep}a\b", "with a", ), RegexSubstitution( "with health", fr"\bw{sep}health\b", "with health", ), RegexSubstitution( "with own", fr"\bw{sep}own\b", "with own", ), RegexSubstitution( "with report", fr"\bw{sep}report\b", "with report", ), RegexSubstitution( "with license", fr"\bw{sep}license\b", "with license", ), RegexSubstitution( "with murder", fr"\bw{sep}murder\b", "with murder", ), RegexSubstitution( "with injury", fr"\bw{sep}(?:injury|inj|injry)\b", "with injury", ), RegexSubstitution( "with turned", fr"\bw{sep}turned\b", "with turned", ), RegexSubstitution( "with altered", fr"\bw{sep}alt\b", "with altered", ), RegexSubstitution( "with deadly", fr"\bw{sep}deadly\b", "with deadly", ), RegexSubstitution( "with dangerous weapon", fr"\b(?:with|w){sep}(?:dangerous|d){sep}(?:weapon|wpn|weapn|weap)\b", "with dangerous weapon", priority=5, ), RegexSubstitution( "with child", fr"\b(?:with|w){sep}(?:child|chi|chld)\b", "with child", ), RegexSubstitution( "with minor", fr"\bw{sep}minor\b", "with minor", ), RegexSubstitution( "with kidnapping", fr"\bw{sep}kidnapping\b", "with kidnapping", ), RegexSubstitution( "with agency", fr"\bw{sep}agency\b", "with agency", ), RegexSubstitution( "with firearm", fr"\bw{sep}firearm\b", "with firearm", ), RegexSubstitution( "with weapon", fr"\bw{sep}(?:weapon|wpn|weapn|weap)\b", "with weapon", ), RegexSubstitution( "with knife", fr"\bw{sep}knife\b", "with knife", ), RegexSubstitution( "with force", fr"\bw{sep}force\b", "with force", ), RegexSubstitution( "with extenuating circumstances", fr"\bw{sep}ext{sep}circumstances\b", "with extenuating circumstances", ), RegexSubstitution( "with prior", fr"\bw{sep}prior\b", "with prior", ), RegexSubstitution( "with previous", fr"\bw{sep}previous\b", "with previous", ), RegexSubstitution( "with domestic violence", fr"\bw{sep}dv\b", "with domestic violence", ), RegexSubstitution( "with suspended", fr"\bw{sep}suspended\b", "with suspended", ), RegexSubstitution( # doublecheck this "vehicle with", fr"\bvehicle{sep}w{sep}", "vehicle with", ), RegexSubstitution( # TODO: is this "possession with" or "possession weapon"? see concealed weapon as example "possession with", fr"\b(?:possession|possess|poss){sep}w{sep}", "possession with", ), RegexSubstitution( "possession with intent", fr"\bp{sep}with{sep}intent", "possession with intent", priority=30, ), RegexSubstitution( "neglect with", fr"\bneglect{sep}w{sep}", "neglect with", ), RegexSubstitution( "cooperate with", fr"\bcooperate{sep}w{sep}", "cooperate with", ), RegexSubstitution( "interfere with", fr"\b(?:inter|interfere){sep}w{sep}", "interfere with", ), RegexSubstitution( # TODO consolidate tamper/tampering? "tamper with", fr"\btamper{sep}w{sep}", "tamper with", ), RegexSubstitution( "tampering with", fr"\btampering{sep}w{sep}", "tampering with", ), RegexSubstitution( "assault with", fr"\bassault{sep}w{sep}", "assault with", ), # FIREARM TERMS RegexSubstitution( "firearm with altered identification numbers", fr"\bfirearm{sep}(?:with|w){sep}alter\b", "firearm with altered identification numbers", ), RegexSubstitution( "firearm", fr"\bf{sep}a\b", "firearm", ), RegexSubstitution( "intimidation", fr"\b(?:intim|intimid)\b", "intimidation", ), # DOMESTIC VIOLENCE TERMS / PROTECTION / RESTRAINING ORDERS RegexSubstitution( "protective order", fr"\b(?:protective|protection|prot){sep}(?:order|ord|or)\b", "protective order", ), RegexSubstitution( "domestic violence protective order", r"\bdvpo\b", "domestic violence protective order", ), RegexSubstitution("domestic", r"\bdom\b", "domestic", priority=20), RegexSubstitution( "domestic violence", r"\bdv\b", "domestic violence", ), RegexSubstitution( "domestic violence 2", fr"\bd{sep}v\b", "domestic violence", ), RegexSubstitution( "witness testimony", fr"\bwit{sep}tes\b", "witness testimony", ), # CONVICTION TERMS == RegexSubstitution( "misdemeanor conviction", fr"\b(?:misdemeanor|misd){sep}(?:convic|conv)\b", "misdemeanor conviction", ), RegexSubstitution( "prior conviction", fr"\b(?:prior|pr|pri){sep}(?:convic|conv)\b", "prior conviction", ), # ==== GENERAL TERMS ===== RegexSubstitution( # NOTE: added a negative lookbehind for 'mentally' so we won't override 'mentally ill' cases "illegal", fr"\b(?