rota-app / cleaning_utils.py
akgodwin's picture
add app files
2be8263
raw
history blame
66.5 kB
import re
from dataclasses import dataclass
from string import punctuation
import pandas as pd
all_punctuation = punctuation + "‘’·—»"
# keep in dollar signs
all_punctuation = all_punctuation.replace("$", "")
# "regex separator"
# captures the following: 1+ spaces OR 1+ non-word characters (ex: "/", "-"), OR 1 word boundary
# apply the this variable using an `fr` string in the regex substituion (ex: `fr"\bw{sep}force\b"`)
sep = "(?: +|\W+|\b)"
@dataclass
class RegexRemoval:
description: str
regex_str: str # usually raw string: r"your string"
def __post_init__(self):
self.regex = re.compile(self.regex_str, re.IGNORECASE)
@dataclass
class RegexSubstitution:
description: str
regex_str: str # usually raw string: r"your string"
replacement: str
priority: int = 10 # higher values → run later (eg: 1 runs before 20)
def __post_init__(self):
self.regex = re.compile(self.regex_str, re.IGNORECASE)
removals = [
RegexRemoval("OBSCIS", r"(OBSCIS)"),
RegexRemoval(
"MO Suffix",
r"\b\w\s\w\s\w\w?\s\w\s\d{2}(?: |\W)\d{2}(?: |\W)\d{4}",
),
RegexRemoval(
"Statute Prefix", r"\S{1,2}\s\d\S{0,3}\.\d\S{0,3}\.\d\S{0,3}(?:\.\d?\S{0,3}?)?"
),
]
substitutions = [
# LESS THAN / GREATER THAN terms =========
RegexSubstitution("Less Than", fr"\b(?:<|lt)\b", " less than "),
RegexSubstitution("Less Than 2", fr"\blt(?=\d+)\b", "less than "),
RegexSubstitution("Less Than 3", fr"\<", " less than "),
RegexSubstitution("Greater Than", fr"\b(?:&GT;|gt|\>)\b", " greater than "),
RegexSubstitution("Greater Than 2", fr"\bgt(?=\d+)\b", "greater than "),
RegexSubstitution("Greater Than 3", fr"\>", " greater than "),
# WITH terms ===========
RegexSubstitution("With Out", fr"\bw{sep}(?:o|out)\b", "without"),
RegexSubstitution("With Out 2", fr"\bwo\b", "without"),
RegexSubstitution("Within", fr"\bw{sep}(?:i|in)\b", "within", priority=5),
RegexSubstitution(
"With Intent",
fr"\bw{sep}\s?in?t?e?n?t?\b",
"with intent",
),
RegexSubstitution(
"with a",
fr"\bw{sep}a\b",
"with a",
),
RegexSubstitution(
"with health",
fr"\bw{sep}health\b",
"with health",
),
RegexSubstitution(
"with own",
fr"\bw{sep}own\b",
"with own",
),
RegexSubstitution(
"with report",
fr"\bw{sep}report\b",
"with report",
),
RegexSubstitution(
"with license",
fr"\bw{sep}license\b",
"with license",
),
RegexSubstitution(
"with murder",
fr"\bw{sep}murder\b",
"with murder",
),
RegexSubstitution(
"with injury",
fr"\bw{sep}(?:injury|inj|injry)\b",
"with injury",
),
RegexSubstitution(
"with turned",
fr"\bw{sep}turned\b",
"with turned",
),
RegexSubstitution(
"with altered",
fr"\bw{sep}alt\b",
"with altered",
),
RegexSubstitution(
"with deadly",
fr"\bw{sep}deadly\b",
"with deadly",
),
RegexSubstitution(
"with dangerous weapon",
fr"\b(?:with|w){sep}(?:dangerous|d){sep}(?:weapon|wpn|weapn|weap)\b",
"with dangerous weapon",
priority=5,
),
RegexSubstitution(
"with child",
fr"\b(?:with|w){sep}(?:child|chi|chld)\b",
"with child",
),
RegexSubstitution(
"with minor",
fr"\bw{sep}minor\b",
"with minor",
),
RegexSubstitution(
"with kidnapping",
fr"\bw{sep}kidnapping\b",
"with kidnapping",
),
RegexSubstitution(
"with agency",
fr"\bw{sep}agency\b",
"with agency",
),
RegexSubstitution(
"with firearm",
fr"\bw{sep}firearm\b",
"with firearm",
),
RegexSubstitution(
"with weapon",
fr"\bw{sep}(?:weapon|wpn|weapn|weap)\b",
"with weapon",
),
RegexSubstitution(
"with knife",
fr"\bw{sep}knife\b",
"with knife",
),
RegexSubstitution(
"with force",
fr"\bw{sep}force\b",
"with force",
),
RegexSubstitution(
"with extenuating circumstances",
fr"\bw{sep}ext{sep}circumstances\b",
"with extenuating circumstances",
),
RegexSubstitution(
"with prior",
fr"\bw{sep}prior\b",
"with prior",
),
RegexSubstitution(
"with previous",
fr"\bw{sep}previous\b",
"with previous",
),
RegexSubstitution(
"with domestic violence",
fr"\bw{sep}dv\b",
"with domestic violence",
),
RegexSubstitution(
"with suspended",
fr"\bw{sep}suspended\b",
"with suspended",
),
RegexSubstitution( # doublecheck this
"vehicle with",
fr"\bvehicle{sep}w{sep}",
"vehicle with",
),
RegexSubstitution( # TODO: is this "possession with" or "possession weapon"? see concealed weapon as example
"possession with",
fr"\b(?:possession|possess|poss){sep}w{sep}",
"possession with",
),
RegexSubstitution(
"possession with intent",
fr"\bp{sep}with{sep}intent",
"possession with intent",
priority=30,
),
RegexSubstitution(
"neglect with",
fr"\bneglect{sep}w{sep}",
"neglect with",
),
RegexSubstitution(
"cooperate with",
fr"\bcooperate{sep}w{sep}",
"cooperate with",
),
RegexSubstitution(
"interfere with",
fr"\b(?:inter|interfere){sep}w{sep}",
"interfere with",
),
RegexSubstitution( # TODO consolidate tamper/tampering?
"tamper with",
fr"\btamper{sep}w{sep}",
"tamper with",
),
RegexSubstitution(
"tampering with",
fr"\btampering{sep}w{sep}",
"tampering with",
),
RegexSubstitution(
"assault with",
fr"\bassault{sep}w{sep}",
"assault with",
),
# FIREARM TERMS
RegexSubstitution(
"firearm with altered identification numbers",
fr"\bfirearm{sep}(?:with|w){sep}alter\b",
"firearm with altered identification numbers",
),
RegexSubstitution(
"firearm",
fr"\bf{sep}a\b",
"firearm",
),
RegexSubstitution(
"intimidation",
fr"\b(?:intim|intimid)\b",
"intimidation",
),
# DOMESTIC VIOLENCE TERMS / PROTECTION / RESTRAINING ORDERS
RegexSubstitution(
"protective order",
fr"\b(?:protective|protection|prot){sep}(?:order|ord|or)\b",
"protective order",
),
RegexSubstitution(
"domestic violence protective order",
r"\bdvpo\b",
"domestic violence protective order",
),
RegexSubstitution("domestic", r"\bdom\b", "domestic", priority=20),
RegexSubstitution(
"domestic violence",
r"\bdv\b",
"domestic violence",
),
RegexSubstitution(
"domestic violence 2",
fr"\bd{sep}v\b",
"domestic violence",
),
RegexSubstitution(
"witness testimony",
fr"\bwit{sep}tes\b",
"witness testimony",
),
# CONVICTION TERMS ==
RegexSubstitution(
"misdemeanor conviction",
fr"\b(?:misdemeanor|misd){sep}(?:convic|conv)\b",
"misdemeanor conviction",
),
RegexSubstitution(
"prior conviction",
fr"\b(?:prior|pr|pri){sep}(?:convic|conv)\b",
"prior conviction",
),
# ==== GENERAL TERMS =====
RegexSubstitution( # NOTE: added a negative lookbehind for 'mentally' so we won't override 'mentally ill' cases
"illegal",
fr"\b(?<!mentally )(?:ill|illeg|illgl)\b",
"illegal",
),
RegexSubstitution("commercial fish", fr"\bcomm{sep}fish\b", "commercial fish"),
RegexSubstitution("vessel", fr"\bvess\b", "vessel"),
RegexSubstitution(
"traffic control device",
fr"\btraff{sep}control{sep}dev\b",
"traffic control device",
),
RegexSubstitution("non-culpable", fr"\bnonculp\b", "non-culpable"),
RegexSubstitution("prohibited", fr"\bprohib\b", "prohibited"),
RegexSubstitution("nuisance", fr"\bnuis\b", "nuisance"),
RegexSubstitution("obstruction", fr"\bobstr\b", "obstruction"),
RegexSubstitution("pedestrian", fr"\bped\b", "pedestrian"),
RegexSubstitution("conduct", fr"\bcond\b", "conduct", priority=20),
RegexSubstitution(
"subsequent",
fr"\bsubsq\b",
"subsequent",
),
RegexSubstitution(
"disturbing the peace",
fr"\bdist{sep}peace\b",
"disturbing the peace",
),
RegexSubstitution(
"offender accountability act",
fr"\boaa\b",
"offender accountability act",
),
RegexSubstitution(
"against",
fr"\b(?:agnst|agin)\b",
"against",
),
RegexSubstitution(
"child",
fr"\b(?:chil|chld)\b",
"child",
),
RegexSubstitution(
"school",
fr"\bschl\b",
"school",
),
RegexSubstitution(
"multiple",
fr"\bmult\b",
"multiple",
),
RegexSubstitution(
"assailant",
fr"\bassail\b",
"assailant",
),
RegexSubstitution(
"public disturbance",
fr"\b(?:public|pub|publ){sep}(?:disturbance|disturb|dist)\b",
"public disturbance",
),
RegexSubstitution(
"interfere",
fr"\b(?:interf|interfer)\b",
"interfere",
),
RegexSubstitution( # TODO should we leave obstructing/obstruction separate terms or lump into obstruct?
"obstructing",
fr"\bob\b",
"obstructing",
),
RegexSubstitution(
"law enforcement officer",
fr"\bleo\b",
"law enforcement officer",
),
RegexSubstitution(
"officer",
fr"\b(?:offcr|ofcr)\b",
"officer",
),
RegexSubstitution(
"minor",
fr"\b(?:min|minr|mnr)\b",
"minor",
),
RegexSubstitution(
"distance within 300 feet of park",
fr"\bdist{sep}300{sep}park\b",
"distance within 300 feet of park",
priority=5,
),
RegexSubstitution(
"distance within 300",
fr"{sep}dist{sep}w{sep}i{sep}300\b",
"distance within 300",
priority=5,
),
RegexSubstitution(
"major",
fr"\bmajr\b",
"major",
),
RegexSubstitution(
"willful",
fr"\b(?:wilfl|wlfl)\b",
"willful",
),
RegexSubstitution(
"issue worthless checks",
fr"\b(?:issue|iss){sep}(?:worthless|wrthlss|wrtls){sep}(?:checks|cks)\b",
"worthless",
),
RegexSubstitution(
"issue multiple worthless checks",
fr"\b(?:issue|iss){sep}(?:multiple|mltpl){sep}(?:worthless|wrthlss|wrtls){sep}(?:checks|cks)\b",
"worthless",
),
RegexSubstitution(
"unauthorized",
fr"\b(?:unauth|unau|unauthd)\b",
"unauthorized",
),
RegexSubstitution(
"child support",
fr"\b(?:child|chld|chi){sep}(?:support|supp|sup)\b",
"child support",
),
RegexSubstitution(
"unlawful",
r"\b(?:unlawfully|unlaw|unlawfl|unlawf|unlwfl|unl)\b",
"unlawful",
),
RegexSubstitution(
"Possession",
r"\b(?:possess|poss?)\b",
"possession",
),
RegexSubstitution(
"Abetting",
r"\b(?:abett|abetted)\b",
"Abetting",
),
RegexSubstitution("emergency", r"\b(?:emerg|emer)\b", "emergency", priority=20),
RegexSubstitution(
"Attempted",
r"\b(?:att|atmpt)\b",
"attempted",
),
RegexSubstitution( # NOTE: added negative look ahead so we don't remap "at risk" to "attempted risk"
"Attempted 2",
r"\bat(?! risk)\b",
"attempted",
),
RegexSubstitution(
"Battery",
r"\bbatt\b",
"battery",
),
RegexSubstitution(
"Violation of Probation",
r"\bvop\b",
"violation of probation",
),
RegexSubstitution( # NOTE: removed 'con' because shows up in some DV-related text, may not be a one-size fits all regex / 'consp' to conspiracy or conspire?
"Conspiracy",
r"\b(?:consp|conspi|conspira|conspirc|consprc|consprcy|cnsprcy|conspr)\b",
"conspiracy",
),
RegexSubstitution(
"Property",
r"\bprop\b",
"property",
),
RegexSubstitution(
"public disturbance",
fr"\b(?:public|pub|publ){sep}(?:disturbance|dist)\b",
"public disturbance",
),
RegexSubstitution(
"Criminal",
r"\bcrim\b",
"criminal",
),
RegexSubstitution(
"License",
r"\blic\b",
"license",
),
RegexSubstitution(
"Credit Card",
r"\bcc\b",
"credit card",
),
RegexSubstitution(
"Credit Card 2",
r"\bcred{sep}crd\b",
"credit card",
),
RegexSubstitution(
"exchange",
r"\bexch\b",
"exchange",
),
RegexSubstitution(
"electric power",
fr"\belec{sep}pwr\b",
"electric power",
),
RegexSubstitution(
"commit false", fr"\bcom?{sep}false\b", "commit false", priority=5
),
# VEHICLE terms ===========
RegexSubstitution(
"Vehicle",
r"\b(?:veh|vehi)\b",
"vehicle",
),
RegexSubstitution(
"Vehicles",
r"\bvehs\b",
"vehicles",
),
RegexSubstitution(
"commercial motor vehicle",
r"\bcmv\b",
"commercial motor vehicle",
),
RegexSubstitution(
"motor vehicle",
fr"\b(?:mtr|mot){sep}(?:vehicle|veh)\b",
"motor vehicle",
),
RegexSubstitution(
"motor vehicle 2",
fr"\bm{sep}v\b",
"motor vehicle",
),
RegexSubstitution(
"motor vehicle 3",
fr"\b(?:mtv|mv)\b",
"motor vehicle",
),
RegexSubstitution("odometer", fr"\bodom\b", "odometer"),
RegexSubstitution(
"red light",
fr"\bred{sep}light\b",
"red light",
),
RegexSubstitution(
"vehicle sound system",
fr"\bveh{sep}snd{sep}sys\b",
"vehicle sound system",
priority=20,
),
# =====
RegexSubstitution(
"Assault",
r"\bass?lt\b",
"assault",
),
RegexSubstitution(
"Assault 2",
r"\bass\b",
"assault",
),
RegexSubstitution(
"Mentally",
r"\bment\b",
"mentally",
),
RegexSubstitution(
"mentally ill",
r"\bmnt{sep}ill\b",
"mentally ill",
),
RegexSubstitution(
"Unknown",
r"\bunk\b",
"unknown",
),
RegexSubstitution(
"cohabitation",
r"\b(?:coh|cohbt)\b",
"cohabitation",
),
RegexSubstitution(
"Statement",
r"\bstmt\b",
"statement",
),
RegexSubstitution(
"Degree",
r"\bdegr?e?\b",
"degree",
),
RegexSubstitution(
"Felony",
r"\b(?:fe|fel|felo|felny|fl|flny)\b",
"felony",
),
RegexSubstitution(
"misdemeanor",
r"\bmisd\b",
"misdemeanor",
),
# AGE
RegexSubstitution(
"years of age",
r"\byoa\b",
"years of age",
),
RegexSubstitution(
"year",
r"\byr\b",
"year",
),
RegexSubstitution(
"year 2",
r"(?!\d+)yr\b",
" year",
),
RegexSubstitution(
"elderly",
r"\beldrly\b",
"elderly",
),
RegexSubstitution(
"under",
r"\b(?:und|undr)\b",
"under",
),
# AGE / FEMALE
RegexSubstitution(
"female",
fr"\bfem\b",
"female",
),
RegexSubstitution(
"age female",
fr"\bage{sep}f\b",
"age female",
),
RegexSubstitution(
"old female",
fr"\bold{sep}f\b",
"old female",
),
RegexSubstitution(
"older female",
fr"\bolder{sep}f\b",
"older female",
),
RegexSubstitution(
"13 female",
fr"\b13{sep}f\b",
"13 female",
),
RegexSubstitution(
"15 female",
fr"\b15{sep}f\b",
"15 female",
),
RegexSubstitution(
"17 female",
fr"\b17{sep}f\b",
"17 female",
),
# AGE / MALE
RegexSubstitution(
"age male",
fr"\bage{sep}m\b",
"age male",
),
RegexSubstitution(
"old male",
fr"\bold{sep}m\b",
"old male",
),
RegexSubstitution(
"older male",
fr"\bolder{sep}m\b",
"older male",
),
RegexSubstitution(
"13 male",
fr"\b13{sep}m\b",
"13 male",
),
RegexSubstitution(
"15 male",
fr"\b15{sep}m\b",
"15 male",
),
RegexSubstitution(
"17 male",
fr"\b17{sep}m\b",
"17 male",
),
# ======
RegexSubstitution(
"Robbery",
r"\brobb\b",
"robbery",
),
RegexSubstitution(
"Attempted Robbery",
fr"\battempted{sep}(?:rob|robb)\b",
"attempted robbery",
),
RegexSubstitution(
"Detainer Robbery",
fr"\bdetainer{sep}(?:rob|robb)\b",
"detainer robbery",
),
RegexSubstitution(
"Aggravated",
r"\b(?:agg|aggrav|aggr|aggravted)\b",
"aggravated",
),
RegexSubstitution(
"Forced",
r"\bfrc\b",
"forced",
),
RegexSubstitution(
"Danger",
r"\bdng\b",
"danger",
),
RegexSubstitution(
"Abetting",
r"\babet\b",
"abetting",
),
RegexSubstitution(
"Acquaintance",
r"\b(?:acquant|acq|acquaint|acquain)\b",
"acquaintance",
),
RegexSubstitution(
"Breaking and Entering",
r"\bB ?& ?E\b",
"breaking and entering",
),
RegexSubstitution("Building", r"\bbldg\b", "building"),
RegexSubstitution(
"Adult",
r"\badlt\b",
"adult",
),
RegexSubstitution(
"Deliver",
r"\bdel\b",
"deliver",
),
RegexSubstitution(
"Family",
r"\bfam\b",
"family",
),
RegexSubstitution(
"Burglary",
r"\bburg\b",
"burglary",
),
RegexSubstitution(
"Murder",
r"\bmur\b",
"murder",
),
RegexSubstitution(
"conspiracy to commit",
fr"\bconsp{sep}comm\b",
"conspiracy to commit",
),
RegexSubstitution(
"Representation",
r"\brep\b",
"representation",
),
RegexSubstitution(
"Previous",
r"\bprev\b",
"previous",
),
RegexSubstitution( # TODO revisit this - 'com' can also be 'commit'
"Common",
r"\bcom\b",
"common",
),
RegexSubstitution(
"of a",
r"\bofa\b",
"of a",
),
RegexSubstitution( # TODO revisit this - 'viol' relates to 'violation' too
"violent",
r"\bviol\b",
"violent",
),
RegexSubstitution(
"perform",
r"\bperf\b",
"perform",
),
RegexSubstitution(
"household",
r"\b(?:hh|hsehld|hhld)\b",
"household",
),
RegexSubstitution(
"Other",
r"\both\b",
"other",
),
# WEAPON TERMS =========
RegexSubstitution(
"Weapon", r"\b(?:wea|wpn|weapn|weap|weapo)\b", "weapon", priority=20
),
RegexSubstitution(
"Weapons", r"\b(?:wea|wpn|weapn|weap|weapo)s\b", "weapons", priority=20
),
RegexSubstitution("dangerous weapon", r"\b(?:dwpn|dw)\b", "dangerous weapon"),
RegexSubstitution(
"dangerous weapon 2", fr"\bd{sep}(?:w|wpn)\b", "dangerous weapon"
),
RegexSubstitution(
"concealed weapon", fr"\bconcealed{sep}(?:w|wpn)\b", "concealed weapon"
),
# HARM terms =======
RegexSubstitution(
"Bodily Harm",
fr"\b(?:bod{sep}ha?rm|bh)\b",
"bodily harm",
),
RegexSubstitution(
"physical",
fr"\bphy\b",
"physical",
),
RegexSubstitution(
"harmful",
fr"\bharmfl\b",
"harmful",
),
RegexSubstitution(
"Great Bodily",
fr"\b(?:gr|grt){sep}bodily\b",
"great bodily",
),
RegexSubstitution(
"Great Bodily Injury",
fr"\bgbi\b",
"great bodily injury",
),
RegexSubstitution(
"Substantial Bodily Harm",
r"\bsbh\b",
"substantial bodily harm",
),
RegexSubstitution(
"injury",
r"\b(?:injry|inj)\b",
"injury",
),
RegexSubstitution(
"inflict",
r"\binflt\b",
"inflict",
),
RegexSubstitution(
"Great Bodily Harm",
fr"\bgr{sep}bod{sep}harm\b",
"great bodily harm",
),
RegexSubstitution(
"Great Bodily Harm 2",
fr"\bgbh\b",
"great bodily harm",
),
# ====
RegexSubstitution( # TODO: revisit PERS can be person too
"Personal",
r"\bpers\b",
"personal",
),
RegexSubstitution(
"persons",
r"\bprsns\b",
"persons",
),
RegexSubstitution(
"person",
r"\b(?:prsn|per|perso)\b",
"person",
),
RegexSubstitution("election day", fr"\belec{sep}day\b", "election day"),
RegexSubstitution(
"temporary",
r"\btemp\b",
"temporary",
),
RegexSubstitution(
"improper",
r"\bimprop\b",
"improper",
),
RegexSubstitution(
"false",
r"\bfls\b",
"false",
),
RegexSubstitution(
"responsibility",
r"\bresp\b",
"responsibility",
),
RegexSubstitution(
"advertise",
r"\bad\b",
"advertise",
),
RegexSubstitution(
"imprisonment",
r"\b(?:imprison|impris|imprsn)\b",
"imprisonment",
),
RegexSubstitution(
"prohibited",
r"\bproh\b",
"prohibited",
),
RegexSubstitution(
"under influence",
fr"\bunder{sep}(?:infl|influ)\b",
"under influence",
priority=5,
),
RegexSubstitution(
"stolen",
r"\bstln\b",
"stolen",
),
RegexSubstitution(
"years",
r"\byrs\b",
"years",
),
RegexSubstitution(
"intent",
r"\bint\b",
"intent",
),
RegexSubstitution(
"passage",
r"\bpassg\b",
"passage",
),
RegexSubstitution(
"withdraw",
r"\bwit\b",
"withdraw",
),
RegexSubstitution(
"manufacturing or delivering",
r"\bman\Wdel\b",
"manufacturing delivering",
),
RegexSubstitution( # Revisit this
"minimum mandatory",
r"\bmin\Wman\b",
"minimum mandatory",
),
RegexSubstitution(
"stranger",
r"\bstr(?:ngr)?\b",
"stranger",
),
RegexSubstitution(
"personal use",
r"\bpers use\b",
"personal use",
),
RegexSubstitution(
"force",
r"\bfo?rc\b",
"force",
),
RegexSubstitution(
"operate",
r"\b(?:oper|op|opr)\b",
"operate",
),
RegexSubstitution(
"occupied",
r"\bocc\b",
"occupied",
),
RegexSubstitution(
"health care facility",
r"\bhealth{sep}care{sep}fac\b",
"health care facility",
priority=5,
),
RegexSubstitution(
"residence",
r"\bres\b",
"residence",
),
RegexSubstitution(
"terrorism threats",
fr"\bterr{sep}(?:thre|thrts)\b",
"terrorism threats",
),
RegexSubstitution(
"false report",
fr"\bfals{sep}rprt\b",
"false report",
),
RegexSubstitution(
"government",
r"\bgovt\b",
"government",
),
RegexSubstitution(
"advocating",
r"\badvoc\b",
"advocating",
),
RegexSubstitution(
"government property",
r"\bgov{sep}property\b",
"government property",
),
RegexSubstitution(
"general assembly",
r"\bgen{sep}assembly\b",
"general assembly",
),
RegexSubstitution( # NOTE: added negative lookahead because was seeing "by off" when updating statutory rape terms & "by offense" is not correct
"offense",
fr"\b(?<!by )(?:offense|offen|off|offe)\b",
"offense",
),
RegexSubstitution(
"information",
fr"\b(?:info|infor)\b",
"information",
),
# LEWD charge cat
RegexSubstitution(
"pornography",
fr"\b(?:porn|porno)\b",
"pornography",
),
RegexSubstitution(
"compelling",
fr"\bcompel\b",
"compelling",
),
RegexSubstitution(
"prostitution",
fr"\bprostit\b",
"prostitution",
),
RegexSubstitution(
"computer",
fr"\bcomputr\b",
"computer",
),
RegexSubstitution(
"incapable",
fr"\bincap\b",
"incapable",
),
RegexSubstitution(
"juvenile",
fr"\b(?:juv|juven)\b",
"juvenile",
),
RegexSubstitution(
"involving",
fr"\b(?:involv|invlv)\b",
"involving",
),
RegexSubstitution(
"equipment",
fr"\bequip\b",
"equipment",
),
RegexSubstitution(
"hazardous",
fr"\bhaz\b",
"hazardous",
),
RegexSubstitution( # NOTE: assault and battery unless A,B is followed by C
"assault and battery",
fr"\b(?:a\&b|a{sep}b|a \& b|ab)(?!c)\b",
"assault and battery",
),
RegexSubstitution( # NOTE: assault and battery unless A,B is followed by C
"assault and battery 2",
fr"\b(?:a\&b|a{sep}b|a \& b|ab)(?!\Wc)\b",
"assault and battery",
),
RegexSubstitution( # NOTE: assault and battery unless A,B is followed by C
"assault and battery 2",
fr"\b(?:a\&b|a{sep}b|a \& b|ab)(?! c)\b",
"assault and battery",
),
RegexSubstitution(
"promote distribution",
fr"\bpromote{sep}distrb\b",
"promote distribution",
),
RegexSubstitution(
"child molestation first degree",
fr"\b(?:child|chld|ch){sep}(?:molestation|molest|mol){sep}1\b",
"child molestation first degree",
),
RegexSubstitution(
"child molestation second degree",
fr"\b(?:child|chld|ch){sep}(?:molestation|molest|mol){sep}2\b",
"child molestation second degree",
),
RegexSubstitution(
"child molestation third degree",
fr"\b(?:child|chld|ch){sep}(?:molestation|molest|mol){sep}3\b",
"child molestation third degree",
),
RegexSubstitution(
"child molestation",
fr"\b(?:child|chld|ch){sep}(?:molestation|molest|mol)\b",
"child molestation",
priority=5,
),
RegexSubstitution(
"molestation",
fr"\b(?:molestation|molest|mol)\b",
"molestation",
),
RegexSubstitution(
"indecent conduct exposure",
fr"\bind{sep}cond{sep}expos\b",
"indecent conduct exposure",
),
RegexSubstitution(
"indecent",
fr"\bindec\b",
"indecent",
),
RegexSubstitution(
"indecent liberties",
fr"\bind{sep}lib\b",
"indecent liberties",
),
RegexSubstitution(
"moving",
fr"\bmov\b",
"moving",
),
RegexSubstitution(
"depiction",
fr"\bdptn\b",
"depiction",
),
RegexSubstitution(
"child luring",
fr"\bchil{sep}lrng\b",
"child luring",
),
RegexSubstitution(
"dissemination",
fr"\b(?:dissm|dissem)\b",
"dissemination",
),
RegexSubstitution(
"possession of depictions of minor engaged in sexually explicit conduct",
fr"\bposs{sep}(?:depict|dep){sep}(?:minor|min){sep}eng{sep}sex{sep}(?:exp|expct){sep}conduct\b",
"possession of depictions of minor engaged in sexually explicit conduct",
priority=3,
),
RegexSubstitution(
"dealing of depictions of minor engaged in sexually explicit conduct",
fr"\bdeal{sep}(?:depict|dep){sep}(?:minor|min){sep}eng{sep}sex{sep}(?:exp|expct){sep}conduct\b",
"dealing of depictions of minor engaged in sexually explicit conduct",
priority=3,
),
RegexSubstitution(
"viewing of depictions of minor engaged in sexually explicit conduct",
fr"\bview{sep}(?:depict|dep){sep}(?:minor|min){sep}eng{sep}sex{sep}(?:exp|expct){sep}conduct\b",
"viewing of depictions of minor engaged in sexually explicit conduct",
priority=3,
),
RegexSubstitution(
"online sexual corruption of a child",
fr"\bonline{sep}sex{sep}corrupt{sep}child\b",
"online sexual corruption of a child",
),
RegexSubstitution(
"lewd or lascivious act",
fr"\b(?:L\&L|L{sep}L)\b",
"lewd or lascivious act",
),
RegexSubstitution(
"exposure",
r"\bexpos\b",
"exposure",
),
# SEXUAL OFFENSES =====
RegexSubstitution(
"Criminal Sexual Conduct",
r"\bcsc\b",
"criminal sexual conduct",
),
RegexSubstitution(
"sexual",
r"\bsexl\b",
"sexual",
),
RegexSubstitution(
"explicit",
r"\bexplct\b",
"explicit",
),
RegexSubstitution(
"sexual offense",
fr"\b(?:sexual|sex){sep}(?:offense|offen|off)\b",
"sexual offense",
),
RegexSubstitution(
"sexual offenses",
fr"\b(?:sexual|sex){sep}(?:offense|offen|off)s\b",
"sexual offenses",
),
RegexSubstitution(
"sexual assault",
fr"\b(?:sexual|sex){sep}(?:assault|assult|assualt|ass|asst)\b",
"sexual assault",
),
RegexSubstitution(
"sexual contact",
fr"\b(?:sexual|sex){sep}(?:contact)\b",
"sexual contact",
),
RegexSubstitution(
"sexual act",
fr"\b(?:sexual|sex){sep}(?:act|acts)\b",
"sexual act",
),
RegexSubstitution(
"sexual act 2",
fr"\bsxact\b",
"sexual act",
),
RegexSubstitution(
"sexual abuse",
fr"\b(?:sexual|sex){sep}(?:abuse|ab)\b",
"sexual abuse",
),
RegexSubstitution(
"commit sex abuse",
fr"\bcomm{sep}sex{sep}abuse\b",
"commit sex abuse",
),
RegexSubstitution(
"commit sex act",
fr"\bcomm{sep}sex{sep}act\b",
"commit sex act",
),
RegexSubstitution(
"commit sex abuse minor",
fr"\bcommsexabuseminor\b",
"commit sex abuse minor",
priority=20,
),
RegexSubstitution(
"sexual battery",
fr"\b(?:sexual|sex){sep}(?:battery|batt|bat)\b",
"sexual battery",
),
RegexSubstitution( # TODO: should these actually map to "sexual misconduct"?
"sexual conduct",
fr"\b(?:sexual|sex){sep}(?:conduct|cndct|cond|con)\b",
"sexual conduct",
),
RegexSubstitution(
"sexual penetration",
fr"\b(?:sexual|sex){sep}(?:penetration|pen)\b",
"sexual penetration",
),
RegexSubstitution( # TODO: Revisit - hard to tell if exp/expl maps to "exploitation" or "explicit"
"sexual exploitation",
fr"\b(?:sexual|sex){sep}(?:exploitation|exploit)\b",
"sexual exploitation",
),
RegexSubstitution(
"sexual performance",
fr"\b(?:sexual|sex){sep}(?:performance|perform)\b",
"sexual performance",
),
RegexSubstitution(
"sexual imposition",
fr"\b(?:sexual|sex){sep}(?:imposition|imp)\b",
"sexual imposition",
),
RegexSubstitution(
"sex with",
fr"\bsex{sep}w\b",
"sex with",
),
RegexSubstitution( # TODO: Revisit - hard to tell if offen/off maps to "offender" or "offense"
"sex offender",
fr"\b(?:sexual|sex){sep}(?:offender|offend|offndr|ofndr)\b",
"sex offender",
),
RegexSubstitution(
"sexual predator",
fr"\b(?:sexual|sex){sep}(?:predator|pred)\b",
"sexual predator",
),
RegexSubstitution(
"voluntary sexual relations",
fr"\bvol{sep}sex{sep}rel\b",
"voluntary sexual relations",
),
RegexSubstitution(
"sex related",
fr"\bsex{sep}(?:reltd|rel)\b",
"sex related",
),
RegexSubstitution(
"sex related 2",
fr"\bsexreltd\b",
"sex related",
),
RegexSubstitution(
"statutory rape",
fr"\bstat{sep}rape\b",
"statutory rape",
),
RegexSubstitution(
"rape first degree",
fr"\brape{sep}(?:1|1st|i)\b",
"rape first degree",
),
RegexSubstitution(
"rape second degree",
fr"\brape{sep}(?:2|2nd|ii)\b",
"rape second degree",
),
RegexSubstitution(
"rape third degree",
fr"\brape{sep}(?:3|3rd|iii)\b",
"rape third degree",
),
RegexSubstitution(
"sodomy first degree",
fr"\bsodomy{sep}(?:1|1st|i)\b",
"sodomy first degree",
),
RegexSubstitution(
"sodomy second degree",
fr"\bsodomy{sep}(?:2|2nd|ii)\b",
"sodomy second degree",
),
RegexSubstitution(
"sodomy third degree",
fr"\bsodomy{sep}(?:3|3rd|iii)\b",
"sodomy third degree",
),
RegexSubstitution(
"incest first degree",
fr"\bincest{sep}(?:1|1st|i)\b",
"incest first degree",
),
RegexSubstitution(
"incest second degree",
fr"\bincest{sep}(?:2|2nd|ii)\b",
"incest second degree",
),
RegexSubstitution(
"sex first degree",
fr"\bsex{sep}(?:1|1st|i)\b",
"sex first degree",
),
RegexSubstitution(
"sex second degree",
fr"\bsex{sep}(?:2|2nd|ii)\b",
"sex second degree",
),
RegexSubstitution(
"criminal sexual conduct first degree",
fr"\bcsc{sep}(?:1|1st|i)\b",
"criminal sexual conduct first degree",
priority=5,
),
RegexSubstitution(
"criminal sexual conduct second degree",
fr"\bcsc{sep}(?:2|2nd|ii)\b",
"criminal sexual conduct second degree",
priority=5,
),
RegexSubstitution(
"criminal sexual conduct third degree",
fr"\bcsc{sep}(?:3|3rd|ii)\b",
"criminal sexual conduct third degree",
priority=5,
),
RegexSubstitution(
"criminal sexual conduct fourth degree",
fr"\bcsc{sep}(?:4|4th|iv)\b",
"criminal sexual conduct fourth degree",
priority=5,
),
RegexSubstitution(
"sodomy",
r"\bsod\b",
"sodomy",
),
RegexSubstitution(
"engage sexual act",
fr"\benga{sep}sex{sep}act\b",
"engage sexual act",
),
RegexSubstitution(
"engage sexual act 2",
fr"\beng{sep}sex\b",
"engage sexual act",
),
RegexSubstitution("no force", fr"\bno{sep}frc\b", "no force", priority=5),
RegexSubstitution(
"force or coercion",
fr"\bfrc{sep}or{sep}coercn\b",
"force or coercion",
priority=5,
),
RegexSubstitution(
"coercion",
fr"\b(?:coer|coercn)\b",
"coercion",
),
RegexSubstitution(
"position of authority",
fr"\bpos{sep}auth\b",
"position of authority",
priority=4,
),
RegexSubstitution(
"position of authority 2",
fr"\bpos{sep}of{sep}auth\b",
"position of authority",
priority=4,
),
RegexSubstitution(
"person in authority",
fr"\bper{sep}aut\b",
"person in authority",
priority=4,
),
RegexSubstitution(
"other family",
fr"\b(?:othr|oth|other){sep}(?:family|fam)\b",
"other family",
priority=4,
),
RegexSubstitution(
"immoral",
fr"\b(?:immoral|imoral|imm|imor)\b",
"immoral",
priority=4,
),
RegexSubstitution(
"purpose",
fr"\bpurp\b",
"purpose",
priority=4,
),
RegexSubstitution(
"communication with minor for immoral purpose",
fr"\b(?:communication|comm|com){sep}(?:with|w){sep}(?:minor|min){sep}(?:immoral|imoral|imm|imor)\b",
"communication with minor for immoral purpose",
priority=4,
),
RegexSubstitution(
"communication with minor for immoral purpose 2",
fr"\bcomm{sep}minor{sep}imm\b",
"communication with minor for immoral purpose",
priority=4,
),
RegexSubstitution(
"communication with minor",
fr"\bcom{sep}w{sep}minor\b",
"communication with minor",
priority=4,
),
# EMBEZZLEMENT ===
RegexSubstitution(
"Embezzlement",
r"\b(?:embezzle|embezz|embez|embzzlmnt|embz)\b",
"embezzlement",
),
RegexSubstitution(
"real estate",
fr"\breal{sep}estat\b",
"real estate",
),
RegexSubstitution(
"chattel",
r"\bchatl\b",
"chattel",
),
RegexSubstitution(
"received",
r"\b(?:receiv|rcvd)\b",
"received",
),
RegexSubstitution(
"mortgagor",
r"\bmortgr\b",
"mortgagor",
),
RegexSubstitution(
"agreement",
r"\bagrmnt\b",
"agreement",
),
RegexSubstitution(
"public",
fr"\b(?:pub|publ|pblc)\b",
"public",
),
RegexSubstitution(
"behavior",
r"\bbehav\b",
"behavior",
),
RegexSubstitution(
"private",
r"\bpriv\b",
"private",
),
RegexSubstitution(
"corporation",
fr"\bcorp\b",
"corporation",
),
RegexSubstitution(
"purchase",
fr"\bpurc\b",
"purchase",
),
RegexSubstitution( # NOTE: pol may also be police - saw pol dog for example (police dog)
"political",
fr"\b(?:pol|polit|politcl)\b",
"political",
),
RegexSubstitution("police dog", fr"\bpol{sep}dog\b", "police dog", priority=5),
RegexSubstitution(
"payroll",
fr"\bpayrll\b",
"payroll",
),
RegexSubstitution(
"law enforcement",
fr"\blaw{sep}enf\b",
"law enforcement",
),
RegexSubstitution(
"incident",
fr"\bincdnt\b",
"incident",
),
RegexSubstitution(
"report",
fr"\brept\b",
"report",
),
RegexSubstitution(
"transfer",
fr"\btrnsf\b",
"transfer",
),
RegexSubstitution(
"capital assets",
fr"\bcptl{sep}asts\b",
"capital assets",
),
RegexSubstitution(
"clerk of court",
fr"\bclrk{sep}of{sep}crt\b",
"clerk of court",
),
RegexSubstitution(
"insufficient",
fr"\binsuf\b",
"insufficient",
),
RegexSubstitution(
"corporate officer", fr"\bcorp{sep}officer\b", "corporate officer", priority=5
),
RegexSubstitution(
"institution",
fr"\b(?:instit|inst)\b",
"institution",
),
RegexSubstitution(
"organization",
fr"\borg\b",
"organization",
),
RegexSubstitution(
"animals",
fr"\banmls\b",
"animals",
),
RegexSubstitution(
"animal",
fr"\banml\b",
"animal",
),
RegexSubstitution(
"software",
fr"\bsoftwr\b",
"software",
),
RegexSubstitution(
"transit or service bus",
fr"\btrans{sep}serv{sep}bus\b",
"transit or service bus",
),
RegexSubstitution(
"insurance agent",
fr"\binsur{sep}agent\b",
"insurance agent",
),
RegexSubstitution(
"official",
fr"\b(?:offic|offl|offcl|officl)\b",
"official",
),
RegexSubstitution( # TODO: is 'misapp' ... misappropriation or misapplication?
"misappropriation",
fr"\b(?:misappro|misapp)\b",
"misappropriation",
),
RegexSubstitution(
"misapplication",
fr"\bmisapl\b",
"misappropriation",
),
RegexSubstitution(
"fiduciary",
fr"\bfiduc\b",
"fiduciary",
),
RegexSubstitution(
"financial",
fr"\bfinan\b",
"financial",
),
RegexSubstitution(
"funds",
fr"\bfnds\b",
"funds",
),
# FELONY - UNSPECIFIED terms
RegexSubstitution(
"rendering assistance",
fr"\brend{sep}assist\b",
"rendering assistance",
priority=5,
),
RegexSubstitution(
"criminal assistance",
fr"\b(?:crim|criminal){sep}assist\b",
"criminal assistance",
priority=4,
),
RegexSubstitution(
"consummate",
fr"\b(?:consu|consummat)\b",
"consummate",
priority=4,
),
RegexSubstitution(
"deliver",
fr"\bdelive\b",
"deliver",
priority=4,
),
RegexSubstitution(
"to commit",
fr"\bto{sep}comm\b",
"to commit",
priority=4,
),
RegexSubstitution(
"violation of",
fr"\b(?:viol?|vio){sep}of\b",
"violation of",
priority=4,
),
RegexSubstitution(
"violation of civil",
fr"\bvol?{sep}civil\b",
"violation of civil",
priority=4,
),
RegexSubstitution("rendering", fr"\brend\b", "rendering"),
RegexSubstitution(
"assistance first degree",
fr"\bassistance{sep}1\b",
"assistance first degree",
priority=30,
),
RegexSubstitution(
"assistance second degree",
fr"\bassistance{sep}2\b",
"assistance second degree",
priority=30,
),
RegexSubstitution(
"assistance third degree",
fr"\bassistance{sep}3\b",
"assistance third degree",
priority=30,
),
RegexSubstitution(
"class",
fr"\bclas\b",
"class",
),
RegexSubstitution(
"accessory",
fr"\b(?:accessry|accsry)\b",
"accessory",
),
RegexSubstitution(
"dependency",
fr"\bdepndncy\b",
"dependency",
),
RegexSubstitution(
"unspecified",
fr"\bunspfd\b",
"unspecified",
),
RegexSubstitution(
"responsibility",
fr"\brespon?\b",
"responsibility",
),
RegexSubstitution(
"classification",
fr"\bclassif\b",
"classification",
),
RegexSubstitution(
"vice president",
fr"\bvp\b",
"vice president",
priority=30,
),
# BRIBERY terms
RegexSubstitution(
"personal",
fr"\bpersona\b",
"personal",
),
RegexSubstitution(
"assistance",
fr"\basst\b",
"assistance",
),
RegexSubstitution(
"service",
fr"\bserv\b",
"service",
),
RegexSubstitution(
"facilitation",
fr"\b(?:facil|fac)\b",
"facilitation",
),
RegexSubstitution(
"smuggling",
fr"\bsmug\b",
"smuggling",
),
RegexSubstitution(
"health",
fr"\bhlth\b",
"health",
),
RegexSubstitution( # NOTE: 'off' tends to be 'offense' hence the priority on this one
"official position", fr"\boff{sep}position\b", "official position", priority=5
),
RegexSubstitution(
"participants",
fr"\bparticipnts\b",
"participants",
),
RegexSubstitution(
"contestant",
fr"\bcntst\b",
"contestant",
),
RegexSubstitution(
"accept",
fr"\baccpt\b",
"accept",
),
RegexSubstitution(
"campaign contribution",
fr"\bcamp{sep}cont\b",
"campaign contribution",
),
RegexSubstitution(
"influence",
fr"\b(?:inflnce|influenc)\b",
"influence",
),
RegexSubstitution(
"compensation",
fr"\bcompens\b",
"compensation",
),
RegexSubstitution(
"treatment",
fr"\btreatm\b",
"treatment",
),
RegexSubstitution(
"commercial bribe",
fr"\b(?:comm|comm\'l){sep}bribe\b",
"commercial bribe",
),
RegexSubstitution(
"false testimony",
fr"\bfalse{sep}test\b",
"false testimony",
),
RegexSubstitution(
"miscellaneous",
fr"\bmisc\b",
"miscellaneous",
),
RegexSubstitution(
"impersonating",
fr"\bimpers\b",
"impersonating",
),
RegexSubstitution(
"receiving",
fr"\brecv\b",
"receiving",
),
RegexSubstitution(
"interfere with official process",
fr"\binterfere{sep}w{sep}offc{sep}proc\b",
"interfere with official process",
priority=5,
),
RegexSubstitution("public record", fr"\b(?:public|pub){sep}rec\b", "public record"),
RegexSubstitution(
"public servant",
fr"\b(?:public|pub){sep}(?:servant|srv|srvnt)\b",
"public servant",
),
RegexSubstitution( # NOTE: 'wit' also maps to 'withdraw', hence priority here
"witness juror",
fr"\b(?:witness|wit){sep}(?:juror|jur)\b",
"witness juror",
priority=5,
),
RegexSubstitution(
"umpire referee", fr"\b(?:umpire|ump){sep}(?:referee|ref)\b", "umpire referee"
),
# FAMILY RELATED OFFENSES
RegexSubstitution(
"custody interference",
fr"\bcust{sep}inter\b",
"custody interference",
),
RegexSubstitution(
"custody interference second degree",
fr"\bcust{sep}inter{sep}2\b",
"custody interference second degree",
priority=5,
),
RegexSubstitution(
"abandonment",
fr"\babandonmnt\b",
"abandonment",
),
RegexSubstitution(
"unattended",
fr"\bunatt\b",
"unattended",
),
RegexSubstitution(
"endanger",
fr"\b(?:endngr|endgr|endang)\b",
"endanger",
),
RegexSubstitution(
"welfare",
fr"\b(?:wlfre|wlfr)\b",
"welfare",
),
RegexSubstitution(
"endanger welfare",
fr"\b(?:endngr|endgr|endang){sep}(?:wlfre|wlfr|wel)\b",
"endanger welfare",
),
RegexSubstitution(
"neglect",
fr"\bneglct\b",
"neglect",
),
RegexSubstitution(
"contribute",
fr"\bcontrib\b",
"contribute",
),
RegexSubstitution(
"delinquincy",
fr"\b(?:dlnqncy|delinq)\b",
"delinquincy",
),
RegexSubstitution(
"service",
fr"\bsrvc\b",
"service",
),
RegexSubstitution(
"misrepresentation",
fr"\bmisrep\b",
"misrepresentation",
),
RegexSubstitution(
"disabled",
fr"\bdisabld\b",
"disabled",
),
# ===
RegexSubstitution(
"system of records exempt",
fr"\bsor{sep}exempt\b",
"system of records exempt",
),
RegexSubstitution(
"type",
r"\btyp\b",
"type",
),
RegexSubstitution(
"misconduct",
r"\b(?:miscond|miscon)\b",
"misconduct",
),
RegexSubstitution(
"mischief",
r"\bmisch\b",
"mischief",
),
RegexSubstitution(
"probation revocation",
fr"\bprob{sep}(?:rev|revo)\b",
"probation revocation",
),
RegexSubstitution(
"management",
r"\bmgmt\b",
"management",
),
RegexSubstitution(
"subsistence",
r"\bsubsist\b",
"subsistence",
),
RegexSubstitution(
"penalty group",
r"\bpg\b",
"penalty group",
),
RegexSubstitution(
"community custody",
r"\bcomm custody\b",
"community custody",
),
RegexSubstitution(
"contempt",
r"\bcntmpt\b",
"contempt",
),
RegexSubstitution(
"counterfeit",
r"\b(?:cntft|cntrft|cntrfeit|cnterft|contrft|contrfit)\b",
"counterfeit",
),
RegexSubstitution(
"counts",
r"\b(?:cts|cnts)\b",
"counts",
),
RegexSubstitution(
"victim",
r"\b(?:vict|vctm|vic)\b",
"victim",
),
# NUMBER TERMS ===========
RegexSubstitution("first", r"\b1st\b", "first", priority=20),
RegexSubstitution(
"first degree", fr"\b(?:first|1|1st){sep}(?:dgr|dg|de|d)\b", "first degree"
),
RegexSubstitution("first degree 2", fr"\b1dg\b", "first degree"),
RegexSubstitution(
"circumstances in the first degree",
fr"\bcircumstances{sep}1\b",
"circumstances in the first degree",
),
RegexSubstitution("second", r"\b2nd\b", "second", priority=20),
RegexSubstitution(
"second degree", fr"\b(?:second|2|2nd){sep}(?:dgr|dg|de|d)\b", "second degree"
),
RegexSubstitution(
"circumstances in the second degree",
fr"\bcircumstances{sep}2\b",
"circumstances in the second degree",
),
RegexSubstitution("third", r"\b3rd\b", "third", priority=20),
RegexSubstitution(
"third degree", fr"\b(?:third|3|3rd){sep}(?:dgr|dg|de|d)\b", "third degree"
),
RegexSubstitution(
"circumstances in the third degree",
fr"\bcircumstances{sep}3\b",
"circumstances in the third degree",
),
RegexSubstitution("fourth", r"\b4th\b", "fourth", priority=20),
RegexSubstitution("fifth", r"\b5th\b", "fifth", priority=20),
RegexSubstitution("sixth", r"\b6th\b", "sixth", priority=20),
RegexSubstitution("seventh", r"\b7th\b", "seventh", priority=20),
RegexSubstitution("eighth", r"\b8th\b", "eighth", priority=20),
RegexSubstitution("ninth", r"\b9th\b", "ninth", priority=20),
RegexSubstitution("tenth", r"\b10th\b", "tenth", priority=20),
# SCHEDULE terms ===========
# observed "l" for use of "i" across schedule terms
RegexSubstitution(
"Schedule", r"\b(?:sc?he?d?|sch|sched|schd)\b", "schedule", priority=9
),
RegexSubstitution(
"schedule one",
fr"\bschedule{sep}(?:i|1|l)\b",
"schedule one",
),
RegexSubstitution(
"schedule two",
fr"\bschedule{sep}(?:ii|2|ll)\b",
"schedule two",
),
RegexSubstitution(
"schedule three",
fr"\bschedule{sep}(?:iii|3|lll)\b",
"schedule three",
),
RegexSubstitution(
"schedule four",
fr"\bschedule{sep}(?:iv|4|lv)\b",
"schedule four",
),
RegexSubstitution(
"schedule five",
fr"\bschedule{sep}(?:v|5)\b",
"schedule five",
),
RegexSubstitution(
"schedule six",
fr"\bschedule{sep}(?:vi|6|vl)\b",
"schedule six",
),
# DRIVING TERMS ===========
RegexSubstitution(
"driving",
r"\bdrvg\b",
"driving",
),
RegexSubstitution(
"driving 2",
fr"\bdriv{sep}g\b",
"driving",
),
RegexSubstitution(
"failure to yield",
fr"\bfty\b",
"failure to yield",
),
RegexSubstitution(
"permit",
fr"\bperm\b",
"permit",
),
RegexSubstitution(
"registration",
fr"\b(?:regis|registra)\b",
"registration",
),
RegexSubstitution(
"driving under the influence",
r"\bdui\b",
"driving under the influence",
),
RegexSubstitution(
"driving while impaired",
r"\bdwi\b",
"driving while impaired",
),
RegexSubstitution(
"driving while license suspended",
r"\bdwls\b",
"driving while license suspended",
),
RegexSubstitution(
"driving while license revoked",
r"\bdwlr\b",
"driving while license revoked",
),
RegexSubstitution(
"revoked",
r"\brevkd\b",
"revoked",
),
RegexSubstitution(
"reckless endangerment",
fr"\breckles{sep}endanger\b",
"reckless endangerment",
),
RegexSubstitution(
"highway",
fr"\bhi{sep}way\b",
"highway",
),
RegexSubstitution(
"reckless driving",
fr"\brek{sep}dr?\b",
"reckless driving",
),
# ========
RegexSubstitution(
"retail theft",
fr"\bretail{sep}thft\b",
"retail theft",
),
RegexSubstitution(
"impregnate girl",
fr"\b(?:impregnate|impreg){sep}(?:girl|grl)\b",
"impregnate girl",
),
RegexSubstitution(
"worker compensation",
fr"\bwrkr{sep}cmp\b",
"worker compensation",
),
RegexSubstitution(
"disregard",
fr"\bdisreg\b",
"disregard",
),
RegexSubstitution(
"electrical appliance",
fr"\belct{sep}appl\b",
"electrical appliance",
),
RegexSubstitution(
"serial number",
fr"\b(?:serial|ser){sep}(?:number|nmbr|num|nu|no)\b",
"serial number",
),
# DISTRIBUTION / FURNISH / TRAFFICK TERMS =======
RegexSubstitution( # TODO: revisit traff/traf', more likely to be traffick/ing but could be traffic (cars)
"traffick",
r"\b(?:tfk|traff|traf)\b",
"traffick",
),
RegexSubstitution( # TODO: revisit adding 'dist', more likely to be distribution but could be disturbance
"distribution",
r"\b(?:distr|distrib)\b",
"distribution",
),
RegexSubstitution(
"attempted distribution",
fr"\b(?:at|att|attempted){sep}dist\b",
"attempted distribution",
priority=5,
),
RegexSubstitution(
"illegal distribution",
fr"\billgl{sep}dist\b",
"intent distribution",
priority=5,
),
RegexSubstitution(
"buy distribute",
fr"\bbuy{sep}dist\b",
"buy distribute",
),
RegexSubstitution(
"intent distribute",
fr"\b(?:intent|int){sep}dist\b",
"intent distribute",
priority=5,
),
RegexSubstitution(
"intent to distribute",
fr"\b(?:intent|int){sep}to{sep}dist\b",
"intent to distribute",
priority=5,
),
RegexSubstitution(
"distribution possession",
fr"\bdist{sep}(?:possession|possess|poss)\b",
"distribution possession",
priority=5,
),
RegexSubstitution(
"unauthorized distribution",
fr"\b(?:unauthorized|unauth|unau|unauthd){sep}dist\b",
"unauthorized distribution",
priority=5,
),
RegexSubstitution(
"possession distribution",
fr"\b(?:possession|possess|poss){sep}dist\b",
"possession distribution",
priority=5,
),
RegexSubstitution(
"unlaw distribution",
fr"\b(?:unlawful|unlaw){sep}dist\b",
"unlawful distribution",
priority=5,
),
RegexSubstitution(
"distribution controlled",
fr"\bdist{sep}(?:controlled|cntrld|cntrl|contrlld)\b",
"distribution controlled",
priority=5,
),
RegexSubstitution(
"distribute schedule",
fr"\bdist{sep}(?:schedule|sch|sched)\b",
"distribute schedule",
priority=5,
),
RegexSubstitution(
"furnish",
r"\b(?:furnishing|furn)\b",
"furnish",
),
RegexSubstitution( # TODO: revisit adding 'man', more likely to be manufacture/ing but could have other meaning
"manufacturing",
r"\b(?:manuf|manu|mfg|manf|manfac)\b",
"manufacturing",
),
RegexSubstitution(
"manufacturing distribution sell",
fr"\b(?:manuf|manu|man|mfg|manf|manfac){sep}dist{sep}sell\b",
"manufacturing distribution sell",
priority=5,
),
RegexSubstitution(
"record sell rent distribute",
fr"\brecord{sep}sell{sep}rent{sep}dist\b",
"record sell rent distribute",
priority=5,
),
RegexSubstitution(
"sell distribute",
fr"\bsell{sep}dist\b",
"sell distribute",
priority=5,
),
RegexSubstitution(
"sale distribute",
fr"\bsale{sep}dist\b",
"sale distribute",
priority=5,
),
RegexSubstitution(
"offer agree to distribute",
fr"\boffer{sep}agree{sep}to{sep}dist\b",
"offer agree distribute",
priority=5,
),
RegexSubstitution(
"arrange to distribute",
fr"\barrange{sep}to{sep}dist\b",
"arrange to distribute",
priority=5,
),
RegexSubstitution(
"arrange to distribute 2",
fr"\barrange{sep}dist\b",
"arrange to distribute",
priority=5,
),
RegexSubstitution(
"controlled substance distribution",
fr"\bcontr{sep}sub{sep}dist\b",
"controlled substance distribution",
priority=5,
),
RegexSubstitution(
"manufacturing deliver distribution",
fr"\b(?:manuf|manu|man|mfg|manf){sep}del{sep}dist\b",
"manufacturing deliver distribution",
priority=5,
),
RegexSubstitution(
"possession distribution manufacturing",
fr"\bposs{sep}dist{sep}manuf\b",
"possession distribution manufacturing",
priority=5,
),
RegexSubstitution(
"with intent to distribute",
fr"\bwitd\b",
"with intent to distribute",
priority=5,
),
RegexSubstitution(
"possession with intent to distribute",
fr"\bposs{sep}(?:with|w){sep}(?:intent|int|i){sep}dist\b",
"possession with intent to distribute",
priority=5,
),
RegexSubstitution(
"manufacturing distribution possession",
fr"\b(?:manuf|manu|man|mfg|manf){sep}dist{sep}(?:p|poss|pos)\b",
"manufacturing distribution possession",
priority=5,
),
RegexSubstitution(
"manufacturing distribution",
fr"\b(?:manuf|manu|man|mfg|manf){sep}dist\b",
"manufacturing distribution",
priority=5,
),
RegexSubstitution(
"distribution obscene material",
fr"\bdist{sep}(?:obscene|obs|obsc){sep}(?:material|mat|mtrl)\b",
"distribution obscene material",
priority=5,
),
RegexSubstitution(
"harmful material",
fr"\b(?:harmful|hrmf){sep}(?:material|mat|mtrl)\b",
"harmful material",
priority=5,
),
RegexSubstitution(
"obscene material distribution",
fr"\b(?:obscene|obs|obsc){sep}(?:material|mat|mtrl){sep}dist\b",
"obscene material distribution",
priority=5,
),
RegexSubstitution(
"material",
fr"\b(?:matrl|mat|mtrl)\b",
"material",
priority=5,
),
RegexSubstitution(
"distribution child porn",
fr"\bdist{sep}child{sep}porn\b",
"distribution child porn",
priority=5,
),
RegexSubstitution(
"distribution controlled substances",
fr"\bdist{sep}cds\b",
"distribution controlled substances",
priority=5,
),
RegexSubstitution(
"controlled substances distribution ",
fr"\bcds{sep}dist\b",
"controlled substances distribution ",
priority=5,
),
RegexSubstitution(
"distribution narcotics",
fr"\bdist{sep}narc\b",
"distribution narcotics",
priority=5,
),
RegexSubstitution(
"deliver or distribution",
fr"\bdel{sep}or{sep}dist\b",
"deliver or distribution",
priority=5,
),
RegexSubstitution(
"criminal distribution",
fr"\bcriminal{sep}dist\b",
"criminal distribution",
priority=5,
),
RegexSubstitution(
"purchase",
r"\bpur\b",
"purchase",
),
# DRUG TERMS ===========
RegexSubstitution(
"marijuana",
r"\b(?:marij|marihuana|mari|marijuan|marijua|mariju|mj)\b",
"marijuana",
),
RegexSubstitution(
"hydrocodone",
r"\bhydroc\b",
"hydrocodone",
),
RegexSubstitution(
"cocaine",
r"\b(?:cocain|coca|cocai|cocne)\b",
"cocaine",
),
RegexSubstitution(
"crack or cocaine",
r"\bcoc\b",
"crack or cocaine",
),
RegexSubstitution(
"rohypnol",
r"\brohypnl\b",
"rohypnol",
),
RegexSubstitution(
"heroine",
r"\bher\b",
"heroine",
),
RegexSubstitution(
"heroine",
r"\bher\b",
"heroine",
),
RegexSubstitution(
"ecstasy",
r"\bmdma\b",
"ecstasy",
),
RegexSubstitution(
"methamphetamine",
r"\b(?:meth|metham|methamphet|methamph)\b",
"methamphetamine",
),
RegexSubstitution(
"paraphernalia",
r"\b(?:para|paraph|paraphenalia|parap)\b",
"paraphernalia",
),
RegexSubstitution(
"grams",
r"\b(?:gr|gms|grms)\b",
"grams",
),
RegexSubstitution(
"gram",
r"\bgm\b",
"gram",
),
RegexSubstitution(
"kilograms",
r"\bkg\b",
"kilograms",
),
RegexSubstitution(
"pounds",
r"\blb\b",
"pounds",
),
RegexSubstitution(
"ounces",
r"\boz\b",
"ounces",
),
# ALCOHOL / LIQUOR terms ===========
RegexSubstitution(
"alcoholic beverage", r"\balc\Wbev\b", "alcoholic beverage", priority=5
),
RegexSubstitution(
"beverage",
r"\bbev\b",
"beverage",
),
RegexSubstitution(
"blood alcohol concentration",
r"\bbac\b",
"blood alcohol concentration",
),
RegexSubstitution(
"alcohol",
r"\b(?:alc|alch|alchol|alcohl|alco|alcoh|alcoho)\b",
"alcohol",
),
RegexSubstitution(
"over legal",
fr"\b(?:over|ov){sep}(?:legal|leg)\b",
"over legal",
),
RegexSubstitution(
"supply",
fr"\bsupp\b",
"supply",
),
RegexSubstitution(
"liquor",
fr"\bliq\b",
"liquor",
),
RegexSubstitution(
"distill",
r"\bdstl\b",
"distill",
),
RegexSubstitution(
"minor in possession",
fr"\bmip\b",
"minor in possession",
),
RegexSubstitution(
"premises",
fr"\bprem\b",
"premises",
),
RegexSubstitution(
"consume",
fr"\bcnsum\b",
"consume",
),
RegexSubstitution(
"intoxication",
fr"\bintox\b",
"intoxication",
),
RegexSubstitution(
"available",
fr"\bavail\b",
"available",
),
RegexSubstitution(
"unlicensed",
fr"\bunlic\b",
"unlicensed",
),
RegexSubstitution(
"large amount",
fr"\blg{sep}amt\b",
"large amount",
),
RegexSubstitution(
"small amount",
fr"\bsm{sep}amt\b",
"small amount",
),
RegexSubstitution(
"required",
fr"\breq\b",
"required",
),
RegexSubstitution(
"violate prohibition",
fr"\bvio{sep}prohibition\b",
"violate prohibition",
),
RegexSubstitution(
"enticement",
fr"\bentcmnt\b",
"enticement",
),
# SUBSTANCE TERMS ========
RegexSubstitution(
"Substance",
r"\b(?:sub|subs|substanc|substan|substnces|subtance|substa|substnc|sunstance|subst)\b",
"substance",
20,
),
RegexSubstitution("controlled", r"\b(?:cntrld|cntrl|contrlld)\b", "controlled", 20),
RegexSubstitution(
"controlled dangerous substances",
r"\bcds\b",
"controlled dangerous substances",
),
RegexSubstitution(
"solicitation of controlled substances",
fr"\bsol{sep}cds\b",
"solicitation of controlled substances",
priority=4,
),
RegexSubstitution(
"solicitation",
fr"\b(?:solct|sol|solicit|solic)\b",
"solicitation",
),
RegexSubstitution(
"solicitation of narcotics",
fr"\bsol{sep}narc\b",
"solicitation of narcotics",
priority=4,
),
RegexSubstitution(
"Controlled Substance",
fr"\bcont?r?{sep}?subs?t?(?:\b|stance\b)",
"controlled substance",
),
RegexSubstitution(
"Controlled Substance 2",
r"\bc\W?s\b",
"controlled substance",
),
RegexSubstitution(
"unlawful possession of a controlled substance",
r"\bupcs\b",
"unlawful possession of a controlled substance",
),
]
def prep_text(text):
# Remove Commas from Numbers
text = re.sub(r"(\d+?),(\d+?)", r"\1\2", text)
# TODO: double check this `'s` regex
text = re.sub(r"\b(\S+?)'(s)", r"\1\2", text)
# replace hyphens with spaces
text = re.sub("-", " ", text)
# replace forward-slashes with spaces
text = re.sub("/", " ", text)
return text
def cleaner(text):
if pd.isnull(text):
return ""
# Prepare text for regex substitions
text = prep_text(text)
# Do all substitutions (Case insensitive on raw text)
substitutions_sorted = sorted(substitutions, key=lambda s: s.priority)
for substitution in substitutions_sorted:
text = re.sub(substitution.regex, substitution.replacement, text)
# Remove any terms we don't want
for removal in removals:
text = re.sub(removal.regex, " ", text)
# Then remove remaining punctuation
for punct in all_punctuation:
text = text.replace(punct, " ")
text = " ".join(text.split()) # removes extra spaces: " " → " "
text = text.lower()
return text