akgodwin commited on
Commit
3febea9
1 Parent(s): d337f30

reroute to streamlit app

Browse files
Files changed (8) hide show
  1. .vscode/settings.json +0 -3
  2. ABOUT.md +3 -1
  3. README.md +1 -1
  4. app.py +3 -101
  5. cleaning_utils.py +0 -2652
  6. download.py +0 -28
  7. onnx_model_utils.py +0 -195
  8. requirements.txt +1 -11
.vscode/settings.json DELETED
@@ -1,3 +0,0 @@
1
- {
2
- "python.defaultInterpreterPath": "/Users/akgodwin/.pyenv/versions/3.9.11/envs/hf-rota-app/bin/python",
3
- }
 
 
 
ABOUT.md CHANGED
@@ -15,4 +15,6 @@ The model was trained on [publicly available data](https://web.archive.org/web/2
15
 
16
  For more information on the model, please see the [model repo](https://huggingface.co/rti-international/rota).
17
 
18
- This model and application were developed by the [RTI International Center for Data Science](https://www.rti.org/centers/rti-center-data-science).
 
 
15
 
16
  For more information on the model, please see the [model repo](https://huggingface.co/rti-international/rota).
17
 
18
+ This model and application were developed by the [RTI International Center for Data Science](https://www.rti.org/centers/rti-center-data-science).
19
+
20
+ ### ℹ️ Use
README.md CHANGED
@@ -27,4 +27,4 @@ The model was trained on [publicly available data](https://web.archive.org/web/2
27
 
28
  For more information on the model, please see the [model repo](https://huggingface.co/rti-international/rota).
29
 
30
- This model and application were developed by the [RTI International Center for Data Science](https://www.rti.org/centers/rti-center-data-science).
27
 
28
  For more information on the model, please see the [model repo](https://huggingface.co/rti-international/rota).
29
 
30
+ This model and application were developed by the [RTI International Center for Data Science and AI](https://www.rti.org/centers/rti-center-data-science).
app.py CHANGED
@@ -1,111 +1,13 @@
1
- from functools import partial
2
  from pathlib import Path
3
 
4
- from pandas import DataFrame, read_csv, read_excel
5
  import streamlit as st
6
- from more_itertools import ichunked
7
- from stqdm import stqdm
8
 
9
- from onnx_model_utils import predict, predict_bulk, max_pred_bulk, RELEASE_TAG
10
- from download import download_link
11
 
12
  PRED_BATCH_SIZE = 4
13
-
14
-
15
  st.set_page_config(page_title="ROTA", initial_sidebar_state="collapsed")
16
 
17
- st.markdown(Path("ABOUT.md").read_text())
18
-
19
- st.markdown("## ✏️ Single Coder Demo")
20
- input_text = st.text_input(
21
- "Input Offense",
22
- value="FRAUDULENT USE OF A CREDIT CARD OR DEBT CARD >= $25,000",
23
- )
24
-
25
- predictions = predict(input_text)
26
-
27
- st.markdown("Predictions")
28
- labels = ["Charge Category"]
29
- st.dataframe(
30
- DataFrame(predictions[0])
31
- .assign(
32
- confidence=lambda d: d["score"].apply(lambda d: round(d * 100, 0)).astype(int)
33
- )
34
- .drop("score", axis="columns")
35
- )
36
-
37
- st.markdown("---")
38
- st.markdown("## 📑 Bulk Coder")
39
- st.warning(
40
- "⚠️ *Note:* Your input data will be deduplicated"
41
- " on the selected column to reduce computation requirements."
42
- " You will need to re-join the results on your offense text column."
43
- )
44
- st.markdown("1️⃣ **Upload File**")
45
- uploaded_file = st.file_uploader("Bulk Upload", type=["xlsx", "csv"])
46
-
47
- file_readers = {"csv": read_csv, "xlsx": partial(read_excel, engine="openpyxl")}
48
 
49
- if uploaded_file is not None:
50
- for filetype, reader in file_readers.items():
51
- if uploaded_file.name.endswith(filetype):
52
- df = reader(uploaded_file)
53
- file_name = uploaded_file.name
54
- del uploaded_file
55
- st.write("2️⃣ **Select Column of Offense Descriptions**")
56
- string_columns = list(df.select_dtypes("object").columns)
57
- longest_column = max(
58
- [(df[c].str.len().mean(), c) for c in string_columns], key=lambda x: x[0]
59
- )[1]
60
-
61
- selected_column = st.selectbox(
62
- "Select Column",
63
- options=list(string_columns),
64
- index=string_columns.index(longest_column),
65
- )
66
- original_length = len(df)
67
- df_unique = df.drop_duplicates(subset=[selected_column]).copy()
68
- del df
69
- st.markdown(
70
- f"Uploaded Data Sample `(Deduplicated. N Rows = {len(df_unique)}, Original N = {original_length})`"
71
- )
72
- st.dataframe(df_unique.head(20))
73
- st.write(f"3️⃣ **Predict Using Column: `{selected_column}`**")
74
-
75
- column = df_unique[selected_column].copy()
76
- del df_unique
77
- if st.button(f"Compute Predictions"):
78
- input_texts = (value for _, value in column.items())
79
-
80
- n_batches = (len(column) // PRED_BATCH_SIZE) + 1
81
-
82
- bulk_preds = []
83
- for batch in stqdm(
84
- ichunked(input_texts, PRED_BATCH_SIZE),
85
- total=n_batches,
86
- desc="Bulk Predict Progress",
87
- ):
88
- batch_preds = predict_bulk(batch)
89
- bulk_preds.extend(batch_preds)
90
-
91
- pred_df = column.to_frame()
92
- max_preds = max_pred_bulk(bulk_preds)
93
- pred_df["charge_category_pred"] = [p["label"] for p in max_preds]
94
- pred_df["charge_category_pred_confidence"] = [
95
- int(round(p["score"] * 100, 0)) for p in max_preds
96
- ]
97
- del column
98
- del bulk_preds
99
- del max_preds
100
-
101
- # # TODO: Add all scores
102
-
103
- st.write("**Sample Output**")
104
- st.dataframe(pred_df.head(100))
105
 
106
- tmp_download_link = download_link(
107
- pred_df,
108
- f"{file_name}-ncrp-predictions.csv",
109
- "⬇️ Download as CSV",
110
- )
111
- st.markdown(tmp_download_link, unsafe_allow_html=True)
 
1
  from pathlib import Path
2
 
 
3
  import streamlit as st
 
 
4
 
 
 
5
 
6
  PRED_BATCH_SIZE = 4
 
 
7
  st.set_page_config(page_title="ROTA", initial_sidebar_state="collapsed")
8
 
9
+ st.markdown(":zap: The ROTA app is available for use at https://rti-rota.streamlit.app/ :zap:")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
+ st.markdown(Path("ABOUT.md").read_text())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
+ st.markdown(":zap: To use the ROTA app, go to https://rti-rota.streamlit.app/ :zap:")
 
 
 
 
 
cleaning_utils.py DELETED
@@ -1,2652 +0,0 @@
1
- import re
2
- from dataclasses import dataclass
3
- from string import punctuation
4
- import pandas as pd
5
-
6
- all_punctuation = punctuation + "‘’·—»"
7
- # keep in dollar signs
8
- all_punctuation = all_punctuation.replace("$", "")
9
-
10
-
11
- # "regex separator"
12
- # captures the following: 1+ spaces OR 1+ non-word characters (ex: "/", "-"), OR 1 word boundary
13
- # apply the this variable using an `fr` string in the regex substituion (ex: `fr"\bw{sep}force\b"`)
14
- sep = "(?: +|\W+|\b)"
15
-
16
-
17
- @dataclass
18
- class RegexRemoval:
19
- description: str
20
- regex_str: str # usually raw string: r"your string"
21
-
22
- def __post_init__(self):
23
- self.regex = re.compile(self.regex_str, re.IGNORECASE)
24
-
25
-
26
- @dataclass
27
- class RegexSubstitution:
28
- description: str
29
- regex_str: str # usually raw string: r"your string"
30
- replacement: str
31
- priority: int = 10 # higher values → run later (eg: 1 runs before 20)
32
-
33
- def __post_init__(self):
34
- self.regex = re.compile(self.regex_str, re.IGNORECASE)
35
-
36
-
37
- removals = [
38
- RegexRemoval("OBSCIS", r"(OBSCIS)"),
39
- RegexRemoval(
40
- "MO Suffix",
41
- r"\b\w\s\w\s\w\w?\s\w\s\d{2}(?: |\W)\d{2}(?: |\W)\d{4}",
42
- ),
43
- RegexRemoval(
44
- "Statute Prefix", r"\S{1,2}\s\d\S{0,3}\.\d\S{0,3}\.\d\S{0,3}(?:\.\d?\S{0,3}?)?"
45
- ),
46
- ]
47
-
48
- substitutions = [
49
- # LESS THAN / GREATER THAN terms =========
50
- RegexSubstitution("Less Than", fr"\b(?:<|lt)\b", " less than "),
51
- RegexSubstitution("Less Than 2", fr"\blt(?=\d+)\b", "less than "),
52
- RegexSubstitution("Less Than 3", fr"\<", " less than "),
53
- RegexSubstitution("Greater Than", fr"\b(?:&GT;|gt|\>)\b", " greater than "),
54
- RegexSubstitution("Greater Than 2", fr"\bgt(?=\d+)\b", "greater than "),
55
- RegexSubstitution("Greater Than 3", fr"\>", " greater than "),
56
- # WITH terms ===========
57
- RegexSubstitution("With Out", fr"\bw{sep}(?:o|out)\b", "without"),
58
- RegexSubstitution("With Out 2", fr"\bwo\b", "without"),
59
- RegexSubstitution("Within", fr"\bw{sep}(?:i|in)\b", "within", priority=5),
60
- RegexSubstitution(
61
- "With Intent",
62
- fr"\bw{sep}\s?in?t?e?n?t?\b",
63
- "with intent",
64
- ),
65
- RegexSubstitution(
66
- "with a",
67
- fr"\bw{sep}a\b",
68
- "with a",
69
- ),
70
- RegexSubstitution(
71
- "with health",
72
- fr"\bw{sep}health\b",
73
- "with health",
74
- ),
75
- RegexSubstitution(
76
- "with own",
77
- fr"\bw{sep}own\b",
78
- "with own",
79
- ),
80
- RegexSubstitution(
81
- "with report",
82
- fr"\bw{sep}report\b",
83
- "with report",
84
- ),
85
- RegexSubstitution(
86
- "with license",
87
- fr"\bw{sep}license\b",
88
- "with license",
89
- ),
90
- RegexSubstitution(
91
- "with murder",
92
- fr"\bw{sep}murder\b",
93
- "with murder",
94
- ),
95
- RegexSubstitution(
96
- "with injury",
97
- fr"\bw{sep}(?:injury|inj|injry)\b",
98
- "with injury",
99
- ),
100
- RegexSubstitution(
101
- "with turned",
102
- fr"\bw{sep}turned\b",
103
- "with turned",
104
- ),
105
- RegexSubstitution(
106
- "with altered",
107
- fr"\bw{sep}alt\b",
108
- "with altered",
109
- ),
110
- RegexSubstitution(
111
- "with deadly",
112
- fr"\bw{sep}deadly\b",
113
- "with deadly",
114
- ),
115
- RegexSubstitution(
116
- "with dangerous weapon",
117
- fr"\b(?:with|w){sep}(?:dangerous|d){sep}(?:weapon|wpn|weapn|weap)\b",
118
- "with dangerous weapon",
119
- priority=5,
120
- ),
121
- RegexSubstitution(
122
- "with child",
123
- fr"\b(?:with|w){sep}(?:child|chi|chld)\b",
124
- "with child",
125
- ),
126
- RegexSubstitution(
127
- "with minor",
128
- fr"\bw{sep}minor\b",
129
- "with minor",
130
- ),
131
- RegexSubstitution(
132
- "with kidnapping",
133
- fr"\bw{sep}kidnapping\b",
134
- "with kidnapping",
135
- ),
136
- RegexSubstitution(
137
- "with agency",
138
- fr"\bw{sep}agency\b",
139
- "with agency",
140
- ),
141
- RegexSubstitution(
142
- "with firearm",
143
- fr"\bw{sep}firearm\b",
144
- "with firearm",
145
- ),
146
- RegexSubstitution(
147
- "with weapon",
148
- fr"\bw{sep}(?:weapon|wpn|weapn|weap)\b",
149
- "with weapon",
150
- ),
151
- RegexSubstitution(
152
- "with knife",
153
- fr"\bw{sep}knife\b",
154
- "with knife",
155
- ),
156
- RegexSubstitution(
157
- "with force",
158
- fr"\bw{sep}force\b",
159
- "with force",
160
- ),
161
- RegexSubstitution(
162
- "with extenuating circumstances",
163
- fr"\bw{sep}ext{sep}circumstances\b",
164
- "with extenuating circumstances",
165
- ),
166
- RegexSubstitution(
167
- "with prior",
168
- fr"\bw{sep}prior\b",
169
- "with prior",
170
- ),
171
- RegexSubstitution(
172
- "with previous",
173
- fr"\bw{sep}previous\b",
174
- "with previous",
175
- ),
176
- RegexSubstitution(
177
- "with domestic violence",
178
- fr"\bw{sep}dv\b",
179
- "with domestic violence",
180
- ),
181
- RegexSubstitution(
182
- "with suspended",
183
- fr"\bw{sep}suspended\b",
184
- "with suspended",
185
- ),
186
- RegexSubstitution( # doublecheck this
187
- "vehicle with",
188
- fr"\bvehicle{sep}w{sep}",
189
- "vehicle with",
190
- ),
191
- RegexSubstitution( # TODO: is this "possession with" or "possession weapon"? see concealed weapon as example
192
- "possession with",
193
- fr"\b(?:possession|possess|poss){sep}w{sep}",
194
- "possession with",
195
- ),
196
- RegexSubstitution(
197
- "possession with intent",
198
- fr"\bp{sep}with{sep}intent",
199
- "possession with intent",
200
- priority=30,
201
- ),
202
- RegexSubstitution(
203
- "neglect with",
204
- fr"\bneglect{sep}w{sep}",
205
- "neglect with",
206
- ),
207
- RegexSubstitution(
208
- "cooperate with",
209
- fr"\bcooperate{sep}w{sep}",
210
- "cooperate with",
211
- ),
212
- RegexSubstitution(
213
- "interfere with",
214
- fr"\b(?:inter|interfere){sep}w{sep}",
215
- "interfere with",
216
- ),
217
- RegexSubstitution( # TODO consolidate tamper/tampering?
218
- "tamper with",
219
- fr"\btamper{sep}w{sep}",
220
- "tamper with",
221
- ),
222
- RegexSubstitution(
223
- "tampering with",
224
- fr"\btampering{sep}w{sep}",
225
- "tampering with",
226
- ),
227
- RegexSubstitution(
228
- "assault with",
229
- fr"\bassault{sep}w{sep}",
230
- "assault with",
231
- ),
232
- # FIREARM TERMS
233
- RegexSubstitution(
234
- "firearm with altered identification numbers",
235
- fr"\bfirearm{sep}(?:with|w){sep}alter\b",
236
- "firearm with altered identification numbers",
237
- ),
238
- RegexSubstitution(
239
- "firearm",
240
- fr"\bf{sep}a\b",
241
- "firearm",
242
- ),
243
- RegexSubstitution(
244
- "intimidation",
245
- fr"\b(?:intim|intimid)\b",
246
- "intimidation",
247
- ),
248
- # DOMESTIC VIOLENCE TERMS / PROTECTION / RESTRAINING ORDERS
249
- RegexSubstitution(
250
- "protective order",
251
- fr"\b(?:protective|protection|prot){sep}(?:order|ord|or)\b",
252
- "protective order",
253
- ),
254
- RegexSubstitution(
255
- "domestic violence protective order",
256
- r"\bdvpo\b",
257
- "domestic violence protective order",
258
- ),
259
- RegexSubstitution("domestic", r"\bdom\b", "domestic", priority=20),
260
- RegexSubstitution(
261
- "domestic violence",
262
- r"\bdv\b",
263
- "domestic violence",
264
- ),
265
- RegexSubstitution(
266
- "domestic violence 2",
267
- fr"\bd{sep}v\b",
268
- "domestic violence",
269
- ),
270
- RegexSubstitution(
271
- "witness testimony",
272
- fr"\bwit{sep}tes\b",
273
- "witness testimony",
274
- ),
275
- # CONVICTION TERMS ==
276
- RegexSubstitution(
277
- "misdemeanor conviction",
278
- fr"\b(?:misdemeanor|misd){sep}(?:convic|conv)\b",
279
- "misdemeanor conviction",
280
- ),
281
- RegexSubstitution(
282
- "prior conviction",
283
- fr"\b(?:prior|pr|pri){sep}(?:convic|conv)\b",
284
- "prior conviction",
285
- ),
286
- # ==== GENERAL TERMS =====
287
- RegexSubstitution( # NOTE: added a negative lookbehind for 'mentally' so we won't override 'mentally ill' cases
288
- "illegal",
289
- fr"\b(?<!mentally )(?:ill|illeg|illgl)\b",
290
- "illegal",
291
- ),
292
- RegexSubstitution("commercial fish", fr"\bcomm{sep}fish\b", "commercial fish"),
293
- RegexSubstitution("vessel", fr"\bvess\b", "vessel"),
294
- RegexSubstitution(
295
- "traffic control device",
296
- fr"\btraff{sep}control{sep}dev\b",
297
- "traffic control device",
298
- ),
299
- RegexSubstitution("non-culpable", fr"\bnonculp\b", "non-culpable"),
300
- RegexSubstitution("prohibited", fr"\bprohib\b", "prohibited"),
301
- RegexSubstitution("nuisance", fr"\bnuis\b", "nuisance"),
302
- RegexSubstitution("obstruction", fr"\bobstr\b", "obstruction"),
303
- RegexSubstitution("pedestrian", fr"\bped\b", "pedestrian"),
304
- RegexSubstitution("conduct", fr"\bcond\b", "conduct", priority=20),
305
- RegexSubstitution(
306
- "subsequent",
307
- fr"\bsubsq\b",
308
- "subsequent",
309
- ),
310
- RegexSubstitution(
311
- "disturbing the peace",
312
- fr"\bdist{sep}peace\b",
313
- "disturbing the peace",
314
- ),
315
- RegexSubstitution(
316
- "offender accountability act",
317
- fr"\boaa\b",
318
- "offender accountability act",
319
- ),
320
- RegexSubstitution(
321
- "against",
322
- fr"\b(?:agnst|agin)\b",
323
- "against",
324
- ),
325
- RegexSubstitution(
326
- "child",
327
- fr"\b(?:chil|chld)\b",
328
- "child",
329
- ),
330
- RegexSubstitution(
331
- "school",
332
- fr"\bschl\b",
333
- "school",
334
- ),
335
- RegexSubstitution(
336
- "multiple",
337
- fr"\bmult\b",
338
- "multiple",
339
- ),
340
- RegexSubstitution(
341
- "assailant",
342
- fr"\bassail\b",
343
- "assailant",
344
- ),
345
- RegexSubstitution(
346
- "public disturbance",
347
- fr"\b(?:public|pub|publ){sep}(?:disturbance|disturb|dist)\b",
348
- "public disturbance",
349
- ),
350
- RegexSubstitution(
351
- "interfere",
352
- fr"\b(?:interf|interfer)\b",
353
- "interfere",
354
- ),
355
- RegexSubstitution( # TODO should we leave obstructing/obstruction separate terms or lump into obstruct?
356
- "obstructing",
357
- fr"\bob\b",
358
- "obstructing",
359
- ),
360
- RegexSubstitution(
361
- "law enforcement officer",
362
- fr"\bleo\b",
363
- "law enforcement officer",
364
- ),
365
- RegexSubstitution(
366
- "officer",
367
- fr"\b(?:offcr|ofcr)\b",
368
- "officer",
369
- ),
370
- RegexSubstitution(
371
- "minor",
372
- fr"\b(?:min|minr|mnr)\b",
373
- "minor",
374
- ),
375
- RegexSubstitution(
376
- "distance within 300 feet of park",
377
- fr"\bdist{sep}300{sep}park\b",
378
- "distance within 300 feet of park",
379
- priority=5,
380
- ),
381
- RegexSubstitution(
382
- "distance within 300",
383
- fr"{sep}dist{sep}w{sep}i{sep}300\b",
384
- "distance within 300",
385
- priority=5,
386
- ),
387
- RegexSubstitution(
388
- "major",
389
- fr"\bmajr\b",
390
- "major",
391
- ),
392
- RegexSubstitution(
393
- "willful",
394
- fr"\b(?:wilfl|wlfl)\b",
395
- "willful",
396
- ),
397
- RegexSubstitution(
398
- "issue worthless checks",
399
- fr"\b(?:issue|iss){sep}(?:worthless|wrthlss|wrtls){sep}(?:checks|cks)\b",
400
- "worthless",
401
- ),
402
- RegexSubstitution(
403
- "issue multiple worthless checks",
404
- fr"\b(?:issue|iss){sep}(?:multiple|mltpl){sep}(?:worthless|wrthlss|wrtls){sep}(?:checks|cks)\b",
405
- "worthless",
406
- ),
407
- RegexSubstitution(
408
- "unauthorized",
409
- fr"\b(?:unauth|unau|unauthd)\b",
410
- "unauthorized",
411
- ),
412
- RegexSubstitution(
413
- "child support",
414
- fr"\b(?:child|chld|chi){sep}(?:support|supp|sup)\b",
415
- "child support",
416
- ),
417
- RegexSubstitution(
418
- "unlawful",
419
- r"\b(?:unlawfully|unlaw|unlawfl|unlawf|unlwfl|unl)\b",
420
- "unlawful",
421
- ),
422
- RegexSubstitution(
423
- "Possession",
424
- r"\b(?:possess|poss?)\b",
425
- "possession",
426
- ),
427
- RegexSubstitution(
428
- "Abetting",
429
- r"\b(?:abett|abetted)\b",
430
- "Abetting",
431
- ),
432
- RegexSubstitution("emergency", r"\b(?:emerg|emer)\b", "emergency", priority=20),
433
- RegexSubstitution(
434
- "Attempted",
435
- r"\b(?:att|atmpt)\b",
436
- "attempted",
437
- ),
438
- RegexSubstitution( # NOTE: added negative look ahead so we don't remap "at risk" to "attempted risk"
439
- "Attempted 2",
440
- r"\bat(?! risk)\b",
441
- "attempted",
442
- ),
443
- RegexSubstitution(
444
- "Battery",
445
- r"\bbatt\b",
446
- "battery",
447
- ),
448
- RegexSubstitution(
449
- "Violation of Probation",
450
- r"\bvop\b",
451
- "violation of probation",
452
- ),
453
- RegexSubstitution( # NOTE: removed 'con' because shows up in some DV-related text, may not be a one-size fits all regex / 'consp' to conspiracy or conspire?
454
- "Conspiracy",
455
- r"\b(?:consp|conspi|conspira|conspirc|consprc|consprcy|cnsprcy|conspr)\b",
456
- "conspiracy",
457
- ),
458
- RegexSubstitution(
459
- "Property",
460
- r"\bprop\b",
461
- "property",
462
- ),
463
- RegexSubstitution(
464
- "public disturbance",
465
- fr"\b(?:public|pub|publ){sep}(?:disturbance|dist)\b",
466
- "public disturbance",
467
- ),
468
- RegexSubstitution(
469
- "Criminal",
470
- r"\bcrim\b",
471
- "criminal",
472
- ),
473
- RegexSubstitution(
474
- "License",
475
- r"\blic\b",
476
- "license",
477
- ),
478
- RegexSubstitution(
479
- "Credit Card",
480
- r"\bcc\b",
481
- "credit card",
482
- ),
483
- RegexSubstitution(
484
- "Credit Card 2",
485
- r"\bcred{sep}crd\b",
486
- "credit card",
487
- ),
488
- RegexSubstitution(
489
- "exchange",
490
- r"\bexch\b",
491
- "exchange",
492
- ),
493
- RegexSubstitution(
494
- "electric power",
495
- fr"\belec{sep}pwr\b",
496
- "electric power",
497
- ),
498
- RegexSubstitution(
499
- "commit false", fr"\bcom?{sep}false\b", "commit false", priority=5
500
- ),
501
- # VEHICLE terms ===========
502
- RegexSubstitution(
503
- "Vehicle",
504
- r"\b(?:veh|vehi)\b",
505
- "vehicle",
506
- ),
507
- RegexSubstitution(
508
- "Vehicles",
509
- r"\bvehs\b",
510
- "vehicles",
511
- ),
512
- RegexSubstitution(
513
- "commercial motor vehicle",
514
- r"\bcmv\b",
515
- "commercial motor vehicle",
516
- ),
517
- RegexSubstitution(
518
- "motor vehicle",
519
- fr"\b(?:mtr|mot){sep}(?:vehicle|veh)\b",
520
- "motor vehicle",
521
- ),
522
- RegexSubstitution(
523
- "motor vehicle 2",
524
- fr"\bm{sep}v\b",
525
- "motor vehicle",
526
- ),
527
- RegexSubstitution(
528
- "motor vehicle 3",
529
- fr"\b(?:mtv|mv)\b",
530
- "motor vehicle",
531
- ),
532
- RegexSubstitution("odometer", fr"\bodom\b", "odometer"),
533
- RegexSubstitution(
534
- "red light",
535
- fr"\bred{sep}light\b",
536
- "red light",
537
- ),
538
- RegexSubstitution(
539
- "vehicle sound system",
540
- fr"\bveh{sep}snd{sep}sys\b",
541
- "vehicle sound system",
542
- priority=20,
543
- ),
544
- # =====
545
- RegexSubstitution(
546
- "Assault",
547
- r"\bass?lt\b",
548
- "assault",
549
- ),
550
- RegexSubstitution(
551
- "Assault 2",
552
- r"\bass\b",
553
- "assault",
554
- ),
555
- RegexSubstitution(
556
- "Mentally",
557
- r"\bment\b",
558
- "mentally",
559
- ),
560
- RegexSubstitution(
561
- "mentally ill",
562
- r"\bmnt{sep}ill\b",
563
- "mentally ill",
564
- ),
565
- RegexSubstitution(
566
- "Unknown",
567
- r"\bunk\b",
568
- "unknown",
569
- ),
570
- RegexSubstitution(
571
- "cohabitation",
572
- r"\b(?:coh|cohbt)\b",
573
- "cohabitation",
574
- ),
575
- RegexSubstitution(
576
- "Statement",
577
- r"\bstmt\b",
578
- "statement",
579
- ),
580
- RegexSubstitution(
581
- "Degree",
582
- r"\bdegr?e?\b",
583
- "degree",
584
- ),
585
- RegexSubstitution(
586
- "Felony",
587
- r"\b(?:fe|fel|felo|felny|fl|flny)\b",
588
- "felony",
589
- ),
590
- RegexSubstitution(
591
- "misdemeanor",
592
- r"\bmisd\b",
593
- "misdemeanor",
594
- ),
595
- # AGE
596
- RegexSubstitution(
597
- "years of age",
598
- r"\byoa\b",
599
- "years of age",
600
- ),
601
- RegexSubstitution(
602
- "year",
603
- r"\byr\b",
604
- "year",
605
- ),
606
- RegexSubstitution(
607
- "year 2",
608
- r"(?!\d+)yr\b",
609
- " year",
610
- ),
611
- RegexSubstitution(
612
- "elderly",
613
- r"\beldrly\b",
614
- "elderly",
615
- ),
616
- RegexSubstitution(
617
- "under",
618
- r"\b(?:und|undr)\b",
619
- "under",
620
- ),
621
- # AGE / FEMALE
622
- RegexSubstitution(
623
- "female",
624
- fr"\bfem\b",
625
- "female",
626
- ),
627
- RegexSubstitution(
628
- "age female",
629
- fr"\bage{sep}f\b",
630
- "age female",
631
- ),
632
- RegexSubstitution(
633
- "old female",
634
- fr"\bold{sep}f\b",
635
- "old female",
636
- ),
637
- RegexSubstitution(
638
- "older female",
639
- fr"\bolder{sep}f\b",
640
- "older female",
641
- ),
642
- RegexSubstitution(
643
- "13 female",
644
- fr"\b13{sep}f\b",
645
- "13 female",
646
- ),
647
- RegexSubstitution(
648
- "15 female",
649
- fr"\b15{sep}f\b",
650
- "15 female",
651
- ),
652
- RegexSubstitution(
653
- "17 female",
654
- fr"\b17{sep}f\b",
655
- "17 female",
656
- ),
657
- # AGE / MALE
658
- RegexSubstitution(
659
- "age male",
660
- fr"\bage{sep}m\b",
661
- "age male",
662
- ),
663
- RegexSubstitution(
664
- "old male",
665
- fr"\bold{sep}m\b",
666
- "old male",
667
- ),
668
- RegexSubstitution(
669
- "older male",
670
- fr"\bolder{sep}m\b",
671
- "older male",
672
- ),
673
- RegexSubstitution(
674
- "13 male",
675
- fr"\b13{sep}m\b",
676
- "13 male",
677
- ),
678
- RegexSubstitution(
679
- "15 male",
680
- fr"\b15{sep}m\b",
681
- "15 male",
682
- ),
683
- RegexSubstitution(
684
- "17 male",
685
- fr"\b17{sep}m\b",
686
- "17 male",
687
- ),
688
- # ======
689
- RegexSubstitution(
690
- "Robbery",
691
- r"\brobb\b",
692
- "robbery",
693
- ),
694
- RegexSubstitution(
695
- "Attempted Robbery",
696
- fr"\battempted{sep}(?:rob|robb)\b",
697
- "attempted robbery",
698
- ),
699
- RegexSubstitution(
700
- "Detainer Robbery",
701
- fr"\bdetainer{sep}(?:rob|robb)\b",
702
- "detainer robbery",
703
- ),
704
- RegexSubstitution(
705
- "Aggravated",
706
- r"\b(?:agg|aggrav|aggr|aggravted)\b",
707
- "aggravated",
708
- ),
709
- RegexSubstitution(
710
- "Forced",
711
- r"\bfrc\b",
712
- "forced",
713
- ),
714
- RegexSubstitution(
715
- "Danger",
716
- r"\bdng\b",
717
- "danger",
718
- ),
719
- RegexSubstitution(
720
- "Abetting",
721
- r"\babet\b",
722
- "abetting",
723
- ),
724
- RegexSubstitution(
725
- "Acquaintance",
726
- r"\b(?:acquant|acq|acquaint|acquain)\b",
727
- "acquaintance",
728
- ),
729
- RegexSubstitution(
730
- "Breaking and Entering",
731
- r"\bB ?& ?E\b",
732
- "breaking and entering",
733
- ),
734
- RegexSubstitution("Building", r"\bbldg\b", "building"),
735
- RegexSubstitution(
736
- "Adult",
737
- r"\badlt\b",
738
- "adult",
739
- ),
740
- RegexSubstitution(
741
- "Deliver",
742
- r"\bdel\b",
743
- "deliver",
744
- ),
745
- RegexSubstitution(
746
- "Family",
747
- r"\bfam\b",
748
- "family",
749
- ),
750
- RegexSubstitution(
751
- "Burglary",
752
- r"\bburg\b",
753
- "burglary",
754
- ),
755
- RegexSubstitution(
756
- "Murder",
757
- r"\bmur\b",
758
- "murder",
759
- ),
760
- RegexSubstitution(
761
- "conspiracy to commit",
762
- fr"\bconsp{sep}comm\b",
763
- "conspiracy to commit",
764
- ),
765
- RegexSubstitution(
766
- "Representation",
767
- r"\brep\b",
768
- "representation",
769
- ),
770
- RegexSubstitution(
771
- "Previous",
772
- r"\bprev\b",
773
- "previous",
774
- ),
775
- RegexSubstitution( # TODO revisit this - 'com' can also be 'commit'
776
- "Common",
777
- r"\bcom\b",
778
- "common",
779
- ),
780
- RegexSubstitution(
781
- "of a",
782
- r"\bofa\b",
783
- "of a",
784
- ),
785
- RegexSubstitution( # TODO revisit this - 'viol' relates to 'violation' too
786
- "violent",
787
- r"\bviol\b",
788
- "violent",
789
- ),
790
- RegexSubstitution(
791
- "perform",
792
- r"\bperf\b",
793
- "perform",
794
- ),
795
- RegexSubstitution(
796
- "household",
797
- r"\b(?:hh|hsehld|hhld)\b",
798
- "household",
799
- ),
800
- RegexSubstitution(
801
- "Other",
802
- r"\both\b",
803
- "other",
804
- ),
805
- # WEAPON TERMS =========
806
- RegexSubstitution(
807
- "Weapon", r"\b(?:wea|wpn|weapn|weap|weapo)\b", "weapon", priority=20
808
- ),
809
- RegexSubstitution(
810
- "Weapons", r"\b(?:wea|wpn|weapn|weap|weapo)s\b", "weapons", priority=20
811
- ),
812
- RegexSubstitution("dangerous weapon", r"\b(?:dwpn|dw)\b", "dangerous weapon"),
813
- RegexSubstitution(
814
- "dangerous weapon 2", fr"\bd{sep}(?:w|wpn)\b", "dangerous weapon"
815
- ),
816
- RegexSubstitution(
817
- "concealed weapon", fr"\bconcealed{sep}(?:w|wpn)\b", "concealed weapon"
818
- ),
819
- # HARM terms =======
820
- RegexSubstitution(
821
- "Bodily Harm",
822
- fr"\b(?:bod{sep}ha?rm|bh)\b",
823
- "bodily harm",
824
- ),
825
- RegexSubstitution(
826
- "physical",
827
- fr"\bphy\b",
828
- "physical",
829
- ),
830
- RegexSubstitution(
831
- "harmful",
832
- fr"\bharmfl\b",
833
- "harmful",
834
- ),
835
- RegexSubstitution(
836
- "Great Bodily",
837
- fr"\b(?:gr|grt){sep}bodily\b",
838
- "great bodily",
839
- ),
840
- RegexSubstitution(
841
- "Great Bodily Injury",
842
- fr"\bgbi\b",
843
- "great bodily injury",
844
- ),
845
- RegexSubstitution(
846
- "Substantial Bodily Harm",
847
- r"\bsbh\b",
848
- "substantial bodily harm",
849
- ),
850
- RegexSubstitution(
851
- "injury",
852
- r"\b(?:injry|inj)\b",
853
- "injury",
854
- ),
855
- RegexSubstitution(
856
- "inflict",
857
- r"\binflt\b",
858
- "inflict",
859
- ),
860
- RegexSubstitution(
861
- "Great Bodily Harm",
862
- fr"\bgr{sep}bod{sep}harm\b",
863
- "great bodily harm",
864
- ),
865
- RegexSubstitution(
866
- "Great Bodily Harm 2",
867
- fr"\bgbh\b",
868
- "great bodily harm",
869
- ),
870
- # ====
871
- RegexSubstitution( # TODO: revisit PERS can be person too
872
- "Personal",
873
- r"\bpers\b",
874
- "personal",
875
- ),
876
- RegexSubstitution(
877
- "persons",
878
- r"\bprsns\b",
879
- "persons",
880
- ),
881
- RegexSubstitution(
882
- "person",
883
- r"\b(?:prsn|per|perso)\b",
884
- "person",
885
- ),
886
- RegexSubstitution("election day", fr"\belec{sep}day\b", "election day"),
887
- RegexSubstitution(
888
- "temporary",
889
- r"\btemp\b",
890
- "temporary",
891
- ),
892
- RegexSubstitution(
893
- "improper",
894
- r"\bimprop\b",
895
- "improper",
896
- ),
897
- RegexSubstitution(
898
- "false",
899
- r"\bfls\b",
900
- "false",
901
- ),
902
- RegexSubstitution(
903
- "responsibility",
904
- r"\bresp\b",
905
- "responsibility",
906
- ),
907
- RegexSubstitution(
908
- "advertise",
909
- r"\bad\b",
910
- "advertise",
911
- ),
912
- RegexSubstitution(
913
- "imprisonment",
914
- r"\b(?:imprison|impris|imprsn)\b",
915
- "imprisonment",
916
- ),
917
- RegexSubstitution(
918
- "prohibited",
919
- r"\bproh\b",
920
- "prohibited",
921
- ),
922
- RegexSubstitution(
923
- "under influence",
924
- fr"\bunder{sep}(?:infl|influ)\b",
925
- "under influence",
926
- priority=5,
927
- ),
928
- RegexSubstitution(
929
- "stolen",
930
- r"\bstln\b",
931
- "stolen",
932
- ),
933
- RegexSubstitution(
934
- "years",
935
- r"\byrs\b",
936
- "years",
937
- ),
938
- RegexSubstitution(
939
- "intent",
940
- r"\bint\b",
941
- "intent",
942
- ),
943
- RegexSubstitution(
944
- "passage",
945
- r"\bpassg\b",
946
- "passage",
947
- ),
948
- RegexSubstitution(
949
- "withdraw",
950
- r"\bwit\b",
951
- "withdraw",
952
- ),
953
- RegexSubstitution(
954
- "manufacturing or delivering",
955
- r"\bman\Wdel\b",
956
- "manufacturing delivering",
957
- ),
958
- RegexSubstitution( # Revisit this
959
- "minimum mandatory",
960
- r"\bmin\Wman\b",
961
- "minimum mandatory",
962
- ),
963
- RegexSubstitution(
964
- "stranger",
965
- r"\bstr(?:ngr)?\b",
966
- "stranger",
967
- ),
968
- RegexSubstitution(
969
- "personal use",
970
- r"\bpers use\b",
971
- "personal use",
972
- ),
973
- RegexSubstitution(
974
- "force",
975
- r"\bfo?rc\b",
976
- "force",
977
- ),
978
- RegexSubstitution(
979
- "operate",
980
- r"\b(?:oper|op|opr)\b",
981
- "operate",
982
- ),
983
- RegexSubstitution(
984
- "occupied",
985
- r"\bocc\b",
986
- "occupied",
987
- ),
988
- RegexSubstitution(
989
- "health care facility",
990
- r"\bhealth{sep}care{sep}fac\b",
991
- "health care facility",
992
- priority=5,
993
- ),
994
- RegexSubstitution(
995
- "residence",
996
- r"\bres\b",
997
- "residence",
998
- ),
999
- RegexSubstitution(
1000
- "terrorism threats",
1001
- fr"\bterr{sep}(?:thre|thrts)\b",
1002
- "terrorism threats",
1003
- ),
1004
- RegexSubstitution(
1005
- "false report",
1006
- fr"\bfals{sep}rprt\b",
1007
- "false report",
1008
- ),
1009
- RegexSubstitution(
1010
- "government",
1011
- r"\bgovt\b",
1012
- "government",
1013
- ),
1014
- RegexSubstitution(
1015
- "advocating",
1016
- r"\badvoc\b",
1017
- "advocating",
1018
- ),
1019
- RegexSubstitution(
1020
- "government property",
1021
- r"\bgov{sep}property\b",
1022
- "government property",
1023
- ),
1024
- RegexSubstitution(
1025
- "general assembly",
1026
- r"\bgen{sep}assembly\b",
1027
- "general assembly",
1028
- ),
1029
- RegexSubstitution( # NOTE: added negative lookahead because was seeing "by off" when updating statutory rape terms & "by offense" is not correct
1030
- "offense",
1031
- fr"\b(?<!by )(?:offense|offen|off|offe)\b",
1032
- "offense",
1033
- ),
1034
- RegexSubstitution(
1035
- "information",
1036
- fr"\b(?:info|infor)\b",
1037
- "information",
1038
- ),
1039
- # LEWD charge cat
1040
- RegexSubstitution(
1041
- "pornography",
1042
- fr"\b(?:porn|porno)\b",
1043
- "pornography",
1044
- ),
1045
- RegexSubstitution(
1046
- "compelling",
1047
- fr"\bcompel\b",
1048
- "compelling",
1049
- ),
1050
- RegexSubstitution(
1051
- "prostitution",
1052
- fr"\bprostit\b",
1053
- "prostitution",
1054
- ),
1055
- RegexSubstitution(
1056
- "computer",
1057
- fr"\bcomputr\b",
1058
- "computer",
1059
- ),
1060
- RegexSubstitution(
1061
- "incapable",
1062
- fr"\bincap\b",
1063
- "incapable",
1064
- ),
1065
- RegexSubstitution(
1066
- "juvenile",
1067
- fr"\b(?:juv|juven)\b",
1068
- "juvenile",
1069
- ),
1070
- RegexSubstitution(
1071
- "involving",
1072
- fr"\b(?:involv|invlv)\b",
1073
- "involving",
1074
- ),
1075
- RegexSubstitution(
1076
- "equipment",
1077
- fr"\bequip\b",
1078
- "equipment",
1079
- ),
1080
- RegexSubstitution(
1081
- "hazardous",
1082
- fr"\bhaz\b",
1083
- "hazardous",
1084
- ),
1085
- RegexSubstitution( # NOTE: assault and battery unless A,B is followed by C
1086
- "assault and battery",
1087
- fr"\b(?:a\&b|a{sep}b|a \& b|ab)(?!c)\b",
1088
- "assault and battery",
1089
- ),
1090
- RegexSubstitution( # NOTE: assault and battery unless A,B is followed by C
1091
- "assault and battery 2",
1092
- fr"\b(?:a\&b|a{sep}b|a \& b|ab)(?!\Wc)\b",
1093
- "assault and battery",
1094
- ),
1095
- RegexSubstitution( # NOTE: assault and battery unless A,B is followed by C
1096
- "assault and battery 2",
1097
- fr"\b(?:a\&b|a{sep}b|a \& b|ab)(?! c)\b",
1098
- "assault and battery",
1099
- ),
1100
- RegexSubstitution(
1101
- "promote distribution",
1102
- fr"\bpromote{sep}distrb\b",
1103
- "promote distribution",
1104
- ),
1105
- RegexSubstitution(
1106
- "child molestation first degree",
1107
- fr"\b(?:child|chld|ch){sep}(?:molestation|molest|mol){sep}1\b",
1108
- "child molestation first degree",
1109
- ),
1110
- RegexSubstitution(
1111
- "child molestation second degree",
1112
- fr"\b(?:child|chld|ch){sep}(?:molestation|molest|mol){sep}2\b",
1113
- "child molestation second degree",
1114
- ),
1115
- RegexSubstitution(
1116
- "child molestation third degree",
1117
- fr"\b(?:child|chld|ch){sep}(?:molestation|molest|mol){sep}3\b",
1118
- "child molestation third degree",
1119
- ),
1120
- RegexSubstitution(
1121
- "child molestation",
1122
- fr"\b(?:child|chld|ch){sep}(?:molestation|molest|mol)\b",
1123
- "child molestation",
1124
- priority=5,
1125
- ),
1126
- RegexSubstitution(
1127
- "molestation",
1128
- fr"\b(?:molestation|molest|mol)\b",
1129
- "molestation",
1130
- ),
1131
- RegexSubstitution(
1132
- "indecent conduct exposure",
1133
- fr"\bind{sep}cond{sep}expos\b",
1134
- "indecent conduct exposure",
1135
- ),
1136
- RegexSubstitution(
1137
- "indecent",
1138
- fr"\bindec\b",
1139
- "indecent",
1140
- ),
1141
- RegexSubstitution(
1142
- "indecent liberties",
1143
- fr"\bind{sep}lib\b",
1144
- "indecent liberties",
1145
- ),
1146
- RegexSubstitution(
1147
- "moving",
1148
- fr"\bmov\b",
1149
- "moving",
1150
- ),
1151
- RegexSubstitution(
1152
- "depiction",
1153
- fr"\bdptn\b",
1154
- "depiction",
1155
- ),
1156
- RegexSubstitution(
1157
- "child luring",
1158
- fr"\bchil{sep}lrng\b",
1159
- "child luring",
1160
- ),
1161
- RegexSubstitution(
1162
- "dissemination",
1163
- fr"\b(?:dissm|dissem)\b",
1164
- "dissemination",
1165
- ),
1166
- RegexSubstitution(
1167
- "possession of depictions of minor engaged in sexually explicit conduct",
1168
- fr"\bposs{sep}(?:depict|dep){sep}(?:minor|min){sep}eng{sep}sex{sep}(?:exp|expct){sep}conduct\b",
1169
- "possession of depictions of minor engaged in sexually explicit conduct",
1170
- priority=3,
1171
- ),
1172
- RegexSubstitution(
1173
- "dealing of depictions of minor engaged in sexually explicit conduct",
1174
- fr"\bdeal{sep}(?:depict|dep){sep}(?:minor|min){sep}eng{sep}sex{sep}(?:exp|expct){sep}conduct\b",
1175
- "dealing of depictions of minor engaged in sexually explicit conduct",
1176
- priority=3,
1177
- ),
1178
- RegexSubstitution(
1179
- "viewing of depictions of minor engaged in sexually explicit conduct",
1180
- fr"\bview{sep}(?:depict|dep){sep}(?:minor|min){sep}eng{sep}sex{sep}(?:exp|expct){sep}conduct\b",
1181
- "viewing of depictions of minor engaged in sexually explicit conduct",
1182
- priority=3,
1183
- ),
1184
- RegexSubstitution(
1185
- "online sexual corruption of a child",
1186
- fr"\bonline{sep}sex{sep}corrupt{sep}child\b",
1187
- "online sexual corruption of a child",
1188
- ),
1189
- RegexSubstitution(
1190
- "lewd or lascivious act",
1191
- fr"\b(?:L\&L|L{sep}L)\b",
1192
- "lewd or lascivious act",
1193
- ),
1194
- RegexSubstitution(
1195
- "exposure",
1196
- r"\bexpos\b",
1197
- "exposure",
1198
- ),
1199
- # SEXUAL OFFENSES =====
1200
- RegexSubstitution(
1201
- "Criminal Sexual Conduct",
1202
- r"\bcsc\b",
1203
- "criminal sexual conduct",
1204
- ),
1205
- RegexSubstitution(
1206
- "sexual",
1207
- r"\bsexl\b",
1208
- "sexual",
1209
- ),
1210
- RegexSubstitution(
1211
- "explicit",
1212
- r"\bexplct\b",
1213
- "explicit",
1214
- ),
1215
- RegexSubstitution(
1216
- "sexual offense",
1217
- fr"\b(?:sexual|sex){sep}(?:offense|offen|off)\b",
1218
- "sexual offense",
1219
- ),
1220
- RegexSubstitution(
1221
- "sexual offenses",
1222
- fr"\b(?:sexual|sex){sep}(?:offense|offen|off)s\b",
1223
- "sexual offenses",
1224
- ),
1225
- RegexSubstitution(
1226
- "sexual assault",
1227
- fr"\b(?:sexual|sex){sep}(?:assault|assult|assualt|ass|asst)\b",
1228
- "sexual assault",
1229
- ),
1230
- RegexSubstitution(
1231
- "sexual contact",
1232
- fr"\b(?:sexual|sex){sep}(?:contact)\b",
1233
- "sexual contact",
1234
- ),
1235
- RegexSubstitution(
1236
- "sexual act",
1237
- fr"\b(?:sexual|sex){sep}(?:act|acts)\b",
1238
- "sexual act",
1239
- ),
1240
- RegexSubstitution(
1241
- "sexual act 2",
1242
- fr"\bsxact\b",
1243
- "sexual act",
1244
- ),
1245
- RegexSubstitution(
1246
- "sexual abuse",
1247
- fr"\b(?:sexual|sex){sep}(?:abuse|ab)\b",
1248
- "sexual abuse",
1249
- ),
1250
- RegexSubstitution(
1251
- "commit sex abuse",
1252
- fr"\bcomm{sep}sex{sep}abuse\b",
1253
- "commit sex abuse",
1254
- ),
1255
- RegexSubstitution(
1256
- "commit sex act",
1257
- fr"\bcomm{sep}sex{sep}act\b",
1258
- "commit sex act",
1259
- ),
1260
- RegexSubstitution(
1261
- "commit sex abuse minor",
1262
- fr"\bcommsexabuseminor\b",
1263
- "commit sex abuse minor",
1264
- priority=20,
1265
- ),
1266
- RegexSubstitution(
1267
- "sexual battery",
1268
- fr"\b(?:sexual|sex){sep}(?:battery|batt|bat)\b",
1269
- "sexual battery",
1270
- ),
1271
- RegexSubstitution( # TODO: should these actually map to "sexual misconduct"?
1272
- "sexual conduct",
1273
- fr"\b(?:sexual|sex){sep}(?:conduct|cndct|cond|con)\b",
1274
- "sexual conduct",
1275
- ),
1276
- RegexSubstitution(
1277
- "sexual penetration",
1278
- fr"\b(?:sexual|sex){sep}(?:penetration|pen)\b",
1279
- "sexual penetration",
1280
- ),
1281
- RegexSubstitution( # TODO: Revisit - hard to tell if exp/expl maps to "exploitation" or "explicit"
1282
- "sexual exploitation",
1283
- fr"\b(?:sexual|sex){sep}(?:exploitation|exploit)\b",
1284
- "sexual exploitation",
1285
- ),
1286
- RegexSubstitution(
1287
- "sexual performance",
1288
- fr"\b(?:sexual|sex){sep}(?:performance|perform)\b",
1289
- "sexual performance",
1290
- ),
1291
- RegexSubstitution(
1292
- "sexual imposition",
1293
- fr"\b(?:sexual|sex){sep}(?:imposition|imp)\b",
1294
- "sexual imposition",
1295
- ),
1296
- RegexSubstitution(
1297
- "sex with",
1298
- fr"\bsex{sep}w\b",
1299
- "sex with",
1300
- ),
1301
- RegexSubstitution( # TODO: Revisit - hard to tell if offen/off maps to "offender" or "offense"
1302
- "sex offender",
1303
- fr"\b(?:sexual|sex){sep}(?:offender|offend|offndr|ofndr)\b",
1304
- "sex offender",
1305
- ),
1306
- RegexSubstitution(
1307
- "sexual predator",
1308
- fr"\b(?:sexual|sex){sep}(?:predator|pred)\b",
1309
- "sexual predator",
1310
- ),
1311
- RegexSubstitution(
1312
- "voluntary sexual relations",
1313
- fr"\bvol{sep}sex{sep}rel\b",
1314
- "voluntary sexual relations",
1315
- ),
1316
- RegexSubstitution(
1317
- "sex related",
1318
- fr"\bsex{sep}(?:reltd|rel)\b",
1319
- "sex related",
1320
- ),
1321
- RegexSubstitution(
1322
- "sex related 2",
1323
- fr"\bsexreltd\b",
1324
- "sex related",
1325
- ),
1326
- RegexSubstitution(
1327
- "statutory rape",
1328
- fr"\bstat{sep}rape\b",
1329
- "statutory rape",
1330
- ),
1331
- RegexSubstitution(
1332
- "rape first degree",
1333
- fr"\brape{sep}(?:1|1st|i)\b",
1334
- "rape first degree",
1335
- ),
1336
- RegexSubstitution(
1337
- "rape second degree",
1338
- fr"\brape{sep}(?:2|2nd|ii)\b",
1339
- "rape second degree",
1340
- ),
1341
- RegexSubstitution(
1342
- "rape third degree",
1343
- fr"\brape{sep}(?:3|3rd|iii)\b",
1344
- "rape third degree",
1345
- ),
1346
- RegexSubstitution(
1347
- "sodomy first degree",
1348
- fr"\bsodomy{sep}(?:1|1st|i)\b",
1349
- "sodomy first degree",
1350
- ),
1351
- RegexSubstitution(
1352
- "sodomy second degree",
1353
- fr"\bsodomy{sep}(?:2|2nd|ii)\b",
1354
- "sodomy second degree",
1355
- ),
1356
- RegexSubstitution(
1357
- "sodomy third degree",
1358
- fr"\bsodomy{sep}(?:3|3rd|iii)\b",
1359
- "sodomy third degree",
1360
- ),
1361
- RegexSubstitution(
1362
- "incest first degree",
1363
- fr"\bincest{sep}(?:1|1st|i)\b",
1364
- "incest first degree",
1365
- ),
1366
- RegexSubstitution(
1367
- "incest second degree",
1368
- fr"\bincest{sep}(?:2|2nd|ii)\b",
1369
- "incest second degree",
1370
- ),
1371
- RegexSubstitution(
1372
- "sex first degree",
1373
- fr"\bsex{sep}(?:1|1st|i)\b",
1374
- "sex first degree",
1375
- ),
1376
- RegexSubstitution(
1377
- "sex second degree",
1378
- fr"\bsex{sep}(?:2|2nd|ii)\b",
1379
- "sex second degree",
1380
- ),
1381
- RegexSubstitution(
1382
- "criminal sexual conduct first degree",
1383
- fr"\bcsc{sep}(?:1|1st|i)\b",
1384
- "criminal sexual conduct first degree",
1385
- priority=5,
1386
- ),
1387
- RegexSubstitution(
1388
- "criminal sexual conduct second degree",
1389
- fr"\bcsc{sep}(?:2|2nd|ii)\b",
1390
- "criminal sexual conduct second degree",
1391
- priority=5,
1392
- ),
1393
- RegexSubstitution(
1394
- "criminal sexual conduct third degree",
1395
- fr"\bcsc{sep}(?:3|3rd|ii)\b",
1396
- "criminal sexual conduct third degree",
1397
- priority=5,
1398
- ),
1399
- RegexSubstitution(
1400
- "criminal sexual conduct fourth degree",
1401
- fr"\bcsc{sep}(?:4|4th|iv)\b",
1402
- "criminal sexual conduct fourth degree",
1403
- priority=5,
1404
- ),
1405
- RegexSubstitution(
1406
- "sodomy",
1407
- r"\bsod\b",
1408
- "sodomy",
1409
- ),
1410
- RegexSubstitution(
1411
- "engage sexual act",
1412
- fr"\benga{sep}sex{sep}act\b",
1413
- "engage sexual act",
1414
- ),
1415
- RegexSubstitution(
1416
- "engage sexual act 2",
1417
- fr"\beng{sep}sex\b",
1418
- "engage sexual act",
1419
- ),
1420
- RegexSubstitution("no force", fr"\bno{sep}frc\b", "no force", priority=5),
1421
- RegexSubstitution(
1422
- "force or coercion",
1423
- fr"\bfrc{sep}or{sep}coercn\b",
1424
- "force or coercion",
1425
- priority=5,
1426
- ),
1427
- RegexSubstitution(
1428
- "coercion",
1429
- fr"\b(?:coer|coercn)\b",
1430
- "coercion",
1431
- ),
1432
- RegexSubstitution(
1433
- "position of authority",
1434
- fr"\bpos{sep}auth\b",
1435
- "position of authority",
1436
- priority=4,
1437
- ),
1438
- RegexSubstitution(
1439
- "position of authority 2",
1440
- fr"\bpos{sep}of{sep}auth\b",
1441
- "position of authority",
1442
- priority=4,
1443
- ),
1444
- RegexSubstitution(
1445
- "person in authority",
1446
- fr"\bper{sep}aut\b",
1447
- "person in authority",
1448
- priority=4,
1449
- ),
1450
- RegexSubstitution(
1451
- "other family",
1452
- fr"\b(?:othr|oth|other){sep}(?:family|fam)\b",
1453
- "other family",
1454
- priority=4,
1455
- ),
1456
- RegexSubstitution(
1457
- "immoral",
1458
- fr"\b(?:immoral|imoral|imm|imor)\b",
1459
- "immoral",
1460
- priority=4,
1461
- ),
1462
- RegexSubstitution(
1463
- "purpose",
1464
- fr"\bpurp\b",
1465
- "purpose",
1466
- priority=4,
1467
- ),
1468
- RegexSubstitution(
1469
- "communication with minor for immoral purpose",
1470
- fr"\b(?:communication|comm|com){sep}(?:with|w){sep}(?:minor|min){sep}(?:immoral|imoral|imm|imor)\b",
1471
- "communication with minor for immoral purpose",
1472
- priority=4,
1473
- ),
1474
- RegexSubstitution(
1475
- "communication with minor for immoral purpose 2",
1476
- fr"\bcomm{sep}minor{sep}imm\b",
1477
- "communication with minor for immoral purpose",
1478
- priority=4,
1479
- ),
1480
- RegexSubstitution(
1481
- "communication with minor",
1482
- fr"\bcom{sep}w{sep}minor\b",
1483
- "communication with minor",
1484
- priority=4,
1485
- ),
1486
- # EMBEZZLEMENT ===
1487
- RegexSubstitution(
1488
- "Embezzlement",
1489
- r"\b(?:embezzle|embezz|embez|embzzlmnt|embz)\b",
1490
- "embezzlement",
1491
- ),
1492
- RegexSubstitution(
1493
- "real estate",
1494
- fr"\breal{sep}estat\b",
1495
- "real estate",
1496
- ),
1497
- RegexSubstitution(
1498
- "chattel",
1499
- r"\bchatl\b",
1500
- "chattel",
1501
- ),
1502
- RegexSubstitution(
1503
- "received",
1504
- r"\b(?:receiv|rcvd)\b",
1505
- "received",
1506
- ),
1507
- RegexSubstitution(
1508
- "mortgagor",
1509
- r"\bmortgr\b",
1510
- "mortgagor",
1511
- ),
1512
- RegexSubstitution(
1513
- "agreement",
1514
- r"\bagrmnt\b",
1515
- "agreement",
1516
- ),
1517
- RegexSubstitution(
1518
- "public",
1519
- fr"\b(?:pub|publ|pblc)\b",
1520
- "public",
1521
- ),
1522
- RegexSubstitution(
1523
- "behavior",
1524
- r"\bbehav\b",
1525
- "behavior",
1526
- ),
1527
- RegexSubstitution(
1528
- "private",
1529
- r"\bpriv\b",
1530
- "private",
1531
- ),
1532
- RegexSubstitution(
1533
- "corporation",
1534
- fr"\bcorp\b",
1535
- "corporation",
1536
- ),
1537
- RegexSubstitution(
1538
- "purchase",
1539
- fr"\bpurc\b",
1540
- "purchase",
1541
- ),
1542
- RegexSubstitution( # NOTE: pol may also be police - saw pol dog for example (police dog)
1543
- "political",
1544
- fr"\b(?:pol|polit|politcl)\b",
1545
- "political",
1546
- ),
1547
- RegexSubstitution("police dog", fr"\bpol{sep}dog\b", "police dog", priority=5),
1548
- RegexSubstitution(
1549
- "payroll",
1550
- fr"\bpayrll\b",
1551
- "payroll",
1552
- ),
1553
- RegexSubstitution(
1554
- "law enforcement",
1555
- fr"\blaw{sep}enf\b",
1556
- "law enforcement",
1557
- ),
1558
- RegexSubstitution(
1559
- "incident",
1560
- fr"\bincdnt\b",
1561
- "incident",
1562
- ),
1563
- RegexSubstitution(
1564
- "report",
1565
- fr"\brept\b",
1566
- "report",
1567
- ),
1568
- RegexSubstitution(
1569
- "transfer",
1570
- fr"\btrnsf\b",
1571
- "transfer",
1572
- ),
1573
- RegexSubstitution(
1574
- "capital assets",
1575
- fr"\bcptl{sep}asts\b",
1576
- "capital assets",
1577
- ),
1578
- RegexSubstitution(
1579
- "clerk of court",
1580
- fr"\bclrk{sep}of{sep}crt\b",
1581
- "clerk of court",
1582
- ),
1583
- RegexSubstitution(
1584
- "insufficient",
1585
- fr"\binsuf\b",
1586
- "insufficient",
1587
- ),
1588
- RegexSubstitution(
1589
- "corporate officer", fr"\bcorp{sep}officer\b", "corporate officer", priority=5
1590
- ),
1591
- RegexSubstitution(
1592
- "institution",
1593
- fr"\b(?:instit|inst)\b",
1594
- "institution",
1595
- ),
1596
- RegexSubstitution(
1597
- "organization",
1598
- fr"\borg\b",
1599
- "organization",
1600
- ),
1601
- RegexSubstitution(
1602
- "animals",
1603
- fr"\banmls\b",
1604
- "animals",
1605
- ),
1606
- RegexSubstitution(
1607
- "animal",
1608
- fr"\banml\b",
1609
- "animal",
1610
- ),
1611
- RegexSubstitution(
1612
- "software",
1613
- fr"\bsoftwr\b",
1614
- "software",
1615
- ),
1616
- RegexSubstitution(
1617
- "transit or service bus",
1618
- fr"\btrans{sep}serv{sep}bus\b",
1619
- "transit or service bus",
1620
- ),
1621
- RegexSubstitution(
1622
- "insurance agent",
1623
- fr"\binsur{sep}agent\b",
1624
- "insurance agent",
1625
- ),
1626
- RegexSubstitution(
1627
- "official",
1628
- fr"\b(?:offic|offl|offcl|officl)\b",
1629
- "official",
1630
- ),
1631
- RegexSubstitution( # TODO: is 'misapp' ... misappropriation or misapplication?
1632
- "misappropriation",
1633
- fr"\b(?:misappro|misapp)\b",
1634
- "misappropriation",
1635
- ),
1636
- RegexSubstitution(
1637
- "misapplication",
1638
- fr"\bmisapl\b",
1639
- "misappropriation",
1640
- ),
1641
- RegexSubstitution(
1642
- "fiduciary",
1643
- fr"\bfiduc\b",
1644
- "fiduciary",
1645
- ),
1646
- RegexSubstitution(
1647
- "financial",
1648
- fr"\bfinan\b",
1649
- "financial",
1650
- ),
1651
- RegexSubstitution(
1652
- "funds",
1653
- fr"\bfnds\b",
1654
- "funds",
1655
- ),
1656
- # FELONY - UNSPECIFIED terms
1657
- RegexSubstitution(
1658
- "rendering assistance",
1659
- fr"\brend{sep}assist\b",
1660
- "rendering assistance",
1661
- priority=5,
1662
- ),
1663
- RegexSubstitution(
1664
- "criminal assistance",
1665
- fr"\b(?:crim|criminal){sep}assist\b",
1666
- "criminal assistance",
1667
- priority=4,
1668
- ),
1669
- RegexSubstitution(
1670
- "consummate",
1671
- fr"\b(?:consu|consummat)\b",
1672
- "consummate",
1673
- priority=4,
1674
- ),
1675
- RegexSubstitution(
1676
- "deliver",
1677
- fr"\bdelive\b",
1678
- "deliver",
1679
- priority=4,
1680
- ),
1681
- RegexSubstitution(
1682
- "to commit",
1683
- fr"\bto{sep}comm\b",
1684
- "to commit",
1685
- priority=4,
1686
- ),
1687
- RegexSubstitution(
1688
- "violation of",
1689
- fr"\b(?:viol?|vio){sep}of\b",
1690
- "violation of",
1691
- priority=4,
1692
- ),
1693
- RegexSubstitution(
1694
- "violation of civil",
1695
- fr"\bvol?{sep}civil\b",
1696
- "violation of civil",
1697
- priority=4,
1698
- ),
1699
- RegexSubstitution("rendering", fr"\brend\b", "rendering"),
1700
- RegexSubstitution(
1701
- "assistance first degree",
1702
- fr"\bassistance{sep}1\b",
1703
- "assistance first degree",
1704
- priority=30,
1705
- ),
1706
- RegexSubstitution(
1707
- "assistance second degree",
1708
- fr"\bassistance{sep}2\b",
1709
- "assistance second degree",
1710
- priority=30,
1711
- ),
1712
- RegexSubstitution(
1713
- "assistance third degree",
1714
- fr"\bassistance{sep}3\b",
1715
- "assistance third degree",
1716
- priority=30,
1717
- ),
1718
- RegexSubstitution(
1719
- "class",
1720
- fr"\bclas\b",
1721
- "class",
1722
- ),
1723
- RegexSubstitution(
1724
- "accessory",
1725
- fr"\b(?:accessry|accsry)\b",
1726
- "accessory",
1727
- ),
1728
- RegexSubstitution(
1729
- "dependency",
1730
- fr"\bdepndncy\b",
1731
- "dependency",
1732
- ),
1733
- RegexSubstitution(
1734
- "unspecified",
1735
- fr"\bunspfd\b",
1736
- "unspecified",
1737
- ),
1738
- RegexSubstitution(
1739
- "responsibility",
1740
- fr"\brespon?\b",
1741
- "responsibility",
1742
- ),
1743
- RegexSubstitution(
1744
- "classification",
1745
- fr"\bclassif\b",
1746
- "classification",
1747
- ),
1748
- RegexSubstitution(
1749
- "vice president",
1750
- fr"\bvp\b",
1751
- "vice president",
1752
- priority=30,
1753
- ),
1754
- # BRIBERY terms
1755
- RegexSubstitution(
1756
- "personal",
1757
- fr"\bpersona\b",
1758
- "personal",
1759
- ),
1760
- RegexSubstitution(
1761
- "assistance",
1762
- fr"\basst\b",
1763
- "assistance",
1764
- ),
1765
- RegexSubstitution(
1766
- "service",
1767
- fr"\bserv\b",
1768
- "service",
1769
- ),
1770
- RegexSubstitution(
1771
- "facilitation",
1772
- fr"\b(?:facil|fac)\b",
1773
- "facilitation",
1774
- ),
1775
- RegexSubstitution(
1776
- "smuggling",
1777
- fr"\bsmug\b",
1778
- "smuggling",
1779
- ),
1780
- RegexSubstitution(
1781
- "health",
1782
- fr"\bhlth\b",
1783
- "health",
1784
- ),
1785
- RegexSubstitution( # NOTE: 'off' tends to be 'offense' hence the priority on this one
1786
- "official position", fr"\boff{sep}position\b", "official position", priority=5
1787
- ),
1788
- RegexSubstitution(
1789
- "participants",
1790
- fr"\bparticipnts\b",
1791
- "participants",
1792
- ),
1793
- RegexSubstitution(
1794
- "contestant",
1795
- fr"\bcntst\b",
1796
- "contestant",
1797
- ),
1798
- RegexSubstitution(
1799
- "accept",
1800
- fr"\baccpt\b",
1801
- "accept",
1802
- ),
1803
- RegexSubstitution(
1804
- "campaign contribution",
1805
- fr"\bcamp{sep}cont\b",
1806
- "campaign contribution",
1807
- ),
1808
- RegexSubstitution(
1809
- "influence",
1810
- fr"\b(?:inflnce|influenc)\b",
1811
- "influence",
1812
- ),
1813
- RegexSubstitution(
1814
- "compensation",
1815
- fr"\bcompens\b",
1816
- "compensation",
1817
- ),
1818
- RegexSubstitution(
1819
- "treatment",
1820
- fr"\btreatm\b",
1821
- "treatment",
1822
- ),
1823
- RegexSubstitution(
1824
- "commercial bribe",
1825
- fr"\b(?:comm|comm\'l){sep}bribe\b",
1826
- "commercial bribe",
1827
- ),
1828
- RegexSubstitution(
1829
- "false testimony",
1830
- fr"\bfalse{sep}test\b",
1831
- "false testimony",
1832
- ),
1833
- RegexSubstitution(
1834
- "miscellaneous",
1835
- fr"\bmisc\b",
1836
- "miscellaneous",
1837
- ),
1838
- RegexSubstitution(
1839
- "impersonating",
1840
- fr"\bimpers\b",
1841
- "impersonating",
1842
- ),
1843
- RegexSubstitution(
1844
- "receiving",
1845
- fr"\brecv\b",
1846
- "receiving",
1847
- ),
1848
- RegexSubstitution(
1849
- "interfere with official process",
1850
- fr"\binterfere{sep}w{sep}offc{sep}proc\b",
1851
- "interfere with official process",
1852
- priority=5,
1853
- ),
1854
- RegexSubstitution("public record", fr"\b(?:public|pub){sep}rec\b", "public record"),
1855
- RegexSubstitution(
1856
- "public servant",
1857
- fr"\b(?:public|pub){sep}(?:servant|srv|srvnt)\b",
1858
- "public servant",
1859
- ),
1860
- RegexSubstitution( # NOTE: 'wit' also maps to 'withdraw', hence priority here
1861
- "witness juror",
1862
- fr"\b(?:witness|wit){sep}(?:juror|jur)\b",
1863
- "witness juror",
1864
- priority=5,
1865
- ),
1866
- RegexSubstitution(
1867
- "umpire referee", fr"\b(?:umpire|ump){sep}(?:referee|ref)\b", "umpire referee"
1868
- ),
1869
- # FAMILY RELATED OFFENSES
1870
- RegexSubstitution(
1871
- "custody interference",
1872
- fr"\bcust{sep}inter\b",
1873
- "custody interference",
1874
- ),
1875
- RegexSubstitution(
1876
- "custody interference second degree",
1877
- fr"\bcust{sep}inter{sep}2\b",
1878
- "custody interference second degree",
1879
- priority=5,
1880
- ),
1881
- RegexSubstitution(
1882
- "abandonment",
1883
- fr"\babandonmnt\b",
1884
- "abandonment",
1885
- ),
1886
- RegexSubstitution(
1887
- "unattended",
1888
- fr"\bunatt\b",
1889
- "unattended",
1890
- ),
1891
- RegexSubstitution(
1892
- "endanger",
1893
- fr"\b(?:endngr|endgr|endang)\b",
1894
- "endanger",
1895
- ),
1896
- RegexSubstitution(
1897
- "welfare",
1898
- fr"\b(?:wlfre|wlfr)\b",
1899
- "welfare",
1900
- ),
1901
- RegexSubstitution(
1902
- "endanger welfare",
1903
- fr"\b(?:endngr|endgr|endang){sep}(?:wlfre|wlfr|wel)\b",
1904
- "endanger welfare",
1905
- ),
1906
- RegexSubstitution(
1907
- "neglect",
1908
- fr"\bneglct\b",
1909
- "neglect",
1910
- ),
1911
- RegexSubstitution(
1912
- "contribute",
1913
- fr"\bcontrib\b",
1914
- "contribute",
1915
- ),
1916
- RegexSubstitution(
1917
- "delinquincy",
1918
- fr"\b(?:dlnqncy|delinq)\b",
1919
- "delinquincy",
1920
- ),
1921
- RegexSubstitution(
1922
- "service",
1923
- fr"\bsrvc\b",
1924
- "service",
1925
- ),
1926
- RegexSubstitution(
1927
- "misrepresentation",
1928
- fr"\bmisrep\b",
1929
- "misrepresentation",
1930
- ),
1931
- RegexSubstitution(
1932
- "disabled",
1933
- fr"\bdisabld\b",
1934
- "disabled",
1935
- ),
1936
- # ===
1937
- RegexSubstitution(
1938
- "system of records exempt",
1939
- fr"\bsor{sep}exempt\b",
1940
- "system of records exempt",
1941
- ),
1942
- RegexSubstitution(
1943
- "type",
1944
- r"\btyp\b",
1945
- "type",
1946
- ),
1947
- RegexSubstitution(
1948
- "misconduct",
1949
- r"\b(?:miscond|miscon)\b",
1950
- "misconduct",
1951
- ),
1952
- RegexSubstitution(
1953
- "mischief",
1954
- r"\bmisch\b",
1955
- "mischief",
1956
- ),
1957
- RegexSubstitution(
1958
- "probation revocation",
1959
- fr"\bprob{sep}(?:rev|revo)\b",
1960
- "probation revocation",
1961
- ),
1962
- RegexSubstitution(
1963
- "management",
1964
- r"\bmgmt\b",
1965
- "management",
1966
- ),
1967
- RegexSubstitution(
1968
- "subsistence",
1969
- r"\bsubsist\b",
1970
- "subsistence",
1971
- ),
1972
- RegexSubstitution(
1973
- "penalty group",
1974
- r"\bpg\b",
1975
- "penalty group",
1976
- ),
1977
- RegexSubstitution(
1978
- "community custody",
1979
- r"\bcomm custody\b",
1980
- "community custody",
1981
- ),
1982
- RegexSubstitution(
1983
- "contempt",
1984
- r"\bcntmpt\b",
1985
- "contempt",
1986
- ),
1987
- RegexSubstitution(
1988
- "counterfeit",
1989
- r"\b(?:cntft|cntrft|cntrfeit|cnterft|contrft|contrfit)\b",
1990
- "counterfeit",
1991
- ),
1992
- RegexSubstitution(
1993
- "counts",
1994
- r"\b(?:cts|cnts)\b",
1995
- "counts",
1996
- ),
1997
- RegexSubstitution(
1998
- "victim",
1999
- r"\b(?:vict|vctm|vic)\b",
2000
- "victim",
2001
- ),
2002
- # NUMBER TERMS ===========
2003
- RegexSubstitution("first", r"\b1st\b", "first", priority=20),
2004
- RegexSubstitution(
2005
- "first degree", fr"\b(?:first|1|1st){sep}(?:dgr|dg|de|d)\b", "first degree"
2006
- ),
2007
- RegexSubstitution("first degree 2", fr"\b1dg\b", "first degree"),
2008
- RegexSubstitution(
2009
- "circumstances in the first degree",
2010
- fr"\bcircumstances{sep}1\b",
2011
- "circumstances in the first degree",
2012
- ),
2013
- RegexSubstitution("second", r"\b2nd\b", "second", priority=20),
2014
- RegexSubstitution(
2015
- "second degree", fr"\b(?:second|2|2nd){sep}(?:dgr|dg|de|d)\b", "second degree"
2016
- ),
2017
- RegexSubstitution(
2018
- "circumstances in the second degree",
2019
- fr"\bcircumstances{sep}2\b",
2020
- "circumstances in the second degree",
2021
- ),
2022
- RegexSubstitution("third", r"\b3rd\b", "third", priority=20),
2023
- RegexSubstitution(
2024
- "third degree", fr"\b(?:third|3|3rd){sep}(?:dgr|dg|de|d)\b", "third degree"
2025
- ),
2026
- RegexSubstitution(
2027
- "circumstances in the third degree",
2028
- fr"\bcircumstances{sep}3\b",
2029
- "circumstances in the third degree",
2030
- ),
2031
- RegexSubstitution("fourth", r"\b4th\b", "fourth", priority=20),
2032
- RegexSubstitution("fifth", r"\b5th\b", "fifth", priority=20),
2033
- RegexSubstitution("sixth", r"\b6th\b", "sixth", priority=20),
2034
- RegexSubstitution("seventh", r"\b7th\b", "seventh", priority=20),
2035
- RegexSubstitution("eighth", r"\b8th\b", "eighth", priority=20),
2036
- RegexSubstitution("ninth", r"\b9th\b", "ninth", priority=20),
2037
- RegexSubstitution("tenth", r"\b10th\b", "tenth", priority=20),
2038
- # SCHEDULE terms ===========
2039
- # observed "l" for use of "i" across schedule terms
2040
- RegexSubstitution(
2041
- "Schedule", r"\b(?:sc?he?d?|sch|sched|schd)\b", "schedule", priority=9
2042
- ),
2043
- RegexSubstitution(
2044
- "schedule one",
2045
- fr"\bschedule{sep}(?:i|1|l)\b",
2046
- "schedule one",
2047
- ),
2048
- RegexSubstitution(
2049
- "schedule two",
2050
- fr"\bschedule{sep}(?:ii|2|ll)\b",
2051
- "schedule two",
2052
- ),
2053
- RegexSubstitution(
2054
- "schedule three",
2055
- fr"\bschedule{sep}(?:iii|3|lll)\b",
2056
- "schedule three",
2057
- ),
2058
- RegexSubstitution(
2059
- "schedule four",
2060
- fr"\bschedule{sep}(?:iv|4|lv)\b",
2061
- "schedule four",
2062
- ),
2063
- RegexSubstitution(
2064
- "schedule five",
2065
- fr"\bschedule{sep}(?:v|5)\b",
2066
- "schedule five",
2067
- ),
2068
- RegexSubstitution(
2069
- "schedule six",
2070
- fr"\bschedule{sep}(?:vi|6|vl)\b",
2071
- "schedule six",
2072
- ),
2073
- # DRIVING TERMS ===========
2074
- RegexSubstitution(
2075
- "driving",
2076
- r"\bdrvg\b",
2077
- "driving",
2078
- ),
2079
- RegexSubstitution(
2080
- "driving 2",
2081
- fr"\bdriv{sep}g\b",
2082
- "driving",
2083
- ),
2084
- RegexSubstitution(
2085
- "failure to yield",
2086
- fr"\bfty\b",
2087
- "failure to yield",
2088
- ),
2089
- RegexSubstitution(
2090
- "permit",
2091
- fr"\bperm\b",
2092
- "permit",
2093
- ),
2094
- RegexSubstitution(
2095
- "registration",
2096
- fr"\b(?:regis|registra)\b",
2097
- "registration",
2098
- ),
2099
- RegexSubstitution(
2100
- "driving under the influence",
2101
- r"\bdui\b",
2102
- "driving under the influence",
2103
- ),
2104
- RegexSubstitution(
2105
- "driving while impaired",
2106
- r"\bdwi\b",
2107
- "driving while impaired",
2108
- ),
2109
- RegexSubstitution(
2110
- "driving while license suspended",
2111
- r"\bdwls\b",
2112
- "driving while license suspended",
2113
- ),
2114
- RegexSubstitution(
2115
- "driving while license revoked",
2116
- r"\bdwlr\b",
2117
- "driving while license revoked",
2118
- ),
2119
- RegexSubstitution(
2120
- "revoked",
2121
- r"\brevkd\b",
2122
- "revoked",
2123
- ),
2124
- RegexSubstitution(
2125
- "reckless endangerment",
2126
- fr"\breckles{sep}endanger\b",
2127
- "reckless endangerment",
2128
- ),
2129
- RegexSubstitution(
2130
- "highway",
2131
- fr"\bhi{sep}way\b",
2132
- "highway",
2133
- ),
2134
- RegexSubstitution(
2135
- "reckless driving",
2136
- fr"\brek{sep}dr?\b",
2137
- "reckless driving",
2138
- ),
2139
- # ========
2140
- RegexSubstitution(
2141
- "retail theft",
2142
- fr"\bretail{sep}thft\b",
2143
- "retail theft",
2144
- ),
2145
- RegexSubstitution(
2146
- "impregnate girl",
2147
- fr"\b(?:impregnate|impreg){sep}(?:girl|grl)\b",
2148
- "impregnate girl",
2149
- ),
2150
- RegexSubstitution(
2151
- "worker compensation",
2152
- fr"\bwrkr{sep}cmp\b",
2153
- "worker compensation",
2154
- ),
2155
- RegexSubstitution(
2156
- "disregard",
2157
- fr"\bdisreg\b",
2158
- "disregard",
2159
- ),
2160
- RegexSubstitution(
2161
- "electrical appliance",
2162
- fr"\belct{sep}appl\b",
2163
- "electrical appliance",
2164
- ),
2165
- RegexSubstitution(
2166
- "serial number",
2167
- fr"\b(?:serial|ser){sep}(?:number|nmbr|num|nu|no)\b",
2168
- "serial number",
2169
- ),
2170
- # DISTRIBUTION / FURNISH / TRAFFICK TERMS =======
2171
- RegexSubstitution( # TODO: revisit traff/traf', more likely to be traffick/ing but could be traffic (cars)
2172
- "traffick",
2173
- r"\b(?:tfk|traff|traf)\b",
2174
- "traffick",
2175
- ),
2176
- RegexSubstitution( # TODO: revisit adding 'dist', more likely to be distribution but could be disturbance
2177
- "distribution",
2178
- r"\b(?:distr|distrib)\b",
2179
- "distribution",
2180
- ),
2181
- RegexSubstitution(
2182
- "attempted distribution",
2183
- fr"\b(?:at|att|attempted){sep}dist\b",
2184
- "attempted distribution",
2185
- priority=5,
2186
- ),
2187
- RegexSubstitution(
2188
- "illegal distribution",
2189
- fr"\billgl{sep}dist\b",
2190
- "intent distribution",
2191
- priority=5,
2192
- ),
2193
- RegexSubstitution(
2194
- "buy distribute",
2195
- fr"\bbuy{sep}dist\b",
2196
- "buy distribute",
2197
- ),
2198
- RegexSubstitution(
2199
- "intent distribute",
2200
- fr"\b(?:intent|int){sep}dist\b",
2201
- "intent distribute",
2202
- priority=5,
2203
- ),
2204
- RegexSubstitution(
2205
- "intent to distribute",
2206
- fr"\b(?:intent|int){sep}to{sep}dist\b",
2207
- "intent to distribute",
2208
- priority=5,
2209
- ),
2210
- RegexSubstitution(
2211
- "distribution possession",
2212
- fr"\bdist{sep}(?:possession|possess|poss)\b",
2213
- "distribution possession",
2214
- priority=5,
2215
- ),
2216
- RegexSubstitution(
2217
- "unauthorized distribution",
2218
- fr"\b(?:unauthorized|unauth|unau|unauthd){sep}dist\b",
2219
- "unauthorized distribution",
2220
- priority=5,
2221
- ),
2222
- RegexSubstitution(
2223
- "possession distribution",
2224
- fr"\b(?:possession|possess|poss){sep}dist\b",
2225
- "possession distribution",
2226
- priority=5,
2227
- ),
2228
- RegexSubstitution(
2229
- "unlaw distribution",
2230
- fr"\b(?:unlawful|unlaw){sep}dist\b",
2231
- "unlawful distribution",
2232
- priority=5,
2233
- ),
2234
- RegexSubstitution(
2235
- "distribution controlled",
2236
- fr"\bdist{sep}(?:controlled|cntrld|cntrl|contrlld)\b",
2237
- "distribution controlled",
2238
- priority=5,
2239
- ),
2240
- RegexSubstitution(
2241
- "distribute schedule",
2242
- fr"\bdist{sep}(?:schedule|sch|sched)\b",
2243
- "distribute schedule",
2244
- priority=5,
2245
- ),
2246
- RegexSubstitution(
2247
- "furnish",
2248
- r"\b(?:furnishing|furn)\b",
2249
- "furnish",
2250
- ),
2251
- RegexSubstitution( # TODO: revisit adding 'man', more likely to be manufacture/ing but could have other meaning
2252
- "manufacturing",
2253
- r"\b(?:manuf|manu|mfg|manf|manfac)\b",
2254
- "manufacturing",
2255
- ),
2256
- RegexSubstitution(
2257
- "manufacturing distribution sell",
2258
- fr"\b(?:manuf|manu|man|mfg|manf|manfac){sep}dist{sep}sell\b",
2259
- "manufacturing distribution sell",
2260
- priority=5,
2261
- ),
2262
- RegexSubstitution(
2263
- "record sell rent distribute",
2264
- fr"\brecord{sep}sell{sep}rent{sep}dist\b",
2265
- "record sell rent distribute",
2266
- priority=5,
2267
- ),
2268
- RegexSubstitution(
2269
- "sell distribute",
2270
- fr"\bsell{sep}dist\b",
2271
- "sell distribute",
2272
- priority=5,
2273
- ),
2274
- RegexSubstitution(
2275
- "sale distribute",
2276
- fr"\bsale{sep}dist\b",
2277
- "sale distribute",
2278
- priority=5,
2279
- ),
2280
- RegexSubstitution(
2281
- "offer agree to distribute",
2282
- fr"\boffer{sep}agree{sep}to{sep}dist\b",
2283
- "offer agree distribute",
2284
- priority=5,
2285
- ),
2286
- RegexSubstitution(
2287
- "arrange to distribute",
2288
- fr"\barrange{sep}to{sep}dist\b",
2289
- "arrange to distribute",
2290
- priority=5,
2291
- ),
2292
- RegexSubstitution(
2293
- "arrange to distribute 2",
2294
- fr"\barrange{sep}dist\b",
2295
- "arrange to distribute",
2296
- priority=5,
2297
- ),
2298
- RegexSubstitution(
2299
- "controlled substance distribution",
2300
- fr"\bcontr{sep}sub{sep}dist\b",
2301
- "controlled substance distribution",
2302
- priority=5,
2303
- ),
2304
- RegexSubstitution(
2305
- "manufacturing deliver distribution",
2306
- fr"\b(?:manuf|manu|man|mfg|manf){sep}del{sep}dist\b",
2307
- "manufacturing deliver distribution",
2308
- priority=5,
2309
- ),
2310
- RegexSubstitution(
2311
- "possession distribution manufacturing",
2312
- fr"\bposs{sep}dist{sep}manuf\b",
2313
- "possession distribution manufacturing",
2314
- priority=5,
2315
- ),
2316
- RegexSubstitution(
2317
- "with intent to distribute",
2318
- fr"\bwitd\b",
2319
- "with intent to distribute",
2320
- priority=5,
2321
- ),
2322
- RegexSubstitution(
2323
- "possession with intent to distribute",
2324
- fr"\bposs{sep}(?:with|w){sep}(?:intent|int|i){sep}dist\b",
2325
- "possession with intent to distribute",
2326
- priority=5,
2327
- ),
2328
- RegexSubstitution(
2329
- "manufacturing distribution possession",
2330
- fr"\b(?:manuf|manu|man|mfg|manf){sep}dist{sep}(?:p|poss|pos)\b",
2331
- "manufacturing distribution possession",
2332
- priority=5,
2333
- ),
2334
- RegexSubstitution(
2335
- "manufacturing distribution",
2336
- fr"\b(?:manuf|manu|man|mfg|manf){sep}dist\b",
2337
- "manufacturing distribution",
2338
- priority=5,
2339
- ),
2340
- RegexSubstitution(
2341
- "distribution obscene material",
2342
- fr"\bdist{sep}(?:obscene|obs|obsc){sep}(?:material|mat|mtrl)\b",
2343
- "distribution obscene material",
2344
- priority=5,
2345
- ),
2346
- RegexSubstitution(
2347
- "harmful material",
2348
- fr"\b(?:harmful|hrmf){sep}(?:material|mat|mtrl)\b",
2349
- "harmful material",
2350
- priority=5,
2351
- ),
2352
- RegexSubstitution(
2353
- "obscene material distribution",
2354
- fr"\b(?:obscene|obs|obsc){sep}(?:material|mat|mtrl){sep}dist\b",
2355
- "obscene material distribution",
2356
- priority=5,
2357
- ),
2358
- RegexSubstitution(
2359
- "material",
2360
- fr"\b(?:matrl|mat|mtrl)\b",
2361
- "material",
2362
- priority=5,
2363
- ),
2364
- RegexSubstitution(
2365
- "distribution child porn",
2366
- fr"\bdist{sep}child{sep}porn\b",
2367
- "distribution child porn",
2368
- priority=5,
2369
- ),
2370
- RegexSubstitution(
2371
- "distribution controlled substances",
2372
- fr"\bdist{sep}cds\b",
2373
- "distribution controlled substances",
2374
- priority=5,
2375
- ),
2376
- RegexSubstitution(
2377
- "controlled substances distribution ",
2378
- fr"\bcds{sep}dist\b",
2379
- "controlled substances distribution ",
2380
- priority=5,
2381
- ),
2382
- RegexSubstitution(
2383
- "distribution narcotics",
2384
- fr"\bdist{sep}narc\b",
2385
- "distribution narcotics",
2386
- priority=5,
2387
- ),
2388
- RegexSubstitution(
2389
- "deliver or distribution",
2390
- fr"\bdel{sep}or{sep}dist\b",
2391
- "deliver or distribution",
2392
- priority=5,
2393
- ),
2394
- RegexSubstitution(
2395
- "criminal distribution",
2396
- fr"\bcriminal{sep}dist\b",
2397
- "criminal distribution",
2398
- priority=5,
2399
- ),
2400
- RegexSubstitution(
2401
- "purchase",
2402
- r"\bpur\b",
2403
- "purchase",
2404
- ),
2405
- # DRUG TERMS ===========
2406
- RegexSubstitution(
2407
- "marijuana",
2408
- r"\b(?:marij|marihuana|mari|marijuan|marijua|mariju|mj)\b",
2409
- "marijuana",
2410
- ),
2411
- RegexSubstitution(
2412
- "hydrocodone",
2413
- r"\bhydroc\b",
2414
- "hydrocodone",
2415
- ),
2416
- RegexSubstitution(
2417
- "cocaine",
2418
- r"\b(?:cocain|coca|cocai|cocne)\b",
2419
- "cocaine",
2420
- ),
2421
- RegexSubstitution(
2422
- "crack or cocaine",
2423
- r"\bcoc\b",
2424
- "crack or cocaine",
2425
- ),
2426
- RegexSubstitution(
2427
- "rohypnol",
2428
- r"\brohypnl\b",
2429
- "rohypnol",
2430
- ),
2431
- RegexSubstitution(
2432
- "heroine",
2433
- r"\bher\b",
2434
- "heroine",
2435
- ),
2436
- RegexSubstitution(
2437
- "heroine",
2438
- r"\bher\b",
2439
- "heroine",
2440
- ),
2441
- RegexSubstitution(
2442
- "ecstasy",
2443
- r"\bmdma\b",
2444
- "ecstasy",
2445
- ),
2446
- RegexSubstitution(
2447
- "methamphetamine",
2448
- r"\b(?:meth|metham|methamphet|methamph)\b",
2449
- "methamphetamine",
2450
- ),
2451
- RegexSubstitution(
2452
- "paraphernalia",
2453
- r"\b(?:para|paraph|paraphenalia|parap)\b",
2454
- "paraphernalia",
2455
- ),
2456
- RegexSubstitution(
2457
- "grams",
2458
- r"\b(?:gr|gms|grms)\b",
2459
- "grams",
2460
- ),
2461
- RegexSubstitution(
2462
- "gram",
2463
- r"\bgm\b",
2464
- "gram",
2465
- ),
2466
- RegexSubstitution(
2467
- "kilograms",
2468
- r"\bkg\b",
2469
- "kilograms",
2470
- ),
2471
- RegexSubstitution(
2472
- "pounds",
2473
- r"\blb\b",
2474
- "pounds",
2475
- ),
2476
- RegexSubstitution(
2477
- "ounces",
2478
- r"\boz\b",
2479
- "ounces",
2480
- ),
2481
- # ALCOHOL / LIQUOR terms ===========
2482
- RegexSubstitution(
2483
- "alcoholic beverage", r"\balc\Wbev\b", "alcoholic beverage", priority=5
2484
- ),
2485
- RegexSubstitution(
2486
- "beverage",
2487
- r"\bbev\b",
2488
- "beverage",
2489
- ),
2490
- RegexSubstitution(
2491
- "blood alcohol concentration",
2492
- r"\bbac\b",
2493
- "blood alcohol concentration",
2494
- ),
2495
- RegexSubstitution(
2496
- "alcohol",
2497
- r"\b(?:alc|alch|alchol|alcohl|alco|alcoh|alcoho)\b",
2498
- "alcohol",
2499
- ),
2500
- RegexSubstitution(
2501
- "over legal",
2502
- fr"\b(?:over|ov){sep}(?:legal|leg)\b",
2503
- "over legal",
2504
- ),
2505
- RegexSubstitution(
2506
- "supply",
2507
- fr"\bsupp\b",
2508
- "supply",
2509
- ),
2510
- RegexSubstitution(
2511
- "liquor",
2512
- fr"\bliq\b",
2513
- "liquor",
2514
- ),
2515
- RegexSubstitution(
2516
- "distill",
2517
- r"\bdstl\b",
2518
- "distill",
2519
- ),
2520
- RegexSubstitution(
2521
- "minor in possession",
2522
- fr"\bmip\b",
2523
- "minor in possession",
2524
- ),
2525
- RegexSubstitution(
2526
- "premises",
2527
- fr"\bprem\b",
2528
- "premises",
2529
- ),
2530
- RegexSubstitution(
2531
- "consume",
2532
- fr"\bcnsum\b",
2533
- "consume",
2534
- ),
2535
- RegexSubstitution(
2536
- "intoxication",
2537
- fr"\bintox\b",
2538
- "intoxication",
2539
- ),
2540
- RegexSubstitution(
2541
- "available",
2542
- fr"\bavail\b",
2543
- "available",
2544
- ),
2545
- RegexSubstitution(
2546
- "unlicensed",
2547
- fr"\bunlic\b",
2548
- "unlicensed",
2549
- ),
2550
- RegexSubstitution(
2551
- "large amount",
2552
- fr"\blg{sep}amt\b",
2553
- "large amount",
2554
- ),
2555
- RegexSubstitution(
2556
- "small amount",
2557
- fr"\bsm{sep}amt\b",
2558
- "small amount",
2559
- ),
2560
- RegexSubstitution(
2561
- "required",
2562
- fr"\breq\b",
2563
- "required",
2564
- ),
2565
- RegexSubstitution(
2566
- "violate prohibition",
2567
- fr"\bvio{sep}prohibition\b",
2568
- "violate prohibition",
2569
- ),
2570
- RegexSubstitution(
2571
- "enticement",
2572
- fr"\bentcmnt\b",
2573
- "enticement",
2574
- ),
2575
- # SUBSTANCE TERMS ========
2576
- RegexSubstitution(
2577
- "Substance",
2578
- r"\b(?:sub|subs|substanc|substan|substnces|subtance|substa|substnc|sunstance|subst)\b",
2579
- "substance",
2580
- 20,
2581
- ),
2582
- RegexSubstitution("controlled", r"\b(?:cntrld|cntrl|contrlld)\b", "controlled", 20),
2583
- RegexSubstitution(
2584
- "controlled dangerous substances",
2585
- r"\bcds\b",
2586
- "controlled dangerous substances",
2587
- ),
2588
- RegexSubstitution(
2589
- "solicitation of controlled substances",
2590
- fr"\bsol{sep}cds\b",
2591
- "solicitation of controlled substances",
2592
- priority=4,
2593
- ),
2594
- RegexSubstitution(
2595
- "solicitation",
2596
- fr"\b(?:solct|sol|solicit|solic)\b",
2597
- "solicitation",
2598
- ),
2599
- RegexSubstitution(
2600
- "solicitation of narcotics",
2601
- fr"\bsol{sep}narc\b",
2602
- "solicitation of narcotics",
2603
- priority=4,
2604
- ),
2605
- RegexSubstitution(
2606
- "Controlled Substance",
2607
- fr"\bcont?r?{sep}?subs?t?(?:\b|stance\b)",
2608
- "controlled substance",
2609
- ),
2610
- RegexSubstitution(
2611
- "Controlled Substance 2",
2612
- r"\bc\W?s\b",
2613
- "controlled substance",
2614
- ),
2615
- RegexSubstitution(
2616
- "unlawful possession of a controlled substance",
2617
- r"\bupcs\b",
2618
- "unlawful possession of a controlled substance",
2619
- ),
2620
- ]
2621
-
2622
-
2623
- def prep_text(text):
2624
- # Remove Commas from Numbers
2625
- text = re.sub(r"(\d+?),(\d+?)", r"\1\2", text)
2626
- # TODO: double check this `'s` regex
2627
- text = re.sub(r"\b(\S+?)'(s)", r"\1\2", text)
2628
- # replace hyphens with spaces
2629
- text = re.sub("-", " ", text)
2630
- # replace forward-slashes with spaces
2631
- text = re.sub("/", " ", text)
2632
- return text
2633
-
2634
-
2635
- def cleaner(text):
2636
- if pd.isnull(text):
2637
- return ""
2638
- # Prepare text for regex substitions
2639
- text = prep_text(text)
2640
- # Do all substitutions (Case insensitive on raw text)
2641
- substitutions_sorted = sorted(substitutions, key=lambda s: s.priority)
2642
- for substitution in substitutions_sorted:
2643
- text = re.sub(substitution.regex, substitution.replacement, text)
2644
- # Remove any terms we don't want
2645
- for removal in removals:
2646
- text = re.sub(removal.regex, " ", text)
2647
- # Then remove remaining punctuation
2648
- for punct in all_punctuation:
2649
- text = text.replace(punct, " ")
2650
- text = " ".join(text.split()) # removes extra spaces: " " → " "
2651
- text = text.lower()
2652
- return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
download.py DELETED
@@ -1,28 +0,0 @@
1
- # https://discuss.streamlit.io/t/heres-a-download-function-that-works-for-dataframes-and-txt/4052
2
-
3
- import base64
4
-
5
- import streamlit as st
6
- import pandas as pd
7
-
8
-
9
- def download_link(object_to_download, download_filename, download_link_text):
10
- """
11
- Generates a link to download the given object_to_download.
12
-
13
- object_to_download (str, pd.DataFrame): The object to be downloaded.
14
- download_filename (str): filename and extension of file. e.g. mydata.csv, some_txt_output.txt
15
- download_link_text (str): Text to display for download link.
16
-
17
- Examples:
18
- download_link(YOUR_DF, 'YOUR_DF.csv', 'Click here to download data!')
19
- download_link(YOUR_STRING, 'YOUR_STRING.txt', 'Click here to download your text!')
20
-
21
- """
22
- if isinstance(object_to_download, pd.DataFrame):
23
- object_to_download = object_to_download.to_csv(index=False)
24
-
25
- # some strings <-> bytes conversions necessary here
26
- b64 = base64.b64encode(object_to_download.encode()).decode()
27
-
28
- return f'<a href="data:file/txt;base64,{b64}" download="{download_filename}">{download_link_text}</a>'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
onnx_model_utils.py DELETED
@@ -1,195 +0,0 @@
1
- import os
2
-
3
- from psutil import cpu_count
4
-
5
- # Constants from the performance optimization available in onnxruntime
6
- # It needs to be done before importing onnxruntime
7
- os.environ["OMP_NUM_THREADS"] = str(cpu_count(logical=True))
8
- os.environ["OMP_WAIT_POLICY"] = "ACTIVE"
9
- os.environ["TOKENIZERS_PARALLELISM"] = "true"
10
-
11
- import json
12
- import os
13
- from pathlib import Path
14
- from typing import Any, Dict, List
15
- import gzip
16
- import shutil
17
-
18
- from numpy import ndarray
19
- import requests
20
- import streamlit as st
21
- from onnxruntime import GraphOptimizationLevel, InferenceSession, SessionOptions
22
- from scipy.special import softmax
23
- from transformers import AutoTokenizer
24
- from transformers.file_utils import http_get
25
-
26
- from cleaning_utils import cleaner
27
-
28
- RELEASE_TAG = "2021.05.18.15"
29
- OUTPUT_PATH = Path("onnx/rota-quantized.onnx")
30
- ONNX_RELEASE = (
31
- "https://github.com/RTIInternational/"
32
- "rota/"
33
- "releases/download/"
34
- f"{RELEASE_TAG}/"
35
- "rota-quantized.onnx.gz"
36
- )
37
-
38
-
39
- @st.cache
40
- def cleaner_cache(text):
41
- return cleaner(text)
42
-
43
-
44
- def get_label_config(model_name, config_path: Path = Path("config.json")):
45
- if config_path.exists():
46
- config_json = json.loads(config_path.read_text())
47
- labels = {int(k): v for k, v in config_json["id2label"].items()}
48
- else:
49
- config_url = f"https://huggingface.co/{model_name}/raw/main/config.json"
50
- config_json = requests.get(config_url).json()
51
- config_path.write_text(json.dumps(config_json))
52
- labels = {int(k): v for k, v in config_json["id2label"].items()}
53
- return labels
54
-
55
-
56
- class ONNXCPUClassificationPipeline:
57
- def __init__(self, tokenizer, model_path):
58
- self.tokenizer = tokenizer
59
- self.model = create_cpu_model(model_path)
60
- self.labels = get_label_config(
61
- tokenizer.name_or_path, config_path=Path("onnx/config.json")
62
- )
63
-
64
- def __call__(self, texts: List[str]) -> List[List[Dict[str, Any]]]:
65
- # Inputs are provided through numpy array
66
- model_inputs = self.tokenizer(texts, return_tensors="pt", padding=True)
67
- inputs_onnx = {k: v.cpu().detach().numpy() for k, v in model_inputs.items()}
68
-
69
- # Run the model (None = get all the outputs)
70
- output = self.model.run(0, inputs_onnx)
71
- probs = softmax(output[0], axis=1)
72
- predictions = self._format_predictions(probs, self.labels)
73
- return predictions
74
-
75
- def _format_predictions(
76
- self, softmax_array: ndarray, labels: List[str]
77
- ) -> List[List[Dict[str, Any]]]:
78
- """Format predictions from ONNX similar to the
79
- huggingface transformers classification pipeline
80
-
81
- Args:
82
- softmax_array (np.ndarray): array of shape (n_preds, n_labels)
83
-
84
- Returns:
85
- List[List[Dict[str, Any]]]: Output of predictions, where each row is a list of
86
- Dict with keys "label" and "score"
87
- """
88
- predictions = [
89
- [
90
- {"label": labels[column], "score": float(softmax_array[row][column])}
91
- for column in range(softmax_array.shape[1])
92
- ]
93
- for row in range(softmax_array.shape[0])
94
- ]
95
- return predictions
96
-
97
-
98
- def create_cpu_model(model_path: str) -> InferenceSession:
99
- # Few properties that might have an impact on performances (provided by MS)
100
- options = SessionOptions()
101
- options.intra_op_num_threads = 1
102
- options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL
103
-
104
- # Load the model as a graph and prepare the CPU backend
105
- session = InferenceSession(model_path, options, providers=["CPUExecutionProvider"])
106
- session.disable_fallback()
107
-
108
- return session
109
-
110
-
111
- def download_model():
112
- OUTPUT_PATH.parent.mkdir(exist_ok=True)
113
- with open(f"{OUTPUT_PATH}.gz", "wb") as f:
114
- http_get(
115
- ONNX_RELEASE,
116
- f,
117
- )
118
-
119
- with gzip.open(f"{OUTPUT_PATH}.gz", "rb") as f_in:
120
- with open(f"{OUTPUT_PATH}", "wb") as f_out:
121
- shutil.copyfileobj(f_in, f_out)
122
-
123
-
124
- def load_model():
125
- if not OUTPUT_PATH.exists():
126
- download_model()
127
- tokenizer = AutoTokenizer.from_pretrained("rti-international/rota")
128
- pipeline = ONNXCPUClassificationPipeline(tokenizer, str(OUTPUT_PATH))
129
- return pipeline
130
-
131
-
132
- pipeline = load_model()
133
-
134
-
135
- def predict(text: str, sort=True) -> List[List[Dict[str, Any]]]:
136
- """Generate a single prediction on an input text
137
-
138
- Args:
139
- text (str): The input text to generate a prediction for (post-clean)
140
- sort (bool, optional): Whether to sort the predicted labels by score. Defaults to True.
141
-
142
- Returns:
143
- List[List[Dict[str, Any]]]: A list with a single element containing predicted label scores.
144
- """
145
- clean = cleaner_cache(text)
146
- preds = pipeline([clean])
147
- if sort:
148
- sorted_preds = [
149
- sorted(p, key=lambda d: d["score"], reverse=True) for p in preds
150
- ]
151
- return sorted_preds
152
- else:
153
- return preds
154
-
155
-
156
- def predict_bulk(texts: List[str]) -> List[List[Dict[str, Any]]]:
157
- """Generate predictions on a list of strings.
158
-
159
- Args:
160
- texts (List[str]): Input texts to generate predictions (post-cleaning)
161
-
162
- Returns:
163
- List[List[Dict[str, Any]]]: Predicted label scores for each input text
164
- """
165
- cleaned = [cleaner_cache(text) for text in texts]
166
- preds = pipeline(cleaned)
167
- del cleaned
168
- return preds
169
-
170
-
171
- def _max_pred(prediction_scores: List[Dict[str, Any]]) -> Dict[str, Any]:
172
- """Utility function to find the maximum predicted label
173
- for a single prediction
174
-
175
- Args:
176
- prediction_scores (List[Dict[str, Any]]): A list of predictions with keys
177
- 'label' and 'score'
178
-
179
- Returns:
180
- Dict[str, Any]: The 'label' and 'score' dict with the highest score value
181
- """
182
- return max(prediction_scores, key=lambda d: d["score"])
183
-
184
-
185
- def max_pred_bulk(preds: List[List[Dict[str, Any]]]) -> List[Dict[str, Any]]:
186
- """Generates a "column" of label predictions by finding the max
187
- prediction score per element
188
-
189
- Args:
190
- preds (List[List[Dict[str, Any]]]): A list of predictions
191
-
192
- Returns:
193
- List[Dict[str, Any]: A list of 'label' and 'score' dict with the highest score value
194
- """
195
- return [_max_pred(pred) for pred in preds]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,11 +1 @@
1
- openpyxl==3.0.6
2
- pandas==1.2.0
3
- transformers[torch]==4.6.0
4
- # New Requirements
5
- streamlit==0.82.0
6
- more-itertools==8.7.0
7
- stqdm==0.0.3
8
- onnx==1.9.0
9
- onnxruntime==1.7.0
10
- psutil==5.8.0
11
- scipy==1.6.2
1
+ streamlit==1.21.0