Peter B commited on
Commit
5fae609
1 Parent(s): 0ea804a

initial commit (old model)

Browse files
.github/workflows/onnx-release.yml ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Add ONNX Files as Release
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+
8
+ jobs:
9
+ build:
10
+ name: Model Build
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - uses: actions/checkout@v2
14
+ with:
15
+ lfs: "true"
16
+ fetch-depth: 2
17
+ - name: Check if Model Files have changed
18
+ id: changed-files-specific
19
+ uses: tj-actions/changed-files@v5.1
20
+ with:
21
+ files: |
22
+ config.json
23
+ merges.txt
24
+ model_args.json
25
+ pytorch_model.bin
26
+ special_tokens_map.json
27
+ tokenizer_config.json
28
+ training_args.bin
29
+ vocab.json
30
+ - name: Get current date for release tag
31
+ id: date
32
+ run: echo "::set-output name=date::$(date -u +'%Y.%m.%d.%H')"
33
+ - uses: actions/setup-python@v2
34
+ if: steps.changed-files-specific.outputs.any_changed == 'true'
35
+ with:
36
+ python-version: "3.7"
37
+ - uses: actions/cache@v2
38
+ if: steps.changed-files-specific.outputs.any_changed == 'true'
39
+ with:
40
+ path: ${{ env.pythonLocation }}
41
+ key: ${{ env.pythonLocation }}-${{ hashFiles('onnx-convert-requirements.txt') }}
42
+ - name: Install dependencies
43
+ if: steps.changed-files-specific.outputs.any_changed == 'true'
44
+ run: |
45
+ python -m pip install --upgrade pip
46
+ pip install -r onnx-convert-requirements.txt
47
+ - name: Convert Model
48
+ if: steps.changed-files-specific.outputs.any_changed == 'true'
49
+ run: python utils/convert_onnx.py convert_model . onnx/${{ github.event.repository.name }}.onnx
50
+ - name: GZIP ONNX Files for Release
51
+ if: steps.changed-files-specific.outputs.any_changed == 'true'
52
+ run: |
53
+ gzip onnx/${{ github.event.repository.name }}.onnx
54
+ gzip onnx/${{ github.event.repository.name }}-quantized.onnx
55
+ - name: Release Repo + ONNX under date tag
56
+ uses: "marvinpinto/action-automatic-releases@latest"
57
+ if: steps.changed-files-specific.outputs.any_changed == 'true'
58
+ with:
59
+ repo_token: "${{ secrets.GITHUB_TOKEN }}"
60
+ automatic_release_tag: "${{ steps.date.outputs.date }}"
61
+ prerelease: false
62
+ draft: true
63
+ files: |
64
+ ${{ github.event.repository.name }}.tar.gz
65
+ onnx/${{ github.event.repository.name }}.onnx.gz
66
+ onnx/${{ github.event.repository.name }}-quantized.onnx.gz
.gitignore ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
98
+ __pypackages__/
99
+
100
+ # Celery stuff
101
+ celerybeat-schedule
102
+ celerybeat.pid
103
+
104
+ # SageMath parsed files
105
+ *.sage.py
106
+
107
+ # Environments
108
+ .env
109
+ .venv
110
+ env/
111
+ venv/
112
+ ENV/
113
+ env.bak/
114
+ venv.bak/
115
+
116
+ # Spyder project settings
117
+ .spyderproject
118
+ .spyproject
119
+
120
+ # Rope project settings
121
+ .ropeproject
122
+
123
+ # mkdocs documentation
124
+ /site
125
+
126
+ # mypy
127
+ .mypy_cache/
128
+ .dmypy.json
129
+ dmypy.json
130
+
131
+ # Pyre type checker
132
+ .pyre/
133
+
134
+ # pytype static type analyzer
135
+ .pytype/
136
+
137
+ # Cython debug symbols
138
+ cython_debug/
139
+
140
+ .vscode/
141
+
142
+ .DS_Store
143
+
144
+ onnx/
LICENSE ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ Apache License
3
+ Version 2.0, January 2004
4
+ http://www.apache.org/licenses/
5
+
6
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7
+
8
+ 1. Definitions.
9
+
10
+ "License" shall mean the terms and conditions for use, reproduction,
11
+ and distribution as defined by Sections 1 through 9 of this document.
12
+
13
+ "Licensor" shall mean the copyright owner or entity authorized by
14
+ the copyright owner that is granting the License.
15
+
16
+ "Legal Entity" shall mean the union of the acting entity and all
17
+ other entities that control, are controlled by, or are under common
18
+ control with that entity. For the purposes of this definition,
19
+ "control" means (i) the power, direct or indirect, to cause the
20
+ direction or management of such entity, whether by contract or
21
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
22
+ outstanding shares, or (iii) beneficial ownership of such entity.
23
+
24
+ "You" (or "Your") shall mean an individual or Legal Entity
25
+ exercising permissions granted by this License.
26
+
27
+ "Source" form shall mean the preferred form for making modifications,
28
+ including but not limited to software source code, documentation
29
+ source, and configuration files.
30
+
31
+ "Object" form shall mean any form resulting from mechanical
32
+ transformation or translation of a Source form, including but
33
+ not limited to compiled object code, generated documentation,
34
+ and conversions to other media types.
35
+
36
+ "Work" shall mean the work of authorship, whether in Source or
37
+ Object form, made available under the License, as indicated by a
38
+ copyright notice that is included in or attached to the work
39
+ (an example is provided in the Appendix below).
40
+
41
+ "Derivative Works" shall mean any work, whether in Source or Object
42
+ form, that is based on (or derived from) the Work and for which the
43
+ editorial revisions, annotations, elaborations, or other modifications
44
+ represent, as a whole, an original work of authorship. For the purposes
45
+ of this License, Derivative Works shall not include works that remain
46
+ separable from, or merely link (or bind by name) to the interfaces of,
47
+ the Work and Derivative Works thereof.
48
+
49
+ "Contribution" shall mean any work of authorship, including
50
+ the original version of the Work and any modifications or additions
51
+ to that Work or Derivative Works thereof, that is intentionally
52
+ submitted to Licensor for inclusion in the Work by the copyright owner
53
+ or by an individual or Legal Entity authorized to submit on behalf of
54
+ the copyright owner. For the purposes of this definition, "submitted"
55
+ means any form of electronic, verbal, or written communication sent
56
+ to the Licensor or its representatives, including but not limited to
57
+ communication on electronic mailing lists, source code control systems,
58
+ and issue tracking systems that are managed by, or on behalf of, the
59
+ Licensor for the purpose of discussing and improving the Work, but
60
+ excluding communication that is conspicuously marked or otherwise
61
+ designated in writing by the copyright owner as "Not a Contribution."
62
+
63
+ "Contributor" shall mean Licensor and any individual or Legal Entity
64
+ on behalf of whom a Contribution has been received by Licensor and
65
+ subsequently incorporated within the Work.
66
+
67
+ 2. Grant of Copyright License. Subject to the terms and conditions of
68
+ this License, each Contributor hereby grants to You a perpetual,
69
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70
+ copyright license to reproduce, prepare Derivative Works of,
71
+ publicly display, publicly perform, sublicense, and distribute the
72
+ Work and such Derivative Works in Source or Object form.
73
+
74
+ 3. Grant of Patent License. Subject to the terms and conditions of
75
+ this License, each Contributor hereby grants to You a perpetual,
76
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77
+ (except as stated in this section) patent license to make, have made,
78
+ use, offer to sell, sell, import, and otherwise transfer the Work,
79
+ where such license applies only to those patent claims licensable
80
+ by such Contributor that are necessarily infringed by their
81
+ Contribution(s) alone or by combination of their Contribution(s)
82
+ with the Work to which such Contribution(s) was submitted. If You
83
+ institute patent litigation against any entity (including a
84
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
85
+ or a Contribution incorporated within the Work constitutes direct
86
+ or contributory patent infringement, then any patent licenses
87
+ granted to You under this License for that Work shall terminate
88
+ as of the date such litigation is filed.
89
+
90
+ 4. Redistribution. You may reproduce and distribute copies of the
91
+ Work or Derivative Works thereof in any medium, with or without
92
+ modifications, and in Source or Object form, provided that You
93
+ meet the following conditions:
94
+
95
+ (a) You must give any other recipients of the Work or
96
+ Derivative Works a copy of this License; and
97
+
98
+ (b) You must cause any modified files to carry prominent notices
99
+ stating that You changed the files; and
100
+
101
+ (c) You must retain, in the Source form of any Derivative Works
102
+ that You distribute, all copyright, patent, trademark, and
103
+ attribution notices from the Source form of the Work,
104
+ excluding those notices that do not pertain to any part of
105
+ the Derivative Works; and
106
+
107
+ (d) If the Work includes a "NOTICE" text file as part of its
108
+ distribution, then any Derivative Works that You distribute must
109
+ include a readable copy of the attribution notices contained
110
+ within such NOTICE file, excluding those notices that do not
111
+ pertain to any part of the Derivative Works, in at least one
112
+ of the following places: within a NOTICE text file distributed
113
+ as part of the Derivative Works; within the Source form or
114
+ documentation, if provided along with the Derivative Works; or,
115
+ within a display generated by the Derivative Works, if and
116
+ wherever such third-party notices normally appear. The contents
117
+ of the NOTICE file are for informational purposes only and
118
+ do not modify the License. You may add Your own attribution
119
+ notices within Derivative Works that You distribute, alongside
120
+ or as an addendum to the NOTICE text from the Work, provided
121
+ that such additional attribution notices cannot be construed
122
+ as modifying the License.
123
+
124
+ You may add Your own copyright statement to Your modifications and
125
+ may provide additional or different license terms and conditions
126
+ for use, reproduction, or distribution of Your modifications, or
127
+ for any such Derivative Works as a whole, provided Your use,
128
+ reproduction, and distribution of the Work otherwise complies with
129
+ the conditions stated in this License.
130
+
131
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
132
+ any Contribution intentionally submitted for inclusion in the Work
133
+ by You to the Licensor shall be under the terms and conditions of
134
+ this License, without any additional terms or conditions.
135
+ Notwithstanding the above, nothing herein shall supersede or modify
136
+ the terms of any separate license agreement you may have executed
137
+ with Licensor regarding such Contributions.
138
+
139
+ 6. Trademarks. This License does not grant permission to use the trade
140
+ names, trademarks, service marks, or product names of the Licensor,
141
+ except as required for reasonable and customary use in describing the
142
+ origin of the Work and reproducing the content of the NOTICE file.
143
+
144
+ 7. Disclaimer of Warranty. Unless required by applicable law or
145
+ agreed to in writing, Licensor provides the Work (and each
146
+ Contributor provides its Contributions) on an "AS IS" BASIS,
147
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148
+ implied, including, without limitation, any warranties or conditions
149
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150
+ PARTICULAR PURPOSE. You are solely responsible for determining the
151
+ appropriateness of using or redistributing the Work and assume any
152
+ risks associated with Your exercise of permissions under this License.
153
+
154
+ 8. Limitation of Liability. In no event and under no legal theory,
155
+ whether in tort (including negligence), contract, or otherwise,
156
+ unless required by applicable law (such as deliberate and grossly
157
+ negligent acts) or agreed to in writing, shall any Contributor be
158
+ liable to You for damages, including any direct, indirect, special,
159
+ incidental, or consequential damages of any character arising as a
160
+ result of this License or out of the use or inability to use the
161
+ Work (including but not limited to damages for loss of goodwill,
162
+ work stoppage, computer failure or malfunction, or any and all
163
+ other commercial damages or losses), even if such Contributor
164
+ has been advised of the possibility of such damages.
165
+
166
+ 9. Accepting Warranty or Additional Liability. While redistributing
167
+ the Work or Derivative Works thereof, You may choose to offer,
168
+ and charge a fee for, acceptance of support, warranty, indemnity,
169
+ or other liability obligations and/or rights consistent with this
170
+ License. However, in accepting such obligations, You may act only
171
+ on Your own behalf and on Your sole responsibility, not on behalf
172
+ of any other Contributor, and only if You agree to indemnify,
173
+ defend, and hold each Contributor harmless for any liability
174
+ incurred by, or claims asserted against, such Contributor by reason
175
+ of your accepting any such warranty or additional liability.
176
+
177
+ END OF TERMS AND CONDITIONS
178
+
179
+ APPENDIX: How to apply the Apache License to your work.
180
+
181
+ To apply the Apache License to your work, attach the following
182
+ boilerplate notice, with the fields enclosed by brackets "[]"
183
+ replaced with your own identifying information. (Don't include
184
+ the brackets!) The text should be enclosed in the appropriate
185
+ comment syntax for the file format. We also recommend that a
186
+ file or class name and description of purpose be included on the
187
+ same "printed page" as the copyright notice for easier
188
+ identification within third-party archives.
189
+
190
+ Copyright 2021 RTI International
191
+
192
+ Licensed under the Apache License, Version 2.0 (the "License");
193
+ you may not use this file except in compliance with the License.
194
+ You may obtain a copy of the License at
195
+
196
+ http://www.apache.org/licenses/LICENSE-2.0
197
+
198
+ Unless required by applicable law or agreed to in writing, software
199
+ distributed under the License is distributed on an "AS IS" BASIS,
200
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201
+ See the License for the specific language governing permissions and
202
+ limitations under the License.
README.md ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - en
4
+ widget:
5
+ - text: theft 3
6
+ - text: forgery
7
+ - text: unlawful possession short-barreled shotgun
8
+ - text: criminal trespass 2nd degree
9
+ - text: eluding a police vehicle
10
+ - text: upcs synthetic narcotic
11
+ ---
12
+
13
+ # ROTA
14
+ ## Rapid Offense Text Autocoder
15
+
16
+ [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.4739146.svg)](https://doi.org/10.5281/zenodo.4739146)
17
+
18
+ Criminal justice research can often require conversion of free-text offense descriptions into overall charge categories to aid in state-wise comparisons. For example, the free-text offense of “eluding a police vehicle” would be coded to a charge category of “Obstruction - Law Enforcement”. Since free-text offense descriptions aren't standardized and often need to be categorized in large volumes, this can result in a manual and time intensive process for researchers. We present a machine learning model for converting offense text into offense codes.
19
+
20
+ Currently, this model predicts the *Charge Category* of a given offense text. A *charge category* is one of the headings for offense codes in the [2009 NCRP Codebook: Appendix F](https://www.icpsr.umich.edu/web/NACJD/studies/30799/datadocumentation#).
21
+
22
+ The model was trained on [publicly available data](https://web.archive.org/web/20201021001250/https://www.icpsr.umich.edu/web/pages/NACJD/guides/ncrp.html) from a crosswalk containing offenses from all 50 states combined with three additional hand-labeled offense text datasets.
23
+
24
+ <details>
25
+ <summary>Charge Category Example</summary>
26
+ <img src="https://i.ibb.co/xLsrzmV/charge-category-example.png" width="500">
27
+ </details>
28
+
29
+ ### Data Preprocessing
30
+
31
+ The input text is standardized through a series of preprocessing steps. The text is first passed through a sequence of 500+ case-insensitive regular expressions that identify common misspellings and abbreviations and expand the text to a more full, correct English text. Some data-specific prefixes and suffixes are then removed from the text -- e.g. some states included a statute as a part of the text. Finally, punctuation (excluding dollar signs) are removed from the input, multiple spaces between words are removed, and the text is lowercased.
32
+
33
+ ## Cross-Validation Performance
34
+
35
+ This model was evaluated using 3-fold cross validation. Except where noted, numbers presented below are the mean value across the 3 folds.
36
+
37
+ The model in this repository is trained on all available data. Because of this, you can typically expect production performance to be (unknowably) better than the numbers presented below.
38
+
39
+ ### Overall Metrics
40
+
41
+ | Metric | Value |
42
+ | -------- | ----- |
43
+ | Accuracy | 0.934 |
44
+ | MCC | 0.931 |
45
+
46
+
47
+
48
+ | Metric | precision | recall | f1-score |
49
+ | --------- | --------- | ------ | -------- |
50
+ | macro avg | 0.811 | 0.788 | 0.795 |
51
+
52
+
53
+ *Note*: These are the average of the values *per fold*, so *macro avg* is the average of the macro average of all categories per fold.
54
+
55
+ ### Per-Category Metrics
56
+
57
+ | Category | precision | recall | f1-score | support |
58
+ | ------------------------------------------------------ | --------- | ------ | -------- | ------- |
59
+ | AGGRAVATED ASSAULT | 0.95 | 0.957 | 0.954 | 4085 |
60
+ | ARMED ROBBERY | 0.958 | 0.957 | 0.957 | 1021 |
61
+ | ARSON | 0.951 | 0.953 | 0.952 | 344 |
62
+ | ASSAULTING PUBLIC OFFICER | 0.921 | 0.905 | 0.913 | 588 |
63
+ | AUTO THEFT | 0.962 | 0.963 | 0.963 | 1660 |
64
+ | BLACKMAIL/EXTORTION/INTIMIDATION | 0.868 | 0.878 | 0.873 | 627 |
65
+ | BRIBERY AND CONFLICT OF INTEREST | 0.772 | 0.821 | 0.795 | 216 |
66
+ | BURGLARY | 0.982 | 0.98 | 0.981 | 2214 |
67
+ | CHILD ABUSE | 0.793 | 0.776 | 0.784 | 139 |
68
+ | COCAINE OR CRACK VIOLATION OFFENSE UNSPECIFIED | 0.868 | 0.865 | 0.866 | 47 |
69
+ | COMMERCIALIZED VICE | 0.811 | 0.777 | 0.793 | 666 |
70
+ | CONTEMPT OF COURT | 0.983 | 0.987 | 0.985 | 2952 |
71
+ | CONTRIBUTING TO DELINQUENCY OF A MINOR | 0.454 | 0.379 | 0.396 | 50 |
72
+ | CONTROLLED SUBSTANCE - OFFENSE UNSPECIFIED | 0.839 | 0.78 | 0.808 | 280 |
73
+ | COUNTERFEITING (FEDERAL ONLY) | 0 | 0 | 0 | 2 |
74
+ | DESTRUCTION OF PROPERTY | 0.967 | 0.97 | 0.969 | 2560 |
75
+ | DRIVING UNDER INFLUENCE - DRUGS | 0.641 | 0.616 | 0.625 | 34 |
76
+ | DRIVING UNDER THE INFLUENCE | 0.944 | 0.95 | 0.947 | 2195 |
77
+ | DRIVING WHILE INTOXICATED | 0.99 | 0.978 | 0.984 | 2391 |
78
+ | DRUG OFFENSES - VIOLATION/DRUG UNSPECIFIED | 0.905 | 0.911 | 0.908 | 3100 |
79
+ | DRUNKENNESS/VAGRANCY/DISORDERLY CONDUCT | 0.853 | 0.862 | 0.857 | 380 |
80
+ | EMBEZZLEMENT | 0.86 | 0.762 | 0.808 | 100 |
81
+ | EMBEZZLEMENT (FEDERAL ONLY) | 0 | 0 | 0 | 1 |
82
+ | ESCAPE FROM CUSTODY | 0.989 | 0.99 | 0.99 | 4035 |
83
+ | FAMILY RELATED OFFENSES | 0.744 | 0.768 | 0.756 | 442 |
84
+ | FELONY - UNSPECIFIED | 0.665 | 0.762 | 0.709 | 122 |
85
+ | FLIGHT TO AVOID PROSECUTION | 0.43 | 0.434 | 0.431 | 38 |
86
+ | FORCIBLE SODOMY | 0.773 | 0.837 | 0.802 | 76 |
87
+ | FORGERY (FEDERAL ONLY) | 0 | 0 | 0 | 2 |
88
+ | FORGERY/FRAUD | 0.909 | 0.928 | 0.918 | 4687 |
89
+ | FRAUD (FEDERAL ONLY) | 0 | 0 | 0 | 2 |
90
+ | GRAND LARCENY - THEFT OVER $200 | 0.959 | 0.972 | 0.966 | 2412 |
91
+ | HABITUAL OFFENDER | 0.748 | 0.656 | 0.695 | 53 |
92
+ | HEROIN VIOLATION - OFFENSE UNSPECIFIED | 0.877 | 0.777 | 0.82 | 24 |
93
+ | HIT AND RUN DRIVING | 0.927 | 0.933 | 0.93 | 303 |
94
+ | HIT/RUN DRIVING - PROPERTY DAMAGE | 0.929 | 0.924 | 0.926 | 362 |
95
+ | IMMIGRATION VIOLATIONS | 0.778 | 0.616 | 0.681 | 19 |
96
+ | INVASION OF PRIVACY | 0.925 | 0.923 | 0.924 | 1235 |
97
+ | JUVENILE OFFENSES | 0.9 | 0.871 | 0.883 | 144 |
98
+ | KIDNAPPING | 0.926 | 0.929 | 0.927 | 553 |
99
+ | LARCENY/THEFT - VALUE UNKNOWN | 0.953 | 0.946 | 0.95 | 3175 |
100
+ | LEWD ACT WITH CHILDREN | 0.786 | 0.846 | 0.814 | 596 |
101
+ | LIQUOR LAW VIOLATIONS | 0.731 | 0.762 | 0.746 | 214 |
102
+ | MANSLAUGHTER - NON-VEHICULAR | 0.661 | 0.803 | 0.725 | 139 |
103
+ | MANSLAUGHTER - VEHICULAR | 0.763 | 0.854 | 0.803 | 117 |
104
+ | MARIJUANA/HASHISH VIOLATION - OFFENSE UNSPECIFIED | 0.778 | 0.675 | 0.718 | 62 |
105
+ | MISDEMEANOR UNSPECIFIED | 0.616 | 0.256 | 0.357 | 57 |
106
+ | MORALS/DECENCY - OFFENSE | 0.759 | 0.763 | 0.761 | 412 |
107
+ | MURDER | 0.965 | 0.922 | 0.943 | 621 |
108
+ | OBSTRUCTION - LAW ENFORCEMENT | 0.945 | 0.947 | 0.946 | 4220 |
109
+ | OFFENSES AGAINST COURTS, LEGISLATURES, AND COMMISSIONS | 0.882 | 0.897 | 0.889 | 1965 |
110
+ | PAROLE VIOLATION | 0.968 | 0.949 | 0.958 | 946 |
111
+ | PETTY LARCENY - THEFT UNDER $200 | 0.987 | 0.768 | 0.864 | 139 |
112
+ | POSSESSION/USE - COCAINE OR CRACK | 0.9 | 0.928 | 0.913 | 68 |
113
+ | POSSESSION/USE - DRUG UNSPECIFIED | 0.618 | 0.561 | 0.586 | 189 |
114
+ | POSSESSION/USE - HEROIN | 0.917 | 0.839 | 0.876 | 25 |
115
+ | POSSESSION/USE - MARIJUANA/HASHISH | 0.975 | 0.973 | 0.974 | 556 |
116
+ | POSSESSION/USE - OTHER CONTROLLED SUBSTANCES | 0.976 | 0.965 | 0.97 | 3271 |
117
+ | PROBATION VIOLATION | 0.958 | 0.956 | 0.957 | 1158 |
118
+ | PROPERTY OFFENSES - OTHER | 0.892 | 0.863 | 0.878 | 446 |
119
+ | PUBLIC ORDER OFFENSES - OTHER | 0.706 | 0.717 | 0.711 | 1871 |
120
+ | RACKETEERING/EXTORTION (FEDERAL ONLY) | 0 | 0 | 0 | 2 |
121
+ | RAPE - FORCE | 0.841 | 0.871 | 0.856 | 641 |
122
+ | RAPE - STATUTORY - NO FORCE | 0.714 | 0.551 | 0.619 | 140 |
123
+ | REGULATORY OFFENSES (FEDERAL ONLY) | 0.8 | 0.558 | 0.657 | 70 |
124
+ | RIOTING | 0.785 | 0.605 | 0.68 | 119 |
125
+ | SEXUAL ASSAULT - OTHER | 0.829 | 0.839 | 0.834 | 971 |
126
+ | SIMPLE ASSAULT | 0.977 | 0.967 | 0.972 | 4577 |
127
+ | STOLEN PROPERTY - RECEIVING | 0.953 | 0.955 | 0.954 | 1193 |
128
+ | STOLEN PROPERTY - TRAFFICKING | 0.899 | 0.875 | 0.887 | 491 |
129
+ | TAX LAW (FEDERAL ONLY) | 0.474 | 0.177 | 0.256 | 30 |
130
+ | TRAFFIC OFFENSES - MINOR | 0.976 | 0.975 | 0.975 | 8699 |
131
+ | TRAFFICKING - COCAINE OR CRACK | 0.893 | 0.944 | 0.918 | 185 |
132
+ | TRAFFICKING - DRUG UNSPECIFIED | 0.729 | 0.783 | 0.755 | 516 |
133
+ | TRAFFICKING - HEROIN | 0.874 | 0.902 | 0.887 | 54 |
134
+ | TRAFFICKING - OTHER CONTROLLED SUBSTANCES | 0.963 | 0.953 | 0.958 | 2832 |
135
+ | TRAFFICKING MARIJUANA/HASHISH | 0.919 | 0.934 | 0.926 | 255 |
136
+ | TRESPASSING | 0.974 | 0.982 | 0.978 | 1916 |
137
+ | UNARMED ROBBERY | 0.941 | 0.935 | 0.938 | 377 |
138
+ | UNAUTHORIZED USE OF VEHICLE | 0.929 | 0.911 | 0.92 | 304 |
139
+ | UNSPECIFIED HOMICIDE | 0.641 | 0.591 | 0.614 | 60 |
140
+ | VIOLENT OFFENSES - OTHER | 0.82 | 0.817 | 0.818 | 606 |
141
+ | VOLUNTARY/NONNEGLIGENT MANSLAUGHTER | 0.641 | 0.559 | 0.596 | 54 |
142
+ | WEAPON OFFENSE | 0.944 | 0.947 | 0.945 | 2466 |
143
+
144
+ *Note: `support` is the average number of observations predicted on per fold, so the total number of observations per class is roughly 3x `support`.*
145
+
146
+ ### Using Confidence Scores
147
+
148
+ If we interpret the classification probability as a confidence score, we can use it to filter out predictions that the model isn't as confident about. We applied this process in 3-fold cross validation. The numbers presented below indicate how much of the prediction data is retained given a confidence score cutoff of `p`. We present the overall accuracy and MCC metrics as if the model was only evaluated on this subset of confident predictions.
149
+
150
+ | | cutoff | percent retained | mcc | acc |
151
+ | --- | ------ | ---------------- | ----- | ----- |
152
+ | 0 | 0.85 | 0.952 | 0.959 | 0.961 |
153
+ | 1 | 0.90 | 0.944 | 0.963 | 0.965 |
154
+ | 2 | 0.95 | 0.928 | 0.969 | 0.971 |
155
+ | 3 | 0.975 | 0.912 | 0.975 | 0.976 |
156
+ | 4 | 0.99 | 0.885 | 0.982 | 0.983 |
157
+ | 5 | 0.999 | 0.737 | 0.996 | 0.996 |
code_map.json ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "AGGRAVATED ASSAULT": 0,
3
+ "ARMED ROBBERY": 1,
4
+ "ARSON": 2,
5
+ "ASSAULTING PUBLIC OFFICER": 3,
6
+ "AUTO THEFT": 4,
7
+ "BLACKMAIL/EXTORTION/INTIMIDATION": 5,
8
+ "BRIBERY AND CONFLICT OF INTEREST": 6,
9
+ "BURGLARY": 7,
10
+ "CHILD ABUSE": 8,
11
+ "COCAINE OR CRACK VIOLATION OFFENSE UNSPECIFIED": 9,
12
+ "COMMERCIALIZED VICE": 10,
13
+ "CONTEMPT OF COURT": 11,
14
+ "CONTRIBUTING TO DELINQUENCY OF A MINOR": 12,
15
+ "CONTROLLED SUBSTANCE - OFFENSE UNSPECIFIED": 13,
16
+ "COUNTERFEITING (FEDERAL ONLY)": 14,
17
+ "DESTRUCTION OF PROPERTY": 15,
18
+ "DRIVING UNDER INFLUENCE - DRUGS": 16,
19
+ "DRIVING UNDER THE INFLUENCE": 17,
20
+ "DRIVING WHILE INTOXICATED": 18,
21
+ "DRUG OFFENSES - VIOLATION/DRUG UNSPECIFIED": 19,
22
+ "DRUNKENNESS/VAGRANCY/DISORDERLY CONDUCT": 20,
23
+ "EMBEZZLEMENT": 21,
24
+ "EMBEZZLEMENT (FEDERAL ONLY)": 22,
25
+ "ESCAPE FROM CUSTODY": 23,
26
+ "FAMILY RELATED OFFENSES": 24,
27
+ "FELONY - UNSPECIFIED": 25,
28
+ "FLIGHT TO AVOID PROSECUTION": 26,
29
+ "FORCIBLE SODOMY": 27,
30
+ "FORGERY (FEDERAL ONLY)": 28,
31
+ "FORGERY/FRAUD": 29,
32
+ "FRAUD (FEDERAL ONLY)": 30,
33
+ "GRAND LARCENY - THEFT OVER $200": 31,
34
+ "HABITUAL OFFENDER": 32,
35
+ "HEROIN VIOLATION - OFFENSE UNSPECIFIED": 33,
36
+ "HIT AND RUN DRIVING": 34,
37
+ "HIT/RUN DRIVING - PROPERTY DAMAGE": 35,
38
+ "IMMIGRATION VIOLATIONS": 36,
39
+ "INVASION OF PRIVACY": 37,
40
+ "JUVENILE OFFENSES": 38,
41
+ "KIDNAPPING": 39,
42
+ "LARCENY/THEFT - VALUE UNKNOWN": 40,
43
+ "LEWD ACT WITH CHILDREN": 41,
44
+ "LIQUOR LAW VIOLATIONS": 42,
45
+ "MANSLAUGHTER - NON-VEHICULAR": 43,
46
+ "MANSLAUGHTER - VEHICULAR": 44,
47
+ "MARIJUANA/HASHISH VIOLATION - OFFENSE UNSPECIFIED": 45,
48
+ "MISDEMEANOR UNSPECIFIED": 46,
49
+ "MORALS/DECENCY - OFFENSE": 47,
50
+ "MURDER": 48,
51
+ "OBSTRUCTION - LAW ENFORCEMENT": 49,
52
+ "OFFENSES AGAINST COURTS, LEGISLATURES, AND COMMISSIONS": 50,
53
+ "PAROLE VIOLATION": 51,
54
+ "PETTY LARCENY - THEFT UNDER $200": 52,
55
+ "POSSESSION/USE - COCAINE OR CRACK": 53,
56
+ "POSSESSION/USE - DRUG UNSPECIFIED": 54,
57
+ "POSSESSION/USE - HEROIN": 55,
58
+ "POSSESSION/USE - MARIJUANA/HASHISH": 56,
59
+ "POSSESSION/USE - OTHER CONTROLLED SUBSTANCES": 57,
60
+ "PROBATION VIOLATION": 58,
61
+ "PROPERTY OFFENSES - OTHER": 59,
62
+ "PUBLIC ORDER OFFENSES - OTHER": 60,
63
+ "RACKETEERING/EXTORTION (FEDERAL ONLY)": 61,
64
+ "RAPE - FORCE": 62,
65
+ "RAPE - STATUTORY - NO FORCE": 63,
66
+ "REGULATORY OFFENSES (FEDERAL ONLY)": 64,
67
+ "RIOTING": 65,
68
+ "SEXUAL ASSAULT - OTHER": 66,
69
+ "SIMPLE ASSAULT": 67,
70
+ "STOLEN PROPERTY - RECEIVING": 68,
71
+ "STOLEN PROPERTY - TRAFFICKING": 69,
72
+ "TAX LAW (FEDERAL ONLY)": 70,
73
+ "TRAFFIC OFFENSES - MINOR": 71,
74
+ "TRAFFICKING - COCAINE OR CRACK": 72,
75
+ "TRAFFICKING - DRUG UNSPECIFIED": 73,
76
+ "TRAFFICKING - HEROIN": 74,
77
+ "TRAFFICKING - OTHER CONTROLLED SUBSTANCES": 75,
78
+ "TRAFFICKING MARIJUANA/HASHISH": 76,
79
+ "TRESPASSING": 77,
80
+ "UNARMED ROBBERY": 78,
81
+ "UNAUTHORIZED USE OF VEHICLE": 79,
82
+ "UNSPECIFIED HOMICIDE": 80,
83
+ "VIOLENT OFFENSES - OTHER": 81,
84
+ "VOLUNTARY/NONNEGLIGENT MANSLAUGHTER": 82,
85
+ "WEAPON OFFENSE": 83
86
+ }
config.backup.json ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "./local/distilroberta-base",
3
+ "architectures": [
4
+ "RobertaForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "eos_token_id": 2,
9
+ "gradient_checkpointing": false,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 768,
13
+ "id2label": {
14
+ "0": "LABEL_0",
15
+ "1": "LABEL_1",
16
+ "2": "LABEL_2",
17
+ "3": "LABEL_3",
18
+ "4": "LABEL_4",
19
+ "5": "LABEL_5",
20
+ "6": "LABEL_6",
21
+ "7": "LABEL_7",
22
+ "8": "LABEL_8",
23
+ "9": "LABEL_9",
24
+ "10": "LABEL_10",
25
+ "11": "LABEL_11",
26
+ "12": "LABEL_12",
27
+ "13": "LABEL_13",
28
+ "14": "LABEL_14",
29
+ "15": "LABEL_15",
30
+ "16": "LABEL_16",
31
+ "17": "LABEL_17",
32
+ "18": "LABEL_18",
33
+ "19": "LABEL_19",
34
+ "20": "LABEL_20",
35
+ "21": "LABEL_21",
36
+ "22": "LABEL_22",
37
+ "23": "LABEL_23",
38
+ "24": "LABEL_24",
39
+ "25": "LABEL_25",
40
+ "26": "LABEL_26",
41
+ "27": "LABEL_27",
42
+ "28": "LABEL_28",
43
+ "29": "LABEL_29",
44
+ "30": "LABEL_30",
45
+ "31": "LABEL_31",
46
+ "32": "LABEL_32",
47
+ "33": "LABEL_33",
48
+ "34": "LABEL_34",
49
+ "35": "LABEL_35",
50
+ "36": "LABEL_36",
51
+ "37": "LABEL_37",
52
+ "38": "LABEL_38",
53
+ "39": "LABEL_39",
54
+ "40": "LABEL_40",
55
+ "41": "LABEL_41",
56
+ "42": "LABEL_42",
57
+ "43": "LABEL_43",
58
+ "44": "LABEL_44",
59
+ "45": "LABEL_45",
60
+ "46": "LABEL_46",
61
+ "47": "LABEL_47",
62
+ "48": "LABEL_48",
63
+ "49": "LABEL_49",
64
+ "50": "LABEL_50",
65
+ "51": "LABEL_51",
66
+ "52": "LABEL_52",
67
+ "53": "LABEL_53",
68
+ "54": "LABEL_54",
69
+ "55": "LABEL_55",
70
+ "56": "LABEL_56",
71
+ "57": "LABEL_57",
72
+ "58": "LABEL_58",
73
+ "59": "LABEL_59",
74
+ "60": "LABEL_60",
75
+ "61": "LABEL_61",
76
+ "62": "LABEL_62",
77
+ "63": "LABEL_63",
78
+ "64": "LABEL_64",
79
+ "65": "LABEL_65",
80
+ "66": "LABEL_66",
81
+ "67": "LABEL_67",
82
+ "68": "LABEL_68",
83
+ "69": "LABEL_69",
84
+ "70": "LABEL_70",
85
+ "71": "LABEL_71",
86
+ "72": "LABEL_72",
87
+ "73": "LABEL_73",
88
+ "74": "LABEL_74",
89
+ "75": "LABEL_75",
90
+ "76": "LABEL_76",
91
+ "77": "LABEL_77",
92
+ "78": "LABEL_78",
93
+ "79": "LABEL_79",
94
+ "80": "LABEL_80",
95
+ "81": "LABEL_81",
96
+ "82": "LABEL_82",
97
+ "83": "LABEL_83"
98
+ },
99
+ "initializer_range": 0.02,
100
+ "intermediate_size": 3072,
101
+ "label2id": {
102
+ "LABEL_0": 0,
103
+ "LABEL_1": 1,
104
+ "LABEL_10": 10,
105
+ "LABEL_11": 11,
106
+ "LABEL_12": 12,
107
+ "LABEL_13": 13,
108
+ "LABEL_14": 14,
109
+ "LABEL_15": 15,
110
+ "LABEL_16": 16,
111
+ "LABEL_17": 17,
112
+ "LABEL_18": 18,
113
+ "LABEL_19": 19,
114
+ "LABEL_2": 2,
115
+ "LABEL_20": 20,
116
+ "LABEL_21": 21,
117
+ "LABEL_22": 22,
118
+ "LABEL_23": 23,
119
+ "LABEL_24": 24,
120
+ "LABEL_25": 25,
121
+ "LABEL_26": 26,
122
+ "LABEL_27": 27,
123
+ "LABEL_28": 28,
124
+ "LABEL_29": 29,
125
+ "LABEL_3": 3,
126
+ "LABEL_30": 30,
127
+ "LABEL_31": 31,
128
+ "LABEL_32": 32,
129
+ "LABEL_33": 33,
130
+ "LABEL_34": 34,
131
+ "LABEL_35": 35,
132
+ "LABEL_36": 36,
133
+ "LABEL_37": 37,
134
+ "LABEL_38": 38,
135
+ "LABEL_39": 39,
136
+ "LABEL_4": 4,
137
+ "LABEL_40": 40,
138
+ "LABEL_41": 41,
139
+ "LABEL_42": 42,
140
+ "LABEL_43": 43,
141
+ "LABEL_44": 44,
142
+ "LABEL_45": 45,
143
+ "LABEL_46": 46,
144
+ "LABEL_47": 47,
145
+ "LABEL_48": 48,
146
+ "LABEL_49": 49,
147
+ "LABEL_5": 5,
148
+ "LABEL_50": 50,
149
+ "LABEL_51": 51,
150
+ "LABEL_52": 52,
151
+ "LABEL_53": 53,
152
+ "LABEL_54": 54,
153
+ "LABEL_55": 55,
154
+ "LABEL_56": 56,
155
+ "LABEL_57": 57,
156
+ "LABEL_58": 58,
157
+ "LABEL_59": 59,
158
+ "LABEL_6": 6,
159
+ "LABEL_60": 60,
160
+ "LABEL_61": 61,
161
+ "LABEL_62": 62,
162
+ "LABEL_63": 63,
163
+ "LABEL_64": 64,
164
+ "LABEL_65": 65,
165
+ "LABEL_66": 66,
166
+ "LABEL_67": 67,
167
+ "LABEL_68": 68,
168
+ "LABEL_69": 69,
169
+ "LABEL_7": 7,
170
+ "LABEL_70": 70,
171
+ "LABEL_71": 71,
172
+ "LABEL_72": 72,
173
+ "LABEL_73": 73,
174
+ "LABEL_74": 74,
175
+ "LABEL_75": 75,
176
+ "LABEL_76": 76,
177
+ "LABEL_77": 77,
178
+ "LABEL_78": 78,
179
+ "LABEL_79": 79,
180
+ "LABEL_8": 8,
181
+ "LABEL_80": 80,
182
+ "LABEL_81": 81,
183
+ "LABEL_82": 82,
184
+ "LABEL_83": 83,
185
+ "LABEL_9": 9
186
+ },
187
+ "layer_norm_eps": 1e-05,
188
+ "max_position_embeddings": 514,
189
+ "model_type": "roberta",
190
+ "num_attention_heads": 12,
191
+ "num_hidden_layers": 6,
192
+ "pad_token_id": 1,
193
+ "position_embedding_type": "absolute",
194
+ "transformers_version": "4.5.1",
195
+ "type_vocab_size": 1,
196
+ "use_cache": true,
197
+ "vocab_size": 50265
198
+ }
config.json ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "./local/distilroberta-base",
3
+ "architectures": [
4
+ "RobertaForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "eos_token_id": 2,
9
+ "gradient_checkpointing": false,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 768,
13
+ "id2label": {
14
+ "0": "AGGRAVATED ASSAULT",
15
+ "1": "ARMED ROBBERY",
16
+ "2": "ARSON",
17
+ "3": "ASSAULTING PUBLIC OFFICER",
18
+ "4": "AUTO THEFT",
19
+ "5": "BLACKMAIL/EXTORTION/INTIMIDATION",
20
+ "6": "BRIBERY AND CONFLICT OF INTEREST",
21
+ "7": "BURGLARY",
22
+ "8": "CHILD ABUSE",
23
+ "9": "COCAINE OR CRACK VIOLATION OFFENSE UNSPECIFIED",
24
+ "10": "COMMERCIALIZED VICE",
25
+ "11": "CONTEMPT OF COURT",
26
+ "12": "CONTRIBUTING TO DELINQUENCY OF A MINOR",
27
+ "13": "CONTROLLED SUBSTANCE - OFFENSE UNSPECIFIED",
28
+ "14": "COUNTERFEITING (FEDERAL ONLY)",
29
+ "15": "DESTRUCTION OF PROPERTY",
30
+ "16": "DRIVING UNDER INFLUENCE - DRUGS",
31
+ "17": "DRIVING UNDER THE INFLUENCE",
32
+ "18": "DRIVING WHILE INTOXICATED",
33
+ "19": "DRUG OFFENSES - VIOLATION/DRUG UNSPECIFIED",
34
+ "20": "DRUNKENNESS/VAGRANCY/DISORDERLY CONDUCT",
35
+ "21": "EMBEZZLEMENT",
36
+ "22": "EMBEZZLEMENT (FEDERAL ONLY)",
37
+ "23": "ESCAPE FROM CUSTODY",
38
+ "24": "FAMILY RELATED OFFENSES",
39
+ "25": "FELONY - UNSPECIFIED",
40
+ "26": "FLIGHT TO AVOID PROSECUTION",
41
+ "27": "FORCIBLE SODOMY",
42
+ "28": "FORGERY (FEDERAL ONLY)",
43
+ "29": "FORGERY/FRAUD",
44
+ "30": "FRAUD (FEDERAL ONLY)",
45
+ "31": "GRAND LARCENY - THEFT OVER $200",
46
+ "32": "HABITUAL OFFENDER",
47
+ "33": "HEROIN VIOLATION - OFFENSE UNSPECIFIED",
48
+ "34": "HIT AND RUN DRIVING",
49
+ "35": "HIT/RUN DRIVING - PROPERTY DAMAGE",
50
+ "36": "IMMIGRATION VIOLATIONS",
51
+ "37": "INVASION OF PRIVACY",
52
+ "38": "JUVENILE OFFENSES",
53
+ "39": "KIDNAPPING",
54
+ "40": "LARCENY/THEFT - VALUE UNKNOWN",
55
+ "41": "LEWD ACT WITH CHILDREN",
56
+ "42": "LIQUOR LAW VIOLATIONS",
57
+ "43": "MANSLAUGHTER - NON-VEHICULAR",
58
+ "44": "MANSLAUGHTER - VEHICULAR",
59
+ "45": "MARIJUANA/HASHISH VIOLATION - OFFENSE UNSPECIFIED",
60
+ "46": "MISDEMEANOR UNSPECIFIED",
61
+ "47": "MORALS/DECENCY - OFFENSE",
62
+ "48": "MURDER",
63
+ "49": "OBSTRUCTION - LAW ENFORCEMENT",
64
+ "50": "OFFENSES AGAINST COURTS, LEGISLATURES, AND COMMISSIONS",
65
+ "51": "PAROLE VIOLATION",
66
+ "52": "PETTY LARCENY - THEFT UNDER $200",
67
+ "53": "POSSESSION/USE - COCAINE OR CRACK",
68
+ "54": "POSSESSION/USE - DRUG UNSPECIFIED",
69
+ "55": "POSSESSION/USE - HEROIN",
70
+ "56": "POSSESSION/USE - MARIJUANA/HASHISH",
71
+ "57": "POSSESSION/USE - OTHER CONTROLLED SUBSTANCES",
72
+ "58": "PROBATION VIOLATION",
73
+ "59": "PROPERTY OFFENSES - OTHER",
74
+ "60": "PUBLIC ORDER OFFENSES - OTHER",
75
+ "61": "RACKETEERING/EXTORTION (FEDERAL ONLY)",
76
+ "62": "RAPE - FORCE",
77
+ "63": "RAPE - STATUTORY - NO FORCE",
78
+ "64": "REGULATORY OFFENSES (FEDERAL ONLY)",
79
+ "65": "RIOTING",
80
+ "66": "SEXUAL ASSAULT - OTHER",
81
+ "67": "SIMPLE ASSAULT",
82
+ "68": "STOLEN PROPERTY - RECEIVING",
83
+ "69": "STOLEN PROPERTY - TRAFFICKING",
84
+ "70": "TAX LAW (FEDERAL ONLY)",
85
+ "71": "TRAFFIC OFFENSES - MINOR",
86
+ "72": "TRAFFICKING - COCAINE OR CRACK",
87
+ "73": "TRAFFICKING - DRUG UNSPECIFIED",
88
+ "74": "TRAFFICKING - HEROIN",
89
+ "75": "TRAFFICKING - OTHER CONTROLLED SUBSTANCES",
90
+ "76": "TRAFFICKING MARIJUANA/HASHISH",
91
+ "77": "TRESPASSING",
92
+ "78": "UNARMED ROBBERY",
93
+ "79": "UNAUTHORIZED USE OF VEHICLE",
94
+ "80": "UNSPECIFIED HOMICIDE",
95
+ "81": "VIOLENT OFFENSES - OTHER",
96
+ "82": "VOLUNTARY/NONNEGLIGENT MANSLAUGHTER",
97
+ "83": "WEAPON OFFENSE"
98
+ },
99
+ "initializer_range": 0.02,
100
+ "intermediate_size": 3072,
101
+ "label2id": {
102
+ "AGGRAVATED ASSAULT": 0,
103
+ "ARMED ROBBERY": 1,
104
+ "ARSON": 2,
105
+ "ASSAULTING PUBLIC OFFICER": 3,
106
+ "AUTO THEFT": 4,
107
+ "BLACKMAIL/EXTORTION/INTIMIDATION": 5,
108
+ "BRIBERY AND CONFLICT OF INTEREST": 6,
109
+ "BURGLARY": 7,
110
+ "CHILD ABUSE": 8,
111
+ "COCAINE OR CRACK VIOLATION OFFENSE UNSPECIFIED": 9,
112
+ "COMMERCIALIZED VICE": 10,
113
+ "CONTEMPT OF COURT": 11,
114
+ "CONTRIBUTING TO DELINQUENCY OF A MINOR": 12,
115
+ "CONTROLLED SUBSTANCE - OFFENSE UNSPECIFIED": 13,
116
+ "COUNTERFEITING (FEDERAL ONLY)": 14,
117
+ "DESTRUCTION OF PROPERTY": 15,
118
+ "DRIVING UNDER INFLUENCE - DRUGS": 16,
119
+ "DRIVING UNDER THE INFLUENCE": 17,
120
+ "DRIVING WHILE INTOXICATED": 18,
121
+ "DRUG OFFENSES - VIOLATION/DRUG UNSPECIFIED": 19,
122
+ "DRUNKENNESS/VAGRANCY/DISORDERLY CONDUCT": 20,
123
+ "EMBEZZLEMENT": 21,
124
+ "EMBEZZLEMENT (FEDERAL ONLY)": 22,
125
+ "ESCAPE FROM CUSTODY": 23,
126
+ "FAMILY RELATED OFFENSES": 24,
127
+ "FELONY - UNSPECIFIED": 25,
128
+ "FLIGHT TO AVOID PROSECUTION": 26,
129
+ "FORCIBLE SODOMY": 27,
130
+ "FORGERY (FEDERAL ONLY)": 28,
131
+ "FORGERY/FRAUD": 29,
132
+ "FRAUD (FEDERAL ONLY)": 30,
133
+ "GRAND LARCENY - THEFT OVER $200": 31,
134
+ "HABITUAL OFFENDER": 32,
135
+ "HEROIN VIOLATION - OFFENSE UNSPECIFIED": 33,
136
+ "HIT AND RUN DRIVING": 34,
137
+ "HIT/RUN DRIVING - PROPERTY DAMAGE": 35,
138
+ "IMMIGRATION VIOLATIONS": 36,
139
+ "INVASION OF PRIVACY": 37,
140
+ "JUVENILE OFFENSES": 38,
141
+ "KIDNAPPING": 39,
142
+ "LARCENY/THEFT - VALUE UNKNOWN": 40,
143
+ "LEWD ACT WITH CHILDREN": 41,
144
+ "LIQUOR LAW VIOLATIONS": 42,
145
+ "MANSLAUGHTER - NON-VEHICULAR": 43,
146
+ "MANSLAUGHTER - VEHICULAR": 44,
147
+ "MARIJUANA/HASHISH VIOLATION - OFFENSE UNSPECIFIED": 45,
148
+ "MISDEMEANOR UNSPECIFIED": 46,
149
+ "MORALS/DECENCY - OFFENSE": 47,
150
+ "MURDER": 48,
151
+ "OBSTRUCTION - LAW ENFORCEMENT": 49,
152
+ "OFFENSES AGAINST COURTS, LEGISLATURES, AND COMMISSIONS": 50,
153
+ "PAROLE VIOLATION": 51,
154
+ "PETTY LARCENY - THEFT UNDER $200": 52,
155
+ "POSSESSION/USE - COCAINE OR CRACK": 53,
156
+ "POSSESSION/USE - DRUG UNSPECIFIED": 54,
157
+ "POSSESSION/USE - HEROIN": 55,
158
+ "POSSESSION/USE - MARIJUANA/HASHISH": 56,
159
+ "POSSESSION/USE - OTHER CONTROLLED SUBSTANCES": 57,
160
+ "PROBATION VIOLATION": 58,
161
+ "PROPERTY OFFENSES - OTHER": 59,
162
+ "PUBLIC ORDER OFFENSES - OTHER": 60,
163
+ "RACKETEERING/EXTORTION (FEDERAL ONLY)": 61,
164
+ "RAPE - FORCE": 62,
165
+ "RAPE - STATUTORY - NO FORCE": 63,
166
+ "REGULATORY OFFENSES (FEDERAL ONLY)": 64,
167
+ "RIOTING": 65,
168
+ "SEXUAL ASSAULT - OTHER": 66,
169
+ "SIMPLE ASSAULT": 67,
170
+ "STOLEN PROPERTY - RECEIVING": 68,
171
+ "STOLEN PROPERTY - TRAFFICKING": 69,
172
+ "TAX LAW (FEDERAL ONLY)": 70,
173
+ "TRAFFIC OFFENSES - MINOR": 71,
174
+ "TRAFFICKING - COCAINE OR CRACK": 72,
175
+ "TRAFFICKING - DRUG UNSPECIFIED": 73,
176
+ "TRAFFICKING - HEROIN": 74,
177
+ "TRAFFICKING - OTHER CONTROLLED SUBSTANCES": 75,
178
+ "TRAFFICKING MARIJUANA/HASHISH": 76,
179
+ "TRESPASSING": 77,
180
+ "UNARMED ROBBERY": 78,
181
+ "UNAUTHORIZED USE OF VEHICLE": 79,
182
+ "UNSPECIFIED HOMICIDE": 80,
183
+ "VIOLENT OFFENSES - OTHER": 81,
184
+ "VOLUNTARY/NONNEGLIGENT MANSLAUGHTER": 82,
185
+ "WEAPON OFFENSE": 83
186
+ },
187
+ "layer_norm_eps": 1e-05,
188
+ "max_position_embeddings": 514,
189
+ "model_type": "roberta",
190
+ "num_attention_heads": 12,
191
+ "num_hidden_layers": 6,
192
+ "pad_token_id": 1,
193
+ "position_embedding_type": "absolute",
194
+ "transformers_version": "4.5.1",
195
+ "type_vocab_size": 1,
196
+ "use_cache": true,
197
+ "vocab_size": 50265
198
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model_args.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"adafactor_beta1": null, "adafactor_clip_threshold": 1.0, "adafactor_decay_rate": -0.8, "adafactor_eps": [1e-30, 0.001], "adafactor_relative_step": true, "adafactor_scale_parameter": true, "adafactor_warmup_init": true, "adam_epsilon": 1e-08, "best_model_dir": "outputs/best_model", "cache_dir": "cache_dir/", "config": {}, "cosine_schedule_num_cycles": 0.5, "custom_layer_parameters": [], "custom_parameter_groups": [], "dataloader_num_workers": 0, "do_lower_case": false, "dynamic_quantize": false, "early_stopping_consider_epochs": false, "early_stopping_delta": 0, "early_stopping_metric": "eval_loss", "early_stopping_metric_minimize": true, "early_stopping_patience": 3, "encoding": null, "eval_batch_size": 8, "evaluate_during_training": false, "evaluate_during_training_silent": true, "evaluate_during_training_steps": 2000, "evaluate_during_training_verbose": false, "evaluate_each_epoch": true, "fp16": true, "gradient_accumulation_steps": 1, "learning_rate": 4e-05, "local_rank": -1, "logging_steps": 50, "manual_seed": null, "max_grad_norm": 1.0, "max_seq_length": 128, "model_name": "./local/distilroberta-base", "model_type": "roberta", "multiprocessing_chunksize": -1, "n_gpu": 1, "no_cache": false, "no_save": false, "not_saved_args": [], "num_train_epochs": 3, "optimizer": "AdamW", "output_dir": "outputs/", "overwrite_output_dir": true, "polynomial_decay_schedule_lr_end": 1e-07, "polynomial_decay_schedule_power": 1.0, "process_count": 10, "quantized_model": false, "reprocess_input_data": true, "save_best_model": true, "save_eval_checkpoints": true, "save_model_every_epoch": false, "save_optimizer_and_scheduler": true, "save_steps": -1, "scheduler": "linear_schedule_with_warmup", "silent": false, "skip_special_tokens": true, "tensorboard_dir": null, "thread_count": null, "tokenizer_name": "./local/distilroberta-base", "tokenizer_type": null, "train_batch_size": 8, "train_custom_parameters_only": false, "use_cached_eval_features": false, "use_early_stopping": false, "use_hf_datasets": false, "use_multiprocessing": false, "use_multiprocessing_for_evaluation": true, "wandb_kwargs": {}, "wandb_project": null, "warmup_ratio": 0.06, "warmup_steps": 5870, "weight_decay": 0.0, "model_class": "ClassificationModel", "labels_list": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83], "labels_map": {}, "lazy_delimiter": "\t", "lazy_labels_column": 1, "lazy_loading": false, "lazy_loading_start_line": 1, "lazy_text_a_column": null, "lazy_text_b_column": null, "lazy_text_column": 0, "onnx": false, "regression": false, "sliding_window": false, "special_tokens_list": [], "stride": 0.8, "tie_value": 1}
onnx-convert-requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ fire==0.4.0
2
+ transformers==4.5.1
3
+ onnx==1.9.0
4
+ onnxruntime==1.7.0
5
+ torch==1.7.1
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c863f2c55a71623041440ab4366ed794ef4f6d874805adf05bee940c126b2b10
3
+ size 331143181
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "sep_token": "</s>", "pad_token": "<pad>", "cls_token": "<s>", "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": false}}
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": {"content": "<unk>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "bos_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "eos_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "add_prefix_space": false, "errors": "replace", "sep_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "cls_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "pad_token": {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "./local/distilroberta-base", "do_lower_case": false}
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5678323e8528e40ef5bcfef0a58bd4f141085c2cd8147a35c5f26fb2e9bd5f30
3
+ size 3311
utils/convert_onnx.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fire
2
+ from pathlib import Path
3
+ from typing import Optional
4
+
5
+ from transformers.convert_graph_to_onnx import convert, quantize
6
+
7
+
8
+ def convert_model(model: str, path: Optional[str] = None):
9
+ if not path:
10
+ folder_name = Path(".").resolve().name
11
+ path = Path("onnx") / f"{folder_name}.onnx"
12
+ convert(
13
+ framework="pt",
14
+ model=str(Path(model).resolve()),
15
+ output=Path(path),
16
+ opset=11,
17
+ pipeline_name="sentiment-analysis", # needed for classification tasks
18
+ )
19
+ quantize(Path(path))
20
+
21
+
22
+ if __name__ == "__main__":
23
+ fire.Fire()
24
+
utils/update_labels.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from pathlib import Path
3
+ from copy import copy
4
+
5
+ config = json.loads(Path("config.json").read_text())
6
+ code_map = json.loads(Path("code_map.json").read_text()) # label : ID
7
+ code_map_inv = {v: k for k, v in code_map.items()} # id : label
8
+
9
+ code_map_inv_str = {str(k): str(v) for k, v in code_map_inv.items()}
10
+ code_map_str = {str(v): str(k) for k, v in code_map.items()}
11
+ config["id2label"] = code_map_inv_str
12
+ config["label2id"] = code_map
13
+
14
+ Path("config.json").write_text(json.dumps(config, indent=4))
vocab.json ADDED
The diff for this file is too large to render. See raw diff