Spaces:
Runtime error
Runtime error
AmitGarage
commited on
Commit
•
7cb24dc
1
Parent(s):
1ec71e5
Upload 2 files
Browse files- project.yml +173 -0
- requirements.txt +9 -0
project.yml
ADDED
@@ -0,0 +1,173 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
title: "Detecting entities in Medical Records with PyTorch"
|
2 |
+
description: "This project uses the [i2b2 (n2c2) 2011 Challenge Dataset](https://portal.dbmi.hms.harvard.edu/projects/n2c2-nlp/) to bootstrap a PyTorch NER model to detect entities in Medical Records. It also demonstrates how to anonymize medical records for annotators in [Prodigy](https://prodi.gy)."
|
3 |
+
# Variables can be referenced across the project.yml using ${vars.var_name}
|
4 |
+
vars:
|
5 |
+
beth_train_tarfile: i2b2_Beth_Train_Release.tar.gz
|
6 |
+
partners_train_tarfile: i2b2_Partners_Train_Release.tar.gz
|
7 |
+
test_zipfile: Task_1C.zip
|
8 |
+
spacy_config: "spacy_config.cfg"
|
9 |
+
config: "config.cfg"
|
10 |
+
config_trf: "config_trf.cfg"
|
11 |
+
config_trf_resume: "config_trf_resume.cfg"
|
12 |
+
config_trf_test: "config_trf_test.cfg"
|
13 |
+
name: "ner_pytorch_medical"
|
14 |
+
version: "0.0.0"
|
15 |
+
train: "train"
|
16 |
+
dev: "dev"
|
17 |
+
test: "test"
|
18 |
+
prodigy:
|
19 |
+
dataset: "pytorch_ner_medical_correct_anonymous"
|
20 |
+
source: "assets/mock_notes.jsonl"
|
21 |
+
labels: "person,problem,pronoun,test,treatment"
|
22 |
+
azure:
|
23 |
+
text_analytics_key: "YOUR_API_KEY"
|
24 |
+
text_analytics_base_url: "https://westus2.api.cognitive.microsoft.com/"
|
25 |
+
|
26 |
+
|
27 |
+
# These are the directories that the project needs. The project CLI will make
|
28 |
+
# sure that they always exist.
|
29 |
+
directories: ["assets", "training", "configs", "scripts", "corpus", "packages"]
|
30 |
+
|
31 |
+
# Assets that should be downloaded or available in the directory. You can replace
|
32 |
+
# this with your own input data.
|
33 |
+
assets:
|
34 |
+
- dest: "assets/n2c2_2011/${vars.beth_train_tarfile}"
|
35 |
+
description: "Tarfile containing original challenge data from the Beth training data split"
|
36 |
+
- dest: "assets/n2c2_2011/${vars.partners_train_tarfile}"
|
37 |
+
description: "Tarfile containing original challenge data from the Partners training data split"
|
38 |
+
- dest: "assets/n2c2_2011/${vars.test_zipfile}"
|
39 |
+
description: "Zipfile containing original challenge test data"
|
40 |
+
- dest: "assets/mock_notes.jsonl"
|
41 |
+
description: "JSONL file with raw mock notes to annotate in prodigy"
|
42 |
+
|
43 |
+
# Workflows are sequences of commands (see below) executed in order. You can
|
44 |
+
# run them via "spacy project run [workflow]". If a commands's inputs/outputs
|
45 |
+
# haven't changed, it won't be re-run.
|
46 |
+
workflows:
|
47 |
+
all:
|
48 |
+
- preprocess
|
49 |
+
- train
|
50 |
+
- evaluate
|
51 |
+
|
52 |
+
# Project commands, specified in a style similar to CI config files (e.g. Azure
|
53 |
+
# pipelines). The name is the command name that lets you trigger the command
|
54 |
+
# via "spacy project run [command] [path]". The help message is optional and
|
55 |
+
# shown when executing "spacy project run [optional command] [path] --help".
|
56 |
+
commands:
|
57 |
+
- name: "preprocess"
|
58 |
+
help: "Convert the data to spaCy's binary format"
|
59 |
+
script:
|
60 |
+
- "python scripts/preprocess.py assets/n2c2_2011 corpus"
|
61 |
+
deps:
|
62 |
+
- "assets/n2c2_2011/${vars.beth_train_tarfile}"
|
63 |
+
- "assets/n2c2_2011/${vars.partners_train_tarfile}"
|
64 |
+
- "assets/n2c2_2011/${vars.test_zipfile}"
|
65 |
+
- "scripts/preprocess.py"
|
66 |
+
outputs:
|
67 |
+
- "corpus/${vars.train}.spacy"
|
68 |
+
- "corpus/${vars.dev}.spacy"
|
69 |
+
- "corpus/${vars.test}.spacy"
|
70 |
+
|
71 |
+
- name: "train"
|
72 |
+
help: "Train a custom PyTorch named entity recognition model"
|
73 |
+
script:
|
74 |
+
- "python -m spacy train configs/${vars.config} --output training/ --paths.train corpus/${vars.train}.spacy --paths.dev corpus/${vars.dev}.spacy --code scripts/custom_functions.py --gpu-id 0"
|
75 |
+
deps:
|
76 |
+
- "corpus/${vars.train}.spacy"
|
77 |
+
- "corpus/${vars.dev}.spacy"
|
78 |
+
outputs:
|
79 |
+
- "training/model-best"
|
80 |
+
|
81 |
+
- name: "train-trf"
|
82 |
+
help: "Train a custom PyTorch named entity recognition model with transformer"
|
83 |
+
script:
|
84 |
+
- "python -m spacy train configs/${vars.config_trf} --output training_trf/ --paths.train corpus/${vars.train}.spacy --paths.dev corpus/${vars.dev}.spacy --code scripts/custom_functions.py --gpu-id 0"
|
85 |
+
deps:
|
86 |
+
- "corpus/${vars.train}.spacy"
|
87 |
+
- "corpus/${vars.dev}.spacy"
|
88 |
+
outputs:
|
89 |
+
- "training_trf/model-best"
|
90 |
+
|
91 |
+
- name: "train-trf-test"
|
92 |
+
help: "Train a custom PyTorch named entity recognition model with transformer"
|
93 |
+
script:
|
94 |
+
- "python -m spacy train configs/${vars.config_trf_test} --output training_trf_test/ --paths.train corpus/${vars.train}.spacy --paths.dev corpus/${vars.dev}.spacy --code scripts/custom_functions.py --gpu-id 0"
|
95 |
+
deps:
|
96 |
+
- "corpus/${vars.train}.spacy"
|
97 |
+
- "corpus/${vars.dev}.spacy"
|
98 |
+
outputs:
|
99 |
+
- "training_trf/model-best"
|
100 |
+
|
101 |
+
- name: "train-trf-resume"
|
102 |
+
help: "Train a custom PyTorch named entity recognition model with transformer"
|
103 |
+
script:
|
104 |
+
- "python -m spacy train configs/${vars.config_trf_resume} --output training_trf/ --paths.train corpus/${vars.train}.spacy --paths.dev corpus/${vars.dev}.spacy --code scripts/custom_functions.py --gpu-id 0"
|
105 |
+
deps:
|
106 |
+
- "corpus/${vars.train}.spacy"
|
107 |
+
- "corpus/${vars.dev}.spacy"
|
108 |
+
outputs:
|
109 |
+
- "training_trf/model-best"
|
110 |
+
|
111 |
+
- name: "evaluate"
|
112 |
+
help: "Evaluate the custom PyTorch model and export metrics"
|
113 |
+
script:
|
114 |
+
- "python -m spacy evaluate training/model-best corpus/${vars.test}.spacy --output training/metrics.json --code scripts/custom_functions.py"
|
115 |
+
deps:
|
116 |
+
- "corpus/${vars.test}.spacy"
|
117 |
+
- "training/model-best"
|
118 |
+
outputs:
|
119 |
+
- "training/metrics.json"
|
120 |
+
|
121 |
+
- name: "evaluate-trf"
|
122 |
+
help: "Evaluate the custom PyTorch model and export metrics"
|
123 |
+
script:
|
124 |
+
- "python -m spacy evaluate training_trf/model-best corpus/${vars.test}.spacy --output training_trf/metrics.json --code scripts/custom_functions.py --gpu-id 0"
|
125 |
+
deps:
|
126 |
+
- "corpus/${vars.test}.spacy"
|
127 |
+
- "training_trf/model-best"
|
128 |
+
outputs:
|
129 |
+
- "training_trf/metrics.json"
|
130 |
+
|
131 |
+
- name: "evaluate-trf-test"
|
132 |
+
help: "Evaluate the custom PyTorch model and export metrics"
|
133 |
+
script:
|
134 |
+
- "python -m spacy evaluate training_trf_test/model-best corpus/${vars.test}.spacy --output training_trf_test/metrics.json --code scripts/custom_functions.py --gpu-id 0"
|
135 |
+
deps:
|
136 |
+
- "corpus/${vars.test}.spacy"
|
137 |
+
- "training_trf_test/model-best"
|
138 |
+
outputs:
|
139 |
+
- "training_trf_test/metrics.json"
|
140 |
+
|
141 |
+
- name: package
|
142 |
+
help: "Package the trained model so it can be installed"
|
143 |
+
script:
|
144 |
+
- "python -m spacy package training/model-best packages --name ${vars.name} --version ${vars.version} --force --code scripts/custom_functions.py"
|
145 |
+
deps:
|
146 |
+
- "training/model-best"
|
147 |
+
outputs_no_cache:
|
148 |
+
- "packages/en_${vars.name}-${vars.version}/dist/en_${vars.name}-${vars.version}.tar.gz"
|
149 |
+
|
150 |
+
- name: package-trf
|
151 |
+
help: "Package the trained model so it can be installed"
|
152 |
+
script:
|
153 |
+
- "python -m spacy package training_trf/model-best packages --name ${vars.name} --version ${vars.version} --force --code scripts/torch_ner_model.py,scripts/torch_ner_pipe.py"
|
154 |
+
deps:
|
155 |
+
- "training_trf/model-best"
|
156 |
+
outputs_no_cache:
|
157 |
+
- "packages/en_${vars.name}-${vars.version}/dist/en_${vars.name}-${vars.version}.tar.gz"
|
158 |
+
|
159 |
+
- name: visualize-model
|
160 |
+
help: Visualize the model's output interactively using Streamlit
|
161 |
+
script:
|
162 |
+
- "streamlit run scripts/visualize_model.py training_trf/model-best \"The patient had surgery.\""
|
163 |
+
deps:
|
164 |
+
- "scripts/visualize_model.py"
|
165 |
+
- "training/model-best"
|
166 |
+
|
167 |
+
- name: annotate
|
168 |
+
help: Run the custom prodigy recipe to anonymize data for the annotator and update the PyTorch NER model
|
169 |
+
script:
|
170 |
+
- "prodigy ner.correct.anonymous ${vars.prodigy.dataset} training/model-best ${vars.prodigy.source} --text-analytics-key ${vars.azure.text_analytics_key} --text-analytics-base-url ${vars.azure.text_analytics_base_url} --label {vars.prodigy.labels} --update -F scripts/prodigy/recipes.py"
|
171 |
+
deps:
|
172 |
+
- "scripts/prodigy/recipes.py"
|
173 |
+
- "training/model-best"
|
requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
spacy-streamlit>=1.0.0a0
|
2 |
+
spacy-transformers
|
3 |
+
torch==1.8.1
|
4 |
+
streamlit
|
5 |
+
presidio-analyzer==2.2.1
|
6 |
+
presidio-anonymizer==2.2.1
|
7 |
+
spacy==3.0.5
|
8 |
+
spacy[transformers]
|
9 |
+
cupy-cuda11x
|