AmitGarage commited on
Commit
7cb24dc
1 Parent(s): 1ec71e5

Upload 2 files

Browse files
Files changed (2) hide show
  1. project.yml +173 -0
  2. requirements.txt +9 -0
project.yml ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ title: "Detecting entities in Medical Records with PyTorch"
2
+ description: "This project uses the [i2b2 (n2c2) 2011 Challenge Dataset](https://portal.dbmi.hms.harvard.edu/projects/n2c2-nlp/) to bootstrap a PyTorch NER model to detect entities in Medical Records. It also demonstrates how to anonymize medical records for annotators in [Prodigy](https://prodi.gy)."
3
+ # Variables can be referenced across the project.yml using ${vars.var_name}
4
+ vars:
5
+ beth_train_tarfile: i2b2_Beth_Train_Release.tar.gz
6
+ partners_train_tarfile: i2b2_Partners_Train_Release.tar.gz
7
+ test_zipfile: Task_1C.zip
8
+ spacy_config: "spacy_config.cfg"
9
+ config: "config.cfg"
10
+ config_trf: "config_trf.cfg"
11
+ config_trf_resume: "config_trf_resume.cfg"
12
+ config_trf_test: "config_trf_test.cfg"
13
+ name: "ner_pytorch_medical"
14
+ version: "0.0.0"
15
+ train: "train"
16
+ dev: "dev"
17
+ test: "test"
18
+ prodigy:
19
+ dataset: "pytorch_ner_medical_correct_anonymous"
20
+ source: "assets/mock_notes.jsonl"
21
+ labels: "person,problem,pronoun,test,treatment"
22
+ azure:
23
+ text_analytics_key: "YOUR_API_KEY"
24
+ text_analytics_base_url: "https://westus2.api.cognitive.microsoft.com/"
25
+
26
+
27
+ # These are the directories that the project needs. The project CLI will make
28
+ # sure that they always exist.
29
+ directories: ["assets", "training", "configs", "scripts", "corpus", "packages"]
30
+
31
+ # Assets that should be downloaded or available in the directory. You can replace
32
+ # this with your own input data.
33
+ assets:
34
+ - dest: "assets/n2c2_2011/${vars.beth_train_tarfile}"
35
+ description: "Tarfile containing original challenge data from the Beth training data split"
36
+ - dest: "assets/n2c2_2011/${vars.partners_train_tarfile}"
37
+ description: "Tarfile containing original challenge data from the Partners training data split"
38
+ - dest: "assets/n2c2_2011/${vars.test_zipfile}"
39
+ description: "Zipfile containing original challenge test data"
40
+ - dest: "assets/mock_notes.jsonl"
41
+ description: "JSONL file with raw mock notes to annotate in prodigy"
42
+
43
+ # Workflows are sequences of commands (see below) executed in order. You can
44
+ # run them via "spacy project run [workflow]". If a commands's inputs/outputs
45
+ # haven't changed, it won't be re-run.
46
+ workflows:
47
+ all:
48
+ - preprocess
49
+ - train
50
+ - evaluate
51
+
52
+ # Project commands, specified in a style similar to CI config files (e.g. Azure
53
+ # pipelines). The name is the command name that lets you trigger the command
54
+ # via "spacy project run [command] [path]". The help message is optional and
55
+ # shown when executing "spacy project run [optional command] [path] --help".
56
+ commands:
57
+ - name: "preprocess"
58
+ help: "Convert the data to spaCy's binary format"
59
+ script:
60
+ - "python scripts/preprocess.py assets/n2c2_2011 corpus"
61
+ deps:
62
+ - "assets/n2c2_2011/${vars.beth_train_tarfile}"
63
+ - "assets/n2c2_2011/${vars.partners_train_tarfile}"
64
+ - "assets/n2c2_2011/${vars.test_zipfile}"
65
+ - "scripts/preprocess.py"
66
+ outputs:
67
+ - "corpus/${vars.train}.spacy"
68
+ - "corpus/${vars.dev}.spacy"
69
+ - "corpus/${vars.test}.spacy"
70
+
71
+ - name: "train"
72
+ help: "Train a custom PyTorch named entity recognition model"
73
+ script:
74
+ - "python -m spacy train configs/${vars.config} --output training/ --paths.train corpus/${vars.train}.spacy --paths.dev corpus/${vars.dev}.spacy --code scripts/custom_functions.py --gpu-id 0"
75
+ deps:
76
+ - "corpus/${vars.train}.spacy"
77
+ - "corpus/${vars.dev}.spacy"
78
+ outputs:
79
+ - "training/model-best"
80
+
81
+ - name: "train-trf"
82
+ help: "Train a custom PyTorch named entity recognition model with transformer"
83
+ script:
84
+ - "python -m spacy train configs/${vars.config_trf} --output training_trf/ --paths.train corpus/${vars.train}.spacy --paths.dev corpus/${vars.dev}.spacy --code scripts/custom_functions.py --gpu-id 0"
85
+ deps:
86
+ - "corpus/${vars.train}.spacy"
87
+ - "corpus/${vars.dev}.spacy"
88
+ outputs:
89
+ - "training_trf/model-best"
90
+
91
+ - name: "train-trf-test"
92
+ help: "Train a custom PyTorch named entity recognition model with transformer"
93
+ script:
94
+ - "python -m spacy train configs/${vars.config_trf_test} --output training_trf_test/ --paths.train corpus/${vars.train}.spacy --paths.dev corpus/${vars.dev}.spacy --code scripts/custom_functions.py --gpu-id 0"
95
+ deps:
96
+ - "corpus/${vars.train}.spacy"
97
+ - "corpus/${vars.dev}.spacy"
98
+ outputs:
99
+ - "training_trf/model-best"
100
+
101
+ - name: "train-trf-resume"
102
+ help: "Train a custom PyTorch named entity recognition model with transformer"
103
+ script:
104
+ - "python -m spacy train configs/${vars.config_trf_resume} --output training_trf/ --paths.train corpus/${vars.train}.spacy --paths.dev corpus/${vars.dev}.spacy --code scripts/custom_functions.py --gpu-id 0"
105
+ deps:
106
+ - "corpus/${vars.train}.spacy"
107
+ - "corpus/${vars.dev}.spacy"
108
+ outputs:
109
+ - "training_trf/model-best"
110
+
111
+ - name: "evaluate"
112
+ help: "Evaluate the custom PyTorch model and export metrics"
113
+ script:
114
+ - "python -m spacy evaluate training/model-best corpus/${vars.test}.spacy --output training/metrics.json --code scripts/custom_functions.py"
115
+ deps:
116
+ - "corpus/${vars.test}.spacy"
117
+ - "training/model-best"
118
+ outputs:
119
+ - "training/metrics.json"
120
+
121
+ - name: "evaluate-trf"
122
+ help: "Evaluate the custom PyTorch model and export metrics"
123
+ script:
124
+ - "python -m spacy evaluate training_trf/model-best corpus/${vars.test}.spacy --output training_trf/metrics.json --code scripts/custom_functions.py --gpu-id 0"
125
+ deps:
126
+ - "corpus/${vars.test}.spacy"
127
+ - "training_trf/model-best"
128
+ outputs:
129
+ - "training_trf/metrics.json"
130
+
131
+ - name: "evaluate-trf-test"
132
+ help: "Evaluate the custom PyTorch model and export metrics"
133
+ script:
134
+ - "python -m spacy evaluate training_trf_test/model-best corpus/${vars.test}.spacy --output training_trf_test/metrics.json --code scripts/custom_functions.py --gpu-id 0"
135
+ deps:
136
+ - "corpus/${vars.test}.spacy"
137
+ - "training_trf_test/model-best"
138
+ outputs:
139
+ - "training_trf_test/metrics.json"
140
+
141
+ - name: package
142
+ help: "Package the trained model so it can be installed"
143
+ script:
144
+ - "python -m spacy package training/model-best packages --name ${vars.name} --version ${vars.version} --force --code scripts/custom_functions.py"
145
+ deps:
146
+ - "training/model-best"
147
+ outputs_no_cache:
148
+ - "packages/en_${vars.name}-${vars.version}/dist/en_${vars.name}-${vars.version}.tar.gz"
149
+
150
+ - name: package-trf
151
+ help: "Package the trained model so it can be installed"
152
+ script:
153
+ - "python -m spacy package training_trf/model-best packages --name ${vars.name} --version ${vars.version} --force --code scripts/torch_ner_model.py,scripts/torch_ner_pipe.py"
154
+ deps:
155
+ - "training_trf/model-best"
156
+ outputs_no_cache:
157
+ - "packages/en_${vars.name}-${vars.version}/dist/en_${vars.name}-${vars.version}.tar.gz"
158
+
159
+ - name: visualize-model
160
+ help: Visualize the model's output interactively using Streamlit
161
+ script:
162
+ - "streamlit run scripts/visualize_model.py training_trf/model-best \"The patient had surgery.\""
163
+ deps:
164
+ - "scripts/visualize_model.py"
165
+ - "training/model-best"
166
+
167
+ - name: annotate
168
+ help: Run the custom prodigy recipe to anonymize data for the annotator and update the PyTorch NER model
169
+ script:
170
+ - "prodigy ner.correct.anonymous ${vars.prodigy.dataset} training/model-best ${vars.prodigy.source} --text-analytics-key ${vars.azure.text_analytics_key} --text-analytics-base-url ${vars.azure.text_analytics_base_url} --label {vars.prodigy.labels} --update -F scripts/prodigy/recipes.py"
171
+ deps:
172
+ - "scripts/prodigy/recipes.py"
173
+ - "training/model-best"
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ spacy-streamlit>=1.0.0a0
2
+ spacy-transformers
3
+ torch==1.8.1
4
+ streamlit
5
+ presidio-analyzer==2.2.1
6
+ presidio-anonymizer==2.2.1
7
+ spacy==3.0.5
8
+ spacy[transformers]
9
+ cupy-cuda11x