File size: 7,706 Bytes
7cb24dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
title: "Detecting entities in Medical Records with PyTorch"
description: "This project uses the [i2b2 (n2c2) 2011 Challenge Dataset](https://portal.dbmi.hms.harvard.edu/projects/n2c2-nlp/) to bootstrap a PyTorch NER model to detect entities in Medical Records. It also demonstrates how to anonymize medical records for annotators in [Prodigy](https://prodi.gy)."
# Variables can be referenced across the project.yml using ${vars.var_name}
vars:
  beth_train_tarfile: i2b2_Beth_Train_Release.tar.gz
  partners_train_tarfile: i2b2_Partners_Train_Release.tar.gz
  test_zipfile: Task_1C.zip
  spacy_config: "spacy_config.cfg"
  config: "config.cfg"
  config_trf: "config_trf.cfg"
  config_trf_resume: "config_trf_resume.cfg"
  config_trf_test: "config_trf_test.cfg"
  name: "ner_pytorch_medical"
  version: "0.0.0"
  train: "train"
  dev: "dev"
  test: "test"
  prodigy:
    dataset: "pytorch_ner_medical_correct_anonymous"
    source: "assets/mock_notes.jsonl"
    labels: "person,problem,pronoun,test,treatment"
  azure:
    text_analytics_key: "YOUR_API_KEY"
    text_analytics_base_url: "https://westus2.api.cognitive.microsoft.com/"


# These are the directories that the project needs. The project CLI will make
# sure that they always exist.
directories: ["assets", "training", "configs", "scripts", "corpus", "packages"]

# Assets that should be downloaded or available in the directory. You can replace
# this with your own input data.
assets:
    - dest: "assets/n2c2_2011/${vars.beth_train_tarfile}"
      description: "Tarfile containing original challenge data from the Beth training data split"
    - dest: "assets/n2c2_2011/${vars.partners_train_tarfile}"
      description: "Tarfile containing original challenge data from the Partners training data split"
    - dest: "assets/n2c2_2011/${vars.test_zipfile}"
      description: "Zipfile containing original challenge test data"
    - dest: "assets/mock_notes.jsonl"
      description: "JSONL file with raw mock notes to annotate in prodigy"

# Workflows are sequences of commands (see below) executed in order. You can
# run them via "spacy project run [workflow]". If a commands's inputs/outputs
# haven't changed, it won't be re-run.
workflows:
  all:
    - preprocess
    - train
    - evaluate

# Project commands, specified in a style similar to CI config files (e.g. Azure
# pipelines). The name is the command name that lets you trigger the command
# via "spacy project run [command] [path]". The help message is optional and
# shown when executing "spacy project run [optional command] [path] --help".
commands:
  - name: "preprocess"
    help: "Convert the data to spaCy's binary format"
    script:
      - "python scripts/preprocess.py assets/n2c2_2011 corpus"
    deps:
      - "assets/n2c2_2011/${vars.beth_train_tarfile}"
      - "assets/n2c2_2011/${vars.partners_train_tarfile}"
      - "assets/n2c2_2011/${vars.test_zipfile}"
      - "scripts/preprocess.py"
    outputs:
      - "corpus/${vars.train}.spacy"
      - "corpus/${vars.dev}.spacy"
      - "corpus/${vars.test}.spacy"

  - name: "train"
    help: "Train a custom PyTorch named entity recognition model"
    script:
      - "python -m spacy train configs/${vars.config} --output training/ --paths.train corpus/${vars.train}.spacy --paths.dev corpus/${vars.dev}.spacy --code scripts/custom_functions.py --gpu-id 0"
    deps:
      - "corpus/${vars.train}.spacy"
      - "corpus/${vars.dev}.spacy"
    outputs:
      - "training/model-best"
    
  - name: "train-trf"
    help: "Train a custom PyTorch named entity recognition model with transformer"
    script:
      - "python -m spacy train configs/${vars.config_trf} --output training_trf/ --paths.train corpus/${vars.train}.spacy --paths.dev corpus/${vars.dev}.spacy --code scripts/custom_functions.py --gpu-id 0"
    deps:
      - "corpus/${vars.train}.spacy"
      - "corpus/${vars.dev}.spacy"
    outputs:
      - "training_trf/model-best"

  - name: "train-trf-test"
    help: "Train a custom PyTorch named entity recognition model with transformer"
    script:
      - "python -m spacy train configs/${vars.config_trf_test} --output training_trf_test/ --paths.train corpus/${vars.train}.spacy --paths.dev corpus/${vars.dev}.spacy --code scripts/custom_functions.py --gpu-id 0"
    deps:
      - "corpus/${vars.train}.spacy"
      - "corpus/${vars.dev}.spacy"
    outputs:
      - "training_trf/model-best"
      
  - name: "train-trf-resume"
    help: "Train a custom PyTorch named entity recognition model with transformer"
    script:
      - "python -m spacy train configs/${vars.config_trf_resume} --output training_trf/ --paths.train corpus/${vars.train}.spacy --paths.dev corpus/${vars.dev}.spacy --code scripts/custom_functions.py --gpu-id 0"
    deps:
      - "corpus/${vars.train}.spacy"
      - "corpus/${vars.dev}.spacy"
    outputs:
      - "training_trf/model-best"

  - name: "evaluate"
    help: "Evaluate the custom PyTorch model and export metrics"
    script:
      - "python -m spacy evaluate training/model-best corpus/${vars.test}.spacy --output training/metrics.json --code scripts/custom_functions.py"
    deps:
      - "corpus/${vars.test}.spacy"
      - "training/model-best"
    outputs:
      - "training/metrics.json"
  
  - name: "evaluate-trf"
    help: "Evaluate the custom PyTorch model and export metrics"
    script:
      - "python -m spacy evaluate training_trf/model-best corpus/${vars.test}.spacy --output training_trf/metrics.json --code scripts/custom_functions.py --gpu-id 0"
    deps:
      - "corpus/${vars.test}.spacy"
      - "training_trf/model-best"
    outputs:
      - "training_trf/metrics.json"
 
  - name: "evaluate-trf-test"
    help: "Evaluate the custom PyTorch model and export metrics"
    script:
      - "python -m spacy evaluate training_trf_test/model-best corpus/${vars.test}.spacy --output training_trf_test/metrics.json --code scripts/custom_functions.py --gpu-id 0"
    deps:
      - "corpus/${vars.test}.spacy"
      - "training_trf_test/model-best"
    outputs:
      - "training_trf_test/metrics.json"

  - name: package
    help: "Package the trained model so it can be installed"
    script:
      - "python -m spacy package training/model-best packages --name ${vars.name} --version ${vars.version} --force --code scripts/custom_functions.py"
    deps:
      - "training/model-best"
    outputs_no_cache:
      - "packages/en_${vars.name}-${vars.version}/dist/en_${vars.name}-${vars.version}.tar.gz"

  - name: package-trf
    help: "Package the trained model so it can be installed"
    script:
      - "python -m spacy package training_trf/model-best packages --name ${vars.name} --version ${vars.version} --force --code scripts/torch_ner_model.py,scripts/torch_ner_pipe.py"
    deps:
      - "training_trf/model-best"
    outputs_no_cache:
      - "packages/en_${vars.name}-${vars.version}/dist/en_${vars.name}-${vars.version}.tar.gz"

  - name: visualize-model
    help: Visualize the model's output interactively using Streamlit
    script:
      - "streamlit run scripts/visualize_model.py training_trf/model-best \"The patient had surgery.\""
    deps:
      - "scripts/visualize_model.py"
      - "training/model-best"

  - name: annotate
    help: Run the custom prodigy recipe to anonymize data for the annotator and update the PyTorch NER model
    script:
      - "prodigy ner.correct.anonymous ${vars.prodigy.dataset} training/model-best ${vars.prodigy.source} --text-analytics-key ${vars.azure.text_analytics_key} --text-analytics-base-url ${vars.azure.text_analytics_base_url} --label {vars.prodigy.labels} --update -F scripts/prodigy/recipes.py"
    deps:
      - "scripts/prodigy/recipes.py"
      - "training/model-best"