Spaces:
Runtime error
Runtime error
File size: 7,706 Bytes
7cb24dc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 |
title: "Detecting entities in Medical Records with PyTorch"
description: "This project uses the [i2b2 (n2c2) 2011 Challenge Dataset](https://portal.dbmi.hms.harvard.edu/projects/n2c2-nlp/) to bootstrap a PyTorch NER model to detect entities in Medical Records. It also demonstrates how to anonymize medical records for annotators in [Prodigy](https://prodi.gy)."
# Variables can be referenced across the project.yml using ${vars.var_name}
vars:
beth_train_tarfile: i2b2_Beth_Train_Release.tar.gz
partners_train_tarfile: i2b2_Partners_Train_Release.tar.gz
test_zipfile: Task_1C.zip
spacy_config: "spacy_config.cfg"
config: "config.cfg"
config_trf: "config_trf.cfg"
config_trf_resume: "config_trf_resume.cfg"
config_trf_test: "config_trf_test.cfg"
name: "ner_pytorch_medical"
version: "0.0.0"
train: "train"
dev: "dev"
test: "test"
prodigy:
dataset: "pytorch_ner_medical_correct_anonymous"
source: "assets/mock_notes.jsonl"
labels: "person,problem,pronoun,test,treatment"
azure:
text_analytics_key: "YOUR_API_KEY"
text_analytics_base_url: "https://westus2.api.cognitive.microsoft.com/"
# These are the directories that the project needs. The project CLI will make
# sure that they always exist.
directories: ["assets", "training", "configs", "scripts", "corpus", "packages"]
# Assets that should be downloaded or available in the directory. You can replace
# this with your own input data.
assets:
- dest: "assets/n2c2_2011/${vars.beth_train_tarfile}"
description: "Tarfile containing original challenge data from the Beth training data split"
- dest: "assets/n2c2_2011/${vars.partners_train_tarfile}"
description: "Tarfile containing original challenge data from the Partners training data split"
- dest: "assets/n2c2_2011/${vars.test_zipfile}"
description: "Zipfile containing original challenge test data"
- dest: "assets/mock_notes.jsonl"
description: "JSONL file with raw mock notes to annotate in prodigy"
# Workflows are sequences of commands (see below) executed in order. You can
# run them via "spacy project run [workflow]". If a commands's inputs/outputs
# haven't changed, it won't be re-run.
workflows:
all:
- preprocess
- train
- evaluate
# Project commands, specified in a style similar to CI config files (e.g. Azure
# pipelines). The name is the command name that lets you trigger the command
# via "spacy project run [command] [path]". The help message is optional and
# shown when executing "spacy project run [optional command] [path] --help".
commands:
- name: "preprocess"
help: "Convert the data to spaCy's binary format"
script:
- "python scripts/preprocess.py assets/n2c2_2011 corpus"
deps:
- "assets/n2c2_2011/${vars.beth_train_tarfile}"
- "assets/n2c2_2011/${vars.partners_train_tarfile}"
- "assets/n2c2_2011/${vars.test_zipfile}"
- "scripts/preprocess.py"
outputs:
- "corpus/${vars.train}.spacy"
- "corpus/${vars.dev}.spacy"
- "corpus/${vars.test}.spacy"
- name: "train"
help: "Train a custom PyTorch named entity recognition model"
script:
- "python -m spacy train configs/${vars.config} --output training/ --paths.train corpus/${vars.train}.spacy --paths.dev corpus/${vars.dev}.spacy --code scripts/custom_functions.py --gpu-id 0"
deps:
- "corpus/${vars.train}.spacy"
- "corpus/${vars.dev}.spacy"
outputs:
- "training/model-best"
- name: "train-trf"
help: "Train a custom PyTorch named entity recognition model with transformer"
script:
- "python -m spacy train configs/${vars.config_trf} --output training_trf/ --paths.train corpus/${vars.train}.spacy --paths.dev corpus/${vars.dev}.spacy --code scripts/custom_functions.py --gpu-id 0"
deps:
- "corpus/${vars.train}.spacy"
- "corpus/${vars.dev}.spacy"
outputs:
- "training_trf/model-best"
- name: "train-trf-test"
help: "Train a custom PyTorch named entity recognition model with transformer"
script:
- "python -m spacy train configs/${vars.config_trf_test} --output training_trf_test/ --paths.train corpus/${vars.train}.spacy --paths.dev corpus/${vars.dev}.spacy --code scripts/custom_functions.py --gpu-id 0"
deps:
- "corpus/${vars.train}.spacy"
- "corpus/${vars.dev}.spacy"
outputs:
- "training_trf/model-best"
- name: "train-trf-resume"
help: "Train a custom PyTorch named entity recognition model with transformer"
script:
- "python -m spacy train configs/${vars.config_trf_resume} --output training_trf/ --paths.train corpus/${vars.train}.spacy --paths.dev corpus/${vars.dev}.spacy --code scripts/custom_functions.py --gpu-id 0"
deps:
- "corpus/${vars.train}.spacy"
- "corpus/${vars.dev}.spacy"
outputs:
- "training_trf/model-best"
- name: "evaluate"
help: "Evaluate the custom PyTorch model and export metrics"
script:
- "python -m spacy evaluate training/model-best corpus/${vars.test}.spacy --output training/metrics.json --code scripts/custom_functions.py"
deps:
- "corpus/${vars.test}.spacy"
- "training/model-best"
outputs:
- "training/metrics.json"
- name: "evaluate-trf"
help: "Evaluate the custom PyTorch model and export metrics"
script:
- "python -m spacy evaluate training_trf/model-best corpus/${vars.test}.spacy --output training_trf/metrics.json --code scripts/custom_functions.py --gpu-id 0"
deps:
- "corpus/${vars.test}.spacy"
- "training_trf/model-best"
outputs:
- "training_trf/metrics.json"
- name: "evaluate-trf-test"
help: "Evaluate the custom PyTorch model and export metrics"
script:
- "python -m spacy evaluate training_trf_test/model-best corpus/${vars.test}.spacy --output training_trf_test/metrics.json --code scripts/custom_functions.py --gpu-id 0"
deps:
- "corpus/${vars.test}.spacy"
- "training_trf_test/model-best"
outputs:
- "training_trf_test/metrics.json"
- name: package
help: "Package the trained model so it can be installed"
script:
- "python -m spacy package training/model-best packages --name ${vars.name} --version ${vars.version} --force --code scripts/custom_functions.py"
deps:
- "training/model-best"
outputs_no_cache:
- "packages/en_${vars.name}-${vars.version}/dist/en_${vars.name}-${vars.version}.tar.gz"
- name: package-trf
help: "Package the trained model so it can be installed"
script:
- "python -m spacy package training_trf/model-best packages --name ${vars.name} --version ${vars.version} --force --code scripts/torch_ner_model.py,scripts/torch_ner_pipe.py"
deps:
- "training_trf/model-best"
outputs_no_cache:
- "packages/en_${vars.name}-${vars.version}/dist/en_${vars.name}-${vars.version}.tar.gz"
- name: visualize-model
help: Visualize the model's output interactively using Streamlit
script:
- "streamlit run scripts/visualize_model.py training_trf/model-best \"The patient had surgery.\""
deps:
- "scripts/visualize_model.py"
- "training/model-best"
- name: annotate
help: Run the custom prodigy recipe to anonymize data for the annotator and update the PyTorch NER model
script:
- "prodigy ner.correct.anonymous ${vars.prodigy.dataset} training/model-best ${vars.prodigy.source} --text-analytics-key ${vars.azure.text_analytics_key} --text-analytics-base-url ${vars.azure.text_analytics_base_url} --label {vars.prodigy.labels} --update -F scripts/prodigy/recipes.py"
deps:
- "scripts/prodigy/recipes.py"
- "training/model-best"
|