Spaces:
Runtime error
Runtime error
title: "Detecting entities in Medical Records with PyTorch" | |
description: "This project uses the [i2b2 (n2c2) 2011 Challenge Dataset](https://portal.dbmi.hms.harvard.edu/projects/n2c2-nlp/) to bootstrap a PyTorch NER model to detect entities in Medical Records. It also demonstrates how to anonymize medical records for annotators in [Prodigy](https://prodi.gy)." | |
# Variables can be referenced across the project.yml using ${vars.var_name} | |
vars: | |
beth_train_tarfile: i2b2_Beth_Train_Release.tar.gz | |
partners_train_tarfile: i2b2_Partners_Train_Release.tar.gz | |
test_zipfile: Task_1C.zip | |
spacy_config: "spacy_config.cfg" | |
config: "config.cfg" | |
config_trf: "config_trf.cfg" | |
config_trf_resume: "config_trf_resume.cfg" | |
config_trf_test: "config_trf_test.cfg" | |
name: "ner_pytorch_medical" | |
version: "0.0.0" | |
train: "train" | |
dev: "dev" | |
test: "test" | |
prodigy: | |
dataset: "pytorch_ner_medical_correct_anonymous" | |
source: "assets/mock_notes.jsonl" | |
labels: "person,problem,pronoun,test,treatment" | |
azure: | |
text_analytics_key: "YOUR_API_KEY" | |
text_analytics_base_url: "https://westus2.api.cognitive.microsoft.com/" | |
# These are the directories that the project needs. The project CLI will make | |
# sure that they always exist. | |
directories: ["assets", "training", "configs", "scripts", "corpus", "packages"] | |
# Assets that should be downloaded or available in the directory. You can replace | |
# this with your own input data. | |
assets: | |
- dest: "assets/n2c2_2011/${vars.beth_train_tarfile}" | |
description: "Tarfile containing original challenge data from the Beth training data split" | |
- dest: "assets/n2c2_2011/${vars.partners_train_tarfile}" | |
description: "Tarfile containing original challenge data from the Partners training data split" | |
- dest: "assets/n2c2_2011/${vars.test_zipfile}" | |
description: "Zipfile containing original challenge test data" | |
- dest: "assets/mock_notes.jsonl" | |
description: "JSONL file with raw mock notes to annotate in prodigy" | |
# Workflows are sequences of commands (see below) executed in order. You can | |
# run them via "spacy project run [workflow]". If a commands's inputs/outputs | |
# haven't changed, it won't be re-run. | |
workflows: | |
all: | |
- preprocess | |
- train | |
- evaluate | |
# Project commands, specified in a style similar to CI config files (e.g. Azure | |
# pipelines). The name is the command name that lets you trigger the command | |
# via "spacy project run [command] [path]". The help message is optional and | |
# shown when executing "spacy project run [optional command] [path] --help". | |
commands: | |
- name: "preprocess" | |
help: "Convert the data to spaCy's binary format" | |
script: | |
- "python scripts/preprocess.py assets/n2c2_2011 corpus" | |
deps: | |
- "assets/n2c2_2011/${vars.beth_train_tarfile}" | |
- "assets/n2c2_2011/${vars.partners_train_tarfile}" | |
- "assets/n2c2_2011/${vars.test_zipfile}" | |
- "scripts/preprocess.py" | |
outputs: | |
- "corpus/${vars.train}.spacy" | |
- "corpus/${vars.dev}.spacy" | |
- "corpus/${vars.test}.spacy" | |
- name: "train" | |
help: "Train a custom PyTorch named entity recognition model" | |
script: | |
- "python -m spacy train configs/${vars.config} --output training/ --paths.train corpus/${vars.train}.spacy --paths.dev corpus/${vars.dev}.spacy --code scripts/custom_functions.py --gpu-id 0" | |
deps: | |
- "corpus/${vars.train}.spacy" | |
- "corpus/${vars.dev}.spacy" | |
outputs: | |
- "training/model-best" | |
- name: "train-trf" | |
help: "Train a custom PyTorch named entity recognition model with transformer" | |
script: | |
- "python -m spacy train configs/${vars.config_trf} --output training_trf/ --paths.train corpus/${vars.train}.spacy --paths.dev corpus/${vars.dev}.spacy --code scripts/custom_functions.py --gpu-id 0" | |
deps: | |
- "corpus/${vars.train}.spacy" | |
- "corpus/${vars.dev}.spacy" | |
outputs: | |
- "training_trf/model-best" | |
- name: "train-trf-test" | |
help: "Train a custom PyTorch named entity recognition model with transformer" | |
script: | |
- "python -m spacy train configs/${vars.config_trf_test} --output training_trf_test/ --paths.train corpus/${vars.train}.spacy --paths.dev corpus/${vars.dev}.spacy --code scripts/custom_functions.py --gpu-id 0" | |
deps: | |
- "corpus/${vars.train}.spacy" | |
- "corpus/${vars.dev}.spacy" | |
outputs: | |
- "training_trf/model-best" | |
- name: "train-trf-resume" | |
help: "Train a custom PyTorch named entity recognition model with transformer" | |
script: | |
- "python -m spacy train configs/${vars.config_trf_resume} --output training_trf/ --paths.train corpus/${vars.train}.spacy --paths.dev corpus/${vars.dev}.spacy --code scripts/custom_functions.py --gpu-id 0" | |
deps: | |
- "corpus/${vars.train}.spacy" | |
- "corpus/${vars.dev}.spacy" | |
outputs: | |
- "training_trf/model-best" | |
- name: "evaluate" | |
help: "Evaluate the custom PyTorch model and export metrics" | |
script: | |
- "python -m spacy evaluate training/model-best corpus/${vars.test}.spacy --output training/metrics.json --code scripts/custom_functions.py" | |
deps: | |
- "corpus/${vars.test}.spacy" | |
- "training/model-best" | |
outputs: | |
- "training/metrics.json" | |
- name: "evaluate-trf" | |
help: "Evaluate the custom PyTorch model and export metrics" | |
script: | |
- "python -m spacy evaluate training_trf/model-best corpus/${vars.test}.spacy --output training_trf/metrics.json --code scripts/custom_functions.py --gpu-id 0" | |
deps: | |
- "corpus/${vars.test}.spacy" | |
- "training_trf/model-best" | |
outputs: | |
- "training_trf/metrics.json" | |
- name: "evaluate-trf-test" | |
help: "Evaluate the custom PyTorch model and export metrics" | |
script: | |
- "python -m spacy evaluate training_trf_test/model-best corpus/${vars.test}.spacy --output training_trf_test/metrics.json --code scripts/custom_functions.py --gpu-id 0" | |
deps: | |
- "corpus/${vars.test}.spacy" | |
- "training_trf_test/model-best" | |
outputs: | |
- "training_trf_test/metrics.json" | |
- name: package | |
help: "Package the trained model so it can be installed" | |
script: | |
- "python -m spacy package training/model-best packages --name ${vars.name} --version ${vars.version} --force --code scripts/custom_functions.py" | |
deps: | |
- "training/model-best" | |
outputs_no_cache: | |
- "packages/en_${vars.name}-${vars.version}/dist/en_${vars.name}-${vars.version}.tar.gz" | |
- name: package-trf | |
help: "Package the trained model so it can be installed" | |
script: | |
- "python -m spacy package training_trf/model-best packages --name ${vars.name} --version ${vars.version} --force --code scripts/torch_ner_model.py,scripts/torch_ner_pipe.py" | |
deps: | |
- "training_trf/model-best" | |
outputs_no_cache: | |
- "packages/en_${vars.name}-${vars.version}/dist/en_${vars.name}-${vars.version}.tar.gz" | |
- name: visualize-model | |
help: Visualize the model's output interactively using Streamlit | |
script: | |
- "streamlit run scripts/visualize_model.py training_trf/model-best \"The patient had surgery.\"" | |
deps: | |
- "scripts/visualize_model.py" | |
- "training/model-best" | |
- name: annotate | |
help: Run the custom prodigy recipe to anonymize data for the annotator and update the PyTorch NER model | |
script: | |
- "prodigy ner.correct.anonymous ${vars.prodigy.dataset} training/model-best ${vars.prodigy.source} --text-analytics-key ${vars.azure.text_analytics_key} --text-analytics-base-url ${vars.azure.text_analytics_base_url} --label {vars.prodigy.labels} --update -F scripts/prodigy/recipes.py" | |
deps: | |
- "scripts/prodigy/recipes.py" | |
- "training/model-best" | |