mhomilius commited on
Commit
030c644
1 Parent(s): 173dd53

first model version

Browse files
README.md ADDED
@@ -0,0 +1,4 @@
 
 
 
 
1
+ # BERT-based deidentification model
2
+
3
+ This repo contains model weights trained on the I2B2 dataset.
4
+ See [OBI EHR deidentification] (https://github.com/obi-ds/ehr_deidentification) for more details and how to get started.
config.json ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "emilyalsentzer/Bio_ClinicalBERT",
3
+ "architectures": [
4
+ "BertForTokenClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "finetuning_task": "ner",
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "id2label": {
13
+ "0": "B-AGE",
14
+ "1": "B-DATE",
15
+ "2": "B-EMAIL",
16
+ "3": "B-HOSP",
17
+ "4": "B-ID",
18
+ "5": "B-LOC",
19
+ "6": "B-OTHERPHI",
20
+ "7": "B-PATIENT",
21
+ "8": "B-PATORG",
22
+ "9": "B-PHONE",
23
+ "10": "B-STAFF",
24
+ "11": "I-AGE",
25
+ "12": "I-DATE",
26
+ "13": "I-EMAIL",
27
+ "14": "I-HOSP",
28
+ "15": "I-ID",
29
+ "16": "I-LOC",
30
+ "17": "I-OTHERPHI",
31
+ "18": "I-PATIENT",
32
+ "19": "I-PATORG",
33
+ "20": "I-PHONE",
34
+ "21": "I-STAFF",
35
+ "22": "L-AGE",
36
+ "23": "L-DATE",
37
+ "24": "L-EMAIL",
38
+ "25": "L-HOSP",
39
+ "26": "L-ID",
40
+ "27": "L-LOC",
41
+ "28": "L-OTHERPHI",
42
+ "29": "L-PATIENT",
43
+ "30": "L-PATORG",
44
+ "31": "L-PHONE",
45
+ "32": "L-STAFF",
46
+ "33": "O",
47
+ "34": "U-AGE",
48
+ "35": "U-DATE",
49
+ "36": "U-EMAIL",
50
+ "37": "U-HOSP",
51
+ "38": "U-ID",
52
+ "39": "U-LOC",
53
+ "40": "U-OTHERPHI",
54
+ "41": "U-PATIENT",
55
+ "42": "U-PATORG",
56
+ "43": "U-PHONE",
57
+ "44": "U-STAFF"
58
+ },
59
+ "initializer_range": 0.02,
60
+ "intermediate_size": 3072,
61
+ "label2id": {
62
+ "B-AGE": 0,
63
+ "B-DATE": 1,
64
+ "B-EMAIL": 2,
65
+ "B-HOSP": 3,
66
+ "B-ID": 4,
67
+ "B-LOC": 5,
68
+ "B-OTHERPHI": 6,
69
+ "B-PATIENT": 7,
70
+ "B-PATORG": 8,
71
+ "B-PHONE": 9,
72
+ "B-STAFF": 10,
73
+ "I-AGE": 11,
74
+ "I-DATE": 12,
75
+ "I-EMAIL": 13,
76
+ "I-HOSP": 14,
77
+ "I-ID": 15,
78
+ "I-LOC": 16,
79
+ "I-OTHERPHI": 17,
80
+ "I-PATIENT": 18,
81
+ "I-PATORG": 19,
82
+ "I-PHONE": 20,
83
+ "I-STAFF": 21,
84
+ "L-AGE": 22,
85
+ "L-DATE": 23,
86
+ "L-EMAIL": 24,
87
+ "L-HOSP": 25,
88
+ "L-ID": 26,
89
+ "L-LOC": 27,
90
+ "L-OTHERPHI": 28,
91
+ "L-PATIENT": 29,
92
+ "L-PATORG": 30,
93
+ "L-PHONE": 31,
94
+ "L-STAFF": 32,
95
+ "O": 33,
96
+ "U-AGE": 34,
97
+ "U-DATE": 35,
98
+ "U-EMAIL": 36,
99
+ "U-HOSP": 37,
100
+ "U-ID": 38,
101
+ "U-LOC": 39,
102
+ "U-OTHERPHI": 40,
103
+ "U-PATIENT": 41,
104
+ "U-PATORG": 42,
105
+ "U-PHONE": 43,
106
+ "U-STAFF": 44
107
+ },
108
+ "layer_norm_eps": 1e-12,
109
+ "max_position_embeddings": 512,
110
+ "model_type": "bert",
111
+ "num_attention_heads": 12,
112
+ "num_hidden_layers": 12,
113
+ "pad_token_id": 0,
114
+ "position_embedding_type": "absolute",
115
+ "transformers_version": "4.6.1",
116
+ "type_vocab_size": 2,
117
+ "use_cache": true,
118
+ "vocab_size": 28996
119
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:842d9f0d394c71528bb73f87748c580a9a3ea82973a18e0871fd579e3eb21c6b
3
+ size 431100529
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
1
+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
1
+ {"do_lower_case": false, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "special_tokens_map_file": null, "name_or_path": "emilyalsentzer/Bio_ClinicalBERT", "do_basic_tokenize": true, "never_split": null}
vocab.txt ADDED
The diff for this file is too large to render. See raw diff