karl2990 commited on
Commit
f2e3a05
1 Parent(s): c245c17

Update spaCy pipeline

Browse files
Files changed (10) hide show
  1. README.md +22 -23
  2. config.cfg +44 -50
  3. en_med12_trf-any-py3-none-any.whl +2 -2
  4. meta.json +65 -71
  5. ner/model +0 -0
  6. ner/moves +1 -1
  7. textcat/cfg +2 -2
  8. textcat/model +1 -1
  9. transformer/model +1 -1
  10. vocab/strings.json +0 -0
README.md CHANGED
@@ -14,25 +14,25 @@ model-index:
14
  metrics:
15
  - name: NER Precision
16
  type: precision
17
- value: 0.8630460449
18
  - name: NER Recall
19
  type: recall
20
- value: 0.8640661939
21
  - name: NER F Score
22
  type: f_score
23
- value: 0.8635558181
24
  ---
25
  | Feature | Description |
26
  | --- | --- |
27
  | **Name** | `en_med12_trf` |
28
  | **Version** | `1` |
29
  | **spaCy** | `>=3.4.1,<3.5.0` |
30
- | **Default Pipeline** | `tok2vec`, `transformer`, `ner`, `textcat` |
31
- | **Components** | `tok2vec`, `transformer`, `ner`, `textcat` |
32
  | **Vectors** | 0 keys, 0 unique vectors (0 dimensions) |
33
  | **Sources** | n/a |
34
- | **License** | APACHE LICENSE, VERSION 2.0 |
35
- | **Author** | Karl Renius |
36
 
37
  ### Label Scheme
38
 
@@ -43,7 +43,7 @@ model-index:
43
  | Component | Labels |
44
  | --- | --- |
45
  | **`ner`** | `Denominator_Unit`, `Denominator_Value`, `Dose_Form`, `Medication_Name`, `NDC`, `Numerator_Unit`, `Numerator_Value`, `Product_Package_Type`, `Product_Package_Type_Value`, `Quantity_Factor_Unit`, `Quantity_Factor_Unit_Value`, `Quantity_Factor_Value` |
46
- | **`textcat`** | `MEDICATION`, `OTHER` |
47
 
48
  </details>
49
 
@@ -51,19 +51,18 @@ model-index:
51
 
52
  | Type | Score |
53
  | --- | --- |
54
- | `ENTS_F` | 86.36 |
55
- | `ENTS_P` | 86.30 |
56
- | `ENTS_R` | 86.41 |
57
- | `CATS_SCORE` | 96.85 |
58
- | `CATS_MICRO_P` | 93.61 |
59
- | `CATS_MICRO_R` | 99.64 |
60
- | `CATS_MICRO_F` | 96.53 |
61
- | `CATS_MACRO_P` | 94.24 |
62
- | `CATS_MACRO_R` | 99.61 |
63
- | `CATS_MACRO_F` | 96.85 |
64
- | `CATS_MACRO_AUC` | 99.68 |
65
  | `CATS_MACRO_AUC_PER_TYPE` | 0.00 |
66
- | `TOK2VEC_LOSS` | 0.00 |
67
- | `TRANSFORMER_LOSS` | 131016.45 |
68
- | `NER_LOSS` | 28078.22 |
69
- | `TEXTCAT_LOSS` | 1261.44 |
 
14
  metrics:
15
  - name: NER Precision
16
  type: precision
17
+ value: 0.8398220245
18
  - name: NER Recall
19
  type: recall
20
+ value: 0.8445190157
21
  - name: NER F Score
22
  type: f_score
23
+ value: 0.842163971
24
  ---
25
  | Feature | Description |
26
  | --- | --- |
27
  | **Name** | `en_med12_trf` |
28
  | **Version** | `1` |
29
  | **spaCy** | `>=3.4.1,<3.5.0` |
30
+ | **Default Pipeline** | `transformer`, `ner`, `textcat` |
31
+ | **Components** | `transformer`, `ner`, `textcat` |
32
  | **Vectors** | 0 keys, 0 unique vectors (0 dimensions) |
33
  | **Sources** | n/a |
34
+ | **License** | n/a |
35
+ | **Author** | [n/a]() |
36
 
37
  ### Label Scheme
38
 
 
43
  | Component | Labels |
44
  | --- | --- |
45
  | **`ner`** | `Denominator_Unit`, `Denominator_Value`, `Dose_Form`, `Medication_Name`, `NDC`, `Numerator_Unit`, `Numerator_Value`, `Product_Package_Type`, `Product_Package_Type_Value`, `Quantity_Factor_Unit`, `Quantity_Factor_Unit_Value`, `Quantity_Factor_Value` |
46
+ | **`textcat`** | `OTHER`, `MEDICATION` |
47
 
48
  </details>
49
 
 
51
 
52
  | Type | Score |
53
  | --- | --- |
54
+ | `ENTS_F` | 84.22 |
55
+ | `ENTS_P` | 83.98 |
56
+ | `ENTS_R` | 84.45 |
57
+ | `CATS_SCORE` | 93.88 |
58
+ | `CATS_MICRO_P` | 89.78 |
59
+ | `CATS_MICRO_R` | 97.98 |
60
+ | `CATS_MICRO_F` | 93.70 |
61
+ | `CATS_MACRO_P` | 90.29 |
62
+ | `CATS_MACRO_R` | 97.93 |
63
+ | `CATS_MACRO_F` | 93.88 |
64
+ | `CATS_MACRO_AUC` | 98.53 |
65
  | `CATS_MACRO_AUC_PER_TYPE` | 0.00 |
66
+ | `TRANSFORMER_LOSS` | 152780.09 |
67
+ | `NER_LOSS` | 69513.43 |
68
+ | `TEXTCAT_LOSS` | 1868.30 |
 
config.cfg CHANGED
@@ -1,17 +1,17 @@
1
  [paths]
2
- train = null
3
- dev = null
4
  vectors = null
5
  init_tok2vec = null
6
 
7
  [system]
8
- gpu_allocator = null
9
- seed = 0
10
 
11
  [nlp]
12
  lang = "en"
13
- pipeline = ["tok2vec","transformer","ner","textcat"]
14
- batch_size = 128
15
  disabled = []
16
  before_creation = null
17
  after_creation = null
@@ -64,26 +64,6 @@ grad_factor = 1.0
64
  pooling = {"@layers":"reduce_mean.v1"}
65
  upstream = "*"
66
 
67
- [components.tok2vec]
68
- factory = "tok2vec"
69
-
70
- [components.tok2vec.model]
71
- @architectures = "spacy.Tok2Vec.v2"
72
-
73
- [components.tok2vec.model.embed]
74
- @architectures = "spacy.MultiHashEmbed.v2"
75
- width = ${components.tok2vec.model.encode.width}
76
- attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
77
- rows = [5000,2500,2500,2500]
78
- include_static_vectors = false
79
-
80
- [components.tok2vec.model.encode]
81
- @architectures = "spacy.MaxoutWindowEncoder.v2"
82
- width = 96
83
- depth = 4
84
- window_size = 1
85
- maxout_pieces = 3
86
-
87
  [components.transformer]
88
  factory = "transformer"
89
  max_batch_items = 4096
@@ -107,27 +87,22 @@ use_fast = true
107
  [components.transformer.model.transformer_config]
108
 
109
  [corpora]
110
- @readers = "prodigy.MergedCorpus.v1"
111
- eval_split = 0.2
112
- sample_size = 1.0
113
- textcat_multilabel = null
114
- parser = null
115
- tagger = null
116
- senter = null
117
- spancat = null
118
-
119
- [corpora.ner]
120
- @readers = "prodigy.NERCorpus.v1"
121
- datasets = ["real_world_meds"]
122
- eval_datasets = []
123
- default_fill = "outside"
124
- incorrect_key = "incorrect_spans"
125
-
126
- [corpora.textcat]
127
- @readers = "prodigy.TextCatCorpus.v1"
128
- datasets = ["db-labeled"]
129
- eval_datasets = []
130
- exclusive = true
131
 
132
  [training]
133
  accumulate_gradient = 3
@@ -135,7 +110,7 @@ dev_corpus = "corpora.dev"
135
  train_corpus = "corpora.train"
136
  seed = ${system.seed}
137
  gpu_allocator = ${system.gpu_allocator}
138
- dropout = 0.1
139
  patience = 1600
140
  max_epochs = 0
141
  max_steps = 20000
@@ -152,8 +127,13 @@ buffer = 256
152
  get_length = null
153
 
154
  [training.logger]
155
- @loggers = "prodigy.ConsoleLogger.v1"
156
- progress_bar = false
 
 
 
 
 
157
 
158
  [training.optimizer]
159
  @optimizers = "Adam.v1"
@@ -200,4 +180,18 @@ after_init = null
200
 
201
  [initialize.components]
202
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
  [initialize.tokenizer]
 
1
  [paths]
2
+ train = "100_percent.spacy"
3
+ dev = "rw_test.spacy"
4
  vectors = null
5
  init_tok2vec = null
6
 
7
  [system]
8
+ gpu_allocator = "pytorch"
9
+ seed = 42
10
 
11
  [nlp]
12
  lang = "en"
13
+ pipeline = ["transformer","ner","textcat"]
14
+ batch_size = 256
15
  disabled = []
16
  before_creation = null
17
  after_creation = null
 
64
  pooling = {"@layers":"reduce_mean.v1"}
65
  upstream = "*"
66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  [components.transformer]
68
  factory = "transformer"
69
  max_batch_items = 4096
 
87
  [components.transformer.model.transformer_config]
88
 
89
  [corpora]
90
+
91
+ [corpora.dev]
92
+ @readers = "spacy.Corpus.v1"
93
+ path = ${paths.dev}
94
+ max_length = 0
95
+ gold_preproc = false
96
+ limit = 0
97
+ augmenter = null
98
+
99
+ [corpora.train]
100
+ @readers = "spacy.Corpus.v1"
101
+ path = ${paths.train}
102
+ max_length = 0
103
+ gold_preproc = false
104
+ limit = 0
105
+ augmenter = null
 
 
 
 
 
106
 
107
  [training]
108
  accumulate_gradient = 3
 
110
  train_corpus = "corpora.train"
111
  seed = ${system.seed}
112
  gpu_allocator = ${system.gpu_allocator}
113
+ dropout = 0.05
114
  patience = 1600
115
  max_epochs = 0
116
  max_steps = 20000
 
127
  get_length = null
128
 
129
  [training.logger]
130
+ @loggers = "spacy.WandbLogger.v3"
131
+ project_name = "med12"
132
+ remove_config_values = ["paths.train","paths.dev","corpora.train.path","corpora.dev.path"]
133
+ log_dataset_dir = "./corpus"
134
+ model_log_interval = 1000
135
+ entity = null
136
+ run_name = null
137
 
138
  [training.optimizer]
139
  @optimizers = "Adam.v1"
 
180
 
181
  [initialize.components]
182
 
183
+ [initialize.components.ner]
184
+
185
+ [initialize.components.ner.labels]
186
+ @readers = "spacy.read_labels.v1"
187
+ path = "C:\\Users\\karl2\\Documents\\medication_parsing\\rxnorm_model_development\\models\\roberta_weak_rw\\labels\\ner.json"
188
+ require = false
189
+
190
+ [initialize.components.textcat]
191
+
192
+ [initialize.components.textcat.labels]
193
+ @readers = "spacy.read_labels.v1"
194
+ path = "C:\\Users\\karl2\\Documents\\medication_parsing\\rxnorm_model_development\\models\\roberta_weak_rw\\labels\\textcat.json"
195
+ require = false
196
+
197
  [initialize.tokenizer]
en_med12_trf-any-py3-none-any.whl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cbdc0c18c39ecfea3ef8dfe9194a8d9f0fcd6ec471d170d914fe682c05e0ed2a
3
- size 460233712
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e0d7e75743984013d22e7c555d824fd77b38f46e6f851bd031695e14a933a34
3
+ size 454351979
meta.json CHANGED
@@ -16,9 +16,6 @@
16
  "name":null
17
  },
18
  "labels":{
19
- "tok2vec":[
20
-
21
- ],
22
  "transformer":[
23
 
24
  ],
@@ -37,18 +34,16 @@
37
  "Quantity_Factor_Value"
38
  ],
39
  "textcat":[
40
- "MEDICATION",
41
- "OTHER"
42
  ]
43
  },
44
  "pipeline":[
45
- "tok2vec",
46
  "transformer",
47
  "ner",
48
  "textcat"
49
  ],
50
  "components":[
51
- "tok2vec",
52
  "transformer",
53
  "ner",
54
  "textcat"
@@ -57,92 +52,91 @@
57
 
58
  ],
59
  "performance":{
60
- "ents_f":0.8635558181,
61
- "ents_p":0.8630460449,
62
- "ents_r":0.8640661939,
63
  "ents_per_type":{
64
  "Medication_Name":{
65
- "p":0.8534482759,
66
- "r":0.8497854077,
67
- "f":0.8516129032
68
- },
69
- "Numerator_Value":{
70
- "p":0.9416666667,
71
- "r":0.9826086957,
72
- "f":0.9617021277
73
- },
74
- "Numerator_Unit":{
75
- "p":0.9230769231,
76
- "r":0.9391304348,
77
- "f":0.9310344828
78
  },
79
  "Dose_Form":{
80
- "p":0.8113207547,
81
- "r":0.8322580645,
82
- "f":0.821656051
83
  },
84
  "Quantity_Factor_Value":{
85
- "p":0.8064516129,
86
- "r":0.7575757576,
87
- "f":0.78125
 
 
 
 
 
88
  },
89
  "Quantity_Factor_Unit":{
90
- "p":0.8928571429,
91
- "r":0.8333333333,
92
- "f":0.8620689655
93
  },
94
  "Product_Package_Type":{
95
- "p":0.8717948718,
96
- "r":0.7727272727,
97
- "f":0.8192771084
98
  },
99
- "Product_Package_Type_Value":{
100
- "p":0.8571428571,
101
- "r":0.8571428571,
102
- "f":0.8571428571
103
- },
104
- "Quantity_Factor_Unit_Value":{
105
- "p":0.9090909091,
106
- "r":0.7547169811,
107
- "f":0.824742268
108
  },
109
- "Denominator_Value":{
110
- "p":0.4545454545,
111
- "r":1.0,
112
- "f":0.625
113
  },
114
  "Denominator_Unit":{
115
- "p":0.6470588235,
116
- "r":0.9166666667,
117
- "f":0.7586206897
 
 
 
 
 
 
 
 
 
 
118
  }
119
  },
120
- "cats_score":0.9684618658,
121
  "cats_score_desc":"macro F",
122
- "cats_micro_p":0.9360913862,
123
- "cats_micro_r":0.9963851462,
124
- "cats_micro_f":0.9652976759,
125
- "cats_macro_p":0.9423963976,
126
- "cats_macro_r":0.9961496466,
127
- "cats_macro_f":0.9684618658,
128
- "cats_macro_auc":0.9968290074,
129
  "cats_f_per_type":{
130
- "MEDICATION":{
131
- "p":0.9273913043,
132
- "r":0.996728972,
133
- "f":0.9608108108
134
- },
135
  "OTHER":{
136
- "p":0.9574014909,
137
- "r":0.9955703212,
138
- "f":0.9761129207
 
 
 
 
 
139
  }
140
  },
141
  "cats_macro_auc_per_type":0.0,
142
- "tok2vec_loss":0.0,
143
- "transformer_loss":1310.1644586238,
144
- "ner_loss":280.7821546286,
145
- "textcat_loss":12.6144110119
146
  },
147
  "requirements":[
148
  "spacy-transformers>=1.1.7,<1.2.0"
 
16
  "name":null
17
  },
18
  "labels":{
 
 
 
19
  "transformer":[
20
 
21
  ],
 
34
  "Quantity_Factor_Value"
35
  ],
36
  "textcat":[
37
+ "OTHER",
38
+ "MEDICATION"
39
  ]
40
  },
41
  "pipeline":[
 
42
  "transformer",
43
  "ner",
44
  "textcat"
45
  ],
46
  "components":[
 
47
  "transformer",
48
  "ner",
49
  "textcat"
 
52
 
53
  ],
54
  "performance":{
55
+ "ents_f":0.842163971,
56
+ "ents_p":0.8398220245,
57
+ "ents_r":0.8445190157,
58
  "ents_per_type":{
59
  "Medication_Name":{
60
+ "p":0.8181818182,
61
+ "r":0.8247011952,
62
+ "f":0.8214285714
 
 
 
 
 
 
 
 
 
 
63
  },
64
  "Dose_Form":{
65
+ "p":0.7718120805,
66
+ "r":0.7467532468,
67
+ "f":0.7590759076
68
  },
69
  "Quantity_Factor_Value":{
70
+ "p":0.8421052632,
71
+ "r":0.7111111111,
72
+ "f":0.7710843373
73
+ },
74
+ "Quantity_Factor_Unit_Value":{
75
+ "p":0.8,
76
+ "r":0.8181818182,
77
+ "f":0.808988764
78
  },
79
  "Quantity_Factor_Unit":{
80
+ "p":0.9166666667,
81
+ "r":0.8148148148,
82
+ "f":0.862745098
83
  },
84
  "Product_Package_Type":{
85
+ "p":0.8055555556,
86
+ "r":0.6744186047,
87
+ "f":0.7341772152
88
  },
89
+ "Numerator_Value":{
90
+ "p":0.9047619048,
91
+ "r":0.9851851852,
92
+ "f":0.9432624113
 
 
 
 
 
93
  },
94
+ "Numerator_Unit":{
95
+ "p":0.928057554,
96
+ "r":0.9772727273,
97
+ "f":0.9520295203
98
  },
99
  "Denominator_Unit":{
100
+ "p":0.7222222222,
101
+ "r":0.8666666667,
102
+ "f":0.7878787879
103
+ },
104
+ "Product_Package_Type_Value":{
105
+ "p":0.75,
106
+ "r":0.9375,
107
+ "f":0.8333333333
108
+ },
109
+ "Denominator_Value":{
110
+ "p":0.3333333333,
111
+ "r":0.4,
112
+ "f":0.3636363636
113
  }
114
  },
115
+ "cats_score":0.93877706,
116
  "cats_score_desc":"macro F",
117
+ "cats_micro_p":0.8978046934,
118
+ "cats_micro_r":0.9797604296,
119
+ "cats_micro_f":0.9369938771,
120
+ "cats_macro_p":0.9029397761,
121
+ "cats_macro_r":0.9792967771,
122
+ "cats_macro_f":0.93877706,
123
+ "cats_macro_auc":0.9853349088,
124
  "cats_f_per_type":{
 
 
 
 
 
125
  "OTHER":{
126
+ "p":0.9478632479,
127
+ "r":0.9711033275,
128
+ "f":0.9593425606
129
+ },
130
+ "MEDICATION":{
131
+ "p":0.8580163043,
132
+ "r":0.9874902267,
133
+ "f":0.9182115594
134
  }
135
  },
136
  "cats_macro_auc_per_type":0.0,
137
+ "transformer_loss":1527.8009473217,
138
+ "ner_loss":695.1342882158,
139
+ "textcat_loss":18.6830486141
 
140
  },
141
  "requirements":[
142
  "spacy-transformers>=1.1.7,<1.2.0"
ner/model CHANGED
Binary files a/ner/model and b/ner/model differ
 
ner/moves CHANGED
@@ -1 +1 @@
1
- ��moves��{"0":{},"1":{"Dose_Form":68280,"Medication_Name":66363,"Numerator_Value":36632,"Numerator_Unit":36387,"NDC":29380,"Denominator_Unit":15956,"Quantity_Factor_Unit":10336,"Product_Package_Type":10272,"Quantity_Factor_Value":10061,"Product_Package_Type_Value":9617,"Quantity_Factor_Unit_Value":3388,"Denominator_Value":653},"2":{"Dose_Form":68280,"Medication_Name":66363,"Numerator_Value":36632,"Numerator_Unit":36387,"NDC":29380,"Denominator_Unit":15956,"Quantity_Factor_Unit":10336,"Product_Package_Type":10272,"Quantity_Factor_Value":10061,"Product_Package_Type_Value":9617,"Quantity_Factor_Unit_Value":3388,"Denominator_Value":653},"3":{"Dose_Form":68280,"Medication_Name":66363,"Numerator_Value":36632,"Numerator_Unit":36387,"NDC":29380,"Denominator_Unit":15956,"Quantity_Factor_Unit":10336,"Product_Package_Type":10272,"Quantity_Factor_Value":10061,"Product_Package_Type_Value":9617,"Quantity_Factor_Unit_Value":3388,"Denominator_Value":653},"4":{"Dose_Form":68280,"Medication_Name":66363,"Numerator_Value":36632,"Numerator_Unit":36387,"NDC":29380,"Denominator_Unit":15956,"Quantity_Factor_Unit":10336,"Product_Package_Type":10272,"Quantity_Factor_Value":10061,"Product_Package_Type_Value":9617,"Quantity_Factor_Unit_Value":3388,"Denominator_Value":653,"":1},"5":{"":1}}�cfg��neg_key�
 
1
+ ��moves��{"0":{},"1":{"Dose_Form":68500,"Medication_Name":66539,"Numerator_Value":36769,"Numerator_Unit":36524,"NDC":30140,"Denominator_Unit":16207,"Product_Package_Type":10669,"Quantity_Factor_Unit":10581,"Quantity_Factor_Value":10160,"Product_Package_Type_Value":9959,"Quantity_Factor_Unit_Value":3526,"Denominator_Value":685},"2":{"Dose_Form":68500,"Medication_Name":66539,"Numerator_Value":36769,"Numerator_Unit":36524,"NDC":30140,"Denominator_Unit":16207,"Product_Package_Type":10669,"Quantity_Factor_Unit":10581,"Quantity_Factor_Value":10160,"Product_Package_Type_Value":9959,"Quantity_Factor_Unit_Value":3526,"Denominator_Value":685},"3":{"Dose_Form":68500,"Medication_Name":66539,"Numerator_Value":36769,"Numerator_Unit":36524,"NDC":30140,"Denominator_Unit":16207,"Product_Package_Type":10669,"Quantity_Factor_Unit":10581,"Quantity_Factor_Value":10160,"Product_Package_Type_Value":9959,"Quantity_Factor_Unit_Value":3526,"Denominator_Value":685},"4":{"Dose_Form":68500,"Medication_Name":66539,"Numerator_Value":36769,"Numerator_Unit":36524,"NDC":30140,"Denominator_Unit":16207,"Product_Package_Type":10669,"Quantity_Factor_Unit":10581,"Quantity_Factor_Value":10160,"Product_Package_Type_Value":9959,"Quantity_Factor_Unit_Value":3526,"Denominator_Value":685,"":1},"5":{"":1}}�cfg��neg_key�
textcat/cfg CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "labels":[
3
- "MEDICATION",
4
- "OTHER"
5
  ],
6
  "threshold":0.5,
7
  "positive_label":null
 
1
  {
2
  "labels":[
3
+ "OTHER",
4
+ "MEDICATION"
5
  ],
6
  "threshold":0.5,
7
  "positive_label":null
textcat/model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4f483561606ab536a2921d5c0d71ed741f06ecaaa8408e2f02a8af50a7058ce7
3
  size 9202498
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3412ff82cfd7317a5373da70b8f0ac507e6f41b0862c15fabdd8124c2934ec1c
3
  size 9202498
transformer/model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d37cbcb7a5e658fa6a3e8f4e5a4c9d04ae623faf1a9033df2c6c40db5cef26fe
3
  size 502030732
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:54558ded63daa3c035d9e37f5c483cbea285bb518f2601c73e05f5908a6016ac
3
  size 502030732
vocab/strings.json CHANGED
The diff for this file is too large to render. See raw diff