karl2990 commited on
Commit
ed7247d
1 Parent(s): 4c0267d

Update spaCy pipeline

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  en_med12_trf-any-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
34
  textcat/model filter=lfs diff=lfs merge=lfs -text
35
  transformer/model filter=lfs diff=lfs merge=lfs -text
 
 
33
  en_med12_trf-any-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
34
  textcat/model filter=lfs diff=lfs merge=lfs -text
35
  transformer/model filter=lfs diff=lfs merge=lfs -text
36
+ tok2vec/model filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -14,36 +14,36 @@ model-index:
14
  metrics:
15
  - name: NER Precision
16
  type: precision
17
- value: 0.9987402508
18
  - name: NER Recall
19
  type: recall
20
- value: 0.9991845056
21
  - name: NER F Score
22
  type: f_score
23
- value: 0.9989623288
24
  ---
25
  | Feature | Description |
26
  | --- | --- |
27
  | **Name** | `en_med12_trf` |
28
- | **Version** | `0.0.0` |
29
  | **spaCy** | `>=3.4.1,<3.5.0` |
30
- | **Default Pipeline** | `transformer`, `ner`, `textcat` |
31
- | **Components** | `transformer`, `ner`, `textcat` |
32
  | **Vectors** | 0 keys, 0 unique vectors (0 dimensions) |
33
  | **Sources** | n/a |
34
- | **License** | Apache License, Version 2.0 |
35
- | **Author** | Karl Renius |
36
 
37
  ### Label Scheme
38
 
39
  <details>
40
 
41
- <summary>View label scheme (15 labels for 2 components)</summary>
42
 
43
  | Component | Labels |
44
  | --- | --- |
45
  | **`ner`** | `Denominator_Unit`, `Denominator_Value`, `Dose_Form`, `Medication_Name`, `NDC`, `Numerator_Unit`, `Numerator_Value`, `Product_Package_Type`, `Product_Package_Type_Value`, `Quantity_Factor_Unit`, `Quantity_Factor_Unit_Value`, `Quantity_Factor_Value` |
46
- | **`textcat`** | `MEDICATION`, `OTHER`, `DEVICE` |
47
 
48
  </details>
49
 
@@ -51,18 +51,19 @@ model-index:
51
 
52
  | Type | Score |
53
  | --- | --- |
54
- | `ENTS_F` | 99.90 |
55
- | `ENTS_P` | 99.87 |
56
- | `ENTS_R` | 99.92 |
57
- | `CATS_SCORE` | 100.00 |
58
- | `CATS_MICRO_P` | 100.00 |
59
- | `CATS_MICRO_R` | 100.00 |
60
- | `CATS_MICRO_F` | 100.00 |
61
- | `CATS_MACRO_P` | 100.00 |
62
- | `CATS_MACRO_R` | 100.00 |
63
- | `CATS_MACRO_F` | 100.00 |
64
- | `CATS_MACRO_AUC` | 100.00 |
65
  | `CATS_MACRO_AUC_PER_TYPE` | 0.00 |
66
- | `TRANSFORMER_LOSS` | 19686.79 |
67
- | `NER_LOSS` | 513489.20 |
68
- | `TEXTCAT_LOSS` | 2.08 |
 
 
14
  metrics:
15
  - name: NER Precision
16
  type: precision
17
+ value: 0.8630460449
18
  - name: NER Recall
19
  type: recall
20
+ value: 0.8640661939
21
  - name: NER F Score
22
  type: f_score
23
+ value: 0.8635558181
24
  ---
25
  | Feature | Description |
26
  | --- | --- |
27
  | **Name** | `en_med12_trf` |
28
+ | **Version** | `1` |
29
  | **spaCy** | `>=3.4.1,<3.5.0` |
30
+ | **Default Pipeline** | `tok2vec`, `transformer`, `ner`, `textcat` |
31
+ | **Components** | `tok2vec`, `transformer`, `ner`, `textcat` |
32
  | **Vectors** | 0 keys, 0 unique vectors (0 dimensions) |
33
  | **Sources** | n/a |
34
+ | **License** | n/a |
35
+ | **Author** | [n/a]() |
36
 
37
  ### Label Scheme
38
 
39
  <details>
40
 
41
+ <summary>View label scheme (14 labels for 2 components)</summary>
42
 
43
  | Component | Labels |
44
  | --- | --- |
45
  | **`ner`** | `Denominator_Unit`, `Denominator_Value`, `Dose_Form`, `Medication_Name`, `NDC`, `Numerator_Unit`, `Numerator_Value`, `Product_Package_Type`, `Product_Package_Type_Value`, `Quantity_Factor_Unit`, `Quantity_Factor_Unit_Value`, `Quantity_Factor_Value` |
46
+ | **`textcat`** | `MEDICATION`, `OTHER` |
47
 
48
  </details>
49
 
 
51
 
52
  | Type | Score |
53
  | --- | --- |
54
+ | `ENTS_F` | 86.36 |
55
+ | `ENTS_P` | 86.30 |
56
+ | `ENTS_R` | 86.41 |
57
+ | `CATS_SCORE` | 96.85 |
58
+ | `CATS_MICRO_P` | 93.61 |
59
+ | `CATS_MICRO_R` | 99.64 |
60
+ | `CATS_MICRO_F` | 96.53 |
61
+ | `CATS_MACRO_P` | 94.24 |
62
+ | `CATS_MACRO_R` | 99.61 |
63
+ | `CATS_MACRO_F` | 96.85 |
64
+ | `CATS_MACRO_AUC` | 99.68 |
65
  | `CATS_MACRO_AUC_PER_TYPE` | 0.00 |
66
+ | `TOK2VEC_LOSS` | 0.00 |
67
+ | `TRANSFORMER_LOSS` | 131016.45 |
68
+ | `NER_LOSS` | 28078.22 |
69
+ | `TEXTCAT_LOSS` | 1261.44 |
config.cfg CHANGED
@@ -1,16 +1,16 @@
1
  [paths]
2
- train = "ner_cat_train.spacy"
3
- dev = "ner_cat_test.spacy"
4
  vectors = null
5
  init_tok2vec = null
6
 
7
  [system]
8
- gpu_allocator = "pytorch"
9
  seed = 0
10
 
11
  [nlp]
12
  lang = "en"
13
- pipeline = ["transformer","ner","textcat"]
14
  batch_size = 128
15
  disabled = []
16
  before_creation = null
@@ -64,6 +64,26 @@ grad_factor = 1.0
64
  pooling = {"@layers":"reduce_mean.v1"}
65
  upstream = "*"
66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  [components.transformer]
68
  factory = "transformer"
69
  max_batch_items = 4096
@@ -87,22 +107,27 @@ use_fast = true
87
  [components.transformer.model.transformer_config]
88
 
89
  [corpora]
90
-
91
- [corpora.dev]
92
- @readers = "spacy.Corpus.v1"
93
- path = ${paths.dev}
94
- max_length = 0
95
- gold_preproc = false
96
- limit = 0
97
- augmenter = null
98
-
99
- [corpora.train]
100
- @readers = "spacy.Corpus.v1"
101
- path = ${paths.train}
102
- max_length = 0
103
- gold_preproc = false
104
- limit = 0
105
- augmenter = null
 
 
 
 
 
106
 
107
  [training]
108
  accumulate_gradient = 3
@@ -127,7 +152,7 @@ buffer = 256
127
  get_length = null
128
 
129
  [training.logger]
130
- @loggers = "spacy.ConsoleLogger.v1"
131
  progress_bar = false
132
 
133
  [training.optimizer]
 
1
  [paths]
2
+ train = null
3
+ dev = null
4
  vectors = null
5
  init_tok2vec = null
6
 
7
  [system]
8
+ gpu_allocator = null
9
  seed = 0
10
 
11
  [nlp]
12
  lang = "en"
13
+ pipeline = ["tok2vec","transformer","ner","textcat"]
14
  batch_size = 128
15
  disabled = []
16
  before_creation = null
 
64
  pooling = {"@layers":"reduce_mean.v1"}
65
  upstream = "*"
66
 
67
+ [components.tok2vec]
68
+ factory = "tok2vec"
69
+
70
+ [components.tok2vec.model]
71
+ @architectures = "spacy.Tok2Vec.v2"
72
+
73
+ [components.tok2vec.model.embed]
74
+ @architectures = "spacy.MultiHashEmbed.v2"
75
+ width = ${components.tok2vec.model.encode.width}
76
+ attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
77
+ rows = [5000,2500,2500,2500]
78
+ include_static_vectors = false
79
+
80
+ [components.tok2vec.model.encode]
81
+ @architectures = "spacy.MaxoutWindowEncoder.v2"
82
+ width = 96
83
+ depth = 4
84
+ window_size = 1
85
+ maxout_pieces = 3
86
+
87
  [components.transformer]
88
  factory = "transformer"
89
  max_batch_items = 4096
 
107
  [components.transformer.model.transformer_config]
108
 
109
  [corpora]
110
+ @readers = "prodigy.MergedCorpus.v1"
111
+ eval_split = 0.2
112
+ sample_size = 1.0
113
+ textcat_multilabel = null
114
+ parser = null
115
+ tagger = null
116
+ senter = null
117
+ spancat = null
118
+
119
+ [corpora.ner]
120
+ @readers = "prodigy.NERCorpus.v1"
121
+ datasets = ["real_world_meds"]
122
+ eval_datasets = []
123
+ default_fill = "outside"
124
+ incorrect_key = "incorrect_spans"
125
+
126
+ [corpora.textcat]
127
+ @readers = "prodigy.TextCatCorpus.v1"
128
+ datasets = ["db-labeled"]
129
+ eval_datasets = []
130
+ exclusive = true
131
 
132
  [training]
133
  accumulate_gradient = 3
 
152
  get_length = null
153
 
154
  [training.logger]
155
+ @loggers = "prodigy.ConsoleLogger.v1"
156
  progress_bar = false
157
 
158
  [training.optimizer]
en_med12_trf-any-py3-none-any.whl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:912d4cbccd78fb209542ccfe36bbfae676e05fc585099b62cc0146375877ce82
3
- size 453535042
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cbdc0c18c39ecfea3ef8dfe9194a8d9f0fcd6ec471d170d914fe682c05e0ed2a
3
+ size 460233712
meta.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "lang":"en",
3
  "name":"med12_trf",
4
- "version":"0.0.0",
5
  "description":"",
6
  "author":"",
7
  "email":"",
@@ -16,6 +16,9 @@
16
  "name":null
17
  },
18
  "labels":{
 
 
 
19
  "transformer":[
20
 
21
  ],
@@ -35,16 +38,17 @@
35
  ],
36
  "textcat":[
37
  "MEDICATION",
38
- "OTHER",
39
- "DEVICE"
40
  ]
41
  },
42
  "pipeline":[
 
43
  "transformer",
44
  "ner",
45
  "textcat"
46
  ],
47
  "components":[
 
48
  "transformer",
49
  "ner",
50
  "textcat"
@@ -53,101 +57,92 @@
53
 
54
  ],
55
  "performance":{
56
- "ents_f":0.9989623288,
57
- "ents_p":0.9987402508,
58
- "ents_r":0.9991845056,
59
  "ents_per_type":{
60
- "Quantity_Factor_Value":{
61
- "p":0.9984326019,
62
- "r":0.9992156863,
63
- "f":0.9988239906
64
- },
65
- "Quantity_Factor_Unit":{
66
- "p":0.9843575419,
67
- "r":0.9951054217,
68
- "f":0.9897023029
69
- },
70
  "Medication_Name":{
71
- "p":0.9995313964,
72
- "r":0.999344078,
73
- "f":0.9994377284
74
  },
75
  "Numerator_Value":{
76
- "p":0.9995644125,
77
- "r":0.9996732738,
78
- "f":0.9996188402
79
  },
80
  "Numerator_Unit":{
81
- "p":0.999780894,
82
- "r":0.9996713769,
83
- "f":0.9997261324
84
- },
85
- "Denominator_Unit":{
86
- "p":0.9982694685,
87
- "r":0.9995049505,
88
- "f":0.9988868275
89
  },
90
  "Dose_Form":{
91
- "p":0.9995135595,
92
- "r":0.9984207969,
93
- "f":0.9989668794
94
  },
95
- "Product_Package_Type_Value":{
96
- "p":1.0,
97
- "r":1.0,
98
- "f":1.0
99
  },
100
- "Product_Package_Type":{
101
- "p":1.0,
102
- "r":1.0,
103
- "f":1.0
104
  },
105
- "NDC":{
106
- "p":1.0,
107
- "r":1.0,
108
- "f":1.0
109
  },
110
- "Denominator_Value":{
111
- "p":1.0,
112
- "r":0.9948979592,
113
- "f":0.9974424552
114
  },
115
  "Quantity_Factor_Unit_Value":{
116
- "p":1.0,
 
 
 
 
 
117
  "r":1.0,
118
- "f":1.0
 
 
 
 
 
119
  }
120
  },
121
- "cats_score":1.0,
122
  "cats_score_desc":"macro F",
123
- "cats_micro_p":1.0,
124
- "cats_micro_r":1.0,
125
- "cats_micro_f":1.0,
126
- "cats_macro_p":1.0,
127
- "cats_macro_r":1.0,
128
- "cats_macro_f":1.0,
129
- "cats_macro_auc":1.0,
130
  "cats_f_per_type":{
131
  "MEDICATION":{
132
- "p":1.0,
133
- "r":1.0,
134
- "f":1.0
135
  },
136
  "OTHER":{
137
- "p":1.0,
138
- "r":1.0,
139
- "f":1.0
140
- },
141
- "DEVICE":{
142
- "p":1.0,
143
- "r":1.0,
144
- "f":1.0
145
  }
146
  },
147
  "cats_macro_auc_per_type":0.0,
148
- "transformer_loss":196.8679366748,
149
- "ner_loss":5134.8920414948,
150
- "textcat_loss":0.0208005876
 
151
  },
152
  "requirements":[
153
  "spacy-transformers>=1.1.7,<1.2.0"
 
1
  {
2
  "lang":"en",
3
  "name":"med12_trf",
4
+ "version":"1",
5
  "description":"",
6
  "author":"",
7
  "email":"",
 
16
  "name":null
17
  },
18
  "labels":{
19
+ "tok2vec":[
20
+
21
+ ],
22
  "transformer":[
23
 
24
  ],
 
38
  ],
39
  "textcat":[
40
  "MEDICATION",
41
+ "OTHER"
 
42
  ]
43
  },
44
  "pipeline":[
45
+ "tok2vec",
46
  "transformer",
47
  "ner",
48
  "textcat"
49
  ],
50
  "components":[
51
+ "tok2vec",
52
  "transformer",
53
  "ner",
54
  "textcat"
 
57
 
58
  ],
59
  "performance":{
60
+ "ents_f":0.8635558181,
61
+ "ents_p":0.8630460449,
62
+ "ents_r":0.8640661939,
63
  "ents_per_type":{
 
 
 
 
 
 
 
 
 
 
64
  "Medication_Name":{
65
+ "p":0.8534482759,
66
+ "r":0.8497854077,
67
+ "f":0.8516129032
68
  },
69
  "Numerator_Value":{
70
+ "p":0.9416666667,
71
+ "r":0.9826086957,
72
+ "f":0.9617021277
73
  },
74
  "Numerator_Unit":{
75
+ "p":0.9230769231,
76
+ "r":0.9391304348,
77
+ "f":0.9310344828
 
 
 
 
 
78
  },
79
  "Dose_Form":{
80
+ "p":0.8113207547,
81
+ "r":0.8322580645,
82
+ "f":0.821656051
83
  },
84
+ "Quantity_Factor_Value":{
85
+ "p":0.8064516129,
86
+ "r":0.7575757576,
87
+ "f":0.78125
88
  },
89
+ "Quantity_Factor_Unit":{
90
+ "p":0.8928571429,
91
+ "r":0.8333333333,
92
+ "f":0.8620689655
93
  },
94
+ "Product_Package_Type":{
95
+ "p":0.8717948718,
96
+ "r":0.7727272727,
97
+ "f":0.8192771084
98
  },
99
+ "Product_Package_Type_Value":{
100
+ "p":0.8571428571,
101
+ "r":0.8571428571,
102
+ "f":0.8571428571
103
  },
104
  "Quantity_Factor_Unit_Value":{
105
+ "p":0.9090909091,
106
+ "r":0.7547169811,
107
+ "f":0.824742268
108
+ },
109
+ "Denominator_Value":{
110
+ "p":0.4545454545,
111
  "r":1.0,
112
+ "f":0.625
113
+ },
114
+ "Denominator_Unit":{
115
+ "p":0.6470588235,
116
+ "r":0.9166666667,
117
+ "f":0.7586206897
118
  }
119
  },
120
+ "cats_score":0.9684618658,
121
  "cats_score_desc":"macro F",
122
+ "cats_micro_p":0.9360913862,
123
+ "cats_micro_r":0.9963851462,
124
+ "cats_micro_f":0.9652976759,
125
+ "cats_macro_p":0.9423963976,
126
+ "cats_macro_r":0.9961496466,
127
+ "cats_macro_f":0.9684618658,
128
+ "cats_macro_auc":0.9968290074,
129
  "cats_f_per_type":{
130
  "MEDICATION":{
131
+ "p":0.9273913043,
132
+ "r":0.996728972,
133
+ "f":0.9608108108
134
  },
135
  "OTHER":{
136
+ "p":0.9574014909,
137
+ "r":0.9955703212,
138
+ "f":0.9761129207
 
 
 
 
 
139
  }
140
  },
141
  "cats_macro_auc_per_type":0.0,
142
+ "tok2vec_loss":0.0,
143
+ "transformer_loss":1310.1644586238,
144
+ "ner_loss":280.7821546286,
145
+ "textcat_loss":12.6144110119
146
  },
147
  "requirements":[
148
  "spacy-transformers>=1.1.7,<1.2.0"
ner/model CHANGED
Binary files a/ner/model and b/ner/model differ
 
ner/moves CHANGED
@@ -1 +1 @@
1
- ��moves��{"0":{},"1":{"Dose_Form":68852,"Medication_Name":66662,"Numerator_Value":36834,"Numerator_Unit":36579,"NDC":29835,"Denominator_Unit":16075,"Product_Package_Type":10496,"Quantity_Factor_Unit":10472,"Quantity_Factor_Value":10195,"Product_Package_Type_Value":9752,"Quantity_Factor_Unit_Value":3419,"Denominator_Value":627},"2":{"Dose_Form":68852,"Medication_Name":66662,"Numerator_Value":36834,"Numerator_Unit":36579,"NDC":29835,"Denominator_Unit":16075,"Product_Package_Type":10496,"Quantity_Factor_Unit":10472,"Quantity_Factor_Value":10195,"Product_Package_Type_Value":9752,"Quantity_Factor_Unit_Value":3419,"Denominator_Value":627},"3":{"Dose_Form":68852,"Medication_Name":66662,"Numerator_Value":36834,"Numerator_Unit":36579,"NDC":29835,"Denominator_Unit":16075,"Product_Package_Type":10496,"Quantity_Factor_Unit":10472,"Quantity_Factor_Value":10195,"Product_Package_Type_Value":9752,"Quantity_Factor_Unit_Value":3419,"Denominator_Value":627},"4":{"Dose_Form":68852,"Medication_Name":66662,"Numerator_Value":36834,"Numerator_Unit":36579,"NDC":29835,"Denominator_Unit":16075,"Product_Package_Type":10496,"Quantity_Factor_Unit":10472,"Quantity_Factor_Value":10195,"Product_Package_Type_Value":9752,"Quantity_Factor_Unit_Value":3419,"Denominator_Value":627,"":1},"5":{"":1}}�cfg��neg_key�
 
1
+ ��moves��{"0":{},"1":{"Dose_Form":68280,"Medication_Name":66363,"Numerator_Value":36632,"Numerator_Unit":36387,"NDC":29380,"Denominator_Unit":15956,"Quantity_Factor_Unit":10336,"Product_Package_Type":10272,"Quantity_Factor_Value":10061,"Product_Package_Type_Value":9617,"Quantity_Factor_Unit_Value":3388,"Denominator_Value":653},"2":{"Dose_Form":68280,"Medication_Name":66363,"Numerator_Value":36632,"Numerator_Unit":36387,"NDC":29380,"Denominator_Unit":15956,"Quantity_Factor_Unit":10336,"Product_Package_Type":10272,"Quantity_Factor_Value":10061,"Product_Package_Type_Value":9617,"Quantity_Factor_Unit_Value":3388,"Denominator_Value":653},"3":{"Dose_Form":68280,"Medication_Name":66363,"Numerator_Value":36632,"Numerator_Unit":36387,"NDC":29380,"Denominator_Unit":15956,"Quantity_Factor_Unit":10336,"Product_Package_Type":10272,"Quantity_Factor_Value":10061,"Product_Package_Type_Value":9617,"Quantity_Factor_Unit_Value":3388,"Denominator_Value":653},"4":{"Dose_Form":68280,"Medication_Name":66363,"Numerator_Value":36632,"Numerator_Unit":36387,"NDC":29380,"Denominator_Unit":15956,"Quantity_Factor_Unit":10336,"Product_Package_Type":10272,"Quantity_Factor_Value":10061,"Product_Package_Type_Value":9617,"Quantity_Factor_Unit_Value":3388,"Denominator_Value":653,"":1},"5":{"":1}}�cfg��neg_key�
textcat/cfg CHANGED
@@ -1,8 +1,7 @@
1
  {
2
  "labels":[
3
  "MEDICATION",
4
- "OTHER",
5
- "DEVICE"
6
  ],
7
  "threshold":0.5,
8
  "positive_label":null
 
1
  {
2
  "labels":[
3
  "MEDICATION",
4
+ "OTHER"
 
5
  ],
6
  "threshold":0.5,
7
  "positive_label":null
textcat/model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4b2ab8a3a55cf5dacbc07a392060c6dce33219b5f74ec8208e0348e2b7ef5e82
3
- size 10254174
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f483561606ab536a2921d5c0d71ed741f06ecaaa8408e2f02a8af50a7058ce7
3
+ size 9202498
tok2vec/cfg ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+
3
+ }
tok2vec/model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7defdfa5b6acb8521c4f92e0f0ce07cd91cc0d095fabf39eb07405344a330664
3
+ size 6585091
transformer/model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ae16e5a7ea2cd26be7df40459ddb514a7ac88c52e0a67ed668e0b3052d696b9e
3
- size 502030647
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d37cbcb7a5e658fa6a3e8f4e5a4c9d04ae623faf1a9033df2c6c40db5cef26fe
3
+ size 502030732
vocab/strings.json CHANGED
The diff for this file is too large to render. See raw diff