kormilitzin commited on
Commit
b343bff
1 Parent(s): 52dc7ac

Update spaCy pipeline

Browse files
.gitattributes CHANGED
@@ -30,3 +30,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
30
  *strings.json filter=lfs diff=lfs merge=lfs -text
31
  vectors filter=lfs diff=lfs merge=lfs -text
32
  model filter=lfs diff=lfs merge=lfs -text
 
 
30
  *strings.json filter=lfs diff=lfs merge=lfs -text
31
  vectors filter=lfs diff=lfs merge=lfs -text
32
  model filter=lfs diff=lfs merge=lfs -text
33
+ vocab/key2row filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -14,25 +14,25 @@ model-index:
14
  metrics:
15
  - name: NER Precision
16
  type: precision
17
- value: 0.8778156997
18
  - name: NER Recall
19
  type: recall
20
- value: 0.8840918466
21
  - name: NER F Score
22
  type: f_score
23
- value: 0.8809425949
24
  ---
25
  | Feature | Description |
26
  | --- | --- |
27
  | **Name** | `en_core_med7_lg` |
28
- | **Version** | `3.1.3.1` |
29
- | **spaCy** | `>=3.1.4,<3.2.0` |
30
  | **Default Pipeline** | `tok2vec`, `ner` |
31
  | **Components** | `tok2vec`, `ner` |
32
- | **Vectors** | 684830 keys, 684830 unique vectors (300 dimensions) |
33
  | **Sources** | n/a |
34
  | **License** | `MIT` |
35
- | **Author** | [Andrey Kormilitzin](kormilitzin.com) |
36
 
37
  ### Label Scheme
38
 
@@ -50,22 +50,8 @@ model-index:
50
 
51
  | Type | Score |
52
  | --- | --- |
53
- | `ENTS_F` | 88.09 |
54
- | `ENTS_P` | 87.78 |
55
- | `ENTS_R` | 88.41 |
56
- | `TOK2VEC_LOSS` | 115648.09 |
57
- | `NER_LOSS` | 279069.77 |
58
-
59
- ### BibTeX entry and citation info
60
-
61
- ```bibtex
62
- @article{kormilitzin2021med7,
63
- title={Med7: A transferable clinical natural language processing model for electronic health records},
64
- author={Kormilitzin, Andrey and Vaci, Nemanja and Liu, Qiang and Nevado-Holgado, Alejo},
65
- journal={Artificial Intelligence in Medicine},
66
- volume={118},
67
- pages={102086},
68
- year={2021},
69
- publisher={Elsevier}
70
- }
71
- ```
 
14
  metrics:
15
  - name: NER Precision
16
  type: precision
17
+ value: 0.8649613325
18
  - name: NER Recall
19
  type: recall
20
+ value: 0.8892966361
21
  - name: NER F Score
22
  type: f_score
23
+ value: 0.876960193
24
  ---
25
  | Feature | Description |
26
  | --- | --- |
27
  | **Name** | `en_core_med7_lg` |
28
+ | **Version** | `3.4.2.1` |
29
+ | **spaCy** | `>=3.4.2,<3.5.0` |
30
  | **Default Pipeline** | `tok2vec`, `ner` |
31
  | **Components** | `tok2vec`, `ner` |
32
+ | **Vectors** | 514157 keys, 514157 unique vectors (300 dimensions) |
33
  | **Sources** | n/a |
34
  | **License** | `MIT` |
35
+ | **Author** | [Andrey Kormilitzin](https://www.kormilitzin.com/) |
36
 
37
  ### Label Scheme
38
 
 
50
 
51
  | Type | Score |
52
  | --- | --- |
53
+ | `ENTS_F` | 87.70 |
54
+ | `ENTS_P` | 86.50 |
55
+ | `ENTS_R` | 88.93 |
56
+ | `TOK2VEC_LOSS` | 226109.53 |
57
+ | `NER_LOSS` | 302222.55 |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
config.cfg CHANGED
@@ -1,8 +1,9 @@
1
  [paths]
2
- train = "./data/spacy_format/train_443.spacy"
3
- dev = "./data/spacy_format/dev_443.spacy"
4
- vectors = null
5
- init_tok2vec = null
 
6
 
7
  [system]
8
  gpu_allocator = null
@@ -24,13 +25,14 @@ tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
24
  factory = "ner"
25
  incorrect_spans_key = null
26
  moves = null
 
27
  update_with_oracle_cut_size = 100
28
 
29
  [components.ner.model]
30
  @architectures = "spacy.TransitionBasedParser.v2"
31
  state_type = "ner"
32
  extra_state_tokens = false
33
- hidden_width = 64
34
  maxout_pieces = 2
35
  use_upper = true
36
  nO = null
@@ -49,8 +51,8 @@ factory = "tok2vec"
49
  [components.tok2vec.model.embed]
50
  @architectures = "spacy.MultiHashEmbed.v2"
51
  width = ${components.tok2vec.model.encode.width}
52
- attrs = ["ORTH","SHAPE"]
53
- rows = [5000,2500]
54
  include_static_vectors = true
55
 
56
  [components.tok2vec.model.encode]
@@ -85,7 +87,7 @@ seed = ${system.seed}
85
  gpu_allocator = ${system.gpu_allocator}
86
  dropout = 0.1
87
  accumulate_gradient = 1
88
- patience = 1600
89
  max_epochs = 0
90
  max_steps = 20000
91
  eval_frequency = 200
@@ -108,7 +110,7 @@ t = 0.0
108
 
109
  [training.logger]
110
  @loggers = "spacy.ConsoleLogger.v1"
111
- progress_bar = false
112
 
113
  [training.optimizer]
114
  @optimizers = "Adam.v1"
@@ -130,13 +132,17 @@ ents_per_type = null
130
  [pretraining]
131
 
132
  [initialize]
133
- vectors = "en_core_web_lg"
134
  init_tok2vec = ${paths.init_tok2vec}
135
  vocab_data = null
136
- lookups = null
137
  before_init = null
138
  after_init = null
139
 
140
  [initialize.components]
141
 
 
 
 
 
 
142
  [initialize.tokenizer]
 
1
  [paths]
2
+ train = "/mnt/sdf/andrey/projects/med7_v3/data/spacy_format/train_med7_v34.spacy"
3
+ dev = "/mnt/sdf/andrey/projects/med7_v3/data/spacy_format/dev_med7_v34.spacy"
4
+ raw_text = "/mnt/sdf/andrey/projects/med7_v3/data/pretrain_mimic.jsonl"
5
+ vectors = "en_core_web_lg"
6
+ init_tok2vec = "/mnt/sdf/andrey/projects/med7_v3/output_pretrain_lg/model169.bin"
7
 
8
  [system]
9
  gpu_allocator = null
 
25
  factory = "ner"
26
  incorrect_spans_key = null
27
  moves = null
28
+ scorer = {"@scorers":"spacy.ner_scorer.v1"}
29
  update_with_oracle_cut_size = 100
30
 
31
  [components.ner.model]
32
  @architectures = "spacy.TransitionBasedParser.v2"
33
  state_type = "ner"
34
  extra_state_tokens = false
35
+ hidden_width = 128
36
  maxout_pieces = 2
37
  use_upper = true
38
  nO = null
 
51
  [components.tok2vec.model.embed]
52
  @architectures = "spacy.MultiHashEmbed.v2"
53
  width = ${components.tok2vec.model.encode.width}
54
+ attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
55
+ rows = [5000,1000,2500,2500]
56
  include_static_vectors = true
57
 
58
  [components.tok2vec.model.encode]
 
87
  gpu_allocator = ${system.gpu_allocator}
88
  dropout = 0.1
89
  accumulate_gradient = 1
90
+ patience = 3600
91
  max_epochs = 0
92
  max_steps = 20000
93
  eval_frequency = 200
 
110
 
111
  [training.logger]
112
  @loggers = "spacy.ConsoleLogger.v1"
113
+ progress_bar = true
114
 
115
  [training.optimizer]
116
  @optimizers = "Adam.v1"
 
132
  [pretraining]
133
 
134
  [initialize]
135
+ vectors = ${paths.vectors}
136
  init_tok2vec = ${paths.init_tok2vec}
137
  vocab_data = null
 
138
  before_init = null
139
  after_init = null
140
 
141
  [initialize.components]
142
 
143
+ [initialize.lookups]
144
+ @misc = "spacy.LookupsDataLoader.v1"
145
+ lang = ${nlp.lang}
146
+ tables = ["lexeme_norm"]
147
+
148
  [initialize.tokenizer]
en_core_med7_lg-any-py3-none-any.whl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a0b0795c8891465e185790eb325fe8cdb571cabb9f7b5f588b8038765581d452
3
- size 790703817
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c8ce371b7ee8b3c42cce7c05031ab165e6e88cc3ed136797eb3c492af96001f
3
+ size 607352778
meta.json CHANGED
@@ -1,18 +1,18 @@
1
  {
2
  "lang":"en",
3
  "name":"core_med7_lg",
4
- "version":"3.1.3.1",
5
  "description":"",
6
  "author":"Andrey Kormilitzin",
7
  "email":"kormilitzin@gmail.com",
8
- "url":"kormilitzin.com",
9
  "license":"MIT",
10
- "spacy_version":">=3.1.4,<3.2.0",
11
- "spacy_git_version":"8bda39f08",
12
  "vectors":{
13
  "width":300,
14
- "vectors":684830,
15
- "keys":684830,
16
  "name":"en_vectors"
17
  },
18
  "labels":{
@@ -41,48 +41,48 @@
41
 
42
  ],
43
  "performance":{
44
- "ents_f":0.8809425949,
45
- "ents_p":0.8778156997,
46
- "ents_r":0.8840918466,
47
  "ents_per_type":{
48
  "DRUG":{
49
- "p":0.8897203623,
50
- "r":0.8855350843,
51
- "f":0.8876227898
52
  },
53
- "DOSAGE":{
54
- "p":0.8563134978,
55
- "r":0.8885542169,
56
- "f":0.8721359941
57
  },
58
  "STRENGTH":{
59
- "p":0.9299065421,
60
- "r":0.9307764266,
61
- "f":0.930341281
62
  },
63
- "DURATION":{
64
- "p":0.6741573034,
65
- "r":0.625,
66
- "f":0.6486486486
67
  },
68
  "FORM":{
69
- "p":0.9318637275,
70
- "r":0.8635097493,
71
- "f":0.8963855422
72
  },
73
- "FREQUENCY":{
74
- "p":0.7181544634,
75
- "r":0.8211009174,
76
- "f":0.7661851257
77
  },
78
- "ROUTE":{
79
- "p":0.9331919406,
80
- "r":0.9322033898,
81
- "f":0.9326974033
82
  }
83
  },
84
- "tok2vec_loss":1156.4809109417,
85
- "ner_loss":2790.6977062746
86
  },
87
  "requirements":[
88
 
 
1
  {
2
  "lang":"en",
3
  "name":"core_med7_lg",
4
+ "version":"3.4.2.1",
5
  "description":"",
6
  "author":"Andrey Kormilitzin",
7
  "email":"kormilitzin@gmail.com",
8
+ "url":"https://www.kormilitzin.com/",
9
  "license":"MIT",
10
+ "spacy_version":">=3.4.2,<3.5.0",
11
+ "spacy_git_version":"Unknown",
12
  "vectors":{
13
  "width":300,
14
+ "vectors":514157,
15
+ "keys":514157,
16
  "name":"en_vectors"
17
  },
18
  "labels":{
 
41
 
42
  ],
43
  "performance":{
44
+ "ents_f":0.876960193,
45
+ "ents_p":0.8649613325,
46
+ "ents_r":0.8892966361,
47
  "ents_per_type":{
48
  "DRUG":{
49
+ "p":0.8638497653,
50
+ "r":0.8761904762,
51
+ "f":0.8699763593
52
  },
53
+ "ROUTE":{
54
+ "p":0.9427083333,
55
+ "r":0.9427083333,
56
+ "f":0.9427083333
57
  },
58
  "STRENGTH":{
59
+ "p":0.8814229249,
60
+ "r":0.9214876033,
61
+ "f":0.901010101
62
  },
63
+ "FREQUENCY":{
64
+ "p":0.7222222222,
65
+ "r":0.8210526316,
66
+ "f":0.7684729064
67
  },
68
  "FORM":{
69
+ "p":0.9400921659,
70
+ "r":0.9400921659,
71
+ "f":0.9400921659
72
  },
73
+ "DOSAGE":{
74
+ "p":0.8671328671,
75
+ "r":0.8671328671,
76
+ "f":0.8671328671
77
  },
78
+ "DURATION":{
79
+ "p":0.6666666667,
80
+ "r":0.6666666667,
81
+ "f":0.6666666667
82
  }
83
  },
84
+ "tok2vec_loss":2261.0953059313,
85
+ "ner_loss":3022.2254596124
86
  },
87
  "requirements":[
88
 
ner/model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:877cccaee4e82318e0afaaa1cbce7117e2bec697f870ab0ca75d272a85d4fc91
3
- size 174716
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d7822d7f88fd809e295fefb36952f61c0bd54de74030670c3b2f9908f651fb51
3
+ size 545160
ner/moves CHANGED
@@ -1 +1 @@
1
- ��moves��{"0":{},"1":{"DRUG":25068,"STRENGTH":19845,"DOSAGE":16674,"FREQUENCY":15332,"FORM":12981,"ROUTE":8181,"DURATION":1940},"2":{"DRUG":25068,"STRENGTH":19845,"DOSAGE":16674,"FREQUENCY":15332,"FORM":12981,"ROUTE":8181,"DURATION":1940},"3":{"DRUG":25068,"STRENGTH":19845,"DOSAGE":16674,"FREQUENCY":15332,"FORM":12981,"ROUTE":8181,"DURATION":1940},"4":{"DRUG":25068,"STRENGTH":19845,"DOSAGE":16674,"FREQUENCY":15332,"FORM":12981,"ROUTE":8181,"DURATION":1940,"":1},"5":{"":1}}�cfg��neg_key�
 
1
+ ��moves��{"0":{},"1":{"DRUG":27417,"STRENGTH":21625,"DOSAGE":18350,"FREQUENCY":16642,"FORM":14267,"ROUTE":8996,"DURATION":2140},"2":{"DRUG":27417,"STRENGTH":21625,"DOSAGE":18350,"FREQUENCY":16642,"FORM":14267,"ROUTE":8996,"DURATION":2140},"3":{"DRUG":27417,"STRENGTH":21625,"DOSAGE":18350,"FREQUENCY":16642,"FORM":14267,"ROUTE":8996,"DURATION":2140},"4":{"DRUG":27417,"STRENGTH":21625,"DOSAGE":18350,"FREQUENCY":16642,"FORM":14267,"ROUTE":8996,"DURATION":2140,"":1},"5":{"":1}}�cfg��neg_key�
tok2vec/model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8903ce60de43343f02405a3f1e1415ff16e18ab60a0bf4c058020863ceefdeac
3
- size 29276330
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:451d2b22dc4129d515e2e2246c54a11f605a8a18ba4a426dae5d7c40d253c707
3
+ size 34434008
tokenizer CHANGED
The diff for this file is too large to render. See raw diff
 
vocab/key2row CHANGED
Binary files a/vocab/key2row and b/vocab/key2row differ
 
vocab/lookups.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:76be8b528d0075f7aae98d6fa57a6d3c83ae480a8469e668d7b0af968995ac71
3
- size 1
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ddd140ecac6a8c4592e9146d6e30074569ffaed97ee51edc9587dc510f8934c
3
+ size 69982
vocab/strings.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fef6c821b0367aa50f02bbcc32e74a95fe09f65317f9714ed236751e4f8ce894
3
- size 9894693
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b1693baa4ec8d99e20fa93b38ecafa6b4f49f244472d455f35e7605f20345e0
3
+ size 10856674
vocab/vectors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d90b9122eef03666021c0592972138a2d70f785920cdc86588b369ec327074a7
3
- size 821796128
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:234dcf234bfdf01775ae6182715d55eaacfcde8555b189f25440b56d3c39fd5d
3
+ size 616988528
vocab/vectors.cfg ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "mode":"default"
3
+ }