init

Browse files

Files changed (15) hide show

README.md +35 -0
config.cfg +145 -0
es_cantemist_ner_trf-3.4.0-py3-none-any.whl +3 -0
meta.json +54 -0
ner/cfg +13 -0
ner/model +3 -0
ner/moves +1 -0
tokenizer +3 -0
transformer/cfg +3 -0
transformer/model +3 -0
vocab/key2row +1 -0
vocab/lookups.bin +3 -0
vocab/strings.json +0 -0
vocab/vectors +0 -0
vocab/vectors.cfg +3 -0

README.md ADDED Viewed

	@@ -0,0 +1,35 @@

+Basic Spacy BioNER pipeline, with a RoBERTa-based model [bsc-bio-ehr-es] (https://huggingface.co/PlanTL-GOB-ES/bsc-bio-ehr-es) and a dataset, CANTEMIST, annotated with tumor morphology entities. For further information, check the  [official website](https://temu.bsc.es/cantemist/). Visit our [GitHub repository](https://github.com/PlanTL-GOB-ES/lm-biomedical-clinical-es). This work was funded by the Spanish State Secretariat for Digitalization and Artificial Intelligence (SEDIA) within the framework of the Plan-TL
+| Feature | Description |
+| --- | --- |
+| **Name** | `es_cantemist_ner_trf` |
+| **Version** | `3.4.0` |
+| **spaCy** | `>=3.4.0,<3.5.0` |
+| **Default Pipeline** | `transformer`, `ner` |
+| **Components** | `transformer`, `ner` |
+| **Vectors** | 0 keys, 0 unique vectors (0 dimensions) |
+| **Sources** | n/a |
+| **License** | `[Apache License, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0)` |
+| **Author** | [The Text Mining Unit from Barcelona Supercomputing Center.](https://huggingface.co/PlanTL-GOB-ES) |
+### Label Scheme
+<details>
+<summary>View label scheme (1 labels for 1 components)</summary>
+| Component | Labels |
+| --- | --- |
+| **`ner`** | `MORFOLOGIA_NEOPLASIA` |
+</details>
+### Accuracy
+| Type | Score |
+| --- | --- |
+| `ENTS_F` | 84.52 |
+| `ENTS_P` | 84.88 |
+| `ENTS_R` | 84.16 |
+| `TRANSFORMER_LOSS` | 25646.78 |
+| `NER_LOSS` | 9622.84 |

config.cfg ADDED Viewed

	@@ -0,0 +1,145 @@

+[paths]
+train = "corpus/train.spacy"
+dev = "corpus/dev.spacy"
+vectors = null
+init_tok2vec = null
+[system]
+gpu_allocator = "pytorch"
+seed = 0
+[nlp]
+lang = "es"
+pipeline = ["transformer","ner"]
+batch_size = 128
+disabled = []
+before_creation = null
+after_creation = null
+after_pipeline_creation = null
+tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
+[components]
+[components.ner]
+factory = "ner"
+incorrect_spans_key = null
+moves = null
+scorer = {"@scorers":"spacy.ner_scorer.v1"}
+update_with_oracle_cut_size = 100
+[components.ner.model]
+@architectures = "spacy.TransitionBasedParser.v2"
+state_type = "ner"
+extra_state_tokens = false
+hidden_width = 64
+maxout_pieces = 2
+use_upper = false
+nO = null
+[components.ner.model.tok2vec]
+@architectures = "spacy-transformers.TransformerListener.v1"
+grad_factor = 1.0
+pooling = {"@layers":"reduce_mean.v1"}
+upstream = "*"
+[components.transformer]
+factory = "transformer"
+max_batch_items = 4096
+set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"}
+[components.transformer.model]
+@architectures = "spacy-transformers.TransformerModel.v3"
+name = "PlanTL-GOB-ES/bsc-bio-ehr-es"
+mixed_precision = false
+[components.transformer.model.get_spans]
+@span_getters = "spacy-transformers.strided_spans.v1"
+window = 128
+stride = 96
+[components.transformer.model.grad_scaler_config]
+[components.transformer.model.tokenizer_config]
+use_fast = true
+[components.transformer.model.transformer_config]
+[corpora]
+[corpora.dev]
+@readers = "spacy.Corpus.v1"
+path = ${paths.dev}
+max_length = 0
+gold_preproc = false
+limit = 0
+augmenter = null
+[corpora.train]
+@readers = "spacy.Corpus.v1"
+path = ${paths.train}
+max_length = 0
+gold_preproc = false
+limit = 0
+augmenter = null
+[training]
+accumulate_gradient = 3
+dev_corpus = "corpora.dev"
+train_corpus = "corpora.train"
+seed = ${system.seed}
+gpu_allocator = ${system.gpu_allocator}
+dropout = 0.1
+patience = 1600
+max_epochs = 0
+max_steps = 20000
+eval_frequency = 200
+frozen_components = []
+annotating_components = []
+before_to_disk = null
+[training.batcher]
+@batchers = "spacy.batch_by_padded.v1"
+discard_oversize = true
+size = 2000
+buffer = 256
+get_length = null
+[training.logger]
+@loggers = "spacy.ConsoleLogger.v1"
+progress_bar = false
+[training.optimizer]
+@optimizers = "Adam.v1"
+beta1 = 0.9
+beta2 = 0.999
+L2_is_weight_decay = true
+L2 = 0.01
+grad_clip = 1.0
+use_averages = false
+eps = 0.00000001
+[training.optimizer.learn_rate]
+@schedules = "warmup_linear.v1"
+warmup_steps = 250
+total_steps = 20000
+initial_rate = 0.00005
+[training.score_weights]
+ents_f = 1.0
+ents_p = 0.0
+ents_r = 0.0
+ents_per_type = null
+[pretraining]
+[initialize]
+vectors = ${paths.vectors}
+init_tok2vec = ${paths.init_tok2vec}
+vocab_data = null
+lookups = null
+before_init = null
+after_init = null
+[initialize.components]
+[initialize.tokenizer]

es_cantemist_ner_trf-3.4.0-py3-none-any.whl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a89c8acad991e8592f109e76f974abcec7c341b48bc7a6ef5d3407fb7bfe2ab0
+size 441486346

meta.json ADDED Viewed

	@@ -0,0 +1,54 @@

+{
+  "lang":"es",
+  "name":"cantemist_ner_trf",
+  "version":"3.4.0",
+  "description":"Basic Spacy BioNER pipeline, with a RoBERTa-based model [bsc-bio-ehr-es] (https://huggingface.co/PlanTL-GOB-ES/bsc-bio-ehr-es) and a dataset, CANTEMIST, annotated with tumor morphology entities. For further information, check the  [official website](https://temu.bsc.es/cantemist/). Visit our [GitHub repository](https://github.com/PlanTL-GOB-ES/lm-biomedical-clinical-es). This work was funded by the Spanish State Secretariat for Digitalization and Artificial Intelligence (SEDIA) within the framework of the Plan-TL",
+  "author":"The Text Mining Unit from Barcelona Supercomputing Center.",
+  "email":"plantl-gob-es@bsc.es",
+  "url":"https://huggingface.co/PlanTL-GOB-ES",
+  "license":"[Apache License, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0)",
+  "spacy_version":">=3.4.0,<3.5.0",
+  "spacy_git_version":"Unknown",
+  "vectors":{
+    "width":0,
+    "vectors":0,
+    "keys":0,
+    "name":null
+  },
+  "labels":{
+    "transformer":[
+    ],
+    "ner":[
+      "MORFOLOGIA_NEOPLASIA"
+    ]
+  },
+  "pipeline":[
+    "transformer",
+    "ner"
+  ],
+  "components":[
+    "transformer",
+    "ner"
+  ],
+  "disabled":[
+  ],
+  "performance":{
+    "ents_f":0.8451798075,
+    "ents_p":0.8487622923,
+    "ents_r":0.8416274378,
+    "ents_per_type":{
+      "MORFOLOGIA_NEOPLASIA":{
+        "p":0.8487622923,
+        "r":0.8416274378,
+        "f":0.8451798075
+      }
+    },
+    "transformer_loss":256.4677646511,
+    "ner_loss":96.2283862055
+  },
+  "requirements":[
+    "spacy-transformers>=1.1.8,<1.2.0"
+  ]
+}

ner/cfg ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "moves":null,
+  "update_with_oracle_cut_size":100,
+  "multitasks":[
+  ],
+  "min_action_freq":1,
+  "learn_tokens":false,
+  "beam_width":1,
+  "beam_density":0.0,
+  "beam_update_prob":0.0,
+  "incorrect_spans_key":null
+}

ner/model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3dff10568a451ec1d3ef8a9e4ba695ab798f9a557e8fbf7a8d90e085f7717e37
+size 207145

ner/moves ADDED Viewed

	@@ -0,0 +1 @@


1	+ ��moves٤{"0":{},"1":{"MORFOLOGIA_NEOPLASIA":14889},"2":{"MORFOLOGIA_NEOPLASIA":14889},"3":{"MORFOLOGIA_NEOPLASIA":14889},"4":{"MORFOLOGIA_NEOPLASIA":14889,"":1},"5":{"":1}}�cfg��neg_key�

tokenizer ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c453330d14b61214d60b17c4c5a47c4acf3027c9b32c452dd3b26f66c5b28169
+size 36836

transformer/cfg ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "max_batch_items":4096
+}

transformer/model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e45adc31db0ea71505cf438a94ec2f6ce0c328dc2935771d46f3928a3bfedc7c
+size 502280552

vocab/key2row ADDED Viewed

	@@ -0,0 +1 @@


1	+ �

vocab/lookups.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:76be8b528d0075f7aae98d6fa57a6d3c83ae480a8469e668d7b0af968995ac71
+size 1

vocab/strings.json ADDED Viewed

The diff for this file is too large to render. See raw diff

vocab/vectors ADDED Viewed

Binary file (128 Bytes). View file

vocab/vectors.cfg ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "mode":"default"
+}