Update spaCy pipeline

Browse files

Files changed (17) hide show

.gitattributes +2 -0
README.md +59 -0
compare.py +45 -0
config.cfg +147 -0
meta.json +90 -0
ner/cfg +13 -0
ner/model +0 -0
ner/moves +1 -0
sr_ner_tesla_bcx-any-py3-none-any.whl +3 -0
tokenizer +0 -0
transformer/cfg +3 -0
transformer/model +3 -0
vocab/key2row +1 -0
vocab/lookups.bin +3 -0
vocab/strings.json +0 -0
vocab/vectors +0 -0
vocab/vectors.cfg +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+sr_ner_tesla_bcx-any-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
+transformer/model filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,59 @@

+---
+tags:
+- spacy
+- token-classification
+language:
+- sr
+license: cc-by-sa-3.0
+model-index:
+- name: sr_ner_tesla_bcx
+  results:
+  - task:
+      name: NER
+      type: token-classification
+    metrics:
+    - name: NER Precision
+      type: precision
+      value: 0.957813548
+    - name: NER Recall
+      type: recall
+      value: 0.966107428
+    - name: NER F Score
+      type: f_score
+      value: 0.9619426108
+---
+sr_ner_tesla_bcx is a spaCy model meticulously fine-tuned for Named Entity Recognition in Serbian language texts. This advanced model incorporates a transformer layer based on XLM-R-BERTić, enhancing its analytical capabilities. It is proficient in identifying 7 distinct categories of entities: PERS (persons), ROLE (professions), DEMO (demonyms), ORG (organizations), LOC (locations), WORK (artworks), and EVENT (events). Detailed information about these categories is available in the accompanying table. The development of this model has been made possible through the support of the Science Fund of the Republic of Serbia, under grant #7276, for the project 'Text Embeddings - Serbian Language Applications - TESLA'.
+| Feature | Description |
+| --- | --- |
+| **Name** | `sr_ner_tesla_bcx` |
+| **Version** | `1.0.0` |
+| **spaCy** | `>=3.7.2,<3.8.0` |
+| **Default Pipeline** | `transformer`, `ner` |
+| **Components** | `transformer`, `ner` |
+| **Vectors** | 0 keys, 0 unique vectors (0 dimensions) |
+| **Sources** | n/a |
+| **License** | `CC BY-SA 3.0` |
+| **Author** | [Milica Ikonić Nešić, Saša Petalinkar, Mihailo Škorić, Ranka Stanković](https://tesla.rgf.bg.ac.rs/) |
+### Label Scheme
+<details>
+<summary>View label scheme (7 labels for 1 components)</summary>
+| Component | Labels |
+| --- | --- |
+| **`ner`** | `DEMO`, `EVENT`, `LOC`, `ORG`, `PERS`, `ROLE`, `WORK` |
+</details>
+### Accuracy
+| Type | Score |
+| --- | --- |
+| `ENTS_F` | 96.19 |
+| `ENTS_P` | 95.78 |
+| `ENTS_R` | 96.61 |
+| `TRANSFORMER_LOSS` | 31553.16 |
+| `NER_LOSS` | 78169.32 |

compare.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import spacy
+from spacy.tokens import DocBin
+import json
+def compare_ner_pipelines(binary_file_path, pipeline_names, output_file_path):
+    # Load SpaCy models based on provided pipeline names
+    nlp_pipelines = [spacy.load(name) for name in pipeline_names]
+    # Load documents from a binary file
+    doc_bin = DocBin().from_disk(binary_file_path)
+    docs = list(doc_bin.get_docs(nlp_pipelines[0].vocab))  # assuming all models share the same vocab
+    # Function to extract entities with their positions
+    def extract_entities(doc):
+        return {(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in doc.ents}
+    # Compare entities in each document across all pipelines
+    all_entities_comparison = []
+    for doc in docs:
+        # Include manually annotated entities as the first item in the list
+        entities_per_pipeline = [extract_entities(nlp(doc.text)) for nlp in nlp_pipelines]
+        # Find common and unique entities
+        common_entities = set.intersection(*entities_per_pipeline)
+        unique_entities = [ents - common_entities for ents in entities_per_pipeline]
+        # Append results for each document
+        all_entities_comparison.append({
+            "document_text": doc.text,
+            "common_entities": list(common_entities),
+            "unique_entities_per_pipeline": {i: list(ents) for i, ents in enumerate(unique_entities)},
+        })
+    # Save the results to a file
+    with open(output_file_path, 'w', encoding="utf-16") as f:
+        json.dump(all_entities_comparison, f, indent=4, ensure_ascii=False)
+    print(f"Comparison results saved to {output_file_path}")
+# Example usage
+def main():
+    base_path = r"E:\ICIST-2024-models\spacy-tr\spacy-tr"
+    compare_ner_pipelines("SRP19101_1.spacy", [base_path + r"\output20\model-best", base_path + r"\output17\model-best"], "compare_results_nk.json")
+if __name__ == "__main__":
+    main()

config.cfg ADDED Viewed

	@@ -0,0 +1,147 @@

+[paths]
+train = "./train.spacy"
+dev = "./dev.spacy"
+vectors = null
+init_tok2vec = null
+[system]
+gpu_allocator = "pytorch"
+seed = 0
+[nlp]
+lang = "sr"
+pipeline = ["transformer","ner"]
+batch_size = 128
+disabled = []
+before_creation = null
+after_creation = null
+after_pipeline_creation = null
+tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
+vectors = {"@vectors":"spacy.Vectors.v1"}
+[components]
+[components.ner]
+factory = "ner"
+incorrect_spans_key = null
+moves = null
+scorer = {"@scorers":"spacy.ner_scorer.v1"}
+update_with_oracle_cut_size = 100
+[components.ner.model]
+@architectures = "spacy.TransitionBasedParser.v2"
+state_type = "ner"
+extra_state_tokens = false
+hidden_width = 64
+maxout_pieces = 2
+use_upper = false
+nO = null
+[components.ner.model.tok2vec]
+@architectures = "spacy-transformers.TransformerListener.v1"
+grad_factor = 1.0
+pooling = {"@layers":"reduce_mean.v1"}
+upstream = "*"
+[components.transformer]
+factory = "transformer"
+max_batch_items = 4096
+set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"}
+[components.transformer.model]
+@architectures = "spacy-transformers.TransformerModel.v3"
+name = "classla/xlm-r-bertic"
+mixed_precision = false
+[components.transformer.model.get_spans]
+@span_getters = "spacy-transformers.strided_spans.v1"
+window = 128
+stride = 96
+[components.transformer.model.grad_scaler_config]
+[components.transformer.model.tokenizer_config]
+use_fast = true
+[components.transformer.model.transformer_config]
+[corpora]
+[corpora.dev]
+@readers = "spacy.Corpus.v1"
+path = ${paths.dev}
+max_length = 0
+gold_preproc = false
+limit = 0
+augmenter = null
+[corpora.train]
+@readers = "spacy.Corpus.v1"
+path = ${paths.train}
+max_length = 0
+gold_preproc = false
+limit = 0
+augmenter = null
+[training]
+accumulate_gradient = 3
+dev_corpus = "corpora.dev"
+train_corpus = "corpora.train"
+seed = ${system.seed}
+gpu_allocator = ${system.gpu_allocator}
+dropout = 0.1
+patience = 1600
+max_epochs = 0
+max_steps = 20000
+eval_frequency = 200
+frozen_components = []
+annotating_components = []
+before_to_disk = null
+before_update = null
+[training.batcher]
+@batchers = "spacy.batch_by_padded.v1"
+discard_oversize = true
+size = 2000
+buffer = 256
+get_length = null
+[training.logger]
+@loggers = "spacy.ConsoleLogger.v1"
+progress_bar = false
+[training.optimizer]
+@optimizers = "Adam.v1"
+beta1 = 0.9
+beta2 = 0.999
+L2_is_weight_decay = true
+L2 = 0.01
+grad_clip = 1.0
+use_averages = false
+eps = 0.00000001
+[training.optimizer.learn_rate]
+@schedules = "warmup_linear.v1"
+warmup_steps = 250
+total_steps = 20000
+initial_rate = 0.00005
+[training.score_weights]
+ents_f = 1.0
+ents_p = 0.0
+ents_r = 0.0
+ents_per_type = null
+[pretraining]
+[initialize]
+vectors = ${paths.vectors}
+init_tok2vec = ${paths.init_tok2vec}
+vocab_data = null
+lookups = null
+before_init = null
+after_init = null
+[initialize.components]
+[initialize.tokenizer]

meta.json ADDED Viewed

	@@ -0,0 +1,90 @@

+{
+  "lang":"sr",
+  "name":"ner_tesla_bcx",
+  "version":"1.0.0",
+  "description":"sr_ner_tesla_bcx is a spaCy model meticulously fine-tuned for Named Entity Recognition in Serbian language texts. This advanced model incorporates a transformer layer based on XLM-R-BERTi\u0107, enhancing its analytical capabilities. It is proficient in identifying 7 distinct categories of entities: PERS (persons), ROLE (professions), DEMO (demonyms), ORG (organizations), LOC (locations), WORK (artworks), and EVENT (events). Detailed information about these categories is available in the accompanying table. The development of this model has been made possible through the support of the Science Fund of the Republic of Serbia, under grant #7276, for the project 'Text Embeddings - Serbian Language Applications - TESLA'.",
+  "author":"Milica Ikoni\u0107 Ne\u0161i\u0107, Sa\u0161a Petalinkar, Mihailo \u0160kori\u0107, Ranka Stankovi\u0107",
+  "email":"",
+  "url":"https://tesla.rgf.bg.ac.rs/",
+  "license":"CC BY-SA 3.0",
+  "spacy_version":">=3.7.2,<3.8.0",
+  "spacy_git_version":"a89eae928",
+  "vectors":{
+    "width":0,
+    "vectors":0,
+    "keys":0,
+    "name":null
+  },
+  "labels":{
+    "transformer":[
+    ],
+    "ner":[
+      "DEMO",
+      "EVENT",
+      "LOC",
+      "ORG",
+      "PERS",
+      "ROLE",
+      "WORK"
+    ]
+  },
+  "pipeline":[
+    "transformer",
+    "ner"
+  ],
+  "components":[
+    "transformer",
+    "ner"
+  ],
+  "disabled":[
+  ],
+  "performance":{
+    "ents_f":0.9619426108,
+    "ents_p":0.957813548,
+    "ents_r":0.966107428,
+    "ents_per_type":{
+      "ROLE":{
+        "p":0.8736710444,
+        "r":0.9172685489,
+        "f":0.8949391416
+      },
+      "PERS":{
+        "p":0.9903689806,
+        "r":0.9903689806,
+        "f":0.9903689806
+      },
+      "LOC":{
+        "p":0.9556978233,
+        "r":0.983658408,
+        "f":0.9694765554
+      },
+      "DEMO":{
+        "p":0.915407855,
+        "r":0.9543307087,
+        "f":0.934464148
+      },
+      "ORG":{
+        "p":0.8515742129,
+        "r":0.7759562842,
+        "f":0.8120085776
+      },
+      "WORK":{
+        "p":0.76,
+        "r":0.5352112676,
+        "f":0.6280991736
+      },
+      "EVENT":{
+        "p":0.6875,
+        "r":0.6875,
+        "f":0.6875
+      }
+    },
+    "transformer_loss":315.5316300207,
+    "ner_loss":781.6932183095
+  },
+  "requirements":[
+    "spacy-transformers>=1.3.4,<1.4.0"
+  ]
+}

ner/cfg ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "moves":null,
+  "update_with_oracle_cut_size":100,
+  "multitasks":[
+  ],
+  "min_action_freq":1,
+  "learn_tokens":false,
+  "beam_width":1,
+  "beam_density":0.0,
+  "beam_update_prob":0.0,
+  "incorrect_spans_key":null
+}

ner/model ADDED Viewed

Binary file (310 kB). View file

ner/moves ADDED Viewed

	@@ -0,0 +1 @@

+ ��moves��{"0":{},"1":{"PERS":66081,"LOC":35152,"ROLE":14259,"ORG":10504,"DEMO":5087,"WORK":973,"EVENT":546},"2":{"PERS":66081,"LOC":35152,"ROLE":14259,"ORG":10504,"DEMO":5087,"WORK":973,"EVENT":546},"3":{"PERS":66081,"LOC":35152,"ROLE":14259,"ORG":10504,"DEMO":5087,"WORK":973,"EVENT":546},"4":{"PERS":66081,"LOC":35152,"ROLE":14259,"ORG":10504,"DEMO":5087,"WORK":973,"EVENT":546,"":1},"5":{"":1}}�cfg��neg_key�

sr_ner_tesla_bcx-any-py3-none-any.whl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1700bc3ddcbdc55cffecc1bb305e2827caf2245d0509e3889c8140b020959357
+size 2085012613

tokenizer ADDED Viewed

Binary file (32.6 kB). View file

transformer/cfg ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "max_batch_items":4096
+}

transformer/model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:90c7ca694d8c4ca3b6500046f95a60ec8a72f7a704f5750d5e9df8fa4966ca6f
+size 2261874822

vocab/key2row ADDED Viewed

	@@ -0,0 +1 @@


1	+ �

vocab/lookups.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:76be8b528d0075f7aae98d6fa57a6d3c83ae480a8469e668d7b0af968995ac71
+size 1

vocab/strings.json ADDED Viewed

The diff for this file is too large to render. See raw diff

vocab/vectors ADDED Viewed

Binary file (128 Bytes). View file

vocab/vectors.cfg ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "mode":"default"
+}