Tanor commited on
Commit
7589a4d
1 Parent(s): cb645ca

Update spaCy pipeline

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ sr_pner_tesla_bcx-any-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
37
+ transformer/model filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - spacy
4
+ - token-classification
5
+ language:
6
+ - sr
7
+ license: cc-by-sa-3.0
8
+ model-index:
9
+ - name: sr_pner_tesla_bcx
10
+ results:
11
+ - task:
12
+ name: NER
13
+ type: token-classification
14
+ metrics:
15
+ - name: NER Precision
16
+ type: precision
17
+ value: 0.9567611119
18
+ - name: NER Recall
19
+ type: recall
20
+ value: 0.9640102828
21
+ - name: NER F Score
22
+ type: f_score
23
+ value: 0.9603720178
24
+ - task:
25
+ name: TAG
26
+ type: token-classification
27
+ metrics:
28
+ - name: TAG (XPOS) Accuracy
29
+ type: accuracy
30
+ value: 0.9853440156
31
+ ---
32
+ sr_pner_tesla_bcx is a spaCy model meticulously fine-tuned for Part-of-Speech Tagging and Named Entity Recognition in Serbian language texts. This advanced model incorporates a transformer layer based on XLM-R-BERTić, enhancing its analytical capabilities. It is proficient in identifying 7 distinct categories of entities: PERS (persons), ROLE (professions), DEMO (demonyms), ORG (organizations), LOC (locations), WORK (artworks), and EVENT (events). Detailed information about these categories is available in the accompanying table. The development of this model has been made possible through the support of the Science Fund of the Republic of Serbia, under grant #7276, for the project 'Text Embeddings - Serbian Language Applications - TESLA'.
33
+
34
+ | Feature | Description |
35
+ | --- | --- |
36
+ | **Name** | `sr_pner_tesla_bcx` |
37
+ | **Version** | `1.0.0` |
38
+ | **spaCy** | `>=3.7.2,<3.8.0` |
39
+ | **Default Pipeline** | `transformer`, `tagger`, `ner` |
40
+ | **Components** | `transformer`, `tagger`, `ner` |
41
+ | **Vectors** | 0 keys, 0 unique vectors (0 dimensions) |
42
+ | **Sources** | n/a |
43
+ | **License** | `CC BY-SA 3.0` |
44
+ | **Author** | [Milica Ikonić Nešić, Saša Petalinkar, Mihailo Škorić, Ranka Stanković](https://tesla.rgf.bg.ac.rs/) |
45
+
46
+ ### Label Scheme
47
+
48
+ <details>
49
+
50
+ <summary>View label scheme (23 labels for 2 components)</summary>
51
+
52
+ | Component | Labels |
53
+ | --- | --- |
54
+ | **`tagger`** | `ADJ`, `ADP`, `ADV`, `AUX`, `CCONJ`, `DET`, `INTJ`, `NOUN`, `NUM`, `PART`, `PRON`, `PROPN`, `PUNCT`, `SCONJ`, `VERB`, `X` |
55
+ | **`ner`** | `DEMO`, `EVENT`, `LOC`, `ORG`, `PERS`, `ROLE`, `WORK` |
56
+
57
+ </details>
58
+
59
+ ### Accuracy
60
+
61
+ | Type | Score |
62
+ | --- | --- |
63
+ | `TAG_ACC` | 98.53 |
64
+ | `ENTS_F` | 96.04 |
65
+ | `ENTS_P` | 95.68 |
66
+ | `ENTS_R` | 96.40 |
67
+ | `TRANSFORMER_LOSS` | 38869.84 |
68
+ | `TAGGER_LOSS` | 56268.63 |
69
+ | `NER_LOSS` | 33269.80 |
config.cfg ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [paths]
2
+ train = "./train.spacy"
3
+ dev = "./dev.spacy"
4
+ vectors = null
5
+ init_tok2vec = null
6
+
7
+ [system]
8
+ gpu_allocator = "pytorch"
9
+ seed = 0
10
+
11
+ [nlp]
12
+ lang = "sr"
13
+ pipeline = ["transformer","tagger","ner"]
14
+ batch_size = 128
15
+ disabled = []
16
+ before_creation = null
17
+ after_creation = null
18
+ after_pipeline_creation = null
19
+ tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
20
+ vectors = {"@vectors":"spacy.Vectors.v1"}
21
+
22
+ [components]
23
+
24
+ [components.ner]
25
+ factory = "ner"
26
+ incorrect_spans_key = null
27
+ moves = null
28
+ scorer = {"@scorers":"spacy.ner_scorer.v1"}
29
+ update_with_oracle_cut_size = 100
30
+
31
+ [components.ner.model]
32
+ @architectures = "spacy.TransitionBasedParser.v2"
33
+ state_type = "ner"
34
+ extra_state_tokens = false
35
+ hidden_width = 64
36
+ maxout_pieces = 2
37
+ use_upper = false
38
+ nO = null
39
+
40
+ [components.ner.model.tok2vec]
41
+ @architectures = "spacy-transformers.TransformerListener.v1"
42
+ grad_factor = 1.0
43
+ pooling = {"@layers":"reduce_mean.v1"}
44
+ upstream = "*"
45
+
46
+ [components.tagger]
47
+ factory = "tagger"
48
+ label_smoothing = 0.0
49
+ neg_prefix = "!"
50
+ overwrite = false
51
+ scorer = {"@scorers":"spacy.tagger_scorer.v1"}
52
+
53
+ [components.tagger.model]
54
+ @architectures = "spacy.Tagger.v2"
55
+ nO = null
56
+ normalize = false
57
+
58
+ [components.tagger.model.tok2vec]
59
+ @architectures = "spacy-transformers.TransformerListener.v1"
60
+ grad_factor = 1.0
61
+ pooling = {"@layers":"reduce_mean.v1"}
62
+ upstream = "*"
63
+
64
+ [components.transformer]
65
+ factory = "transformer"
66
+ max_batch_items = 4096
67
+ set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"}
68
+
69
+ [components.transformer.model]
70
+ @architectures = "spacy-transformers.TransformerModel.v3"
71
+ name = "classla/xlm-r-bertic"
72
+ mixed_precision = false
73
+
74
+ [components.transformer.model.get_spans]
75
+ @span_getters = "spacy-transformers.strided_spans.v1"
76
+ window = 128
77
+ stride = 96
78
+
79
+ [components.transformer.model.grad_scaler_config]
80
+
81
+ [components.transformer.model.tokenizer_config]
82
+ use_fast = true
83
+
84
+ [components.transformer.model.transformer_config]
85
+
86
+ [corpora]
87
+
88
+ [corpora.dev]
89
+ @readers = "spacy.Corpus.v1"
90
+ path = ${paths.dev}
91
+ max_length = 0
92
+ gold_preproc = false
93
+ limit = 0
94
+ augmenter = null
95
+
96
+ [corpora.train]
97
+ @readers = "spacy.Corpus.v1"
98
+ path = ${paths.train}
99
+ max_length = 0
100
+ gold_preproc = false
101
+ limit = 0
102
+ augmenter = null
103
+
104
+ [training]
105
+ accumulate_gradient = 3
106
+ dev_corpus = "corpora.dev"
107
+ train_corpus = "corpora.train"
108
+ annotating_components = ["tagger"]
109
+ seed = ${system.seed}
110
+ gpu_allocator = ${system.gpu_allocator}
111
+ dropout = 0.1
112
+ patience = 1600
113
+ max_epochs = 0
114
+ max_steps = 20000
115
+ eval_frequency = 200
116
+ frozen_components = []
117
+ before_to_disk = null
118
+ before_update = null
119
+
120
+ [training.batcher]
121
+ @batchers = "spacy.batch_by_padded.v1"
122
+ discard_oversize = true
123
+ size = 2000
124
+ buffer = 256
125
+ get_length = null
126
+
127
+ [training.logger]
128
+ @loggers = "spacy.ConsoleLogger.v1"
129
+ progress_bar = false
130
+
131
+ [training.optimizer]
132
+ @optimizers = "Adam.v1"
133
+ beta1 = 0.9
134
+ beta2 = 0.999
135
+ L2_is_weight_decay = true
136
+ L2 = 0.01
137
+ grad_clip = 1.0
138
+ use_averages = false
139
+ eps = 0.00000001
140
+
141
+ [training.optimizer.learn_rate]
142
+ @schedules = "warmup_linear.v1"
143
+ warmup_steps = 250
144
+ total_steps = 20000
145
+ initial_rate = 0.00005
146
+
147
+ [training.score_weights]
148
+ tag_acc = 0.5
149
+ ents_f = 0.5
150
+ ents_p = 0.0
151
+ ents_r = 0.0
152
+ ents_per_type = null
153
+
154
+ [pretraining]
155
+
156
+ [initialize]
157
+ vectors = ${paths.vectors}
158
+ init_tok2vec = ${paths.init_tok2vec}
159
+ vocab_data = null
160
+ lookups = null
161
+ before_init = null
162
+ after_init = null
163
+
164
+ [initialize.components]
165
+
166
+ [initialize.tokenizer]
meta.json ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "lang":"sr",
3
+ "name":"pner_tesla_bcx",
4
+ "version":"1.0.0",
5
+ "description":"sr_pner_tesla_bcx is a spaCy model meticulously fine-tuned for Part-of-Speech Tagging and Named Entity Recognition in Serbian language texts. This advanced model incorporates a transformer layer based on XLM-R-BERTi\u0107, enhancing its analytical capabilities. It is proficient in identifying 7 distinct categories of entities: PERS (persons), ROLE (professions), DEMO (demonyms), ORG (organizations), LOC (locations), WORK (artworks), and EVENT (events). Detailed information about these categories is available in the accompanying table. The development of this model has been made possible through the support of the Science Fund of the Republic of Serbia, under grant #7276, for the project 'Text Embeddings - Serbian Language Applications - TESLA'.",
6
+ "author":"Milica Ikoni\u0107 Ne\u0161i\u0107, Sa\u0161a Petalinkar, Mihailo \u0160kori\u0107, Ranka Stankovi\u0107",
7
+ "email":"",
8
+ "url":"https://tesla.rgf.bg.ac.rs/",
9
+ "license":"CC BY-SA 3.0",
10
+ "spacy_version":">=3.7.2,<3.8.0",
11
+ "spacy_git_version":"a89eae928",
12
+ "vectors":{
13
+ "width":0,
14
+ "vectors":0,
15
+ "keys":0,
16
+ "name":null
17
+ },
18
+ "labels":{
19
+ "transformer":[
20
+
21
+ ],
22
+ "tagger":[
23
+ "ADJ",
24
+ "ADP",
25
+ "ADV",
26
+ "AUX",
27
+ "CCONJ",
28
+ "DET",
29
+ "INTJ",
30
+ "NOUN",
31
+ "NUM",
32
+ "PART",
33
+ "PRON",
34
+ "PROPN",
35
+ "PUNCT",
36
+ "SCONJ",
37
+ "VERB",
38
+ "X"
39
+ ],
40
+ "ner":[
41
+ "DEMO",
42
+ "EVENT",
43
+ "LOC",
44
+ "ORG",
45
+ "PERS",
46
+ "ROLE",
47
+ "WORK"
48
+ ]
49
+ },
50
+ "pipeline":[
51
+ "transformer",
52
+ "tagger",
53
+ "ner"
54
+ ],
55
+ "components":[
56
+ "transformer",
57
+ "tagger",
58
+ "ner"
59
+ ],
60
+ "disabled":[
61
+
62
+ ],
63
+ "performance":{
64
+ "tag_acc":0.9853440156,
65
+ "ents_f":0.9603720178,
66
+ "ents_p":0.9567611119,
67
+ "ents_r":0.9640102828,
68
+ "ents_per_type":{
69
+ "ROLE":{
70
+ "p":0.8674171357,
71
+ "r":0.9107025607,
72
+ "f":0.8885329917
73
+ },
74
+ "PERS":{
75
+ "p":0.9881560903,
76
+ "r":0.991369606,
77
+ "f":0.9897602398
78
+ },
79
+ "LOC":{
80
+ "p":0.9631551635,
81
+ "r":0.9783869267,
82
+ "f":0.9707112971
83
+ },
84
+ "DEMO":{
85
+ "p":0.9299065421,
86
+ "r":0.9401574803,
87
+ "f":0.9350039154
88
+ },
89
+ "ORG":{
90
+ "p":0.8374816984,
91
+ "r":0.781420765,
92
+ "f":0.8084805654
93
+ },
94
+ "WORK":{
95
+ "p":0.6315789474,
96
+ "r":0.5070422535,
97
+ "f":0.5625
98
+ },
99
+ "EVENT":{
100
+ "p":0.5263157895,
101
+ "r":0.625,
102
+ "f":0.5714285714
103
+ }
104
+ },
105
+ "transformer_loss":388.6984258415,
106
+ "tagger_loss":562.6862918995,
107
+ "ner_loss":332.6980463833
108
+ },
109
+ "requirements":[
110
+ "spacy-transformers>=1.3.4,<1.4.0"
111
+ ]
112
+ }
ner/cfg ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "moves":null,
3
+ "update_with_oracle_cut_size":100,
4
+ "multitasks":[
5
+
6
+ ],
7
+ "min_action_freq":1,
8
+ "learn_tokens":false,
9
+ "beam_width":1,
10
+ "beam_density":0.0,
11
+ "beam_update_prob":0.0,
12
+ "incorrect_spans_key":null
13
+ }
ner/model ADDED
Binary file (310 kB). View file
 
ner/moves ADDED
@@ -0,0 +1 @@
 
 
1
+ ��moves��{"0":{},"1":{"PERS":66081,"LOC":35152,"ROLE":14259,"ORG":10504,"DEMO":5087,"WORK":973,"EVENT":546},"2":{"PERS":66081,"LOC":35152,"ROLE":14259,"ORG":10504,"DEMO":5087,"WORK":973,"EVENT":546},"3":{"PERS":66081,"LOC":35152,"ROLE":14259,"ORG":10504,"DEMO":5087,"WORK":973,"EVENT":546},"4":{"PERS":66081,"LOC":35152,"ROLE":14259,"ORG":10504,"DEMO":5087,"WORK":973,"EVENT":546,"":1},"5":{"":1}}�cfg��neg_key�
sr_pner_tesla_bcx-any-py3-none-any.whl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b34f4299321f7bf2e8cc8cc2504ed555c1dfc325f62696a1412969cba164ffc
3
+ size 2085091101
tagger/cfg ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "label_smoothing":0.0,
3
+ "labels":[
4
+ "ADJ",
5
+ "ADP",
6
+ "ADV",
7
+ "AUX",
8
+ "CCONJ",
9
+ "DET",
10
+ "INTJ",
11
+ "NOUN",
12
+ "NUM",
13
+ "PART",
14
+ "PRON",
15
+ "PROPN",
16
+ "PUNCT",
17
+ "SCONJ",
18
+ "VERB",
19
+ "X"
20
+ ],
21
+ "neg_prefix":"!",
22
+ "overwrite":false
23
+ }
tagger/model ADDED
Binary file (66.2 kB). View file
 
tokenizer ADDED
Binary file (32.6 kB). View file
 
transformer/cfg ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "max_batch_items":4096
3
+ }
transformer/model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:95dbbac4ce9565e560c73f53296ce9073f6ed8bdcf30b58ee2dee4acf0b7a76b
3
+ size 2261874822
vocab/key2row ADDED
@@ -0,0 +1 @@
 
 
1
+
vocab/lookups.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76be8b528d0075f7aae98d6fa57a6d3c83ae480a8469e668d7b0af968995ac71
3
+ size 1
vocab/strings.json ADDED
The diff for this file is too large to render. See raw diff
 
vocab/vectors ADDED
Binary file (128 Bytes). View file
 
vocab/vectors.cfg ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "mode":"default"
3
+ }