a-menu commited on
Commit
9b349e2
1 Parent(s): a17c761

Update spaCy pipeline

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ fr_arches_ner_trf-any-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
37
+ transformer/model filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - spacy
4
+ - token-classification
5
+ language:
6
+ - fr
7
+ license: cc-by-nc-2.0
8
+ model-index:
9
+ - name: fr_arches_ner_trf
10
+ results:
11
+ - task:
12
+ name: NER
13
+ type: token-classification
14
+ metrics:
15
+ - name: NER Precision
16
+ type: precision
17
+ value: 0.8033279872
18
+ - name: NER Recall
19
+ type: recall
20
+ value: 0.8439342881
21
+ - name: NER F Score
22
+ type: f_score
23
+ value: 0.8231306491
24
+ ---
25
+ French model trained to recognize named entities from archaeological reports.
26
+
27
+ | Feature | Description |
28
+ | --- | --- |
29
+ | **Name** | `fr_arches_ner_trf` |
30
+ | **Version** | `0.0.0` |
31
+ | **spaCy** | `>=3.6.1,<3.7.0` |
32
+ | **Default Pipeline** | `transformer`, `ner`, `entity_punctuation_removal` |
33
+ | **Components** | `transformer`, `ner`, `entity_punctuation_removal` |
34
+ | **Vectors** | 0 keys, 0 unique vectors (0 dimensions) |
35
+ | **Sources** | n/a |
36
+ | **License** | `cc-by-nc 2.0` |
37
+ | **Author** | [n/a]() |
38
+
39
+ ### Label Scheme
40
+
41
+ <details>
42
+
43
+ <summary>View label scheme (15 labels for 1 components)</summary>
44
+
45
+ | Component | Labels |
46
+ | --- | --- |
47
+ | **`ner`** | `CHRONOLOGIE`, `DECOR`, `EDIFICE`, `ESPECE`, `GPE`, `ID`, `LIEUDIT_SITE`, `LOC`, `MATERIAU`, `MOBILIER`, `ORG`, `PERSONNE`, `PEUPLE_CULTURE`, `STRUCTURE`, `TECHNIQUE_STYLE` |
48
+
49
+ </details>
50
+
51
+ ### Accuracy
52
+
53
+ | Type | Score |
54
+ | --- | --- |
55
+ | `ENTS_F` | 82.31 |
56
+ | `ENTS_P` | 80.33 |
57
+ | `ENTS_R` | 84.39 |
58
+ | `TRANSFORMER_LOSS` | 218923.98 |
59
+ | `NER_LOSS` | 51779.36 |
config.cfg ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [paths]
2
+ train = "drive/MyDrive/ARCHES/TAL/donnees/ner/corpus_artificiel/train_dev_test_spacy/train.spacy"
3
+ dev = "drive/MyDrive/ARCHES/TAL/donnees/ner/corpus_artificiel/train_dev_test_spacy/dev.spacy"
4
+ vectors = null
5
+ init_tok2vec = null
6
+
7
+ [system]
8
+ gpu_allocator = "pytorch"
9
+ seed = 0
10
+
11
+ [nlp]
12
+ lang = "fr"
13
+ pipeline = ["transformer","ner","entity_punctuation_removal"]
14
+ batch_size = 128
15
+ disabled = []
16
+ before_creation = null
17
+ after_creation = null
18
+ after_pipeline_creation = null
19
+ tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
20
+
21
+ [components]
22
+
23
+ [components.entity_punctuation_removal]
24
+ factory = "entity_punctuation_removal"
25
+
26
+ [components.ner]
27
+ factory = "ner"
28
+ incorrect_spans_key = null
29
+ moves = null
30
+ scorer = {"@scorers":"spacy.ner_scorer.v1"}
31
+ update_with_oracle_cut_size = 100
32
+
33
+ [components.ner.model]
34
+ @architectures = "spacy.TransitionBasedParser.v2"
35
+ state_type = "ner"
36
+ extra_state_tokens = false
37
+ hidden_width = 64
38
+ maxout_pieces = 2
39
+ use_upper = false
40
+ nO = null
41
+
42
+ [components.ner.model.tok2vec]
43
+ @architectures = "spacy-transformers.TransformerListener.v1"
44
+ grad_factor = 1.0
45
+ pooling = {"@layers":"reduce_mean.v1"}
46
+ upstream = "*"
47
+
48
+ [components.transformer]
49
+ factory = "transformer"
50
+ max_batch_items = 4096
51
+ set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"}
52
+
53
+ [components.transformer.model]
54
+ @architectures = "spacy-transformers.TransformerModel.v3"
55
+ name = "camembert-base"
56
+ mixed_precision = false
57
+
58
+ [components.transformer.model.get_spans]
59
+ @span_getters = "spacy-transformers.strided_spans.v1"
60
+ window = 128
61
+ stride = 96
62
+
63
+ [components.transformer.model.grad_scaler_config]
64
+
65
+ [components.transformer.model.tokenizer_config]
66
+ use_fast = true
67
+
68
+ [components.transformer.model.transformer_config]
69
+
70
+ [corpora]
71
+
72
+ [corpora.dev]
73
+ @readers = "spacy.Corpus.v1"
74
+ path = ${paths.dev}
75
+ max_length = 0
76
+ gold_preproc = false
77
+ limit = 0
78
+ augmenter = null
79
+
80
+ [corpora.train]
81
+ @readers = "spacy.Corpus.v1"
82
+ path = ${paths.train}
83
+ max_length = 0
84
+ gold_preproc = false
85
+ limit = 0
86
+ augmenter = null
87
+
88
+ [training]
89
+ accumulate_gradient = 3
90
+ dev_corpus = "corpora.dev"
91
+ train_corpus = "corpora.train"
92
+ seed = ${system.seed}
93
+ gpu_allocator = ${system.gpu_allocator}
94
+ dropout = 0.1
95
+ patience = 1600
96
+ max_epochs = 0
97
+ max_steps = 20000
98
+ eval_frequency = 200
99
+ frozen_components = []
100
+ annotating_components = []
101
+ before_to_disk = null
102
+ before_update = null
103
+
104
+ [training.batcher]
105
+ @batchers = "spacy.batch_by_padded.v1"
106
+ discard_oversize = true
107
+ size = 2000
108
+ buffer = 256
109
+ get_length = null
110
+
111
+ [training.logger]
112
+ @loggers = "spacy.ConsoleLogger.v1"
113
+ progress_bar = false
114
+
115
+ [training.optimizer]
116
+ @optimizers = "Adam.v1"
117
+ beta1 = 0.9
118
+ beta2 = 0.999
119
+ L2_is_weight_decay = true
120
+ L2 = 0.01
121
+ grad_clip = 1.0
122
+ use_averages = false
123
+ eps = 0.00000001
124
+
125
+ [training.optimizer.learn_rate]
126
+ @schedules = "warmup_linear.v1"
127
+ warmup_steps = 250
128
+ total_steps = 20000
129
+ initial_rate = 0.00005
130
+
131
+ [training.score_weights]
132
+ ents_f = 1.0
133
+ ents_p = 0.0
134
+ ents_r = 0.0
135
+ ents_per_type = null
136
+
137
+ [pretraining]
138
+
139
+ [initialize]
140
+ vectors = ${paths.vectors}
141
+ init_tok2vec = ${paths.init_tok2vec}
142
+ vocab_data = null
143
+ lookups = null
144
+ before_init = null
145
+ after_init = null
146
+
147
+ [initialize.components]
148
+
149
+ [initialize.tokenizer]
entity_punctuation_removal.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spacy
2
+ from spacy.language import Language
3
+ import regex
4
+
5
+
6
+ @Language.component("entity_punctuation_removal")
7
+
8
+
9
+ def entity_punctuation_removal(doc):
10
+ # On liste nos entités
11
+ ents = list(doc.ents)
12
+
13
+ i = 0
14
+ while i < len(ents):
15
+ current_ent = ents[i]
16
+
17
+ # On attrape l'entité si :
18
+ # - Elle consiste en un unique signe de ponctuation
19
+ # - Et que son tag IOB est B (début d'entité)
20
+ # - Et qu'elle n'est pas suivie par une entité de tag IOB I (suite d'entité) OU qu'elle n'est suivie par aucun token
21
+ # Si l'entité est attrapée, elle est supprimée du doc
22
+ if i + 1 < len(ents) and regex.match(r'^\p{P}$', current_ent.text) and current_ent.root.ent_iob_ == "B" :
23
+ ents.pop(i)
24
+ elif i == len(ents) - 1 and regex.match(r'^\p{P}$', current_ent.text) and current_ent.root.ent_iob_ == "B" :
25
+ ents.pop(i)
26
+ else:
27
+ i += 1
28
+
29
+ # On met à jour le doc avec les entités modifiées
30
+ doc.ents = tuple(ents)
31
+
32
+ return doc
33
+
34
+ Language.component("entity_punctuation_removal", func=entity_punctuation_removal)
fr_arches_ner_trf-any-py3-none-any.whl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:952cfb3ea4a3a2822b19e14ced55ed54fa9a88c4642252c2872e726d19649de6
3
+ size 404718336
meta.json ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "lang":"fr",
3
+ "name":"arches_ner_trf",
4
+ "version":"0.0.0",
5
+ "description":"French model trained to recognize named entities from archaeological reports.",
6
+ "author":"",
7
+ "email":"",
8
+ "url":"",
9
+ "license":"cc-by-nc 2.0",
10
+ "spacy_version":">=3.6.1,<3.7.0",
11
+ "spacy_git_version":"458bc5f45",
12
+ "vectors":{
13
+ "width":0,
14
+ "vectors":0,
15
+ "keys":0,
16
+ "name":null
17
+ },
18
+ "labels":{
19
+ "transformer":[
20
+
21
+ ],
22
+ "ner":[
23
+ "CHRONOLOGIE",
24
+ "DECOR",
25
+ "EDIFICE",
26
+ "ESPECE",
27
+ "GPE",
28
+ "ID",
29
+ "LIEUDIT_SITE",
30
+ "LOC",
31
+ "MATERIAU",
32
+ "MOBILIER",
33
+ "ORG",
34
+ "PERSONNE",
35
+ "PEUPLE_CULTURE",
36
+ "STRUCTURE",
37
+ "TECHNIQUE_STYLE"
38
+ ]
39
+ },
40
+ "pipeline":[
41
+ "transformer",
42
+ "ner",
43
+ "entity_punctuation_removal"
44
+ ],
45
+ "components":[
46
+ "transformer",
47
+ "ner",
48
+ "entity_punctuation_removal"
49
+ ],
50
+ "disabled":[
51
+
52
+ ],
53
+ "performance":{
54
+ "ents_f":0.8231306491,
55
+ "ents_p":0.8033279872,
56
+ "ents_r":0.8439342881,
57
+ "ents_per_type":{
58
+ "ORG":{
59
+ "p":0.6967213115,
60
+ "r":0.8095238095,
61
+ "f":0.7488986784
62
+ },
63
+ "GPE":{
64
+ "p":0.869047619,
65
+ "r":0.8902439024,
66
+ "f":0.8795180723
67
+ },
68
+ "CHRONOLOGIE":{
69
+ "p":0.8232044199,
70
+ "r":0.8696498054,
71
+ "f":0.8457899716
72
+ },
73
+ "LOC":{
74
+ "p":0.8062015504,
75
+ "r":0.776119403,
76
+ "f":0.7908745247
77
+ },
78
+ "LIEUDIT_SITE":{
79
+ "p":0.7112299465,
80
+ "r":0.6487804878,
81
+ "f":0.6785714286
82
+ },
83
+ "PEUPLE_CULTURE":{
84
+ "p":0.2708333333,
85
+ "r":0.8666666667,
86
+ "f":0.4126984127
87
+ },
88
+ "MATERIAU":{
89
+ "p":0.8187702265,
90
+ "r":0.8006329114,
91
+ "f":0.8096
92
+ },
93
+ "TECHNIQUE_STYLE":{
94
+ "p":0.785467128,
95
+ "r":0.7442622951,
96
+ "f":0.7643097643
97
+ },
98
+ "MOBILIER":{
99
+ "p":0.8251308901,
100
+ "r":0.8745837958,
101
+ "f":0.849137931
102
+ },
103
+ "STRUCTURE":{
104
+ "p":0.7746031746,
105
+ "r":0.8201680672,
106
+ "f":0.7967346939
107
+ },
108
+ "ID":{
109
+ "p":0.8271155596,
110
+ "r":0.9044776119,
111
+ "f":0.8640684411
112
+ },
113
+ "ESPECE":{
114
+ "p":0.96,
115
+ "r":0.8571428571,
116
+ "f":0.9056603774
117
+ },
118
+ "PERSONNE":{
119
+ "p":0.875,
120
+ "r":0.8936170213,
121
+ "f":0.8842105263
122
+ },
123
+ "DECOR":{
124
+ "p":0.5223880597,
125
+ "r":0.6730769231,
126
+ "f":0.5882352941
127
+ },
128
+ "EDIFICE":{
129
+ "p":0.4761904762,
130
+ "r":0.4545454545,
131
+ "f":0.4651162791
132
+ }
133
+ },
134
+ "transformer_loss":2189.2397952173,
135
+ "ner_loss":517.7935899099
136
+ },
137
+ "requirements":[
138
+ "spacy-transformers>=1.3.4,<1.4.0"
139
+ ]
140
+ }
ner/cfg ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "moves":null,
3
+ "update_with_oracle_cut_size":100,
4
+ "multitasks":[
5
+
6
+ ],
7
+ "min_action_freq":1,
8
+ "learn_tokens":false,
9
+ "beam_width":1,
10
+ "beam_density":0.0,
11
+ "beam_update_prob":0.0,
12
+ "incorrect_spans_key":null
13
+ }
ner/model ADDED
Binary file (295 kB). View file
 
ner/moves ADDED
@@ -0,0 +1 @@
 
 
1
+ ��moves��{"0":{},"1":{"MOBILIER":24870,"STRUCTURE":23884,"CHRONOLOGIE":21745,"ID":20721,"MATERIAU":11489,"GPE":4094,"TECHNIQUE_STYLE":3857,"LIEUDIT_SITE":3672,"PERSONNE":2710,"DECOR":2038,"LOC":1686,"ORG":1596,"ESPECE":1310,"EDIFICE":1216,"PEUPLE_CULTURE":757},"2":{"MOBILIER":24870,"STRUCTURE":23884,"CHRONOLOGIE":21745,"ID":20721,"MATERIAU":11489,"GPE":4094,"TECHNIQUE_STYLE":3857,"LIEUDIT_SITE":3672,"PERSONNE":2710,"DECOR":2038,"LOC":1686,"ORG":1596,"ESPECE":1310,"EDIFICE":1216,"PEUPLE_CULTURE":757},"3":{"MOBILIER":24870,"STRUCTURE":23884,"CHRONOLOGIE":21745,"ID":20721,"MATERIAU":11489,"GPE":4094,"TECHNIQUE_STYLE":3857,"LIEUDIT_SITE":3672,"PERSONNE":2710,"DECOR":2038,"LOC":1686,"ORG":1596,"ESPECE":1310,"EDIFICE":1216,"PEUPLE_CULTURE":757},"4":{"MOBILIER":24870,"STRUCTURE":23884,"CHRONOLOGIE":21745,"ID":20721,"MATERIAU":11489,"GPE":4094,"TECHNIQUE_STYLE":3857,"LIEUDIT_SITE":3672,"PERSONNE":2710,"DECOR":2038,"LOC":1686,"ORG":1596,"ESPECE":1310,"EDIFICE":1216,"PEUPLE_CULTURE":757,"":1},"5":{"":1}}�cfg��neg_key�
repartition.txt ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Corpus train:
2
+
3
+ ARA_paleo_F125492_01.conll
4
+ CIF_med_F148315_BD.conll
5
+ BFC_moderne_F114238.conll
6
+ CIF_moderne_C106097.conll
7
+ CIF_proto_F004820.conll
8
+ CIF_neo_C102845.conll
9
+ CIF_contempo_F115347.conll
10
+ BFC_neo_F200449601_BD.conll
11
+ BFC_paleo_CB09001802_01.conll
12
+ CIF_antique_C001185_BD.conll
13
+ BFC_proto_F101604_01.conll
14
+ BFC_med_F106159.conll
15
+ ARA_neo_F102890_01.conll
16
+ ARA_antique_F120714.conll
17
+ ARA_med_F108171.conll
18
+ ARA_contempo_F110732.conll
19
+
20
+
21
+ Corpus dev:
22
+
23
+ CIF_paleo_F110854.conll
24
+ ARA_proto_F128240.conll
25
+ BFC_contempo_F123602.conll
26
+
27
+
28
+ Corpus test:
29
+
30
+ ARA_moderne_F114296.conll
31
+ BFC_antique_CB09003002.conll
resultats_post_ponct.json ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "token_acc":1.0,
3
+ "token_p":1.0,
4
+ "token_r":1.0,
5
+ "token_f":1.0,
6
+ "ents_p":0.8308413301,
7
+ "ents_r":0.8494182067,
8
+ "ents_f":0.8400270758,
9
+ "ents_per_type":{
10
+ "PERSONNE":{
11
+ "p":0.9523809524,
12
+ "r":0.8602150538,
13
+ "f":0.9039548023
14
+ },
15
+ "GPE":{
16
+ "p":0.8152173913,
17
+ "r":0.7352941176,
18
+ "f":0.7731958763
19
+ },
20
+ "LIEUDIT_SITE":{
21
+ "p":0.5666666667,
22
+ "r":0.5151515152,
23
+ "f":0.5396825397
24
+ },
25
+ "CHRONOLOGIE":{
26
+ "p":0.8973684211,
27
+ "r":0.9093333333,
28
+ "f":0.9033112583
29
+ },
30
+ "ORG":{
31
+ "p":0.7586206897,
32
+ "r":0.7586206897,
33
+ "f":0.7586206897
34
+ },
35
+ "STRUCTURE":{
36
+ "p":0.8469945355,
37
+ "r":0.8423913043,
38
+ "f":0.8446866485
39
+ },
40
+ "LOC":{
41
+ "p":0.7058823529,
42
+ "r":0.7058823529,
43
+ "f":0.7058823529
44
+ },
45
+ "EDIFICE":{
46
+ "p":0.5238095238,
47
+ "r":0.9166666667,
48
+ "f":0.6666666667
49
+ },
50
+ "PEUPLE_CULTURE":{
51
+ "p":0.0,
52
+ "r":0.0,
53
+ "f":0.0
54
+ },
55
+ "ID":{
56
+ "p":0.8692240628,
57
+ "r":0.8760984183,
58
+ "f":0.8726477024
59
+ },
60
+ "MOBILIER":{
61
+ "p":0.8184663537,
62
+ "r":0.8573770492,
63
+ "f":0.837469976
64
+ },
65
+ "MATERIAU":{
66
+ "p":0.7838765009,
67
+ "r":0.8416206262,
68
+ "f":0.811722913
69
+ },
70
+ "TECHNIQUE_STYLE":{
71
+ "p":0.6829268293,
72
+ "r":0.7777777778,
73
+ "f":0.7272727273
74
+ },
75
+ "DECOR":{
76
+ "p":0.3947368421,
77
+ "r":0.5769230769,
78
+ "f":0.46875
79
+ },
80
+ "ESPECE":{
81
+ "p":0.8888888889,
82
+ "r":0.8888888889,
83
+ "f":0.8888888889
84
+ }
85
+ },
86
+ "speed":4940.2096129626
87
+ }
tokenizer ADDED
The diff for this file is too large to render. See raw diff
 
transformer/cfg ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "max_batch_items":4096
3
+ }
transformer/model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff62ae3f6360aef680dc111d6e64f7c55be57461ced8eeb9d22e0edaf3567909
3
+ size 445800276
vocab/key2row ADDED
@@ -0,0 +1 @@
 
 
1
+
vocab/lookups.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76be8b528d0075f7aae98d6fa57a6d3c83ae480a8469e668d7b0af968995ac71
3
+ size 1
vocab/strings.json ADDED
The diff for this file is too large to render. See raw diff
 
vocab/vectors ADDED
Binary file (128 Bytes). View file
 
vocab/vectors.cfg ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "mode":"default"
3
+ }