iproskurina commited on
Commit
8196617
1 Parent(s): d9e9450

Update spaCy pipeline

Browse files
custom_factories.py CHANGED
@@ -7,9 +7,10 @@ from spacy.util import get_model_meta
7
  model_path = Path(__file__).parent
8
  meta = get_model_meta(model_path)
9
  data_dir = f"{meta['lang']}_{meta['name']}-{meta['version']}"
10
- components_path = model_path / data_dir
11
 
12
  @Language.component("errors")
13
  def errors(doc):
14
- nlp_vocabulary = spacy.load(model_path)
 
15
  return nlp_vocabulary(doc)
 
7
  model_path = Path(__file__).parent
8
  meta = get_model_meta(model_path)
9
  data_dir = f"{meta['lang']}_{meta['name']}-{meta['version']}"
10
+ components_path = model_path / data_dir / "training"
11
 
12
  @Language.component("errors")
13
  def errors(doc):
14
+ nlp_vocabulary = spacy.load(components_path)
15
+ print("Loaded component")
16
  return nlp_vocabulary(doc)
en_grammar_checker-any-py3-none-any.whl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:81a6bbab4bd7f39da127d1290f4ca6c194acf88cb8687ef8e5e7eb198162d9ca
3
- size 27085
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:483f6941367174edb71452d6c4f82fdf61aae1b35c3755a91e8cf4d58688cbcf
3
+ size 406916683
meta.json CHANGED
@@ -8,20 +8,15 @@
8
  "url":"",
9
  "license":"CC BY-SA 3.0",
10
  "spacy_version":">=3.4.4,<3.5.0",
11
- "parent_package":"spacy",
12
- "requirements":[
13
- "spacy-transformers>=1.0.0,<1.1.0"
14
- ],
15
- "sources":[
16
- {
17
- "license":"MIT"
18
- }
19
- ],
20
  "vectors":{
21
  "width":0,
22
  "vectors":0,
23
  "keys":0,
24
  "name":null
 
 
 
25
  },
26
  "pipeline":[
27
  "errors"
@@ -29,9 +24,18 @@
29
  "components":[
30
  "errors"
31
  ],
32
- "labels":{
33
 
34
- },
 
 
 
 
 
 
 
 
 
35
  "performance":{
36
  "spans_errors_p":0.7937892339,
37
  "spans_errors_r":0.4476503759,
@@ -94,9 +98,5 @@
94
  }
95
  }
96
  },
97
- "speed":2779.5295317788,
98
- "spacy_git_version":"61dfdd9fb",
99
- "disabled":[
100
-
101
- ]
102
  }
 
8
  "url":"",
9
  "license":"CC BY-SA 3.0",
10
  "spacy_version":">=3.4.4,<3.5.0",
11
+ "spacy_git_version":"61dfdd9fb",
 
 
 
 
 
 
 
 
12
  "vectors":{
13
  "width":0,
14
  "vectors":0,
15
  "keys":0,
16
  "name":null
17
+ },
18
+ "labels":{
19
+
20
  },
21
  "pipeline":[
22
  "errors"
 
24
  "components":[
25
  "errors"
26
  ],
27
+ "disabled":[
28
 
29
+ ],
30
+ "parent_package":"spacy",
31
+ "requirements":[
32
+ "spacy-transformers>=1.0.0,<1.1.0"
33
+ ],
34
+ "sources":[
35
+ {
36
+ "license":"MIT"
37
+ }
38
+ ],
39
  "performance":{
40
  "spans_errors_p":0.7937892339,
41
  "spans_errors_r":0.4476503759,
 
98
  }
99
  }
100
  },
101
+ "speed":2779.5295317788
 
 
 
 
102
  }
training/config.cfg ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [paths]
2
+ train = "./realec/train.spacy"
3
+ dev = "./realec/dev.spacy"
4
+ vectors = null
5
+ init_tok2vec = null
6
+
7
+ [system]
8
+ gpu_allocator = "pytorch"
9
+ seed = 0
10
+
11
+ [nlp]
12
+ lang = "en"
13
+ pipeline = ["transformer","spancat"]
14
+ batch_size = 16
15
+ disabled = []
16
+ before_creation = null
17
+ after_creation = null
18
+ after_pipeline_creation = null
19
+ tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
20
+
21
+ [components]
22
+
23
+ [components.spancat]
24
+ factory = "spancat"
25
+ max_positive = null
26
+ scorer = {"@scorers":"spacy.spancat_scorer.v1"}
27
+ spans_key = "errors"
28
+ threshold = 0.5
29
+
30
+ [components.spancat.model]
31
+ @architectures = "spacy.SpanCategorizer.v1"
32
+
33
+ [components.spancat.model.reducer]
34
+ @layers = "spacy.mean_max_reducer.v1"
35
+ hidden_size = 128
36
+
37
+ [components.spancat.model.scorer]
38
+ @layers = "spacy.LinearLogistic.v1"
39
+ nO = null
40
+ nI = null
41
+
42
+ [components.spancat.model.tok2vec]
43
+ @architectures = "spacy-transformers.TransformerListener.v1"
44
+ grad_factor = 1.0
45
+ pooling = {"@layers":"reduce_mean.v1"}
46
+ upstream = "*"
47
+
48
+ [components.spancat.suggester]
49
+ @misc = "spacy.ngram_suggester.v1"
50
+ sizes = [1,2,3]
51
+
52
+ [components.transformer]
53
+ factory = "transformer"
54
+ max_batch_items = 4096
55
+ set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"}
56
+
57
+ [components.transformer.model]
58
+ @architectures = "spacy-transformers.TransformerModel.v3"
59
+ name = "bert-base-cased"
60
+ mixed_precision = false
61
+
62
+ [components.transformer.model.get_spans]
63
+ @span_getters = "spacy-transformers.strided_spans.v1"
64
+ window = 128
65
+ stride = 96
66
+
67
+ [components.transformer.model.grad_scaler_config]
68
+
69
+ [components.transformer.model.tokenizer_config]
70
+ use_fast = true
71
+
72
+ [components.transformer.model.transformer_config]
73
+
74
+ [corpora]
75
+
76
+ [corpora.dev]
77
+ @readers = "spacy.Corpus.v1"
78
+ path = "./realec/dev.spacy"
79
+ max_length = 0
80
+ gold_preproc = false
81
+ limit = 0
82
+ augmenter = null
83
+
84
+ [corpora.train]
85
+ @readers = "spacy.Corpus.v1"
86
+ path = "./realec/train.spacy"
87
+ max_length = 0
88
+ gold_preproc = false
89
+ limit = 0
90
+ augmenter = null
91
+
92
+ [training]
93
+ accumulate_gradient = 3
94
+ dev_corpus = "corpora.dev"
95
+ train_corpus = "corpora.train"
96
+ seed = 0
97
+ gpu_allocator = "pytorch"
98
+ dropout = 0.1
99
+ patience = 1600
100
+ max_epochs = 0
101
+ max_steps = 20000
102
+ eval_frequency = 200
103
+ frozen_components = []
104
+ annotating_components = []
105
+ before_to_disk = null
106
+
107
+ [training.batcher]
108
+ @batchers = "spacy.batch_by_padded.v1"
109
+ discard_oversize = true
110
+ size = 2000
111
+ buffer = 256
112
+ get_length = null
113
+
114
+ [training.logger]
115
+ @loggers = "spacy.WandbLogger.v3"
116
+ project_name = "my-awesome-project"
117
+ remove_config_values = ["paths.train","paths.dev","corpora.train.path","corpora.dev.path"]
118
+ log_dataset_dir = null
119
+ entity = null
120
+ run_name = "grammar-checker"
121
+ model_log_interval = null
122
+
123
+ [training.optimizer]
124
+ @optimizers = "Adam.v1"
125
+ beta1 = 0.9
126
+ beta2 = 0.999
127
+ L2_is_weight_decay = true
128
+ L2 = 0.01
129
+ grad_clip = 1.0
130
+ use_averages = false
131
+ eps = 0.00000001
132
+
133
+ [training.optimizer.learn_rate]
134
+ @schedules = "warmup_linear.v1"
135
+ warmup_steps = 250
136
+ total_steps = 20000
137
+ initial_rate = 0.00005
138
+
139
+ [training.score_weights]
140
+ spans_sc_f = 0.5
141
+ spans_sc_p = 0.0
142
+ spans_sc_r = 0.0
143
+ spans_Agreement_errors_f = 0.06
144
+ spans_Articles_f = 0.03
145
+ spans_Capitalisation_f = 0.05
146
+ spans_Formational_affixes_f = 0.1
147
+ spans_Noun_number_f = 0.04
148
+ spans_Numerals_f = 0.06
149
+ spans_Prepositions_f = 0.05
150
+ spans_Punctuation_f = 0.03
151
+ spans_Spelling_f = 0.02
152
+ spans_Tense_choice_f = 0.03
153
+ spans_lex_item_choice_f = 0.03
154
+
155
+ [pretraining]
156
+
157
+ [initialize]
158
+ vectors = null
159
+ init_tok2vec = null
160
+ vocab_data = null
161
+ lookups = null
162
+ before_init = null
163
+ after_init = null
164
+
165
+ [initialize.components]
166
+
167
+ [initialize.tokenizer]
training/meta.json ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "lang":"en",
3
+ "name":"grammar_checker",
4
+ "version":"1.0.1",
5
+ "description":"Essay Grammar Checker",
6
+ "author":"Irina Proskurina",
7
+ "email":"",
8
+ "url":"",
9
+ "license":"CC BY-SA 3.0",
10
+ "spacy_version":">=3.4.4,<3.5.0",
11
+ "parent_package":"spacy",
12
+ "requirements":[
13
+ "spacy-transformers>=1.0.0,<1.1.0"
14
+ ],
15
+ "sources":[
16
+ {
17
+ "license":"MIT"
18
+ }
19
+ ],
20
+ "vectors":{
21
+ "width":0,
22
+ "vectors":0,
23
+ "keys":0,
24
+ "name":null
25
+ },
26
+ "pipeline":
27
+ "errors",
28
+ "components":"errors",
29
+ "labels":{
30
+
31
+ },
32
+ "performance":{
33
+ "spans_errors_p":0.7937892339,
34
+ "spans_errors_r":0.4476503759,
35
+ "spans_errors_f":0.5724644939,
36
+ "spans_errors_per_type":{
37
+ "Numerals":{
38
+ "p":0.7313328681,
39
+ "r":0.577092511,
40
+ "f":0.6451215759
41
+ },
42
+ "lex_item_choice":{
43
+ "p":0.7750791975,
44
+ "r":0.1950571353,
45
+ "f":0.3116772824
46
+ },
47
+ "Articles":{
48
+ "p":0.785046729,
49
+ "r":0.4552258065,
50
+ "f":0.5762822607
51
+ },
52
+ "Punctuation":{
53
+ "p":0.6955835962,
54
+ "r":0.2376077586,
55
+ "f":0.3542168675
56
+ },
57
+ "Prepositions":{
58
+ "p":0.8163471241,
59
+ "r":0.3766294227,
60
+ "f":0.5154507805
61
+ },
62
+ "Formational_affixes":{
63
+ "p":0.7269700333,
64
+ "r":0.6031307551,
65
+ "f":0.6592853548
66
+ },
67
+ "Agreement_errors":{
68
+ "p":0.7909018356,
69
+ "r":0.5164147994,
70
+ "f":0.6248423707
71
+ },
72
+ "Capitalisation":{
73
+ "p":0.8034148593,
74
+ "r":0.7899274047,
75
+ "f":0.7966140471
76
+ },
77
+ "Noun_number":{
78
+ "p":0.8251445087,
79
+ "r":0.5558079169,
80
+ "f":0.6642109345
81
+ },
82
+ "Tense_choice":{
83
+ "p":0.7827648115,
84
+ "r":0.5369458128,
85
+ "f":0.6369612856
86
+ },
87
+ "Spelling":{
88
+ "p":0.886746988,
89
+ "r":0.4357608052,
90
+ "f":0.5843588726
91
+ }
92
+ }
93
+ },
94
+ "speed":2779.5295317788,
95
+ "spacy_git_version":"61dfdd9fb",
96
+ "disabled":[
97
+
98
+ ]
99
+ }
training/spancat/cfg ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "labels":[
3
+ "Numerals",
4
+ "lex_item_choice",
5
+ "Articles",
6
+ "Punctuation",
7
+ "Prepositions",
8
+ "Formational_affixes",
9
+ "Agreement_errors",
10
+ "Capitalisation",
11
+ "Noun_number",
12
+ "Tense_choice",
13
+ "Spelling"
14
+ ],
15
+ "spans_key":"errors",
16
+ "threshold":0.5,
17
+ "max_positive":null
18
+ }
training/spancat/model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0644468a3433a24fac0c4eff33e9a6626a1eabf81407b933c942d1d83f69e84d
3
+ size 4728651
training/tokenizer ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:24ad4ddf9a27837484c7fe2ae8a454167cfc9604ef2e7740a2e74a39cc2c1bc3
3
+ size 76990
training/transformer/cfg ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "max_batch_items":4096
3
+ }
training/transformer/model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:295f882fecfc711a4d7a52b89365526b8fcf308a8c092f91da73a9bba8b80629
3
+ size 434208063
training/vocab/key2row ADDED
@@ -0,0 +1 @@
 
 
1
+
training/vocab/lookups.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76be8b528d0075f7aae98d6fa57a6d3c83ae480a8469e668d7b0af968995ac71
3
+ size 1
training/vocab/strings.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a64eed47943544dafc1dabe9d0f6c26399acf4d59c10d9cdd1c31ba85168f82
3
+ size 176610
training/vocab/vectors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:14772b683e726436d5948ad3fff2b43d036ef2ebbe3458aafed6004e05a40706
3
+ size 128
training/vocab/vectors.cfg ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "mode":"default"
3
+ }