iproskurina commited on
Commit
ce365be
1 Parent(s): 8196617

Update spaCy pipeline

Browse files
README.md CHANGED
@@ -8,6 +8,8 @@ model-index:
8
  - name: en_grammar_checker
9
  results: []
10
  ---
 
 
11
  | Feature | Description |
12
  | --- | --- |
13
  | **Name** | `en_grammar_checker` |
@@ -17,5 +19,13 @@ model-index:
17
  | **Components** | `errors` |
18
  | **Vectors** | 0 keys, 0 unique vectors (0 dimensions) |
19
  | **Sources** | n/a |
20
- | **License** | n/a |
21
- | **Author** | [n/a]() |
 
 
 
 
 
 
 
 
 
8
  - name: en_grammar_checker
9
  results: []
10
  ---
11
+ Essay Grammar Checker
12
+
13
  | Feature | Description |
14
  | --- | --- |
15
  | **Name** | `en_grammar_checker` |
 
19
  | **Components** | `errors` |
20
  | **Vectors** | 0 keys, 0 unique vectors (0 dimensions) |
21
  | **Sources** | n/a |
22
+ | **License** | `CC BY-SA 3.0` |
23
+ | **Author** | [Irina Proskurina]() |
24
+
25
+ ### Accuracy
26
+
27
+ | Type | Score |
28
+ | --- | --- |
29
+ | `SPANS_ERRORS_P` | 79.38 |
30
+ | `SPANS_ERRORS_R` | 44.77 |
31
+ | `SPANS_ERRORS_F` | 57.25 |
config.cfg CHANGED
@@ -1,77 +1,124 @@
1
  [paths]
2
- train = null
3
- dev = null
4
  vectors = null
5
  init_tok2vec = null
6
 
7
  [system]
 
8
  seed = 0
9
- gpu_allocator = null
10
 
11
  [nlp]
12
  lang = "en"
13
- pipeline = ["errors"]
 
14
  disabled = []
15
  before_creation = null
16
  after_creation = null
17
  after_pipeline_creation = null
18
- batch_size = 1000
19
  tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
20
 
21
  [components]
22
 
23
- [components.errors]
24
- factory = "errors"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  [corpora]
27
 
28
  [corpora.dev]
29
  @readers = "spacy.Corpus.v1"
30
- path = ${paths.dev}
31
- gold_preproc = false
32
  max_length = 0
 
33
  limit = 0
34
  augmenter = null
35
 
36
  [corpora.train]
37
  @readers = "spacy.Corpus.v1"
38
- path = ${paths.train}
39
- gold_preproc = false
40
  max_length = 0
 
41
  limit = 0
42
  augmenter = null
43
 
44
  [training]
45
- seed = ${system.seed}
46
- gpu_allocator = ${system.gpu_allocator}
 
 
 
47
  dropout = 0.1
48
- accumulate_gradient = 1
49
  patience = 1600
50
  max_epochs = 0
51
  max_steps = 20000
52
  eval_frequency = 200
53
  frozen_components = []
54
  annotating_components = []
55
- dev_corpus = "corpora.dev"
56
- train_corpus = "corpora.train"
57
  before_to_disk = null
58
 
59
  [training.batcher]
60
- @batchers = "spacy.batch_by_words.v1"
61
- discard_oversize = false
62
- tolerance = 0.2
 
63
  get_length = null
64
 
65
- [training.batcher.size]
66
- @schedules = "compounding.v1"
67
- start = 100
68
- stop = 1000
69
- compound = 1.001
70
- t = 0.0
71
-
72
  [training.logger]
73
- @loggers = "spacy.ConsoleLogger.v1"
74
- progress_bar = false
 
 
 
 
 
75
 
76
  [training.optimizer]
77
  @optimizers = "Adam.v1"
@@ -82,15 +129,34 @@ L2 = 0.01
82
  grad_clip = 1.0
83
  use_averages = false
84
  eps = 0.00000001
85
- learn_rate = 0.001
 
 
 
 
 
86
 
87
  [training.score_weights]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
89
  [pretraining]
90
 
91
  [initialize]
92
- vectors = ${paths.vectors}
93
- init_tok2vec = ${paths.init_tok2vec}
94
  vocab_data = null
95
  lookups = null
96
  before_init = null
 
1
  [paths]
2
+ train = "./realec/train.spacy"
3
+ dev = "./realec/dev.spacy"
4
  vectors = null
5
  init_tok2vec = null
6
 
7
  [system]
8
+ gpu_allocator = "pytorch"
9
  seed = 0
 
10
 
11
  [nlp]
12
  lang = "en"
13
+ pipeline = ["transformer","spancat"]
14
+ batch_size = 16
15
  disabled = []
16
  before_creation = null
17
  after_creation = null
18
  after_pipeline_creation = null
 
19
  tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
20
 
21
  [components]
22
 
23
+ [components.spancat]
24
+ factory = "spancat"
25
+ max_positive = null
26
+ scorer = {"@scorers":"spacy.spancat_scorer.v1"}
27
+ spans_key = "errors"
28
+ threshold = 0.5
29
+
30
+ [components.spancat.model]
31
+ @architectures = "spacy.SpanCategorizer.v1"
32
+
33
+ [components.spancat.model.reducer]
34
+ @layers = "spacy.mean_max_reducer.v1"
35
+ hidden_size = 128
36
+
37
+ [components.spancat.model.scorer]
38
+ @layers = "spacy.LinearLogistic.v1"
39
+ nO = null
40
+ nI = null
41
+
42
+ [components.spancat.model.tok2vec]
43
+ @architectures = "spacy-transformers.TransformerListener.v1"
44
+ grad_factor = 1.0
45
+ pooling = {"@layers":"reduce_mean.v1"}
46
+ upstream = "*"
47
+
48
+ [components.spancat.suggester]
49
+ @misc = "spacy.ngram_suggester.v1"
50
+ sizes = [1,2,3]
51
+
52
+ [components.transformer]
53
+ factory = "transformer"
54
+ max_batch_items = 4096
55
+ set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"}
56
+
57
+ [components.transformer.model]
58
+ @architectures = "spacy-transformers.TransformerModel.v3"
59
+ name = "bert-base-cased"
60
+ mixed_precision = false
61
+
62
+ [components.transformer.model.get_spans]
63
+ @span_getters = "spacy-transformers.strided_spans.v1"
64
+ window = 128
65
+ stride = 96
66
+
67
+ [components.transformer.model.grad_scaler_config]
68
+
69
+ [components.transformer.model.tokenizer_config]
70
+ use_fast = true
71
+
72
+ [components.transformer.model.transformer_config]
73
 
74
  [corpora]
75
 
76
  [corpora.dev]
77
  @readers = "spacy.Corpus.v1"
78
+ path = "./realec/dev.spacy"
 
79
  max_length = 0
80
+ gold_preproc = false
81
  limit = 0
82
  augmenter = null
83
 
84
  [corpora.train]
85
  @readers = "spacy.Corpus.v1"
86
+ path = "./realec/train.spacy"
 
87
  max_length = 0
88
+ gold_preproc = false
89
  limit = 0
90
  augmenter = null
91
 
92
  [training]
93
+ accumulate_gradient = 3
94
+ dev_corpus = "corpora.dev"
95
+ train_corpus = "corpora.train"
96
+ seed = 0
97
+ gpu_allocator = "pytorch"
98
  dropout = 0.1
 
99
  patience = 1600
100
  max_epochs = 0
101
  max_steps = 20000
102
  eval_frequency = 200
103
  frozen_components = []
104
  annotating_components = []
 
 
105
  before_to_disk = null
106
 
107
  [training.batcher]
108
+ @batchers = "spacy.batch_by_padded.v1"
109
+ discard_oversize = true
110
+ size = 2000
111
+ buffer = 256
112
  get_length = null
113
 
 
 
 
 
 
 
 
114
  [training.logger]
115
+ @loggers = "spacy.WandbLogger.v3"
116
+ project_name = "my-awesome-project"
117
+ remove_config_values = ["paths.train","paths.dev","corpora.train.path","corpora.dev.path"]
118
+ log_dataset_dir = null
119
+ entity = null
120
+ run_name = "grammar-checker"
121
+ model_log_interval = null
122
 
123
  [training.optimizer]
124
  @optimizers = "Adam.v1"
 
129
  grad_clip = 1.0
130
  use_averages = false
131
  eps = 0.00000001
132
+
133
+ [training.optimizer.learn_rate]
134
+ @schedules = "warmup_linear.v1"
135
+ warmup_steps = 250
136
+ total_steps = 20000
137
+ initial_rate = 0.00005
138
 
139
  [training.score_weights]
140
+ spans_sc_f = 0.5
141
+ spans_sc_p = 0.0
142
+ spans_sc_r = 0.0
143
+ spans_Agreement_errors_f = 0.06
144
+ spans_Articles_f = 0.03
145
+ spans_Capitalisation_f = 0.05
146
+ spans_Formational_affixes_f = 0.1
147
+ spans_Noun_number_f = 0.04
148
+ spans_Numerals_f = 0.06
149
+ spans_Prepositions_f = 0.05
150
+ spans_Punctuation_f = 0.03
151
+ spans_Spelling_f = 0.02
152
+ spans_Tense_choice_f = 0.03
153
+ spans_lex_item_choice_f = 0.03
154
 
155
  [pretraining]
156
 
157
  [initialize]
158
+ vectors = null
159
+ init_tok2vec = null
160
  vocab_data = null
161
  lookups = null
162
  before_init = null
en_grammar_checker-any-py3-none-any.whl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:483f6941367174edb71452d6c4f82fdf61aae1b35c3755a91e8cf4d58688cbcf
3
- size 406916683
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bce34b4aa2e04c41a3fada9ada855e4745a8ca7e8858e63b672f569c2255fdc6
3
+ size 406893070
meta.json CHANGED
@@ -8,15 +8,20 @@
8
  "url":"",
9
  "license":"CC BY-SA 3.0",
10
  "spacy_version":">=3.4.4,<3.5.0",
11
- "spacy_git_version":"61dfdd9fb",
 
 
 
 
 
 
 
 
12
  "vectors":{
13
  "width":0,
14
  "vectors":0,
15
  "keys":0,
16
  "name":null
17
- },
18
- "labels":{
19
-
20
  },
21
  "pipeline":[
22
  "errors"
@@ -24,18 +29,9 @@
24
  "components":[
25
  "errors"
26
  ],
27
- "disabled":[
28
 
29
- ],
30
- "parent_package":"spacy",
31
- "requirements":[
32
- "spacy-transformers>=1.0.0,<1.1.0"
33
- ],
34
- "sources":[
35
- {
36
- "license":"MIT"
37
- }
38
- ],
39
  "performance":{
40
  "spans_errors_p":0.7937892339,
41
  "spans_errors_r":0.4476503759,
@@ -98,5 +94,9 @@
98
  }
99
  }
100
  },
101
- "speed":2779.5295317788
 
 
 
 
102
  }
 
8
  "url":"",
9
  "license":"CC BY-SA 3.0",
10
  "spacy_version":">=3.4.4,<3.5.0",
11
+ "parent_package":"spacy",
12
+ "requirements":[
13
+ "spacy-transformers>=1.0.0,<1.1.0"
14
+ ],
15
+ "sources":[
16
+ {
17
+ "license":"MIT"
18
+ }
19
+ ],
20
  "vectors":{
21
  "width":0,
22
  "vectors":0,
23
  "keys":0,
24
  "name":null
 
 
 
25
  },
26
  "pipeline":[
27
  "errors"
 
29
  "components":[
30
  "errors"
31
  ],
32
+ "labels":{
33
 
34
+ },
 
 
 
 
 
 
 
 
 
35
  "performance":{
36
  "spans_errors_p":0.7937892339,
37
  "spans_errors_r":0.4476503759,
 
94
  }
95
  }
96
  },
97
+ "speed":2779.5295317788,
98
+ "spacy_git_version":"61dfdd9fb",
99
+ "disabled":[
100
+
101
+ ]
102
  }
spancat/cfg ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "labels":[
3
+ "Numerals",
4
+ "lex_item_choice",
5
+ "Articles",
6
+ "Punctuation",
7
+ "Prepositions",
8
+ "Formational_affixes",
9
+ "Agreement_errors",
10
+ "Capitalisation",
11
+ "Noun_number",
12
+ "Tense_choice",
13
+ "Spelling"
14
+ ],
15
+ "spans_key":"errors",
16
+ "threshold":0.5,
17
+ "max_positive":null
18
+ }
spancat/model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0644468a3433a24fac0c4eff33e9a6626a1eabf81407b933c942d1d83f69e84d
3
+ size 4728651
transformer/cfg ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "max_batch_items":4096
3
+ }
transformer/model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:295f882fecfc711a4d7a52b89365526b8fcf308a8c092f91da73a9bba8b80629
3
+ size 434208063
vocab/strings.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b0341677b1e682df9d40e4e944d83860fbff48c547e2251da6885f2bc6a3fa29
3
- size 12938
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a64eed47943544dafc1dabe9d0f6c26399acf4d59c10d9cdd1c31ba85168f82
3
+ size 176610