Zlovoblachko commited on
Commit
20875ec
1 Parent(s): cce67dc

Update spaCy pipeline

Browse files
.gitattributes CHANGED
@@ -35,3 +35,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
35
  en_pipeline-any-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
36
  textcat/model filter=lfs diff=lfs merge=lfs -text
37
  transformer/model filter=lfs diff=lfs merge=lfs -text
 
 
35
  en_pipeline-any-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
36
  textcat/model filter=lfs diff=lfs merge=lfs -text
37
  transformer/model filter=lfs diff=lfs merge=lfs -text
38
+ spancat/model filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,7 +1,6 @@
1
  ---
2
  tags:
3
  - spacy
4
- - text-classification
5
  language:
6
  - en
7
  model-index:
@@ -12,9 +11,9 @@ model-index:
12
  | --- | --- |
13
  | **Name** | `en_pipeline` |
14
  | **Version** | `0.0.0` |
15
- | **spaCy** | `>=3.5.1,<3.6.0` |
16
- | **Default Pipeline** | `transformer`, `textcat` |
17
- | **Components** | `transformer`, `textcat` |
18
  | **Vectors** | 0 keys, 0 unique vectors (0 dimensions) |
19
  | **Sources** | n/a |
20
  | **License** | n/a |
@@ -24,11 +23,11 @@ model-index:
24
 
25
  <details>
26
 
27
- <summary>View label scheme (2 labels for 1 components)</summary>
28
 
29
  | Component | Labels |
30
  | --- | --- |
31
- | **`textcat`** | `POS`, `NEG` |
32
 
33
  </details>
34
 
@@ -36,13 +35,8 @@ model-index:
36
 
37
  | Type | Score |
38
  | --- | --- |
39
- | `CATS_SCORE` | 91.77 |
40
- | `CATS_MICRO_P` | 91.78 |
41
- | `CATS_MICRO_R` | 91.78 |
42
- | `CATS_MICRO_F` | 91.78 |
43
- | `CATS_MACRO_P` | 91.90 |
44
- | `CATS_MACRO_R` | 91.74 |
45
- | `CATS_MACRO_F` | 91.77 |
46
- | `CATS_MACRO_AUC` | 97.20 |
47
- | `TRANSFORMER_LOSS` | 2920.28 |
48
- | `TEXTCAT_LOSS` | 307.66 |
 
1
  ---
2
  tags:
3
  - spacy
 
4
  language:
5
  - en
6
  model-index:
 
11
  | --- | --- |
12
  | **Name** | `en_pipeline` |
13
  | **Version** | `0.0.0` |
14
+ | **spaCy** | `>=3.4.4,<3.5.0` |
15
+ | **Default Pipeline** | `transformer`, `spancat` |
16
+ | **Components** | `transformer`, `spancat` |
17
  | **Vectors** | 0 keys, 0 unique vectors (0 dimensions) |
18
  | **Sources** | n/a |
19
  | **License** | n/a |
 
23
 
24
  <details>
25
 
26
+ <summary>View label scheme (1 labels for 1 components)</summary>
27
 
28
  | Component | Labels |
29
  | --- | --- |
30
+ | **`spancat`** | `Collocation calque` |
31
 
32
  </details>
33
 
 
35
 
36
  | Type | Score |
37
  | --- | --- |
38
+ | `SPANS_SC_F` | 78.65 |
39
+ | `SPANS_SC_P` | 79.55 |
40
+ | `SPANS_SC_R` | 77.78 |
41
+ | `TRANSFORMER_LOSS` | 7535.29 |
42
+ | `SPANCAT_LOSS` | 148493.75 |
 
 
 
 
 
config.cfg CHANGED
@@ -1,6 +1,6 @@
1
  [paths]
2
- train = "./train.spacy"
3
- dev = "./test.spacy"
4
  vectors = null
5
  init_tok2vec = null
6
 
@@ -10,8 +10,8 @@ seed = 0
10
 
11
  [nlp]
12
  lang = "en"
13
- pipeline = ["transformer","textcat"]
14
- batch_size = 128
15
  disabled = []
16
  before_creation = null
17
  after_creation = null
@@ -20,28 +20,35 @@ tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
20
 
21
  [components]
22
 
23
- [components.textcat]
24
- factory = "textcat"
25
- scorer = {"@scorers":"spacy.textcat_scorer.v2"}
26
- threshold = 0.0
 
 
27
 
28
- [components.textcat.model]
29
- @architectures = "spacy.TextCatEnsemble.v2"
30
- nO = null
 
 
 
31
 
32
- [components.textcat.model.linear_model]
33
- @architectures = "spacy.TextCatBOW.v2"
34
- exclusive_classes = true
35
- ngram_size = 1
36
- no_output_layer = false
37
  nO = null
 
38
 
39
- [components.textcat.model.tok2vec]
40
  @architectures = "spacy-transformers.TransformerListener.v1"
41
  grad_factor = 1.0
42
  pooling = {"@layers":"reduce_mean.v1"}
43
  upstream = "*"
44
 
 
 
 
 
45
  [components.transformer]
46
  factory = "transformer"
47
  max_batch_items = 4096
@@ -49,7 +56,7 @@ set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotati
49
 
50
  [components.transformer.model]
51
  @architectures = "spacy-transformers.TransformerModel.v3"
52
- name = "bert-base-uncased"
53
  mixed_precision = false
54
 
55
  [components.transformer.model.get_spans]
@@ -68,7 +75,7 @@ use_fast = true
68
 
69
  [corpora.dev]
70
  @readers = "spacy.Corpus.v1"
71
- path = ${paths.dev}
72
  max_length = 0
73
  gold_preproc = false
74
  limit = 0
@@ -76,7 +83,7 @@ augmenter = null
76
 
77
  [corpora.train]
78
  @readers = "spacy.Corpus.v1"
79
- path = ${paths.train}
80
  max_length = 0
81
  gold_preproc = false
82
  limit = 0
@@ -86,8 +93,8 @@ augmenter = null
86
  accumulate_gradient = 3
87
  dev_corpus = "corpora.dev"
88
  train_corpus = "corpora.train"
89
- seed = ${system.seed}
90
- gpu_allocator = ${system.gpu_allocator}
91
  dropout = 0.1
92
  patience = 1600
93
  max_epochs = 0
@@ -96,7 +103,6 @@ eval_frequency = 200
96
  frozen_components = []
97
  annotating_components = []
98
  before_to_disk = null
99
- before_update = null
100
 
101
  [training.batcher]
102
  @batchers = "spacy.batch_by_padded.v1"
@@ -106,8 +112,13 @@ buffer = 256
106
  get_length = null
107
 
108
  [training.logger]
109
- @loggers = "spacy.ConsoleLogger.v1"
110
- progress_bar = true
 
 
 
 
 
111
 
112
  [training.optimizer]
113
  @optimizers = "Adam.v1"
@@ -126,22 +137,15 @@ total_steps = 20000
126
  initial_rate = 0.00005
127
 
128
  [training.score_weights]
129
- cats_score = 1.0
130
- cats_score_desc = null
131
- cats_micro_p = null
132
- cats_micro_r = null
133
- cats_micro_f = null
134
- cats_macro_p = null
135
- cats_macro_r = null
136
- cats_macro_f = null
137
- cats_macro_auc = null
138
- cats_f_per_type = null
139
 
140
  [pretraining]
141
 
142
  [initialize]
143
- vectors = ${paths.vectors}
144
- init_tok2vec = ${paths.init_tok2vec}
145
  vocab_data = null
146
  lookups = null
147
  before_init = null
 
1
  [paths]
2
+ train = "./realec/train.spacy"
3
+ dev = "./realec/dev.spacy"
4
  vectors = null
5
  init_tok2vec = null
6
 
 
10
 
11
  [nlp]
12
  lang = "en"
13
+ pipeline = ["transformer","spancat"]
14
+ batch_size = 16
15
  disabled = []
16
  before_creation = null
17
  after_creation = null
 
20
 
21
  [components]
22
 
23
+ [components.spancat]
24
+ factory = "spancat"
25
+ max_positive = null
26
+ scorer = {"@scorers":"spacy.spancat_scorer.v1"}
27
+ spans_key = "sc"
28
+ threshold = 0.5
29
 
30
+ [components.spancat.model]
31
+ @architectures = "spacy.SpanCategorizer.v1"
32
+
33
+ [components.spancat.model.reducer]
34
+ @layers = "spacy.mean_max_reducer.v1"
35
+ hidden_size = 128
36
 
37
+ [components.spancat.model.scorer]
38
+ @layers = "spacy.LinearLogistic.v1"
 
 
 
39
  nO = null
40
+ nI = null
41
 
42
+ [components.spancat.model.tok2vec]
43
  @architectures = "spacy-transformers.TransformerListener.v1"
44
  grad_factor = 1.0
45
  pooling = {"@layers":"reduce_mean.v1"}
46
  upstream = "*"
47
 
48
+ [components.spancat.suggester]
49
+ @misc = "spacy.ngram_suggester.v1"
50
+ sizes = [1,2,3]
51
+
52
  [components.transformer]
53
  factory = "transformer"
54
  max_batch_items = 4096
 
56
 
57
  [components.transformer.model]
58
  @architectures = "spacy-transformers.TransformerModel.v3"
59
+ name = "roberta-base"
60
  mixed_precision = false
61
 
62
  [components.transformer.model.get_spans]
 
75
 
76
  [corpora.dev]
77
  @readers = "spacy.Corpus.v1"
78
+ path = "./dev_new.spacy"
79
  max_length = 0
80
  gold_preproc = false
81
  limit = 0
 
83
 
84
  [corpora.train]
85
  @readers = "spacy.Corpus.v1"
86
+ path = "./train_new.spacy"
87
  max_length = 0
88
  gold_preproc = false
89
  limit = 0
 
93
  accumulate_gradient = 3
94
  dev_corpus = "corpora.dev"
95
  train_corpus = "corpora.train"
96
+ seed = 0
97
+ gpu_allocator = "pytorch"
98
  dropout = 0.1
99
  patience = 1600
100
  max_epochs = 0
 
103
  frozen_components = []
104
  annotating_components = []
105
  before_to_disk = null
 
106
 
107
  [training.batcher]
108
  @batchers = "spacy.batch_by_padded.v1"
 
112
  get_length = null
113
 
114
  [training.logger]
115
+ @loggers = "spacy.WandbLogger.v3"
116
+ project_name = "my-awesome-project"
117
+ remove_config_values = ["paths.train","paths.dev","corpora.train.path","corpora.dev.path"]
118
+ log_dataset_dir = null
119
+ entity = null
120
+ run_name = "grammar-checker"
121
+ model_log_interval = null
122
 
123
  [training.optimizer]
124
  @optimizers = "Adam.v1"
 
137
  initial_rate = 0.00005
138
 
139
  [training.score_weights]
140
+ spans_sc_f = 1.0
141
+ spans_sc_p = 0.0
142
+ spans_sc_r = 0.0
 
 
 
 
 
 
 
143
 
144
  [pretraining]
145
 
146
  [initialize]
147
+ vectors = null
148
+ init_tok2vec = null
149
  vocab_data = null
150
  lookups = null
151
  before_init = null
en_pipeline-any-py3-none-any.whl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e35e7f065ebdc23d16a1ef3496befde1f383d45a057dd2ae32ab1d64bf778e02
3
- size 412661622
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3c18fb55ea1df716bcfba73e5dc56088a12cf5d913f27f4efce4969a85c07a4
3
+ size 429989617
meta.json CHANGED
@@ -7,7 +7,7 @@
7
  "email":"",
8
  "url":"",
9
  "license":"",
10
- "spacy_version":">=3.5.1,<3.6.0",
11
  "spacy_git_version":"Unknown",
12
  "vectors":{
13
  "width":0,
@@ -19,48 +19,29 @@
19
  "transformer":[
20
 
21
  ],
22
- "textcat":[
23
- "POS",
24
- "NEG"
25
  ]
26
  },
27
  "pipeline":[
28
  "transformer",
29
- "textcat"
30
  ],
31
  "components":[
32
  "transformer",
33
- "textcat"
34
  ],
35
  "disabled":[
36
 
37
  ],
38
  "performance":{
39
- "cats_score":0.9176602658,
40
- "cats_score_desc":"macro F",
41
- "cats_micro_p":0.9177777778,
42
- "cats_micro_r":0.9177777778,
43
- "cats_micro_f":0.9177777778,
44
- "cats_macro_p":0.9189851078,
45
- "cats_macro_r":0.917437174,
46
- "cats_macro_f":0.9176602658,
47
- "cats_macro_auc":0.9720394737,
48
- "cats_f_per_type":{
49
- "POS":{
50
- "p":0.9383886256,
51
- "r":0.8918918919,
52
- "f":0.9145496536
53
- },
54
- "NEG":{
55
- "p":0.89958159,
56
- "r":0.9429824561,
57
- "f":0.9207708779
58
- }
59
- },
60
- "transformer_loss":29.2027841461,
61
- "textcat_loss":3.076644832
62
  },
63
  "requirements":[
64
- "spacy-transformers>=1.2.2,<1.3.0"
65
  ]
66
  }
 
7
  "email":"",
8
  "url":"",
9
  "license":"",
10
+ "spacy_version":">=3.4.4,<3.5.0",
11
  "spacy_git_version":"Unknown",
12
  "vectors":{
13
  "width":0,
 
19
  "transformer":[
20
 
21
  ],
22
+ "spancat":[
23
+ "Collocation calque"
 
24
  ]
25
  },
26
  "pipeline":[
27
  "transformer",
28
+ "spancat"
29
  ],
30
  "components":[
31
  "transformer",
32
+ "spancat"
33
  ],
34
  "disabled":[
35
 
36
  ],
37
  "performance":{
38
+ "spans_sc_f":0.7865168539,
39
+ "spans_sc_p":0.7954545455,
40
+ "spans_sc_r":0.7777777778,
41
+ "transformer_loss":75.3529010877,
42
+ "spancat_loss":1484.9374902405
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  },
44
  "requirements":[
45
+ "spacy-transformers>=1.2.1,<1.3.0"
46
  ]
47
  }
spancat/cfg ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "labels":[
3
+ "Collocation calque"
4
+ ],
5
+ "spans_key":"sc",
6
+ "threshold":0.5,
7
+ "max_positive":null
8
+ }
spancat/model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f87221448a074d1c96d0e147e20962654614d0c20d79f1573af97256e0ca67b3
3
+ size 4723491
tokenizer CHANGED
The diff for this file is too large to render. See raw diff
 
transformer/model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6d0701461020db1519d06f02a0fabb24b27c4021afdbf1471b7200399f0b4055
3
- size 438956545
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af10117af5973ed22ffdadc9022168ca709b348568937a25abdf65d7c84649aa
3
+ size 502030680
vocab/strings.json CHANGED
The diff for this file is too large to render. See raw diff