egumasa commited on
Commit
4aad32c
1 Parent(s): ebbe6f5

Update spaCy pipeline

Browse files
README.md CHANGED
@@ -54,13 +54,13 @@ model-index:
54
  metrics:
55
  - name: Sentences F-Score
56
  type: f_score
57
- value: 0.9144831558
58
  ---
59
  | Feature | Description |
60
  | --- | --- |
61
  | **Name** | `en_engagement_spl_RoBERTa_base_attention` |
62
- | **Version** | `0.0.2` |
63
- | **spaCy** | `>=3.4.4,<3.5.0` |
64
  | **Default Pipeline** | `transformer`, `parser`, `tagger`, `ner`, `attribute_ruler`, `lemmatizer`, `trainable_transformer`, `spancat` |
65
  | **Components** | `transformer`, `parser`, `tagger`, `ner`, `attribute_ruler`, `lemmatizer`, `trainable_transformer`, `spancat` |
66
  | **Vectors** | 0 keys, 0 unique vectors (0 dimensions) |
@@ -90,16 +90,16 @@ model-index:
90
  | `DEP_UAS` | 0.00 |
91
  | `DEP_LAS` | 0.00 |
92
  | `DEP_LAS_PER_TYPE` | 0.00 |
93
- | `SENTS_P` | 89.82 |
94
- | `SENTS_R` | 93.14 |
95
- | `SENTS_F` | 91.45 |
96
  | `TAG_ACC` | 0.00 |
97
  | `ENTS_F` | 0.00 |
98
  | `ENTS_P` | 0.00 |
99
  | `ENTS_R` | 0.00 |
100
  | `LEMMA_ACC` | 0.00 |
101
- | `SPANS_SC_F` | 76.99 |
102
- | `SPANS_SC_P` | 77.84 |
103
- | `SPANS_SC_R` | 76.17 |
104
- | `TRAINABLE_TRANSFORMER_LOSS` | 482.53 |
105
- | `SPANCAT_LOSS` | 68571.83 |
 
54
  metrics:
55
  - name: Sentences F-Score
56
  type: f_score
57
+ value: 0.9469411424
58
  ---
59
  | Feature | Description |
60
  | --- | --- |
61
  | **Name** | `en_engagement_spl_RoBERTa_base_attention` |
62
+ | **Version** | `0.0.1` |
63
+ | **spaCy** | `>=3.6.0,<3.7.0` |
64
  | **Default Pipeline** | `transformer`, `parser`, `tagger`, `ner`, `attribute_ruler`, `lemmatizer`, `trainable_transformer`, `spancat` |
65
  | **Components** | `transformer`, `parser`, `tagger`, `ner`, `attribute_ruler`, `lemmatizer`, `trainable_transformer`, `spancat` |
66
  | **Vectors** | 0 keys, 0 unique vectors (0 dimensions) |
 
90
  | `DEP_UAS` | 0.00 |
91
  | `DEP_LAS` | 0.00 |
92
  | `DEP_LAS_PER_TYPE` | 0.00 |
93
+ | `SENTS_P` | 93.64 |
94
+ | `SENTS_R` | 95.78 |
95
+ | `SENTS_F` | 94.69 |
96
  | `TAG_ACC` | 0.00 |
97
  | `ENTS_F` | 0.00 |
98
  | `ENTS_P` | 0.00 |
99
  | `ENTS_R` | 0.00 |
100
  | `LEMMA_ACC` | 0.00 |
101
+ | `SPANS_SC_F` | 77.65 |
102
+ | `SPANS_SC_P` | 78.19 |
103
+ | `SPANS_SC_R` | 77.12 |
104
+ | `TRAINABLE_TRANSFORMER_LOSS` | 5917.79 |
105
+ | `SPANCAT_LOSS` | 76188.74 |
attribute_ruler/patterns CHANGED
Binary files a/attribute_ruler/patterns and b/attribute_ruler/patterns differ
 
config.cfg CHANGED
@@ -85,16 +85,11 @@ spans_key = ${vars.spans_key}
85
  threshold = 0.5
86
 
87
  [components.spancat.model]
88
- @architectures = "Attention_SpanCategorizer.v4"
89
- LSTMdepth = 1
90
- LSTMdropout = 0.0
91
- LSTMhidden = 200
92
 
93
  [components.spancat.model.reducer]
94
- @layers = "mean_max_reducer.v1.5"
95
  hidden_size = 128
96
- dropout = 0.0
97
- depth = 1
98
 
99
  [components.spancat.model.scorer]
100
  @layers = "spacy.LinearLogistic.v1"
@@ -113,6 +108,7 @@ sizes = [1,2,3,4,5,6,7,8,9,10,11,12]
113
 
114
  [components.tagger]
115
  factory = "tagger"
 
116
  neg_prefix = "!"
117
  overwrite = false
118
  scorer = {"@scorers":"spacy.tagger_scorer.v1"}
@@ -151,8 +147,8 @@ max_batch_items = 4096
151
  set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"}
152
 
153
  [components.transformer.model]
154
- @architectures = "spacy-transformers.TransformerModel.v3"
155
  name = "roberta-base"
 
156
  mixed_precision = false
157
 
158
  [components.transformer.model.get_spans]
@@ -199,6 +195,7 @@ eval_frequency = 200
199
  frozen_components = ["transformer","parser","tagger","ner","attribute_ruler","lemmatizer"]
200
  annotating_components = ["parser"]
201
  before_to_disk = null
 
202
 
203
  [training.batcher]
204
  @batchers = "spacy.batch_by_words.v1"
 
85
  threshold = 0.5
86
 
87
  [components.spancat.model]
88
+ @architectures = "Attention_SpanCategorizer.v3"
 
 
 
89
 
90
  [components.spancat.model.reducer]
91
+ @layers = "spacy.mean_max_reducer.v1"
92
  hidden_size = 128
 
 
93
 
94
  [components.spancat.model.scorer]
95
  @layers = "spacy.LinearLogistic.v1"
 
108
 
109
  [components.tagger]
110
  factory = "tagger"
111
+ label_smoothing = 0.0
112
  neg_prefix = "!"
113
  overwrite = false
114
  scorer = {"@scorers":"spacy.tagger_scorer.v1"}
 
147
  set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"}
148
 
149
  [components.transformer.model]
 
150
  name = "roberta-base"
151
+ @architectures = "spacy-transformers.TransformerModel.v3"
152
  mixed_precision = false
153
 
154
  [components.transformer.model.get_spans]
 
195
  frozen_components = ["transformer","parser","tagger","ner","attribute_ruler","lemmatizer"]
196
  annotating_components = ["parser"]
197
  before_to_disk = null
198
+ before_update = null
199
 
200
  [training.batcher]
201
  @batchers = "spacy.batch_by_words.v1"
custom_functions.py CHANGED
@@ -368,9 +368,7 @@ def build_spancat_LSTM_model(
368
  tok2vec: Model[List[Doc], List[Floats2d]],
369
  reducer: Model[Ragged, Floats2d],
370
  scorer: Model[Floats2d, Floats2d],
371
- LSTMdepth: int = 1,
372
- LSTMdropout: float = 0.0,
373
- LSTMhidden: int = 200) -> Model[Tuple[List[Doc], Ragged], Floats2d]:
374
  """Build a span categorizer model, given a token-to-vector model, a
375
  reducer model to map the sequence of vectors for each span down to a single
376
  vector, and a scorer model to map the vectors to probabilities.
@@ -385,18 +383,17 @@ def build_spancat_LSTM_model(
385
  0,
386
  chain(
387
  tok2vec,
388
- PyTorchLSTM(nI=768,
389
- nO=LSTMhidden,
390
- bi=True,
391
- depth=LSTMdepth,
392
- dropout=LSTMdropout),
393
  cast(Model[List[Floats2d], Ragged], list2ragged()))))
394
 
 
 
 
395
 
 
396
  model = chain(
397
  embedding,
 
398
  extract_spans(),
399
- ParametricAttention(nO = width),
400
  reducer,
401
  scorer,
402
  )
 
368
  tok2vec: Model[List[Doc], List[Floats2d]],
369
  reducer: Model[Ragged, Floats2d],
370
  scorer: Model[Floats2d, Floats2d],
371
+ ) -> Model[Tuple[List[Doc], Ragged], Floats2d]:
 
 
372
  """Build a span categorizer model, given a token-to-vector model, a
373
  reducer model to map the sequence of vectors for each span down to a single
374
  vector, and a scorer model to map the vectors to probabilities.
 
383
  0,
384
  chain(
385
  tok2vec,
 
 
 
 
 
386
  cast(Model[List[Floats2d], Ragged], list2ragged()))))
387
 
388
+ attention_layer = chain(
389
+ ParametricAttention(nO = width),
390
+ list2ragged())
391
 
392
+
393
  model = chain(
394
  embedding,
395
+ attention_layer,
396
  extract_spans(),
 
397
  reducer,
398
  scorer,
399
  )
en_engagement_spl_RoBERTa_base_attention-any-py3-none-any.whl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d645450810f3d7ad0f9567642780fce0ede7d726fcbc5d62f0d48b7f39e80295
3
- size 903169587
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2b2dd5c15ced10fb287fd7b894d189522d585bdfe8bd743c2d26ffff2df02ae
3
+ size 903886799
meta.json CHANGED
@@ -1,14 +1,14 @@
1
  {
2
  "lang":"en",
3
  "name":"engagement_spl_RoBERTa_base_attention",
4
- "version":"0.0.2",
5
  "description":"",
6
  "author":"",
7
  "email":"",
8
  "url":"",
9
  "license":"",
10
- "spacy_version":">=3.4.4,<3.5.0",
11
- "spacy_git_version":"77833bfef",
12
  "vectors":{
13
  "width":0,
14
  "vectors":0,
@@ -186,9 +186,9 @@
186
  "dep_uas":0.0,
187
  "dep_las":0.0,
188
  "dep_las_per_type":0.0,
189
- "sents_p":0.8981900452,
190
- "sents_r":0.9313782991,
191
- "sents_f":0.9144831558,
192
  "tag_acc":0.0,
193
  "ents_f":0.0,
194
  "ents_p":0.0,
@@ -204,17 +204,17 @@
204
  "r":0.0,
205
  "f":0.0
206
  },
207
- "COUNTER":{
208
  "p":0.0,
209
  "r":0.0,
210
  "f":0.0
211
  },
212
- "PROCLAIM":{
213
  "p":0.0,
214
  "r":0.0,
215
  "f":0.0
216
  },
217
- "PERSON":{
218
  "p":0.0,
219
  "r":0.0,
220
  "f":0.0
@@ -224,6 +224,11 @@
224
  "r":0.0,
225
  "f":0.0
226
  },
 
 
 
 
 
227
  "ORG":{
228
  "p":0.0,
229
  "r":0.0,
@@ -269,27 +274,22 @@
269
  "r":0.0,
270
  "f":0.0
271
  },
272
- "LAW":{
273
- "p":0.0,
274
- "r":0.0,
275
- "f":0.0
276
- },
277
- "NORP":{
278
  "p":0.0,
279
  "r":0.0,
280
  "f":0.0
281
  },
282
- "GPE":{
283
  "p":0.0,
284
  "r":0.0,
285
  "f":0.0
286
  },
287
- "QUANTITY":{
288
  "p":0.0,
289
  "r":0.0,
290
  "f":0.0
291
  },
292
- "WORK_OF_ART":{
293
  "p":0.0,
294
  "r":0.0,
295
  "f":0.0
@@ -316,14 +316,14 @@
316
  }
317
  },
318
  "lemma_acc":0.0,
319
- "spans_sc_f":0.7699386503,
320
- "spans_sc_p":0.7783750258,
321
- "spans_sc_r":0.7616831883,
322
- "trainable_transformer_loss":4.8252773336,
323
- "spancat_loss":685.7182957505
324
  },
325
  "requirements":[
326
- "spacy-transformers>=1.1.8,<1.2.0",
327
- "spacy-experimental>=0.6.1,<0.7.0"
328
  ]
329
  }
 
1
  {
2
  "lang":"en",
3
  "name":"engagement_spl_RoBERTa_base_attention",
4
+ "version":"0.0.1",
5
  "description":"",
6
  "author":"",
7
  "email":"",
8
  "url":"",
9
  "license":"",
10
+ "spacy_version":">=3.6.0,<3.7.0",
11
+ "spacy_git_version":"6fc153a26",
12
  "vectors":{
13
  "width":0,
14
  "vectors":0,
 
186
  "dep_uas":0.0,
187
  "dep_las":0.0,
188
  "dep_las_per_type":0.0,
189
+ "sents_p":0.936353211,
190
+ "sents_r":0.957771261,
191
+ "sents_f":0.9469411424,
192
  "tag_acc":0.0,
193
  "ents_f":0.0,
194
  "ents_p":0.0,
 
204
  "r":0.0,
205
  "f":0.0
206
  },
207
+ "PROCLAIM":{
208
  "p":0.0,
209
  "r":0.0,
210
  "f":0.0
211
  },
212
+ "COUNTER":{
213
  "p":0.0,
214
  "r":0.0,
215
  "f":0.0
216
  },
217
+ "NORP":{
218
  "p":0.0,
219
  "r":0.0,
220
  "f":0.0
 
224
  "r":0.0,
225
  "f":0.0
226
  },
227
+ "PERSON":{
228
+ "p":0.0,
229
+ "r":0.0,
230
+ "f":0.0
231
+ },
232
  "ORG":{
233
  "p":0.0,
234
  "r":0.0,
 
274
  "r":0.0,
275
  "f":0.0
276
  },
277
+ "GPE":{
 
 
 
 
 
278
  "p":0.0,
279
  "r":0.0,
280
  "f":0.0
281
  },
282
+ "QUANTITY":{
283
  "p":0.0,
284
  "r":0.0,
285
  "f":0.0
286
  },
287
+ "WORK_OF_ART":{
288
  "p":0.0,
289
  "r":0.0,
290
  "f":0.0
291
  },
292
+ "LAW":{
293
  "p":0.0,
294
  "r":0.0,
295
  "f":0.0
 
316
  }
317
  },
318
  "lemma_acc":0.0,
319
+ "spans_sc_f":0.7765328988,
320
+ "spans_sc_p":0.7819487179,
321
+ "spans_sc_r":0.7711915841,
322
+ "trainable_transformer_loss":59.1778603495,
323
+ "spancat_loss":761.8874162047
324
  },
325
  "requirements":[
326
+ "spacy-transformers>=1.2.5,<1.3.0",
327
+ "spacy-experimental>=0.6.4,<0.7.0"
328
  ]
329
  }
ner/model CHANGED
Binary files a/ner/model and b/ner/model differ
 
parser/model CHANGED
Binary files a/parser/model and b/parser/model differ
 
spancat/cfg CHANGED
@@ -13,5 +13,7 @@
13
  ],
14
  "spans_key":"sc",
15
  "threshold":0.5,
16
- "max_positive":null
 
 
17
  }
 
13
  ],
14
  "spans_key":"sc",
15
  "threshold":0.5,
16
+ "max_positive":null,
17
+ "negative_weight":null,
18
+ "allow_overlap":true
19
  }
spancat/model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a619bfcf65a09d7ae3530f875da94ff971cb9cc9541c75a46ba323935c6e99e5
3
- size 4025694
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:57f0eae0e1a723c68852e60899998d0ab54d3c5dd3a9605b47c87568f5e1c46f
3
+ size 4731308
tagger/cfg CHANGED
@@ -1,4 +1,5 @@
1
  {
 
2
  "labels":[
3
  "$",
4
  "''",
 
1
  {
2
+ "label_smoothing":0.0,
3
  "labels":[
4
  "$",
5
  "''",
tagger/model CHANGED
Binary files a/tagger/model and b/tagger/model differ
 
tokenizer CHANGED
The diff for this file is too large to render. See raw diff
 
trainable_transformer/model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0563af895d09415d3a4bb71890938adf4e29777c45ff9d3ebf484677d74f8101
3
- size 502027816
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe7414c3a1d546d9d1e5e579db20f3b41d324d07e083d16fff4de6861996948b
3
+ size 502028268
transformer/model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5cb2327f9fbf15d08de679e5cfd9b1ae1fc4c565d3f3438d8e627d85a3c577a3
3
- size 502027925
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb9fc57108532fd3a96a6c638e5a4c55c19990e900eac50266b189f1c23da3bf
3
+ size 502028329
vocab/strings.json CHANGED
@@ -3384,6 +3384,7 @@
3384
  "3500",
3385
  "35087.38",
3386
  "350Ms",
 
3387
  "350ms",
3388
  "351",
3389
  "351.2",
@@ -68891,7 +68892,6 @@
68891
  "pennzoil",
68892
  "pens",
68893
  "pensacola",
68894
- "pensee",
68895
  "pensees",
68896
  "pension",
68897
  "pensions",
@@ -86547,7 +86547,6 @@
86547
  "wolves",
86548
  "womack",
86549
  "woman",
86550
- "womanize",
86551
  "womanizing",
86552
  "womanly",
86553
  "womans",
 
3384
  "3500",
3385
  "35087.38",
3386
  "350Ms",
3387
+ "350m",
3388
  "350ms",
3389
  "351",
3390
  "351.2",
 
68892
  "pennzoil",
68893
  "pens",
68894
  "pensacola",
 
68895
  "pensees",
68896
  "pension",
68897
  "pensions",
 
86547
  "wolves",
86548
  "womack",
86549
  "woman",
 
86550
  "womanizing",
86551
  "womanly",
86552
  "womans",