egumasa commited on
Commit
ebbe6f5
1 Parent(s): 10d4d4f

Update spaCy pipeline

Browse files
README.md CHANGED
@@ -54,13 +54,13 @@ model-index:
54
  metrics:
55
  - name: Sentences F-Score
56
  type: f_score
57
- value: 0.9469411424
58
  ---
59
  | Feature | Description |
60
  | --- | --- |
61
  | **Name** | `en_engagement_spl_RoBERTa_base_attention` |
62
- | **Version** | `0.0.1` |
63
- | **spaCy** | `>=3.6.0,<3.7.0` |
64
  | **Default Pipeline** | `transformer`, `parser`, `tagger`, `ner`, `attribute_ruler`, `lemmatizer`, `trainable_transformer`, `spancat` |
65
  | **Components** | `transformer`, `parser`, `tagger`, `ner`, `attribute_ruler`, `lemmatizer`, `trainable_transformer`, `spancat` |
66
  | **Vectors** | 0 keys, 0 unique vectors (0 dimensions) |
@@ -90,16 +90,16 @@ model-index:
90
  | `DEP_UAS` | 0.00 |
91
  | `DEP_LAS` | 0.00 |
92
  | `DEP_LAS_PER_TYPE` | 0.00 |
93
- | `SENTS_P` | 93.64 |
94
- | `SENTS_R` | 95.78 |
95
- | `SENTS_F` | 94.69 |
96
  | `TAG_ACC` | 0.00 |
97
  | `ENTS_F` | 0.00 |
98
  | `ENTS_P` | 0.00 |
99
  | `ENTS_R` | 0.00 |
100
  | `LEMMA_ACC` | 0.00 |
101
- | `SPANS_SC_F` | 77.65 |
102
- | `SPANS_SC_P` | 78.19 |
103
- | `SPANS_SC_R` | 77.12 |
104
- | `TRAINABLE_TRANSFORMER_LOSS` | 5917.79 |
105
- | `SPANCAT_LOSS` | 76188.74 |
 
54
  metrics:
55
  - name: Sentences F-Score
56
  type: f_score
57
+ value: 0.9144831558
58
  ---
59
  | Feature | Description |
60
  | --- | --- |
61
  | **Name** | `en_engagement_spl_RoBERTa_base_attention` |
62
+ | **Version** | `0.0.2` |
63
+ | **spaCy** | `>=3.4.4,<3.5.0` |
64
  | **Default Pipeline** | `transformer`, `parser`, `tagger`, `ner`, `attribute_ruler`, `lemmatizer`, `trainable_transformer`, `spancat` |
65
  | **Components** | `transformer`, `parser`, `tagger`, `ner`, `attribute_ruler`, `lemmatizer`, `trainable_transformer`, `spancat` |
66
  | **Vectors** | 0 keys, 0 unique vectors (0 dimensions) |
 
90
  | `DEP_UAS` | 0.00 |
91
  | `DEP_LAS` | 0.00 |
92
  | `DEP_LAS_PER_TYPE` | 0.00 |
93
+ | `SENTS_P` | 89.82 |
94
+ | `SENTS_R` | 93.14 |
95
+ | `SENTS_F` | 91.45 |
96
  | `TAG_ACC` | 0.00 |
97
  | `ENTS_F` | 0.00 |
98
  | `ENTS_P` | 0.00 |
99
  | `ENTS_R` | 0.00 |
100
  | `LEMMA_ACC` | 0.00 |
101
+ | `SPANS_SC_F` | 76.99 |
102
+ | `SPANS_SC_P` | 77.84 |
103
+ | `SPANS_SC_R` | 76.17 |
104
+ | `TRAINABLE_TRANSFORMER_LOSS` | 482.53 |
105
+ | `SPANCAT_LOSS` | 68571.83 |
attribute_ruler/patterns CHANGED
Binary files a/attribute_ruler/patterns and b/attribute_ruler/patterns differ
 
config.cfg CHANGED
@@ -85,11 +85,16 @@ spans_key = ${vars.spans_key}
85
  threshold = 0.5
86
 
87
  [components.spancat.model]
88
- @architectures = "Attention_SpanCategorizer.v3"
 
 
 
89
 
90
  [components.spancat.model.reducer]
91
- @layers = "spacy.mean_max_reducer.v1"
92
  hidden_size = 128
 
 
93
 
94
  [components.spancat.model.scorer]
95
  @layers = "spacy.LinearLogistic.v1"
@@ -108,7 +113,6 @@ sizes = [1,2,3,4,5,6,7,8,9,10,11,12]
108
 
109
  [components.tagger]
110
  factory = "tagger"
111
- label_smoothing = 0.0
112
  neg_prefix = "!"
113
  overwrite = false
114
  scorer = {"@scorers":"spacy.tagger_scorer.v1"}
@@ -147,8 +151,8 @@ max_batch_items = 4096
147
  set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"}
148
 
149
  [components.transformer.model]
150
- name = "roberta-base"
151
  @architectures = "spacy-transformers.TransformerModel.v3"
 
152
  mixed_precision = false
153
 
154
  [components.transformer.model.get_spans]
@@ -195,7 +199,6 @@ eval_frequency = 200
195
  frozen_components = ["transformer","parser","tagger","ner","attribute_ruler","lemmatizer"]
196
  annotating_components = ["parser"]
197
  before_to_disk = null
198
- before_update = null
199
 
200
  [training.batcher]
201
  @batchers = "spacy.batch_by_words.v1"
 
85
  threshold = 0.5
86
 
87
  [components.spancat.model]
88
+ @architectures = "Attention_SpanCategorizer.v4"
89
+ LSTMdepth = 1
90
+ LSTMdropout = 0.0
91
+ LSTMhidden = 200
92
 
93
  [components.spancat.model.reducer]
94
+ @layers = "mean_max_reducer.v1.5"
95
  hidden_size = 128
96
+ dropout = 0.0
97
+ depth = 1
98
 
99
  [components.spancat.model.scorer]
100
  @layers = "spacy.LinearLogistic.v1"
 
113
 
114
  [components.tagger]
115
  factory = "tagger"
 
116
  neg_prefix = "!"
117
  overwrite = false
118
  scorer = {"@scorers":"spacy.tagger_scorer.v1"}
 
151
  set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"}
152
 
153
  [components.transformer.model]
 
154
  @architectures = "spacy-transformers.TransformerModel.v3"
155
+ name = "roberta-base"
156
  mixed_precision = false
157
 
158
  [components.transformer.model.get_spans]
 
199
  frozen_components = ["transformer","parser","tagger","ner","attribute_ruler","lemmatizer"]
200
  annotating_components = ["parser"]
201
  before_to_disk = null
 
202
 
203
  [training.batcher]
204
  @batchers = "spacy.batch_by_words.v1"
custom_functions.py CHANGED
@@ -368,7 +368,9 @@ def build_spancat_LSTM_model(
368
  tok2vec: Model[List[Doc], List[Floats2d]],
369
  reducer: Model[Ragged, Floats2d],
370
  scorer: Model[Floats2d, Floats2d],
371
- ) -> Model[Tuple[List[Doc], Ragged], Floats2d]:
 
 
372
  """Build a span categorizer model, given a token-to-vector model, a
373
  reducer model to map the sequence of vectors for each span down to a single
374
  vector, and a scorer model to map the vectors to probabilities.
@@ -383,17 +385,18 @@ def build_spancat_LSTM_model(
383
  0,
384
  chain(
385
  tok2vec,
 
 
 
 
 
386
  cast(Model[List[Floats2d], Ragged], list2ragged()))))
387
 
388
- attention_layer = chain(
389
- ParametricAttention(nO = width),
390
- list2ragged())
391
 
392
-
393
  model = chain(
394
  embedding,
395
- attention_layer,
396
  extract_spans(),
 
397
  reducer,
398
  scorer,
399
  )
 
368
  tok2vec: Model[List[Doc], List[Floats2d]],
369
  reducer: Model[Ragged, Floats2d],
370
  scorer: Model[Floats2d, Floats2d],
371
+ LSTMdepth: int = 1,
372
+ LSTMdropout: float = 0.0,
373
+ LSTMhidden: int = 200) -> Model[Tuple[List[Doc], Ragged], Floats2d]:
374
  """Build a span categorizer model, given a token-to-vector model, a
375
  reducer model to map the sequence of vectors for each span down to a single
376
  vector, and a scorer model to map the vectors to probabilities.
 
385
  0,
386
  chain(
387
  tok2vec,
388
+ PyTorchLSTM(nI=768,
389
+ nO=LSTMhidden,
390
+ bi=True,
391
+ depth=LSTMdepth,
392
+ dropout=LSTMdropout),
393
  cast(Model[List[Floats2d], Ragged], list2ragged()))))
394
 
 
 
 
395
 
 
396
  model = chain(
397
  embedding,
 
398
  extract_spans(),
399
+ ParametricAttention(nO = width),
400
  reducer,
401
  scorer,
402
  )
en_engagement_spl_RoBERTa_base_attention-any-py3-none-any.whl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a2b2dd5c15ced10fb287fd7b894d189522d585bdfe8bd743c2d26ffff2df02ae
3
- size 903886799
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d645450810f3d7ad0f9567642780fce0ede7d726fcbc5d62f0d48b7f39e80295
3
+ size 903169587
meta.json CHANGED
@@ -1,14 +1,14 @@
1
  {
2
  "lang":"en",
3
  "name":"engagement_spl_RoBERTa_base_attention",
4
- "version":"0.0.1",
5
  "description":"",
6
  "author":"",
7
  "email":"",
8
  "url":"",
9
  "license":"",
10
- "spacy_version":">=3.6.0,<3.7.0",
11
- "spacy_git_version":"6fc153a26",
12
  "vectors":{
13
  "width":0,
14
  "vectors":0,
@@ -186,9 +186,9 @@
186
  "dep_uas":0.0,
187
  "dep_las":0.0,
188
  "dep_las_per_type":0.0,
189
- "sents_p":0.936353211,
190
- "sents_r":0.957771261,
191
- "sents_f":0.9469411424,
192
  "tag_acc":0.0,
193
  "ents_f":0.0,
194
  "ents_p":0.0,
@@ -204,27 +204,22 @@
204
  "r":0.0,
205
  "f":0.0
206
  },
207
- "PROCLAIM":{
208
- "p":0.0,
209
- "r":0.0,
210
- "f":0.0
211
- },
212
  "COUNTER":{
213
  "p":0.0,
214
  "r":0.0,
215
  "f":0.0
216
  },
217
- "NORP":{
218
  "p":0.0,
219
  "r":0.0,
220
  "f":0.0
221
  },
222
- "FAC":{
223
  "p":0.0,
224
  "r":0.0,
225
  "f":0.0
226
  },
227
- "PERSON":{
228
  "p":0.0,
229
  "r":0.0,
230
  "f":0.0
@@ -274,22 +269,27 @@
274
  "r":0.0,
275
  "f":0.0
276
  },
277
- "GPE":{
278
  "p":0.0,
279
  "r":0.0,
280
  "f":0.0
281
  },
282
- "QUANTITY":{
283
  "p":0.0,
284
  "r":0.0,
285
  "f":0.0
286
  },
287
- "WORK_OF_ART":{
288
  "p":0.0,
289
  "r":0.0,
290
  "f":0.0
291
  },
292
- "LAW":{
 
 
 
 
 
293
  "p":0.0,
294
  "r":0.0,
295
  "f":0.0
@@ -316,14 +316,14 @@
316
  }
317
  },
318
  "lemma_acc":0.0,
319
- "spans_sc_f":0.7765328988,
320
- "spans_sc_p":0.7819487179,
321
- "spans_sc_r":0.7711915841,
322
- "trainable_transformer_loss":59.1778603495,
323
- "spancat_loss":761.8874162047
324
  },
325
  "requirements":[
326
- "spacy-transformers>=1.2.5,<1.3.0",
327
- "spacy-experimental>=0.6.4,<0.7.0"
328
  ]
329
  }
 
1
  {
2
  "lang":"en",
3
  "name":"engagement_spl_RoBERTa_base_attention",
4
+ "version":"0.0.2",
5
  "description":"",
6
  "author":"",
7
  "email":"",
8
  "url":"",
9
  "license":"",
10
+ "spacy_version":">=3.4.4,<3.5.0",
11
+ "spacy_git_version":"77833bfef",
12
  "vectors":{
13
  "width":0,
14
  "vectors":0,
 
186
  "dep_uas":0.0,
187
  "dep_las":0.0,
188
  "dep_las_per_type":0.0,
189
+ "sents_p":0.8981900452,
190
+ "sents_r":0.9313782991,
191
+ "sents_f":0.9144831558,
192
  "tag_acc":0.0,
193
  "ents_f":0.0,
194
  "ents_p":0.0,
 
204
  "r":0.0,
205
  "f":0.0
206
  },
 
 
 
 
 
207
  "COUNTER":{
208
  "p":0.0,
209
  "r":0.0,
210
  "f":0.0
211
  },
212
+ "PROCLAIM":{
213
  "p":0.0,
214
  "r":0.0,
215
  "f":0.0
216
  },
217
+ "PERSON":{
218
  "p":0.0,
219
  "r":0.0,
220
  "f":0.0
221
  },
222
+ "FAC":{
223
  "p":0.0,
224
  "r":0.0,
225
  "f":0.0
 
269
  "r":0.0,
270
  "f":0.0
271
  },
272
+ "LAW":{
273
  "p":0.0,
274
  "r":0.0,
275
  "f":0.0
276
  },
277
+ "NORP":{
278
  "p":0.0,
279
  "r":0.0,
280
  "f":0.0
281
  },
282
+ "GPE":{
283
  "p":0.0,
284
  "r":0.0,
285
  "f":0.0
286
  },
287
+ "QUANTITY":{
288
+ "p":0.0,
289
+ "r":0.0,
290
+ "f":0.0
291
+ },
292
+ "WORK_OF_ART":{
293
  "p":0.0,
294
  "r":0.0,
295
  "f":0.0
 
316
  }
317
  },
318
  "lemma_acc":0.0,
319
+ "spans_sc_f":0.7699386503,
320
+ "spans_sc_p":0.7783750258,
321
+ "spans_sc_r":0.7616831883,
322
+ "trainable_transformer_loss":4.8252773336,
323
+ "spancat_loss":685.7182957505
324
  },
325
  "requirements":[
326
+ "spacy-transformers>=1.1.8,<1.2.0",
327
+ "spacy-experimental>=0.6.1,<0.7.0"
328
  ]
329
  }
ner/model CHANGED
Binary files a/ner/model and b/ner/model differ
 
parser/model CHANGED
Binary files a/parser/model and b/parser/model differ
 
spancat/cfg CHANGED
@@ -13,7 +13,5 @@
13
  ],
14
  "spans_key":"sc",
15
  "threshold":0.5,
16
- "max_positive":null,
17
- "negative_weight":null,
18
- "allow_overlap":true
19
  }
 
13
  ],
14
  "spans_key":"sc",
15
  "threshold":0.5,
16
+ "max_positive":null
 
 
17
  }
spancat/model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:57f0eae0e1a723c68852e60899998d0ab54d3c5dd3a9605b47c87568f5e1c46f
3
- size 4731308
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a619bfcf65a09d7ae3530f875da94ff971cb9cc9541c75a46ba323935c6e99e5
3
+ size 4025694
tagger/cfg CHANGED
@@ -1,5 +1,4 @@
1
  {
2
- "label_smoothing":0.0,
3
  "labels":[
4
  "$",
5
  "''",
 
1
  {
 
2
  "labels":[
3
  "$",
4
  "''",
tagger/model CHANGED
Binary files a/tagger/model and b/tagger/model differ
 
tokenizer CHANGED
The diff for this file is too large to render. See raw diff
 
trainable_transformer/model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fe7414c3a1d546d9d1e5e579db20f3b41d324d07e083d16fff4de6861996948b
3
- size 502028268
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0563af895d09415d3a4bb71890938adf4e29777c45ff9d3ebf484677d74f8101
3
+ size 502027816
transformer/model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bb9fc57108532fd3a96a6c638e5a4c55c19990e900eac50266b189f1c23da3bf
3
- size 502028329
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5cb2327f9fbf15d08de679e5cfd9b1ae1fc4c565d3f3438d8e627d85a3c577a3
3
+ size 502027925
vocab/strings.json CHANGED
@@ -3384,7 +3384,6 @@
3384
  "3500",
3385
  "35087.38",
3386
  "350Ms",
3387
- "350m",
3388
  "350ms",
3389
  "351",
3390
  "351.2",
@@ -68892,6 +68891,7 @@
68892
  "pennzoil",
68893
  "pens",
68894
  "pensacola",
 
68895
  "pensees",
68896
  "pension",
68897
  "pensions",
@@ -86547,6 +86547,7 @@
86547
  "wolves",
86548
  "womack",
86549
  "woman",
 
86550
  "womanizing",
86551
  "womanly",
86552
  "womans",
 
3384
  "3500",
3385
  "35087.38",
3386
  "350Ms",
 
3387
  "350ms",
3388
  "351",
3389
  "351.2",
 
68891
  "pennzoil",
68892
  "pens",
68893
  "pensacola",
68894
+ "pensee",
68895
  "pensees",
68896
  "pension",
68897
  "pensions",
 
86547
  "wolves",
86548
  "womack",
86549
  "woman",
86550
+ "womanize",
86551
  "womanizing",
86552
  "womanly",
86553
  "womans",