ytsaig commited on
Commit
76162d8
1 Parent(s): 47a1d4b

Create readability pipeline.

Browse files
.ipynb_checkpoints/config-checkpoint.cfg ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [paths]
2
+ train = null
3
+ dev = null
4
+ vectors = null
5
+ init_tok2vec = null
6
+
7
+ [system]
8
+ seed = 0
9
+ gpu_allocator = null
10
+
11
+ [nlp]
12
+ lang = "en"
13
+ pipeline = ["tok2vec","tagger","parser","attribute_ruler","readability"]
14
+ disabled = []
15
+ before_creation = null
16
+ after_creation = null
17
+ after_pipeline_creation = null
18
+ batch_size = 1000
19
+ tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
20
+
21
+ [components]
22
+
23
+ [components.attribute_ruler]
24
+ factory = "attribute_ruler"
25
+ scorer = {"@scorers":"spacy.attribute_ruler_scorer.v1"}
26
+ validate = false
27
+
28
+ [components.parser]
29
+ factory = "parser"
30
+ learn_tokens = false
31
+ min_action_freq = 30
32
+ moves = null
33
+ scorer = {"@scorers":"spacy.parser_scorer.v1"}
34
+ update_with_oracle_cut_size = 100
35
+
36
+ [components.parser.model]
37
+ @architectures = "spacy.TransitionBasedParser.v2"
38
+ state_type = "parser"
39
+ extra_state_tokens = false
40
+ hidden_width = 64
41
+ maxout_pieces = 2
42
+ use_upper = true
43
+ nO = null
44
+
45
+ [components.parser.model.tok2vec]
46
+ @architectures = "spacy.Tok2VecListener.v1"
47
+ width = 96
48
+ upstream = "tok2vec"
49
+
50
+ [components.readability]
51
+ factory = "readability"
52
+
53
+ [components.tagger]
54
+ factory = "tagger"
55
+ neg_prefix = "!"
56
+ overwrite = false
57
+ scorer = {"@scorers":"spacy.tagger_scorer.v1"}
58
+
59
+ [components.tagger.model]
60
+ @architectures = "spacy.Tagger.v1"
61
+ nO = null
62
+
63
+ [components.tagger.model.tok2vec]
64
+ @architectures = "spacy.Tok2VecListener.v1"
65
+ width = 96
66
+ upstream = "tok2vec"
67
+
68
+ [components.tok2vec]
69
+ factory = "tok2vec"
70
+
71
+ [components.tok2vec.model]
72
+ @architectures = "spacy.Tok2Vec.v2"
73
+
74
+ [components.tok2vec.model.embed]
75
+ @architectures = "spacy.MultiHashEmbed.v2"
76
+ width = 96
77
+ attrs = ["NORM","PREFIX","SUFFIX","SHAPE","SPACY"]
78
+ rows = [5000,2500,2500,2500,100]
79
+ include_static_vectors = false
80
+
81
+ [components.tok2vec.model.encode]
82
+ @architectures = "spacy.MaxoutWindowEncoder.v2"
83
+ width = 96
84
+ depth = 4
85
+ window_size = 1
86
+ maxout_pieces = 3
87
+
88
+ [corpora]
89
+
90
+ [corpora.dev]
91
+ @readers = "spacy.Corpus.v1"
92
+ path = ${paths.dev}
93
+ gold_preproc = false
94
+ max_length = 0
95
+ limit = 0
96
+ augmenter = null
97
+
98
+ [corpora.train]
99
+ @readers = "spacy.Corpus.v1"
100
+ path = ${paths.train}
101
+ gold_preproc = false
102
+ max_length = 0
103
+ limit = 0
104
+ augmenter = null
105
+
106
+ [training]
107
+ seed = ${system.seed}
108
+ gpu_allocator = ${system.gpu_allocator}
109
+ dropout = 0.1
110
+ accumulate_gradient = 1
111
+ patience = 1600
112
+ max_epochs = 0
113
+ max_steps = 20000
114
+ eval_frequency = 200
115
+ frozen_components = []
116
+ annotating_components = []
117
+ dev_corpus = "corpora.dev"
118
+ train_corpus = "corpora.train"
119
+ before_to_disk = null
120
+
121
+ [training.batcher]
122
+ @batchers = "spacy.batch_by_words.v1"
123
+ discard_oversize = false
124
+ tolerance = 0.2
125
+ get_length = null
126
+
127
+ [training.batcher.size]
128
+ @schedules = "compounding.v1"
129
+ start = 100
130
+ stop = 1000
131
+ compound = 1.001
132
+ t = 0.0
133
+
134
+ [training.logger]
135
+ @loggers = "spacy.ConsoleLogger.v1"
136
+ progress_bar = false
137
+
138
+ [training.optimizer]
139
+ @optimizers = "Adam.v1"
140
+ beta1 = 0.9
141
+ beta2 = 0.999
142
+ L2_is_weight_decay = true
143
+ L2 = 0.01
144
+ grad_clip = 1.0
145
+ use_averages = false
146
+ eps = 0.00000001
147
+ learn_rate = 0.001
148
+
149
+ [training.score_weights]
150
+ tag_acc = 0.5
151
+ dep_uas = 0.25
152
+ dep_las = 0.25
153
+ dep_las_per_type = null
154
+ sents_p = null
155
+ sents_r = null
156
+ sents_f = 0.0
157
+
158
+ [pretraining]
159
+
160
+ [initialize]
161
+ vectors = ${paths.vectors}
162
+ init_tok2vec = ${paths.init_tok2vec}
163
+ vocab_data = null
164
+ lookups = null
165
+ before_init = null
166
+ after_init = null
167
+
168
+ [initialize.components]
169
+
170
+ [initialize.tokenizer]
README.md CHANGED
@@ -1,5 +1,4 @@
1
  ---
2
- license: other
3
  tags:
4
  - spacy
5
  - text-classification
@@ -12,7 +11,7 @@ A Spacy pipeline for generating readability scores
12
  | --- | --- |
13
  | **Name** | `en_readability` |
14
  | **Version** | `0.1` |
15
- | **spaCy** | `>=3.2.1,<3.3.0` |
16
  | **Default Pipeline** | `tok2vec`, `tagger`, `parser`, `attribute_ruler`, `readability` |
17
  | **Components** | `tok2vec`, `tagger`, `parser`, `attribute_ruler`, `readability` |
18
  | **Vectors** | 0 keys, 0 unique vectors (0 dimensions) |
@@ -24,11 +23,11 @@ A Spacy pipeline for generating readability scores
24
 
25
  <details>
26
 
27
- <summary>View label scheme (94 labels for 2 components)</summary>
28
 
29
  | Component | Labels |
30
  | --- | --- |
31
- | **`tagger`** | `$`, `''`, `,`, `-LRB-`, `-RRB-`, `.`, `:`, `ADD`, `AFX`, `CC`, `CD`, `DT`, `EX`, `FW`, `HYPH`, `IN`, `JJ`, `JJR`, `JJS`, `LS`, `MD`, `NFP`, `NN`, `NNP`, `NNPS`, `NNS`, `PDT`, `POS`, `PRP`, `PRP$`, `RB`, `RBR`, `RBS`, `RP`, `SYM`, `TO`, `UH`, `VB`, `VBD`, `VBG`, `VBN`, `VBP`, `VBZ`, `WDT`, `WP`, `WP$`, `WRB`, `XX`, ```` |
32
  | **`parser`** | `ROOT`, `acl`, `acomp`, `advcl`, `advmod`, `agent`, `amod`, `appos`, `attr`, `aux`, `auxpass`, `case`, `cc`, `ccomp`, `compound`, `conj`, `csubj`, `csubjpass`, `dative`, `dep`, `det`, `dobj`, `expl`, `intj`, `mark`, `meta`, `neg`, `nmod`, `npadvmod`, `nsubj`, `nsubjpass`, `nummod`, `oprd`, `parataxis`, `pcomp`, `pobj`, `poss`, `preconj`, `predet`, `prep`, `prt`, `punct`, `quantmod`, `relcl`, `xcomp` |
33
 
34
  </details>
1
  ---
 
2
  tags:
3
  - spacy
4
  - text-classification
11
  | --- | --- |
12
  | **Name** | `en_readability` |
13
  | **Version** | `0.1` |
14
+ | **spaCy** | `>=3.4.0,<3.5.0` |
15
  | **Default Pipeline** | `tok2vec`, `tagger`, `parser`, `attribute_ruler`, `readability` |
16
  | **Components** | `tok2vec`, `tagger`, `parser`, `attribute_ruler`, `readability` |
17
  | **Vectors** | 0 keys, 0 unique vectors (0 dimensions) |
23
 
24
  <details>
25
 
26
+ <summary>View label scheme (95 labels for 2 components)</summary>
27
 
28
  | Component | Labels |
29
  | --- | --- |
30
+ | **`tagger`** | `$`, `''`, `,`, `-LRB-`, `-RRB-`, `.`, `:`, `ADD`, `AFX`, `CC`, `CD`, `DT`, `EX`, `FW`, `HYPH`, `IN`, `JJ`, `JJR`, `JJS`, `LS`, `MD`, `NFP`, `NN`, `NNP`, `NNPS`, `NNS`, `PDT`, `POS`, `PRP`, `PRP$`, `RB`, `RBR`, `RBS`, `RP`, `SYM`, `TO`, `UH`, `VB`, `VBD`, `VBG`, `VBN`, `VBP`, `VBZ`, `WDT`, `WP`, `WP$`, `WRB`, `XX`, `_SP`, ```` |
31
  | **`parser`** | `ROOT`, `acl`, `acomp`, `advcl`, `advmod`, `agent`, `amod`, `appos`, `attr`, `aux`, `auxpass`, `case`, `cc`, `ccomp`, `compound`, `conj`, `csubj`, `csubjpass`, `dative`, `dep`, `det`, `dobj`, `expl`, `intj`, `mark`, `meta`, `neg`, `nmod`, `npadvmod`, `nsubj`, `nsubjpass`, `nummod`, `oprd`, `parataxis`, `pcomp`, `pobj`, `poss`, `preconj`, `predet`, `prep`, `prt`, `punct`, `quantmod`, `relcl`, `xcomp` |
32
 
33
  </details>
attribute_ruler/patterns CHANGED
Binary files a/attribute_ruler/patterns and b/attribute_ruler/patterns differ
config.cfg CHANGED
@@ -57,8 +57,9 @@ overwrite = false
57
  scorer = {"@scorers":"spacy.tagger_scorer.v1"}
58
 
59
  [components.tagger.model]
60
- @architectures = "spacy.Tagger.v1"
61
  nO = null
 
62
 
63
  [components.tagger.model.tok2vec]
64
  @architectures = "spacy.Tok2VecListener.v1"
@@ -75,7 +76,7 @@ factory = "tok2vec"
75
  @architectures = "spacy.MultiHashEmbed.v2"
76
  width = 96
77
  attrs = ["NORM","PREFIX","SUFFIX","SHAPE","SPACY"]
78
- rows = [5000,2500,2500,2500,100]
79
  include_static_vectors = false
80
 
81
  [components.tok2vec.model.encode]
57
  scorer = {"@scorers":"spacy.tagger_scorer.v1"}
58
 
59
  [components.tagger.model]
60
+ @architectures = "spacy.Tagger.v2"
61
  nO = null
62
+ normalize = false
63
 
64
  [components.tagger.model.tok2vec]
65
  @architectures = "spacy.Tok2VecListener.v1"
76
  @architectures = "spacy.MultiHashEmbed.v2"
77
  width = 96
78
  attrs = ["NORM","PREFIX","SUFFIX","SHAPE","SPACY"]
79
+ rows = [5000,1000,2500,2500,50]
80
  include_static_vectors = false
81
 
82
  [components.tok2vec.model.encode]
en_readability-any-py3-none-any.whl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1e565d770eb3af162c4bde937b46a628fc98d5094340b3543583ed4de1eb256d
3
- size 6873891
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:53adcc14fe186b430af7cda67efa72e3bf21f519ff28ce2eb9bf091cc27cbc16
3
+ size 6324285
meta.json CHANGED
@@ -7,8 +7,8 @@
7
  "email":"",
8
  "url":"www.valurank.com",
9
  "license":"",
10
- "spacy_version":">=3.2.1,<3.3.0",
11
- "spacy_git_version":"800737b41",
12
  "vectors":{
13
  "width":0,
14
  "vectors":0,
@@ -68,6 +68,7 @@
68
  "WP$",
69
  "WRB",
70
  "XX",
 
71
  "``"
72
  ],
73
  "parser":[
7
  "email":"",
8
  "url":"www.valurank.com",
9
  "license":"",
10
+ "spacy_version":">=3.4.0,<3.5.0",
11
+ "spacy_git_version":"d583626a8",
12
  "vectors":{
13
  "width":0,
14
  "vectors":0,
68
  "WP$",
69
  "WRB",
70
  "XX",
71
+ "_SP",
72
  "``"
73
  ],
74
  "parser":[
parser/model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b59392116d91c4703c8f57d775d471c14a788104c0491b0364ee00c1ed3dc906
3
  size 319909
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e80971fd38f1f20f11dabe644a485c6ef0846064256c7b2e929148a8b3ce6b97
3
  size 319909
parser/moves CHANGED
@@ -1,2 +1 @@
1
- ��moves�
2
- {"0":{"":995932},"1":{"":989662},"2":{"det":172430,"nsubj":165679,"compound":116803,"amod":106128,"aux":87078,"punct":65505,"advmod":62711,"poss":36427,"mark":27913,"nummod":22583,"auxpass":15597,"prep":13989,"nsubjpass":13867,"neg":12358,"cc":10694,"nmod":9572,"advcl":9063,"npadvmod":8135,"quantmod":7071,"intj":6557,"ccomp":5899,"dobj":3427,"expl":3360,"dep":3191,"predet":1945,"parataxis":1826,"csubj":1431,"preconj":620,"pobj||prep":615,"attr":578,"meta":448,"advmod||conj":367,"dobj||xcomp":352,"acomp":284,"nsubj||ccomp":224,"dative":206,"advmod||xcomp":149,"dobj||ccomp":70,"csubjpass":64,"dobj||conj":62,"prep||conj":51,"acl":48,"prep||nsubj":41,"prep||dobj":36,"xcomp":34,"advmod||ccomp":32,"oprd":31},"3":{"punct":183437,"pobj":182256,"prep":173845,"dobj":89650,"conj":59689,"cc":51858,"ccomp":30404,"advmod":22820,"xcomp":21045,"relcl":20968,"advcl":19833,"attr":17739,"acomp":16824,"appos":14963,"case":13361,"acl":12091,"pcomp":10345,"npadvmod":9702,"prt":8179,"agent":3884,"dative":3867,"nsubj":3465,"intj":2898,"neg":2871,"amod":2843,"nummod":2510,"oprd":2304,"dep":1518,"parataxis":1261,"quantmod":317,"nmod":296,"acl||dobj":202,"prep||dobj":190,"prep||nsubj":162,"acl||nsubj":159,"appos||nsubj":145,"relcl||dobj":134,"relcl||nsubj":111,"aux":103,"expl":96,"meta":93,"appos||dobj":86,"preconj":71,"csubj":65,"prep||nsubjpass":55,"prep||advmod":54,"prep||acomp":53,"det":51,"nsubjpass":45,"acl||nsubjpass":42,"relcl||pobj":41,"mark":40,"auxpass":39,"prep||pobj":36,"relcl||nsubjpass":32,"appos||nsubjpass":31},"4":{"ROOT":110979}}�cfg��neg_key�
1
+ ��moves� {"0":{"":994332},"1":{"":999432},"2":{"det":172595,"nsubj":165748,"compound":116623,"amod":105184,"aux":86667,"punct":65478,"advmod":62763,"poss":36443,"mark":27941,"nummod":22598,"auxpass":15594,"prep":14001,"nsubjpass":13856,"neg":12357,"cc":10739,"nmod":9562,"advcl":9062,"npadvmod":8168,"quantmod":7101,"intj":6464,"ccomp":5896,"dobj":3427,"expl":3360,"dep":2871,"predet":1944,"parataxis":1837,"csubj":1428,"preconj":621,"pobj||prep":616,"attr":578,"meta":376,"advmod||conj":368,"dobj||xcomp":352,"acomp":284,"nsubj||ccomp":224,"dative":206,"advmod||xcomp":149,"dobj||ccomp":70,"csubjpass":64,"dobj||conj":62,"prep||conj":51,"acl":48,"prep||nsubj":41,"prep||dobj":36,"xcomp":34,"advmod||ccomp":32,"oprd":31},"3":{"punct":183790,"pobj":182191,"prep":174008,"dobj":89615,"conj":59687,"cc":51930,"ccomp":30385,"advmod":22861,"xcomp":21021,"relcl":20969,"advcl":19828,"attr":17741,"acomp":16922,"appos":15265,"case":13388,"acl":12085,"pcomp":10324,"dep":10116,"npadvmod":9796,"prt":8179,"agent":3903,"dative":3866,"nsubj":3470,"neg":2906,"amod":2839,"intj":2819,"nummod":2732,"oprd":2301,"parataxis":1261,"quantmod":319,"nmod":294,"acl||dobj":200,"prep||dobj":190,"prep||nsubj":162,"acl||nsubj":159,"appos||nsubj":145,"relcl||dobj":134,"relcl||nsubj":111,"aux":103,"expl":96,"meta":92,"appos||dobj":86,"preconj":71,"csubj":65,"prep||nsubjpass":55,"prep||advmod":54,"prep||acomp":53,"det":51,"nsubjpass":45,"relcl||pobj":42,"acl||nsubjpass":42,"mark":40,"auxpass":39,"prep||pobj":36,"relcl||nsubjpass":32,"appos||nsubjpass":31},"4":{"ROOT":111664}}�cfg��neg_key�
 
tagger/cfg CHANGED
@@ -48,6 +48,7 @@
48
  "WP$",
49
  "WRB",
50
  "XX",
 
51
  "``"
52
  ],
53
  "neg_prefix":"!",
48
  "WP$",
49
  "WRB",
50
  "XX",
51
+ "_SP",
52
  "``"
53
  ],
54
  "neg_prefix":"!",
tagger/model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ba998456472a6669d24e643e1924f24daeb15da1b5eb985abae8c3e787162364
3
- size 19389
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d62054e74f89be08b720157a45ddf3a5a5a9e8c51f191cdea364e390c0032d7e
3
+ size 19829
tok2vec/model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8f529c77a3dd14a7d13e59bf848a8606f40a5d45928e324b8e3a1ecca98d14ce
3
- size 6734429
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6967e88ec7b0680d94a75500c46fe19a1b1e01ef5f608a58826077e45af5010d
3
+ size 6139229
tokenizer CHANGED
The diff for this file is too large to render. See raw diff
vocab/strings.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3d478b0a3e9b0a537949cb4949b3690251e99851da80eb192a01c7cc9496db90
3
- size 1089573
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b2696502155e027d7e26609065b911a03ee6c5004b150fa989e2d03a3ca4338
3
+ size 1104000