hiroshi-matsuda-rit commited on
Commit
776c495
1 Parent(s): 7e3cc92

Update spaCy pipeline

Browse files
.gitattributes CHANGED
@@ -14,3 +14,7 @@
14
  *.pb filter=lfs diff=lfs merge=lfs -text
15
  *.pt filter=lfs diff=lfs merge=lfs -text
16
  *.pth filter=lfs diff=lfs merge=lfs -text
 
 
 
 
14
  *.pb filter=lfs diff=lfs merge=lfs -text
15
  *.pt filter=lfs diff=lfs merge=lfs -text
16
  *.pth filter=lfs diff=lfs merge=lfs -text
17
+ *.whl filter=lfs diff=lfs merge=lfs -text
18
+ *.npz filter=lfs diff=lfs merge=lfs -text
19
+ *strings.json filter=lfs diff=lfs merge=lfs -text
20
+ vectors filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - spacy
4
+ - token-classification
5
+ language:
6
+ - ja
7
+ license: CC-BY-SA-4.0
8
+ model-index:
9
+ - name: ja_gsd_bert_wwm_unidic_lite
10
+ results:
11
+ - tasks:
12
+ name: NER
13
+ type: token-classification
14
+ metrics:
15
+ - name: Precision
16
+ type: precision
17
+ value: 0.8496143959
18
+ - name: Recall
19
+ type: recall
20
+ value: 0.8314465409
21
+ - name: F Score
22
+ type: f_score
23
+ value: 0.840432295
24
+ - tasks:
25
+ name: POS
26
+ type: token-classification
27
+ metrics:
28
+ - name: Accuracy
29
+ type: accuracy
30
+ value: 0.0
31
+ - tasks:
32
+ name: SENTER
33
+ type: token-classification
34
+ metrics:
35
+ - name: Precision
36
+ type: precision
37
+ value: 0.9201520913
38
+ - name: Recall
39
+ type: recall
40
+ value: 0.9546351085
41
+ - name: F Score
42
+ type: f_score
43
+ value: 0.9370764763
44
+ - tasks:
45
+ name: UNLABELED_DEPENDENCIES
46
+ type: token-classification
47
+ metrics:
48
+ - name: Accuracy
49
+ type: accuracy
50
+ value: 0.9367795389
51
+ - tasks:
52
+ name: LABELED_DEPENDENCIES
53
+ type: token-classification
54
+ metrics:
55
+ - name: Accuracy
56
+ type: accuracy
57
+ value: 0.9367795389
58
+ ---
59
+ Japanese transformer pipeline (bert-base). Components: transformer, parser, ner.
60
+
61
+ | Feature | Description |
62
+ | --- | --- |
63
+ | **Name** | `ja_gsd_bert_wwm_unidic_lite` |
64
+ | **Version** | `3.1.0` |
65
+ | **spaCy** | `>=3.1.0,<3.2.0` |
66
+ | **Default Pipeline** | `transformer`, `parser`, `ner` |
67
+ | **Components** | `transformer`, `parser`, `ner` |
68
+ | **Vectors** | 0 keys, 0 unique vectors (0 dimensions) |
69
+ | **Sources** | [UD_Japanese-GSD](https://github.com/UniversalDependencies/UD_Japanese-GSD)<br />[UD_Japanese-GSD r2.8+NE](https://github.com/megagonlabs/UD_Japanese-GSD/releases/tag/r2.8-NE)<br />[SudachiDict_core](https://github.com/WorksApplications/SudachiDict)<br />[cl-tohoku/bert-base-japanese-whole-word-masking](https://huggingface.co/cl-tohoku/bert-base-japanese-whole-word-masking)<br />[unidic_lite](https://github.com/polm/unidic-lite) |
70
+ | **License** | `CC BY-SA 4.0` |
71
+ | **Author** | [Megagon Labs Tokyo.](https://github.com/megagonlabs/UD_japanese_GSD) |
72
+
73
+ ### Label Scheme
74
+
75
+ <details>
76
+
77
+ <summary>View label scheme (45 labels for 2 components)</summary>
78
+
79
+ | Component | Labels |
80
+ | --- | --- |
81
+ | **`parser`** | `ROOT`, `acl`, `advcl`, `advmod`, `amod`, `aux`, `case`, `cc`, `ccomp`, `compound`, `cop`, `csubj`, `dep`, `det`, `dislocated`, `fixed`, `mark`, `nmod`, `nsubj`, `nummod`, `obj`, `obl`, `punct` |
82
+ | **`ner`** | `CARDINAL`, `DATE`, `EVENT`, `FAC`, `GPE`, `LANGUAGE`, `LAW`, `LOC`, `MONEY`, `MOVEMENT`, `NORP`, `ORDINAL`, `ORG`, `PERCENT`, `PERSON`, `PET_NAME`, `PHONE`, `PRODUCT`, `QUANTITY`, `TIME`, `TITLE_AFFIX`, `WORK_OF_ART` |
83
+
84
+ </details>
85
+
86
+ ### Accuracy
87
+
88
+ | Type | Score |
89
+ | --- | --- |
90
+ | `DEP_UAS` | 93.68 |
91
+ | `DEP_LAS` | 92.61 |
92
+ | `SENTS_P` | 92.02 |
93
+ | `SENTS_R` | 95.46 |
94
+ | `SENTS_F` | 93.71 |
95
+ | `ENTS_F` | 84.04 |
96
+ | `ENTS_P` | 84.96 |
97
+ | `ENTS_R` | 83.14 |
98
+ | `TAG_ACC` | 0.00 |
99
+ | `TRANSFORMER_LOSS` | 28861.67 |
100
+ | `PARSER_LOSS` | 1306248.63 |
101
+ | `NER_LOSS` | 13993.36 |
config.cfg ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [paths]
2
+ train = "corpus/ja_gsd-ud-train.ne.spacy"
3
+ dev = "corpus/ja_gsd-ud-dev.ne.spacy"
4
+ vectors = null
5
+ init_tok2vec = null
6
+
7
+ [system]
8
+ gpu_allocator = "pytorch"
9
+ seed = 0
10
+
11
+ [nlp]
12
+ lang = "ja"
13
+ pipeline = ["transformer","parser","ner"]
14
+ batch_size = 128
15
+ disabled = []
16
+ before_creation = null
17
+ after_creation = null
18
+ after_pipeline_creation = null
19
+
20
+ [nlp.tokenizer]
21
+ @tokenizers = "spacy.ja.JapaneseTokenizer"
22
+ split_mode = "A"
23
+
24
+ [components]
25
+
26
+ [components.ner]
27
+ factory = "ner"
28
+ incorrect_spans_key = null
29
+ moves = null
30
+ update_with_oracle_cut_size = 100
31
+
32
+ [components.ner.model]
33
+ @architectures = "spacy.TransitionBasedParser.v2"
34
+ state_type = "ner"
35
+ extra_state_tokens = false
36
+ hidden_width = 64
37
+ maxout_pieces = 2
38
+ use_upper = false
39
+ nO = null
40
+
41
+ [components.ner.model.tok2vec]
42
+ @architectures = "spacy-transformers.TransformerListener.v1"
43
+ grad_factor = 1.0
44
+ pooling = {"@layers":"reduce_mean.v1"}
45
+ upstream = "*"
46
+
47
+ [components.parser]
48
+ factory = "parser"
49
+ learn_tokens = false
50
+ min_action_freq = 30
51
+ moves = null
52
+ update_with_oracle_cut_size = 100
53
+
54
+ [components.parser.model]
55
+ @architectures = "spacy.TransitionBasedParser.v2"
56
+ state_type = "parser"
57
+ extra_state_tokens = false
58
+ hidden_width = 128
59
+ maxout_pieces = 3
60
+ use_upper = false
61
+ nO = null
62
+
63
+ [components.parser.model.tok2vec]
64
+ @architectures = "spacy-transformers.TransformerListener.v1"
65
+ grad_factor = 1.0
66
+ pooling = {"@layers":"reduce_mean.v1"}
67
+ upstream = "*"
68
+
69
+ [components.transformer]
70
+ factory = "transformer"
71
+ max_batch_items = 4096
72
+ set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"}
73
+
74
+ [components.transformer.model]
75
+ @architectures = "spacy-transformers.TransformerModel.v1"
76
+ name = "cl-tohoku/bert-base-japanese-whole-word-masking"
77
+
78
+ [components.transformer.model.get_spans]
79
+ @span_getters = "spacy-transformers.strided_spans.v1"
80
+ window = 128
81
+ stride = 96
82
+
83
+ [components.transformer.model.tokenizer_config]
84
+ use_fast = false
85
+
86
+ [components.transformer.model.tokenizer_config.mecab_kwargs]
87
+ mecab_dic = "unidic_lite"
88
+
89
+ [corpora]
90
+
91
+ [corpora.dev]
92
+ @readers = "spacy.Corpus.v1"
93
+ path = ${paths.dev}
94
+ max_length = 0
95
+ gold_preproc = false
96
+ limit = 0
97
+ augmenter = null
98
+
99
+ [corpora.train]
100
+ @readers = "spacy.Corpus.v1"
101
+ path = ${paths.train}
102
+ max_length = 500
103
+ gold_preproc = false
104
+ limit = 0
105
+ augmenter = null
106
+
107
+ [training]
108
+ accumulate_gradient = 3
109
+ dev_corpus = "corpora.dev"
110
+ train_corpus = "corpora.train"
111
+ seed = ${system.seed}
112
+ gpu_allocator = ${system.gpu_allocator}
113
+ dropout = 0.1
114
+ patience = 0
115
+ max_epochs = 0
116
+ max_steps = 20000
117
+ eval_frequency = 200
118
+ frozen_components = []
119
+ before_to_disk = null
120
+ annotating_components = []
121
+
122
+ [training.batcher]
123
+ @batchers = "spacy.batch_by_padded.v1"
124
+ discard_oversize = true
125
+ size = 2000
126
+ buffer = 256
127
+ get_length = null
128
+
129
+ [training.logger]
130
+ @loggers = "spacy.WandbLogger.v2"
131
+ project_name = "ja_spacy_bert_wwm_unidic_lite"
132
+ remove_config_values = ["paths.train","paths.dev","corpora.train.path","corpora.dev.path"]
133
+ log_dataset_dir = "./corpus"
134
+ model_log_interval = 200
135
+
136
+ [training.optimizer]
137
+ @optimizers = "Adam.v1"
138
+ beta1 = 0.9
139
+ beta2 = 0.999
140
+ L2_is_weight_decay = true
141
+ L2 = 0.01
142
+ grad_clip = 1.0
143
+ use_averages = false
144
+ eps = 0.00000001
145
+
146
+ [training.optimizer.learn_rate]
147
+ @schedules = "warmup_linear.v1"
148
+ warmup_steps = 250
149
+ total_steps = 20000
150
+ initial_rate = 0.00005
151
+
152
+ [training.score_weights]
153
+ dep_uas = 0.17
154
+ dep_las = 0.17
155
+ dep_las_per_type = null
156
+ sents_p = null
157
+ sents_r = null
158
+ sents_f = 0.0
159
+ ents_f = 0.33
160
+ ents_p = 0.0
161
+ ents_r = 0.0
162
+ ents_per_type = null
163
+ tag_acc = 0.33
164
+
165
+ [pretraining]
166
+
167
+ [initialize]
168
+ vectors = null
169
+ init_tok2vec = ${paths.init_tok2vec}
170
+ vocab_data = null
171
+ lookups = null
172
+ before_init = null
173
+ after_init = null
174
+
175
+ [initialize.components]
176
+
177
+ [initialize.tokenizer]
ja_gsd_bert_wwm_unidic_lite-any-py3-none-any.whl ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee49380fb404214178a630941a52c9577cf26e74721a04fc9c54bf8abb7e6a66
3
+ size 412051652
meta.json ADDED
@@ -0,0 +1,346 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "lang":"ja",
3
+ "name":"gsd_bert_wwm_unidic_lite",
4
+ "version":"3.1.0",
5
+ "description":"Japanese transformer pipeline (bert-base). Components: transformer, parser, ner.",
6
+ "author":"Megagon Labs Tokyo.",
7
+ "email":"ginza@megagon.ai",
8
+ "url":"https://github.com/megagonlabs/UD_japanese_GSD",
9
+ "license":"CC BY-SA 4.0",
10
+ "spacy_version":">=3.1.0,<3.2.0",
11
+ "spacy_git_version":"530b5d72f",
12
+ "vectors":{
13
+ "width":0,
14
+ "vectors":0,
15
+ "keys":0,
16
+ "name":null
17
+ },
18
+ "labels":{
19
+ "transformer":[
20
+
21
+ ],
22
+ "parser":[
23
+ "ROOT",
24
+ "acl",
25
+ "advcl",
26
+ "advmod",
27
+ "amod",
28
+ "aux",
29
+ "case",
30
+ "cc",
31
+ "ccomp",
32
+ "compound",
33
+ "cop",
34
+ "csubj",
35
+ "dep",
36
+ "det",
37
+ "dislocated",
38
+ "fixed",
39
+ "mark",
40
+ "nmod",
41
+ "nsubj",
42
+ "nummod",
43
+ "obj",
44
+ "obl",
45
+ "punct"
46
+ ],
47
+ "ner":[
48
+ "CARDINAL",
49
+ "DATE",
50
+ "EVENT",
51
+ "FAC",
52
+ "GPE",
53
+ "LANGUAGE",
54
+ "LAW",
55
+ "LOC",
56
+ "MONEY",
57
+ "MOVEMENT",
58
+ "NORP",
59
+ "ORDINAL",
60
+ "ORG",
61
+ "PERCENT",
62
+ "PERSON",
63
+ "PET_NAME",
64
+ "PHONE",
65
+ "PRODUCT",
66
+ "QUANTITY",
67
+ "TIME",
68
+ "TITLE_AFFIX",
69
+ "WORK_OF_ART"
70
+ ]
71
+ },
72
+ "pipeline":[
73
+ "transformer",
74
+ "parser",
75
+ "ner"
76
+ ],
77
+ "components":[
78
+ "transformer",
79
+ "parser",
80
+ "ner"
81
+ ],
82
+ "disabled":[
83
+
84
+ ],
85
+ "performance":{
86
+ "dep_uas":0.9367795389,
87
+ "dep_las":0.926075995,
88
+ "dep_las_per_type":{
89
+ "cc":{
90
+ "p":0.8863636364,
91
+ "r":0.8125,
92
+ "f":0.847826087
93
+ },
94
+ "compound":{
95
+ "p":0.9503214494,
96
+ "r":0.916572717,
97
+ "f":0.9331420373
98
+ },
99
+ "obl":{
100
+ "p":0.8710493047,
101
+ "r":0.8601747815,
102
+ "f":0.8655778894
103
+ },
104
+ "case":{
105
+ "p":0.9870030581,
106
+ "r":0.9810030395,
107
+ "f":0.9839939024
108
+ },
109
+ "dislocated":{
110
+ "p":0.6,
111
+ "r":0.6923076923,
112
+ "f":0.6428571429
113
+ },
114
+ "nsubj":{
115
+ "p":0.8875739645,
116
+ "r":0.8637236084,
117
+ "f":0.8754863813
118
+ },
119
+ "nmod":{
120
+ "p":0.9405063291,
121
+ "r":0.869005848,
122
+ "f":0.903343465
123
+ },
124
+ "root":{
125
+ "p":0.9119850187,
126
+ "r":0.9605522682,
127
+ "f":0.9356388088
128
+ },
129
+ "aux":{
130
+ "p":0.9766573296,
131
+ "r":0.9712163417,
132
+ "f":0.9739292365
133
+ },
134
+ "advcl":{
135
+ "p":0.7825112108,
136
+ "r":0.7842696629,
137
+ "f":0.7833894501
138
+ },
139
+ "mark":{
140
+ "p":0.979757085,
141
+ "r":0.968,
142
+ "f":0.9738430584
143
+ },
144
+ "fixed":{
145
+ "p":0.9694793537,
146
+ "r":0.9818181818,
147
+ "f":0.9756097561
148
+ },
149
+ "acl":{
150
+ "p":0.8984198646,
151
+ "r":0.8747252747,
152
+ "f":0.8864142539
153
+ },
154
+ "obj":{
155
+ "p":0.9541284404,
156
+ "r":0.9425981873,
157
+ "f":0.9483282675
158
+ },
159
+ "nummod":{
160
+ "p":0.9934640523,
161
+ "r":0.899408284,
162
+ "f":0.9440993789
163
+ },
164
+ "advmod":{
165
+ "p":0.7647058824,
166
+ "r":0.7428571429,
167
+ "f":0.7536231884
168
+ },
169
+ "amod":{
170
+ "p":0.935483871,
171
+ "r":0.7837837838,
172
+ "f":0.8529411765
173
+ },
174
+ "cop":{
175
+ "p":0.9759036145,
176
+ "r":0.9418604651,
177
+ "f":0.9585798817
178
+ },
179
+ "ccomp":{
180
+ "p":0.9523809524,
181
+ "r":0.9090909091,
182
+ "f":0.9302325581
183
+ },
184
+ "det":{
185
+ "p":1.0,
186
+ "r":0.9811320755,
187
+ "f":0.9904761905
188
+ },
189
+ "csubj":{
190
+ "p":0.7142857143,
191
+ "r":0.8333333333,
192
+ "f":0.7692307692
193
+ },
194
+ "dep":{
195
+ "p":0.4,
196
+ "r":0.2857142857,
197
+ "f":0.3333333333
198
+ }
199
+ },
200
+ "sents_p":0.9201520913,
201
+ "sents_r":0.9546351085,
202
+ "sents_f":0.9370764763,
203
+ "ents_f":0.840432295,
204
+ "ents_p":0.8496143959,
205
+ "ents_r":0.8314465409,
206
+ "ents_per_type":{
207
+ "DATE":{
208
+ "p":0.9814814815,
209
+ "r":0.9724770642,
210
+ "f":0.9769585253
211
+ },
212
+ "ORG":{
213
+ "p":0.78125,
214
+ "r":0.7299270073,
215
+ "f":0.7547169811
216
+ },
217
+ "TITLE_AFFIX":{
218
+ "p":0.8518518519,
219
+ "r":0.7666666667,
220
+ "f":0.8070175439
221
+ },
222
+ "PERSON":{
223
+ "p":0.9333333333,
224
+ "r":0.9064748201,
225
+ "f":0.9197080292
226
+ },
227
+ "GPE":{
228
+ "p":0.780952381,
229
+ "r":0.8723404255,
230
+ "f":0.824120603
231
+ },
232
+ "PRODUCT":{
233
+ "p":0.6285714286,
234
+ "r":0.5238095238,
235
+ "f":0.5714285714
236
+ },
237
+ "TIME":{
238
+ "p":0.6666666667,
239
+ "r":1.0,
240
+ "f":0.8
241
+ },
242
+ "QUANTITY":{
243
+ "p":0.8648648649,
244
+ "r":0.9696969697,
245
+ "f":0.9142857143
246
+ },
247
+ "NORP":{
248
+ "p":0.8846153846,
249
+ "r":0.71875,
250
+ "f":0.7931034483
251
+ },
252
+ "ORDINAL":{
253
+ "p":0.7391304348,
254
+ "r":0.7727272727,
255
+ "f":0.7555555556
256
+ },
257
+ "WORK_OF_ART":{
258
+ "p":0.6666666667,
259
+ "r":0.7058823529,
260
+ "f":0.6857142857
261
+ },
262
+ "PERCENT":{
263
+ "p":1.0,
264
+ "r":0.5714285714,
265
+ "f":0.7272727273
266
+ },
267
+ "CARDINAL":{
268
+ "p":0.0,
269
+ "r":0.0,
270
+ "f":0.0
271
+ },
272
+ "EVENT":{
273
+ "p":0.9565217391,
274
+ "r":0.8461538462,
275
+ "f":0.8979591837
276
+ },
277
+ "FAC":{
278
+ "p":0.8055555556,
279
+ "r":0.7837837838,
280
+ "f":0.7945205479
281
+ },
282
+ "LOC":{
283
+ "p":0.8888888889,
284
+ "r":0.8,
285
+ "f":0.8421052632
286
+ },
287
+ "MOVEMENT":{
288
+ "p":0.6,
289
+ "r":0.6,
290
+ "f":0.6
291
+ },
292
+ "LAW":{
293
+ "p":1.0,
294
+ "r":1.0,
295
+ "f":1.0
296
+ },
297
+ "MONEY":{
298
+ "p":1.0,
299
+ "r":1.0,
300
+ "f":1.0
301
+ },
302
+ "LANGUAGE":{
303
+ "p":1.0,
304
+ "r":1.0,
305
+ "f":1.0
306
+ }
307
+ },
308
+ "tag_acc":0.0,
309
+ "transformer_loss":288.6167381342,
310
+ "parser_loss":13062.4862750822,
311
+ "ner_loss":139.9335659693
312
+ },
313
+ "sources":[
314
+ {
315
+ "name":"UD_Japanese-GSD",
316
+ "url":"https://github.com/UniversalDependencies/UD_Japanese-GSD",
317
+ "license":"CC BY-SA 4.0"
318
+ },
319
+ {
320
+ "name":"UD_Japanese-GSD r2.8+NE",
321
+ "url":"https://github.com/megagonlabs/UD_Japanese-GSD/releases/tag/r2.8-NE",
322
+ "license":"CC BY-SA 4.0"
323
+ },
324
+ {
325
+ "name":"SudachiDict_core",
326
+ "url":"https://github.com/WorksApplications/SudachiDict",
327
+ "license":"Apache License 2.0"
328
+ },
329
+ {
330
+ "name":"cl-tohoku/bert-base-japanese-whole-word-masking",
331
+ "url":"https://huggingface.co/cl-tohoku/bert-base-japanese-whole-word-masking",
332
+ "license":"CC BY-SA 3.0"
333
+ },
334
+ {
335
+ "name":"unidic_lite",
336
+ "url":"https://github.com/polm/unidic-lite",
337
+ "license":"individually defined open software license"
338
+ }
339
+ ],
340
+ "parent_package":"spacy",
341
+ "requirements":[
342
+ "sudachipy>=0.5.2",
343
+ "sudachidict_core>=20210608",
344
+ "spacy-transformers>=1.0.2,<1.1.0"
345
+ ]
346
+ }
ner/cfg ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "moves":null,
3
+ "update_with_oracle_cut_size":100,
4
+ "multitasks":[
5
+
6
+ ],
7
+ "min_action_freq":1,
8
+ "learn_tokens":false,
9
+ "beam_width":1,
10
+ "beam_density":0.0,
11
+ "beam_update_prob":0.0,
12
+ "incorrect_spans_key":null
13
+ }
ner/model ADDED
Binary file (339 kB). View file
ner/moves ADDED
@@ -0,0 +1 @@
 
1
+ ��moves��{"0":{},"1":{"DATE":4200,"ORG":3487,"PERSON":3042,"QUANTITY":2519,"GPE":1953,"PRODUCT":1328,"FAC":1243,"ORDINAL":1114,"WORK_OF_ART":1053,"EVENT":869,"NORP":734,"LOC":563,"MONEY":400,"TITLE_AFFIX":344,"TIME":300,"PERCENT":274,"MOVEMENT":148,"LAW":94,"LANGUAGE":82,"CARDINAL":27,"PET_NAME":20,"PHONE":4},"2":{"DATE":4200,"ORG":3487,"PERSON":3042,"QUANTITY":2519,"GPE":1953,"PRODUCT":1328,"FAC":1243,"ORDINAL":1114,"WORK_OF_ART":1053,"EVENT":869,"NORP":734,"LOC":563,"MONEY":400,"TITLE_AFFIX":344,"TIME":300,"PERCENT":274,"MOVEMENT":148,"LAW":94,"LANGUAGE":82,"CARDINAL":27,"PET_NAME":20,"PHONE":4},"3":{"DATE":4200,"ORG":3487,"PERSON":3042,"QUANTITY":2519,"GPE":1953,"PRODUCT":1328,"FAC":1243,"ORDINAL":1114,"WORK_OF_ART":1053,"EVENT":869,"NORP":734,"LOC":563,"MONEY":400,"TITLE_AFFIX":344,"TIME":300,"PERCENT":274,"MOVEMENT":148,"LAW":94,"LANGUAGE":82,"CARDINAL":27,"PET_NAME":20,"PHONE":4},"4":{"DATE":4200,"ORG":3487,"PERSON":3042,"QUANTITY":2519,"GPE":1953,"PRODUCT":1328,"FAC":1243,"ORDINAL":1114,"WORK_OF_ART":1053,"EVENT":869,"NORP":734,"LOC":563,"MONEY":400,"TITLE_AFFIX":344,"TIME":300,"PERCENT":274,"MOVEMENT":148,"LAW":94,"LANGUAGE":82,"CARDINAL":27,"PET_NAME":20,"PHONE":4,"":1},"5":{"":1}}�cfg��neg_key�
parser/cfg ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "moves":null,
3
+ "update_with_oracle_cut_size":100,
4
+ "multitasks":[
5
+
6
+ ],
7
+ "min_action_freq":30,
8
+ "learn_tokens":false,
9
+ "beam_width":1,
10
+ "beam_density":0.0,
11
+ "beam_update_prob":0.0,
12
+ "incorrect_spans_key":null
13
+ }
parser/model ADDED
Binary file (754 kB). View file
parser/moves ADDED
@@ -0,0 +1 @@
 
1
+ ��moves�~{"0":{"":75051},"1":{"":81581},"2":{"compound":22178,"nmod":11296,"obl":10522,"nsubj":6649,"acl":6185,"advcl":5956,"obj":4364,"nummod":2247,"advmod":1841,"punct":1169,"det":822,"cc":699,"amod":357,"ccomp":335,"dislocated":233,"csubj":139,"dep":0},"3":{"case":35390,"punct":15051,"aux":14506,"fixed":7377,"mark":6390,"cop":2079,"compound":542,"advcl":148,"dep":56},"4":{"ROOT":6810}}�cfg��neg_key�
tokenizer/cfg ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ {
2
+ "split_mode":"A"
3
+ }
transformer/cfg ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ {
2
+ "max_batch_items":4096
3
+ }
transformer/model/config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "cl-tohoku/bert-base-japanese-whole-word-masking",
3
+ "architectures": [
4
+ "BertForMaskedLM"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "gradient_checkpointing": false,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 768,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 3072,
13
+ "layer_norm_eps": 1e-12,
14
+ "max_position_embeddings": 512,
15
+ "model_type": "bert",
16
+ "num_attention_heads": 12,
17
+ "num_hidden_layers": 12,
18
+ "pad_token_id": 0,
19
+ "position_embedding_type": "absolute",
20
+ "tokenizer_class": "BertJapaneseTokenizer",
21
+ "transformers_version": "4.6.1",
22
+ "type_vocab_size": 2,
23
+ "use_cache": true,
24
+ "vocab_size": 32000
25
+ }
transformer/model/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:599a195b86b9d080cfca6788742da743f78ff0cde0bea93e28a53f929b7d8ec4
3
+ size 442555895
transformer/model/special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
1
+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
transformer/model/tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
1
+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "do_lower_case": false, "do_word_tokenize": true, "do_subword_tokenize": true, "word_tokenizer_type": "mecab", "subword_tokenizer_type": "wordpiece", "never_split": null, "mecab_kwargs": {"mecab_dic": "unidic_lite"}, "model_max_length": 512, "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "cl-tohoku/bert-base-japanese-whole-word-masking"}
transformer/model/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
vocab/key2row ADDED
@@ -0,0 +1 @@
 
1
+
vocab/lookups.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76be8b528d0075f7aae98d6fa57a6d3c83ae480a8469e668d7b0af968995ac71
3
+ size 1
vocab/strings.json ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6de34ff1760a0cca2e3da83ff530767a3711d9328fa14c612c211f27fb06b89
3
+ size 614020
vocab/vectors ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:14772b683e726436d5948ad3fff2b43d036ef2ebbe3458aafed6004e05a40706
3
+ size 128