kajyuuen commited on
Commit
a7345ac
0 Parent(s):

Initial commit

Browse files
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
LICENSE ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ Apache License
3
+ Version 2.0, January 2004
4
+ http://www.apache.org/licenses/
5
+
6
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7
+
8
+ 1. Definitions.
9
+
10
+ "License" shall mean the terms and conditions for use, reproduction,
11
+ and distribution as defined by Sections 1 through 9 of this document.
12
+
13
+ "Licensor" shall mean the copyright owner or entity authorized by
14
+ the copyright owner that is granting the License.
15
+
16
+ "Legal Entity" shall mean the union of the acting entity and all
17
+ other entities that control, are controlled by, or are under common
18
+ control with that entity. For the purposes of this definition,
19
+ "control" means (i) the power, direct or indirect, to cause the
20
+ direction or management of such entity, whether by contract or
21
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
22
+ outstanding shares, or (iii) beneficial ownership of such entity.
23
+
24
+ "You" (or "Your") shall mean an individual or Legal Entity
25
+ exercising permissions granted by this License.
26
+
27
+ "Source" form shall mean the preferred form for making modifications,
28
+ including but not limited to software source code, documentation
29
+ source, and configuration files.
30
+
31
+ "Object" form shall mean any form resulting from mechanical
32
+ transformation or translation of a Source form, including but
33
+ not limited to compiled object code, generated documentation,
34
+ and conversions to other media types.
35
+
36
+ "Work" shall mean the work of authorship, whether in Source or
37
+ Object form, made available under the License, as indicated by a
38
+ copyright notice that is included in or attached to the work
39
+ (an example is provided in the Appendix below).
40
+
41
+ "Derivative Works" shall mean any work, whether in Source or Object
42
+ form, that is based on (or derived from) the Work and for which the
43
+ editorial revisions, annotations, elaborations, or other modifications
44
+ represent, as a whole, an original work of authorship. For the purposes
45
+ of this License, Derivative Works shall not include works that remain
46
+ separable from, or merely link (or bind by name) to the interfaces of,
47
+ the Work and Derivative Works thereof.
48
+
49
+ "Contribution" shall mean any work of authorship, including
50
+ the original version of the Work and any modifications or additions
51
+ to that Work or Derivative Works thereof, that is intentionally
52
+ submitted to Licensor for inclusion in the Work by the copyright owner
53
+ or by an individual or Legal Entity authorized to submit on behalf of
54
+ the copyright owner. For the purposes of this definition, "submitted"
55
+ means any form of electronic, verbal, or written communication sent
56
+ to the Licensor or its representatives, including but not limited to
57
+ communication on electronic mailing lists, source code control systems,
58
+ and issue tracking systems that are managed by, or on behalf of, the
59
+ Licensor for the purpose of discussing and improving the Work, but
60
+ excluding communication that is conspicuously marked or otherwise
61
+ designated in writing by the copyright owner as "Not a Contribution."
62
+
63
+ "Contributor" shall mean Licensor and any individual or Legal Entity
64
+ on behalf of whom a Contribution has been received by Licensor and
65
+ subsequently incorporated within the Work.
66
+
67
+ 2. Grant of Copyright License. Subject to the terms and conditions of
68
+ this License, each Contributor hereby grants to You a perpetual,
69
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70
+ copyright license to reproduce, prepare Derivative Works of,
71
+ publicly display, publicly perform, sublicense, and distribute the
72
+ Work and such Derivative Works in Source or Object form.
73
+
74
+ 3. Grant of Patent License. Subject to the terms and conditions of
75
+ this License, each Contributor hereby grants to You a perpetual,
76
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77
+ (except as stated in this section) patent license to make, have made,
78
+ use, offer to sell, sell, import, and otherwise transfer the Work,
79
+ where such license applies only to those patent claims licensable
80
+ by such Contributor that are necessarily infringed by their
81
+ Contribution(s) alone or by combination of their Contribution(s)
82
+ with the Work to which such Contribution(s) was submitted. If You
83
+ institute patent litigation against any entity (including a
84
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
85
+ or a Contribution incorporated within the Work constitutes direct
86
+ or contributory patent infringement, then any patent licenses
87
+ granted to You under this License for that Work shall terminate
88
+ as of the date such litigation is filed.
89
+
90
+ 4. Redistribution. You may reproduce and distribute copies of the
91
+ Work or Derivative Works thereof in any medium, with or without
92
+ modifications, and in Source or Object form, provided that You
93
+ meet the following conditions:
94
+
95
+ (a) You must give any other recipients of the Work or
96
+ Derivative Works a copy of this License; and
97
+
98
+ (b) You must cause any modified files to carry prominent notices
99
+ stating that You changed the files; and
100
+
101
+ (c) You must retain, in the Source form of any Derivative Works
102
+ that You distribute, all copyright, patent, trademark, and
103
+ attribution notices from the Source form of the Work,
104
+ excluding those notices that do not pertain to any part of
105
+ the Derivative Works; and
106
+
107
+ (d) If the Work includes a "NOTICE" text file as part of its
108
+ distribution, then any Derivative Works that You distribute must
109
+ include a readable copy of the attribution notices contained
110
+ within such NOTICE file, excluding those notices that do not
111
+ pertain to any part of the Derivative Works, in at least one
112
+ of the following places: within a NOTICE text file distributed
113
+ as part of the Derivative Works; within the Source form or
114
+ documentation, if provided along with the Derivative Works; or,
115
+ within a display generated by the Derivative Works, if and
116
+ wherever such third-party notices normally appear. The contents
117
+ of the NOTICE file are for informational purposes only and
118
+ do not modify the License. You may add Your own attribution
119
+ notices within Derivative Works that You distribute, alongside
120
+ or as an addendum to the NOTICE text from the Work, provided
121
+ that such additional attribution notices cannot be construed
122
+ as modifying the License.
123
+
124
+ You may add Your own copyright statement to Your modifications and
125
+ may provide additional or different license terms and conditions
126
+ for use, reproduction, or distribution of Your modifications, or
127
+ for any such Derivative Works as a whole, provided Your use,
128
+ reproduction, and distribution of the Work otherwise complies with
129
+ the conditions stated in this License.
130
+
131
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
132
+ any Contribution intentionally submitted for inclusion in the Work
133
+ by You to the Licensor shall be under the terms and conditions of
134
+ this License, without any additional terms or conditions.
135
+ Notwithstanding the above, nothing herein shall supersede or modify
136
+ the terms of any separate license agreement you may have executed
137
+ with Licensor regarding such Contributions.
138
+
139
+ 6. Trademarks. This License does not grant permission to use the trade
140
+ names, trademarks, service marks, or product names of the Licensor,
141
+ except as required for reasonable and customary use in describing the
142
+ origin of the Work and reproducing the content of the NOTICE file.
143
+
144
+ 7. Disclaimer of Warranty. Unless required by applicable law or
145
+ agreed to in writing, Licensor provides the Work (and each
146
+ Contributor provides its Contributions) on an "AS IS" BASIS,
147
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148
+ implied, including, without limitation, any warranties or conditions
149
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150
+ PARTICULAR PURPOSE. You are solely responsible for determining the
151
+ appropriateness of using or redistributing the Work and assume any
152
+ risks associated with Your exercise of permissions under this License.
153
+
154
+ 8. Limitation of Liability. In no event and under no legal theory,
155
+ whether in tort (including negligence), contract, or otherwise,
156
+ unless required by applicable law (such as deliberate and grossly
157
+ negligent acts) or agreed to in writing, shall any Contributor be
158
+ liable to You for damages, including any direct, indirect, special,
159
+ incidental, or consequential damages of any character arising as a
160
+ result of this License or out of the use or inability to use the
161
+ Work (including but not limited to damages for loss of goodwill,
162
+ work stoppage, computer failure or malfunction, or any and all
163
+ other commercial damages or losses), even if such Contributor
164
+ has been advised of the possibility of such damages.
165
+
166
+ 9. Accepting Warranty or Additional Liability. While redistributing
167
+ the Work or Derivative Works thereof, You may choose to offer,
168
+ and charge a fee for, acceptance of support, warranty, indemnity,
169
+ or other liability obligations and/or rights consistent with this
170
+ License. However, in accepting such obligations, You may act only
171
+ on Your own behalf and on Your sole responsibility, not on behalf
172
+ of any other Contributor, and only if You agree to indemnify,
173
+ defend, and hold each Contributor harmless for any liability
174
+ incurred by, or claims asserted against, such Contributor by reason
175
+ of your accepting any such warranty or additional liability.
176
+
177
+ END OF TERMS AND CONDITIONS
178
+
179
+ APPENDIX: How to apply the Apache License to your work.
180
+
181
+ To apply the Apache License to your work, attach the following
182
+ boilerplate notice, with the fields enclosed by brackets "[]"
183
+ replaced with your own identifying information. (Don't include
184
+ the brackets!) The text should be enclosed in the appropriate
185
+ comment syntax for the file format. We also recommend that a
186
+ file or class name and description of purpose be included on the
187
+ same "printed page" as the copyright notice for easier
188
+ identification within third-party archives.
189
+
190
+ Copyright 2021 LINE Corporation
191
+
192
+ Licensed under the Apache License, Version 2.0 (the "License");
193
+ you may not use this file except in compliance with the License.
194
+ You may obtain a copy of the License at
195
+
196
+ http://www.apache.org/licenses/LICENSE-2.0
197
+
198
+ Unless required by applicable law or agreed to in writing, software
199
+ distributed under the License is distributed on an "AS IS" BASIS,
200
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201
+ See the License for the specific language governing permissions and
202
+ limitations under the License.
README.md ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: ja
3
+ license: apache-2.0
4
+ mask_token: "[MASK]"
5
+ widget:
6
+ - text: "LINE株式会社で[MASK]の研究・開発をしている。"
7
+ ---
8
+
9
+ # LINE DistilBERT Japanese
10
+
11
+ This is a DistilBERT model pre-trained on 131 GB of Japanese web text.
12
+ The teacher model is BERT-base that built in-house at LINE.
13
+ The model was trained by [LINE Corporation](https://linecorp.com/).
14
+
15
+ ## For Japanese
16
+
17
+ https://github.com/line/LINE-DistilBERT-Japanese/README_ja.md is written in Japanese.
18
+
19
+ ## How to use
20
+
21
+ ```python
22
+ from transformers import AutoTokenizer, AutoModel
23
+ tokenizer = AutoTokenizer.from_pretrained("line-corporation/line-distilbert-base-japanese", trust_remote_code=True)
24
+ model = AutoModel.from_pretrained("line-corporation/line-distilbert-base-japanese")
25
+
26
+ sentence = "LINE株式会社で[MASK]の研究・開発をしている。"
27
+ print(model(**tokenizer(sentence, return_tensors="pt")))
28
+ ```
29
+
30
+ ## Model architecture
31
+
32
+ The model architecture is the DitilBERT base model; 6 layers, 768 dimensions of hidden states, 12 attention heads, 66M parameters.
33
+
34
+ ## Evaluation
35
+
36
+ The evaluation by [JGLUE](https://github.com/yahoojapan/JGLUE) is as follows:
37
+
38
+ | model name | #Params | Marc_ja | JNLI | JSTS | JSQuAD | JCommonSenseQA |
39
+ |------------------------|:-------:|:-------:|:----:|:----------------:|:---------:|:--------------:|
40
+ | | | acc | acc | Pearson/Spearman | EM/F1 | acc |
41
+ | LINE-DistilBERT | 68M | 95.6 | 88.9 | 89.2/85.1 | 87.3/93.3 | 76.1 |
42
+ | Laboro-DistilBERT | 68M | 94.7 | 82.0 | 87.4/82.7 | 70.2/87.3 | 73.2 |
43
+ | BandaiNamco-DistilBERT | 68M | 94.6 | 81.6 | 86.8/82.1 | 80.0/88.0 | 66.5 |
44
+
45
+ ## Tokenization
46
+
47
+ The texts are first tokenized by MeCab with the Unidic dictionary and then split into subwords by the SentencePiece algorithm. The vocabulary size is 32768.
48
+
49
+ ## Licenses
50
+
51
+ The pretrained models are distributed under the terms of the [Apache License, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0).
52
+
53
+ ## To cite this work
54
+
55
+ We haven't published any paper on this work. Please cite [this GitHub repository](http://github.com/line/LINE-DistilBERT-Japanese):
56
+
57
+ ```
58
+ @article{LINE DistilBERT Japanese,
59
+ title = {LINE DistilBERT Japanese},
60
+ author = {"Koga, Kobayashi and Li, Shengzhe and Nakamachi, Akifumi and Sato, Toshinori"},
61
+ year = {2023},
62
+ howpublished = {\url{http://github.com/line/LINE-DistilBERT-Japanese}}
63
+ }
64
+ ```
config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation": "gelu",
3
+ "architectures": [
4
+ "DistilBertForMaskedLM"
5
+ ],
6
+ "attention_dropout": 0.1,
7
+ "dim": 768,
8
+ "dropout": 0.1,
9
+ "hidden_dim": 3072,
10
+ "initializer_range": 0.02,
11
+ "max_position_embeddings": 512,
12
+ "model_type": "distilbert",
13
+ "n_heads": 12,
14
+ "n_layers": 6,
15
+ "output_hidden_states": true,
16
+ "pad_token_id": 0,
17
+ "qa_dropout": 0.1,
18
+ "seq_classif_dropout": 0.2,
19
+ "sinusoidal_pos_embds": true,
20
+ "tie_weights_": true,
21
+ "transformers_version": "4.18.0",
22
+ "vocab_size": 32768
23
+ }
distilbert_japanese_tokenizer.py ADDED
@@ -0,0 +1,1012 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+
3
+ # Copyright 2023 LINE Corporation.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ # Almost copied from [transformers.BertJapaneseTokenizer](https://github.com/huggingface/transformers/blob/v4.26.1/src/transformers/models/bert_japanese/tokenization_bert_japanese.py#)
18
+ # This code is distributed under the Apache License 2.0.
19
+
20
+ """Tokenization classes."""
21
+
22
+
23
+ import collections
24
+ import copy
25
+ import os
26
+ import unicodedata
27
+ from typing import Any, Dict, List, Optional, Tuple
28
+
29
+ from transformers.tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
30
+ from transformers.utils import is_sentencepiece_available, logging
31
+
32
+
33
+ if is_sentencepiece_available():
34
+ import sentencepiece as spm
35
+ else:
36
+ spm = None
37
+
38
+ logger = logging.get_logger(__name__)
39
+
40
+ VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "spm_file": "spiece.model"}
41
+
42
+ SPIECE_UNDERLINE = "▁"
43
+
44
+ PRETRAINED_VOCAB_FILES_MAP = {
45
+ "vocab_file": {
46
+ "cl-tohoku/bert-base-japanese": "https://huggingface.co/cl-tohoku/bert-base-japanese/resolve/main/vocab.txt",
47
+ "cl-tohoku/bert-base-japanese-whole-word-masking": (
48
+ "https://huggingface.co/cl-tohoku/bert-base-japanese-whole-word-masking/resolve/main/vocab.txt"
49
+ ),
50
+ "cl-tohoku/bert-base-japanese-char": (
51
+ "https://huggingface.co/cl-tohoku/bert-base-japanese-char/resolve/main/vocab.txt"
52
+ ),
53
+ "cl-tohoku/bert-base-japanese-char-whole-word-masking": (
54
+ "https://huggingface.co/cl-tohoku/bert-base-japanese-char-whole-word-masking/resolve/main/vocab.txt"
55
+ ),
56
+ }
57
+ }
58
+
59
+ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
60
+ "cl-tohoku/bert-base-japanese": 512,
61
+ "cl-tohoku/bert-base-japanese-whole-word-masking": 512,
62
+ "cl-tohoku/bert-base-japanese-char": 512,
63
+ "cl-tohoku/bert-base-japanese-char-whole-word-masking": 512,
64
+ }
65
+
66
+ PRETRAINED_INIT_CONFIGURATION = {
67
+ "cl-tohoku/bert-base-japanese": {
68
+ "do_lower_case": False,
69
+ "word_tokenizer_type": "mecab",
70
+ "subword_tokenizer_type": "wordpiece",
71
+ },
72
+ "cl-tohoku/bert-base-japanese-whole-word-masking": {
73
+ "do_lower_case": False,
74
+ "word_tokenizer_type": "mecab",
75
+ "subword_tokenizer_type": "wordpiece",
76
+ },
77
+ "cl-tohoku/bert-base-japanese-char": {
78
+ "do_lower_case": False,
79
+ "word_tokenizer_type": "mecab",
80
+ "subword_tokenizer_type": "character",
81
+ },
82
+ "cl-tohoku/bert-base-japanese-char-whole-word-masking": {
83
+ "do_lower_case": False,
84
+ "word_tokenizer_type": "mecab",
85
+ "subword_tokenizer_type": "character",
86
+ },
87
+ }
88
+
89
+
90
+ # Copied from transformers.models.bert.tokenization_bert.load_vocab
91
+ def load_vocab(vocab_file):
92
+ """Loads a vocabulary file into a dictionary."""
93
+ vocab = collections.OrderedDict()
94
+ with open(vocab_file, "r", encoding="utf-8") as reader:
95
+ tokens = reader.readlines()
96
+ for index, token in enumerate(tokens):
97
+ token = token.rstrip("\n")
98
+ vocab[token] = index
99
+ return vocab
100
+
101
+
102
+ # Copied from transformers.models.bert.tokenization_bert.whitespace_tokenize
103
+ def whitespace_tokenize(text):
104
+ """Runs basic whitespace cleaning and splitting on a piece of text."""
105
+ text = text.strip()
106
+ if not text:
107
+ return []
108
+ tokens = text.split()
109
+ return tokens
110
+
111
+
112
+ class DistilBertJapaneseTokenizer(PreTrainedTokenizer):
113
+ r"""
114
+ Construct a BERT tokenizer for Japanese text.
115
+
116
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer
117
+ to: this superclass for more information regarding those methods.
118
+
119
+ Args:
120
+ vocab_file (`str`):
121
+ Path to a one-wordpiece-per-line vocabulary file.
122
+ spm_file (`str`, *optional*):
123
+ Path to [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .spm or .model
124
+ extension) that contains the vocabulary.
125
+ do_lower_case (`bool`, *optional*, defaults to `True`):
126
+ Whether to lower case the input. Only has an effect when do_basic_tokenize=True.
127
+ do_word_tokenize (`bool`, *optional*, defaults to `True`):
128
+ Whether to do word tokenization.
129
+ do_subword_tokenize (`bool`, *optional*, defaults to `True`):
130
+ Whether to do subword tokenization.
131
+ word_tokenizer_type (`str`, *optional*, defaults to `"basic"`):
132
+ Type of word tokenizer. Choose from ["basic", "mecab", "sudachi", "jumanpp"].
133
+ subword_tokenizer_type (`str`, *optional*, defaults to `"wordpiece"`):
134
+ Type of subword tokenizer. Choose from ["wordpiece", "character", "sentencepiece",].
135
+ mecab_kwargs (`dict`, *optional*):
136
+ Dictionary passed to the `MecabTokenizer` constructor.
137
+ sudachi_kwargs (`dict`, *optional*):
138
+ Dictionary passed to the `SudachiTokenizer` constructor.
139
+ jumanpp_kwargs (`dict`, *optional*):
140
+ Dictionary passed to the `JumanppTokenizer` constructor.
141
+ """
142
+
143
+ vocab_files_names = VOCAB_FILES_NAMES
144
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
145
+ pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
146
+ max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
147
+ model_input_names = [ "input_ids" , "attention_mask" ]
148
+
149
+ def __init__(
150
+ self,
151
+ vocab_file,
152
+ spm_file=None,
153
+ do_lower_case=False,
154
+ do_word_tokenize=True,
155
+ do_subword_tokenize=True,
156
+ word_tokenizer_type="basic",
157
+ subword_tokenizer_type="wordpiece",
158
+ never_split=None,
159
+ unk_token="[UNK]",
160
+ sep_token="[SEP]",
161
+ pad_token="[PAD]",
162
+ cls_token="[CLS]",
163
+ mask_token="[MASK]",
164
+ mecab_kwargs=None,
165
+ sudachi_kwargs=None,
166
+ jumanpp_kwargs=None,
167
+ **kwargs
168
+ ):
169
+ super().__init__(
170
+ spm_file=spm_file,
171
+ unk_token=unk_token,
172
+ sep_token=sep_token,
173
+ pad_token=pad_token,
174
+ cls_token=cls_token,
175
+ mask_token=mask_token,
176
+ do_lower_case=do_lower_case,
177
+ do_word_tokenize=do_word_tokenize,
178
+ do_subword_tokenize=do_subword_tokenize,
179
+ word_tokenizer_type=word_tokenizer_type,
180
+ subword_tokenizer_type=subword_tokenizer_type,
181
+ never_split=never_split,
182
+ mecab_kwargs=mecab_kwargs,
183
+ sudachi_kwargs=sudachi_kwargs,
184
+ jumanpp_kwargs=jumanpp_kwargs,
185
+ **kwargs,
186
+ )
187
+
188
+ if subword_tokenizer_type == "sentencepiece":
189
+ if not os.path.isfile(spm_file):
190
+ raise ValueError(
191
+ f"Can't find a vocabulary file at path '{spm_file}'. To load the vocabulary from a Google"
192
+ " pretrained model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
193
+ )
194
+ self.spm_file = spm_file
195
+ else:
196
+ if not os.path.isfile(vocab_file):
197
+ raise ValueError(
198
+ f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google"
199
+ " pretrained model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
200
+ )
201
+ self.vocab = load_vocab(vocab_file)
202
+ self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
203
+
204
+ self.do_word_tokenize = do_word_tokenize
205
+ self.word_tokenizer_type = word_tokenizer_type
206
+ self.lower_case = do_lower_case
207
+ self.never_split = never_split
208
+ self.mecab_kwargs = copy.deepcopy(mecab_kwargs)
209
+ self.sudachi_kwargs = copy.deepcopy(sudachi_kwargs)
210
+ self.jumanpp_kwargs = copy.deepcopy(jumanpp_kwargs)
211
+ if do_word_tokenize:
212
+ if word_tokenizer_type == "basic":
213
+ self.word_tokenizer = BasicTokenizer(
214
+ do_lower_case=do_lower_case, never_split=never_split, tokenize_chinese_chars=False
215
+ )
216
+ elif word_tokenizer_type == "mecab":
217
+ self.word_tokenizer = MecabTokenizer(
218
+ do_lower_case=do_lower_case, never_split=never_split, **(mecab_kwargs or {})
219
+ )
220
+ elif word_tokenizer_type == "sudachi":
221
+ self.word_tokenizer = SudachiTokenizer(
222
+ do_lower_case=do_lower_case, never_split=never_split, **(sudachi_kwargs or {})
223
+ )
224
+ elif word_tokenizer_type == "jumanpp":
225
+ self.word_tokenizer = JumanppTokenizer(
226
+ do_lower_case=do_lower_case, never_split=never_split, **(jumanpp_kwargs or {})
227
+ )
228
+ else:
229
+ raise ValueError(f"Invalid word_tokenizer_type '{word_tokenizer_type}' is specified.")
230
+
231
+ self.do_subword_tokenize = do_subword_tokenize
232
+ self.subword_tokenizer_type = subword_tokenizer_type
233
+ if do_subword_tokenize:
234
+ if subword_tokenizer_type == "wordpiece":
235
+ self.subword_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
236
+ elif subword_tokenizer_type == "character":
237
+ self.subword_tokenizer = CharacterTokenizer(vocab=self.vocab, unk_token=self.unk_token)
238
+ elif subword_tokenizer_type == "sentencepiece":
239
+ self.subword_tokenizer = SentencepieceTokenizer(vocab=self.spm_file, unk_token=self.unk_token)
240
+ else:
241
+ raise ValueError(f"Invalid subword_tokenizer_type '{subword_tokenizer_type}' is specified.")
242
+
243
+ @property
244
+ def do_lower_case(self):
245
+ return self.lower_case
246
+
247
+ def __getstate__(self):
248
+ state = dict(self.__dict__)
249
+ if self.word_tokenizer_type in ["mecab", "sudachi", "jumanpp"]:
250
+ del state["word_tokenizer"]
251
+ return state
252
+
253
+ def __setstate__(self, state):
254
+ self.__dict__ = state
255
+ if self.word_tokenizer_type == "mecab":
256
+ self.word_tokenizer = MecabTokenizer(
257
+ do_lower_case=self.do_lower_case, never_split=self.never_split, **(self.mecab_kwargs or {})
258
+ )
259
+ elif self.word_tokenizer_type == "sudachi":
260
+ self.word_tokenizer = SudachiTokenizer(
261
+ do_lower_case=self.do_lower_case, never_split=self.never_split, **(self.sudachi_kwargs or {})
262
+ )
263
+ elif self.word_tokenizer_type == "jumanpp":
264
+ self.word_tokenizer = JumanppTokenizer(
265
+ do_lower_case=self.do_lower_case, never_split=self.never_split, **(self.jumanpp_kwargs or {})
266
+ )
267
+
268
+ def _tokenize(self, text):
269
+ if self.do_word_tokenize:
270
+ tokens = self.word_tokenizer.tokenize(text, never_split=self.all_special_tokens)
271
+ else:
272
+ tokens = [text]
273
+
274
+ if self.do_subword_tokenize:
275
+ split_tokens = [sub_token for token in tokens for sub_token in self.subword_tokenizer.tokenize(token)]
276
+ else:
277
+ split_tokens = tokens
278
+
279
+ return split_tokens
280
+
281
+ @property
282
+ def vocab_size(self):
283
+ if self.subword_tokenizer_type == "sentencepiece":
284
+ return len(self.subword_tokenizer.sp_model)
285
+ return len(self.vocab)
286
+
287
+ def get_vocab(self):
288
+ if self.subword_tokenizer_type == "sentencepiece":
289
+ vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
290
+ vocab.update(self.added_tokens_encoder)
291
+ return vocab
292
+ return dict(self.vocab, **self.added_tokens_encoder)
293
+
294
+ def _convert_token_to_id(self, token):
295
+ """Converts a token (str) in an id using the vocab."""
296
+ if self.subword_tokenizer_type == "sentencepiece":
297
+ return self.subword_tokenizer.sp_model.PieceToId(token)
298
+ return self.vocab.get(token, self.vocab.get(self.unk_token))
299
+
300
+ def _convert_id_to_token(self, index):
301
+ """Converts an index (integer) in a token (str) using the vocab."""
302
+ if self.subword_tokenizer_type == "sentencepiece":
303
+ return self.subword_tokenizer.sp_model.IdToPiece(index)
304
+ return self.ids_to_tokens.get(index, self.unk_token)
305
+
306
+ def convert_tokens_to_string(self, tokens):
307
+ """Converts a sequence of tokens (string) in a single string."""
308
+ if self.subword_tokenizer_type == "sentencepiece":
309
+ return self.subword_tokenizer.sp_model.decode(tokens)
310
+ out_string = " ".join(tokens).replace(" ##", "").strip()
311
+ return out_string
312
+
313
+ # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.build_inputs_with_special_tokens
314
+ def build_inputs_with_special_tokens(
315
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
316
+ ) -> List[int]:
317
+ """
318
+ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
319
+ adding special tokens. A BERT sequence has the following format:
320
+
321
+ - single sequence: `[CLS] X [SEP]`
322
+ - pair of sequences: `[CLS] A [SEP] B [SEP]`
323
+
324
+ Args:
325
+ token_ids_0 (`List[int]`):
326
+ List of IDs to which the special tokens will be added.
327
+ token_ids_1 (`List[int]`, *optional*):
328
+ Optional second list of IDs for sequence pairs.
329
+
330
+ Returns:
331
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
332
+ """
333
+ if token_ids_1 is None:
334
+ return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
335
+ cls = [self.cls_token_id]
336
+ sep = [self.sep_token_id]
337
+ return cls + token_ids_0 + sep + token_ids_1 + sep
338
+
339
+ # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.get_special_tokens_mask
340
+ def get_special_tokens_mask(
341
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
342
+ ) -> List[int]:
343
+ """
344
+ Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
345
+ special tokens using the tokenizer `prepare_for_model` method.
346
+
347
+ Args:
348
+ token_ids_0 (`List[int]`):
349
+ List of IDs.
350
+ token_ids_1 (`List[int]`, *optional*):
351
+ Optional second list of IDs for sequence pairs.
352
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
353
+ Whether or not the token list is already formatted with special tokens for the model.
354
+
355
+ Returns:
356
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
357
+ """
358
+
359
+ if already_has_special_tokens:
360
+ return super().get_special_tokens_mask(
361
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
362
+ )
363
+
364
+ if token_ids_1 is not None:
365
+ return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
366
+ return [1] + ([0] * len(token_ids_0)) + [1]
367
+
368
+ # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.create_token_type_ids_from_sequences
369
+ def create_token_type_ids_from_sequences(
370
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
371
+ ) -> List[int]:
372
+ """
373
+ Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
374
+ pair mask has the following format:
375
+
376
+ ```
377
+ 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
378
+ | first sequence | second sequence |
379
+ ```
380
+
381
+ If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
382
+
383
+ Args:
384
+ token_ids_0 (`List[int]`):
385
+ List of IDs.
386
+ token_ids_1 (`List[int]`, *optional*):
387
+ Optional second list of IDs for sequence pairs.
388
+
389
+ Returns:
390
+ `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
391
+ """
392
+ sep = [self.sep_token_id]
393
+ cls = [self.cls_token_id]
394
+ if token_ids_1 is None:
395
+ return len(cls + token_ids_0 + sep) * [0]
396
+ return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
397
+
398
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
399
+ if os.path.isdir(save_directory):
400
+ if self.subword_tokenizer_type == "sentencepiece":
401
+ vocab_file = os.path.join(
402
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["spm_file"]
403
+ )
404
+ else:
405
+ vocab_file = os.path.join(
406
+ save_directory,
407
+ (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"],
408
+ )
409
+ else:
410
+ vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
411
+
412
+ if self.subword_tokenizer_type == "sentencepiece":
413
+ with open(vocab_file, "wb") as writer:
414
+ content_spiece_model = self.subword_tokenizer.sp_model.serialized_model_proto()
415
+ writer.write(content_spiece_model)
416
+ else:
417
+ with open(vocab_file, "w", encoding="utf-8") as writer:
418
+ index = 0
419
+ for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
420
+ if index != token_index:
421
+ logger.warning(
422
+ f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
423
+ " Please check that the vocabulary is not corrupted!"
424
+ )
425
+ index = token_index
426
+ writer.write(token + "\n")
427
+ index += 1
428
+ return (vocab_file,)
429
+
430
+
431
+ class MecabTokenizer:
432
+ """Runs basic tokenization with MeCab morphological parser."""
433
+
434
+ def __init__(
435
+ self,
436
+ do_lower_case=False,
437
+ never_split=None,
438
+ normalize_text=True,
439
+ mecab_dic: Optional[str] = "ipadic",
440
+ mecab_option: Optional[str] = None,
441
+ ):
442
+ """
443
+ Constructs a MecabTokenizer.
444
+
445
+ Args:
446
+ **do_lower_case**: (*optional*) boolean (default True)
447
+ Whether to lowercase the input.
448
+ **never_split**: (*optional*) list of str
449
+ Kept for backward compatibility purposes. Now implemented directly at the base class level (see
450
+ [`PreTrainedTokenizer.tokenize`]) List of tokens not to split.
451
+ **normalize_text**: (*optional*) boolean (default True)
452
+ Whether to apply unicode normalization to text before tokenization.
453
+ **mecab_dic**: (*optional*) string (default "ipadic")
454
+ Name of dictionary to be used for MeCab initialization. If you are using a system-installed dictionary,
455
+ set this option to `None` and modify *mecab_option*.
456
+ **mecab_option**: (*optional*) string
457
+ String passed to MeCab constructor.
458
+ """
459
+ self.do_lower_case = do_lower_case
460
+ self.never_split = never_split if never_split is not None else []
461
+ self.normalize_text = normalize_text
462
+
463
+ try:
464
+ import fugashi
465
+ except ModuleNotFoundError as error:
466
+ raise error.__class__(
467
+ "You need to install fugashi to use MecabTokenizer. "
468
+ "See https://pypi.org/project/fugashi/ for installation."
469
+ )
470
+
471
+ mecab_option = mecab_option or ""
472
+
473
+ if mecab_dic is not None:
474
+ if mecab_dic == "ipadic":
475
+ try:
476
+ import ipadic
477
+ except ModuleNotFoundError as error:
478
+ raise error.__class__(
479
+ "The ipadic dictionary is not installed. "
480
+ "See https://github.com/polm/ipadic-py for installation."
481
+ )
482
+
483
+ dic_dir = ipadic.DICDIR
484
+
485
+ elif mecab_dic == "unidic_lite":
486
+ try:
487
+ import unidic_lite
488
+ except ModuleNotFoundError as error:
489
+ raise error.__class__(
490
+ "The unidic_lite dictionary is not installed. "
491
+ "See https://github.com/polm/unidic-lite for installation."
492
+ )
493
+
494
+ dic_dir = unidic_lite.DICDIR
495
+
496
+ elif mecab_dic == "unidic":
497
+ try:
498
+ import unidic
499
+ except ModuleNotFoundError as error:
500
+ raise error.__class__(
501
+ "The unidic dictionary is not installed. "
502
+ "See https://github.com/polm/unidic-py for installation."
503
+ )
504
+
505
+ dic_dir = unidic.DICDIR
506
+ if not os.path.isdir(dic_dir):
507
+ raise RuntimeError(
508
+ "The unidic dictionary itself is not found. "
509
+ "See https://github.com/polm/unidic-py for installation."
510
+ )
511
+
512
+ else:
513
+ raise ValueError("Invalid mecab_dic is specified.")
514
+
515
+ mecabrc = os.path.join(dic_dir, "mecabrc")
516
+ mecab_option = f'-d "{dic_dir}" -r "{mecabrc}" ' + mecab_option
517
+
518
+ self.mecab = fugashi.GenericTagger(mecab_option)
519
+
520
+ def tokenize(self, text, never_split=None, **kwargs):
521
+ """Tokenizes a piece of text."""
522
+ if self.normalize_text:
523
+ text = unicodedata.normalize("NFKC", text)
524
+
525
+ never_split = self.never_split + (never_split if never_split is not None else [])
526
+ tokens = []
527
+
528
+ for word in self.mecab(text):
529
+ token = word.surface
530
+
531
+ if self.do_lower_case and token not in never_split:
532
+ token = token.lower()
533
+
534
+ tokens.append(token)
535
+
536
+ return tokens
537
+
538
+
539
+ class SudachiTokenizer:
540
+ """Runs basic tokenization with Sudachi morphological parser."""
541
+
542
+ def __init__(
543
+ self,
544
+ do_lower_case=False,
545
+ never_split=None,
546
+ normalize_text=True,
547
+ trim_whitespace=False,
548
+ sudachi_split_mode="A",
549
+ sudachi_config_path=None,
550
+ sudachi_resource_dir=None,
551
+ sudachi_dict_type="core",
552
+ ):
553
+ """
554
+ Constructs a SudachiTokenizer.
555
+
556
+ Args:
557
+ **do_lower_case**: (*optional*) boolean (default True)
558
+ Whether to lowercase the input.
559
+ **never_split**: (*optional*) list of str
560
+ Kept for backward compatibility purposes. Now implemented directly at the base class level (see
561
+ [`PreTrainedTokenizer.tokenize`]) List of tokens not to split.
562
+ **normalize_text**: (*optional*) boolean (default True)
563
+ Whether to apply unicode normalization to text before tokenization.
564
+ **trim_whitespace**: (*optional*) boolean (default False)
565
+ Whether to trim all whitespace, tab, newline from tokens.
566
+ **sudachi_split_mode**: (*optional*) string
567
+ Split mode of sudachi, choose from "A", "B", "C".
568
+ **sudachi_config_path**: (*optional*) string
569
+ **sudachi_resource_dir**: (*optional*) string
570
+ **sudachi_dict_type**: (*optional*) string
571
+ dict type of sudachi, choose from "small", "core", "full".
572
+ """
573
+
574
+ self.do_lower_case = do_lower_case
575
+ self.never_split = never_split if never_split is not None else []
576
+ self.normalize_text = normalize_text
577
+ self.trim_whitespace = trim_whitespace
578
+
579
+ try:
580
+ from sudachipy import dictionary, tokenizer
581
+ except ImportError:
582
+ raise ImportError(
583
+ "You need to install sudachipy to use SudachiTokenizer. "
584
+ "See https://github.com/WorksApplications/SudachiPy for installation."
585
+ )
586
+
587
+ if sudachi_split_mode == "A":
588
+ self.split_mode = tokenizer.Tokenizer.SplitMode.A
589
+ elif sudachi_split_mode == "B":
590
+ self.split_mode = tokenizer.Tokenizer.SplitMode.B
591
+ elif sudachi_split_mode == "C":
592
+ self.split_mode = tokenizer.Tokenizer.SplitMode.C
593
+ else:
594
+ raise ValueError("Invalid sudachi_split_mode is specified.")
595
+
596
+ self.sudachi = dictionary.Dictionary(
597
+ config_path=sudachi_config_path, resource_dir=sudachi_resource_dir, dict=sudachi_dict_type
598
+ ).create(self.split_mode)
599
+
600
+ def tokenize(self, text, never_split=None, **kwargs):
601
+ """Tokenizes a piece of text."""
602
+ if self.normalize_text:
603
+ text = unicodedata.normalize("NFKC", text)
604
+
605
+ never_split = self.never_split + (never_split if never_split is not None else [])
606
+ tokens = []
607
+
608
+ for word in self.sudachi.tokenize(text):
609
+ token = word.surface()
610
+
611
+ if self.do_lower_case and token not in never_split:
612
+ token = token.lower()
613
+
614
+ if self.trim_whitespace:
615
+ if token.strip() == "":
616
+ continue
617
+ else:
618
+ token = token.strip()
619
+
620
+ tokens.append(token)
621
+
622
+ return tokens
623
+
624
+
625
+ class JumanppTokenizer:
626
+ """Runs basic tokenization with jumanpp morphological parser."""
627
+
628
+ def __init__(
629
+ self,
630
+ do_lower_case=False,
631
+ never_split=None,
632
+ normalize_text=True,
633
+ trim_whitespace=False,
634
+ ):
635
+ """
636
+ Constructs a JumanppTokenizer.
637
+
638
+ Args:
639
+ **do_lower_case**: (*optional*) boolean (default True)
640
+ Whether to lowercase the input.
641
+ **never_split**: (*optional*) list of str
642
+ Kept for backward compatibility purposes. Now implemented directly at the base class level (see
643
+ [`PreTrainedTokenizer.tokenize`]) List of tokens not to split.
644
+ **normalize_text**: (*optional*) boolean (default True)
645
+ Whether to apply unicode normalization to text before tokenization.
646
+ **trim_whitespace**: (*optional*) boolean (default False)
647
+ Whether to trim all whitespace, tab, newline from tokens.
648
+ """
649
+
650
+ self.do_lower_case = do_lower_case
651
+ self.never_split = never_split if never_split is not None else []
652
+ self.normalize_text = normalize_text
653
+ self.trim_whitespace = trim_whitespace
654
+
655
+ try:
656
+ import rhoknp
657
+ except ImportError:
658
+ raise ImportError(
659
+ "You need to install rhoknp to use JumanppTokenizer. "
660
+ "See https://github.com/ku-nlp/rhoknp for installation."
661
+ )
662
+
663
+ self.juman = rhoknp.Jumanpp()
664
+
665
+ def tokenize(self, text, never_split=None, **kwargs):
666
+ """Tokenizes a piece of text."""
667
+ if self.normalize_text:
668
+ text = unicodedata.normalize("NFKC", text)
669
+
670
+ text = text.strip()
671
+
672
+ never_split = self.never_split + (never_split if never_split is not None else [])
673
+ tokens = []
674
+
675
+ for mrph in self.juman.apply_to_sentence(text).morphemes:
676
+ token = mrph.text
677
+
678
+ if self.do_lower_case and token not in never_split:
679
+ token = token.lower()
680
+
681
+ if self.trim_whitespace:
682
+ if token.strip() == "":
683
+ continue
684
+ else:
685
+ token = token.strip()
686
+
687
+ tokens.append(token)
688
+
689
+ return tokens
690
+
691
+
692
+ class CharacterTokenizer:
693
+ """Runs Character tokenization."""
694
+
695
+ def __init__(self, vocab, unk_token, normalize_text=True):
696
+ """
697
+ Constructs a CharacterTokenizer.
698
+
699
+ Args:
700
+ **vocab**:
701
+ Vocabulary object.
702
+ **unk_token**: str
703
+ A special symbol for out-of-vocabulary token.
704
+ **normalize_text**: (`optional`) boolean (default True)
705
+ Whether to apply unicode normalization to text before tokenization.
706
+ """
707
+ self.vocab = vocab
708
+ self.unk_token = unk_token
709
+ self.normalize_text = normalize_text
710
+
711
+ def tokenize(self, text):
712
+ """
713
+ Tokenizes a piece of text into characters.
714
+
715
+ For example, `input = "apple""` wil return as output `["a", "p", "p", "l", "e"]`.
716
+
717
+ Args:
718
+ text: A single token or whitespace separated tokens.
719
+ This should have already been passed through *BasicTokenizer*.
720
+
721
+ Returns:
722
+ A list of characters.
723
+ """
724
+ if self.normalize_text:
725
+ text = unicodedata.normalize("NFKC", text)
726
+
727
+ output_tokens = []
728
+ for char in text:
729
+ if char not in self.vocab:
730
+ output_tokens.append(self.unk_token)
731
+ continue
732
+
733
+ output_tokens.append(char)
734
+
735
+ return output_tokens
736
+
737
+
738
+ # Copied from transformers.models.bert.tokenization_bert.BasicTokenizer
739
+ class BasicTokenizer(object):
740
+ """
741
+ Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
742
+
743
+ Args:
744
+ do_lower_case (`bool`, *optional*, defaults to `True`):
745
+ Whether or not to lowercase the input when tokenizing.
746
+ never_split (`Iterable`, *optional*):
747
+ Collection of tokens which will never be split during tokenization. Only has an effect when
748
+ `do_basic_tokenize=True`
749
+ tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
750
+ Whether or not to tokenize Chinese characters.
751
+
752
+ This should likely be deactivated for Japanese (see this
753
+ [issue](https://github.com/huggingface/transformers/issues/328)).
754
+ strip_accents (`bool`, *optional*):
755
+ Whether or not to strip all accents. If this option is not specified, then it will be determined by the
756
+ value for `lowercase` (as in the original BERT).
757
+ """
758
+
759
+ def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
760
+ if never_split is None:
761
+ never_split = []
762
+ self.do_lower_case = do_lower_case
763
+ self.never_split = set(never_split)
764
+ self.tokenize_chinese_chars = tokenize_chinese_chars
765
+ self.strip_accents = strip_accents
766
+
767
+ def tokenize(self, text, never_split=None):
768
+ """
769
+ Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
770
+ WordPieceTokenizer.
771
+
772
+ Args:
773
+ never_split (`List[str]`, *optional*)
774
+ Kept for backward compatibility purposes. Now implemented directly at the base class level (see
775
+ [`PreTrainedTokenizer.tokenize`]) List of token not to split.
776
+ """
777
+ # union() returns a new set by concatenating the two sets.
778
+ never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
779
+ text = self._clean_text(text)
780
+
781
+ # This was added on November 1st, 2018 for the multilingual and Chinese
782
+ # models. This is also applied to the English models now, but it doesn't
783
+ # matter since the English models were not trained on any Chinese data
784
+ # and generally don't have any Chinese data in them (there are Chinese
785
+ # characters in the vocabulary because Wikipedia does have some Chinese
786
+ # words in the English Wikipedia.).
787
+ if self.tokenize_chinese_chars:
788
+ text = self._tokenize_chinese_chars(text)
789
+ orig_tokens = whitespace_tokenize(text)
790
+ split_tokens = []
791
+ for token in orig_tokens:
792
+ if token not in never_split:
793
+ if self.do_lower_case:
794
+ token = token.lower()
795
+ if self.strip_accents is not False:
796
+ token = self._run_strip_accents(token)
797
+ elif self.strip_accents:
798
+ token = self._run_strip_accents(token)
799
+ split_tokens.extend(self._run_split_on_punc(token, never_split))
800
+
801
+ output_tokens = whitespace_tokenize(" ".join(split_tokens))
802
+ return output_tokens
803
+
804
+ def _run_strip_accents(self, text):
805
+ """Strips accents from a piece of text."""
806
+ text = unicodedata.normalize("NFD", text)
807
+ output = []
808
+ for char in text:
809
+ cat = unicodedata.category(char)
810
+ if cat == "Mn":
811
+ continue
812
+ output.append(char)
813
+ return "".join(output)
814
+
815
+ def _run_split_on_punc(self, text, never_split=None):
816
+ """Splits punctuation on a piece of text."""
817
+ if never_split is not None and text in never_split:
818
+ return [text]
819
+ chars = list(text)
820
+ i = 0
821
+ start_new_word = True
822
+ output = []
823
+ while i < len(chars):
824
+ char = chars[i]
825
+ if _is_punctuation(char):
826
+ output.append([char])
827
+ start_new_word = True
828
+ else:
829
+ if start_new_word:
830
+ output.append([])
831
+ start_new_word = False
832
+ output[-1].append(char)
833
+ i += 1
834
+
835
+ return ["".join(x) for x in output]
836
+
837
+ def _tokenize_chinese_chars(self, text):
838
+ """Adds whitespace around any CJK character."""
839
+ output = []
840
+ for char in text:
841
+ cp = ord(char)
842
+ if self._is_chinese_char(cp):
843
+ output.append(" ")
844
+ output.append(char)
845
+ output.append(" ")
846
+ else:
847
+ output.append(char)
848
+ return "".join(output)
849
+
850
+ def _is_chinese_char(self, cp):
851
+ """Checks whether CP is the codepoint of a CJK character."""
852
+ # This defines a "chinese character" as anything in the CJK Unicode block:
853
+ # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
854
+ #
855
+ # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
856
+ # despite its name. The modern Korean Hangul alphabet is a different block,
857
+ # as is Japanese Hiragana and Katakana. Those alphabets are used to write
858
+ # space-separated words, so they are not treated specially and handled
859
+ # like the all of the other languages.
860
+ if (
861
+ (cp >= 0x4E00 and cp <= 0x9FFF)
862
+ or (cp >= 0x3400 and cp <= 0x4DBF) #
863
+ or (cp >= 0x20000 and cp <= 0x2A6DF) #
864
+ or (cp >= 0x2A700 and cp <= 0x2B73F) #
865
+ or (cp >= 0x2B740 and cp <= 0x2B81F) #
866
+ or (cp >= 0x2B820 and cp <= 0x2CEAF) #
867
+ or (cp >= 0xF900 and cp <= 0xFAFF)
868
+ or (cp >= 0x2F800 and cp <= 0x2FA1F) #
869
+ ): #
870
+ return True
871
+
872
+ return False
873
+
874
+ def _clean_text(self, text):
875
+ """Performs invalid character removal and whitespace cleanup on text."""
876
+ output = []
877
+ for char in text:
878
+ cp = ord(char)
879
+ if cp == 0 or cp == 0xFFFD or _is_control(char):
880
+ continue
881
+ if _is_whitespace(char):
882
+ output.append(" ")
883
+ else:
884
+ output.append(char)
885
+ return "".join(output)
886
+
887
+
888
+ # Copied from transformers.models.bert.tokenization_bert.WordpieceTokenizer
889
+ class WordpieceTokenizer(object):
890
+ """Runs WordPiece tokenization."""
891
+
892
+ def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
893
+ self.vocab = vocab
894
+ self.unk_token = unk_token
895
+ self.max_input_chars_per_word = max_input_chars_per_word
896
+
897
+ def tokenize(self, text):
898
+ """
899
+ Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
900
+ tokenization using the given vocabulary.
901
+
902
+ For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.
903
+
904
+ Args:
905
+ text: A single token or whitespace separated tokens. This should have
906
+ already been passed through *BasicTokenizer*.
907
+
908
+ Returns:
909
+ A list of wordpiece tokens.
910
+ """
911
+
912
+ output_tokens = []
913
+ for token in whitespace_tokenize(text):
914
+ chars = list(token)
915
+ if len(chars) > self.max_input_chars_per_word:
916
+ output_tokens.append(self.unk_token)
917
+ continue
918
+
919
+ is_bad = False
920
+ start = 0
921
+ sub_tokens = []
922
+ while start < len(chars):
923
+ end = len(chars)
924
+ cur_substr = None
925
+ while start < end:
926
+ substr = "".join(chars[start:end])
927
+ if start > 0:
928
+ substr = "##" + substr
929
+ if substr in self.vocab:
930
+ cur_substr = substr
931
+ break
932
+ end -= 1
933
+ if cur_substr is None:
934
+ is_bad = True
935
+ break
936
+ sub_tokens.append(cur_substr)
937
+ start = end
938
+
939
+ if is_bad:
940
+ output_tokens.append(self.unk_token)
941
+ else:
942
+ output_tokens.extend(sub_tokens)
943
+ return output_tokens
944
+
945
+
946
+ class SentencepieceTokenizer(object):
947
+ """
948
+ Runs sentencepiece tokenization. Based on transformers.models.albert.tokenization_albert.AlbertTokenizer.
949
+ """
950
+
951
+ def __init__(
952
+ self,
953
+ vocab,
954
+ unk_token,
955
+ do_lower_case=False,
956
+ remove_space=True,
957
+ keep_accents=True,
958
+ sp_model_kwargs: Optional[Dict[str, Any]] = None,
959
+ ):
960
+ self.vocab = vocab
961
+ self.unk_token = unk_token
962
+ self.do_lower_case = do_lower_case
963
+ self.remove_space = remove_space
964
+ self.keep_accents = keep_accents
965
+
966
+ self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
967
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
968
+ self.sp_model.Load(self.vocab)
969
+
970
+ def preprocess_text(self, inputs):
971
+ if self.remove_space:
972
+ outputs = " ".join(inputs.strip().split())
973
+ else:
974
+ outputs = inputs
975
+ outputs = outputs.replace("``", '"').replace("''", '"')
976
+
977
+ if not self.keep_accents:
978
+ outputs = unicodedata.normalize("NFKD", outputs)
979
+ outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
980
+ if self.do_lower_case:
981
+ outputs = outputs.lower()
982
+
983
+ return outputs
984
+
985
+ def tokenize(self, text):
986
+ """
987
+ Tokenizes text by sentencepiece. Based on [SentencePiece](https://github.com/google/sentencepiece).
988
+ Tokenization needs the given vocabulary.
989
+
990
+ Args:
991
+ text: A string needs to be tokenized.
992
+
993
+ Returns:
994
+ A list of sentencepiece tokens.
995
+ """
996
+ text = self.preprocess_text(text)
997
+ pieces = self.sp_model.encode(text, out_type=str)
998
+ new_pieces = []
999
+ for piece in pieces:
1000
+ if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit():
1001
+ cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, ""))
1002
+ if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
1003
+ if len(cur_pieces[0]) == 1:
1004
+ cur_pieces = cur_pieces[1:]
1005
+ else:
1006
+ cur_pieces[0] = cur_pieces[0][1:]
1007
+ cur_pieces.append(piece[-1])
1008
+ new_pieces.extend(cur_pieces)
1009
+ else:
1010
+ new_pieces.append(piece)
1011
+
1012
+ return new_pieces
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d8b678bd9ce14363d5e6e3dc53c7dd7c76f8ad11394ccffd200f15e9f6819eb
3
+ size 274894944
spiece.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bcfafc8c0662d9c8f39621a64c74260f2ad120310c8dd24886de2dddaf599b4e
3
+ size 439391
tokenizer_config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_lower_case":true,
3
+ "remove_space":true,
4
+ "keep_accents":true,
5
+ "bos_token": "[CLS]",
6
+ "eos_token": "[SEP]",
7
+ "unk_token": "<unk>",
8
+ "sep_token": "[SEP]",
9
+ "pad_token": "<pad>",
10
+ "cls_token": "[CLS]",
11
+ "mask_token":{
12
+ "content":"[MASK]",
13
+ "single_word":false,
14
+ "lstrip":true,
15
+ "rstrip":false,
16
+ "normalized":false,
17
+ "__type":"AddedToken"
18
+ },
19
+ "tokenize_chinese_chars":false,
20
+ "tokenizer_class": "BertJapaneseTokenizer",
21
+ "word_tokenizer_type": "mecab",
22
+ "subword_tokenizer_type": "sentencepiece",
23
+ "mecab_kwargs": {
24
+ "mecab_dic": "unidic_lite"
25
+ },
26
+ "auto_map": {
27
+ "AutoTokenizer": [
28
+ "distilbert_japanese_tokenizer.DistilBertJapaneseTokenizer",
29
+ null
30
+ ]
31
+ }
32
+ }