fix
Browse files- README.md +7 -0
- sentencepiece_ja.py +1 -7
README.md
CHANGED
@@ -31,3 +31,10 @@ https://huggingface.co/datasets/izumi-lab/wikinews-ja-20230728
|
|
31 |
https://huggingface.co/datasets/izumi-lab/wikinews-en-20230728
|
32 |
https://huggingface.co/datasets/if001/aozorabunko-clean-sin
|
33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
https://huggingface.co/datasets/izumi-lab/wikinews-en-20230728
|
32 |
https://huggingface.co/datasets/if001/aozorabunko-clean-sin
|
33 |
|
34 |
+
|
35 |
+
|
36 |
+
## settings
|
37 |
+
```
|
38 |
+
all_special_ids: [1, 2, 3, 0, 4]
|
39 |
+
all_special_tokens: ['<BOS>', '<EOS>', '<UNK>', '<PAD>', '<MASK>']
|
40 |
+
```
|
sentencepiece_ja.py
CHANGED
@@ -14,13 +14,7 @@ class SentencePieceJA(PreTrainedTokenizer):
|
|
14 |
**kwargs):
|
15 |
from tokenizers import Tokenizer
|
16 |
self._tokenizer = Tokenizer.from_file(model_path)
|
17 |
-
super().__init__(
|
18 |
-
pad_token=pad,
|
19 |
-
bos_token=bos,
|
20 |
-
eos_token=eos,
|
21 |
-
unk_token=unk,
|
22 |
-
mask_token=mask,
|
23 |
-
**kwargs)
|
24 |
self.add_special_tokens({
|
25 |
'pad_token': pad,
|
26 |
'bos_token': bos,
|
|
|
14 |
**kwargs):
|
15 |
from tokenizers import Tokenizer
|
16 |
self._tokenizer = Tokenizer.from_file(model_path)
|
17 |
+
super().__init__(**kwargs)
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
self.add_special_tokens({
|
19 |
'pad_token': pad,
|
20 |
'bos_token': bos,
|