fix
Browse files- README.md +24 -0
- special_tokens_map.json +7 -1
- tokenizer_config.json +6 -0
README.md
CHANGED
@@ -7,3 +7,27 @@ tags:
|
|
7 |
- tokenizer
|
8 |
- sentencepiece
|
9 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
- tokenizer
|
8 |
- sentencepiece
|
9 |
---
|
10 |
+
|
11 |
+
sentencepiece unigramを日本語で学習
|
12 |
+
https://github.com/huggingface/tokenizers
|
13 |
+
|
14 |
+
## sample
|
15 |
+
|
16 |
+
```
|
17 |
+
from transformers import AutoTokenizer
|
18 |
+
tokenizer = AutoTokenizer.from_pretrained("if001/sentencepiece_ja", trust_remote_code=True)
|
19 |
+
print(tokenizer("hello world"))
|
20 |
+
|
21 |
+
>> {'input_ids': [158, 8418, 1427, 15930, 866, 13782, 44, 15034, 1719, 16655, 8, 115, 5, 280, 17635, 94, 818, 2748, 1168, 1114], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
|
22 |
+
|
23 |
+
print(tokenizer.tokenize('それは九月初旬のある蒸し暑い晩のことであった。私は、D坂の大通りの中程にある'))
|
24 |
+
>> ['それは', '九月', '初', '旬', 'のある', '蒸', 'し', '暑い', '晩', 'のことであった', '。', '私は', '、', 'D', '坂の', '大', '通り', 'の中', '程', 'にある']
|
25 |
+
|
26 |
+
```
|
27 |
+
|
28 |
+
|
29 |
+
## データセット
|
30 |
+
https://huggingface.co/datasets/izumi-lab/wikinews-ja-20230728
|
31 |
+
https://huggingface.co/datasets/izumi-lab/wikinews-en-20230728
|
32 |
+
https://huggingface.co/datasets/if001/aozorabunko-clean-sin
|
33 |
+
|
special_tokens_map.json
CHANGED
@@ -1 +1,7 @@
|
|
1 |
-
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": "<BOS>",
|
3 |
+
"eos_token": "<EOS>",
|
4 |
+
"mask_token": "<MASK>",
|
5 |
+
"pad_token": "<PAD>",
|
6 |
+
"unk_token": "<UNK>"
|
7 |
+
}
|
tokenizer_config.json
CHANGED
@@ -1,4 +1,10 @@
|
|
1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
"clean_up_tokenization_spaces": true,
|
3 |
"tokenizer_class": "SentencePieceJA",
|
4 |
"auto_map": {
|
|
|
1 |
{
|
2 |
+
"bos_token": "<BOS>",
|
3 |
+
"eos_token": "<EOS>",
|
4 |
+
"mask_token": "<MASK>",
|
5 |
+
"model_max_length": 1000000000000000019884624838656,
|
6 |
+
"pad_token": "<PAD>",
|
7 |
+
"unk_token": "<UNK>",
|
8 |
"clean_up_tokenization_spaces": true,
|
9 |
"tokenizer_class": "SentencePieceJA",
|
10 |
"auto_map": {
|