fix tokenizer
Browse files- .gitignore +1 -0
- README.md +2 -2
- tokenizer_config.json +1 -1
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
.DS_Store
|
README.md
CHANGED
@@ -7,12 +7,12 @@ This repository provides a medium-sized Japanese GPT-2 model trained on [Japanes
|
|
7 |
|
8 |
# Use the model
|
9 |
|
10 |
-
*NOTE:* Use `T5Tokenizer` to initiate the tokenizer
|
11 |
|
12 |
~~~~
|
13 |
from transformers import T5Tokenizer, AutoModelForCausalLM
|
14 |
|
15 |
-
tokenizer = T5Tokenizer.from_pretrained("rinna/japanese-gpt2-medium")
|
16 |
|
17 |
model = AutoModelForCausalLM.from_pretrained("rinna/japanese-gpt2-medium")
|
18 |
~~~~
|
|
|
7 |
|
8 |
# Use the model
|
9 |
|
10 |
+
*NOTE:* Use `T5Tokenizer` to initiate the tokenizer with argument `extra_ids=0`.
|
11 |
|
12 |
~~~~
|
13 |
from transformers import T5Tokenizer, AutoModelForCausalLM
|
14 |
|
15 |
+
tokenizer = T5Tokenizer.from_pretrained("rinna/japanese-gpt2-medium", extra_ids=0)
|
16 |
|
17 |
model = AutoModelForCausalLM.from_pretrained("rinna/japanese-gpt2-medium")
|
18 |
~~~~
|
tokenizer_config.json
CHANGED
@@ -1 +1 @@
|
|
1 |
-
{"eos_token": "</s>", "unk_token": "<unk>", "pad_token": "[PAD]", "additional_special_tokens": [], "bos_token": "<s>", "cls_token": "[CLS]", "sep_token": "[SEP]", "mask_token": "[MASK]", "do_lower_case": true}
|
|
|
1 |
+
{"eos_token": "</s>", "unk_token": "<unk>", "pad_token": "[PAD]", "additional_special_tokens": [], "bos_token": "<s>", "cls_token": "[CLS]", "sep_token": "[SEP]", "mask_token": "[MASK]", "do_lower_case": true, "extra_ids": 0}
|