update readme
Browse files
README.md
CHANGED
@@ -55,7 +55,7 @@ The tokenizer was trained with the [SentencePiece](https://github.com/google/sen
|
|
55 |
```
|
56 |
import sentencepiece as spm
|
57 |
config = {
|
58 |
-
"input": "./dataset/oscar_bn
|
59 |
"input_format": "text",
|
60 |
"model_type": "unigram",
|
61 |
"vocab_size": 32000,
|
@@ -86,7 +86,7 @@ config = {
|
|
86 |
"eos_piece": "[SEP]",
|
87 |
"train_extremely_large_corpus": true,
|
88 |
"split_by_whitespace": true,
|
89 |
-
"model_prefix": "./
|
90 |
"input_sentence_size": 4000000,
|
91 |
"user_defined_symbols": "(,),\",-,.,–,£"
|
92 |
}
|
|
|
55 |
```
|
56 |
import sentencepiece as spm
|
57 |
config = {
|
58 |
+
"input": "./dataset/oscar_bn.txt,./dataset/wikipedia_bn.txt",
|
59 |
"input_format": "text",
|
60 |
"model_type": "unigram",
|
61 |
"vocab_size": 32000,
|
|
|
86 |
"eos_piece": "[SEP]",
|
87 |
"train_extremely_large_corpus": true,
|
88 |
"split_by_whitespace": true,
|
89 |
+
"model_prefix": "./spiece",
|
90 |
"input_sentence_size": 4000000,
|
91 |
"user_defined_symbols": "(,),\",-,.,–,£"
|
92 |
}
|