SaulLu
/

albert-bn-dev

Model card Files Files and versions Community

SaulLu commited on Apr 22, 2021

Commit

b97ef8e

•

1 Parent(s): 7a12445

update readme

Files changed (1) hide show

README.md +2 -2

README.md CHANGED Viewed

@@ -55,7 +55,7 @@ The tokenizer was trained with the [SentencePiece](https://github.com/google/sen
 ```
 import sentencepiece as spm
 config = {
-    "input": "./dataset/oscar_bn/oscar_bn.txt,./dataset/wikipedia_bn/wikipedia_bn.txt",
     "input_format": "text",
     "model_type": "unigram",
     "vocab_size": 32000,
@@ -86,7 +86,7 @@ config = {
     "eos_piece": "[SEP]",
     "train_extremely_large_corpus": true,
     "split_by_whitespace": true,
-    "model_prefix": "./tokenizer_bn/data/oscar_wiki_bn_spm_unigram_4000000_2021_04_21_17_06_50/spiece",
     "input_sentence_size": 4000000,
     "user_defined_symbols": "(,),\",-,.,–,£"
 }

 ```
 import sentencepiece as spm
 config = {
+    "input": "./dataset/oscar_bn.txt,./dataset/wikipedia_bn.txt",
     "input_format": "text",
     "model_type": "unigram",
     "vocab_size": 32000,
     "eos_piece": "[SEP]",
     "train_extremely_large_corpus": true,
     "split_by_whitespace": true,
+    "model_prefix": "./spiece",
     "input_sentence_size": 4000000,
     "user_defined_symbols": "(,),\",-,.,–,£"
 }