SaulLu commited on
Commit
b97ef8e
1 Parent(s): 7a12445

update readme

Browse files
Files changed (1) hide show
  1. README.md +2 -2
README.md CHANGED
@@ -55,7 +55,7 @@ The tokenizer was trained with the [SentencePiece](https://github.com/google/sen
55
  ```
56
  import sentencepiece as spm
57
  config = {
58
- "input": "./dataset/oscar_bn/oscar_bn.txt,./dataset/wikipedia_bn/wikipedia_bn.txt",
59
  "input_format": "text",
60
  "model_type": "unigram",
61
  "vocab_size": 32000,
@@ -86,7 +86,7 @@ config = {
86
  "eos_piece": "[SEP]",
87
  "train_extremely_large_corpus": true,
88
  "split_by_whitespace": true,
89
- "model_prefix": "./tokenizer_bn/data/oscar_wiki_bn_spm_unigram_4000000_2021_04_21_17_06_50/spiece",
90
  "input_sentence_size": 4000000,
91
  "user_defined_symbols": "(,),\",-,.,–,£"
92
  }
 
55
  ```
56
  import sentencepiece as spm
57
  config = {
58
+ "input": "./dataset/oscar_bn.txt,./dataset/wikipedia_bn.txt",
59
  "input_format": "text",
60
  "model_type": "unigram",
61
  "vocab_size": 32000,
 
86
  "eos_piece": "[SEP]",
87
  "train_extremely_large_corpus": true,
88
  "split_by_whitespace": true,
89
+ "model_prefix": "./spiece",
90
  "input_sentence_size": 4000000,
91
  "user_defined_symbols": "(,),\",-,.,–,£"
92
  }