File size: 750 Bytes
0dc226d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
import sentencepiece as spm

input_file = "output.txt"
model_prefix = "my_tokenizer"
vocab_size = 18816
model_type = "word"
input_sentence_size = 1000000
shuffle_input_sentence = True

pad_token = '<pad>'
bos_token = '<start>'
eos_token = '<end>'
unk_token = '<unk>'

spm.SentencePieceTrainer.train(
    f"--input={input_file} --model_prefix={model_prefix} --vocab_size={vocab_size} --model_type={model_type} --input_sentence_size={input_sentence_size} --shuffle_input_sentence={shuffle_input_sentence} --max_sentence_length=40000 --pad_id=0 --pad_piece={pad_token} --unk_id=1 --unk_piece={unk_token} --bos_id=2 --bos_piece={bos_token} --eos_id=3 --eos_piece={eos_token}"
)

tokenizer = spm.SentencePieceProcessor(model_file=f"{model_prefix}.model")