ul2-small-nl16-finnish / train_sentencepiece.py
aapot
Add UL2 code
8fdc728
raw
history blame
513 Bytes
import sentencepiece as spm
spm.SentencePieceTrainer.train(input="/researchdisk/training_dataset_sentences/train.txt", model_prefix='spiece', vocab_size=32000, character_coverage=1.0,
pad_id=0, unk_id=2, eos_id=1, bos_id=-1,
user_defined_symbols=['[NLU]', '[NLG]', '[S2S]'],
train_extremely_large_corpus=True,
num_threads=96, input_sentence_size=50000000, shuffle_input_sentence=True)