export LC_ALL=C.UTF-8 | |
export LANG=C.UTF-8 | |
export OUTPUT_DIR=/home/saied/code/gpt2-medium-persian | |
export DATASET_NAME=oscar | |
export DATASET_CONFIG_NAME=unshuffled_deduplicated_fa | |
export VOCAB_SIZE=50000 | |
export MIN_FREQUENCY=2 | |
export SPECIAL_TOKENS='<s>','<pad>','</s>','<unk>','<mask>','<|endoftext|>','<|startoftext|>','<sep>','<cls>','<nl>','<tab>','<zwnj>','[U1]','[U2]','[U3]','[U4]','[U5]','[U6]','[U7]','[U8]','[U9]','[U10]','[U11]','[U12]','[U13]','[U14]','[U15]','[U16]','[U17]','[U18]','[U19]','[U20]' | |
python src/train_tokenizer.py \ | |
--output_dir="$OUTPUT_DIR" \ | |
--dataset_name="$DATASET_NAME" \ | |
--dataset_config_name="$DATASET_CONFIG_NAME" \ | |
--vocab_size=$VOCAB_SIZE \ | |
--min_frequency=$MIN_FREQUENCY \ | |
--special_tokens="$SPECIAL_TOKENS" |