#!/bin/bash export LC_ALL=C.UTF-8 export LANG=C.UTF-8 export OUTPUT_DIR=/home/saied/code/gpt2-medium-persian export DATASET_NAME=oscar export DATASET_CONFIG_NAME=unshuffled_deduplicated_fa export VOCAB_SIZE=50000 export MIN_FREQUENCY=2 export SPECIAL_TOKENS='','','','','','<|endoftext|>','<|startoftext|>','','','','','','[U1]','[U2]','[U3]','[U4]','[U5]','[U6]','[U7]','[U8]','[U9]','[U10]','[U11]','[U12]','[U13]','[U14]','[U15]','[U16]','[U17]','[U18]','[U19]','[U20]' python src/train_tokenizer.py \ --output_dir="$OUTPUT_DIR" \ --dataset_name="$DATASET_NAME" \ --dataset_config_name="$DATASET_CONFIG_NAME" \ --vocab_size=$VOCAB_SIZE \ --min_frequency=$MIN_FREQUENCY \ --special_tokens="$SPECIAL_TOKENS"