| dataset_name=$1 |
| vocab_size=$2 |
| corpus_dir=$3 |
| stage=$4 |
| phase1_tokenizer_dir=$5 |
| num_inherit_merges=$6 |
|
|
| |
| if [ -z "$dataset_name" ] || [ -z "$vocab_size" ] || [ -z "$corpus_dir" ] || [ -z "$stage" ]; then |
| echo "Error: Missing required arguments." |
| echo "Usage: $0 <dataset_name> <vocab_size> <corpus_dir> <stage> <phase1_tokenizer_dir> <num_inherit_merges>" |
| exit 1 |
| fi |
|
|
| |
| if ! [[ "$vocab_size" =~ ^[0-9]+$ ]] || [ "$vocab_size" -le 0 ]; then |
| echo "Error: Vocab size must be a positive integer." |
| exit 1 |
| fi |
|
|
| |
| if ! [[ "$stage" =~ ^[1-2]$ ]]; then |
| echo "Error: Invalid stage. Please specify either 1 or 2." |
| exit 1 |
| fi |
|
|
| if [ $vocab_size -ge $((10**9)) ]; then |
| vocab_size_str=$(($vocab_size / 10**9))G |
| elif [ $vocab_size -ge $((10**6)) ]; then |
| vocab_size_str=$(($vocab_size / 10**6))M |
| elif [ $vocab_size -ge $((10**3)) ]; then |
| vocab_size_str=$(($vocab_size / 10**3))K |
| else |
| vocab_size_str=${vocab_size} |
| fi |
|
|
|
|
| if [[ $stage == 1 ]]; then |
| tokenizer_dir=tokenizer_json/${dataset_name}_vocab${vocab_size_str}_stage${stage} |
| echo "Phase 1 tokenizer training: $tokenizer_dir" |
| python -m train_tokenizer \ |
| --output_dir $tokenizer_dir \ |
| --corpus_dir $corpus_dir \ |
| --vocab_size $vocab_size \ |
| --do_whitespace_pretokenization true |
| elif [[ $stage == 2 ]]; then |
| |
| if ! [[ "$num_inherit_merges" =~ ^[0-9]+$ ]] || [ "$num_inherit_merges" -lt 0 ]; then |
| echo "Warning: num_inherit_merges is invalid or missing. Defaulting to 0." |
| num_inherit_merges=0 |
| fi |
| |
| if [ ! -d "$phase1_tokenizer_dir" ]; then |
| echo "Error: Phase 1 tokenizer directory '$phase1_tokenizer_dir' not found!" |
| exit 1 |
| fi |
|
|
|
|
| if [ $num_inherit_merges -ge $((10**9)) ]; then |
| num_inherit_merges_str=$(($num_inherit_merges / 10**9))G |
| elif [ $num_inherit_merges -ge $((10**6)) ]; then |
| num_inherit_merges_str=$(($num_inherit_merges / 10**6))M |
| elif [ $num_inherit_merges -ge $((10**3)) ]; then |
| num_inherit_merges_str=$(($num_inherit_merges / 10**3))K |
| else |
| num_inherit_merges_str=${num_inherit_merges} |
| fi |
|
|
| phase2_tokenizer_dir=tokenizer_json/${dataset_name}_vocab${vocab_size_str}_from${num_inherit_merges_str}_stage${stage} |
| echo "Phase 2 tokenizer training: $phase2_tokenizer_dir" |
|
|
| mkdir -p $phase2_tokenizer_dir |
| head -n $num_inherit_merges $phase1_tokenizer_dir/merges.txt > $phase2_tokenizer_dir/merges.txt |
| cp $phase1_tokenizer_dir/meta.json $phase2_tokenizer_dir/meta.json |
|
|
| python -m train_tokenizer \ |
| --output_dir $phase2_tokenizer_dir \ |
| --vocab_size $vocab_size \ |
| --do_whitespace_pretokenization false |
| else |
| echo "Error: Invalid stage specified. Please choose '1' for Phase 1 or '2' for Phase 2." |
| exit 1 |
| fi |
|
|