File size: 785 Bytes
31bf2aa
 
 
 
 
c36ebf7
31bf2aa
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
#!/bin/bash

export LC_ALL=C.UTF-8
export LANG=C.UTF-8

export OUTPUT_DIR=/home/saied/code/gpt2-medium-persian
export DATASET_NAME=oscar
export DATASET_CONFIG_NAME=unshuffled_deduplicated_fa
export VOCAB_SIZE=50000
export MIN_FREQUENCY=2
export SPECIAL_TOKENS='<s>','<pad>','</s>','<unk>','<mask>','<|endoftext|>','<|startoftext|>','<sep>','<cls>','<nl>','<tab>','<zwnj>','[U1]','[U2]','[U3]','[U4]','[U5]','[U6]','[U7]','[U8]','[U9]','[U10]','[U11]','[U12]','[U13]','[U14]','[U15]','[U16]','[U17]','[U18]','[U19]','[U20]'


python src/train_tokenizer.py \
    --output_dir="$OUTPUT_DIR"  \
    --dataset_name="$DATASET_NAME" \
    --dataset_config_name="$DATASET_CONFIG_NAME" \
    --vocab_size=$VOCAB_SIZE \
    --min_frequency=$MIN_FREQUENCY \
    --special_tokens="$SPECIAL_TOKENS"