portuguese-roberta-base / run_mlm_flax_stream.sh
artsousa's picture
add config
55f9e66
#!/usr/bin/bash
# load dir variables
#
. cfg_.config
#MODEL_DIR=/home/diiogofernands/hub/portuguese-roberta-base/portuguese-roberta-base-config
#MODEL_OUT=/home/diiogofernands/hub/portuguese-roberta-base/model_pretrained
#DATA=/home/diiogofernands/extracted/brwac
MODEL_DIR=$BASE_DIR/portuguese-roberta-base-config
MODEL_OUT=$BASE_DIR/models-output/training-id-roberta-base-brwac-oscar-merged
DATA=$DATASET_DIR/merged_brwac-all_oscarpt
DEVICES=8 #TPU CHIPS
PER_DEVICE_BATCH=256
TOTAL_BATCH=$(expr $DEVICES \* $PER_DEVICE_BATCH)
DATASET_SIZE=$(python -c "
from datasets import load_from_disk;
dataset = load_from_disk('${DATA}');
print(dataset.num_rows)")
TOTAL_STEPS=100000
EPOCHS=$(python -c "print($TOTAL_STEPS/(${DATASET_SIZE} // ${TOTAL_BATCH}))")
NUM_TRAINING_STEPS=$(python -c "print(round((${DATASET_SIZE} // ${TOTAL_BATCH}) * ${EPOCHS}))")
echo "MAX_STEPS = ${NUM_TRAINING_STEPS}"
#--model_config_name $MODEL_DIR \
python ./src/run_mlm_flax_stream.py \
--output_dir ${MODEL_OUT} \
--model_type roberta \
--config_name $MODEL_DIR \
--tokenizer_name ${MODEL_DIR} \
--model_name_or_path $BASE_DIR/roberta-base-config \
--dataset_name brwac_oscar_pt \
--dataset_path $DATA \
--max_seq_length 128 \
--pad_to_max_length \
--per_device_train_batch_size $PER_DEVICE_BATCH \
--per_device_eval_batch_size $PER_DEVICE_BATCH \
--weight_decay 0.01 \
--warmup_steps 24000 \
--overwrite_output_dir \
--adam_beta1 0.9 \
--adam_beta2 0.98 \
--adam_epsilon 1e-6 \
--learning_rate 6e-4 \
--num_train_steps $NUM_TRAINING_STEPS \
--num_eval_samples 5000 \
--save_step 1000 \
--logging_steps 500 \
--eval_steps 1000 \
--dtype bfloat16 \
#--push_to_hubv