|
#!/usr/bin/bash |
|
|
|
|
|
|
|
. cfg_.config |
|
|
|
|
|
|
|
|
|
|
|
MODEL_DIR=$BASE_DIR/portuguese-roberta-base-config |
|
MODEL_OUT=$BASE_DIR/models-output/training-id-roberta-base-brwac-oscar-merged |
|
DATA=$DATASET_DIR/merged_brwac-all_oscarpt |
|
|
|
DEVICES=8 |
|
PER_DEVICE_BATCH=256 |
|
|
|
TOTAL_BATCH=$(expr $DEVICES \* $PER_DEVICE_BATCH) |
|
DATASET_SIZE=$(python -c " |
|
from datasets import load_from_disk; |
|
dataset = load_from_disk('${DATA}'); |
|
print(dataset.num_rows)") |
|
|
|
TOTAL_STEPS=100000 |
|
EPOCHS=$(python -c "print($TOTAL_STEPS/(${DATASET_SIZE} // ${TOTAL_BATCH}))") |
|
NUM_TRAINING_STEPS=$(python -c "print(round((${DATASET_SIZE} // ${TOTAL_BATCH}) * ${EPOCHS}))") |
|
|
|
echo "MAX_STEPS = ${NUM_TRAINING_STEPS}" |
|
|
|
|
|
python ./src/run_mlm_flax_stream.py \ |
|
--output_dir ${MODEL_OUT} \ |
|
--model_type roberta \ |
|
--config_name $MODEL_DIR \ |
|
--tokenizer_name ${MODEL_DIR} \ |
|
--model_name_or_path $BASE_DIR/roberta-base-config \ |
|
--dataset_name brwac_oscar_pt \ |
|
--dataset_path $DATA \ |
|
--max_seq_length 128 \ |
|
--pad_to_max_length \ |
|
--per_device_train_batch_size $PER_DEVICE_BATCH \ |
|
--per_device_eval_batch_size $PER_DEVICE_BATCH \ |
|
--weight_decay 0.01 \ |
|
--warmup_steps 24000 \ |
|
--overwrite_output_dir \ |
|
--adam_beta1 0.9 \ |
|
--adam_beta2 0.98 \ |
|
--adam_epsilon 1e-6 \ |
|
--learning_rate 6e-4 \ |
|
--num_train_steps $NUM_TRAINING_STEPS \ |
|
--num_eval_samples 5000 \ |
|
--save_step 1000 \ |
|
--logging_steps 500 \ |
|
--eval_steps 1000 \ |
|
--dtype bfloat16 \ |
|
|
|
|
|
|