#!/usr/bin/bash # load dir variables # . cfg_.config #MODEL_DIR=/home/diiogofernands/hub/portuguese-roberta-base/portuguese-roberta-base-config #MODEL_OUT=/home/diiogofernands/hub/portuguese-roberta-base/model_pretrained #DATA=/home/diiogofernands/extracted/brwac MODEL_DIR=$BASE_DIR/portuguese-roberta-base-config MODEL_OUT=$BASE_DIR/models-output/training-id-roberta-base-brwac-oscar-merged DATA=$DATASET_DIR/merged_brwac-all_oscarpt DEVICES=8 #TPU CHIPS PER_DEVICE_BATCH=256 TOTAL_BATCH=$(expr $DEVICES \* $PER_DEVICE_BATCH) DATASET_SIZE=$(python -c " from datasets import load_from_disk; dataset = load_from_disk('${DATA}'); print(dataset.num_rows)") TOTAL_STEPS=100000 EPOCHS=$(python -c "print($TOTAL_STEPS/(${DATASET_SIZE} // ${TOTAL_BATCH}))") NUM_TRAINING_STEPS=$(python -c "print(round((${DATASET_SIZE} // ${TOTAL_BATCH}) * ${EPOCHS}))") echo "MAX_STEPS = ${NUM_TRAINING_STEPS}" #--model_config_name $MODEL_DIR \ python ./src/run_mlm_flax_stream.py \ --output_dir ${MODEL_OUT} \ --model_type roberta \ --config_name $MODEL_DIR \ --tokenizer_name ${MODEL_DIR} \ --model_name_or_path $BASE_DIR/roberta-base-config \ --dataset_name brwac_oscar_pt \ --dataset_path $DATA \ --max_seq_length 128 \ --pad_to_max_length \ --per_device_train_batch_size $PER_DEVICE_BATCH \ --per_device_eval_batch_size $PER_DEVICE_BATCH \ --weight_decay 0.01 \ --warmup_steps 24000 \ --overwrite_output_dir \ --adam_beta1 0.9 \ --adam_beta2 0.98 \ --adam_epsilon 1e-6 \ --learning_rate 6e-4 \ --num_train_steps $NUM_TRAINING_STEPS \ --num_eval_samples 5000 \ --save_step 1000 \ --logging_steps 500 \ --eval_steps 1000 \ --dtype bfloat16 \ #--push_to_hubv