|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
nodes=( $( scontrol show hostnames $SLURM_JOB_NODELIST ) ) |
|
nodes_array=($nodes) |
|
head_node=${nodes_array[0]} |
|
head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address) |
|
|
|
echo Node IP: $head_node_ip |
|
export LOGLEVEL=INFO |
|
|
|
|
|
|
|
source /home/.bashrc |
|
conda activate smi-ted-env |
|
|
|
|
|
srun torchrun \ |
|
--nnodes 6 \ |
|
--nproc_per_node 4 \ |
|
--rdzv_id $RANDOM \ |
|
--rdzv_backend c10d \ |
|
--rdzv_endpoint $head_node_ip:29500 \ |
|
train_model_ED.py \ |
|
--device cuda \ |
|
--n_batch 288 \ |
|
--n_layer 12 \ |
|
--n_head 12 \ |
|
--n_embd 768 \ |
|
--max_len 202 \ |
|
--d_dropout 0.2 \ |
|
--lr_start 3e-5 \ |
|
--lr_multiplier 4 \ |
|
--lr_decoder 3e-5 \ |
|
--n_workers 12 \ |
|
--max_epochs 51 \ |
|
--gpu -1 \ |
|
--num_nodes 1 \ |
|
--num_feats 32 \ |
|
--root_dir . \ |
|
--checkpoint_every 10000 \ |
|
--grad_acc 1 \ |
|
--train_load 'pubchem' \ |
|
--smi_ted_version 'v1' \ |
|
--data_root './pubchem/pubchem_rd-canonical_smiles.smi' \ |
|
--save_checkpoint_path './light_checkpoints' \ |
|
--load_checkpoint_path '' \ |
|
--rotate \ |
|
--debug \ |
|
--model_arch 'BERT__both_rotate' \ |