diff --git "a/train_job_output.txt" "b/train_job_output.txt" new file mode 100644--- /dev/null +++ "b/train_job_output.txt" @@ -0,0 +1,493 @@ +slurm submission log: 2024-05-07 15:08:36.461968 +created following sbatch script: + +############################### + +#!/bin/bash + +#SBATCH --account=nlp +#SBATCH --cpus-per-task=16 +#SBATCH --gres=gpu:2 +#SBATCH --job-name=tthrush-job-3330079 +#SBATCH --mem=400G +#SBATCH --nodelist=sphinx2 +#SBATCH --open-mode=append +#SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms/pythia-70m_sciq/train_job_output.txt +#SBATCH --partition=sphinx +#SBATCH --time=14-0 + +# activate your desired anaconda environment +. /nlp/scr/tthrush/miniconda3/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection + +# cd to working directory +cd . + +# launch commands +srun --unbuffered run_as_child_processes 'torchrun --master_port 29505 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/training_data/sciq --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms/pythia-70m_sciq --output_hub_id pythia-70m_sciq --model_id EleutherAI/pythia-70m --num_train_epochs 1 --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2' + +############################### + +submission to slurm complete! + + +############################### +slurm submission output + +Submitted batch job 7543213 + + + +############################### + +############################### +start time: 2024-05-07 19:45:41.371508 +machine: sphinx2 +conda env: pretraining-coreset-selection +############################### +running following processes + + torchrun --master_port 29505 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/training_data/sciq --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms/pythia-70m_sciq --output_hub_id pythia-70m_sciq --model_id EleutherAI/pythia-70m --num_train_epochs 1 --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2 + + +############################### +command outputs: + + +[2024-05-07 19:45:47,827] torch.distributed.run: [WARNING] +[2024-05-07 19:45:47,827] torch.distributed.run: [WARNING] ***************************************** +[2024-05-07 19:45:47,827] torch.distributed.run: [WARNING] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +[2024-05-07 19:45:47,827] torch.distributed.run: [WARNING] ***************************************** +05/07/2024 19:45:51 - INFO - __main__ - Script parameters ScriptArguments(dataset_id='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/training_data/sciq', output_dir='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms/pythia-70m_sciq', output_hub_id='pythia-70m_sciq', hf_hub_token=True, model_id='EleutherAI/pythia-70m', per_device_train_batch_size=256, num_train_epochs=1, learning_rate=0.001, gradient_accumulation_steps=2, from_scratch=True, warmup_ratio=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, weight_decay=0.01, lr_scheduler_type='cosine', local_rank=0, resume_from_checkpoint=False, deepspeed=None, peft=False) +05/07/2024 19:45:51 - INFO - __main__ - Script parameters ScriptArguments(dataset_id='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/training_data/sciq', output_dir='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms/pythia-70m_sciq', output_hub_id='pythia-70m_sciq', hf_hub_token=True, model_id='EleutherAI/pythia-70m', per_device_train_batch_size=256, num_train_epochs=1, learning_rate=0.001, gradient_accumulation_steps=2, from_scratch=True, warmup_ratio=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, weight_decay=0.01, lr_scheduler_type='cosine', local_rank=0, resume_from_checkpoint=False, deepspeed=None, peft=False) + 0%| | 0/10682 [00:00