diff --git "a/train_job_output.txt" "b/train_job_output.txt" --- "a/train_job_output.txt" +++ "b/train_job_output.txt" @@ -1,4 +1,4 @@ -slurm submission log: 2024-05-20 23:23:25.400328 +slurm submission log: 2024-05-22 17:07:24.872646 created following sbatch script: ############################### @@ -7,13 +7,419 @@ created following sbatch script: #SBATCH --account=nlp #SBATCH --cpus-per-task=16 -#SBATCH --dependency=afterok:7637741 +#SBATCH --dependency=afterok:7642739 #SBATCH --gres=gpu:2 -#SBATCH --job-name=tthrush-job-390299 +#SBATCH --job-name=tthrush-job-2024328 #SBATCH --mem=100G #SBATCH --nodelist=sphinx2 #SBATCH --open-mode=append -#SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_clipped_scaled/llms/pythia-70m_sciq_1/train_job_output.txt +#SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained/llms/pythia-70m_sciq_1/train_job_output.txt +#SBATCH --partition=sphinx +#SBATCH --time=14-0 + +# activate your desired anaconda environment +. /nlp/scr/tthrush/miniconda3/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection + +# cd to working directory +cd . + +# launch commands +srun --unbuffered run_as_child_processes 'torchrun --master_port 29524 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained/data/sciq --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained/llms/pythia-70m_sciq_1 --output_hub_id pythia-70m_sciq --model_id EleutherAI/pythia-70m --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2 --per_device_train_batch_size 256 --seed 1 --num_train_epochs 1' + +############################### + +submission to slurm complete! + + +############################### +slurm submission output + +Submitted batch job 7642740 + + + +############################### + +############################### +start time: 2024-05-22 17:09:56.300014 +machine: sphinx2 +conda env: pretraining-coreset-selection +############################### +running following processes + + torchrun --master_port 29524 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained/data/sciq --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained/llms/pythia-70m_sciq_1 --output_hub_id pythia-70m_sciq --model_id EleutherAI/pythia-70m --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2 --per_device_train_batch_size 256 --seed 1 --num_train_epochs 1 + + +############################### +command outputs: + + +[2024-05-22 17:09:58,221] torch.distributed.run: [WARNING] +[2024-05-22 17:09:58,221] torch.distributed.run: [WARNING] ***************************************** +[2024-05-22 17:09:58,221] torch.distributed.run: [WARNING] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +[2024-05-22 17:09:58,221] torch.distributed.run: [WARNING] ***************************************** +05/22/2024 17:10:15 - INFO - __main__ - Script parameters ScriptArguments(seed=1, dataset_id='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained/data/sciq', output_dir='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained/llms/pythia-70m_sciq_1', output_hub_id='pythia-70m_sciq', hf_hub_token=True, model_id='EleutherAI/pythia-70m', per_device_train_batch_size=256, num_train_epochs=1.0, learning_rate=0.001, gradient_accumulation_steps=2, from_scratch=True, warmup_ratio=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, weight_decay=0.01, lr_scheduler_type='cosine', local_rank=0, resume_from_checkpoint=False, deepspeed=None, peft=False) +Traceback (most recent call last): + File "/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_llm.py", line 204, in + train_model() + File "/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_llm.py", line 164, in train_model + train_dataset = load_from_disk(script_args.dataset_id) + File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/datasets/load.py", line 2638, in load_from_disk + raise FileNotFoundError( +FileNotFoundError: Directory /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained/data/sciq is neither a `Dataset` directory nor a `DatasetDict` directory. +[2024-05-22 17:10:18,250] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 1904395 closing signal SIGTERM +[2024-05-22 17:10:18,314] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 1 (pid: 1904396) of binary: /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/bin/python +Traceback (most recent call last): + File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/bin/torchrun", line 8, in + sys.exit(main()) + File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper + return f(*args, **kwargs) + File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/torch/distributed/run.py", line 812, in main + run(args) + File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/torch/distributed/run.py", line 803, in run + elastic_launch( + File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 135, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 268, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +train_llm.py FAILED +------------------------------------------------------------ +Failures: + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2024-05-22_17:10:18 + host : sphinx2.stanford.edu + rank : 1 (local_rank: 1) + exitcode : 1 (pid: 1904396) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +============================================================ +############################### +end time: 2024-05-22 17:10:26.333875 +elapsed time: 0:00:30.033861 +slurm submission log: 2024-05-22 17:23:51.305056 +created following sbatch script: + +############################### + +#!/bin/bash + +#SBATCH --account=nlp +#SBATCH --cpus-per-task=16 +#SBATCH --dependency=afterok:7642779 +#SBATCH --gres=gpu:2 +#SBATCH --job-name=tthrush-job-3697265 +#SBATCH --mem=100G +#SBATCH --nodelist=sphinx2 +#SBATCH --open-mode=append +#SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained/llms/pythia-70m_sciq_1/train_job_output.txt +#SBATCH --partition=sphinx +#SBATCH --time=14-0 + +# activate your desired anaconda environment +. /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection + +# cd to working directory +cd . + +# launch commands +srun --unbuffered run_as_child_processes 'torchrun --master_port 29524 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained/data/sciq --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained/llms/pythia-70m_sciq_1 --output_hub_id pythia-70m_sciq --model_id EleutherAI/pythia-70m --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2 --per_device_train_batch_size 256 --seed 1 --num_train_epochs 1' + +############################### + +submission to slurm complete! + + +############################### +slurm submission output + +Submitted batch job 7642780 + + + +############################### + +/var/lib/slurm/slurmd/job7642780/slurm_script: line 16: /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh: No such file or directory + +CommandNotFoundError: Your shell has not been properly configured to use 'conda activate'. +To initialize your shell, run + + $ conda init + +Currently supported shells are: + - bash + - fish + - tcsh + - xonsh + - zsh + - powershell + +See 'conda init --help' for more information and options. + +IMPORTANT: You may need to close and restart your shell after running 'conda init'. + + +############################### +start time: 2024-05-22 17:25:17.800550 +machine: sphinx2 +conda env: pretraining-coreset-selection +############################### +running following processes + + torchrun --master_port 29524 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained/data/sciq --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained/llms/pythia-70m_sciq_1 --output_hub_id pythia-70m_sciq --model_id EleutherAI/pythia-70m --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2 --per_device_train_batch_size 256 --seed 1 --num_train_epochs 1 + + +############################### +command outputs: + + +[2024-05-22 17:25:19,448] torch.distributed.run: [WARNING] +[2024-05-22 17:25:19,448] torch.distributed.run: [WARNING] ***************************************** +[2024-05-22 17:25:19,448] torch.distributed.run: [WARNING] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +[2024-05-22 17:25:19,448] torch.distributed.run: [WARNING] ***************************************** +05/22/2024 17:25:24 - INFO - __main__ - Script parameters ScriptArguments(seed=1, dataset_id='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained/data/sciq', output_dir='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained/llms/pythia-70m_sciq_1', output_hub_id='pythia-70m_sciq', hf_hub_token=True, model_id='EleutherAI/pythia-70m', per_device_train_batch_size=256, num_train_epochs=1.0, learning_rate=0.001, gradient_accumulation_steps=2, from_scratch=True, warmup_ratio=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, weight_decay=0.01, lr_scheduler_type='cosine', local_rank=0, resume_from_checkpoint=False, deepspeed=None, peft=False) +Traceback (most recent call last): + File "/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_llm.py", line 204, in + train_model() + File "/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_llm.py", line 164, in train_model + train_dataset = load_from_disk(script_args.dataset_id) + File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/datasets/load.py", line 2638, in load_from_disk + raise FileNotFoundError( +FileNotFoundError: Directory /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained/data/sciq is neither a `Dataset` directory nor a `DatasetDict` directory. +05/22/2024 17:25:29 - INFO - __main__ - Script parameters ScriptArguments(seed=1, dataset_id='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained/data/sciq', output_dir='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained/llms/pythia-70m_sciq_1', output_hub_id='pythia-70m_sciq', hf_hub_token=True, model_id='EleutherAI/pythia-70m', per_device_train_batch_size=256, num_train_epochs=1.0, learning_rate=0.001, gradient_accumulation_steps=2, from_scratch=True, warmup_ratio=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, weight_decay=0.01, lr_scheduler_type='cosine', local_rank=0, resume_from_checkpoint=False, deepspeed=None, peft=False) +[2024-05-22 17:25:29,465] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 1908605 closing signal SIGTERM +[2024-05-22 17:25:29,879] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 0 (pid: 1908604) of binary: /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/bin/python +Traceback (most recent call last): + File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/bin/torchrun", line 8, in + sys.exit(main()) + File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper + return f(*args, **kwargs) + File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/torch/distributed/run.py", line 812, in main + run(args) + File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/torch/distributed/run.py", line 803, in run + elastic_launch( + File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 135, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 268, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +train_llm.py FAILED +------------------------------------------------------------ +Failures: + +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2024-05-22_17:25:29 + host : sphinx2.stanford.edu + rank : 0 (local_rank: 0) + exitcode : 1 (pid: 1908604) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +============================================================ +############################### +end time: 2024-05-22 17:25:37.829790 +elapsed time: 0:00:20.029240 +slurm submission log: 2024-05-22 17:29:15.690545 +created following sbatch script: + +############################### + +#!/bin/bash + +#SBATCH --account=nlp +#SBATCH --cpus-per-task=16 +#SBATCH --dependency=afterok:7642804 +#SBATCH --gres=gpu:2 +#SBATCH --job-name=tthrush-job-957751 +#SBATCH --mem=100G +#SBATCH --nodelist=sphinx2 +#SBATCH --open-mode=append +#SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained/llms/pythia-70m_sciq_1/train_job_output.txt +#SBATCH --partition=sphinx +#SBATCH --time=14-0 + +# activate your desired anaconda environment +. /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection + +# cd to working directory +cd . + +# launch commands +srun --unbuffered run_as_child_processes 'torchrun --master_port 29524 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained/data/sciq --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained/llms/pythia-70m_sciq_1 --output_hub_id pythia-70m_sciq --model_id EleutherAI/pythia-70m --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2 --per_device_train_batch_size 256 --seed 1 --num_train_epochs 1' + +############################### + +submission to slurm complete! + + +############################### +slurm submission output + +Submitted batch job 7642805 + + + +############################### + +/var/lib/slurm/slurmd/job7642805/slurm_script: line 16: /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh: No such file or directory + +CommandNotFoundError: Your shell has not been properly configured to use 'conda activate'. +To initialize your shell, run + + $ conda init + +Currently supported shells are: + - bash + - fish + - tcsh + - xonsh + - zsh + - powershell + +See 'conda init --help' for more information and options. + +IMPORTANT: You may need to close and restart your shell after running 'conda init'. + + +############################### +start time: 2024-05-22 17:31:31.464665 +machine: sphinx2 +conda env: pretraining-coreset-selection +############################### +running following processes + + torchrun --master_port 29524 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained/data/sciq --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained/llms/pythia-70m_sciq_1 --output_hub_id pythia-70m_sciq --model_id EleutherAI/pythia-70m --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2 --per_device_train_batch_size 256 --seed 1 --num_train_epochs 1 + + +############################### +command outputs: + + +[2024-05-22 17:31:33,359] torch.distributed.run: [WARNING] +[2024-05-22 17:31:33,359] torch.distributed.run: [WARNING] ***************************************** +[2024-05-22 17:31:33,359] torch.distributed.run: [WARNING] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +[2024-05-22 17:31:33,359] torch.distributed.run: [WARNING] ***************************************** +05/22/2024 17:31:38 - INFO - __main__ - Script parameters ScriptArguments(seed=1, dataset_id='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained/data/sciq', output_dir='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained/llms/pythia-70m_sciq_1', output_hub_id='pythia-70m_sciq', hf_hub_token=True, model_id='EleutherAI/pythia-70m', per_device_train_batch_size=256, num_train_epochs=1.0, learning_rate=0.001, gradient_accumulation_steps=2, from_scratch=True, warmup_ratio=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, weight_decay=0.01, lr_scheduler_type='cosine', local_rank=0, resume_from_checkpoint=False, deepspeed=None, peft=False) +05/22/2024 17:31:38 - INFO - __main__ - Script parameters ScriptArguments(seed=1, dataset_id='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained/data/sciq', output_dir='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained/llms/pythia-70m_sciq_1', output_hub_id='pythia-70m_sciq', hf_hub_token=True, model_id='EleutherAI/pythia-70m', per_device_train_batch_size=256, num_train_epochs=1.0, learning_rate=0.001, gradient_accumulation_steps=2, from_scratch=True, warmup_ratio=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, weight_decay=0.01, lr_scheduler_type='cosine', local_rank=0, resume_from_checkpoint=False, deepspeed=None, peft=False) +Traceback (most recent call last): + File "/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_llm.py", line 204, in + train_model() + File "/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_llm.py", line 164, in train_model + train_dataset = load_from_disk(script_args.dataset_id) + File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/datasets/load.py", line 2638, in load_from_disk + raise FileNotFoundError( +FileNotFoundError: Directory /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained/data/sciq is neither a `Dataset` directory nor a `DatasetDict` directory. +Traceback (most recent call last): + File "/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_llm.py", line 204, in + train_model() + File "/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_llm.py", line 164, in train_model + train_dataset = load_from_disk(script_args.dataset_id) + File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/datasets/load.py", line 2638, in load_from_disk + raise FileNotFoundError( +FileNotFoundError: Directory /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained/data/sciq is neither a `Dataset` directory nor a `DatasetDict` directory. +[2024-05-22 17:31:43,376] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 0 (pid: 1910118) of binary: /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/bin/python +Traceback (most recent call last): + File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/bin/torchrun", line 8, in + sys.exit(main()) + File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper + return f(*args, **kwargs) + File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/torch/distributed/run.py", line 812, in main + run(args) + File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/torch/distributed/run.py", line 803, in run + elastic_launch( + File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 135, in __call__ + return launch_agent(self._config, self._entrypoint, list(args)) + File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 268, in launch_agent + raise ChildFailedError( +torch.distributed.elastic.multiprocessing.errors.ChildFailedError: +============================================================ +train_llm.py FAILED +------------------------------------------------------------ +Failures: +[1]: + time : 2024-05-22_17:31:43 + host : sphinx2.stanford.edu + rank : 1 (local_rank: 1) + exitcode : 1 (pid: 1910119) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +------------------------------------------------------------ +Root Cause (first observed failure): +[0]: + time : 2024-05-22_17:31:43 + host : sphinx2.stanford.edu + rank : 0 (local_rank: 0) + exitcode : 1 (pid: 1910118) + error_file: + traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html +============================================================ +############################### +end time: 2024-05-22 17:31:51.493076 +elapsed time: 0:00:20.028411 +slurm submission log: 2024-05-22 17:41:39.178541 +created following sbatch script: + +############################### + +#!/bin/bash + +#SBATCH --account=nlp +#SBATCH --cpus-per-task=16 +#SBATCH --dependency=afterok:7642833 +#SBATCH --gres=gpu:2 +#SBATCH --job-name=tthrush-job-300489 +#SBATCH --mem=100G +#SBATCH --nodelist=sphinx2 +#SBATCH --open-mode=append +#SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained/llms/pythia-70m_sciq_1/train_job_output.txt +#SBATCH --partition=sphinx +#SBATCH --time=14-0 + +# activate your desired anaconda environment +. /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection + +# cd to working directory +cd . + +# launch commands +srun --unbuffered run_as_child_processes 'torchrun --master_port 29524 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained/data/sciq --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained/llms/pythia-70m_sciq_1 --output_hub_id pythia-70m_sciq --model_id EleutherAI/pythia-70m --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2 --per_device_train_batch_size 256 --seed 1 --num_train_epochs 1' + +############################### + +submission to slurm complete! + + +############################### +slurm submission output + +Submitted batch job 7642834 + + + +############################### + +slurm submission log: 2024-05-22 19:52:22.772491 +created following sbatch script: + +############################### + +#!/bin/bash + +#SBATCH --account=nlp +#SBATCH --cpus-per-task=16 +#SBATCH --dependency=afterok:7643056 +#SBATCH --gres=gpu:2 +#SBATCH --job-name=tthrush-job-3981807 +#SBATCH --mem=100G +#SBATCH --nodelist=sphinx2 +#SBATCH --open-mode=append +#SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained/llms/pythia-70m_sciq_1/train_job_output.txt #SBATCH --partition=sphinx #SBATCH --time=14-0 @@ -24,7 +430,7 @@ created following sbatch script: cd . # launch commands -srun --unbuffered run_as_child_processes 'torchrun --master_port 29524 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_clipped_scaled/data/sciq --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_clipped_scaled/llms/pythia-70m_sciq_1 --output_hub_id pythia-70m_sciq --model_id EleutherAI/pythia-70m --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2 --per_device_train_batch_size 256 --seed 1 --num_train_epochs 1' +srun --unbuffered run_as_child_processes 'torchrun --master_port 29524 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained/data/sciq --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained/llms/pythia-70m_sciq_1 --output_hub_id pythia-70m_sciq --model_id EleutherAI/pythia-70m --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2 --per_device_train_batch_size 256 --seed 1 --num_train_epochs 1' ############################### @@ -34,13 +440,13 @@ submission to slurm complete! ############################### slurm submission output -Submitted batch job 7637742 +Submitted batch job 7643057 ############################### -/var/lib/slurm/slurmd/job7637742/slurm_script: line 16: /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh: No such file or directory +/var/lib/slurm/slurmd/job7643057/slurm_script: line 16: /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh: No such file or directory CommandNotFoundError: Your shell has not been properly configured to use 'conda activate'. To initialize your shell, run @@ -61,455 +467,454 @@ IMPORTANT: You may need to close and restart your shell after running 'conda ini ############################### -start time: 2024-05-21 07:32:04.674137 +start time: 2024-05-23 04:03:11.850965 machine: sphinx2 conda env: pretraining-coreset-selection ############################### running following processes - torchrun --master_port 29524 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_clipped_scaled/data/sciq --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_clipped_scaled/llms/pythia-70m_sciq_1 --output_hub_id pythia-70m_sciq --model_id EleutherAI/pythia-70m --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2 --per_device_train_batch_size 256 --seed 1 --num_train_epochs 1 + torchrun --master_port 29524 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained/data/sciq --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained/llms/pythia-70m_sciq_1 --output_hub_id pythia-70m_sciq --model_id EleutherAI/pythia-70m --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2 --per_device_train_batch_size 256 --seed 1 --num_train_epochs 1 ############################### command outputs: -[2024-05-21 07:32:08,656] torch.distributed.run: [WARNING] -[2024-05-21 07:32:08,656] torch.distributed.run: [WARNING] ***************************************** -[2024-05-21 07:32:08,656] torch.distributed.run: [WARNING] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -[2024-05-21 07:32:08,656] torch.distributed.run: [WARNING] ***************************************** -05/21/2024 07:32:18 - INFO - __main__ - Script parameters ScriptArguments(seed=1, dataset_id='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_clipped_scaled/data/sciq', output_dir='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_clipped_scaled/llms/pythia-70m_sciq_1', output_hub_id='pythia-70m_sciq', hf_hub_token=True, model_id='EleutherAI/pythia-70m', per_device_train_batch_size=256, num_train_epochs=1.0, learning_rate=0.001, gradient_accumulation_steps=2, from_scratch=True, warmup_ratio=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, weight_decay=0.01, lr_scheduler_type='cosine', local_rank=0, resume_from_checkpoint=False, deepspeed=None, peft=False) -05/21/2024 07:32:18 - INFO - __main__ - Script parameters ScriptArguments(seed=1, dataset_id='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_clipped_scaled/data/sciq', output_dir='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_clipped_scaled/llms/pythia-70m_sciq_1', output_hub_id='pythia-70m_sciq', hf_hub_token=True, model_id='EleutherAI/pythia-70m', per_device_train_batch_size=256, num_train_epochs=1.0, learning_rate=0.001, gradient_accumulation_steps=2, from_scratch=True, warmup_ratio=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, weight_decay=0.01, lr_scheduler_type='cosine', local_rank=0, resume_from_checkpoint=False, deepspeed=None, peft=False) - 0%| | 0/10719 [00:00