diff --git "a/train_job_output.txt" "b/train_job_output.txt" --- "a/train_job_output.txt" +++ "b/train_job_output.txt" @@ -1,4 +1,4 @@ -slurm submission log: 2024-05-24 11:42:10.609607 +slurm submission log: 2024-05-24 23:54:02.551434 created following sbatch script: ############################### @@ -7,13 +7,13 @@ created following sbatch script: #SBATCH --account=nlp #SBATCH --cpus-per-task=16 -#SBATCH --dependency=afterok:7648448 +#SBATCH --dependency=afterok:7649440 #SBATCH --gres=gpu:2 -#SBATCH --job-name=tthrush-job-245568 +#SBATCH --job-name=tthrush-job-2884917 #SBATCH --mem=100G #SBATCH --nodelist=sphinx2 #SBATCH --open-mode=append -#SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_initial_init_min_threshold/llms/pythia-70m_sciq_1/train_job_output.txt +#SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_diff/llms/pythia-70m_sciq_1/train_job_output.txt #SBATCH --partition=sphinx #SBATCH --time=14-0 @@ -24,7 +24,7 @@ created following sbatch script: cd . # launch commands -srun --unbuffered run_as_child_processes 'torchrun --master_port 29524 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_initial_init_min_threshold/data/sciq --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_initial_init_min_threshold/llms/pythia-70m_sciq_1 --output_hub_id pythia-70m_sciq --model_id EleutherAI/pythia-70m --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2 --per_device_train_batch_size 256 --seed 1 --num_train_epochs 14' +srun --unbuffered run_as_child_processes 'torchrun --master_port 29524 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_diff/data/sciq --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_diff/llms/pythia-70m_sciq_1 --output_hub_id pythia-70m_sciq --model_id EleutherAI/pythia-70m --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2 --per_device_train_batch_size 256 --seed 1 --num_train_epochs 14' ############################### @@ -34,13 +34,13 @@ submission to slurm complete! ############################### slurm submission output -Submitted batch job 7648449 +Submitted batch job 7649441 ############################### -/var/lib/slurm/slurmd/job7648449/slurm_script: line 16: /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh: No such file or directory +/var/lib/slurm/slurmd/job7649441/slurm_script: line 16: /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh: No such file or directory CommandNotFoundError: Your shell has not been properly configured to use 'conda activate'. To initialize your shell, run @@ -61,606 +61,59 @@ IMPORTANT: You may need to close and restart your shell after running 'conda ini ############################### -start time: 2024-05-24 11:43:07.650666 +start time: 2024-05-25 04:55:04.852621 machine: sphinx2 conda env: pretraining-coreset-selection ############################### running following processes - torchrun --master_port 29524 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_initial_init_min_threshold/data/sciq --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_initial_init_min_threshold/llms/pythia-70m_sciq_1 --output_hub_id pythia-70m_sciq --model_id EleutherAI/pythia-70m --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2 --per_device_train_batch_size 256 --seed 1 --num_train_epochs 14 + torchrun --master_port 29524 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_diff/data/sciq --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_big_diff/llms/pythia-70m_sciq_1 --output_hub_id pythia-70m_sciq --model_id EleutherAI/pythia-70m --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2 --per_device_train_batch_size 256 --seed 1 --num_train_epochs 14 ############################### command outputs: -[2024-05-24 11:43:09,861] torch.distributed.run: [WARNING] -[2024-05-24 11:43:09,861] torch.distributed.run: [WARNING] ***************************************** -[2024-05-24 11:43:09,861] torch.distributed.run: [WARNING] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -[2024-05-24 11:43:09,861] torch.distributed.run: [WARNING] ***************************************** -05/24/2024 11:43:15 - INFO - __main__ - Script parameters ScriptArguments(seed=1, dataset_id='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_initial_init_min_threshold/data/sciq', output_dir='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_initial_init_min_threshold/llms/pythia-70m_sciq_1', output_hub_id='pythia-70m_sciq', hf_hub_token=True, model_id='EleutherAI/pythia-70m', per_device_train_batch_size=256, num_train_epochs=14.0, learning_rate=0.001, gradient_accumulation_steps=2, from_scratch=True, warmup_ratio=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, weight_decay=0.01, lr_scheduler_type='cosine', local_rank=0, resume_from_checkpoint=False, deepspeed=None, peft=False) -05/24/2024 11:43:15 - INFO - __main__ - Script parameters ScriptArguments(seed=1, dataset_id='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_initial_init_min_threshold/data/sciq', output_dir='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_initial_init_min_threshold/llms/pythia-70m_sciq_1', output_hub_id='pythia-70m_sciq', hf_hub_token=True, model_id='EleutherAI/pythia-70m', per_device_train_batch_size=256, num_train_epochs=14.0, learning_rate=0.001, gradient_accumulation_steps=2, from_scratch=True, warmup_ratio=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, weight_decay=0.01, lr_scheduler_type='cosine', local_rank=0, resume_from_checkpoint=False, deepspeed=None, peft=False) -Traceback (most recent call last): - File "/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_llm.py", line 204, in - train_model() - File "/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_llm.py", line 164, in train_model - train_dataset = load_from_disk(script_args.dataset_id) - File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/datasets/load.py", line 2638, in load_from_disk - raise FileNotFoundError( -FileNotFoundError: Directory /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_initial_init_min_threshold/data/sciq is neither a `Dataset` directory nor a `DatasetDict` directory. -Traceback (most recent call last): - File "/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_llm.py", line 204, in - train_model() - File "/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_llm.py", line 164, in train_model - train_dataset = load_from_disk(script_args.dataset_id) - File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/datasets/load.py", line 2638, in load_from_disk - raise FileNotFoundError( -FileNotFoundError: Directory /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_initial_init_min_threshold/data/sciq is neither a `Dataset` directory nor a `DatasetDict` directory. -[2024-05-24 11:43:19,880] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 0 (pid: 2635363) of binary: /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/bin/python -Traceback (most recent call last): - File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/bin/torchrun", line 8, in - sys.exit(main()) - File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/torch/distributed/run.py", line 812, in main - run(args) - File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/torch/distributed/run.py", line 803, in run - elastic_launch( - File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 135, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 268, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -train_llm.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-05-24_11:43:19 - host : sphinx2.stanford.edu - rank : 1 (local_rank: 1) - exitcode : 1 (pid: 2635364) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-05-24_11:43:19 - host : sphinx2.stanford.edu - rank : 0 (local_rank: 0) - exitcode : 1 (pid: 2635363) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -############################### -end time: 2024-05-24 11:43:27.673614 -elapsed time: 0:00:20.022948 -slurm submission log: 2024-05-24 11:46:16.259550 -created following sbatch script: - -############################### - -#!/bin/bash - -#SBATCH --account=nlp -#SBATCH --cpus-per-task=16 -#SBATCH --dependency=afterok:7648480 -#SBATCH --gres=gpu:2 -#SBATCH --job-name=tthrush-job-4918505 -#SBATCH --mem=100G -#SBATCH --nodelist=sphinx2 -#SBATCH --open-mode=append -#SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_initial_init_min_threshold/llms/pythia-70m_sciq_1/train_job_output.txt -#SBATCH --partition=sphinx -#SBATCH --time=14-0 - -# activate your desired anaconda environment -. /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection - -# cd to working directory -cd . - -# launch commands -srun --unbuffered run_as_child_processes 'torchrun --master_port 29524 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_initial_init_min_threshold/data/sciq --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_initial_init_min_threshold/llms/pythia-70m_sciq_1 --output_hub_id pythia-70m_sciq --model_id EleutherAI/pythia-70m --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2 --per_device_train_batch_size 256 --seed 1 --num_train_epochs 14' - -############################### - -submission to slurm complete! - - -############################### -slurm submission output - -Submitted batch job 7648481 - - - -############################### - -/var/lib/slurm/slurmd/job7648481/slurm_script: line 16: /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh: No such file or directory - -CommandNotFoundError: Your shell has not been properly configured to use 'conda activate'. -To initialize your shell, run - - $ conda init - -Currently supported shells are: - - bash - - fish - - tcsh - - xonsh - - zsh - - powershell - -See 'conda init --help' for more information and options. - -IMPORTANT: You may need to close and restart your shell after running 'conda init'. - - -############################### -start time: 2024-05-24 13:43:42.025549 -machine: sphinx2 -conda env: pretraining-coreset-selection -############################### -running following processes - - torchrun --master_port 29524 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_initial_init_min_threshold/data/sciq --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_initial_init_min_threshold/llms/pythia-70m_sciq_1 --output_hub_id pythia-70m_sciq --model_id EleutherAI/pythia-70m --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2 --per_device_train_batch_size 256 --seed 1 --num_train_epochs 14 - - -############################### -command outputs: - - -[2024-05-24 13:43:45,190] torch.distributed.run: [WARNING] -[2024-05-24 13:43:45,190] torch.distributed.run: [WARNING] ***************************************** -[2024-05-24 13:43:45,190] torch.distributed.run: [WARNING] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -[2024-05-24 13:43:45,190] torch.distributed.run: [WARNING] ***************************************** -05/24/2024 13:43:52 - INFO - __main__ - Script parameters ScriptArguments(seed=1, dataset_id='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_initial_init_min_threshold/data/sciq', output_dir='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_initial_init_min_threshold/llms/pythia-70m_sciq_1', output_hub_id='pythia-70m_sciq', hf_hub_token=True, model_id='EleutherAI/pythia-70m', per_device_train_batch_size=256, num_train_epochs=14.0, learning_rate=0.001, gradient_accumulation_steps=2, from_scratch=True, warmup_ratio=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, weight_decay=0.01, lr_scheduler_type='cosine', local_rank=0, resume_from_checkpoint=False, deepspeed=None, peft=False) -05/24/2024 13:43:52 - INFO - __main__ - Script parameters ScriptArguments(seed=1, dataset_id='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_initial_init_min_threshold/data/sciq', output_dir='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_initial_init_min_threshold/llms/pythia-70m_sciq_1', output_hub_id='pythia-70m_sciq', hf_hub_token=True, model_id='EleutherAI/pythia-70m', per_device_train_batch_size=256, num_train_epochs=14.0, learning_rate=0.001, gradient_accumulation_steps=2, from_scratch=True, warmup_ratio=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, weight_decay=0.01, lr_scheduler_type='cosine', local_rank=0, resume_from_checkpoint=False, deepspeed=None, peft=False) - 0%| | 0/11074 [00:00