diff --git "a/train_job_output.txt" "b/train_job_output.txt" --- "a/train_job_output.txt" +++ "b/train_job_output.txt" @@ -1,4 +1,4 @@ -slurm submission log: 2024-05-08 15:15:19.565748 +slurm submission log: 2024-05-19 09:14:41.566584 created following sbatch script: ############################### @@ -7,24 +7,23 @@ created following sbatch script: #SBATCH --account=nlp #SBATCH --cpus-per-task=16 -#SBATCH --dependency=afterok:7590695 -#SBATCH --gres=gpu:2 -#SBATCH --job-name=tthrush-job-1731429 -#SBATCH --mem=400G +#SBATCH --gres=gpu:1 +#SBATCH --job-name=tthrush-job-2782455 +#SBATCH --mem=100G #SBATCH --nodelist=sphinx2 #SBATCH --open-mode=append -#SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms/pythia-14m_piqa/train_job_output.txt +#SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/14m_llm_seeds_more_data/pythia-14m_piqa_1/train_job_output.txt #SBATCH --partition=sphinx #SBATCH --time=14-0 # activate your desired anaconda environment -. /nlp/scr/tthrush/miniconda3/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection +. /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection # cd to working directory cd . # launch commands -srun --unbuffered run_as_child_processes 'torchrun --master_port 29506 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/training_data/piqa --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms/pythia-14m_piqa --output_hub_id pythia-14m_piqa --model_id /pythia-14m --num_train_epochs 1 --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2' +srun --unbuffered run_as_child_processes 'torchrun --master_port 29517 --nproc_per_node=1 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_data_more_data/piqa --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/14m_llm_seeds_more_data/pythia-14m_piqa_1 --output_hub_id pythia-14m_piqa --model_id EleutherAI/pythia-14m --learning_rate 6e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 1 --per_device_train_batch_size 1024 --seed 1 --num_train_epochs 0.2' ############################### @@ -34,122 +33,136 @@ submission to slurm complete! ############################### slurm submission output -Submitted batch job 7590696 +Submitted batch job 7631089 ############################### +slurm submission log: 2024-05-19 09:16:20.758126 +created following sbatch script: + ############################### -start time: 2024-05-08 16:43:55.476578 -machine: sphinx2 -conda env: pretraining-coreset-selection + +#!/bin/bash + +#SBATCH --account=nlp +#SBATCH --cpus-per-task=16 +#SBATCH --gres=gpu:1 +#SBATCH --job-name=tthrush-job-914626 +#SBATCH --mem=100G +#SBATCH --nodelist=sphinx2 +#SBATCH --open-mode=append +#SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/14m_llm_seeds_more_data/pythia-14m_piqa_1/train_job_output.txt +#SBATCH --partition=sphinx +#SBATCH --time=14-0 + +# activate your desired anaconda environment +. /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection + +# cd to working directory +cd . + +# launch commands +srun --unbuffered run_as_child_processes 'torchrun --master_port 29517 --nproc_per_node=1 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_data_more_data/piqa --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/14m_llm_seeds_more_data/pythia-14m_piqa_1 --output_hub_id pythia-14m_piqa --model_id EleutherAI/pythia-14m --learning_rate 6e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 1 --per_device_train_batch_size 1024 --seed 1 --num_train_epochs 0.2' + ############################### -running following processes - torchrun --master_port 29506 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/training_data/piqa --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms/pythia-14m_piqa --output_hub_id pythia-14m_piqa --model_id /pythia-14m --num_train_epochs 1 --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2 +submission to slurm complete! ############################### -command outputs: +slurm submission output + +Submitted batch job 7631150 + + + +############################### + +slurm submission log: 2024-05-19 09:25:08.150536 +created following sbatch script: + +############################### + +#!/bin/bash + +#SBATCH --account=nlp +#SBATCH --cpus-per-task=16 +#SBATCH --gres=gpu:1 +#SBATCH --job-name=tthrush-job-758301 +#SBATCH --mem=100G +#SBATCH --nodelist=sphinx2 +#SBATCH --open-mode=append +#SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/14m_llm_seeds_more_data/pythia-14m_piqa_1/train_job_output.txt +#SBATCH --partition=sphinx +#SBATCH --time=14-0 + +# activate your desired anaconda environment +. /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection + +# cd to working directory +cd . + +# launch commands +srun --unbuffered run_as_child_processes 'torchrun --master_port 29517 --nproc_per_node=1 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_data_more_data/piqa --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/14m_llm_seeds_more_data/pythia-14m_piqa_1 --output_hub_id pythia-14m_piqa --model_id EleutherAI/pythia-14m --learning_rate 6e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 1 --per_device_train_batch_size 768 --seed 1 --num_train_epochs 0.2' + +############################### + +submission to slurm complete! + + +############################### +slurm submission output + +Submitted batch job 7631221 + + + +############################### + +slurm submission log: 2024-05-19 09:27:21.164411 +created following sbatch script: + +############################### +#!/bin/bash + +#SBATCH --account=nlp +#SBATCH --cpus-per-task=16 +#SBATCH --gres=gpu:1 +#SBATCH --job-name=tthrush-job-4168844 +#SBATCH --mem=100G +#SBATCH --nodelist=sphinx2 +#SBATCH --open-mode=append +#SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/14m_llm_seeds_more_data/pythia-14m_piqa_1/train_job_output.txt +#SBATCH --partition=sphinx +#SBATCH --time=14-0 + +# activate your desired anaconda environment +. /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection + +# cd to working directory +cd . + +# launch commands +srun --unbuffered run_as_child_processes 'torchrun --master_port 29517 --nproc_per_node=1 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_data_more_data/piqa --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/14m_llm_seeds_more_data/pythia-14m_piqa_1 --output_hub_id pythia-14m_piqa --model_id EleutherAI/pythia-14m --learning_rate 6e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 1 --per_device_train_batch_size 640 --seed 1 --num_train_epochs 0.2' + +############################### -[2024-05-08 16:43:59,360] torch.distributed.run: [WARNING] -[2024-05-08 16:43:59,360] torch.distributed.run: [WARNING] ***************************************** -[2024-05-08 16:43:59,360] torch.distributed.run: [WARNING] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -[2024-05-08 16:43:59,360] torch.distributed.run: [WARNING] ***************************************** -05/08/2024 16:44:05 - INFO - __main__ - Script parameters ScriptArguments(dataset_id='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/training_data/piqa', output_dir='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms/pythia-14m_piqa', output_hub_id='pythia-14m_piqa', hf_hub_token=True, model_id='/pythia-14m', per_device_train_batch_size=256, num_train_epochs=1, learning_rate=0.001, gradient_accumulation_steps=2, from_scratch=True, warmup_ratio=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, weight_decay=0.01, lr_scheduler_type='cosine', local_rank=0, resume_from_checkpoint=False, deepspeed=None, peft=False) -05/08/2024 16:44:05 - INFO - __main__ - Script parameters ScriptArguments(dataset_id='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/training_data/piqa', output_dir='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms/pythia-14m_piqa', output_hub_id='pythia-14m_piqa', hf_hub_token=True, model_id='/pythia-14m', per_device_train_batch_size=256, num_train_epochs=1, learning_rate=0.001, gradient_accumulation_steps=2, from_scratch=True, warmup_ratio=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, weight_decay=0.01, lr_scheduler_type='cosine', local_rank=0, resume_from_checkpoint=False, deepspeed=None, peft=False) -Traceback (most recent call last): - File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/transformers/utils/hub.py", line 398, in cached_file - resolved_file = hf_hub_download( - File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py", line 110, in _inner_fn - validate_repo_id(arg_value) - File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py", line 164, in validate_repo_id - raise HFValidationError( -huggingface_hub.utils._validators.HFValidationError: Repo id must use alphanumeric chars or '-', '_', '.', '--' and '..' are forbidden, '-' and '.' cannot start or end the name, max length is 96: '/pythia-14m'. - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_llm.py", line 202, in - train_model() - File "/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_llm.py", line 166, in train_model - config = AutoConfig.from_pretrained(script_args.model_id) - File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/transformers/models/auto/configuration_auto.py", line 1138, in from_pretrained - config_dict, unused_kwargs = PretrainedConfig.get_config_dict(pretrained_model_name_or_path, **kwargs) - File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/transformers/configuration_utils.py", line 631, in get_config_dict - config_dict, kwargs = cls._get_config_dict(pretrained_model_name_or_path, **kwargs) - File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/transformers/configuration_utils.py", line 686, in _get_config_dict - resolved_config_file = cached_file( - File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/transformers/utils/hub.py", line 462, in cached_file -Traceback (most recent call last): - File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/transformers/utils/hub.py", line 398, in cached_file -raise EnvironmentError( -OSError: Incorrect path_or_model_id: '/pythia-14m'. Please provide either the path to a local folder or the repo_id of a model on the Hub. - resolved_file = hf_hub_download( - File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py", line 110, in _inner_fn - validate_repo_id(arg_value) - File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py", line 164, in validate_repo_id - raise HFValidationError( -huggingface_hub.utils._validators.HFValidationError: Repo id must use alphanumeric chars or '-', '_', '.', '--' and '..' are forbidden, '-' and '.' cannot start or end the name, max length is 96: '/pythia-14m'. - -The above exception was the direct cause of the following exception: - -Traceback (most recent call last): - File "/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_llm.py", line 202, in - train_model() - File "/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_llm.py", line 166, in train_model - config = AutoConfig.from_pretrained(script_args.model_id) - File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/transformers/models/auto/configuration_auto.py", line 1138, in from_pretrained - config_dict, unused_kwargs = PretrainedConfig.get_config_dict(pretrained_model_name_or_path, **kwargs) - File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/transformers/configuration_utils.py", line 631, in get_config_dict - config_dict, kwargs = cls._get_config_dict(pretrained_model_name_or_path, **kwargs) - File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/transformers/configuration_utils.py", line 686, in _get_config_dict - resolved_config_file = cached_file( - File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/transformers/utils/hub.py", line 462, in cached_file - raise EnvironmentError( -OSError: Incorrect path_or_model_id: '/pythia-14m'. Please provide either the path to a local folder or the repo_id of a model on the Hub. -[2024-05-08 16:45:49,480] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 0 (pid: 2312304) of binary: /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/bin/python -Traceback (most recent call last): - File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/bin/torchrun", line 8, in - sys.exit(main()) - File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper - return f(*args, **kwargs) - File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/torch/distributed/run.py", line 812, in main - run(args) - File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/torch/distributed/run.py", line 803, in run - elastic_launch( - File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 135, in __call__ - return launch_agent(self._config, self._entrypoint, list(args)) - File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 268, in launch_agent - raise ChildFailedError( -torch.distributed.elastic.multiprocessing.errors.ChildFailedError: -============================================================ -train_llm.py FAILED ------------------------------------------------------------- -Failures: -[1]: - time : 2024-05-08_16:45:49 - host : sphinx2.stanford.edu - rank : 1 (local_rank: 1) - exitcode : 1 (pid: 2312305) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ------------------------------------------------------------- -Root Cause (first observed failure): -[0]: - time : 2024-05-08_16:45:49 - host : sphinx2.stanford.edu - rank : 0 (local_rank: 0) - exitcode : 1 (pid: 2312304) - error_file: - traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html -============================================================ -############################### -end time: 2024-05-08 16:45:55.593886 -elapsed time: 0:02:00.117308 -slurm submission log: 2024-05-09 07:34:40.871568 +submission to slurm complete! + + +############################### +slurm submission output + +Submitted batch job 7631284 + + + +############################### + +slurm submission log: 2024-05-19 09:28:17.764747 created following sbatch script: ############################### @@ -158,23 +171,23 @@ created following sbatch script: #SBATCH --account=nlp #SBATCH --cpus-per-task=16 -#SBATCH --gres=gpu:2 -#SBATCH --job-name=tthrush-job-4443020 -#SBATCH --mem=400G +#SBATCH --gres=gpu:1 +#SBATCH --job-name=tthrush-job-3776623 +#SBATCH --mem=100G #SBATCH --nodelist=sphinx2 #SBATCH --open-mode=append -#SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms/pythia-14m_piqa/train_job_output.txt +#SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/14m_llm_seeds_more_data/pythia-14m_piqa_1/train_job_output.txt #SBATCH --partition=sphinx #SBATCH --time=14-0 # activate your desired anaconda environment -. /nlp/scr/tthrush/miniconda3/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection +. /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection # cd to working directory cd . # launch commands -srun --unbuffered run_as_child_processes 'torchrun --master_port 29506 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/training_data/piqa --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms/pythia-14m_piqa --output_hub_id pythia-14m_piqa --model_id EleutherAI/pythia-14m --num_train_epochs 1 --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2' +srun --unbuffered run_as_child_processes 'torchrun --master_port 29517 --nproc_per_node=1 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_data_more_data/piqa --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/14m_llm_seeds_more_data/pythia-14m_piqa_1 --output_hub_id pythia-14m_piqa --model_id EleutherAI/pythia-14m --learning_rate 6e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 1 --per_device_train_batch_size 512 --seed 1 --num_train_epochs 0.2' ############################### @@ -184,461 +197,525 @@ submission to slurm complete! ############################### slurm submission output -Submitted batch job 7591656 +Submitted batch job 7631348 + + +############################### +slurm submission log: 2024-05-19 09:29:20.389708 +created following sbatch script: ############################### +#!/bin/bash + +#SBATCH --account=nlp +#SBATCH --cpus-per-task=16 +#SBATCH --gres=gpu:1 +#SBATCH --job-name=tthrush-job-4689387 +#SBATCH --mem=100G +#SBATCH --nodelist=sphinx2 +#SBATCH --open-mode=append +#SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/14m_llm_seeds_more_data/pythia-14m_piqa_1/train_job_output.txt +#SBATCH --partition=sphinx +#SBATCH --time=14-0 + +# activate your desired anaconda environment +. /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection + +# cd to working directory +cd . + +# launch commands +srun --unbuffered run_as_child_processes 'torchrun --master_port 29517 --nproc_per_node=1 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_data_more_data/piqa --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/14m_llm_seeds_more_data/pythia-14m_piqa_1 --output_hub_id pythia-14m_piqa --model_id EleutherAI/pythia-14m --learning_rate 6e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 1 --per_device_train_batch_size 256 --seed 1 --num_train_epochs 0.2' + +############################### + +submission to slurm complete! + + +############################### +slurm submission output + +Submitted batch job 7631409 + + + +############################### + +slurm submission log: 2024-05-19 09:30:33.987595 +created following sbatch script: + +############################### + +#!/bin/bash + +#SBATCH --account=nlp +#SBATCH --cpus-per-task=16 +#SBATCH --gres=gpu:1 +#SBATCH --job-name=tthrush-job-1409388 +#SBATCH --mem=100G +#SBATCH --nodelist=sphinx2 +#SBATCH --open-mode=append +#SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/14m_llm_seeds_more_data/pythia-14m_piqa_1/train_job_output.txt +#SBATCH --partition=sphinx +#SBATCH --time=14-0 + +# activate your desired anaconda environment +. /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection + +# cd to working directory +cd . + +# launch commands +srun --unbuffered run_as_child_processes 'torchrun --master_port 29517 --nproc_per_node=1 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_data_more_data/piqa --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/14m_llm_seeds_more_data/pythia-14m_piqa_1 --output_hub_id pythia-14m_piqa --model_id EleutherAI/pythia-14m --learning_rate 6e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 1 --per_device_train_batch_size 340 --seed 1 --num_train_epochs 0.2' + +############################### + +submission to slurm complete! + + +############################### +slurm submission output + +Submitted batch job 7631469 + + + +############################### + +slurm submission log: 2024-05-19 09:31:39.572545 +created following sbatch script: + +############################### + +#!/bin/bash + +#SBATCH --account=nlp +#SBATCH --cpus-per-task=16 +#SBATCH --gres=gpu:1 +#SBATCH --job-name=tthrush-job-1319359 +#SBATCH --mem=100G +#SBATCH --nodelist=sphinx2 +#SBATCH --open-mode=append +#SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/14m_llm_seeds_more_data/pythia-14m_piqa_1/train_job_output.txt +#SBATCH --partition=sphinx +#SBATCH --time=14-0 + +# activate your desired anaconda environment +. /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection + +# cd to working directory +cd . + +# launch commands +srun --unbuffered run_as_child_processes 'torchrun --master_port 29517 --nproc_per_node=1 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_data_more_data/piqa --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/14m_llm_seeds_more_data/pythia-14m_piqa_1 --output_hub_id pythia-14m_piqa --model_id EleutherAI/pythia-14m --learning_rate 6e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 1 --per_device_train_batch_size 320 --seed 1 --num_train_epochs 0.2' + +############################### + +submission to slurm complete! + + +############################### +slurm submission output + +Submitted batch job 7631529 + + + +############################### + +slurm submission log: 2024-05-19 09:34:33.193118 +created following sbatch script: + +############################### + +#!/bin/bash + +#SBATCH --account=nlp +#SBATCH --cpus-per-task=16 +#SBATCH --gres=gpu:1 +#SBATCH --job-name=tthrush-job-742694 +#SBATCH --mem=100G +#SBATCH --nodelist=sphinx2 +#SBATCH --open-mode=append +#SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/14m_llm_seeds_more_data/pythia-14m_piqa_1/train_job_output.txt +#SBATCH --partition=sphinx +#SBATCH --time=14-0 + +# activate your desired anaconda environment +. /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection + +# cd to working directory +cd . + +# launch commands +srun --unbuffered run_as_child_processes 'torchrun --master_port 29517 --nproc_per_node=1 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_data_more_data/piqa --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/14m_llm_seeds_more_data/pythia-14m_piqa_1 --output_hub_id pythia-14m_piqa --model_id EleutherAI/pythia-14m --learning_rate 6e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 1 --per_device_train_batch_size 256 --seed 1 --num_train_epochs 0.2' + +############################### + +submission to slurm complete! + + +############################### +slurm submission output + +Submitted batch job 7631593 + + + +############################### + +slurm submission log: 2024-05-19 09:45:20.459710 +created following sbatch script: + +############################### + +#!/bin/bash + +#SBATCH --account=nlp +#SBATCH --cpus-per-task=16 +#SBATCH --gres=gpu:1 +#SBATCH --job-name=tthrush-job-3186286 +#SBATCH --mem=100G +#SBATCH --nodelist=sphinx2 +#SBATCH --open-mode=append +#SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/14m_llm_seeds_more_data/pythia-14m_piqa_1/train_job_output.txt +#SBATCH --partition=sphinx +#SBATCH --time=14-0 + +# activate your desired anaconda environment +. /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection + +# cd to working directory +cd . + +# launch commands +srun --unbuffered run_as_child_processes 'torchrun --master_port 29517 --nproc_per_node=1 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_data_more_data/piqa --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/14m_llm_seeds_more_data/pythia-14m_piqa_1 --output_hub_id pythia-14m_piqa --model_id EleutherAI/pythia-14m --learning_rate 6e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 1 --per_device_train_batch_size 320 --seed 1 --num_train_epochs 0.2' + +############################### + +submission to slurm complete! + + +############################### +slurm submission output + +Submitted batch job 7631663 + + + +############################### + +/var/lib/slurm/slurmd/job7631663/slurm_script: line 15: /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh: No such file or directory + +CommandNotFoundError: Your shell has not been properly configured to use 'conda activate'. +To initialize your shell, run + + $ conda init + +Currently supported shells are: + - bash + - fish + - tcsh + - xonsh + - zsh + - powershell + +See 'conda init --help' for more information and options. + +IMPORTANT: You may need to close and restart your shell after running 'conda init'. + + ############################### -start time: 2024-05-09 13:07:36.318275 +start time: 2024-05-19 10:14:28.271980 machine: sphinx2 conda env: pretraining-coreset-selection ############################### running following processes - torchrun --master_port 29506 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/training_data/piqa --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms/pythia-14m_piqa --output_hub_id pythia-14m_piqa --model_id EleutherAI/pythia-14m --num_train_epochs 1 --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2 + torchrun --master_port 29517 --nproc_per_node=1 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_data_more_data/piqa --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/14m_llm_seeds_more_data/pythia-14m_piqa_1 --output_hub_id pythia-14m_piqa --model_id EleutherAI/pythia-14m --learning_rate 6e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 1 --per_device_train_batch_size 320 --seed 1 --num_train_epochs 0.2 ############################### command outputs: -[2024-05-09 13:07:41,155] torch.distributed.run: [WARNING] -[2024-05-09 13:07:41,155] torch.distributed.run: [WARNING] ***************************************** -[2024-05-09 13:07:41,155] torch.distributed.run: [WARNING] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. -[2024-05-09 13:07:41,155] torch.distributed.run: [WARNING] ***************************************** -05/09/2024 13:07:50 - INFO - __main__ - Script parameters ScriptArguments(dataset_id='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/training_data/piqa', output_dir='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms/pythia-14m_piqa', output_hub_id='pythia-14m_piqa', hf_hub_token=True, model_id='EleutherAI/pythia-14m', per_device_train_batch_size=256, num_train_epochs=1, learning_rate=0.001, gradient_accumulation_steps=2, from_scratch=True, warmup_ratio=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, weight_decay=0.01, lr_scheduler_type='cosine', local_rank=0, resume_from_checkpoint=False, deepspeed=None, peft=False) -05/09/2024 13:07:50 - INFO - __main__ - Script parameters ScriptArguments(dataset_id='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/training_data/piqa', output_dir='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms/pythia-14m_piqa', output_hub_id='pythia-14m_piqa', hf_hub_token=True, model_id='EleutherAI/pythia-14m', per_device_train_batch_size=256, num_train_epochs=1, learning_rate=0.001, gradient_accumulation_steps=2, from_scratch=True, warmup_ratio=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, weight_decay=0.01, lr_scheduler_type='cosine', local_rank=0, resume_from_checkpoint=False, deepspeed=None, peft=False) - 0%| | 0/10682 [00:00