diff --git "a/train_job_output.txt" "b/train_job_output.txt" --- "a/train_job_output.txt" +++ "b/train_job_output.txt" @@ -1,4 +1,4 @@ -slurm submission log: 2024-05-19 16:02:34.779470 +slurm submission log: 2024-05-19 16:02:35.308376 created following sbatch script: ############################### @@ -9,11 +9,11 @@ created following sbatch script: #SBATCH --cpus-per-task=16 #SBATCH --dependency=afterok:7632874 #SBATCH --gres=gpu:1 -#SBATCH --job-name=tthrush-job-1789646 +#SBATCH --job-name=tthrush-job-780288 #SBATCH --mem=100G #SBATCH --nodelist=sphinx2 #SBATCH --open-mode=append -#SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_more_data_test_normalized/pythia-14m_piqa_4/train_job_output.txt +#SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_more_data_test_normalized/pythia-14m_piqa_5/train_job_output.txt #SBATCH --partition=sphinx #SBATCH --time=14-0 @@ -24,7 +24,7 @@ created following sbatch script: cd . # launch commands -srun --unbuffered run_as_child_processes 'torchrun --master_port 29520 --nproc_per_node=1 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_data_more_data_test_normalized/piqa --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_more_data_test_normalized/pythia-14m_piqa_4 --output_hub_id pythia-14m_piqa --model_id EleutherAI/pythia-14m --learning_rate 6e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 1 --per_device_train_batch_size 320 --seed 4 --num_train_epochs 0.2' +srun --unbuffered run_as_child_processes 'torchrun --master_port 29521 --nproc_per_node=1 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_data_more_data_test_normalized/piqa --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_more_data_test_normalized/pythia-14m_piqa_5 --output_hub_id pythia-14m_piqa --model_id EleutherAI/pythia-14m --learning_rate 6e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 1 --per_device_train_batch_size 320 --seed 5 --num_train_epochs 0.2' ############################### @@ -34,13 +34,13 @@ submission to slurm complete! ############################### slurm submission output -Submitted batch job 7632890 +Submitted batch job 7632893 ############################### -/var/lib/slurm/slurmd/job7632890/slurm_script: line 16: /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh: No such file or directory +/var/lib/slurm/slurmd/job7632893/slurm_script: line 16: /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh: No such file or directory CommandNotFoundError: Your shell has not been properly configured to use 'conda activate'. To initialize your shell, run @@ -61,293 +61,293 @@ IMPORTANT: You may need to close and restart your shell after running 'conda ini ############################### -start time: 2024-05-19 22:48:06.176918 +start time: 2024-05-19 23:05:29.816537 machine: sphinx2 conda env: pretraining-coreset-selection ############################### running following processes - torchrun --master_port 29520 --nproc_per_node=1 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_data_more_data_test_normalized/piqa --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_more_data_test_normalized/pythia-14m_piqa_4 --output_hub_id pythia-14m_piqa --model_id EleutherAI/pythia-14m --learning_rate 6e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 1 --per_device_train_batch_size 320 --seed 4 --num_train_epochs 0.2 + torchrun --master_port 29521 --nproc_per_node=1 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_data_more_data_test_normalized/piqa --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_more_data_test_normalized/pythia-14m_piqa_5 --output_hub_id pythia-14m_piqa --model_id EleutherAI/pythia-14m --learning_rate 6e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 1 --per_device_train_batch_size 320 --seed 5 --num_train_epochs 0.2 ############################### command outputs: -05/19/2024 22:48:42 - INFO - __main__ - Script parameters ScriptArguments(seed=4, dataset_id='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_data_more_data_test_normalized/piqa', output_dir='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_more_data_test_normalized/pythia-14m_piqa_4', output_hub_id='pythia-14m_piqa', hf_hub_token=True, model_id='EleutherAI/pythia-14m', per_device_train_batch_size=320, num_train_epochs=0.2, learning_rate=0.006, gradient_accumulation_steps=1, from_scratch=True, warmup_ratio=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, weight_decay=0.01, lr_scheduler_type='cosine', local_rank=0, resume_from_checkpoint=False, deepspeed=None, peft=False) +05/19/2024 23:05:39 - INFO - __main__ - Script parameters ScriptArguments(seed=5, dataset_id='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_data_more_data_test_normalized/piqa', output_dir='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_more_data_test_normalized/pythia-14m_piqa_5', output_hub_id='pythia-14m_piqa', hf_hub_token=True, model_id='EleutherAI/pythia-14m', per_device_train_batch_size=320, num_train_epochs=0.2, learning_rate=0.006, gradient_accumulation_steps=1, from_scratch=True, warmup_ratio=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, weight_decay=0.01, lr_scheduler_type='cosine', local_rank=0, resume_from_checkpoint=False, deepspeed=None, peft=False) 0%| | 0/6844 [00:00