|
slurm submission log: 2024-05-11 17:54:08.850609 |
|
created following sbatch script: |
|
|
|
############################### |
|
|
|
#!/bin/bash |
|
|
|
#SBATCH --account=nlp |
|
#SBATCH --cpus-per-task=16 |
|
#SBATCH --dependency=afterok: |
|
#SBATCH --gres=gpu:1 |
|
#SBATCH --job-name=tthrush-job-2902311 |
|
#SBATCH --mem=60G |
|
#SBATCH --nodelist=sphinx2 |
|
#SBATCH --open-mode=append |
|
#SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_5/pythia-70m_sciq/eval_job_output.txt |
|
#SBATCH --partition=sphinx |
|
#SBATCH --time=14-0 |
|
|
|
# activate your desired anaconda environment |
|
. /nlp/scr/tthrush/miniconda3/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection |
|
|
|
# cd to working directory |
|
cd . |
|
|
|
# launch commands |
|
srun --unbuffered run_as_child_processes 'lm_eval --model hf --model_args pretrained=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_5/pythia-70m_sciq,revision=main,dtype=float16,trust_remote_code=True --tasks xnli_en,xnli_fr,sciq,piqa,lambada,arc_easy --device cuda --output_path /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_5/pythia-70m_sciq/perf' |
|
|
|
############################### |
|
|
|
submission to slurm complete! |
|
|
|
|
|
############################### |
|
slurm submission output |
|
|
|
|
|
|
|
sbatch: error: Batch job submission failed: Job dependency problem |
|
|
|
############################### |
|
|
|
slurm submission log: 2024-05-11 17:55:07.106159 |
|
created following sbatch script: |
|
|
|
############################### |
|
|
|
#!/bin/bash |
|
|
|
#SBATCH --account=nlp |
|
#SBATCH --cpus-per-task=16 |
|
#SBATCH --dependency=afterok:7598873 |
|
#SBATCH --gres=gpu:1 |
|
#SBATCH --job-name=tthrush-job-552824 |
|
#SBATCH --mem=60G |
|
#SBATCH --nodelist=sphinx2 |
|
#SBATCH --open-mode=append |
|
#SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_5/pythia-70m_sciq/eval_job_output.txt |
|
#SBATCH --partition=sphinx |
|
#SBATCH --time=14-0 |
|
|
|
# activate your desired anaconda environment |
|
. /nlp/scr/tthrush/miniconda3/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection |
|
|
|
# cd to working directory |
|
cd . |
|
|
|
# launch commands |
|
srun --unbuffered run_as_child_processes 'lm_eval --model hf --model_args pretrained=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_5/pythia-70m_sciq,revision=main,dtype=float16,trust_remote_code=True --tasks xnli_en,xnli_fr,sciq,piqa,lambada,arc_easy --device cuda --output_path /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_5/pythia-70m_sciq/perf' |
|
|
|
############################### |
|
|
|
submission to slurm complete! |
|
|
|
|
|
############################### |
|
slurm submission output |
|
|
|
Submitted batch job 7598874 |
|
|
|
|
|
|
|
############################### |
|
|
|
############################### |
|
start time: 2024-05-11 17:58:20.674188 |
|
machine: sphinx2 |
|
conda env: pretraining-coreset-selection |
|
############################### |
|
running following processes |
|
|
|
lm_eval --model hf --model_args pretrained=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_5/pythia-70m_sciq,revision=main,dtype=float16,trust_remote_code=True --tasks xnli_en,xnli_fr,sciq,piqa,lambada,arc_easy --device cuda --output_path /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_5/pythia-70m_sciq/perf |
|
|
|
|
|
############################### |
|
command outputs: |
|
|
|
|
|
2024-05-11:17:58:30,428 INFO [utils.py:145] Note: detected 255 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable. |
|
2024-05-11:17:58:30,429 INFO [utils.py:148] Note: NumExpr detected 255 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8. |
|
2024-05-11:17:58:30,429 INFO [utils.py:160] NumExpr defaulting to 8 threads. |
|
2024-05-11:17:58:31,929 INFO [config.py:58] PyTorch version 2.2.2 available. |
|
2024-05-11:17:58:45,100 INFO [__main__.py:156] Verbosity set to INFO |
|
2024-05-11:17:59:06,539 WARNING [__init__.py:194] Some tasks could not be loaded due to missing dependencies. Run with `--verbosity DEBUG` for full details. |
|
srun: Job step aborted: Waiting up to 32 seconds for job step to finish. |
|
slurmstepd: error: *** JOB 7598874 ON sphinx2 CANCELLED AT 2024-05-11T17:59:25 *** |
|
slurmstepd: error: *** STEP 7598874.0 ON sphinx2 CANCELLED AT 2024-05-11T17:59:25 *** |
|
Received SIGTERM, job terminating, terminating 1 processes... |
|
slurm submission log: 2024-05-11 18:01:39.856307 |
|
created following sbatch script: |
|
|
|
############################### |
|
|
|
#!/bin/bash |
|
|
|
#SBATCH --account=nlp |
|
#SBATCH --cpus-per-task=16 |
|
#SBATCH --dependency=afterok:7598912 |
|
#SBATCH --gres=gpu:1 |
|
#SBATCH --job-name=tthrush-job-2986377 |
|
#SBATCH --mem=60G |
|
#SBATCH --nodelist=sphinx2 |
|
#SBATCH --open-mode=append |
|
#SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_5/pythia-70m_sciq/eval_job_output.txt |
|
#SBATCH --partition=sphinx |
|
#SBATCH --time=14-0 |
|
|
|
# activate your desired anaconda environment |
|
. /nlp/scr/tthrush/miniconda3/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection |
|
|
|
# cd to working directory |
|
cd . |
|
|
|
# launch commands |
|
srun --unbuffered run_as_child_processes 'lm_eval --model hf --model_args pretrained=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_5/pythia-70m_sciq,revision=main,dtype=float16,trust_remote_code=True --tasks xnli_en,xnli_fr,sciq,piqa,lambada,arc_easy --device cuda --output_path /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/llms_5/pythia-70m_sciq/perf' |
|
|
|
############################### |
|
|
|
submission to slurm complete! |
|
|
|
|
|
############################### |
|
slurm submission output |
|
|
|
Submitted batch job 7598913 |
|
|
|
|
|
|
|
############################### |
|
|
|
|