3outeille HF staff commited on
Commit
31f7d06
1 Parent(s): b90eceb

Upload llama-1B/16_GPUS/dp-1_tp-4_pp-4_mbz-32

Browse files
llama-1B/16_GPUS/dp-1_tp-4_pp-4_mbz-32/bench.slurm ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ #SBATCH --job-name=bench_cluster
4
+ #SBATCH --time=00:59:00
5
+ #SBATCH --partition=hopper-prod
6
+ #SBATCH --nodes=2
7
+ #SBATCH --gres=gpu:8
8
+ #SBATCH --qos=high
9
+ #SBATCH --ntasks-per-node=1
10
+ #SBATCH --cpus-per-task=96
11
+ #SBATCH --exclusive
12
+ #SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-1_tp-4_pp-4_mbz-32/log.out
13
+ #SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-1_tp-4_pp-4_mbz-32/log.out
14
+
15
+ # Function to update status based on squeue output
16
+ update_status() {
17
+ job_id=$1
18
+ status_file=$2
19
+ # For unknown reasons, it doenst update status for pending. It only works for running
20
+ while true; do
21
+ job_status=$(squeue --job $job_id --noheader --format=%T)
22
+ echo "Job status: $job_status"
23
+ if [ -z "$job_status" ]; then
24
+ # Job has finished or is not found
25
+ break
26
+ elif [ "$job_status" = "RUNNING" ]; then
27
+ printf "running" > $status_file
28
+ break
29
+ fi
30
+ sleep 10
31
+ done
32
+ }
33
+
34
+ # Misc initializations.
35
+ echo "========================"
36
+ echo "START TIME: $(date)"
37
+ source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh
38
+ conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster
39
+ echo python3 version = $(python3 --version)
40
+ echo "========================"
41
+
42
+ # Slurm stuff
43
+ export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
44
+ export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
45
+ export MASTER_PORT=$((1024 + RANDOM % 64511))
46
+
47
+ export TMPDIR=/scratch
48
+ export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache"
49
+ export CUBLAS_WORKSPACE_CONFIG=":4096:8"
50
+ export CUDA_DEVICE_MAX_CONNECTIONS="1"
51
+
52
+ huggingface-cli login --token $HUGGINGFACE_TOKEN
53
+
54
+
55
+ NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron"
56
+ CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-1_tp-4_pp-4_mbz-32/config.yaml"
57
+
58
+ LAUNCHER="torchrun \
59
+ --nproc_per_node 8 \
60
+ --nnodes 2 \
61
+ --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \
62
+ --rdzv_backend c10d \
63
+ --max_restarts 0 \
64
+ --tee 3 \
65
+ --node_rank ${SLURM_PROCID}"
66
+
67
+ # Checkout the bench_cluster branch
68
+ cd $NANOTRON_REPO
69
+ git checkout bench_cluster
70
+ cd ..
71
+ # Get the current job ID
72
+ job_id=${SLURM_JOB_ID}
73
+
74
+ # Update status to "pending" or "running" in the background
75
+ update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-1_tp-4_pp-4_mbz-32/status.txt &
76
+
77
+ # Run the main command
78
+ srun -u $LAUNCHER $CMD
79
+ exit_status=$?
80
+
81
+ # Update status based on the exit status of `srun`
82
+ if [ $exit_status -eq 0 ]; then
83
+ printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-1_tp-4_pp-4_mbz-32/status.txt
84
+ else
85
+ if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-1_tp-4_pp-4_mbz-32/log.out; then
86
+ printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-1_tp-4_pp-4_mbz-32/status.txt
87
+ elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-1_tp-4_pp-4_mbz-32/log.out; then
88
+ printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-1_tp-4_pp-4_mbz-32/status.txt
89
+ elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-1_tp-4_pp-4_mbz-32/log.out; then
90
+ printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-1_tp-4_pp-4_mbz-32/status.txt
91
+ else
92
+ printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-1_tp-4_pp-4_mbz-32/status.txt
93
+ fi
94
+ fi
95
+
96
+ # Run the report script if the job completed successfully
97
+ if [ $exit_status -eq 0 ]; then
98
+ python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-1_tp-4_pp-4_mbz-32 --is_logs
99
+ python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-1_tp-4_pp-4_mbz-32 --is_profiler
100
+ fi
101
+
102
+
103
+ # Push to hub the folder using huggingface_cli
104
+ huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-1_tp-4_pp-4_mbz-32 llama-1B/16_GPUS/dp-1_tp-4_pp-4_mbz-32 --commit-message "Upload llama-1B/16_GPUS/dp-1_tp-4_pp-4_mbz-32"
105
+
106
+ # Verify the upload
107
+ if [ $? -eq 0 ]; then
108
+ echo "Uploading to Huggingface Hub successful"
109
+ else
110
+ echo "Failed to upload to Huggingface Hub"
111
+ fi
llama-1B/16_GPUS/dp-1_tp-4_pp-4_mbz-32/config.yaml ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ general:
2
+ project: bench_cluster
3
+ seed: 42
4
+ model:
5
+ ddp_bucket_cap_mb: 25
6
+ dtype: bfloat16
7
+ init_method:
8
+ std: 0.025
9
+ make_vocab_size_divisible_by: 1
10
+ model_config:
11
+ bos_token_id: 1
12
+ eos_token_id: 2
13
+ hidden_act: silu
14
+ hidden_size: 2048
15
+ initializer_range: 0.02
16
+ intermediate_size: 4096
17
+ is_llama_config: true
18
+ max_position_embeddings: 4096
19
+ num_attention_heads: 32
20
+ num_hidden_layers: 24
21
+ num_key_value_heads: 32
22
+ pad_token_id: null
23
+ pretraining_tp: 1
24
+ rms_norm_eps: 1.0e-05
25
+ rope_scaling: null
26
+ rope_theta: 10000.0
27
+ tie_word_embeddings: true
28
+ use_cache: true
29
+ vocab_size: 50257
30
+ optimizer:
31
+ accumulate_grad_in_fp32: true
32
+ clip_grad: 1.0
33
+ learning_rate_scheduler:
34
+ learning_rate: 0.0001
35
+ lr_decay_style: linear
36
+ lr_warmup_style: linear
37
+ lr_warmup_steps: 1
38
+ min_decay_lr: 1.0e-05
39
+ optimizer_factory:
40
+ adam_beta1: 0.9
41
+ adam_beta2: 0.95
42
+ adam_eps: 1.0e-08
43
+ name: adamW
44
+ torch_adam_is_fused: true
45
+ weight_decay: 0.01
46
+ zero_stage: 1
47
+ parallelism:
48
+ dp: 1
49
+ expert_parallel_size: 1
50
+ pp: 4
51
+ pp_engine: 1f1b
52
+ tp: 4
53
+ tp_linear_async_communication: false
54
+ tp_mode: REDUCE_SCATTER
55
+ profiler:
56
+ profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-1_tp-4_pp-4_mbz-32
57
+ tokenizer:
58
+ tokenizer_max_length: null
59
+ tokenizer_name_or_path: openai-community/gpt2
60
+ tokenizer_revision: null
61
+ data_stages:
62
+ - name: Training Stage
63
+ start_training_step: 1
64
+ data:
65
+ dataset:
66
+ dataset_overwrite_cache: false
67
+ dataset_processing_num_proc_per_process: 64
68
+ hf_dataset_config_name: null
69
+ hf_dataset_or_datasets: roneneldan/TinyStories
70
+ hf_dataset_splits: train
71
+ text_column_name: text
72
+ num_loading_workers: 32
73
+ seed: 42
74
+ lighteval: null
75
+ tokens:
76
+ train_steps: 20
77
+ val_check_interval: -1
78
+ batch_accumulation_per_replica: 32
79
+ limit_test_batches: 0
80
+ limit_val_batches: 0
81
+ micro_batch_size: 32
82
+ sequence_length: 4096
83
+ logging:
84
+ iteration_step_info_interval: 1
85
+ log_level: info
86
+ log_level_replica: info
87
+ checkpoints:
88
+ checkpoint_interval: 100000
89
+ checkpoints_path: /dev/null
90
+ resume_checkpoint_path: null
llama-1B/16_GPUS/dp-1_tp-4_pp-4_mbz-32/log.out ADDED
@@ -0,0 +1,800 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ========================
2
+ START TIME: Tue Jul 2 14:14:57 UTC 2024
3
+ python3 version = Python 3.10.14
4
+ ========================
5
+ The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
6
+ Token is valid (permission: write).
7
+ Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token
8
+ Login successful
9
+ Already on 'bench_cluster'
10
+ M examples/config_tiny_llama.py
11
+ M examples/config_tiny_llama.yaml
12
+ M examples/train_tiny_llama.sh
13
+ M src/nanotron/models/llama.py
14
+ M src/nanotron/trainer.py
15
+ Your branch is up to date with 'origin/bench_cluster'.
16
+ Job status: RUNNING
17
+ W0702 14:15:05.914000 140251158693696 torch/distributed/run.py:757]
18
+ W0702 14:15:05.914000 140251158693696 torch/distributed/run.py:757] *****************************************
19
+ W0702 14:15:05.914000 140251158693696 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
20
+ W0702 14:15:05.914000 140251158693696 torch/distributed/run.py:757] *****************************************
21
+ W0702 14:15:06.663000 140497670219584 torch/distributed/run.py:757]
22
+ W0702 14:15:06.663000 140497670219584 torch/distributed/run.py:757] *****************************************
23
+ W0702 14:15:06.663000 140497670219584 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
24
+ W0702 14:15:06.663000 140497670219584 torch/distributed/run.py:757] *****************************************
25
+ [default0]:07/02/2024 14:15:31 [WARNING|DP=0|PP=0|TP=0|ip-26-0-170-31]: [Vocab Size Padding] Padded vocab (size: 50257) with 3 dummy tokens (new size: 50260)
26
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: Config:
27
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: Config(general=GeneralArgs(project='bench_cluster',
28
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: run='%date_%jobid',
29
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: seed=42,
30
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: step=None,
31
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: consumed_train_samples=None,
32
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: benchmark_csv_path=None,
33
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: ignore_sanity_checks=True),
34
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: parallelism=ParallelismArgs(dp=1,
35
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: pp=4,
36
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: tp=4,
37
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: pp_engine=<nanotron.parallel.pipeline_parallel.engine.OneForwardOneBackwardPipelineEngine object at 0x7f459fdb06a0>,
38
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: tp_mode=<TensorParallelLinearMode.REDUCE_SCATTER: 2>,
39
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: tp_linear_async_communication=False,
40
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: expert_parallel_size=1),
41
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1,
42
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: eos_token_id=2,
43
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: hidden_act='silu',
44
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: hidden_size=2048,
45
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: initializer_range=0.02,
46
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: intermediate_size=4096,
47
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: is_llama_config=True,
48
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: max_position_embeddings=4096,
49
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: num_attention_heads=32,
50
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: num_hidden_layers=24,
51
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: num_key_value_heads=32,
52
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: pad_token_id=None,
53
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: pretraining_tp=1,
54
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: rms_norm_eps=1e-05,
55
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: rope_scaling=None,
56
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: rope_theta=10000.0,
57
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: tie_word_embeddings=True,
58
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: use_cache=True,
59
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: vocab_size=50260),
60
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: init_method=RandomInit(std=0.025),
61
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: dtype=torch.bfloat16,
62
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: make_vocab_size_divisible_by=1,
63
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: ddp_bucket_cap_mb=25),
64
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2',
65
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: tokenizer_revision=None,
66
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: tokenizer_max_length=None),
67
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'),
68
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: checkpoint_interval=100000,
69
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: save_initial_state=False,
70
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: resume_checkpoint_path=None,
71
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: checkpoints_path_is_shared_file_system=False),
72
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: logging=LoggingArgs(log_level='info',
73
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: log_level_replica='info',
74
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: iteration_step_info_interval=1),
75
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: tokens=TokensArgs(sequence_length=4096,
76
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: train_steps=20,
77
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: micro_batch_size=32,
78
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: batch_accumulation_per_replica=32,
79
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: val_check_interval=-1,
80
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: limit_val_batches=0,
81
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: limit_test_batches=0),
82
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08,
83
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: adam_beta1=0.9,
84
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: adam_beta2=0.95,
85
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: torch_adam_is_fused=True,
86
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: name='adamW'),
87
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: zero_stage=1,
88
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: weight_decay=0.01,
89
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: clip_grad=1.0,
90
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: accumulate_grad_in_fp32=True,
91
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001,
92
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: lr_warmup_steps=1,
93
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: lr_warmup_style='linear',
94
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: lr_decay_style='linear',
95
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: lr_decay_steps=19,
96
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: lr_decay_starting_step=None,
97
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: min_decay_lr=1e-05)),
98
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: data_stages=[DatasetStageArgs(name='Training Stage',
99
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: start_training_step=1,
100
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories',
101
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: hf_dataset_splits='train',
102
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: hf_dataset_config_name=None,
103
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: dataset_processing_num_proc_per_process=64,
104
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: dataset_overwrite_cache=False,
105
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: text_column_name='text'),
106
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: seed=42,
107
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: num_loading_workers=32))],
108
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/16_GPUS/dp-1_tp-4_pp-4_mbz-32')),
109
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: lighteval=None)
110
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: Model Config:
111
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: LlamaConfig(bos_token_id=1,
112
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: eos_token_id=2,
113
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: hidden_act='silu',
114
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: hidden_size=2048,
115
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: initializer_range=0.02,
116
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: intermediate_size=4096,
117
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: is_llama_config=True,
118
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: max_position_embeddings=4096,
119
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: num_attention_heads=32,
120
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: num_hidden_layers=24,
121
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: num_key_value_heads=32,
122
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: pad_token_id=None,
123
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: pretraining_tp=1,
124
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: rms_norm_eps=1e-05,
125
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: rope_scaling=None,
126
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: rope_theta=10000.0,
127
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: tie_word_embeddings=True,
128
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: use_cache=True,
129
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: vocab_size=50260)
130
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: Building model..
131
+ [default0]:07/02/2024 14:15:31 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: Setting PP block ranks...
132
+ [default6]:07/02/2024 14:15:47 [INFO|DP=0|PP=3|TP=2|ip-26-0-171-56]: Local number of parameters: 67.7M (129.12MiB)
133
+ [default3]:07/02/2024 14:15:47 [INFO|DP=0|PP=0|TP=3|ip-26-0-170-31]: Local number of parameters: 99.2M (189.14MiB)
134
+ [default3]:07/02/2024 14:15:47 [INFO|DP=0|PP=0|TP=3|ip-26-0-170-31]: [After model building] Memory usage: 197.07MiB. Peak allocated: 199.10MiB Peak reserved: 200.00MiB
135
+ [default3]:07/02/2024 14:15:47 [INFO|DP=0|PP=0|TP=3|ip-26-0-170-31]: No checkpoint path provided.
136
+ [default2]:07/02/2024 14:15:47 [INFO|DP=0|PP=0|TP=2|ip-26-0-170-31]: Local number of parameters: 99.2M (189.14MiB)
137
+ [default2]:07/02/2024 14:15:47 [INFO|DP=0|PP=0|TP=2|ip-26-0-170-31]: [After model building] Memory usage: 197.07MiB. Peak allocated: 199.10MiB Peak reserved: 200.00MiB
138
+ [default2]:07/02/2024 14:15:47 [INFO|DP=0|PP=0|TP=2|ip-26-0-170-31]: No checkpoint path provided.
139
+ [default7]:07/02/2024 14:15:47 [INFO|DP=0|PP=1|TP=3|ip-26-0-170-31]: Local number of parameters: 73.4M (140.05MiB)
140
+ [default7]:07/02/2024 14:15:47 [INFO|DP=0|PP=1|TP=3|ip-26-0-170-31]: [After model building] Memory usage: 147.07MiB. Peak allocated: 149.10MiB Peak reserved: 150.00MiB
141
+ [default7]:07/02/2024 14:15:47 [INFO|DP=0|PP=1|TP=3|ip-26-0-170-31]: No checkpoint path provided.
142
+ [default4]:07/02/2024 14:15:47 [INFO|DP=0|PP=1|TP=0|ip-26-0-170-31]: Local number of parameters: 73.4M (140.05MiB)
143
+ [default4]:07/02/2024 14:15:47 [INFO|DP=0|PP=1|TP=0|ip-26-0-170-31]: [After model building] Memory usage: 147.07MiB. Peak allocated: 149.10MiB Peak reserved: 150.00MiB
144
+ [default4]:07/02/2024 14:15:47 [INFO|DP=0|PP=1|TP=0|ip-26-0-170-31]: No checkpoint path provided.
145
+ [default1]:07/02/2024 14:15:47 [INFO|DP=0|PP=0|TP=1|ip-26-0-170-31]: Local number of parameters: 99.2M (189.14MiB)
146
+ [default1]:07/02/2024 14:15:47 [INFO|DP=0|PP=0|TP=1|ip-26-0-170-31]: [After model building] Memory usage: 197.07MiB. Peak allocated: 199.10MiB Peak reserved: 200.00MiB
147
+ [default1]:07/02/2024 14:15:47 [INFO|DP=0|PP=0|TP=1|ip-26-0-170-31]: No checkpoint path provided.
148
+ [default5]:07/02/2024 14:15:47 [INFO|DP=0|PP=1|TP=1|ip-26-0-170-31]: Local number of parameters: 73.4M (140.05MiB)
149
+ [default5]:07/02/2024 14:15:47 [INFO|DP=0|PP=1|TP=1|ip-26-0-170-31]: [After model building] Memory usage: 147.07MiB. Peak allocated: 149.10MiB Peak reserved: 150.00MiB
150
+ [default5]:07/02/2024 14:15:47 [INFO|DP=0|PP=1|TP=1|ip-26-0-170-31]: No checkpoint path provided.
151
+ [default6]:07/02/2024 14:15:47 [INFO|DP=0|PP=1|TP=2|ip-26-0-170-31]: Local number of parameters: 73.4M (140.05MiB)
152
+ [default6]:07/02/2024 14:15:47 [INFO|DP=0|PP=1|TP=2|ip-26-0-170-31]: [After model building] Memory usage: 147.07MiB. Peak allocated: 149.10MiB Peak reserved: 150.00MiB
153
+ [default6]:07/02/2024 14:15:47 [INFO|DP=0|PP=1|TP=2|ip-26-0-170-31]: No checkpoint path provided.
154
+ [default0]:07/02/2024 14:15:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: Total number of parameters: 1.21G (2313.42MiB)
155
+ [default0]:07/02/2024 14:15:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: Local number of parameters: 99.2M (189.14MiB)
156
+ [default0]:07/02/2024 14:15:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: [After model building] Memory usage: 197.07MiB. Peak allocated: 199.10MiB Peak reserved: 200.00MiB
157
+ [default0]:07/02/2024 14:15:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: No checkpoint path provided.
158
+ [default0]:07/02/2024 14:15:47 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: Parametrizing model parameters using StandardParametrizator
159
+ [default0]:07/02/2024 14:15:47 [INFO|DP=0|PP=2|TP=0|ip-26-0-171-56]: Local number of parameters: 62.9M (120.05MiB)
160
+ [default0]:07/02/2024 14:15:47 [INFO|DP=0|PP=2|TP=0|ip-26-0-171-56]: [After model building] Memory usage: 126.06MiB. Peak allocated: 128.09MiB Peak reserved: 130.00MiB
161
+ [default0]:07/02/2024 14:15:47 [INFO|DP=0|PP=2|TP=0|ip-26-0-171-56]: No checkpoint path provided.
162
+ [default1]:07/02/2024 14:15:47 [INFO|DP=0|PP=2|TP=1|ip-26-0-171-56]: Local number of parameters: 62.9M (120.05MiB)
163
+ [default1]:07/02/2024 14:15:47 [INFO|DP=0|PP=2|TP=1|ip-26-0-171-56]: [After model building] Memory usage: 126.06MiB. Peak allocated: 128.09MiB Peak reserved: 130.00MiB
164
+ [default1]:07/02/2024 14:15:47 [INFO|DP=0|PP=2|TP=1|ip-26-0-171-56]: No checkpoint path provided.
165
+ [default7]:07/02/2024 14:15:47 [INFO|DP=0|PP=3|TP=3|ip-26-0-171-56]: Local number of parameters: 67.7M (129.12MiB)
166
+ [default7]:07/02/2024 14:15:47 [INFO|DP=0|PP=3|TP=3|ip-26-0-171-56]: [After model building] Memory usage: 134.05MiB. Peak allocated: 136.08MiB Peak reserved: 138.00MiB
167
+ [default7]:07/02/2024 14:15:47 [INFO|DP=0|PP=3|TP=3|ip-26-0-171-56]: No checkpoint path provided.
168
+ [default2]:07/02/2024 14:15:47 [INFO|DP=0|PP=2|TP=2|ip-26-0-171-56]: Local number of parameters: 62.9M (120.05MiB)
169
+ [default2]:07/02/2024 14:15:47 [INFO|DP=0|PP=2|TP=2|ip-26-0-171-56]: [After model building] Memory usage: 126.06MiB. Peak allocated: 128.09MiB Peak reserved: 130.00MiB
170
+ [default2]:07/02/2024 14:15:47 [INFO|DP=0|PP=2|TP=2|ip-26-0-171-56]: No checkpoint path provided.
171
+ [default3]:07/02/2024 14:15:47 [INFO|DP=0|PP=2|TP=3|ip-26-0-171-56]: Local number of parameters: 62.9M (120.05MiB)
172
+ [default5]:07/02/2024 14:15:47 [INFO|DP=0|PP=3|TP=1|ip-26-0-171-56]: Local number of parameters: 67.7M (129.12MiB)
173
+ [default5]:07/02/2024 14:15:47 [INFO|DP=0|PP=3|TP=1|ip-26-0-171-56]: [After model building] Memory usage: 134.05MiB. Peak allocated: 136.08MiB Peak reserved: 138.00MiB
174
+ [default3]:07/02/2024 14:15:47 [INFO|DP=0|PP=2|TP=3|ip-26-0-171-56]: [After model building] Memory usage: 126.06MiB. Peak allocated: 128.09MiB Peak reserved: 130.00MiB
175
+ [default4]:07/02/2024 14:15:47 [INFO|DP=0|PP=3|TP=0|ip-26-0-171-56]: Local number of parameters: 67.7M (129.12MiB)
176
+ [default4]:07/02/2024 14:15:47 [INFO|DP=0|PP=3|TP=0|ip-26-0-171-56]: [After model building] Memory usage: 134.05MiB. Peak allocated: 136.08MiB Peak reserved: 138.00MiB
177
+ [default4]:07/02/2024 14:15:47 [INFO|DP=0|PP=3|TP=0|ip-26-0-171-56]: No checkpoint path provided.
178
+ [default3]:07/02/2024 14:15:47 [INFO|DP=0|PP=2|TP=3|ip-26-0-171-56]: No checkpoint path provided.
179
+ [default5]:07/02/2024 14:15:47 [INFO|DP=0|PP=3|TP=1|ip-26-0-171-56]: No checkpoint path provided.
180
+ [default6]:07/02/2024 14:15:47 [INFO|DP=0|PP=3|TP=2|ip-26-0-171-56]: [After model building] Memory usage: 134.05MiB. Peak allocated: 136.08MiB Peak reserved: 138.00MiB
181
+ [default6]:07/02/2024 14:15:47 [INFO|DP=0|PP=3|TP=2|ip-26-0-171-56]: No checkpoint path provided.
182
+ [default0]:07/02/2024 14:15:49 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: [Optimizer Building] Using LearningRateForSP as learning rate
183
+ [default0]:07/02/2024 14:15:49 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: [ZeRO sharding] Size of optimizer params per rank:
184
+ [default0]:07/02/2024 14:15:49 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: [ZeRO sharding] DP Rank 0 has 99.2M out of 99.2M (100.00%) params' optimizer states
185
+ [default0]:07/02/2024 14:15:50 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples
186
+ [default0]:07/02/2024 14:15:50 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: Using `datasets` library
187
+ [default0]:07/02/2024 14:15:50 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4')
188
+ [default0]:07/02/2024 14:15:50 [WARNING|DP=0|PP=0|TP=0|ip-26-0-170-31]: Repo card metadata block was not found. Setting CardData to empty.
189
+ [default0]:Repo card metadata block was not found. Setting CardData to empty.
190
+ [default0]:07/02/2024 14:15:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: [Training Plan] There are 1 training stages
191
+ [default0]:07/02/2024 14:15:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: [Stage Training Stage] start from step 1
192
+ [default0]:07/02/2024 14:15:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]:
193
+ [default0]:07/02/2024 14:15:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: [Start training] datetime: 2024-07-02 14:15:52.769469 | mbs: 32 | grad_accum: 32 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0
194
+ [default0]:07/02/2024 14:15:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps
195
+ [default0]:07/02/2024 14:15:52 [INFO|DP=0|PP=0|TP=0|ip-26-0-170-31]: Memory usage: 953.61MiB. Peak allocated 953.61MiB. Peak reserved: 960.00MiB
196
+ [default5]:07/02/2024 14:15:52 [WARNING|DP=0|PP=3|TP=1|ip-26-0-171-56]: Repo card metadata block was not found. Setting CardData to empty.
197
+ [default7]:Repo card metadata block was not found. Setting CardData to empty.
198
+ [default3]:07/02/2024 14:15:52 [WARNING|DP=0|PP=2|TP=3|ip-26-0-171-56]: Repo card metadata block was not found. Setting CardData to empty.
199
+ [default7]:07/02/2024 14:15:52 [WARNING|DP=0|PP=3|TP=3|ip-26-0-171-56]: Repo card metadata block was not found. Setting CardData to empty.
200
+ [default2]:07/02/2024 14:15:52 [WARNING|DP=0|PP=2|TP=2|ip-26-0-171-56]: Repo card metadata block was not found. Setting CardData to empty.
201
+ [default5]:Repo card metadata block was not found. Setting CardData to empty.
202
+ [default2]:Repo card metadata block was not found. Setting CardData to empty.
203
+ [default3]:Repo card metadata block was not found. Setting CardData to empty.
204
+ [default3]:07/02/2024 14:15:52 [WARNING|DP=0|PP=0|TP=3|ip-26-0-170-31]: Repo card metadata block was not found. Setting CardData to empty.
205
+ [default2]:Repo card metadata block was not found. Setting CardData to empty.
206
+ [default2]:07/02/2024 14:15:52 [WARNING|DP=0|PP=0|TP=2|ip-26-0-170-31]: Repo card metadata block was not found. Setting CardData to empty.
207
+ [default7]:07/02/2024 14:15:52 [WARNING|DP=0|PP=1|TP=3|ip-26-0-170-31]: Repo card metadata block was not found. Setting CardData to empty.
208
+ [default1]:Repo card metadata block was not found. Setting CardData to empty.
209
+ [default7]:Repo card metadata block was not found. Setting CardData to empty.
210
+ [default5]:Repo card metadata block was not found. Setting CardData to empty.
211
+ [default4]:Repo card metadata block was not found. Setting CardData to empty.
212
+ [default4]:07/02/2024 14:15:52 [WARNING|DP=0|PP=1|TP=0|ip-26-0-170-31]: Repo card metadata block was not found. Setting CardData to empty.
213
+ [default1]:07/02/2024 14:15:52 [WARNING|DP=0|PP=0|TP=1|ip-26-0-170-31]: Repo card metadata block was not found. Setting CardData to empty.
214
+ [default3]:Repo card metadata block was not found. Setting CardData to empty.
215
+ [default5]:07/02/2024 14:15:52 [WARNING|DP=0|PP=1|TP=1|ip-26-0-170-31]: Repo card metadata block was not found. Setting CardData to empty.
216
+ [default6]:Repo card metadata block was not found. Setting CardData to empty.
217
+ [default6]:07/02/2024 14:15:52 [WARNING|DP=0|PP=1|TP=2|ip-26-0-170-31]: Repo card metadata block was not found. Setting CardData to empty.
218
+ [default0]:Repo card metadata block was not found. Setting CardData to empty.
219
+ [default0]:07/02/2024 14:15:52 [WARNING|DP=0|PP=2|TP=0|ip-26-0-171-56]: Repo card metadata block was not found. Setting CardData to empty.
220
+ [default1]:07/02/2024 14:15:53 [WARNING|DP=0|PP=2|TP=1|ip-26-0-171-56]: Repo card metadata block was not found. Setting CardData to empty.
221
+ [default6]:Repo card metadata block was not found. Setting CardData to empty.
222
+ [default6]:07/02/2024 14:15:53 [WARNING|DP=0|PP=3|TP=2|ip-26-0-171-56]: Repo card metadata block was not found. Setting CardData to empty.
223
+ [default1]:Repo card metadata block was not found. Setting CardData to empty.
224
+ [default4]:Repo card metadata block was not found. Setting CardData to empty.
225
+ [default4]:07/02/2024 14:15:53 [WARNING|DP=0|PP=3|TP=0|ip-26-0-171-56]: Repo card metadata block was not found. Setting CardData to empty.
226
+ [default2]:[rank2]: Traceback (most recent call last):
227
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
228
+ [default2]:[rank2]: trainer.train(dataloader)
229
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
230
+ [default2]:[rank2]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
231
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
232
+ [default2]:[rank2]: outputs = self.pipeline_engine.train_batch_iter(
233
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
234
+ [default2]:[rank2]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
235
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
236
+ [default2]:[rank2]: output = model(**micro_batch)
237
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
238
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
239
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
240
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
241
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
242
+ [default2]:[rank2]: sharded_logits = self.model(
243
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
244
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
245
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
246
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
247
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
248
+ [default2]:[rank2]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
249
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
250
+ [default2]:[rank2]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
251
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
252
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
253
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
254
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
255
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
256
+ [default2]:[rank2]: output = self.pp_block(**new_kwargs)
257
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
258
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
259
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
260
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
261
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
262
+ [default2]:[rank2]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
263
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
264
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
265
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
266
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
267
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward
268
+ [default2]:[rank2]: hidden_states = self.down_proj(self.split_silu_mul(merged_states))
269
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
270
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
271
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
272
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
273
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 128, in forward
274
+ [default2]:[rank2]: return self.act(gate_states) * up_states
275
+ [default2]:[rank2]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 256.00 MiB. GPU  has a total capacity of 79.33 GiB of which 205.94 MiB is free. Including non-PyTorch memory, this process has 79.11 GiB memory in use. Of the allocated memory 69.81 GiB is allocated by PyTorch, and 141.60 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
276
+ [default0]:[rank0]: Traceback (most recent call last):
277
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
278
+ [default0]:[rank0]: trainer.train(dataloader)
279
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
280
+ [default0]:[rank0]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
281
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
282
+ [default0]:[rank0]: outputs = self.pipeline_engine.train_batch_iter(
283
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
284
+ [default0]:[rank0]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
285
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
286
+ [default0]:[rank0]: output = model(**micro_batch)
287
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
288
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
289
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
290
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
291
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
292
+ [default0]:[rank0]: sharded_logits = self.model(
293
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
294
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
295
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
296
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
297
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
298
+ [default0]:[rank0]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
299
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
300
+ [default0]:[rank0]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
301
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
302
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
303
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
304
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
305
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
306
+ [default0]:[rank0]: output = self.pp_block(**new_kwargs)
307
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
308
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
309
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
310
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
311
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
312
+ [default0]:[rank0]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
313
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
314
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
315
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
316
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
317
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward
318
+ [default0]:[rank0]: hidden_states = self.down_proj(self.split_silu_mul(merged_states))
319
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
320
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
321
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
322
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
323
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward
324
+ [default0]:[rank0]: return row_linear(
325
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear
326
+ [default0]:[rank0]: out = F.linear(input, weight, bias)
327
+ [default0]:[rank0]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 512.00 MiB. GPU
328
+ [default1]:[rank1]: Traceback (most recent call last):
329
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
330
+ [default1]:[rank1]: trainer.train(dataloader)
331
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
332
+ [default1]:[rank1]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
333
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
334
+ [default1]:[rank1]: outputs = self.pipeline_engine.train_batch_iter(
335
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
336
+ [default1]:[rank1]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
337
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
338
+ [default1]:[rank1]: output = model(**micro_batch)
339
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
340
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
341
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
342
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
343
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
344
+ [default1]:[rank1]: sharded_logits = self.model(
345
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
346
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
347
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
348
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
349
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
350
+ [default1]:[rank1]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
351
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
352
+ [default1]:[rank1]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
353
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
354
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
355
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
356
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
357
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
358
+ [default1]:[rank1]: output = self.pp_block(**new_kwargs)
359
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
360
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
361
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
362
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
363
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
364
+ [default1]:[rank1]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
365
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
366
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
367
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
368
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
369
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward
370
+ [default1]:[rank1]: hidden_states = self.down_proj(self.split_silu_mul(merged_states))
371
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
372
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
373
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
374
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
375
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward
376
+ [default1]:[rank1]: return row_linear(
377
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear
378
+ [default1]:[rank1]: out = F.linear(input, weight, bias)
379
+ [default1]:[rank1]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 512.00 MiB. GPU  has a total capacity of 79.33 GiB of which 35.94 MiB is free. Including non-PyTorch memory, this process has 79.28 GiB memory in use. Of the allocated memory 70.06 GiB is allocated by PyTorch, and 141.60 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
380
+ [default1]:Exception in thread Thread-2 (_pin_memory_loop):
381
+ [default1]:Traceback (most recent call last):
382
+ [default1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
383
+ [default1]: self.run()
384
+ [default1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/threading.py", line 953, in run
385
+ [default1]: self._target(*self._args, **self._kwargs)
386
+ [default1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/utils/data/_utils/pin_memory.py", line 54, in _pin_memory_loop
387
+ [default1]: do_one_step()
388
+ [default1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/utils/data/_utils/pin_memory.py", line 31, in do_one_step
389
+ [default1]: r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
390
+ [default1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/multiprocessing/queues.py", line 122, in get
391
+ [default1]: return _ForkingPickler.loads(res)
392
+ [default1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/multiprocessing/reductions.py", line 495, in rebuild_storage_fd
393
+ [default1]: fd = df.detach()
394
+ [default1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/multiprocessing/resource_sharer.py", line 57, in detach
395
+ [default1]: with _resource_sharer.get_connection(self._id) as conn:
396
+ [default1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/multiprocessing/resource_sharer.py", line 86, in get_connection
397
+ [default1]: c = Client(address, authkey=process.current_process().authkey)
398
+ [default1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/multiprocessing/connection.py", line 508, in Client
399
+ [default1]: answer_challenge(c, authkey)
400
+ [default1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/multiprocessing/connection.py", line 752, in answer_challenge
401
+ [default1]: message = connection.recv_bytes(256) # reject large message
402
+ [default1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/multiprocessing/connection.py", line 216, in recv_bytes
403
+ [default1]: buf = self._recv_bytes(maxlength)
404
+ [default1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/multiprocessing/connection.py", line 414, in _recv_bytes
405
+ [default1]: buf = self._recv(4)
406
+ [default1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/multiprocessing/connection.py", line 379, in _recv
407
+ [default1]: chunk = read(handle, remaining)
408
+ [default1]:ConnectionResetError: [Errno 104] Connection reset by peer
409
+ [default3]:[rank3]: Traceback (most recent call last):
410
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
411
+ [default3]:[rank3]: trainer.train(dataloader)
412
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
413
+ [default3]:[rank3]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
414
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
415
+ [default3]:[rank3]: outputs = self.pipeline_engine.train_batch_iter(
416
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
417
+ [default3]:[rank3]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
418
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
419
+ [default3]:[rank3]: output = model(**micro_batch)
420
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
421
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
422
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
423
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
424
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
425
+ [default3]:[rank3]: sharded_logits = self.model(
426
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
427
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
428
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
429
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
430
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
431
+ [default3]:[rank3]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
432
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
433
+ [default3]:[rank3]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
434
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
435
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
436
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
437
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
438
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
439
+ [default3]:[rank3]: output = self.pp_block(**new_kwargs)
440
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
441
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
442
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
443
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
444
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
445
+ [default3]:[rank3]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
446
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
447
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
448
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
449
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
450
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 172, in forward
451
+ [default3]:[rank3]: hidden_states = self.down_proj(self.split_silu_mul(merged_states))
452
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
453
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
454
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
455
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
456
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 159, in forward
457
+ [default3]:[rank3]: return row_linear(
458
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 474, in row_linear
459
+ [default3]:[rank3]: out = F.linear(input, weight, bias)
460
+ [default3]:[rank3]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 512.00 MiB. GPU  has a total capacity of 79.33 GiB of which 335.94 MiB is free. Including non-PyTorch memory, this process has 78.99 GiB memory in use. Of the allocated memory 70.06 GiB is allocated by PyTorch, and 141.60 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
461
+ [default0]:Exception in thread Thread-2 (_pin_memory_loop):
462
+ [default0]:Traceback (most recent call last):
463
+ [default0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
464
+ [default0]: self.run()
465
+ [default0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/threading.py", line 953, in run
466
+ [default0]: self._target(*self._args, **self._kwargs)
467
+ [default0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/utils/data/_utils/pin_memory.py", line 54, in _pin_memory_loop
468
+ [default0]: do_one_step()
469
+ [default0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/utils/data/_utils/pin_memory.py", line 31, in do_one_step
470
+ [default0]: r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
471
+ [default0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/multiprocessing/queues.py", line 122, in get
472
+ [default0]: return _ForkingPickler.loads(res)
473
+ [default0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/multiprocessing/reductions.py", line 495, in rebuild_storage_fd
474
+ [default0]: fd = df.detach()
475
+ [default0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/multiprocessing/resource_sharer.py", line 57, in detach
476
+ [default0]: with _resource_sharer.get_connection(self._id) as conn:
477
+ [default0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/multiprocessing/resource_sharer.py", line 86, in get_connection
478
+ [default0]: c = Client(address, authkey=process.current_process().authkey)
479
+ [default0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/multiprocessing/connection.py", line 508, in Client
480
+ [default0]: answer_challenge(c, authkey)
481
+ [default0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/multiprocessing/connection.py", line 757, in answer_challenge
482
+ [default0]: response = connection.recv_bytes(256) # reject large message
483
+ [default0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/multiprocessing/connection.py", line 216, in recv_bytes
484
+ [default0]: buf = self._recv_bytes(maxlength)
485
+ [default0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/multiprocessing/connection.py", line 414, in _recv_bytes
486
+ [default0]: buf = self._recv(4)
487
+ [default0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/multiprocessing/connection.py", line 379, in _recv
488
+ [default0]: chunk = read(handle, remaining)
489
+ [default0]:ConnectionResetError: [Errno 104] Connection reset by peer
490
+ W0702 14:16:13.047000 140497670219584 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2695551 closing signal SIGTERM
491
+ W0702 14:16:13.048000 140497670219584 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2695552 closing signal SIGTERM
492
+ W0702 14:16:13.049000 140497670219584 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2695553 closing signal SIGTERM
493
+ W0702 14:16:13.049000 140497670219584 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2695554 closing signal SIGTERM
494
+ [default4]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.)
495
+ [default4]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
496
+ [default5]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.)
497
+ [default5]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
498
+ [default6]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.)
499
+ [default6]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
500
+ [default7]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.)
501
+ [default7]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
502
+ E0702 14:16:15.183000 140497670219584 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 2695547) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10
503
+ Traceback (most recent call last):
504
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in <module>
505
+ sys.exit(main())
506
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper
507
+ return f(*args, **kwargs)
508
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main
509
+ run(args)
510
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run
511
+ elastic_launch(
512
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__
513
+ return launch_agent(self._config, self._entrypoint, list(args))
514
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent
515
+ raise ChildFailedError(
516
+ torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
517
+ ============================================================
518
+ /fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED
519
+ ------------------------------------------------------------
520
+ Failures:
521
+ [1]:
522
+ time : 2024-07-02_14:16:13
523
+ host : ip-26-0-170-31.ec2.internal
524
+ rank : 1 (local_rank: 1)
525
+ exitcode : 1 (pid: 2695548)
526
+ error_file: <N/A>
527
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
528
+ [2]:
529
+ time : 2024-07-02_14:16:13
530
+ host : ip-26-0-170-31.ec2.internal
531
+ rank : 2 (local_rank: 2)
532
+ exitcode : 1 (pid: 2695549)
533
+ error_file: <N/A>
534
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
535
+ [3]:
536
+ time : 2024-07-02_14:16:13
537
+ host : ip-26-0-170-31.ec2.internal
538
+ rank : 3 (local_rank: 3)
539
+ exitcode : 1 (pid: 2695550)
540
+ error_file: <N/A>
541
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
542
+ ------------------------------------------------------------
543
+ Root Cause (first observed failure):
544
+ [0]:
545
+ time : 2024-07-02_14:16:13
546
+ host : ip-26-0-170-31.ec2.internal
547
+ rank : 0 (local_rank: 0)
548
+ exitcode : 1 (pid: 2695547)
549
+ error_file: <N/A>
550
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
551
+ ============================================================
552
+ srun: error: ip-26-0-170-31: task 0: Exited with exit code 1
553
+ [default0]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.)
554
+ [default0]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
555
+ [default2]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.)
556
+ [default2]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
557
+ [default3]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.)
558
+ [default3]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
559
+ [default1]:/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/autograd/graph.py:744: UserWarning: c10d::allreduce_: an autograd kernel was not registered to the Autograd key(s) but we are trying to backprop through it. This may lead to silently incorrect behavior. This behavior is deprecated and will be removed in a future version of PyTorch. If your operator is differentiable, please ensure you have registered an autograd kernel to the correct Autograd key (e.g. DispatchKey::Autograd, DispatchKey::CompositeImplicitAutograd). If your operator is not differentiable, or to squash this warning and use the previous behavior, please register torch::CppFunction::makeFallthrough() to DispatchKey::Autograd. (Triggered internally at ../torch/csrc/autograd/autograd_not_implemented_fallback.cpp:63.)
560
+ [default1]: return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
561
+ W0702 14:16:17.053000 140245491873536 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1252] The node 'ip-26-0-171-56.ec2.internal_2954948_0' has failed to send a keep-alive heartbeat to the rendezvous 'none' due to an error of type RendezvousConnectionError.
562
+ [default1]:[rank9]: Traceback (most recent call last):
563
+ [default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
564
+ [default2]:[rank10]: Traceback (most recent call last):
565
+ [default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
566
+ [default3]:[rank11]: Traceback (most recent call last):
567
+ [default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
568
+ [default3]:[rank11]: trainer.train(dataloader)
569
+ [default1]:[rank9]: trainer.train(dataloader)
570
+ [default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
571
+ [default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
572
+ [default2]:[rank10]: trainer.train(dataloader)
573
+ [default1]:[rank9]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
574
+ [default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
575
+ [default3]:[rank11]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
576
+ [default2]:[rank10]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
577
+ [default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
578
+ [default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
579
+ [default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
580
+ [default1]:[rank9]: outputs = self.pipeline_engine.train_batch_iter(
581
+ [default2]:[rank10]: outputs = self.pipeline_engine.train_batch_iter(
582
+ [default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
583
+ [default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
584
+ [default1]:[rank9]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
585
+ [default2]:[rank10]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
586
+ [default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
587
+ [default3]:[rank11]: outputs = self.pipeline_engine.train_batch_iter(
588
+ [default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
589
+ [default3]:[rank11]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
590
+ [default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
591
+ [default3]:[rank11]: output = model(**micro_batch)
592
+ [default2]:[rank10]: output = model(**micro_batch)
593
+ [default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
594
+ [default2]:[rank10]: return self._call_impl(*args, **kwargs)
595
+ [default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
596
+ [default2]:[rank10]: return forward_call(*args, **kwargs)
597
+ [default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
598
+ [default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
599
+ [default2]:[rank10]: sharded_logits = self.model(
600
+ [default3]:[rank11]: return self._call_impl(*args, **kwargs)
601
+ [default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
602
+ [default2]:[rank10]: return self._call_impl(*args, **kwargs)
603
+ [default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
604
+ [default1]:[rank9]: output = model(**micro_batch)
605
+ [default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
606
+ [default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
607
+ [default2]:[rank10]: return forward_call(*args, **kwargs)
608
+ [default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
609
+ [default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
610
+ [default1]:[rank9]: return self._call_impl(*args, **kwargs)
611
+ [default2]:[rank10]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
612
+ [default3]:[rank11]: return forward_call(*args, **kwargs)
613
+ [default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
614
+ [default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
615
+ [default2]:[rank10]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
616
+ [default1]:[rank9]: return forward_call(*args, **kwargs)
617
+ [default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
618
+ [default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
619
+ [default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
620
+ [default3]:[rank11]: sharded_logits = self.model(
621
+ [default2]:[rank10]: return self._call_impl(*args, **kwargs)
622
+ [default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
623
+ [default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
624
+ [default1]:[rank9]: sharded_logits = self.model(
625
+ [default2]:[rank10]: return forward_call(*args, **kwargs)
626
+ [default3]:[rank11]: return self._call_impl(*args, **kwargs)
627
+ [default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward
628
+ [default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
629
+ [default1]:[rank9]: return self._call_impl(*args, **kwargs)
630
+ [default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
631
+ [default1]:[rank9]: return forward_call(*args, **kwargs)
632
+ [default2]:[rank10]: new_kwargs[name] = recv_from_pipeline_state_buffer(
633
+ [default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer
634
+ [default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
635
+ [default2]:[rank10]: pipeline_state.run_communication()
636
+ [default1]:[rank9]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
637
+ [default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 160, in run_communication
638
+ [default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
639
+ [default2]:[rank10]: send_grad()
640
+ [default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
641
+ [default1]:[rank9]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
642
+ [default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 41, in __call__
643
+ [default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
644
+ [default3]:[rank11]: return forward_call(*args, **kwargs)
645
+ [default1]:[rank9]: return self._call_impl(*args, **kwargs)
646
+ [default2]:[rank10]: self.p2p.send_tensors([self.grad], to_rank=self.to_rank)
647
+ [default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
648
+ [default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors
649
+ [default2]:[rank10]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag)
650
+ [default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
651
+ [default1]:[rank9]: return forward_call(*args, **kwargs)
652
+ [default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors
653
+ [default3]:[rank11]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
654
+ [default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward
655
+ [default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
656
+ [default1]:[rank9]: new_kwargs[name] = recv_from_pipeline_state_buffer(
657
+ [default3]:[rank11]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
658
+ [default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer
659
+ [default2]:[rank10]: self._send_meta(tensor, to_rank=to_rank, tag=tag)
660
+ [default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
661
+ [default1]:[rank9]: pipeline_state.run_communication()
662
+ [default3]:[rank11]: return self._call_impl(*args, **kwargs)
663
+ [default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 160, in run_communication
664
+ [default2]:[rank10]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta
665
+ [default1]:[rank9]: send_grad()
666
+ [default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
667
+ [default2]:[rank10]: dist.send(
668
+ [default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 41, in __call__
669
+ [default1]:[rank9]: self.p2p.send_tensors([self.grad], to_rank=self.to_rank)
670
+ [default3]:[rank11]: return forward_call(*args, **kwargs)
671
+ [default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper
672
+ [default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors
673
+ [default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward
674
+ [default1]:[rank9]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag)
675
+ [default3]:[rank11]: new_kwargs[name] = recv_from_pipeline_state_buffer(
676
+ [default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors
677
+ [default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer
678
+ [default1]:[rank9]: self._send_meta(tensor, to_rank=to_rank, tag=tag)
679
+ [default3]:[rank11]: pipeline_state.run_communication()
680
+ [default2]:[rank10]: return func(*args, **kwargs)
681
+ [default1]:[rank9]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta
682
+ [default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 160, in run_communication
683
+ [default3]:[rank11]: send_grad()
684
+ [default1]:[rank9]: dist.send(
685
+ [default2]:[rank10]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send
686
+ [default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 41, in __call__
687
+ [default2]:[rank10]: group.send([tensor], group_dst_rank, tag).wait()
688
+ [default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper
689
+ [default2]:[rank10]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1.
690
+ [default1]:[rank9]: return func(*args, **kwargs)
691
+ [default3]:[rank11]: self.p2p.send_tensors([self.grad], to_rank=self.to_rank)
692
+ [default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors
693
+ [default3]:[rank11]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag)
694
+ [default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors
695
+ [default1]:[rank9]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send
696
+ [default3]:[rank11]: self._send_meta(tensor, to_rank=to_rank, tag=tag)
697
+ [default1]:[rank9]: group.send([tensor], group_dst_rank, tag).wait()
698
+ [default3]:[rank11]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta
699
+ [default1]:[rank9]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1.
700
+ [default3]:[rank11]: dist.send(
701
+ [default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper
702
+ [default3]:[rank11]: return func(*args, **kwargs)
703
+ [default3]:[rank11]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send
704
+ [default3]:[rank11]: group.send([tensor], group_dst_rank, tag).wait()
705
+ [default3]:[rank11]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1.
706
+ [default0]:[rank8]: Traceback (most recent call last):
707
+ [default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
708
+ [default0]:[rank8]: trainer.train(dataloader)
709
+ [default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
710
+ [default0]:[rank8]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
711
+ [default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
712
+ [default0]:[rank8]: outputs = self.pipeline_engine.train_batch_iter(
713
+ [default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
714
+ [default0]:[rank8]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
715
+ [default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
716
+ [default0]:[rank8]: output = model(**micro_batch)
717
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
718
+ [default0]:[rank8]: return self._call_impl(*args, **kwargs)
719
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
720
+ [default0]:[rank8]: return forward_call(*args, **kwargs)
721
+ [default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
722
+ [default0]:[rank8]: sharded_logits = self.model(
723
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
724
+ [default0]:[rank8]: return self._call_impl(*args, **kwargs)
725
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
726
+ [default0]:[rank8]: return forward_call(*args, **kwargs)
727
+ [default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
728
+ [default0]:[rank8]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
729
+ [default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
730
+ [default0]:[rank8]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
731
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
732
+ [default0]:[rank8]: return self._call_impl(*args, **kwargs)
733
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
734
+ [default0]:[rank8]: return forward_call(*args, **kwargs)
735
+ [default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 126, in forward
736
+ [default0]:[rank8]: new_kwargs[name] = recv_from_pipeline_state_buffer(
737
+ [default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/functional.py", line 117, in recv_from_pipeline_state_buffer
738
+ [default0]:[rank8]: pipeline_state.run_communication()
739
+ [default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 160, in run_communication
740
+ [default0]:[rank8]: send_grad()
741
+ [default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/state.py", line 41, in __call__
742
+ [default0]:[rank8]: self.p2p.send_tensors([self.grad], to_rank=self.to_rank)
743
+ [default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 348, in send_tensors
744
+ [default0]:[rank8]: futures = self.isend_tensors(tensors=tensors, to_rank=to_rank, tag=tag)
745
+ [default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 295, in isend_tensors
746
+ [default0]:[rank8]: self._send_meta(tensor, to_rank=to_rank, tag=tag)
747
+ [default0]:[rank8]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/p2p.py", line 221, in _send_meta
748
+ [default0]:[rank8]: dist.send(
749
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 75, in wrapper
750
+ [default0]:[rank8]: return func(*args, **kwargs)
751
+ [default0]:[rank8]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1886, in send
752
+ [default0]:[rank8]: group.send([tensor], group_dst_rank, tag).wait()
753
+ [default0]:[rank8]: torch.distributed.DistBackendError: NCCL communicator was aborted on rank 1.
754
+ W0702 14:16:18.050000 140251158693696 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2955028 closing signal SIGTERM
755
+ W0702 14:16:18.050000 140251158693696 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2955029 closing signal SIGTERM
756
+ W0702 14:16:18.053000 140251158693696 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2955030 closing signal SIGTERM
757
+ W0702 14:16:18.054000 140251158693696 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2955031 closing signal SIGTERM
758
+ W0702 14:16:18.054000 140251158693696 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2955032 closing signal SIGTERM
759
+ W0702 14:16:18.056000 140251158693696 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2955033 closing signal SIGTERM
760
+ W0702 14:16:18.064000 140251158693696 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2955034 closing signal SIGTERM
761
+ W0702 14:16:18.071000 140251158693696 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 2955035 closing signal SIGTERM
762
+ W0702 14:16:21.689000 140251158693696 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-56.ec2.internal_2954948_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError.
763
+ W0702 14:16:21.700000 140251158693696 torch/distributed/elastic/rendezvous/dynamic_rendezvous.py:1203] The node 'ip-26-0-171-56.ec2.internal_2954948_0' has failed to shutdown the rendezvous 'none' due to an error of type RendezvousConnectionError.
764
+ Traceback (most recent call last):
765
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store
766
+ return getattr(self._store, store_op)(*args, **kwargs)
767
+ torch.distributed.DistNetworkError: Broken pipe
768
+
769
+ The above exception was the direct cause of the following exception:
770
+
771
+ Traceback (most recent call last):
772
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in <module>
773
+ sys.exit(main())
774
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper
775
+ return f(*args, **kwargs)
776
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main
777
+ run(args)
778
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run
779
+ elastic_launch(
780
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__
781
+ return launch_agent(self._config, self._entrypoint, list(args))
782
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 254, in launch_agent
783
+ result = agent.run()
784
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper
785
+ result = f(*args, **kwargs)
786
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run
787
+ result = self._invoke_run(role)
788
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 908, in _invoke_run
789
+ num_nodes_waiting = rdzv_handler.num_nodes_waiting()
790
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1174, in num_nodes_waiting
791
+ self._state_holder.sync()
792
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 419, in sync
793
+ get_response = self._backend.get_state()
794
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state
795
+ base64_state: bytes = self._call_store("get", self._key)
796
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store
797
+ raise RendezvousConnectionError(
798
+ torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details.
799
+ srun: error: ip-26-0-171-56: task 1: Exited with exit code 1
800
+ Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details.
llama-1B/16_GPUS/dp-1_tp-4_pp-4_mbz-32/status.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ oom