3outeille HF staff commited on
Commit
a921e7d
1 Parent(s): ed0e41b

Upload llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-1024

Browse files
llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-1024/bench.slurm ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ #SBATCH --job-name=bench_cluster
4
+ #SBATCH --time=02:00:00
5
+ #SBATCH --partition=hopper-prod
6
+ #SBATCH --nodes=1
7
+ #SBATCH --gres=gpu:8
8
+ #SBATCH --qos=normal
9
+ #SBATCH --ntasks-per-node=1
10
+ #SBATCH --cpus-per-task=96
11
+ #SBATCH --exclusive
12
+ #SBATCH --output=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-1024/log.out
13
+ #SBATCH --error=/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-1024/log.out
14
+
15
+ # Function to update status based on squeue output
16
+ update_status() {
17
+ job_id=$1
18
+ status_file=$2
19
+ # For unknown reasons, it doenst update status for pending. It only works for running
20
+ while true; do
21
+ job_status=$(squeue --job $job_id --noheader --format=%T)
22
+ echo "Job status: $job_status"
23
+ if [ -z "$job_status" ]; then
24
+ # Job has finished or is not found
25
+ break
26
+ elif [ "$job_status" = "RUNNING" ]; then
27
+ printf "running" > $status_file
28
+ break
29
+ fi
30
+ sleep 10
31
+ done
32
+ }
33
+
34
+ # Misc initializations.
35
+ echo "========================"
36
+ echo "START TIME: $(date)"
37
+ source /fsx/ferdinandmom/miniforge3/etc/profile.d/conda.sh
38
+ conda activate /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster
39
+ echo python3 version = $(python3 --version)
40
+ echo "========================"
41
+
42
+ # Slurm stuff
43
+ export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
44
+ export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
45
+ export MASTER_PORT=$((1024 + RANDOM % 64511))
46
+
47
+ export TMPDIR=/scratch
48
+ export HF_DATASETS_CACHE="/admin/home/ferdinand_mom/.cache"
49
+ export CUBLAS_WORKSPACE_CONFIG=":4096:8"
50
+ export CUDA_DEVICE_MAX_CONNECTIONS="1"
51
+
52
+ huggingface-cli login --token $HUGGINGFACE_TOKEN
53
+
54
+
55
+ NANOTRON_REPO="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron"
56
+ CMD="$NANOTRON_REPO/run_train.py --config-file /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-1024/config.yaml"
57
+
58
+ LAUNCHER="torchrun \
59
+ --nproc_per_node 8 \
60
+ --nnodes 1 \
61
+ --rdzv_endpoint ${MASTER_ADDR}:${MASTER_PORT} \
62
+ --rdzv_backend c10d \
63
+ --max_restarts 0 \
64
+ --tee 3 \
65
+ --node_rank ${SLURM_PROCID}"
66
+
67
+ # Checkout the bench_cluster branch
68
+ cd $NANOTRON_REPO
69
+ git checkout bench_cluster
70
+ cd ..
71
+ # Get the current job ID
72
+ job_id=${SLURM_JOB_ID}
73
+
74
+ # Update status to "pending" or "running" in the background
75
+ update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-1024/status.txt &
76
+
77
+ # Run the main command
78
+ srun -u $LAUNCHER $CMD
79
+ exit_status=$?
80
+
81
+ # Update status based on the exit status of `srun`
82
+ if [ $exit_status -eq 0 ]; then
83
+ printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-1024/status.txt
84
+ else
85
+ if grep -q "OutOfMemoryError" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-1024/log.out; then
86
+ printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-1024/status.txt
87
+ elif grep -q " CUDA error: an illegal memory access" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-1024/log.out; then
88
+ printf "oom" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-1024/status.txt
89
+ elif grep -q "Timeout at NCCL" /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-1024/log.out; then
90
+ printf "timeout" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-1024/status.txt
91
+ else
92
+ printf "fail" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-1024/status.txt
93
+ fi
94
+ fi
95
+
96
+ # Run the report script if the job completed successfully
97
+ if [ $exit_status -eq 0 ]; then
98
+ python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-1024 --is_logs
99
+ python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-1024 --is_profiler
100
+ fi
101
+
102
+
103
+ # Push to hub the folder using huggingface_cli
104
+ huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-1024 llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-1024 --commit-message "Upload llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-1024"
105
+
106
+ # Verify the upload
107
+ if [ $? -eq 0 ]; then
108
+ echo "Uploading to Huggingface Hub successful"
109
+ else
110
+ echo "Failed to upload to Huggingface Hub"
111
+ fi
llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-1024/config.yaml ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ general:
2
+ project: bench_cluster
3
+ seed: 42
4
+ model:
5
+ ddp_bucket_cap_mb: 25
6
+ dtype: bfloat16
7
+ init_method:
8
+ std: 0.025
9
+ make_vocab_size_divisible_by: 1
10
+ model_config:
11
+ bos_token_id: 1
12
+ eos_token_id: 2
13
+ hidden_act: silu
14
+ hidden_size: 2048
15
+ initializer_range: 0.02
16
+ intermediate_size: 4096
17
+ is_llama_config: true
18
+ max_position_embeddings: 4096
19
+ num_attention_heads: 32
20
+ num_hidden_layers: 24
21
+ num_key_value_heads: 32
22
+ pad_token_id: null
23
+ pretraining_tp: 1
24
+ rms_norm_eps: 1.0e-05
25
+ rope_scaling: null
26
+ rope_theta: 10000.0
27
+ tie_word_embeddings: true
28
+ use_cache: true
29
+ vocab_size: 50257
30
+ optimizer:
31
+ accumulate_grad_in_fp32: true
32
+ clip_grad: 1.0
33
+ learning_rate_scheduler:
34
+ learning_rate: 0.0001
35
+ lr_decay_style: linear
36
+ lr_warmup_style: linear
37
+ lr_warmup_steps: 1
38
+ min_decay_lr: 1.0e-05
39
+ optimizer_factory:
40
+ adam_beta1: 0.9
41
+ adam_beta2: 0.95
42
+ adam_eps: 1.0e-08
43
+ name: adamW
44
+ torch_adam_is_fused: true
45
+ weight_decay: 0.01
46
+ zero_stage: 1
47
+ parallelism:
48
+ dp: 1
49
+ expert_parallel_size: 1
50
+ pp: 1
51
+ pp_engine: 1f1b
52
+ tp: 8
53
+ tp_linear_async_communication: false
54
+ tp_mode: REDUCE_SCATTER
55
+ profiler:
56
+ profiler_export_path: /fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-1024
57
+ tokenizer:
58
+ tokenizer_max_length: null
59
+ tokenizer_name_or_path: openai-community/gpt2
60
+ tokenizer_revision: null
61
+ data_stages:
62
+ - name: Training Stage
63
+ start_training_step: 1
64
+ data:
65
+ dataset:
66
+ dataset_overwrite_cache: false
67
+ dataset_processing_num_proc_per_process: 64
68
+ hf_dataset_config_name: null
69
+ hf_dataset_or_datasets: roneneldan/TinyStories
70
+ hf_dataset_splits: train
71
+ text_column_name: text
72
+ num_loading_workers: 0
73
+ seed: 42
74
+ lighteval: null
75
+ tokens:
76
+ train_steps: 20
77
+ val_check_interval: -1
78
+ batch_accumulation_per_replica: 1
79
+ limit_test_batches: 0
80
+ limit_val_batches: 0
81
+ micro_batch_size: 1024
82
+ sequence_length: 4096
83
+ logging:
84
+ iteration_step_info_interval: 1
85
+ log_level: info
86
+ log_level_replica: info
87
+ checkpoints:
88
+ checkpoint_interval: 100000
89
+ checkpoints_path: /dev/null
90
+ resume_checkpoint_path: null
llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-1024/log.out ADDED
@@ -0,0 +1,647 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ========================
2
+ START TIME: Wed Jul 3 21:43:03 UTC 2024
3
+ python3 version = Python 3.10.14
4
+ ========================
5
+ The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
6
+ Token is valid (permission: write).
7
+ Your token has been saved to /admin/home/ferdinand_mom/.cache/huggingface/token
8
+ Login successful
9
+ Already on 'bench_cluster'
10
+ M examples/config_tiny_llama.py
11
+ M examples/config_tiny_llama.yaml
12
+ M examples/train_tiny_llama.sh
13
+ M src/nanotron/models/llama.py
14
+ M src/nanotron/trainer.py
15
+ Your branch is up to date with 'origin/bench_cluster'.
16
+ Job status: RUNNING
17
+ W0703 21:43:07.092000 140123394443072 torch/distributed/run.py:757]
18
+ W0703 21:43:07.092000 140123394443072 torch/distributed/run.py:757] *****************************************
19
+ W0703 21:43:07.092000 140123394443072 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
20
+ W0703 21:43:07.092000 140123394443072 torch/distributed/run.py:757] *****************************************
21
+ [default0]:07/03/2024 21:43:26 [WARNING|DP=0|PP=0|TP=0|ip-26-0-163-220]: [Vocab Size Padding] Padded vocab (size: 50257) with 7 dummy tokens (new size: 50264)
22
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: Config:
23
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: Config(general=GeneralArgs(project='bench_cluster',
24
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: run='%date_%jobid',
25
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: seed=42,
26
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: step=None,
27
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: consumed_train_samples=None,
28
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: benchmark_csv_path=None,
29
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: ignore_sanity_checks=True),
30
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: parallelism=ParallelismArgs(dp=1,
31
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: pp=1,
32
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: tp=8,
33
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: pp_engine=<nanotron.parallel.pipeline_parallel.engine.OneForwardOneBackwardPipelineEngine object at 0x7ff563268940>,
34
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: tp_mode=<TensorParallelLinearMode.REDUCE_SCATTER: 2>,
35
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: tp_linear_async_communication=False,
36
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: expert_parallel_size=1),
37
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: model=ModelArgs(model_config=LlamaConfig(bos_token_id=1,
38
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: eos_token_id=2,
39
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: hidden_act='silu',
40
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: hidden_size=2048,
41
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: initializer_range=0.02,
42
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: intermediate_size=4096,
43
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: is_llama_config=True,
44
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: max_position_embeddings=4096,
45
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: num_attention_heads=32,
46
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: num_hidden_layers=24,
47
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: num_key_value_heads=32,
48
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: pad_token_id=None,
49
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: pretraining_tp=1,
50
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: rms_norm_eps=1e-05,
51
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: rope_scaling=None,
52
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: rope_theta=10000.0,
53
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: tie_word_embeddings=True,
54
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: use_cache=True,
55
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: vocab_size=50264),
56
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: init_method=RandomInit(std=0.025),
57
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: dtype=torch.bfloat16,
58
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: make_vocab_size_divisible_by=1,
59
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: ddp_bucket_cap_mb=25),
60
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: tokenizer=TokenizerArgs(tokenizer_name_or_path='openai-community/gpt2',
61
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: tokenizer_revision=None,
62
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: tokenizer_max_length=None),
63
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: checkpoints=CheckpointsArgs(checkpoints_path=Path('/dev/null'),
64
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: checkpoint_interval=100000,
65
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: save_initial_state=False,
66
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: resume_checkpoint_path=None,
67
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: checkpoints_path_is_shared_file_system=False),
68
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: logging=LoggingArgs(log_level='info',
69
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: log_level_replica='info',
70
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: iteration_step_info_interval=1),
71
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: tokens=TokensArgs(sequence_length=4096,
72
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: train_steps=20,
73
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: micro_batch_size=1024,
74
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: batch_accumulation_per_replica=1,
75
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: val_check_interval=-1,
76
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: limit_val_batches=0,
77
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: limit_test_batches=0),
78
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: optimizer=OptimizerArgs(optimizer_factory=AdamWOptimizerArgs(adam_eps=1e-08,
79
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: adam_beta1=0.9,
80
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: adam_beta2=0.95,
81
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: torch_adam_is_fused=True,
82
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: name='adamW'),
83
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: zero_stage=1,
84
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: weight_decay=0.01,
85
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: clip_grad=1.0,
86
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: accumulate_grad_in_fp32=True,
87
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: learning_rate_scheduler=LRSchedulerArgs(learning_rate=0.0001,
88
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: lr_warmup_steps=1,
89
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: lr_warmup_style='linear',
90
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: lr_decay_style='linear',
91
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: lr_decay_steps=19,
92
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: lr_decay_starting_step=None,
93
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: min_decay_lr=1e-05)),
94
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: data_stages=[DatasetStageArgs(name='Training Stage',
95
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: start_training_step=1,
96
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: data=DataArgs(dataset=PretrainDatasetsArgs(hf_dataset_or_datasets='roneneldan/TinyStories',
97
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: hf_dataset_splits='train',
98
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: hf_dataset_config_name=None,
99
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: dataset_processing_num_proc_per_process=64,
100
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: dataset_overwrite_cache=False,
101
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: text_column_name='text'),
102
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: seed=42,
103
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: num_loading_workers=0))],
104
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: profiler=ProfilerArgs(profiler_export_path=Path('/fsx/ferdinandmom/ferdinand-hf/bench_cluster/results/llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-1024')),
105
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: lighteval=None)
106
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: Model Config:
107
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: LlamaConfig(bos_token_id=1,
108
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: eos_token_id=2,
109
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: hidden_act='silu',
110
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: hidden_size=2048,
111
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: initializer_range=0.02,
112
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: intermediate_size=4096,
113
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: is_llama_config=True,
114
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: max_position_embeddings=4096,
115
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: num_attention_heads=32,
116
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: num_hidden_layers=24,
117
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: num_key_value_heads=32,
118
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: pad_token_id=None,
119
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: pretraining_tp=1,
120
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: rms_norm_eps=1e-05,
121
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: rope_scaling=None,
122
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: rope_theta=10000.0,
123
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: tie_word_embeddings=True,
124
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: use_cache=True,
125
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: vocab_size=50264)
126
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: Building model..
127
+ [default0]:07/03/2024 21:43:26 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: Setting PP block ranks...
128
+ [default4]:07/03/2024 21:43:41 [INFO|DP=0|PP=0|TP=4|ip-26-0-163-220]: Local number of parameters: 139M (264.73MiB)
129
+ [default5]:07/03/2024 21:43:41 [INFO|DP=0|PP=0|TP=5|ip-26-0-163-220]: Local number of parameters: 139M (264.73MiB)
130
+ [default4]:07/03/2024 21:43:41 [INFO|DP=0|PP=0|TP=4|ip-26-0-163-220]: [After model building] Memory usage: 290.76MiB. Peak allocated: 317.33MiB Peak reserved: 324.00MiB
131
+ [default4]:07/03/2024 21:43:41 [INFO|DP=0|PP=0|TP=4|ip-26-0-163-220]: No checkpoint path provided.
132
+ [default5]:07/03/2024 21:43:41 [INFO|DP=0|PP=0|TP=5|ip-26-0-163-220]: [After model building] Memory usage: 290.76MiB. Peak allocated: 317.33MiB Peak reserved: 324.00MiB
133
+ [default5]:07/03/2024 21:43:41 [INFO|DP=0|PP=0|TP=5|ip-26-0-163-220]: No checkpoint path provided.
134
+ [default3]:07/03/2024 21:43:41 [INFO|DP=0|PP=0|TP=3|ip-26-0-163-220]: Local number of parameters: 139M (264.73MiB)
135
+ [default3]:07/03/2024 21:43:41 [INFO|DP=0|PP=0|TP=3|ip-26-0-163-220]: [After model building] Memory usage: 290.76MiB. Peak allocated: 317.33MiB Peak reserved: 324.00MiB
136
+ [default3]:07/03/2024 21:43:41 [INFO|DP=0|PP=0|TP=3|ip-26-0-163-220]: No checkpoint path provided.
137
+ [default2]:07/03/2024 21:43:41 [INFO|DP=0|PP=0|TP=2|ip-26-0-163-220]: Local number of parameters: 139M (264.73MiB)
138
+ [default2]:07/03/2024 21:43:41 [INFO|DP=0|PP=0|TP=2|ip-26-0-163-220]: [After model building] Memory usage: 290.76MiB. Peak allocated: 317.33MiB Peak reserved: 324.00MiB
139
+ [default2]:07/03/2024 21:43:41 [INFO|DP=0|PP=0|TP=2|ip-26-0-163-220]: No checkpoint path provided.
140
+ [default0]:07/03/2024 21:43:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: Total number of parameters: 1.11G (2117.88MiB)
141
+ [default0]:07/03/2024 21:43:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: Local number of parameters: 139M (264.73MiB)
142
+ [default0]:07/03/2024 21:43:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: [After model building] Memory usage: 290.76MiB. Peak allocated: 317.33MiB Peak reserved: 324.00MiB
143
+ [default0]:07/03/2024 21:43:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: No checkpoint path provided.
144
+ [default0]:07/03/2024 21:43:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: Parametrizing model parameters using StandardParametrizator
145
+ [default0]:07/03/2024 21:43:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: [Optimizer Building] Using LearningRateForSP as learning rate
146
+ [default0]:07/03/2024 21:43:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: [ZeRO sharding] Size of optimizer params per rank:
147
+ [default0]:07/03/2024 21:43:41 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: [ZeRO sharding] DP Rank 0 has 139M out of 139M (100.00%) params' optimizer states
148
+ [default6]:07/03/2024 21:43:41 [INFO|DP=0|PP=0|TP=6|ip-26-0-163-220]: Local number of parameters: 139M (264.73MiB)
149
+ [default6]:07/03/2024 21:43:41 [INFO|DP=0|PP=0|TP=6|ip-26-0-163-220]: [After model building] Memory usage: 290.76MiB. Peak allocated: 317.33MiB Peak reserved: 324.00MiB
150
+ [default6]:07/03/2024 21:43:41 [INFO|DP=0|PP=0|TP=6|ip-26-0-163-220]: No checkpoint path provided.
151
+ [default1]:07/03/2024 21:43:41 [INFO|DP=0|PP=0|TP=1|ip-26-0-163-220]: Local number of parameters: 139M (264.73MiB)
152
+ [default1]:07/03/2024 21:43:41 [INFO|DP=0|PP=0|TP=1|ip-26-0-163-220]: [After model building] Memory usage: 290.76MiB. Peak allocated: 317.33MiB Peak reserved: 324.00MiB
153
+ [default1]:07/03/2024 21:43:41 [INFO|DP=0|PP=0|TP=1|ip-26-0-163-220]: No checkpoint path provided.
154
+ [default7]:07/03/2024 21:43:41 [INFO|DP=0|PP=0|TP=7|ip-26-0-163-220]: Local number of parameters: 139M (264.73MiB)
155
+ [default7]:07/03/2024 21:43:41 [INFO|DP=0|PP=0|TP=7|ip-26-0-163-220]: [After model building] Memory usage: 290.76MiB. Peak allocated: 317.33MiB Peak reserved: 324.00MiB
156
+ [default7]:07/03/2024 21:43:41 [INFO|DP=0|PP=0|TP=7|ip-26-0-163-220]: No checkpoint path provided.
157
+ [default0]:07/03/2024 21:43:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: [Training Plan] Stage Training Stage has 19 remaining training steps and has consumed 0 samples
158
+ [default0]:07/03/2024 21:43:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: Using `datasets` library
159
+ [default0]:07/03/2024 21:43:42 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: Loading tokenizer from openai-community/gpt2 and transformers/hf_hub versions ('4.41.2', '0.23.4')
160
+ [default0]:07/03/2024 21:43:43 [WARNING|DP=0|PP=0|TP=0|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty.
161
+ [default0]:Repo card metadata block was not found. Setting CardData to empty.
162
+ [default0]:07/03/2024 21:43:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: [Training Plan] There are 1 training stages
163
+ [default0]:07/03/2024 21:43:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: [Stage Training Stage] start from step 1
164
+ [default0]:07/03/2024 21:43:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]:
165
+ [default0]:07/03/2024 21:43:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: [Start training] datetime: 2024-07-03 21:43:44.038819 | mbs: 1024 | grad_accum: 1 | global_batch_size: 1024 | sequence_length: 4096 | train_steps: 20 | start_iteration_step: 0 | consumed_train_samples: 0
166
+ [default0]:07/03/2024 21:43:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: Resuming training from stage Training Stage, it has trained for 0 samples and has 19 remaining train steps
167
+ [default0]:07/03/2024 21:43:44 [INFO|DP=0|PP=0|TP=0|ip-26-0-163-220]: Memory usage: 1350.75MiB. Peak allocated 1350.76MiB. Peak reserved: 1384.00MiB
168
+ [default4]:07/03/2024 21:43:44 [WARNING|DP=0|PP=0|TP=4|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty.
169
+ [default5]:07/03/2024 21:43:44 [WARNING|DP=0|PP=0|TP=5|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty.
170
+ [default7]:Repo card metadata block was not found. Setting CardData to empty.
171
+ [default3]:07/03/2024 21:43:44 [WARNING|DP=0|PP=0|TP=3|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty.
172
+ [default2]:07/03/2024 21:43:44 [WARNING|DP=0|PP=0|TP=2|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty.
173
+ [default6]:Repo card metadata block was not found. Setting CardData to empty.
174
+ [default6]:07/03/2024 21:43:44 [WARNING|DP=0|PP=0|TP=6|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty.
175
+ [default1]:07/03/2024 21:43:44 [WARNING|DP=0|PP=0|TP=1|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty.
176
+ [default7]:07/03/2024 21:43:44 [WARNING|DP=0|PP=0|TP=7|ip-26-0-163-220]: Repo card metadata block was not found. Setting CardData to empty.
177
+ [default1]:Repo card metadata block was not found. Setting CardData to empty.
178
+ [default2]:Repo card metadata block was not found. Setting CardData to empty.
179
+ [default3]:Repo card metadata block was not found. Setting CardData to empty.
180
+ [default4]:Repo card metadata block was not found. Setting CardData to empty.
181
+ [default5]:Repo card metadata block was not found. Setting CardData to empty.
182
+ [default7]:[rank7]: Traceback (most recent call last):
183
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
184
+ [default7]:[rank7]: trainer.train(dataloader)
185
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
186
+ [default7]:[rank7]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
187
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
188
+ [default7]:[rank7]: outputs = self.pipeline_engine.train_batch_iter(
189
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
190
+ [default7]:[rank7]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
191
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
192
+ [default7]:[rank7]: output = model(**micro_batch)
193
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
194
+ [default7]:[rank7]: return self._call_impl(*args, **kwargs)
195
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
196
+ [default7]:[rank7]: return forward_call(*args, **kwargs)
197
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
198
+ [default7]:[rank7]: sharded_logits = self.model(
199
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
200
+ [default7]:[rank7]: return self._call_impl(*args, **kwargs)
201
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
202
+ [default7]:[rank7]: return forward_call(*args, **kwargs)
203
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
204
+ [default7]:[rank7]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
205
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
206
+ [default7]:[rank7]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
207
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
208
+ [default7]:[rank7]: return self._call_impl(*args, **kwargs)
209
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
210
+ [default7]:[rank7]: return forward_call(*args, **kwargs)
211
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
212
+ [default7]:[rank7]: output = self.pp_block(**new_kwargs)
213
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
214
+ [default7]:[rank7]: return self._call_impl(*args, **kwargs)
215
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
216
+ [default7]:[rank7]: return forward_call(*args, **kwargs)
217
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
218
+ [default7]:[rank7]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
219
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
220
+ [default7]:[rank7]: return self._call_impl(*args, **kwargs)
221
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
222
+ [default7]:[rank7]: return forward_call(*args, **kwargs)
223
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 171, in forward
224
+ [default7]:[rank7]: merged_states = self.gate_up_proj(hidden_states)
225
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
226
+ [default7]:[rank7]: return self._call_impl(*args, **kwargs)
227
+ [default7]:[rank7]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
228
+ [default7]:[rank7]: return forward_call(*args, **kwargs)
229
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 87, in forward
230
+ [default7]:[rank7]: return column_linear(
231
+ [default7]:[rank7]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 359, in column_linear
232
+ [default7]:[rank7]: return F.linear(input, weight, bias)
233
+ [default7]:[rank7]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 6.81 GiB is free. Including non-PyTorch memory, this process has 72.50 GiB memory in use. Of the allocated memory 55.52 GiB is allocated by PyTorch, and 5.95 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
234
+ [default0]:[rank0]: Traceback (most recent call last):
235
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
236
+ [default6]:[rank6]: Traceback (most recent call last):
237
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
238
+ [default6]:[rank6]: trainer.train(dataloader)
239
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
240
+ [default6]:[rank6]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
241
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
242
+ [default6]:[rank6]: outputs = self.pipeline_engine.train_batch_iter(
243
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
244
+ [default6]:[rank6]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
245
+ [default0]:[rank0]: trainer.train(dataloader)
246
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
247
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
248
+ [default0]:[rank0]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
249
+ [default6]:[rank6]: output = model(**micro_batch)
250
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
251
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
252
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
253
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
254
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
255
+ [default0]:[rank0]: outputs = self.pipeline_engine.train_batch_iter(
256
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
257
+ [default0]:[rank0]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
258
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
259
+ [default0]:[rank0]: output = model(**micro_batch)
260
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
261
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
262
+ [default6]:[rank6]: sharded_logits = self.model(
263
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
264
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
265
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
266
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
267
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
268
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
269
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
270
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
271
+ [default0]:[rank0]: sharded_logits = self.model(
272
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
273
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
274
+ [default6]:[rank6]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
275
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
276
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
277
+ [default6]:[rank6]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
278
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
279
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
280
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
281
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
282
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
283
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
284
+ [default6]:[rank6]: output = self.pp_block(**new_kwargs)
285
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
286
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
287
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
288
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
289
+ [default0]:[rank0]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
290
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
291
+ [default0]:[rank0]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
292
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
293
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
294
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
295
+ [default6]:[rank6]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
296
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
297
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
298
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
299
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
300
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
301
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
302
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
303
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 171, in forward
304
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
305
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
306
+ [default0]:[rank0]: output = self.pp_block(**new_kwargs)
307
+ [default6]:[rank6]: merged_states = self.gate_up_proj(hidden_states)
308
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
309
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
310
+ [default6]:[rank6]: return self._call_impl(*args, **kwargs)
311
+ [default6]:[rank6]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
312
+ [default6]:[rank6]: return forward_call(*args, **kwargs)
313
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 87, in forward
314
+ [default6]:[rank6]: return column_linear(
315
+ [default6]:[rank6]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 359, in column_linear
316
+ [default6]:[rank6]: return F.linear(input, weight, bias)
317
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
318
+ [default6]:[rank6]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 6.11 GiB is free. Including non-PyTorch memory, this process has 73.21 GiB memory in use. Of the allocated memory 55.52 GiB is allocated by PyTorch, and 5.95 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
319
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
320
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
321
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
322
+ [default0]:[rank0]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
323
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
324
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
325
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
326
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
327
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 171, in forward
328
+ [default0]:[rank0]: merged_states = self.gate_up_proj(hidden_states)
329
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
330
+ [default0]:[rank0]: return self._call_impl(*args, **kwargs)
331
+ [default0]:[rank0]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
332
+ [default0]:[rank0]: return forward_call(*args, **kwargs)
333
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 87, in forward
334
+ [default0]:[rank0]: return column_linear(
335
+ [default0]:[rank0]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 359, in column_linear
336
+ [default0]:[rank0]: return F.linear(input, weight, bias)
337
+ [default0]:[rank0]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU
338
+ [default4]:[rank4]: Traceback (most recent call last):
339
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
340
+ [default4]:[rank4]: trainer.train(dataloader)
341
+ [default3]:[rank3]: Traceback (most recent call last):
342
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
343
+ [default3]:[rank3]: trainer.train(dataloader)
344
+ [default2]:[rank2]: Traceback (most recent call last):
345
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
346
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
347
+ [default1]:[rank1]: Traceback (most recent call last):
348
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
349
+ [default2]:[rank2]: trainer.train(dataloader)
350
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
351
+ [default4]:[rank4]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
352
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
353
+ [default1]:[rank1]: trainer.train(dataloader)
354
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
355
+ [default2]:[rank2]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
356
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
357
+ [default1]:[rank1]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
358
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
359
+ [default2]:[rank2]: outputs = self.pipeline_engine.train_batch_iter(
360
+ [default4]:[rank4]: outputs = self.pipeline_engine.train_batch_iter(
361
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
362
+ [default3]:[rank3]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
363
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
364
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
365
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
366
+ [default1]:[rank1]: outputs = self.pipeline_engine.train_batch_iter(
367
+ [default4]:[rank4]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
368
+ [default2]:[rank2]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
369
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
370
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
371
+ [default4]:[rank4]: output = model(**micro_batch)
372
+ [default2]:[rank2]: output = model(**micro_batch)
373
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
374
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
375
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
376
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
377
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
378
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
379
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
380
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
381
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
382
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
383
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
384
+ [default3]:[rank3]: outputs = self.pipeline_engine.train_batch_iter(
385
+ [default4]:[rank4]: sharded_logits = self.model(
386
+ [default1]:[rank1]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
387
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
388
+ [default2]:[rank2]: sharded_logits = self.model(
389
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
390
+ [default3]:[rank3]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
391
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
392
+ [default1]:[rank1]: output = model(**micro_batch)
393
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
394
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
395
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
396
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
397
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
398
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
399
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
400
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
401
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
402
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
403
+ [default3]:[rank3]: output = model(**micro_batch)
404
+ [default1]:[rank1]: sharded_logits = self.model(
405
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
406
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
407
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
408
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
409
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
410
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
411
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
412
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
413
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
414
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
415
+ [default4]:[rank4]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
416
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
417
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
418
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
419
+ [default4]:[rank4]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
420
+ [default2]:[rank2]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
421
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
422
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
423
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
424
+ [default2]:[rank2]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
425
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
426
+ [default3]:[rank3]: sharded_logits = self.model(
427
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
428
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
429
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
430
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
431
+ [default1]:[rank1]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
432
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
433
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
434
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
435
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
436
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
437
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
438
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
439
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
440
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
441
+ [default1]:[rank1]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
442
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
443
+ [default4]:[rank4]: output = self.pp_block(**new_kwargs)
444
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
445
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
446
+ [default2]:[rank2]: output = self.pp_block(**new_kwargs)
447
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
448
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
449
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
450
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
451
+ [default5]:[rank5]: Traceback (most recent call last):
452
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
453
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
454
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
455
+ [default3]:[rank3]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
456
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
457
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py", line 237, in <module>
458
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
459
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
460
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
461
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
462
+ [default5]:[rank5]: trainer.train(dataloader)
463
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 429, in train
464
+ [default2]:[rank2]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
465
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
466
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
467
+ [default5]:[rank5]: outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
468
+ [default3]:[rank3]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
469
+ [default1]:[rank1]: output = self.pp_block(**new_kwargs)
470
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
471
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
472
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
473
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
474
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/trainer.py", line 462, in training_step
475
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
476
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
477
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
478
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
479
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
480
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
481
+ [default5]:[rank5]: outputs = self.pipeline_engine.train_batch_iter(
482
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 278, in train_batch_iter
483
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
484
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
485
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
486
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
487
+ [default5]:[rank5]: output = self.forward(context=context, state=state, micro_batch=micro_batch, model=model)
488
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 171, in forward
489
+ [default2]:[rank2]: merged_states = self.gate_up_proj(hidden_states)
490
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
491
+ [default3]:[rank3]: output = self.pp_block(**new_kwargs)
492
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
493
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/engine.py", line 44, in forward
494
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
495
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
496
+ [default4]:[rank4]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
497
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
498
+ [default5]:[rank5]: output = model(**micro_batch)
499
+ [default2]:[rank2]: return self._call_impl(*args, **kwargs)
500
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
501
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
502
+ [default2]:[rank2]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
503
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
504
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
505
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
506
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
507
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
508
+ [default2]:[rank2]: return forward_call(*args, **kwargs)
509
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
510
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
511
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
512
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
513
+ [default1]:[rank1]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
514
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 87, in forward
515
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 171, in forward
516
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
517
+ [default2]:[rank2]: return column_linear(
518
+ [default4]:[rank4]: merged_states = self.gate_up_proj(hidden_states)
519
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 891, in forward
520
+ [default5]:[rank5]: sharded_logits = self.model(
521
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
522
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
523
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
524
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
525
+ [default3]:[rank3]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
526
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
527
+ [default2]:[rank2]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 359, in column_linear
528
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
529
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
530
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
531
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
532
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
533
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 171, in forward
534
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
535
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 764, in forward
536
+ [default4]:[rank4]: return self._call_impl(*args, **kwargs)
537
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
538
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 171, in forward
539
+ [default3]:[rank3]: merged_states = self.gate_up_proj(hidden_states)
540
+ [default4]:[rank4]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
541
+ [default4]:[rank4]: return forward_call(*args, **kwargs)
542
+ [default2]:[rank2]: return F.linear(input, weight, bias)
543
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 87, in forward
544
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
545
+ [default5]:[rank5]: return self.forward_with_hidden_states(input_ids=input_ids, input_mask=input_mask)[0]
546
+ [default4]:[rank4]: return column_linear(
547
+ [default4]:[rank4]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 359, in column_linear
548
+ [default1]:[rank1]: merged_states = self.gate_up_proj(hidden_states)
549
+ [default3]:[rank3]: return self._call_impl(*args, **kwargs)
550
+ [default4]:[rank4]: return F.linear(input, weight, bias)
551
+ [default2]:[rank2]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 6.11 GiB is free. Including non-PyTorch memory, this process has 73.21 GiB memory in use. Of the allocated memory 55.52 GiB is allocated by PyTorch, and 5.95 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
552
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 780, in forward_with_hidden_states
553
+ [default5]:[rank5]: hidden_encoder_states = encoder_block(**hidden_encoder_states)
554
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
555
+ [default1]:[rank1]: return self._call_impl(*args, **kwargs)
556
+ [default1]:[rank1]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
557
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
558
+ [default3]:[rank3]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
559
+ [default1]:[rank1]: return forward_call(*args, **kwargs)
560
+ [default4]:[rank4]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 6.11 GiB is free. Including non-PyTorch memory, this process has 73.21 GiB memory in use. Of the allocated memory 55.52 GiB is allocated by PyTorch, and 5.95 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
561
+ [default3]:[rank3]: return forward_call(*args, **kwargs)
562
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
563
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
564
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 87, in forward
565
+ [default3]:[rank3]: return column_linear(
566
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 87, in forward
567
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
568
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/pipeline_parallel/block.py", line 151, in forward
569
+ [default3]:[rank3]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 359, in column_linear
570
+ [default1]:[rank1]: return column_linear(
571
+ [default1]:[rank1]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 359, in column_linear
572
+ [default5]:[rank5]: output = self.pp_block(**new_kwargs)
573
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
574
+ [default3]:[rank3]: return F.linear(input, weight, bias)
575
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
576
+ [default1]:[rank1]: return F.linear(input, weight, bias)
577
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
578
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
579
+ [default3]:[rank3]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 6.11 GiB is free. Including non-PyTorch memory, this process has 73.21 GiB memory in use. Of the allocated memory 55.52 GiB is allocated by PyTorch, and 5.95 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
580
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 637, in forward
581
+ [default1]:[rank1]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 6.11 GiB is free. Including non-PyTorch memory, this process has 73.21 GiB memory in use. Of the allocated memory 55.52 GiB is allocated by PyTorch, and 5.95 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
582
+ [default5]:[rank5]: hidden_states = self.mlp(hidden_states=hidden_states)["hidden_states"]
583
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
584
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
585
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
586
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
587
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/models/llama.py", line 171, in forward
588
+ [default5]:[rank5]: merged_states = self.gate_up_proj(hidden_states)
589
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
590
+ [default5]:[rank5]: return self._call_impl(*args, **kwargs)
591
+ [default5]:[rank5]: File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
592
+ [default5]:[rank5]: return forward_call(*args, **kwargs)
593
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 87, in forward
594
+ [default5]:[rank5]: return column_linear(
595
+ [default5]:[rank5]: File "/fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/src/nanotron/parallel/tensor_parallel/functional.py", line 359, in column_linear
596
+ [default5]:[rank5]: return F.linear(input, weight, bias)
597
+ [default5]:[rank5]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU  has a total capacity of 79.33 GiB of which 6.11 GiB is free. Including non-PyTorch memory, this process has 73.21 GiB memory in use. Of the allocated memory 55.52 GiB is allocated by PyTorch, and 5.95 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
598
+ W0703 21:43:57.232000 140123394443072 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 393829 closing signal SIGTERM
599
+ W0703 21:43:57.232000 140123394443072 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 393830 closing signal SIGTERM
600
+ W0703 21:43:57.233000 140123394443072 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 393831 closing signal SIGTERM
601
+ W0703 21:43:57.233000 140123394443072 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 393832 closing signal SIGTERM
602
+ W0703 21:43:57.233000 140123394443072 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 393833 closing signal SIGTERM
603
+ E0703 21:43:58.351000 140123394443072 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 393826) of binary: /fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/python3.10
604
+ Traceback (most recent call last):
605
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/bin/torchrun", line 8, in <module>
606
+ sys.exit(main())
607
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper
608
+ return f(*args, **kwargs)
609
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main
610
+ run(args)
611
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run
612
+ elastic_launch(
613
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__
614
+ return launch_agent(self._config, self._entrypoint, list(args))
615
+ File "/fsx/ferdinandmom/miniforge3/envs/env-bench-cluster/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent
616
+ raise ChildFailedError(
617
+ torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
618
+ ============================================================
619
+ /fsx/ferdinandmom/ferdinand-hf/bench_cluster/nanotron/run_train.py FAILED
620
+ ------------------------------------------------------------
621
+ Failures:
622
+ [1]:
623
+ time : 2024-07-03_21:43:57
624
+ host : ip-26-0-163-220.ec2.internal
625
+ rank : 1 (local_rank: 1)
626
+ exitcode : 1 (pid: 393827)
627
+ error_file: <N/A>
628
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
629
+ [2]:
630
+ time : 2024-07-03_21:43:57
631
+ host : ip-26-0-163-220.ec2.internal
632
+ rank : 2 (local_rank: 2)
633
+ exitcode : 1 (pid: 393828)
634
+ error_file: <N/A>
635
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
636
+ ------------------------------------------------------------
637
+ Root Cause (first observed failure):
638
+ [0]:
639
+ time : 2024-07-03_21:43:57
640
+ host : ip-26-0-163-220.ec2.internal
641
+ rank : 0 (local_rank: 0)
642
+ exitcode : 1 (pid: 393826)
643
+ error_file: <N/A>
644
+ traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
645
+ ============================================================
646
+ srun: error: ip-26-0-163-220: task 0: Exited with exit code 1
647
+ Consider using `hf_transfer` for faster uploads. This solution comes with some limitations. See https://huggingface.co/docs/huggingface_hub/hf_transfer for more details.
llama-1B/8_GPUS/dp-1_tp-8_pp-1_mbz-1024/status.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ oom