3outeille HF staff commited on
Commit
7d31b0e
1 Parent(s): 86c9c46

Upload llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16

Browse files
llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16/bench.slurm CHANGED
@@ -31,6 +31,12 @@ update_status() {
31
  done
32
  }
33
 
 
 
 
 
 
 
34
  # Misc initializations.
35
  echo "========================"
36
  echo "START TIME: $(date)"
@@ -75,9 +81,21 @@ job_id=${SLURM_JOB_ID}
75
  update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16/status.txt &
76
 
77
  # Run the main command
78
- srun -u $LAUNCHER $CMD
 
 
 
 
 
79
  exit_status=$?
80
 
 
 
 
 
 
 
 
81
  # Update status based on the exit status of `srun`
82
  if [ $exit_status -eq 0 ]; then
83
  printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16/status.txt
@@ -99,7 +117,6 @@ if [ $exit_status -eq 0 ]; then
99
  python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16 --is_profiler
100
  fi
101
 
102
-
103
  # Push to hub the folder using huggingface_cli
104
  huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16 llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16 --commit-message "Upload llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16"
105
 
 
31
  done
32
  }
33
 
34
+ dump_stack_trace() {
35
+ local pid=$1
36
+ local output_file=$2
37
+ py-spy dump --pid $pid > $output_file
38
+ }
39
+
40
  # Misc initializations.
41
  echo "========================"
42
  echo "START TIME: $(date)"
 
81
  update_status $job_id /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16/status.txt &
82
 
83
  # Run the main command
84
+ srun -u $LAUNCHER $CMD &
85
+
86
+ main_pid=$!
87
+
88
+ # Wait for the main process to finish
89
+ wait $main_pid
90
  exit_status=$?
91
 
92
+ # If the exit status is non-zero, dump the stack trace
93
+ if [ $exit_status -ne 0 ]; then
94
+ dump_file="/fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16/crash_dump_${SLURM_JOB_ID}_${SLURM_PROCID}.txt"
95
+ echo "Job crashed. Dumping stack trace to $dump_file"
96
+ dump_stack_trace $main_pid $dump_file
97
+ fi
98
+
99
  # Update status based on the exit status of `srun`
100
  if [ $exit_status -eq 0 ]; then
101
  printf "completed" > /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16/status.txt
 
117
  python /fsx/ferdinandmom/ferdinand-hf/bench_cluster/main.py report --inp_dir /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16 --is_profiler
118
  fi
119
 
 
120
  # Push to hub the folder using huggingface_cli
121
  huggingface-cli upload nanotron/bench_cluster /fsx/ferdinandmom/ferdinand-hf/bench_cluster/tmp/bench_cluster/llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16 llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16 --commit-message "Upload llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16"
122
 
llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16/crash_dump_7398594_0.txt ADDED
File without changes
llama-1B/64_GPUS/dp-1_tp-1_pp-64_mbz-16/log.out CHANGED
The diff for this file is too large to render. See raw diff