ferdinand.mom commited on
Commit
b79f60a
1 Parent(s): 10ca648

add metrics + profiler.csv for 16 GPUS

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. llama-1B/16_GPUS/16_GPUS_summary_results.csv +119 -0
  2. llama-1B/16_GPUS/dp-16_tp-1_pp-1_mbz-1/log_metrics.csv +21 -0
  3. llama-1B/16_GPUS/dp-16_tp-1_pp-1_mbz-1/profiler.csv +2 -0
  4. llama-1B/16_GPUS/dp-16_tp-1_pp-1_mbz-2/log_metrics.csv +21 -0
  5. llama-1B/16_GPUS/dp-16_tp-1_pp-1_mbz-2/profiler.csv +2 -0
  6. llama-1B/16_GPUS/dp-16_tp-1_pp-1_mbz-4/log_metrics.csv +21 -0
  7. llama-1B/16_GPUS/dp-16_tp-1_pp-1_mbz-4/profiler.csv +2 -0
  8. llama-1B/16_GPUS/dp-1_tp-1_pp-16_mbz-1/log_metrics.csv +21 -0
  9. llama-1B/16_GPUS/dp-1_tp-1_pp-16_mbz-1/profiler.csv +1 -1
  10. llama-1B/16_GPUS/dp-1_tp-1_pp-16_mbz-2/log_metrics.csv +21 -0
  11. llama-1B/16_GPUS/dp-1_tp-1_pp-16_mbz-2/profiler.csv +2 -0
  12. llama-1B/16_GPUS/dp-1_tp-1_pp-16_mbz-4/log_metrics.csv +21 -0
  13. llama-1B/16_GPUS/dp-1_tp-1_pp-16_mbz-4/profiler.csv +2 -0
  14. llama-1B/16_GPUS/dp-1_tp-2_pp-8_mbz-2/profiler.csv +1 -1
  15. llama-1B/16_GPUS/dp-1_tp-2_pp-8_mbz-4/log_metrics.csv +21 -0
  16. llama-1B/16_GPUS/dp-1_tp-2_pp-8_mbz-4/profiler.csv +2 -0
  17. llama-1B/16_GPUS/dp-1_tp-2_pp-8_mbz-8/profiler.csv +1 -1
  18. llama-1B/16_GPUS/dp-1_tp-4_pp-4_mbz-16/log_metrics.csv +21 -0
  19. llama-1B/16_GPUS/dp-1_tp-4_pp-4_mbz-16/profiler.csv +2 -0
  20. llama-1B/16_GPUS/dp-1_tp-4_pp-4_mbz-4/profiler.csv +1 -1
  21. llama-1B/16_GPUS/dp-1_tp-4_pp-4_mbz-8/log_metrics.csv +21 -0
  22. llama-1B/16_GPUS/dp-1_tp-4_pp-4_mbz-8/profiler.csv +2 -0
  23. llama-1B/16_GPUS/dp-1_tp-8_pp-2_mbz-16/log_metrics.csv +21 -0
  24. llama-1B/16_GPUS/dp-1_tp-8_pp-2_mbz-16/profiler.csv +2 -0
  25. llama-1B/16_GPUS/dp-1_tp-8_pp-2_mbz-32/log_metrics.csv +21 -0
  26. llama-1B/16_GPUS/dp-1_tp-8_pp-2_mbz-32/profiler.csv +2 -0
  27. llama-1B/16_GPUS/dp-1_tp-8_pp-2_mbz-8/profiler.csv +1 -1
  28. llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-2/log_metrics.csv +21 -0
  29. llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-2/profiler.csv +2 -0
  30. llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-4/profiler.csv +1 -1
  31. llama-1B/16_GPUS/dp-2_tp-2_pp-4_mbz-2/log_metrics.csv +21 -0
  32. llama-1B/16_GPUS/dp-2_tp-2_pp-4_mbz-2/profiler.csv +2 -0
  33. llama-1B/16_GPUS/dp-2_tp-2_pp-4_mbz-4/log_metrics.csv +21 -0
  34. llama-1B/16_GPUS/dp-2_tp-2_pp-4_mbz-4/profiler.csv +2 -0
  35. llama-1B/16_GPUS/dp-2_tp-2_pp-4_mbz-8/log_metrics.csv +21 -0
  36. llama-1B/16_GPUS/dp-2_tp-2_pp-4_mbz-8/profiler.csv +2 -0
  37. llama-1B/16_GPUS/dp-2_tp-4_pp-2_mbz-16/log_metrics.csv +21 -0
  38. llama-1B/16_GPUS/dp-2_tp-4_pp-2_mbz-16/profiler.csv +2 -0
  39. llama-1B/16_GPUS/dp-2_tp-4_pp-2_mbz-4/log_metrics.csv +21 -0
  40. llama-1B/16_GPUS/dp-2_tp-4_pp-2_mbz-4/profiler.csv +2 -0
  41. llama-1B/16_GPUS/dp-2_tp-4_pp-2_mbz-8/log_metrics.csv +21 -0
  42. llama-1B/16_GPUS/dp-2_tp-4_pp-2_mbz-8/profiler.csv +2 -0
  43. llama-1B/16_GPUS/dp-2_tp-8_pp-1_mbz-16/log_metrics.csv +21 -0
  44. llama-1B/16_GPUS/dp-2_tp-8_pp-1_mbz-16/profiler.csv +2 -0
  45. llama-1B/16_GPUS/dp-2_tp-8_pp-1_mbz-32/log_metrics.csv +21 -0
  46. llama-1B/16_GPUS/dp-2_tp-8_pp-1_mbz-32/profiler.csv +2 -0
  47. llama-1B/16_GPUS/dp-4_tp-1_pp-4_mbz-1/log_metrics.csv +21 -0
  48. llama-1B/16_GPUS/dp-4_tp-1_pp-4_mbz-1/profiler.csv +2 -0
  49. llama-1B/16_GPUS/dp-4_tp-1_pp-4_mbz-2/profiler.csv +1 -1
  50. llama-1B/16_GPUS/dp-4_tp-1_pp-4_mbz-4/log_metrics.csv +21 -0
llama-1B/16_GPUS/16_GPUS_summary_results.csv ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model,run_name,status,nnodes,dp,tp,pp,batch_accumulation_per_replica,micro_batch_size,tok/s/gpu,mfu,forward,backward
2
+ llama-1B,dp-1_tp-8_pp-2_mbz-1,,2,1,8,2,1024,1,-1,-1,,
3
+ llama-1B,dp-1_tp-8_pp-2_mbz-2,,2,1,8,2,512,2,-1,-1,,
4
+ llama-1B,dp-1_tp-8_pp-2_mbz-4,,2,1,8,2,256,4,-1,-1,,
5
+ llama-1B,dp-1_tp-8_pp-2_mbz-8,,2,1,8,2,128,8,-1,-1,,
6
+ llama-1B,dp-1_tp-8_pp-2_mbz-16,,2,1,8,2,64,16,-1,-1,,
7
+ llama-1B,dp-1_tp-8_pp-2_mbz-32,,2,1,8,2,32,32,-1,-1,,
8
+ llama-1B,dp-1_tp-8_pp-2_mbz-64,,2,1,8,2,16,64,-1,-1,,
9
+ llama-1B,dp-1_tp-8_pp-2_mbz-128,,2,1,8,2,8,128,-1,-1,,
10
+ llama-1B,dp-1_tp-8_pp-2_mbz-256,,2,1,8,2,4,256,-1,-1,,
11
+ llama-1B,dp-1_tp-8_pp-2_mbz-512,,2,1,8,2,2,512,-1,-1,,
12
+ llama-1B,dp-1_tp-8_pp-2_mbz-1024,,2,1,8,2,1,1024,-1,-1,,
13
+ llama-1B,dp-1_tp-1_pp-16_mbz-1,,2,1,1,16,1024,1,-1,-1,,
14
+ llama-1B,dp-1_tp-1_pp-16_mbz-2,,2,1,1,16,512,2,-1,-1,,
15
+ llama-1B,dp-1_tp-1_pp-16_mbz-4,,2,1,1,16,256,4,-1,-1,,
16
+ llama-1B,dp-1_tp-1_pp-16_mbz-8,,2,1,1,16,128,8,-1,-1,,
17
+ llama-1B,dp-1_tp-1_pp-16_mbz-16,,2,1,1,16,64,16,-1,-1,,
18
+ llama-1B,dp-1_tp-1_pp-16_mbz-32,,2,1,1,16,32,32,-1,-1,,
19
+ llama-1B,dp-1_tp-1_pp-16_mbz-64,,2,1,1,16,16,64,-1,-1,,
20
+ llama-1B,dp-16_tp-1_pp-1_mbz-1,,2,16,1,1,64,1,-1,-1,,
21
+ llama-1B,dp-16_tp-1_pp-1_mbz-2,,2,16,1,1,32,2,-1,-1,,
22
+ llama-1B,dp-16_tp-1_pp-1_mbz-4,,2,16,1,1,16,4,-1,-1,,
23
+ llama-1B,dp-16_tp-1_pp-1_mbz-8,,2,16,1,1,8,8,-1,-1,,
24
+ llama-1B,dp-16_tp-1_pp-1_mbz-16,,2,16,1,1,4,16,-1,-1,,
25
+ llama-1B,dp-16_tp-1_pp-1_mbz-32,,2,16,1,1,2,32,-1,-1,,
26
+ llama-1B,dp-16_tp-1_pp-1_mbz-64,,2,16,1,1,1,64,-1,-1,,
27
+ llama-1B,dp-1_tp-4_pp-4_mbz-1,,2,1,4,4,1024,1,-1,-1,,
28
+ llama-1B,dp-1_tp-4_pp-4_mbz-2,,2,1,4,4,512,2,-1,-1,,
29
+ llama-1B,dp-1_tp-4_pp-4_mbz-4,,2,1,4,4,256,4,-1,-1,,
30
+ llama-1B,dp-1_tp-4_pp-4_mbz-8,,2,1,4,4,128,8,-1,-1,,
31
+ llama-1B,dp-1_tp-4_pp-4_mbz-16,,2,1,4,4,64,16,-1,-1,,
32
+ llama-1B,dp-1_tp-4_pp-4_mbz-32,,2,1,4,4,32,32,-1,-1,,
33
+ llama-1B,dp-1_tp-4_pp-4_mbz-64,,2,1,4,4,16,64,-1,-1,,
34
+ llama-1B,dp-1_tp-4_pp-4_mbz-128,,2,1,4,4,8,128,-1,-1,,
35
+ llama-1B,dp-1_tp-4_pp-4_mbz-256,,2,1,4,4,4,256,-1,-1,,
36
+ llama-1B,dp-8_tp-2_pp-1_mbz-1,,2,8,2,1,128,1,-1,-1,,
37
+ llama-1B,dp-8_tp-2_pp-1_mbz-2,,2,8,2,1,64,2,-1,-1,,
38
+ llama-1B,dp-8_tp-2_pp-1_mbz-4,,2,8,2,1,32,4,-1,-1,,
39
+ llama-1B,dp-8_tp-2_pp-1_mbz-8,,2,8,2,1,16,8,-1,-1,,
40
+ llama-1B,dp-8_tp-2_pp-1_mbz-16,,2,8,2,1,8,16,-1,-1,,
41
+ llama-1B,dp-8_tp-2_pp-1_mbz-32,,2,8,2,1,4,32,-1,-1,,
42
+ llama-1B,dp-8_tp-2_pp-1_mbz-64,,2,8,2,1,2,64,-1,-1,,
43
+ llama-1B,dp-8_tp-2_pp-1_mbz-128,,2,8,2,1,1,128,-1,-1,,
44
+ llama-1B,dp-4_tp-1_pp-4_mbz-1,,2,4,1,4,256,1,-1,-1,,
45
+ llama-1B,dp-4_tp-1_pp-4_mbz-2,,2,4,1,4,128,2,-1,-1,,
46
+ llama-1B,dp-4_tp-1_pp-4_mbz-4,,2,4,1,4,64,4,-1,-1,,
47
+ llama-1B,dp-4_tp-1_pp-4_mbz-8,,2,4,1,4,32,8,-1,-1,,
48
+ llama-1B,dp-4_tp-1_pp-4_mbz-16,,2,4,1,4,16,16,-1,-1,,
49
+ llama-1B,dp-4_tp-1_pp-4_mbz-32,,2,4,1,4,8,32,-1,-1,,
50
+ llama-1B,dp-4_tp-1_pp-4_mbz-64,,2,4,1,4,4,64,-1,-1,,
51
+ llama-1B,dp-8_tp-1_pp-2_mbz-1,,2,8,1,2,128,1,-1,-1,,
52
+ llama-1B,dp-8_tp-1_pp-2_mbz-2,,2,8,1,2,64,2,-1,-1,,
53
+ llama-1B,dp-8_tp-1_pp-2_mbz-4,,2,8,1,2,32,4,-1,-1,,
54
+ llama-1B,dp-8_tp-1_pp-2_mbz-8,,2,8,1,2,16,8,-1,-1,,
55
+ llama-1B,dp-8_tp-1_pp-2_mbz-16,,2,8,1,2,8,16,-1,-1,,
56
+ llama-1B,dp-8_tp-1_pp-2_mbz-32,,2,8,1,2,4,32,-1,-1,,
57
+ llama-1B,dp-8_tp-1_pp-2_mbz-64,,2,8,1,2,2,64,-1,-1,,
58
+ llama-1B,dp-8_tp-1_pp-2_mbz-128,,2,8,1,2,1,128,-1,-1,,
59
+ llama-1B,dp-4_tp-4_pp-1_mbz-1,,2,4,4,1,256,1,-1,-1,,
60
+ llama-1B,dp-4_tp-4_pp-1_mbz-2,,2,4,4,1,128,2,-1,-1,,
61
+ llama-1B,dp-4_tp-4_pp-1_mbz-4,,2,4,4,1,64,4,-1,-1,,
62
+ llama-1B,dp-4_tp-4_pp-1_mbz-8,,2,4,4,1,32,8,-1,-1,,
63
+ llama-1B,dp-4_tp-4_pp-1_mbz-16,,2,4,4,1,16,16,-1,-1,,
64
+ llama-1B,dp-4_tp-4_pp-1_mbz-32,,2,4,4,1,8,32,-1,-1,,
65
+ llama-1B,dp-4_tp-4_pp-1_mbz-64,,2,4,4,1,4,64,-1,-1,,
66
+ llama-1B,dp-4_tp-4_pp-1_mbz-128,,2,4,4,1,2,128,-1,-1,,
67
+ llama-1B,dp-4_tp-4_pp-1_mbz-256,,2,4,4,1,1,256,-1,-1,,
68
+ llama-1B,dp-2_tp-2_pp-4_mbz-1,,2,2,2,4,512,1,-1,-1,,
69
+ llama-1B,dp-2_tp-2_pp-4_mbz-2,,2,2,2,4,256,2,-1,-1,,
70
+ llama-1B,dp-2_tp-2_pp-4_mbz-4,,2,2,2,4,128,4,-1,-1,,
71
+ llama-1B,dp-2_tp-2_pp-4_mbz-8,,2,2,2,4,64,8,-1,-1,,
72
+ llama-1B,dp-2_tp-2_pp-4_mbz-16,,2,2,2,4,32,16,-1,-1,,
73
+ llama-1B,dp-2_tp-2_pp-4_mbz-32,,2,2,2,4,16,32,-1,-1,,
74
+ llama-1B,dp-2_tp-2_pp-4_mbz-64,,2,2,2,4,8,64,-1,-1,,
75
+ llama-1B,dp-2_tp-2_pp-4_mbz-128,,2,2,2,4,4,128,-1,-1,,
76
+ llama-1B,dp-2_tp-8_pp-1_mbz-1,,2,2,8,1,512,1,-1,-1,,
77
+ llama-1B,dp-2_tp-8_pp-1_mbz-2,,2,2,8,1,256,2,-1,-1,,
78
+ llama-1B,dp-2_tp-8_pp-1_mbz-4,,2,2,8,1,128,4,-1,-1,,
79
+ llama-1B,dp-2_tp-8_pp-1_mbz-8,,2,2,8,1,64,8,-1,-1,,
80
+ llama-1B,dp-2_tp-8_pp-1_mbz-16,,2,2,8,1,32,16,-1,-1,,
81
+ llama-1B,dp-2_tp-8_pp-1_mbz-32,,2,2,8,1,16,32,-1,-1,,
82
+ llama-1B,dp-2_tp-8_pp-1_mbz-64,,2,2,8,1,8,64,-1,-1,,
83
+ llama-1B,dp-2_tp-8_pp-1_mbz-128,,2,2,8,1,4,128,-1,-1,,
84
+ llama-1B,dp-2_tp-8_pp-1_mbz-256,,2,2,8,1,2,256,-1,-1,,
85
+ llama-1B,dp-2_tp-8_pp-1_mbz-512,,2,2,8,1,1,512,-1,-1,,
86
+ llama-1B,dp-1_tp-2_pp-8_mbz-1,,2,1,2,8,1024,1,-1,-1,,
87
+ llama-1B,dp-1_tp-2_pp-8_mbz-2,,2,1,2,8,512,2,-1,-1,,
88
+ llama-1B,dp-1_tp-2_pp-8_mbz-4,,2,1,2,8,256,4,-1,-1,,
89
+ llama-1B,dp-1_tp-2_pp-8_mbz-8,,2,1,2,8,128,8,-1,-1,,
90
+ llama-1B,dp-1_tp-2_pp-8_mbz-16,,2,1,2,8,64,16,-1,-1,,
91
+ llama-1B,dp-1_tp-2_pp-8_mbz-32,,2,1,2,8,32,32,-1,-1,,
92
+ llama-1B,dp-1_tp-2_pp-8_mbz-64,,2,1,2,8,16,64,-1,-1,,
93
+ llama-1B,dp-1_tp-2_pp-8_mbz-128,,2,1,2,8,8,128,-1,-1,,
94
+ llama-1B,dp-2_tp-1_pp-8_mbz-1,,2,2,1,8,512,1,-1,-1,,
95
+ llama-1B,dp-2_tp-1_pp-8_mbz-2,,2,2,1,8,256,2,-1,-1,,
96
+ llama-1B,dp-2_tp-1_pp-8_mbz-4,,2,2,1,8,128,4,-1,-1,,
97
+ llama-1B,dp-2_tp-1_pp-8_mbz-8,,2,2,1,8,64,8,-1,-1,,
98
+ llama-1B,dp-2_tp-1_pp-8_mbz-16,,2,2,1,8,32,16,-1,-1,,
99
+ llama-1B,dp-2_tp-1_pp-8_mbz-32,,2,2,1,8,16,32,-1,-1,,
100
+ llama-1B,dp-2_tp-1_pp-8_mbz-64,,2,2,1,8,8,64,-1,-1,,
101
+ llama-1B,dp-2_tp-4_pp-2_mbz-1,,2,2,4,2,512,1,-1,-1,,
102
+ llama-1B,dp-2_tp-4_pp-2_mbz-2,,2,2,4,2,256,2,-1,-1,,
103
+ llama-1B,dp-2_tp-4_pp-2_mbz-4,,2,2,4,2,128,4,-1,-1,,
104
+ llama-1B,dp-2_tp-4_pp-2_mbz-8,,2,2,4,2,64,8,-1,-1,,
105
+ llama-1B,dp-2_tp-4_pp-2_mbz-16,,2,2,4,2,32,16,-1,-1,,
106
+ llama-1B,dp-2_tp-4_pp-2_mbz-32,,2,2,4,2,16,32,-1,-1,,
107
+ llama-1B,dp-2_tp-4_pp-2_mbz-64,,2,2,4,2,8,64,-1,-1,,
108
+ llama-1B,dp-2_tp-4_pp-2_mbz-128,,2,2,4,2,4,128,-1,-1,,
109
+ llama-1B,dp-2_tp-4_pp-2_mbz-256,,2,2,4,2,2,256,-1,-1,,
110
+ llama-1B,dp-2_tp-4_pp-2_mbz-512,,2,2,4,2,1,512,-1,-1,,
111
+ llama-1B,dp-4_tp-2_pp-2_mbz-1,,2,4,2,2,256,1,-1,-1,,
112
+ llama-1B,dp-4_tp-2_pp-2_mbz-2,,2,4,2,2,128,2,-1,-1,,
113
+ llama-1B,dp-4_tp-2_pp-2_mbz-4,,2,4,2,2,64,4,-1,-1,,
114
+ llama-1B,dp-4_tp-2_pp-2_mbz-8,,2,4,2,2,32,8,-1,-1,,
115
+ llama-1B,dp-4_tp-2_pp-2_mbz-16,,2,4,2,2,16,16,-1,-1,,
116
+ llama-1B,dp-4_tp-2_pp-2_mbz-32,,2,4,2,2,8,32,-1,-1,,
117
+ llama-1B,dp-4_tp-2_pp-2_mbz-64,,2,4,2,2,4,64,-1,-1,,
118
+ llama-1B,dp-4_tp-2_pp-2_mbz-128,,2,4,2,2,2,128,-1,-1,,
119
+ llama-1B,dp-4_tp-2_pp-2_mbz-256,,2,4,2,2,1,256,-1,-1,,
llama-1B/16_GPUS/dp-16_tp-1_pp-1_mbz-1/log_metrics.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ iteration,consumed_tokens,elapsed_time_per_iteration_ms,tokens_per_sec,tokens_per_sec_per_gpu,global_batch_size,lm_loss,lr,model_tflops_per_gpu,hardware_tflops_per_gpu,grad_norm,memory_usage_MiB,peak_allocated_MiB,peak_reserved_MiB
2
+ 1,4190000.0000000005,21800.0,192000.0,12000.0,1020.0,11.3,0.0001,109.0,109.0,33.1,7256.42,15785.37,18632.0
3
+ 2,8390000.0,10800.0,387000.0,24200.0,1020.0,11.3,9.53e-05,220.0,220.0,33.3,7256.42,15785.37,18632.0
4
+ 3,12600000.0,11000.0,382000.0,23900.0,1020.0,16.0,9.05e-05,216.0,216.0,249.0,7256.42,15785.37,18632.0
5
+ 4,16800000.0,13300.0,316000.0,19700.0,1020.0,15.1,8.58e-05,179.0,179.0,41.6,7256.39,11621.72,18632.0
6
+ 5,21000000.0,13300.0,314000.0,19600.0,1020.0,10.8,8.11e-05,178.0,178.0,26.0,7256.39,15785.37,18632.0
7
+ 6,25200000.0,13200.0,319000.0,19900.0,1020.0,10.8,7.63e-05,181.0,181.0,18.9,7256.39,15785.37,18632.0
8
+ 7,29400000.0,10800.0,388000.0,24200.0,1020.0,10.2,7.16e-05,220.0,220.0,7.97,7256.39,15785.37,18632.0
9
+ 8,33600000.0,10800.0,388000.0,24200.0,1020.0,9.16,6.68e-05,220.0,220.0,6.46,7256.39,15785.37,18632.0
10
+ 9,37700000.0,11000.0,383000.0,23900.0,1020.0,11.2,6.21e-05,217.0,217.0,59.7,7256.39,15785.37,18632.0
11
+ 10,41900000.0,10800.0,387000.0,24200.0,1020.0,9.59,5.74e-05,219.0,219.0,44.0,7256.39,15785.37,18632.0
12
+ 11,46100000.0,10900.0,386000.0,24100.0,1020.0,8.08,5.26e-05,219.0,219.0,8.41,7256.39,15785.37,18632.0
13
+ 12,50300000.0,10900.0,384000.0,24000.0,1020.0,7.86,4.79e-05,218.0,218.0,5.09,7256.39,15785.37,18632.0
14
+ 13,54500000.0,11000.0,382000.0,23900.0,1020.0,7.7,4.32e-05,217.0,217.0,4.71,7256.39,15785.37,18632.0
15
+ 14,58700000.0,11000.0,381000.0,23800.0,1020.0,7.56,3.84e-05,216.0,216.0,5.14,7256.39,15785.37,18632.0
16
+ 15,62900000.0,11000.0,381000.0,23800.0,1020.0,7.4,3.37e-05,216.0,216.0,5.16,7256.39,15785.37,18632.0
17
+ 16,67099999.99999999,10900.0,386000.0,24100.0,1020.0,7.29,2.89e-05,219.0,219.0,5.26,7256.39,15785.37,18632.0
18
+ 17,71300000.0,11000.0,380000.0,23700.0,1020.0,7.22,2.42e-05,215.0,215.0,5.18,7256.39,15785.37,18632.0
19
+ 18,75500000.0,11100.0,378000.0,23600.0,1020.0,7.15,1.95e-05,214.0,214.0,5.04,7256.39,15785.37,18632.0
20
+ 19,79700000.0,11000.0,382000.0,23900.0,1020.0,7.08,1.47e-05,217.0,217.0,3.85,7256.39,15785.37,18632.0
21
+ 20,83900000.0,10800.0,388000.0,24200.0,1020.0,7.03,1e-05,220.0,220.0,2.9,,,
llama-1B/16_GPUS/dp-16_tp-1_pp-1_mbz-1/profiler.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ forward,backward
2
+ 0ms 959μs,1ms 7μs
llama-1B/16_GPUS/dp-16_tp-1_pp-1_mbz-2/log_metrics.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ iteration,consumed_tokens,elapsed_time_per_iteration_ms,tokens_per_sec,tokens_per_sec_per_gpu,global_batch_size,lm_loss,lr,model_tflops_per_gpu,hardware_tflops_per_gpu,grad_norm,memory_usage_MiB,peak_allocated_MiB,peak_reserved_MiB
2
+ 1,4190000.0000000005,18900.0,222000.0,13900.0,1020.0,11.3,0.0001,126.0,126.0,33.1,7252.46,25075.44,27568.0
3
+ 2,8390000.0,9350.0,449000.0,28000.0,1020.0,11.3,9.53e-05,254.0,254.0,33.3,7252.46,25075.44,27568.0
4
+ 3,12600000.0,9080.0,462000.0,28900.0,1020.0,16.0,9.05e-05,262.0,262.0,249.0,7252.46,25075.44,27568.0
5
+ 4,16800000.0,9640.0,435000.0,27200.0,1020.0,15.1,8.58e-05,247.0,247.0,41.6,7252.45,11617.76,27568.0
6
+ 5,21000000.0,9500.0,441000.0,27600.0,1020.0,10.8,8.11e-05,250.0,250.0,25.9,7252.45,25075.44,27568.0
7
+ 6,25200000.0,9730.0,431000.0,26900.0,1020.0,10.8,7.63e-05,244.0,244.0,18.9,7252.45,25075.44,27568.0
8
+ 7,29400000.0,8900.0,471000.0,29500.0,1020.0,10.2,7.16e-05,267.0,267.0,7.97,7252.45,25075.44,27568.0
9
+ 8,33600000.0,8860.0,473000.0,29600.0,1020.0,9.15,6.68e-05,268.0,268.0,6.46,7252.45,25075.44,27568.0
10
+ 9,37700000.0,9080.0,462000.0,28900.0,1020.0,11.2,6.21e-05,262.0,262.0,59.7,7252.45,25075.44,27568.0
11
+ 10,41900000.0,9130.0,459000.0,28700.0,1020.0,9.6,5.74e-05,260.0,260.0,44.2,7252.45,25075.44,27568.0
12
+ 11,46100000.0,9390.0,447000.0,27900.0,1020.0,8.08,5.26e-05,253.0,253.0,8.69,7252.45,25075.44,27568.0
13
+ 12,50300000.0,8910.0,471000.0,29400.0,1020.0,7.86,4.79e-05,267.0,267.0,5.1,7252.45,25075.44,27568.0
14
+ 13,54500000.0,9060.0,463000.0,28900.0,1020.0,7.7,4.32e-05,263.0,263.0,4.73,7252.45,25075.44,27568.0
15
+ 14,58700000.0,9030.0,464000.0,29000.0,1020.0,7.56,3.84e-05,263.0,263.0,5.09,7252.45,25075.44,27568.0
16
+ 15,62900000.0,9030.0,464000.0,29000.0,1020.0,7.4,3.37e-05,263.0,263.0,5.16,7252.45,25075.44,27568.0
17
+ 16,67099999.99999999,9270.0,453000.0,28300.0,1020.0,7.3,2.89e-05,257.0,257.0,5.15,7252.45,25075.44,27568.0
18
+ 17,71300000.0,9240.0,454000.0,28400.0,1020.0,7.22,2.42e-05,258.0,258.0,5.14,7252.45,25075.44,27568.0
19
+ 18,75500000.0,9120.0,460000.0,28800.0,1020.0,7.15,1.95e-05,261.0,261.0,5.04,7252.45,25075.44,27568.0
20
+ 19,79700000.0,9110.0,460000.0,28800.0,1020.0,7.08,1.47e-05,261.0,261.0,3.86,7252.45,25075.44,27568.0
21
+ 20,83900000.0,8990.0,467000.0,29200.0,1020.0,7.03,1e-05,265.0,265.0,2.94,,,
llama-1B/16_GPUS/dp-16_tp-1_pp-1_mbz-2/profiler.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ forward,backward
2
+ 0ms 881μs,1ms 507μs
llama-1B/16_GPUS/dp-16_tp-1_pp-1_mbz-4/log_metrics.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ iteration,consumed_tokens,elapsed_time_per_iteration_ms,tokens_per_sec,tokens_per_sec_per_gpu,global_batch_size,lm_loss,lr,model_tflops_per_gpu,hardware_tflops_per_gpu,grad_norm,memory_usage_MiB,peak_allocated_MiB,peak_reserved_MiB
2
+ 1,4190000.0000000005,20500.0,205000.0,12800.0,1020.0,11.3,0.0001,116.0,116.0,33.1,7252.59,42900.39,44042.0
3
+ 2,8390000.0,8800.0,476000.0,29800.0,1020.0,11.3,9.53e-05,270.0,270.0,33.3,7252.59,42900.39,44042.0
4
+ 3,12600000.0,8770.0,478000.0,29900.0,1020.0,16.0,9.05e-05,271.0,271.0,249.0,7252.59,42900.39,44042.0
5
+ 4,16800000.0,8870.0,473000.0,29600.0,1020.0,15.1,8.58e-05,268.0,268.0,41.6,7252.58,11617.89,44042.0
6
+ 5,21000000.0,8870.0,473000.0,29600.0,1020.0,10.8,8.11e-05,268.0,268.0,26.0,7252.58,42900.39,44042.0
7
+ 6,25200000.0,8820.0,475000.0,29700.0,1020.0,10.8,7.63e-05,270.0,270.0,18.9,7252.58,42900.39,44042.0
8
+ 7,29400000.0,8500.0,494000.0,30800.0,1020.0,10.2,7.16e-05,280.0,280.0,7.97,7252.58,42900.39,44042.0
9
+ 8,33600000.0,8580.0,489000.0,30500.0,1020.0,9.15,6.68e-05,277.0,277.0,6.46,7252.58,42900.39,44042.0
10
+ 9,37700000.0,8650.0,485000.0,30300.0,1020.0,11.2,6.21e-05,275.0,275.0,59.8,7252.58,42900.39,44042.0
11
+ 10,41900000.0,8790.0,477000.0,29800.0,1020.0,9.6,5.74e-05,271.0,271.0,44.2,7252.58,42900.39,44042.0
12
+ 11,46100000.0,8600.0,488000.0,30500.0,1020.0,8.08,5.26e-05,277.0,277.0,8.6,7252.58,42900.39,44042.0
13
+ 12,50300000.0,8660.0,484000.0,30300.0,1020.0,7.86,4.79e-05,275.0,275.0,5.09,7252.58,42900.39,44042.0
14
+ 13,54500000.0,8640.0,485000.0,30300.0,1020.0,7.7,4.32e-05,275.0,275.0,4.73,7252.58,42900.39,44042.0
15
+ 14,58700000.0,8820.0,476000.0,29700.0,1020.0,7.56,3.84e-05,270.0,270.0,5.1,7252.58,42900.39,44042.0
16
+ 15,62900000.0,8680.0,483000.0,30200.0,1020.0,7.4,3.37e-05,274.0,274.0,5.17,7252.58,42900.39,44042.0
17
+ 16,67099999.99999999,8660.0,485000.0,30300.0,1020.0,7.3,2.89e-05,275.0,275.0,5.17,7252.58,42900.39,44042.0
18
+ 17,71300000.0,8730.0,480000.0,30000.0,1020.0,7.22,2.42e-05,272.0,272.0,5.13,7252.58,42900.39,44042.0
19
+ 18,75500000.0,8630.0,486000.0,30400.0,1020.0,7.15,1.95e-05,276.0,276.0,5.04,7252.58,42900.39,44042.0
20
+ 19,79700000.0,8710.0,481000.0,30100.0,1020.0,7.08,1.47e-05,273.0,273.0,3.87,7252.58,42900.39,44042.0
21
+ 20,83900000.0,8790.0,477000.0,29800.0,1020.0,7.03,1e-05,271.0,271.0,2.93,,,
llama-1B/16_GPUS/dp-16_tp-1_pp-1_mbz-4/profiler.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ forward,backward
2
+ 0ms 971μs,1ms 187μs
llama-1B/16_GPUS/dp-1_tp-1_pp-16_mbz-1/log_metrics.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ iteration,consumed_tokens,elapsed_time_per_iteration_ms,tokens_per_sec,tokens_per_sec_per_gpu,global_batch_size,lm_loss,lr,model_tflops_per_gpu,hardware_tflops_per_gpu,grad_norm,memory_usage_MiB,peak_allocated_MiB,peak_reserved_MiB
2
+ 1,4190000.0000000005,105000.0,40000.0,2500.0,1020.0,11.1,0.0001,22.7,22.7,25.6,3274.04,3274.05,13040.0
3
+ 2,8390000.0,50800.0,82500.0,5160.0,1020.0,11.1,9.53e-05,46.8,46.8,25.9,3274.04,12603.56,13042.0
4
+ 3,12600000.0,50900.0,82300.0,5150.0,1020.0,9.9,9.05e-05,46.7,46.7,40.4,3274.04,3274.05,13042.0
5
+ 4,16800000.0,47800.0,87700.0,5480.0,1020.0,11.9,8.58e-05,49.7,49.7,61.2,3274.04,12603.56,13042.0
6
+ 5,21000000.0,46600.0,90100.0,5630.0,1020.0,9.05,8.11e-05,51.1,51.1,8.31,,,
7
+ 6,25200000.0,51500.0,81500.0,5090.0,1020.0,8.86,7.63e-05,46.2,46.2,6.63,3274.04,12603.56,13042.0
8
+ 7,29400000.0,534000.0,7850.0,491.0,1020.0,8.37,7.16e-05,4.45,4.45,4.93,3274.04,12603.56,13042.0
9
+ 8,33600000.0,53800.0,78000.0,4880.0,1020.0,7.97,6.68e-05,44.2,44.2,3.13,3274.04,12603.56,13042.0
10
+ 9,37700000.0,47500.0,88400.0,5520.0,1020.0,7.83,6.21e-05,50.1,50.1,9.04,3274.04,12603.56,13042.0
11
+ 10,41900000.0,48000.0,87300.0,5460.0,1020.0,7.62,5.74e-05,49.5,49.5,5.09,3274.04,12603.56,13042.0
12
+ 11,46100000.0,47900.0,87600.0,5470.0,1020.0,7.47,5.26e-05,49.7,49.7,4.06,,,
13
+ 12,50300000.0,45700.0,91700.0,5730.0,1020.0,7.34,4.79e-05,52.0,52.0,3.12,3274.04,12603.56,13042.0
14
+ 13,54500000.0,48000.0,87400.0,5460.0,1020.0,7.23,4.32e-05,49.6,49.6,2.73,3274.04,12603.56,13042.0
15
+ 14,58700000.0,44700.0,93800.0,5860.0,1020.0,7.14,3.84e-05,53.2,53.2,2.33,3274.04,12603.56,13042.0
16
+ 15,62900000.0,46200.0,90800.0,5680.0,1020.0,7.06,3.37e-05,51.5,51.5,2.48,3274.04,12603.56,13042.0
17
+ 16,67099999.99999999,47000.0,89200.0,5580.0,1020.0,6.98,2.89e-05,50.6,50.6,2.66,3274.04,12603.56,13042.0
18
+ 17,71300000.0,46000.0,91100.0,5690.0,1020.0,6.9,2.42e-05,51.7,51.7,1.89,,,
19
+ 18,75500000.0,43500.0,96400.0,6030.0,1020.0,6.84,1.95e-05,54.7,54.7,1.61,3274.04,12603.56,13042.0
20
+ 19,79700000.0,47500.0,88200.0,5510.0,1020.0,6.8,1.47e-05,50.0,50.0,1.85,,,
21
+ 20,83900000.0,44700.0,93900.0,5870.0,1020.0,6.76,1e-05,53.2,53.2,1.81,,,
llama-1B/16_GPUS/dp-1_tp-1_pp-16_mbz-1/profiler.csv CHANGED
@@ -1,2 +1,2 @@
1
  forward,backward
2
- 0ms 619μs,1ms 421μs
 
1
  forward,backward
2
+ 0ms 944μs,1ms 86μs
llama-1B/16_GPUS/dp-1_tp-1_pp-16_mbz-2/log_metrics.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ iteration,consumed_tokens,elapsed_time_per_iteration_ms,tokens_per_sec,tokens_per_sec_per_gpu,global_batch_size,lm_loss,lr,model_tflops_per_gpu,hardware_tflops_per_gpu,grad_norm,memory_usage_MiB,peak_allocated_MiB,peak_reserved_MiB
2
+ 1,4190000.0000000005,90800.0,46200.0,2890.0,1020.0,11.1,0.0001,26.2,26.2,25.6,3274.08,3274.08,23444.0
3
+ 2,8390000.0,31700.0,132000.0,8270.0,1020.0,11.1,9.53e-05,75.1,75.1,25.9,3274.08,22941.07,23444.0
4
+ 3,12600000.0,31900.0,131000.0,8210.0,1020.0,9.9,9.05e-05,74.5,74.5,40.4,3274.08,22941.07,23444.0
5
+ 4,16800000.0,30400.0,138000.0,8630.0,1020.0,11.9,8.58e-05,78.3,78.3,61.2,3274.08,3274.08,23444.0
6
+ 5,21000000.0,30100.0,139000.0,8700.0,1020.0,9.05,8.11e-05,78.9,78.9,8.32,3274.08,22941.07,23444.0
7
+ 6,25200000.0,32900.0,128000.0,7970.0,1020.0,8.86,7.63e-05,72.3,72.3,6.61,3274.08,22941.07,23444.0
8
+ 7,29400000.0,272000.0,15400.0,965.0,1020.0,8.37,7.16e-05,8.75,8.75,4.93,3274.08,22941.07,23444.0
9
+ 8,33600000.0,32000.0,131000.0,8189.999999999999,1020.0,7.97,6.68e-05,74.3,74.3,3.12,3274.08,22941.07,23444.0
10
+ 9,37700000.0,32600.0,129000.0,8039.999999999999,1020.0,7.83,6.21e-05,73.0,73.0,9.04,3274.08,22941.07,23444.0
11
+ 10,41900000.0,31100.0,135000.0,8430.0,1020.0,7.62,5.74e-05,76.5,76.5,5.08,,,
12
+ 11,46100000.0,31100.0,135000.0,8440.0,1020.0,7.47,5.26e-05,76.5,76.5,4.05,3274.08,22941.07,23444.0
13
+ 12,50300000.0,32400.0,130000.0,8090.0,1020.0,7.34,4.79e-05,73.4,73.4,3.13,3274.08,22941.07,23444.0
14
+ 13,54500000.0,32000.0,131000.0,8180.0,1020.0,7.23,4.32e-05,74.2,74.2,2.74,,,
15
+ 14,58700000.0,32100.0,131000.0,8160.0,1020.0,7.14,3.84e-05,74.0,74.0,2.32,3274.08,22941.07,23444.0
16
+ 15,62900000.0,30000.0,140000.0,8740.0,1020.0,7.06,3.37e-05,79.3,79.3,2.47,3274.08,22941.07,23444.0
17
+ 16,67099999.99999999,34000.0,123000.0,7700.0,1020.0,6.98,2.89e-05,69.9,69.9,2.66,,,
18
+ 17,71300000.0,32900.0,128000.0,7970.0,1020.0,6.9,2.42e-05,72.4,72.4,1.88,3274.08,22941.07,23444.0
19
+ 18,75500000.0,31500.0,133000.0,8340.0,1020.0,6.84,1.95e-05,75.6,75.6,1.61,3274.08,22941.07,23444.0
20
+ 19,79700000.0,31900.0,132000.0,8230.0,1020.0,6.8,1.47e-05,74.7,74.7,1.83,3274.08,22941.07,23444.0
21
+ 20,83900000.0,31600.0,133000.0,8290.0,1020.0,6.77,1e-05,75.2,75.2,1.82,,,
llama-1B/16_GPUS/dp-1_tp-1_pp-16_mbz-2/profiler.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ forward,backward
2
+ 0ms 956μs,1ms 174μs
llama-1B/16_GPUS/dp-1_tp-1_pp-16_mbz-4/log_metrics.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ iteration,consumed_tokens,elapsed_time_per_iteration_ms,tokens_per_sec,tokens_per_sec_per_gpu,global_batch_size,lm_loss,lr,model_tflops_per_gpu,hardware_tflops_per_gpu,grad_norm,memory_usage_MiB,peak_allocated_MiB,peak_reserved_MiB
2
+ 1,4190000.0000000005,81400.0,51500.0,3220.0,1020.0,11.1,0.0001,29.2,29.2,25.6,3274.15,3274.15,43028.0
3
+ 2,8390000.0,30800.0,136000.0,8500.0,1020.0,11.1,9.53e-05,77.1,77.1,25.9,3274.15,3274.15,43028.0
4
+ 3,12600000.0,28800.0,145000.0,9090.0,1020.0,9.9,9.05e-05,82.5,82.5,40.4,3274.15,3274.15,43028.0
5
+ 4,16800000.0,30000.0,140000.0,8750.0,1020.0,11.9,8.58e-05,79.4,79.4,61.2,3274.15,42592.08,43028.0
6
+ 5,21000000.0,31800.0,132000.0,8250.0,1020.0,9.05,8.11e-05,74.9,74.9,8.31,,,
7
+ 6,25200000.0,30600.0,137000.0,8550.0,1020.0,8.85,7.63e-05,77.6,77.6,6.61,3274.15,42592.08,43028.0
8
+ 7,29400000.0,146000.0,28700.0,1790.0,1020.0,8.37,7.16e-05,16.3,16.3,4.93,,,
9
+ 8,33600000.0,29800.0,141000.0,8810.0,1020.0,7.97,6.68e-05,79.9,79.9,3.12,3274.15,42592.08,43028.0
10
+ 9,37700000.0,31500.0,133000.0,8320.0,1020.0,7.83,6.21e-05,75.5,75.5,9.04,,,
11
+ 10,41900000.0,31400.0,134000.0,8350.0,1020.0,7.62,5.74e-05,75.8,75.8,5.09,3274.15,42592.08,43028.0
12
+ 11,46100000.0,30300.0,138000.0,8650.0,1020.0,7.47,5.26e-05,78.5,78.5,4.06,3274.15,42592.08,43028.0
13
+ 12,50300000.0,32000.0,131000.0,8180.0,1020.0,7.34,4.79e-05,74.2,74.2,3.13,,,
14
+ 13,54500000.0,32400.0,129000.0,8080.0,1020.0,7.23,4.32e-05,73.4,73.4,2.73,3274.15,42592.08,43028.0
15
+ 14,58700000.0,30900.0,136000.0,8480.0,1020.0,7.14,3.84e-05,77.0,77.0,2.33,3274.15,42592.08,43028.0
16
+ 15,62900000.0,30600.0,137000.0,8560.0,1020.0,7.06,3.37e-05,77.7,77.7,2.47,3274.15,42592.08,43028.0
17
+ 16,67099999.99999999,32100.0,131000.0,8170.0,1020.0,6.98,2.89e-05,74.1,74.1,2.69,3274.15,42592.08,43028.0
18
+ 17,71300000.0,31300.0,134000.0,8370.0,1020.0,6.9,2.42e-05,75.9,75.9,1.91,3274.15,42592.08,43028.0
19
+ 18,75500000.0,29900.0,140000.0,8770.0,1020.0,6.84,1.95e-05,79.5,79.5,1.62,3274.15,42592.08,43028.0
20
+ 19,79700000.0,31700.0,132000.0,8270.0,1020.0,6.8,1.47e-05,75.0,75.0,1.85,3274.15,42592.08,43028.0
21
+ 20,83900000.0,30800.0,136000.0,8500.0,1020.0,6.77,1e-05,77.1,77.1,1.82,,,
llama-1B/16_GPUS/dp-1_tp-1_pp-16_mbz-4/profiler.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ forward,backward
2
+ 0ms 584μs,3ms 721μs
llama-1B/16_GPUS/dp-1_tp-2_pp-8_mbz-2/profiler.csv CHANGED
@@ -1,2 +1,2 @@
1
  forward,backward
2
- 0ms 918μs,1ms 958μs
 
1
  forward,backward
2
+ 0ms 942μs,1ms 137μs
llama-1B/16_GPUS/dp-1_tp-2_pp-8_mbz-4/log_metrics.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ iteration,consumed_tokens,elapsed_time_per_iteration_ms,tokens_per_sec,tokens_per_sec_per_gpu,global_batch_size,lm_loss,lr,model_tflops_per_gpu,hardware_tflops_per_gpu,grad_norm,memory_usage_MiB,peak_allocated_MiB,peak_reserved_MiB
2
+ 1,4190000.0000000005,109000.0,38400.0,2400.0,1020.0,11.2,0.0001,21.8,21.8,17.8,2393.89,24101.07,24548.0
3
+ 2,8390000.0,65600.0,63900.0,3990.0,1020.0,11.2,9.53e-05,36.2,36.2,17.8,2393.89,24101.07,24548.0
4
+ 3,12600000.0,62400.0,67200.0,4200.0,1020.0,9.62,9.05e-05,38.1,38.1,21.7,2393.89,2393.9,24548.0
5
+ 4,16800000.0,63900.0,65600.0,4100.0,1020.0,10.5,8.58e-05,37.2,37.2,45.6,,,
6
+ 5,21000000.0,65900.0,63700.0,3980.0,1020.0,9.43,8.11e-05,36.1,36.1,11.3,2393.89,24101.07,24548.0
7
+ 6,25200000.0,66600.0,63000.0,3940.0,1020.0,9.37,7.63e-05,35.7,35.7,7.69,2393.89,24101.07,24548.0
8
+ 7,29400000.0,301000.0,14000.0,872.0,1020.0,8.96,7.16e-05,7.91,7.91,5.69,2393.89,24101.07,24548.0
9
+ 8,33600000.0,63200.0,66400.0,4150.0,1020.0,8.47,6.68e-05,37.6,37.6,5.25,2393.89,24101.07,24548.0
10
+ 9,37700000.0,65200.0,64300.0,4019.9999999999995,1020.0,8.01,6.21e-05,36.5,36.5,4.65,2393.89,24101.07,24548.0
11
+ 10,41900000.0,62100.0,67600.0,4220.0,1020.0,7.75,5.74e-05,38.3,38.3,3.85,2393.89,24101.07,24548.0
12
+ 11,46100000.0,61400.0,68400.0,4270.0,1020.0,7.62,5.26e-05,38.8,38.8,4.98,2393.89,24101.07,24548.0
13
+ 12,50300000.0,63000.0,66600.0,4160.0,1020.0,7.46,4.79e-05,37.7,37.7,3.45,,,
14
+ 13,54500000.0,64200.0,65400.00000000001,4090.0,1020.0,7.34,4.32e-05,37.1,37.1,3.45,2393.89,24101.07,24548.0
15
+ 14,58700000.0,63700.0,65800.0,4110.0,1020.0,7.22,3.84e-05,37.3,37.3,3.22,2393.89,24101.07,24548.0
16
+ 15,62900000.0,64500.0,65000.0,4059.9999999999995,1020.0,7.1,3.37e-05,36.8,36.8,2.88,2393.89,24101.07,24548.0
17
+ 16,67099999.99999999,64300.0,65200.0,4080.0,1020.0,7.01,2.89e-05,37.0,37.0,2.57,2393.89,24101.07,24548.0
18
+ 17,71300000.0,63900.0,65700.0,4100.0,1020.0,6.94,2.42e-05,37.2,37.2,2.47,2393.89,24101.07,24548.0
19
+ 18,75500000.0,63700.0,65900.0,4120.0,1020.0,6.88,1.95e-05,37.4,37.4,2.49,2393.89,24101.07,24548.0
20
+ 19,79700000.0,63600.0,65900.0,4120.0,1020.0,6.83,1.47e-05,37.4,37.4,2.42,,,
21
+ 20,83900000.0,64900.00000000001,64599.99999999999,4040.0,1020.0,6.78,1e-05,36.7,36.7,2.25,,,
llama-1B/16_GPUS/dp-1_tp-2_pp-8_mbz-4/profiler.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ forward,backward
2
+ 0ms 971μs,1ms 239μs
llama-1B/16_GPUS/dp-1_tp-2_pp-8_mbz-8/profiler.csv CHANGED
@@ -1,2 +1,2 @@
1
  forward,backward
2
- 0ms 980μs,7ms 495μs
 
1
  forward,backward
2
+ 0ms 943μs,1ms 229μs
llama-1B/16_GPUS/dp-1_tp-4_pp-4_mbz-16/log_metrics.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ iteration,consumed_tokens,elapsed_time_per_iteration_ms,tokens_per_sec,tokens_per_sec_per_gpu,global_batch_size,lm_loss,lr,model_tflops_per_gpu,hardware_tflops_per_gpu,grad_norm,memory_usage_MiB,peak_allocated_MiB,peak_reserved_MiB
2
+ 1,4190000.0000000005,48500.0,86500.0,5400.0,1020.0,11.2,0.0001,49.0,49.0,10.9,1778.24,47092.1,47498.0
3
+ 2,8390000.0,19800.0,212000.0,13200.0,1020.0,11.2,9.53e-05,120.0,120.0,11.0,1778.24,1778.27,47498.0
4
+ 3,12600000.0,18600.0,226000.0,14100.0,1020.0,9.83,9.05e-05,128.0,128.0,44.4,1778.24,1778.27,47498.0
5
+ 4,16800000.0,17800.0,235000.0,14700.0,1020.0,12.1,8.58e-05,133.0,133.0,24.8,,,
6
+ 5,21000000.0,17100.0,245000.0,15300.0,1020.0,10.1,8.11e-05,139.0,139.0,11.4,1778.24,47092.1,47498.0
7
+ 6,25200000.0,17100.0,245000.0,15300.0,1020.0,9.39,7.63e-05,139.0,139.0,7.05,1778.24,47092.1,47498.0
8
+ 7,29400000.0,113000.0,37000.0,2310.0,1020.0,8.7,7.16e-05,21.0,21.0,5.44,1778.24,47092.1,47498.0
9
+ 8,33600000.0,17200.0,243000.0,15200.0,1020.0,8.77,6.68e-05,138.0,138.0,18.3,,,
10
+ 9,37700000.0,17200.0,244000.0,15300.0,1020.0,8.11,6.21e-05,139.0,139.0,4.97,1778.24,47092.1,47498.0
11
+ 10,41900000.0,16900.0,249000.0,15500.0,1020.0,7.96,5.74e-05,141.0,141.0,4.62,1778.24,47092.1,47498.0
12
+ 11,46100000.0,16000.0,262000.0,16400.0,1020.0,7.84,5.26e-05,149.0,149.0,4.93,1778.24,47092.1,47498.0
13
+ 12,50300000.0,18100.0,232000.0,14500.0,1020.0,7.64,4.79e-05,132.0,132.0,4.08,1778.24,47092.1,47498.0
14
+ 13,54500000.0,17500.0,240000.0,15000.0,1020.0,7.48,4.32e-05,136.0,136.0,3.28,1778.24,47092.1,47498.0
15
+ 14,58700000.0,18200.0,230000.0,14400.0,1020.0,7.4,3.84e-05,131.0,131.0,3.52,1778.24,47092.1,47498.0
16
+ 15,62900000.0,17000.0,246000.0,15400.0,1020.0,7.29,3.37e-05,140.0,140.0,3.13,,,
17
+ 16,67099999.99999999,17600.0,239000.0,14900.0,1020.0,7.18,2.89e-05,135.0,135.0,3.12,1778.24,47092.1,47498.0
18
+ 17,71300000.0,17700.0,237000.0,14800.0,1020.0,7.09,2.42e-05,134.0,134.0,3.22,,,
19
+ 18,75500000.0,17800.0,236000.0,14700.0,1020.0,7.02,1.95e-05,134.0,134.0,3.19,1778.24,47092.1,47498.0
20
+ 19,79700000.0,18400.0,227000.0,14200.0,1020.0,6.97,1.47e-05,129.0,129.0,3.06,1778.24,47092.1,47498.0
21
+ 20,83900000.0,17600.0,239000.0,14900.0,1020.0,6.92,1e-05,135.0,135.0,2.88,,,
llama-1B/16_GPUS/dp-1_tp-4_pp-4_mbz-16/profiler.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ forward,backward
2
+ 0ms 888μs,1ms 570μs
llama-1B/16_GPUS/dp-1_tp-4_pp-4_mbz-4/profiler.csv CHANGED
@@ -1,2 +1,2 @@
1
  forward,backward
2
- 1ms 17μs,0ms 416μs
 
1
  forward,backward
2
+ 0ms 980μs,1ms 139μs
llama-1B/16_GPUS/dp-1_tp-4_pp-4_mbz-8/log_metrics.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ iteration,consumed_tokens,elapsed_time_per_iteration_ms,tokens_per_sec,tokens_per_sec_per_gpu,global_batch_size,lm_loss,lr,model_tflops_per_gpu,hardware_tflops_per_gpu,grad_norm,memory_usage_MiB,peak_allocated_MiB,peak_reserved_MiB
2
+ 1,4190000.0000000005,50800.0,82600.0,5160.0,1020.0,11.2,0.0001,46.9,46.9,10.9,1777.96,24436.91,24816.0
3
+ 2,8390000.0,15800.0,265000.0,16600.0,1020.0,11.2,9.53e-05,150.0,150.0,11.0,1777.96,24436.91,24816.0
4
+ 3,12600000.0,17300.0,242000.0,15100.0,1020.0,9.83,9.05e-05,137.0,137.0,44.3,1777.96,24436.91,24816.0
5
+ 4,16800000.0,16700.0,252000.0,15700.0,1020.0,12.1,8.58e-05,143.0,143.0,24.8,1777.96,24436.91,24816.0
6
+ 5,21000000.0,15400.0,272000.0,17000.0,1020.0,10.1,8.11e-05,154.0,154.0,11.4,,,
7
+ 6,25200000.0,15900.0,264000.0,16500.0,1020.0,9.39,7.63e-05,150.0,150.0,7.05,1777.96,24436.91,24816.0
8
+ 7,29400000.0,204000.0,20600.0,1290.0,1020.0,8.69,7.16e-05,11.7,11.7,5.43,1777.96,24436.91,24816.0
9
+ 8,33600000.0,16300.0,257000.0,16000.0,1020.0,8.77,6.68e-05,146.0,146.0,18.4,1777.96,24436.91,24816.0
10
+ 9,37700000.0,17600.0,238000.0,14900.0,1020.0,8.11,6.21e-05,135.0,135.0,4.96,,,
11
+ 10,41900000.0,16600.0,253000.0,15800.0,1020.0,7.96,5.74e-05,144.0,144.0,4.62,1777.96,24436.91,24816.0
12
+ 11,46100000.0,16300.0,257000.0,16100.000000000002,1020.0,7.84,5.26e-05,146.0,146.0,4.93,1777.96,24436.91,24816.0
13
+ 12,50300000.0,16000.0,262000.0,16400.0,1020.0,7.64,4.79e-05,148.0,148.0,4.08,,,
14
+ 13,54500000.0,16100.000000000002,261000.0,16300.0,1020.0,7.48,4.32e-05,148.0,148.0,3.28,1777.96,24436.91,24816.0
15
+ 14,58700000.0,16500.0,254000.0,15900.0,1020.0,7.4,3.84e-05,144.0,144.0,3.52,1777.96,24436.91,24816.0
16
+ 15,62900000.0,16100.000000000002,261000.0,16300.0,1020.0,7.29,3.37e-05,148.0,148.0,3.13,,,
17
+ 16,67099999.99999999,16000.0,263000.0,16400.0,1020.0,7.18,2.89e-05,149.0,149.0,3.11,1777.96,24436.91,24816.0
18
+ 17,71300000.0,15300.0,275000.0,17200.0,1020.0,7.09,2.42e-05,156.0,156.0,3.22,1777.96,24436.91,24816.0
19
+ 18,75500000.0,15800.0,266000.0,16600.0,1020.0,7.02,1.95e-05,151.0,151.0,3.19,1777.96,24436.91,24816.0
20
+ 19,79700000.0,15600.0,268000.0,16800.0,1020.0,6.97,1.47e-05,152.0,152.0,3.06,,,
21
+ 20,83900000.0,15300.0,274000.0,17100.0,1020.0,6.92,1e-05,156.0,156.0,2.89,,,
llama-1B/16_GPUS/dp-1_tp-4_pp-4_mbz-8/profiler.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ forward,backward
2
+ 0ms 804μs,1ms 957μs
llama-1B/16_GPUS/dp-1_tp-8_pp-2_mbz-16/log_metrics.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ iteration,consumed_tokens,elapsed_time_per_iteration_ms,tokens_per_sec,tokens_per_sec_per_gpu,global_batch_size,lm_loss,lr,model_tflops_per_gpu,hardware_tflops_per_gpu,grad_norm,memory_usage_MiB,peak_allocated_MiB,peak_reserved_MiB
2
+ 1,4190000.0000000005,36400.0,115000.0,7200.0,1020.0,11.2,0.0001,65.3,65.3,12.1,1572.71,31525.69,31874.0
3
+ 2,8390000.0,18000.0,233000.0,14500.0,1020.0,11.2,9.53e-05,132.0,132.0,12.2,1572.71,31525.69,31874.0
4
+ 3,12600000.0,18100.0,232000.0,14500.0,1020.0,10.0,9.05e-05,132.0,132.0,51.6,1572.71,31525.69,31874.0
5
+ 4,16800000.0,17400.0,241000.0,15000.0,1020.0,11.7,8.58e-05,136.0,136.0,18.2,1572.71,1572.75,31874.0
6
+ 5,21000000.0,17500.0,239000.0,14900.0,1020.0,10.4,8.11e-05,136.0,136.0,16.0,1572.71,31525.69,31874.0
7
+ 6,25200000.0,17500.0,240000.0,15000.0,1020.0,9.9,7.63e-05,136.0,136.0,9.07,1572.71,31525.69,31874.0
8
+ 7,29400000.0,216000.0,19400.0,1210.0,1020.0,9.37,7.16e-05,11.0,11.0,6.23,1572.71,31525.69,31874.0
9
+ 8,33600000.0,17400.0,241000.0,15100.0,1020.0,8.89,6.68e-05,137.0,137.0,5.76,1572.71,31525.69,31874.0
10
+ 9,37700000.0,18300.0,229000.0,14300.0,1020.0,8.8,6.21e-05,130.0,130.0,11.2,,,
11
+ 10,41900000.0,17300.0,243000.0,15200.0,1020.0,8.33,5.74e-05,138.0,138.0,5.72,1572.71,31525.69,31874.0
12
+ 11,46100000.0,17200.0,243000.0,15200.0,1020.0,8.06,5.26e-05,138.0,138.0,4.91,1572.71,31525.69,31874.0
13
+ 12,50300000.0,17000.0,247000.0,15400.0,1020.0,7.9,4.79e-05,140.0,140.0,4.86,1572.71,31525.69,31874.0
14
+ 13,54500000.0,17300.0,242000.0,15100.0,1020.0,7.75,4.32e-05,137.0,137.0,4.69,,,
15
+ 14,58700000.0,17300.0,243000.0,15200.0,1020.0,7.62,3.84e-05,138.0,138.0,4.69,1572.71,31525.69,31874.0
16
+ 15,62900000.0,17300.0,242000.0,15100.0,1020.0,7.48,3.37e-05,137.0,137.0,4.49,1572.71,31525.69,31874.0
17
+ 16,67099999.99999999,17000.0,247000.0,15400.0,1020.0,7.34,2.89e-05,140.0,140.0,3.99,1572.71,31525.69,31874.0
18
+ 17,71300000.0,17100.0,245000.0,15300.0,1020.0,7.23,2.42e-05,139.0,139.0,3.54,1572.71,31525.69,31874.0
19
+ 18,75500000.0,17300.0,242000.0,15100.0,1020.0,7.16,1.95e-05,137.0,137.0,3.28,1572.71,31525.69,31874.0
20
+ 19,79700000.0,17300.0,242000.0,15100.0,1020.0,7.09,1.47e-05,137.0,137.0,3.2,1572.71,31525.69,31874.0
21
+ 20,83900000.0,17500.0,240000.0,15000.0,1020.0,7.03,1e-05,136.0,136.0,3.1,,,
llama-1B/16_GPUS/dp-1_tp-8_pp-2_mbz-16/profiler.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ forward,backward
2
+ 0ms 972μs,1ms 205μs
llama-1B/16_GPUS/dp-1_tp-8_pp-2_mbz-32/log_metrics.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ iteration,consumed_tokens,elapsed_time_per_iteration_ms,tokens_per_sec,tokens_per_sec_per_gpu,global_batch_size,lm_loss,lr,model_tflops_per_gpu,hardware_tflops_per_gpu,grad_norm,memory_usage_MiB,peak_allocated_MiB,peak_reserved_MiB
2
+ 1,4190000.0000000005,35300.0,119000.0,7430.0,1020.0,11.2,0.0001,67.4,67.4,12.1,1573.27,1573.31,62200.0
3
+ 2,8390000.0,16800.0,250000.0,15600.0,1020.0,11.2,9.53e-05,142.0,142.0,12.2,1573.27,1573.31,62200.0
4
+ 3,12600000.0,17000.0,246000.0,15400.0,1020.0,10.0,9.05e-05,140.0,140.0,51.6,1573.27,1573.31,62200.0
5
+ 4,16800000.0,16800.0,249000.0,15600.0,1020.0,11.7,8.58e-05,141.0,141.0,18.3,1573.27,61477.19,62200.0
6
+ 5,21000000.0,16800.0,250000.0,15600.0,1020.0,10.4,8.11e-05,142.0,142.0,16.0,,,
7
+ 6,25200000.0,16400.0,255000.0,16000.0,1020.0,9.9,7.63e-05,145.0,145.0,9.07,1573.27,61477.19,62200.0
8
+ 7,29400000.0,117000.0,36000.0,2250.0,1020.0,9.37,7.16e-05,20.4,20.4,6.23,1573.27,61477.19,62200.0
9
+ 8,33600000.0,16600.0,253000.0,15800.0,1020.0,8.89,6.68e-05,144.0,144.0,5.76,1573.27,61477.19,62200.0
10
+ 9,37700000.0,17300.0,243000.0,15200.0,1020.0,8.8,6.21e-05,138.0,138.0,11.2,1573.27,61477.19,62200.0
11
+ 10,41900000.0,16500.0,255000.0,15900.0,1020.0,8.33,5.74e-05,144.0,144.0,5.72,1573.27,61477.19,62200.0
12
+ 11,46100000.0,16700.0,252000.0,15700.0,1020.0,8.06,5.26e-05,143.0,143.0,4.91,1573.27,61477.19,62200.0
13
+ 12,50300000.0,16900.0,249000.0,15500.0,1020.0,7.9,4.79e-05,141.0,141.0,4.86,1573.27,61477.19,62200.0
14
+ 13,54500000.0,17000.0,247000.0,15400.0,1020.0,7.75,4.32e-05,140.0,140.0,4.69,1573.27,61477.19,62200.0
15
+ 14,58700000.0,16500.0,254000.0,15900.0,1020.0,7.62,3.84e-05,144.0,144.0,4.69,1573.27,61477.19,62200.0
16
+ 15,62900000.0,16700.0,251000.0,15700.0,1020.0,7.48,3.37e-05,143.0,143.0,4.49,,,
17
+ 16,67099999.99999999,16900.0,248000.0,15500.0,1020.0,7.34,2.89e-05,141.0,141.0,3.99,1573.27,61477.19,62200.0
18
+ 17,71300000.0,17500.0,240000.0,15000.0,1020.0,7.23,2.42e-05,136.0,136.0,3.54,1573.27,61477.19,62200.0
19
+ 18,75500000.0,17100.0,245000.0,15300.0,1020.0,7.16,1.95e-05,139.0,139.0,3.28,1573.27,61477.19,62200.0
20
+ 19,79700000.0,16900.0,247000.0,15500.0,1020.0,7.09,1.47e-05,140.0,140.0,3.2,,,
21
+ 20,83900000.0,16600.0,252000.0,15700.0,1020.0,7.03,1e-05,143.0,143.0,3.1,,,
llama-1B/16_GPUS/dp-1_tp-8_pp-2_mbz-32/profiler.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ forward,backward
2
+ 0ms 974μs,0ms 988μs
llama-1B/16_GPUS/dp-1_tp-8_pp-2_mbz-8/profiler.csv CHANGED
@@ -1,2 +1,2 @@
1
  forward,backward
2
- 1ms 74μs,0ms 533μs
 
1
  forward,backward
2
+ 0ms 949μs,1ms 197μs
llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-2/log_metrics.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ iteration,consumed_tokens,elapsed_time_per_iteration_ms,tokens_per_sec,tokens_per_sec_per_gpu,global_batch_size,lm_loss,lr,model_tflops_per_gpu,hardware_tflops_per_gpu,grad_norm,memory_usage_MiB,peak_allocated_MiB,peak_reserved_MiB
2
+ 1,4190000.0000000005,63000.0,66600.0,4160.0,1020.0,11.1,0.0001,37.8,37.8,24.9,3168.18,4459.05,23598.0
3
+ 2,8390000.0,32100.0,131000.0,8170.0,1020.0,11.1,9.53e-05,74.1,74.1,25.1,3168.18,4459.05,23598.0
4
+ 3,12600000.0,29300.0,143000.0,8950.0,1020.0,9.49,9.05e-05,81.2,81.2,21.5,3168.18,4459.05,23598.0
5
+ 4,16800000.0,28900.0,145000.0,9070.0,1020.0,9.36,8.58e-05,82.3,82.3,21.4,3168.18,22834.67,23598.0
6
+ 5,21000000.0,31800.0,132000.0,8240.0,1020.0,9.01,8.11e-05,74.8,74.8,12.7,,,
7
+ 6,25200000.0,30800.0,136000.0,8520.0,1020.0,10.3,7.63e-05,77.3,77.3,47.1,3168.18,22834.67,23598.0
8
+ 7,29400000.0,241000.0,17400.0,1090.0,1020.0,8.68,7.16e-05,9.88,9.88,5.58,3168.18,22834.67,23598.0
9
+ 8,33600000.0,31200.0,135000.0,8410.0,1020.0,8.32,6.68e-05,76.3,76.3,4.77,3168.18,22834.67,23598.0
10
+ 9,37700000.0,31900.0,131000.0,8210.0,1020.0,7.95,6.21e-05,74.5,74.5,3.31,3168.18,22834.67,23598.0
11
+ 10,41900000.0,30600.0,137000.0,8550.0,1020.0,7.69,5.74e-05,77.6,77.6,4.31,3168.18,22834.67,23598.0
12
+ 11,46100000.0,32100.0,131000.0,8170.0,1020.0,7.45,5.26e-05,74.2,74.2,2.5,,,
13
+ 12,50300000.0,31700.0,132000.0,8270.0,1020.0,7.37,4.79e-05,75.0,75.0,5.02,3168.18,22834.67,23598.0
14
+ 13,54500000.0,32200.000000000004,130000.0,8150.0,1020.0,7.31,4.32e-05,73.9,73.9,6.06,3168.18,22834.67,23598.0
15
+ 14,58700000.0,31900.0,132000.0,8220.0,1020.0,7.19,3.84e-05,74.6,74.6,5.3,,,
16
+ 15,62900000.0,34100.0,123000.0,7690.0,1020.0,7.06,3.37e-05,69.7,69.7,2.73,3168.18,22834.67,23598.0
17
+ 16,67099999.99999999,32500.0,129000.0,8060.000000000001,1020.0,6.97,2.89e-05,73.2,73.2,1.99,3168.18,22834.67,23598.0
18
+ 17,71300000.0,30800.0,136000.0,8510.0,1020.0,6.91,2.42e-05,77.2,77.2,2.04,3168.18,22834.67,23598.0
19
+ 18,75500000.0,32900.0,128000.0,7970.0,1020.0,6.86,1.95e-05,72.4,72.4,2.0,3168.18,22834.67,23598.0
20
+ 19,79700000.0,30100.0,139000.0,8700.0,1020.0,6.81,1.47e-05,78.9,78.9,2.01,,,
21
+ 20,83900000.0,31100.0,135000.0,8420.0,1020.0,6.77,1e-05,76.4,76.4,1.94,,,
llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-2/profiler.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ forward,backward
2
+ 0ms 715μs,2ms 528μs
llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-4/profiler.csv CHANGED
@@ -1,2 +1,2 @@
1
  forward,backward
2
- 0ms 732μs,3ms 706μs
 
1
  forward,backward
2
+ 0ms 976μs,1ms 206μs
llama-1B/16_GPUS/dp-2_tp-2_pp-4_mbz-2/log_metrics.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ iteration,consumed_tokens,elapsed_time_per_iteration_ms,tokens_per_sec,tokens_per_sec_per_gpu,global_batch_size,lm_loss,lr,model_tflops_per_gpu,hardware_tflops_per_gpu,grad_norm,memory_usage_MiB,peak_allocated_MiB,peak_reserved_MiB
2
+ 1,4190000.0000000005,58800.0,71400.0,4460.0,1020.0,11.2,0.0001,40.5,40.5,14.8,2343.89,3292.11,12328.0
3
+ 2,8390000.0,35300.0,119000.0,7440.0,1020.0,11.2,9.53e-05,67.5,67.5,14.9,2343.89,3292.11,12328.0
4
+ 3,12600000.0,36400.0,115000.0,7210.0,1020.0,9.53,9.05e-05,65.4,65.4,35.8,2343.89,3292.11,12328.0
5
+ 4,16800000.0,37900.0,111000.0,6910.0,1020.0,12.3,8.58e-05,62.7,62.7,37.4,,,
6
+ 5,21000000.0,35100.0,119000.0,7460.0,1020.0,9.94,8.11e-05,67.7,67.7,14.1,2343.89,11855.1,12328.0
7
+ 6,25200000.0,34300.0,122000.0,7650.0,1020.0,9.44,7.63e-05,69.4,69.4,8.14,2343.89,11855.1,12328.0
8
+ 7,29400000.0,430000.0,9760.0,610.0,1020.0,8.73,7.16e-05,5.54,5.54,6.04,2343.89,11855.1,12328.0
9
+ 8,33600000.0,35100.0,119000.0,7460.0,1020.0,9.17,6.68e-05,67.7,67.7,27.9,,,
10
+ 9,37700000.0,33900.0,124000.0,7730.0,1020.0,8.33,6.21e-05,70.2,70.2,9.38,2343.89,11855.1,12328.0
11
+ 10,41900000.0,32900.0,128000.0,7970.0,1020.0,8.02,5.74e-05,72.3,72.3,5.25,,,
12
+ 11,46100000.0,35800.0,117000.0,7320.0,1020.0,7.85,5.26e-05,66.4,66.4,4.81,2343.89,11855.1,12328.0
13
+ 12,50300000.0,37200.0,113000.0,7050.0,1020.0,7.68,4.79e-05,64.0,64.0,4.49,,,
14
+ 13,54500000.0,34900.0,120000.0,7520.0,1020.0,7.53,4.32e-05,68.2,68.2,4.16,2343.89,11855.1,12328.0
15
+ 14,58700000.0,34500.0,122000.0,7600.0,1020.0,7.4,3.84e-05,68.9,68.9,4.08,2343.89,11855.1,12328.0
16
+ 15,62900000.0,35200.0,119000.0,7440.0,1020.0,7.26,3.37e-05,67.5,67.5,3.25,,,
17
+ 16,67099999.99999999,35300.0,119000.0,7420.0,1020.0,7.17,2.89e-05,67.3,67.3,2.43,2343.89,11855.1,12328.0
18
+ 17,71300000.0,37200.0,113000.0,7040.0,1020.0,7.1,2.42e-05,63.9,63.9,2.88,2343.89,11855.1,12328.0
19
+ 18,75500000.0,34500.0,121000.0,7590.0,1020.0,7.03,1.95e-05,68.9,68.9,2.75,2343.89,11855.1,12328.0
20
+ 19,79700000.0,36300.0,116000.0,7230.0,1020.0,6.96,1.47e-05,65.6,65.6,2.64,2343.89,11855.1,12328.0
21
+ 20,83900000.0,34800.0,120000.0,7530.0,1020.0,6.91,1e-05,68.3,68.3,2.47,,,
llama-1B/16_GPUS/dp-2_tp-2_pp-4_mbz-2/profiler.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ forward,backward
2
+ 0ms 959μs,1ms 86μs
llama-1B/16_GPUS/dp-2_tp-2_pp-4_mbz-4/log_metrics.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ iteration,consumed_tokens,elapsed_time_per_iteration_ms,tokens_per_sec,tokens_per_sec_per_gpu,global_batch_size,lm_loss,lr,model_tflops_per_gpu,hardware_tflops_per_gpu,grad_norm,memory_usage_MiB,peak_allocated_MiB,peak_reserved_MiB
2
+ 1,4190000.0000000005,58200.0,72100.0,4500.0,1020.0,11.2,0.0001,40.9,40.9,14.8,2343.96,3291.29,21926.0
3
+ 2,8390000.0,31600.0,133000.0,8290.0,1020.0,11.2,9.53e-05,75.2,75.2,14.9,2343.96,21358.33,21926.0
4
+ 3,12600000.0,32400.0,129000.0,8090.0,1020.0,9.53,9.05e-05,73.4,73.4,35.7,2343.96,3291.29,21926.0
5
+ 4,16800000.0,30400.0,138000.0,8630.0,1020.0,12.3,8.58e-05,78.3,78.3,37.4,2343.96,21358.33,21926.0
6
+ 5,21000000.0,33200.0,126000.0,7890.0,1020.0,9.94,8.11e-05,71.6,71.6,14.1,,,
7
+ 6,25200000.0,32100.0,130000.0,8150.0,1020.0,9.43,7.63e-05,74.0,74.0,8.15,2343.96,21358.33,21926.0
8
+ 7,29400000.0,224000.0,18700.0,1170.0,1020.0,8.73,7.16e-05,10.6,10.6,6.04,2343.96,21358.33,21926.0
9
+ 8,33600000.0,34400.0,122000.0,7620.0,1020.0,9.16,6.68e-05,69.1,69.1,27.8,2343.96,21358.33,21926.0
10
+ 9,37700000.0,33300.0,126000.0,7880.0,1020.0,8.32,6.21e-05,71.5,71.5,9.29,2343.96,21358.33,21926.0
11
+ 10,41900000.0,32600.0,129000.0,8029.999999999999,1020.0,8.02,5.74e-05,72.9,72.9,5.24,2343.96,21358.33,21926.0
12
+ 11,46100000.0,33500.0,125000.0,7830.0,1020.0,7.85,5.26e-05,71.1,71.1,4.81,2343.96,21358.33,21926.0
13
+ 12,50300000.0,33200.0,126000.0,7900.0,1020.0,7.68,4.79e-05,71.7,71.7,4.49,2343.96,21358.33,21926.0
14
+ 13,54500000.0,31800.0,132000.0,8250.0,1020.0,7.53,4.32e-05,74.9,74.9,4.15,2343.96,21358.33,21926.0
15
+ 14,58700000.0,32000.0,131000.0,8200.0,1020.0,7.4,3.84e-05,74.4,74.4,4.07,2343.96,21358.33,21926.0
16
+ 15,62900000.0,32299.999999999996,130000.0,8119.999999999999,1020.0,7.26,3.37e-05,73.7,73.7,3.24,2343.96,21358.33,21926.0
17
+ 16,67099999.99999999,33600.0,125000.0,7810.0,1020.0,7.17,2.89e-05,70.8,70.8,2.43,2343.96,21358.33,21926.0
18
+ 17,71300000.0,31500.0,133000.0,8320.0,1020.0,7.1,2.42e-05,75.5,75.5,2.88,2343.96,21358.33,21926.0
19
+ 18,75500000.0,33200.0,126000.0,7890.0,1020.0,7.03,1.95e-05,71.6,71.6,2.75,2343.96,21358.33,21926.0
20
+ 19,79700000.0,34100.0,123000.0,7700.0,1020.0,6.96,1.47e-05,69.8,69.8,2.64,,,
21
+ 20,83900000.0,31700.0,132000.0,8260.0,1020.0,6.91,1e-05,75.0,75.0,2.47,,,
llama-1B/16_GPUS/dp-2_tp-2_pp-4_mbz-4/profiler.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ forward,backward
2
+ 0ms 915μs,1ms 415μs
llama-1B/16_GPUS/dp-2_tp-2_pp-4_mbz-8/log_metrics.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ iteration,consumed_tokens,elapsed_time_per_iteration_ms,tokens_per_sec,tokens_per_sec_per_gpu,global_batch_size,lm_loss,lr,model_tflops_per_gpu,hardware_tflops_per_gpu,grad_norm,memory_usage_MiB,peak_allocated_MiB,peak_reserved_MiB
2
+ 1,4190000.0000000005,47000.0,89300.0,5580.0,1020.0,11.2,0.0001,50.6,50.6,14.8,2344.1,3289.64,41214.0
3
+ 2,8390000.0,24200.0,174000.0,10800.0,1020.0,11.2,9.53e-05,98.4,98.4,14.9,2344.1,3289.64,41214.0
4
+ 3,12600000.0,28600.0,147000.0,9180.0,1020.0,9.53,9.05e-05,83.3,83.3,35.8,2344.1,40364.8,41214.0
5
+ 4,16800000.0,27800.0,151000.0,9440.0,1020.0,12.3,8.58e-05,85.7,85.7,37.4,2344.1,3289.64,41214.0
6
+ 5,21000000.0,25200.0,166000.0,10400.0,1020.0,9.94,8.11e-05,94.2,94.2,14.1,2344.1,40364.8,41214.0
7
+ 6,25200000.0,26300.0,159000.0,9950.0,1020.0,9.44,7.63e-05,90.3,90.3,8.13,2344.1,40364.8,41214.0
8
+ 7,29400000.0,121000.0,34700.0,2170.0,1020.0,8.73,7.16e-05,19.7,19.7,6.04,2344.1,40364.8,41214.0
9
+ 8,33600000.0,26000.0,161000.0,10100.0,1020.0,9.17,6.68e-05,91.4,91.4,28.0,2344.1,40364.8,41214.0
10
+ 9,37700000.0,27100.0,155000.0,9660.0,1020.0,8.33,6.21e-05,87.6,87.6,9.42,2344.1,40364.8,41214.0
11
+ 10,41900000.0,26600.0,157000.0,9840.0,1020.0,8.02,5.74e-05,89.3,89.3,5.24,2344.1,40364.8,41214.0
12
+ 11,46100000.0,26500.0,158000.0,9880.0,1020.0,7.85,5.26e-05,89.6,89.6,4.81,,,
13
+ 12,50300000.0,27900.0,151000.0,9410.0,1020.0,7.68,4.79e-05,85.4,85.4,4.49,2344.1,40364.8,41214.0
14
+ 13,54500000.0,25900.0,162000.0,10100.0,1020.0,7.53,4.32e-05,91.7,91.7,4.15,2344.1,40364.8,41214.0
15
+ 14,58700000.0,25700.0,163000.0,10200.0,1020.0,7.4,3.84e-05,92.6,92.6,4.07,,,
16
+ 15,62900000.0,25800.0,163000.0,10200.0,1020.0,7.26,3.37e-05,92.3,92.3,3.25,2344.1,40364.8,41214.0
17
+ 16,67099999.99999999,24100.0,174000.0,10900.0,1020.0,7.17,2.89e-05,98.6,98.6,2.44,2344.1,40364.8,41214.0
18
+ 17,71300000.0,25600.0,164000.0,10200.0,1020.0,7.1,2.42e-05,92.9,92.9,2.88,2344.1,40364.8,41214.0
19
+ 18,75500000.0,25400.0,165000.0,10300.0,1020.0,7.03,1.95e-05,93.5,93.5,2.75,2344.1,40364.8,41214.0
20
+ 19,79700000.0,24300.0,173000.0,10800.0,1020.0,6.96,1.47e-05,97.9,97.9,2.65,,,
21
+ 20,83900000.0,28300.0,148000.0,9270.0,1020.0,6.91,1e-05,84.1,84.1,2.47,,,
llama-1B/16_GPUS/dp-2_tp-2_pp-4_mbz-8/profiler.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ forward,backward
2
+ 0ms 961μs,1ms 120μs
llama-1B/16_GPUS/dp-2_tp-4_pp-2_mbz-16/log_metrics.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ iteration,consumed_tokens,elapsed_time_per_iteration_ms,tokens_per_sec,tokens_per_sec_per_gpu,global_batch_size,lm_loss,lr,model_tflops_per_gpu,hardware_tflops_per_gpu,grad_norm,memory_usage_MiB,peak_allocated_MiB,peak_reserved_MiB
2
+ 1,4190000.0000000005,27500.0,152000.0,9520.0,1020.0,11.1,0.0001,86.4,86.4,15.0,2064.33,2888.12,47696.0
3
+ 2,8390000.0,13900.0,302000.0,18900.0,1020.0,11.1,9.53e-05,171.0,171.0,15.1,2064.33,2888.12,47952.0
4
+ 3,12600000.0,12900.0,325000.0,20300.0,1020.0,11.4,9.05e-05,184.0,184.0,106.0,2064.33,47377.06,47952.0
5
+ 4,16800000.0,13300.0,315000.0,19700.0,1020.0,11.7,8.58e-05,179.0,179.0,24.5,2064.33,47377.06,47952.0
6
+ 5,21000000.0,13000.0,324000.0,20200.0,1020.0,10.0,8.11e-05,184.0,184.0,11.0,,,
7
+ 6,25200000.0,12500.0,335000.0,20900.0,1020.0,9.46,7.63e-05,190.0,190.0,7.2,2064.33,47377.06,47952.0
8
+ 7,29400000.0,108000.0,38700.0,2420.0,1020.0,8.87,7.16e-05,21.9,21.9,5.99,2064.33,47377.06,47952.0
9
+ 8,33600000.0,13500.0,311000.0,19400.0,1020.0,8.44,6.68e-05,176.0,176.0,5.47,,,
10
+ 9,37700000.0,12400.0,337000.0,21100.0,1020.0,8.17,6.21e-05,191.0,191.0,6.22,2064.33,47377.06,47952.0
11
+ 10,41900000.0,12700.0,331000.0,20700.0,1020.0,7.87,5.74e-05,188.0,188.0,4.35,2064.33,47377.06,47952.0
12
+ 11,46100000.0,12900.0,326000.0,20400.0,1020.0,7.74,5.26e-05,185.0,185.0,4.47,2064.33,47377.06,47952.0
13
+ 12,50300000.0,12700.0,331000.0,20700.0,1020.0,7.6,4.79e-05,188.0,188.0,4.41,2064.33,47377.06,47952.0
14
+ 13,54500000.0,12800.0,328000.0,20500.0,1020.0,7.41,4.32e-05,186.0,186.0,3.72,2064.33,47377.06,47952.0
15
+ 14,58700000.0,13700.0,306000.0,19100.0,1020.0,7.27,3.84e-05,173.0,173.0,3.19,,,
16
+ 15,62900000.0,13300.0,316000.0,19800.0,1020.0,7.17,3.37e-05,179.0,179.0,3.0,2064.33,47377.06,47952.0
17
+ 16,67099999.99999999,13400.0,312000.0,19500.0,1020.0,7.07,2.89e-05,177.0,177.0,3.0,2064.33,47377.06,47952.0
18
+ 17,71300000.0,14800.0,283000.0,17700.0,1020.0,6.96,2.42e-05,160.0,160.0,2.81,2064.33,47377.06,47952.0
19
+ 18,75500000.0,13300.0,314000.0,19700.0,1020.0,6.88,1.95e-05,178.0,178.0,3.0,2064.33,47377.06,47952.0
20
+ 19,79700000.0,13600.0,308000.0,19200.0,1020.0,6.82,1.47e-05,174.0,174.0,3.08,,,
21
+ 20,83900000.0,14100.0,298000.0,18600.0,1020.0,6.77,1e-05,169.0,169.0,2.98,,,
llama-1B/16_GPUS/dp-2_tp-4_pp-2_mbz-16/profiler.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ forward,backward
2
+ 0ms 963μs,1ms 10μs
llama-1B/16_GPUS/dp-2_tp-4_pp-2_mbz-4/log_metrics.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ iteration,consumed_tokens,elapsed_time_per_iteration_ms,tokens_per_sec,tokens_per_sec_per_gpu,global_batch_size,lm_loss,lr,model_tflops_per_gpu,hardware_tflops_per_gpu,grad_norm,memory_usage_MiB,peak_allocated_MiB,peak_reserved_MiB
2
+ 1,4190000.0000000005,30700.0,136000.0,8530.0,1020.0,11.1,0.0001,77.4,77.4,15.0,2063.91,13395.12,13626.0
3
+ 2,8390000.0,15700.0,268000.0,16700.0,1020.0,11.1,9.53e-05,152.0,152.0,15.1,2063.91,2887.7,13626.0
4
+ 3,12600000.0,15900.0,263000.0,16400.0,1020.0,11.4,9.05e-05,149.0,149.0,106.0,2063.91,2887.7,13626.0
5
+ 4,16800000.0,19200.0,218000.0,13600.0,1020.0,11.7,8.58e-05,124.0,124.0,24.5,2063.91,13395.12,13626.0
6
+ 5,21000000.0,20000.0,210000.0,13100.0,1020.0,10.0,8.11e-05,119.0,119.0,11.0,,,
7
+ 6,25200000.0,19300.0,217000.0,13600.0,1020.0,9.46,7.63e-05,123.0,123.0,7.2,2063.91,13395.12,13626.0
8
+ 7,29400000.0,413000.0,10200.0,635.0,1020.0,8.87,7.16e-05,5.76,5.76,5.99,2063.91,13395.12,13626.0
9
+ 8,33600000.0,16300.0,257000.0,16000.0,1020.0,8.43,6.68e-05,145.0,145.0,5.47,,,
10
+ 9,37700000.0,15400.0,273000.0,17000.0,1020.0,8.17,6.21e-05,155.0,155.0,6.19,2063.91,13395.12,13626.0
11
+ 10,41900000.0,16400.0,256000.0,16000.0,1020.0,7.86,5.74e-05,145.0,145.0,4.35,2063.91,13395.12,13626.0
12
+ 11,46100000.0,15600.0,268000.0,16800.0,1020.0,7.74,5.26e-05,152.0,152.0,4.48,2063.91,13395.12,13626.0
13
+ 12,50300000.0,15700.0,267000.0,16700.0,1020.0,7.6,4.79e-05,152.0,152.0,4.41,2063.91,13395.12,13626.0
14
+ 13,54500000.0,15400.0,273000.0,17100.0,1020.0,7.41,4.32e-05,155.0,155.0,3.72,2063.91,13395.12,13626.0
15
+ 14,58700000.0,15300.0,274000.0,17100.0,1020.0,7.27,3.84e-05,155.0,155.0,3.19,2063.91,13395.12,13626.0
16
+ 15,62900000.0,15100.0,278000.0,17300.0,1020.0,7.17,3.37e-05,157.0,157.0,3.0,2063.91,13395.12,13626.0
17
+ 16,67099999.99999999,16200.0,260000.0,16200.0,1020.0,7.07,2.89e-05,147.0,147.0,3.0,2063.91,13395.12,13626.0
18
+ 17,71300000.0,15300.0,273000.0,17100.0,1020.0,6.96,2.42e-05,155.0,155.0,2.81,2063.91,13395.12,13626.0
19
+ 18,75500000.0,15200.0,275000.0,17200.0,1020.0,6.88,1.95e-05,156.0,156.0,3.0,2063.91,13395.12,13626.0
20
+ 19,79700000.0,15600.0,269000.0,16800.0,1020.0,6.82,1.47e-05,152.0,152.0,3.08,,,
21
+ 20,83900000.0,16200.0,260000.0,16200.0,1020.0,6.77,1e-05,147.0,147.0,2.98,,,
llama-1B/16_GPUS/dp-2_tp-4_pp-2_mbz-4/profiler.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ forward,backward
2
+ 0ms 967μs,1ms 107μs
llama-1B/16_GPUS/dp-2_tp-4_pp-2_mbz-8/log_metrics.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ iteration,consumed_tokens,elapsed_time_per_iteration_ms,tokens_per_sec,tokens_per_sec_per_gpu,global_batch_size,lm_loss,lr,model_tflops_per_gpu,hardware_tflops_per_gpu,grad_norm,memory_usage_MiB,peak_allocated_MiB,peak_reserved_MiB
2
+ 1,4190000.0000000005,29300.0,143000.0,8950.0,1020.0,11.1,0.0001,81.2,81.2,15.0,2064.05,2887.84,24986.0
3
+ 2,8390000.0,15600.0,269000.0,16800.0,1020.0,11.1,9.53e-05,153.0,153.0,15.1,2064.05,2887.84,25114.0
4
+ 3,12600000.0,14200.0,294000.0,18400.0,1020.0,11.4,9.05e-05,167.0,167.0,106.0,2064.05,2887.84,25114.0
5
+ 4,16800000.0,16000.0,262000.0,16400.0,1020.0,11.7,8.58e-05,149.0,149.0,24.5,,,
6
+ 5,21000000.0,15600.0,269000.0,16800.0,1020.0,10.0,8.11e-05,153.0,153.0,11.0,2064.05,24722.43,25114.0
7
+ 6,25200000.0,14500.0,290000.0,18100.0,1020.0,9.46,7.63e-05,164.0,164.0,7.21,2064.05,24722.43,25114.0
8
+ 7,29400000.0,221000.0,19000.0,1190.0,1020.0,8.87,7.16e-05,10.8,10.8,5.99,,,
9
+ 8,33600000.0,14600.0,287000.0,17900.0,1020.0,8.44,6.68e-05,163.0,163.0,5.45,2064.05,24722.43,25114.0
10
+ 9,37700000.0,16000.0,262000.0,16400.0,1020.0,8.18,6.21e-05,149.0,149.0,6.29,2064.05,24722.43,25114.0
11
+ 10,41900000.0,14500.0,289000.0,18100.0,1020.0,7.87,5.74e-05,164.0,164.0,4.35,2064.05,24722.43,25114.0
12
+ 11,46100000.0,14700.0,286000.0,17900.0,1020.0,7.74,5.26e-05,162.0,162.0,4.47,2064.05,24722.43,25114.0
13
+ 12,50300000.0,14200.0,295000.0,18500.0,1020.0,7.6,4.79e-05,167.0,167.0,4.41,,,
14
+ 13,54500000.0,14600.0,288000.0,18000.0,1020.0,7.42,4.32e-05,163.0,163.0,3.72,2064.05,24722.43,25114.0
15
+ 14,58700000.0,14900.0,282000.0,17700.0,1020.0,7.27,3.84e-05,160.0,160.0,3.2,2064.05,24722.43,25114.0
16
+ 15,62900000.0,14500.0,289000.0,18100.0,1020.0,7.17,3.37e-05,164.0,164.0,2.99,2064.05,24722.43,25114.0
17
+ 16,67099999.99999999,14000.0,299000.0,18700.0,1020.0,7.07,2.89e-05,169.0,169.0,3.0,2064.05,24722.43,25114.0
18
+ 17,71300000.0,14900.0,282000.0,17600.0,1020.0,6.96,2.42e-05,160.0,160.0,2.81,,,
19
+ 18,75500000.0,14500.0,289000.0,18100.0,1020.0,6.88,1.95e-05,164.0,164.0,3.0,2064.05,24722.43,25114.0
20
+ 19,79700000.0,15200.0,275000.0,17200.0,1020.0,6.82,1.47e-05,156.0,156.0,3.08,,,
21
+ 20,83900000.0,15100.0,278000.0,17400.0,1020.0,6.77,1e-05,158.0,158.0,2.99,,,
llama-1B/16_GPUS/dp-2_tp-4_pp-2_mbz-8/profiler.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ forward,backward
2
+ 0ms 967μs,1ms 105μs
llama-1B/16_GPUS/dp-2_tp-8_pp-1_mbz-16/log_metrics.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ iteration,consumed_tokens,elapsed_time_per_iteration_ms,tokens_per_sec,tokens_per_sec_per_gpu,global_batch_size,lm_loss,lr,model_tflops_per_gpu,hardware_tflops_per_gpu,grad_norm,memory_usage_MiB,peak_allocated_MiB,peak_reserved_MiB
2
+ 1,4190000.0000000005,25900.0,162000.0,10100.0,1020.0,11.5,0.0001,92.0,92.0,15.7,1696.19,30498.63,30840.0
3
+ 2,8390000.0,13400.0,312000.0,19500.0,1020.0,11.5,9.53e-05,177.0,177.0,16.0,1696.19,30498.63,30840.0
4
+ 3,12600000.0,13600.0,308000.0,19200.0,1020.0,12.8,9.05e-05,174.0,174.0,137.0,1696.19,30498.63,30840.0
5
+ 4,16800000.0,13600.0,308000.0,19300.0,1020.0,12.2,8.58e-05,175.0,175.0,22.4,1696.17,2358.02,30840.0
6
+ 5,21000000.0,13400.0,314000.0,19600.0,1020.0,12.4,8.11e-05,178.0,178.0,42.9,1696.17,30498.63,30840.0
7
+ 6,25200000.0,13400.0,312000.0,19500.0,1020.0,11.1,7.63e-05,177.0,177.0,24.7,1696.17,30498.63,30840.0
8
+ 7,29400000.0,13600.0,309000.0,19300.0,1020.0,10.2,7.16e-05,175.0,175.0,12.2,1696.17,30498.63,30840.0
9
+ 8,33600000.0,13400.0,313000.0,19600.0,1020.0,9.8,6.68e-05,178.0,178.0,7.31,1696.17,30498.63,30840.0
10
+ 9,37700000.0,13400.0,314000.0,19600.0,1020.0,9.32,6.21e-05,178.0,178.0,6.66,1696.17,30498.63,30840.0
11
+ 10,41900000.0,13600.0,309000.0,19300.0,1020.0,9.22,5.74e-05,175.0,175.0,16.2,1696.17,30498.63,30840.0
12
+ 11,46100000.0,13600.0,308000.0,19300.0,1020.0,8.63,5.26e-05,175.0,175.0,7.93,1696.17,30498.63,30840.0
13
+ 12,50300000.0,13700.0,307000.0,19200.0,1020.0,8.27,4.79e-05,174.0,174.0,5.43,1696.17,30498.63,30840.0
14
+ 13,54500000.0,13300.0,315000.0,19700.0,1020.0,8.1,4.32e-05,179.0,179.0,5.53,1696.17,30498.63,30840.0
15
+ 14,58700000.0,13600.0,309000.0,19300.0,1020.0,7.93,3.84e-05,175.0,175.0,5.77,1696.17,30498.63,30840.0
16
+ 15,62900000.0,13600.0,309000.0,19300.0,1020.0,7.72,3.37e-05,175.0,175.0,5.17,1696.17,30498.63,30840.0
17
+ 16,67099999.99999999,13400.0,313000.0,19600.0,1020.0,7.56,2.89e-05,178.0,178.0,4.92,1696.17,30498.63,30840.0
18
+ 17,71300000.0,13800.0,304000.0,19000.0,1020.0,7.45,2.42e-05,172.0,172.0,4.93,1696.17,30498.63,30840.0
19
+ 18,75500000.0,13500.0,310000.0,19400.0,1020.0,7.35,1.95e-05,176.0,176.0,4.04,1696.17,30498.63,30840.0
20
+ 19,79700000.0,13500.0,311000.0,19400.0,1020.0,7.29,1.47e-05,176.0,176.0,4.11,1696.17,30498.63,30840.0
21
+ 20,83900000.0,13500.0,312000.0,19500.0,1020.0,7.23,1e-05,177.0,177.0,3.95,,,
llama-1B/16_GPUS/dp-2_tp-8_pp-1_mbz-16/profiler.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ forward,backward
2
+ 0ms 971μs,0ms 973μs
llama-1B/16_GPUS/dp-2_tp-8_pp-1_mbz-32/log_metrics.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ iteration,consumed_tokens,elapsed_time_per_iteration_ms,tokens_per_sec,tokens_per_sec_per_gpu,global_batch_size,lm_loss,lr,model_tflops_per_gpu,hardware_tflops_per_gpu,grad_norm,memory_usage_MiB,peak_allocated_MiB,peak_reserved_MiB
2
+ 1,4190000.0000000005,20500.0,205000.0,12800.0,1020.0,11.5,0.0001,116.0,116.0,15.7,1697.3,59303.14,60566.0
3
+ 2,8390000.0,12900.0,325000.0,20300.0,1020.0,11.5,9.53e-05,184.0,184.0,16.0,1697.3,59303.14,60566.0
4
+ 3,12600000.0,12700.0,329000.0,20600.0,1020.0,12.8,9.05e-05,187.0,187.0,137.0,1697.3,59303.14,60566.0
5
+ 4,16800000.0,13000.0,322000.0,20100.0,1020.0,12.2,8.58e-05,183.0,183.0,22.4,1697.29,2359.13,60566.0
6
+ 5,21000000.0,13000.0,322000.0,20100.0,1020.0,12.4,8.11e-05,182.0,182.0,42.8,1697.29,59303.14,60566.0
7
+ 6,25200000.0,12900.0,325000.0,20300.0,1020.0,11.1,7.63e-05,185.0,185.0,24.8,1697.29,59303.14,60566.0
8
+ 7,29400000.0,13100.0,320000.0,20000.0,1020.0,10.2,7.16e-05,182.0,182.0,12.1,1697.29,59303.14,60566.0
9
+ 8,33600000.0,12700.0,329000.0,20600.0,1020.0,9.8,6.68e-05,187.0,187.0,7.31,1697.29,59303.14,60566.0
10
+ 9,37700000.0,12800.0,328000.0,20500.0,1020.0,9.32,6.21e-05,186.0,186.0,6.66,1697.29,59303.14,60566.0
11
+ 10,41900000.0,12900.0,324000.0,20300.0,1020.0,9.22,5.74e-05,184.0,184.0,16.3,1697.29,59303.14,60566.0
12
+ 11,46100000.0,12900.0,325000.0,20300.0,1020.0,8.63,5.26e-05,184.0,184.0,7.95,1697.29,59303.14,60566.0
13
+ 12,50300000.0,12800.0,329000.0,20500.0,1020.0,8.27,4.79e-05,186.0,186.0,5.43,1697.29,59303.14,60566.0
14
+ 13,54500000.0,12800.0,327000.0,20400.0,1020.0,8.1,4.32e-05,185.0,185.0,5.53,1697.29,59303.14,60566.0
15
+ 14,58700000.0,12800.0,328000.0,20500.0,1020.0,7.93,3.84e-05,186.0,186.0,5.77,1697.29,59303.14,60566.0
16
+ 15,62900000.0,12800.0,328000.0,20500.0,1020.0,7.72,3.37e-05,186.0,186.0,5.17,1697.29,59303.14,60566.0
17
+ 16,67099999.99999999,13000.0,323000.0,20200.0,1020.0,7.56,2.89e-05,183.0,183.0,4.93,1697.29,59303.14,60566.0
18
+ 17,71300000.0,12800.0,329000.0,20500.0,1020.0,7.45,2.42e-05,186.0,186.0,4.93,1697.29,59303.14,60566.0
19
+ 18,75500000.0,12800.0,327000.0,20500.0,1020.0,7.35,1.95e-05,186.0,186.0,4.02,1697.29,59303.14,60566.0
20
+ 19,79700000.0,12800.0,328000.0,20500.0,1020.0,7.29,1.47e-05,186.0,186.0,4.11,1697.29,59303.14,60566.0
21
+ 20,83900000.0,12700.0,329000.0,20600.0,1020.0,7.23,1e-05,187.0,187.0,3.96,,,
llama-1B/16_GPUS/dp-2_tp-8_pp-1_mbz-32/profiler.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ forward,backward
2
+ 0ms 958μs,1ms 170μs
llama-1B/16_GPUS/dp-4_tp-1_pp-4_mbz-1/log_metrics.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ iteration,consumed_tokens,elapsed_time_per_iteration_ms,tokens_per_sec,tokens_per_sec_per_gpu,global_batch_size,lm_loss,lr,model_tflops_per_gpu,hardware_tflops_per_gpu,grad_norm,memory_usage_MiB,peak_allocated_MiB,peak_reserved_MiB
2
+ 1,4190000.0000000005,34000.0,123000.0,7720.0,1020.0,11.1,0.0001,70.0,70.0,25.1,3478.24,11653.25,12904.0
3
+ 2,8390000.0,19000.0,221000.0,13800.0,1020.0,11.1,9.53e-05,125.0,125.0,25.2,3478.24,5184.61,12904.0
4
+ 3,12600000.0,19100.0,219000.0,13700.0,1020.0,11.4,9.05e-05,124.0,124.0,217.0,3478.24,5184.61,12904.0
5
+ 4,16800000.0,22400.0,187000.0,11700.0,1020.0,13.8,8.58e-05,106.0,106.0,22.5,,,
6
+ 5,21000000.0,22200.0,189000.0,11800.0,1020.0,9.98,8.11e-05,107.0,107.0,16.5,3478.24,11653.25,12904.0
7
+ 6,25200000.0,21500.0,195000.0,12200.0,1020.0,10.9,7.63e-05,111.0,111.0,93.8,3478.24,11653.25,12904.0
8
+ 7,29400000.0,389000.0,10800.0,674.0,1020.0,9.16,7.16e-05,6.11,6.11,19.7,3478.24,11653.25,12904.0
9
+ 8,33600000.0,22500.0,187000.0,11700.0,1020.0,8.83,6.68e-05,106.0,106.0,6.08,3478.24,11653.25,12904.0
10
+ 9,37700000.0,22600.0,186000.0,11600.0,1020.0,8.47,6.21e-05,105.0,105.0,5.23,3478.24,11653.25,12904.0
11
+ 10,41900000.0,19200.0,218000.0,13600.0,1020.0,8.17,5.74e-05,124.0,124.0,7.72,3478.24,11653.25,12904.0
12
+ 11,46100000.0,20000.0,210000.0,13100.0,1020.0,7.93,5.26e-05,119.0,119.0,5.54,,,
13
+ 12,50300000.0,19600.0,214000.0,13300.0,1020.0,7.75,4.79e-05,121.0,121.0,4.65,3478.24,11653.25,12904.0
14
+ 13,54500000.0,18900.0,222000.0,13900.0,1020.0,7.58,4.32e-05,126.0,126.0,2.89,3478.24,11653.25,12904.0
15
+ 14,58700000.0,19100.0,219000.0,13700.0,1020.0,7.5,3.84e-05,124.0,124.0,4.19,3478.24,11653.25,12904.0
16
+ 15,62900000.0,19200.0,218000.0,13600.0,1020.0,7.4,3.37e-05,124.0,124.0,3.86,3478.24,11653.25,12904.0
17
+ 16,67099999.99999999,19500.0,215000.0,13500.0,1020.0,7.29,2.89e-05,122.0,122.0,3.07,3478.24,11653.25,12904.0
18
+ 17,71300000.0,19600.0,214000.0,13400.0,1020.0,7.19,2.42e-05,122.0,122.0,2.39,3478.24,11653.25,12904.0
19
+ 18,75500000.0,21100.0,199000.0,12400.0,1020.0,7.13,1.95e-05,113.0,113.0,2.21,3478.24,11653.25,12904.0
20
+ 19,79700000.0,19600.0,214000.0,13400.0,1020.0,7.08,1.47e-05,121.0,121.0,2.64,3478.24,11653.25,12904.0
21
+ 20,83900000.0,17900.0,234000.0,14600.0,1020.0,7.03,1e-05,133.0,133.0,2.29,,,
llama-1B/16_GPUS/dp-4_tp-1_pp-4_mbz-1/profiler.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ forward,backward
2
+ 0ms 947μs,1ms 132μs
llama-1B/16_GPUS/dp-4_tp-1_pp-4_mbz-2/profiler.csv CHANGED
@@ -1,2 +1,2 @@
1
  forward,backward
2
- 0ms 890μs,1ms 193μs
 
1
  forward,backward
2
+ 0ms 947μs,1ms 197μs
llama-1B/16_GPUS/dp-4_tp-1_pp-4_mbz-4/log_metrics.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ iteration,consumed_tokens,elapsed_time_per_iteration_ms,tokens_per_sec,tokens_per_sec_per_gpu,global_batch_size,lm_loss,lr,model_tflops_per_gpu,hardware_tflops_per_gpu,grad_norm,memory_usage_MiB,peak_allocated_MiB,peak_reserved_MiB
2
+ 1,4190000.0000000005,38800.0,108000.0,6750.0,1020.0,11.1,0.0001,61.3,61.3,25.1,3478.34,37922.28,38320.0
3
+ 2,8390000.0,19100.0,219000.0,13700.0,1020.0,11.1,9.53e-05,124.0,124.0,25.2,3478.34,37922.28,38320.0
4
+ 3,12600000.0,19000.0,221000.0,13800.0,1020.0,11.4,9.05e-05,125.0,125.0,217.0,3478.34,37922.28,38320.0
5
+ 4,16800000.0,19800.0,212000.0,13300.0,1020.0,13.8,8.58e-05,120.0,120.0,22.5,3478.34,5180.18,38320.0
6
+ 5,21000000.0,18200.0,231000.0,14400.0,1020.0,9.98,8.11e-05,131.0,131.0,16.4,3478.34,37922.28,38320.0
7
+ 6,25200000.0,18600.0,226000.0,14100.0,1020.0,10.9,7.63e-05,128.0,128.0,93.8,3478.34,37922.28,38320.0
8
+ 7,29400000.0,105000.0,40000.0,2500.0,1020.0,9.16,7.16e-05,22.7,22.7,19.8,,,
9
+ 8,33600000.0,18200.0,230000.0,14400.0,1020.0,8.83,6.68e-05,131.0,131.0,6.08,3478.34,37922.28,38320.0
10
+ 9,37700000.0,17900.0,235000.0,14700.0,1020.0,8.47,6.21e-05,133.0,133.0,5.23,3478.34,37922.28,38320.0
11
+ 10,41900000.0,17200.0,244000.0,15200.0,1020.0,8.17,5.74e-05,138.0,138.0,7.71,3478.34,37922.28,38320.0
12
+ 11,46100000.0,19400.0,216000.0,13500.0,1020.0,7.93,5.26e-05,123.0,123.0,5.53,3478.34,37922.28,38320.0
13
+ 12,50300000.0,18800.0,223000.0,14000.0,1020.0,7.75,4.79e-05,127.0,127.0,4.64,,,
14
+ 13,54500000.0,18700.0,224000.0,14000.0,1020.0,7.58,4.32e-05,127.0,127.0,2.9,3478.34,37922.28,38320.0
15
+ 14,58700000.0,16200.0,258000.0,16200.0,1020.0,7.5,3.84e-05,147.0,147.0,4.18,3478.34,37922.28,38320.0
16
+ 15,62900000.0,18100.0,232000.0,14500.0,1020.0,7.4,3.37e-05,131.0,131.0,3.86,3478.34,37922.28,38320.0
17
+ 16,67099999.99999999,17700.0,237000.0,14800.0,1020.0,7.29,2.89e-05,134.0,134.0,3.06,3478.34,37922.28,38320.0
18
+ 17,71300000.0,18300.0,229000.0,14300.0,1020.0,7.19,2.42e-05,130.0,130.0,2.39,,,
19
+ 18,75500000.0,20300.0,206000.0,12900.0,1020.0,7.13,1.95e-05,117.0,117.0,2.2,3478.34,37922.28,38320.0
20
+ 19,79700000.0,17800.0,236000.0,14800.0,1020.0,7.08,1.47e-05,134.0,134.0,2.64,,,
21
+ 20,83900000.0,17200.0,244000.0,15200.0,1020.0,7.03,1e-05,138.0,138.0,2.3,,,