ferdinand.mom
commited on
Commit
•
b79f60a
1
Parent(s):
10ca648
add metrics + profiler.csv for 16 GPUS
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- llama-1B/16_GPUS/16_GPUS_summary_results.csv +119 -0
- llama-1B/16_GPUS/dp-16_tp-1_pp-1_mbz-1/log_metrics.csv +21 -0
- llama-1B/16_GPUS/dp-16_tp-1_pp-1_mbz-1/profiler.csv +2 -0
- llama-1B/16_GPUS/dp-16_tp-1_pp-1_mbz-2/log_metrics.csv +21 -0
- llama-1B/16_GPUS/dp-16_tp-1_pp-1_mbz-2/profiler.csv +2 -0
- llama-1B/16_GPUS/dp-16_tp-1_pp-1_mbz-4/log_metrics.csv +21 -0
- llama-1B/16_GPUS/dp-16_tp-1_pp-1_mbz-4/profiler.csv +2 -0
- llama-1B/16_GPUS/dp-1_tp-1_pp-16_mbz-1/log_metrics.csv +21 -0
- llama-1B/16_GPUS/dp-1_tp-1_pp-16_mbz-1/profiler.csv +1 -1
- llama-1B/16_GPUS/dp-1_tp-1_pp-16_mbz-2/log_metrics.csv +21 -0
- llama-1B/16_GPUS/dp-1_tp-1_pp-16_mbz-2/profiler.csv +2 -0
- llama-1B/16_GPUS/dp-1_tp-1_pp-16_mbz-4/log_metrics.csv +21 -0
- llama-1B/16_GPUS/dp-1_tp-1_pp-16_mbz-4/profiler.csv +2 -0
- llama-1B/16_GPUS/dp-1_tp-2_pp-8_mbz-2/profiler.csv +1 -1
- llama-1B/16_GPUS/dp-1_tp-2_pp-8_mbz-4/log_metrics.csv +21 -0
- llama-1B/16_GPUS/dp-1_tp-2_pp-8_mbz-4/profiler.csv +2 -0
- llama-1B/16_GPUS/dp-1_tp-2_pp-8_mbz-8/profiler.csv +1 -1
- llama-1B/16_GPUS/dp-1_tp-4_pp-4_mbz-16/log_metrics.csv +21 -0
- llama-1B/16_GPUS/dp-1_tp-4_pp-4_mbz-16/profiler.csv +2 -0
- llama-1B/16_GPUS/dp-1_tp-4_pp-4_mbz-4/profiler.csv +1 -1
- llama-1B/16_GPUS/dp-1_tp-4_pp-4_mbz-8/log_metrics.csv +21 -0
- llama-1B/16_GPUS/dp-1_tp-4_pp-4_mbz-8/profiler.csv +2 -0
- llama-1B/16_GPUS/dp-1_tp-8_pp-2_mbz-16/log_metrics.csv +21 -0
- llama-1B/16_GPUS/dp-1_tp-8_pp-2_mbz-16/profiler.csv +2 -0
- llama-1B/16_GPUS/dp-1_tp-8_pp-2_mbz-32/log_metrics.csv +21 -0
- llama-1B/16_GPUS/dp-1_tp-8_pp-2_mbz-32/profiler.csv +2 -0
- llama-1B/16_GPUS/dp-1_tp-8_pp-2_mbz-8/profiler.csv +1 -1
- llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-2/log_metrics.csv +21 -0
- llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-2/profiler.csv +2 -0
- llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-4/profiler.csv +1 -1
- llama-1B/16_GPUS/dp-2_tp-2_pp-4_mbz-2/log_metrics.csv +21 -0
- llama-1B/16_GPUS/dp-2_tp-2_pp-4_mbz-2/profiler.csv +2 -0
- llama-1B/16_GPUS/dp-2_tp-2_pp-4_mbz-4/log_metrics.csv +21 -0
- llama-1B/16_GPUS/dp-2_tp-2_pp-4_mbz-4/profiler.csv +2 -0
- llama-1B/16_GPUS/dp-2_tp-2_pp-4_mbz-8/log_metrics.csv +21 -0
- llama-1B/16_GPUS/dp-2_tp-2_pp-4_mbz-8/profiler.csv +2 -0
- llama-1B/16_GPUS/dp-2_tp-4_pp-2_mbz-16/log_metrics.csv +21 -0
- llama-1B/16_GPUS/dp-2_tp-4_pp-2_mbz-16/profiler.csv +2 -0
- llama-1B/16_GPUS/dp-2_tp-4_pp-2_mbz-4/log_metrics.csv +21 -0
- llama-1B/16_GPUS/dp-2_tp-4_pp-2_mbz-4/profiler.csv +2 -0
- llama-1B/16_GPUS/dp-2_tp-4_pp-2_mbz-8/log_metrics.csv +21 -0
- llama-1B/16_GPUS/dp-2_tp-4_pp-2_mbz-8/profiler.csv +2 -0
- llama-1B/16_GPUS/dp-2_tp-8_pp-1_mbz-16/log_metrics.csv +21 -0
- llama-1B/16_GPUS/dp-2_tp-8_pp-1_mbz-16/profiler.csv +2 -0
- llama-1B/16_GPUS/dp-2_tp-8_pp-1_mbz-32/log_metrics.csv +21 -0
- llama-1B/16_GPUS/dp-2_tp-8_pp-1_mbz-32/profiler.csv +2 -0
- llama-1B/16_GPUS/dp-4_tp-1_pp-4_mbz-1/log_metrics.csv +21 -0
- llama-1B/16_GPUS/dp-4_tp-1_pp-4_mbz-1/profiler.csv +2 -0
- llama-1B/16_GPUS/dp-4_tp-1_pp-4_mbz-2/profiler.csv +1 -1
- llama-1B/16_GPUS/dp-4_tp-1_pp-4_mbz-4/log_metrics.csv +21 -0
llama-1B/16_GPUS/16_GPUS_summary_results.csv
ADDED
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
model,run_name,status,nnodes,dp,tp,pp,batch_accumulation_per_replica,micro_batch_size,tok/s/gpu,mfu,forward,backward
|
2 |
+
llama-1B,dp-1_tp-8_pp-2_mbz-1,,2,1,8,2,1024,1,-1,-1,,
|
3 |
+
llama-1B,dp-1_tp-8_pp-2_mbz-2,,2,1,8,2,512,2,-1,-1,,
|
4 |
+
llama-1B,dp-1_tp-8_pp-2_mbz-4,,2,1,8,2,256,4,-1,-1,,
|
5 |
+
llama-1B,dp-1_tp-8_pp-2_mbz-8,,2,1,8,2,128,8,-1,-1,,
|
6 |
+
llama-1B,dp-1_tp-8_pp-2_mbz-16,,2,1,8,2,64,16,-1,-1,,
|
7 |
+
llama-1B,dp-1_tp-8_pp-2_mbz-32,,2,1,8,2,32,32,-1,-1,,
|
8 |
+
llama-1B,dp-1_tp-8_pp-2_mbz-64,,2,1,8,2,16,64,-1,-1,,
|
9 |
+
llama-1B,dp-1_tp-8_pp-2_mbz-128,,2,1,8,2,8,128,-1,-1,,
|
10 |
+
llama-1B,dp-1_tp-8_pp-2_mbz-256,,2,1,8,2,4,256,-1,-1,,
|
11 |
+
llama-1B,dp-1_tp-8_pp-2_mbz-512,,2,1,8,2,2,512,-1,-1,,
|
12 |
+
llama-1B,dp-1_tp-8_pp-2_mbz-1024,,2,1,8,2,1,1024,-1,-1,,
|
13 |
+
llama-1B,dp-1_tp-1_pp-16_mbz-1,,2,1,1,16,1024,1,-1,-1,,
|
14 |
+
llama-1B,dp-1_tp-1_pp-16_mbz-2,,2,1,1,16,512,2,-1,-1,,
|
15 |
+
llama-1B,dp-1_tp-1_pp-16_mbz-4,,2,1,1,16,256,4,-1,-1,,
|
16 |
+
llama-1B,dp-1_tp-1_pp-16_mbz-8,,2,1,1,16,128,8,-1,-1,,
|
17 |
+
llama-1B,dp-1_tp-1_pp-16_mbz-16,,2,1,1,16,64,16,-1,-1,,
|
18 |
+
llama-1B,dp-1_tp-1_pp-16_mbz-32,,2,1,1,16,32,32,-1,-1,,
|
19 |
+
llama-1B,dp-1_tp-1_pp-16_mbz-64,,2,1,1,16,16,64,-1,-1,,
|
20 |
+
llama-1B,dp-16_tp-1_pp-1_mbz-1,,2,16,1,1,64,1,-1,-1,,
|
21 |
+
llama-1B,dp-16_tp-1_pp-1_mbz-2,,2,16,1,1,32,2,-1,-1,,
|
22 |
+
llama-1B,dp-16_tp-1_pp-1_mbz-4,,2,16,1,1,16,4,-1,-1,,
|
23 |
+
llama-1B,dp-16_tp-1_pp-1_mbz-8,,2,16,1,1,8,8,-1,-1,,
|
24 |
+
llama-1B,dp-16_tp-1_pp-1_mbz-16,,2,16,1,1,4,16,-1,-1,,
|
25 |
+
llama-1B,dp-16_tp-1_pp-1_mbz-32,,2,16,1,1,2,32,-1,-1,,
|
26 |
+
llama-1B,dp-16_tp-1_pp-1_mbz-64,,2,16,1,1,1,64,-1,-1,,
|
27 |
+
llama-1B,dp-1_tp-4_pp-4_mbz-1,,2,1,4,4,1024,1,-1,-1,,
|
28 |
+
llama-1B,dp-1_tp-4_pp-4_mbz-2,,2,1,4,4,512,2,-1,-1,,
|
29 |
+
llama-1B,dp-1_tp-4_pp-4_mbz-4,,2,1,4,4,256,4,-1,-1,,
|
30 |
+
llama-1B,dp-1_tp-4_pp-4_mbz-8,,2,1,4,4,128,8,-1,-1,,
|
31 |
+
llama-1B,dp-1_tp-4_pp-4_mbz-16,,2,1,4,4,64,16,-1,-1,,
|
32 |
+
llama-1B,dp-1_tp-4_pp-4_mbz-32,,2,1,4,4,32,32,-1,-1,,
|
33 |
+
llama-1B,dp-1_tp-4_pp-4_mbz-64,,2,1,4,4,16,64,-1,-1,,
|
34 |
+
llama-1B,dp-1_tp-4_pp-4_mbz-128,,2,1,4,4,8,128,-1,-1,,
|
35 |
+
llama-1B,dp-1_tp-4_pp-4_mbz-256,,2,1,4,4,4,256,-1,-1,,
|
36 |
+
llama-1B,dp-8_tp-2_pp-1_mbz-1,,2,8,2,1,128,1,-1,-1,,
|
37 |
+
llama-1B,dp-8_tp-2_pp-1_mbz-2,,2,8,2,1,64,2,-1,-1,,
|
38 |
+
llama-1B,dp-8_tp-2_pp-1_mbz-4,,2,8,2,1,32,4,-1,-1,,
|
39 |
+
llama-1B,dp-8_tp-2_pp-1_mbz-8,,2,8,2,1,16,8,-1,-1,,
|
40 |
+
llama-1B,dp-8_tp-2_pp-1_mbz-16,,2,8,2,1,8,16,-1,-1,,
|
41 |
+
llama-1B,dp-8_tp-2_pp-1_mbz-32,,2,8,2,1,4,32,-1,-1,,
|
42 |
+
llama-1B,dp-8_tp-2_pp-1_mbz-64,,2,8,2,1,2,64,-1,-1,,
|
43 |
+
llama-1B,dp-8_tp-2_pp-1_mbz-128,,2,8,2,1,1,128,-1,-1,,
|
44 |
+
llama-1B,dp-4_tp-1_pp-4_mbz-1,,2,4,1,4,256,1,-1,-1,,
|
45 |
+
llama-1B,dp-4_tp-1_pp-4_mbz-2,,2,4,1,4,128,2,-1,-1,,
|
46 |
+
llama-1B,dp-4_tp-1_pp-4_mbz-4,,2,4,1,4,64,4,-1,-1,,
|
47 |
+
llama-1B,dp-4_tp-1_pp-4_mbz-8,,2,4,1,4,32,8,-1,-1,,
|
48 |
+
llama-1B,dp-4_tp-1_pp-4_mbz-16,,2,4,1,4,16,16,-1,-1,,
|
49 |
+
llama-1B,dp-4_tp-1_pp-4_mbz-32,,2,4,1,4,8,32,-1,-1,,
|
50 |
+
llama-1B,dp-4_tp-1_pp-4_mbz-64,,2,4,1,4,4,64,-1,-1,,
|
51 |
+
llama-1B,dp-8_tp-1_pp-2_mbz-1,,2,8,1,2,128,1,-1,-1,,
|
52 |
+
llama-1B,dp-8_tp-1_pp-2_mbz-2,,2,8,1,2,64,2,-1,-1,,
|
53 |
+
llama-1B,dp-8_tp-1_pp-2_mbz-4,,2,8,1,2,32,4,-1,-1,,
|
54 |
+
llama-1B,dp-8_tp-1_pp-2_mbz-8,,2,8,1,2,16,8,-1,-1,,
|
55 |
+
llama-1B,dp-8_tp-1_pp-2_mbz-16,,2,8,1,2,8,16,-1,-1,,
|
56 |
+
llama-1B,dp-8_tp-1_pp-2_mbz-32,,2,8,1,2,4,32,-1,-1,,
|
57 |
+
llama-1B,dp-8_tp-1_pp-2_mbz-64,,2,8,1,2,2,64,-1,-1,,
|
58 |
+
llama-1B,dp-8_tp-1_pp-2_mbz-128,,2,8,1,2,1,128,-1,-1,,
|
59 |
+
llama-1B,dp-4_tp-4_pp-1_mbz-1,,2,4,4,1,256,1,-1,-1,,
|
60 |
+
llama-1B,dp-4_tp-4_pp-1_mbz-2,,2,4,4,1,128,2,-1,-1,,
|
61 |
+
llama-1B,dp-4_tp-4_pp-1_mbz-4,,2,4,4,1,64,4,-1,-1,,
|
62 |
+
llama-1B,dp-4_tp-4_pp-1_mbz-8,,2,4,4,1,32,8,-1,-1,,
|
63 |
+
llama-1B,dp-4_tp-4_pp-1_mbz-16,,2,4,4,1,16,16,-1,-1,,
|
64 |
+
llama-1B,dp-4_tp-4_pp-1_mbz-32,,2,4,4,1,8,32,-1,-1,,
|
65 |
+
llama-1B,dp-4_tp-4_pp-1_mbz-64,,2,4,4,1,4,64,-1,-1,,
|
66 |
+
llama-1B,dp-4_tp-4_pp-1_mbz-128,,2,4,4,1,2,128,-1,-1,,
|
67 |
+
llama-1B,dp-4_tp-4_pp-1_mbz-256,,2,4,4,1,1,256,-1,-1,,
|
68 |
+
llama-1B,dp-2_tp-2_pp-4_mbz-1,,2,2,2,4,512,1,-1,-1,,
|
69 |
+
llama-1B,dp-2_tp-2_pp-4_mbz-2,,2,2,2,4,256,2,-1,-1,,
|
70 |
+
llama-1B,dp-2_tp-2_pp-4_mbz-4,,2,2,2,4,128,4,-1,-1,,
|
71 |
+
llama-1B,dp-2_tp-2_pp-4_mbz-8,,2,2,2,4,64,8,-1,-1,,
|
72 |
+
llama-1B,dp-2_tp-2_pp-4_mbz-16,,2,2,2,4,32,16,-1,-1,,
|
73 |
+
llama-1B,dp-2_tp-2_pp-4_mbz-32,,2,2,2,4,16,32,-1,-1,,
|
74 |
+
llama-1B,dp-2_tp-2_pp-4_mbz-64,,2,2,2,4,8,64,-1,-1,,
|
75 |
+
llama-1B,dp-2_tp-2_pp-4_mbz-128,,2,2,2,4,4,128,-1,-1,,
|
76 |
+
llama-1B,dp-2_tp-8_pp-1_mbz-1,,2,2,8,1,512,1,-1,-1,,
|
77 |
+
llama-1B,dp-2_tp-8_pp-1_mbz-2,,2,2,8,1,256,2,-1,-1,,
|
78 |
+
llama-1B,dp-2_tp-8_pp-1_mbz-4,,2,2,8,1,128,4,-1,-1,,
|
79 |
+
llama-1B,dp-2_tp-8_pp-1_mbz-8,,2,2,8,1,64,8,-1,-1,,
|
80 |
+
llama-1B,dp-2_tp-8_pp-1_mbz-16,,2,2,8,1,32,16,-1,-1,,
|
81 |
+
llama-1B,dp-2_tp-8_pp-1_mbz-32,,2,2,8,1,16,32,-1,-1,,
|
82 |
+
llama-1B,dp-2_tp-8_pp-1_mbz-64,,2,2,8,1,8,64,-1,-1,,
|
83 |
+
llama-1B,dp-2_tp-8_pp-1_mbz-128,,2,2,8,1,4,128,-1,-1,,
|
84 |
+
llama-1B,dp-2_tp-8_pp-1_mbz-256,,2,2,8,1,2,256,-1,-1,,
|
85 |
+
llama-1B,dp-2_tp-8_pp-1_mbz-512,,2,2,8,1,1,512,-1,-1,,
|
86 |
+
llama-1B,dp-1_tp-2_pp-8_mbz-1,,2,1,2,8,1024,1,-1,-1,,
|
87 |
+
llama-1B,dp-1_tp-2_pp-8_mbz-2,,2,1,2,8,512,2,-1,-1,,
|
88 |
+
llama-1B,dp-1_tp-2_pp-8_mbz-4,,2,1,2,8,256,4,-1,-1,,
|
89 |
+
llama-1B,dp-1_tp-2_pp-8_mbz-8,,2,1,2,8,128,8,-1,-1,,
|
90 |
+
llama-1B,dp-1_tp-2_pp-8_mbz-16,,2,1,2,8,64,16,-1,-1,,
|
91 |
+
llama-1B,dp-1_tp-2_pp-8_mbz-32,,2,1,2,8,32,32,-1,-1,,
|
92 |
+
llama-1B,dp-1_tp-2_pp-8_mbz-64,,2,1,2,8,16,64,-1,-1,,
|
93 |
+
llama-1B,dp-1_tp-2_pp-8_mbz-128,,2,1,2,8,8,128,-1,-1,,
|
94 |
+
llama-1B,dp-2_tp-1_pp-8_mbz-1,,2,2,1,8,512,1,-1,-1,,
|
95 |
+
llama-1B,dp-2_tp-1_pp-8_mbz-2,,2,2,1,8,256,2,-1,-1,,
|
96 |
+
llama-1B,dp-2_tp-1_pp-8_mbz-4,,2,2,1,8,128,4,-1,-1,,
|
97 |
+
llama-1B,dp-2_tp-1_pp-8_mbz-8,,2,2,1,8,64,8,-1,-1,,
|
98 |
+
llama-1B,dp-2_tp-1_pp-8_mbz-16,,2,2,1,8,32,16,-1,-1,,
|
99 |
+
llama-1B,dp-2_tp-1_pp-8_mbz-32,,2,2,1,8,16,32,-1,-1,,
|
100 |
+
llama-1B,dp-2_tp-1_pp-8_mbz-64,,2,2,1,8,8,64,-1,-1,,
|
101 |
+
llama-1B,dp-2_tp-4_pp-2_mbz-1,,2,2,4,2,512,1,-1,-1,,
|
102 |
+
llama-1B,dp-2_tp-4_pp-2_mbz-2,,2,2,4,2,256,2,-1,-1,,
|
103 |
+
llama-1B,dp-2_tp-4_pp-2_mbz-4,,2,2,4,2,128,4,-1,-1,,
|
104 |
+
llama-1B,dp-2_tp-4_pp-2_mbz-8,,2,2,4,2,64,8,-1,-1,,
|
105 |
+
llama-1B,dp-2_tp-4_pp-2_mbz-16,,2,2,4,2,32,16,-1,-1,,
|
106 |
+
llama-1B,dp-2_tp-4_pp-2_mbz-32,,2,2,4,2,16,32,-1,-1,,
|
107 |
+
llama-1B,dp-2_tp-4_pp-2_mbz-64,,2,2,4,2,8,64,-1,-1,,
|
108 |
+
llama-1B,dp-2_tp-4_pp-2_mbz-128,,2,2,4,2,4,128,-1,-1,,
|
109 |
+
llama-1B,dp-2_tp-4_pp-2_mbz-256,,2,2,4,2,2,256,-1,-1,,
|
110 |
+
llama-1B,dp-2_tp-4_pp-2_mbz-512,,2,2,4,2,1,512,-1,-1,,
|
111 |
+
llama-1B,dp-4_tp-2_pp-2_mbz-1,,2,4,2,2,256,1,-1,-1,,
|
112 |
+
llama-1B,dp-4_tp-2_pp-2_mbz-2,,2,4,2,2,128,2,-1,-1,,
|
113 |
+
llama-1B,dp-4_tp-2_pp-2_mbz-4,,2,4,2,2,64,4,-1,-1,,
|
114 |
+
llama-1B,dp-4_tp-2_pp-2_mbz-8,,2,4,2,2,32,8,-1,-1,,
|
115 |
+
llama-1B,dp-4_tp-2_pp-2_mbz-16,,2,4,2,2,16,16,-1,-1,,
|
116 |
+
llama-1B,dp-4_tp-2_pp-2_mbz-32,,2,4,2,2,8,32,-1,-1,,
|
117 |
+
llama-1B,dp-4_tp-2_pp-2_mbz-64,,2,4,2,2,4,64,-1,-1,,
|
118 |
+
llama-1B,dp-4_tp-2_pp-2_mbz-128,,2,4,2,2,2,128,-1,-1,,
|
119 |
+
llama-1B,dp-4_tp-2_pp-2_mbz-256,,2,4,2,2,1,256,-1,-1,,
|
llama-1B/16_GPUS/dp-16_tp-1_pp-1_mbz-1/log_metrics.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
iteration,consumed_tokens,elapsed_time_per_iteration_ms,tokens_per_sec,tokens_per_sec_per_gpu,global_batch_size,lm_loss,lr,model_tflops_per_gpu,hardware_tflops_per_gpu,grad_norm,memory_usage_MiB,peak_allocated_MiB,peak_reserved_MiB
|
2 |
+
1,4190000.0000000005,21800.0,192000.0,12000.0,1020.0,11.3,0.0001,109.0,109.0,33.1,7256.42,15785.37,18632.0
|
3 |
+
2,8390000.0,10800.0,387000.0,24200.0,1020.0,11.3,9.53e-05,220.0,220.0,33.3,7256.42,15785.37,18632.0
|
4 |
+
3,12600000.0,11000.0,382000.0,23900.0,1020.0,16.0,9.05e-05,216.0,216.0,249.0,7256.42,15785.37,18632.0
|
5 |
+
4,16800000.0,13300.0,316000.0,19700.0,1020.0,15.1,8.58e-05,179.0,179.0,41.6,7256.39,11621.72,18632.0
|
6 |
+
5,21000000.0,13300.0,314000.0,19600.0,1020.0,10.8,8.11e-05,178.0,178.0,26.0,7256.39,15785.37,18632.0
|
7 |
+
6,25200000.0,13200.0,319000.0,19900.0,1020.0,10.8,7.63e-05,181.0,181.0,18.9,7256.39,15785.37,18632.0
|
8 |
+
7,29400000.0,10800.0,388000.0,24200.0,1020.0,10.2,7.16e-05,220.0,220.0,7.97,7256.39,15785.37,18632.0
|
9 |
+
8,33600000.0,10800.0,388000.0,24200.0,1020.0,9.16,6.68e-05,220.0,220.0,6.46,7256.39,15785.37,18632.0
|
10 |
+
9,37700000.0,11000.0,383000.0,23900.0,1020.0,11.2,6.21e-05,217.0,217.0,59.7,7256.39,15785.37,18632.0
|
11 |
+
10,41900000.0,10800.0,387000.0,24200.0,1020.0,9.59,5.74e-05,219.0,219.0,44.0,7256.39,15785.37,18632.0
|
12 |
+
11,46100000.0,10900.0,386000.0,24100.0,1020.0,8.08,5.26e-05,219.0,219.0,8.41,7256.39,15785.37,18632.0
|
13 |
+
12,50300000.0,10900.0,384000.0,24000.0,1020.0,7.86,4.79e-05,218.0,218.0,5.09,7256.39,15785.37,18632.0
|
14 |
+
13,54500000.0,11000.0,382000.0,23900.0,1020.0,7.7,4.32e-05,217.0,217.0,4.71,7256.39,15785.37,18632.0
|
15 |
+
14,58700000.0,11000.0,381000.0,23800.0,1020.0,7.56,3.84e-05,216.0,216.0,5.14,7256.39,15785.37,18632.0
|
16 |
+
15,62900000.0,11000.0,381000.0,23800.0,1020.0,7.4,3.37e-05,216.0,216.0,5.16,7256.39,15785.37,18632.0
|
17 |
+
16,67099999.99999999,10900.0,386000.0,24100.0,1020.0,7.29,2.89e-05,219.0,219.0,5.26,7256.39,15785.37,18632.0
|
18 |
+
17,71300000.0,11000.0,380000.0,23700.0,1020.0,7.22,2.42e-05,215.0,215.0,5.18,7256.39,15785.37,18632.0
|
19 |
+
18,75500000.0,11100.0,378000.0,23600.0,1020.0,7.15,1.95e-05,214.0,214.0,5.04,7256.39,15785.37,18632.0
|
20 |
+
19,79700000.0,11000.0,382000.0,23900.0,1020.0,7.08,1.47e-05,217.0,217.0,3.85,7256.39,15785.37,18632.0
|
21 |
+
20,83900000.0,10800.0,388000.0,24200.0,1020.0,7.03,1e-05,220.0,220.0,2.9,,,
|
llama-1B/16_GPUS/dp-16_tp-1_pp-1_mbz-1/profiler.csv
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
forward,backward
|
2 |
+
0ms 959μs,1ms 7μs
|
llama-1B/16_GPUS/dp-16_tp-1_pp-1_mbz-2/log_metrics.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
iteration,consumed_tokens,elapsed_time_per_iteration_ms,tokens_per_sec,tokens_per_sec_per_gpu,global_batch_size,lm_loss,lr,model_tflops_per_gpu,hardware_tflops_per_gpu,grad_norm,memory_usage_MiB,peak_allocated_MiB,peak_reserved_MiB
|
2 |
+
1,4190000.0000000005,18900.0,222000.0,13900.0,1020.0,11.3,0.0001,126.0,126.0,33.1,7252.46,25075.44,27568.0
|
3 |
+
2,8390000.0,9350.0,449000.0,28000.0,1020.0,11.3,9.53e-05,254.0,254.0,33.3,7252.46,25075.44,27568.0
|
4 |
+
3,12600000.0,9080.0,462000.0,28900.0,1020.0,16.0,9.05e-05,262.0,262.0,249.0,7252.46,25075.44,27568.0
|
5 |
+
4,16800000.0,9640.0,435000.0,27200.0,1020.0,15.1,8.58e-05,247.0,247.0,41.6,7252.45,11617.76,27568.0
|
6 |
+
5,21000000.0,9500.0,441000.0,27600.0,1020.0,10.8,8.11e-05,250.0,250.0,25.9,7252.45,25075.44,27568.0
|
7 |
+
6,25200000.0,9730.0,431000.0,26900.0,1020.0,10.8,7.63e-05,244.0,244.0,18.9,7252.45,25075.44,27568.0
|
8 |
+
7,29400000.0,8900.0,471000.0,29500.0,1020.0,10.2,7.16e-05,267.0,267.0,7.97,7252.45,25075.44,27568.0
|
9 |
+
8,33600000.0,8860.0,473000.0,29600.0,1020.0,9.15,6.68e-05,268.0,268.0,6.46,7252.45,25075.44,27568.0
|
10 |
+
9,37700000.0,9080.0,462000.0,28900.0,1020.0,11.2,6.21e-05,262.0,262.0,59.7,7252.45,25075.44,27568.0
|
11 |
+
10,41900000.0,9130.0,459000.0,28700.0,1020.0,9.6,5.74e-05,260.0,260.0,44.2,7252.45,25075.44,27568.0
|
12 |
+
11,46100000.0,9390.0,447000.0,27900.0,1020.0,8.08,5.26e-05,253.0,253.0,8.69,7252.45,25075.44,27568.0
|
13 |
+
12,50300000.0,8910.0,471000.0,29400.0,1020.0,7.86,4.79e-05,267.0,267.0,5.1,7252.45,25075.44,27568.0
|
14 |
+
13,54500000.0,9060.0,463000.0,28900.0,1020.0,7.7,4.32e-05,263.0,263.0,4.73,7252.45,25075.44,27568.0
|
15 |
+
14,58700000.0,9030.0,464000.0,29000.0,1020.0,7.56,3.84e-05,263.0,263.0,5.09,7252.45,25075.44,27568.0
|
16 |
+
15,62900000.0,9030.0,464000.0,29000.0,1020.0,7.4,3.37e-05,263.0,263.0,5.16,7252.45,25075.44,27568.0
|
17 |
+
16,67099999.99999999,9270.0,453000.0,28300.0,1020.0,7.3,2.89e-05,257.0,257.0,5.15,7252.45,25075.44,27568.0
|
18 |
+
17,71300000.0,9240.0,454000.0,28400.0,1020.0,7.22,2.42e-05,258.0,258.0,5.14,7252.45,25075.44,27568.0
|
19 |
+
18,75500000.0,9120.0,460000.0,28800.0,1020.0,7.15,1.95e-05,261.0,261.0,5.04,7252.45,25075.44,27568.0
|
20 |
+
19,79700000.0,9110.0,460000.0,28800.0,1020.0,7.08,1.47e-05,261.0,261.0,3.86,7252.45,25075.44,27568.0
|
21 |
+
20,83900000.0,8990.0,467000.0,29200.0,1020.0,7.03,1e-05,265.0,265.0,2.94,,,
|
llama-1B/16_GPUS/dp-16_tp-1_pp-1_mbz-2/profiler.csv
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
forward,backward
|
2 |
+
0ms 881μs,1ms 507μs
|
llama-1B/16_GPUS/dp-16_tp-1_pp-1_mbz-4/log_metrics.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
iteration,consumed_tokens,elapsed_time_per_iteration_ms,tokens_per_sec,tokens_per_sec_per_gpu,global_batch_size,lm_loss,lr,model_tflops_per_gpu,hardware_tflops_per_gpu,grad_norm,memory_usage_MiB,peak_allocated_MiB,peak_reserved_MiB
|
2 |
+
1,4190000.0000000005,20500.0,205000.0,12800.0,1020.0,11.3,0.0001,116.0,116.0,33.1,7252.59,42900.39,44042.0
|
3 |
+
2,8390000.0,8800.0,476000.0,29800.0,1020.0,11.3,9.53e-05,270.0,270.0,33.3,7252.59,42900.39,44042.0
|
4 |
+
3,12600000.0,8770.0,478000.0,29900.0,1020.0,16.0,9.05e-05,271.0,271.0,249.0,7252.59,42900.39,44042.0
|
5 |
+
4,16800000.0,8870.0,473000.0,29600.0,1020.0,15.1,8.58e-05,268.0,268.0,41.6,7252.58,11617.89,44042.0
|
6 |
+
5,21000000.0,8870.0,473000.0,29600.0,1020.0,10.8,8.11e-05,268.0,268.0,26.0,7252.58,42900.39,44042.0
|
7 |
+
6,25200000.0,8820.0,475000.0,29700.0,1020.0,10.8,7.63e-05,270.0,270.0,18.9,7252.58,42900.39,44042.0
|
8 |
+
7,29400000.0,8500.0,494000.0,30800.0,1020.0,10.2,7.16e-05,280.0,280.0,7.97,7252.58,42900.39,44042.0
|
9 |
+
8,33600000.0,8580.0,489000.0,30500.0,1020.0,9.15,6.68e-05,277.0,277.0,6.46,7252.58,42900.39,44042.0
|
10 |
+
9,37700000.0,8650.0,485000.0,30300.0,1020.0,11.2,6.21e-05,275.0,275.0,59.8,7252.58,42900.39,44042.0
|
11 |
+
10,41900000.0,8790.0,477000.0,29800.0,1020.0,9.6,5.74e-05,271.0,271.0,44.2,7252.58,42900.39,44042.0
|
12 |
+
11,46100000.0,8600.0,488000.0,30500.0,1020.0,8.08,5.26e-05,277.0,277.0,8.6,7252.58,42900.39,44042.0
|
13 |
+
12,50300000.0,8660.0,484000.0,30300.0,1020.0,7.86,4.79e-05,275.0,275.0,5.09,7252.58,42900.39,44042.0
|
14 |
+
13,54500000.0,8640.0,485000.0,30300.0,1020.0,7.7,4.32e-05,275.0,275.0,4.73,7252.58,42900.39,44042.0
|
15 |
+
14,58700000.0,8820.0,476000.0,29700.0,1020.0,7.56,3.84e-05,270.0,270.0,5.1,7252.58,42900.39,44042.0
|
16 |
+
15,62900000.0,8680.0,483000.0,30200.0,1020.0,7.4,3.37e-05,274.0,274.0,5.17,7252.58,42900.39,44042.0
|
17 |
+
16,67099999.99999999,8660.0,485000.0,30300.0,1020.0,7.3,2.89e-05,275.0,275.0,5.17,7252.58,42900.39,44042.0
|
18 |
+
17,71300000.0,8730.0,480000.0,30000.0,1020.0,7.22,2.42e-05,272.0,272.0,5.13,7252.58,42900.39,44042.0
|
19 |
+
18,75500000.0,8630.0,486000.0,30400.0,1020.0,7.15,1.95e-05,276.0,276.0,5.04,7252.58,42900.39,44042.0
|
20 |
+
19,79700000.0,8710.0,481000.0,30100.0,1020.0,7.08,1.47e-05,273.0,273.0,3.87,7252.58,42900.39,44042.0
|
21 |
+
20,83900000.0,8790.0,477000.0,29800.0,1020.0,7.03,1e-05,271.0,271.0,2.93,,,
|
llama-1B/16_GPUS/dp-16_tp-1_pp-1_mbz-4/profiler.csv
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
forward,backward
|
2 |
+
0ms 971μs,1ms 187μs
|
llama-1B/16_GPUS/dp-1_tp-1_pp-16_mbz-1/log_metrics.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
iteration,consumed_tokens,elapsed_time_per_iteration_ms,tokens_per_sec,tokens_per_sec_per_gpu,global_batch_size,lm_loss,lr,model_tflops_per_gpu,hardware_tflops_per_gpu,grad_norm,memory_usage_MiB,peak_allocated_MiB,peak_reserved_MiB
|
2 |
+
1,4190000.0000000005,105000.0,40000.0,2500.0,1020.0,11.1,0.0001,22.7,22.7,25.6,3274.04,3274.05,13040.0
|
3 |
+
2,8390000.0,50800.0,82500.0,5160.0,1020.0,11.1,9.53e-05,46.8,46.8,25.9,3274.04,12603.56,13042.0
|
4 |
+
3,12600000.0,50900.0,82300.0,5150.0,1020.0,9.9,9.05e-05,46.7,46.7,40.4,3274.04,3274.05,13042.0
|
5 |
+
4,16800000.0,47800.0,87700.0,5480.0,1020.0,11.9,8.58e-05,49.7,49.7,61.2,3274.04,12603.56,13042.0
|
6 |
+
5,21000000.0,46600.0,90100.0,5630.0,1020.0,9.05,8.11e-05,51.1,51.1,8.31,,,
|
7 |
+
6,25200000.0,51500.0,81500.0,5090.0,1020.0,8.86,7.63e-05,46.2,46.2,6.63,3274.04,12603.56,13042.0
|
8 |
+
7,29400000.0,534000.0,7850.0,491.0,1020.0,8.37,7.16e-05,4.45,4.45,4.93,3274.04,12603.56,13042.0
|
9 |
+
8,33600000.0,53800.0,78000.0,4880.0,1020.0,7.97,6.68e-05,44.2,44.2,3.13,3274.04,12603.56,13042.0
|
10 |
+
9,37700000.0,47500.0,88400.0,5520.0,1020.0,7.83,6.21e-05,50.1,50.1,9.04,3274.04,12603.56,13042.0
|
11 |
+
10,41900000.0,48000.0,87300.0,5460.0,1020.0,7.62,5.74e-05,49.5,49.5,5.09,3274.04,12603.56,13042.0
|
12 |
+
11,46100000.0,47900.0,87600.0,5470.0,1020.0,7.47,5.26e-05,49.7,49.7,4.06,,,
|
13 |
+
12,50300000.0,45700.0,91700.0,5730.0,1020.0,7.34,4.79e-05,52.0,52.0,3.12,3274.04,12603.56,13042.0
|
14 |
+
13,54500000.0,48000.0,87400.0,5460.0,1020.0,7.23,4.32e-05,49.6,49.6,2.73,3274.04,12603.56,13042.0
|
15 |
+
14,58700000.0,44700.0,93800.0,5860.0,1020.0,7.14,3.84e-05,53.2,53.2,2.33,3274.04,12603.56,13042.0
|
16 |
+
15,62900000.0,46200.0,90800.0,5680.0,1020.0,7.06,3.37e-05,51.5,51.5,2.48,3274.04,12603.56,13042.0
|
17 |
+
16,67099999.99999999,47000.0,89200.0,5580.0,1020.0,6.98,2.89e-05,50.6,50.6,2.66,3274.04,12603.56,13042.0
|
18 |
+
17,71300000.0,46000.0,91100.0,5690.0,1020.0,6.9,2.42e-05,51.7,51.7,1.89,,,
|
19 |
+
18,75500000.0,43500.0,96400.0,6030.0,1020.0,6.84,1.95e-05,54.7,54.7,1.61,3274.04,12603.56,13042.0
|
20 |
+
19,79700000.0,47500.0,88200.0,5510.0,1020.0,6.8,1.47e-05,50.0,50.0,1.85,,,
|
21 |
+
20,83900000.0,44700.0,93900.0,5870.0,1020.0,6.76,1e-05,53.2,53.2,1.81,,,
|
llama-1B/16_GPUS/dp-1_tp-1_pp-16_mbz-1/profiler.csv
CHANGED
@@ -1,2 +1,2 @@
|
|
1 |
forward,backward
|
2 |
-
0ms
|
|
|
1 |
forward,backward
|
2 |
+
0ms 944μs,1ms 86μs
|
llama-1B/16_GPUS/dp-1_tp-1_pp-16_mbz-2/log_metrics.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
iteration,consumed_tokens,elapsed_time_per_iteration_ms,tokens_per_sec,tokens_per_sec_per_gpu,global_batch_size,lm_loss,lr,model_tflops_per_gpu,hardware_tflops_per_gpu,grad_norm,memory_usage_MiB,peak_allocated_MiB,peak_reserved_MiB
|
2 |
+
1,4190000.0000000005,90800.0,46200.0,2890.0,1020.0,11.1,0.0001,26.2,26.2,25.6,3274.08,3274.08,23444.0
|
3 |
+
2,8390000.0,31700.0,132000.0,8270.0,1020.0,11.1,9.53e-05,75.1,75.1,25.9,3274.08,22941.07,23444.0
|
4 |
+
3,12600000.0,31900.0,131000.0,8210.0,1020.0,9.9,9.05e-05,74.5,74.5,40.4,3274.08,22941.07,23444.0
|
5 |
+
4,16800000.0,30400.0,138000.0,8630.0,1020.0,11.9,8.58e-05,78.3,78.3,61.2,3274.08,3274.08,23444.0
|
6 |
+
5,21000000.0,30100.0,139000.0,8700.0,1020.0,9.05,8.11e-05,78.9,78.9,8.32,3274.08,22941.07,23444.0
|
7 |
+
6,25200000.0,32900.0,128000.0,7970.0,1020.0,8.86,7.63e-05,72.3,72.3,6.61,3274.08,22941.07,23444.0
|
8 |
+
7,29400000.0,272000.0,15400.0,965.0,1020.0,8.37,7.16e-05,8.75,8.75,4.93,3274.08,22941.07,23444.0
|
9 |
+
8,33600000.0,32000.0,131000.0,8189.999999999999,1020.0,7.97,6.68e-05,74.3,74.3,3.12,3274.08,22941.07,23444.0
|
10 |
+
9,37700000.0,32600.0,129000.0,8039.999999999999,1020.0,7.83,6.21e-05,73.0,73.0,9.04,3274.08,22941.07,23444.0
|
11 |
+
10,41900000.0,31100.0,135000.0,8430.0,1020.0,7.62,5.74e-05,76.5,76.5,5.08,,,
|
12 |
+
11,46100000.0,31100.0,135000.0,8440.0,1020.0,7.47,5.26e-05,76.5,76.5,4.05,3274.08,22941.07,23444.0
|
13 |
+
12,50300000.0,32400.0,130000.0,8090.0,1020.0,7.34,4.79e-05,73.4,73.4,3.13,3274.08,22941.07,23444.0
|
14 |
+
13,54500000.0,32000.0,131000.0,8180.0,1020.0,7.23,4.32e-05,74.2,74.2,2.74,,,
|
15 |
+
14,58700000.0,32100.0,131000.0,8160.0,1020.0,7.14,3.84e-05,74.0,74.0,2.32,3274.08,22941.07,23444.0
|
16 |
+
15,62900000.0,30000.0,140000.0,8740.0,1020.0,7.06,3.37e-05,79.3,79.3,2.47,3274.08,22941.07,23444.0
|
17 |
+
16,67099999.99999999,34000.0,123000.0,7700.0,1020.0,6.98,2.89e-05,69.9,69.9,2.66,,,
|
18 |
+
17,71300000.0,32900.0,128000.0,7970.0,1020.0,6.9,2.42e-05,72.4,72.4,1.88,3274.08,22941.07,23444.0
|
19 |
+
18,75500000.0,31500.0,133000.0,8340.0,1020.0,6.84,1.95e-05,75.6,75.6,1.61,3274.08,22941.07,23444.0
|
20 |
+
19,79700000.0,31900.0,132000.0,8230.0,1020.0,6.8,1.47e-05,74.7,74.7,1.83,3274.08,22941.07,23444.0
|
21 |
+
20,83900000.0,31600.0,133000.0,8290.0,1020.0,6.77,1e-05,75.2,75.2,1.82,,,
|
llama-1B/16_GPUS/dp-1_tp-1_pp-16_mbz-2/profiler.csv
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
forward,backward
|
2 |
+
0ms 956μs,1ms 174μs
|
llama-1B/16_GPUS/dp-1_tp-1_pp-16_mbz-4/log_metrics.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
iteration,consumed_tokens,elapsed_time_per_iteration_ms,tokens_per_sec,tokens_per_sec_per_gpu,global_batch_size,lm_loss,lr,model_tflops_per_gpu,hardware_tflops_per_gpu,grad_norm,memory_usage_MiB,peak_allocated_MiB,peak_reserved_MiB
|
2 |
+
1,4190000.0000000005,81400.0,51500.0,3220.0,1020.0,11.1,0.0001,29.2,29.2,25.6,3274.15,3274.15,43028.0
|
3 |
+
2,8390000.0,30800.0,136000.0,8500.0,1020.0,11.1,9.53e-05,77.1,77.1,25.9,3274.15,3274.15,43028.0
|
4 |
+
3,12600000.0,28800.0,145000.0,9090.0,1020.0,9.9,9.05e-05,82.5,82.5,40.4,3274.15,3274.15,43028.0
|
5 |
+
4,16800000.0,30000.0,140000.0,8750.0,1020.0,11.9,8.58e-05,79.4,79.4,61.2,3274.15,42592.08,43028.0
|
6 |
+
5,21000000.0,31800.0,132000.0,8250.0,1020.0,9.05,8.11e-05,74.9,74.9,8.31,,,
|
7 |
+
6,25200000.0,30600.0,137000.0,8550.0,1020.0,8.85,7.63e-05,77.6,77.6,6.61,3274.15,42592.08,43028.0
|
8 |
+
7,29400000.0,146000.0,28700.0,1790.0,1020.0,8.37,7.16e-05,16.3,16.3,4.93,,,
|
9 |
+
8,33600000.0,29800.0,141000.0,8810.0,1020.0,7.97,6.68e-05,79.9,79.9,3.12,3274.15,42592.08,43028.0
|
10 |
+
9,37700000.0,31500.0,133000.0,8320.0,1020.0,7.83,6.21e-05,75.5,75.5,9.04,,,
|
11 |
+
10,41900000.0,31400.0,134000.0,8350.0,1020.0,7.62,5.74e-05,75.8,75.8,5.09,3274.15,42592.08,43028.0
|
12 |
+
11,46100000.0,30300.0,138000.0,8650.0,1020.0,7.47,5.26e-05,78.5,78.5,4.06,3274.15,42592.08,43028.0
|
13 |
+
12,50300000.0,32000.0,131000.0,8180.0,1020.0,7.34,4.79e-05,74.2,74.2,3.13,,,
|
14 |
+
13,54500000.0,32400.0,129000.0,8080.0,1020.0,7.23,4.32e-05,73.4,73.4,2.73,3274.15,42592.08,43028.0
|
15 |
+
14,58700000.0,30900.0,136000.0,8480.0,1020.0,7.14,3.84e-05,77.0,77.0,2.33,3274.15,42592.08,43028.0
|
16 |
+
15,62900000.0,30600.0,137000.0,8560.0,1020.0,7.06,3.37e-05,77.7,77.7,2.47,3274.15,42592.08,43028.0
|
17 |
+
16,67099999.99999999,32100.0,131000.0,8170.0,1020.0,6.98,2.89e-05,74.1,74.1,2.69,3274.15,42592.08,43028.0
|
18 |
+
17,71300000.0,31300.0,134000.0,8370.0,1020.0,6.9,2.42e-05,75.9,75.9,1.91,3274.15,42592.08,43028.0
|
19 |
+
18,75500000.0,29900.0,140000.0,8770.0,1020.0,6.84,1.95e-05,79.5,79.5,1.62,3274.15,42592.08,43028.0
|
20 |
+
19,79700000.0,31700.0,132000.0,8270.0,1020.0,6.8,1.47e-05,75.0,75.0,1.85,3274.15,42592.08,43028.0
|
21 |
+
20,83900000.0,30800.0,136000.0,8500.0,1020.0,6.77,1e-05,77.1,77.1,1.82,,,
|
llama-1B/16_GPUS/dp-1_tp-1_pp-16_mbz-4/profiler.csv
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
forward,backward
|
2 |
+
0ms 584μs,3ms 721μs
|
llama-1B/16_GPUS/dp-1_tp-2_pp-8_mbz-2/profiler.csv
CHANGED
@@ -1,2 +1,2 @@
|
|
1 |
forward,backward
|
2 |
-
0ms
|
|
|
1 |
forward,backward
|
2 |
+
0ms 942μs,1ms 137μs
|
llama-1B/16_GPUS/dp-1_tp-2_pp-8_mbz-4/log_metrics.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
iteration,consumed_tokens,elapsed_time_per_iteration_ms,tokens_per_sec,tokens_per_sec_per_gpu,global_batch_size,lm_loss,lr,model_tflops_per_gpu,hardware_tflops_per_gpu,grad_norm,memory_usage_MiB,peak_allocated_MiB,peak_reserved_MiB
|
2 |
+
1,4190000.0000000005,109000.0,38400.0,2400.0,1020.0,11.2,0.0001,21.8,21.8,17.8,2393.89,24101.07,24548.0
|
3 |
+
2,8390000.0,65600.0,63900.0,3990.0,1020.0,11.2,9.53e-05,36.2,36.2,17.8,2393.89,24101.07,24548.0
|
4 |
+
3,12600000.0,62400.0,67200.0,4200.0,1020.0,9.62,9.05e-05,38.1,38.1,21.7,2393.89,2393.9,24548.0
|
5 |
+
4,16800000.0,63900.0,65600.0,4100.0,1020.0,10.5,8.58e-05,37.2,37.2,45.6,,,
|
6 |
+
5,21000000.0,65900.0,63700.0,3980.0,1020.0,9.43,8.11e-05,36.1,36.1,11.3,2393.89,24101.07,24548.0
|
7 |
+
6,25200000.0,66600.0,63000.0,3940.0,1020.0,9.37,7.63e-05,35.7,35.7,7.69,2393.89,24101.07,24548.0
|
8 |
+
7,29400000.0,301000.0,14000.0,872.0,1020.0,8.96,7.16e-05,7.91,7.91,5.69,2393.89,24101.07,24548.0
|
9 |
+
8,33600000.0,63200.0,66400.0,4150.0,1020.0,8.47,6.68e-05,37.6,37.6,5.25,2393.89,24101.07,24548.0
|
10 |
+
9,37700000.0,65200.0,64300.0,4019.9999999999995,1020.0,8.01,6.21e-05,36.5,36.5,4.65,2393.89,24101.07,24548.0
|
11 |
+
10,41900000.0,62100.0,67600.0,4220.0,1020.0,7.75,5.74e-05,38.3,38.3,3.85,2393.89,24101.07,24548.0
|
12 |
+
11,46100000.0,61400.0,68400.0,4270.0,1020.0,7.62,5.26e-05,38.8,38.8,4.98,2393.89,24101.07,24548.0
|
13 |
+
12,50300000.0,63000.0,66600.0,4160.0,1020.0,7.46,4.79e-05,37.7,37.7,3.45,,,
|
14 |
+
13,54500000.0,64200.0,65400.00000000001,4090.0,1020.0,7.34,4.32e-05,37.1,37.1,3.45,2393.89,24101.07,24548.0
|
15 |
+
14,58700000.0,63700.0,65800.0,4110.0,1020.0,7.22,3.84e-05,37.3,37.3,3.22,2393.89,24101.07,24548.0
|
16 |
+
15,62900000.0,64500.0,65000.0,4059.9999999999995,1020.0,7.1,3.37e-05,36.8,36.8,2.88,2393.89,24101.07,24548.0
|
17 |
+
16,67099999.99999999,64300.0,65200.0,4080.0,1020.0,7.01,2.89e-05,37.0,37.0,2.57,2393.89,24101.07,24548.0
|
18 |
+
17,71300000.0,63900.0,65700.0,4100.0,1020.0,6.94,2.42e-05,37.2,37.2,2.47,2393.89,24101.07,24548.0
|
19 |
+
18,75500000.0,63700.0,65900.0,4120.0,1020.0,6.88,1.95e-05,37.4,37.4,2.49,2393.89,24101.07,24548.0
|
20 |
+
19,79700000.0,63600.0,65900.0,4120.0,1020.0,6.83,1.47e-05,37.4,37.4,2.42,,,
|
21 |
+
20,83900000.0,64900.00000000001,64599.99999999999,4040.0,1020.0,6.78,1e-05,36.7,36.7,2.25,,,
|
llama-1B/16_GPUS/dp-1_tp-2_pp-8_mbz-4/profiler.csv
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
forward,backward
|
2 |
+
0ms 971μs,1ms 239μs
|
llama-1B/16_GPUS/dp-1_tp-2_pp-8_mbz-8/profiler.csv
CHANGED
@@ -1,2 +1,2 @@
|
|
1 |
forward,backward
|
2 |
-
0ms
|
|
|
1 |
forward,backward
|
2 |
+
0ms 943μs,1ms 229μs
|
llama-1B/16_GPUS/dp-1_tp-4_pp-4_mbz-16/log_metrics.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
iteration,consumed_tokens,elapsed_time_per_iteration_ms,tokens_per_sec,tokens_per_sec_per_gpu,global_batch_size,lm_loss,lr,model_tflops_per_gpu,hardware_tflops_per_gpu,grad_norm,memory_usage_MiB,peak_allocated_MiB,peak_reserved_MiB
|
2 |
+
1,4190000.0000000005,48500.0,86500.0,5400.0,1020.0,11.2,0.0001,49.0,49.0,10.9,1778.24,47092.1,47498.0
|
3 |
+
2,8390000.0,19800.0,212000.0,13200.0,1020.0,11.2,9.53e-05,120.0,120.0,11.0,1778.24,1778.27,47498.0
|
4 |
+
3,12600000.0,18600.0,226000.0,14100.0,1020.0,9.83,9.05e-05,128.0,128.0,44.4,1778.24,1778.27,47498.0
|
5 |
+
4,16800000.0,17800.0,235000.0,14700.0,1020.0,12.1,8.58e-05,133.0,133.0,24.8,,,
|
6 |
+
5,21000000.0,17100.0,245000.0,15300.0,1020.0,10.1,8.11e-05,139.0,139.0,11.4,1778.24,47092.1,47498.0
|
7 |
+
6,25200000.0,17100.0,245000.0,15300.0,1020.0,9.39,7.63e-05,139.0,139.0,7.05,1778.24,47092.1,47498.0
|
8 |
+
7,29400000.0,113000.0,37000.0,2310.0,1020.0,8.7,7.16e-05,21.0,21.0,5.44,1778.24,47092.1,47498.0
|
9 |
+
8,33600000.0,17200.0,243000.0,15200.0,1020.0,8.77,6.68e-05,138.0,138.0,18.3,,,
|
10 |
+
9,37700000.0,17200.0,244000.0,15300.0,1020.0,8.11,6.21e-05,139.0,139.0,4.97,1778.24,47092.1,47498.0
|
11 |
+
10,41900000.0,16900.0,249000.0,15500.0,1020.0,7.96,5.74e-05,141.0,141.0,4.62,1778.24,47092.1,47498.0
|
12 |
+
11,46100000.0,16000.0,262000.0,16400.0,1020.0,7.84,5.26e-05,149.0,149.0,4.93,1778.24,47092.1,47498.0
|
13 |
+
12,50300000.0,18100.0,232000.0,14500.0,1020.0,7.64,4.79e-05,132.0,132.0,4.08,1778.24,47092.1,47498.0
|
14 |
+
13,54500000.0,17500.0,240000.0,15000.0,1020.0,7.48,4.32e-05,136.0,136.0,3.28,1778.24,47092.1,47498.0
|
15 |
+
14,58700000.0,18200.0,230000.0,14400.0,1020.0,7.4,3.84e-05,131.0,131.0,3.52,1778.24,47092.1,47498.0
|
16 |
+
15,62900000.0,17000.0,246000.0,15400.0,1020.0,7.29,3.37e-05,140.0,140.0,3.13,,,
|
17 |
+
16,67099999.99999999,17600.0,239000.0,14900.0,1020.0,7.18,2.89e-05,135.0,135.0,3.12,1778.24,47092.1,47498.0
|
18 |
+
17,71300000.0,17700.0,237000.0,14800.0,1020.0,7.09,2.42e-05,134.0,134.0,3.22,,,
|
19 |
+
18,75500000.0,17800.0,236000.0,14700.0,1020.0,7.02,1.95e-05,134.0,134.0,3.19,1778.24,47092.1,47498.0
|
20 |
+
19,79700000.0,18400.0,227000.0,14200.0,1020.0,6.97,1.47e-05,129.0,129.0,3.06,1778.24,47092.1,47498.0
|
21 |
+
20,83900000.0,17600.0,239000.0,14900.0,1020.0,6.92,1e-05,135.0,135.0,2.88,,,
|
llama-1B/16_GPUS/dp-1_tp-4_pp-4_mbz-16/profiler.csv
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
forward,backward
|
2 |
+
0ms 888μs,1ms 570μs
|
llama-1B/16_GPUS/dp-1_tp-4_pp-4_mbz-4/profiler.csv
CHANGED
@@ -1,2 +1,2 @@
|
|
1 |
forward,backward
|
2 |
-
|
|
|
1 |
forward,backward
|
2 |
+
0ms 980μs,1ms 139μs
|
llama-1B/16_GPUS/dp-1_tp-4_pp-4_mbz-8/log_metrics.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
iteration,consumed_tokens,elapsed_time_per_iteration_ms,tokens_per_sec,tokens_per_sec_per_gpu,global_batch_size,lm_loss,lr,model_tflops_per_gpu,hardware_tflops_per_gpu,grad_norm,memory_usage_MiB,peak_allocated_MiB,peak_reserved_MiB
|
2 |
+
1,4190000.0000000005,50800.0,82600.0,5160.0,1020.0,11.2,0.0001,46.9,46.9,10.9,1777.96,24436.91,24816.0
|
3 |
+
2,8390000.0,15800.0,265000.0,16600.0,1020.0,11.2,9.53e-05,150.0,150.0,11.0,1777.96,24436.91,24816.0
|
4 |
+
3,12600000.0,17300.0,242000.0,15100.0,1020.0,9.83,9.05e-05,137.0,137.0,44.3,1777.96,24436.91,24816.0
|
5 |
+
4,16800000.0,16700.0,252000.0,15700.0,1020.0,12.1,8.58e-05,143.0,143.0,24.8,1777.96,24436.91,24816.0
|
6 |
+
5,21000000.0,15400.0,272000.0,17000.0,1020.0,10.1,8.11e-05,154.0,154.0,11.4,,,
|
7 |
+
6,25200000.0,15900.0,264000.0,16500.0,1020.0,9.39,7.63e-05,150.0,150.0,7.05,1777.96,24436.91,24816.0
|
8 |
+
7,29400000.0,204000.0,20600.0,1290.0,1020.0,8.69,7.16e-05,11.7,11.7,5.43,1777.96,24436.91,24816.0
|
9 |
+
8,33600000.0,16300.0,257000.0,16000.0,1020.0,8.77,6.68e-05,146.0,146.0,18.4,1777.96,24436.91,24816.0
|
10 |
+
9,37700000.0,17600.0,238000.0,14900.0,1020.0,8.11,6.21e-05,135.0,135.0,4.96,,,
|
11 |
+
10,41900000.0,16600.0,253000.0,15800.0,1020.0,7.96,5.74e-05,144.0,144.0,4.62,1777.96,24436.91,24816.0
|
12 |
+
11,46100000.0,16300.0,257000.0,16100.000000000002,1020.0,7.84,5.26e-05,146.0,146.0,4.93,1777.96,24436.91,24816.0
|
13 |
+
12,50300000.0,16000.0,262000.0,16400.0,1020.0,7.64,4.79e-05,148.0,148.0,4.08,,,
|
14 |
+
13,54500000.0,16100.000000000002,261000.0,16300.0,1020.0,7.48,4.32e-05,148.0,148.0,3.28,1777.96,24436.91,24816.0
|
15 |
+
14,58700000.0,16500.0,254000.0,15900.0,1020.0,7.4,3.84e-05,144.0,144.0,3.52,1777.96,24436.91,24816.0
|
16 |
+
15,62900000.0,16100.000000000002,261000.0,16300.0,1020.0,7.29,3.37e-05,148.0,148.0,3.13,,,
|
17 |
+
16,67099999.99999999,16000.0,263000.0,16400.0,1020.0,7.18,2.89e-05,149.0,149.0,3.11,1777.96,24436.91,24816.0
|
18 |
+
17,71300000.0,15300.0,275000.0,17200.0,1020.0,7.09,2.42e-05,156.0,156.0,3.22,1777.96,24436.91,24816.0
|
19 |
+
18,75500000.0,15800.0,266000.0,16600.0,1020.0,7.02,1.95e-05,151.0,151.0,3.19,1777.96,24436.91,24816.0
|
20 |
+
19,79700000.0,15600.0,268000.0,16800.0,1020.0,6.97,1.47e-05,152.0,152.0,3.06,,,
|
21 |
+
20,83900000.0,15300.0,274000.0,17100.0,1020.0,6.92,1e-05,156.0,156.0,2.89,,,
|
llama-1B/16_GPUS/dp-1_tp-4_pp-4_mbz-8/profiler.csv
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
forward,backward
|
2 |
+
0ms 804μs,1ms 957μs
|
llama-1B/16_GPUS/dp-1_tp-8_pp-2_mbz-16/log_metrics.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
iteration,consumed_tokens,elapsed_time_per_iteration_ms,tokens_per_sec,tokens_per_sec_per_gpu,global_batch_size,lm_loss,lr,model_tflops_per_gpu,hardware_tflops_per_gpu,grad_norm,memory_usage_MiB,peak_allocated_MiB,peak_reserved_MiB
|
2 |
+
1,4190000.0000000005,36400.0,115000.0,7200.0,1020.0,11.2,0.0001,65.3,65.3,12.1,1572.71,31525.69,31874.0
|
3 |
+
2,8390000.0,18000.0,233000.0,14500.0,1020.0,11.2,9.53e-05,132.0,132.0,12.2,1572.71,31525.69,31874.0
|
4 |
+
3,12600000.0,18100.0,232000.0,14500.0,1020.0,10.0,9.05e-05,132.0,132.0,51.6,1572.71,31525.69,31874.0
|
5 |
+
4,16800000.0,17400.0,241000.0,15000.0,1020.0,11.7,8.58e-05,136.0,136.0,18.2,1572.71,1572.75,31874.0
|
6 |
+
5,21000000.0,17500.0,239000.0,14900.0,1020.0,10.4,8.11e-05,136.0,136.0,16.0,1572.71,31525.69,31874.0
|
7 |
+
6,25200000.0,17500.0,240000.0,15000.0,1020.0,9.9,7.63e-05,136.0,136.0,9.07,1572.71,31525.69,31874.0
|
8 |
+
7,29400000.0,216000.0,19400.0,1210.0,1020.0,9.37,7.16e-05,11.0,11.0,6.23,1572.71,31525.69,31874.0
|
9 |
+
8,33600000.0,17400.0,241000.0,15100.0,1020.0,8.89,6.68e-05,137.0,137.0,5.76,1572.71,31525.69,31874.0
|
10 |
+
9,37700000.0,18300.0,229000.0,14300.0,1020.0,8.8,6.21e-05,130.0,130.0,11.2,,,
|
11 |
+
10,41900000.0,17300.0,243000.0,15200.0,1020.0,8.33,5.74e-05,138.0,138.0,5.72,1572.71,31525.69,31874.0
|
12 |
+
11,46100000.0,17200.0,243000.0,15200.0,1020.0,8.06,5.26e-05,138.0,138.0,4.91,1572.71,31525.69,31874.0
|
13 |
+
12,50300000.0,17000.0,247000.0,15400.0,1020.0,7.9,4.79e-05,140.0,140.0,4.86,1572.71,31525.69,31874.0
|
14 |
+
13,54500000.0,17300.0,242000.0,15100.0,1020.0,7.75,4.32e-05,137.0,137.0,4.69,,,
|
15 |
+
14,58700000.0,17300.0,243000.0,15200.0,1020.0,7.62,3.84e-05,138.0,138.0,4.69,1572.71,31525.69,31874.0
|
16 |
+
15,62900000.0,17300.0,242000.0,15100.0,1020.0,7.48,3.37e-05,137.0,137.0,4.49,1572.71,31525.69,31874.0
|
17 |
+
16,67099999.99999999,17000.0,247000.0,15400.0,1020.0,7.34,2.89e-05,140.0,140.0,3.99,1572.71,31525.69,31874.0
|
18 |
+
17,71300000.0,17100.0,245000.0,15300.0,1020.0,7.23,2.42e-05,139.0,139.0,3.54,1572.71,31525.69,31874.0
|
19 |
+
18,75500000.0,17300.0,242000.0,15100.0,1020.0,7.16,1.95e-05,137.0,137.0,3.28,1572.71,31525.69,31874.0
|
20 |
+
19,79700000.0,17300.0,242000.0,15100.0,1020.0,7.09,1.47e-05,137.0,137.0,3.2,1572.71,31525.69,31874.0
|
21 |
+
20,83900000.0,17500.0,240000.0,15000.0,1020.0,7.03,1e-05,136.0,136.0,3.1,,,
|
llama-1B/16_GPUS/dp-1_tp-8_pp-2_mbz-16/profiler.csv
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
forward,backward
|
2 |
+
0ms 972μs,1ms 205μs
|
llama-1B/16_GPUS/dp-1_tp-8_pp-2_mbz-32/log_metrics.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
iteration,consumed_tokens,elapsed_time_per_iteration_ms,tokens_per_sec,tokens_per_sec_per_gpu,global_batch_size,lm_loss,lr,model_tflops_per_gpu,hardware_tflops_per_gpu,grad_norm,memory_usage_MiB,peak_allocated_MiB,peak_reserved_MiB
|
2 |
+
1,4190000.0000000005,35300.0,119000.0,7430.0,1020.0,11.2,0.0001,67.4,67.4,12.1,1573.27,1573.31,62200.0
|
3 |
+
2,8390000.0,16800.0,250000.0,15600.0,1020.0,11.2,9.53e-05,142.0,142.0,12.2,1573.27,1573.31,62200.0
|
4 |
+
3,12600000.0,17000.0,246000.0,15400.0,1020.0,10.0,9.05e-05,140.0,140.0,51.6,1573.27,1573.31,62200.0
|
5 |
+
4,16800000.0,16800.0,249000.0,15600.0,1020.0,11.7,8.58e-05,141.0,141.0,18.3,1573.27,61477.19,62200.0
|
6 |
+
5,21000000.0,16800.0,250000.0,15600.0,1020.0,10.4,8.11e-05,142.0,142.0,16.0,,,
|
7 |
+
6,25200000.0,16400.0,255000.0,16000.0,1020.0,9.9,7.63e-05,145.0,145.0,9.07,1573.27,61477.19,62200.0
|
8 |
+
7,29400000.0,117000.0,36000.0,2250.0,1020.0,9.37,7.16e-05,20.4,20.4,6.23,1573.27,61477.19,62200.0
|
9 |
+
8,33600000.0,16600.0,253000.0,15800.0,1020.0,8.89,6.68e-05,144.0,144.0,5.76,1573.27,61477.19,62200.0
|
10 |
+
9,37700000.0,17300.0,243000.0,15200.0,1020.0,8.8,6.21e-05,138.0,138.0,11.2,1573.27,61477.19,62200.0
|
11 |
+
10,41900000.0,16500.0,255000.0,15900.0,1020.0,8.33,5.74e-05,144.0,144.0,5.72,1573.27,61477.19,62200.0
|
12 |
+
11,46100000.0,16700.0,252000.0,15700.0,1020.0,8.06,5.26e-05,143.0,143.0,4.91,1573.27,61477.19,62200.0
|
13 |
+
12,50300000.0,16900.0,249000.0,15500.0,1020.0,7.9,4.79e-05,141.0,141.0,4.86,1573.27,61477.19,62200.0
|
14 |
+
13,54500000.0,17000.0,247000.0,15400.0,1020.0,7.75,4.32e-05,140.0,140.0,4.69,1573.27,61477.19,62200.0
|
15 |
+
14,58700000.0,16500.0,254000.0,15900.0,1020.0,7.62,3.84e-05,144.0,144.0,4.69,1573.27,61477.19,62200.0
|
16 |
+
15,62900000.0,16700.0,251000.0,15700.0,1020.0,7.48,3.37e-05,143.0,143.0,4.49,,,
|
17 |
+
16,67099999.99999999,16900.0,248000.0,15500.0,1020.0,7.34,2.89e-05,141.0,141.0,3.99,1573.27,61477.19,62200.0
|
18 |
+
17,71300000.0,17500.0,240000.0,15000.0,1020.0,7.23,2.42e-05,136.0,136.0,3.54,1573.27,61477.19,62200.0
|
19 |
+
18,75500000.0,17100.0,245000.0,15300.0,1020.0,7.16,1.95e-05,139.0,139.0,3.28,1573.27,61477.19,62200.0
|
20 |
+
19,79700000.0,16900.0,247000.0,15500.0,1020.0,7.09,1.47e-05,140.0,140.0,3.2,,,
|
21 |
+
20,83900000.0,16600.0,252000.0,15700.0,1020.0,7.03,1e-05,143.0,143.0,3.1,,,
|
llama-1B/16_GPUS/dp-1_tp-8_pp-2_mbz-32/profiler.csv
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
forward,backward
|
2 |
+
0ms 974μs,0ms 988μs
|
llama-1B/16_GPUS/dp-1_tp-8_pp-2_mbz-8/profiler.csv
CHANGED
@@ -1,2 +1,2 @@
|
|
1 |
forward,backward
|
2 |
-
|
|
|
1 |
forward,backward
|
2 |
+
0ms 949μs,1ms 197μs
|
llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-2/log_metrics.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
iteration,consumed_tokens,elapsed_time_per_iteration_ms,tokens_per_sec,tokens_per_sec_per_gpu,global_batch_size,lm_loss,lr,model_tflops_per_gpu,hardware_tflops_per_gpu,grad_norm,memory_usage_MiB,peak_allocated_MiB,peak_reserved_MiB
|
2 |
+
1,4190000.0000000005,63000.0,66600.0,4160.0,1020.0,11.1,0.0001,37.8,37.8,24.9,3168.18,4459.05,23598.0
|
3 |
+
2,8390000.0,32100.0,131000.0,8170.0,1020.0,11.1,9.53e-05,74.1,74.1,25.1,3168.18,4459.05,23598.0
|
4 |
+
3,12600000.0,29300.0,143000.0,8950.0,1020.0,9.49,9.05e-05,81.2,81.2,21.5,3168.18,4459.05,23598.0
|
5 |
+
4,16800000.0,28900.0,145000.0,9070.0,1020.0,9.36,8.58e-05,82.3,82.3,21.4,3168.18,22834.67,23598.0
|
6 |
+
5,21000000.0,31800.0,132000.0,8240.0,1020.0,9.01,8.11e-05,74.8,74.8,12.7,,,
|
7 |
+
6,25200000.0,30800.0,136000.0,8520.0,1020.0,10.3,7.63e-05,77.3,77.3,47.1,3168.18,22834.67,23598.0
|
8 |
+
7,29400000.0,241000.0,17400.0,1090.0,1020.0,8.68,7.16e-05,9.88,9.88,5.58,3168.18,22834.67,23598.0
|
9 |
+
8,33600000.0,31200.0,135000.0,8410.0,1020.0,8.32,6.68e-05,76.3,76.3,4.77,3168.18,22834.67,23598.0
|
10 |
+
9,37700000.0,31900.0,131000.0,8210.0,1020.0,7.95,6.21e-05,74.5,74.5,3.31,3168.18,22834.67,23598.0
|
11 |
+
10,41900000.0,30600.0,137000.0,8550.0,1020.0,7.69,5.74e-05,77.6,77.6,4.31,3168.18,22834.67,23598.0
|
12 |
+
11,46100000.0,32100.0,131000.0,8170.0,1020.0,7.45,5.26e-05,74.2,74.2,2.5,,,
|
13 |
+
12,50300000.0,31700.0,132000.0,8270.0,1020.0,7.37,4.79e-05,75.0,75.0,5.02,3168.18,22834.67,23598.0
|
14 |
+
13,54500000.0,32200.000000000004,130000.0,8150.0,1020.0,7.31,4.32e-05,73.9,73.9,6.06,3168.18,22834.67,23598.0
|
15 |
+
14,58700000.0,31900.0,132000.0,8220.0,1020.0,7.19,3.84e-05,74.6,74.6,5.3,,,
|
16 |
+
15,62900000.0,34100.0,123000.0,7690.0,1020.0,7.06,3.37e-05,69.7,69.7,2.73,3168.18,22834.67,23598.0
|
17 |
+
16,67099999.99999999,32500.0,129000.0,8060.000000000001,1020.0,6.97,2.89e-05,73.2,73.2,1.99,3168.18,22834.67,23598.0
|
18 |
+
17,71300000.0,30800.0,136000.0,8510.0,1020.0,6.91,2.42e-05,77.2,77.2,2.04,3168.18,22834.67,23598.0
|
19 |
+
18,75500000.0,32900.0,128000.0,7970.0,1020.0,6.86,1.95e-05,72.4,72.4,2.0,3168.18,22834.67,23598.0
|
20 |
+
19,79700000.0,30100.0,139000.0,8700.0,1020.0,6.81,1.47e-05,78.9,78.9,2.01,,,
|
21 |
+
20,83900000.0,31100.0,135000.0,8420.0,1020.0,6.77,1e-05,76.4,76.4,1.94,,,
|
llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-2/profiler.csv
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
forward,backward
|
2 |
+
0ms 715μs,2ms 528μs
|
llama-1B/16_GPUS/dp-2_tp-1_pp-8_mbz-4/profiler.csv
CHANGED
@@ -1,2 +1,2 @@
|
|
1 |
forward,backward
|
2 |
-
0ms
|
|
|
1 |
forward,backward
|
2 |
+
0ms 976μs,1ms 206μs
|
llama-1B/16_GPUS/dp-2_tp-2_pp-4_mbz-2/log_metrics.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
iteration,consumed_tokens,elapsed_time_per_iteration_ms,tokens_per_sec,tokens_per_sec_per_gpu,global_batch_size,lm_loss,lr,model_tflops_per_gpu,hardware_tflops_per_gpu,grad_norm,memory_usage_MiB,peak_allocated_MiB,peak_reserved_MiB
|
2 |
+
1,4190000.0000000005,58800.0,71400.0,4460.0,1020.0,11.2,0.0001,40.5,40.5,14.8,2343.89,3292.11,12328.0
|
3 |
+
2,8390000.0,35300.0,119000.0,7440.0,1020.0,11.2,9.53e-05,67.5,67.5,14.9,2343.89,3292.11,12328.0
|
4 |
+
3,12600000.0,36400.0,115000.0,7210.0,1020.0,9.53,9.05e-05,65.4,65.4,35.8,2343.89,3292.11,12328.0
|
5 |
+
4,16800000.0,37900.0,111000.0,6910.0,1020.0,12.3,8.58e-05,62.7,62.7,37.4,,,
|
6 |
+
5,21000000.0,35100.0,119000.0,7460.0,1020.0,9.94,8.11e-05,67.7,67.7,14.1,2343.89,11855.1,12328.0
|
7 |
+
6,25200000.0,34300.0,122000.0,7650.0,1020.0,9.44,7.63e-05,69.4,69.4,8.14,2343.89,11855.1,12328.0
|
8 |
+
7,29400000.0,430000.0,9760.0,610.0,1020.0,8.73,7.16e-05,5.54,5.54,6.04,2343.89,11855.1,12328.0
|
9 |
+
8,33600000.0,35100.0,119000.0,7460.0,1020.0,9.17,6.68e-05,67.7,67.7,27.9,,,
|
10 |
+
9,37700000.0,33900.0,124000.0,7730.0,1020.0,8.33,6.21e-05,70.2,70.2,9.38,2343.89,11855.1,12328.0
|
11 |
+
10,41900000.0,32900.0,128000.0,7970.0,1020.0,8.02,5.74e-05,72.3,72.3,5.25,,,
|
12 |
+
11,46100000.0,35800.0,117000.0,7320.0,1020.0,7.85,5.26e-05,66.4,66.4,4.81,2343.89,11855.1,12328.0
|
13 |
+
12,50300000.0,37200.0,113000.0,7050.0,1020.0,7.68,4.79e-05,64.0,64.0,4.49,,,
|
14 |
+
13,54500000.0,34900.0,120000.0,7520.0,1020.0,7.53,4.32e-05,68.2,68.2,4.16,2343.89,11855.1,12328.0
|
15 |
+
14,58700000.0,34500.0,122000.0,7600.0,1020.0,7.4,3.84e-05,68.9,68.9,4.08,2343.89,11855.1,12328.0
|
16 |
+
15,62900000.0,35200.0,119000.0,7440.0,1020.0,7.26,3.37e-05,67.5,67.5,3.25,,,
|
17 |
+
16,67099999.99999999,35300.0,119000.0,7420.0,1020.0,7.17,2.89e-05,67.3,67.3,2.43,2343.89,11855.1,12328.0
|
18 |
+
17,71300000.0,37200.0,113000.0,7040.0,1020.0,7.1,2.42e-05,63.9,63.9,2.88,2343.89,11855.1,12328.0
|
19 |
+
18,75500000.0,34500.0,121000.0,7590.0,1020.0,7.03,1.95e-05,68.9,68.9,2.75,2343.89,11855.1,12328.0
|
20 |
+
19,79700000.0,36300.0,116000.0,7230.0,1020.0,6.96,1.47e-05,65.6,65.6,2.64,2343.89,11855.1,12328.0
|
21 |
+
20,83900000.0,34800.0,120000.0,7530.0,1020.0,6.91,1e-05,68.3,68.3,2.47,,,
|
llama-1B/16_GPUS/dp-2_tp-2_pp-4_mbz-2/profiler.csv
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
forward,backward
|
2 |
+
0ms 959μs,1ms 86μs
|
llama-1B/16_GPUS/dp-2_tp-2_pp-4_mbz-4/log_metrics.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
iteration,consumed_tokens,elapsed_time_per_iteration_ms,tokens_per_sec,tokens_per_sec_per_gpu,global_batch_size,lm_loss,lr,model_tflops_per_gpu,hardware_tflops_per_gpu,grad_norm,memory_usage_MiB,peak_allocated_MiB,peak_reserved_MiB
|
2 |
+
1,4190000.0000000005,58200.0,72100.0,4500.0,1020.0,11.2,0.0001,40.9,40.9,14.8,2343.96,3291.29,21926.0
|
3 |
+
2,8390000.0,31600.0,133000.0,8290.0,1020.0,11.2,9.53e-05,75.2,75.2,14.9,2343.96,21358.33,21926.0
|
4 |
+
3,12600000.0,32400.0,129000.0,8090.0,1020.0,9.53,9.05e-05,73.4,73.4,35.7,2343.96,3291.29,21926.0
|
5 |
+
4,16800000.0,30400.0,138000.0,8630.0,1020.0,12.3,8.58e-05,78.3,78.3,37.4,2343.96,21358.33,21926.0
|
6 |
+
5,21000000.0,33200.0,126000.0,7890.0,1020.0,9.94,8.11e-05,71.6,71.6,14.1,,,
|
7 |
+
6,25200000.0,32100.0,130000.0,8150.0,1020.0,9.43,7.63e-05,74.0,74.0,8.15,2343.96,21358.33,21926.0
|
8 |
+
7,29400000.0,224000.0,18700.0,1170.0,1020.0,8.73,7.16e-05,10.6,10.6,6.04,2343.96,21358.33,21926.0
|
9 |
+
8,33600000.0,34400.0,122000.0,7620.0,1020.0,9.16,6.68e-05,69.1,69.1,27.8,2343.96,21358.33,21926.0
|
10 |
+
9,37700000.0,33300.0,126000.0,7880.0,1020.0,8.32,6.21e-05,71.5,71.5,9.29,2343.96,21358.33,21926.0
|
11 |
+
10,41900000.0,32600.0,129000.0,8029.999999999999,1020.0,8.02,5.74e-05,72.9,72.9,5.24,2343.96,21358.33,21926.0
|
12 |
+
11,46100000.0,33500.0,125000.0,7830.0,1020.0,7.85,5.26e-05,71.1,71.1,4.81,2343.96,21358.33,21926.0
|
13 |
+
12,50300000.0,33200.0,126000.0,7900.0,1020.0,7.68,4.79e-05,71.7,71.7,4.49,2343.96,21358.33,21926.0
|
14 |
+
13,54500000.0,31800.0,132000.0,8250.0,1020.0,7.53,4.32e-05,74.9,74.9,4.15,2343.96,21358.33,21926.0
|
15 |
+
14,58700000.0,32000.0,131000.0,8200.0,1020.0,7.4,3.84e-05,74.4,74.4,4.07,2343.96,21358.33,21926.0
|
16 |
+
15,62900000.0,32299.999999999996,130000.0,8119.999999999999,1020.0,7.26,3.37e-05,73.7,73.7,3.24,2343.96,21358.33,21926.0
|
17 |
+
16,67099999.99999999,33600.0,125000.0,7810.0,1020.0,7.17,2.89e-05,70.8,70.8,2.43,2343.96,21358.33,21926.0
|
18 |
+
17,71300000.0,31500.0,133000.0,8320.0,1020.0,7.1,2.42e-05,75.5,75.5,2.88,2343.96,21358.33,21926.0
|
19 |
+
18,75500000.0,33200.0,126000.0,7890.0,1020.0,7.03,1.95e-05,71.6,71.6,2.75,2343.96,21358.33,21926.0
|
20 |
+
19,79700000.0,34100.0,123000.0,7700.0,1020.0,6.96,1.47e-05,69.8,69.8,2.64,,,
|
21 |
+
20,83900000.0,31700.0,132000.0,8260.0,1020.0,6.91,1e-05,75.0,75.0,2.47,,,
|
llama-1B/16_GPUS/dp-2_tp-2_pp-4_mbz-4/profiler.csv
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
forward,backward
|
2 |
+
0ms 915μs,1ms 415μs
|
llama-1B/16_GPUS/dp-2_tp-2_pp-4_mbz-8/log_metrics.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
iteration,consumed_tokens,elapsed_time_per_iteration_ms,tokens_per_sec,tokens_per_sec_per_gpu,global_batch_size,lm_loss,lr,model_tflops_per_gpu,hardware_tflops_per_gpu,grad_norm,memory_usage_MiB,peak_allocated_MiB,peak_reserved_MiB
|
2 |
+
1,4190000.0000000005,47000.0,89300.0,5580.0,1020.0,11.2,0.0001,50.6,50.6,14.8,2344.1,3289.64,41214.0
|
3 |
+
2,8390000.0,24200.0,174000.0,10800.0,1020.0,11.2,9.53e-05,98.4,98.4,14.9,2344.1,3289.64,41214.0
|
4 |
+
3,12600000.0,28600.0,147000.0,9180.0,1020.0,9.53,9.05e-05,83.3,83.3,35.8,2344.1,40364.8,41214.0
|
5 |
+
4,16800000.0,27800.0,151000.0,9440.0,1020.0,12.3,8.58e-05,85.7,85.7,37.4,2344.1,3289.64,41214.0
|
6 |
+
5,21000000.0,25200.0,166000.0,10400.0,1020.0,9.94,8.11e-05,94.2,94.2,14.1,2344.1,40364.8,41214.0
|
7 |
+
6,25200000.0,26300.0,159000.0,9950.0,1020.0,9.44,7.63e-05,90.3,90.3,8.13,2344.1,40364.8,41214.0
|
8 |
+
7,29400000.0,121000.0,34700.0,2170.0,1020.0,8.73,7.16e-05,19.7,19.7,6.04,2344.1,40364.8,41214.0
|
9 |
+
8,33600000.0,26000.0,161000.0,10100.0,1020.0,9.17,6.68e-05,91.4,91.4,28.0,2344.1,40364.8,41214.0
|
10 |
+
9,37700000.0,27100.0,155000.0,9660.0,1020.0,8.33,6.21e-05,87.6,87.6,9.42,2344.1,40364.8,41214.0
|
11 |
+
10,41900000.0,26600.0,157000.0,9840.0,1020.0,8.02,5.74e-05,89.3,89.3,5.24,2344.1,40364.8,41214.0
|
12 |
+
11,46100000.0,26500.0,158000.0,9880.0,1020.0,7.85,5.26e-05,89.6,89.6,4.81,,,
|
13 |
+
12,50300000.0,27900.0,151000.0,9410.0,1020.0,7.68,4.79e-05,85.4,85.4,4.49,2344.1,40364.8,41214.0
|
14 |
+
13,54500000.0,25900.0,162000.0,10100.0,1020.0,7.53,4.32e-05,91.7,91.7,4.15,2344.1,40364.8,41214.0
|
15 |
+
14,58700000.0,25700.0,163000.0,10200.0,1020.0,7.4,3.84e-05,92.6,92.6,4.07,,,
|
16 |
+
15,62900000.0,25800.0,163000.0,10200.0,1020.0,7.26,3.37e-05,92.3,92.3,3.25,2344.1,40364.8,41214.0
|
17 |
+
16,67099999.99999999,24100.0,174000.0,10900.0,1020.0,7.17,2.89e-05,98.6,98.6,2.44,2344.1,40364.8,41214.0
|
18 |
+
17,71300000.0,25600.0,164000.0,10200.0,1020.0,7.1,2.42e-05,92.9,92.9,2.88,2344.1,40364.8,41214.0
|
19 |
+
18,75500000.0,25400.0,165000.0,10300.0,1020.0,7.03,1.95e-05,93.5,93.5,2.75,2344.1,40364.8,41214.0
|
20 |
+
19,79700000.0,24300.0,173000.0,10800.0,1020.0,6.96,1.47e-05,97.9,97.9,2.65,,,
|
21 |
+
20,83900000.0,28300.0,148000.0,9270.0,1020.0,6.91,1e-05,84.1,84.1,2.47,,,
|
llama-1B/16_GPUS/dp-2_tp-2_pp-4_mbz-8/profiler.csv
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
forward,backward
|
2 |
+
0ms 961μs,1ms 120μs
|
llama-1B/16_GPUS/dp-2_tp-4_pp-2_mbz-16/log_metrics.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
iteration,consumed_tokens,elapsed_time_per_iteration_ms,tokens_per_sec,tokens_per_sec_per_gpu,global_batch_size,lm_loss,lr,model_tflops_per_gpu,hardware_tflops_per_gpu,grad_norm,memory_usage_MiB,peak_allocated_MiB,peak_reserved_MiB
|
2 |
+
1,4190000.0000000005,27500.0,152000.0,9520.0,1020.0,11.1,0.0001,86.4,86.4,15.0,2064.33,2888.12,47696.0
|
3 |
+
2,8390000.0,13900.0,302000.0,18900.0,1020.0,11.1,9.53e-05,171.0,171.0,15.1,2064.33,2888.12,47952.0
|
4 |
+
3,12600000.0,12900.0,325000.0,20300.0,1020.0,11.4,9.05e-05,184.0,184.0,106.0,2064.33,47377.06,47952.0
|
5 |
+
4,16800000.0,13300.0,315000.0,19700.0,1020.0,11.7,8.58e-05,179.0,179.0,24.5,2064.33,47377.06,47952.0
|
6 |
+
5,21000000.0,13000.0,324000.0,20200.0,1020.0,10.0,8.11e-05,184.0,184.0,11.0,,,
|
7 |
+
6,25200000.0,12500.0,335000.0,20900.0,1020.0,9.46,7.63e-05,190.0,190.0,7.2,2064.33,47377.06,47952.0
|
8 |
+
7,29400000.0,108000.0,38700.0,2420.0,1020.0,8.87,7.16e-05,21.9,21.9,5.99,2064.33,47377.06,47952.0
|
9 |
+
8,33600000.0,13500.0,311000.0,19400.0,1020.0,8.44,6.68e-05,176.0,176.0,5.47,,,
|
10 |
+
9,37700000.0,12400.0,337000.0,21100.0,1020.0,8.17,6.21e-05,191.0,191.0,6.22,2064.33,47377.06,47952.0
|
11 |
+
10,41900000.0,12700.0,331000.0,20700.0,1020.0,7.87,5.74e-05,188.0,188.0,4.35,2064.33,47377.06,47952.0
|
12 |
+
11,46100000.0,12900.0,326000.0,20400.0,1020.0,7.74,5.26e-05,185.0,185.0,4.47,2064.33,47377.06,47952.0
|
13 |
+
12,50300000.0,12700.0,331000.0,20700.0,1020.0,7.6,4.79e-05,188.0,188.0,4.41,2064.33,47377.06,47952.0
|
14 |
+
13,54500000.0,12800.0,328000.0,20500.0,1020.0,7.41,4.32e-05,186.0,186.0,3.72,2064.33,47377.06,47952.0
|
15 |
+
14,58700000.0,13700.0,306000.0,19100.0,1020.0,7.27,3.84e-05,173.0,173.0,3.19,,,
|
16 |
+
15,62900000.0,13300.0,316000.0,19800.0,1020.0,7.17,3.37e-05,179.0,179.0,3.0,2064.33,47377.06,47952.0
|
17 |
+
16,67099999.99999999,13400.0,312000.0,19500.0,1020.0,7.07,2.89e-05,177.0,177.0,3.0,2064.33,47377.06,47952.0
|
18 |
+
17,71300000.0,14800.0,283000.0,17700.0,1020.0,6.96,2.42e-05,160.0,160.0,2.81,2064.33,47377.06,47952.0
|
19 |
+
18,75500000.0,13300.0,314000.0,19700.0,1020.0,6.88,1.95e-05,178.0,178.0,3.0,2064.33,47377.06,47952.0
|
20 |
+
19,79700000.0,13600.0,308000.0,19200.0,1020.0,6.82,1.47e-05,174.0,174.0,3.08,,,
|
21 |
+
20,83900000.0,14100.0,298000.0,18600.0,1020.0,6.77,1e-05,169.0,169.0,2.98,,,
|
llama-1B/16_GPUS/dp-2_tp-4_pp-2_mbz-16/profiler.csv
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
forward,backward
|
2 |
+
0ms 963μs,1ms 10μs
|
llama-1B/16_GPUS/dp-2_tp-4_pp-2_mbz-4/log_metrics.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
iteration,consumed_tokens,elapsed_time_per_iteration_ms,tokens_per_sec,tokens_per_sec_per_gpu,global_batch_size,lm_loss,lr,model_tflops_per_gpu,hardware_tflops_per_gpu,grad_norm,memory_usage_MiB,peak_allocated_MiB,peak_reserved_MiB
|
2 |
+
1,4190000.0000000005,30700.0,136000.0,8530.0,1020.0,11.1,0.0001,77.4,77.4,15.0,2063.91,13395.12,13626.0
|
3 |
+
2,8390000.0,15700.0,268000.0,16700.0,1020.0,11.1,9.53e-05,152.0,152.0,15.1,2063.91,2887.7,13626.0
|
4 |
+
3,12600000.0,15900.0,263000.0,16400.0,1020.0,11.4,9.05e-05,149.0,149.0,106.0,2063.91,2887.7,13626.0
|
5 |
+
4,16800000.0,19200.0,218000.0,13600.0,1020.0,11.7,8.58e-05,124.0,124.0,24.5,2063.91,13395.12,13626.0
|
6 |
+
5,21000000.0,20000.0,210000.0,13100.0,1020.0,10.0,8.11e-05,119.0,119.0,11.0,,,
|
7 |
+
6,25200000.0,19300.0,217000.0,13600.0,1020.0,9.46,7.63e-05,123.0,123.0,7.2,2063.91,13395.12,13626.0
|
8 |
+
7,29400000.0,413000.0,10200.0,635.0,1020.0,8.87,7.16e-05,5.76,5.76,5.99,2063.91,13395.12,13626.0
|
9 |
+
8,33600000.0,16300.0,257000.0,16000.0,1020.0,8.43,6.68e-05,145.0,145.0,5.47,,,
|
10 |
+
9,37700000.0,15400.0,273000.0,17000.0,1020.0,8.17,6.21e-05,155.0,155.0,6.19,2063.91,13395.12,13626.0
|
11 |
+
10,41900000.0,16400.0,256000.0,16000.0,1020.0,7.86,5.74e-05,145.0,145.0,4.35,2063.91,13395.12,13626.0
|
12 |
+
11,46100000.0,15600.0,268000.0,16800.0,1020.0,7.74,5.26e-05,152.0,152.0,4.48,2063.91,13395.12,13626.0
|
13 |
+
12,50300000.0,15700.0,267000.0,16700.0,1020.0,7.6,4.79e-05,152.0,152.0,4.41,2063.91,13395.12,13626.0
|
14 |
+
13,54500000.0,15400.0,273000.0,17100.0,1020.0,7.41,4.32e-05,155.0,155.0,3.72,2063.91,13395.12,13626.0
|
15 |
+
14,58700000.0,15300.0,274000.0,17100.0,1020.0,7.27,3.84e-05,155.0,155.0,3.19,2063.91,13395.12,13626.0
|
16 |
+
15,62900000.0,15100.0,278000.0,17300.0,1020.0,7.17,3.37e-05,157.0,157.0,3.0,2063.91,13395.12,13626.0
|
17 |
+
16,67099999.99999999,16200.0,260000.0,16200.0,1020.0,7.07,2.89e-05,147.0,147.0,3.0,2063.91,13395.12,13626.0
|
18 |
+
17,71300000.0,15300.0,273000.0,17100.0,1020.0,6.96,2.42e-05,155.0,155.0,2.81,2063.91,13395.12,13626.0
|
19 |
+
18,75500000.0,15200.0,275000.0,17200.0,1020.0,6.88,1.95e-05,156.0,156.0,3.0,2063.91,13395.12,13626.0
|
20 |
+
19,79700000.0,15600.0,269000.0,16800.0,1020.0,6.82,1.47e-05,152.0,152.0,3.08,,,
|
21 |
+
20,83900000.0,16200.0,260000.0,16200.0,1020.0,6.77,1e-05,147.0,147.0,2.98,,,
|
llama-1B/16_GPUS/dp-2_tp-4_pp-2_mbz-4/profiler.csv
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
forward,backward
|
2 |
+
0ms 967μs,1ms 107μs
|
llama-1B/16_GPUS/dp-2_tp-4_pp-2_mbz-8/log_metrics.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
iteration,consumed_tokens,elapsed_time_per_iteration_ms,tokens_per_sec,tokens_per_sec_per_gpu,global_batch_size,lm_loss,lr,model_tflops_per_gpu,hardware_tflops_per_gpu,grad_norm,memory_usage_MiB,peak_allocated_MiB,peak_reserved_MiB
|
2 |
+
1,4190000.0000000005,29300.0,143000.0,8950.0,1020.0,11.1,0.0001,81.2,81.2,15.0,2064.05,2887.84,24986.0
|
3 |
+
2,8390000.0,15600.0,269000.0,16800.0,1020.0,11.1,9.53e-05,153.0,153.0,15.1,2064.05,2887.84,25114.0
|
4 |
+
3,12600000.0,14200.0,294000.0,18400.0,1020.0,11.4,9.05e-05,167.0,167.0,106.0,2064.05,2887.84,25114.0
|
5 |
+
4,16800000.0,16000.0,262000.0,16400.0,1020.0,11.7,8.58e-05,149.0,149.0,24.5,,,
|
6 |
+
5,21000000.0,15600.0,269000.0,16800.0,1020.0,10.0,8.11e-05,153.0,153.0,11.0,2064.05,24722.43,25114.0
|
7 |
+
6,25200000.0,14500.0,290000.0,18100.0,1020.0,9.46,7.63e-05,164.0,164.0,7.21,2064.05,24722.43,25114.0
|
8 |
+
7,29400000.0,221000.0,19000.0,1190.0,1020.0,8.87,7.16e-05,10.8,10.8,5.99,,,
|
9 |
+
8,33600000.0,14600.0,287000.0,17900.0,1020.0,8.44,6.68e-05,163.0,163.0,5.45,2064.05,24722.43,25114.0
|
10 |
+
9,37700000.0,16000.0,262000.0,16400.0,1020.0,8.18,6.21e-05,149.0,149.0,6.29,2064.05,24722.43,25114.0
|
11 |
+
10,41900000.0,14500.0,289000.0,18100.0,1020.0,7.87,5.74e-05,164.0,164.0,4.35,2064.05,24722.43,25114.0
|
12 |
+
11,46100000.0,14700.0,286000.0,17900.0,1020.0,7.74,5.26e-05,162.0,162.0,4.47,2064.05,24722.43,25114.0
|
13 |
+
12,50300000.0,14200.0,295000.0,18500.0,1020.0,7.6,4.79e-05,167.0,167.0,4.41,,,
|
14 |
+
13,54500000.0,14600.0,288000.0,18000.0,1020.0,7.42,4.32e-05,163.0,163.0,3.72,2064.05,24722.43,25114.0
|
15 |
+
14,58700000.0,14900.0,282000.0,17700.0,1020.0,7.27,3.84e-05,160.0,160.0,3.2,2064.05,24722.43,25114.0
|
16 |
+
15,62900000.0,14500.0,289000.0,18100.0,1020.0,7.17,3.37e-05,164.0,164.0,2.99,2064.05,24722.43,25114.0
|
17 |
+
16,67099999.99999999,14000.0,299000.0,18700.0,1020.0,7.07,2.89e-05,169.0,169.0,3.0,2064.05,24722.43,25114.0
|
18 |
+
17,71300000.0,14900.0,282000.0,17600.0,1020.0,6.96,2.42e-05,160.0,160.0,2.81,,,
|
19 |
+
18,75500000.0,14500.0,289000.0,18100.0,1020.0,6.88,1.95e-05,164.0,164.0,3.0,2064.05,24722.43,25114.0
|
20 |
+
19,79700000.0,15200.0,275000.0,17200.0,1020.0,6.82,1.47e-05,156.0,156.0,3.08,,,
|
21 |
+
20,83900000.0,15100.0,278000.0,17400.0,1020.0,6.77,1e-05,158.0,158.0,2.99,,,
|
llama-1B/16_GPUS/dp-2_tp-4_pp-2_mbz-8/profiler.csv
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
forward,backward
|
2 |
+
0ms 967μs,1ms 105μs
|
llama-1B/16_GPUS/dp-2_tp-8_pp-1_mbz-16/log_metrics.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
iteration,consumed_tokens,elapsed_time_per_iteration_ms,tokens_per_sec,tokens_per_sec_per_gpu,global_batch_size,lm_loss,lr,model_tflops_per_gpu,hardware_tflops_per_gpu,grad_norm,memory_usage_MiB,peak_allocated_MiB,peak_reserved_MiB
|
2 |
+
1,4190000.0000000005,25900.0,162000.0,10100.0,1020.0,11.5,0.0001,92.0,92.0,15.7,1696.19,30498.63,30840.0
|
3 |
+
2,8390000.0,13400.0,312000.0,19500.0,1020.0,11.5,9.53e-05,177.0,177.0,16.0,1696.19,30498.63,30840.0
|
4 |
+
3,12600000.0,13600.0,308000.0,19200.0,1020.0,12.8,9.05e-05,174.0,174.0,137.0,1696.19,30498.63,30840.0
|
5 |
+
4,16800000.0,13600.0,308000.0,19300.0,1020.0,12.2,8.58e-05,175.0,175.0,22.4,1696.17,2358.02,30840.0
|
6 |
+
5,21000000.0,13400.0,314000.0,19600.0,1020.0,12.4,8.11e-05,178.0,178.0,42.9,1696.17,30498.63,30840.0
|
7 |
+
6,25200000.0,13400.0,312000.0,19500.0,1020.0,11.1,7.63e-05,177.0,177.0,24.7,1696.17,30498.63,30840.0
|
8 |
+
7,29400000.0,13600.0,309000.0,19300.0,1020.0,10.2,7.16e-05,175.0,175.0,12.2,1696.17,30498.63,30840.0
|
9 |
+
8,33600000.0,13400.0,313000.0,19600.0,1020.0,9.8,6.68e-05,178.0,178.0,7.31,1696.17,30498.63,30840.0
|
10 |
+
9,37700000.0,13400.0,314000.0,19600.0,1020.0,9.32,6.21e-05,178.0,178.0,6.66,1696.17,30498.63,30840.0
|
11 |
+
10,41900000.0,13600.0,309000.0,19300.0,1020.0,9.22,5.74e-05,175.0,175.0,16.2,1696.17,30498.63,30840.0
|
12 |
+
11,46100000.0,13600.0,308000.0,19300.0,1020.0,8.63,5.26e-05,175.0,175.0,7.93,1696.17,30498.63,30840.0
|
13 |
+
12,50300000.0,13700.0,307000.0,19200.0,1020.0,8.27,4.79e-05,174.0,174.0,5.43,1696.17,30498.63,30840.0
|
14 |
+
13,54500000.0,13300.0,315000.0,19700.0,1020.0,8.1,4.32e-05,179.0,179.0,5.53,1696.17,30498.63,30840.0
|
15 |
+
14,58700000.0,13600.0,309000.0,19300.0,1020.0,7.93,3.84e-05,175.0,175.0,5.77,1696.17,30498.63,30840.0
|
16 |
+
15,62900000.0,13600.0,309000.0,19300.0,1020.0,7.72,3.37e-05,175.0,175.0,5.17,1696.17,30498.63,30840.0
|
17 |
+
16,67099999.99999999,13400.0,313000.0,19600.0,1020.0,7.56,2.89e-05,178.0,178.0,4.92,1696.17,30498.63,30840.0
|
18 |
+
17,71300000.0,13800.0,304000.0,19000.0,1020.0,7.45,2.42e-05,172.0,172.0,4.93,1696.17,30498.63,30840.0
|
19 |
+
18,75500000.0,13500.0,310000.0,19400.0,1020.0,7.35,1.95e-05,176.0,176.0,4.04,1696.17,30498.63,30840.0
|
20 |
+
19,79700000.0,13500.0,311000.0,19400.0,1020.0,7.29,1.47e-05,176.0,176.0,4.11,1696.17,30498.63,30840.0
|
21 |
+
20,83900000.0,13500.0,312000.0,19500.0,1020.0,7.23,1e-05,177.0,177.0,3.95,,,
|
llama-1B/16_GPUS/dp-2_tp-8_pp-1_mbz-16/profiler.csv
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
forward,backward
|
2 |
+
0ms 971μs,0ms 973μs
|
llama-1B/16_GPUS/dp-2_tp-8_pp-1_mbz-32/log_metrics.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
iteration,consumed_tokens,elapsed_time_per_iteration_ms,tokens_per_sec,tokens_per_sec_per_gpu,global_batch_size,lm_loss,lr,model_tflops_per_gpu,hardware_tflops_per_gpu,grad_norm,memory_usage_MiB,peak_allocated_MiB,peak_reserved_MiB
|
2 |
+
1,4190000.0000000005,20500.0,205000.0,12800.0,1020.0,11.5,0.0001,116.0,116.0,15.7,1697.3,59303.14,60566.0
|
3 |
+
2,8390000.0,12900.0,325000.0,20300.0,1020.0,11.5,9.53e-05,184.0,184.0,16.0,1697.3,59303.14,60566.0
|
4 |
+
3,12600000.0,12700.0,329000.0,20600.0,1020.0,12.8,9.05e-05,187.0,187.0,137.0,1697.3,59303.14,60566.0
|
5 |
+
4,16800000.0,13000.0,322000.0,20100.0,1020.0,12.2,8.58e-05,183.0,183.0,22.4,1697.29,2359.13,60566.0
|
6 |
+
5,21000000.0,13000.0,322000.0,20100.0,1020.0,12.4,8.11e-05,182.0,182.0,42.8,1697.29,59303.14,60566.0
|
7 |
+
6,25200000.0,12900.0,325000.0,20300.0,1020.0,11.1,7.63e-05,185.0,185.0,24.8,1697.29,59303.14,60566.0
|
8 |
+
7,29400000.0,13100.0,320000.0,20000.0,1020.0,10.2,7.16e-05,182.0,182.0,12.1,1697.29,59303.14,60566.0
|
9 |
+
8,33600000.0,12700.0,329000.0,20600.0,1020.0,9.8,6.68e-05,187.0,187.0,7.31,1697.29,59303.14,60566.0
|
10 |
+
9,37700000.0,12800.0,328000.0,20500.0,1020.0,9.32,6.21e-05,186.0,186.0,6.66,1697.29,59303.14,60566.0
|
11 |
+
10,41900000.0,12900.0,324000.0,20300.0,1020.0,9.22,5.74e-05,184.0,184.0,16.3,1697.29,59303.14,60566.0
|
12 |
+
11,46100000.0,12900.0,325000.0,20300.0,1020.0,8.63,5.26e-05,184.0,184.0,7.95,1697.29,59303.14,60566.0
|
13 |
+
12,50300000.0,12800.0,329000.0,20500.0,1020.0,8.27,4.79e-05,186.0,186.0,5.43,1697.29,59303.14,60566.0
|
14 |
+
13,54500000.0,12800.0,327000.0,20400.0,1020.0,8.1,4.32e-05,185.0,185.0,5.53,1697.29,59303.14,60566.0
|
15 |
+
14,58700000.0,12800.0,328000.0,20500.0,1020.0,7.93,3.84e-05,186.0,186.0,5.77,1697.29,59303.14,60566.0
|
16 |
+
15,62900000.0,12800.0,328000.0,20500.0,1020.0,7.72,3.37e-05,186.0,186.0,5.17,1697.29,59303.14,60566.0
|
17 |
+
16,67099999.99999999,13000.0,323000.0,20200.0,1020.0,7.56,2.89e-05,183.0,183.0,4.93,1697.29,59303.14,60566.0
|
18 |
+
17,71300000.0,12800.0,329000.0,20500.0,1020.0,7.45,2.42e-05,186.0,186.0,4.93,1697.29,59303.14,60566.0
|
19 |
+
18,75500000.0,12800.0,327000.0,20500.0,1020.0,7.35,1.95e-05,186.0,186.0,4.02,1697.29,59303.14,60566.0
|
20 |
+
19,79700000.0,12800.0,328000.0,20500.0,1020.0,7.29,1.47e-05,186.0,186.0,4.11,1697.29,59303.14,60566.0
|
21 |
+
20,83900000.0,12700.0,329000.0,20600.0,1020.0,7.23,1e-05,187.0,187.0,3.96,,,
|
llama-1B/16_GPUS/dp-2_tp-8_pp-1_mbz-32/profiler.csv
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
forward,backward
|
2 |
+
0ms 958μs,1ms 170μs
|
llama-1B/16_GPUS/dp-4_tp-1_pp-4_mbz-1/log_metrics.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
iteration,consumed_tokens,elapsed_time_per_iteration_ms,tokens_per_sec,tokens_per_sec_per_gpu,global_batch_size,lm_loss,lr,model_tflops_per_gpu,hardware_tflops_per_gpu,grad_norm,memory_usage_MiB,peak_allocated_MiB,peak_reserved_MiB
|
2 |
+
1,4190000.0000000005,34000.0,123000.0,7720.0,1020.0,11.1,0.0001,70.0,70.0,25.1,3478.24,11653.25,12904.0
|
3 |
+
2,8390000.0,19000.0,221000.0,13800.0,1020.0,11.1,9.53e-05,125.0,125.0,25.2,3478.24,5184.61,12904.0
|
4 |
+
3,12600000.0,19100.0,219000.0,13700.0,1020.0,11.4,9.05e-05,124.0,124.0,217.0,3478.24,5184.61,12904.0
|
5 |
+
4,16800000.0,22400.0,187000.0,11700.0,1020.0,13.8,8.58e-05,106.0,106.0,22.5,,,
|
6 |
+
5,21000000.0,22200.0,189000.0,11800.0,1020.0,9.98,8.11e-05,107.0,107.0,16.5,3478.24,11653.25,12904.0
|
7 |
+
6,25200000.0,21500.0,195000.0,12200.0,1020.0,10.9,7.63e-05,111.0,111.0,93.8,3478.24,11653.25,12904.0
|
8 |
+
7,29400000.0,389000.0,10800.0,674.0,1020.0,9.16,7.16e-05,6.11,6.11,19.7,3478.24,11653.25,12904.0
|
9 |
+
8,33600000.0,22500.0,187000.0,11700.0,1020.0,8.83,6.68e-05,106.0,106.0,6.08,3478.24,11653.25,12904.0
|
10 |
+
9,37700000.0,22600.0,186000.0,11600.0,1020.0,8.47,6.21e-05,105.0,105.0,5.23,3478.24,11653.25,12904.0
|
11 |
+
10,41900000.0,19200.0,218000.0,13600.0,1020.0,8.17,5.74e-05,124.0,124.0,7.72,3478.24,11653.25,12904.0
|
12 |
+
11,46100000.0,20000.0,210000.0,13100.0,1020.0,7.93,5.26e-05,119.0,119.0,5.54,,,
|
13 |
+
12,50300000.0,19600.0,214000.0,13300.0,1020.0,7.75,4.79e-05,121.0,121.0,4.65,3478.24,11653.25,12904.0
|
14 |
+
13,54500000.0,18900.0,222000.0,13900.0,1020.0,7.58,4.32e-05,126.0,126.0,2.89,3478.24,11653.25,12904.0
|
15 |
+
14,58700000.0,19100.0,219000.0,13700.0,1020.0,7.5,3.84e-05,124.0,124.0,4.19,3478.24,11653.25,12904.0
|
16 |
+
15,62900000.0,19200.0,218000.0,13600.0,1020.0,7.4,3.37e-05,124.0,124.0,3.86,3478.24,11653.25,12904.0
|
17 |
+
16,67099999.99999999,19500.0,215000.0,13500.0,1020.0,7.29,2.89e-05,122.0,122.0,3.07,3478.24,11653.25,12904.0
|
18 |
+
17,71300000.0,19600.0,214000.0,13400.0,1020.0,7.19,2.42e-05,122.0,122.0,2.39,3478.24,11653.25,12904.0
|
19 |
+
18,75500000.0,21100.0,199000.0,12400.0,1020.0,7.13,1.95e-05,113.0,113.0,2.21,3478.24,11653.25,12904.0
|
20 |
+
19,79700000.0,19600.0,214000.0,13400.0,1020.0,7.08,1.47e-05,121.0,121.0,2.64,3478.24,11653.25,12904.0
|
21 |
+
20,83900000.0,17900.0,234000.0,14600.0,1020.0,7.03,1e-05,133.0,133.0,2.29,,,
|
llama-1B/16_GPUS/dp-4_tp-1_pp-4_mbz-1/profiler.csv
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
forward,backward
|
2 |
+
0ms 947μs,1ms 132μs
|
llama-1B/16_GPUS/dp-4_tp-1_pp-4_mbz-2/profiler.csv
CHANGED
@@ -1,2 +1,2 @@
|
|
1 |
forward,backward
|
2 |
-
0ms
|
|
|
1 |
forward,backward
|
2 |
+
0ms 947μs,1ms 197μs
|
llama-1B/16_GPUS/dp-4_tp-1_pp-4_mbz-4/log_metrics.csv
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
iteration,consumed_tokens,elapsed_time_per_iteration_ms,tokens_per_sec,tokens_per_sec_per_gpu,global_batch_size,lm_loss,lr,model_tflops_per_gpu,hardware_tflops_per_gpu,grad_norm,memory_usage_MiB,peak_allocated_MiB,peak_reserved_MiB
|
2 |
+
1,4190000.0000000005,38800.0,108000.0,6750.0,1020.0,11.1,0.0001,61.3,61.3,25.1,3478.34,37922.28,38320.0
|
3 |
+
2,8390000.0,19100.0,219000.0,13700.0,1020.0,11.1,9.53e-05,124.0,124.0,25.2,3478.34,37922.28,38320.0
|
4 |
+
3,12600000.0,19000.0,221000.0,13800.0,1020.0,11.4,9.05e-05,125.0,125.0,217.0,3478.34,37922.28,38320.0
|
5 |
+
4,16800000.0,19800.0,212000.0,13300.0,1020.0,13.8,8.58e-05,120.0,120.0,22.5,3478.34,5180.18,38320.0
|
6 |
+
5,21000000.0,18200.0,231000.0,14400.0,1020.0,9.98,8.11e-05,131.0,131.0,16.4,3478.34,37922.28,38320.0
|
7 |
+
6,25200000.0,18600.0,226000.0,14100.0,1020.0,10.9,7.63e-05,128.0,128.0,93.8,3478.34,37922.28,38320.0
|
8 |
+
7,29400000.0,105000.0,40000.0,2500.0,1020.0,9.16,7.16e-05,22.7,22.7,19.8,,,
|
9 |
+
8,33600000.0,18200.0,230000.0,14400.0,1020.0,8.83,6.68e-05,131.0,131.0,6.08,3478.34,37922.28,38320.0
|
10 |
+
9,37700000.0,17900.0,235000.0,14700.0,1020.0,8.47,6.21e-05,133.0,133.0,5.23,3478.34,37922.28,38320.0
|
11 |
+
10,41900000.0,17200.0,244000.0,15200.0,1020.0,8.17,5.74e-05,138.0,138.0,7.71,3478.34,37922.28,38320.0
|
12 |
+
11,46100000.0,19400.0,216000.0,13500.0,1020.0,7.93,5.26e-05,123.0,123.0,5.53,3478.34,37922.28,38320.0
|
13 |
+
12,50300000.0,18800.0,223000.0,14000.0,1020.0,7.75,4.79e-05,127.0,127.0,4.64,,,
|
14 |
+
13,54500000.0,18700.0,224000.0,14000.0,1020.0,7.58,4.32e-05,127.0,127.0,2.9,3478.34,37922.28,38320.0
|
15 |
+
14,58700000.0,16200.0,258000.0,16200.0,1020.0,7.5,3.84e-05,147.0,147.0,4.18,3478.34,37922.28,38320.0
|
16 |
+
15,62900000.0,18100.0,232000.0,14500.0,1020.0,7.4,3.37e-05,131.0,131.0,3.86,3478.34,37922.28,38320.0
|
17 |
+
16,67099999.99999999,17700.0,237000.0,14800.0,1020.0,7.29,2.89e-05,134.0,134.0,3.06,3478.34,37922.28,38320.0
|
18 |
+
17,71300000.0,18300.0,229000.0,14300.0,1020.0,7.19,2.42e-05,130.0,130.0,2.39,,,
|
19 |
+
18,75500000.0,20300.0,206000.0,12900.0,1020.0,7.13,1.95e-05,117.0,117.0,2.2,3478.34,37922.28,38320.0
|
20 |
+
19,79700000.0,17800.0,236000.0,14800.0,1020.0,7.08,1.47e-05,134.0,134.0,2.64,,,
|
21 |
+
20,83900000.0,17200.0,244000.0,15200.0,1020.0,7.03,1e-05,138.0,138.0,2.3,,,
|