File size: 83,113 Bytes
a86ca18
 
143e8bd
a86ca18
 
 
 
 
 
6b8b1ef
a86ca18
 
 
 
143e8bd
 
6b8b1ef
 
a86ca18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143e8bd
 
6b8b1ef
143e8bd
 
 
 
 
a86ca18
 
143e8bd
 
 
 
 
 
6b8b1ef
 
143e8bd
a86ca18
 
6b8b1ef
 
 
a86ca18
143e8bd
 
6b8b1ef
 
 
 
a86ca18
6b8b1ef
878bd55
6b8b1ef
 
 
 
143e8bd
6b8b1ef
 
 
 
878bd55
6b8b1ef
 
 
 
143e8bd
6b8b1ef
 
 
 
143e8bd
6b8b1ef
 
 
 
143e8bd
6b8b1ef
 
 
 
 
 
 
 
878bd55
6b8b1ef
 
 
 
143e8bd
6b8b1ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143e8bd
f2c15d5
a86ca18
6b8b1ef
 
 
143e8bd
6b8b1ef
143e8bd
6b8b1ef
143e8bd
6b8b1ef
143e8bd
6b8b1ef
143e8bd
6b8b1ef
143e8bd
6b8b1ef
89bae42
6b8b1ef
 
 
143e8bd
6b8b1ef
 
 
143e8bd
6b8b1ef
 
 
 
 
143e8bd
6b8b1ef
 
 
 
 
 
143e8bd
6b8b1ef
143e8bd
6b8b1ef
 
 
 
 
0ccc803
89bae42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0ccc803
89bae42
 
 
 
 
 
 
 
 
0ccc803
89bae42
 
 
 
 
 
 
 
 
 
 
0ccc803
89bae42
 
 
 
 
 
 
 
 
 
 
 
 
 
0ccc803
89bae42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0ccc803
89bae42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0ccc803
 
89bae42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0ccc803
 
89bae42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0ccc803
89bae42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0ccc803
 
ad6c00b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0ccc803
ad6c00b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0ccc803
 
ad6c00b
 
 
 
 
 
 
 
 
 
 
 
 
 
0ccc803
ad6c00b
 
 
 
 
 
 
 
 
 
 
 
 
 
0ccc803
ad6c00b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0ccc803
ad6c00b
 
 
 
 
 
 
 
 
 
 
0ccc803
 
 
 
 
 
 
 
 
 
ad6c00b
0ccc803
ad6c00b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0ccc803
 
ad6c00b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0ccc803
 
ad6c00b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0ccc803
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a86ca18
143e8bd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6b8b1ef
143e8bd
6b8b1ef
 
 
 
a86ca18
143e8bd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a86ca18
143e8bd
 
a86ca18
 
 
143e8bd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a86ca18
 
143e8bd
 
 
a86ca18
 
143e8bd
 
 
f2c15d5
a86ca18
143e8bd
a86ca18
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
<!DOCTYPE html>
<html>

<head>
    <script src="distill.bundle.js" type="module" fetchpriority="high" blocking></script>
    <script src="main.bundle.js" type="module" fetchpriority="low" defer></script>
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <meta charset="utf8">
    <base target="_blank">
    <title>The Ultra-Scale Playbook: Training LLMs on GPU Clusters</title>
    <link rel="stylesheet" href="style.css">
</head>

<body>
    <d-front-matter>
        <script id='distill-front-matter' type="text/json">{
    "title": "The Ultra-Scale Playbook: Training LLMs on GPU Clusters",
    "description": "This blog covers everything about scaling LLMs in 2025.",
    "published": "Sept 28, 2024",
    "affiliation": {"name": "HuggingFace"},
    "authors": [
      {
        "author":"Leandro Werra",
        "authorURL":"https://huggingface.co/lvwerra"
      },
      {
        "author":"Thomas Wolf",
        "authorURL":"https://huggingface.co/thomwolf"
      }
    ],
    "katex": {
      "delimiters": [
        {"left": "$$", "right": "$$", "display": false}
      ]
    }
  }
    </script>
    </d-front-matter>
    <d-title>
        <h1 class="l-page" style="text-align: center;">The Ultra-Scale Playbook: Training LLMs on GPU Clusters</h1>
        <div id="title-plot" class="main-plot-container l-screen">
            <figure>
                <img src="assets/images/banner.png" alt="FineWeb">
            </figure>
            <!-- <div id="clusters-plot">
            <img src="assets/images/clusters.png" alt="Clusters">
        </div> -->
        </div>
    </d-title>
    <d-byline></d-byline>
    <d-article>
        <d-contents>
        </d-contents>
        
        <p>Fueled by the scaling laws<d-cite bibtex-key="kaplan2020scalinglaws"></d-cite><d-cite bibtex-key="hoffmann2022chinchilla"></d-cite>, the trend of training ever larger language models on vaster amounts of data has been driving progress in AI for the past couple years. Initially, the development of the largest models happened exclusively behind closed doors of a handful of research labs but recently opened up more with the release of models such as Llama 3.1 405B<d-cite bibtex-key="grattafiori2024llama3herdmodels"></d-cite> and DeepSeek R1<d-cite bibtex-key="deepseekai2024deepseekv3technicalreport"></d-cite>. While these models have <a href="https://huggingface.co/meta-llama">openly shared</a> <a href="https://huggingface.co/deepseek-ai">weights</a> and their training recipes are described in <a href="https://ai.meta.com/research/publications/the-llama-3-herd-of-models/">technical</a> <a href="https://github.com/deepseek-ai/DeepSeek-R1/blob/main/DeepSeek_R1.pdf">reports</a>, the challenging engineering to involved to train at the necessary infrastructure scale is still hidden between the lines of a handful of papers and complex training frameworks. This ~~long blog post~~ open-source book is here to open this black box!</p>

        <aside>Reading time: 7 days. For the best reading experience, we recommend not using a mobile phone.</aside>

        <p>In this book we invite you to follow us in the wonderful world of scaling training of Large Language Models to tens, hundreds, thousands of GPUs. It assumes you know the basics on LLM architecture and training, but are new to distributed training. This writing can be seen as a second part of a trilogy following our first blog on processing data for pre-training, the so-called “<a href="https://huggingface.co/spaces/HuggingFaceFW/blogpost-fineweb-v1">FineWeb blog post</a>”. Having read both blog posts, you should have almost all the core knowledge needed to deeply understand how LLMs are being built nowadays, just missing a bit the final spices like data mixing or architecture choices to complete the recipe (stay tuned…).</p>

        <p>Pre-training LLMs from scratch now requires amounts of compute which exceed in almost every case the use of a single GPU or machine. The clusters used to train these models range from hundreds to thousands of nodes each usually equipped with 4 to 8 GPUs. To make the best use of such an expensive hardware as well as to train in a reasonable time, a range of distributed training methods have been developed with the goal of ensuring that GPUs are highly utilized at all times. Efficiently scaling LLM training is also not confined to pretraining anymore, as fine-tuning larger models on more domain specific data is becoming the standard practice to achieve the best results.</p>

        <aside>We are extremely thankful to the whole <a href="https://distill.pub/">distill.pub</a> team for creating
            the template on which we based this blog post.</aside>
        
        <p>In this post we’ll cover these scaling methods exhaustively while keeping a single story-line to understand where each technique comes from. We’ll cover data, tensor, pipeline and context parallelism as well as ZeRO and kernel fusion. The post is built on the following <strong>three foundations</strong>:</p>

        <p><strong>Quick intros on theory and concepts:</strong> before diving into code and experiments, we want to understand how each method works at a high level and what it’s advantages and limits are. You’ll learn about which parts of a language model eat away your memory and when during training it happens. You’ll learn how we can solve memory constraints by parallelizing the models and increase the throughput by scaling up GPUs. As a result you'll understand how the following widget to compute the memory breakdown of a transformer model works: </p>

        <div id="graph"></div>
        <div id="controls">
            <div class="cell column-1">
            <label for="a">Attention Heads (a):</label>
            <input type="range" id="a" name="a" min="1" max="128" value="8">
            <input type="number" id="a_input" value="8" min="1" max="128">
            </div>
            <div class="cell column-2">
            <label for="mixed">Mixed Precision:</label>
            <input type="checkbox" id="mixed" name="mixed" checked>
            <span></span> <!-- Empty span to maintain grid alignment -->
            </div>
            <div class="cell column-1">
            <label for="b">Micro Batch Size (b):</label>
            <input type="range" id="b" name="b" min="1" max="53248" value="32">
            <input type="number" id="b_input" value="32" min="1" max="53248">
            </div>
            <div class="cell column-2">
            <label for="seq_parallel">Sequence Parallelism:</label>
            <input type="checkbox" id="seq_parallel" name="seq_parallel">
            <span></span> <!-- Empty span to maintain grid alignment -->
            </div>
            <div class="cell column-1">
            <label for="h">Hidden Dimension (h):</label>
            <input type="range" id="h" name="h" min="1" max="16384" value="512">
            <input type="number" id="h_input" value="512" min="128" max="16384">
            </div>
            <div class="cell column-2">
            <label for="recomputation">Recomputation:</label>
            <select id="recomputation" name="recomputation">
                <option value="none">None</option>
                <option value="selective">Selective</option>
                <option value="full">Full</option>
            </select>
            <span></span> <!-- Empty span to maintain grid alignment -->
            </div>
            <div class="cell column-1">
            <label for="h_ff">Feedforward Dimension (h_ff):</label>
            <input type="range" id="h_ff" name="h_ff" min="1" max="65536" value="2048">
            <input type="number" id="h_ff_input" value="2048" min="512" max="65536">
            </div>
            <div class="cell column-2">
            <label for="zero">Zero:</label>
            <select id="zero" name="zero">
                <option value="0">0</option>
                <option value="1">1</option>
                <option value="2">2</option>
                <option value="3">3</option>
            </select>
            <span></span> <!-- Empty span to maintain grid alignment -->
            </div>
            <div class="cell column-1">
            <label for="L">Number of Layers (L):</label>
            <input type="range" id="L" name="L" min="1" max="126" value="12">
            <input type="number" id="L_input" value="12" min="1" max="126">
            </div>
            <div class="cell column-2">
            <label for="ff_activation">FF Activation:</label>
            <select id="ff_activation" name="ff_activation">
                <option value="relu">ReLU</option>
                <option value="gelu">GELU</option>
                <option value="swiglu">SwiGLU</option>
            </select>
            <span></span> <!-- Empty span to maintain grid alignment -->
            </div>
            <div class="cell column-1">
            <label for="s">Sequence Length (s):</label>
            <input type="range" id="s" name="s" min="1" max="128000" value="128">
            <input type="number" id="s_input" value="128" min="64" max="128000">
            </div>
            <div class="cell column-2">
            <label for="presets">Presets:</label>
            <select id="presets" name="presets">
                <option value="Llama 3 Tiny">Llama 3 Tiny</option>
                <option value="Llama 3 8B">Llama 3 8B</option>
                <option value="Llama 3 70B">Llama 3 70B</option>
                <option value="Llama 3 405B">Llama 3 405B</option>
            </select>
            <span></span> <!-- Empty span to maintain grid alignment -->
            </div>
            <div class="cell column-1">
            <label for="v">Vocabulary Size (v):</label>
            <input type="range" id="v" name="v" min="1000" max="100000" value="30522">
            <input type="number" id="v_input" value="30522" min="1000" max="100000">
            </div>
            <div class="cell column-2">
            <label for="tp">Tensor Parallelism (t):</label>
            <input type="range" id="tp" name="tp" min="1" max="16" value="8">
            <input type="number" id="tp_input" value="8" min="1" max="16">
            </div>
            <div class="cell column-1">
            <label for="k">Optimizer Parameters (k):</label>
            <input type="range" id="k" name="k" min="1" max="16" value="8">
            <input type="number" id="k_input" value="8" min="1" max="16">
            </div>
            <div class="cell column-2">
            <label for="dp">Data Parallelism (d):</label>
            <input type="range" id="dp" name="dp" min="1" max="256" value="1">
            <input type="number" id="dp_input" value="1" min="1" max="256">
            </div>
        </div>

        <p>While this widget gives a theoretical breakdown the following tool can be used to predict the memory usage:</p>
        
        <p><img alt="image.png" src="assets/images/placeholder.png"/></p>

        <p><strong>Clear code implementations:</strong> theory is one thing, but we discover all kinds of edge cases and important details when we implement something. That’s why we link to implementation references where possible. Depending on the case, we’ll use two code references: the <a href="https://github.com/huggingface/picotron">picotron</a> repository is built for education, thus it implements concepts usually in single, self-contained short files. On the other hand, to look at production ready code, we’ll refer to the <a href="https://github.com/huggingface/nanotron">nanotron</a> implementations which is a production training codebase used at Hugging Face.</p>

        <p><img alt="Picotron implements each key concept in a self-contained way, such that the method can be studied separately and in isolation." src="assets/images/placeholder.png" /></p>

        <p><strong>Real training efficiency benchmarks:</strong> Finally, how to <em>actually</em> scale your LLM training depends on your infrastructure, such as the kind of chips, interconnect etc., and we can’t give a single unified recipe. What we will give though is a way to benchmark several setups and it is what we have done on our cluster! We ran over 4100 distributed experiments with up to 512 GPUs to scan many possible distributed training layouts and model sizes. TODO: link to dataset too </p>

        <p><img alt="An overview of the over 4000 experiments across all Llama architectures where each data point corresponds to an experiment launch." src="assets/images/placeholder.png" /></p>

        <p>As you can see, there’s a lot of ground to be covered. Before getting into the trenches of distributed training let’s take a quick high level look on we’ll cover in the post.</p>

        <h2>TL;DR</h2>

        <p>This book is very extensive so we decide to start with a very general overview of how you can think about distributed training. At a high level, the key challenge in scaling LLM training is to make a training step (forward/backward/optimizer step) with a large batch size the fastest possible.</p>
        <p>When scaling up models and input batches, we quickly end up in situations where either our target batch size won't fit in memory, or/and the model itself is too large to fit in a single GPU's memory.</p>
        <p>To solve this scaling issue we’ll need to carefully evaluate different parallelization strategies and find the optimal balance between three main factors:</p>
        <ol>
        <li><strong>Memory Usage</strong><ul>
        <li>Hard limitation - if a training step doesn't fit in memory, training cannot proceed</li>
        <li>Sometimes we can increase compute (e.g. recomputation) or increase communication (e.g. ZeRO) to reduce memory usage</li>
        </ul>
        </li>
        <li><strong>Compute Efficiency</strong><ul>
        <li>Memory transfer can also decrease compute efficiency.</li>
        <li>We want our hardware to spend most time computing, so we need to reduce time spent on data transfers or unoptimized kernels.</li>
        <li>GPUs need sufficient workload (large enough matrices/batch sizes) to maintain high utilization (compute-bound) otherwise they become memory-bound (limited by memory bandwidth).</li>
        </ul>
        </li>
        <li><strong>Communication overhead</strong><ul>
        <li>Two main types. For GPUs: intra-node (NVLink TODO: bandwidth) and inter-node (network TODO: bandwidth)</li>
        <li>Two main attributes: base latency and peak bandwidth. Base latency is a constant overhead that makes us want to do the least number of comms possible, and peak bandwidth controls the how fast we can move data between gpus</li>
        <li>We prioritize using the fastest communication channels (like NVLink) for operations that occur frequently and/or block computation (e.g. tensor parallelism)</li>
        <li>We want to minimize communication overhead as it keeps GPUs idle, so we try to overlap communication with compute as much as possible</li>
        </ul>
        </li>
        </ol>
        <p>But let’s not get too much ahead of our self and scale progressively. To guide you along the journey and as a practical reference we summarized the key concepts in a cheatsheet:</p>
        <p>[TODO: ADD CHEATSHEET]</p>
        <p>Now that we nailed a few key concept and terms let’s get started by revisiting the basic training steps of an LLM!</p>
                
        <h2>First Steps: Training on one GPU</h2>
        
        <p>Let’s start by quickly reviewing the very basics of model training before we start to scale to many GPUs. When a model is trained on a single GPU, the training typically consists of three steps: </p>

        <ol>
            <li>a forward pass which passes inputs through the model to yield its outputs,</li>
            <li>a backward pass to compute the gradients, and</li>
            <li>an optimization step using the gradients to update the parameters</li>
        </ol>

        <p>It looks generally like this: </p>
        <p><img alt="image.png" src="assets/images/placeholder.png" /></p>

        <aside>As we’ll see later, these steps may be repeated or intertwined but for now we’ll start simple.</aside>

        <p>In this figure, the boxes on the top line can be seen as successive layers inside a model (same for the last line). The red boxes are the associated gradients for each of these layers, computed during the backward pass.</p>

        <p>The batch size (<d-math>bs</d-math>) is one of the important hyper-parameters for model training and affects both model convergence and throughput.</p>

        <p>If the batch size is too small, gradients will tend to be noisy and the model may not be able to converge to the most optimal performance, on the contrary it can be useful in early training to navigate quickly in the training landscape. On the other hand, a batch size too large will make less use of each training token rendering convergence slower and wasting compute. You can find a nice discussion of this topic in OpenAI’s paper on large batch training<d-cite bibtex-key="mccandlish2018largebatchtraining"></d-cite> or Section 4.2 of MiniMax-01 <a href="https://filecdn.minimax.chat/_Arxiv_MiniMax_01_Report.pdf">technical report</a>.</p>

        <aside>For instance, during DeepSeek-V3/R1 training “the batch size is gradually increased from 3072 to 15360 in the training of the first 469B tokens, and then keeps 15360 in the remaining training”.</aside>

        <p>Batch size also affects the time it takes to train on a given text dataset: a small batch size will require more optimizer steps to train on the same amount of samples. Optimizer steps are costly (in compute time) and the total time to train will thus increase compared to a larger batch size. This being said, note that the batch size can often be adjusted quite largely around the optimal batch size without major impact to the performance of the model, i.e. the sensitivity of final model performances to the exact batch size value is usually rather low around the optimal batch size.</p>

        <p>In the LLM pretraining community, batch sizes are commonly reported in terms of tokens rather than in number of samples (<d-math>bst</d-math> = Batch Size Tokens), this makes training numbers generally independent of the exact input sequence length used during the training.</p>

        <p>In the simplest case, training on a single machine, the <d-math>bs</d-math> (in samples) and <d-math>bst</d-math> can be computed from the model input sequence length (seq) as follows :</p>

        <aside><p>From here onward we’ll show the formulas for the batch size in terms of samples but you can always get its token-unit counterpart by multiplying it with the sequence length.
        </aside>

        <d-math block>
        bst=bs *seq
        </d-math>

        <p>A sweet spot for recent LLM training is typically on the order of 4-60 million tokens per batch. However, a typical issue when scaling the training of our model to these large batch sizes is out-of-memory issues, ie. our GPU doesn’t have enough memory.</p>

        <aside>Note: Llama 1 was trained with a batch size of ~4M tokens for 1.4 trillions tokens while DeepSeek was trained with a batch size of ~60M tokens for 14 trillion tokens.
        </aside>

        <p><strong>It’s time to tackle our first scaling problem: what if our model starts exploding GPU memory before we’ve reached our target batch size (maybe in some case even when using the lowest possible batch size, <code>bs=1</code>)?</strong></p>

        <p>Let’s start by quickly understanding what led to our out-of-memory issue in the first place. This will help us gain some useful intuitions for later.</p>

        <h3>Memory usage in Transformers</h3>

        <p>When training a neural network model, one store several items in memory:</p>

        <ul>
            <li>Model weights</li>
            <li>Activations needed to compute the gradients</li>
            <li>Model gradients</li>
            <li>Optimizer states</li>
        </ul>
        
        <aside >You would think for a model you could compute the memory requirements exactly but there are a few additional memory occupants that makes it hard to be exact:
        <ul>
            <li>CUDA Kernels typically require 1-2 GB of GPU memory, which you can quickly verify by running <code>import torch; torch.ones((1, 1)).to("cuda")</code> and then checking the GPU memory with <code>nvidia-smi</code>.</li>
            <li>Some rest memory usage from buffers, intermediate results and some memory that can’t be used due to fragmentation</li>
        </ul>
        We’ll neglect these last two contributors as they are typically small and constant factors.</aside>

        <p>These items are stored as tensors which come in different <em>shapes</em> and <em>precisions</em>. The <em>shapes</em> are determined by hyper-parameters such as batch size, sequence length, model hidden dimensions, attention heads, vocabulary size, and potential model sharding as we’ll see later. <em>Precision</em> refers to formats like FP32, BF16, or FP8, which respectively require 4, 2, or 1 byte to store each single value in the tensor.</p>

        <p>So how can I quickly determine memory usage from these variable? One simple way is to do this empirically and just measure it.</p>

        <h4>Memory profiling a training step</h4>

        <p>Using this snippet [TODO: link to appendix A5], we can understand how memory is allocated throughout training. We can see that memory utilization is not a static thing but varies a lot during training and during a training step:</p>

        <p><img alt="llama-1b-memory.png" src="assets/images/placeholder.png" /></p>

        <p>Clearly the first step looks very different from the subsequent ones, but let’s first have a look at the general anatomy of a step: first the activations increase quickly as we do the forward pass, then during the backward pass the gradients build up and as the backward pass propagates, the stored activations used to compute the gradients are progressively cleared. Finally, we perform the optimization step during which we need all the gradients and then update the optimizer states before we start the next forward pass. </p>

        <p>Why does the first step looks different: the activations increase quickly and then plateau for a while. In this first step the torch cache allocator does a lot of preparation preparing memory allocations to speed up the subsequent steps so that they don’t require searching for free memory blocks afterwards (see <a href="https://zdevito.github.io/2022/08/04/cuda-caching-allocator.html">Zach’s blog</a>). After the first step we also see the optimizer states appearing which generally offset the memory usage for further training steps.</p>

        <aside>Ever noticed how sometimes the training succeeds in the first step but then OOMs during the following training steps? This can be explained by the build-up of the optimizer state after the first step.
        </aside>

        <p>Now that we’ve a first view of memory, let’s see how scaling up training is often a question of maximizing compute efficiency while keeping the memory requirements of these various items (activations, parameters, gradients, optimizer states) within the memory constraints of the GPUs.</p>

        <h4>Weights/grads/optimizer states memory</h4>

        <p>We can actually pretty easily estimate the memory needed for the model’s weights, gradients and optimizer states.</p>

        <p>For a simple transformer LLM the number of parameters is given by the <a href="https://michaelwornow.net/2024/01/18/counting-params-in-transformer">following formula</a>:</p>

        <d-math block>
            N = h * v + L * (12 * h^2 + 13 * h) + 2*h
        </d-math>

        <aside>We excluded the positional embedding count as rotary embeddings are not learned.</aside>

        <p>In that equation, <d-math>h</d-math> is the hidden dimension, <d-math>v</d-math> the vocabulary size, and <d-math>L</d-math> the number of layers in the model. Note that looking at the equation we can see that the term that will dominate at large hidden dimensions is the <d-math>h^2</d-math> term since it’s the only one growing quadratically as we scale the parameters.</p>

        <p>Memory requirements for the parameters and gradients are simply the number of parameters multiplied by the number of bytes per parameter. In good old-fashioned full precision (FP32) training both parameters and gradients require 4 bytes while the optimizer, if we use Adam, requires the momentum and variance to be stored, which adds another two 4 bytes per parameter. In summary:</p>

        <d-math block>
            \begin{aligned}
            & m_{params} = 4 * N \\
            & m_{grad} = 4 * N \\
            & m_{opt} = (4+4) * N
            \end{aligned}
        </d-math>

        <p>Now let’s have look how things change if we train with mixed precision<d-cite bibtex-key="micikevicius2018mixedprecisiontraining"></d-cite>. The default nowadays is for mixed precision training is BF16, requires 2 bytes per parameter and gradient as well as an additional copy of the model weights and gradients in FP32, thus 12 bytes per parameter in total. In addition to the parameters and gradient, we need to store the optimizer states: for the Adam optimizer, this requires the momentum and the variance usually stored in FP32 for numerical stability, each using 4 bytes. </p>

        <aside>See some more details below when we cover the ZeRO methods.</aside>

        <p>Here’s the summary:</p>
        
        <d-math block>
            \begin{aligned}
                & m_{params} = 2 * N \\ 
                & m_{grad} = 2 * N \\ 
                & m_{params_fp32} = 4 * N  \\ 
                & m_{opt} = (4+4) * N
            \end{aligned}
        </d-math>

        <aside>Some librarie store grads in fp32 which would require an additional $m_{params_fp32} = 4 * N$ memory. This is done for example in nanotron, because <code>bf16</code> is lossy for smaller values and we always prioritize stability. See  <a href="https://github.com/microsoft/DeepSpeed/issues/1773">this DeepSpeed issue</a> for more information.
        </aside>

        <p>Interestingly, mixed precision itself doesn’t save overall memory as it just distributes the memory differently across the three components, and in fact adds another 4 bytes over full precision training if we accumulate gradients in FP32. It’s still advantageous as having the model which does the forward/backward in half precision it allows us to (1) use optimized lower precision operations on the GPU which are faster and (2) reduces the activation memory requirements during the forward pass.</p>

        <p>Let’s get a sense of how much general memory we need for a model (full and mixed precision giving the same overall value):</p>

        <table>
        <thead>
            <tr>
            <th><strong>Model parameters</strong></th>
            <th><strong>FP32 or BF16 w/o FP32 grad acc</strong></th>
            <th><strong>BF16 w/ FP32 grad acc</strong></th>
            </tr>
        </thead>
        <tbody>
            <tr>
            <td>1B</td>
            <td>16 GB</td>
            <td>20 GB</td>
            </tr>
            <tr>
            <td>7B</td>
            <td>112 GB</td>
            <td>140 GB</td>
            </tr>
            <tr>
            <td>70B</td>
            <td>1120 GB</td>
            <td>1400 GB</td>
            </tr>
            <tr>
            <td>405B</td>
            <td>6480 GB</td>
            <td>8100 GB</td>
            </tr>
        </tbody>
        </table>

        <aside><p>Using FP8 training instead of BF16 would further decrease the memory usage but it is less stable and a very active research topic (see <a href="https://x.com/xariusrke/status/1826669126955278401">this tweet</a>) and we’ll cover it in more detail later.
        </aside>

        <p>As we can see, as soon as we reach <strong>7B</strong> (!), weights and optimizer requirements already starts to add up significantly and exceed the size of a typical GPU memory, e.g. 80GB for a H100 GPU.</p>
        
        <p>But for now, let’s start with models which still fits in a single GPU, take a look at the other big contributor to our memory budget: the activation memory.</p>

        <h4>Activations memory</h4>
        
        <p>Activation memory is a bit more complex to compute than the weights, gradients and optimizer states, in part because it depends on the inputs of the model. If you’re unsure why we even need to store activations for the backward pass, <a href="https://www.determined.ai/blog/act-mem-2">this reference</a> is a good quick refresh. After a careful inspection of how backward pass is computed we can estimate the total memory required for the activations in mixed precision and we arrive at the following equation:</p>

        <d-math block>
            m_{act} =  L<em> seq * bs * h * (34 + \frac{5</em>n_{heads}*seq}{h})</p>
        </d-math>

        <p>Here <d-math>L</d-math> is the number of layers, <d-math>seq</d-math> the sequence length, <d-math>bs</d-math> the batch size in samples, <d-math>h</d-math> the hidden dimension of the model and <d-math>n_{heads}</d-math> the number of heads.</p>

        <p>For the exact derivation of the numbers, you can follow this original NVIDIA paper on recomputation <d-cite bibtex-key="korthikanti2022recomputation"></d-cite>, it essentially requires you to do some accounting of all the sizes of intermediate activations between each operation in a transformer layer.</p>

        <p>An interesting observation here is how the memory is not static for a given model but it scales linearly with both the sequence length and batch size. This means the activation memory is the part which will blow up when we increase our batch size or train with longer sequences. We can use this equation to look at how memory usage changes for various sequence lengths for example for Llama models (<code>bs=1</code>):</p>

        <p><img alt="llama-memory-bars-no-recomp.png" src="/assets/images/placeholder.png" /></p>

        <p>This graph tells a striking story: for short sequences (or similar for small batch-sizes), activations are almost negligible, but starting at around 2-4k tokens they come to take a significant amount of memory while parameter, gradient and optimizer states usage (that we’ll discuss later) stays roughly independent of the sequence length and batch size.</p>

        <p><strong>For large input tokens (a.k.a large batch-sizes/sequences), activations become by far the largest memory burden.</strong> </p>

        <p>Is there a way to tame this “activation explosion”? Good question, reader!</p>

        <p>It’s time to explain our first technique – called <strong><em>activation recomputation</em><em></em> </strong>**which will help us cap activation memory footprint. An essential tool in today’s large model training toolbox.</p>

        <h3>Activation recomputation</h3>
        
        <p>The general idea behind <strong><em>activation recomputation</em></strong> – also called <em>gradient checkpointing</em> or <em>rematerialization</em> – is to discard some activations during the forward pass to save memory and spend some extra compute to recompute these on the fly during the backward pass. Without recomputation, we store every hidden state between two learnable operations (e.g. FF, LayerNorm etc.), such that we can use them during the backward pass to compute gradients. When we use recomputation we typically will only store activations at a few key points along the model architecture, discard the rest of activations and recompute them on the fly during the backward pass from the nearest saved activations, basically performing again a sub-part of the forward pass to trade of memory for compute. It generally looks like this:</p>

        <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>

        <p>There are several strategies to select key activations to store:</p>

        <ul>
            <li><strong>Full</strong>: We checkpoint activations at the transition point between each layer of the Transformer model. This is usually called the <code>full</code> strategy since it requires a forward pass through each layer essentially adding a full forward pass during the backward pass. This strategy saves the most memory but is the most expensive one in terms of compute. It generally increases the compute cost and time by up to 30-40% which is very noticeable.</li>
            <li><strong>Selective</strong>: In general we can do better than full. The authors of the recomputation paper<d-cite bibtex-key="korthikanti2022recomputation"></d-cite> did a detailed analysis studying which activations grow the largest and have the cheapest recomputation cost in terms of FLOPs. Turns out that the attention computations fall in that category, and thus we can usually discard them and focus on checkpointing expensive the feedforward computations. For a GPT-3 (175B) model this means <strong>70% activation memory reduction at a 2.7% compute cost</strong>.</li>
        </ul>

        <aside>In recent models like DeepSeek V3, selective checkpointing is performed, storing even a smaller size of attention activation —using so-called “Multi-Head Latent Attention” (MLA)– to optimize activation memory usage.</aside>

        <p>Let’s see how drastically recomputation strategies can in practice reduce the memory footprint and how selective recomputation strikes a nice balance between memory saving and recomputation cost:</p>

        <p><img alt="llama-8b-memory-bars--recomp.png" src="/assets/images/placeholder.png" /></p>

        <aside>When you’re measuring how efficient your training setup is at using the accelerator’s available compute, you may want to take recomputation into account when measuring the total FLOPS (Floating point operations per second) of your training setup and comparing it to theoretical maximum FLOPS of your GPU/TPU/accelerator to estimate GPU utilization. Taking recomputation into account when calculating FLOPS for a training step gives a value called “hardware FLOPS” which is the real number of operations performed on the accelerator. Dividing this number by the duration of one training step and the maximum accelerator FLOPS yields the <em>Hardware FLOPS Utilization (HFU).</em> </aside>
        
        <aside>However, when comparing various accelerators together, what really matters at the end of the day is the start-to-end time needed to train the same models on the same dataset, ie. if an accelerator allows to skip recomputation and thus perform less operation per second for a faster training it should be rewarded. Thus, alternative is to compute what is called <em>Model FLOPS Utilization (MFU)</em>, which in contrast to HFU only accounts for the required operations to compute the forward+backward passes, and not recomputation, ie. is specific to the model, not the training implementation.</aside>

        <p>Most training frameworks these days use FlashAttention (which we’ll cover a bit later) which integrate natively activation recomputation in its optimization strategy by recomputing attention scores and matrices in the backward pass instead of storing them. Thus most people using Flash Attention are already making use of selective recomputation.</p>

        <p><strong>As you’ve now understood, activation recomputation increases the number of FLOPs slightly due to recomputation, while it significantly reduces memory access overhead.</strong> </p>

        <p>This trade-off is particularly advantageous on hardware with small high-speed memory, like GPUs, as accessing memory is typically slower than performing computations. Despite the additional operations involves, the overall effect is thus often faster computation as well, in addition to the much lower memory footprint.</p>

        <p>Now that we’ve learned about recomputation, we can tame the activations memory usage as we saw in the above graphs!</p>

        <p>However, activations still bears a linear dependance on the batch size and all our profiles in the barplots above were using <code>bs=1</code> so as we move to larger batch sizes it might become an issue again. Do not despair as we have a second tool in our box - <strong><em>gradient accumulation</em></strong> to the rescue!</p>

        <h3>Gradient accumulation</h3>

        <p>Now that we’ve used activation recomputation to fit our model with a small batch size on a single GPU, we still need to reach our target batch size, let’s say 1M tokens (see our earlier discussion on optimal batch size). Gradient accumulation is a very straightforward method to avoid memory explosion when doing this.</p>

        <p>With <em>gradient accumulation</em> we split our batch into micro-batches, do forward and backward passes repeatedly on each micro-batch, compute the gradients, and, as the name suggests, sum the gradients for each micro-batch before doing a final optimizer step. In practice, we perform the optimization step not on the sum but on the average of the gradients, so the result is independent of the number of gradient accumulation steps.</p>

        <p>Let’s call the batch size for each forward pass the <code>micro batch size</code> (mbs). We’ll refer to the overall batch size between each optimizer step as the <code>global batch size</code> (gbs). If we do one optimizer step for each 8 forward/backward passes, the <code>global batch size</code> will be 8 times the <code>micro batch size</code>.</p>

        <p>What we now call <code>global batch size</code> thus corresponds to what we’ve called up to now just <code>batch size</code> for simplicity (we now make our terms more precise to avoid ambiguity).</p>

        <p>With gradient accumulation the global batch size can be simply computed as follows:</p>

        <d-math block>
            bs = gbs = mbs \times grad\_acc 
        </d-math>

        <p>Gradient accumulation allows us to effectively increase our batch size up to infinity (and beyond!) while the memory footprint stays constant. Gradient accumulation is also compatible with activation recomputation for further memory reduction. One drawback however, is that gradient accumulation requires multiple consecutive forward/backward passes per optimization step thereby increasing the compute overhead and slowing down training. No free lunch! </p>

        <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>

        <p><strong>Gradient accumulation allows us to reduce memory of activations which grow linearly with batch size by computing only only partial, micro-batches. There is a small overhead caused by the additional forward and backward passes.</strong></p>

        <aside>Using gradient accumulation means we need to keep buffers where we accumulate gradients which persist throughout a training step. Whereas without gradient accumulation, in the backward gradients are computed while freeing the activations memory, which means a lower peak memory.</aside>

        <p>But if you’ve carefully followed, you probably noticed that the forward/backward passes for each micro-batch can actually be run in parallel. Forward/backward passes are independent from each other, with independent input samples being the only difference. Seems like it’s time to start extending our training to more than one GPU! </p>

        <p>Let’s get a larger workstation 🖥️  with a couple of GPUs and start investigating our first scaling technique called <em><strong>data parallelism</strong> which is just a parallel version of gradient accumulation</em>.</p>

        <p><strong>TODO: add profiling here or not?</strong></p>

        <h2>Data Parallelism</h2>
        
        <p>The idea behind data parallelism (DP) is to replicate the model on several GPUs (we call the replica's “model instances”) and run forward and backward passes on different micro batches of data in parallel for each GPU, hence the name Data Parallelism. </p>

        <p>Using a different micro batch for each GPU means we’ll have different gradients in each GPU, so to keep the model instances in sync across different GPUs, the gradients from the model instances are averaged using an operation called “all-reduce”, which happens during the backward pass, before the optimizer step.</p>

        <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>

        <p>This involves our first “distributed communication” primitive: <em><strong>all-reduce</em></strong> which handles the synchronization and communication between GPU instances and nodes.</p>

        <aside>If you are not familiar with distributed communications patterns like broadcast, gather or all-reduce we put together a small crash course in the Appendix [TODO Link].</aside>

        <p>TODO: embed naive DP: <a href="https://github.com/huggingface/picotron/blob/0035cce0e04afd6192763b11efe50010d8ad0f71/picotron/data_parallel/data_parallel.py#L10-L60">https://github.com/huggingface/picotron/blob/0035cce0e04afd6192763b11efe50010d8ad0f71/picotron/data_parallel/data_parallel.py#L10-L60</a></p>

        <p>TODO: embed bucket DP: <a href="https://github.com/huggingface/picotron/blob/0035cce0e04afd6192763b11efe50010d8ad0f71/picotron/data_parallel/data_parallel.py#L62-L171">https://github.com/huggingface/picotron/blob/0035cce0e04afd6192763b11efe50010d8ad0f71/picotron/data_parallel/data_parallel.py#L62-L171</a></p>

        <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>

        <p>A naive DP implementation would just wait for the backward pass the finish so that we have all gradients, then it triggers an all-reduce over all DP ranks, to sync these gradients. But such an sequential steps of computation followed by communication is <strong>A BIG NO!</strong> Because we don’t want our GPUs to stay idle while communication is happening.</p>

        <p>Instead we should try to overlap communication and computation whenever possible so that they happen at the same time as much as possible.</p>

        <p>Let’s see three optimizations that are done in practice for this! </p>

        <h4><strong>First optimization:</strong> Overlap gradient synchronization with backward pass</h4>

        <p>The main drawback of the naive DDP approach we’ve just described is that after the backward pass (<em>computation</em>), we have to wait for gradient synchronization (<em>communication</em>) before updating the parameters. Could we overlap this communication with our computation? The answer is yes!</p>

        <p>As shown in the figure above, the gradients (red boxes) for a layer can be gathered and summed even before the gradients from earlier layers (red boxes to the left) have been computed. For example, as soon as the backward pass of the last layer is complete (last box on the right), those gradients can already be gathered and summed while the backward computations continue for earlier layers, moving toward the left.</p>

        <p>This can be achieved in pytorch by attaching an <em>all-reduce hook function</em> to each parameter. An all-reduce operation is triggered as soon as the gradient for that parameter is ready, while gradients for other parameters are still being computed. This approach overlaps most of the all-reduce operations with gradient calculations, thereby improving efficiency. Here's a simple function to attach a hook:</p>

        <d-code block language="python">
            def register_backward_hook(self, hook):
                """
                Registers a backward hook for all parameters of the model that 
                require gradients.
                """
                for p in self.module.parameters():
                    if p.requires_grad is True:
                        p.register_post_accumulate_grad_hook(hook)</d-code>

        <p><img alt="image.png" src="/assets/images/placeholder.png"/></p>

        <p>Overlapping computation and communication reduces the time spent waiting for gradient synchronization across the entire model. Gradient synchronization can occur (at least partially) in parallel with backward pass, significantly speeding up data parallelism.  </p>

        <p>This is our first example of “<em>overlapping computation and communication</em>” which we will discuss several times in this blog post and is an essential technique to maximal scaling efficiency. Let's have a look how we can further improve the DP efficiency!</p>


        <h4><strong>Second optimization:</strong> Bucketing gradients</h4>
        
        <p>We can even go further with optimizing DP. For a given number of parameters to synchronize, GPU operations like collective communications are often more efficient when performing few calls on large tensors rather than many calls on smaller tensors. Therefore, instead of performing independent all-reduce for each gradient, we can group gradients into buckets and launch a single all-reduce for all the gradients within the same bucket. Think of it like packing items into boxes before shipping—it's more efficient to send a few big boxes than many small ones. By performing a single all-reduce operation for each bucket, we can significantly reduce communication overhead and speed up the communication operation.</p>

        <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>

        <h4><strong>Third optimization: </strong>Interplay with gradient accumulation</h4>

        <p>As we’ve seen before, gradient accumulation works by performing multiple forward and backward passes before updating the parameters with <code>optimizer.step()</code>. When combining gradient accumulation with data parallelism, we should be careful when we want to synchronize gradients.</p>

        <p>In a naive version, an all-reduce operation is automatically triggered after each backward pass during the accumulation, which is sub-optimal as a single reduce after the final step would have the same effect while reducing overhead.</p>

        <p>In PyTorch, this is typically solved by adding a <a href="https://github.com/pytorch/pytorch/blob/5ea67778619c31b13644914deef709199052ee55/torch/nn/parallel/distributed.py#L1408-L1435"><code>model.no_sync()</code></a> decorator, which disables gradient synchronization, on the backward passes which don’t need reduction.</p>

        <aside>When performing communication operations, tensors must be contiguous in memory. To avoid redundant memory copies during communication, ensure that tensors that will be communicated are stored contiguously in memory. Sometimes we need to allocate additional continuous buffers of the size of activations or model parameters specifically for communication, which contributes to the peak memory usage during training.</aside>

        <h3>Revisit global batch size</h3>
        <p>Let’s update our batch size equation with our newly learned Data Parallelism and Gradient Accumulation parameters:</p>

        <d-math block>
            bs = gbs = mbs \times grad\_acc 
        </d-math>
        <p>Where  <d-math>grad\_acc</d-math> is the number of gradient accumulation steps and DP is the number of parallel instances used for data parallelism.</p>

        <p>Given a targeted global batch size, we can thus trade gradient accumulation steps for data-parallel processes to speed up training. In practice, people tend to maximize the number of data-parallel nodes (DP) over gradient accumulation as much as possible since it's inherently parallel, unlike the sequential nature of gradient accumulation. Gradient accumulation is then added on top of data parallelism to achieve the target global batch size when scaling data parallelism alone is not sufficient before you run out of GPUs.</p>

        <aside>A good resource for further reading on Data Parallelism is <a href="https://siboehm.com/articles/22/data-parallel-training">https://siboehm.com/articles/22/data-parallel-training</a>.
        </aside>

        <p>Being able to distribute the training over different samples gives us a first dimension of parallelization, thus making this 1D parallelism (we’ll progressively cover 4 more dimensions).</p>

        <h3>Our journey up to now</h3>
        <p>Let’s quickly summarize what we’ve seen up to now and how to setup our first 1D parallel training with a draft recipe for an optimal data-parallel setup:</p>

        <ol>
            <li>We should first determine the best (global) batch size in tokens (<code>GBST</code>) either by consulting literature or running experiments measuring model convergence.</li>
            <li>We then select a sequence length for training, again by either consulting literature or running experiments. Generally, 2-8k tokens work reliably well for the evaluations we have today (we won’t dive in training recipes here but teams usually increase the sequence at the end of the training, adding some longer-context data samples in the mix to reach the longer context size of today).</li>
            <li>We now know the batch size (gbs). We can find the maximum local batch size (mbs) on a single GPU by increasing the local batch size until we run out of memory.</li>
            <li>Finally, we determine the number of available GPUs for our target DP. The ratio of GBS to DP gives us the remaining number of gradient accumulation steps needed for the desired GBS. </li>
        </ol>

        <aside>For instance DeepSeek and Llama models are trained with a 4k tokens sequence length during the main pretraining phase.</aside>

        <aside>The reason 2-8k work well for pretraining is that documents that are longer are very rare on the web. See this <a href="https://www.harmdevries.com/post/context-length/">Harm’s blogpost</a> for a detailed analysis.
        </aside>

        <p>If the gradient accumulation ratio is lower than one, i.e. we have too many GPUs a.k.a GPU-rich 🤑 (!), we can either choose to not use all our GPUs, explore a larger global batch size or test if a lower MBS will speed up training. In the latter case we’ll end up prioritizing throughput over individual GPU compute efficiency, using a smaller MBS than possible in order to speed up training.</p>

        <p>Time to take a concrete example: Let’s say we want to train a recent model with a GBS of 4M tokens and a sequence length of 4k. This means our batch size will be 1024 samples (we pick powers of two). We observe that a single GPU can only fit MBS=2 in memory and we have 128 GPUs available for training. This means with 4 gradient accumulation steps we’ll achieve our goal of 1024 samples or 4M tokens per training step. Now what if we suddenly have 512 GPUs available? We can achieve the same GBS and thus identical training by keeping MBS=2 and setting gradient accumulation steps to 1 and achieve faster training!</p>

        <aside>Bear in mind that at the 512GPUs scale, depending on the network used, the communication operations will start to be bound by <em>ring latency</em>  (time required for a signal to propagate once around the ring) **which means we can no longer fully overlap the DP communications. This will decrease our compute efficiency and hit our throughput. In this case we should start exploring other dimensions to parallelize on.
        </aside>

        <p>While data parallelism cleverly overlaps the all-reduce gradient synchronization with backward computation to save time, this benefit starts to break down at large scales. As we add more and more GPUs (hundreds or thousands), the overhead of coordinating between them grows significantly. The end result? We get less and less efficient returns from each additional GPU we add to the system:</p>

        <p><img alt="image.png" src="/assets/images/placeholder.png"/></p>

        <p>As expected, we can also see that the memory usage per GPU is not affected by adding more DP ranks for training.</p>

        <p><strong>We’ve explored data parallelism, our first (simple) strategy to scale training across more GPUs. It works like gradient accumulation but parallelizes the forward and backward passes on micro batches, thus increasing throughput!</strong></p>
        
        <p>The keen reader has already probably noted however that this assumes that we can fit at least one input sample forward pass (mbs<em>=1)</em> into our GPU memory. This is not always the case! As we can see, larger models don’t fit into a single GPU, even with activation recomputation activated: </p>

        <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>

        <aside>Tip: you can quickly eyeball the minimal memory required for your model’s parameters by multiplying by 2 e.g. 70B → 140GB (=133GiB)</aside>

        <p>Do we have other options for these larger models? We do have some solutions thankfully. They will involve either move some of these tensors to the CPU or split the weights/gradients/optimizer-states tensors across GPUs devices!</p>

        <p>There are two main approaches to splitting: parallelism (tensor, context, or pipeline parallelism) and sharing (DeepSpeed Zero or PyTorch FSDP). Both approaches are somewhat orthogonal and can actually be combined! The sharing paradigm is closely related to DP so we’ll have a look at it first by investigating the ZeRO method!</p>


        <h3>ZeRO (<strong>Ze</strong>ro <strong>R</strong>edundancy <strong>O</strong>ptimizer)</h3>
        
        <h4>Memory usage revisited</h4>
        
        <h4>ZeRO-1: Partitioning Optimizer States</h4>
        
        <h4>ZeRO-2: Adding <strong>Gradient Partitioning</strong></h4>
        
        <h4>ZeRO-3: Adding <strong>Parameter Partitioning</strong></h4>
        

        <h2>Tensor Parallelism</h2>

        <p>So we have sharded the model’s parameters, gradients and optimizers states with ZeRO but we hit a limit once activation memory overtakes our memory budget. Welcome Tensor Parallelism (TP), a method which shards weights, gradients, and optimizers states as well as activations and without the need to gather them all prior to the computation. Seems like a dream! Let’s first have a look at how Tensor Parallel works with simple matrix multiplications.</p>

        <p>Tensor Parallelism leverages the mathematical properties of matrix multiplication <d-math>A \times B</d-math>. To understand how it works, let's examine two fundamental equations that make this parallelization possible:</p>

        <d-math block>
            \begin{aligned}
            &\text{1.} \quad A\cdot B = A \cdot \begin{bmatrix} B_1 & B_2 & \cdots \end{bmatrix} = \begin{bmatrix} AB_1 & AB_2 & \cdots \end{bmatrix} \\
            &\text{2.} \quad A\cdot B =\begin{bmatrix} A_1 & A_2 & \cdots \end{bmatrix} \begin{bmatrix} B_1 \\ B_2 \\ \vdots \end{bmatrix} = \sum_{i=1}^n A_i B_i
            \end{aligned}
        </d-math>

        <p>This means that we can compute matrix product by either 1) multiplying each column of <d-math>B</d-math> individually or 2) multiplying each row individually and combining the results. In a neural network, the matrix multiplication is more often represented in the following format: <d-math>X \times W</d-math>, where:</p>

        <ul>
            <li>X represents the input or activation values</li>
            <li>W represents the weight of the <code>nn.Linear</code></li>
        </ul>

        <p>In practice a small example of the operation looks like this:</p>

        <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>

        <p>Let’s see how we can parallelise this operation! In tensor parallelism, tensors will be split into N shards along a particular dimension and distributed across N GPUs. Matrices can be split either on the column part or row part leading to row and column parallelism. One thing we’ll see in the following is that choosing row or column sharding will require different communications primitives.</p>

        <p>Our first option is to use column-wise sharding (also called <strong><em>column-linear</em></strong>): We'll copy the complete input matrices to each worker, requiring an operation called <strong><em>broadcast</em></strong>, and split the weight matrix into columns. The inputs are then multiplied with the partial weight matrices, and the results are finally combined using an <strong><em>all-gather</em></strong> operation.</p>

        <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>

        <p>The second option is called row-wise sharding (also called <strong><em>row-linear</em></strong>): As the attentive reader might guess, row-linear means that we split the weight matrix into chunks of rows. However, this also requires us to split the inputs, which needs a <strong><em>scatter</em></strong> operation rather than a broadcast as used in column-linear sharding. The results on each worker are already in the right shape but need to be summed for the final result, thus requiring an all-reduce operation in this scenario.</p>

        <p>We see here our fourth distributed primitive: <strong><em>scatter</em></strong>!</p>

        <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>

        <h3>Tensor Parallelism in a Transformer Block</h3>
        
        <p>To come up with a strategy to follow, let’s move from a toy example to a real model building block. A Transformer model is made of two main building blocks : Feedforward layers (MLP) and Multi-Head Attention (MHA). We can apply tensor parallelism to both.</p>

        <p>The Feedforward part can be parallelized by having a “Column linear” followed by a “Row Linear” which amounts to a broadcast to copy the input and an all-reduce in forward. Note that the broadcast isn’t needed in actual training where we can make sure inputs are already synced across TP ranks.</p>

        <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>

        <p>Now that we’ve found the most efficient schema for the Feedforward part of the transformer, let’s take a look at the multi-head attention block (MHA).</p>

        <p>We can generally follow a similar approach where Q, K, and V matrices are split in a column-parallel fashion, and the output projection is split along the row dimension. With multi-head attention, the column-parallel approach has a very natural interpretation: each worker computes the attention for an individual or a subset of heads. The same approach works as well for <a href="https://arxiv.org/abs/1911.02150"><strong><em>multi-query</em></strong> (MQA)</a> or <a href="https://arxiv.org/abs/2305.13245"><strong><em>grouped query attention</em></strong> (GQA)</a> where key and values are shared between queries. </p>

        <p>It's also worth noting that the tensor parallelism degree should not exceed the number of Q/K/V heads because we need intact heads per TP rank. And in case we’re using GQA, TP degree should be below number of K/V heads, otherwise it requires additional comms to keep them in sync. For instance, LLaMA-3 8B has 8 Key/Value heads, so the tensor parallelism degree should be less than or equal to 8, otherwise if TP=16 for example, we need to duplicate each K/V head and make sure they stay in sync.</p>

        <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
        
        <p>Finally note that there is a tradeoff in terms of communication as we’ve added several distributed communication primitive directly in the computation path of our model. At the difference of ZeRO where we could prefetch, it can be harder to make these communication fully overlap with computations. </p>

        <p><img alt="Forward pass in Tensor Parallelism" src="/assets/images/placeholder.png" /></p>

        <p>Looking at the timeline of operations in tensor-parallel MLP (same applies for Attention), we can better understand the tradeoffs involved. In the forward of each decoder layer, we hit a synchronization point with the AllReduce operation that cannot be overlapped with computation. This <em>exposed communication</em> overhead is necessary to combine partial results across tensor-parallel ranks before the final LayerNorm can be applied. </p>

        <p>Tensor parallelism does help reduce activation memory for the matrix multiplications since the intermediate activations are sharded across GPUs. However, we still need to gather the full activations for operations like LayerNorm, which means we're not getting the full memory benefits we could. Additionally, it introduces significant communication requirements that heavily depend on the network infrastructure. The inability to hide this particular AllReduce behind computation means it directly adds to the critical path of forward propagation.</p>

        <p><img alt="Impact of Tensor Parallelism on model performance and batch size capacity: while increasing TP leads to reduced per-GPU throughput (left), it enables processing of larger batch sizes (right), illustrating the trade-off between computational efficiency and memory availability in distributed training." src="/assets/images/placeholder.png" /></p>

        <p>Impact of Tensor Parallelism on model performance and batch size capacity: while increasing TP leads to reduced per-GPU throughput (left), it enables processing of larger batch sizes (right), illustrating the trade-off between computational efficiency and memory availability in distributed training.</p>

        <p>In practice, the communication overhead of tensor parallelism becomes particularly noticeable as we scale beyond 8 GPUs. While tensor parallelism within a single node can leverage fast NVLink interconnects, going across nodes requires slower network connections. As shown in the throughput plot above, we observe significant drops when moving from TP=8 to TP=16, and an even steeper decline from TP=16 to TP=32. This illustrates how communication costs can dominate at higher degrees of parallelism.</p>

        <p>However, tensor parallelism provides important benefits for memory usage by distributing model parameters, gradients, optimizer states and activations (to some extent) across GPUs. Let's examine this effect on a 70B parameter model:</p>

        <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>

        <p>As we can see, increasing tensor parallelism reduces the memory needed for model parameters, gradients and optimizer states on each GPU. While tensor parallelism does help reduce activation memory in attention and feedforward layers by sharding the matrix multiplications across GPUs, we don't get the full memory benefits we could. This is because operations like layer normalization and dropout still require gathering the full activations on each GPU, partially negating the memory savings. We can do better by finding ways to parallelize these remaining operations as well.</p>

        <aside>One interesting note about layer normalization in tensor parallel training - since each TP rank sees the same activations after the all-gather, the layer norm weights don't actually need an all-reduce to sync their gradients after the backward pass. They naturally stay in sync across ranks. However, for dropout operations, we must make sure to sync the random seed across TP ranks to maintain deterministic behavior.
        </aside>

        <p>This raises an interesting question - could we extend tensor parallelism to these remaining operations as well? Indeed, it's possible to parallelize layer norm, dropout and other operations too, which we'll explore next.</p>

        <h3>Sequence Parallelism</h3>
        
        <p>In regions where we apply tensor parallelism (TP), like attention and feedforward layers, each GPU only needs to operate on a portion of the hidden dimension since the weights are sharded. However, operations like layer norm or dropout (which is not used a lot anymore in LLM) require access to the full hidden dimension to compute correctly.</p>

        <p>Rather than gathering the full hidden dimension on each GPU (which would defeat the memory benefits of TP), we can instead shard these operations along the sequence length dimension. This approach is called <strong>sequence parallelism (SP)</strong>.</p>

        <aside>Note that the term Sequence Parallelism is a bit overloaded: the Sequence Parallelism in this section is tightly coupled to Tensor Parallelism and applies to dropout and layer norm operation. However, when we will move to longer sequences the attention computation will become a bottleneck, which calls for techniques such as Ring-Attention, which are sometimes also called <em>Sequence Parallelism</em> but we’ll refer to them as <em>Context Parallelism</em> to differentiate the two approaches. So each time you see sequence parallelism, remember that it is used together with tensor parallelism (in contrast to context parallelism, which can be used independently).</aside>

        <p>Sequence parallelism (SP) involves splitting the activations and computations for the parts of the model not handled by tensor parallelism (TP) such as Dropout and LayerNorm, but along the input sequence dimension rather than across hidden dimension. This is needed because these operations require access to the full hidden dimension to compute correctly. For example, LayerNorm needs the full hidden dimension to compute mean and variance:</p>
        
        <d-math block>
            \text{LayerNorm}(x) = \gamma \cdot \frac{x - \mu}{\sqrt{\sigma^2 + \epsilon}} + \beta
        </d-math>

        <p>where <d-math>\mu = \text{mean}(x)</d-math> and <d-math>\sigma^2 = \text{var}(x)</d-math> are computed across hidden dimension <d-math>h</d-math>.</p>

        <p>So even though these operations are computationally cheap, they still require significant activation memory since they need the complete hidden dimension. SP allows us to shard this <strong>memory</strong> burden across GPUs by splitting along the sequence dimension instead.</p>

        <p>In practice we’ll go from the left diagram to the right:</p>

        <p><img alt=" in forward: f = no-op ; f* = all-reduce ; g = all-gather ; g* = reduce-scatter
            in backward: f = all-reduce ; f* = no-op ; g = reduce-scatter ; g* = all-gather
           SP region needs full hidden_dim" src="/assets/images/placeholder.png" /></p>
        
        <p>in forward: f = no-op ; f<em> = all-reduce ; g = all-gather ; g</em> = reduce-scatter in backward: f = all-reduce ; f<em> = no-op ; g = reduce-scatter ; g</em> = all-gather SP region needs full hidden_dim</p>

        <p>The diagram shows how we transition between tensor-parallel and sequence-parallel regions using different collective operations (labeled "f" and "g"). The key challenge is managing these transitions efficiently while keeping memory usage low and maintaining correctness.</p>

        <p>In the forward pass:</p>
        <ul>
            <li>"f" is a no-op (no operation) because activations are already duplicated across ranks</li>
            <li>"f*" is an all-reduce to synchronize activations and ensure correctness</li>
        </ul>
        <p>In the backward pass:</p>
        <ul>
            <li>"f*" is a no-op because gradients are already duplicated across ranks</li>
            <li>"f" is an all-reduce to synchronize gradients</li>
        </ul>

        <p>These operations "f" and "f<em>" are called </em><em>conjugate</em>* pairs because they complement each other - when one is a no-op in forward, the other is an all-reduce in backward, and vice versa.</p>
            
        <p>For sequence parallelism (SP), we use different operations labeled "g" and "g*". Specifically, we avoid using all-reduce in the SP region since that would require gathering the full activations and increase our peak memory usage, defeating the purpose of SP.</p>
        
        <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>

        <p>So what is actually happening here? As a famous LLM would say, let’s take it step-by-step:</p>

        <p><strong>Initial LayerNorm (SP Region)</strong></p>
        <ul>
            <li>Input tensors X1<em> and X2</em> (b,s/2,h) enter LayerNorm, already split across sequence dimension</li>
            <li>Each GPU computes LayerNorm independently on its sequence chunk and give Y1<em> and Y2</em></li>
        </ul>
        <p><strong>First Transition (SP → TP)</strong></p>
        <ul>
            <li>"g" operation (all-gather) combines Y1<em> and Y2</em> back to full sequence length</li>
            <li> Restores Y (b,s,h) since column linear layer needs full hidden dimension h</li>
        </ul>
        <p><strong>First Linear Layer (TP Region)</strong></p>
        <ul>
            <li>A1 is a column-linear layer, so it splits Y along the hidden dimension</li>
            <li>GeLU is applied independently on each GPU</li>
            <li>Z1* is (b,s,h/2)</li>
        </ul>
        <p><strong>Second Linear Layer (TP Region)</strong></p>
        <ul>
            <li>B1 is a row-linear layer, so it restores the hidden dimension</li>
            <li>W1 is (b,s,h)</li>
        </ul>
        <p><strong>Final Transition (TP → SP)</strong></p>
        <ul>
            <li>"g*" operation (reduce-scatter) which reduces for previous row-linear correctness while scattering along sequence dimension</li>
            <li>W1* is (b,s/2,h)</li>
        </ul>

        <p>A key advantage of sequence parallelism is that it reduces the maximum activation size we need to store. In tensor parallelism alone, we had to store activations of shape (b,s,h) at various points. However, with sequence parallelism, the maximum activation size is reduced to <d-math>\frac{b \cdot s \cdot h}{tp}</d-math> since we always either split along the sequence or hidden dimensions.</p>

        <p>It’s a bit difficult to keep track of all the parts that are sharded differently in TP and TP/SP - believe us, we find it hard to map as well so we made this small table to summarize how the activations (aka <code>hidden_states</code> ) shape change across hidden dimension h and sequence dimension s during a forward pass:</p>

        <table>
            <thead>
              <tr>
                <th>Region</th>
                <th>TP only</th>
                <th>TP with SP</th>
              </tr>
            </thead>
            <tbody>
              <tr>
                <td>Enter TP (Column Linear)</td>
                <td>h: sharded (weight_out is sharded)<br>s: full</td>
                <td>h: sharded (weight_out is sharded)<br>s: <strong>all-gather</strong> to full</td>
              </tr>
              <tr>
                <td>TP Region</td>
                <td>h: sharded<br>s: full</td>
                <td>h: sharded<br>s: full</td>
              </tr>
              <tr>
                <td>Exit TP (Row Linear)</td>
                <td>h: full (weight_out is full + <strong>all-reduce</strong> for correctness)<br>s: full</td>
                <td>h: full (weight_out is full + <strong>reduce-scatter</strong> for correctness)<br>s: <strong>reduce-scatter</strong> to sharded</td>
              </tr>
              <tr>
                <td>SP Region</td>
                <td>h: full<br>s: full</td>
                <td>h: full<br>s: sharded</td>
              </tr>
            </tbody>
           </table>
        
        <p>And for the embedding layer:</p>
        
        <table>
            <thead>
              <tr>
                <th>Region</th>
                <th>Vanilla TP</th>
                <th>TP with SP</th>
              </tr>
            </thead>
            <tbody>
              <tr>
                <td>Embedding Layer (Row Linear sharded on vocab)</td>
                <td>h: full (weight_out is full + <strong>all-reduce</strong> for correctness)<br>s: unchanged</td>
                <td>h: full (weight_out is full + <strong>reduce-scatter</strong> for correctness)<br>s: <strong>reduce-scatter</strong> to sharded</td>
              </tr>
            </tbody>
           </table>
        
        <p>You can find an example of implementation of both column and row linear TP in picotron:

        <a href="https://github.com/huggingface/picotron/blob/main/picotron/tensor_parallel/tensor_parallel.py">https://github.com/huggingface/picotron/blob/main/picotron/tensor_parallel/tensor_parallel.py</a> </p>

        <p>By using sequence parallelism, we can achieve even greater activation memory savings, allowing us to push our batch size and sequence length further than what would be possible with tensor parallelism alone. Let's see what that means for our previous 70B model example:</p>

        <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>

        <p>Does that mean that SP incurs more communication than TP? Well, yes and no. In the forward of a vanilla TP we had two all-reduce per transformer block, and in SP we have two all-gather and two reduce-scatter per transformer block. So SP does twice the number of communication operations as TP. But since an all-reduce operation can be broken down into to an all-gather + reduce-scatter (see in [TODO: Appendix link]) they’re actually equivalent in terms of communication. Same reasoning for backward as we just use the conjugate of each operation (no-op ↔ allreduce and allgather ↔ reducescatter).</p>

        <p>If you’ve been paying close attention, you’ll notice that we’re talking about 4 comms ops in each layer (2 for Attention and 2 for MLP). This is how the MLP profiling looks like when using Tensor + Sequence Parallelism:</p>

        <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>

        <p>Besides the fact that TP requires communications in each layer, it also can’t easily be overlapped with compute, which makes throughput heavily dependent on the communication bandwidth. This is why TP is usually done only within a node (TP≤8).</p>


        <aside>Overlapping communication with computation for TP is an active area of research, with recent work like Domino <d-cite bibtex-key="wang2024domino"></d-cite> exploring novel techniques to maximize this overlap. For example, Megatron-LM/Nanotron implement a partial overlapping of all-gather with FC1 computation, and we expect to see more innovations in this space as the field continues to evolve.</aside>

        <p>As you might expect, this communication overhead becomes increasingly problematic as we scale up tensor parallelism. To illustrate this, let’s check throughput as we scale TP with SP for a 3B model:</p>

        <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
        <p>Impact of combined Tensor and Sequence Parallelism (TP/SP) on a 3B model’s performance and memory utilization with 4096 seqlen: when scaling both TP and SP together, there's a trade-off between computational efficiency (left) and memory capacity (right). While higher parallelism degrees reduce per-GPU throughput, they enable processing of significantly larger batch sizes by reducing the activation memory.</p>

        <p>Let’s summarize our observations:</p>

        <ul>
            <li>for both methods we notice the biggest performance drop when we move from TP=8 to TP=16, because that’s when we move from only communicating within a single node (NVLink), to communicating inter-nodes (EFA)</li>
            <li>the memory savings in activations when using TP with SP helps us fit far bigger batches than TP alone</li>
            <li>the memory savings in activations when using TP with SP helps us fit far bigger batches than TP alone</li>
        </ul>

        <p><strong>We have seen how TP helps us shard activations across several GPUs by splitting the attention and feedforward operations along the hidden dimension and how SP is a natural complement for the remaining operations by splitting along the sequence dimension.</strong></p>

        <p>However, there are two limits to TP and SP: 1) if we scale the sequence length the activation memory will still blow up in the TP region and 2) if the model is too big to fit with TP=8 then we will see a massive slow-down due to the inter-node connectivity.</p>

        <aside>Since LayerNorms in the SP region operate on different portions of the sequence, their gradients will differ across TP ranks. To ensure the weights stay synchronized, we need to allreduce their gradients during the backward pass, similar to how DP ensures weights stay in sync. This is a small communication overhead since LayerNorm has relatively few parameters.</aside>

        <p>We can tackle problem 1) with Context parallelism and problem 2) with Pipeline parallelism. Let’s first have a look at Context parallelism!</p>

        <h2>Context Parallelism</h2>
        
        <h3>Introducing Context Parallelism</h3>
        
        <h3>Discovering Ring Attention</h3>
        
        <h3>Zig-Zag Ring Attention – A Balanced Compute Implementation</h3>
        
        <h2>Pipeline Parallelism</h2>
        
        <h3>Splitting layers on various nodes - All forward, all backward</h3>
        
        <h3>One-forward-one-backward and LLama 3.1 schemes</h3>
        
        <h3>Interleaving stages</h3>
        
        <h3>Zero Bubble and DualPipe</h3>
        
        <h2>Expert parallelism</h2>
        
        <h2>5D parallelism in a nutshell</h2>
        
        <h2>How to Find the Best Training Configuration</h2>
        
        <h2>Diving in the GPUs – fusing, threading, mixing</h2>
        
        <h4>A primer on GPU</h4>
        
        <h3>How to improve performance with Kernels ?</h3>
        
        <h4>Memory Coalescing</h4>
        
        <h4>Tiling</h4>
        
        <h4>Thread Coarsening</h4>
        
        <h4>Minimizing Control Divergence</h4>
        
        <h3>Flash Attention 1-3</h3>
        
        <h3>Fused Kernels</h3>
        
        <h3>Mixed Precision Training</h3>
        
        <h4>FP16 and BF16 training</h4>
        
        <h4>FP8 pretraining</h4>
        
        <h2>Conclusion</h2>
        
        <h3>What you learned</h3>
        
        <h3>What we learned</h3>
        
        <h3>What’s next?</h3>
        
        <h2>References</h2>
        
        <h3>Landmark LLM Scaling Papers</h3>
        
        <h3>Training Frameworks</h3>
        
        <h3>Debugging</h3>
        
        <h3>Distribution Techniques</h3>
        
        <h3>CUDA Kernels</h3>
        
        <h3>Hardware</h3>
        
        <h3>Others</h3>
        
        <h2>Appendix</h2>

    </d-article>

    <d-appendix>
        <d-bibliography src="bibliography.bib"></d-bibliography>
        <style>
            d-appendix .citation {
                font-size: 11px;
                line-height: 15px;
                border-left: 1px solid rgba(0, 0, 0, 0.1);
                padding-left: 18px;
                border: 1px solid rgba(0, 0, 0, 0.1);
                background: rgba(0, 0, 0, 0.02);
                padding: 10px 18px;
                border-radius: 3px;
                color: rgba(150, 150, 150, 1);
                overflow: hidden;
                margin-top: -12px;
                white-space: pre-wrap;
                word-wrap: break-word;
            }
        </style>

        <h3 id="citation">Citation</h3>
        <p>For attribution in academic contexts, please cite this work as</p>
        <pre
            class="citation short">XXX, et al., "The Ultra-Scale Playbook: Training LLMs on GPU Clusterse", 2025.</pre>
        <p>BibTeX citation</p>
        <pre class="citation long">@misc{TODO,
      title={The Ultra-Scale Playbook: Training LLMs on GPU Clusters},
      author={TODO},
      year={2025},
}</pre>
    </d-appendix>

    <script>
        const article = document.querySelector('d-article');
        const toc = document.querySelector('d-contents');
        if (toc) {
            const headings = article.querySelectorAll('h2, h3, h4');
            let ToC = `<nav role="navigation" class="l-text figcaption"><h3>Table of contents</h3>`;
            let prevLevel = 0;

            for (const el of headings) {
                // should element be included in TOC?
                const isInTitle = el.parentElement.tagName == 'D-TITLE';
                const isException = el.getAttribute('no-toc');
                if (isInTitle || isException) continue;
                el.setAttribute('id', el.textContent.toLowerCase().replaceAll(" ", "_"))
                const link = '<a target="_self" href="' + '#' + el.getAttribute('id') + '">' + el.textContent + '</a>';

                const level = el.tagName === 'H2' ? 0 : (el.tagName === 'H3' ? 1 : 2);
                while (prevLevel < level) {
                    ToC += '<ul>'
                    prevLevel++;
                }
                while (prevLevel > level) {
                    ToC += '</ul>'
                    prevLevel--;
                }
                if (level === 0)
                    ToC += '<div>' + link + '</div>';
                else
                    ToC += '<li>' + link + '</li>';
            }

            while (prevLevel > 0) {
                ToC += '</ul>'
                prevLevel--;
            }
            ToC += '</nav>';
            toc.innerHTML = ToC;
            toc.setAttribute('prerendered', 'true');
            const toc_links = document.querySelectorAll('d-contents > nav a');

            window.addEventListener('scroll', (_event) => {
                if (typeof (headings) != 'undefined' && headings != null && typeof (toc_links) != 'undefined' && toc_links != null) {
                    // Then iterate forwards, on the first match highlight it and break
                    find_active: {
                        for (let i = headings.length - 1; i >= 0; i--) {
                            if (headings[i].getBoundingClientRect().top - 50 <= 0) {
                                if (!toc_links[i].classList.contains("active")) {
                                    toc_links.forEach((link, _index) => {
                                        link.classList.remove("active");
                                    });
                                    toc_links[i].classList.add('active');
                                }
                                break find_active;
                            }
                        }
                        toc_links.forEach((link, _index) => {
                            link.classList.remove("active");
                        });
                    }
                }
            });
        }
    </script>

</body>

</html>