lvwerra HF staff commited on
Commit
d0f79ee
·
1 Parent(s): c36597e

fix figures mostly

Browse files
assets/data/fragments/banner.html ADDED
The diff for this file is too large to render. See raw diff
 
assets/data/fragments/benchmarks_interactive.html ADDED
The diff for this file is too large to render. See raw diff
 
assets/data/fragments/memory-profile.html ADDED
The diff for this file is too large to render. See raw diff
 
assets/data/fragments/memory-recomputation.html ADDED
@@ -0,0 +1 @@
 
 
1
+ <div> <div id="3a29a7b6-5a6e-4e51-aabe-515dafefbd63" class="plotly-graph-div" style="height:500px; width:850px;"></div> <script type="text/javascript"> window.PLOTLYENV=window.PLOTLYENV || {}; if (document.getElementById("3a29a7b6-5a6e-4e51-aabe-515dafefbd63")) { Plotly.newPlot( "3a29a7b6-5a6e-4e51-aabe-515dafefbd63", [{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(78, 165, 183)"},"name":"parameters","showlegend":true,"visible":true,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAAEjnD0AAAAAASPcPQAAAAACkCxBAAAAAAKQrEEA="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(227, 138, 66)"},"name":"gradients","showlegend":true,"visible":true,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAAEjnD0AAAAAASPcPQAAAAACkCxBAAAAAAKQrEEA="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(232, 137, 171)"},"name":"optimizer","showlegend":true,"visible":true,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAAEjnH0AAAAAASPcfQAAAAACkCyBAAAAAAKQrIEA="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(206, 192, 250)"},"name":"activations","showlegend":true,"visible":true,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAAACEDEAAAAAAAEIoQAAAAAAAIUZAAAAAAIAQZUA="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(78, 165, 183)"},"name":"parameters","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAAKWXKkAAAAAApZ0qQAAAAAClqSpAAAAAAKXBKkA="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(227, 138, 66)"},"name":"gradients","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAAKWXKkAAAAAApZ0qQAAAAAClqSpAAAAAAKXBKkA="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(232, 137, 171)"},"name":"optimizer","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAAKWXOkAAAAAApZ06QAAAAAClqTpAAAAAAKXBOkA="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(206, 192, 250)"},"name":"activations","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAAABLGEAAAAAAgLUyQAAAAACA1U9AAAAAAMAKbUA="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(78, 165, 183)"},"name":"parameters","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAAKL6OUAAAAAAov45QAAAAACiBjpAAAAAAKIWOkA="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(227, 138, 66)"},"name":"gradients","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAAKL6OUAAAAAAov45QAAAAACiBjpAAAAAAKIWOkA="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(232, 137, 171)"},"name":"optimizer","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAAKL6SUAAAAAAov5JQAAAAACiBkpAAAAAAKIWSkA="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(206, 192, 250)"},"name":"activations","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAAACCIkAAAAAAAII8QAAAAAAAQVhAAAAAAIAgdkA="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(78, 165, 183)"},"name":"parameters","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAgER\u002fbkAAAACARIBuQAAAAIBEgm5AAAAAgESGbkA="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(227, 138, 66)"},"name":"gradients","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAgER\u002fbkAAAACARIBuQAAAAIBEgm5AAAAAgESGbkA="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(232, 137, 171)"},"name":"optimizer","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAgER\u002ffkAAAACARIB+QAAAAIBEgn5AAAAAgESGfkA="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(206, 192, 250)"},"name":"activations","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAAAAhR0AAAAAAgNBhQAAAAACAUH5AAAAAAECom0A="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(78, 165, 183)"},"name":"parameters","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAgPa\u002fl0AAAACANsCXQAAAAIC2wJdAAAAAgLbBl0A="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(227, 138, 66)"},"name":"gradients","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAgPa\u002fl0AAAACANsCXQAAAAIC2wJdAAAAAgLbBl0A="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(232, 137, 171)"},"name":"optimizer","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAgPa\u002fp0AAAACANsCnQAAAAIC2wKdAAAAAgLbBp0A="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(206, 192, 250)"},"name":"activations","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAAIA2YkAAAAAAgA58QAAAAABA35dAAAAAAKDHtUA="},"type":"bar"}], {"template":{"data":{"barpolar":[{"marker":{"line":{"color":"white","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"barpolar"}],"bar":[{"error_x":{"color":"#2a3f5f"},"error_y":{"color":"#2a3f5f"},"marker":{"line":{"color":"white","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"bar"}],"carpet":[{"aaxis":{"endlinecolor":"#2a3f5f","gridcolor":"#C8D4E3","linecolor":"#C8D4E3","minorgridcolor":"#C8D4E3","startlinecolor":"#2a3f5f"},"baxis":{"endlinecolor":"#2a3f5f","gridcolor":"#C8D4E3","linecolor":"#C8D4E3","minorgridcolor":"#C8D4E3","startlinecolor":"#2a3f5f"},"type":"carpet"}],"choropleth":[{"colorbar":{"outlinewidth":0,"ticks":""},"type":"choropleth"}],"contourcarpet":[{"colorbar":{"outlinewidth":0,"ticks":""},"type":"contourcarpet"}],"contour":[{"colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"type":"contour"}],"heatmap":[{"colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"type":"heatmap"}],"histogram2dcontour":[{"colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"type":"histogram2dcontour"}],"histogram2d":[{"colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"type":"histogram2d"}],"histogram":[{"marker":{"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"histogram"}],"mesh3d":[{"colorbar":{"outlinewidth":0,"ticks":""},"type":"mesh3d"}],"parcoords":[{"line":{"colorbar":{"outlinewidth":0,"ticks":""}},"type":"parcoords"}],"pie":[{"automargin":true,"type":"pie"}],"scatter3d":[{"line":{"colorbar":{"outlinewidth":0,"ticks":""}},"marker":{"colorbar":{"outlinewidth":0,"ticks":""}},"type":"scatter3d"}],"scattercarpet":[{"marker":{"colorbar":{"outlinewidth":0,"ticks":""}},"type":"scattercarpet"}],"scattergeo":[{"marker":{"colorbar":{"outlinewidth":0,"ticks":""}},"type":"scattergeo"}],"scattergl":[{"marker":{"colorbar":{"outlinewidth":0,"ticks":""}},"type":"scattergl"}],"scattermapbox":[{"marker":{"colorbar":{"outlinewidth":0,"ticks":""}},"type":"scattermapbox"}],"scattermap":[{"marker":{"colorbar":{"outlinewidth":0,"ticks":""}},"type":"scattermap"}],"scatterpolargl":[{"marker":{"colorbar":{"outlinewidth":0,"ticks":""}},"type":"scatterpolargl"}],"scatterpolar":[{"marker":{"colorbar":{"outlinewidth":0,"ticks":""}},"type":"scatterpolar"}],"scatter":[{"fillpattern":{"fillmode":"overlay","size":10,"solidity":0.2},"type":"scatter"}],"scatterternary":[{"marker":{"colorbar":{"outlinewidth":0,"ticks":""}},"type":"scatterternary"}],"surface":[{"colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"type":"surface"}],"table":[{"cells":{"fill":{"color":"#EBF0F8"},"line":{"color":"white"}},"header":{"fill":{"color":"#C8D4E3"},"line":{"color":"white"}},"type":"table"}]},"layout":{"annotationdefaults":{"arrowcolor":"#2a3f5f","arrowhead":0,"arrowwidth":1},"autotypenumbers":"strict","coloraxis":{"colorbar":{"outlinewidth":0,"ticks":""}},"colorscale":{"diverging":[[0,"#8e0152"],[0.1,"#c51b7d"],[0.2,"#de77ae"],[0.3,"#f1b6da"],[0.4,"#fde0ef"],[0.5,"#f7f7f7"],[0.6,"#e6f5d0"],[0.7,"#b8e186"],[0.8,"#7fbc41"],[0.9,"#4d9221"],[1,"#276419"]],"sequential":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"sequentialminus":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]},"colorway":["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],"font":{"color":"#2a3f5f"},"geo":{"bgcolor":"white","lakecolor":"white","landcolor":"white","showlakes":true,"showland":true,"subunitcolor":"#C8D4E3"},"hoverlabel":{"align":"left"},"hovermode":"closest","mapbox":{"style":"light"},"paper_bgcolor":"white","plot_bgcolor":"white","polar":{"angularaxis":{"gridcolor":"#EBF0F8","linecolor":"#EBF0F8","ticks":""},"bgcolor":"white","radialaxis":{"gridcolor":"#EBF0F8","linecolor":"#EBF0F8","ticks":""}},"scene":{"xaxis":{"backgroundcolor":"white","gridcolor":"#DFE8F3","gridwidth":2,"linecolor":"#EBF0F8","showbackground":true,"ticks":"","zerolinecolor":"#EBF0F8"},"yaxis":{"backgroundcolor":"white","gridcolor":"#DFE8F3","gridwidth":2,"linecolor":"#EBF0F8","showbackground":true,"ticks":"","zerolinecolor":"#EBF0F8"},"zaxis":{"backgroundcolor":"white","gridcolor":"#DFE8F3","gridwidth":2,"linecolor":"#EBF0F8","showbackground":true,"ticks":"","zerolinecolor":"#EBF0F8"}},"shapedefaults":{"line":{"color":"#2a3f5f"}},"ternary":{"aaxis":{"gridcolor":"#DFE8F3","linecolor":"#A2B1C6","ticks":""},"baxis":{"gridcolor":"#DFE8F3","linecolor":"#A2B1C6","ticks":""},"bgcolor":"white","caxis":{"gridcolor":"#DFE8F3","linecolor":"#A2B1C6","ticks":""}},"title":{"x":0.05},"xaxis":{"automargin":true,"gridcolor":"#EBF0F8","linecolor":"#EBF0F8","ticks":"","title":{"standoff":15},"zerolinecolor":"#EBF0F8","zerolinewidth":2},"yaxis":{"automargin":true,"gridcolor":"#EBF0F8","linecolor":"#EBF0F8","ticks":"","title":{"standoff":15},"zerolinecolor":"#EBF0F8","zerolinewidth":2}}},"title":{"text":"Memory Usage with Recomputation","x":0.5,"y":0.95},"margin":{"r":80},"legend":{"y":0.95,"x":1.02,"xanchor":"left","yanchor":"top"},"updatemenus":[{"active":0,"buttons":[{"args":[{"visible":[true,true,true,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false]},{"yaxis.range":[0,203.1547058105469]}],"label":"1B","method":"update"},{"args":[{"visible":[false,false,false,false,true,true,true,true,false,false,false,false,false,false,false,false,false,false,false,false]},{"yaxis.range":[0,314.4336639404297]}],"label":"3B","method":"update"},{"args":[{"visible":[false,false,false,false,false,false,false,false,true,true,true,true,false,false,false,false,false,false,false,false]},{"yaxis.range":[0,504.2233764648438]}],"label":"8B","method":"update"},{"args":[{"visible":[false,false,false,false,false,false,false,false,false,false,false,false,true,true,true,true,false,false,false,false]},{"yaxis.range":[0,3021.530541992188]}],"label":"70B","method":"update"},{"args":[{"visible":[false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,true,true,true]},{"yaxis.range":[0,12823.0716796875]}],"label":"405B","method":"update"}],"direction":"down","showactive":true,"x":1.035,"xanchor":"left","y":0.6,"yanchor":"top"},{"active":0,"buttons":[{"args":[{"y":[[3.9879302978515625,3.9957427978515625,4.0113677978515625,4.0426177978515625],[3.9879302978515625,3.9957427978515625,4.0113677978515625,4.0426177978515625],[7.975860595703125,7.991485595703125,8.022735595703125,8.085235595703125],[3.564453125,12.12890625,44.2578125,168.515625],[13.296180725097656,13.307899475097656,13.331336975097656,13.378211975097656],[13.296180725097656,13.307899475097656,13.331336975097656,13.378211975097656],[26.592361450195312,26.615798950195312,26.662673950195312,26.756423950195312],[6.0732421875,18.708984375,63.66796875,232.3359375],[25.979034423828125,25.994659423828125,26.025909423828125,26.088409423828125],[25.979034423828125,25.994659423828125,26.025909423828125,26.088409423828125],[51.95806884765625,51.98931884765625,52.05181884765625,52.17681884765625],[9.25390625,28.5078125,97.015625,354.03125],[243.97711181640625,244.00836181640625,244.07086181640625,244.19586181640625],[243.97711181640625,244.00836181640625,244.07086181640625,244.19586181640625],[487.9542236328125,488.0167236328125,488.1417236328125,488.3917236328125],[46.2578125,142.515625,485.03125,1770.0625],[1519.99072265625,1520.05322265625,1520.17822265625,1520.42822265625],[1519.99072265625,1520.05322265625,1520.17822265625,1520.42822265625],[3039.9814453125,3040.1064453125,3040.3564453125,3040.8564453125],[145.703125,448.90625,1527.8125,5575.625]]}],"label":"None","method":"restyle"},{"args":[{"y":[[3.9879302978515625,3.9957427978515625,4.0113677978515625,4.0426177978515625],[3.9879302978515625,3.9957427978515625,4.0113677978515625,4.0426177978515625],[7.975860595703125,7.991485595703125,8.022735595703125,8.085235595703125],[1.064453125,2.12890625,4.2578125,8.515625],[13.296180725097656,13.307899475097656,13.331336975097656,13.378211975097656],[13.296180725097656,13.307899475097656,13.331336975097656,13.378211975097656],[26.592361450195312,26.615798950195312,26.662673950195312,26.756423950195312],[2.7919921875,5.583984375,11.16796875,22.3359375],[25.979034423828125,25.994659423828125,26.025909423828125,26.088409423828125],[25.979034423828125,25.994659423828125,26.025909423828125,26.088409423828125],[51.95806884765625,51.98931884765625,52.05181884765625,52.17681884765625],[4.25390625,8.5078125,17.015625,34.03125],[243.97711181640625,244.00836181640625,244.07086181640625,244.19586181640625],[243.97711181640625,244.00836181640625,244.07086181640625,244.19586181640625],[487.9542236328125,488.0167236328125,488.1417236328125,488.3917236328125],[21.2578125,42.515625,85.03125,170.0625],[1519.99072265625,1520.05322265625,1520.17822265625,1520.42822265625],[1519.99072265625,1520.05322265625,1520.17822265625,1520.42822265625],[3039.9814453125,3040.1064453125,3040.3564453125,3040.8564453125],[66.953125,133.90625,267.8125,535.625]]}],"label":"selective","method":"restyle"},{"args":[{"y":[[3.9879302978515625,3.9957427978515625,4.0113677978515625,4.0426177978515625],[3.9879302978515625,3.9957427978515625,4.0113677978515625,4.0426177978515625],[7.975860595703125,7.991485595703125,8.022735595703125,8.085235595703125],[0.064453125,0.12890625,0.2578125,0.515625],[13.296180725097656,13.307899475097656,13.331336975097656,13.378211975097656],[13.296180725097656,13.307899475097656,13.331336975097656,13.378211975097656],[26.592361450195312,26.615798950195312,26.662673950195312,26.756423950195312],[0.1669921875,0.333984375,0.66796875,1.3359375],[25.979034423828125,25.994659423828125,26.025909423828125,26.088409423828125],[25.979034423828125,25.994659423828125,26.025909423828125,26.088409423828125],[51.95806884765625,51.98931884765625,52.05181884765625,52.17681884765625],[0.25390625,0.5078125,1.015625,2.03125],[243.97711181640625,244.00836181640625,244.07086181640625,244.19586181640625],[243.97711181640625,244.00836181640625,244.07086181640625,244.19586181640625],[487.9542236328125,488.0167236328125,488.1417236328125,488.3917236328125],[1.2578125,2.515625,5.03125,10.0625],[1519.99072265625,1520.05322265625,1520.17822265625,1520.42822265625],[1519.99072265625,1520.05322265625,1520.17822265625,1520.42822265625],[3039.9814453125,3040.1064453125,3040.3564453125,3040.8564453125],[3.953125,7.90625,15.8125,31.625]]}],"label":"full","method":"restyle"}],"direction":"down","showactive":true,"x":1.035,"xanchor":"left","y":0.4,"yanchor":"top"}],"barmode":"stack","xaxis":{"title":{"text":"Sequence Length"}},"yaxis":{"title":{"text":"Memory (GB)"},"range":[0,203.1547058105469]},"width":850,"height":500,"annotations":[{"showarrow":false,"text":"Model Size:","x":1.035,"xanchor":"left","xref":"paper","y":0.6,"yanchor":"bottom","yref":"paper"},{"showarrow":false,"text":"Recomputation:","x":1.035,"xanchor":"left","xref":"paper","y":0.4,"yanchor":"bottom","yref":"paper"}]}, {"responsive": true, "scrollZoom": false} ) }; </script> </div>
assets/images/5d_full.svg ADDED
dist/assets/data/fragments/banner.html ADDED
The diff for this file is too large to render. See raw diff
 
dist/assets/data/fragments/benchmarks_interactive.html ADDED
The diff for this file is too large to render. See raw diff
 
dist/assets/data/fragments/memory-profile.html ADDED
The diff for this file is too large to render. See raw diff
 
dist/assets/data/fragments/memory-recomputation.html ADDED
@@ -0,0 +1 @@
 
 
1
+ <div> <div id="3a29a7b6-5a6e-4e51-aabe-515dafefbd63" class="plotly-graph-div" style="height:500px; width:850px;"></div> <script type="text/javascript"> window.PLOTLYENV=window.PLOTLYENV || {}; if (document.getElementById("3a29a7b6-5a6e-4e51-aabe-515dafefbd63")) { Plotly.newPlot( "3a29a7b6-5a6e-4e51-aabe-515dafefbd63", [{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(78, 165, 183)"},"name":"parameters","showlegend":true,"visible":true,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAAEjnD0AAAAAASPcPQAAAAACkCxBAAAAAAKQrEEA="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(227, 138, 66)"},"name":"gradients","showlegend":true,"visible":true,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAAEjnD0AAAAAASPcPQAAAAACkCxBAAAAAAKQrEEA="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(232, 137, 171)"},"name":"optimizer","showlegend":true,"visible":true,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAAEjnH0AAAAAASPcfQAAAAACkCyBAAAAAAKQrIEA="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(206, 192, 250)"},"name":"activations","showlegend":true,"visible":true,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAAACEDEAAAAAAAEIoQAAAAAAAIUZAAAAAAIAQZUA="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(78, 165, 183)"},"name":"parameters","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAAKWXKkAAAAAApZ0qQAAAAAClqSpAAAAAAKXBKkA="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(227, 138, 66)"},"name":"gradients","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAAKWXKkAAAAAApZ0qQAAAAAClqSpAAAAAAKXBKkA="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(232, 137, 171)"},"name":"optimizer","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAAKWXOkAAAAAApZ06QAAAAAClqTpAAAAAAKXBOkA="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(206, 192, 250)"},"name":"activations","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAAABLGEAAAAAAgLUyQAAAAACA1U9AAAAAAMAKbUA="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(78, 165, 183)"},"name":"parameters","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAAKL6OUAAAAAAov45QAAAAACiBjpAAAAAAKIWOkA="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(227, 138, 66)"},"name":"gradients","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAAKL6OUAAAAAAov45QAAAAACiBjpAAAAAAKIWOkA="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(232, 137, 171)"},"name":"optimizer","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAAKL6SUAAAAAAov5JQAAAAACiBkpAAAAAAKIWSkA="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(206, 192, 250)"},"name":"activations","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAAACCIkAAAAAAAII8QAAAAAAAQVhAAAAAAIAgdkA="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(78, 165, 183)"},"name":"parameters","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAgER\u002fbkAAAACARIBuQAAAAIBEgm5AAAAAgESGbkA="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(227, 138, 66)"},"name":"gradients","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAgER\u002fbkAAAACARIBuQAAAAIBEgm5AAAAAgESGbkA="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(232, 137, 171)"},"name":"optimizer","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAgER\u002ffkAAAACARIB+QAAAAIBEgn5AAAAAgESGfkA="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(206, 192, 250)"},"name":"activations","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAAAAhR0AAAAAAgNBhQAAAAACAUH5AAAAAAECom0A="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(78, 165, 183)"},"name":"parameters","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAgPa\u002fl0AAAACANsCXQAAAAIC2wJdAAAAAgLbBl0A="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(227, 138, 66)"},"name":"gradients","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAgPa\u002fl0AAAACANsCXQAAAAIC2wJdAAAAAgLbBl0A="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(232, 137, 171)"},"name":"optimizer","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAgPa\u002fp0AAAACANsCnQAAAAIC2wKdAAAAAgLbBp0A="},"type":"bar"},{"hovertemplate":"Seq len=%{x}\u003cbr\u003eMem=%{y:.1f}GB\u003cbr\u003e%{data.name}\u003cextra\u003e\u003c\u002fextra\u003e","marker":{"color":"rgb(206, 192, 250)"},"name":"activations","showlegend":true,"visible":false,"x":["1024","2048","4096","8192"],"y":{"dtype":"f8","bdata":"AAAAAIA2YkAAAAAAgA58QAAAAABA35dAAAAAAKDHtUA="},"type":"bar"}], {"template":{"data":{"barpolar":[{"marker":{"line":{"color":"white","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"barpolar"}],"bar":[{"error_x":{"color":"#2a3f5f"},"error_y":{"color":"#2a3f5f"},"marker":{"line":{"color":"white","width":0.5},"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"bar"}],"carpet":[{"aaxis":{"endlinecolor":"#2a3f5f","gridcolor":"#C8D4E3","linecolor":"#C8D4E3","minorgridcolor":"#C8D4E3","startlinecolor":"#2a3f5f"},"baxis":{"endlinecolor":"#2a3f5f","gridcolor":"#C8D4E3","linecolor":"#C8D4E3","minorgridcolor":"#C8D4E3","startlinecolor":"#2a3f5f"},"type":"carpet"}],"choropleth":[{"colorbar":{"outlinewidth":0,"ticks":""},"type":"choropleth"}],"contourcarpet":[{"colorbar":{"outlinewidth":0,"ticks":""},"type":"contourcarpet"}],"contour":[{"colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"type":"contour"}],"heatmap":[{"colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"type":"heatmap"}],"histogram2dcontour":[{"colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"type":"histogram2dcontour"}],"histogram2d":[{"colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"type":"histogram2d"}],"histogram":[{"marker":{"pattern":{"fillmode":"overlay","size":10,"solidity":0.2}},"type":"histogram"}],"mesh3d":[{"colorbar":{"outlinewidth":0,"ticks":""},"type":"mesh3d"}],"parcoords":[{"line":{"colorbar":{"outlinewidth":0,"ticks":""}},"type":"parcoords"}],"pie":[{"automargin":true,"type":"pie"}],"scatter3d":[{"line":{"colorbar":{"outlinewidth":0,"ticks":""}},"marker":{"colorbar":{"outlinewidth":0,"ticks":""}},"type":"scatter3d"}],"scattercarpet":[{"marker":{"colorbar":{"outlinewidth":0,"ticks":""}},"type":"scattercarpet"}],"scattergeo":[{"marker":{"colorbar":{"outlinewidth":0,"ticks":""}},"type":"scattergeo"}],"scattergl":[{"marker":{"colorbar":{"outlinewidth":0,"ticks":""}},"type":"scattergl"}],"scattermapbox":[{"marker":{"colorbar":{"outlinewidth":0,"ticks":""}},"type":"scattermapbox"}],"scattermap":[{"marker":{"colorbar":{"outlinewidth":0,"ticks":""}},"type":"scattermap"}],"scatterpolargl":[{"marker":{"colorbar":{"outlinewidth":0,"ticks":""}},"type":"scatterpolargl"}],"scatterpolar":[{"marker":{"colorbar":{"outlinewidth":0,"ticks":""}},"type":"scatterpolar"}],"scatter":[{"fillpattern":{"fillmode":"overlay","size":10,"solidity":0.2},"type":"scatter"}],"scatterternary":[{"marker":{"colorbar":{"outlinewidth":0,"ticks":""}},"type":"scatterternary"}],"surface":[{"colorbar":{"outlinewidth":0,"ticks":""},"colorscale":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"type":"surface"}],"table":[{"cells":{"fill":{"color":"#EBF0F8"},"line":{"color":"white"}},"header":{"fill":{"color":"#C8D4E3"},"line":{"color":"white"}},"type":"table"}]},"layout":{"annotationdefaults":{"arrowcolor":"#2a3f5f","arrowhead":0,"arrowwidth":1},"autotypenumbers":"strict","coloraxis":{"colorbar":{"outlinewidth":0,"ticks":""}},"colorscale":{"diverging":[[0,"#8e0152"],[0.1,"#c51b7d"],[0.2,"#de77ae"],[0.3,"#f1b6da"],[0.4,"#fde0ef"],[0.5,"#f7f7f7"],[0.6,"#e6f5d0"],[0.7,"#b8e186"],[0.8,"#7fbc41"],[0.9,"#4d9221"],[1,"#276419"]],"sequential":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]],"sequentialminus":[[0.0,"#0d0887"],[0.1111111111111111,"#46039f"],[0.2222222222222222,"#7201a8"],[0.3333333333333333,"#9c179e"],[0.4444444444444444,"#bd3786"],[0.5555555555555556,"#d8576b"],[0.6666666666666666,"#ed7953"],[0.7777777777777778,"#fb9f3a"],[0.8888888888888888,"#fdca26"],[1.0,"#f0f921"]]},"colorway":["#636efa","#EF553B","#00cc96","#ab63fa","#FFA15A","#19d3f3","#FF6692","#B6E880","#FF97FF","#FECB52"],"font":{"color":"#2a3f5f"},"geo":{"bgcolor":"white","lakecolor":"white","landcolor":"white","showlakes":true,"showland":true,"subunitcolor":"#C8D4E3"},"hoverlabel":{"align":"left"},"hovermode":"closest","mapbox":{"style":"light"},"paper_bgcolor":"white","plot_bgcolor":"white","polar":{"angularaxis":{"gridcolor":"#EBF0F8","linecolor":"#EBF0F8","ticks":""},"bgcolor":"white","radialaxis":{"gridcolor":"#EBF0F8","linecolor":"#EBF0F8","ticks":""}},"scene":{"xaxis":{"backgroundcolor":"white","gridcolor":"#DFE8F3","gridwidth":2,"linecolor":"#EBF0F8","showbackground":true,"ticks":"","zerolinecolor":"#EBF0F8"},"yaxis":{"backgroundcolor":"white","gridcolor":"#DFE8F3","gridwidth":2,"linecolor":"#EBF0F8","showbackground":true,"ticks":"","zerolinecolor":"#EBF0F8"},"zaxis":{"backgroundcolor":"white","gridcolor":"#DFE8F3","gridwidth":2,"linecolor":"#EBF0F8","showbackground":true,"ticks":"","zerolinecolor":"#EBF0F8"}},"shapedefaults":{"line":{"color":"#2a3f5f"}},"ternary":{"aaxis":{"gridcolor":"#DFE8F3","linecolor":"#A2B1C6","ticks":""},"baxis":{"gridcolor":"#DFE8F3","linecolor":"#A2B1C6","ticks":""},"bgcolor":"white","caxis":{"gridcolor":"#DFE8F3","linecolor":"#A2B1C6","ticks":""}},"title":{"x":0.05},"xaxis":{"automargin":true,"gridcolor":"#EBF0F8","linecolor":"#EBF0F8","ticks":"","title":{"standoff":15},"zerolinecolor":"#EBF0F8","zerolinewidth":2},"yaxis":{"automargin":true,"gridcolor":"#EBF0F8","linecolor":"#EBF0F8","ticks":"","title":{"standoff":15},"zerolinecolor":"#EBF0F8","zerolinewidth":2}}},"title":{"text":"Memory Usage with Recomputation","x":0.5,"y":0.95},"margin":{"r":80},"legend":{"y":0.95,"x":1.02,"xanchor":"left","yanchor":"top"},"updatemenus":[{"active":0,"buttons":[{"args":[{"visible":[true,true,true,true,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false]},{"yaxis.range":[0,203.1547058105469]}],"label":"1B","method":"update"},{"args":[{"visible":[false,false,false,false,true,true,true,true,false,false,false,false,false,false,false,false,false,false,false,false]},{"yaxis.range":[0,314.4336639404297]}],"label":"3B","method":"update"},{"args":[{"visible":[false,false,false,false,false,false,false,false,true,true,true,true,false,false,false,false,false,false,false,false]},{"yaxis.range":[0,504.2233764648438]}],"label":"8B","method":"update"},{"args":[{"visible":[false,false,false,false,false,false,false,false,false,false,false,false,true,true,true,true,false,false,false,false]},{"yaxis.range":[0,3021.530541992188]}],"label":"70B","method":"update"},{"args":[{"visible":[false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,false,true,true,true,true]},{"yaxis.range":[0,12823.0716796875]}],"label":"405B","method":"update"}],"direction":"down","showactive":true,"x":1.035,"xanchor":"left","y":0.6,"yanchor":"top"},{"active":0,"buttons":[{"args":[{"y":[[3.9879302978515625,3.9957427978515625,4.0113677978515625,4.0426177978515625],[3.9879302978515625,3.9957427978515625,4.0113677978515625,4.0426177978515625],[7.975860595703125,7.991485595703125,8.022735595703125,8.085235595703125],[3.564453125,12.12890625,44.2578125,168.515625],[13.296180725097656,13.307899475097656,13.331336975097656,13.378211975097656],[13.296180725097656,13.307899475097656,13.331336975097656,13.378211975097656],[26.592361450195312,26.615798950195312,26.662673950195312,26.756423950195312],[6.0732421875,18.708984375,63.66796875,232.3359375],[25.979034423828125,25.994659423828125,26.025909423828125,26.088409423828125],[25.979034423828125,25.994659423828125,26.025909423828125,26.088409423828125],[51.95806884765625,51.98931884765625,52.05181884765625,52.17681884765625],[9.25390625,28.5078125,97.015625,354.03125],[243.97711181640625,244.00836181640625,244.07086181640625,244.19586181640625],[243.97711181640625,244.00836181640625,244.07086181640625,244.19586181640625],[487.9542236328125,488.0167236328125,488.1417236328125,488.3917236328125],[46.2578125,142.515625,485.03125,1770.0625],[1519.99072265625,1520.05322265625,1520.17822265625,1520.42822265625],[1519.99072265625,1520.05322265625,1520.17822265625,1520.42822265625],[3039.9814453125,3040.1064453125,3040.3564453125,3040.8564453125],[145.703125,448.90625,1527.8125,5575.625]]}],"label":"None","method":"restyle"},{"args":[{"y":[[3.9879302978515625,3.9957427978515625,4.0113677978515625,4.0426177978515625],[3.9879302978515625,3.9957427978515625,4.0113677978515625,4.0426177978515625],[7.975860595703125,7.991485595703125,8.022735595703125,8.085235595703125],[1.064453125,2.12890625,4.2578125,8.515625],[13.296180725097656,13.307899475097656,13.331336975097656,13.378211975097656],[13.296180725097656,13.307899475097656,13.331336975097656,13.378211975097656],[26.592361450195312,26.615798950195312,26.662673950195312,26.756423950195312],[2.7919921875,5.583984375,11.16796875,22.3359375],[25.979034423828125,25.994659423828125,26.025909423828125,26.088409423828125],[25.979034423828125,25.994659423828125,26.025909423828125,26.088409423828125],[51.95806884765625,51.98931884765625,52.05181884765625,52.17681884765625],[4.25390625,8.5078125,17.015625,34.03125],[243.97711181640625,244.00836181640625,244.07086181640625,244.19586181640625],[243.97711181640625,244.00836181640625,244.07086181640625,244.19586181640625],[487.9542236328125,488.0167236328125,488.1417236328125,488.3917236328125],[21.2578125,42.515625,85.03125,170.0625],[1519.99072265625,1520.05322265625,1520.17822265625,1520.42822265625],[1519.99072265625,1520.05322265625,1520.17822265625,1520.42822265625],[3039.9814453125,3040.1064453125,3040.3564453125,3040.8564453125],[66.953125,133.90625,267.8125,535.625]]}],"label":"selective","method":"restyle"},{"args":[{"y":[[3.9879302978515625,3.9957427978515625,4.0113677978515625,4.0426177978515625],[3.9879302978515625,3.9957427978515625,4.0113677978515625,4.0426177978515625],[7.975860595703125,7.991485595703125,8.022735595703125,8.085235595703125],[0.064453125,0.12890625,0.2578125,0.515625],[13.296180725097656,13.307899475097656,13.331336975097656,13.378211975097656],[13.296180725097656,13.307899475097656,13.331336975097656,13.378211975097656],[26.592361450195312,26.615798950195312,26.662673950195312,26.756423950195312],[0.1669921875,0.333984375,0.66796875,1.3359375],[25.979034423828125,25.994659423828125,26.025909423828125,26.088409423828125],[25.979034423828125,25.994659423828125,26.025909423828125,26.088409423828125],[51.95806884765625,51.98931884765625,52.05181884765625,52.17681884765625],[0.25390625,0.5078125,1.015625,2.03125],[243.97711181640625,244.00836181640625,244.07086181640625,244.19586181640625],[243.97711181640625,244.00836181640625,244.07086181640625,244.19586181640625],[487.9542236328125,488.0167236328125,488.1417236328125,488.3917236328125],[1.2578125,2.515625,5.03125,10.0625],[1519.99072265625,1520.05322265625,1520.17822265625,1520.42822265625],[1519.99072265625,1520.05322265625,1520.17822265625,1520.42822265625],[3039.9814453125,3040.1064453125,3040.3564453125,3040.8564453125],[3.953125,7.90625,15.8125,31.625]]}],"label":"full","method":"restyle"}],"direction":"down","showactive":true,"x":1.035,"xanchor":"left","y":0.4,"yanchor":"top"}],"barmode":"stack","xaxis":{"title":{"text":"Sequence Length"}},"yaxis":{"title":{"text":"Memory (GB)"},"range":[0,203.1547058105469]},"width":850,"height":500,"annotations":[{"showarrow":false,"text":"Model Size:","x":1.035,"xanchor":"left","xref":"paper","y":0.6,"yanchor":"bottom","yref":"paper"},{"showarrow":false,"text":"Recomputation:","x":1.035,"xanchor":"left","xref":"paper","y":0.4,"yanchor":"bottom","yref":"paper"}]}, {"responsive": true, "scrollZoom": false} ) }; </script> </div>
dist/assets/images/5d_full.svg ADDED
dist/index.html CHANGED
@@ -1639,12 +1639,14 @@
1639
 
1640
  <p>On the other hand, ZeRO-1 and ZeRO-2, which focus on optimizer states and gradients, can be easily combined with Pipeline Parallelism and are complementary to it. Combining them don't raise any particular new challenge. For instance, the training of DeepSeek-v3 used PP combined with ZeRO-1!</p>
1641
 
1642
- <p><strong>Tensor Parallelism</strong> (with Sequence Parallelism) is naturally complementary and interoperable with both Pipeline Parallelism and ZeRO-3, because it relies on the distributive property of matrix multiplication that allows weights and activations to be sharded and computed independently before being combined. In practice TP has two important limitations we've discussed in the previous sections: First, since its communication operations are part of the critical path of computation, it's difficult to scale well beyond a certain point at which communication overhead begins to dominate. Second, unlike ZeRO and PP which are model-agnostic, TP requires careful handling of activation sharding - sometimes along the hidden dimension (in the TP region) and sometimes along the sequence dimension (in the SP region) - making it more cumbersome to implement correctly and requiring model-specific knowledge to ensure proper sharding patterns throughout.</p>
1643
-
1644
- <div class="l-page"><img alt="TP & SP diagram" src="/assets/images/5D_nutshell_tp_sp.svg" /></div>
1645
- <!-- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p> -->
1646
 
 
 
1647
 
 
 
 
1648
  <p>When combining parallelism strategies, TP will typically be kept for high-speed intra-node communications while ZeRO-3 or PP can use parallelism groups spanning lower speed inter-node communications, since their communication patterns are more amenable to scaling. The main consideration is organizing the GPU groups efficiently for each parallelism dimension to maximize throughput and minimize communication overhead, while being mindful of TP's scaling limitations.</p>
1649
 
1650
 
@@ -1652,7 +1654,7 @@
1652
 
1653
  <p><strong>Context Parallelism (CP)</strong> specifically targets the challenge of training with very long sequences by sharding activations along the sequence dimension across GPUs. While most operations like MLPs and LayerNorm can process these sharded sequences independently, attention layers require communication since each token needs access to keys/values from the full sequence. This is handled efficiently through ring attention patterns that overlap computation and communication. CP is particularly valuable when scaling to extreme sequence lengths (128k+ tokens) where even with full activation recomputation the memory requirements for attention would be prohibitive on a single GPU.</p>
1654
 
1655
- <div class="l-page"><img alt="CP diagram" src="/assets/images/5d_nutshell_cp.svg" /></div>
1656
 
1657
  <!-- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p> -->
1658
 
@@ -1667,7 +1669,7 @@
1667
  <li>Expert Parallelism primarly affects the MoE layers (which replace standard MLP blocks), leaving attention and other components unchanged</li>
1668
  </ul>
1669
 
1670
- <div class="l-page"><img alt="EP diagram" src="/assets/images/5d_nutshell_ep.svg" /></div>
1671
 
1672
  <div class="note-box">
1673
  <p class="note-box-title">📝 Note</p>
@@ -1711,11 +1713,11 @@
1711
 
1712
  <p>Which leads us to this beautiful diagram to summarize all what we’ve seen:</p>
1713
 
1714
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
1715
 
1716
  <p>And to have an idea of the memory benefits of each parallelism:</p>
1717
 
1718
- <div class="l-page"><img alt="5Dparallelism_8Bmemoryusage.svg" src="/assets/images/5Dparallelism_8Bmemoryusage.svg" /></div>
1719
 
1720
  <h2>How to Find the Best Training Configuration</h2>
1721
 
@@ -2564,6 +2566,16 @@
2564
  <p>Training language models across compute clusters with DiLoCo.</p>
2565
  </div>
2566
 
 
 
 
 
 
 
 
 
 
 
2567
  <h3>Debugging</h3>
2568
 
2569
  <div>
@@ -2727,6 +2739,10 @@
2727
  <p>Easy explanation of Flash Attention</p>
2728
  </div>
2729
 
 
 
 
 
2730
 
2731
  <h2>Appendix</h2>
2732
 
 
1639
 
1640
  <p>On the other hand, ZeRO-1 and ZeRO-2, which focus on optimizer states and gradients, can be easily combined with Pipeline Parallelism and are complementary to it. Combining them don't raise any particular new challenge. For instance, the training of DeepSeek-v3 used PP combined with ZeRO-1!</p>
1641
 
1642
+ <p><strong>Tensor Parallelism</strong> (with Sequence Parallelism) is naturally complementary and interoperable with both Pipeline Parallelism and ZeRO-3, because it relies on the distributive property of matrix multiplication that allows weights and activations to be sharded and computed independently before being combined.</p>
 
 
 
1643
 
1644
+ <img alt="TP & SP diagram" src="/assets/images/5D_nutshell_tp_sp.svg" style="width: 1000px; max-width: none;" />
1645
+ <!-- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p> -->
1646
 
1647
+
1648
+ <p>In practice TP has two important limitations we've discussed in the previous sections: First, since its communication operations are part of the critical path of computation, it's difficult to scale well beyond a certain point at which communication overhead begins to dominate. Second, unlike ZeRO and PP which are model-agnostic, TP requires careful handling of activation sharding - sometimes along the hidden dimension (in the TP region) and sometimes along the sequence dimension (in the SP region) - making it more cumbersome to implement correctly and requiring model-specific knowledge to ensure proper sharding patterns throughout.</p>
1649
+
1650
  <p>When combining parallelism strategies, TP will typically be kept for high-speed intra-node communications while ZeRO-3 or PP can use parallelism groups spanning lower speed inter-node communications, since their communication patterns are more amenable to scaling. The main consideration is organizing the GPU groups efficiently for each parallelism dimension to maximize throughput and minimize communication overhead, while being mindful of TP's scaling limitations.</p>
1651
 
1652
 
 
1654
 
1655
  <p><strong>Context Parallelism (CP)</strong> specifically targets the challenge of training with very long sequences by sharding activations along the sequence dimension across GPUs. While most operations like MLPs and LayerNorm can process these sharded sequences independently, attention layers require communication since each token needs access to keys/values from the full sequence. This is handled efficiently through ring attention patterns that overlap computation and communication. CP is particularly valuable when scaling to extreme sequence lengths (128k+ tokens) where even with full activation recomputation the memory requirements for attention would be prohibitive on a single GPU.</p>
1656
 
1657
+ <img alt="CP diagram" src="/assets/images/5d_nutshell_cp.svg" style="width: 1000px; max-width: none;" />
1658
 
1659
  <!-- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p> -->
1660
 
 
1669
  <li>Expert Parallelism primarly affects the MoE layers (which replace standard MLP blocks), leaving attention and other components unchanged</li>
1670
  </ul>
1671
 
1672
+ <img alt="EP diagram" src="/assets/images/5d_nutshell_ep.svg" style="width: 1000px; max-width: none;" />
1673
 
1674
  <div class="note-box">
1675
  <p class="note-box-title">📝 Note</p>
 
1713
 
1714
  <p>Which leads us to this beautiful diagram to summarize all what we’ve seen:</p>
1715
 
1716
+ <p><img alt="image.png" src="/assets/images/5d_full.svg" style="width: 1000px; max-width: none;"/></p>
1717
 
1718
  <p>And to have an idea of the memory benefits of each parallelism:</p>
1719
 
1720
+ <img alt="5Dparallelism_8Bmemoryusage.svg" src="/assets/images/5Dparallelism_8Bmemoryusage.svg" style="width: 1000px; max-width: none;"/>
1721
 
1722
  <h2>How to Find the Best Training Configuration</h2>
1723
 
 
2566
  <p>Training language models across compute clusters with DiLoCo.</p>
2567
  </div>
2568
 
2569
+ <div>
2570
+ <a href="https://github.com/kakaobrain/torchgpipe"><strong>torchgpipe</strong></a>
2571
+ <p>A GPipe implementation in PyTorch.</p>
2572
+ </div>
2573
+
2574
+ <div>
2575
+ <a href="https://github.com/EleutherAI/oslo"><strong>OSLO</strong></a>
2576
+ <p>OSLO: Open Source for Large-scale Optimization.</p>
2577
+ </div>
2578
+
2579
  <h3>Debugging</h3>
2580
 
2581
  <div>
 
2739
  <p>Easy explanation of Flash Attention</p>
2740
  </div>
2741
 
2742
+ <div>
2743
+ <a href="https://github.com/tunib-ai/large-scale-lm-tutorials"><strong>TunibAI's 3D parallelism tutorial</strong></a>
2744
+ <p>Large-scale language modeling tutorials with PyTorch.</p>
2745
+ </div>
2746
 
2747
  <h2>Appendix</h2>
2748
 
src/index.html CHANGED
@@ -1641,7 +1641,7 @@
1641
 
1642
  <p><strong>Tensor Parallelism</strong> (with Sequence Parallelism) is naturally complementary and interoperable with both Pipeline Parallelism and ZeRO-3, because it relies on the distributive property of matrix multiplication that allows weights and activations to be sharded and computed independently before being combined.</p>
1643
 
1644
- <div class="l-page"><img alt="TP & SP diagram" src="/assets/images/5D_nutshell_tp_sp.svg" /></div>
1645
  <!-- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p> -->
1646
 
1647
 
@@ -1654,7 +1654,7 @@
1654
 
1655
  <p><strong>Context Parallelism (CP)</strong> specifically targets the challenge of training with very long sequences by sharding activations along the sequence dimension across GPUs. While most operations like MLPs and LayerNorm can process these sharded sequences independently, attention layers require communication since each token needs access to keys/values from the full sequence. This is handled efficiently through ring attention patterns that overlap computation and communication. CP is particularly valuable when scaling to extreme sequence lengths (128k+ tokens) where even with full activation recomputation the memory requirements for attention would be prohibitive on a single GPU.</p>
1656
 
1657
- <div class="l-page"><img alt="CP diagram" src="/assets/images/5d_nutshell_cp.svg" /></div>
1658
 
1659
  <!-- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p> -->
1660
 
@@ -1669,7 +1669,7 @@
1669
  <li>Expert Parallelism primarly affects the MoE layers (which replace standard MLP blocks), leaving attention and other components unchanged</li>
1670
  </ul>
1671
 
1672
- <div class="l-page"><img alt="EP diagram" src="/assets/images/5d_nutshell_ep.svg" /></div>
1673
 
1674
  <div class="note-box">
1675
  <p class="note-box-title">📝 Note</p>
@@ -1713,11 +1713,11 @@
1713
 
1714
  <p>Which leads us to this beautiful diagram to summarize all what we’ve seen:</p>
1715
 
1716
- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p>
1717
 
1718
  <p>And to have an idea of the memory benefits of each parallelism:</p>
1719
 
1720
- <div class="l-page"><img alt="5Dparallelism_8Bmemoryusage.svg" src="/assets/images/5Dparallelism_8Bmemoryusage.svg" /></div>
1721
 
1722
  <h2>How to Find the Best Training Configuration</h2>
1723
 
@@ -2566,6 +2566,16 @@
2566
  <p>Training language models across compute clusters with DiLoCo.</p>
2567
  </div>
2568
 
 
 
 
 
 
 
 
 
 
 
2569
  <h3>Debugging</h3>
2570
 
2571
  <div>
@@ -2729,6 +2739,10 @@
2729
  <p>Easy explanation of Flash Attention</p>
2730
  </div>
2731
 
 
 
 
 
2732
 
2733
  <h2>Appendix</h2>
2734
 
 
1641
 
1642
  <p><strong>Tensor Parallelism</strong> (with Sequence Parallelism) is naturally complementary and interoperable with both Pipeline Parallelism and ZeRO-3, because it relies on the distributive property of matrix multiplication that allows weights and activations to be sharded and computed independently before being combined.</p>
1643
 
1644
+ <img alt="TP & SP diagram" src="/assets/images/5D_nutshell_tp_sp.svg" style="width: 1000px; max-width: none;" />
1645
  <!-- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p> -->
1646
 
1647
 
 
1654
 
1655
  <p><strong>Context Parallelism (CP)</strong> specifically targets the challenge of training with very long sequences by sharding activations along the sequence dimension across GPUs. While most operations like MLPs and LayerNorm can process these sharded sequences independently, attention layers require communication since each token needs access to keys/values from the full sequence. This is handled efficiently through ring attention patterns that overlap computation and communication. CP is particularly valuable when scaling to extreme sequence lengths (128k+ tokens) where even with full activation recomputation the memory requirements for attention would be prohibitive on a single GPU.</p>
1656
 
1657
+ <img alt="CP diagram" src="/assets/images/5d_nutshell_cp.svg" style="width: 1000px; max-width: none;" />
1658
 
1659
  <!-- <p><img alt="image.png" src="/assets/images/placeholder.png" /></p> -->
1660
 
 
1669
  <li>Expert Parallelism primarly affects the MoE layers (which replace standard MLP blocks), leaving attention and other components unchanged</li>
1670
  </ul>
1671
 
1672
+ <img alt="EP diagram" src="/assets/images/5d_nutshell_ep.svg" style="width: 1000px; max-width: none;" />
1673
 
1674
  <div class="note-box">
1675
  <p class="note-box-title">📝 Note</p>
 
1713
 
1714
  <p>Which leads us to this beautiful diagram to summarize all what we’ve seen:</p>
1715
 
1716
+ <p><img alt="image.png" src="/assets/images/5d_full.svg" style="width: 1000px; max-width: none;"/></p>
1717
 
1718
  <p>And to have an idea of the memory benefits of each parallelism:</p>
1719
 
1720
+ <img alt="5Dparallelism_8Bmemoryusage.svg" src="/assets/images/5Dparallelism_8Bmemoryusage.svg" style="width: 1000px; max-width: none;"/>
1721
 
1722
  <h2>How to Find the Best Training Configuration</h2>
1723
 
 
2566
  <p>Training language models across compute clusters with DiLoCo.</p>
2567
  </div>
2568
 
2569
+ <div>
2570
+ <a href="https://github.com/kakaobrain/torchgpipe"><strong>torchgpipe</strong></a>
2571
+ <p>A GPipe implementation in PyTorch.</p>
2572
+ </div>
2573
+
2574
+ <div>
2575
+ <a href="https://github.com/EleutherAI/oslo"><strong>OSLO</strong></a>
2576
+ <p>OSLO: Open Source for Large-scale Optimization.</p>
2577
+ </div>
2578
+
2579
  <h3>Debugging</h3>
2580
 
2581
  <div>
 
2739
  <p>Easy explanation of Flash Attention</p>
2740
  </div>
2741
 
2742
+ <div>
2743
+ <a href="https://github.com/tunib-ai/large-scale-lm-tutorials"><strong>TunibAI's 3D parallelism tutorial</strong></a>
2744
+ <p>Large-scale language modeling tutorials with PyTorch.</p>
2745
+ </div>
2746
 
2747
  <h2>Appendix</h2>
2748