justheuristic commited on
Commit
4ee0173
1 Parent(s): 5c906aa

update "how to train at 100mbps"

Browse files
Files changed (3) hide show
  1. app.py +44 -8
  2. st_helpers.py +2 -7
  3. static/content_style.css +4 -1
app.py CHANGED
@@ -19,11 +19,11 @@ make_header()
19
  content_text(f"""
20
  There was a time when you could comfortably train state-of-the-art vision and language models at home on your workstation.
21
  The first convolutional neural net to beat ImageNet
22
- (<a target="_blank" href="https://proceedings.neurips.cc/paper/2012/file/c399862d3b9d6b76c8436e924a68c45b-Paper.pdf">AlexNet</a>)
23
  was trained for 5-6 days on two gamer-grade GPUs. In contrast, today's TOP-1 ImageNet model
24
- (<a target="_blank" href="https://arxiv.org/abs/2106.04803">CoAtNet</a>)
25
  takes 20,000 TPU-v3 days. And things are even worse in the NLP world: training
26
- <a target="_blank" href="https://arxiv.org/abs/2005.14165">GPT&#8209;3</a> on a top-tier server
27
  with 8x A100 would take decades.""")
28
 
29
  content_text(f"""
@@ -34,12 +34,49 @@ All it takes is for a bunch of us to come together. In fact, we're doing it righ
34
  draw_current_progress()
35
 
36
  content_text(f"""
37
- We're training a model similar to <a target="_blank" href="https://openai.com/blog/dall-e/">OpenAI DALL-E</a>,
38
  that is, a transformer "language model" that generates images from text description.
39
- It is trained on <a target="_blank" href=https://laion.ai/laion-400-open-dataset/>LAION-400M</a>,
40
  the world's largest openly available image-text-pair dataset with 400 million samples. Our model is based on
41
- the <a target="_blank" href=https://github.com/lucidrains/DALLE-pytorch>dalle&#8209;pytorch</a> implementation
42
- by <a target="_blank" href="https://github.com/lucidrains">Phil Wang</a> with several tweaks for memory-efficient training.""")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
 
45
  content_title("How do I join?")
@@ -71,5 +108,4 @@ content_text("<b> TODO </b> General Story That Weaves Together Three Tabs Below
71
 
72
  make_tabs()
73
 
74
- content_text("<b> TODO UPDATE")
75
  make_footer()
19
  content_text(f"""
20
  There was a time when you could comfortably train state-of-the-art vision and language models at home on your workstation.
21
  The first convolutional neural net to beat ImageNet
22
+ ({cite("AlexNet", "https://proceedings.neurips.cc/paper/2012/file/c399862d3b9d6b76c8436e924a68c45b-Paper.pdf")})
23
  was trained for 5-6 days on two gamer-grade GPUs. In contrast, today's TOP-1 ImageNet model
24
+ ({cite("CoAtNet", "https://arxiv.org/abs/2106.04803")})
25
  takes 20,000 TPU-v3 days. And things are even worse in the NLP world: training
26
+ {cite("GPT&#8209;3", "https://arxiv.org/abs/2005.14165")} on a top-tier server
27
  with 8x A100 would take decades.""")
28
 
29
  content_text(f"""
34
  draw_current_progress()
35
 
36
  content_text(f"""
37
+ We're training a model similar to {cite("OpenAI DALL-E", "https://openai.com/blog/dall-e/")},
38
  that is, a transformer "language model" that generates images from text description.
39
+ It is trained on {cite("LAION-400M", "https://laion.ai/laion-400-open-dataset/")},
40
  the world's largest openly available image-text-pair dataset with 400 million samples. Our model is based on
41
+ the {cite("dalle&#8209;pytorch", "https://github.com/lucidrains/DALLE-pytorch")} implementation
42
+ by {cite("Phil Wang", "https://github.com/lucidrains")} with a few tweaks to make it communication-efficient.
43
+ """, vspace_after=8)
44
+
45
+
46
+ with st.expander("How to train efficiently over the internet?"):
47
+ content_text(f"""
48
+ Modern distributed training algorithms are designed for HPC networks with 10-100 gigabit per second bandwidth.
49
+ In turn, a typical Internet connection runs at 10-100 megabits per second: that’s three orders of magnitude slower.
50
+ To make distributed training over the Internet efficient, you need to win back these three orders of magnitude.
51
+ """)
52
+ content_text(f"""
53
+ This may seem daunting at first, but in reality, DL researchers have already made all the necessary pieces for solving this puzzle:
54
+ <table style="border: 0px;"><tbody style="border: 0px;">
55
+ <tr><td> Speed-up (AllReduce)<br> </td> <td>Existing technique</td></tr>
56
+ <tr><td class=centered><strong>4-16x</strong></td><td>
57
+ <strong>Large-batch training:</strong> {cite("You et al. (2019)", "https://arxiv.org/abs/1904.00962")} proposed a way for training neural networks efficiently with larger batches, and hence, fewer communication rounds.
58
+ </td></tr>
59
+ <tr><td class=centered><strong>4-64x</strong></td><td>
60
+ <strong>Gradient Compression:</strong> from simple {cite("8-bit quantization", "https://arxiv.org/abs/1511.04561")}
61
+ to advanced techniques such as {cite("Deep Gradient Compression", "https://arxiv.org/abs/1712.01887")},
62
+ {cite("PowerSGD", "https://arxiv.org/abs/1905.13727")}, {cite("1-bit Adam", "https://arxiv.org/abs/2102.02888")},
63
+ and many others. As a rule of thumb, you can safely reduce communication by 16-64x. More extreme compression is often
64
+ possible, but it may affect stability or final quality.
65
+ </td></tr>
66
+ <tr><td class=centered><strong>4-24x</strong></td><td>
67
+ <strong>Parameter sharing:</strong> reusing parameters between model layers results in a model with fewer parameters,
68
+ and hence, fewer gradients to communicate. {cite("Lan et al. (2019)", "https://arxiv.org/abs/1909.11942")} and
69
+ {cite("Xue et al. (2021)", "https://arxiv.org/pdf/2107.11817.pdf")} propose efficient parameter sharing techniques
70
+ for NLP and vision.
71
+ </td></tr>
72
+ <tr><td class=centered><strong>1.5-2x</strong></td><td>
73
+ <strong>Overlapping computation with communication:</strong> running network communication in background while
74
+ computing the next portion of gradients. This is a {cite("long-standing trick from HPC", "https://ur.booksc.eu/book/1624068/2d0506")}
75
+ that was recently adapted for DL training. {cite("Ren et al. (2021)", "https://arxiv.org/abs/2101.06840")} show that
76
+ updating parameters in background while computing the next batch of gradients does not reduce convergence.
77
+ </td></tr>
78
+ </tbody></table>
79
+ """)
80
 
81
 
82
  content_title("How do I join?")
108
 
109
  make_tabs()
110
 
 
111
  make_footer()
st_helpers.py CHANGED
@@ -50,10 +50,5 @@ def content_text(text: str, vspace_before: int = 0, vspace_after: int = 0):
50
  f'{text}</div><center>',
51
  unsafe_allow_html=True)
52
 
53
-
54
- CITATIONS = {}
55
-
56
-
57
- def cite(tag):
58
- CITATIONS.setdefault(tag, len(CITATIONS) + 1)
59
- return f"&nbsp;[{CITATIONS[tag]}]"
50
  f'{text}</div><center>',
51
  unsafe_allow_html=True)
52
 
53
+ def cite(tag, link):
54
+ return f"""<a target="_blank" rel="noopener noreferrer" href="{link}">{tag}</a>"""
 
 
 
 
 
static/content_style.css CHANGED
@@ -1,11 +1,14 @@
1
  .faded {
2
  margin: 0 auto;
3
  background: var(--window-color);
4
- box-shadow: 0 0 5px 5px var(--window-color);
5
  font-family: cursive;
6
  font-family: "Gill Sans", sans-serif;
7
  display: inline-block
8
  }
 
 
 
9
  .padded {
10
  width: 100%;
11
  max-width: 800px;
1
  .faded {
2
  margin: 0 auto;
3
  background: var(--window-color);
4
+ box-shadow: 0 0 1px 1px var(--window-color);
5
  font-family: cursive;
6
  font-family: "Gill Sans", sans-serif;
7
  display: inline-block
8
  }
9
+ .centered {
10
+ text-align: center;
11
+ }
12
  .padded {
13
  width: 100%;
14
  max-width: 800px;