muellerzr HF staff commited on
Commit
06a60a3
β€’
1 Parent(s): b91e31d
Makefile ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .PHONY: quality style test docs
2
+
3
+ # Check that source code meets quality standards
4
+
5
+ extra_quality_checks:
6
+ doc-builder style src --max_len 119
7
+
8
+ # this target runs checks on all files
9
+ quality:
10
+ black --check src
11
+ isort --check-only src
12
+ flake8 src
13
+ doc-builder style src --max_len 119 --check_only
14
+
15
+ # Format source code automatically and check is there are any problems left that need manual fixing
16
+ style:
17
+ black src
18
+ isort src
19
+ doc-builder style src --max_len 119
code_samples/accelerate DELETED
@@ -1,17 +0,0 @@
1
- <pre>
2
- from accelerate import Accelerator
3
- accelerator = Accelerator()
4
- train_dataloader, model, optimizer scheduler = accelerator.prepare(
5
- dataloader, model, optimizer, scheduler
6
- )
7
-
8
- model.train()
9
- for batch in train_dataloader:
10
- optimizer.zero_grad()
11
- inputs, targets = batch
12
- outputs = model(inputs)
13
- loss = loss_function(outputs, targets)
14
- accelerator.backward(loss)
15
- optimizer.step()
16
- scheduler.step()
17
- </pre>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
code_samples/basic DELETED
@@ -1,31 +0,0 @@
1
- ##
2
- <pre>
3
- +from accelerate import Accelerator
4
- +accelerator = Accelerator()
5
- +dataloader, model, optimizer scheduler = accelerator.prepare(
6
- + dataloader, model, optimizer, scheduler
7
- +)
8
-
9
- for batch in dataloader:
10
- optimizer.zero_grad()
11
- inputs, targets = batch
12
- - inputs = inputs.to(device)
13
- - targets = targets.to(device)
14
- outputs = model(inputs)
15
- loss = loss_function(outputs, targets)
16
- - loss.backward()
17
- + accelerator.backward(loss)
18
- optimizer.step()
19
- scheduler.step()</pre>
20
- ##
21
- Everything around `accelerate` occurs with the `Accelerator` class. To use it, first make an object.
22
- Then call `.prepare` passing in the PyTorch objects that you would normally train with. This will
23
- return the same objects, but they will be on the correct device and distributed if needed. Then
24
- you can train as normal, but instead of calling `loss.backward()` you call `accelerator.backward(loss)`.
25
- Also note that you don't need to call `model.to(device)` or `inputs.to(device)` anymore, as this
26
- is done automatically by `accelerator.prepare()`.
27
-
28
- ##
29
- To learn more checkout the related documentation:
30
- - <a href="https://huggingface.co/docs/accelerate/basic_tutorials/migration" target="_blank">Migrating to πŸ€— Accelerate</a>
31
- - <a href="https://huggingface.co/docs/accelerate/package_reference/accelerator" target="_blank">The Accelerator</a>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
code_samples/calculating_metrics DELETED
@@ -1,51 +0,0 @@
1
- ##
2
- <pre>
3
- import evaluate
4
- +from accelerate import Accelerator
5
- +accelerator = Accelerator()
6
- +train_dataloader, eval_dataloader, model, optimizer, scheduler = (
7
- + accelerator.prepare(
8
- + train_dataloader, eval_dataloader,
9
- + model, optimizer, scheduler
10
- + )
11
- +)
12
- metric = evaluate.load("accuracy")
13
- for batch in train_dataloader:
14
- optimizer.zero_grad()
15
- inputs, targets = batch
16
- - inputs = inputs.to(device)
17
- - targets = targets.to(device)
18
- outputs = model(inputs)
19
- loss = loss_function(outputs, targets)
20
- loss.backward()
21
- optimizer.step()
22
- scheduler.step()
23
-
24
- model.eval()
25
- for batch in eval_dataloader:
26
- inputs, targets = batch
27
- - inputs = inputs.to(device)
28
- - targets = targets.to(device)
29
- with torch.no_grad():
30
- outputs = model(inputs)
31
- predictions = outputs.argmax(dim=-1)
32
- + predictions, references = accelerator.gather_for_metrics(
33
- + (predictions, references)
34
- + )
35
- metric.add_batch(
36
- predictions = predictions,
37
- references = references
38
- )
39
- print(metric.compute())</pre>
40
-
41
- ##
42
- When calculating metrics on a validation set, you can use the `Accelerator.gather_for_metrics`
43
- method to gather the predictions and references from all devices and then calculate the metric on the gathered values.
44
- This will also *automatically* drop the padded values from the gathered tensors that were added to ensure
45
- that all tensors have the same length. This ensures that the metric is calculated on the correct values.
46
- ##
47
- To learn more checkout the related documentation:
48
-
49
- - <a href="https://huggingface.co/docs/accelerate/en/quicktour#distributed-evaluation" target="_blank">Quicktour - Calculating metrics</a>
50
- - <a href="https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.gather_for_metrics" target="_blank">API reference</a>
51
- - <a href="https://github.com/huggingface/accelerate/blob/main/examples/by_feature/multi_process_metrics.py" target="_blank">Example script</a>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
code_samples/checkpointing DELETED
@@ -1,29 +0,0 @@
1
- ##
2
- <pre>
3
- from accelerate import Accelerator
4
- accelerator = Accelerator()
5
- dataloader, model, optimizer scheduler = accelerator.prepare(
6
- dataloader, model, optimizer, scheduler
7
- )
8
-
9
- for batch in dataloader:
10
- optimizer.zero_grad()
11
- inputs, targets = batch
12
- outputs = model(inputs)
13
- loss = loss_function(outputs, targets)
14
- accelerator.backward(loss)
15
- optimizer.step()
16
- scheduler.step()
17
- +accelerator.save_state("checkpoint_dir")
18
- +accelerator.load_state("checkpoint_dir")</pre>
19
- ##
20
- To save or load a checkpoint in, `Accelerator` provides the `save_state` and `load_state` methods.
21
- These methods will save or load the state of the model, optimizer, scheduler, as well as random states and
22
- any custom registered objects from the main process on each device to a passed in folder.
23
- **This API is designed to save and resume training states only from within the same python script or training setup.**
24
- ##
25
- To learn more checkout the related documentation:
26
- - <a href="https://huggingface.co/docs/accelerate/usage_guides/checkpoint" target="_blank">Saving and loading training states</a>
27
- - <a href="https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state" target="_blank">`save_state` API reference</a>
28
- - <a href="https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.load_state" target="_blank">`load_state` API reference</a>
29
- - <a href="https://github.com/huggingface/accelerate/blob/main/examples/by_feature/checkpointing.py" target="_blank">Example script</a>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
code_samples/experiment_tracking DELETED
@@ -1,32 +0,0 @@
1
- ##
2
- <pre>
3
- from accelerate import Accelerator
4
- -accelerator = Accelerator()
5
- +accelerator = Accelerator(log_with="wandb")
6
- train_dataloader, model, optimizer scheduler = accelerator.prepare(
7
- dataloader, model, optimizer, scheduler
8
- )
9
- +accelerator.init_trackers()
10
- model.train()
11
- for batch in train_dataloader:
12
- optimizer.zero_grad()
13
- inputs, targets = batch
14
- outputs = model(inputs)
15
- loss = loss_function(outputs, targets)
16
- + accelerator.log({"loss":loss})
17
- accelerator.backward(loss)
18
- optimizer.step()
19
- scheduler.step()
20
- +accelerator.end_training()
21
- </pre>
22
- ##
23
- To use experiment trackers with `accelerate`, simply pass the desired tracker to the `log_with` parameter
24
- when building the `Accelerator` object. Then initialize the tracker(s) by running `Accelerator.init_trackers()`
25
- passing in any configurations they may need. Afterwards call `Accelerator.log` to log a particular value to your tracker.
26
- At the end of training call `accelerator.end_training()` to call any finalization functions a tracking library
27
- may need automatically.
28
- ##
29
- To learn more checkout the related documentation:
30
- - <a href="https://huggingface.co/docs/accelerate/usage_guides/tracking" target="_blank">Using experiment trackers</a>
31
- - <a href="https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.log" target="_blank">Accelerator API Reference</a>
32
- - <a href="https://huggingface.co/docs/accelerate/package_reference/tracking" target="_blank">Tracking API Reference</a>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
code_samples/gradient_accumulation DELETED
@@ -1,33 +0,0 @@
1
- ##
2
- <pre>
3
- from accelerate import Accelerator
4
- accelerator = Accelerator(
5
- + gradient_accumulation_steps=2,
6
- )
7
- dataloader, model, optimizer scheduler = accelerator.prepare(
8
- dataloader, model, optimizer, scheduler
9
- )
10
-
11
- for batch in dataloader:
12
- + with accelerator.accumulate(model):
13
- optimizer.zero_grad()
14
- inputs, targets = batch
15
- outputs = model(inputs)
16
- loss = loss_function(outputs, targets)
17
- accelerator.backward(loss)
18
- optimizer.step()
19
- scheduler.step()</pre>
20
-
21
- ##
22
- When performing gradient accumulation in a distributed setup, there are many opportunities for efficiency mistakes
23
- to occur. `Accelerator` provides a context manager that will take care of the details for you and ensure that the
24
- model is training correctly. Simply wrap the training loop in the `Accelerator.accumulate` context manager
25
- while passing in the model you are training on and during training the gradients will accumulate and synchronize
26
- automatically when needed.
27
-
28
- ##
29
- To learn more checkout the related documentation:
30
- - <a href="https://huggingface.co/docs/accelerate/usage_guides/gradient_accumulation" target="_blank">Performing gradient accumulation</a>
31
- - <a href="https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.accumulate" target="_blank">API reference</a>
32
- - <a href="https://github.com/huggingface/accelerate/blob/main/examples/by_feature/gradient_accumulation.py" target="_blank">Example script</a>
33
- - <a href="https://github.com/huggingface/accelerate/blob/main/examples/by_feature/automatic_gradient_accumulation.py" target="_blank">Performing automatic gradient accumulation example script</a>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
code_samples/initial DELETED
@@ -1,11 +0,0 @@
1
- <pre>
2
- for batch in dataloader:
3
- optimizer.zero_grad()
4
- inputs, targets = batch
5
- inputs = inputs.to(device)
6
- targets = targets.to(device)
7
- outputs = model(inputs)
8
- loss = loss_function(outputs, targets)
9
- loss.backward()
10
- optimizer.step()
11
- scheduler.step()</pre>
 
 
 
 
 
 
 
 
 
 
 
 
code_samples/initial_with_metrics DELETED
@@ -1,27 +0,0 @@
1
- <pre>
2
- import evaluate
3
- metric = evaluate.load("accuracy")
4
- for batch in train_dataloader:
5
- optimizer.zero_grad()
6
- inputs, targets = batch
7
- inputs = inputs.to(device)
8
- targets = targets.to(device)
9
- outputs = model(inputs)
10
- loss = loss_function(outputs, targets)
11
- loss.backward()
12
- optimizer.step()
13
- scheduler.step()
14
-
15
- model.eval()
16
- for batch in eval_dataloader:
17
- inputs, targets = batch
18
- inputs = inputs.to(device)
19
- targets = targets.to(device)
20
- with torch.no_grad():
21
- outputs = model(inputs)
22
- predictions = outputs.argmax(dim=-1)
23
- metric.add_batch(
24
- predictions = predictions,
25
- references = references
26
- )
27
- print(metric.compute())</pre>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
code_samples/large_scale_training/aws_sagemaker DELETED
@@ -1,77 +0,0 @@
1
- ##
2
- Run `accelerate config` on and answer the questionnaire accordingly.
3
- Below is an example yaml for running code remotely on AWS SageMaker. Replace placeholder `xxxxx` with
4
- appropriate values.
5
-
6
- <pre>
7
- base_job_name: accelerate-sagemaker-1
8
- compute_environment: AMAZON_SAGEMAKER
9
- distributed_type: 'NO'
10
- dynamo_backend: 'NO'
11
- ec2_instance_type: ml.p3.2xlarge
12
- gpu_ids: all
13
- iam_role_name: xxxxx
14
- mixed_precision: 'no'
15
- num_machines: 1
16
- profile: xxxxx
17
- py_version: py38
18
- pytorch_version: 1.10.2
19
- region: us-east-1
20
- transformers_version: 4.17.0
21
- use_cpu: false
22
- </pre>
23
- ##
24
- <pre>
25
- from accelerate import Accelerator
26
-
27
- def parse_args():
28
- parser = argparse.ArgumentParser(description="sample task")
29
-
30
- parser.add_argument(
31
- "--pad_to_max_length",
32
- - action="store_true",
33
- + type=bool,
34
- + default=False,
35
- help="If passed, pad all samples to `max_length`. Otherwise, dynamic padding is used.",
36
- )
37
-
38
- ...
39
-
40
-
41
- + def main():
42
- accelerator = Accelerator()
43
-
44
- model, optimizer, training_dataloader, scheduler = accelerator.prepare(
45
- model, optimizer, training_dataloader, scheduler
46
- )
47
-
48
- for batch in training_dataloader:
49
- optimizer.zero_grad()
50
- inputs, targets = batch
51
- outputs = model(inputs)
52
- loss = loss_function(outputs, targets)
53
- accelerator.backward(loss)
54
- optimizer.step()
55
- scheduler.step()
56
-
57
- - torch.save('/opt/ml/model`)
58
- + accelerator.save('/opt/ml/model')
59
-
60
- + if __name__ == "__main__":
61
- + main()
62
- </pre>
63
- Launching a script using default accelerate config file looks like the following:
64
- ```
65
- accelerate launch {script_name.py} {--arg1} {--arg2} ...
66
- ```
67
- ##
68
- SageMaker doesn’t support argparse actions. If you want to use, for example, boolean hyperparameters, you need to specify type as bool in your script and provide an explicit True or False value for this hyperparameter. An example for the same is shown above for `pad_to_max_length` argument. Another important point is to save all the output artifacts to `/opt/ml/model` or use `os.environ["SM_MODEL_DIR"]` as your save directory. After training, artifacts in this directory are uploaded to S3, an example is shown in above code snippet.
69
-
70
- You can provide custom docker image, input channels pointing to S3 data locations and use SageMaker metrics logging
71
- as part of advanced features. Please refer <a href="https://github.com/huggingface/notebooks/tree/main/sagemaker/22_accelerate_sagemaker_examples" target="_blank">Examples showcasing AWS SageMaker integration of πŸ€— Accelerate</a>
72
-
73
- ##
74
- To learn more checkout the related documentation:
75
- - <a href="https://huggingface.co/docs/accelerate/usage_guides/sagemaker" target="_blank">How to use πŸ€— Accelerate with SageMaker</a>
76
- - <a href="https://github.com/huggingface/notebooks/tree/main/sagemaker/22_accelerate_sagemaker_examples" target="_blank">Examples showcasing AWS SageMaker integration of πŸ€— Accelerate</a>
77
- - <a href="https://huggingface.co/docs/accelerate/main/en/package_reference/cli" target="_blank">The Accelerate CLI</a>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
code_samples/training_configuration/aws_sagemaker ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ##
2
+ <pre>
3
+ +base_job_name: accelerate-sagemaker-1
4
+ +compute_environment: AMAZON_SAGEMAKER
5
+ distributed_type: 'NO'
6
+ dynamo_backend: 'NO'
7
+ +ec2_instance_type: ml.p3.2xlarge
8
+ +gpu_ids: all
9
+ +iam_role_name: MY_IAM_ROLE_NAME
10
+ mixed_precision: 'no'
11
+ +num_machines: 1
12
+ +profile: MY_PROFILE_NAME
13
+ +py_version: py38
14
+ +pytorch_version: 1.10.2
15
+ +region: us-east-1
16
+ +transformers_version: 4.17.0
17
+ use_cpu: false
18
+ </pre>
19
+ ##
20
+ <pre>
21
+ def parse_args():
22
+ parser = argparse.ArgumentParse(
23
+ description="sample task"
24
+ )
25
+
26
+ parser.add_argument(
27
+ "--some_bool_arg",
28
+ - action="store_true",
29
+ + type=bool,
30
+ + default=False,
31
+ )
32
+ </pre>
33
+ ##
34
+ If the YAML was generated through the `accelerate config` command:
35
+ ```
36
+ accelerate launch {script_name.py} {--arg1} {--arg2} ...
37
+ ```
38
+
39
+ If the YAML is saved to a `~/config.yaml` file:
40
+ ```
41
+ accelerate launch --config_file ~/config.yaml {script_name.py} {--arg1} {--arg2} ...
42
+ ```
43
+ ##
44
+ SageMaker does not support argparse actions. As a result if a script parameter would normally be a boolean, you need to specify the type as `bool` in the script and provide an explicit `True` or `False` value.
45
+
46
+ Also, when using SageMaker all output artifacts should use `/opt/ml/model` or `os.environ["SM_MODEL_DIR"]` as your save directory. After training, artifacts in this directory are uploaded to S3.
47
+ ##
48
+ To learn more checkout the related documentation:
49
+ - <a href="https://huggingface.co/docs/accelerate/usage_guides/sagemaker" target="_blank">How to use πŸ€— Accelerate with SageMaker</a>
50
+ - <a href="https://github.com/huggingface/notebooks/tree/main/sagemaker/22_accelerate_sagemaker_examples" target="_blank">Examples showcasing AWS SageMaker integration of πŸ€— Accelerate</a>
51
+ - <a href="https://huggingface.co/docs/accelerate/main/en/package_reference/cli" target="_blank">The Command Line</a>
code_samples/{large_scale_training β†’ training_configuration}/deepspeed RENAMED
@@ -1,17 +1,16 @@
1
  ##
2
- Run `accelerate config` and answer the questionnaire accordingly.
3
- Below is an example yaml for mixed-precision training using DeepSpeed ZeRO Stage-3 with CPU offloading on 8 GPUs.
4
  <pre>
5
  compute_environment: LOCAL_MACHINE
6
- deepspeed_config:
7
- gradient_accumulation_steps: 1
8
- gradient_clipping: 1.0
9
- offload_optimizer_device: cpu
10
- offload_param_device: cpu
11
- zero3_init_flag: true
12
- zero3_save_16bit_model: true
13
- zero_stage: 3
14
- distributed_type: DEEPSPEED
15
  downcast_bf16: 'no'
16
  dynamo_backend: 'NO'
17
  fsdp_config: {}
@@ -19,61 +18,52 @@ machine_rank: 0
19
  main_training_function: main
20
  megatron_lm_config: {}
21
  mixed_precision: fp16
22
- num_machines: 1
23
- num_processes: 8
24
  rdzv_backend: static
25
  same_network: true
26
  use_cpu: false
27
  </pre>
28
  ##
 
29
  <pre>
30
- from accelerate import Accelerator
31
 
32
- + def main():
33
  accelerator = Accelerator()
34
 
35
  model, optimizer, training_dataloader, scheduler = accelerator.prepare(
36
  model, optimizer, training_dataloader, scheduler
37
  )
38
 
39
- for batch in training_dataloader:
40
- optimizer.zero_grad()
41
- inputs, targets = batch
42
- outputs = model(inputs)
43
- loss = loss_function(outputs, targets)
44
- accelerator.backward(loss)
45
- optimizer.step()
46
- scheduler.step()
47
-
48
- ...
49
-
50
  generated_tokens = accelerator.unwrap_model(model).generate(
51
- batch["input_ids"],
52
- attention_mask=batch["attention_mask"],
53
- **gen_kwargs,
54
- + synced_gpus=True #required for ZeRO Stage 3
55
- )
56
  ...
57
 
58
  accelerator.unwrap_model(model).save_pretrained(
59
  args.output_dir,
60
  is_main_process=accelerator.is_main_process,
61
  save_function=accelerator.save,
62
- + state_dict=accelerator.get_state_dict(model), #required for ZeRO Stage 3
63
- )
64
-
65
  ...
66
-
67
- + if __name__ == "__main__":
68
- + main()
69
  </pre>
70
-
71
- Launching a script using default accelerate config file looks like the following:
72
  ```
73
  accelerate launch {script_name.py} {--arg1} {--arg2} ...
74
  ```
75
 
76
- Alternatively, you can use `accelerate launch` with right config params for multi-gpu training as shown below
 
 
 
 
 
77
  ```
78
  accelerate launch \
79
  --use_deepspeed \
@@ -90,12 +80,12 @@ accelerate launch \
90
  ```
91
 
92
  ##
93
- For core DeepSpeed features supported via accelerate config file, no changes are required for ZeRO Stages 1 and 2. For ZeRO Stage-3, transformers' `generate` function requires `synced_gpus=True` and `save_pretrained` requires the `state_dict` param due to the fact that model parameters are sharded across the GPUs.
94
-
95
- For advanced users who like granular control via DeepSpeed config file, it is supported wherein you can pass its loaction when running `accelerate config` command. You can also specify values of most of the fields in DeepSpeed config file as `auto` and they are filled automatically via the arguments of `accelerate launch` command and `accelerator.prepare` call thereby making life simple for users. Please refer docs on <a href="https://huggingface.co/docs/accelerate/usage_guides/deepspeed#deepspeed-config-file" target="_blank">DeepSpeed Config File</a>
96
 
 
97
  ##
98
  To learn more checkout the related documentation:
99
  - <a href="https://huggingface.co/docs/accelerate/usage_guides/deepspeed" target="_blank">How to use DeepSpeed</a>
 
100
  - <a href="https://huggingface.co/blog/accelerate-deepspeed" target="_blank">Accelerate Large Model Training using DeepSpeed</a>
101
  - <a href="https://huggingface.co/docs/accelerate/package_reference/deepspeed" target="_blank">DeepSpeed Utilities</a>
 
1
  ##
2
+ Below is an example yaml for mixed precision training using DeepSpeed ZeRO Stage-3 with CPU offloading on 8 GPUs.
 
3
  <pre>
4
  compute_environment: LOCAL_MACHINE
5
+ +deepspeed_config:
6
+ + gradient_accumulation_steps: 1
7
+ + gradient_clipping: 1.0
8
+ + offload_optimizer_device: cpu
9
+ + offload_param_device: cpu
10
+ + zero3_init_flag: true
11
+ + zero3_save_16bit_model: true
12
+ + zero_stage: 3
13
+ +distributed_type: DEEPSPEED
14
  downcast_bf16: 'no'
15
  dynamo_backend: 'NO'
16
  fsdp_config: {}
 
18
  main_training_function: main
19
  megatron_lm_config: {}
20
  mixed_precision: fp16
21
+ +num_machines: 1
22
+ +num_processes: 8
23
  rdzv_backend: static
24
  same_network: true
25
  use_cpu: false
26
  </pre>
27
  ##
28
+ Assume that `model` is created utilizing the `transformers` library.
29
  <pre>
30
+ from accelerate import Accelerator
31
 
32
+ def main():
33
  accelerator = Accelerator()
34
 
35
  model, optimizer, training_dataloader, scheduler = accelerator.prepare(
36
  model, optimizer, training_dataloader, scheduler
37
  )
38
 
 
 
 
 
 
 
 
 
 
 
 
39
  generated_tokens = accelerator.unwrap_model(model).generate(
40
+ batch["input_ids"],
41
+ attention_mask=batch["attention_mask"],
42
+ **gen_kwargs,
43
+ + synced_gpus=True
44
+ )
45
  ...
46
 
47
  accelerator.unwrap_model(model).save_pretrained(
48
  args.output_dir,
49
  is_main_process=accelerator.is_main_process,
50
  save_function=accelerator.save,
51
+ + state_dict=accelerator.get_state_dict(model)
52
+ )
 
53
  ...
 
 
 
54
  </pre>
55
+ ##
56
+ If the YAML was generated through the `accelerate config` command:
57
  ```
58
  accelerate launch {script_name.py} {--arg1} {--arg2} ...
59
  ```
60
 
61
+ If the YAML is saved to a `~/config.yaml` file:
62
+ ```
63
+ accelerate launch --config_file ~/config.yaml {script_name.py} {--arg1} {--arg2} ...
64
+ ```
65
+
66
+ Or you can use `accelerate launch` with right configuration parameters and have no `config.yaml` file:
67
  ```
68
  accelerate launch \
69
  --use_deepspeed \
 
80
  ```
81
 
82
  ##
83
+ For core DeepSpeed features (ZeRO stages 1 and 2), Accelerate requires no code changes. For ZeRO Stage-3, `transformers`' `generate` function requires `synced_gpus=True` and `save_pretrained` requires the `state_dict` param due to the fact that model parameters are sharded across the GPUs.
 
 
84
 
85
+ You can also specify values of most of the fields in the `DeepSpeed` config file to `auto` and they will be automatically filled when performing `accelerate launch`.
86
  ##
87
  To learn more checkout the related documentation:
88
  - <a href="https://huggingface.co/docs/accelerate/usage_guides/deepspeed" target="_blank">How to use DeepSpeed</a>
89
+ <a href="https://huggingface.co/docs/accelerate/usage_guides/deepspeed#deepspeed-config-file" target="_blank">DeepSpeed Config File</a>
90
  - <a href="https://huggingface.co/blog/accelerate-deepspeed" target="_blank">Accelerate Large Model Training using DeepSpeed</a>
91
  - <a href="https://huggingface.co/docs/accelerate/package_reference/deepspeed" target="_blank">DeepSpeed Utilities</a>
code_samples/{large_scale_training β†’ training_configuration}/megatron-lm RENAMED
@@ -1,23 +1,22 @@
1
  ##
2
- Run `accelerate config` and answer the questionnaire accordingly.
3
- Below is an example yaml for BF16 mixed-precision training using Megatron-LM with DPxTPxPP=2x2x2 degrees on 8 GPUs. (DP-Data Parallelism, PP-Pipeline Parallelism, TP-Tensor Parallelism). It is also using Sequence Parallelism and selective activation checkpointing along with sharded optimizer.
4
  <pre>
5
  compute_environment: LOCAL_MACHINE
6
  deepspeed_config: {}
7
- distributed_type: MEGATRON_LM
8
  downcast_bf16: 'no'
9
  dynamo_backend: 'NO'
10
  fsdp_config: {}
11
  machine_rank: 0
12
  main_training_function: main
13
- megatron_lm_config:
14
- megatron_lm_gradient_clipping: 1.0
15
- megatron_lm_num_micro_batches: 2
16
- megatron_lm_pp_degree: 2
17
- megatron_lm_recompute_activations: true
18
- megatron_lm_sequence_parallelism: true
19
- megatron_lm_tp_degree: 2
20
- megatron_lm_use_distributed_optimizer: true
21
  mixed_precision: bf16
22
  num_machines: 1
23
  num_processes: 8
@@ -27,67 +26,52 @@ use_cpu: false
27
  </pre>
28
  ##
29
  <pre>
30
- from accelerate import Accelerator
 
31
 
32
- + def main():
33
- accelerator = Accelerator()
34
-
35
- ...
36
-
37
- - lr_scheduler = get_scheduler(
38
- - name=args.lr_scheduler_type,
39
- + lr_scheduler = accelerate.utils.MegatronLMDummyScheduler(
40
- optimizer=optimizer,
41
- num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps,
42
- num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
43
- )
44
-
45
-
46
- model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
47
- model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
48
- )
49
-
50
- total_batch_size = (
51
- - args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
52
- + accelerator.state.megatron_lm_plugin.global_batch_size
53
- )
54
-
55
- for batch in training_dataloader:
56
- optimizer.zero_grad()
57
- inputs, targets = batch
58
- outputs = model(inputs)
59
- loss = loss_function(outputs, targets)
60
- accelerator.backward(loss)
61
- optimizer.step()
62
- scheduler.step()
63
-
64
- ...
65
-
66
- # in eval loop
67
- for step, batch in enumerate(eval_dataloader):
68
- with torch.no_grad():
69
- outputs = model(**batch)
70
- loss = outputs.loss
71
- - losses.append(accelerator.gather_for_metrics(loss.repeat(args.per_device_eval_batch_size)))
72
- + losses.append(loss) # For Megatron-LM, the losses are already averaged across the data parallel group
73
- - losses = torch.cat(losses)
74
- + losses = torch.tensor(losses)
75
- eval_loss = torch.mean(losses)
76
- perplexity = math.exp(eval_loss)
77
- logger.info(f"epoch {epoch}: perplexity: {perplexity} eval_loss: {eval_loss}")
78
-
79
- + accelerator.save_state(output_dir)
80
-
81
- + if __name__ == "__main__":
82
- + main()
83
  </pre>
84
-
85
- Launching a script using default accelerate config file looks like the following:
86
  ```
87
  accelerate launch {script_name.py} {--arg1} {--arg2} ...
88
  ```
89
 
90
- Alternatively, you can use `accelerate launch` with right config params for multi-gpu training as shown below
 
 
 
 
 
91
  ```
92
  accelerate launch \
93
  --use_megatron_lm \
@@ -109,9 +93,14 @@ For Megatron-LM, the supported models Transformers GPT2, Megatron-BERT and T5 mo
109
  3. Losses are already averaged across the data parallel group
110
  4. save the model using `accelerator.save_state` instead of transformers `from_pretrianed`
111
 
112
- These changes have been highlited in the code snippet above.
 
 
 
 
 
113
 
114
- Megatron-LM intergration supports many advanced features such as ability to leverage custom train step, using Megatron-LM indexed datasets, checkpoint reshaping and interoperabiloity utilities, `megatron_generate` function for text generation using Tensor and Pipeline Parallelism and support for ROPE/ALibi Positional embeddings and Multi-Query Attention. However, these require more changes owing to the complexity; worth it for getting the highest performance.
115
 
116
  ##
117
  To learn more checkout the related documentation:
 
1
  ##
2
+ Below is an example yaml for BF16 mixed-precision training using Megatron-LM with 2x Data Parallelism, 2x Pipeline Parallelism, and 2x Tensor Parallelism on 8 GPUs. It is also using Sequence Parallelism, selective activation checkpointing, and a sharded optimizer.
 
3
  <pre>
4
  compute_environment: LOCAL_MACHINE
5
  deepspeed_config: {}
6
+ +distributed_type: MEGATRON_LM
7
  downcast_bf16: 'no'
8
  dynamo_backend: 'NO'
9
  fsdp_config: {}
10
  machine_rank: 0
11
  main_training_function: main
12
+ +megatron_lm_config:
13
+ + megatron_lm_gradient_clipping: 1.0
14
+ + megatron_lm_num_micro_batches: 2
15
+ + megatron_lm_pp_degree: 2
16
+ + megatron_lm_recompute_activations: true
17
+ + megatron_lm_sequence_parallelism: true
18
+ + megatron_lm_tp_degree: 2
19
+ + megatron_lm_use_distributed_optimizer: true
20
  mixed_precision: bf16
21
  num_machines: 1
22
  num_processes: 8
 
26
  </pre>
27
  ##
28
  <pre>
29
+ from accelerate import Accelerator
30
+ +from accelerate.utils import MegatronLMDummyScheduler
31
 
32
+ accelerator = Accelerator()
33
+
34
+ ...
35
+
36
+ -lr_scheduler = get_scheduler(
37
+ - name=args.lr_scheduler_type,
38
+ - ...
39
+ -)
40
+ +lr_scheduler = MegatronLMDummyScheduler(
41
+ + optimizer=optimizer,
42
+ + num_warmup_steps=...,
43
+ + num_training_steps=...,
44
+ +)
45
+ model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
46
+ model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
47
+ )
48
+
49
+ total_batch_size = (
50
+ - args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
51
+ + accelerator.state.megatron_lm_plugin.global_batch_size
52
+ )
53
+ # in evaluation loop
54
+ for step, batch in enumerate(eval_dataloader):
55
+ with torch.no_grad():
56
+ outputs = model(**batch)
57
+ loss = outputs.loss
58
+ - losses.append(accelerator.gather_for_metrics(loss.repeat(args.per_device_eval_batch_size)))
59
+ + losses.append(loss) # For Megatron-LM, the losses are already averaged across the data parallel group
60
+ -losses = torch.cat(losses)
61
+ +losses = torch.tensor(losses)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  </pre>
63
+ ##
64
+ If the YAML was generated through the `accelerate config` command:
65
  ```
66
  accelerate launch {script_name.py} {--arg1} {--arg2} ...
67
  ```
68
 
69
+ If the YAML is saved to a `~/config.yaml` file:
70
+ ```
71
+ accelerate launch --config_file ~/config.yaml {script_name.py} {--arg1} {--arg2} ...
72
+ ```
73
+
74
+ Or you can use `accelerate launch` with right configuration parameters and have no `config.yaml` file:
75
  ```
76
  accelerate launch \
77
  --use_megatron_lm \
 
93
  3. Losses are already averaged across the data parallel group
94
  4. save the model using `accelerator.save_state` instead of transformers `from_pretrianed`
95
 
96
+ The Accelerate Megatron-LM integration supports many advanced features such as:
97
+ - Leveraging custom training steps
98
+ - Using Megatron-LM indexed datasets
99
+ - Checkpoint reshaping and interoperabiloity utilities
100
+ - Using `megatron_generate` for text generation using Tensor and Pipeline Parallism
101
+ - Support for ROPE/ALibi Positional embeddings and Multi-Query Attention
102
 
103
+ However, each of these require more changes to your source code than what is presented here.
104
 
105
  ##
106
  To learn more checkout the related documentation:
code_samples/{large_scale_training β†’ training_configuration}/multi_gpu RENAMED
@@ -1,60 +1,44 @@
1
  ##
2
- Run `accelerate config` and answer the questionnaire accordingly.
3
- Below is an example yaml for using multi-gpu training with 4 GPUs.
4
  <pre>
5
- compute_environment: LOCAL_MACHINE
6
  deepspeed_config: {}
7
- distributed_type: MULTI_GPU
8
  downcast_bf16: 'no'
9
  dynamo_backend: 'NO'
10
  fsdp_config: {}
11
- gpu_ids: all
12
- machine_rank: 0
13
  main_training_function: main
14
  megatron_lm_config: {}
15
  mixed_precision: 'no'
16
- num_machines: 1
17
- num_processes: 4
18
- rdzv_backend: static
19
- same_network: true
20
  use_cpu: false</pre>
21
  ##
22
- <pre>
23
- from accelerate import Accelerator
24
-
25
- + def main():
26
- accelerator = Accelerator()
27
-
28
- model, optimizer, training_dataloader, scheduler = accelerator.prepare(
29
- model, optimizer, training_dataloader, scheduler
30
- )
31
-
32
- for batch in training_dataloader:
33
- optimizer.zero_grad()
34
- inputs, targets = batch
35
- outputs = model(inputs)
36
- loss = loss_function(outputs, targets)
37
- accelerator.backward(loss)
38
- optimizer.step()
39
- scheduler.step()
40
-
41
- + if __name__ == "__main__":
42
- + main()
43
- </pre>
44
-
45
- Launching a script using default accelerate config file looks like the following:
46
  ```
47
  accelerate launch {script_name.py} {--arg1} {--arg2} ...
48
  ```
49
 
50
- Alternatively, you can use `accelerate launch` with right config params for multi-gpu training as shown below
 
 
 
 
 
51
  ```
52
  accelerate launch --multi_gpu --num_processes=4 {script_name.py} {--arg1} {--arg2} ...
53
  ```
54
 
55
  ##
56
- Using this feature involves no changes to the code apart from the ones mentioned in the tab `Simplify your code and improve efficieny`.
 
 
57
  ##
58
  To learn more checkout the related documentation:
59
  - <a href="https://huggingface.co/docs/accelerate/main/en/basic_tutorials/launch" target="_blank">Launching distributed code</a>
60
- - <a href="https://huggingface.co/docs/accelerate/main/en/package_reference/cli" target="_blank">The Accelerate CLI</a>
 
1
  ##
 
 
2
  <pre>
3
+ compute_environment: LOCAL_MACHINE
4
  deepspeed_config: {}
5
+ +distributed_type: MULTI_GPU
6
  downcast_bf16: 'no'
7
  dynamo_backend: 'NO'
8
  fsdp_config: {}
9
+ +gpu_ids: all
10
+ +machine_rank: 0
11
  main_training_function: main
12
  megatron_lm_config: {}
13
  mixed_precision: 'no'
14
+ +num_machines: 1
15
+ +num_processes: 4
16
+ +rdzv_backend: static
17
+ +same_network: true
18
  use_cpu: false</pre>
19
  ##
20
+ None
21
+ ##
22
+ If the YAML was generated through the `accelerate config` command:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  ```
24
  accelerate launch {script_name.py} {--arg1} {--arg2} ...
25
  ```
26
 
27
+ If the YAML is saved to a `~/config.yaml` file:
28
+ ```
29
+ accelerate launch --config_file ~/config.yaml {script_name.py} {--arg1} {--arg2} ...
30
+ ```
31
+
32
+ Or you can use `accelerate launch` with right configuration parameters and have no `config.yaml` file:
33
  ```
34
  accelerate launch --multi_gpu --num_processes=4 {script_name.py} {--arg1} {--arg2} ...
35
  ```
36
 
37
  ##
38
+ Launching on multi-GPU instances requires a different launch command than just `python myscript.py`. Accelerate will wrap around the proper launching script to delegate and call, reading in how to set their configuration based on the parameters passed in. It is a passthrough to the `torchrun` command.
39
+
40
+ **Remember that you can always use the `accelerate launch` functionality, even if the code in your script does not use the `Accelerator`**
41
  ##
42
  To learn more checkout the related documentation:
43
  - <a href="https://huggingface.co/docs/accelerate/main/en/basic_tutorials/launch" target="_blank">Launching distributed code</a>
44
+ - <a href="https://huggingface.co/docs/accelerate/main/en/package_reference/cli" target="_blank">The Command Line</a>
code_samples/{large_scale_training β†’ training_configuration}/multi_node_multi_gpu RENAMED
@@ -1,89 +1,78 @@
1
  ##
2
- Run `accelerate config` on and answer the questionnaire accordingly.
3
- Below is an example yaml for using multi-gpu training with 4 GPUs on 2 nodes/machines.
4
 
5
- On Node/Machine 1:
6
  <pre>
7
- compute_environment: LOCAL_MACHINE
8
  deepspeed_config: {}
9
- distributed_type: MULTI_GPU
10
  downcast_bf16: 'no'
11
  dynamo_backend: 'NO'
12
  fsdp_config: {}
13
  gpu_ids: all
14
- machine_rank: 0
15
- main_process_ip: 192.168.20.1
16
- main_process_port: 8080
17
  main_training_function: main
18
  megatron_lm_config: {}
19
  mixed_precision: 'no'
20
- num_machines: 2
21
- num_processes: 8
22
- rdzv_backend: static
23
- same_network: true
24
  use_cpu: false
25
  </pre>
26
 
27
- On Node/Machine 2:
28
  <pre>
29
- compute_environment: LOCAL_MACHINE
30
  deepspeed_config: {}
31
- distributed_type: MULTI_GPU
32
  downcast_bf16: 'no'
33
  dynamo_backend: 'NO'
34
  fsdp_config: {}
35
  gpu_ids: all
36
  -machine_rank: 0
37
  +machine_rank: 1
38
- main_process_ip: 192.168.20.1
39
- main_process_port: 8080
40
  main_training_function: main
41
  megatron_lm_config: {}
42
  mixed_precision: 'no'
43
- num_machines: 2
44
- num_processes: 8
45
- rdzv_backend: static
46
- same_network: true
47
  use_cpu: false
48
  </pre>
49
  ##
50
- <pre>
51
- from accelerate import Accelerator
52
-
53
- + def main():
54
- accelerator = Accelerator()
55
-
56
- model, optimizer, training_dataloader, scheduler = accelerator.prepare(
57
- model, optimizer, training_dataloader, scheduler
58
- )
59
-
60
- for batch in training_dataloader:
61
- optimizer.zero_grad()
62
- inputs, targets = batch
63
- outputs = model(inputs)
64
- loss = loss_function(outputs, targets)
65
- accelerator.backward(loss)
66
- optimizer.step()
67
- scheduler.step()
68
-
69
- + if __name__ == "__main__":
70
- + main()
71
- </pre>
72
 
73
- Launching a script using default accelerate config file looks like the following:
74
  ```
75
  accelerate launch {script_name.py} {--arg1} {--arg2} ...
76
  ```
77
 
78
- Alternatively, you can use `accelerate launch` with right config params for multi-gpu training as shown below. Replace `{node_number}` with appropriate number.
 
 
 
 
 
 
 
79
  ```
80
  accelerate launch --multi_gpu --num_machines=2 --num_processes=8 --main_process_ip="192.168.20.1" --main_process_port=8080
81
  --machine_rank={node_number} {script_name.py} {--arg1} {--arg2} ...
82
  ```
83
 
84
  ##
85
- Using this feature involves no changes to the code apart from the ones mentioned in the tab `Simplify your code and improve efficieny`.
 
 
86
  ##
87
  To learn more checkout the related documentation:
88
  - <a href="https://huggingface.co/docs/accelerate/main/en/basic_tutorials/launch" target="_blank">Launching distributed code</a>
89
- - <a href="https://huggingface.co/docs/accelerate/main/en/package_reference/cli" target="_blank">The Accelerate CLI</a>
 
1
  ##
2
+ Below are example yamls for using multi-gpu training with 4 GPUs on two machines (nodes) where each machine has two GPUs:
 
3
 
4
+ On machine 1 (host):
5
  <pre>
6
+ compute_environment: LOCAL_MACHINE
7
  deepspeed_config: {}
8
+ +distributed_type: MULTI_GPU
9
  downcast_bf16: 'no'
10
  dynamo_backend: 'NO'
11
  fsdp_config: {}
12
  gpu_ids: all
13
+ +machine_rank: 0
14
+ +main_process_ip: 192.168.20.1
15
+ +main_process_port: 8080
16
  main_training_function: main
17
  megatron_lm_config: {}
18
  mixed_precision: 'no'
19
+ +num_machines: 2
20
+ +num_processes: 8
21
+ +rdzv_backend: static
22
+ +same_network: true
23
  use_cpu: false
24
  </pre>
25
 
26
+ On machine 2:
27
  <pre>
28
+ compute_environment: LOCAL_MACHINE
29
  deepspeed_config: {}
30
+ +distributed_type: MULTI_GPU
31
  downcast_bf16: 'no'
32
  dynamo_backend: 'NO'
33
  fsdp_config: {}
34
  gpu_ids: all
35
  -machine_rank: 0
36
  +machine_rank: 1
37
+ +main_process_ip: 192.168.20.1
38
+ +main_process_port: 8080
39
  main_training_function: main
40
  megatron_lm_config: {}
41
  mixed_precision: 'no'
42
+ +num_machines: 2
43
+ +num_processes: 8
44
+ +rdzv_backend: static
45
+ +same_network: true
46
  use_cpu: false
47
  </pre>
48
  ##
49
+ None
50
+ ##
51
+ To launch a script, on each machine run one of the following variations:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
+ If the YAML was generated through the `accelerate config` command:
54
  ```
55
  accelerate launch {script_name.py} {--arg1} {--arg2} ...
56
  ```
57
 
58
+ If the YAML is saved to a `~/config.yaml` file:
59
+ ```
60
+ accelerate launch --config_file ~/config.yaml {script_name.py} {--arg1} {--arg2} ...
61
+ ```
62
+
63
+ Or you can use `accelerate launch` with right configuration parameters and have no `config.yaml` file:
64
+
65
+ Replace `{node_number}` with appropriate machine number (0 for host, 1+ if not).
66
  ```
67
  accelerate launch --multi_gpu --num_machines=2 --num_processes=8 --main_process_ip="192.168.20.1" --main_process_port=8080
68
  --machine_rank={node_number} {script_name.py} {--arg1} {--arg2} ...
69
  ```
70
 
71
  ##
72
+ When utilizing multiple machines (nodes) for training, the config file needs to know how each machine will be able to communicate (the IP address and port), how many *total* GPUs there are, and whether the current machine is either the host or a client.
73
+
74
+ **Remember that you can always use the `accelerate launch` functionality, even if the code in your script does not use the `Accelerator`**
75
  ##
76
  To learn more checkout the related documentation:
77
  - <a href="https://huggingface.co/docs/accelerate/main/en/basic_tutorials/launch" target="_blank">Launching distributed code</a>
78
+ - <a href="https://huggingface.co/docs/accelerate/main/en/package_reference/cli" target="_blank">The Command Line</a>
code_samples/{large_scale_training β†’ training_configuration}/pytorch_fsdp RENAMED
@@ -1,63 +1,55 @@
1
  ##
2
- Run `accelerate config` and answer the questionnaire accordingly.
3
- Below is an example yaml for BF16 mixed-precision training using PyTorch FSDP with CPU offloading on 8 GPUs.
4
  <pre>
5
- compute_environment: LOCAL_MACHINE
6
  deepspeed_config: {}
7
- distributed_type: FSDP
8
  downcast_bf16: 'no'
9
  dynamo_backend: 'NO'
10
- fsdp_config:
11
- fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
12
- fsdp_backward_prefetch_policy: BACKWARD_PRE
13
- fsdp_offload_params: true
14
- fsdp_sharding_strategy: 1
15
- fsdp_state_dict_type: FULL_STATE_DICT
16
- fsdp_transformer_layer_cls_to_wrap: T5Block
17
  machine_rank: 0
18
  main_training_function: main
19
  megatron_lm_config: {}
20
  mixed_precision: bf16
21
  num_machines: 1
22
- num_processes: 8
23
  rdzv_backend: static
24
  same_network: true
25
  use_cpu: false
26
  </pre>
27
  ##
28
  <pre>
29
- from accelerate import Accelerator
30
-
31
- + def main():
32
- accelerator = Accelerator()
33
 
34
- model = accelerator.prepare(model)
35
-
36
- optimizer, training_dataloader, scheduler = accelerator.prepare(
37
- optimizer, training_dataloader, scheduler
38
- )
39
-
40
- for batch in training_dataloader:
41
- optimizer.zero_grad()
42
- inputs, targets = batch
43
- outputs = model(inputs)
44
- loss = loss_function(outputs, targets)
45
- accelerator.backward(loss)
46
- optimizer.step()
47
- scheduler.step()
48
-
49
- ...
50
-
51
- + if __name__ == "__main__":
52
- + main()
53
  </pre>
54
-
55
- Launching a script using default accelerate config file looks like the following:
56
  ```
57
  accelerate launch {script_name.py} {--arg1} {--arg2} ...
58
  ```
59
 
60
- Alternatively, you can use `accelerate launch` with right config params for multi-gpu training as shown below
 
 
 
 
 
61
  ```
62
  accelerate launch \
63
  --use_fsdp \
@@ -71,10 +63,11 @@ accelerate launch \
71
  ```
72
 
73
  ##
74
- For PyTorch FDSP, you need to prepare the model first before preparing the optimizer since FSDP will shard parameters in-place and this will break any previously initialized optimizers. Same in outlined in the above code snippet. For transformer models, please use `TRANSFORMER_BASED_WRAP` auto wrap policy as shown in the config above.
75
 
 
76
 
77
  ##
78
  To learn more checkout the related documentation:
79
- - <a href="https://huggingface.co/docs/accelerate/usage_guides/fsdp" target="_blank">How to use FSDP</a>
80
  - <a href="https://huggingface.co/blog/pytorch-fsdp" target="_blank">Accelerate Large Model Training using PyTorch Fully Sharded Data Parallel</a>
 
1
  ##
2
+ Below is an example yaml for BF16 mixed-precision training using PyTorch Fully Sharded Data Parallism (FSDP) with CPU offloading on 8 GPUs.
 
3
  <pre>
4
+ compute_environment: LOCAL_MACHINE
5
  deepspeed_config: {}
6
+ +distributed_type: FSDP
7
  downcast_bf16: 'no'
8
  dynamo_backend: 'NO'
9
+ +fsdp_config:
10
+ + fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
11
+ + fsdp_backward_prefetch_policy: BACKWARD_PRE
12
+ + fsdp_offload_params: true
13
+ + fsdp_sharding_strategy: 1
14
+ + fsdp_state_dict_type: FULL_STATE_DICT
15
+ + fsdp_transformer_layer_cls_to_wrap: T5Block
16
  machine_rank: 0
17
  main_training_function: main
18
  megatron_lm_config: {}
19
  mixed_precision: bf16
20
  num_machines: 1
21
+ +num_processes: 8
22
  rdzv_backend: static
23
  same_network: true
24
  use_cpu: false
25
  </pre>
26
  ##
27
  <pre>
28
+ from accelerate import Accelerator
 
 
 
29
 
30
+ accelerator = Accelerator()
31
+ - model, optimizer, dataloader, scheduler = accelerator.prepare(
32
+ - model, optimizer, dataloader, scheduler
33
+ -)
34
+ +model = accelerator.prepare(model)
35
+ +# Optimizer can be any PyTorch optimizer class
36
+ +optimizer = torch.optim.AdamW(params=model.parameters(), lr=lr)
37
+ +optimizer, dataloader, scheduler = accelerator.prepare(
38
+ + optimizer, dataloader, scheduler
39
+ +)
 
 
 
 
 
 
 
 
 
40
  </pre>
41
+ ##
42
+ If the YAML was generated through the `accelerate config` command:
43
  ```
44
  accelerate launch {script_name.py} {--arg1} {--arg2} ...
45
  ```
46
 
47
+ If the YAML is saved to a `~/config.yaml` file:
48
+ ```
49
+ accelerate launch --config_file ~/config.yaml {script_name.py} {--arg1} {--arg2} ...
50
+ ```
51
+
52
+ Or you can use `accelerate launch` with right configuration parameters and have no `config.yaml` file:
53
  ```
54
  accelerate launch \
55
  --use_fsdp \
 
63
  ```
64
 
65
  ##
66
+ For PyTorch FDSP, you need to prepare the model first **before** preparing the optimizer since FSDP will shard parameters in-place and this will break any previously initialized optimizers.
67
 
68
+ For transformer models, please use `TRANSFORMER_BASED_WRAP` auto wrap policy as shown in the config above.
69
 
70
  ##
71
  To learn more checkout the related documentation:
72
+ - <a href="https://huggingface.co/docs/accelerate/usage_guides/fsdp" target="_blank">How to use Fully Sharded Data Parallelism</a>
73
  - <a href="https://huggingface.co/blog/pytorch-fsdp" target="_blank">Accelerate Large Model Training using PyTorch Fully Sharded Data Parallel</a>
setup.cfg ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [isort]
2
+ default_section = FIRSTPARTY
3
+ ensure_newline_before_comments = True
4
+ force_grid_wrap = 0
5
+ include_trailing_comma = True
6
+ known_first_party = accelerate
7
+ known_third_party =
8
+ numpy
9
+ torch
10
+ torch_xla
11
+
12
+ line_length = 119
13
+ lines_after_imports = 2
14
+ multi_line_output = 3
15
+ use_parentheses = True
16
+
17
+ [flake8]
18
+ ignore = E203, E722, E501, E741, W503, W605
19
+ max-line-length = 119
src/app.py CHANGED
@@ -1,10 +1,35 @@
 
 
1
  import gradio as gr
2
- from markup import highlight, get_text
3
  from template import get_templates
4
 
 
5
  templates = get_templates()
6
 
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  def change(inp, textbox):
9
  """Based on an `inp`, render and highlight the appropriate code sample.
10
 
@@ -20,65 +45,80 @@ def change(inp, textbox):
20
  if textbox == "base":
21
  code, explanation, docs = get_text(inp, textbox)
22
  if inp == "Basic":
23
- return (highlight(code), "## Accelerate Code (Base Integration)", explanation, docs)
 
 
 
 
 
24
  elif inp == "Calculating Metrics":
25
  return (highlight(code), f"## Accelerate Code ({inp})", explanation, docs)
26
  else:
27
  return (highlight(code), f"## Accelerate Code ({inp})", explanation, docs)
28
- elif textbox == "large_scale_training":
29
- config, code, explanation, docs = get_text(inp, textbox)
30
- return (highlight(config), highlight(code), f"## Accelerate Code ({inp})", explanation, docs)
 
 
31
 
32
 
33
- default = change("Basic", "base")
 
34
 
35
 
36
  def base_features(textbox):
37
- # textbox.value = "base"
38
  inp = gr.Radio(
39
- ["Basic", "Calculating Metrics", "Checkpointing", "Experiment Tracking", "Gradient Accumulation"],
 
 
 
 
 
 
40
  label="Select a feature you would like to integrate",
41
  value="Basic",
42
  )
43
- with gr.Row():
44
- with gr.Column():
45
- feature = gr.Markdown("## Accelerate Code")
46
- out = gr.Markdown(default[0])
47
- with gr.Row():
48
- with gr.Column():
49
- gr.Markdown("## Explanation")
50
- explanation = gr.Markdown(default[2])
51
- with gr.Row():
52
- with gr.Column():
53
- gr.Markdown("## Documentation Links")
54
- docs = gr.Markdown(default[3])
55
- inp.change(fn=change, inputs=[inp, textbox], outputs=[out, feature, explanation, docs])
56
 
57
 
58
- def large_scale_training(textbox):
59
- # textbox.value = "large_scale_training"
60
  inp = gr.Radio(
61
- ["Multi GPU", "Multi Node Multi GPU", "AWS SageMaker", "DeepSpeed", "PyTorch FSDP", "Megatron-LM"],
62
- label="Select a feature you would like to integrate",
63
- value="Basic",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  )
65
- with gr.Row():
66
- with gr.Column():
67
- feature = gr.Markdown("## Accelerate Config")
68
- config = gr.Markdown("")
69
- with gr.Row():
70
- with gr.Column():
71
- feature = gr.Markdown("## Accelerate Code")
72
- out = gr.Markdown("")
73
- with gr.Row():
74
- with gr.Column():
75
- gr.Markdown("## Explanation")
76
- explanation = gr.Markdown("")
77
- with gr.Row():
78
- with gr.Column():
79
- gr.Markdown("## Documentation Links")
80
- docs = gr.Markdown("")
81
- inp.change(fn=change, inputs=[inp, textbox], outputs=[config, out, feature, explanation, docs])
82
 
83
 
84
  # def big_model_inference():
@@ -126,16 +166,18 @@ def large_scale_training(textbox):
126
  with gr.Blocks() as demo:
127
 
128
  with gr.Tabs():
129
- with gr.TabItem("Simplify your code and improve efficieny"):
130
  textbox = gr.Textbox(label="tab_name", visible=False, value="base")
131
  base_features(textbox)
132
- with gr.TabItem("Large Scale Training"):
133
- textbox = gr.Textbox(label="tab_name", visible=False, value="large_scale_training")
134
- large_scale_training(textbox)
 
 
135
  with gr.TabItem("Big Model Inference"):
136
  # big_model_inference()
137
  pass
138
- with gr.TabItem("Notebook Launcher Intergation"):
139
  # notebook_launcher()
140
  pass
141
 
 
1
+ from contextlib import contextmanager
2
+
3
  import gradio as gr
4
+ from markup import get_text, highlight
5
  from template import get_templates
6
 
7
+
8
  templates = get_templates()
9
 
10
 
11
+ def fill_tab(title, explanation):
12
+ """
13
+ Fill the tab with the appropriate title and explanation.
14
+ """
15
+ return gr.Markdown(title), gr.Markdown(explanation)
16
+
17
+
18
+ @contextmanager
19
+ def new_section():
20
+ """
21
+ A context manager to create a new section in the interface. Equivalent of:
22
+ ```python
23
+ with gr.Row():
24
+ with gr.Column():
25
+ ...
26
+ ```
27
+ """
28
+ with gr.Row():
29
+ with gr.Column():
30
+ yield
31
+
32
+
33
  def change(inp, textbox):
34
  """Based on an `inp`, render and highlight the appropriate code sample.
35
 
 
45
  if textbox == "base":
46
  code, explanation, docs = get_text(inp, textbox)
47
  if inp == "Basic":
48
+ return (
49
+ highlight(code),
50
+ "## Accelerate Code (Base Integration)",
51
+ explanation,
52
+ docs,
53
+ )
54
  elif inp == "Calculating Metrics":
55
  return (highlight(code), f"## Accelerate Code ({inp})", explanation, docs)
56
  else:
57
  return (highlight(code), f"## Accelerate Code ({inp})", explanation, docs)
58
+ elif textbox == "training_configuration":
59
+ yaml, changes, command, explanation, docs = get_text(inp, textbox)
60
+ return (highlight(yaml), highlight(changes), command, explanation, docs)
61
+ else:
62
+ raise ValueError(f"Invalid tab name: {textbox}")
63
 
64
 
65
+ default_base = change("Basic", "base")
66
+ default_training_config = change("Multi GPU", "training_configuration")
67
 
68
 
69
  def base_features(textbox):
 
70
  inp = gr.Radio(
71
+ [
72
+ "Basic",
73
+ "Calculating Metrics",
74
+ "Checkpointing",
75
+ "Experiment Tracking",
76
+ "Gradient Accumulation",
77
+ ],
78
  label="Select a feature you would like to integrate",
79
  value="Basic",
80
  )
81
+ with new_section():
82
+ feature, out = fill_tab("## Accelerate Code", default_base[0])
83
+ with new_section():
84
+ _, explanation = fill_tab("## Explanation", default_base[2])
85
+ with new_section():
86
+ _, docs = fill_tab("## Documentation Links", default_base[3])
87
+ inp.change(
88
+ fn=change, inputs=[inp, textbox], outputs=[out, feature, explanation, docs]
89
+ )
 
 
 
 
90
 
91
 
92
+ def training_config(textbox):
 
93
  inp = gr.Radio(
94
+ [
95
+ "AWS SageMaker",
96
+ "DeepSpeed",
97
+ "Megatron-LM",
98
+ "Multi GPU",
99
+ "Multi Node Multi GPU",
100
+ "PyTorch FSDP",
101
+ ],
102
+ label="Select a distributed YAML configuration you would like to view.",
103
+ value="Multi GPU",
104
+ )
105
+ with new_section():
106
+ _, yaml = fill_tab("## Example YAML Configuration", default_training_config[0])
107
+ with new_section():
108
+ _, changes = fill_tab(
109
+ "## Changes to Training Script", default_training_config[1]
110
+ )
111
+ with new_section():
112
+ _, command = fill_tab("## Command to Run Training", default_training_config[2])
113
+ with new_section():
114
+ _, explanation = fill_tab("## Explanation", default_training_config[3])
115
+ with new_section():
116
+ _, docs = fill_tab("## Documentation Links", default_training_config[4])
117
+ inp.change(
118
+ fn=change,
119
+ inputs=[inp, textbox],
120
+ outputs=[yaml, changes, command, explanation, docs],
121
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
 
123
 
124
  # def big_model_inference():
 
166
  with gr.Blocks() as demo:
167
 
168
  with gr.Tabs():
169
+ with gr.TabItem("Basic Training Integration"):
170
  textbox = gr.Textbox(label="tab_name", visible=False, value="base")
171
  base_features(textbox)
172
+ with gr.TabItem("Launch Configuration"):
173
+ textbox = gr.Textbox(
174
+ label="tab_name", visible=False, value="training_configuration"
175
+ )
176
+ training_config(textbox)
177
  with gr.TabItem("Big Model Inference"):
178
  # big_model_inference()
179
  pass
180
+ with gr.TabItem("Launching from Notebooks"):
181
  # notebook_launcher()
182
  pass
183
 
src/markup.py CHANGED
@@ -14,6 +14,7 @@
14
 
15
  from template import get_filename
16
 
 
17
  _remove_color = "rgb(103,6,12)"
18
  _addition_color = "rgb(6,103,12)"
19
 
 
14
 
15
  from template import get_filename
16
 
17
+
18
  _remove_color = "rgb(103,6,12)"
19
  _addition_color = "rgb(6,103,12)"
20
 
src/template.py CHANGED
@@ -13,6 +13,7 @@
13
  # limitations under the License.
14
  import os
15
 
 
16
  TEMPLATES = ["initial", "initial_with_metrics", "accelerate"]
17
 
18
 
@@ -27,4 +28,6 @@ def get_templates() -> dict:
27
  """
28
  Returns a dictionary of template type to code content
29
  """
30
- return {template: open(get_filename("base", template)).read() for template in TEMPLATES}
 
 
 
13
  # limitations under the License.
14
  import os
15
 
16
+
17
  TEMPLATES = ["initial", "initial_with_metrics", "accelerate"]
18
 
19
 
 
28
  """
29
  Returns a dictionary of template type to code content
30
  """
31
+ return {
32
+ template: open(get_filename("base", template)).read() for template in TEMPLATES
33
+ }