File size: 950 Bytes
002bd9b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
# https://huggingface.co/docs/transformers/v4.32.1/en/main_classes/trainer#using-accelerate-launcher-with-trainer
compute_environment: LOCAL_MACHINE
deepspeed_config:
  deepspeed_multinode_launcher: standard
  gradient_accumulation_steps: 1
  offload_optimizer_device: none
  offload_param_device: none
  zero3_init_flag: false
  zero_stage: 2
  gradient_clipping: 1.0
distributed_type: DEEPSPEED
downcast_bf16: 'no'
gpu_ids: all
machine_rank: 0  # change this for each node
main_process_ip: localhost  # the machines on Azure is inter-connected, so you can directly config it according to `~/.ssh/config`
main_process_port: 11451  # change this as you like
main_training_function: main
mixed_precision: fp16
num_machines: 1  # change this for all nodes
num_processes: 1  # changet this for all nodes. all the gpu processes among the nodes.
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false