File size: 950 Bytes
002bd9b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 |
# https://huggingface.co/docs/transformers/v4.32.1/en/main_classes/trainer#using-accelerate-launcher-with-trainer
compute_environment: LOCAL_MACHINE
deepspeed_config:
deepspeed_multinode_launcher: standard
gradient_accumulation_steps: 1
offload_optimizer_device: none
offload_param_device: none
zero3_init_flag: false
zero_stage: 2
gradient_clipping: 1.0
distributed_type: DEEPSPEED
downcast_bf16: 'no'
gpu_ids: all
machine_rank: 0 # change this for each node
main_process_ip: localhost # the machines on Azure is inter-connected, so you can directly config it according to `~/.ssh/config`
main_process_port: 11451 # change this as you like
main_training_function: main
mixed_precision: fp16
num_machines: 1 # change this for all nodes
num_processes: 1 # changet this for all nodes. all the gpu processes among the nodes.
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false |