|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import os |
|
import sys |
|
import tempfile |
|
|
|
import torch |
|
|
|
from .state import AcceleratorState |
|
from .utils import PrecisionType, PrepareForLaunch, is_mps_available, patch_environment |
|
|
|
|
|
def notebook_launcher(function, args=(), num_processes=None, mixed_precision="no", use_port="29500"): |
|
""" |
|
Launches a training function, using several processes if it's possible in the current environment (TPU with |
|
multiple cores for instance). |
|
|
|
<Tip warning={true}> |
|
|
|
To use this function absolutely zero calls to a CUDA device must be made in the notebook session before calling. If |
|
any have been made, you will need to restart the notebook and make sure no cells use any CUDA capability. |
|
|
|
</Tip> |
|
|
|
Args: |
|
function (`Callable`): |
|
The training function to execute. If it accepts arguments, the first argument should be the index of the |
|
process run. |
|
args (`Tuple`): |
|
Tuple of arguments to pass to the function (it will receive `*args`). |
|
num_processes (`int`, *optional*): |
|
The number of processes to use for training. Will default to 8 in Colab/Kaggle if a TPU is available, to |
|
the number of GPUs available otherwise. |
|
mixed_precision (`str`, *optional*, defaults to `"no"`): |
|
If `fp16` or `bf16`, will use mixed precision training on multi-GPU. |
|
use_port (`str`, *optional*, defaults to `"29500"`): |
|
The port to use to communicate between processes when launching a multi-GPU training. |
|
|
|
Example: |
|
|
|
```python |
|
# Assume this is defined in a Jupyter Notebook on an instance with two GPUs |
|
from accelerate import notebook_launcher |
|
|
|
|
|
def train(*args): |
|
# Your training function here |
|
... |
|
|
|
|
|
notebook_launcher(train, args=(arg1, arg2), num_processes=2, mixed_precision="fp16") |
|
``` |
|
""" |
|
|
|
in_colab = False |
|
in_kaggle = False |
|
if any(key.startswith("KAGGLE") for key in os.environ.keys()): |
|
in_kaggle = True |
|
elif "IPython" in sys.modules: |
|
in_colab = "google.colab" in str(sys.modules["IPython"].get_ipython()) |
|
|
|
try: |
|
mixed_precision = PrecisionType(mixed_precision.lower()) |
|
except ValueError: |
|
raise ValueError( |
|
f"Unknown mixed_precision mode: {args.mixed_precision.lower()}. Choose between {PrecisionType.list()}." |
|
) |
|
|
|
if (in_colab or in_kaggle) and (os.environ.get("TPU_NAME", None) is not None): |
|
|
|
import torch_xla.distributed.xla_multiprocessing as xmp |
|
|
|
if len(AcceleratorState._shared_state) > 0: |
|
raise ValueError( |
|
"To train on TPU in Colab or Kaggle Kernel, the `Accelerator` should only be initialized inside " |
|
"your training function. Restart your notebook and make sure no cells initializes an " |
|
"`Accelerator`." |
|
) |
|
if num_processes is None: |
|
num_processes = 8 |
|
|
|
launcher = PrepareForLaunch(function, distributed_type="TPU") |
|
print(f"Launching a training on {num_processes} TPU cores.") |
|
xmp.spawn(launcher, args=args, nprocs=num_processes, start_method="fork") |
|
elif in_colab: |
|
|
|
if torch.cuda.is_available(): |
|
print("Launching training on one GPU.") |
|
else: |
|
print("Launching training on one CPU.") |
|
function(*args) |
|
else: |
|
if num_processes is None: |
|
raise ValueError( |
|
"You have to specify the number of GPUs you would like to use, add `num_processes=...` to your call." |
|
) |
|
|
|
if num_processes > 1: |
|
|
|
from torch.multiprocessing import start_processes |
|
from torch.multiprocessing.spawn import ProcessRaisedException |
|
|
|
if len(AcceleratorState._shared_state) > 0: |
|
raise ValueError( |
|
"To launch a multi-GPU training from your notebook, the `Accelerator` should only be initialized " |
|
"inside your training function. Restart your notebook and make sure no cells initializes an " |
|
"`Accelerator`." |
|
) |
|
|
|
if torch.cuda.is_initialized(): |
|
raise ValueError( |
|
"To launch a multi-GPU training from your notebook, you need to avoid running any instruction " |
|
"using `torch.cuda` in any cell. Restart your notebook and make sure no cells use any CUDA " |
|
"function." |
|
) |
|
|
|
|
|
|
|
with patch_environment( |
|
world_size=num_processes, master_addr="127.0.01", master_port=use_port, mixed_precision=mixed_precision |
|
): |
|
launcher = PrepareForLaunch(function, distributed_type="MULTI_GPU") |
|
print(f"Launching training on {num_processes} GPUs.") |
|
try: |
|
start_processes(launcher, args=args, nprocs=num_processes, start_method="fork") |
|
except ProcessRaisedException as e: |
|
if "Cannot re-initialize CUDA in forked subprocess" in e.args[0]: |
|
raise RuntimeError( |
|
"CUDA has been initialized before the `notebook_launcher` could create a forked subprocess. " |
|
"This likely stems from an outside import causing issues once the `notebook_launcher()` is called. " |
|
"Please review your imports and test them when running the `notebook_launcher()` to identify " |
|
"which one is problematic." |
|
) from e |
|
|
|
else: |
|
|
|
if is_mps_available(): |
|
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" |
|
print("Launching training on MPS.") |
|
elif torch.cuda.is_available(): |
|
print("Launching training on one GPU.") |
|
else: |
|
print("Launching training on CPU.") |
|
function(*args) |
|
|
|
|
|
def debug_launcher(function, args=(), num_processes=2): |
|
""" |
|
Launches a training function using several processes on CPU for debugging purposes. |
|
|
|
<Tip warning={true}> |
|
|
|
This function is provided for internal testing and debugging, but it's not intended for real trainings. It will |
|
only use the CPU. |
|
|
|
</Tip> |
|
|
|
Args: |
|
function (`Callable`): |
|
The training function to execute. |
|
args (`Tuple`): |
|
Tuple of arguments to pass to the function (it will receive `*args`). |
|
num_processes (`int`, *optional*, defaults to 2): |
|
The number of processes to use for training. |
|
""" |
|
from torch.multiprocessing import start_processes |
|
|
|
with tempfile.NamedTemporaryFile() as tmp_file: |
|
|
|
|
|
with patch_environment( |
|
world_size=num_processes, |
|
master_addr="127.0.01", |
|
master_port="29500", |
|
accelerate_mixed_precision="no", |
|
accelerate_debug_rdv_file=tmp_file.name, |
|
accelerate_use_cpu="yes", |
|
): |
|
launcher = PrepareForLaunch(function, debug=True) |
|
start_processes(launcher, args=args, nprocs=num_processes, start_method="fork") |
|
|