Spaces:
Running
on
A10G
Running
on
A10G
File size: 2,044 Bytes
5325fcc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
"""
Utility functions for SLURM configuration and cluster settings.
"""
from enum import Enum
import os
import socket
import typing as tp
import omegaconf
class ClusterType(Enum):
AWS = "aws"
FAIR = "fair"
RSC = "rsc"
LOCAL_DARWIN = "darwin"
DEFAULT = "default" # used for any other cluster.
def _guess_cluster_type() -> ClusterType:
uname = os.uname()
fqdn = socket.getfqdn()
if uname.sysname == "Linux" and (uname.release.endswith("-aws") or ".ec2" in fqdn):
return ClusterType.AWS
if fqdn.endswith(".fair"):
return ClusterType.FAIR
if fqdn.endswith(".facebook.com"):
return ClusterType.RSC
if uname.sysname == "Darwin":
return ClusterType.LOCAL_DARWIN
return ClusterType.DEFAULT
def get_cluster_type(
cluster_type: tp.Optional[ClusterType] = None,
) -> tp.Optional[ClusterType]:
if cluster_type is None:
return _guess_cluster_type()
return cluster_type
def get_slurm_parameters(
cfg: omegaconf.DictConfig, cluster_type: tp.Optional[ClusterType] = None
) -> omegaconf.DictConfig:
"""Update SLURM parameters in configuration based on cluster type.
If the cluster type is not specify, it infers it automatically.
"""
from ..environment import AudioCraftEnvironment
cluster_type = get_cluster_type(cluster_type)
# apply cluster-specific adjustments
if cluster_type == ClusterType.AWS:
cfg["mem_per_gpu"] = None
cfg["constraint"] = None
cfg["setup"] = []
elif cluster_type == ClusterType.RSC:
cfg["mem_per_gpu"] = None
cfg["setup"] = []
cfg["constraint"] = None
cfg["partition"] = "learn"
slurm_exclude = AudioCraftEnvironment.get_slurm_exclude()
if slurm_exclude is not None:
cfg["exclude"] = slurm_exclude
return cfg
|