| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| from ..parallelism_config import ParallelismConfig |
| from .ao import convert_model_to_fp8_ao, filter_first_and_last_linear_layers, has_ao_layers |
| from .constants import ( |
| MITA_PROFILING_AVAILABLE_PYTORCH_VERSION, |
| MODEL_NAME, |
| OPTIMIZER_NAME, |
| PROFILE_PATTERN_NAME, |
| RNG_STATE_NAME, |
| SAFE_MODEL_NAME, |
| SAFE_WEIGHTS_INDEX_NAME, |
| SAFE_WEIGHTS_NAME, |
| SAFE_WEIGHTS_PATTERN_NAME, |
| SAMPLER_NAME, |
| SCALER_NAME, |
| SCHEDULER_NAME, |
| TORCH_DISTRIBUTED_OPERATION_TYPES, |
| TORCH_LAUNCH_PARAMS, |
| WEIGHTS_INDEX_NAME, |
| WEIGHTS_NAME, |
| WEIGHTS_PATTERN_NAME, |
| XPU_PROFILING_AVAILABLE_PYTORCH_VERSION, |
| ) |
| from .dataclasses import ( |
| AORecipeKwargs, |
| AutocastKwargs, |
| BnbQuantizationConfig, |
| ComputeEnvironment, |
| CustomDtype, |
| DataLoaderConfiguration, |
| DDPCommunicationHookType, |
| DeepSpeedPlugin, |
| DeepSpeedSequenceParallelConfig, |
| DistributedDataParallelKwargs, |
| DistributedType, |
| DynamoBackend, |
| FP8RecipeKwargs, |
| FullyShardedDataParallelPlugin, |
| GradientAccumulationPlugin, |
| GradScalerKwargs, |
| InitProcessGroupKwargs, |
| KwargsHandler, |
| LoggerType, |
| MegatronLMPlugin, |
| MSAMPRecipeKwargs, |
| PrecisionType, |
| ProfileKwargs, |
| ProjectConfiguration, |
| RNGType, |
| SageMakerDistributedType, |
| TensorInformation, |
| TERecipeKwargs, |
| TorchContextParallelConfig, |
| TorchDynamoPlugin, |
| TorchTensorParallelConfig, |
| TorchTensorParallelPlugin, |
| add_model_config_to_megatron_parser, |
| ) |
| from .environment import ( |
| are_libraries_initialized, |
| check_cuda_fp8_capability, |
| check_cuda_p2p_ib_support, |
| clear_environment, |
| convert_dict_to_env_variables, |
| get_cpu_distributed_information, |
| get_current_device_type, |
| get_gpu_info, |
| get_int_from_env, |
| parse_choice_from_env, |
| parse_flag_from_env, |
| patch_environment, |
| purge_accelerate_environment, |
| set_numa_affinity, |
| str_to_bool, |
| ) |
| from .imports import ( |
| deepspeed_required, |
| get_ccl_version, |
| is_4bit_bnb_available, |
| is_8bit_bnb_available, |
| is_aim_available, |
| is_bf16_available, |
| is_bitsandbytes_multi_backend_available, |
| is_bnb_available, |
| is_boto3_available, |
| is_ccl_available, |
| is_clearml_available, |
| is_comet_ml_available, |
| is_cuda_available, |
| is_datasets_available, |
| is_deepspeed_available, |
| is_dvclive_available, |
| is_fp8_available, |
| is_fp16_available, |
| is_habana_gaudi1, |
| is_hpu_available, |
| is_import_timer_available, |
| is_ipex_available, |
| is_lomo_available, |
| is_matplotlib_available, |
| is_megatron_lm_available, |
| is_mlflow_available, |
| is_mlu_available, |
| is_mps_available, |
| is_msamp_available, |
| is_musa_available, |
| is_npu_available, |
| is_pandas_available, |
| is_peft_available, |
| is_pippy_available, |
| is_pynvml_available, |
| is_pytest_available, |
| is_rich_available, |
| is_sagemaker_available, |
| is_schedulefree_available, |
| is_sdaa_available, |
| is_swanlab_available, |
| is_tensorboard_available, |
| is_timm_available, |
| is_torch_xla_available, |
| is_torchao_available, |
| is_torchdata_available, |
| is_torchdata_stateful_dataloader_available, |
| is_torchvision_available, |
| is_trackio_available, |
| is_transformer_engine_available, |
| is_transformer_engine_mxfp8_available, |
| is_transformers_available, |
| is_triton_available, |
| is_wandb_available, |
| is_weights_only_available, |
| is_xccl_available, |
| is_xpu_available, |
| torchao_required, |
| ) |
| from .modeling import ( |
| align_module_device, |
| calculate_maximum_sizes, |
| check_device_map, |
| check_tied_parameters_in_config, |
| check_tied_parameters_on_same_device, |
| compute_module_sizes, |
| convert_file_size_to_int, |
| dtype_byte_size, |
| find_tied_parameters, |
| get_balanced_memory, |
| get_grad_scaler, |
| get_max_layer_size, |
| get_max_memory, |
| get_mixed_precision_context_manager, |
| has_offloaded_params, |
| id_tensor_storage, |
| infer_auto_device_map, |
| is_peft_model, |
| load_checkpoint_in_model, |
| load_offloaded_weights, |
| load_state_dict, |
| named_module_tensors, |
| retie_parameters, |
| set_module_tensor_to_device, |
| ) |
| from .offload import ( |
| OffloadedWeightsLoader, |
| PrefixedDataset, |
| extract_submodules_state_dict, |
| load_offloaded_weight, |
| offload_state_dict, |
| offload_weight, |
| save_offload_index, |
| ) |
| from .operations import ( |
| CannotPadNestedTensorWarning, |
| GatheredParameters, |
| broadcast, |
| broadcast_object_list, |
| concatenate, |
| convert_outputs_to_fp32, |
| convert_to_fp32, |
| copy_tensor_to_devices, |
| find_batch_size, |
| find_device, |
| gather, |
| gather_object, |
| get_data_structure, |
| honor_type, |
| ignorant_find_batch_size, |
| initialize_tensors, |
| is_namedtuple, |
| is_tensor_information, |
| is_torch_tensor, |
| listify, |
| pad_across_processes, |
| pad_input_tensors, |
| recursively_apply, |
| reduce, |
| send_to_device, |
| slice_tensors, |
| ) |
| from .versions import compare_versions, is_torch_version |
|
|
|
|
| if is_deepspeed_available(): |
| from .deepspeed import ( |
| DeepSpeedEngineWrapper, |
| DeepSpeedOptimizerWrapper, |
| DeepSpeedSchedulerWrapper, |
| DummyOptim, |
| DummyScheduler, |
| HfDeepSpeedConfig, |
| get_active_deepspeed_plugin, |
| map_pytorch_optim_to_deepspeed, |
| ) |
|
|
| from .bnb import has_4bit_bnb_layers, load_and_quantize_model |
| from .fsdp_utils import ( |
| disable_fsdp_ram_efficient_loading, |
| enable_fsdp_ram_efficient_loading, |
| ensure_weights_retied, |
| fsdp2_apply_ac, |
| fsdp2_canonicalize_names, |
| fsdp2_load_full_state_dict, |
| fsdp2_prepare_model, |
| fsdp2_switch_optimizer_parameters, |
| get_fsdp2_grad_scaler, |
| load_fsdp_model, |
| load_fsdp_optimizer, |
| merge_fsdp_weights, |
| save_fsdp_model, |
| save_fsdp_optimizer, |
| ) |
| from .launch import ( |
| PrepareForLaunch, |
| _filter_args, |
| prepare_deepspeed_cmd_env, |
| prepare_multi_gpu_env, |
| prepare_sagemager_args_inputs, |
| prepare_simple_launcher_cmd_env, |
| prepare_tpu, |
| ) |
|
|
| |
| from .megatron_lm import ( |
| AbstractTrainStep, |
| BertTrainStep, |
| GPTTrainStep, |
| MegatronLMDummyDataLoader, |
| MegatronLMDummyScheduler, |
| T5TrainStep, |
| avg_losses_across_data_parallel_group, |
| ) |
|
|
|
|
| if is_megatron_lm_available(): |
| from .megatron_lm import ( |
| MegatronEngine, |
| MegatronLMOptimizerWrapper, |
| MegatronLMSchedulerWrapper, |
| gather_across_data_parallel_groups, |
| ) |
| from .megatron_lm import initialize as megatron_lm_initialize |
| from .megatron_lm import prepare_data_loader as megatron_lm_prepare_data_loader |
| from .megatron_lm import prepare_model_optimizer_scheduler as megatron_lm_prepare_model_optimizer_scheduler |
| from .megatron_lm import prepare_optimizer as megatron_lm_prepare_optimizer |
| from .megatron_lm import prepare_scheduler as megatron_lm_prepare_scheduler |
| from .memory import find_executable_batch_size, release_memory |
| from .other import ( |
| check_os_kernel, |
| clean_state_dict_for_safetensors, |
| compile_regions, |
| compile_regions_deepspeed, |
| convert_bytes, |
| extract_model_from_parallel, |
| get_module_children_bottom_up, |
| get_pretty_name, |
| has_compiled_regions, |
| is_compiled_module, |
| is_port_in_use, |
| load, |
| merge_dicts, |
| model_has_dtensor, |
| recursive_getattr, |
| save, |
| wait_for_everyone, |
| write_basic_config, |
| ) |
| from .random import set_seed, synchronize_rng_state, synchronize_rng_states |
| from .torch_xla import install_xla |
| from .tqdm import tqdm |
| from .transformer_engine import ( |
| apply_fp8_autowrap, |
| contextual_fp8_autocast, |
| convert_model, |
| has_transformer_engine_layers, |
| ) |
|
|