deepspeedai · tohtana · Jan 31, 2025 · Jan 31, 2025 · Feb 1, 2025 · Feb 2, 2025
@@ -8,7 +8,8 @@
 
 from deepspeed.utils import logger
 from deepspeed.utils.tensor_fragment import map_to_flat_opt_states
-from deepspeed.runtime.utils import bwc_tensor_model_parallel_rank
+from deepspeed.runtime.utils import bwc_tensor_model_parallel_rank, see_memory_usage
+from deepspeed.runtime.torch_autocast import get_autocast_dtype, is_autocast_initialized
 
 
 class DeepSpeedOptimizer(object):
@@ -61,3 +62,20 @@ def load_hp_checkpoint_state_from_checkpoint_dir(self, lp_groups_name: str, chec
                 if key == 'params':
                     continue
                 param_group[key] = value
+
+    def report_ipg_memory_usage(self, tag, param_elems, dtype=None):
+        dtypes = self.ipg_buckets.keys() if dtype is None else [dtype]
+
+        for dt in dtypes:
+            bucket = self.ipg_buckets[dt]
+            elem_count = bucket.elements + param_elems
+            percent_of_bucket_size = (100.0 * elem_count) // self.reduce_bucket_size
+            see_memory_usage(
+                f"{tag}: elems in_bucket {dt} {bucket.elements} param {param_elems} max_percent {percent_of_bucket_size}"
+            )
+
+    def get_param_comm_dtype(self, param):
+        if is_autocast_initialized():
+            return get_autocast_dtype(param)
+        else:
+            return self.communication_data_type
@@ -157,6 +157,35 @@ def get_amp_params(param_dict):
         return False
 
 
+def get_torch_autocast_enabled(param_dict):
+    if TORCH_AUTOCAST in param_dict.keys():
+        return get_scalar_param(param_dict[TORCH_AUTOCAST], TORCH_AUTOCAST_ENABLED, TORCH_AUTOCAST_ENABLED_DEFAULT)
+    else:
+        return False
+
+
+def get_torch_autocast_dtype(param_dict):
+    if TORCH_AUTOCAST in param_dict:
+        if TORCH_AUTOCAST_DTYPE in param_dict[TORCH_AUTOCAST]:
+            try:
+                return DtypeEnum(param_dict[TORCH_AUTOCAST][TORCH_AUTOCAST_DTYPE]).value
+            except KeyError:
+                raise ValueError(
+                    f"Invalid dtype for torch autocast: {param_dict[TORCH_AUTOCAST][TORCH_AUTOCAST_DTYPE]}")
+    return None
+
+
+def get_lower_precision_safe_modules(param_dict):
+    if TORCH_AUTOCAST in param_dict:
+        if TORCH_AUTOCAST_LOWER_PRECISION_SAFE_MODULES in param_dict[TORCH_AUTOCAST]:
+            module_names_with_package = param_dict[TORCH_AUTOCAST][TORCH_AUTOCAST_LOWER_PRECISION_SAFE_MODULES]
+            if not all(isinstance(module_name, str) for module_name in module_names_with_package):
+                raise ValueError(
+                    f"Invalid module names for torch autocast: {module_names_with_package}. Expected list of strings.")
+            return module_names_with_package
+    return None
+
+
 def get_fp16_enabled(param_dict):
     if FP16 in param_dict.keys():
         return get_scalar_param(param_dict[FP16], FP16_ENABLED, FP16_ENABLED_DEFAULT)
@@ -836,6 +865,9 @@ def _initialize_params(self, param_dict):
         self.fp16_master_weights_and_gradients = get_fp16_master_weights_and_grads_enabled(param_dict)
         self.amp_enabled = get_amp_enabled(param_dict)
         self.amp_params = get_amp_params(param_dict)
+        self.torch_autocast_enabled = get_torch_autocast_enabled(param_dict)
+        self.torch_autocast_dtype = get_torch_autocast_dtype(param_dict)
+        self.torch_autocast_lower_precision_safe_modules = get_lower_precision_safe_modules(param_dict)
         self.loss_scale = get_loss_scale(param_dict)
         self.initial_dynamic_scale = get_initial_dynamic_scale(param_dict)
         self.dynamic_loss_scale_args = get_dynamic_loss_scale_args(param_dict)

@@ -202,6 +202,23 @@
 AMP_ENABLED = "enabled"
 AMP_ENABLED_DEFAULT = False
 
+#########################################
+# Torch AMP support
+#########################################
+TORCH_AUTOCAST_FORMAT = '''
+PyTorch autocast config should be of the format:
+"torch_autocast": {
+  "enabled": true,
+  "dtype": "bfloat16",
+}
+'''
+TORCH_AUTOCAST = "torch_autocast"
+
+TORCH_AUTOCAST_ENABLED = "enabled"
+TORCH_AUTOCAST_ENABLED_DEFAULT = False
+TORCH_AUTOCAST_DTYPE = "dtype"
+TORCH_AUTOCAST_LOWER_PRECISION_SAFE_MODULES = "lower_precision_safe_modules"
+
 #########################################
 # Gradient clipping
 #########################################

@@ -19,7 +19,7 @@
 from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
 from contextlib import contextmanager
 
-from typing import Callable, Dict, Union, Iterable, Container
+from typing import Callable, Dict, Union, Iterable, Container, List
 
 import deepspeed
 
@@ -91,6 +91,7 @@
 
 from deepspeed.runtime.checkpoint_engine.torch_checkpoint_engine import TorchCheckpointEngine
 from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+from deepspeed.runtime.torch_autocast import init_autocast_params, get_default_autocast_lower_precision_modules
 
 from .pipe.module import PipelineModule
 from .utils import get_ma_status
@@ -316,6 +317,9 @@ def __init__(self,
         if not isinstance(model_parameters, list):
             model_parameters = list(model_parameters)
 
+        if self.torch_autocast_enabled():
+            init_autocast_params(self, self.torch_autocast_dtype(), self.torch_autocast_lower_precision_safe_modules())
+
         if has_optimizer:
             self._configure_optimizer(optimizer, model_parameters)
             self._configure_lr_scheduler()
@@ -923,6 +927,16 @@ def amp_enabled(self):
     def amp_params(self):
         return self._config.amp_params
 
+    def torch_autocast_enabled(self) -> bool:
+        return self._config.torch_autocast_enabled
+
+    def torch_autocast_dtype(self) -> torch.dtype:
+        return self._config.torch_autocast_dtype
+
+    def torch_autocast_lower_precision_safe_modules(self) -> List[str]:
+        module_names = self._config.torch_autocast_lower_precision_safe_modules
+        return get_default_autocast_lower_precision_modules() if module_names is None else module_names
+
     def fp16_auto_cast(self):
         return self._config.fp16_auto_cast
 
@@ -2027,7 +2041,10 @@ def forward(self, *inputs, **kwargs):
         if self.autotuning_profile_model_info():
             ma = get_ma_status()
 
-        loss = self.module(*inputs, **kwargs)
+        with torch.autocast(device_type=get_accelerator().device_name(),
+                            dtype=self.torch_autocast_dtype(),
+                            enabled=self.torch_autocast_enabled()):
+            loss = self.module(*inputs, **kwargs)
 
         if self.autotuning_profile_model_info():
             activation_mem = get_ma_status() - ma

@@ -0,0 +1,81 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Iterable, Set, List, Union
+import importlib
+
+import torch
+
+LOWER_PRECISION_SAFE_MODULES = [
+    torch.nn.Linear,
+    torch.nn.Conv1d,
+    torch.nn.Conv2d,
+    torch.nn.Conv3d,
+]
+
+TORCH_AUTOCAST_INITIALIZED = False
+
+
+def _validate_auto_cast_settings(engine):
+
+    assert not engine.fp16_enabled(), "Cannot enable both torch autocast and fp16"
+    assert not engine.bfloat16_enabled(), "Cannot enable both torch autocast and bfloat16"
+    assert not engine.zero_quantized_weights(), "Cannot enable both torch autocast and zero quantized weights"
+
+    assert all(p.dtype == torch.float32
+               for p in engine.parameters()), "All parameters must be float32 for torch autocast"
+    assert engine.communication_data_type == torch.float32, "Communication data type must be float32 for torch autocast"
+
+
+def init_autocast_params(engine, dtype: torch.dtype,
+                         torch_autocast_lower_precision_safe_modules: Union[None, List[str]]) -> None:
+
+    _validate_auto_cast_settings(engine)
+    model = engine.module
+
+    if torch_autocast_lower_precision_safe_modules is None:
+        lower_precision_safe_module_classes = LOWER_PRECISION_SAFE_MODULES
+    else:
+        lower_precision_safe_module_classes = []
+        for module_name in torch_autocast_lower_precision_safe_modules:
+            try:
+                package_name, class_name = module_name.rsplit('.', 1)
+                module = importlib.import_module(package_name)
+                class_ = getattr(module, class_name)
+                lower_precision_safe_module_classes.append(class_)
+            except Exception as e:
+                raise ValueError(f"Failed to import lower precision safe module {module_name}: {e}")
+
+    for module in model.modules():
+        if module.__class__ in lower_precision_safe_module_classes:
+            for p in module.parameters(recurse=False):
+                p.autocast_dtype = dtype
+
+    global TORCH_AUTOCAST_INITIALIZED
+    TORCH_AUTOCAST_INITIALIZED = True
+
+
+def is_autocast_initialized() -> bool:
+    return TORCH_AUTOCAST_INITIALIZED
+
+
+def get_default_autocast_lower_precision_modules() -> List[str]:
+    return [f"{cls.__module__}.{cls.__name__}" for cls in LOWER_PRECISION_SAFE_MODULES]
+
+
+def get_autocast_dtype(param: torch.nn.Parameter) -> torch.dtype:
+    return param.autocast_dtype if hasattr(param, "autocast_dtype") else param.dtype
+
+
+def has_autocast_dtype(param: torch.nn.Parameter) -> bool:
+    return hasattr(param, "autocast_dtype")
+
+
+def get_all_autocast_dtypes(params: Iterable) -> Set[torch.dtype]:
+    return {get_autocast_dtype(p) for p in params}
+
+
+def sort_dtypes(dtypes: List[torch.dtype]) -> List[torch.dtype]:
+    return sorted(dtypes, key=str)
@@ -69,6 +69,5 @@ def get_state_devices(model, state: OffloadStateTypeEnum) -> Set[torch.device]:
         return set(safe_get_local_optimizer_state(p, "exp_avg").device for p in model.parameters()) | \
                set(safe_get_local_optimizer_state(p, "exp_avg_sq").device for p in model.parameters())
     elif state == OffloadStateTypeEnum.contiguous_grad_buffer:
-        if model.optimizer._DeepSpeedZeroOptimizer_Stage3__ipg_bucket_flat_buffer == None:
-            return {}
-        return {model.optimizer._DeepSpeedZeroOptimizer_Stage3__ipg_bucket_flat_buffer.device}
+        return set(bucket.buffer.device for bucket in model.optimizer.ipg_buckets.values()
+                   if bucket.buffer is not None)