From 22e8a4b2ffb4910b7d4bfb82e8b926f8d5a58d1b Mon Sep 17 00:00:00 2001
From: Xiang Song <xiangsx@amazon.com>
Date: Wed, 12 Feb 2025 23:52:38 -0800
Subject: [PATCH 01/10] Add debug mode for gconstruct

---
 .../graphstorm/gconstruct/construct_graph.py  |  9 ++++-
 python/graphstorm/gconstruct/transform.py     | 35 +++++++++++++++----
 python/graphstorm/gconstruct/utils.py         | 22 ++++++++++++
 3 files changed, 59 insertions(+), 7 deletions(-)

diff --git a/python/graphstorm/gconstruct/construct_graph.py b/python/graphstorm/gconstruct/construct_graph.py
index 62bfbe7799..24f977a9b2 100644
--- a/python/graphstorm/gconstruct/construct_graph.py
+++ b/python/graphstorm/gconstruct/construct_graph.py
@@ -44,7 +44,9 @@
 from .id_map import NoopMap, IdMap, map_node_ids
 from .utils import (multiprocessing_data_read,
                     update_two_phase_feat_ops, ExtMemArrayMerger,
-                    partition_graph, ExtMemArrayWrapper)
+                    partition_graph,
+                    ExtMemArrayWrapper,
+                    set_debug_mode)
 from .utils import (get_hard_edge_negs_feats,
                     shuffle_hard_nids)
 
@@ -744,6 +746,9 @@ def process_graph(args):
     """
     check_graph_name(args.graph_name)
     logging.basicConfig(level=get_log_level(args.logging_level))
+    if args.debug:
+        set_debug_mode()
+
     with open(args.conf_file, 'r', encoding="utf8") as json_file:
         process_confs = json.load(json_file)
 
@@ -919,6 +924,8 @@ def process_graph(args):
                            help="Whether or not to remap node IDs.")
     argparser.add_argument("--add-reverse-edges", action='store_true',
                            help="Add reverse edges.")
+    argparser.add_argument("--debug", action='store_true',
+                           help="Set gconstruct into debug mode.")
     argparser.add_argument("--output-format", type=str, nargs='+', default=["DistDGL"],
                            help="The output format of the constructed graph."
                                 "It can be a single output format, for example "
diff --git a/python/graphstorm/gconstruct/transform.py b/python/graphstorm/gconstruct/transform.py
index 59586b7b6a..ae3cda7dbe 100644
--- a/python/graphstorm/gconstruct/transform.py
+++ b/python/graphstorm/gconstruct/transform.py
@@ -33,7 +33,11 @@
 from transformers import AutoModel, AutoConfig
 
 from .file_io import read_index
-from .utils import ExtMemArrayWrapper, ExtFeatureWrapper, generate_hash
+from .utils import (ExtMemArrayWrapper,
+                    ExtFeatureWrapper,
+                    generate_hash,
+                    is_debug_mode,
+                    validate_numerical_feats)
 
 LABEL_STATS_FIELD = "training_label_stats"
 LABEL_STATS_FREQUENCY_COUNT = "frequency_cnt"
@@ -392,6 +396,10 @@ def call(self, feats):
                 or np.issubdtype(feats.dtype, np.floating), \
                 f"The feature {self.feat_name} has to be integers or floats."
 
+        if is_debug_mode():
+            assert validate_numerical_feats(feats), \
+                f"There are NaN, Inf or missing value in the {self.feat_name} feature."
+
         encoding = np.zeros((len(feats), self.bucket_cnt), dtype=np.int8)
         max_val = max(self.bucket_range)
         min_val = min(self.bucket_range)
@@ -572,6 +580,10 @@ def pre_process(self, feats):
             # It will load all data into main memory.
             feats = feats.to_numpy()
 
+        if is_debug_mode():
+            assert validate_numerical_feats(feats), \
+                f"There are NaN, Inf or missing value in the {self.feat_name} feature."
+
         if feats.dtype not in [np.float64, np.float32, np.float16, np.int64, \
                               np.int32, np.int16, np.int8]:
             logging.warning("The feature %s has to be floating points or integers,"
@@ -735,6 +747,10 @@ def pre_process(self, feats) -> Dict[str, Optional[np.ndarray]]:
             except: # pylint: disable=bare-except
                 raise ValueError(f"The feature {self.feat_name} has to be integers or floats.")
 
+        if is_debug_mode():
+            assert validate_numerical_feats(feats), \
+                f"There are NaN, Inf or missing value in the {self.feat_name} feature."
+
         # make summation a 1D array
         summation = np.sum(feats, axis=0, keepdims=True).reshape((-1,))
         return {self.feat_name: summation}
@@ -836,10 +852,8 @@ def call(self, feats):
         assert isinstance(feats, (np.ndarray, ExtMemArrayWrapper)), \
                 f"The feature {self.feat_name} has to be NumPy array."
 
-        if np.issubdtype(feats.dtype, np.integer) \
-            or np.issubdtype(feats.dtype, np.floating): \
-            return {self.feat_name: feats}
-        else:
+        if not (np.issubdtype(feats.dtype, np.integer) \
+            or np.issubdtype(feats.dtype, np.floating)):
             logging.warning("The feature %s has to be floating points or integers,"
                             "but get %s. Try to cast it into float32",
                             self.feat_name, feats.dtype)
@@ -850,7 +864,11 @@ def call(self, feats):
             except: # pylint: disable=bare-except
                 raise ValueError(f"The feature {self.feat_name} has to be integers or floats.")
 
-            return {self.feat_name: feats}
+        if is_debug_mode():
+            assert validate_numerical_feats(feats), \
+                f"There are NaN, Inf or missing value in the {self.feat_name} feature."
+
+        return {self.feat_name: feats}
 
     def after_merge_transform(self, feats):
         # The feats can be a numpy array or a numpy memmaped object
@@ -1076,6 +1094,11 @@ def call(self, feats):
         assert np.issubdtype(feats.dtype, np.integer) \
                 or np.issubdtype(feats.dtype, np.floating), \
                 f"The feature {self.feat_name} has to be integers or floats."
+
+        if is_debug_mode():
+            assert validate_numerical_feats(feats), \
+                f"There are NaN, Inf or missing value in the {self.feat_name} feature."
+
         if self.truncate_dim is not None:
             if isinstance(feats, np.ndarray):
                 feats = feats[:, :self.truncate_dim]
diff --git a/python/graphstorm/gconstruct/utils.py b/python/graphstorm/gconstruct/utils.py
index fe2d3d0e77..9a12282c79 100644
--- a/python/graphstorm/gconstruct/utils.py
+++ b/python/graphstorm/gconstruct/utils.py
@@ -41,6 +41,28 @@
 SHARED_MEMORY_CROSS_PROCESS_STORAGE = "shared_memory"
 PICKLE_CROSS_PROCESS_STORAGE = "pickle"
 EXT_MEMORY_STORAGE = "ext_memory"
+IS_DEBUG_MODE = False
+
+def is_debug_mode():
+    """ Check whether global IS_DEBUG_MODE is true.
+    """
+    return IS_DEBUG_MODE
+
+def set_debug_mode():
+    """ Set gconstruct in debug mode.
+    """
+    global IS_DEBUG_MODE
+    IS_DEBUG_MODE = True
+
+def validate_numerical_feats(feats):
+    """ Validate the numerical features
+
+    Returns
+    -------
+    bool: Whether the values are all valid
+    """
+    return (not np.isnan(feats).any()) and \
+        (not np.isinf(feats).any())
 
 def _is_numeric(arr):
     """ Check if the input array has the numeric data type.

From 55435c49130bb2c6e296245f6d212d08a3d792ba Mon Sep 17 00:00:00 2001
From: Xiang Song <xiangsx@amazon.com>
Date: Wed, 12 Feb 2025 23:55:21 -0800
Subject: [PATCH 02/10] update

---
 python/graphstorm/gconstruct/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/graphstorm/gconstruct/utils.py b/python/graphstorm/gconstruct/utils.py
index 9a12282c79..6ef1aad650 100644
--- a/python/graphstorm/gconstruct/utils.py
+++ b/python/graphstorm/gconstruct/utils.py
@@ -59,7 +59,7 @@ def validate_numerical_feats(feats):
 
     Returns
     -------
-    bool: Whether the values are all valid
+    bool: Whether the values of the input feature are all valid
     """
     return (not np.isnan(feats).any()) and \
         (not np.isinf(feats).any())

From 4b72ff1daa619ea5cf9460c86604ca47a3dbe6f5 Mon Sep 17 00:00:00 2001
From: Xiang Song <xiangsx@amazon.com>
Date: Thu, 13 Feb 2025 10:49:49 -0800
Subject: [PATCH 03/10] Update

---
 python/graphstorm/gconstruct/construct_graph.py | 10 +++++-----
 python/graphstorm/gconstruct/transform.py       | 12 ++++++------
 python/graphstorm/gconstruct/utils.py           | 14 +++++++-------
 3 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/python/graphstorm/gconstruct/construct_graph.py b/python/graphstorm/gconstruct/construct_graph.py
index 24f977a9b2..662dd90450 100644
--- a/python/graphstorm/gconstruct/construct_graph.py
+++ b/python/graphstorm/gconstruct/construct_graph.py
@@ -46,7 +46,7 @@
                     update_two_phase_feat_ops, ExtMemArrayMerger,
                     partition_graph,
                     ExtMemArrayWrapper,
-                    set_debug_mode)
+                    set_validate_features)
 from .utils import (get_hard_edge_negs_feats,
                     shuffle_hard_nids)
 
@@ -746,8 +746,8 @@ def process_graph(args):
     """
     check_graph_name(args.graph_name)
     logging.basicConfig(level=get_log_level(args.logging_level))
-    if args.debug:
-        set_debug_mode()
+    if args.validate_features:
+        set_validate_features()
 
     with open(args.conf_file, 'r', encoding="utf8") as json_file:
         process_confs = json.load(json_file)
@@ -924,8 +924,8 @@ def process_graph(args):
                            help="Whether or not to remap node IDs.")
     argparser.add_argument("--add-reverse-edges", action='store_true',
                            help="Add reverse edges.")
-    argparser.add_argument("--debug", action='store_true',
-                           help="Set gconstruct into debug mode.")
+    argparser.add_argument("--validate-features", action='store_false',
+                           help="Turn off features validation.")
     argparser.add_argument("--output-format", type=str, nargs='+', default=["DistDGL"],
                            help="The output format of the constructed graph."
                                 "It can be a single output format, for example "
diff --git a/python/graphstorm/gconstruct/transform.py b/python/graphstorm/gconstruct/transform.py
index ae3cda7dbe..a4586cd013 100644
--- a/python/graphstorm/gconstruct/transform.py
+++ b/python/graphstorm/gconstruct/transform.py
@@ -36,7 +36,7 @@
 from .utils import (ExtMemArrayWrapper,
                     ExtFeatureWrapper,
                     generate_hash,
-                    is_debug_mode,
+                    validate_features,
                     validate_numerical_feats)
 
 LABEL_STATS_FIELD = "training_label_stats"
@@ -396,7 +396,7 @@ def call(self, feats):
                 or np.issubdtype(feats.dtype, np.floating), \
                 f"The feature {self.feat_name} has to be integers or floats."
 
-        if is_debug_mode():
+        if validate_features():
             assert validate_numerical_feats(feats), \
                 f"There are NaN, Inf or missing value in the {self.feat_name} feature."
 
@@ -580,7 +580,7 @@ def pre_process(self, feats):
             # It will load all data into main memory.
             feats = feats.to_numpy()
 
-        if is_debug_mode():
+        if validate_features():
             assert validate_numerical_feats(feats), \
                 f"There are NaN, Inf or missing value in the {self.feat_name} feature."
 
@@ -747,7 +747,7 @@ def pre_process(self, feats) -> Dict[str, Optional[np.ndarray]]:
             except: # pylint: disable=bare-except
                 raise ValueError(f"The feature {self.feat_name} has to be integers or floats.")
 
-        if is_debug_mode():
+        if validate_features():
             assert validate_numerical_feats(feats), \
                 f"There are NaN, Inf or missing value in the {self.feat_name} feature."
 
@@ -864,7 +864,7 @@ def call(self, feats):
             except: # pylint: disable=bare-except
                 raise ValueError(f"The feature {self.feat_name} has to be integers or floats.")
 
-        if is_debug_mode():
+        if validate_features():
             assert validate_numerical_feats(feats), \
                 f"There are NaN, Inf or missing value in the {self.feat_name} feature."
 
@@ -1095,7 +1095,7 @@ def call(self, feats):
                 or np.issubdtype(feats.dtype, np.floating), \
                 f"The feature {self.feat_name} has to be integers or floats."
 
-        if is_debug_mode():
+        if validate_features():
             assert validate_numerical_feats(feats), \
                 f"There are NaN, Inf or missing value in the {self.feat_name} feature."
 
diff --git a/python/graphstorm/gconstruct/utils.py b/python/graphstorm/gconstruct/utils.py
index 6ef1aad650..bcb5eb91f8 100644
--- a/python/graphstorm/gconstruct/utils.py
+++ b/python/graphstorm/gconstruct/utils.py
@@ -41,18 +41,18 @@
 SHARED_MEMORY_CROSS_PROCESS_STORAGE = "shared_memory"
 PICKLE_CROSS_PROCESS_STORAGE = "pickle"
 EXT_MEMORY_STORAGE = "ext_memory"
-IS_DEBUG_MODE = False
+VALIDATE_FEATRE= False
 
-def is_debug_mode():
-    """ Check whether global IS_DEBUG_MODE is true.
+def validate_features():
+    """ Check whether gconstruct needs to validate the input features
     """
-    return IS_DEBUG_MODE
+    return VALIDATE_FEATRE
 
-def set_debug_mode():
+def set_validate_features():
     """ Set gconstruct in debug mode.
     """
-    global IS_DEBUG_MODE
-    IS_DEBUG_MODE = True
+    global VALIDATE_FEATRE
+    VALIDATE_FEATRE = True
 
 def validate_numerical_feats(feats):
     """ Validate the numerical features

From 72cc9e8a504e2b5261d2bc5bdabc4eb5ccaed96e Mon Sep 17 00:00:00 2001
From: Xiang Song <xiangsx@amazon.com>
Date: Thu, 13 Feb 2025 10:57:32 -0800
Subject: [PATCH 04/10] Update

---
 python/graphstorm/gconstruct/construct_graph.py | 12 ++++++++----
 python/graphstorm/gconstruct/utils.py           |  6 +++---
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/python/graphstorm/gconstruct/construct_graph.py b/python/graphstorm/gconstruct/construct_graph.py
index 662dd90450..177c7b7143 100644
--- a/python/graphstorm/gconstruct/construct_graph.py
+++ b/python/graphstorm/gconstruct/construct_graph.py
@@ -46,7 +46,7 @@
                     update_two_phase_feat_ops, ExtMemArrayMerger,
                     partition_graph,
                     ExtMemArrayWrapper,
-                    set_validate_features)
+                    stop_validate_features)
 from .utils import (get_hard_edge_negs_feats,
                     shuffle_hard_nids)
 
@@ -746,8 +746,12 @@ def process_graph(args):
     """
     check_graph_name(args.graph_name)
     logging.basicConfig(level=get_log_level(args.logging_level))
-    if args.validate_features:
-        set_validate_features()
+    if args.no_feature_validate:
+        logging.warning("Turn off input feature validation."
+                        "This will speedup data processing, "
+                        "but won't check whether there are "
+                        "invalid values from the input.")
+        stop_validate_features()
 
     with open(args.conf_file, 'r', encoding="utf8") as json_file:
         process_confs = json.load(json_file)
@@ -924,7 +928,7 @@ def process_graph(args):
                            help="Whether or not to remap node IDs.")
     argparser.add_argument("--add-reverse-edges", action='store_true',
                            help="Add reverse edges.")
-    argparser.add_argument("--validate-features", action='store_false',
+    argparser.add_argument("--no-feature-validate", action='store_true',
                            help="Turn off features validation.")
     argparser.add_argument("--output-format", type=str, nargs='+', default=["DistDGL"],
                            help="The output format of the constructed graph."
diff --git a/python/graphstorm/gconstruct/utils.py b/python/graphstorm/gconstruct/utils.py
index bcb5eb91f8..3ab52b6e89 100644
--- a/python/graphstorm/gconstruct/utils.py
+++ b/python/graphstorm/gconstruct/utils.py
@@ -41,18 +41,18 @@
 SHARED_MEMORY_CROSS_PROCESS_STORAGE = "shared_memory"
 PICKLE_CROSS_PROCESS_STORAGE = "pickle"
 EXT_MEMORY_STORAGE = "ext_memory"
-VALIDATE_FEATRE= False
+VALIDATE_FEATRE= True
 
 def validate_features():
     """ Check whether gconstruct needs to validate the input features
     """
     return VALIDATE_FEATRE
 
-def set_validate_features():
+def stop_validate_features():
     """ Set gconstruct in debug mode.
     """
     global VALIDATE_FEATRE
-    VALIDATE_FEATRE = True
+    VALIDATE_FEATRE = False
 
 def validate_numerical_feats(feats):
     """ Validate the numerical features

From 049b16321fbd18e3d96ceb707416c19f78564652 Mon Sep 17 00:00:00 2001
From: Xiang Song <xiangsx@amazon.com>
Date: Thu, 13 Feb 2025 11:58:48 -0800
Subject: [PATCH 05/10] Add unit test

---
 python/graphstorm/gconstruct/transform.py     | 112 ++++++++----------
 tests/unit-tests/gconstruct/test_transform.py |  95 +++++++++++++--
 2 files changed, 134 insertions(+), 73 deletions(-)

diff --git a/python/graphstorm/gconstruct/transform.py b/python/graphstorm/gconstruct/transform.py
index a4586cd013..d23fef9446 100644
--- a/python/graphstorm/gconstruct/transform.py
+++ b/python/graphstorm/gconstruct/transform.py
@@ -284,6 +284,32 @@ def as_out_dtype(self, feats):
         else:
             return feats.astype(self.out_dtype)
 
+    def feat2numerical(self, feats):
+        """ Cast a feature into a numerical numpy array if it is not.
+        If it is already a numerical numpy array, do nothing.
+
+        Parameters
+        ----------
+        feats: np.array
+            Input feature.
+
+        Return
+        ------
+        np.array: the cast feature.
+        """
+        if not (np.issubdtype(feats.dtype, np.integer) \
+            or np.issubdtype(feats.dtype, np.floating)):
+            logging.warning("The feature %s has to be floating points or integers,"
+                            "but get %s. Try to cast it into float32",
+                            self.feat_name, feats.dtype)
+            try:
+                # if input dtype is not float or integer, we need to cast the data
+                # into float32
+                feats = feats.astype(np.float32)
+            except: # pylint: disable=bare-except
+                raise ValueError(f"The feature {self.feat_name} has to be integers or floats.")
+        return feats
+
 class GlobalProcessFeatTransform(FeatTransform):
     """ The base class for transformations that can only be done using a single process.
 
@@ -570,6 +596,16 @@ def pre_process(self, feats):
             f"Feature {self.feat_name} of NumericalMinMaxTransform " \
             "must be numpy array or ExtMemArray"
 
+        if validate_features():
+            if isinstance(feats, ExtMemArrayWrapper):
+                # TODO(xiangsx): This is not memory efficient.
+                # It will load all data into main memory.
+                feats = feats.to_numpy()
+            feats = self.feat2numerical(feats)
+
+            assert validate_numerical_feats(feats), \
+                f"There are NaN, Inf or missing value in the {self.feat_name} feature."
+
         # The max and min of $val = (val-min) / (max-min)$ is pre-defined
         # in the transform_conf, return max_val and min_val directly
         if self._max_val is not None and self._min_val is not None:
@@ -580,21 +616,7 @@ def pre_process(self, feats):
             # It will load all data into main memory.
             feats = feats.to_numpy()
 
-        if validate_features():
-            assert validate_numerical_feats(feats), \
-                f"There are NaN, Inf or missing value in the {self.feat_name} feature."
-
-        if feats.dtype not in [np.float64, np.float32, np.float16, np.int64, \
-                              np.int32, np.int16, np.int8]:
-            logging.warning("The feature %s has to be floating points or integers,"
-                            "but get %s. Try to cast it into float32",
-                            self.feat_name, feats.dtype)
-            try:
-                # if input dtype is not float or integer, we need to cast the data
-                # into float32
-                feats = feats.astype(np.float32)
-            except: # pylint: disable=bare-except
-                raise ValueError(f"The feature {self.feat_name} has to be integers or floats.")
+        feats = self.feat2numerical(feats)
         assert len(feats.shape) <= 2, \
             "Only support 1D fp feature or 2D fp feature, " \
             f"but get {len(feats.shape)}D feature for {self.feat_name}"
@@ -663,15 +685,7 @@ def call(self, feats):
             # It will load all data into main memory.
             feats = feats.to_numpy()
 
-        if feats.dtype not in [np.float64, np.float32, np.float16, np.int64, \
-                              np.int32, np.int16, np.int8]:
-            try:
-                # if input dtype is not float or integer, we need to cast the data
-                # into float32
-                feats = feats.astype(np.float32)
-            except: # pylint: disable=bare-except
-                raise ValueError(f"The feature {self.feat_name} has to be integers or floats.")
-
+        feats = self.feat2numerical(feats)
         feats = (feats - self._min_val) / (self._max_val - self._min_val)
         feats[feats > 1] = 1 # any value > self._max_val is set to self._max_val
         feats[feats < 0] = 0 # any value < self._min_val is set to self._min_val
@@ -725,6 +739,16 @@ def pre_process(self, feats) -> Dict[str, Optional[np.ndarray]]:
             f"Feature {self.feat_name} of NumericalMinMaxTransform " \
             f"must be numpy array or ExtMemArray, got {type(feats)}"
 
+        if validate_features():
+            if isinstance(feats, ExtMemArrayWrapper):
+                # TODO(xiangsx): This is not memory efficient.
+                # It will load all data into main memory.
+                feats = feats.to_numpy()
+
+            feats = self.feat2numerical(feats)
+            assert validate_numerical_feats(feats), \
+                f"There are NaN, Inf or missing value in the {self.feat_name} feature."
+
         # If sum has already been set.
         # Skip the pre-process step
         if self._summation is not None:
@@ -735,22 +759,7 @@ def pre_process(self, feats) -> Dict[str, Optional[np.ndarray]]:
             # It will load all data into main memory.
             feats = feats.to_numpy()
 
-        if feats.dtype not in [np.float64, np.float32, np.float16, np.int64, \
-                              np.int32, np.int16, np.int8]:
-            logging.warning("The feature %s has to be floating points or integers,"
-                            "but get %s. Try to cast it into float32",
-                            self.feat_name, feats.dtype)
-            try:
-                # if input dtype is not float or integer, we need to cast the data
-                # into float32
-                feats = feats.astype(np.float32)
-            except: # pylint: disable=bare-except
-                raise ValueError(f"The feature {self.feat_name} has to be integers or floats.")
-
-        if validate_features():
-            assert validate_numerical_feats(feats), \
-                f"There are NaN, Inf or missing value in the {self.feat_name} feature."
-
+        feats = self.feat2numerical(feats)
         # make summation a 1D array
         summation = np.sum(feats, axis=0, keepdims=True).reshape((-1,))
         return {self.feat_name: summation}
@@ -798,15 +807,7 @@ def call(self, feats) -> Dict[str, np.ndarray]:
             # It will load all data into main memory.
             feats = feats.to_numpy()
 
-        if feats.dtype not in [np.float64, np.float32, np.float16, np.int64, \
-                              np.int32, np.int16, np.int8]:
-            try:
-                # if input dtype is not float or integer, we need to cast the data
-                # into float32
-                feats = feats.astype(np.float32)
-            except TypeError:
-                raise ValueError(f"The feature {self.feat_name} has to be integers or floats.")
-
+        feats = self.feat2numerical(feats)
         feats = feats / self._summation
 
         return {self.feat_name: feats}
@@ -852,18 +853,7 @@ def call(self, feats):
         assert isinstance(feats, (np.ndarray, ExtMemArrayWrapper)), \
                 f"The feature {self.feat_name} has to be NumPy array."
 
-        if not (np.issubdtype(feats.dtype, np.integer) \
-            or np.issubdtype(feats.dtype, np.floating)):
-            logging.warning("The feature %s has to be floating points or integers,"
-                            "but get %s. Try to cast it into float32",
-                            self.feat_name, feats.dtype)
-            try:
-                # if input dtype is not float or integer, we need to cast the data
-                # into float32
-                feats = feats.astype(np.float32)
-            except: # pylint: disable=bare-except
-                raise ValueError(f"The feature {self.feat_name} has to be integers or floats.")
-
+        feats = self.feat2numerical(feats)
         if validate_features():
             assert validate_numerical_feats(feats), \
                 f"There are NaN, Inf or missing value in the {self.feat_name} feature."
diff --git a/tests/unit-tests/gconstruct/test_transform.py b/tests/unit-tests/gconstruct/test_transform.py
index 9f934fdc25..6bba7451d2 100644
--- a/tests/unit-tests/gconstruct/test_transform.py
+++ b/tests/unit-tests/gconstruct/test_transform.py
@@ -52,24 +52,56 @@ def test_fp_min_max_bound(input_dtype):
     assert len(max_val.shape) == 1
     assert len(min_val.shape) == 1
 
+    # test invalid inputs
+    feats[0] = np.nan
+    with assert_raises(AssertionError):
+        _ = transform.pre_process(feats)
+
+    feats[0] = np.inf
+    with assert_raises(AssertionError):
+        _ = transform.pre_process(feats)
+
+    # without predefined bound.
     feats = np.random.randn(100).astype(input_dtype)
     res_dtype = np.float32 if input_dtype == np.cfloat else input_dtype
     fifo = np.finfo(res_dtype)
-    feats[0] = fifo.max
-    feats[1] = -fifo.max
+    max_v = np.max(feats) + 10
+    min_v= np.min(feats) - 10
+    feats[0] = max_v
+    feats[1] = min_v
     transform = NumericalMinMaxTransform("test", "test",
                                          out_dtype=input_dtype)
     max_val, min_val = transform.pre_process(feats)["test"]
     assert len(max_val.shape) == 1
     assert len(min_val.shape) == 1
-    assert_equal(max_val[0], np.finfo(res_dtype).max)
-    assert_equal(min_val[0], -np.finfo(res_dtype).max)
+    assert_almost_equal(max_val[0], max_v, decimal=2)
+    assert_almost_equal(min_val[0], min_v, decimal=2)
+
+    # has predefined bound.
+    feats = np.random.randn(100).astype(input_dtype)
+    res_dtype = np.float32 if input_dtype == np.cfloat else input_dtype
+    fifo = np.finfo(res_dtype)
+    max_v = np.max(feats) + 10
+    min_v= np.min(feats) - 10
+    feats[0] = max_v
+    feats[1] = min_v
+    transform = NumericalMinMaxTransform("test", "test",
+                                         max_bound=5,
+                                         min_bound=-5,
+                                         out_dtype=input_dtype)
+    max_val, min_val = transform.pre_process(feats)["test"]
+    assert len(max_val.shape) == 1
+    assert len(min_val.shape) == 1
+    assert_almost_equal(max_val[0], 5, decimal=2)
+    assert_almost_equal(min_val[0], -5, decimal=2)
 
     if input_dtype == np.float16:
         feats = np.random.randn(100).astype(input_dtype)
         fifo = np.finfo(np.float32)
-        feats[0] = fifo.max
-        feats[1] = -fifo.max
+        max_v = np.max(feats) + 10
+        min_v= np.min(feats) - 10
+        feats[0] = max_v
+        feats[1] = min_v
         transform = NumericalMinMaxTransform("test", "test",
                                              out_dtype=input_dtype)
         max_val, min_val = transform.pre_process(feats)["test"]
@@ -77,13 +109,15 @@ def test_fp_min_max_bound(input_dtype):
         assert len(min_val.shape) == 1
         assert max_val[0].dtype == np.float16
         assert min_val[0].dtype == np.float16
-        assert_equal(max_val[0], np.finfo(np.float16).max)
-        assert_equal(min_val[0], -np.finfo(np.float16).max)
+        assert_almost_equal(max_val[0], max_v, decimal=2)
+        assert_almost_equal(min_val[0], min_v, decimal=2)
 
         feats = np.random.randn(100).astype(input_dtype)
         fifo = np.finfo(np.float32)
-        feats[0] = fifo.max
-        feats[1] = -fifo.max
+        max_v = np.max(feats) + 10
+        min_v= np.min(feats) - 10
+        feats[0] = max_v
+        feats[1] = min_v
         transform = NumericalMinMaxTransform("test", "test",
                                              max_bound=fifo.max,
                                              min_bound=-fifo.max,
@@ -95,8 +129,8 @@ def test_fp_min_max_bound(input_dtype):
         assert len(min_val.shape) == 1
         assert max_val[0].dtype == np.float16
         assert min_val[0].dtype == np.float16
-        assert_equal(max_val[0], np.finfo(np.float16).max)
-        assert_equal(min_val[0], -np.finfo(np.float16).max)
+        assert_almost_equal(max_val[0], max_v, decimal=2)
+        assert_almost_equal(min_val[0], min_v, decimal=2)
 
 
 @pytest.mark.parametrize("input_dtype", [np.cfloat, np.float32])
@@ -532,6 +566,15 @@ def test_noop_transform(out_dtype):
     else:
         assert norm_feats["test"].dtype == np.float32
 
+    # invalid input
+    feats[0] = np.nan
+    with assert_raises(AssertionError):
+        _ = transform(feats)
+
+    feats[0] = np.inf
+    with assert_raises(AssertionError):
+        _ = transform(feats)
+
 def test_noop_truncate():
     transform = Noop("test", "test", truncate_dim=16)
     feats = np.random.randn(100, 32).astype(np.float32)
@@ -608,6 +651,15 @@ def rank_gauss(feat):
         assert trans_feat.dtype != np.float16
         assert_almost_equal(feat, trans_feat, decimal=4)
 
+    # inpuu is invalid
+    feat_0[0] = np.nan
+    with assert_raises(AssertionError):
+        _ = transform(feat_0)
+
+    feat_0[0] = np.inf
+    with assert_raises(AssertionError):
+        _ = transform(feat_0)
+
 def test_custom_node_label_processor():
     train_idx = np.arange(0, 10)
     val_idx = np.arange(10, 15)
@@ -1009,6 +1061,15 @@ def test_bucket_transform(out_dtype):
     feats_tar = np.array([[1, 1], [1, 1], [1, 1], [1, 1]], dtype=out_dtype)
     assert_equal(bucket_feats['test'], feats_tar)
 
+    # invalid input
+    feats = np.array([1, 10, 20, np.nan])
+    with assert_raises(AssertionError):
+        bucket_feats = transform(feats)
+
+    feats = np.array([1, 10, 20, np.inf])
+    with assert_raises(AssertionError):
+        bucket_feats = transform(feats)
+
 @pytest.mark.parametrize("id_dtype", [str, np.int64])
 def test_hard_edge_dst_negative_transform(id_dtype):
     hard_neg_trasnform = HardEdgeDstNegativeTransform("hard_neg", "hard_neg")
@@ -1218,6 +1279,15 @@ def test_standard_pre_process(input_dtype):
     assert_almost_equal(conf["sum"],
                         info[0]+info[1])
 
+    # input is invalid
+    feats0[0] = np.nan
+    with assert_raises(AssertionError):
+        _ = transform.pre_process(feats0)
+
+    feats0[0] = np.inf
+    with assert_raises(AssertionError):
+        _ = transform.pre_process(feats0)
+
     # array sum is zero
     info = [np.array([-1]), np.array([1])]
     transform = NumericalStandardTransform("test", "test")
@@ -1230,6 +1300,7 @@ def test_standard_pre_process(input_dtype):
     with assert_raises(AssertionError):
         transform.update_info(info)
 
+
 @pytest.mark.parametrize("input_dtype", [np.cfloat, np.float32])
 def test_standard_transform(input_dtype):
     np.random.seed(0)

From 7bb0c348000512df1d2091cab2402a1deb174fed Mon Sep 17 00:00:00 2001
From: Xiang Song <xiangsx@amazon.com>
Date: Thu, 13 Feb 2025 12:07:33 -0800
Subject: [PATCH 06/10] Update

---
 docs/source/cli/graph-construction/single-machine-gconstruct.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/source/cli/graph-construction/single-machine-gconstruct.rst b/docs/source/cli/graph-construction/single-machine-gconstruct.rst
index 22f35e691f..39f70be5a7 100644
--- a/docs/source/cli/graph-construction/single-machine-gconstruct.rst
+++ b/docs/source/cli/graph-construction/single-machine-gconstruct.rst
@@ -36,6 +36,7 @@ Full argument list of the ``gconstruct.construct_graph`` command
 * **-\-graph-name**: (**Required**) the name assigned for the graph. The graph name must adhere to the `Python identifier naming rules<https://docs.python.org/3/reference/lexical_analysis.html#identifiers>`_ with the exception that hyphens (``-``) are permitted and the name can start with numbers.
 * **-\-remap-node-id**: boolean value to decide whether to rename node IDs or not. Adding this argument will set it to be true, otherwise false.
 * **-\-add-reverse-edges**: boolean value to decide whether to add reverse edges for the given graph. Adding this argument sets it to true; otherwise, it defaults to false. It is **strongly** suggested to include this argument for graph construction, as some nodes in the original data may not have in-degrees, and thus cannot update their presentations by aggregating messages from their neighbors. Adding this arugment helps prevent this issue.
+* **-\-no-feature-validate**: Turn off the feature validation. By default, GraphStorm will check whether the input features include ``NaN`` or ``Inf``. By turning this validation process off, you can speedup feature transformation.
 * **-\-output-format**: the format of constructed graph, options are ``DGL``,  ``DistDGL``.  Default is ``DistDGL``. It also accepts multiple graph formats at the same time separated by an space, for example ``--output-format "DGL DistDGL"``. The output format is explained in the :ref:`Output <gcon-output-format>` section above.
 * **-\-num-parts**: an integer value that specifies the number of graph partitions to produce. This is only valid if the output format is ``DistDGL``.
 * **-\-part-method**: the partition method to use during partitioning. We support 'metis' or 'random'.

From c16718c0a6a1e73ae7843f4b1b017f3a470872c1 Mon Sep 17 00:00:00 2001
From: Xiang Song <xiangsx@amazon.com>
Date: Thu, 13 Feb 2025 12:17:26 -0800
Subject: [PATCH 07/10] Update

---
 python/graphstorm/gconstruct/transform.py     |  4 +--
 .../gconstruct/test_gconstruct_utils.py       | 34 ++++++++++++++++++-
 2 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/python/graphstorm/gconstruct/transform.py b/python/graphstorm/gconstruct/transform.py
index d23fef9446..827c1c3280 100644
--- a/python/graphstorm/gconstruct/transform.py
+++ b/python/graphstorm/gconstruct/transform.py
@@ -418,10 +418,8 @@ def call(self, feats):
                 f"within numerical value."
         if isinstance(feats, ExtMemArrayWrapper):
             feats = feats.to_numpy()
-        assert np.issubdtype(feats.dtype, np.integer) \
-                or np.issubdtype(feats.dtype, np.floating), \
-                f"The feature {self.feat_name} has to be integers or floats."
 
+        feats = self.feat2numerical(feats)
         if validate_features():
             assert validate_numerical_feats(feats), \
                 f"There are NaN, Inf or missing value in the {self.feat_name} feature."
diff --git a/tests/unit-tests/gconstruct/test_gconstruct_utils.py b/tests/unit-tests/gconstruct/test_gconstruct_utils.py
index 770445b5dd..b92fc36459 100644
--- a/tests/unit-tests/gconstruct/test_gconstruct_utils.py
+++ b/tests/unit-tests/gconstruct/test_gconstruct_utils.py
@@ -34,7 +34,10 @@
 from graphstorm.gconstruct.utils import (save_maps,
                                          load_maps,
                                          get_hard_edge_negs_feats,
-                                         shuffle_hard_nids)
+                                         shuffle_hard_nids,
+                                         validate_features,
+                                         stop_validate_features,
+                                         validate_numerical_feats)
 from graphstorm.gconstruct.file_io import (write_data_hdf5,
                                            read_data_hdf5,
                                            get_in_files,
@@ -518,7 +521,36 @@ def test_single_directory_expansion():
         expected_files = sorted([os.path.join(temp_dir, f) for f in test_files])
         assert sorted(result) == expected_files
 
+def test_validate_features():
+    assert validate_features()
+    stop_validate_features()
+    assert validate_features() is False
+
+def test_validate_numerical_feats():
+    array = np.array([1,2,3])
+    assert validate_numerical_feats(array) is True
+
+    array = np.array([1,2,np.inf])
+    assert validate_numerical_feats(array) is False
+
+    array = np.array([1,2,np.nan])
+    assert validate_numerical_feats(array) is False
+
+    array = np.array([[1,2,np.nan],
+                      [1,2,np.inf]])
+    assert validate_numerical_feats(array) is False
+
+    array = np.array([[1,2,3],
+                      [1,2,np.inf]])
+    assert validate_numerical_feats(array) is False
+
+    array = np.array([[1,2,3],
+                      [1,2,np.nan]])
+    assert validate_numerical_feats(array) is False
+
 if __name__ == '__main__':
+    test_validate_numerical_feats()
+    test_validate_features()
     test_shuffle_hard_nids()
     test_save_load_maps()
     test_get_hard_edge_negs_feats()

From 861bf07347019cdcda8f6fcb665367dc4984c870 Mon Sep 17 00:00:00 2001
From: Xiang Song <xiangsx@amazon.com>
Date: Thu, 13 Feb 2025 12:20:28 -0800
Subject: [PATCH 08/10] Update

---
 python/graphstorm/gconstruct/transform.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/graphstorm/gconstruct/transform.py b/python/graphstorm/gconstruct/transform.py
index 827c1c3280..d55bfaa69c 100644
--- a/python/graphstorm/gconstruct/transform.py
+++ b/python/graphstorm/gconstruct/transform.py
@@ -297,6 +297,8 @@ def feat2numerical(self, feats):
         ------
         np.array: the cast feature.
         """
+        # Only allow integers and floating points
+        # Do not allow booleans.
         if not (np.issubdtype(feats.dtype, np.integer) \
             or np.issubdtype(feats.dtype, np.floating)):
             logging.warning("The feature %s has to be floating points or integers,"

From b48535d9c9f64faa6b807aa9edc7e4dc112edfd9 Mon Sep 17 00:00:00 2001
From: Xiang Song <xiangsx@amazon.com>
Date: Thu, 13 Feb 2025 14:47:15 -0800
Subject: [PATCH 09/10] update

---
 python/graphstorm/gconstruct/transform.py     | 1 -
 tests/unit-tests/gconstruct/test_transform.py | 4 ++--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/python/graphstorm/gconstruct/transform.py b/python/graphstorm/gconstruct/transform.py
index d55bfaa69c..600b9e9b61 100644
--- a/python/graphstorm/gconstruct/transform.py
+++ b/python/graphstorm/gconstruct/transform.py
@@ -602,7 +602,6 @@ def pre_process(self, feats):
                 # It will load all data into main memory.
                 feats = feats.to_numpy()
             feats = self.feat2numerical(feats)
-
             assert validate_numerical_feats(feats), \
                 f"There are NaN, Inf or missing value in the {self.feat_name} feature."
 
diff --git a/tests/unit-tests/gconstruct/test_transform.py b/tests/unit-tests/gconstruct/test_transform.py
index 6bba7451d2..e7f179a1f1 100644
--- a/tests/unit-tests/gconstruct/test_transform.py
+++ b/tests/unit-tests/gconstruct/test_transform.py
@@ -55,11 +55,11 @@ def test_fp_min_max_bound(input_dtype):
     # test invalid inputs
     feats[0] = np.nan
     with assert_raises(AssertionError):
-        _ = transform.pre_process(feats)
+        _ = transform.pre_process(feats.astype(input_dtype))
 
     feats[0] = np.inf
     with assert_raises(AssertionError):
-        _ = transform.pre_process(feats)
+        _ = transform.pre_process(feats.astype(input_dtype))
 
     # without predefined bound.
     feats = np.random.randn(100).astype(input_dtype)

From 9027d7b52d543b55c2290b0733eee4cc0461ef7a Mon Sep 17 00:00:00 2001
From: Xiang Song <xiangsx@amazon.com>
Date: Thu, 13 Feb 2025 15:18:18 -0800
Subject: [PATCH 10/10] Fix unitest

---
 tests/unit-tests/gconstruct/test_gconstruct_utils.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/unit-tests/gconstruct/test_gconstruct_utils.py b/tests/unit-tests/gconstruct/test_gconstruct_utils.py
index b92fc36459..771103ebe1 100644
--- a/tests/unit-tests/gconstruct/test_gconstruct_utils.py
+++ b/tests/unit-tests/gconstruct/test_gconstruct_utils.py
@@ -24,6 +24,7 @@
 import pyarrow.parquet as pq
 import pyarrow as pa
 import torch as th
+import graphstorm
 
 from numpy.testing import assert_almost_equal
 
@@ -525,6 +526,8 @@ def test_validate_features():
     assert validate_features()
     stop_validate_features()
     assert validate_features() is False
+    # set VALIDATE_FEATRE back
+    graphstorm.gconstruct.utils.VALIDATE_FEATRE = True
 
 def test_validate_numerical_feats():
     array = np.array([1,2,3])