From 22e8a4b2ffb4910b7d4bfb82e8b926f8d5a58d1b Mon Sep 17 00:00:00 2001 From: Xiang Song Date: Wed, 12 Feb 2025 23:52:38 -0800 Subject: [PATCH 01/10] Add debug mode for gconstruct --- .../graphstorm/gconstruct/construct_graph.py | 9 ++++- python/graphstorm/gconstruct/transform.py | 35 +++++++++++++++---- python/graphstorm/gconstruct/utils.py | 22 ++++++++++++ 3 files changed, 59 insertions(+), 7 deletions(-) diff --git a/python/graphstorm/gconstruct/construct_graph.py b/python/graphstorm/gconstruct/construct_graph.py index 62bfbe7799..24f977a9b2 100644 --- a/python/graphstorm/gconstruct/construct_graph.py +++ b/python/graphstorm/gconstruct/construct_graph.py @@ -44,7 +44,9 @@ from .id_map import NoopMap, IdMap, map_node_ids from .utils import (multiprocessing_data_read, update_two_phase_feat_ops, ExtMemArrayMerger, - partition_graph, ExtMemArrayWrapper) + partition_graph, + ExtMemArrayWrapper, + set_debug_mode) from .utils import (get_hard_edge_negs_feats, shuffle_hard_nids) @@ -744,6 +746,9 @@ def process_graph(args): """ check_graph_name(args.graph_name) logging.basicConfig(level=get_log_level(args.logging_level)) + if args.debug: + set_debug_mode() + with open(args.conf_file, 'r', encoding="utf8") as json_file: process_confs = json.load(json_file) @@ -919,6 +924,8 @@ def process_graph(args): help="Whether or not to remap node IDs.") argparser.add_argument("--add-reverse-edges", action='store_true', help="Add reverse edges.") + argparser.add_argument("--debug", action='store_true', + help="Set gconstruct into debug mode.") argparser.add_argument("--output-format", type=str, nargs='+', default=["DistDGL"], help="The output format of the constructed graph." "It can be a single output format, for example " diff --git a/python/graphstorm/gconstruct/transform.py b/python/graphstorm/gconstruct/transform.py index 59586b7b6a..ae3cda7dbe 100644 --- a/python/graphstorm/gconstruct/transform.py +++ b/python/graphstorm/gconstruct/transform.py @@ -33,7 +33,11 @@ from transformers import AutoModel, AutoConfig from .file_io import read_index -from .utils import ExtMemArrayWrapper, ExtFeatureWrapper, generate_hash +from .utils import (ExtMemArrayWrapper, + ExtFeatureWrapper, + generate_hash, + is_debug_mode, + validate_numerical_feats) LABEL_STATS_FIELD = "training_label_stats" LABEL_STATS_FREQUENCY_COUNT = "frequency_cnt" @@ -392,6 +396,10 @@ def call(self, feats): or np.issubdtype(feats.dtype, np.floating), \ f"The feature {self.feat_name} has to be integers or floats." + if is_debug_mode(): + assert validate_numerical_feats(feats), \ + f"There are NaN, Inf or missing value in the {self.feat_name} feature." + encoding = np.zeros((len(feats), self.bucket_cnt), dtype=np.int8) max_val = max(self.bucket_range) min_val = min(self.bucket_range) @@ -572,6 +580,10 @@ def pre_process(self, feats): # It will load all data into main memory. feats = feats.to_numpy() + if is_debug_mode(): + assert validate_numerical_feats(feats), \ + f"There are NaN, Inf or missing value in the {self.feat_name} feature." + if feats.dtype not in [np.float64, np.float32, np.float16, np.int64, \ np.int32, np.int16, np.int8]: logging.warning("The feature %s has to be floating points or integers," @@ -735,6 +747,10 @@ def pre_process(self, feats) -> Dict[str, Optional[np.ndarray]]: except: # pylint: disable=bare-except raise ValueError(f"The feature {self.feat_name} has to be integers or floats.") + if is_debug_mode(): + assert validate_numerical_feats(feats), \ + f"There are NaN, Inf or missing value in the {self.feat_name} feature." + # make summation a 1D array summation = np.sum(feats, axis=0, keepdims=True).reshape((-1,)) return {self.feat_name: summation} @@ -836,10 +852,8 @@ def call(self, feats): assert isinstance(feats, (np.ndarray, ExtMemArrayWrapper)), \ f"The feature {self.feat_name} has to be NumPy array." - if np.issubdtype(feats.dtype, np.integer) \ - or np.issubdtype(feats.dtype, np.floating): \ - return {self.feat_name: feats} - else: + if not (np.issubdtype(feats.dtype, np.integer) \ + or np.issubdtype(feats.dtype, np.floating)): logging.warning("The feature %s has to be floating points or integers," "but get %s. Try to cast it into float32", self.feat_name, feats.dtype) @@ -850,7 +864,11 @@ def call(self, feats): except: # pylint: disable=bare-except raise ValueError(f"The feature {self.feat_name} has to be integers or floats.") - return {self.feat_name: feats} + if is_debug_mode(): + assert validate_numerical_feats(feats), \ + f"There are NaN, Inf or missing value in the {self.feat_name} feature." + + return {self.feat_name: feats} def after_merge_transform(self, feats): # The feats can be a numpy array or a numpy memmaped object @@ -1076,6 +1094,11 @@ def call(self, feats): assert np.issubdtype(feats.dtype, np.integer) \ or np.issubdtype(feats.dtype, np.floating), \ f"The feature {self.feat_name} has to be integers or floats." + + if is_debug_mode(): + assert validate_numerical_feats(feats), \ + f"There are NaN, Inf or missing value in the {self.feat_name} feature." + if self.truncate_dim is not None: if isinstance(feats, np.ndarray): feats = feats[:, :self.truncate_dim] diff --git a/python/graphstorm/gconstruct/utils.py b/python/graphstorm/gconstruct/utils.py index fe2d3d0e77..9a12282c79 100644 --- a/python/graphstorm/gconstruct/utils.py +++ b/python/graphstorm/gconstruct/utils.py @@ -41,6 +41,28 @@ SHARED_MEMORY_CROSS_PROCESS_STORAGE = "shared_memory" PICKLE_CROSS_PROCESS_STORAGE = "pickle" EXT_MEMORY_STORAGE = "ext_memory" +IS_DEBUG_MODE = False + +def is_debug_mode(): + """ Check whether global IS_DEBUG_MODE is true. + """ + return IS_DEBUG_MODE + +def set_debug_mode(): + """ Set gconstruct in debug mode. + """ + global IS_DEBUG_MODE + IS_DEBUG_MODE = True + +def validate_numerical_feats(feats): + """ Validate the numerical features + + Returns + ------- + bool: Whether the values are all valid + """ + return (not np.isnan(feats).any()) and \ + (not np.isinf(feats).any()) def _is_numeric(arr): """ Check if the input array has the numeric data type. From 55435c49130bb2c6e296245f6d212d08a3d792ba Mon Sep 17 00:00:00 2001 From: Xiang Song Date: Wed, 12 Feb 2025 23:55:21 -0800 Subject: [PATCH 02/10] update --- python/graphstorm/gconstruct/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/graphstorm/gconstruct/utils.py b/python/graphstorm/gconstruct/utils.py index 9a12282c79..6ef1aad650 100644 --- a/python/graphstorm/gconstruct/utils.py +++ b/python/graphstorm/gconstruct/utils.py @@ -59,7 +59,7 @@ def validate_numerical_feats(feats): Returns ------- - bool: Whether the values are all valid + bool: Whether the values of the input feature are all valid """ return (not np.isnan(feats).any()) and \ (not np.isinf(feats).any()) From 4b72ff1daa619ea5cf9460c86604ca47a3dbe6f5 Mon Sep 17 00:00:00 2001 From: Xiang Song Date: Thu, 13 Feb 2025 10:49:49 -0800 Subject: [PATCH 03/10] Update --- python/graphstorm/gconstruct/construct_graph.py | 10 +++++----- python/graphstorm/gconstruct/transform.py | 12 ++++++------ python/graphstorm/gconstruct/utils.py | 14 +++++++------- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/python/graphstorm/gconstruct/construct_graph.py b/python/graphstorm/gconstruct/construct_graph.py index 24f977a9b2..662dd90450 100644 --- a/python/graphstorm/gconstruct/construct_graph.py +++ b/python/graphstorm/gconstruct/construct_graph.py @@ -46,7 +46,7 @@ update_two_phase_feat_ops, ExtMemArrayMerger, partition_graph, ExtMemArrayWrapper, - set_debug_mode) + set_validate_features) from .utils import (get_hard_edge_negs_feats, shuffle_hard_nids) @@ -746,8 +746,8 @@ def process_graph(args): """ check_graph_name(args.graph_name) logging.basicConfig(level=get_log_level(args.logging_level)) - if args.debug: - set_debug_mode() + if args.validate_features: + set_validate_features() with open(args.conf_file, 'r', encoding="utf8") as json_file: process_confs = json.load(json_file) @@ -924,8 +924,8 @@ def process_graph(args): help="Whether or not to remap node IDs.") argparser.add_argument("--add-reverse-edges", action='store_true', help="Add reverse edges.") - argparser.add_argument("--debug", action='store_true', - help="Set gconstruct into debug mode.") + argparser.add_argument("--validate-features", action='store_false', + help="Turn off features validation.") argparser.add_argument("--output-format", type=str, nargs='+', default=["DistDGL"], help="The output format of the constructed graph." "It can be a single output format, for example " diff --git a/python/graphstorm/gconstruct/transform.py b/python/graphstorm/gconstruct/transform.py index ae3cda7dbe..a4586cd013 100644 --- a/python/graphstorm/gconstruct/transform.py +++ b/python/graphstorm/gconstruct/transform.py @@ -36,7 +36,7 @@ from .utils import (ExtMemArrayWrapper, ExtFeatureWrapper, generate_hash, - is_debug_mode, + validate_features, validate_numerical_feats) LABEL_STATS_FIELD = "training_label_stats" @@ -396,7 +396,7 @@ def call(self, feats): or np.issubdtype(feats.dtype, np.floating), \ f"The feature {self.feat_name} has to be integers or floats." - if is_debug_mode(): + if validate_features(): assert validate_numerical_feats(feats), \ f"There are NaN, Inf or missing value in the {self.feat_name} feature." @@ -580,7 +580,7 @@ def pre_process(self, feats): # It will load all data into main memory. feats = feats.to_numpy() - if is_debug_mode(): + if validate_features(): assert validate_numerical_feats(feats), \ f"There are NaN, Inf or missing value in the {self.feat_name} feature." @@ -747,7 +747,7 @@ def pre_process(self, feats) -> Dict[str, Optional[np.ndarray]]: except: # pylint: disable=bare-except raise ValueError(f"The feature {self.feat_name} has to be integers or floats.") - if is_debug_mode(): + if validate_features(): assert validate_numerical_feats(feats), \ f"There are NaN, Inf or missing value in the {self.feat_name} feature." @@ -864,7 +864,7 @@ def call(self, feats): except: # pylint: disable=bare-except raise ValueError(f"The feature {self.feat_name} has to be integers or floats.") - if is_debug_mode(): + if validate_features(): assert validate_numerical_feats(feats), \ f"There are NaN, Inf or missing value in the {self.feat_name} feature." @@ -1095,7 +1095,7 @@ def call(self, feats): or np.issubdtype(feats.dtype, np.floating), \ f"The feature {self.feat_name} has to be integers or floats." - if is_debug_mode(): + if validate_features(): assert validate_numerical_feats(feats), \ f"There are NaN, Inf or missing value in the {self.feat_name} feature." diff --git a/python/graphstorm/gconstruct/utils.py b/python/graphstorm/gconstruct/utils.py index 6ef1aad650..bcb5eb91f8 100644 --- a/python/graphstorm/gconstruct/utils.py +++ b/python/graphstorm/gconstruct/utils.py @@ -41,18 +41,18 @@ SHARED_MEMORY_CROSS_PROCESS_STORAGE = "shared_memory" PICKLE_CROSS_PROCESS_STORAGE = "pickle" EXT_MEMORY_STORAGE = "ext_memory" -IS_DEBUG_MODE = False +VALIDATE_FEATRE= False -def is_debug_mode(): - """ Check whether global IS_DEBUG_MODE is true. +def validate_features(): + """ Check whether gconstruct needs to validate the input features """ - return IS_DEBUG_MODE + return VALIDATE_FEATRE -def set_debug_mode(): +def set_validate_features(): """ Set gconstruct in debug mode. """ - global IS_DEBUG_MODE - IS_DEBUG_MODE = True + global VALIDATE_FEATRE + VALIDATE_FEATRE = True def validate_numerical_feats(feats): """ Validate the numerical features From 72cc9e8a504e2b5261d2bc5bdabc4eb5ccaed96e Mon Sep 17 00:00:00 2001 From: Xiang Song Date: Thu, 13 Feb 2025 10:57:32 -0800 Subject: [PATCH 04/10] Update --- python/graphstorm/gconstruct/construct_graph.py | 12 ++++++++---- python/graphstorm/gconstruct/utils.py | 6 +++--- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/python/graphstorm/gconstruct/construct_graph.py b/python/graphstorm/gconstruct/construct_graph.py index 662dd90450..177c7b7143 100644 --- a/python/graphstorm/gconstruct/construct_graph.py +++ b/python/graphstorm/gconstruct/construct_graph.py @@ -46,7 +46,7 @@ update_two_phase_feat_ops, ExtMemArrayMerger, partition_graph, ExtMemArrayWrapper, - set_validate_features) + stop_validate_features) from .utils import (get_hard_edge_negs_feats, shuffle_hard_nids) @@ -746,8 +746,12 @@ def process_graph(args): """ check_graph_name(args.graph_name) logging.basicConfig(level=get_log_level(args.logging_level)) - if args.validate_features: - set_validate_features() + if args.no_feature_validate: + logging.warning("Turn off input feature validation." + "This will speedup data processing, " + "but won't check whether there are " + "invalid values from the input.") + stop_validate_features() with open(args.conf_file, 'r', encoding="utf8") as json_file: process_confs = json.load(json_file) @@ -924,7 +928,7 @@ def process_graph(args): help="Whether or not to remap node IDs.") argparser.add_argument("--add-reverse-edges", action='store_true', help="Add reverse edges.") - argparser.add_argument("--validate-features", action='store_false', + argparser.add_argument("--no-feature-validate", action='store_true', help="Turn off features validation.") argparser.add_argument("--output-format", type=str, nargs='+', default=["DistDGL"], help="The output format of the constructed graph." diff --git a/python/graphstorm/gconstruct/utils.py b/python/graphstorm/gconstruct/utils.py index bcb5eb91f8..3ab52b6e89 100644 --- a/python/graphstorm/gconstruct/utils.py +++ b/python/graphstorm/gconstruct/utils.py @@ -41,18 +41,18 @@ SHARED_MEMORY_CROSS_PROCESS_STORAGE = "shared_memory" PICKLE_CROSS_PROCESS_STORAGE = "pickle" EXT_MEMORY_STORAGE = "ext_memory" -VALIDATE_FEATRE= False +VALIDATE_FEATRE= True def validate_features(): """ Check whether gconstruct needs to validate the input features """ return VALIDATE_FEATRE -def set_validate_features(): +def stop_validate_features(): """ Set gconstruct in debug mode. """ global VALIDATE_FEATRE - VALIDATE_FEATRE = True + VALIDATE_FEATRE = False def validate_numerical_feats(feats): """ Validate the numerical features From 049b16321fbd18e3d96ceb707416c19f78564652 Mon Sep 17 00:00:00 2001 From: Xiang Song Date: Thu, 13 Feb 2025 11:58:48 -0800 Subject: [PATCH 05/10] Add unit test --- python/graphstorm/gconstruct/transform.py | 112 ++++++++---------- tests/unit-tests/gconstruct/test_transform.py | 95 +++++++++++++-- 2 files changed, 134 insertions(+), 73 deletions(-) diff --git a/python/graphstorm/gconstruct/transform.py b/python/graphstorm/gconstruct/transform.py index a4586cd013..d23fef9446 100644 --- a/python/graphstorm/gconstruct/transform.py +++ b/python/graphstorm/gconstruct/transform.py @@ -284,6 +284,32 @@ def as_out_dtype(self, feats): else: return feats.astype(self.out_dtype) + def feat2numerical(self, feats): + """ Cast a feature into a numerical numpy array if it is not. + If it is already a numerical numpy array, do nothing. + + Parameters + ---------- + feats: np.array + Input feature. + + Return + ------ + np.array: the cast feature. + """ + if not (np.issubdtype(feats.dtype, np.integer) \ + or np.issubdtype(feats.dtype, np.floating)): + logging.warning("The feature %s has to be floating points or integers," + "but get %s. Try to cast it into float32", + self.feat_name, feats.dtype) + try: + # if input dtype is not float or integer, we need to cast the data + # into float32 + feats = feats.astype(np.float32) + except: # pylint: disable=bare-except + raise ValueError(f"The feature {self.feat_name} has to be integers or floats.") + return feats + class GlobalProcessFeatTransform(FeatTransform): """ The base class for transformations that can only be done using a single process. @@ -570,6 +596,16 @@ def pre_process(self, feats): f"Feature {self.feat_name} of NumericalMinMaxTransform " \ "must be numpy array or ExtMemArray" + if validate_features(): + if isinstance(feats, ExtMemArrayWrapper): + # TODO(xiangsx): This is not memory efficient. + # It will load all data into main memory. + feats = feats.to_numpy() + feats = self.feat2numerical(feats) + + assert validate_numerical_feats(feats), \ + f"There are NaN, Inf or missing value in the {self.feat_name} feature." + # The max and min of $val = (val-min) / (max-min)$ is pre-defined # in the transform_conf, return max_val and min_val directly if self._max_val is not None and self._min_val is not None: @@ -580,21 +616,7 @@ def pre_process(self, feats): # It will load all data into main memory. feats = feats.to_numpy() - if validate_features(): - assert validate_numerical_feats(feats), \ - f"There are NaN, Inf or missing value in the {self.feat_name} feature." - - if feats.dtype not in [np.float64, np.float32, np.float16, np.int64, \ - np.int32, np.int16, np.int8]: - logging.warning("The feature %s has to be floating points or integers," - "but get %s. Try to cast it into float32", - self.feat_name, feats.dtype) - try: - # if input dtype is not float or integer, we need to cast the data - # into float32 - feats = feats.astype(np.float32) - except: # pylint: disable=bare-except - raise ValueError(f"The feature {self.feat_name} has to be integers or floats.") + feats = self.feat2numerical(feats) assert len(feats.shape) <= 2, \ "Only support 1D fp feature or 2D fp feature, " \ f"but get {len(feats.shape)}D feature for {self.feat_name}" @@ -663,15 +685,7 @@ def call(self, feats): # It will load all data into main memory. feats = feats.to_numpy() - if feats.dtype not in [np.float64, np.float32, np.float16, np.int64, \ - np.int32, np.int16, np.int8]: - try: - # if input dtype is not float or integer, we need to cast the data - # into float32 - feats = feats.astype(np.float32) - except: # pylint: disable=bare-except - raise ValueError(f"The feature {self.feat_name} has to be integers or floats.") - + feats = self.feat2numerical(feats) feats = (feats - self._min_val) / (self._max_val - self._min_val) feats[feats > 1] = 1 # any value > self._max_val is set to self._max_val feats[feats < 0] = 0 # any value < self._min_val is set to self._min_val @@ -725,6 +739,16 @@ def pre_process(self, feats) -> Dict[str, Optional[np.ndarray]]: f"Feature {self.feat_name} of NumericalMinMaxTransform " \ f"must be numpy array or ExtMemArray, got {type(feats)}" + if validate_features(): + if isinstance(feats, ExtMemArrayWrapper): + # TODO(xiangsx): This is not memory efficient. + # It will load all data into main memory. + feats = feats.to_numpy() + + feats = self.feat2numerical(feats) + assert validate_numerical_feats(feats), \ + f"There are NaN, Inf or missing value in the {self.feat_name} feature." + # If sum has already been set. # Skip the pre-process step if self._summation is not None: @@ -735,22 +759,7 @@ def pre_process(self, feats) -> Dict[str, Optional[np.ndarray]]: # It will load all data into main memory. feats = feats.to_numpy() - if feats.dtype not in [np.float64, np.float32, np.float16, np.int64, \ - np.int32, np.int16, np.int8]: - logging.warning("The feature %s has to be floating points or integers," - "but get %s. Try to cast it into float32", - self.feat_name, feats.dtype) - try: - # if input dtype is not float or integer, we need to cast the data - # into float32 - feats = feats.astype(np.float32) - except: # pylint: disable=bare-except - raise ValueError(f"The feature {self.feat_name} has to be integers or floats.") - - if validate_features(): - assert validate_numerical_feats(feats), \ - f"There are NaN, Inf or missing value in the {self.feat_name} feature." - + feats = self.feat2numerical(feats) # make summation a 1D array summation = np.sum(feats, axis=0, keepdims=True).reshape((-1,)) return {self.feat_name: summation} @@ -798,15 +807,7 @@ def call(self, feats) -> Dict[str, np.ndarray]: # It will load all data into main memory. feats = feats.to_numpy() - if feats.dtype not in [np.float64, np.float32, np.float16, np.int64, \ - np.int32, np.int16, np.int8]: - try: - # if input dtype is not float or integer, we need to cast the data - # into float32 - feats = feats.astype(np.float32) - except TypeError: - raise ValueError(f"The feature {self.feat_name} has to be integers or floats.") - + feats = self.feat2numerical(feats) feats = feats / self._summation return {self.feat_name: feats} @@ -852,18 +853,7 @@ def call(self, feats): assert isinstance(feats, (np.ndarray, ExtMemArrayWrapper)), \ f"The feature {self.feat_name} has to be NumPy array." - if not (np.issubdtype(feats.dtype, np.integer) \ - or np.issubdtype(feats.dtype, np.floating)): - logging.warning("The feature %s has to be floating points or integers," - "but get %s. Try to cast it into float32", - self.feat_name, feats.dtype) - try: - # if input dtype is not float or integer, we need to cast the data - # into float32 - feats = feats.astype(np.float32) - except: # pylint: disable=bare-except - raise ValueError(f"The feature {self.feat_name} has to be integers or floats.") - + feats = self.feat2numerical(feats) if validate_features(): assert validate_numerical_feats(feats), \ f"There are NaN, Inf or missing value in the {self.feat_name} feature." diff --git a/tests/unit-tests/gconstruct/test_transform.py b/tests/unit-tests/gconstruct/test_transform.py index 9f934fdc25..6bba7451d2 100644 --- a/tests/unit-tests/gconstruct/test_transform.py +++ b/tests/unit-tests/gconstruct/test_transform.py @@ -52,24 +52,56 @@ def test_fp_min_max_bound(input_dtype): assert len(max_val.shape) == 1 assert len(min_val.shape) == 1 + # test invalid inputs + feats[0] = np.nan + with assert_raises(AssertionError): + _ = transform.pre_process(feats) + + feats[0] = np.inf + with assert_raises(AssertionError): + _ = transform.pre_process(feats) + + # without predefined bound. feats = np.random.randn(100).astype(input_dtype) res_dtype = np.float32 if input_dtype == np.cfloat else input_dtype fifo = np.finfo(res_dtype) - feats[0] = fifo.max - feats[1] = -fifo.max + max_v = np.max(feats) + 10 + min_v= np.min(feats) - 10 + feats[0] = max_v + feats[1] = min_v transform = NumericalMinMaxTransform("test", "test", out_dtype=input_dtype) max_val, min_val = transform.pre_process(feats)["test"] assert len(max_val.shape) == 1 assert len(min_val.shape) == 1 - assert_equal(max_val[0], np.finfo(res_dtype).max) - assert_equal(min_val[0], -np.finfo(res_dtype).max) + assert_almost_equal(max_val[0], max_v, decimal=2) + assert_almost_equal(min_val[0], min_v, decimal=2) + + # has predefined bound. + feats = np.random.randn(100).astype(input_dtype) + res_dtype = np.float32 if input_dtype == np.cfloat else input_dtype + fifo = np.finfo(res_dtype) + max_v = np.max(feats) + 10 + min_v= np.min(feats) - 10 + feats[0] = max_v + feats[1] = min_v + transform = NumericalMinMaxTransform("test", "test", + max_bound=5, + min_bound=-5, + out_dtype=input_dtype) + max_val, min_val = transform.pre_process(feats)["test"] + assert len(max_val.shape) == 1 + assert len(min_val.shape) == 1 + assert_almost_equal(max_val[0], 5, decimal=2) + assert_almost_equal(min_val[0], -5, decimal=2) if input_dtype == np.float16: feats = np.random.randn(100).astype(input_dtype) fifo = np.finfo(np.float32) - feats[0] = fifo.max - feats[1] = -fifo.max + max_v = np.max(feats) + 10 + min_v= np.min(feats) - 10 + feats[0] = max_v + feats[1] = min_v transform = NumericalMinMaxTransform("test", "test", out_dtype=input_dtype) max_val, min_val = transform.pre_process(feats)["test"] @@ -77,13 +109,15 @@ def test_fp_min_max_bound(input_dtype): assert len(min_val.shape) == 1 assert max_val[0].dtype == np.float16 assert min_val[0].dtype == np.float16 - assert_equal(max_val[0], np.finfo(np.float16).max) - assert_equal(min_val[0], -np.finfo(np.float16).max) + assert_almost_equal(max_val[0], max_v, decimal=2) + assert_almost_equal(min_val[0], min_v, decimal=2) feats = np.random.randn(100).astype(input_dtype) fifo = np.finfo(np.float32) - feats[0] = fifo.max - feats[1] = -fifo.max + max_v = np.max(feats) + 10 + min_v= np.min(feats) - 10 + feats[0] = max_v + feats[1] = min_v transform = NumericalMinMaxTransform("test", "test", max_bound=fifo.max, min_bound=-fifo.max, @@ -95,8 +129,8 @@ def test_fp_min_max_bound(input_dtype): assert len(min_val.shape) == 1 assert max_val[0].dtype == np.float16 assert min_val[0].dtype == np.float16 - assert_equal(max_val[0], np.finfo(np.float16).max) - assert_equal(min_val[0], -np.finfo(np.float16).max) + assert_almost_equal(max_val[0], max_v, decimal=2) + assert_almost_equal(min_val[0], min_v, decimal=2) @pytest.mark.parametrize("input_dtype", [np.cfloat, np.float32]) @@ -532,6 +566,15 @@ def test_noop_transform(out_dtype): else: assert norm_feats["test"].dtype == np.float32 + # invalid input + feats[0] = np.nan + with assert_raises(AssertionError): + _ = transform(feats) + + feats[0] = np.inf + with assert_raises(AssertionError): + _ = transform(feats) + def test_noop_truncate(): transform = Noop("test", "test", truncate_dim=16) feats = np.random.randn(100, 32).astype(np.float32) @@ -608,6 +651,15 @@ def rank_gauss(feat): assert trans_feat.dtype != np.float16 assert_almost_equal(feat, trans_feat, decimal=4) + # inpuu is invalid + feat_0[0] = np.nan + with assert_raises(AssertionError): + _ = transform(feat_0) + + feat_0[0] = np.inf + with assert_raises(AssertionError): + _ = transform(feat_0) + def test_custom_node_label_processor(): train_idx = np.arange(0, 10) val_idx = np.arange(10, 15) @@ -1009,6 +1061,15 @@ def test_bucket_transform(out_dtype): feats_tar = np.array([[1, 1], [1, 1], [1, 1], [1, 1]], dtype=out_dtype) assert_equal(bucket_feats['test'], feats_tar) + # invalid input + feats = np.array([1, 10, 20, np.nan]) + with assert_raises(AssertionError): + bucket_feats = transform(feats) + + feats = np.array([1, 10, 20, np.inf]) + with assert_raises(AssertionError): + bucket_feats = transform(feats) + @pytest.mark.parametrize("id_dtype", [str, np.int64]) def test_hard_edge_dst_negative_transform(id_dtype): hard_neg_trasnform = HardEdgeDstNegativeTransform("hard_neg", "hard_neg") @@ -1218,6 +1279,15 @@ def test_standard_pre_process(input_dtype): assert_almost_equal(conf["sum"], info[0]+info[1]) + # input is invalid + feats0[0] = np.nan + with assert_raises(AssertionError): + _ = transform.pre_process(feats0) + + feats0[0] = np.inf + with assert_raises(AssertionError): + _ = transform.pre_process(feats0) + # array sum is zero info = [np.array([-1]), np.array([1])] transform = NumericalStandardTransform("test", "test") @@ -1230,6 +1300,7 @@ def test_standard_pre_process(input_dtype): with assert_raises(AssertionError): transform.update_info(info) + @pytest.mark.parametrize("input_dtype", [np.cfloat, np.float32]) def test_standard_transform(input_dtype): np.random.seed(0) From 7bb0c348000512df1d2091cab2402a1deb174fed Mon Sep 17 00:00:00 2001 From: Xiang Song Date: Thu, 13 Feb 2025 12:07:33 -0800 Subject: [PATCH 06/10] Update --- docs/source/cli/graph-construction/single-machine-gconstruct.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/cli/graph-construction/single-machine-gconstruct.rst b/docs/source/cli/graph-construction/single-machine-gconstruct.rst index 22f35e691f..39f70be5a7 100644 --- a/docs/source/cli/graph-construction/single-machine-gconstruct.rst +++ b/docs/source/cli/graph-construction/single-machine-gconstruct.rst @@ -36,6 +36,7 @@ Full argument list of the ``gconstruct.construct_graph`` command * **-\-graph-name**: (**Required**) the name assigned for the graph. The graph name must adhere to the `Python identifier naming rules`_ with the exception that hyphens (``-``) are permitted and the name can start with numbers. * **-\-remap-node-id**: boolean value to decide whether to rename node IDs or not. Adding this argument will set it to be true, otherwise false. * **-\-add-reverse-edges**: boolean value to decide whether to add reverse edges for the given graph. Adding this argument sets it to true; otherwise, it defaults to false. It is **strongly** suggested to include this argument for graph construction, as some nodes in the original data may not have in-degrees, and thus cannot update their presentations by aggregating messages from their neighbors. Adding this arugment helps prevent this issue. +* **-\-no-feature-validate**: Turn off the feature validation. By default, GraphStorm will check whether the input features include ``NaN`` or ``Inf``. By turning this validation process off, you can speedup feature transformation. * **-\-output-format**: the format of constructed graph, options are ``DGL``, ``DistDGL``. Default is ``DistDGL``. It also accepts multiple graph formats at the same time separated by an space, for example ``--output-format "DGL DistDGL"``. The output format is explained in the :ref:`Output ` section above. * **-\-num-parts**: an integer value that specifies the number of graph partitions to produce. This is only valid if the output format is ``DistDGL``. * **-\-part-method**: the partition method to use during partitioning. We support 'metis' or 'random'. From c16718c0a6a1e73ae7843f4b1b017f3a470872c1 Mon Sep 17 00:00:00 2001 From: Xiang Song Date: Thu, 13 Feb 2025 12:17:26 -0800 Subject: [PATCH 07/10] Update --- python/graphstorm/gconstruct/transform.py | 4 +-- .../gconstruct/test_gconstruct_utils.py | 34 ++++++++++++++++++- 2 files changed, 34 insertions(+), 4 deletions(-) diff --git a/python/graphstorm/gconstruct/transform.py b/python/graphstorm/gconstruct/transform.py index d23fef9446..827c1c3280 100644 --- a/python/graphstorm/gconstruct/transform.py +++ b/python/graphstorm/gconstruct/transform.py @@ -418,10 +418,8 @@ def call(self, feats): f"within numerical value." if isinstance(feats, ExtMemArrayWrapper): feats = feats.to_numpy() - assert np.issubdtype(feats.dtype, np.integer) \ - or np.issubdtype(feats.dtype, np.floating), \ - f"The feature {self.feat_name} has to be integers or floats." + feats = self.feat2numerical(feats) if validate_features(): assert validate_numerical_feats(feats), \ f"There are NaN, Inf or missing value in the {self.feat_name} feature." diff --git a/tests/unit-tests/gconstruct/test_gconstruct_utils.py b/tests/unit-tests/gconstruct/test_gconstruct_utils.py index 770445b5dd..b92fc36459 100644 --- a/tests/unit-tests/gconstruct/test_gconstruct_utils.py +++ b/tests/unit-tests/gconstruct/test_gconstruct_utils.py @@ -34,7 +34,10 @@ from graphstorm.gconstruct.utils import (save_maps, load_maps, get_hard_edge_negs_feats, - shuffle_hard_nids) + shuffle_hard_nids, + validate_features, + stop_validate_features, + validate_numerical_feats) from graphstorm.gconstruct.file_io import (write_data_hdf5, read_data_hdf5, get_in_files, @@ -518,7 +521,36 @@ def test_single_directory_expansion(): expected_files = sorted([os.path.join(temp_dir, f) for f in test_files]) assert sorted(result) == expected_files +def test_validate_features(): + assert validate_features() + stop_validate_features() + assert validate_features() is False + +def test_validate_numerical_feats(): + array = np.array([1,2,3]) + assert validate_numerical_feats(array) is True + + array = np.array([1,2,np.inf]) + assert validate_numerical_feats(array) is False + + array = np.array([1,2,np.nan]) + assert validate_numerical_feats(array) is False + + array = np.array([[1,2,np.nan], + [1,2,np.inf]]) + assert validate_numerical_feats(array) is False + + array = np.array([[1,2,3], + [1,2,np.inf]]) + assert validate_numerical_feats(array) is False + + array = np.array([[1,2,3], + [1,2,np.nan]]) + assert validate_numerical_feats(array) is False + if __name__ == '__main__': + test_validate_numerical_feats() + test_validate_features() test_shuffle_hard_nids() test_save_load_maps() test_get_hard_edge_negs_feats() From 861bf07347019cdcda8f6fcb665367dc4984c870 Mon Sep 17 00:00:00 2001 From: Xiang Song Date: Thu, 13 Feb 2025 12:20:28 -0800 Subject: [PATCH 08/10] Update --- python/graphstorm/gconstruct/transform.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/graphstorm/gconstruct/transform.py b/python/graphstorm/gconstruct/transform.py index 827c1c3280..d55bfaa69c 100644 --- a/python/graphstorm/gconstruct/transform.py +++ b/python/graphstorm/gconstruct/transform.py @@ -297,6 +297,8 @@ def feat2numerical(self, feats): ------ np.array: the cast feature. """ + # Only allow integers and floating points + # Do not allow booleans. if not (np.issubdtype(feats.dtype, np.integer) \ or np.issubdtype(feats.dtype, np.floating)): logging.warning("The feature %s has to be floating points or integers," From b48535d9c9f64faa6b807aa9edc7e4dc112edfd9 Mon Sep 17 00:00:00 2001 From: Xiang Song Date: Thu, 13 Feb 2025 14:47:15 -0800 Subject: [PATCH 09/10] update --- python/graphstorm/gconstruct/transform.py | 1 - tests/unit-tests/gconstruct/test_transform.py | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/python/graphstorm/gconstruct/transform.py b/python/graphstorm/gconstruct/transform.py index d55bfaa69c..600b9e9b61 100644 --- a/python/graphstorm/gconstruct/transform.py +++ b/python/graphstorm/gconstruct/transform.py @@ -602,7 +602,6 @@ def pre_process(self, feats): # It will load all data into main memory. feats = feats.to_numpy() feats = self.feat2numerical(feats) - assert validate_numerical_feats(feats), \ f"There are NaN, Inf or missing value in the {self.feat_name} feature." diff --git a/tests/unit-tests/gconstruct/test_transform.py b/tests/unit-tests/gconstruct/test_transform.py index 6bba7451d2..e7f179a1f1 100644 --- a/tests/unit-tests/gconstruct/test_transform.py +++ b/tests/unit-tests/gconstruct/test_transform.py @@ -55,11 +55,11 @@ def test_fp_min_max_bound(input_dtype): # test invalid inputs feats[0] = np.nan with assert_raises(AssertionError): - _ = transform.pre_process(feats) + _ = transform.pre_process(feats.astype(input_dtype)) feats[0] = np.inf with assert_raises(AssertionError): - _ = transform.pre_process(feats) + _ = transform.pre_process(feats.astype(input_dtype)) # without predefined bound. feats = np.random.randn(100).astype(input_dtype) From 9027d7b52d543b55c2290b0733eee4cc0461ef7a Mon Sep 17 00:00:00 2001 From: Xiang Song Date: Thu, 13 Feb 2025 15:18:18 -0800 Subject: [PATCH 10/10] Fix unitest --- tests/unit-tests/gconstruct/test_gconstruct_utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/unit-tests/gconstruct/test_gconstruct_utils.py b/tests/unit-tests/gconstruct/test_gconstruct_utils.py index b92fc36459..771103ebe1 100644 --- a/tests/unit-tests/gconstruct/test_gconstruct_utils.py +++ b/tests/unit-tests/gconstruct/test_gconstruct_utils.py @@ -24,6 +24,7 @@ import pyarrow.parquet as pq import pyarrow as pa import torch as th +import graphstorm from numpy.testing import assert_almost_equal @@ -525,6 +526,8 @@ def test_validate_features(): assert validate_features() stop_validate_features() assert validate_features() is False + # set VALIDATE_FEATRE back + graphstorm.gconstruct.utils.VALIDATE_FEATRE = True def test_validate_numerical_feats(): array = np.array([1,2,3])