From 63024e7bbc59895b5b8bebc3c6429750236eb03e Mon Sep 17 00:00:00 2001 From: Shaun <124687868+shaunyogeshwaran@users.noreply.github.com> Date: Wed, 3 Jan 2024 22:53:37 +0530 Subject: [PATCH 1/6] ms-examples-docs --- .../bin/custom/python/gen_modelselection.py | 164 ++++++++++++++++++ 1 file changed, 164 insertions(+) diff --git a/h2o-bindings/bin/custom/python/gen_modelselection.py b/h2o-bindings/bin/custom/python/gen_modelselection.py index 9fea32cb5002..d1b430b79bbc 100644 --- a/h2o-bindings/bin/custom/python/gen_modelselection.py +++ b/h2o-bindings/bin/custom/python/gen_modelselection.py @@ -225,3 +225,167 @@ def get_best_model_predictors(self): mode=maxr, the model returned is no longer guaranteed to have the best R2 value. """ ) + +examples = dict( + build_glm_model=""" +>>> import h2o +>>> from h2o.estimators import H2OModelSelectionEstimator +>>> h2o.init() +>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") +>>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] +>>> response = "GLEASON" +>>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, seed=12345, mode="maxr", build_glm_model=False) +>>> maxrModel.train(x=predictors, y=response, training_frame=prostate) +>>> results = maxrModel.result() +>>> print(results) +>>> coeff = maxrModel.coef() +>>> print(coeff) +>>> coeff3 = maxrModel.coef(3) +>>> print(coeff3) +>>> coeff_norm = maxrModel.coef_norm() +>>> print(coeff_norm) +>>> coeff_norm3 = maxrModel.coef_norm(3) +>>> print(coeff_norm3) +>>> maxrModel.get_predictors_added_per_step() +>>> bwModel = H2OModelSelectionEstimator(max_predictor_number=3, seed=12345, mode="backward", build_glm_model=False) +>>> bwModel.train(x=predictors, y=response, training_frame=prostate) +""", + influence=""" +>>> import h2o +>>> from h2o.estimators import H2OModelSelectionEstimator +>>> h2o.init() +>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") +>>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] +>>> response = "GLEASON" +>>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, seed=12345, mode="maxr", influence="dfbetas") +>>> maxrModel.train(x=predictors, y=response, training_frame=prostate) +>>> results = maxrModel.result() +>>> print(results) +>>> coeff = maxrModel.coef() +>>> print(coeff) +>>> coeff3 = maxrModel.coef(3) +>>> print(coeff3) +>>> coeff_norm = maxrModel.coef_norm() +>>> print(coeff_norm) +>>> coeff_norm3 = maxrModel.coef_norm(3) +>>> print(coeff_norm3) +>>> maxrModel.get_predictors_added_per_step() +>>> bwModel = H2OModelSelectionEstimator(max_predictor_number=3, seed=12345, mode="backward", influence="dfbetas") +>>> bwModel.train(x=predictors, y=response, training_frame=prostate) +""", + multinode_mode=""" +>>> import h2o +>>> from h2o.estimators import H2OModelSelectionEstimator +>>> h2o.init() +>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") +>>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] +>>> response = "GLEASON" +>>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, seed=12345, mode="maxr", multinode_mode=False) +>>> maxrModel.train(x=predictors, y=response, training_frame=prostate) +>>> results = maxrModel.result() +>>> print(results) +>>> coeff = maxrModel.coef() +>>> print(coeff) +>>> coeff3 = maxrModel.coef(3) +>>> print(coeff3) +>>> coeff_norm = maxrModel.coef_norm() +>>> print(coeff_norm) +>>> coeff_norm3 = maxrModel.coef_norm(3) +>>> print(coeff_norm3) +>>> maxrModel.get_predictors_added_per_step() +>>> bwModel = H2OModelSelectionEstimator(max_predictor_number=3, seed=12345, mode="backward", multinode_mode=False) +>>> bwModel.train(x=predictors, y=response, training_frame=prostate) +""", + nparallelism=""" +>>> import h2o +>>> from h2o.estimators import H2OModelSelectionEstimator +>>> h2o.init() +>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") +>>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] +>>> response = "GLEASON" +>>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, seed=12345, mode="maxr", nparallelism=0) +>>> maxrModel.train(x=predictors, y=response, training_frame=prostate) +>>> results = maxrModel.result() +>>> print(results) +>>> coeff = maxrModel.coef() +>>> print(coeff) +>>> coeff3 = maxrModel.coef(3) +>>> print(coeff3) +>>> coeff_norm = maxrModel.coef_norm() +>>> print(coeff_norm) +>>> coeff_norm3 = maxrModel.coef_norm(3) +>>> print(coeff_norm3) +>>> maxrModel.get_predictors_added_per_step() +>>> bwModel = H2OModelSelectionEstimator(max_predictor_number=3, seed=12345, mode="backward", nparallelism=0) +>>> bwModel.train(x=predictors, y=response, training_frame=prostate) +""", + p_values_threshold=""" +>>> import h2o +>>> from h2o.estimators import H2OModelSelectionEstimator +>>> h2o.init() +>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") +>>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] +>>> response = "GLEASON" +>>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, seed=12345, mode="maxr", p_values_threshold=0.0) +>>> maxrModel.train(x=predictors, y=response, training_frame=prostate) +>>> results = maxrModel.result() +>>> print(results) +>>> coeff = maxrModel.coef() +>>> print(coeff) +>>> coeff3 = maxrModel.coef(3) +>>> print(coeff3) +>>> coeff_norm = maxrModel.coef_norm() +>>> print(coeff_norm) +>>> coeff_norm3 = maxrModel.coef_norm(3) +>>> print(coeff_norm3) +>>> maxrModel.get_predictors_added_per_step() +>>> bwModel = H2OModelSelectionEstimator(max_predictor_number=3, seed=12345, mode="backward", p_values_threshold=0.0) +>>> bwModel.train(x=predictors, y=response, training_frame=prostate) +""", + custom_metric_func=""" +>>> import h2o +>>> from h2o.estimators import H2OModelSelectionEstimator +>>> h2o.init() +>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") +>>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] +>>> response = "GLEASON" +>>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, seed=12345, mode="maxr", early_stopping=False) +>>> maxrModel.train(x=predictors, y=response, training_frame=prostate) +>>> results = maxrModel.result() +>>> print(results) +>>> coeff = maxrModel.coef() +>>> print(coeff) +>>> coeff3 = maxrModel.coef(3) +>>> print(coeff3) +>>> coeff_norm = maxrModel.coef_norm() +>>> print(coeff_norm) +>>> coeff_norm3 = maxrModel.coef_norm(3) +>>> print(coeff_norm3) +>>> maxrModel.get_predictors_added_per_step() +>>> bwModel = H2OModelSelectionEstimator(max_predictor_number=3, seed=12345, mode="backward", early_stopping=False) +>>> bwModel.train(x=predictors, y=response, training_frame=prostate) +""", + obj_reg=""" +>>> import h2o +>>> from h2o.estimators import H2OModelSelectionEstimator +>>> h2o.init() +>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") +>>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] +>>> response = "GLEASON" +>>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, seed=12345, mode="maxr", obj_reg=-1.0) +>>> maxrModel.train(x=predictors, y=response, training_frame=prostate) +>>> results = maxrModel.result() +>>> print(results) +>>> coeff = maxrModel.coef() +>>> print(coeff) +>>> coeff3 = maxrModel.coef(3) +>>> print(coeff3) +>>> coeff_norm = maxrModel.coef_norm() +>>> print(coeff_norm) +>>> coeff_norm3 = maxrModel.coef_norm(3) +>>> print(coeff_norm3) +>>> maxrModel.get_predictors_added_per_step() +>>> bwModel = H2OModelSelectionEstimator(max_predictor_number=3, seed=12345, mode="backward", obj_reg=-1.0) +>>> bwModel.train(x=predictors, y=response, training_frame=prostate) +""", +) \ No newline at end of file From 9f1e7f5fd84571290c0d9c816fc5f8b45af65272 Mon Sep 17 00:00:00 2001 From: Hannah Tillman Date: Fri, 12 Jan 2024 13:41:20 -0600 Subject: [PATCH 2/6] ht/built gradle, trimmed examples, added coef/coef_norm/mode + fly-by result fix --- .../bin/custom/python/gen_modelselection.py | 110 ++++++------- h2o-py/h2o/estimators/model_selection.py | 151 ++++++++++++++++++ 2 files changed, 197 insertions(+), 64 deletions(-) diff --git a/h2o-bindings/bin/custom/python/gen_modelselection.py b/h2o-bindings/bin/custom/python/gen_modelselection.py index d1b430b79bbc..23c21ff0c2fe 100644 --- a/h2o-bindings/bin/custom/python/gen_modelselection.py +++ b/h2o-bindings/bin/custom/python/gen_modelselection.py @@ -42,6 +42,21 @@ def coef_norm(self, predictor_size=None): :param predictor_size: predictor subset size, will only return model coefficients of that subset size. :return: list of Python Dicts of coefficients for all models built with different predictor numbers + + :examples: + + >>> import h2o + >>> from h2o.estimators import H2OModelSelectionEstimator + >>> h2o.init() + >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") + >>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] + >>> response = "GLEASON" + >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, seed=12345, mode="maxr") + >>> maxrModel.train(x=predictors, y=response, training_frame=prostate) + >>> results = maxrModel.result() + >>> print(results) + >>> coeff_norm = maxrModel.coef_norm() + >>> print(coeff_norm) """ model_ids = self._model_json["output"]["best_model_ids"] if not(self.actual_params["build_glm_model"]) and self.actual_params["mode"]=="maxrsweep": @@ -95,6 +110,21 @@ def coef(self, predictor_size=None): :param predictor_size: predictor subset size, will only return model coefficients of that subset size. :return: list of Python Dicts of coefficients for all models built with different predictor numbers + + :examples: + + >>> import h2o + >>> from h2o.estimators import H2OModelSelectionEstimator + >>> h2o.init() + >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") + >>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] + >>> response = "GLEASON" + >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, seed=12345, mode="maxr") + >>> maxrModel.train(x=predictors, y=response, training_frame=prostate) + >>> results = maxrModel.result() + >>> print(results) + >>> coeff = maxrModel.coef() + >>> print(coeff) """ if not self.actual_params["build_glm_model"] and self.actual_params["mode"]=="maxrsweep": coef_names = self._model_json["output"]["coefficient_names"] @@ -148,6 +178,7 @@ def coef(self, predictor_size=None): def result(self): """ Get result frame that contains information about the model building process like for modelselection and anovaglm. + :return: the H2OFrame that contains information about the model building process like for modelselection and anovaglm. """ return H2OFrame._expr(expr=ExprNode("result", ASTId(self.key)))._frame(fill_cache=True) @@ -240,15 +271,6 @@ def get_best_model_predictors(self): >>> print(results) >>> coeff = maxrModel.coef() >>> print(coeff) ->>> coeff3 = maxrModel.coef(3) ->>> print(coeff3) ->>> coeff_norm = maxrModel.coef_norm() ->>> print(coeff_norm) ->>> coeff_norm3 = maxrModel.coef_norm(3) ->>> print(coeff_norm3) ->>> maxrModel.get_predictors_added_per_step() ->>> bwModel = H2OModelSelectionEstimator(max_predictor_number=3, seed=12345, mode="backward", build_glm_model=False) ->>> bwModel.train(x=predictors, y=response, training_frame=prostate) """, influence=""" >>> import h2o @@ -263,15 +285,6 @@ def get_best_model_predictors(self): >>> print(results) >>> coeff = maxrModel.coef() >>> print(coeff) ->>> coeff3 = maxrModel.coef(3) ->>> print(coeff3) ->>> coeff_norm = maxrModel.coef_norm() ->>> print(coeff_norm) ->>> coeff_norm3 = maxrModel.coef_norm(3) ->>> print(coeff_norm3) ->>> maxrModel.get_predictors_added_per_step() ->>> bwModel = H2OModelSelectionEstimator(max_predictor_number=3, seed=12345, mode="backward", influence="dfbetas") ->>> bwModel.train(x=predictors, y=response, training_frame=prostate) """, multinode_mode=""" >>> import h2o @@ -286,15 +299,6 @@ def get_best_model_predictors(self): >>> print(results) >>> coeff = maxrModel.coef() >>> print(coeff) ->>> coeff3 = maxrModel.coef(3) ->>> print(coeff3) ->>> coeff_norm = maxrModel.coef_norm() ->>> print(coeff_norm) ->>> coeff_norm3 = maxrModel.coef_norm(3) ->>> print(coeff_norm3) ->>> maxrModel.get_predictors_added_per_step() ->>> bwModel = H2OModelSelectionEstimator(max_predictor_number=3, seed=12345, mode="backward", multinode_mode=False) ->>> bwModel.train(x=predictors, y=response, training_frame=prostate) """, nparallelism=""" >>> import h2o @@ -309,15 +313,6 @@ def get_best_model_predictors(self): >>> print(results) >>> coeff = maxrModel.coef() >>> print(coeff) ->>> coeff3 = maxrModel.coef(3) ->>> print(coeff3) ->>> coeff_norm = maxrModel.coef_norm() ->>> print(coeff_norm) ->>> coeff_norm3 = maxrModel.coef_norm(3) ->>> print(coeff_norm3) ->>> maxrModel.get_predictors_added_per_step() ->>> bwModel = H2OModelSelectionEstimator(max_predictor_number=3, seed=12345, mode="backward", nparallelism=0) ->>> bwModel.train(x=predictors, y=response, training_frame=prostate) """, p_values_threshold=""" >>> import h2o @@ -332,15 +327,6 @@ def get_best_model_predictors(self): >>> print(results) >>> coeff = maxrModel.coef() >>> print(coeff) ->>> coeff3 = maxrModel.coef(3) ->>> print(coeff3) ->>> coeff_norm = maxrModel.coef_norm() ->>> print(coeff_norm) ->>> coeff_norm3 = maxrModel.coef_norm(3) ->>> print(coeff_norm3) ->>> maxrModel.get_predictors_added_per_step() ->>> bwModel = H2OModelSelectionEstimator(max_predictor_number=3, seed=12345, mode="backward", p_values_threshold=0.0) ->>> bwModel.train(x=predictors, y=response, training_frame=prostate) """, custom_metric_func=""" >>> import h2o @@ -355,15 +341,6 @@ def get_best_model_predictors(self): >>> print(results) >>> coeff = maxrModel.coef() >>> print(coeff) ->>> coeff3 = maxrModel.coef(3) ->>> print(coeff3) ->>> coeff_norm = maxrModel.coef_norm() ->>> print(coeff_norm) ->>> coeff_norm3 = maxrModel.coef_norm(3) ->>> print(coeff_norm3) ->>> maxrModel.get_predictors_added_per_step() ->>> bwModel = H2OModelSelectionEstimator(max_predictor_number=3, seed=12345, mode="backward", early_stopping=False) ->>> bwModel.train(x=predictors, y=response, training_frame=prostate) """, obj_reg=""" >>> import h2o @@ -378,14 +355,19 @@ def get_best_model_predictors(self): >>> print(results) >>> coeff = maxrModel.coef() >>> print(coeff) ->>> coeff3 = maxrModel.coef(3) ->>> print(coeff3) ->>> coeff_norm = maxrModel.coef_norm() ->>> print(coeff_norm) ->>> coeff_norm3 = maxrModel.coef_norm(3) ->>> print(coeff_norm3) ->>> maxrModel.get_predictors_added_per_step() ->>> bwModel = H2OModelSelectionEstimator(max_predictor_number=3, seed=12345, mode="backward", obj_reg=-1.0) ->>> bwModel.train(x=predictors, y=response, training_frame=prostate) """, -) \ No newline at end of file + mode=""" +>>> import h2o +>>> from h2o.estimators import H2OModelSelectionEstimator +>>> h2o.init() +>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") +>>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] +>>> response = "GLEASON" +>>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, seed=12345, mode="maxr") +>>> maxrModel.train(x=predictors, y=response, training_frame=prostate) +>>> results = maxrModel.result() +>>> print(results) +>>> coeff = maxrModel.coef() +>>> print(coeff) +""" +) diff --git a/h2o-py/h2o/estimators/model_selection.py b/h2o-py/h2o/estimators/model_selection.py index 564a8963e3e2..a9fc43360b99 100644 --- a/h2o-py/h2o/estimators/model_selection.py +++ b/h2o-py/h2o/estimators/model_selection.py @@ -998,6 +998,21 @@ def obj_reg(self): Likelihood divider in objective value computation, default (of -1.0) will set it to 1/nobs Type: ``float``, defaults to ``-1.0``. + + :examples: + + >>> import h2o + >>> from h2o.estimators import H2OModelSelectionEstimator + >>> h2o.init() + >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") + >>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] + >>> response = "GLEASON" + >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, seed=12345, mode="maxr", obj_reg=-1.0) + >>> maxrModel.train(x=predictors, y=response, training_frame=prostate) + >>> results = maxrModel.result() + >>> print(results) + >>> coeff = maxrModel.coef() + >>> print(coeff) """ return self._parms.get("obj_reg") @@ -1130,6 +1145,21 @@ def custom_metric_func(self): Reference to custom evaluation function, format: `language:keyName=funcName` Type: ``str``. + + :examples: + + >>> import h2o + >>> from h2o.estimators import H2OModelSelectionEstimator + >>> h2o.init() + >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") + >>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] + >>> response = "GLEASON" + >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, seed=12345, mode="maxr", early_stopping=False) + >>> maxrModel.train(x=predictors, y=response, training_frame=prostate) + >>> results = maxrModel.result() + >>> print(results) + >>> coeff = maxrModel.coef() + >>> print(coeff) """ return self._parms.get("custom_metric_func") @@ -1144,6 +1174,21 @@ def nparallelism(self): number of models to build in parallel. Defaults to 0.0 which is adaptive to the system capability Type: ``int``, defaults to ``0``. + + :examples: + + >>> import h2o + >>> from h2o.estimators import H2OModelSelectionEstimator + >>> h2o.init() + >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") + >>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] + >>> response = "GLEASON" + >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, seed=12345, mode="maxr", nparallelism=0) + >>> maxrModel.train(x=predictors, y=response, training_frame=prostate) + >>> results = maxrModel.result() + >>> print(results) + >>> coeff = maxrModel.coef() + >>> print(coeff) """ return self._parms.get("nparallelism") @@ -1190,6 +1235,21 @@ def mode(self): than 'maxr', 'backward' for backward selection. Type: ``Literal["allsubsets", "maxr", "maxrsweep", "backward"]``, defaults to ``"maxr"``. + + :examples: + + >>> import h2o + >>> from h2o.estimators import H2OModelSelectionEstimator + >>> h2o.init() + >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") + >>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] + >>> response = "GLEASON" + >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, seed=12345, mode="maxr") + >>> maxrModel.train(x=predictors, y=response, training_frame=prostate) + >>> results = maxrModel.result() + >>> print(results) + >>> coeff = maxrModel.coef() + >>> print(coeff) """ return self._parms.get("mode") @@ -1207,6 +1267,21 @@ def build_glm_model(self): themselves. Defaults to false. Type: ``bool``, defaults to ``False``. + + :examples: + + >>> import h2o + >>> from h2o.estimators import H2OModelSelectionEstimator + >>> h2o.init() + >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") + >>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] + >>> response = "GLEASON" + >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, seed=12345, mode="maxr", build_glm_model=False) + >>> maxrModel.train(x=predictors, y=response, training_frame=prostate) + >>> results = maxrModel.result() + >>> print(results) + >>> coeff = maxrModel.coef() + >>> print(coeff) """ return self._parms.get("build_glm_model") @@ -1222,6 +1297,21 @@ def p_values_threshold(self): below this threshold Type: ``float``, defaults to ``0.0``. + + :examples: + + >>> import h2o + >>> from h2o.estimators import H2OModelSelectionEstimator + >>> h2o.init() + >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") + >>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] + >>> response = "GLEASON" + >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, seed=12345, mode="maxr", p_values_threshold=0.0) + >>> maxrModel.train(x=predictors, y=response, training_frame=prostate) + >>> results = maxrModel.result() + >>> print(results) + >>> coeff = maxrModel.coef() + >>> print(coeff) """ return self._parms.get("p_values_threshold") @@ -1236,6 +1326,21 @@ def influence(self): If set to dfbetas will calculate the difference in beta when a datarow is included and excluded in the dataset. Type: ``Literal["dfbetas"]``. + + :examples: + + >>> import h2o + >>> from h2o.estimators import H2OModelSelectionEstimator + >>> h2o.init() + >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") + >>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] + >>> response = "GLEASON" + >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, seed=12345, mode="maxr", influence="dfbetas") + >>> maxrModel.train(x=predictors, y=response, training_frame=prostate) + >>> results = maxrModel.result() + >>> print(results) + >>> coeff = maxrModel.coef() + >>> print(coeff) """ return self._parms.get("influence") @@ -1251,6 +1356,21 @@ def multinode_mode(self): Defaults to false. Type: ``bool``, defaults to ``False``. + + :examples: + + >>> import h2o + >>> from h2o.estimators import H2OModelSelectionEstimator + >>> h2o.init() + >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") + >>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] + >>> response = "GLEASON" + >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, seed=12345, mode="maxr", multinode_mode=False) + >>> maxrModel.train(x=predictors, y=response, training_frame=prostate) + >>> results = maxrModel.result() + >>> print(results) + >>> coeff = maxrModel.coef() + >>> print(coeff) """ return self._parms.get("multinode_mode") @@ -1303,6 +1423,21 @@ def coef_norm(self, predictor_size=None): :param predictor_size: predictor subset size, will only return model coefficients of that subset size. :return: list of Python Dicts of coefficients for all models built with different predictor numbers + + :examples: + + >>> import h2o + >>> from h2o.estimators import H2OModelSelectionEstimator + >>> h2o.init() + >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") + >>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] + >>> response = "GLEASON" + >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, seed=12345, mode="maxr") + >>> maxrModel.train(x=predictors, y=response, training_frame=prostate) + >>> results = maxrModel.result() + >>> print(results) + >>> coeff_norm = maxrModel.coef_norm() + >>> print(coeff_norm) """ model_ids = self._model_json["output"]["best_model_ids"] if not(self.actual_params["build_glm_model"]) and self.actual_params["mode"]=="maxrsweep": @@ -1356,6 +1491,21 @@ def coef(self, predictor_size=None): :param predictor_size: predictor subset size, will only return model coefficients of that subset size. :return: list of Python Dicts of coefficients for all models built with different predictor numbers + + :examples: + + >>> import h2o + >>> from h2o.estimators import H2OModelSelectionEstimator + >>> h2o.init() + >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") + >>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] + >>> response = "GLEASON" + >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, seed=12345, mode="maxr") + >>> maxrModel.train(x=predictors, y=response, training_frame=prostate) + >>> results = maxrModel.result() + >>> print(results) + >>> coeff = maxrModel.coef() + >>> print(coeff) """ if not self.actual_params["build_glm_model"] and self.actual_params["mode"]=="maxrsweep": coef_names = self._model_json["output"]["coefficient_names"] @@ -1409,6 +1559,7 @@ def coef(self, predictor_size=None): def result(self): """ Get result frame that contains information about the model building process like for modelselection and anovaglm. + :return: the H2OFrame that contains information about the model building process like for modelselection and anovaglm. """ return H2OFrame._expr(expr=ExprNode("result", ASTId(self.key)))._frame(fill_cache=True) From 195bc7155a2d11a6edf660df997592e748a6c73c Mon Sep 17 00:00:00 2001 From: Hannah Tillman Date: Thu, 18 Jan 2024 11:37:20 -0600 Subject: [PATCH 3/6] ht/spacing update --- .../bin/custom/python/gen_modelselection.py | 47 +++++++++++++++---- h2o-py/h2o/estimators/model_selection.py | 47 +++++++++++++++---- 2 files changed, 74 insertions(+), 20 deletions(-) diff --git a/h2o-bindings/bin/custom/python/gen_modelselection.py b/h2o-bindings/bin/custom/python/gen_modelselection.py index 23c21ff0c2fe..3a65dbaf7eda 100644 --- a/h2o-bindings/bin/custom/python/gen_modelselection.py +++ b/h2o-bindings/bin/custom/python/gen_modelselection.py @@ -51,7 +51,9 @@ def coef_norm(self, predictor_size=None): >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") >>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] >>> response = "GLEASON" - >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, seed=12345, mode="maxr") + >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, + ... seed=12345, + ... mode="maxr") >>> maxrModel.train(x=predictors, y=response, training_frame=prostate) >>> results = maxrModel.result() >>> print(results) @@ -119,7 +121,9 @@ def coef(self, predictor_size=None): >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") >>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] >>> response = "GLEASON" - >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, seed=12345, mode="maxr") + >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, + ... seed=12345, + ... mode="maxr") >>> maxrModel.train(x=predictors, y=response, training_frame=prostate) >>> results = maxrModel.result() >>> print(results) @@ -265,7 +269,10 @@ def get_best_model_predictors(self): >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") >>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] >>> response = "GLEASON" ->>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, seed=12345, mode="maxr", build_glm_model=False) +>>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, +... seed=12345, +... mode="maxr", +... build_glm_model=False) >>> maxrModel.train(x=predictors, y=response, training_frame=prostate) >>> results = maxrModel.result() >>> print(results) @@ -279,7 +286,10 @@ def get_best_model_predictors(self): >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") >>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] >>> response = "GLEASON" ->>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, seed=12345, mode="maxr", influence="dfbetas") +>>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, +... seed=12345, +... mode="maxr", +... influence="dfbetas") >>> maxrModel.train(x=predictors, y=response, training_frame=prostate) >>> results = maxrModel.result() >>> print(results) @@ -293,7 +303,10 @@ def get_best_model_predictors(self): >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") >>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] >>> response = "GLEASON" ->>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, seed=12345, mode="maxr", multinode_mode=False) +>>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, +... seed=12345, +... mode="maxr", +... multinode_mode=False) >>> maxrModel.train(x=predictors, y=response, training_frame=prostate) >>> results = maxrModel.result() >>> print(results) @@ -307,7 +320,10 @@ def get_best_model_predictors(self): >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") >>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] >>> response = "GLEASON" ->>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, seed=12345, mode="maxr", nparallelism=0) +>>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, +... seed=12345, +... mode="maxr", +... nparallelism=0) >>> maxrModel.train(x=predictors, y=response, training_frame=prostate) >>> results = maxrModel.result() >>> print(results) @@ -321,7 +337,10 @@ def get_best_model_predictors(self): >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") >>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] >>> response = "GLEASON" ->>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, seed=12345, mode="maxr", p_values_threshold=0.0) +>>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, +... seed=12345, +... mode="maxr", +... p_values_threshold=0.0) >>> maxrModel.train(x=predictors, y=response, training_frame=prostate) >>> results = maxrModel.result() >>> print(results) @@ -335,7 +354,10 @@ def get_best_model_predictors(self): >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") >>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] >>> response = "GLEASON" ->>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, seed=12345, mode="maxr", early_stopping=False) +>>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, +... seed=12345, +... mode="maxr", +... early_stopping=False) >>> maxrModel.train(x=predictors, y=response, training_frame=prostate) >>> results = maxrModel.result() >>> print(results) @@ -349,7 +371,10 @@ def get_best_model_predictors(self): >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") >>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] >>> response = "GLEASON" ->>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, seed=12345, mode="maxr", obj_reg=-1.0) +>>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, +... seed=12345, +... mode="maxr", +... obj_reg=-1.0) >>> maxrModel.train(x=predictors, y=response, training_frame=prostate) >>> results = maxrModel.result() >>> print(results) @@ -363,7 +388,9 @@ def get_best_model_predictors(self): >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") >>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] >>> response = "GLEASON" ->>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, seed=12345, mode="maxr") +>>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, +... seed=12345, +... mode="maxr") >>> maxrModel.train(x=predictors, y=response, training_frame=prostate) >>> results = maxrModel.result() >>> print(results) diff --git a/h2o-py/h2o/estimators/model_selection.py b/h2o-py/h2o/estimators/model_selection.py index a9fc43360b99..9cec6edd680b 100644 --- a/h2o-py/h2o/estimators/model_selection.py +++ b/h2o-py/h2o/estimators/model_selection.py @@ -1007,7 +1007,10 @@ def obj_reg(self): >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") >>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] >>> response = "GLEASON" - >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, seed=12345, mode="maxr", obj_reg=-1.0) + >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, + ... seed=12345, + ... mode="maxr", + ... obj_reg=-1.0) >>> maxrModel.train(x=predictors, y=response, training_frame=prostate) >>> results = maxrModel.result() >>> print(results) @@ -1154,7 +1157,10 @@ def custom_metric_func(self): >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") >>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] >>> response = "GLEASON" - >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, seed=12345, mode="maxr", early_stopping=False) + >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, + ... seed=12345, + ... mode="maxr", + ... early_stopping=False) >>> maxrModel.train(x=predictors, y=response, training_frame=prostate) >>> results = maxrModel.result() >>> print(results) @@ -1183,7 +1189,10 @@ def nparallelism(self): >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") >>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] >>> response = "GLEASON" - >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, seed=12345, mode="maxr", nparallelism=0) + >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, + ... seed=12345, + ... mode="maxr", + ... nparallelism=0) >>> maxrModel.train(x=predictors, y=response, training_frame=prostate) >>> results = maxrModel.result() >>> print(results) @@ -1244,7 +1253,9 @@ def mode(self): >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") >>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] >>> response = "GLEASON" - >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, seed=12345, mode="maxr") + >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, + ... seed=12345, + ... mode="maxr") >>> maxrModel.train(x=predictors, y=response, training_frame=prostate) >>> results = maxrModel.result() >>> print(results) @@ -1276,7 +1287,10 @@ def build_glm_model(self): >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") >>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] >>> response = "GLEASON" - >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, seed=12345, mode="maxr", build_glm_model=False) + >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, + ... seed=12345, + ... mode="maxr", + ... build_glm_model=False) >>> maxrModel.train(x=predictors, y=response, training_frame=prostate) >>> results = maxrModel.result() >>> print(results) @@ -1306,7 +1320,10 @@ def p_values_threshold(self): >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") >>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] >>> response = "GLEASON" - >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, seed=12345, mode="maxr", p_values_threshold=0.0) + >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, + ... seed=12345, + ... mode="maxr", + ... p_values_threshold=0.0) >>> maxrModel.train(x=predictors, y=response, training_frame=prostate) >>> results = maxrModel.result() >>> print(results) @@ -1335,7 +1352,10 @@ def influence(self): >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") >>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] >>> response = "GLEASON" - >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, seed=12345, mode="maxr", influence="dfbetas") + >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, + ... seed=12345, + ... mode="maxr", + ... influence="dfbetas") >>> maxrModel.train(x=predictors, y=response, training_frame=prostate) >>> results = maxrModel.result() >>> print(results) @@ -1365,7 +1385,10 @@ def multinode_mode(self): >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") >>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] >>> response = "GLEASON" - >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, seed=12345, mode="maxr", multinode_mode=False) + >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, + ... seed=12345, + ... mode="maxr", + ... multinode_mode=False) >>> maxrModel.train(x=predictors, y=response, training_frame=prostate) >>> results = maxrModel.result() >>> print(results) @@ -1432,7 +1455,9 @@ def coef_norm(self, predictor_size=None): >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") >>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] >>> response = "GLEASON" - >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, seed=12345, mode="maxr") + >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, + ... seed=12345, + ... mode="maxr") >>> maxrModel.train(x=predictors, y=response, training_frame=prostate) >>> results = maxrModel.result() >>> print(results) @@ -1500,7 +1525,9 @@ def coef(self, predictor_size=None): >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") >>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] >>> response = "GLEASON" - >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, seed=12345, mode="maxr") + >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, + ... seed=12345, + ... mode="maxr") >>> maxrModel.train(x=predictors, y=response, training_frame=prostate) >>> results = maxrModel.result() >>> print(results) From 353f1c3d3f25e5231dddfa660f6bb58d9febd66b Mon Sep 17 00:00:00 2001 From: Hannah Tillman Date: Mon, 22 Jan 2024 15:33:19 -0600 Subject: [PATCH 4/6] ht/requested updates - removed obj_reg, custom_metric_func, & multinode_mode - updated coef, coef_norm, build_glm_model, influence, p_values_threshold, & mode --- .../bin/custom/python/gen_modelselection.py | 107 +++++------------ h2o-py/h2o/estimators/model_selection.py | 110 +++++------------- 2 files changed, 52 insertions(+), 165 deletions(-) diff --git a/h2o-bindings/bin/custom/python/gen_modelselection.py b/h2o-bindings/bin/custom/python/gen_modelselection.py index 3a65dbaf7eda..c0746ec803f0 100644 --- a/h2o-bindings/bin/custom/python/gen_modelselection.py +++ b/h2o-bindings/bin/custom/python/gen_modelselection.py @@ -51,14 +51,14 @@ def coef_norm(self, predictor_size=None): >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") >>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] >>> response = "GLEASON" - >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, + >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=5, ... seed=12345, ... mode="maxr") >>> maxrModel.train(x=predictors, y=response, training_frame=prostate) - >>> results = maxrModel.result() - >>> print(results) >>> coeff_norm = maxrModel.coef_norm() >>> print(coeff_norm) + >>> coeff_norm_3 = maxrModel.coef_norm(predictor_size=3) # print coefficient norm with 3 predictors + >>> print(coeff_norm_3) """ model_ids = self._model_json["output"]["best_model_ids"] if not(self.actual_params["build_glm_model"]) and self.actual_params["mode"]=="maxrsweep": @@ -125,10 +125,10 @@ def coef(self, predictor_size=None): ... seed=12345, ... mode="maxr") >>> maxrModel.train(x=predictors, y=response, training_frame=prostate) - >>> results = maxrModel.result() - >>> print(results) >>> coeff = maxrModel.coef() >>> print(coeff) + >>> coeff_3 = maxrModel.coef(predictor_size=3) + >>> print(coeff_3) """ if not self.actual_params["build_glm_model"] and self.actual_params["mode"]=="maxrsweep": coef_names = self._model_json["output"]["coefficient_names"] @@ -269,15 +269,17 @@ def get_best_model_predictors(self): >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") >>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] >>> response = "GLEASON" ->>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, +>>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=5, ... seed=12345, -... mode="maxr", -... build_glm_model=False) +... mode="maxrsweep", +... build_glm_model=True) >>> maxrModel.train(x=predictors, y=response, training_frame=prostate) ->>> results = maxrModel.result() ->>> print(results) ->>> coeff = maxrModel.coef() ->>> print(coeff) +>>> result = maxrModel.result() +>>> # get the GLM model with the best performance for a fixed predictor size: +>>> one_model = h2o.get_model(result["model_id"][ind,0]) +>>> predict = one_model.predict(prostate) +>>> # print a version of the predict frame: +>>> print(predict) """, influence=""" >>> import h2o @@ -286,32 +288,13 @@ def get_best_model_predictors(self): >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") >>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] >>> response = "GLEASON" ->>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, +>>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=5, ... seed=12345, ... mode="maxr", ... influence="dfbetas") >>> maxrModel.train(x=predictors, y=response, training_frame=prostate) ->>> results = maxrModel.result() ->>> print(results) ->>> coeff = maxrModel.coef() ->>> print(coeff) -""", - multinode_mode=""" ->>> import h2o ->>> from h2o.estimators import H2OModelSelectionEstimator ->>> h2o.init() ->>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") ->>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] ->>> response = "GLEASON" ->>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, -... seed=12345, -... mode="maxr", -... multinode_mode=False) ->>> maxrModel.train(x=predictors, y=response, training_frame=prostate) ->>> results = maxrModel.result() ->>> print(results) ->>> coeff = maxrModel.coef() ->>> print(coeff) +>>> glm_rid = maxrModel.get_regression_influence_diagnostics() +>>> print(glm_rid) """, nparallelism=""" >>> import h2o @@ -335,51 +318,15 @@ def get_best_model_predictors(self): >>> from h2o.estimators import H2OModelSelectionEstimator >>> h2o.init() >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") ->>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] +>>> predictors = ["AGE", "RACE", "CAPSULE", DCAPS", "PSA", "VOL", "DPROS"] >>> response = "GLEASON" ->>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, -... seed=12345, -... mode="maxr", -... p_values_threshold=0.0) ->>> maxrModel.train(x=predictors, y=response, training_frame=prostate) ->>> results = maxrModel.result() ->>> print(results) ->>> coeff = maxrModel.coef() ->>> print(coeff) -""", - custom_metric_func=""" ->>> import h2o ->>> from h2o.estimators import H2OModelSelectionEstimator ->>> h2o.init() ->>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") ->>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] ->>> response = "GLEASON" ->>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, -... seed=12345, -... mode="maxr", -... early_stopping=False) ->>> maxrModel.train(x=predictors, y=response, training_frame=prostate) ->>> results = maxrModel.result() ->>> print(results) ->>> coeff = maxrModel.coef() ->>> print(coeff) -""", - obj_reg=""" ->>> import h2o ->>> from h2o.estimators import H2OModelSelectionEstimator ->>> h2o.init() ->>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") ->>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] ->>> response = "GLEASON" ->>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, -... seed=12345, -... mode="maxr", -... obj_reg=-1.0) ->>> maxrModel.train(x=predictors, y=response, training_frame=prostate) ->>> results = maxrModel.result() ->>> print(results) ->>> coeff = maxrModel.coef() ->>> print(coeff) +>>> backwardModel = H2OModelSelectionEstimator(min_predictor_number=2, +... seed=12345, +... mode="backward", +... p_values_threshold=0.001) +>>> backwardModel.train(x=predictors, y=response, training_frame=prostate) +>>> result = backwardModel.result() +>>> print(result) """, mode=""" >>> import h2o @@ -388,13 +335,11 @@ def get_best_model_predictors(self): >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") >>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] >>> response = "GLEASON" ->>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, +>>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=5, ... seed=12345, ... mode="maxr") >>> maxrModel.train(x=predictors, y=response, training_frame=prostate) >>> results = maxrModel.result() >>> print(results) ->>> coeff = maxrModel.coef() ->>> print(coeff) """ ) diff --git a/h2o-py/h2o/estimators/model_selection.py b/h2o-py/h2o/estimators/model_selection.py index 9cec6edd680b..42f49d82fbbb 100644 --- a/h2o-py/h2o/estimators/model_selection.py +++ b/h2o-py/h2o/estimators/model_selection.py @@ -998,24 +998,6 @@ def obj_reg(self): Likelihood divider in objective value computation, default (of -1.0) will set it to 1/nobs Type: ``float``, defaults to ``-1.0``. - - :examples: - - >>> import h2o - >>> from h2o.estimators import H2OModelSelectionEstimator - >>> h2o.init() - >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") - >>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] - >>> response = "GLEASON" - >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, - ... seed=12345, - ... mode="maxr", - ... obj_reg=-1.0) - >>> maxrModel.train(x=predictors, y=response, training_frame=prostate) - >>> results = maxrModel.result() - >>> print(results) - >>> coeff = maxrModel.coef() - >>> print(coeff) """ return self._parms.get("obj_reg") @@ -1148,24 +1130,6 @@ def custom_metric_func(self): Reference to custom evaluation function, format: `language:keyName=funcName` Type: ``str``. - - :examples: - - >>> import h2o - >>> from h2o.estimators import H2OModelSelectionEstimator - >>> h2o.init() - >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") - >>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] - >>> response = "GLEASON" - >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, - ... seed=12345, - ... mode="maxr", - ... early_stopping=False) - >>> maxrModel.train(x=predictors, y=response, training_frame=prostate) - >>> results = maxrModel.result() - >>> print(results) - >>> coeff = maxrModel.coef() - >>> print(coeff) """ return self._parms.get("custom_metric_func") @@ -1253,14 +1217,12 @@ def mode(self): >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") >>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] >>> response = "GLEASON" - >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, + >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=5, ... seed=12345, ... mode="maxr") >>> maxrModel.train(x=predictors, y=response, training_frame=prostate) >>> results = maxrModel.result() >>> print(results) - >>> coeff = maxrModel.coef() - >>> print(coeff) """ return self._parms.get("mode") @@ -1287,15 +1249,17 @@ def build_glm_model(self): >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") >>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] >>> response = "GLEASON" - >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, + >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=5, ... seed=12345, - ... mode="maxr", - ... build_glm_model=False) + ... mode="maxrsweep", + ... build_glm_model=True) >>> maxrModel.train(x=predictors, y=response, training_frame=prostate) - >>> results = maxrModel.result() - >>> print(results) - >>> coeff = maxrModel.coef() - >>> print(coeff) + >>> result = maxrModel.result() + >>> # get the GLM model with the best performance for a fixed predictor size: + >>> one_model = h2o.get_model(result["model_id"][ind,0]) + >>> predict = one_model.predict(prostate) + >>> # print a version of the predict frame: + >>> print(predict) """ return self._parms.get("build_glm_model") @@ -1318,17 +1282,15 @@ def p_values_threshold(self): >>> from h2o.estimators import H2OModelSelectionEstimator >>> h2o.init() >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") - >>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] + >>> predictors = ["AGE", "RACE", "CAPSULE", DCAPS", "PSA", "VOL", "DPROS"] >>> response = "GLEASON" - >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, - ... seed=12345, - ... mode="maxr", - ... p_values_threshold=0.0) - >>> maxrModel.train(x=predictors, y=response, training_frame=prostate) - >>> results = maxrModel.result() - >>> print(results) - >>> coeff = maxrModel.coef() - >>> print(coeff) + >>> backwardModel = H2OModelSelectionEstimator(min_predictor_number=2, + ... seed=12345, + ... mode="backward", + ... p_values_threshold=0.001) + >>> backwardModel.train(x=predictors, y=response, training_frame=prostate) + >>> result = backwardModel.result() + >>> print(result) """ return self._parms.get("p_values_threshold") @@ -1352,15 +1314,13 @@ def influence(self): >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") >>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] >>> response = "GLEASON" - >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, + >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=5, ... seed=12345, ... mode="maxr", ... influence="dfbetas") >>> maxrModel.train(x=predictors, y=response, training_frame=prostate) - >>> results = maxrModel.result() - >>> print(results) - >>> coeff = maxrModel.coef() - >>> print(coeff) + >>> glm_rid = maxrModel.get_regression_influence_diagnostics() + >>> print(glm_rid) """ return self._parms.get("influence") @@ -1376,24 +1336,6 @@ def multinode_mode(self): Defaults to false. Type: ``bool``, defaults to ``False``. - - :examples: - - >>> import h2o - >>> from h2o.estimators import H2OModelSelectionEstimator - >>> h2o.init() - >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") - >>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] - >>> response = "GLEASON" - >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, - ... seed=12345, - ... mode="maxr", - ... multinode_mode=False) - >>> maxrModel.train(x=predictors, y=response, training_frame=prostate) - >>> results = maxrModel.result() - >>> print(results) - >>> coeff = maxrModel.coef() - >>> print(coeff) """ return self._parms.get("multinode_mode") @@ -1455,14 +1397,14 @@ def coef_norm(self, predictor_size=None): >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") >>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] >>> response = "GLEASON" - >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, + >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=5, ... seed=12345, ... mode="maxr") >>> maxrModel.train(x=predictors, y=response, training_frame=prostate) - >>> results = maxrModel.result() - >>> print(results) >>> coeff_norm = maxrModel.coef_norm() >>> print(coeff_norm) + >>> coeff_norm_3 = maxrModel.coef_norm(predictor_size=3) # print coefficient norm with 3 predictors + >>> print(coeff_norm_3) """ model_ids = self._model_json["output"]["best_model_ids"] if not(self.actual_params["build_glm_model"]) and self.actual_params["mode"]=="maxrsweep": @@ -1529,10 +1471,10 @@ def coef(self, predictor_size=None): ... seed=12345, ... mode="maxr") >>> maxrModel.train(x=predictors, y=response, training_frame=prostate) - >>> results = maxrModel.result() - >>> print(results) >>> coeff = maxrModel.coef() >>> print(coeff) + >>> coeff_3 = maxrModel.coef(predictor_size=3) + >>> print(coeff_3) """ if not self.actual_params["build_glm_model"] and self.actual_params["mode"]=="maxrsweep": coef_names = self._model_json["output"]["coefficient_names"] From df77e3141f852f882510e14940a5c6efb32e4ce6 Mon Sep 17 00:00:00 2001 From: Hannah Tillman Date: Thu, 25 Jan 2024 13:28:10 -0600 Subject: [PATCH 5/6] ht/removed nparallelism & mpn update --- .../bin/custom/python/gen_modelselection.py | 19 +----------------- h2o-py/h2o/estimators/model_selection.py | 20 +------------------ 2 files changed, 2 insertions(+), 37 deletions(-) diff --git a/h2o-bindings/bin/custom/python/gen_modelselection.py b/h2o-bindings/bin/custom/python/gen_modelselection.py index c0746ec803f0..e1c1d4ebd3e3 100644 --- a/h2o-bindings/bin/custom/python/gen_modelselection.py +++ b/h2o-bindings/bin/custom/python/gen_modelselection.py @@ -121,7 +121,7 @@ def coef(self, predictor_size=None): >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") >>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] >>> response = "GLEASON" - >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, + >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=5, ... seed=12345, ... mode="maxr") >>> maxrModel.train(x=predictors, y=response, training_frame=prostate) @@ -295,23 +295,6 @@ def get_best_model_predictors(self): >>> maxrModel.train(x=predictors, y=response, training_frame=prostate) >>> glm_rid = maxrModel.get_regression_influence_diagnostics() >>> print(glm_rid) -""", - nparallelism=""" ->>> import h2o ->>> from h2o.estimators import H2OModelSelectionEstimator ->>> h2o.init() ->>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") ->>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] ->>> response = "GLEASON" ->>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, -... seed=12345, -... mode="maxr", -... nparallelism=0) ->>> maxrModel.train(x=predictors, y=response, training_frame=prostate) ->>> results = maxrModel.result() ->>> print(results) ->>> coeff = maxrModel.coef() ->>> print(coeff) """, p_values_threshold=""" >>> import h2o diff --git a/h2o-py/h2o/estimators/model_selection.py b/h2o-py/h2o/estimators/model_selection.py index 42f49d82fbbb..e7bf93de0227 100644 --- a/h2o-py/h2o/estimators/model_selection.py +++ b/h2o-py/h2o/estimators/model_selection.py @@ -1144,24 +1144,6 @@ def nparallelism(self): number of models to build in parallel. Defaults to 0.0 which is adaptive to the system capability Type: ``int``, defaults to ``0``. - - :examples: - - >>> import h2o - >>> from h2o.estimators import H2OModelSelectionEstimator - >>> h2o.init() - >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") - >>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] - >>> response = "GLEASON" - >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, - ... seed=12345, - ... mode="maxr", - ... nparallelism=0) - >>> maxrModel.train(x=predictors, y=response, training_frame=prostate) - >>> results = maxrModel.result() - >>> print(results) - >>> coeff = maxrModel.coef() - >>> print(coeff) """ return self._parms.get("nparallelism") @@ -1467,7 +1449,7 @@ def coef(self, predictor_size=None): >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") >>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] >>> response = "GLEASON" - >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=7, + >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=5, ... seed=12345, ... mode="maxr") >>> maxrModel.train(x=predictors, y=response, training_frame=prostate) From 6a76d36b23eb9e6b52198353230c6be12eb6915f Mon Sep 17 00:00:00 2001 From: Hannah Tillman Date: Wed, 31 Jan 2024 11:36:13 -0600 Subject: [PATCH 6/6] ht/fix --- h2o-bindings/bin/custom/python/gen_modelselection.py | 2 +- h2o-py/h2o/estimators/model_selection.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/h2o-bindings/bin/custom/python/gen_modelselection.py b/h2o-bindings/bin/custom/python/gen_modelselection.py index e1c1d4ebd3e3..ee727d44e403 100644 --- a/h2o-bindings/bin/custom/python/gen_modelselection.py +++ b/h2o-bindings/bin/custom/python/gen_modelselection.py @@ -276,7 +276,7 @@ def get_best_model_predictors(self): >>> maxrModel.train(x=predictors, y=response, training_frame=prostate) >>> result = maxrModel.result() >>> # get the GLM model with the best performance for a fixed predictor size: ->>> one_model = h2o.get_model(result["model_id"][ind,0]) +>>> one_model = h2o.get_model(result["model_id"][1, 0]) >>> predict = one_model.predict(prostate) >>> # print a version of the predict frame: >>> print(predict) diff --git a/h2o-py/h2o/estimators/model_selection.py b/h2o-py/h2o/estimators/model_selection.py index e7bf93de0227..8632b1a12fb9 100644 --- a/h2o-py/h2o/estimators/model_selection.py +++ b/h2o-py/h2o/estimators/model_selection.py @@ -1238,7 +1238,7 @@ def build_glm_model(self): >>> maxrModel.train(x=predictors, y=response, training_frame=prostate) >>> result = maxrModel.result() >>> # get the GLM model with the best performance for a fixed predictor size: - >>> one_model = h2o.get_model(result["model_id"][ind,0]) + >>> one_model = h2o.get_model(result["model_id"][1, 0]) >>> predict = one_model.predict(prostate) >>> # print a version of the predict frame: >>> print(predict)