diff --git a/h2o-bindings/bin/custom/python/gen_modelselection.py b/h2o-bindings/bin/custom/python/gen_modelselection.py index 9fea32cb5002..ee727d44e403 100644 --- a/h2o-bindings/bin/custom/python/gen_modelselection.py +++ b/h2o-bindings/bin/custom/python/gen_modelselection.py @@ -42,6 +42,23 @@ def coef_norm(self, predictor_size=None): :param predictor_size: predictor subset size, will only return model coefficients of that subset size. :return: list of Python Dicts of coefficients for all models built with different predictor numbers + + :examples: + + >>> import h2o + >>> from h2o.estimators import H2OModelSelectionEstimator + >>> h2o.init() + >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") + >>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] + >>> response = "GLEASON" + >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=5, + ... seed=12345, + ... mode="maxr") + >>> maxrModel.train(x=predictors, y=response, training_frame=prostate) + >>> coeff_norm = maxrModel.coef_norm() + >>> print(coeff_norm) + >>> coeff_norm_3 = maxrModel.coef_norm(predictor_size=3) # print coefficient norm with 3 predictors + >>> print(coeff_norm_3) """ model_ids = self._model_json["output"]["best_model_ids"] if not(self.actual_params["build_glm_model"]) and self.actual_params["mode"]=="maxrsweep": @@ -95,6 +112,23 @@ def coef(self, predictor_size=None): :param predictor_size: predictor subset size, will only return model coefficients of that subset size. :return: list of Python Dicts of coefficients for all models built with different predictor numbers + + :examples: + + >>> import h2o + >>> from h2o.estimators import H2OModelSelectionEstimator + >>> h2o.init() + >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") + >>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] + >>> response = "GLEASON" + >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=5, + ... seed=12345, + ... mode="maxr") + >>> maxrModel.train(x=predictors, y=response, training_frame=prostate) + >>> coeff = maxrModel.coef() + >>> print(coeff) + >>> coeff_3 = maxrModel.coef(predictor_size=3) + >>> print(coeff_3) """ if not self.actual_params["build_glm_model"] and self.actual_params["mode"]=="maxrsweep": coef_names = self._model_json["output"]["coefficient_names"] @@ -148,6 +182,7 @@ def coef(self, predictor_size=None): def result(self): """ Get result frame that contains information about the model building process like for modelselection and anovaglm. + :return: the H2OFrame that contains information about the model building process like for modelselection and anovaglm. """ return H2OFrame._expr(expr=ExprNode("result", ASTId(self.key)))._frame(fill_cache=True) @@ -225,3 +260,69 @@ def get_best_model_predictors(self): mode=maxr, the model returned is no longer guaranteed to have the best R2 value. """ ) + +examples = dict( + build_glm_model=""" +>>> import h2o +>>> from h2o.estimators import H2OModelSelectionEstimator +>>> h2o.init() +>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") +>>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] +>>> response = "GLEASON" +>>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=5, +... seed=12345, +... mode="maxrsweep", +... build_glm_model=True) +>>> maxrModel.train(x=predictors, y=response, training_frame=prostate) +>>> result = maxrModel.result() +>>> # get the GLM model with the best performance for a fixed predictor size: +>>> one_model = h2o.get_model(result["model_id"][1, 0]) +>>> predict = one_model.predict(prostate) +>>> # print a version of the predict frame: +>>> print(predict) +""", + influence=""" +>>> import h2o +>>> from h2o.estimators import H2OModelSelectionEstimator +>>> h2o.init() +>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") +>>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] +>>> response = "GLEASON" +>>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=5, +... seed=12345, +... mode="maxr", +... influence="dfbetas") +>>> maxrModel.train(x=predictors, y=response, training_frame=prostate) +>>> glm_rid = maxrModel.get_regression_influence_diagnostics() +>>> print(glm_rid) +""", + p_values_threshold=""" +>>> import h2o +>>> from h2o.estimators import H2OModelSelectionEstimator +>>> h2o.init() +>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") +>>> predictors = ["AGE", "RACE", "CAPSULE", DCAPS", "PSA", "VOL", "DPROS"] +>>> response = "GLEASON" +>>> backwardModel = H2OModelSelectionEstimator(min_predictor_number=2, +... seed=12345, +... mode="backward", +... p_values_threshold=0.001) +>>> backwardModel.train(x=predictors, y=response, training_frame=prostate) +>>> result = backwardModel.result() +>>> print(result) +""", + mode=""" +>>> import h2o +>>> from h2o.estimators import H2OModelSelectionEstimator +>>> h2o.init() +>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") +>>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] +>>> response = "GLEASON" +>>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=5, +... seed=12345, +... mode="maxr") +>>> maxrModel.train(x=predictors, y=response, training_frame=prostate) +>>> results = maxrModel.result() +>>> print(results) +""" +) diff --git a/h2o-py/h2o/estimators/model_selection.py b/h2o-py/h2o/estimators/model_selection.py index 564a8963e3e2..8632b1a12fb9 100644 --- a/h2o-py/h2o/estimators/model_selection.py +++ b/h2o-py/h2o/estimators/model_selection.py @@ -1190,6 +1190,21 @@ def mode(self): than 'maxr', 'backward' for backward selection. Type: ``Literal["allsubsets", "maxr", "maxrsweep", "backward"]``, defaults to ``"maxr"``. + + :examples: + + >>> import h2o + >>> from h2o.estimators import H2OModelSelectionEstimator + >>> h2o.init() + >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") + >>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] + >>> response = "GLEASON" + >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=5, + ... seed=12345, + ... mode="maxr") + >>> maxrModel.train(x=predictors, y=response, training_frame=prostate) + >>> results = maxrModel.result() + >>> print(results) """ return self._parms.get("mode") @@ -1207,6 +1222,26 @@ def build_glm_model(self): themselves. Defaults to false. Type: ``bool``, defaults to ``False``. + + :examples: + + >>> import h2o + >>> from h2o.estimators import H2OModelSelectionEstimator + >>> h2o.init() + >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") + >>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] + >>> response = "GLEASON" + >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=5, + ... seed=12345, + ... mode="maxrsweep", + ... build_glm_model=True) + >>> maxrModel.train(x=predictors, y=response, training_frame=prostate) + >>> result = maxrModel.result() + >>> # get the GLM model with the best performance for a fixed predictor size: + >>> one_model = h2o.get_model(result["model_id"][1, 0]) + >>> predict = one_model.predict(prostate) + >>> # print a version of the predict frame: + >>> print(predict) """ return self._parms.get("build_glm_model") @@ -1222,6 +1257,22 @@ def p_values_threshold(self): below this threshold Type: ``float``, defaults to ``0.0``. + + :examples: + + >>> import h2o + >>> from h2o.estimators import H2OModelSelectionEstimator + >>> h2o.init() + >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") + >>> predictors = ["AGE", "RACE", "CAPSULE", DCAPS", "PSA", "VOL", "DPROS"] + >>> response = "GLEASON" + >>> backwardModel = H2OModelSelectionEstimator(min_predictor_number=2, + ... seed=12345, + ... mode="backward", + ... p_values_threshold=0.001) + >>> backwardModel.train(x=predictors, y=response, training_frame=prostate) + >>> result = backwardModel.result() + >>> print(result) """ return self._parms.get("p_values_threshold") @@ -1236,6 +1287,22 @@ def influence(self): If set to dfbetas will calculate the difference in beta when a datarow is included and excluded in the dataset. Type: ``Literal["dfbetas"]``. + + :examples: + + >>> import h2o + >>> from h2o.estimators import H2OModelSelectionEstimator + >>> h2o.init() + >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") + >>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] + >>> response = "GLEASON" + >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=5, + ... seed=12345, + ... mode="maxr", + ... influence="dfbetas") + >>> maxrModel.train(x=predictors, y=response, training_frame=prostate) + >>> glm_rid = maxrModel.get_regression_influence_diagnostics() + >>> print(glm_rid) """ return self._parms.get("influence") @@ -1303,6 +1370,23 @@ def coef_norm(self, predictor_size=None): :param predictor_size: predictor subset size, will only return model coefficients of that subset size. :return: list of Python Dicts of coefficients for all models built with different predictor numbers + + :examples: + + >>> import h2o + >>> from h2o.estimators import H2OModelSelectionEstimator + >>> h2o.init() + >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") + >>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] + >>> response = "GLEASON" + >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=5, + ... seed=12345, + ... mode="maxr") + >>> maxrModel.train(x=predictors, y=response, training_frame=prostate) + >>> coeff_norm = maxrModel.coef_norm() + >>> print(coeff_norm) + >>> coeff_norm_3 = maxrModel.coef_norm(predictor_size=3) # print coefficient norm with 3 predictors + >>> print(coeff_norm_3) """ model_ids = self._model_json["output"]["best_model_ids"] if not(self.actual_params["build_glm_model"]) and self.actual_params["mode"]=="maxrsweep": @@ -1356,6 +1440,23 @@ def coef(self, predictor_size=None): :param predictor_size: predictor subset size, will only return model coefficients of that subset size. :return: list of Python Dicts of coefficients for all models built with different predictor numbers + + :examples: + + >>> import h2o + >>> from h2o.estimators import H2OModelSelectionEstimator + >>> h2o.init() + >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv") + >>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] + >>> response = "GLEASON" + >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=5, + ... seed=12345, + ... mode="maxr") + >>> maxrModel.train(x=predictors, y=response, training_frame=prostate) + >>> coeff = maxrModel.coef() + >>> print(coeff) + >>> coeff_3 = maxrModel.coef(predictor_size=3) + >>> print(coeff_3) """ if not self.actual_params["build_glm_model"] and self.actual_params["mode"]=="maxrsweep": coef_names = self._model_json["output"]["coefficient_names"] @@ -1409,6 +1510,7 @@ def coef(self, predictor_size=None): def result(self): """ Get result frame that contains information about the model building process like for modelselection and anovaglm. + :return: the H2OFrame that contains information about the model building process like for modelselection and anovaglm. """ return H2OFrame._expr(expr=ExprNode("result", ASTId(self.key)))._frame(fill_cache=True)