Skip to content

Commit

Permalink
Merge pull request #16001 from h2oai/sy/-MS
Browse files Browse the repository at this point in the history
GH-15915: Craft MS Examples
  • Loading branch information
shaunyogeshwaran authored Feb 1, 2024
2 parents 1639968 + 6a76d36 commit 9d523ef
Show file tree
Hide file tree
Showing 2 changed files with 203 additions and 0 deletions.
101 changes: 101 additions & 0 deletions h2o-bindings/bin/custom/python/gen_modelselection.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,23 @@ def coef_norm(self, predictor_size=None):
:param predictor_size: predictor subset size, will only return model coefficients of that subset size.
:return: list of Python Dicts of coefficients for all models built with different predictor numbers
:examples:
>>> import h2o
>>> from h2o.estimators import H2OModelSelectionEstimator
>>> h2o.init()
>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv")
>>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]
>>> response = "GLEASON"
>>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=5,
... seed=12345,
... mode="maxr")
>>> maxrModel.train(x=predictors, y=response, training_frame=prostate)
>>> coeff_norm = maxrModel.coef_norm()
>>> print(coeff_norm)
>>> coeff_norm_3 = maxrModel.coef_norm(predictor_size=3) # print coefficient norm with 3 predictors
>>> print(coeff_norm_3)
"""
model_ids = self._model_json["output"]["best_model_ids"]
if not(self.actual_params["build_glm_model"]) and self.actual_params["mode"]=="maxrsweep":
Expand Down Expand Up @@ -95,6 +112,23 @@ def coef(self, predictor_size=None):
:param predictor_size: predictor subset size, will only return model coefficients of that subset size.
:return: list of Python Dicts of coefficients for all models built with different predictor numbers
:examples:
>>> import h2o
>>> from h2o.estimators import H2OModelSelectionEstimator
>>> h2o.init()
>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv")
>>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]
>>> response = "GLEASON"
>>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=5,
... seed=12345,
... mode="maxr")
>>> maxrModel.train(x=predictors, y=response, training_frame=prostate)
>>> coeff = maxrModel.coef()
>>> print(coeff)
>>> coeff_3 = maxrModel.coef(predictor_size=3)
>>> print(coeff_3)
"""
if not self.actual_params["build_glm_model"] and self.actual_params["mode"]=="maxrsweep":
coef_names = self._model_json["output"]["coefficient_names"]
Expand Down Expand Up @@ -148,6 +182,7 @@ def coef(self, predictor_size=None):
def result(self):
"""
Get result frame that contains information about the model building process like for modelselection and anovaglm.
:return: the H2OFrame that contains information about the model building process like for modelselection and anovaglm.
"""
return H2OFrame._expr(expr=ExprNode("result", ASTId(self.key)))._frame(fill_cache=True)
Expand Down Expand Up @@ -225,3 +260,69 @@ def get_best_model_predictors(self):
mode=maxr, the model returned is no longer guaranteed to have the best R2 value.
"""
)

examples = dict(
build_glm_model="""
>>> import h2o
>>> from h2o.estimators import H2OModelSelectionEstimator
>>> h2o.init()
>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv")
>>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]
>>> response = "GLEASON"
>>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=5,
... seed=12345,
... mode="maxrsweep",
... build_glm_model=True)
>>> maxrModel.train(x=predictors, y=response, training_frame=prostate)
>>> result = maxrModel.result()
>>> # get the GLM model with the best performance for a fixed predictor size:
>>> one_model = h2o.get_model(result["model_id"][1, 0])
>>> predict = one_model.predict(prostate)
>>> # print a version of the predict frame:
>>> print(predict)
""",
influence="""
>>> import h2o
>>> from h2o.estimators import H2OModelSelectionEstimator
>>> h2o.init()
>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv")
>>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]
>>> response = "GLEASON"
>>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=5,
... seed=12345,
... mode="maxr",
... influence="dfbetas")
>>> maxrModel.train(x=predictors, y=response, training_frame=prostate)
>>> glm_rid = maxrModel.get_regression_influence_diagnostics()
>>> print(glm_rid)
""",
p_values_threshold="""
>>> import h2o
>>> from h2o.estimators import H2OModelSelectionEstimator
>>> h2o.init()
>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv")
>>> predictors = ["AGE", "RACE", "CAPSULE", DCAPS", "PSA", "VOL", "DPROS"]
>>> response = "GLEASON"
>>> backwardModel = H2OModelSelectionEstimator(min_predictor_number=2,
... seed=12345,
... mode="backward",
... p_values_threshold=0.001)
>>> backwardModel.train(x=predictors, y=response, training_frame=prostate)
>>> result = backwardModel.result()
>>> print(result)
""",
mode="""
>>> import h2o
>>> from h2o.estimators import H2OModelSelectionEstimator
>>> h2o.init()
>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv")
>>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]
>>> response = "GLEASON"
>>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=5,
... seed=12345,
... mode="maxr")
>>> maxrModel.train(x=predictors, y=response, training_frame=prostate)
>>> results = maxrModel.result()
>>> print(results)
"""
)
102 changes: 102 additions & 0 deletions h2o-py/h2o/estimators/model_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -1190,6 +1190,21 @@ def mode(self):
than 'maxr', 'backward' for backward selection.
Type: ``Literal["allsubsets", "maxr", "maxrsweep", "backward"]``, defaults to ``"maxr"``.
:examples:
>>> import h2o
>>> from h2o.estimators import H2OModelSelectionEstimator
>>> h2o.init()
>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv")
>>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]
>>> response = "GLEASON"
>>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=5,
... seed=12345,
... mode="maxr")
>>> maxrModel.train(x=predictors, y=response, training_frame=prostate)
>>> results = maxrModel.result()
>>> print(results)
"""
return self._parms.get("mode")

Expand All @@ -1207,6 +1222,26 @@ def build_glm_model(self):
themselves. Defaults to false.
Type: ``bool``, defaults to ``False``.
:examples:
>>> import h2o
>>> from h2o.estimators import H2OModelSelectionEstimator
>>> h2o.init()
>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv")
>>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]
>>> response = "GLEASON"
>>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=5,
... seed=12345,
... mode="maxrsweep",
... build_glm_model=True)
>>> maxrModel.train(x=predictors, y=response, training_frame=prostate)
>>> result = maxrModel.result()
>>> # get the GLM model with the best performance for a fixed predictor size:
>>> one_model = h2o.get_model(result["model_id"][1, 0])
>>> predict = one_model.predict(prostate)
>>> # print a version of the predict frame:
>>> print(predict)
"""
return self._parms.get("build_glm_model")

Expand All @@ -1222,6 +1257,22 @@ def p_values_threshold(self):
below this threshold
Type: ``float``, defaults to ``0.0``.
:examples:
>>> import h2o
>>> from h2o.estimators import H2OModelSelectionEstimator
>>> h2o.init()
>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv")
>>> predictors = ["AGE", "RACE", "CAPSULE", DCAPS", "PSA", "VOL", "DPROS"]
>>> response = "GLEASON"
>>> backwardModel = H2OModelSelectionEstimator(min_predictor_number=2,
... seed=12345,
... mode="backward",
... p_values_threshold=0.001)
>>> backwardModel.train(x=predictors, y=response, training_frame=prostate)
>>> result = backwardModel.result()
>>> print(result)
"""
return self._parms.get("p_values_threshold")

Expand All @@ -1236,6 +1287,22 @@ def influence(self):
If set to dfbetas will calculate the difference in beta when a datarow is included and excluded in the dataset.
Type: ``Literal["dfbetas"]``.
:examples:
>>> import h2o
>>> from h2o.estimators import H2OModelSelectionEstimator
>>> h2o.init()
>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv")
>>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]
>>> response = "GLEASON"
>>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=5,
... seed=12345,
... mode="maxr",
... influence="dfbetas")
>>> maxrModel.train(x=predictors, y=response, training_frame=prostate)
>>> glm_rid = maxrModel.get_regression_influence_diagnostics()
>>> print(glm_rid)
"""
return self._parms.get("influence")

Expand Down Expand Up @@ -1303,6 +1370,23 @@ def coef_norm(self, predictor_size=None):
:param predictor_size: predictor subset size, will only return model coefficients of that subset size.
:return: list of Python Dicts of coefficients for all models built with different predictor numbers
:examples:
>>> import h2o
>>> from h2o.estimators import H2OModelSelectionEstimator
>>> h2o.init()
>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv")
>>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]
>>> response = "GLEASON"
>>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=5,
... seed=12345,
... mode="maxr")
>>> maxrModel.train(x=predictors, y=response, training_frame=prostate)
>>> coeff_norm = maxrModel.coef_norm()
>>> print(coeff_norm)
>>> coeff_norm_3 = maxrModel.coef_norm(predictor_size=3) # print coefficient norm with 3 predictors
>>> print(coeff_norm_3)
"""
model_ids = self._model_json["output"]["best_model_ids"]
if not(self.actual_params["build_glm_model"]) and self.actual_params["mode"]=="maxrsweep":
Expand Down Expand Up @@ -1356,6 +1440,23 @@ def coef(self, predictor_size=None):
:param predictor_size: predictor subset size, will only return model coefficients of that subset size.
:return: list of Python Dicts of coefficients for all models built with different predictor numbers
:examples:
>>> import h2o
>>> from h2o.estimators import H2OModelSelectionEstimator
>>> h2o.init()
>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv")
>>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]
>>> response = "GLEASON"
>>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=5,
... seed=12345,
... mode="maxr")
>>> maxrModel.train(x=predictors, y=response, training_frame=prostate)
>>> coeff = maxrModel.coef()
>>> print(coeff)
>>> coeff_3 = maxrModel.coef(predictor_size=3)
>>> print(coeff_3)
"""
if not self.actual_params["build_glm_model"] and self.actual_params["mode"]=="maxrsweep":
coef_names = self._model_json["output"]["coefficient_names"]
Expand Down Expand Up @@ -1409,6 +1510,7 @@ def coef(self, predictor_size=None):
def result(self):
"""
Get result frame that contains information about the model building process like for modelselection and anovaglm.
:return: the H2OFrame that contains information about the model building process like for modelselection and anovaglm.
"""
return H2OFrame._expr(expr=ExprNode("result", ASTId(self.key)))._frame(fill_cache=True)
Expand Down

0 comments on commit 9d523ef

Please sign in to comment.