Merge pull request #16001 from h2oai/sy/-MS

GH-15915: Craft MS Examples
h2oai · Feb 1, 2024 · 9d523ef · 9d523ef
2 parents 1639968 + 6a76d36
commit 9d523ef
Show file tree

Hide file tree

Showing 2 changed files with 203 additions and 0 deletions.
diff --git a/h2o-bindings/bin/custom/python/gen_modelselection.py b/h2o-bindings/bin/custom/python/gen_modelselection.py
@@ -42,6 +42,23 @@ def coef_norm(self, predictor_size=None):
     
         :param predictor_size: predictor subset size, will only return model coefficients of that subset size.
         :return: list of Python Dicts of coefficients for all models built with different predictor numbers
+
+        :examples:
+        
+        >>> import h2o
+        >>> from h2o.estimators import H2OModelSelectionEstimator
+        >>> h2o.init()
+        >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv")
+        >>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]
+        >>> response = "GLEASON"
+        >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=5,
+        ...                                        seed=12345,
+        ...                                        mode="maxr")
+        >>> maxrModel.train(x=predictors, y=response, training_frame=prostate)
+        >>> coeff_norm = maxrModel.coef_norm()
+        >>> print(coeff_norm)
+        >>> coeff_norm_3 = maxrModel.coef_norm(predictor_size=3) # print coefficient norm with 3 predictors
+        >>> print(coeff_norm_3)
         """
         model_ids = self._model_json["output"]["best_model_ids"]
         if not(self.actual_params["build_glm_model"]) and self.actual_params["mode"]=="maxrsweep":
@@ -95,6 +112,23 @@ def coef(self, predictor_size=None):
         
         :param predictor_size: predictor subset size, will only return model coefficients of that subset size.
         :return: list of Python Dicts of coefficients for all models built with different predictor numbers
+
+        :examples:
+        
+        >>> import h2o
+        >>> from h2o.estimators import H2OModelSelectionEstimator
+        >>> h2o.init()
+        >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv")
+        >>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]
+        >>> response = "GLEASON"
+        >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=5,
+        ...                                        seed=12345,
+        ...                                        mode="maxr")
+        >>> maxrModel.train(x=predictors, y=response, training_frame=prostate)
+        >>> coeff = maxrModel.coef()
+        >>> print(coeff)
+        >>> coeff_3 = maxrModel.coef(predictor_size=3)
+        >>> print(coeff_3)
         """
         if not self.actual_params["build_glm_model"] and self.actual_params["mode"]=="maxrsweep":
             coef_names = self._model_json["output"]["coefficient_names"]
@@ -148,6 +182,7 @@ def coef(self, predictor_size=None):
     def result(self):
         """
         Get result frame that contains information about the model building process like for modelselection and anovaglm.
+
         :return: the H2OFrame that contains information about the model building process like for modelselection and anovaglm.
         """
         return H2OFrame._expr(expr=ExprNode("result", ASTId(self.key)))._frame(fill_cache=True)
@@ -225,3 +260,69 @@ def get_best_model_predictors(self):
 mode=maxr, the model returned is no longer guaranteed to have the best R2 value.
 """
 )
+
+examples = dict(
+    build_glm_model="""
+>>> import h2o
+>>> from h2o.estimators import H2OModelSelectionEstimator
+>>> h2o.init()
+>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv")
+>>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]
+>>> response = "GLEASON"
+>>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=5,
+...                                        seed=12345,
+...                                        mode="maxrsweep",
+...                                        build_glm_model=True)
+>>> maxrModel.train(x=predictors, y=response, training_frame=prostate)
+>>> result = maxrModel.result()
+>>> # get the GLM model with the best performance for a fixed predictor size:
+>>> one_model = h2o.get_model(result["model_id"][1, 0])
+>>> predict = one_model.predict(prostate)
+>>> # print a version of the predict frame:
+>>> print(predict)
+""",
+    influence="""
+>>> import h2o
+>>> from h2o.estimators import H2OModelSelectionEstimator
+>>> h2o.init()
+>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv")
+>>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]
+>>> response = "GLEASON"
+>>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=5,
+...                                        seed=12345,
+...                                        mode="maxr",
+...                                        influence="dfbetas")
+>>> maxrModel.train(x=predictors, y=response, training_frame=prostate)
+>>> glm_rid = maxrModel.get_regression_influence_diagnostics()
+>>> print(glm_rid)
+""",
+    p_values_threshold="""
+>>> import h2o
+>>> from h2o.estimators import H2OModelSelectionEstimator
+>>> h2o.init()
+>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv")
+>>> predictors = ["AGE", "RACE", "CAPSULE", DCAPS", "PSA", "VOL", "DPROS"]
+>>> response = "GLEASON"
+>>> backwardModel = H2OModelSelectionEstimator(min_predictor_number=2,
+...                                            seed=12345,
+...                                            mode="backward",
+...                                            p_values_threshold=0.001)
+>>> backwardModel.train(x=predictors, y=response, training_frame=prostate)
+>>> result = backwardModel.result()
+>>> print(result)
+""",
+    mode="""
+>>> import h2o
+>>> from h2o.estimators import H2OModelSelectionEstimator
+>>> h2o.init()
+>>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv")
+>>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]
+>>> response = "GLEASON"
+>>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=5,
+...                                        seed=12345,
+...                                        mode="maxr")
+>>> maxrModel.train(x=predictors, y=response, training_frame=prostate)
+>>> results = maxrModel.result()
+>>> print(results)
+"""
+)
diff --git a/h2o-py/h2o/estimators/model_selection.py b/h2o-py/h2o/estimators/model_selection.py
@@ -1190,6 +1190,21 @@ def mode(self):
         than 'maxr', 'backward' for backward selection.
 
         Type: ``Literal["allsubsets", "maxr", "maxrsweep", "backward"]``, defaults to ``"maxr"``.
+
+        :examples:
+
+        >>> import h2o
+        >>> from h2o.estimators import H2OModelSelectionEstimator
+        >>> h2o.init()
+        >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv")
+        >>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]
+        >>> response = "GLEASON"
+        >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=5,
+        ...                                        seed=12345,
+        ...                                        mode="maxr")
+        >>> maxrModel.train(x=predictors, y=response, training_frame=prostate)
+        >>> results = maxrModel.result()
+        >>> print(results)
         """
         return self._parms.get("mode")
 
@@ -1207,6 +1222,26 @@ def build_glm_model(self):
         themselves.  Defaults to false.
 
         Type: ``bool``, defaults to ``False``.
+
+        :examples:
+
+        >>> import h2o
+        >>> from h2o.estimators import H2OModelSelectionEstimator
+        >>> h2o.init()
+        >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv")
+        >>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]
+        >>> response = "GLEASON"
+        >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=5,
+        ...                                        seed=12345,
+        ...                                        mode="maxrsweep",
+        ...                                        build_glm_model=True)
+        >>> maxrModel.train(x=predictors, y=response, training_frame=prostate)
+        >>> result = maxrModel.result()
+        >>> # get the GLM model with the best performance for a fixed predictor size:
+        >>> one_model = h2o.get_model(result["model_id"][1, 0])
+        >>> predict = one_model.predict(prostate)
+        >>> # print a version of the predict frame:
+        >>> print(predict)
         """
         return self._parms.get("build_glm_model")
 
@@ -1222,6 +1257,22 @@ def p_values_threshold(self):
         below this threshold
 
         Type: ``float``, defaults to ``0.0``.
+
+        :examples:
+
+        >>> import h2o
+        >>> from h2o.estimators import H2OModelSelectionEstimator
+        >>> h2o.init()
+        >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv")
+        >>> predictors = ["AGE", "RACE", "CAPSULE", DCAPS", "PSA", "VOL", "DPROS"]
+        >>> response = "GLEASON"
+        >>> backwardModel = H2OModelSelectionEstimator(min_predictor_number=2,
+        ...                                            seed=12345,
+        ...                                            mode="backward",
+        ...                                            p_values_threshold=0.001)
+        >>> backwardModel.train(x=predictors, y=response, training_frame=prostate)
+        >>> result = backwardModel.result()
+        >>> print(result)
         """
         return self._parms.get("p_values_threshold")
 
@@ -1236,6 +1287,22 @@ def influence(self):
         If set to dfbetas will calculate the difference in beta when a datarow is included and excluded in the dataset.
 
         Type: ``Literal["dfbetas"]``.
+
+        :examples:
+
+        >>> import h2o
+        >>> from h2o.estimators import H2OModelSelectionEstimator
+        >>> h2o.init()
+        >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv")
+        >>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]
+        >>> response = "GLEASON"
+        >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=5,
+        ...                                        seed=12345,
+        ...                                        mode="maxr",
+        ...                                        influence="dfbetas")
+        >>> maxrModel.train(x=predictors, y=response, training_frame=prostate)
+        >>> glm_rid = maxrModel.get_regression_influence_diagnostics()
+        >>> print(glm_rid)
         """
         return self._parms.get("influence")
 
@@ -1303,6 +1370,23 @@ def coef_norm(self, predictor_size=None):
 
         :param predictor_size: predictor subset size, will only return model coefficients of that subset size.
         :return: list of Python Dicts of coefficients for all models built with different predictor numbers
+
+        :examples:
+
+        >>> import h2o
+        >>> from h2o.estimators import H2OModelSelectionEstimator
+        >>> h2o.init()
+        >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv")
+        >>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]
+        >>> response = "GLEASON"
+        >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=5,
+        ...                                        seed=12345,
+        ...                                        mode="maxr")
+        >>> maxrModel.train(x=predictors, y=response, training_frame=prostate)
+        >>> coeff_norm = maxrModel.coef_norm()
+        >>> print(coeff_norm)
+        >>> coeff_norm_3 = maxrModel.coef_norm(predictor_size=3) # print coefficient norm with 3 predictors
+        >>> print(coeff_norm_3)
         """
         model_ids = self._model_json["output"]["best_model_ids"]
         if not(self.actual_params["build_glm_model"]) and self.actual_params["mode"]=="maxrsweep":
@@ -1356,6 +1440,23 @@ def coef(self, predictor_size=None):
 
         :param predictor_size: predictor subset size, will only return model coefficients of that subset size.
         :return: list of Python Dicts of coefficients for all models built with different predictor numbers
+
+        :examples:
+
+        >>> import h2o
+        >>> from h2o.estimators import H2OModelSelectionEstimator
+        >>> h2o.init()
+        >>> prostate = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/logreg/prostate.csv")
+        >>> predictors = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]
+        >>> response = "GLEASON"
+        >>> maxrModel = H2OModelSelectionEstimator(max_predictor_number=5,
+        ...                                        seed=12345,
+        ...                                        mode="maxr")
+        >>> maxrModel.train(x=predictors, y=response, training_frame=prostate)
+        >>> coeff = maxrModel.coef()
+        >>> print(coeff)
+        >>> coeff_3 = maxrModel.coef(predictor_size=3)
+        >>> print(coeff_3)
         """
         if not self.actual_params["build_glm_model"] and self.actual_params["mode"]=="maxrsweep":
             coef_names = self._model_json["output"]["coefficient_names"]
@@ -1409,6 +1510,7 @@ def coef(self, predictor_size=None):
     def result(self):
         """
         Get result frame that contains information about the model building process like for modelselection and anovaglm.
+
         :return: the H2OFrame that contains information about the model building process like for modelselection and anovaglm.
         """
         return H2OFrame._expr(expr=ExprNode("result", ASTId(self.key)))._frame(fill_cache=True)