Merge remote-tracking branch 'origin/rel-3.44.0'

h2oai · Mar 1, 2024 · 70f43c9 · 70f43c9
2 parents bb5d0ee + a064dfc
commit 70f43c9
Show file tree

Hide file tree

Showing 13 changed files with 490 additions and 8 deletions.
diff --git a/h2o-bindings/bin/custom/python/gen_anovaglm.py b/h2o-bindings/bin/custom/python/gen_anovaglm.py
@@ -13,6 +13,7 @@ def Lambda(self, value):
     def result(self):
         """
         Get result frame that contains information about the model building process like for modelselection and anovaglm.
+
         :return: the H2OFrame that contains information about the model building process like for modelselection and anovaglm.
         """
         return H2OFrame._expr(expr=ExprNode("result", ASTId(self.key)))._frame(fill_cache=True)
@@ -55,3 +56,48 @@ def result(self):
 high p-values while those with more contributions will have low p-values. 
 """
 )
+examples = dict(
+    highest_interaction_term="""
+>>> import h2o
+>>> h2o.init()
+>>> from h2o.estimators import H2OANOVAGLMEstimator 
+>>> train = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate_complete.csv.zip")
+>>> x = ['AGE', 'VOL', 'DCAPS']
+>>> y = 'CAPSULE'
+>>> anova_model = H2OANOVAGLMEstimator(family='binomial',
+...                                    lambda_=0,
+...                                    missing_values_handling="skip",
+...                                    highest_interaction_term=2)
+>>> anova_model.train(x=x, y=y, training_frame=train)
+>>> anova_model.summary()
+""",
+    link="""
+>>> import h2o
+>>> h2o.init()
+>>> from h2o.estimators import H2OANOVAGLMEstimator 
+>>> train = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate_complete.csv.zip")
+>>> x = ['AGE', 'VOL', 'DCAPS']
+>>> y = 'CAPSULE'
+>>> anova_model = H2OANOVAGLMEstimator(family='binomial',
+...                                    lambda_=0,
+...                                    missing_values_handling="skip",
+...                                    link="family_default")
+>>> anova_model.train(x=x, y=y, training_frame=train)
+>>> anova_model.summary()
+""",
+    save_transformed_framekeys="""
+>>> import h2o
+>>> h2o.init()
+>>> from h2o.estimators import H2OANOVAGLMEstimator 
+>>> train = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/prostate/prostate_complete.csv.zip")
+>>> x = ['AGE', 'VOL', 'DCAPS']
+>>> y = 'CAPSULE'
+>>> anova_model = H2OANOVAGLMEstimator(family='binomial',
+...                                    lambda_=0,
+...                                    missing_values_handling="skip",
+...                                    save_transformed_framekeys=True)  
+>>> anova_model.train(x=x, y=y, training_frame=train)
+>>> transformFrame = h2o.get_frame(anova_model._model_json['output']['transformed_columns_key']['name'])
+>>> print(transformFrame)
+"""
+)
diff --git a/h2o-bindings/bin/custom/python/gen_gam.py b/h2o-bindings/bin/custom/python/gen_gam.py
@@ -40,7 +40,7 @@ def scoring_history(self):
 
     def get_knot_locations(self, gam_column=None):
         """
-        Retrieve gam columns knot locations if store_knot_location parameter is enabled.  If a gam column name is 
+        Retrieve gam columns knot locations if store_knot_locations parameter is enabled.  If a gam column name is 
         specified, the know loations corresponding to that gam column is returned.  Otherwise, all knot locations are
         returned for all gam columns.  The order of the gam columns are specified in gam_knot_column_names of the 
         model output.
@@ -61,8 +61,8 @@ def get_knot_locations(self, gam_column=None):
 
     def get_gam_knot_column_names(self):
         """
-        Retrieve gam column names corresponding to the knot locations that will be returned if store_knot_location
-        parameter is enabled.  
+        Retrieve gam column names corresponding to the knot locations that will be returned if store_knot_locations
+        parameter is enabled.
      
         :return: gam column names whose knot locations are stored in the knot_locations.
         """
@@ -117,3 +117,172 @@ def get_gam_knot_column_names(self):
 MSE, AUC (for logistic regression), degrees of freedom, and confusion matrices.
 """
 )
+examples = dict(
+    bs="""
+>>> import h2o
+>>> from h2o.estimators.gam import H2OGeneralizedAdditiveEstimator
+>>> h2o.init()
+>>> h2o_data = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/multinomial_10_classes_10_cols_10000_Rows_train.csv")
+>>> h2o_data["C11"] = h2o_data["C11"].asfactor()
+>>> y = "C11"
+>>> x = ["C9","C10"]
+>>> h2o_model = H2OGeneralizedAdditiveEstimator(family='multinomial',
+...                                             gam_columns=["C6","C7","C8"],
+...                                             bs=[0,1,3])
+>>> h2o_model.train(x=x, y=y, training_frame=h2o_data)
+>>> h2o_model.coef() # note the spline type in the names of gam column coefficients
+""",
+    gam_columns="""
+>>> import h2o
+>>> from h2o.estimators.gam import H2OGeneralizedAdditiveEstimator
+>>> h2o.init()
+>>> h2o_data = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/multinomial_10_classes_10_cols_10000_Rows_train.csv")
+>>> h2o_data["C11"] = h2o_data["C11"].asfactor()
+>>> y = "C11"
+>>> x = ["C9","C10"]
+>>> h2o_model = H2OGeneralizedAdditiveEstimator(family='multinomial',
+...                                             gam_columns=["C6","C7","C8"])
+>>> h2o_model.train(x=x, y=y, training_frame=h2o_data)
+>>> h2o_model.coef()
+""",
+    get_gam_knot_column_names="""
+>>> import h2o
+>>> from h2o.estimators.gam import H2OGeneralizedAdditiveEstimator
+>>> h2o.init()
+>>> h2o_data = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/multinomial_10_classes_10_cols_10000_Rows_train.csv")
+>>> h2o_data["C11"] = h2o_data["C11"].asfactor()
+>>> y = "C11"
+>>> x = ["C9","C10"]
+>>> h2o_model = H2OGeneralizedAdditiveEstimator(family='multinomial',
+...                                             store_knot_locations=True,
+...                                             gam_columns=["C6","C7","C8"])
+>>> h2o_model.train(x=x, y=y, training_frame=h2o_data)
+>>> h2o_model.get_gam_knot_column_names()
+""",
+    get_knot_locations="""
+>>> import h2o
+>>> from h2o.estimators.gam import H2OGeneralizedAdditiveEstimator
+>>> h2o.init()
+>>> h2o_data = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/multinomial_10_classes_10_cols_10000_Rows_train.csv")
+>>> h2o_data["C11"] = h2o_data["C11"].asfactor()
+>>> y = "C11"
+>>> x = ["C9","C10"]
+>>> h2o_model = H2OGeneralizedAdditiveEstimator(family='multinomial',
+...                                             store_knot_locations=True,
+...                                             gam_columns=["C6","C7","C8"])
+>>> h2o_model.train(x=x, y=y, training_frame=h2o_data)
+>>> h2o_model.get_knot_locations()
+""",
+    keep_gam_cols="""
+>>> import h2o
+>>> from h2o.estimators.gam import H2OGeneralizedAdditiveEstimator
+>>> h2o.init()
+>>> h2o_data = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/multinomial_10_classes_10_cols_10000_Rows_train.csv")
+>>> h2o_data["C11"] = h2o_data["C11"].asfactor()
+>>> train, test = h2o_data.split_frame(ratios = [.8])
+>>> y = "C11"
+>>> x = ["C9","C10"]
+>>> h2o_model = H2OGeneralizedAdditiveEstimator(family='multinomial',
+...                                             keep_gam_cols=True,
+...                                             gam_columns=["C6","C7","C8"])
+>>> h2o_model.train(x=x, y=y, training_frame=h2o_data)
+>>> h2o.get_frame(h2o_model._model_json["output"] ["gam_transformed_center_key"])
+""",
+    knot_ids="""
+>>> import h2o
+>>> from h2o.estimators.gam import H2OGeneralizedAdditiveEstimator
+>>> h2o.init()
+>>> knots1 = [-1.99905699, -0.98143075, 0.02599159, 1.00770987, 1.99942290]
+>>> frameKnots1 = h2o.H2OFrame(python_obj=knots1)
+>>> knots2 = [-1.999821861, -1.005257990, -0.006716042, 1.002197392, 1.999073589]
+>>> frameKnots2 = h2o.H2OFrame(python_obj=knots2)
+>>> knots3 = [-1.999675688, -0.979893796, 0.007573327, 1.011437347, 1.999611676]
+>>> frameKnots3 = h2o.H2OFrame(python_obj=knots3)
+>>> h2o_data = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/multinomial_10_classes_10_cols_10000_Rows_train.csv")()
+>>> h2o_data["C11"] = h2o_data["C11"].asfactor()
+>>> train, test = h2o_data.split_frame(ratios = [.8])
+>>> y = "C11"
+>>> x = ["C9","C10"]
+>>> h2o_model = H2OGeneralizedAdditiveEstimator(family='multinomial',
+...                                             gam_columns=["C6","C7","C8"],
+...                                             store_knot_locations=True,
+...                                             knot_ids=[frameKnots1.key, frameKnots2.key, frameKnots3.key])
+>>> h2o_model.train(x=x, y=y, training_frame=h2o_data)
+>>> h2o_model.get_knot_locations()
+""",
+    num_knots="""
+>>> import h2o
+>>> from h2o.estimators.gam import H2OGeneralizedAdditiveEstimator
+>>> h2o.init()
+>>> h2o_data = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/multinomial_10_classes_10_cols_10000_Rows_train.csv")
+>>> h2o_data["C11"] = h2o_data["C11"].asfactor()
+>>> train, test = h2o_data.split_frame(ratios = [.8])
+>>> y = "C11"
+>>> x = ["C9","C10"]
+>>> h2o_model = H2OGeneralizedAdditiveEstimator(family='multinomial',
+...                                             store_knot_locations=True,
+...                                             gam_columns=["C6","C7","C8"],
+...                                             num_knots=[3,4,5])
+>>> h2o_model.train(x=x, y=y, training_frame=h2o_data)
+>>> h2o_model.get_knot_locations()
+""",
+    scale_tp_penalty_mat="""
+>>> import h2o
+>>> from h2o.estimators.gam import H2OGeneralizedAdditiveEstimator
+>>> h2o.init()
+>>> h2o_data = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/multinomial_10_classes_10_cols_10000_Rows_train.cs
+>>> h2o_data["C11"] = h2o_data["C11"].asfactor()
+>>> y = "C11"
+>>> x = ["C9","C10"]
+>>> h2o_model = H2OGeneralizedAdditiveEstimator(family='multinomial',
+...                                             scale_tp_penalty_mat=True,
+...                                             gam_columns=["C6","C7","C8"],
+...                                             bs=[1,1,1])
+>>> h2o_model.train(x=x, y=y, training_frame=h2o_data)
+>>> h2o_model.coef()
+""",
+    splines_non_negative="""
+>>> import h2o
+>>> from h2o.estimators.gam import H2OGeneralizedAdditiveEstimator
+>>> h2o.init()
+>>> h2o_data = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/binomial_20_cols_10KRows.csv")
+>>> y = "C21"
+>>> x = ["C19","C20"]
+>>> numKnots = [5,5,5]
+>>> h2o_model = H2OGeneralizedAdditiveEstimator(family='gaussian',
+...                                             gam_columns=["C16","C17","C18"],
+...                                             bs=[2,2,2],
+...                                             splines_non_negative=[True, True, True])
+>>> h2o_model.train(x=x, y=y, training_frame=h2o_data)
+>>> h2o_model.coef()
+""",
+    spline_orders="""
+>>> import h2o
+>>> from h2o.estimators.gam import H2OGeneralizedAdditiveEstimator
+>>> h2o.init()
+>>> h2o_data = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/binomial_20_cols_10KRows.csv")
+>>> y = "C21"
+>>> x = ["C19","C20"]
+>>> numKnots = [5,5,5]
+>>> h2o_model = H2OGeneralizedAdditiveEstimator(family='gaussian',
+...                                             gam_columns=["C16","C17","C18"],
+...                                             bs=[2,2,2],
+...                                             spline_orders=[3,4,5])
+>>> h2o_model.train(x=x, y=y, training_frame=h2o_data)
+>>> h2o_model.coef()
+""",
+    standardize_tp_gam_cols="""
+>>> import h2o
+>>> from h2o.estimators.gam import H2OGeneralizedAdditiveEstimator
+>>> h2o.init()
+>>> h2o_data = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/glm_test/binomial_20_cols_10KRows.csv")
+>>> y = "C21"
+>>> x = ["C19","C20"]
+>>> h2o_model = H2OGeneralizedAdditiveEstimator(family='gaussian',
+...                                             gam_columns=["C16","C17","C18"],
+...                                             bs=[1,1,1],
+...                                             standardize_tp_gam_cols=True)
+>>> h2o_model.train(x=x, y=y, training_frame=h2o_data)
+>>> h2o_model.coef()
+""",
+)
diff --git a/h2o-core/src/main/java/water/H2O.java b/h2o-core/src/main/java/water/H2O.java
@@ -2410,6 +2410,11 @@ public static void main( String[] args ) {
 
     // Validate arguments
     validateArguments();
+
+    // Raise user warnings
+    if (H2O.ARGS.web_ip == null) {
+      Log.warn("SECURITY_WARNING: web_ip is not specified. H2O Rest API is listening on all available interfaces.");
+    }
 
     Log.info("X-h2o-cluster-id: " + H2O.CLUSTER_ID);
     Log.info("User name: '" + H2O.ARGS.user_name + "'");

diff --git a/h2o-core/src/main/java/water/H2OStarter.java b/h2o-core/src/main/java/water/H2OStarter.java
@@ -39,7 +39,10 @@ public static void start(String[] args, String relativeResourcePath, boolean fin
     if (!H2O.ARGS.disable_web) {
       Log.info("");
       String message = H2O.ARGS.disable_flow ? "Connect to H2O from your R/Python client: " : "Open H2O Flow in your web browser: ";
-      Log.info(message + H2O.getURL(NetworkInit.h2oHttpView.getScheme()));
+      message += H2O.ARGS.web_ip == null ?
+              H2O.getURL(NetworkInit.h2oHttpView.getScheme()) :
+              H2O.getURL(NetworkInit.h2oHttpView.getScheme(), H2O.ARGS.web_ip, H2O.API_PORT, H2O.ARGS.context_path);
+      Log.info(message);
       Log.info("");
     }
   }

diff --git a/h2o-logging/impl-log4j2/src/main/java/water/util/LoggerBackend.java b/h2o-logging/impl-log4j2/src/main/java/water/util/LoggerBackend.java
@@ -83,6 +83,11 @@ public void reconfigureLog4J() {
                 .addAttribute("target", "SYSTEM_OUT")
                 .add(layoutComponentBuilder));
 
+        builder.add(builder.newAppender("stderr", "Console")
+                .addAttribute("target", "SYSTEM_ERR")
+                .add(builder.newFilter("ThresholdFilter", Filter.Result.ACCEPT, Filter.Result.DENY).addAttribute("level", Level.ERROR))
+                .add(layoutComponentBuilder));
+
         builder.add(newRollingFileAppenderComponent(builder, "R1", "1MB", _getLogFilePath.apply("trace"), pattern, Level.TRACE));
         builder.add(newRollingFileAppenderComponent(builder, "R2", _maxLogFileSize, _getLogFilePath.apply("debug"), pattern, Level.DEBUG));
         builder.add(newRollingFileAppenderComponent(builder, "R3", _maxLogFileSize, _getLogFilePath.apply("info"), pattern, Level.INFO));
@@ -92,6 +97,7 @@ public void reconfigureLog4J() {
         builder.add(newRollingFileAppenderComponent(builder, "HTTPD", "1MB", _getLogFilePath.apply("httpd"), "%d{ISO8601} " + patternTail, Level.TRACE));
 
         AppenderRefComponentBuilder consoleAppenderRef = builder.newAppenderRef("Console");
+        AppenderRefComponentBuilder stderrAppenderRef = builder.newAppenderRef("stderr");
 
         // configure loggers:
         List<AppenderRefComponentBuilder> appenderReferences = new ArrayList();
@@ -102,11 +108,12 @@ public void reconfigureLog4J() {
         appenderReferences.add(builder.newAppenderRef("R5"));
         appenderReferences.add(builder.newAppenderRef("R6"));
         appenderReferences.add(consoleAppenderRef);
+        appenderReferences.add(stderrAppenderRef);
 
         builder.add(newLoggerComponent(builder, "hex", appenderReferences));
         builder.add(newLoggerComponent(builder, "water", appenderReferences));
         builder.add(newLoggerComponent(builder, "ai.h2o", appenderReferences));
-        builder.add(builder.newRootLogger(String.valueOf(L4J_LVLS[_level])).add(consoleAppenderRef));
+        builder.add(builder.newRootLogger(String.valueOf(L4J_LVLS[_level])).add(consoleAppenderRef).add(stderrAppenderRef));
 
         // Turn down the logging for some class hierarchies.
         builder.add(newLoggerComponent(builder, "org.apache.http", appenderReferences, "WARN"));

diff --git a/h2o-logging/impl-log4j2/src/main/resources/log4j2.xml b/h2o-logging/impl-log4j2/src/main/resources/log4j2.xml
@@ -4,10 +4,15 @@
         <Console name="Console" target="SYSTEM_OUT">
             <PatternLayout pattern="%d{HH:mm:ss.SSS} [%t] %-5level %logger{36} - %msg%n"/>
         </Console>
+        <Console name="stderr" target="SYSTEM_ERR"> <!-- 2 -->
+            <PatternLayout pattern="%d{HH:mm:ss.SSS} [%t] %-5level %logger{36} - %msg%n"/>
+            <ThresholdFilter level="ERROR" onMatch="ACCEPT" onMismatch="DENY"/> <!-- 3 -->
+        </Console>
     </Appenders>
     <Loggers>
         <Root level="info">
             <AppenderRef ref="Console"/>
+            <AppenderRef ref="stderr"/>
         </Root>
     </Loggers>
 </Configuration>
diff --git a/h2o-py/h2o/backend/server.py b/h2o-py/h2o/backend/server.py
@@ -372,6 +372,16 @@ def _launch_server(self, port, baseport, mmax, mmin, ea, nthreads, jvm_custom_ar
                 raise H2OServerError("Server wasn't able to start in %f seconds." % elapsed_time)
             time.sleep(0.2)
 
+        security_warning_message = ""
+        if os.stat(self._stdout).st_size > 0:
+            stdout_file = open(self._stdout, encoding='utf-8')
+            for line in stdout_file:
+                if re.search("SECURITY_WARNING", line):
+                    security_warning_message += line + "\n"
+            stdout_file.close()
+        if security_warning_message:
+            warn("\nServer process startup raise a security warning:\n" + str(security_warning_message))
+
     @staticmethod
     def _check_java(java, verbose):
         jver_bytes = subprocess.check_output([java, "-version"], stderr=subprocess.STDOUT)