Merge remote-tracking branch 'origin/master' into fix-jetty-ssl-init

h2oai · Jan 17, 2024 · 9f45118 · 9f45118
2 parents 90c12d1 + 622045f
commit 9f45118
Show file tree

Hide file tree

Showing 105 changed files with 2,707 additions and 780 deletions.
diff --git a/Changes.md b/Changes.md
@@ -2,6 +2,34 @@
 
 ## H2O
 
+### 3.44.0.3 - 12/20/2023
+
+Download at: <a href='http://h2o-release.s3.amazonaws.com/h2o/rel-3.44.0/3/index.html'>http://h2o-release.s3.amazonaws.com/h2o/rel-3.44.0/3/index.html</a>
+
+#### Bug Fix
+- [[#15958]](https://github.com/h2oai/h2o-3/issues/15958) - Fixed maximum likelihood dispersion estimation for GLM tweedie family producing the wrong result for a specific dataset.
+- [[#15936]](https://github.com/h2oai/h2o-3/issues/15936) - Added data frame transformations using polars since datatable cannot be installed on Python 3.10+. 
+- [[#15894]](https://github.com/h2oai/h2o-3/issues/15894) - Ensured that the functions that are supposed to be exported in the R package are exported.
+- [[#15891]](https://github.com/h2oai/h2o-3/issues/15891) - Corrected sign in AIC calculation to fix problem with tweedie dispersion parameter estimation, AIC, and loglikelihood.
+- [[#15887]](https://github.com/h2oai/h2o-3/issues/15887) - Allowed Python H2OFrame constructor to accept an existing H2OFrame.
+- [[#6725]](https://github.com/h2oai/h2o-3/issues/6725) - Fixed LoggerFactory slf4j related regression. 
+
+#### Improvement
+- [[#15937]](https://github.com/h2oai/h2o-3/issues/15937) - Exposed `gainslift_bins` parameter for Deep Learning, GAM, GLM, and Stacked Ensemble algorithms.
+- [[#15916]](https://github.com/h2oai/h2o-3/issues/15916) - Sped up computation of Friedman-Popescu’s H statistic.
+
+#### New Feature
+- [[#15927]](https://github.com/h2oai/h2o-3/issues/15927) - Added anomaly score metric to be used as a `sort_by` metric when sorting grid model performances for Isolation Forest with grid search.
+- [[#15780]](https://github.com/h2oai/h2o-3/issues/15780) - Added `weak_learner_params` parameter for AdaBoost.
+- [[#15779]](https://github.com/h2oai/h2o-3/issues/15779) - Added `weak_learner="deep_learning"` option for AdaBoost.
+- [[#7118]](https://github.com/h2oai/h2o-3/issues/7118) - Implemented scoring and scoring history for Extended Isolation Forest by adding `score_each_iteration` and `score_tree_interval`. 
+
+#### Docs
+- [[#15817]](https://github.com/h2oai/h2o-3/issues/15817) - Improved default threshold API and documentation for binomial classification.
+
+#### Security
+- [[#15754]](https://github.com/h2oai/h2o-3/issues/15754) - Addressed CVE-2022-21230 by replacing nanohttpd.
+
 ### 3.44.0.2 - 11/8/2023
 
 Download at: <a href='http://h2o-release.s3.amazonaws.com/h2o/rel-3.44.0/2/index.html'>http://h2o-release.s3.amazonaws.com/h2o/rel-3.44.0/2/index.html</a>

diff --git a/h2o-algos/src/main/java/hex/ensemble/Metalearner.java b/h2o-algos/src/main/java/hex/ensemble/Metalearner.java
@@ -118,6 +118,7 @@ protected void setCommonParams(P parms) {
     parms._offset_column = _model._parms._offset_column;
     parms._main_model_time_budget_factor = _model._parms._main_model_time_budget_factor;
     parms._custom_metric_func = _model._parms._custom_metric_func;
+    parms._gainslift_bins = _model._parms._gainslift_bins;
   }
 
   protected void setCrossValidationParams(P parms) {

diff --git a/h2o-algos/src/main/java/hex/gam/MetricBuilderGAM.java b/h2o-algos/src/main/java/hex/gam/MetricBuilderGAM.java
@@ -156,6 +156,7 @@ public ModelMetrics makeModelMetrics(Model m, Frame f, Frame adaptedFrame, Frame
         Vec weights = f.vec(gamM._parms._weights_column);
         if (resp != null && fractionalbinomial != _glmf._family) {
           gl = new GainsLift(preds.lastVec(), resp, weights);
+          gl._groups = m._parms._gainslift_bins;
           gl.exec(gamM._output._job);
         }
       }

diff --git a/h2o-algos/src/main/java/hex/glm/DispersionUtils.java b/h2o-algos/src/main/java/hex/glm/DispersionUtils.java
@@ -4,6 +4,7 @@
 import water.Job;
 import water.Key;
 import water.MRTask;
+import water.Scope;
 import water.fvec.Chunk;
 import water.fvec.Frame;
 import water.fvec.Vec;
@@ -12,6 +13,7 @@
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.List;
+import java.util.stream.Collectors;
 
 import static org.apache.commons.math3.special.Gamma.*;
 
@@ -68,14 +70,121 @@ public static double estimateGammaMLSE(GLMTask.ComputeGammaMLSETsk mlCT, double
         return seOld;
     }
 
+    private static double getTweedieLogLikelihood(GLMModel.GLMParameters parms, DataInfo dinfo, double phi, Vec mu) {
+        final double llh = new TweedieEstimator(
+                parms._tweedie_variance_power,
+                phi,
+                false,
+                false,
+                false,
+                false)
+                .compute(mu,
+                        dinfo._adaptedFrame.vec(parms._response_column),
+                        parms._weights_column == null
+                                ? dinfo._adaptedFrame.makeCompatible(new Frame(Vec.makeOne(dinfo._adaptedFrame.numRows())))[0]
+                                : dinfo._adaptedFrame.vec(parms._weights_column))
+                ._loglikelihood;
+        Log.debug("Tweedie LogLikelihood(p=" + parms._tweedie_variance_power + ", phi=" + phi + ") = " + llh);
+        return llh;
+    }
+
+
+    private static double goldenRatioDispersionSearch(GLMModel.GLMParameters parms, DataInfo dinfo, Vec mu,
+                                                      List<Double> logLikelihoods, List<Double> phis, Job job) {
+        // make monotonic
+        List<Double> sortedPhis = phis.stream().sorted().collect(Collectors.toList());
+        List<Double> sortedLLHs = new ArrayList<>();
+        for (int i = 0; i < sortedPhis.size(); i++) {
+            double phi = sortedPhis.get(i);
+            int index = phis.indexOf(phi);
+            sortedLLHs.add(logLikelihoods.get(index));
+        }
+
+        // did we already find a region where there is the maximum?
+        boolean increasing = true;
+        double lowerBound = 1e-16;
+        double upperBound = sortedPhis.get(0);
+        for (int i = 1; i < sortedPhis.size(); i++) {
+            upperBound = sortedPhis.get(i);
+            if (sortedLLHs.get(i - 1) > sortedLLHs.get(i)) {
+                increasing = false;
+                if (i > 2)
+                    lowerBound = sortedPhis.get(i - 2);
+                else {
+                    sortedPhis.add(0, lowerBound);
+                    sortedLLHs.add(0, getTweedieLogLikelihood(parms, dinfo, lowerBound, mu));
+                }
+                break;
+            }
+        }
+        int counter = sortedPhis.size();
+        int iterationsLeft = parms._max_iterations_dispersion - 10 * counter;
+        while (increasing && iterationsLeft > counter && !job.stop_requested()) { // not yet
+            counter++;
+            upperBound *= 2;
+            sortedPhis.add(upperBound);
+            double newLLH = getTweedieLogLikelihood(parms, dinfo, upperBound, mu);
+            Log.debug("Tweedie looking for the region containing the max. likelihood; upper bound = " + upperBound + "; llh = " + newLLH);
+            sortedLLHs.add(newLLH);
+            if (sortedLLHs.get(counter - 2) > sortedLLHs.get(counter - 1)) {
+                if (counter > 3)
+                    lowerBound = sortedPhis.get(counter - 3);
+                Log.debug("Tweedie found the region containing the max. likelihood; phi lower bound = " + lowerBound + "; phi upper bound = " + upperBound);
+                break;
+            }
+        }
+
+        // now we should have the maximum between lowerBound and upperBound
+        double d = (upperBound - lowerBound) * 0.618; // (hiPhi - lowPhi)/golden ratio 
+        double lowPhi = lowerBound;
+        double hiPhi = upperBound;
+
+        double midLoPhi = sortedPhis.get(counter - 2);
+        double midLoLLH = sortedLLHs.get(counter - 2);
+        if (midLoPhi > upperBound) {
+            midLoPhi = hiPhi - d;
+            midLoLLH = getTweedieLogLikelihood(parms, dinfo, midLoPhi, mu);
+        }
+        double midHiPhi = lowPhi + d;
+        double midHiLLH = getTweedieLogLikelihood(parms, dinfo, midHiPhi, mu);
+        for (; counter < iterationsLeft; counter++) {
+            Log.info("Tweedie golden-section search[iter=" + counter + ", phis=(" + lowPhi + ", " + midLoPhi +
+                    ", " + midHiPhi + ", " + hiPhi + "), likelihoods=(" +
+                    "..., " + midLoLLH + ", " + midHiLLH + ", ...)]");
+            if (job.stop_requested()) {
+                return (hiPhi + lowPhi) / 2;
+            }
+            if (midHiLLH > midLoLLH) {
+                lowPhi = midLoPhi;
+            } else {
+                hiPhi = midHiPhi;
+            }
+            d = (hiPhi - lowPhi) * 0.618;  // (hiPhi - lowPhi)/golden ratio
+            if (hiPhi - lowPhi < parms._dispersion_epsilon) {
+                return (hiPhi + lowPhi) / 2;
+            }
+            midLoPhi = hiPhi - d;
+            midHiPhi = lowPhi + d;
+            midLoLLH = getTweedieLogLikelihood(parms, dinfo, midLoPhi, mu);
+            midHiLLH = getTweedieLogLikelihood(parms, dinfo, midHiPhi, mu);
+        }
+        return (hiPhi + lowPhi) / 2;
+    }
+
+
     /**
      * This method estimates the tweedie dispersion parameter.  It will use Newton's update if the new update will 
      * increase the loglikelihood.  Otherwise, the dispersion will be updated as 
      *                        dispersionNew = dispersionCurr + learningRate * update.
      * In addition, line search is used to increase the magnitude of the update when the update magnitude is too small
      * (< 1e-3).  
      * 
-     * For details, please see seciton IV.I, IV.II, and IV.III in document here: 
+     * Every 10th iteration it checks if the optimization doesn't diverge. If it looks like it diverged, it uses a
+     * different likelihood estimation that should be more accurate (combination of Series and Fourier inversion method)
+     * but without gradients. For this reason it will use a Golden section search which doesn't require gradients and
+     * has a linear convergence.
+     * 
+     * For details, please see sections IV.I, IV.II, and IV.III in document here: 
      */
     public static double estimateTweedieDispersionOnly(GLMModel.GLMParameters parms, GLMModel model, Job job,
                                                               double[] beta, DataInfo dinfo) {
@@ -84,27 +193,51 @@ public static double estimateTweedieDispersionOnly(GLMModel.GLMParameters parms,
         long timeLeft = parms._max_runtime_secs > 0 ? (long) (parms._max_runtime_secs * 1000 - modelBuiltTime)
                 : Long.MAX_VALUE;
         TweedieMLDispersionOnly tDispersion = new TweedieMLDispersionOnly(parms.train(), parms, model, beta, dinfo);
+        DispersionTask.GenPrediction gPred = new DispersionTask.GenPrediction(beta, model, dinfo).doAll(
+                1, Vec.T_NUM, dinfo._adaptedFrame);
+        Vec mu = Scope.track(gPred.outputFrame(Key.make(), new String[]{"prediction"}, null)).vec(0);
+
         double dispersionCurr = tDispersion._dispersionParameter;   // initial value of dispersion parameter
         double dispersionNew;
         double update;
         double logLLCurr, logLLNext;
         List<Double> loglikelihoodList = new ArrayList<>();
         List<Double> llChangeList = new ArrayList<>();
         List<Double> dispersionList = new ArrayList<>();
-
+        double bestLogLikelihoodFromSanityCheck = getTweedieLogLikelihood(parms, dinfo,dispersionCurr,mu);
+        List<Double> logLikelihoodSanityChecks = new ArrayList<>();
+        List<Double> dispersionsSanityChecks = new ArrayList<>();
+        logLikelihoodSanityChecks.add(bestLogLikelihoodFromSanityCheck);
+        dispersionsSanityChecks.add(dispersionCurr);
         for (int index = 0; index < parms._max_iterations_dispersion; index++) {
             tDispersion.updateDispersionP(dispersionCurr);
             DispersionTask.ComputeMaxSumSeriesTsk computeTask = new DispersionTask.ComputeMaxSumSeriesTsk(tDispersion,
                     parms, true);
             computeTask.doAll(tDispersion._infoFrame);
             logLLCurr = computeTask._logLL / computeTask._nobsLL;
-
             // record loglikelihood values
             loglikelihoodList.add(logLLCurr);
             dispersionList.add(dispersionCurr);
             if (loglikelihoodList.size() > 1) {
                 llChangeList.add(loglikelihoodList.get(index) - loglikelihoodList.get(index - 1));
-                if ((Math.abs(llChangeList.get(llChangeList.size() - 1)) < parms._dispersion_epsilon)) {
+                boolean converged = (Math.abs(llChangeList.get(llChangeList.size() - 1)) < parms._dispersion_epsilon);
+                if (index % 10 == 0 || converged) { // do a sanity check once in a while and if we think we converged
+                    double newLogLikelihood = getTweedieLogLikelihood(parms, dinfo, dispersionCurr, mu);
+                    logLikelihoodSanityChecks.add(newLogLikelihood);
+                    dispersionsSanityChecks.add(dispersionCurr);
+                    if (newLogLikelihood < bestLogLikelihoodFromSanityCheck) {
+                        // we are getting worse.
+                        Log.info("Tweedie sanity check FAIL. Trying Golden-section search instead of Newton's method.");
+                        tDispersion.cleanUp();
+                        final double dispersion = goldenRatioDispersionSearch(parms, dinfo, mu, logLikelihoodSanityChecks, dispersionsSanityChecks, job);
+                        Log.info("Tweedie dispersion estimate = "+dispersion);
+                        return dispersion;
+                    }
+                    bestLogLikelihoodFromSanityCheck = Math.max(bestLogLikelihoodFromSanityCheck, newLogLikelihood);
+                    Log.debug("Tweedie sanity check OK");
+                }
+
+                if (converged) {
                     tDispersion.cleanUp(); // early stop if loglikelihood has'n changed by > parms._dispersion_epsilon
                     Log.info("last dispersion "+dispersionCurr);
                     return dispersionList.get(loglikelihoodList.indexOf(Collections.max(loglikelihoodList)));
@@ -118,6 +251,8 @@ public static double estimateTweedieDispersionOnly(GLMModel.GLMParameters parms,
                     return Double.NaN;
                 }
             }
+
+
             // get new update to dispersion
             update = computeTask._dLogLL / computeTask._d2LogLL;
             if (Math.abs(update) < 1e-3) { // line search for speedup and increase magnitude of change

diff --git a/h2o-algos/src/main/java/hex/glm/GLM.java b/h2o-algos/src/main/java/hex/glm/GLM.java
@@ -2272,7 +2272,7 @@ private void fitIRLSMML(Solver s) {
             if (!_parms._fix_tweedie_variance_power) {
               if (!_parms._fix_dispersion_parameter) {
                 converged = updateTweediePandPhi(iterCnt, _state.expandBeta(betaCnd), weights, response) && converged;
-                Log.info("GLM Tweedie p and phi estimation: iteration = " + iterCnt + "; p = " + _parms._tweedie_variance_power + "; phi = " + _parms._init_dispersion_parameter);
+                Log.info("GLM Tweedie p and phi estimation: iteration = " + iterCnt + "; p = " + _parms._tweedie_variance_power + "; phi = " + _parms._dispersion_estimated);
               } else {
                 converged = updateTweedieVariancePower(iterCnt, _state.expandBeta(betaCnd), weights, response) && converged;
                 Log.info("GLM Tweedie variance power estimation: iteration = " + iterCnt + "; p = " + _parms._tweedie_variance_power);
@@ -2455,7 +2455,7 @@ private boolean updateTweedieVariancePower(int iterCnt, double[] betaCnd, Vec we
 
     private boolean updateTweediePandPhi(int iterCnt, double[] betaCnd, Vec weights, Vec response) {
       final double originalP = _parms._tweedie_variance_power;
-      final double originalPhi = _parms._init_dispersion_parameter;
+      final double originalPhi = _parms._dispersion_estimated;
       final double contractRatio = 0.5;
       final double pMin = 1 + 1e-10;
       final double pZeroMax = 2 - 1e-10;
@@ -3025,7 +3025,7 @@ else if (gaussian.equals(_parms._family) && Link.identity.equals(_parms._link))
 
       // Make sure if we set dispersion for Tweedie p and phi estimation even without calculating p values
       if (tweedie.equals(_parms._family) && !_parms._fix_dispersion_parameter && !_parms._fix_tweedie_variance_power) {
-        _model.setDispersion(_parms._init_dispersion_parameter, true);
+        _model.setDispersion(_parms._dispersion_estimated, true);
       }
       if (_parms._compute_p_values) { // compute p-values, standard error, estimate dispersion parameters...
         double se = _parms._init_dispersion_parameter;

diff --git a/h2o-algos/src/main/java/hex/glm/GLMMetricBuilder.java b/h2o-algos/src/main/java/hex/glm/GLMMetricBuilder.java
@@ -189,7 +189,7 @@ public final long resDOF() {
 
   protected void computeAIC(GLMModel gm) {
     if (gm._parms._calc_like && gm._finalScoring) { // uses likelihood which is calculated for the final scoring
-      _aic = -2 * _log_likelihood + 2 * Arrays.stream(gm.beta()).filter(b -> b != 0).count();
+      _aic = 2 * _log_likelihood + 2 * Arrays.stream(gm.beta()).filter(b -> b != 0).count();
     } else { // original calculation for the model build
       _aic = 0;
       switch (_glmf._family) {
@@ -246,6 +246,7 @@ protected void computeAIC(GLMModel gm) {
           Vec weights = f.vec(m._parms._weights_column);
           if (resp != null && Family.fractionalbinomial != _glmf._family) { // don't calculate for frac binomial
             gl = new GainsLift(preds.lastVec(), resp, weights);
+            gl._groups = m._parms._gainslift_bins;
             gl.exec(m._output._job);
           }
         }

diff --git a/h2o-algos/src/main/java/hex/glm/GLMModel.java b/h2o-algos/src/main/java/hex/glm/GLMModel.java
@@ -823,7 +823,7 @@ public final double likelihood(double yr, double ym){
                         yr*Math.log(_theta)+yr*Math.log(1+_theta*ym)):
                 ((yr==0 && ym>0)?(_invTheta*Math.log(1+_theta*ym)):0)); // with everything
       }  else if (Family.tweedie.equals(_family) && DispersionMethod.ml.equals(_dispersion_parameter_method) && !_fix_tweedie_variance_power) {
-        return -TweedieEstimator.logLikelihood(yr, ym, _tweedie_variance_power, _init_dispersion_parameter);
+        return -TweedieEstimator.logLikelihood(yr, ym, _tweedie_variance_power, _dispersion_estimated);
       }  else
         return .5 * deviance(yr,ym);
     }
@@ -867,7 +867,7 @@ else if (prediction > 1) // check what are possible values?
           return w * invPhiEst * log(w * yr * invPhiEst / prediction) - w * yr * invPhiEst / prediction 
                   - log(yr) - Gamma.logGamma(w * invPhiEst);
         case tweedie:
-          return -TweedieEstimator.logLikelihood(yr, ym[0], _tweedie_variance_power, _init_dispersion_parameter);
+          return -TweedieEstimator.logLikelihood(yr, ym[0], _tweedie_variance_power, _dispersion_estimated);
         case multinomial:
           // if probability is not given, then it is 1.0 if prediction equals to the real y and 0 othervice
           double predictedProbabilityOfActualClass = ym.length > 1 ? ym[(int) yr + 1] : (prediction == yr ? 1.0 : 0.0);
@@ -1032,6 +1032,7 @@ public DistributionFamily getDistributionFamily() {
     public void updateTweedieParams(double tweedieVariancePower, double tweedieLinkPower, double dispersion){
       _tweedie_variance_power = tweedieVariancePower;
       _tweedie_link_power = tweedieLinkPower;
+      _dispersion_estimated = dispersion;
       _init_dispersion_parameter = dispersion;
     }
   } // GLMParameters

diff --git a/h2o-algos/src/main/java/hex/schemas/DeepLearningV3.java b/h2o-algos/src/main/java/hex/schemas/DeepLearningV3.java
@@ -103,6 +103,7 @@ public static final class DeepLearningParametersV3 extends ModelParametersSchema
         "export_checkpoints_dir", 
         "auc_type", 
         "custom_metric_func",
+        "gainslift_bins",
     };
 
 

diff --git a/h2o-algos/src/main/java/hex/schemas/GAMV3.java b/h2o-algos/src/main/java/hex/schemas/GAMV3.java
@@ -83,7 +83,8 @@ public static final class GAMParametersV3 extends ModelParametersSchemaV3<GAMMod
             "scale", // array, smoothing parameter for GAM,
             "keep_gam_cols",
             "store_knot_locations",
-            "auc_type"
+            "auc_type",
+            "gainslift_bins",
     };
 
     @API(help = "Seed for pseudo random number generator (if applicable)", gridable = true)

diff --git a/h2o-algos/src/main/java/hex/schemas/GLMV3.java b/h2o-algos/src/main/java/hex/schemas/GLMV3.java
@@ -94,7 +94,8 @@ public static final class GLMParametersV3 extends ModelParametersSchemaV3<GLMPar
             "generate_variable_inflation_factors",
             "fix_tweedie_variance_power",
             "dispersion_learning_rate",
-            "influence"
+            "influence",
+            "gainslift_bins",
     };
 
     @API(help = "Seed for pseudo random number generator (if applicable)", gridable = true)

diff --git a/h2o-algos/src/main/java/hex/schemas/StackedEnsembleV99.java b/h2o-algos/src/main/java/hex/schemas/StackedEnsembleV99.java
@@ -56,7 +56,8 @@ public static final class StackedEnsembleParametersV99 extends ModelParametersSc
       "score_training_samples",
       "keep_levelone_frame",
       "export_checkpoints_dir", 
-      "auc_type"
+      "auc_type", 
+      "gainslift_bins",
     };
 
     public static class AlgorithmValuesProvider extends EnumValuesProvider<Algorithm> {