Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/master' into fix-jetty-ssl-init
Browse files Browse the repository at this point in the history
  • Loading branch information
krasinski committed Jan 17, 2024
2 parents 90c12d1 + 622045f commit 9f45118
Show file tree
Hide file tree
Showing 105 changed files with 2,707 additions and 780 deletions.
28 changes: 28 additions & 0 deletions Changes.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,34 @@

## H2O

### 3.44.0.3 - 12/20/2023

Download at: <a href='http://h2o-release.s3.amazonaws.com/h2o/rel-3.44.0/3/index.html'>http://h2o-release.s3.amazonaws.com/h2o/rel-3.44.0/3/index.html</a>

#### Bug Fix
- [[#15958]](https://github.com/h2oai/h2o-3/issues/15958) - Fixed maximum likelihood dispersion estimation for GLM tweedie family producing the wrong result for a specific dataset.
- [[#15936]](https://github.com/h2oai/h2o-3/issues/15936) - Added data frame transformations using polars since datatable cannot be installed on Python 3.10+.
- [[#15894]](https://github.com/h2oai/h2o-3/issues/15894) - Ensured that the functions that are supposed to be exported in the R package are exported.
- [[#15891]](https://github.com/h2oai/h2o-3/issues/15891) - Corrected sign in AIC calculation to fix problem with tweedie dispersion parameter estimation, AIC, and loglikelihood.
- [[#15887]](https://github.com/h2oai/h2o-3/issues/15887) - Allowed Python H2OFrame constructor to accept an existing H2OFrame.
- [[#6725]](https://github.com/h2oai/h2o-3/issues/6725) - Fixed LoggerFactory slf4j related regression.

#### Improvement
- [[#15937]](https://github.com/h2oai/h2o-3/issues/15937) - Exposed `gainslift_bins` parameter for Deep Learning, GAM, GLM, and Stacked Ensemble algorithms.
- [[#15916]](https://github.com/h2oai/h2o-3/issues/15916) - Sped up computation of Friedman-Popescu’s H statistic.

#### New Feature
- [[#15927]](https://github.com/h2oai/h2o-3/issues/15927) - Added anomaly score metric to be used as a `sort_by` metric when sorting grid model performances for Isolation Forest with grid search.
- [[#15780]](https://github.com/h2oai/h2o-3/issues/15780) - Added `weak_learner_params` parameter for AdaBoost.
- [[#15779]](https://github.com/h2oai/h2o-3/issues/15779) - Added `weak_learner="deep_learning"` option for AdaBoost.
- [[#7118]](https://github.com/h2oai/h2o-3/issues/7118) - Implemented scoring and scoring history for Extended Isolation Forest by adding `score_each_iteration` and `score_tree_interval`.

#### Docs
- [[#15817]](https://github.com/h2oai/h2o-3/issues/15817) - Improved default threshold API and documentation for binomial classification.

#### Security
- [[#15754]](https://github.com/h2oai/h2o-3/issues/15754) - Addressed CVE-2022-21230 by replacing nanohttpd.

### 3.44.0.2 - 11/8/2023

Download at: <a href='http://h2o-release.s3.amazonaws.com/h2o/rel-3.44.0/2/index.html'>http://h2o-release.s3.amazonaws.com/h2o/rel-3.44.0/2/index.html</a>
Expand Down
1 change: 1 addition & 0 deletions h2o-algos/src/main/java/hex/ensemble/Metalearner.java
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ protected void setCommonParams(P parms) {
parms._offset_column = _model._parms._offset_column;
parms._main_model_time_budget_factor = _model._parms._main_model_time_budget_factor;
parms._custom_metric_func = _model._parms._custom_metric_func;
parms._gainslift_bins = _model._parms._gainslift_bins;
}

protected void setCrossValidationParams(P parms) {
Expand Down
1 change: 1 addition & 0 deletions h2o-algos/src/main/java/hex/gam/MetricBuilderGAM.java
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ public ModelMetrics makeModelMetrics(Model m, Frame f, Frame adaptedFrame, Frame
Vec weights = f.vec(gamM._parms._weights_column);
if (resp != null && fractionalbinomial != _glmf._family) {
gl = new GainsLift(preds.lastVec(), resp, weights);
gl._groups = m._parms._gainslift_bins;
gl.exec(gamM._output._job);
}
}
Expand Down
143 changes: 139 additions & 4 deletions h2o-algos/src/main/java/hex/glm/DispersionUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import water.Job;
import water.Key;
import water.MRTask;
import water.Scope;
import water.fvec.Chunk;
import water.fvec.Frame;
import water.fvec.Vec;
Expand All @@ -12,6 +13,7 @@
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;

import static org.apache.commons.math3.special.Gamma.*;

Expand Down Expand Up @@ -68,14 +70,121 @@ public static double estimateGammaMLSE(GLMTask.ComputeGammaMLSETsk mlCT, double
return seOld;
}

private static double getTweedieLogLikelihood(GLMModel.GLMParameters parms, DataInfo dinfo, double phi, Vec mu) {
final double llh = new TweedieEstimator(
parms._tweedie_variance_power,
phi,
false,
false,
false,
false)
.compute(mu,
dinfo._adaptedFrame.vec(parms._response_column),
parms._weights_column == null
? dinfo._adaptedFrame.makeCompatible(new Frame(Vec.makeOne(dinfo._adaptedFrame.numRows())))[0]
: dinfo._adaptedFrame.vec(parms._weights_column))
._loglikelihood;
Log.debug("Tweedie LogLikelihood(p=" + parms._tweedie_variance_power + ", phi=" + phi + ") = " + llh);
return llh;
}


private static double goldenRatioDispersionSearch(GLMModel.GLMParameters parms, DataInfo dinfo, Vec mu,
List<Double> logLikelihoods, List<Double> phis, Job job) {
// make monotonic
List<Double> sortedPhis = phis.stream().sorted().collect(Collectors.toList());
List<Double> sortedLLHs = new ArrayList<>();
for (int i = 0; i < sortedPhis.size(); i++) {
double phi = sortedPhis.get(i);
int index = phis.indexOf(phi);
sortedLLHs.add(logLikelihoods.get(index));
}

// did we already find a region where there is the maximum?
boolean increasing = true;
double lowerBound = 1e-16;
double upperBound = sortedPhis.get(0);
for (int i = 1; i < sortedPhis.size(); i++) {
upperBound = sortedPhis.get(i);
if (sortedLLHs.get(i - 1) > sortedLLHs.get(i)) {
increasing = false;
if (i > 2)
lowerBound = sortedPhis.get(i - 2);
else {
sortedPhis.add(0, lowerBound);
sortedLLHs.add(0, getTweedieLogLikelihood(parms, dinfo, lowerBound, mu));
}
break;
}
}
int counter = sortedPhis.size();
int iterationsLeft = parms._max_iterations_dispersion - 10 * counter;
while (increasing && iterationsLeft > counter && !job.stop_requested()) { // not yet
counter++;
upperBound *= 2;
sortedPhis.add(upperBound);
double newLLH = getTweedieLogLikelihood(parms, dinfo, upperBound, mu);
Log.debug("Tweedie looking for the region containing the max. likelihood; upper bound = " + upperBound + "; llh = " + newLLH);
sortedLLHs.add(newLLH);
if (sortedLLHs.get(counter - 2) > sortedLLHs.get(counter - 1)) {
if (counter > 3)
lowerBound = sortedPhis.get(counter - 3);
Log.debug("Tweedie found the region containing the max. likelihood; phi lower bound = " + lowerBound + "; phi upper bound = " + upperBound);
break;
}
}

// now we should have the maximum between lowerBound and upperBound
double d = (upperBound - lowerBound) * 0.618; // (hiPhi - lowPhi)/golden ratio
double lowPhi = lowerBound;
double hiPhi = upperBound;

double midLoPhi = sortedPhis.get(counter - 2);
double midLoLLH = sortedLLHs.get(counter - 2);
if (midLoPhi > upperBound) {
midLoPhi = hiPhi - d;
midLoLLH = getTweedieLogLikelihood(parms, dinfo, midLoPhi, mu);
}
double midHiPhi = lowPhi + d;
double midHiLLH = getTweedieLogLikelihood(parms, dinfo, midHiPhi, mu);
for (; counter < iterationsLeft; counter++) {
Log.info("Tweedie golden-section search[iter=" + counter + ", phis=(" + lowPhi + ", " + midLoPhi +
", " + midHiPhi + ", " + hiPhi + "), likelihoods=(" +
"..., " + midLoLLH + ", " + midHiLLH + ", ...)]");
if (job.stop_requested()) {
return (hiPhi + lowPhi) / 2;
}
if (midHiLLH > midLoLLH) {
lowPhi = midLoPhi;
} else {
hiPhi = midHiPhi;
}
d = (hiPhi - lowPhi) * 0.618; // (hiPhi - lowPhi)/golden ratio
if (hiPhi - lowPhi < parms._dispersion_epsilon) {
return (hiPhi + lowPhi) / 2;
}
midLoPhi = hiPhi - d;
midHiPhi = lowPhi + d;
midLoLLH = getTweedieLogLikelihood(parms, dinfo, midLoPhi, mu);
midHiLLH = getTweedieLogLikelihood(parms, dinfo, midHiPhi, mu);
}
return (hiPhi + lowPhi) / 2;
}


/**
* This method estimates the tweedie dispersion parameter. It will use Newton's update if the new update will
* increase the loglikelihood. Otherwise, the dispersion will be updated as
* dispersionNew = dispersionCurr + learningRate * update.
* In addition, line search is used to increase the magnitude of the update when the update magnitude is too small
* (< 1e-3).
*
* For details, please see seciton IV.I, IV.II, and IV.III in document here:
* Every 10th iteration it checks if the optimization doesn't diverge. If it looks like it diverged, it uses a
* different likelihood estimation that should be more accurate (combination of Series and Fourier inversion method)
* but without gradients. For this reason it will use a Golden section search which doesn't require gradients and
* has a linear convergence.
*
* For details, please see sections IV.I, IV.II, and IV.III in document here:
*/
public static double estimateTweedieDispersionOnly(GLMModel.GLMParameters parms, GLMModel model, Job job,
double[] beta, DataInfo dinfo) {
Expand All @@ -84,27 +193,51 @@ public static double estimateTweedieDispersionOnly(GLMModel.GLMParameters parms,
long timeLeft = parms._max_runtime_secs > 0 ? (long) (parms._max_runtime_secs * 1000 - modelBuiltTime)
: Long.MAX_VALUE;
TweedieMLDispersionOnly tDispersion = new TweedieMLDispersionOnly(parms.train(), parms, model, beta, dinfo);
DispersionTask.GenPrediction gPred = new DispersionTask.GenPrediction(beta, model, dinfo).doAll(
1, Vec.T_NUM, dinfo._adaptedFrame);
Vec mu = Scope.track(gPred.outputFrame(Key.make(), new String[]{"prediction"}, null)).vec(0);

double dispersionCurr = tDispersion._dispersionParameter; // initial value of dispersion parameter
double dispersionNew;
double update;
double logLLCurr, logLLNext;
List<Double> loglikelihoodList = new ArrayList<>();
List<Double> llChangeList = new ArrayList<>();
List<Double> dispersionList = new ArrayList<>();

double bestLogLikelihoodFromSanityCheck = getTweedieLogLikelihood(parms, dinfo,dispersionCurr,mu);
List<Double> logLikelihoodSanityChecks = new ArrayList<>();
List<Double> dispersionsSanityChecks = new ArrayList<>();
logLikelihoodSanityChecks.add(bestLogLikelihoodFromSanityCheck);
dispersionsSanityChecks.add(dispersionCurr);
for (int index = 0; index < parms._max_iterations_dispersion; index++) {
tDispersion.updateDispersionP(dispersionCurr);
DispersionTask.ComputeMaxSumSeriesTsk computeTask = new DispersionTask.ComputeMaxSumSeriesTsk(tDispersion,
parms, true);
computeTask.doAll(tDispersion._infoFrame);
logLLCurr = computeTask._logLL / computeTask._nobsLL;

// record loglikelihood values
loglikelihoodList.add(logLLCurr);
dispersionList.add(dispersionCurr);
if (loglikelihoodList.size() > 1) {
llChangeList.add(loglikelihoodList.get(index) - loglikelihoodList.get(index - 1));
if ((Math.abs(llChangeList.get(llChangeList.size() - 1)) < parms._dispersion_epsilon)) {
boolean converged = (Math.abs(llChangeList.get(llChangeList.size() - 1)) < parms._dispersion_epsilon);
if (index % 10 == 0 || converged) { // do a sanity check once in a while and if we think we converged
double newLogLikelihood = getTweedieLogLikelihood(parms, dinfo, dispersionCurr, mu);
logLikelihoodSanityChecks.add(newLogLikelihood);
dispersionsSanityChecks.add(dispersionCurr);
if (newLogLikelihood < bestLogLikelihoodFromSanityCheck) {
// we are getting worse.
Log.info("Tweedie sanity check FAIL. Trying Golden-section search instead of Newton's method.");
tDispersion.cleanUp();
final double dispersion = goldenRatioDispersionSearch(parms, dinfo, mu, logLikelihoodSanityChecks, dispersionsSanityChecks, job);
Log.info("Tweedie dispersion estimate = "+dispersion);
return dispersion;
}
bestLogLikelihoodFromSanityCheck = Math.max(bestLogLikelihoodFromSanityCheck, newLogLikelihood);
Log.debug("Tweedie sanity check OK");
}

if (converged) {
tDispersion.cleanUp(); // early stop if loglikelihood has'n changed by > parms._dispersion_epsilon
Log.info("last dispersion "+dispersionCurr);
return dispersionList.get(loglikelihoodList.indexOf(Collections.max(loglikelihoodList)));
Expand All @@ -118,6 +251,8 @@ public static double estimateTweedieDispersionOnly(GLMModel.GLMParameters parms,
return Double.NaN;
}
}


// get new update to dispersion
update = computeTask._dLogLL / computeTask._d2LogLL;
if (Math.abs(update) < 1e-3) { // line search for speedup and increase magnitude of change
Expand Down
6 changes: 3 additions & 3 deletions h2o-algos/src/main/java/hex/glm/GLM.java
Original file line number Diff line number Diff line change
Expand Up @@ -2272,7 +2272,7 @@ private void fitIRLSMML(Solver s) {
if (!_parms._fix_tweedie_variance_power) {
if (!_parms._fix_dispersion_parameter) {
converged = updateTweediePandPhi(iterCnt, _state.expandBeta(betaCnd), weights, response) && converged;
Log.info("GLM Tweedie p and phi estimation: iteration = " + iterCnt + "; p = " + _parms._tweedie_variance_power + "; phi = " + _parms._init_dispersion_parameter);
Log.info("GLM Tweedie p and phi estimation: iteration = " + iterCnt + "; p = " + _parms._tweedie_variance_power + "; phi = " + _parms._dispersion_estimated);
} else {
converged = updateTweedieVariancePower(iterCnt, _state.expandBeta(betaCnd), weights, response) && converged;
Log.info("GLM Tweedie variance power estimation: iteration = " + iterCnt + "; p = " + _parms._tweedie_variance_power);
Expand Down Expand Up @@ -2455,7 +2455,7 @@ private boolean updateTweedieVariancePower(int iterCnt, double[] betaCnd, Vec we

private boolean updateTweediePandPhi(int iterCnt, double[] betaCnd, Vec weights, Vec response) {
final double originalP = _parms._tweedie_variance_power;
final double originalPhi = _parms._init_dispersion_parameter;
final double originalPhi = _parms._dispersion_estimated;
final double contractRatio = 0.5;
final double pMin = 1 + 1e-10;
final double pZeroMax = 2 - 1e-10;
Expand Down Expand Up @@ -3025,7 +3025,7 @@ else if (gaussian.equals(_parms._family) && Link.identity.equals(_parms._link))

// Make sure if we set dispersion for Tweedie p and phi estimation even without calculating p values
if (tweedie.equals(_parms._family) && !_parms._fix_dispersion_parameter && !_parms._fix_tweedie_variance_power) {
_model.setDispersion(_parms._init_dispersion_parameter, true);
_model.setDispersion(_parms._dispersion_estimated, true);
}
if (_parms._compute_p_values) { // compute p-values, standard error, estimate dispersion parameters...
double se = _parms._init_dispersion_parameter;
Expand Down
3 changes: 2 additions & 1 deletion h2o-algos/src/main/java/hex/glm/GLMMetricBuilder.java
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ public final long resDOF() {

protected void computeAIC(GLMModel gm) {
if (gm._parms._calc_like && gm._finalScoring) { // uses likelihood which is calculated for the final scoring
_aic = -2 * _log_likelihood + 2 * Arrays.stream(gm.beta()).filter(b -> b != 0).count();
_aic = 2 * _log_likelihood + 2 * Arrays.stream(gm.beta()).filter(b -> b != 0).count();
} else { // original calculation for the model build
_aic = 0;
switch (_glmf._family) {
Expand Down Expand Up @@ -246,6 +246,7 @@ protected void computeAIC(GLMModel gm) {
Vec weights = f.vec(m._parms._weights_column);
if (resp != null && Family.fractionalbinomial != _glmf._family) { // don't calculate for frac binomial
gl = new GainsLift(preds.lastVec(), resp, weights);
gl._groups = m._parms._gainslift_bins;
gl.exec(m._output._job);
}
}
Expand Down
5 changes: 3 additions & 2 deletions h2o-algos/src/main/java/hex/glm/GLMModel.java
Original file line number Diff line number Diff line change
Expand Up @@ -823,7 +823,7 @@ public final double likelihood(double yr, double ym){
yr*Math.log(_theta)+yr*Math.log(1+_theta*ym)):
((yr==0 && ym>0)?(_invTheta*Math.log(1+_theta*ym)):0)); // with everything
} else if (Family.tweedie.equals(_family) && DispersionMethod.ml.equals(_dispersion_parameter_method) && !_fix_tweedie_variance_power) {
return -TweedieEstimator.logLikelihood(yr, ym, _tweedie_variance_power, _init_dispersion_parameter);
return -TweedieEstimator.logLikelihood(yr, ym, _tweedie_variance_power, _dispersion_estimated);
} else
return .5 * deviance(yr,ym);
}
Expand Down Expand Up @@ -867,7 +867,7 @@ else if (prediction > 1) // check what are possible values?
return w * invPhiEst * log(w * yr * invPhiEst / prediction) - w * yr * invPhiEst / prediction
- log(yr) - Gamma.logGamma(w * invPhiEst);
case tweedie:
return -TweedieEstimator.logLikelihood(yr, ym[0], _tweedie_variance_power, _init_dispersion_parameter);
return -TweedieEstimator.logLikelihood(yr, ym[0], _tweedie_variance_power, _dispersion_estimated);
case multinomial:
// if probability is not given, then it is 1.0 if prediction equals to the real y and 0 othervice
double predictedProbabilityOfActualClass = ym.length > 1 ? ym[(int) yr + 1] : (prediction == yr ? 1.0 : 0.0);
Expand Down Expand Up @@ -1032,6 +1032,7 @@ public DistributionFamily getDistributionFamily() {
public void updateTweedieParams(double tweedieVariancePower, double tweedieLinkPower, double dispersion){
_tweedie_variance_power = tweedieVariancePower;
_tweedie_link_power = tweedieLinkPower;
_dispersion_estimated = dispersion;
_init_dispersion_parameter = dispersion;
}
} // GLMParameters
Expand Down
1 change: 1 addition & 0 deletions h2o-algos/src/main/java/hex/schemas/DeepLearningV3.java
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ public static final class DeepLearningParametersV3 extends ModelParametersSchema
"export_checkpoints_dir",
"auc_type",
"custom_metric_func",
"gainslift_bins",
};


Expand Down
3 changes: 2 additions & 1 deletion h2o-algos/src/main/java/hex/schemas/GAMV3.java
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,8 @@ public static final class GAMParametersV3 extends ModelParametersSchemaV3<GAMMod
"scale", // array, smoothing parameter for GAM,
"keep_gam_cols",
"store_knot_locations",
"auc_type"
"auc_type",
"gainslift_bins",
};

@API(help = "Seed for pseudo random number generator (if applicable)", gridable = true)
Expand Down
3 changes: 2 additions & 1 deletion h2o-algos/src/main/java/hex/schemas/GLMV3.java
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,8 @@ public static final class GLMParametersV3 extends ModelParametersSchemaV3<GLMPar
"generate_variable_inflation_factors",
"fix_tweedie_variance_power",
"dispersion_learning_rate",
"influence"
"influence",
"gainslift_bins",
};

@API(help = "Seed for pseudo random number generator (if applicable)", gridable = true)
Expand Down
3 changes: 2 additions & 1 deletion h2o-algos/src/main/java/hex/schemas/StackedEnsembleV99.java
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,8 @@ public static final class StackedEnsembleParametersV99 extends ModelParametersSc
"score_training_samples",
"keep_levelone_frame",
"export_checkpoints_dir",
"auc_type"
"auc_type",
"gainslift_bins",
};

public static class AlgorithmValuesProvider extends EnumValuesProvider<Algorithm> {
Expand Down
Loading

0 comments on commit 9f45118

Please sign in to comment.