From a6bd451606c16c6d5686ccf1d2f96933eda0688f Mon Sep 17 00:00:00 2001 From: syzonyuliia <92266161+syzonyuliia@users.noreply.github.com> Date: Mon, 12 Feb 2024 02:56:02 +0100 Subject: [PATCH] GH-15809: fixes loglikelihood and aic for glm generic model (#16025) * GH-15809: implement AIC and Loglikelihood calculation for GenericModel * GH-15809: add AIC and Loglikelihood to ModelMetricsBinomial * GH-15809: implement AIC and Loglikelihood calculations for ModelMetricsBinomial * GH-15809: add AIC and Loglikelihood to output metrics * GH-15809: update test to check AIC and Loglikelihood calculation for loaded model * GH-15809: correct betas source * GH-15809: implement AIC and loglikelihood calculation for multinomial generic glm * GH-15809: minor aic retrieval fix * GH-15809: enable loglikelihood and AIC calculation for multinomial family * GH-15809: remove prints * GH-15809: refactor * GH-15809: add new parameter to the constructor, and add new constructor * GH-15809: add dispersion_estimated parameter to GLM mojo * GH-15809: update and fix tests * GH-15809: fix metrics exposure in python * GH-15809: fix parameters * GH-15809: add null check * GH-15809: fix tests * GH-15809: fix R tests * GH-15809: fix reading new parameter in MOJO load * GH-15809: fix writing new parameter in MOJO load * GH-15809: fix value * GH-15809: fix comments * GH-15809: fix printing metrics * GH-15809: remove commented code * GH-15809: assign NaN instead of 0 as placeholder value for Loglikelihood * GH-15809: default dispersion estimation set to 1 * GH-15809: clean test * GH-15809: fix aic check in test * GH-15809: additionally fix aic check in test * GH-15809: additionally fix aic check in test * GH-15809: fit test - add default parameters * Fixed test discrepancies. * only return AIC and loglikelihood for glm models * fixed AIC problem when model is not glm * Incorporate Tomas F review. * replace m != null && m.getClass().toString().contains(generic) with score4Generic --------- Co-authored-by: syzonyuliia Co-authored-by: wendycwong --- .../main/java/hex/generic/GenericModel.java | 52 +++++++++++++++++++ h2o-algos/src/main/java/hex/glm/GLMModel.java | 7 ++- .../src/main/java/hex/glm/GLMMojoWriter.java | 2 + h2o-core/src/main/java/hex/Model.java | 6 ++- .../main/java/hex/ModelMetricsBinomial.java | 34 ++++++++++-- .../java/hex/ModelMetricsBinomialGLM.java | 4 +- .../java/hex/ModelMetricsMultinomial.java | 35 ++++++++++++- .../ModelMetricsMultinomialGLMGeneric.java | 10 ++++ .../main/java/hex/ModelMetricsRegression.java | 32 +++++++++++- .../java/hex/ModelMetricsRegressionGLM.java | 7 +-- .../ModelMetricsBinomialGenericV3.java | 2 + .../api/schemas3/ModelMetricsBinomialV3.java | 8 +++ .../ModelMetricsMultinomialGenericV3.java | 3 ++ .../schemas3/ModelMetricsMultinomialV3.java | 9 ++++ .../ModelMetricsRegressionGenericV3.java | 9 ++++ .../schemas3/ModelMetricsRegressionV3.java | 9 ++++ .../genmodel/algos/glm/GlmMojoModelBase.java | 12 ++++- .../hex/genmodel/algos/glm/GlmMojoReader.java | 1 + h2o-py/h2o/model/metrics_base.py | 9 +++- .../pyunit_generic_model_mojo_glm.py | 40 ++++++++++++-- .../pyunit_pubdev_6413_cv_sd_fix.py | 2 + .../testdir_misc/pyunit_metric_json_check.py | 20 +++++-- .../generic/runit_generic_model_mojo_drf.R | 8 +-- .../generic/runit_generic_model_mojo_gbm.R | 8 +-- .../generic/runit_generic_model_mojo_glm.R | 10 ++-- .../runit_generic_model_mojo_xgboost.R | 8 +-- 26 files changed, 301 insertions(+), 46 deletions(-) diff --git a/h2o-algos/src/main/java/hex/generic/GenericModel.java b/h2o-algos/src/main/java/hex/generic/GenericModel.java index 24adcb746e60..e8220a7e75c6 100644 --- a/h2o-algos/src/main/java/hex/generic/GenericModel.java +++ b/h2o-algos/src/main/java/hex/generic/GenericModel.java @@ -2,12 +2,14 @@ import hex.*; import hex.genmodel.*; +import hex.genmodel.algos.glm.GlmMojoModelBase; import hex.genmodel.algos.kmeans.KMeansMojoModel; import hex.genmodel.descriptor.ModelDescriptor; import hex.genmodel.descriptor.ModelDescriptorBuilder; import hex.genmodel.easy.EasyPredictModelWrapper; import hex.genmodel.easy.RowData; import hex.genmodel.easy.exception.PredictException; +import hex.glm.GLMModel; import hex.tree.isofor.ModelMetricsAnomaly; import water.*; import water.fvec.*; @@ -42,6 +44,7 @@ public class GenericModel extends Model _genModelSource; + private GLMModel.GLMParameters _glmParameters; /** * Full constructor @@ -56,6 +59,26 @@ public GenericModel(Key selfKey, GenericModelParameters parms, Gen if (mojoModel._modelAttributes != null && mojoModel._modelAttributes.getModelParameters() != null) { _parms._modelParameters = GenericModelParameters.convertParameters(mojoModel._modelAttributes.getModelParameters()); } + _glmParameters = null; + if(_algoName.toLowerCase().contains("glm")) { + GlmMojoModelBase glmModel = (GlmMojoModelBase) mojoModel; + // create GLM parameters instance + _glmParameters = new GLMModel.GLMParameters( + GLMModel.GLMParameters.Family.valueOf(getParamByName("family").toString()), + GLMModel.GLMParameters.Link.valueOf(getParamByName("link").toString()), + Arrays.stream(getParamByName("lambda").toString().trim().replaceAll("\\[", "") + .replaceAll("\\]", "").split(",\\s*")) + .mapToDouble(Double::parseDouble).toArray(), + Arrays.stream(getParamByName("alpha").toString().trim().replaceAll("\\[", "") + .replaceAll("\\]", "").split(",\\s*")) + .mapToDouble(Double::parseDouble).toArray(), + Double.parseDouble(getParamByName("tweedie_variance_power").toString()), + Double.parseDouble(getParamByName("tweedie_link_power").toString()), + null, + Double.parseDouble(getParamByName("theta").toString()), + glmModel.getDispersionEstimated() + ); + } } public GenericModel(Key selfKey, GenericModelParameters parms, GenericModelOutput output, @@ -133,6 +156,35 @@ protected PredictScoreResult predictScoreImpl(Frame fr, Frame adaptFrm, String d return super.predictScoreImpl(fr, adaptFrm, destination_key, j, computeMetrics, customMetricFunc); } + private Iced getParamByName(String name) { + return Arrays.stream(this._parms._modelParameters) + .filter(p -> Objects.equals(p.name, name)).findAny().get().actual_value; + } + + @Override + public double aic(double likelihood) { + // calculate negative loglikelihood specifically for GLM + if (!_algoName.equals("glm")) { + return Double.NaN; + } else { + long betasCount = Arrays.stream(((GlmMojoModelBase) this.genModel()).getBeta()).filter(b -> b != 0).count(); + return -2 * likelihood + 2 * betasCount; + } + } + + @Override + public double likelihood(double w, double y, double[] f) { + // calculate negative loglikelihood specifically for GLM + if(!_algoName.equals("glm")) { + return Double.NaN; + } else if (w == 0) { + return 0; + } else { + // time-consuming calculation for the final scoring for GLM model + return _glmParameters.likelihood(w, y, f); + } + } + PredictScoreResult predictScoreMojoImpl(Frame fr, String destination_key, Job j, boolean computeMetrics) { GenModel model = genModel(); String[] names = model.getOutputNames(); diff --git a/h2o-algos/src/main/java/hex/glm/GLMModel.java b/h2o-algos/src/main/java/hex/glm/GLMModel.java index 361759e91008..f4b71c1a1809 100755 --- a/h2o-algos/src/main/java/hex/glm/GLMModel.java +++ b/h2o-algos/src/main/java/hex/glm/GLMModel.java @@ -727,6 +727,11 @@ public GLMParameters(Family f, Link l, double [] lambda, double [] alpha, double public GLMParameters(Family f, Link l, double [] lambda, double [] alpha, double twVar, double twLnk, String[] interactions, double theta){ + this(f,l,lambda,alpha,twVar,twLnk,interactions, theta, Double.NaN); + } + + public GLMParameters(Family f, Link l, double [] lambda, double [] alpha, double twVar, double twLnk, + String[] interactions, double theta, double dispersion_estimated){ this._lambda = lambda; this._alpha = alpha; this._tweedie_variance_power = twVar; @@ -736,7 +741,7 @@ public GLMParameters(Family f, Link l, double [] lambda, double [] alpha, double _link = l; this._theta=theta; this._invTheta = 1.0/theta; - this._dispersion_estimated = _init_dispersion_parameter; + this._dispersion_estimated = Double.isNaN(dispersion_estimated) ? _init_dispersion_parameter : dispersion_estimated; } public final double variance(double mu){ diff --git a/h2o-algos/src/main/java/hex/glm/GLMMojoWriter.java b/h2o-algos/src/main/java/hex/glm/GLMMojoWriter.java index 788e4949a96e..f80e5b8a991e 100644 --- a/h2o-algos/src/main/java/hex/glm/GLMMojoWriter.java +++ b/h2o-algos/src/main/java/hex/glm/GLMMojoWriter.java @@ -39,6 +39,8 @@ protected void writeModelData() throws IOException { if (GLMModel.GLMParameters.Family.tweedie.equals(model._parms._family)) writekv("tweedie_link_power", model._parms._tweedie_link_power); + + writekv("dispersion_estimated", (model._parms._compute_p_values ? model._parms._dispersion_estimated : 1.0)); } } diff --git a/h2o-core/src/main/java/hex/Model.java b/h2o-core/src/main/java/hex/Model.java index 1887e9f2948f..33db02a7ebce 100755 --- a/h2o-core/src/main/java/hex/Model.java +++ b/h2o-core/src/main/java/hex/Model.java @@ -1381,7 +1381,11 @@ public double deviance(double w, double y, double f) { } public double likelihood(double w, double y, double[] f) { - return 0.0; // place holder. This function is overridden in GLM. + return Double.NaN; // placeholder. This function is overridden in GLM and GenericModel. + } + + public double aic(double likelihood) { + return Double.NaN; // placeholder. This function is overridden in GenericModel. } public ScoringInfo[] scoring_history() { return scoringInfo; } diff --git a/h2o-core/src/main/java/hex/ModelMetricsBinomial.java b/h2o-core/src/main/java/hex/ModelMetricsBinomial.java index 04a40a3f5ace..029a68388ef5 100755 --- a/h2o-core/src/main/java/hex/ModelMetricsBinomial.java +++ b/h2o-core/src/main/java/hex/ModelMetricsBinomial.java @@ -10,7 +10,6 @@ import water.fvec.Frame; import water.fvec.Vec; import water.util.ArrayUtils; -import water.util.Log; import water.util.MathUtils; import java.util.Arrays; @@ -19,19 +18,30 @@ public class ModelMetricsBinomial extends ModelMetricsSupervised { public final AUC2 _auc; public final double _logloss; + public final double _loglikelihood; + public final double _aic; public double _mean_per_class_error; public final GainsLift _gainsLift; public ModelMetricsBinomial(Model model, Frame frame, long nobs, double mse, String[] domain, - double sigma, AUC2 auc, double logloss, GainsLift gainsLift, + double sigma, AUC2 auc, double logloss, double loglikelihood, double aic, GainsLift gainsLift, CustomMetric customMetric) { super(model, frame, nobs, mse, domain, sigma, customMetric); _auc = auc; _logloss = logloss; + _loglikelihood = loglikelihood; + _aic = aic; _gainsLift = gainsLift; _mean_per_class_error = cm() == null ? Double.NaN : cm().mean_per_class_error(); } + public ModelMetricsBinomial(Model model, Frame frame, long nobs, double mse, String[] domain, + double sigma, AUC2 auc, double logloss, GainsLift gainsLift, + CustomMetric customMetric) { + this(model, frame, nobs, mse, domain, sigma, auc, logloss, Double.NaN, Double.NaN, + gainsLift, customMetric); + } + public static ModelMetricsBinomial getFromDKV(Model model, Frame frame) { ModelMetrics mm = ModelMetrics.getFromDKV(model, frame); if( !(mm instanceof ModelMetricsBinomial) ) @@ -49,6 +59,8 @@ public String toString() { sb.append(" pr_auc: " + (float)_auc.pr_auc() + "\n"); } sb.append(" logloss: " + (float)_logloss + "\n"); + sb.append(" loglikelihood: " + (float)_loglikelihood + "\n"); + sb.append(" AIC: " + (float)_aic + "\n"); sb.append(" mean_per_class_error: " + (float)_mean_per_class_error + "\n"); sb.append(" default threshold: " + (_auc == null ? 0.5 : (float)_auc.defaultThreshold()) + "\n"); if (cm() != null) sb.append(" CM: " + cm().toASCII()); @@ -57,6 +69,8 @@ public String toString() { } public double logloss() { return _logloss; } + public double loglikelihood() { return _loglikelihood; } + public double aic() { return _aic; } public double mean_per_class_error() { return _mean_per_class_error; } @Override public AUC2 auc_obj() { return _auc; } @Override public ConfusionMatrix cm() { @@ -161,6 +175,7 @@ private static class BinomialMetrics extends MRTask { public static class MetricBuilderBinomial> extends MetricBuilderSupervised { protected double _logloss; + protected double _loglikelihood; protected AUC2.AUCBuilder _auc; public MetricBuilderBinomial( String[] domain ) { super(2,domain); _auc = new AUC2.AUCBuilder(AUC2.NBINS); } @@ -177,6 +192,7 @@ public static class MetricBuilderBinomial> ex if(w == 0 || Double.isNaN(w)) return ds; int iact = (int)yact[0]; boolean quasibinomial = (m!=null && m._parms._distribution == DistributionFamily.quasibinomial); + boolean score4Generic = m != null && m.getClass().toString().contains("Generic"); if (quasibinomial) { if (yact[0] != 0) iact = _domain[0].equals(String.valueOf((int) yact[0])) ? 0 : 1; // actual response index needed for confusion matrix, AUC, etc. @@ -197,6 +213,11 @@ public static class MetricBuilderBinomial> ex // Compute log loss _logloss += w * MathUtils.logloss(err); } + + if(score4Generic) { // only perform for generic model, will increase run time for training if performs + _loglikelihood += m.likelihood(w, yact[0], ds); + } + _count++; _wcount += w; assert !Double.isNaN(_sumsqe); @@ -207,6 +228,7 @@ public static class MetricBuilderBinomial> ex @Override public void reduce( T mb ) { super.reduce(mb); // sumseq, count _logloss += mb._logloss; + _loglikelihood += mb._loglikelihood; _auc.reduce(mb._auc); } @@ -256,6 +278,8 @@ private ModelMetrics makeModelMetrics(final Model m, final Frame f, final Frame private ModelMetrics makeModelMetrics(Model m, Frame f, GainsLift gl) { double mse = Double.NaN; + double loglikelihood = Double.NaN; + double aic = Double.NaN; double logloss = Double.NaN; double sigma = Double.NaN; final AUC2 auc; @@ -263,11 +287,15 @@ private ModelMetrics makeModelMetrics(Model m, Frame f, GainsLift gl) { sigma = weightedSigma(); mse = _sumsqe / _wcount; logloss = _logloss / _wcount; + if(m != null && m.getClass().toString().contains("Generic")) { + loglikelihood = -1 * _loglikelihood ; // get likelihood from negative loglikelihood + aic = m.aic(loglikelihood); + } auc = new AUC2(_auc); } else { auc = new AUC2(); } - ModelMetricsBinomial mm = new ModelMetricsBinomial(m, f, _count, mse, _domain, sigma, auc, logloss, gl, _customMetric); + ModelMetricsBinomial mm = new ModelMetricsBinomial(m, f, _count, mse, _domain, sigma, auc, logloss, loglikelihood, aic, gl, _customMetric); if (m!=null) m.addModelMetrics(mm); return mm; } diff --git a/h2o-core/src/main/java/hex/ModelMetricsBinomialGLM.java b/h2o-core/src/main/java/hex/ModelMetricsBinomialGLM.java index 2fbcaa13c0b6..501b54f04dcf 100644 --- a/h2o-core/src/main/java/hex/ModelMetricsBinomialGLM.java +++ b/h2o-core/src/main/java/hex/ModelMetricsBinomialGLM.java @@ -14,7 +14,7 @@ public ModelMetricsBinomialGLM(Model model, Frame frame, long nobs, double mse, double sigma, AUC2 auc, double logloss, double resDev, double nullDev, double aic, long nDof, long rDof, GainsLift gainsLift, CustomMetric customMetric, double loglikelihood) { - super(model, frame, nobs, mse, domain, sigma, auc, logloss, gainsLift, customMetric); + super(model, frame, nobs, mse, domain, sigma, auc, logloss, loglikelihood, aic, gainsLift, customMetric); _resDev = resDev; _nullDev = nullDev; _AIC = aic; @@ -70,7 +70,7 @@ public ModelMetricsMultinomialGLM(Model model, Frame frame, long nobs, double ms double sigma, ConfusionMatrix cm, float [] hr, double logloss, double resDev, double nullDev, double aic, long nDof, long rDof, MultinomialAUC auc, CustomMetric customMetric, double loglikelihood) { - super(model, frame, nobs, mse, domain, sigma, cm, hr, logloss, auc, customMetric); + super(model, frame, nobs, mse, domain, sigma, cm, hr, logloss, loglikelihood, aic, auc, customMetric); _resDev = resDev; _nullDev = nullDev; _AIC = aic; diff --git a/h2o-core/src/main/java/hex/ModelMetricsMultinomial.java b/h2o-core/src/main/java/hex/ModelMetricsMultinomial.java index 7c47205d291f..a15e33b2d6ea 100755 --- a/h2o-core/src/main/java/hex/ModelMetricsMultinomial.java +++ b/h2o-core/src/main/java/hex/ModelMetricsMultinomial.java @@ -18,23 +18,38 @@ public class ModelMetricsMultinomial extends ModelMetricsSupervised { public final float[] _hit_ratios; // Hit ratios public final ConfusionMatrix _cm; public final double _logloss; + public final double _loglikelihood; + public final double _aic; public double _mean_per_class_error; public MultinomialAUC _auc; - public ModelMetricsMultinomial(Model model, Frame frame, long nobs, double mse, String[] domain, double sigma, ConfusionMatrix cm, float[] hr, double logloss, MultinomialAUC auc, CustomMetric customMetric) { + public ModelMetricsMultinomial(Model model, Frame frame, long nobs, double mse, String[] domain, double sigma, + ConfusionMatrix cm, float[] hr, double logloss, double loglikelihood, double aic, + MultinomialAUC auc, CustomMetric customMetric) { super(model, frame, nobs, mse, domain, sigma, customMetric); _cm = cm; _hit_ratios = hr; _logloss = logloss; + _loglikelihood = loglikelihood; + _aic = aic; _mean_per_class_error = cm==null || cm.tooLarge() ? Double.NaN : cm.mean_per_class_error(); _auc = auc; } + public ModelMetricsMultinomial(Model model, Frame frame, long nobs, double mse, String[] domain, double sigma, + ConfusionMatrix cm, float[] hr, double logloss, MultinomialAUC auc, + CustomMetric customMetric) { + this(model, frame, nobs, mse, domain, sigma, cm, hr, logloss, Double.NaN, Double.NaN, auc, customMetric); + + } + @Override public String toString() { StringBuilder sb = new StringBuilder(); sb.append(super.toString()); sb.append(" logloss: " + (float)_logloss + "\n"); + sb.append(" loglikelihood: " + (float)_loglikelihood + "\n"); + sb.append(" AIC: " + (float)_aic + "\n"); sb.append(" mean_per_class_error: " + (float)_mean_per_class_error + "\n"); sb.append(" hit ratios: " + Arrays.toString(_hit_ratios) + "\n"); sb.append(" AUC: "+auc()+ "\n"); @@ -59,6 +74,8 @@ public String toString() { } public double logloss() { return _logloss; } + public double loglikelihood() { return _loglikelihood; } + public double aic() { return _aic; } public double mean_per_class_error() { return _mean_per_class_error; } @Override public ConfusionMatrix cm() { return _cm; } @Override public float[] hr() { return _hit_ratios; } @@ -235,6 +252,7 @@ public static class MetricBuilderMultinomial Distribution _dist; double _abserror; double _rmslerror; + protected double _loglikelihood; public MetricBuilderRegression() { super(1,null); //this will make _work = new float[2]; } @@ -128,6 +143,7 @@ public MetricBuilderRegression(Distribution dist) { // ds[0] has the prediction and ds[1,..,N] is ignored @Override public double[] perRow(double ds[], float[] yact, Model m) {return perRow(ds, yact, 1, 0, m);} @Override public double[] perRow(double ds[], float[] yact, double w, double o, Model m) { + boolean score4Generic = m != null && m.getClass().toString().contains("Generic"); if( Float.isNaN(yact[0]) ) return ds; // No errors if actual is missing if(ArrayUtils.hasNaNs(ds)) return ds; // No errors if prediction has missing values (can happen for GLM) if(w == 0 || Double.isNaN(w)) return ds; @@ -147,6 +163,10 @@ public MetricBuilderRegression(Distribution dist) { _sumdeviance += _dist.deviance(w, yact[0], ds[0]); } } + + if(score4Generic) { // only perform for generic model, will increase run time for training if performs + _loglikelihood += m.likelihood(w, yact[0], ds); + } _count++; _wcount += w; @@ -160,6 +180,7 @@ public MetricBuilderRegression(Distribution dist) { _sumdeviance += mb._sumdeviance; _abserror += mb._abserror; _rmslerror += mb._rmslerror; + _loglikelihood += mb._loglikelihood; } // Having computed a MetricBuilder, this method fills in a ModelMetrics @@ -173,6 +194,8 @@ ModelMetricsRegression computeModelMetrics(Model m, Frame f, Frame adaptedFrame, double mse = _sumsqe / _wcount; double mae = _abserror/_wcount; //Mean Absolute Error double rmsle = Math.sqrt(_rmslerror/_wcount); //Root Mean Squared Log Error + double loglikelihood = Double.NaN; + double aic = Double.NaN; if (adaptedFrame ==null) adaptedFrame = f; double meanResDeviance = 0; if (m != null && m.isDistributionHuber()){ @@ -195,7 +218,12 @@ ModelMetricsRegression computeModelMetrics(Model m, Frame f, Frame adaptedFrame, } else { meanResDeviance = Double.NaN; } - ModelMetricsRegression mm = new ModelMetricsRegression(m, f, _count, mse, weightedSigma(), mae, rmsle, meanResDeviance, _customMetric); + if(m != null && m.getClass().toString().contains("Generic")) { + loglikelihood = -1 * _loglikelihood ; // get likelihood from negative loglikelihood + aic = m.aic(loglikelihood); + } + ModelMetricsRegression mm = new ModelMetricsRegression(m, f, _count, mse, weightedSigma(), mae, rmsle, + meanResDeviance, _customMetric, loglikelihood, aic); return mm; } } diff --git a/h2o-core/src/main/java/hex/ModelMetricsRegressionGLM.java b/h2o-core/src/main/java/hex/ModelMetricsRegressionGLM.java index de12cb321c32..39401e6c7fc8 100644 --- a/h2o-core/src/main/java/hex/ModelMetricsRegressionGLM.java +++ b/h2o-core/src/main/java/hex/ModelMetricsRegressionGLM.java @@ -10,20 +10,17 @@ public class ModelMetricsRegressionGLM extends ModelMetricsRegression implements public final long _residualDegressOfFreedom; public final double _resDev; public final double _nullDev; - public final double _AIC; - public final double _loglikelihood; + public ModelMetricsRegressionGLM(Model model, Frame frame, long nobs, double mse, double sigma, double mae, double rmsle, double resDev, double meanResDev, double nullDev, double aic, long nDof, long rDof, CustomMetric customMetric, double loglikelihood) { - super(model, frame, nobs, mse, sigma, mae, rmsle, meanResDev, customMetric); + super(model, frame, nobs, mse, sigma, mae, rmsle, meanResDev, customMetric, loglikelihood, aic); _resDev = resDev; _nullDev = nullDev; - _AIC = aic; _nullDegressOfFreedom = nDof; _residualDegressOfFreedom = rDof; - _loglikelihood = loglikelihood; } @Override diff --git a/h2o-core/src/main/java/water/api/schemas3/ModelMetricsBinomialGenericV3.java b/h2o-core/src/main/java/water/api/schemas3/ModelMetricsBinomialGenericV3.java index 5e04418baaba..f774bf82a5e5 100644 --- a/h2o-core/src/main/java/water/api/schemas3/ModelMetricsBinomialGenericV3.java +++ b/h2o-core/src/main/java/water/api/schemas3/ModelMetricsBinomialGenericV3.java @@ -10,6 +10,8 @@ public S fillFromImpl(ModelMetricsBinomialGeneric modelMetrics) { super.fillFromImpl(modelMetrics); r2 = modelMetrics.r2(); logloss = modelMetrics._logloss; + loglikelihood = modelMetrics._loglikelihood; + AIC = modelMetrics._aic; if (modelMetrics != null && modelMetrics._confusion_matrix != null) { final ConfusionMatrixV3 convertedConfusionMatrix = new ConfusionMatrixV3(); diff --git a/h2o-core/src/main/java/water/api/schemas3/ModelMetricsBinomialV3.java b/h2o-core/src/main/java/water/api/schemas3/ModelMetricsBinomialV3.java index c1e14a58b14e..0f6deb6fad6a 100644 --- a/h2o-core/src/main/java/water/api/schemas3/ModelMetricsBinomialV3.java +++ b/h2o-core/src/main/java/water/api/schemas3/ModelMetricsBinomialV3.java @@ -18,6 +18,12 @@ public class ModelMetricsBinomialV3= 1.1; } diff --git a/h2o-genmodel/src/main/java/hex/genmodel/algos/glm/GlmMojoReader.java b/h2o-genmodel/src/main/java/hex/genmodel/algos/glm/GlmMojoReader.java index f2f27748eacc..5dca5620ad85 100644 --- a/h2o-genmodel/src/main/java/hex/genmodel/algos/glm/GlmMojoReader.java +++ b/h2o-genmodel/src/main/java/hex/genmodel/algos/glm/GlmMojoReader.java @@ -29,6 +29,7 @@ protected void readModelData() throws IOException { _model._beta = readkv("beta"); _model._family = readkv("family"); + _model._dispersion_estimated = readkv("dispersion_estimated", 1.0); if (_model instanceof GlmMojoModel) { GlmMojoModel m = (GlmMojoModel) _model; diff --git a/h2o-py/h2o/model/metrics_base.py b/h2o-py/h2o/model/metrics_base.py index 63d5e2d240a2..eccf94c105ff 100644 --- a/h2o-py/h2o/model/metrics_base.py +++ b/h2o-py/h2o/model/metrics_base.py @@ -4,6 +4,7 @@ :copyright: (c) 2016 H2O.ai :license: Apache License Version 2.0 (see LICENSE for details) """ +import math from collections import OrderedDict from h2o.display import H2ODisplay, display, repr_def, format_to_html, format_to_multiline @@ -151,7 +152,13 @@ def _str_items(self, verbosity=None): "Null deviance: {}".format(self.null_deviance()), "Residual deviance: {}".format(self.residual_deviance()), ]) - if is_type(self.aic(), numeric): items.append("AIC: {}".format(self.aic())) + + if m_is_glm: + if is_type(self.aic(), numeric) and not math.isnan(self.aic()) and self.aic() != 0: + items.append("AIC: {}".format(self.aic())) + if is_type(self.loglikelihood(), numeric) and not math.isnan(self.loglikelihood()) and self.loglikelihood() != 0: + items.append("Loglikelihood: {}".format(self.loglikelihood())) + items.extend(self._str_items_custom()) return items diff --git a/h2o-py/tests/testdir_generic_model/pyunit_generic_model_mojo_glm.py b/h2o-py/tests/testdir_generic_model/pyunit_generic_model_mojo_glm.py index 258ffa556f1d..f67ed2737b16 100644 --- a/h2o-py/tests/testdir_generic_model/pyunit_generic_model_mojo_glm.py +++ b/h2o-py/tests/testdir_generic_model/pyunit_generic_model_mojo_glm.py @@ -1,3 +1,5 @@ +import math + import tempfile import os import sys @@ -14,10 +16,13 @@ def test(x, y, output_test, strip_part, algo_name, generic_algo_name, family): # GLM airlines = h2o.import_file(path=pyunit_utils.locate("smalldata/testng/airlines_train.csv")) - glm = H2OGeneralizedLinearEstimator(nfolds = 2, family = family, max_iterations=2) # alpha = 1, lambda_ = 1, bad values, use default - glm.train(x = x, y = y, training_frame=airlines, validation_frame=airlines, ) + glm = H2OGeneralizedLinearEstimator(nfolds=2, family=family, max_iterations=2, + compute_p_values=(family == "gaussian"), + remove_collinear_columns=(family == "gaussian"), seed=12345) # alpha = 1, lambda_ = 1, bad values, use default + glm.train(x=x, y=y, training_frame=airlines, validation_frame=airlines,) with H2OTableDisplay.pandas_rendering_enabled(False), capture_output() as (original_output, _): glm.show() + print("*************** GLM model metrics") print(original_output.getvalue()) original_model_filename = tempfile.mkdtemp() original_model_filename = glm.download_mojo(original_model_filename) @@ -26,11 +31,15 @@ def test(x, y, output_test, strip_part, algo_name, generic_algo_name, family): assert generic_mojo_model_from_file is not None with H2OTableDisplay.pandas_rendering_enabled(False), capture_output() as (generic_output, _): generic_mojo_model_from_file.show() + print("*************** GLM generic model metrics") print(generic_output.getvalue()) compare_params(glm, generic_mojo_model_from_file) output_test(original_output.getvalue(), generic_output.getvalue(), strip_part, algo_name, generic_algo_name) - predictions = generic_mojo_model_from_file.predict(airlines) + + airlines_metrics_dataset = h2o.import_file(path=pyunit_utils.locate("smalldata/testng/airlines_train.csv")) + predictions = generic_mojo_model_from_file.predict(airlines_metrics_dataset) + metrics = generic_mojo_model_from_file.model_performance(airlines_metrics_dataset) assert predictions is not None assert predictions.nrows == 24421 assert generic_mojo_model_from_file._model_json["output"]["model_summary"] is not None @@ -38,31 +47,52 @@ def test(x, y, output_test, strip_part, algo_name, generic_algo_name, family): assert generic_mojo_model_from_file._model_json["output"]["variable_importances"] is not None assert len(generic_mojo_model_from_file._model_json["output"]["variable_importances"]._cell_values) > 0 + print(generic_mojo_model_from_file._model_json["output"]["training_metrics"]) generic_mojo_filename = tempfile.mkdtemp("zip", "genericMojo"); generic_mojo_filename = generic_mojo_model_from_file.download_mojo(path=generic_mojo_filename) assert os.path.getsize(generic_mojo_filename) == os.path.getsize(original_model_filename) + if family != 'ordinal': # loglikelihood calculation not available for ordinal family yet + glm_calc_like = H2OGeneralizedLinearEstimator(nfolds=2, family=family, max_iterations=2, calc_like=True, + compute_p_values=(family == "gaussian"), + remove_collinear_columns=(family == "gaussian"), seed=12345) + glm_calc_like.train(x=x, y=y, training_frame=airlines_metrics_dataset, validation_frame=airlines_metrics_dataset) + + print("glm training metrics:") + print(glm._model_json["output"]["training_metrics"]) + print("glm calc like training metrics:") + print(glm_calc_like._model_json["output"]["training_metrics"]) + print("metrics:") + print(metrics) + + assert math.isclose(glm_calc_like._model_json["output"]["training_metrics"]._metric_json["AIC"], + metrics._metric_json["AIC"], rel_tol=1e-6), "The numbers are not close enough." + assert math.isclose(-glm_calc_like._model_json["output"]["training_metrics"]._metric_json["loglikelihood"], + metrics._metric_json["loglikelihood"], rel_tol=1e-6), "The numbers are not close enough." + def mojo_model_test_binomial(): test(["Origin", "Dest"], "IsDepDelayed", compare_output, 'GLM Model: summary', 'ModelMetricsBinomialGLM: glm', 'ModelMetricsBinomialGLMGeneric: generic', 'binomial') + print("completed binomial tests.") def mojo_model_test_regression(): test(["Origin", "Dest"], "Distance", compare_output, 'GLM Model: summary', 'ModelMetricsRegressionGLM: glm', 'ModelMetricsRegressionGLMGeneric: generic', 'gaussian') - + print("completed Gaussian tests.") def mojo_model_test_multinomial(): test(["Origin", "Distance"], "Dest", compare_output, 'GLM Model: summary', 'ModelMetricsMultinomialGLM: glm', 'ModelMetricsMultinomialGLMGeneric: generic', 'multinomial') + print("completed Multinomial tests.") def mojo_model_test_ordinal(): test(["Origin", "Distance", "IsDepDelayed"], "fDayOfWeek", compare_output, 'GLM Model: summary', 'ModelMetricsOrdinalGLM: glm', 'ModelMetricsOrdinalGLMGeneric: generic', 'ordinal') - + pyunit_utils.run_tests([ mojo_model_test_binomial, diff --git a/h2o-py/tests/testdir_jira/pyunit_pubdev_6413_cv_sd_fix.py b/h2o-py/tests/testdir_jira/pyunit_pubdev_6413_cv_sd_fix.py index 247604c1a235..8912db6cbe91 100644 --- a/h2o-py/tests/testdir_jira/pyunit_pubdev_6413_cv_sd_fix.py +++ b/h2o-py/tests/testdir_jira/pyunit_pubdev_6413_cv_sd_fix.py @@ -50,6 +50,8 @@ def assertMeanSDCalculation(meanCol, stdCol, cvVals, tol=1e-6): xsum += temp xsumSquare += temp*temp xmean = xsum/nfolds + if math.isnan(xmean) and math.isnan(float(meanCol[itemIndex])): + continue assert abs(xmean-float(meanCol[itemIndex])) < tol, "Expected mean: {0}, Actual mean: {1}".format(xmean, float(meanCol[itemIndex])) xstd = math.sqrt((xsumSquare-nfolds*xmean*xmean)*oneOverNm1) assert abs(xstd-float(stdCol[itemIndex])) < tol, "Expected SD: {0}, Actual SD: {1}".format(xstd, float(stdCol[itemIndex])) diff --git a/h2o-py/tests/testdir_misc/pyunit_metric_json_check.py b/h2o-py/tests/testdir_misc/pyunit_metric_json_check.py index c28dc8148e9f..5564a60c7f96 100644 --- a/h2o-py/tests/testdir_misc/pyunit_metric_json_check.py +++ b/h2o-py/tests/testdir_misc/pyunit_metric_json_check.py @@ -36,7 +36,9 @@ def metric_json_check(): u'nobs', u'mean_residual_deviance', u'custom_metric_name', - u'custom_metric_value'] + u'custom_metric_value', + u'loglikelihood', + u'AIC'] reg_metric_diff = list(set(reg_metric_json_keys_have) - set(reg_metric_json_keys_desired)) assert not reg_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) regression " \ "metric json. The difference is {2}".format(reg_metric_json_keys_have, @@ -72,7 +74,9 @@ def metric_json_check(): u'residual_deviance', u'mean_residual_deviance', u'custom_metric_name', - u'custom_metric_value'] + u'custom_metric_value', + u'loglikelihood', + u'AIC'] reg_metric_diff = list(set(reg_metric_json_keys_have) - set(reg_metric_json_keys_desired)) assert not reg_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) glm-regression " \ "metric json. The difference is {2}".format(reg_metric_json_keys_have, @@ -111,7 +115,9 @@ def metric_json_check(): u'domain', u'custom_metric_name', u'custom_metric_value', - u'pr_auc'] + u'pr_auc', + u'loglikelihood', + u'AIC'] bin_metric_diff = list(set(bin_metric_json_keys_have) - set(bin_metric_json_keys_desired)) assert not bin_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) binomial " \ "metric json. The difference is {2}".format(bin_metric_json_keys_have, @@ -154,7 +160,9 @@ def metric_json_check(): u'domain', u'custom_metric_name', u'custom_metric_value', - u'pr_auc'] + u'pr_auc', + u'loglikelihood', + u'AIC'] bin_metric_diff = list(set(bin_metric_json_keys_have) - set(bin_metric_json_keys_desired)) assert not bin_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) glm-binomial " \ "metric json. The difference is {2}".format(bin_metric_json_keys_have, @@ -194,7 +202,9 @@ def metric_json_check(): u'duration_in_ms', u'frame_checksum', u'custom_metric_name', - u'custom_metric_value'] + u'custom_metric_value', + u'loglikelihood', + u'AIC'] mul_metric_diff = list(set(mul_metric_json_keys_have) - set(mul_metric_json_keys_desired)) assert not mul_metric_diff, "There's a difference between the current ({0}) and the desired ({1}) multinomial " \ "metric json. The difference is {2}".format(mul_metric_json_keys_have, diff --git a/h2o-r/tests/testdir_algos/generic/runit_generic_model_mojo_drf.R b/h2o-r/tests/testdir_algos/generic/runit_generic_model_mojo_drf.R index 39c74e602288..6472fac40c5b 100644 --- a/h2o-r/tests/testdir_algos/generic/runit_generic_model_mojo_drf.R +++ b/h2o-r/tests/testdir_algos/generic/runit_generic_model_mojo_drf.R @@ -21,8 +21,8 @@ test.model.generic.drf <- function() { original_output <- capture.output(print(original_model)) generic_output <- capture.output(print(generic_model)) compare_output(original_output, generic_output, - c("Extract .+ frame","H2OBinomialModel: drf", "Model ID", "H2OBinomialMetrics: drf"), - c("H2OBinomialModel: generic", "Model ID", "H2OBinomialMetrics: generic")) + c("Extract .+ frame","H2OBinomialModel: drf", "Model ID", "H2OBinomialMetrics: drf", "AIC"), + c("H2OBinomialModel: generic", "Model ID", "H2OBinomialMetrics: generic", "AIC", "loglikelihood")) generic_model_preds <- h2o.predict(generic_model, data) expect_equal(length(generic_model_preds), 3) @@ -45,8 +45,8 @@ test.model.generic.drf <- function() { original_output <- capture.output(print(original_model)) generic_output <- capture.output(print(generic_model)) compare_output(original_output, generic_output, - c("Extract .+ frame", "H2OMultinomialModel: drf", "Model ID", "H2OMultinomialMetrics: drf"), - c("H2OMultinomialModel: generic", "Model ID", "H2OMultinomialMetrics: generic")) + c("Extract .+ frame", "H2OMultinomialModel: drf", "Model ID", "H2OMultinomialMetrics: drf", "AIC"), + c("H2OMultinomialModel: generic", "Model ID", "H2OMultinomialMetrics: generic", "AIC")) # Regression cols <- c("Origin", "Dest") diff --git a/h2o-r/tests/testdir_algos/generic/runit_generic_model_mojo_gbm.R b/h2o-r/tests/testdir_algos/generic/runit_generic_model_mojo_gbm.R index a625fb15feb7..15155063634c 100644 --- a/h2o-r/tests/testdir_algos/generic/runit_generic_model_mojo_gbm.R +++ b/h2o-r/tests/testdir_algos/generic/runit_generic_model_mojo_gbm.R @@ -16,8 +16,8 @@ test.model.generic.gbm <- function() { original_output <- capture.output(print(original_model)) generic_output <- capture.output(print(generic_model)) compare_output(original_output, generic_output, - c("Extract .+ frame","H2OBinomialModel: gbm", "Model ID", "H2OBinomialMetrics: gbm"), - c("H2OBinomialModel: generic", "Model ID", "H2OBinomialMetrics: generic")) + c("Extract .+ frame","H2OBinomialModel: gbm", "Model ID", "H2OBinomialMetrics: gbm", "AIC"), + c("H2OBinomialModel: generic", "Model ID", "H2OBinomialMetrics: generic", "AIC")) generic_model_preds <- h2o.predict(generic_model, data) expect_equal(length(generic_model_preds), 3) @@ -54,8 +54,8 @@ test.model.generic.gbm <- function() { original_output <- capture.output(print(original_model)) generic_output <- capture.output(print(generic_model)) compare_output(original_output, generic_output, - c("Extract .+ frame", "H2OMultinomialModel: gbm", "Model ID", "H2OMultinomialMetrics: gbm"), - c("H2OMultinomialModel: generic", "Model ID", "H2OMultinomialMetrics: generic")) + c("Extract .+ frame", "H2OMultinomialModel: gbm", "Model ID", "H2OMultinomialMetrics: gbm", "AIC"), + c("H2OMultinomialModel: generic", "Model ID", "H2OMultinomialMetrics: generic", "AIC")) } diff --git a/h2o-r/tests/testdir_algos/generic/runit_generic_model_mojo_glm.R b/h2o-r/tests/testdir_algos/generic/runit_generic_model_mojo_glm.R index 5f3c7f317474..b8441b5971f1 100644 --- a/h2o-r/tests/testdir_algos/generic/runit_generic_model_mojo_glm.R +++ b/h2o-r/tests/testdir_algos/generic/runit_generic_model_mojo_glm.R @@ -46,9 +46,8 @@ test.model.generic.glm <- function() { original_output <- capture.output(print(original_model)) generic_output <- capture.output(print(generic_model)) compare_output(original_output, generic_output, - c("Extract .+ frame","H2OBinomialModel: glm", "Model ID", "H2OBinomialMetrics: glm"), - c("H2OBinomialModel: generic", "Model ID", "H2OBinomialMetrics: generic")) - + c("Extract .+ frame","H2OBinomialModel: glm", "Model ID", "H2OBinomialMetrics: glm", "AIC"), + c("H2OBinomialModel: generic", "Model ID", "H2OBinomialMetrics: generic", "AIC")) # Multinomial cols <- c("Origin", "Distance") @@ -64,9 +63,8 @@ test.model.generic.glm <- function() { original_output <- capture.output(print(original_model)) generic_output <- capture.output(print(generic_model)) compare_output(original_output, generic_output, - c("Extract .+ frame","H2OMultinomialModel: glm", "Model ID", "H2OMultinomialMetrics: glm"), - c("H2OMultinomialModel: generic", "Model ID", "H2OMultinomialMetrics: generic")) - + c("Extract .+ frame","H2OMultinomialModel: glm", "Model ID", "H2OMultinomialMetrics: glm", "AIC"), + c("H2OMultinomialModel: generic", "Model ID", "H2OMultinomialMetrics: generic", "AIC")) # Ordinal cols <- c("Origin", "Distance") diff --git a/h2o-r/tests/testdir_algos/generic/runit_generic_model_mojo_xgboost.R b/h2o-r/tests/testdir_algos/generic/runit_generic_model_mojo_xgboost.R index 761540cd8d92..62571ba7a0c5 100644 --- a/h2o-r/tests/testdir_algos/generic/runit_generic_model_mojo_xgboost.R +++ b/h2o-r/tests/testdir_algos/generic/runit_generic_model_mojo_xgboost.R @@ -16,8 +16,8 @@ test.model.generic.gbm <- function() { original_output <- capture.output(print(original_model)) generic_output <- capture.output(print(generic_model)) compare_output(original_output, generic_output, - c("Extract .+ frame","H2OBinomialModel: xgboost", "Model ID", "H2OBinomialMetrics: xgboost"), - c("H2OBinomialModel: generic", "Model ID", "H2OBinomialMetrics: generic")) + c("Extract .+ frame","H2OBinomialModel: xgboost", "Model ID", "H2OBinomialMetrics: xgboost", "AIC"), + c("H2OBinomialModel: generic", "Model ID", "H2OBinomialMetrics: generic", "AIC")) generic_model_preds <- h2o.predict(generic_model, data) expect_equal(length(generic_model_preds), 3) @@ -54,8 +54,8 @@ test.model.generic.gbm <- function() { original_output <- capture.output(print(original_model)) generic_output <- capture.output(print(generic_model)) compare_output(original_output, generic_output, - c("Extract .+ frame", "H2OMultinomialModel: xgboost", "Model ID", "H2OMultinomialMetrics: xgboost"), - c("H2OMultinomialModel: generic", "Model ID", "H2OMultinomialMetrics: generic")) + c("Extract .+ frame", "H2OMultinomialModel: xgboost", "Model ID", "H2OMultinomialMetrics: xgboost", "AIC"), + c("H2OMultinomialModel: generic", "Model ID", "H2OMultinomialMetrics: generic", "AIC")) }