diff --git a/h2o-algos/src/main/java/hex/glm/ConstrainedGLMUtils.java b/h2o-algos/src/main/java/hex/glm/ConstrainedGLMUtils.java index 2b04def930f5..a1d150524c00 100644 --- a/h2o-algos/src/main/java/hex/glm/ConstrainedGLMUtils.java +++ b/h2o-algos/src/main/java/hex/glm/ConstrainedGLMUtils.java @@ -628,14 +628,18 @@ public static double calHBetaMagSquare(double[] beta, LinearConstraints[] constr * * If the stopping conditions are met, it will return true, else it will return false. */ - public static boolean constraintsStop(GLM.GLMGradientInfo gradientInfo, ComputationState state, - LinearConstraints[] equalConst, LinearConstraints[] lessThanConst, - List coefNames) { + public static boolean constraintsStop(GLM.GLMGradientInfo gradientInfo, ComputationState state) { state._csGLMState._gradientMagSquare = innerProduct(gradientInfo._gradient, gradientInfo._gradient); if (state._csGLMState._constraintMagSquare <= ComputationState.EPS_CS && state._csGLMState._gradientMagSquare <= ComputationState.EPS_CS_SQUARE) return true; return false; } + + public static boolean activeConstraints(LinearConstraints[] equalityC, LinearConstraints[] lessThanEqualToC) { + if (equalityC != null) + return true; + return Arrays.stream(lessThanEqualToC).filter(x -> x._active).count() > 0; + } /*** * This method calls getGradient to calculate the gradient, likelhood and objective function values. In addition, diff --git a/h2o-algos/src/main/java/hex/glm/GLM.java b/h2o-algos/src/main/java/hex/glm/GLM.java index 4e6a9da78381..3860eb7f3369 100644 --- a/h2o-algos/src/main/java/hex/glm/GLM.java +++ b/h2o-algos/src/main/java/hex/glm/GLM.java @@ -2344,8 +2344,6 @@ private void fitIRLSMCS(Solver s) { GLMGradientInfo gradientInfo = calGradient(betaCnd, _state, ginfo, lambdaEqual, lambdaLessThan, equalityConstraints, lessThanEqualToConstraints); boolean predictorSizeChange; - boolean done; - boolean kktAchieved; boolean applyBetaConstraints = _parms._separate_linear_beta && _betaConstraintsOn; try { while (true) { @@ -2392,22 +2390,33 @@ private void fitIRLSMCS(Solver s) { updateConstraintParameters(_state, lambdaEqual, lambdaLessThan, equalityConstraints, lessThanEqualToConstraints); // check for stopping conditions - done = !progress(betaCnd, gradientInfo); - kktAchieved = constraintsStop(gradientInfo, _state, equalityConstraints, lessThanEqualToConstraints, - coeffNames); - if (kktAchieved || done) { - _model._betaCndCheckpoint = betaCnd; - if (kktAchieved) - Log.info("KKT Conditions achieved after" + iterCnt +" iterations "); - if (!kktAchieved && done) - Log.info("KKT Conditions not achieved but no further progress made of time out after "+iterCnt+" iterations"); + if (checkIterationDone(betaCnd, gradientInfo, equalityConstraints, lessThanEqualToConstraints, iterCnt)) return; - } } + } } catch (NonSPDMatrixException e) { Log.warn(LogMsg("Got Non SPD matrix, stopped.")); } } + public boolean checkIterationDone(double[] betaCnd, GLMGradientInfo gradientInfo, LinearConstraints[] equalityConstraints, + LinearConstraints[] lessThanEqualToConstraints, int iterCnt) { + // check for stopping conditions + boolean done = !progress(betaCnd, gradientInfo); // no significant change in coeff, or time-out or max_iteration reached + boolean activeConstraintsPresent = activeConstraints(equalityConstraints, lessThanEqualToConstraints); + boolean kktAchieved = activeConstraintsPresent ? constraintsStop(gradientInfo, _state) : false; + boolean kktDoneWithActiveC = activeConstraintsPresent && kktAchieved; + if (kktDoneWithActiveC || done) { + _model._betaCndCheckpoint = betaCnd; + if (kktDoneWithActiveC) + Log.info("KKT Conditions achieved after" + iterCnt +" iterations "); + if (!kktAchieved && done && activeConstraintsPresent) + Log.info("KKT Conditions not achieved but no further progress made due to time out or no changes" + + " to coefficients after "+iterCnt+" iterations"); + return true; + } + return false; + } + public List changeCoeffBetainfo(String[] coefNames) { _betaInfo = new BetaInfo(fractionalbinomial.equals(_parms._family) ? 2 : (multinomial.equals(_parms._family) || ordinal.equals(_parms._family)) ? nclasses() : 1, coefNames.length); diff --git a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_constraints_binomial.py b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_constraints_binomial.py new file mode 100644 index 000000000000..eb728b23cc22 --- /dev/null +++ b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_constraints_binomial.py @@ -0,0 +1,87 @@ +import h2o +from h2o.estimators.glm import H2OGeneralizedLinearEstimator +from tests import pyunit_utils + +# In this test, I setup inactive constraints and the coefficients with and without the constraints should be similar. +def test_constraints_binomial(): + train = h2o.import_file(path=pyunit_utils.locate("smalldata/glm_test/binomial_20_cols_10KRows.csv")) + for ind in range(10): + train[ind] = train[ind].asfactor() + train["C21"] = train["C21"].asfactor() + response = "C21" + predictors = list(range(0,20)) + + loose_init_const = [] # this constraint is satisfied by default coefficient initialization + + h2o_glm = H2OGeneralizedLinearEstimator(family="binomial", compute_p_values=True, remove_collinear_columns=True, + lambda_=0.0, solver="irlsm", seed=12345, standardize=True) + h2o_glm.train(x=predictors, y=response, training_frame=train) + print(h2o_glm.coef()) + + name = "C19" + values = 0.5 + types = "LessThanEqual" + contraint_numbers = 0 + loose_init_const.append([name, values, types, contraint_numbers]) + + name = "C20" + values = -0.8 + types = "LessThanEqual" + contraint_numbers = 0 + loose_init_const.append([name, values, types, contraint_numbers]) + + name = "constant" + values = -10 # 490 + types = "LessThanEqual" + contraint_numbers = 0 + loose_init_const.append([name, values, types, contraint_numbers]) + + name = "C12" + values = 2 + types = "LessThanEqual" + contraint_numbers = 1 + loose_init_const.append([name, values, types, contraint_numbers]) + + name = "C13" + values = -3 + types = "LessThanEqual" + contraint_numbers = 1 + loose_init_const.append([name, values, types, contraint_numbers]) + + name = "constant" + values = -25 # 80 + types = "LessThanEqual" + contraint_numbers = 1 + loose_init_const.append([name, values, types, contraint_numbers]) + + linear_constraints2 = h2o.H2OFrame(loose_init_const) + linear_constraints2.set_names(["names", "values", "types", "constraint_numbers"]) + # GLM model with default coef init values + h2o_glm_no_init = H2OGeneralizedLinearEstimator(family="binomial", compute_p_values=True, remove_collinear_columns=True, + lambda_=0.0, solver="irlsm", linear_constraints=linear_constraints2, seed=12345) + h2o_glm_no_init.train(x=predictors, y=response, training_frame=train) + # GLM model with GLM coefficients built without constraints + startCoef = [-11.777796439159427, 11.145918459656793, -2.240633964081059, -9.670607471364166, -0.04431844452487864, + 1.2000980684830767, -12.867841086167251, 4.8420193776134255, -10.14685080419689, 10.702942010340731, + 0.07976027968540254, 0.44773830396822184, 0.09737389144010582, 14.973662194333558, 15.70206320835652, + 0.1556959973853935, -6.6746924507580205, 6.942172294878739, 3.7876501542686576, 2.9159771977964435, + 14.612327184401229, 7.192082757108297, 1.119764323405339, 7.153590379475222, -6.823123247481835, + -9.77308069726764, -7.931619499230349, 6.91629118878521, 5.867602968167905, 23.314287012503986, + 5.617830967874223, 0.8813224259105378, 20.820182196763465, 4.232709126684858, -3.76117280137443, + -10.4467656317731, -8.919929474793474, -12.750260480508844, -21.187033417068225, 0.9223288064815389, + -16.59577334951634, -2.2764714329170515, -17.577171729916557, -18.51039273019755, -6.294489874768877, + -16.24717884256297, 1.8769336927775133, -3.3038587822340184, -16.90849120430868, 4.833742270371561, + 1.5702458956118868, -12.40449842919434, -6.102490765991213, -11.496005185120135, -7.071687013766151, + -3.850864374953074, -8.957320649739758, -7.468575396622829, 0.5634858217928193, -6.818408867710658, + -1.0014411907757605, -4.983361240945651, -9.280641693445801, 9.266595896543917, 7.3179414322586345, + -5.057799375650119, -11.176969277606156, 35.87638467667283] + h2o_glm_init = H2OGeneralizedLinearEstimator(family="binomial", compute_p_values=True, remove_collinear_columns=True, + lambda_=0.0, solver="irlsm", linear_constraints=linear_constraints2, + startval=startCoef, seed=12345) + h2o_glm_init.train(x=predictors, y=response, training_frame=train) + print("Done") + +if __name__ == "__main__": + pyunit_utils.standalone_test(test_constraints_binomial) +else: + test_constraints_binomial() diff --git a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_constraints_on_collinear_cols1.py b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_constraints_on_collinear_cols1.py index fbff1b1f0540..39c5d10eb317 100644 --- a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_constraints_on_collinear_cols1.py +++ b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_constraints_on_collinear_cols1.py @@ -3,7 +3,8 @@ from tests import pyunit_utils # The purpose of this test to make sure that constrainted GLM works with collinear column removal. In this case, -# the collinear columns are added to the front of the frame. +# the collinear columns are added to the front of the frame. There are two collinear columns and they should be +# removed. def test_constraints_collinear_columns(): # first two columns are enums, the last 4 are real columns h2o_data = pyunit_utils.genTrainFrame(10000, 6, enumCols=2, enumFactors=2, responseLevel=2, miscfrac=0, randseed=12345) @@ -57,14 +58,17 @@ def test_constraints_collinear_columns(): linear_constraints2 = h2o.H2OFrame(lc2) linear_constraints2.set_names(["names", "values", "types", "constraint_numbers"]) - xNoCons = ["C1", "C2", "corr1", "corr2", "C20", "C4", "C5", "C6"] - # h2o_glm1 = H2OGeneralizedLinearEstimator(family="binomial", compute_p_values=True, remove_collinear_columns=True, - # lambda_=0.0, solver="irlsm") - # h2o_glm1.train(x=x, y=y, training_frame=train_data) - # GLM with constraints + h2o_glm = H2OGeneralizedLinearEstimator(family="binomial", compute_p_values=True, remove_collinear_columns=True, - lambda_=0.0, solver="irlsm", linear_constraints=linear_constraints2) + lambda_=0.0, solver="irlsm", linear_constraints=linear_constraints2, + seed = 1234) h2o_glm.train(x=x, y=y, training_frame=train_data ) + # there should be two coefficients with zero + coefs = h2o_glm.coef().values() + numZero = [x for x in coefs if x == 0] + assert len(numZero) == 2, "Length of non-zero coefficients should be 2 but is not." + + if __name__ == "__main__": pyunit_utils.standalone_test(test_constraints_collinear_columns) diff --git a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_constraints_on_collinear_cols2.py b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_constraints_on_collinear_cols2.py index aed8b123a1ff..e8356ba3f0fe 100644 --- a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_constraints_on_collinear_cols2.py +++ b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_constraints_on_collinear_cols2.py @@ -37,38 +37,17 @@ def test_constraints_collinear_columns(): contraint_numbers = 0 lc2.append([name, values, types, contraint_numbers]) - name = "corr1" - values = 1 - types = "LessThanEqual" - contraint_numbers = 1 - lc2.append([name, values, types, contraint_numbers]) - - name = "C6" - values = 1 - types = "LessThanEqual" - contraint_numbers = 1 - lc2.append([name, values, types, contraint_numbers]) - - name = "constant" - values = -2 - types = "LessThanEqual" - contraint_numbers = 1 - lc2.append([name, values, types, contraint_numbers]) - linear_constraints2 = h2o.H2OFrame(lc2) linear_constraints2.set_names(["names", "values", "types", "constraint_numbers"]) - # build glm model with constraints on redundant columns. Expected an error - try: - h2o_glm = H2OGeneralizedLinearEstimator(family="binomial", compute_p_values=True, remove_collinear_columns=True, - lambda_=0.0, solver="irlsm", linear_constraints=linear_constraints2) - h2o_glm.train(x=x, y=y, training_frame=train_data ) - assert False, "Should have thrown an exception!" - except Exception as ex: - print(ex) - temp = str(ex) - assert "included on collinear columns that are going to be removed. Please remove any constraints" in temp, \ - "Wrong exception was received." - print("constraint test passed!") + + h2o_glm = H2OGeneralizedLinearEstimator(family="binomial", compute_p_values=True, remove_collinear_columns=True, + lambda_=0.0, solver="irlsm", linear_constraints=linear_constraints2, + seed = 1234) + h2o_glm.train(x=x, y=y, training_frame=train_data ) + # there should be two coefficients with zero + coefs = h2o_glm.coef().values() + numZero = [x for x in coefs if x == 0] + assert len(numZero) == 2, "Length of non-zero coefficients should be 2 but is not." if __name__ == "__main__": diff --git a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_constraints_on_collinear_cols3.py b/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_constraints_on_collinear_cols3.py deleted file mode 100644 index ec1d758b31fb..000000000000 --- a/h2o-py/tests/testdir_algos/glm/pyunit_GH_6722_constraints_on_collinear_cols3.py +++ /dev/null @@ -1,82 +0,0 @@ -import h2o -from h2o.estimators.glm import H2OGeneralizedLinearEstimator -from tests import pyunit_utils - -# The purpose of this test to make sure that constrainted GLM works with collinear column removal. In this case, -# the collinear columns are added to the front of the frame. -def test_constraints_collinear_columns(): - # first two columns are enums, the last 4 are real columns - h2o_data = h2o.import_file(pyunit_utils.locate("smalldata/glm_test/gaussian_4col_10KRows_train.csv")) - # create extra collinear columns - num1 = h2o_data[0]*0.2-0.5*h2o_data[2] - num2 = -0.8*h2o_data[1]+0.1*h2o_data[3] - h2o_collinear = num1.cbind(num2) - h2o_collinear.set_names(["corr1", "corr2"]) - train_data = h2o_collinear.cbind(h2o_data) - - y = "C5" - x = train_data.names - x.remove(y) - lc2 = [] - - h2o_glm = H2OGeneralizedLinearEstimator(family="Gaussian", compute_p_values=True, remove_collinear_columns=True, - lambda_=0.0, solver="irlsm") - h2o_glm.train(x=x, y=y, training_frame=train_data ) - print(h2o_glm.coef()) - - name = "corr1" - values = 1 - types = "LessThanEqual" - contraint_numbers = 0 - lc2.append([name, values, types, contraint_numbers]) - - name = "corr2" - values = 1 - types = "LessThanEqual" - contraint_numbers = 0 - lc2.append([name, values, types, contraint_numbers]) - - name = "constant" - values = 1 # 490 - types = "LessThanEqual" - contraint_numbers = 0 - lc2.append([name, values, types, contraint_numbers]) - - name = "C1" - values = 1 - types = "LessThanEqual" - contraint_numbers = 1 - lc2.append([name, values, types, contraint_numbers]) - - name = "C2" - values = 0.5 - types = "LessThanEqual" - contraint_numbers = 1 - lc2.append([name, values, types, contraint_numbers]) - - name = "constant" - values = 1 # 80 - types = "LessThanEqual" - contraint_numbers = 1 - lc2.append([name, values, types, contraint_numbers]) - - linear_constraints2 = h2o.H2OFrame(lc2) - linear_constraints2.set_names(["names", "values", "types", "constraint_numbers"]) - # build glm model with constraints on redundant columns. Expected an error - try: - h2o_glm = H2OGeneralizedLinearEstimator(family="gaussian", compute_p_values=True, remove_collinear_columns=True, - lambda_=0.0, solver="irlsm", linear_constraints=linear_constraints2) - h2o_glm.train(x=x, y=y, training_frame=train_data ) - assert False, "Should have thrown an exception!" - except Exception as ex: - print(ex) - temp = str(ex) - assert "included on collinear columns that are going to be removed. Please remove any constraints" in temp, \ - "Wrong exception was received." - print("constraint test passed!") - - -if __name__ == "__main__": - pyunit_utils.standalone_test(test_constraints_collinear_columns) -else: - test_constraints_collinear_columns()