Skip to content

Commit

Permalink
completed python test on collinear columns.
Browse files Browse the repository at this point in the history
  • Loading branch information
wendycwong committed Jan 29, 2024
1 parent 1cbee81 commit bf9c7e9
Show file tree
Hide file tree
Showing 6 changed files with 135 additions and 134 deletions.
10 changes: 7 additions & 3 deletions h2o-algos/src/main/java/hex/glm/ConstrainedGLMUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -628,14 +628,18 @@ public static double calHBetaMagSquare(double[] beta, LinearConstraints[] constr
*
* If the stopping conditions are met, it will return true, else it will return false.
*/
public static boolean constraintsStop(GLM.GLMGradientInfo gradientInfo, ComputationState state,
LinearConstraints[] equalConst, LinearConstraints[] lessThanConst,
List<String> coefNames) {
public static boolean constraintsStop(GLM.GLMGradientInfo gradientInfo, ComputationState state) {
state._csGLMState._gradientMagSquare = innerProduct(gradientInfo._gradient, gradientInfo._gradient);
if (state._csGLMState._constraintMagSquare <= ComputationState.EPS_CS && state._csGLMState._gradientMagSquare <= ComputationState.EPS_CS_SQUARE)
return true;
return false;
}

public static boolean activeConstraints(LinearConstraints[] equalityC, LinearConstraints[] lessThanEqualToC) {
if (equalityC != null)
return true;
return Arrays.stream(lessThanEqualToC).filter(x -> x._active).count() > 0;
}

/***
* This method calls getGradient to calculate the gradient, likelhood and objective function values. In addition,
Expand Down
33 changes: 21 additions & 12 deletions h2o-algos/src/main/java/hex/glm/GLM.java
Original file line number Diff line number Diff line change
Expand Up @@ -2344,8 +2344,6 @@ private void fitIRLSMCS(Solver s) {
GLMGradientInfo gradientInfo = calGradient(betaCnd, _state, ginfo, lambdaEqual, lambdaLessThan,
equalityConstraints, lessThanEqualToConstraints);
boolean predictorSizeChange;
boolean done;
boolean kktAchieved;
boolean applyBetaConstraints = _parms._separate_linear_beta && _betaConstraintsOn;
try {
while (true) {
Expand Down Expand Up @@ -2392,22 +2390,33 @@ private void fitIRLSMCS(Solver s) {
updateConstraintParameters(_state, lambdaEqual, lambdaLessThan, equalityConstraints, lessThanEqualToConstraints);

// check for stopping conditions
done = !progress(betaCnd, gradientInfo);
kktAchieved = constraintsStop(gradientInfo, _state, equalityConstraints, lessThanEqualToConstraints,
coeffNames);
if (kktAchieved || done) {
_model._betaCndCheckpoint = betaCnd;
if (kktAchieved)
Log.info("KKT Conditions achieved after" + iterCnt +" iterations ");
if (!kktAchieved && done)
Log.info("KKT Conditions not achieved but no further progress made of time out after "+iterCnt+" iterations");
if (checkIterationDone(betaCnd, gradientInfo, equalityConstraints, lessThanEqualToConstraints, iterCnt))
return;
} }
}
} catch (NonSPDMatrixException e) {
Log.warn(LogMsg("Got Non SPD matrix, stopped."));
}
}

public boolean checkIterationDone(double[] betaCnd, GLMGradientInfo gradientInfo, LinearConstraints[] equalityConstraints,
LinearConstraints[] lessThanEqualToConstraints, int iterCnt) {
// check for stopping conditions
boolean done = !progress(betaCnd, gradientInfo); // no significant change in coeff, or time-out or max_iteration reached
boolean activeConstraintsPresent = activeConstraints(equalityConstraints, lessThanEqualToConstraints);
boolean kktAchieved = activeConstraintsPresent ? constraintsStop(gradientInfo, _state) : false;
boolean kktDoneWithActiveC = activeConstraintsPresent && kktAchieved;
if (kktDoneWithActiveC || done) {
_model._betaCndCheckpoint = betaCnd;
if (kktDoneWithActiveC)
Log.info("KKT Conditions achieved after" + iterCnt +" iterations ");
if (!kktAchieved && done && activeConstraintsPresent)
Log.info("KKT Conditions not achieved but no further progress made due to time out or no changes" +
" to coefficients after "+iterCnt+" iterations");
return true;
}
return false;
}

public List<String> changeCoeffBetainfo(String[] coefNames) {
_betaInfo = new BetaInfo(fractionalbinomial.equals(_parms._family) ? 2 :
(multinomial.equals(_parms._family) || ordinal.equals(_parms._family)) ? nclasses() : 1, coefNames.length);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import h2o
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from tests import pyunit_utils

# In this test, I setup inactive constraints and the coefficients with and without the constraints should be similar.
def test_constraints_binomial():
train = h2o.import_file(path=pyunit_utils.locate("smalldata/glm_test/binomial_20_cols_10KRows.csv"))
for ind in range(10):
train[ind] = train[ind].asfactor()
train["C21"] = train["C21"].asfactor()
response = "C21"
predictors = list(range(0,20))

loose_init_const = [] # this constraint is satisfied by default coefficient initialization

h2o_glm = H2OGeneralizedLinearEstimator(family="binomial", compute_p_values=True, remove_collinear_columns=True,
lambda_=0.0, solver="irlsm", seed=12345, standardize=True)
h2o_glm.train(x=predictors, y=response, training_frame=train)
print(h2o_glm.coef())

name = "C19"
values = 0.5
types = "LessThanEqual"
contraint_numbers = 0
loose_init_const.append([name, values, types, contraint_numbers])

name = "C20"
values = -0.8
types = "LessThanEqual"
contraint_numbers = 0
loose_init_const.append([name, values, types, contraint_numbers])

name = "constant"
values = -10 # 490
types = "LessThanEqual"
contraint_numbers = 0
loose_init_const.append([name, values, types, contraint_numbers])

name = "C12"
values = 2
types = "LessThanEqual"
contraint_numbers = 1
loose_init_const.append([name, values, types, contraint_numbers])

name = "C13"
values = -3
types = "LessThanEqual"
contraint_numbers = 1
loose_init_const.append([name, values, types, contraint_numbers])

name = "constant"
values = -25 # 80
types = "LessThanEqual"
contraint_numbers = 1
loose_init_const.append([name, values, types, contraint_numbers])

linear_constraints2 = h2o.H2OFrame(loose_init_const)
linear_constraints2.set_names(["names", "values", "types", "constraint_numbers"])
# GLM model with default coef init values
h2o_glm_no_init = H2OGeneralizedLinearEstimator(family="binomial", compute_p_values=True, remove_collinear_columns=True,
lambda_=0.0, solver="irlsm", linear_constraints=linear_constraints2, seed=12345)
h2o_glm_no_init.train(x=predictors, y=response, training_frame=train)
# GLM model with GLM coefficients built without constraints
startCoef = [-11.777796439159427, 11.145918459656793, -2.240633964081059, -9.670607471364166, -0.04431844452487864,
1.2000980684830767, -12.867841086167251, 4.8420193776134255, -10.14685080419689, 10.702942010340731,
0.07976027968540254, 0.44773830396822184, 0.09737389144010582, 14.973662194333558, 15.70206320835652,
0.1556959973853935, -6.6746924507580205, 6.942172294878739, 3.7876501542686576, 2.9159771977964435,
14.612327184401229, 7.192082757108297, 1.119764323405339, 7.153590379475222, -6.823123247481835,
-9.77308069726764, -7.931619499230349, 6.91629118878521, 5.867602968167905, 23.314287012503986,
5.617830967874223, 0.8813224259105378, 20.820182196763465, 4.232709126684858, -3.76117280137443,
-10.4467656317731, -8.919929474793474, -12.750260480508844, -21.187033417068225, 0.9223288064815389,
-16.59577334951634, -2.2764714329170515, -17.577171729916557, -18.51039273019755, -6.294489874768877,
-16.24717884256297, 1.8769336927775133, -3.3038587822340184, -16.90849120430868, 4.833742270371561,
1.5702458956118868, -12.40449842919434, -6.102490765991213, -11.496005185120135, -7.071687013766151,
-3.850864374953074, -8.957320649739758, -7.468575396622829, 0.5634858217928193, -6.818408867710658,
-1.0014411907757605, -4.983361240945651, -9.280641693445801, 9.266595896543917, 7.3179414322586345,
-5.057799375650119, -11.176969277606156, 35.87638467667283]
h2o_glm_init = H2OGeneralizedLinearEstimator(family="binomial", compute_p_values=True, remove_collinear_columns=True,
lambda_=0.0, solver="irlsm", linear_constraints=linear_constraints2,
startval=startCoef, seed=12345)
h2o_glm_init.train(x=predictors, y=response, training_frame=train)
print("Done")

if __name__ == "__main__":
pyunit_utils.standalone_test(test_constraints_binomial)
else:
test_constraints_binomial()
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
from tests import pyunit_utils

# The purpose of this test to make sure that constrainted GLM works with collinear column removal. In this case,
# the collinear columns are added to the front of the frame.
# the collinear columns are added to the front of the frame. There are two collinear columns and they should be
# removed.
def test_constraints_collinear_columns():
# first two columns are enums, the last 4 are real columns
h2o_data = pyunit_utils.genTrainFrame(10000, 6, enumCols=2, enumFactors=2, responseLevel=2, miscfrac=0, randseed=12345)
Expand Down Expand Up @@ -57,14 +58,17 @@ def test_constraints_collinear_columns():

linear_constraints2 = h2o.H2OFrame(lc2)
linear_constraints2.set_names(["names", "values", "types", "constraint_numbers"])
xNoCons = ["C1", "C2", "corr1", "corr2", "C20", "C4", "C5", "C6"]
# h2o_glm1 = H2OGeneralizedLinearEstimator(family="binomial", compute_p_values=True, remove_collinear_columns=True,
# lambda_=0.0, solver="irlsm")
# h2o_glm1.train(x=x, y=y, training_frame=train_data)
# GLM with constraints

h2o_glm = H2OGeneralizedLinearEstimator(family="binomial", compute_p_values=True, remove_collinear_columns=True,
lambda_=0.0, solver="irlsm", linear_constraints=linear_constraints2)
lambda_=0.0, solver="irlsm", linear_constraints=linear_constraints2,
seed = 1234)
h2o_glm.train(x=x, y=y, training_frame=train_data )
# there should be two coefficients with zero
coefs = h2o_glm.coef().values()
numZero = [x for x in coefs if x == 0]
assert len(numZero) == 2, "Length of non-zero coefficients should be 2 but is not."



if __name__ == "__main__":
pyunit_utils.standalone_test(test_constraints_collinear_columns)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,38 +37,17 @@ def test_constraints_collinear_columns():
contraint_numbers = 0
lc2.append([name, values, types, contraint_numbers])

name = "corr1"
values = 1
types = "LessThanEqual"
contraint_numbers = 1
lc2.append([name, values, types, contraint_numbers])

name = "C6"
values = 1
types = "LessThanEqual"
contraint_numbers = 1
lc2.append([name, values, types, contraint_numbers])

name = "constant"
values = -2
types = "LessThanEqual"
contraint_numbers = 1
lc2.append([name, values, types, contraint_numbers])

linear_constraints2 = h2o.H2OFrame(lc2)
linear_constraints2.set_names(["names", "values", "types", "constraint_numbers"])
# build glm model with constraints on redundant columns. Expected an error
try:
h2o_glm = H2OGeneralizedLinearEstimator(family="binomial", compute_p_values=True, remove_collinear_columns=True,
lambda_=0.0, solver="irlsm", linear_constraints=linear_constraints2)
h2o_glm.train(x=x, y=y, training_frame=train_data )
assert False, "Should have thrown an exception!"
except Exception as ex:
print(ex)
temp = str(ex)
assert "included on collinear columns that are going to be removed. Please remove any constraints" in temp, \
"Wrong exception was received."
print("constraint test passed!")

h2o_glm = H2OGeneralizedLinearEstimator(family="binomial", compute_p_values=True, remove_collinear_columns=True,
lambda_=0.0, solver="irlsm", linear_constraints=linear_constraints2,
seed = 1234)
h2o_glm.train(x=x, y=y, training_frame=train_data )
# there should be two coefficients with zero
coefs = h2o_glm.coef().values()
numZero = [x for x in coefs if x == 0]
assert len(numZero) == 2, "Length of non-zero coefficients should be 2 but is not."


if __name__ == "__main__":
Expand Down

This file was deleted.

0 comments on commit bf9c7e9

Please sign in to comment.