Skip to content

Commit

Permalink
Improve Uplift tests (#16032)
Browse files Browse the repository at this point in the history
  • Loading branch information
maurever authored Jan 31, 2024
1 parent e69e2ce commit aedac69
Show file tree
Hide file tree
Showing 6 changed files with 77 additions and 96 deletions.
13 changes: 8 additions & 5 deletions h2o-algos/src/test/java/hex/tree/uplift/UpliftDRFTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ public void testBasicTrain() {
p._treatment_column = "treatment";
p._response_column = "conversion";
p._seed = 0xDECAF;
p._ntrees = 3;

UpliftDRF udrf = new UpliftDRF(p);
UpliftDRFModel model = udrf.trainModel().get();
Expand All @@ -62,6 +63,7 @@ public void testBasicTrainAndScore() {
p._response_column = "conversion";
p._seed = 0xDECAF;
p._nbins = 10;
p._ntrees = 3;

UpliftDRF udrf = new UpliftDRF(p);
UpliftDRFModel model = udrf.trainModel().get();
Expand Down Expand Up @@ -227,7 +229,7 @@ public void testBasicTrainSupportEarlyStoppingAUUC() {
try {
Scope.enter();
Frame train = generateFrame();
int ntrees = 100;
int ntrees = 42;
UpliftDRFModel.UpliftDRFParameters p = new UpliftDRFModel.UpliftDRFParameters();
p._train = train._key;
p._treatment_column = "treatment";
Expand All @@ -252,7 +254,7 @@ public void testBasicTrainSupportEarlyStoppingATE() {
try {
Scope.enter();
Frame train = generateFrame();
int ntrees = 100;
int ntrees = 42;
UpliftDRFModel.UpliftDRFParameters p = new UpliftDRFModel.UpliftDRFParameters();
p._train = train._key;
p._treatment_column = "treatment";
Expand All @@ -277,7 +279,7 @@ public void testBasicTrainSupportEarlyStoppingATT() {
try {
Scope.enter();
Frame train = generateFrame();
int ntrees = 100;
int ntrees = 42;
UpliftDRFModel.UpliftDRFParameters p = new UpliftDRFModel.UpliftDRFParameters();
p._train = train._key;
p._treatment_column = "treatment";
Expand All @@ -302,7 +304,7 @@ public void testBasicTrainSupportEarlyStoppingATC() {
try {
Scope.enter();
Frame train = generateFrame();
int ntrees = 100;
int ntrees = 42;
UpliftDRFModel.UpliftDRFParameters p = new UpliftDRFModel.UpliftDRFParameters();
p._train = train._key;
p._treatment_column = "treatment";
Expand All @@ -327,7 +329,7 @@ public void testBasicTrainSupportEarlyStoppingQini() {
try {
Scope.enter();
Frame train = generateFrame();
int ntrees = 100;
int ntrees = 42;
UpliftDRFModel.UpliftDRFParameters p = new UpliftDRFModel.UpliftDRFParameters();
p._train = train._key;
p._treatment_column = "treatment";
Expand Down Expand Up @@ -363,6 +365,7 @@ public void testMaxDepthZero() {
p._ntrees = 100;
p._max_depth = 0;
p._score_each_iteration = true;
p._ntrees = 3;

UpliftDRF udrf = new UpliftDRF(p);
UpliftDRFModel model = udrf.trainModel().get();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ def uplift_random_forest_early_stopping():
treatment_column=treatment_column,
min_rows=min_rows,
seed=seed,
sample_rate=sample_rate
sample_rate=sample_rate,
score_each_iteration=True
)

uplift_model.train(y=response_column, x=x_names, training_frame=train_h2o, validation_frame=valid_h2o)
Expand All @@ -47,7 +48,8 @@ def uplift_random_forest_early_stopping():
seed=seed,
sample_rate=sample_rate,
stopping_metric="AUUC",
stopping_rounds=2
stopping_rounds=2,
score_each_iteration=True
)

uplift_model_es.train(y=response_column, x=x_names, training_frame=train_h2o, validation_frame=valid_h2o)
Expand Down
35 changes: 13 additions & 22 deletions h2o-r/tests/testdir_algos/uplift/runit_uplift_rf_early_stopping.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
setwd(normalizePath(dirname(R.utils::commandArgs(asValues = TRUE)$"f")))
source("../../../scripts/h2o-r-test-setup.R")
library(uplift)


test.uplift <- function() {
Expand All @@ -10,45 +9,36 @@ test.uplift <- function() {
sample_rate <- 0.8
seed <- 42
set.seed(seed)
x <- c("X1", "X2", "X3", "X4", "X5", "X6")
y <- "y"
treatment_col <- "treat"
x <- c("feature_1", "feature_2", "feature_3", "feature_4", "feature_5", "feature_6")
y <- "outcome"
treatment_col <- "treatment"

# Test data preparation for each implementation
train <- sim_pte(n = 2000, p = 6, rho = 0, sigma = sqrt(2), beta.den = 4)
train$treat <- ifelse(train$treat == 1, 1, 0)
test <- sim_pte(n = 1000, p = 6, rho = 0, sigma = sqrt(2), beta.den = 4)
test$treat <- ifelse(test$treat == 1, 1, 0)

trainh2o <- train
trainh2o$treat <- as.factor(train$treat)
trainh2o$y <- as.factor(train$y)
trainh2o <- as.h2o(trainh2o)

testh2o <- test
testh2o$treat <- as.factor(test$treat)
testh2o$y <- as.factor(test$y)
testh2o <- as.h2o(testh2o)
train <- h2o.importFile(path=locate("smalldata/uplift/upliftml_train.csv"),
col.types=list(by.col.name=c(treatment_col, y), types=c("factor", "factor")))
test <- h2o.importFile(path=locate("smalldata/uplift/upliftml_test.csv"),
col.types=list(by.col.name=c(treatment_col, y), types=c("factor", "factor")))

model <- h2o.upliftRandomForest(
x = x,
y = y,
training_frame = trainh2o,
validation_frame = testh2o,
training_frame = train,
validation_frame = test,
treatment_column = treatment_col,
ntrees = ntrees,
max_depth = max_depth,
min_rows = min_rows,
sample_rate = sample_rate,
score_each_iteration=TRUE,
seed = seed)

print(model)

model_es <- h2o.upliftRandomForest(
x = x,
y = y,
training_frame = trainh2o,
validation_frame = testh2o,
training_frame = train,
validation_frame = test,
treatment_column = treatment_col,
ntrees = ntrees,
max_depth = max_depth,
Expand All @@ -57,6 +47,7 @@ test.uplift <- function() {
seed = seed,
stopping_rounds=2,
stopping_metric="AUUC",
score_each_iteration=TRUE,
stopping_tolerance=0.1)

print(model_es)
Expand Down
17 changes: 6 additions & 11 deletions h2o-r/tests/testdir_algos/uplift/runit_uplift_rf_grid.R
Original file line number Diff line number Diff line change
@@ -1,19 +1,14 @@
setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
source("../../../scripts/h2o-r-test-setup.R")
library(uplift)

check.uplift.grid <- function() {
data <- sim_pte(n = 2000, p = 6, rho = 0, sigma = sqrt(2), beta.den = 4)
print(summary(data))

data$treat <- ifelse(data$treat == 1, 1, 0)
data$treat <- as.factor(data$treat)
data$y <- as.factor(data$y)
data <- as.h2o(data)
x <- c("feature_1", "feature_2", "feature_3", "feature_4", "feature_5", "feature_6")
y <- "outcome"
treat <- "treatment"

x <- c("X1", "X2", "X3", "X4", "X5", "X6")
y <- "y"
treat <- "treat"
data <- h2o.importFile(path=locate("smalldata/uplift/upliftml_train.csv"),
col.types=list(by.col.name=c(treat, y), types=c("factor", "factor")))
print(summary(data))

pretty.list <- function(ll) {
str <- lapply(ll, function(x) { paste0("(", paste(x, collapse = ","), ")", sep = "") })
Expand Down
38 changes: 16 additions & 22 deletions h2o-r/tests/testdir_algos/uplift/runit_uplift_rf_mojo.R
Original file line number Diff line number Diff line change
@@ -1,37 +1,31 @@
setwd(normalizePath(dirname(R.utils::commandArgs(asValues = TRUE)$"f")))
source("../../../scripts/h2o-r-test-setup.R")
library(uplift)


test.uplift <- function() {
ntrees <- 6
ntrees <- 3
mtries <- 6
seed <- 42
uplift_metric <- "KL"
set.seed(seed)

# Test data preparation for each implementation
train <- sim_pte(n = 2000, p = 6, rho = 0, sigma = sqrt(2), beta.den = 4)
train$treat <- ifelse(train$treat == 1, 1, 0)
test <- sim_pte(n = 1000, p = 6, rho = 0, sigma = sqrt(2), beta.den = 4)
test$treat <- ifelse(test$treat == 1, 1, 0)
x <- c("feature_1", "feature_2", "feature_3", "feature_4", "feature_5", "feature_6")
y <- "outcome"
treatment_col <- "treatment"

trainh2o <- train
trainh2o$treat <- as.factor(train$treat)
trainh2o$y <- as.factor(train$y)
trainh2o <- as.h2o(trainh2o)

testh2o <- test
testh2o$treat <- as.factor(test$treat)
testh2o$y <- as.factor(test$y)
testh2o <- as.h2o(testh2o)
# Test data preparation for each implementation
train <- h2o.importFile(path=locate("smalldata/uplift/upliftml_train.csv"),
col.types=list(by.col.name=c(treatment_col, y), types=c("factor", "factor")))
test <- h2o.importFile(path=locate("smalldata/uplift/upliftml_test.csv"),
col.types=list(by.col.name=c(treatment_col, y), types=c("factor", "factor")))

model <- h2o.upliftRandomForest(
x = c("X1", "X2", "X3", "X4", "X5", "X6"),
y = "y",
training_frame = trainh2o,
validation_frame = testh2o,
treatment_column = "treat",
x = x,
y = y,
training_frame = train,
validation_frame = test,
treatment_column = treatment_col,
uplift_metric = uplift_metric,
auuc_type = "qini",
distribution = "bernoulli",
Expand All @@ -43,7 +37,7 @@ test.uplift <- function() {
seed = seed)

print(model)
pred.uplift <- h2o.predict(model, testh2o)
pred.uplift <- h2o.predict(model, test)
pred.uplift.df <- as.data.frame(pred.uplift)

tmpdir <- tempdir()
Expand All @@ -52,7 +46,7 @@ test.uplift <- function() {

model.mojo <- h2o.import_mojo(modelpath)
print(model.mojo)
pred.mojo <- h2o.predict(model.mojo, testh2o)
pred.mojo <- h2o.predict(model.mojo, test)
pred.mojo.df <- as.data.frame(pred.mojo)

expect_equal(pred.mojo.df[1,1], pred.uplift.df[1,1])
Expand Down
64 changes: 30 additions & 34 deletions h2o-r/tests/testdir_algos/uplift/runit_uplift_smoke.R
Original file line number Diff line number Diff line change
@@ -1,51 +1,46 @@
setwd(normalizePath(dirname(R.utils::commandArgs(asValues = TRUE)$"f")))
source("../../../scripts/h2o-r-test-setup.R")
library(uplift)


test.uplift <- function() {
ntrees <- 10
ntrees <- 3
mtries <- 6
seed <- 42
uplift_metrics <- c("KL", "ChiSquared", "Euclidean")
set.seed(seed)

# Test data preparation for each implementation
train <- sim_pte(n = 2000, p = 6, rho = 0, sigma = sqrt(2), beta.den = 4)
train$treat <- ifelse(train$treat == 1, 1, 0)
test <- sim_pte(n = 1000, p = 6, rho = 0, sigma = sqrt(2), beta.den = 4)
test$treat <- ifelse(test$treat == 1, 1, 0)
x <- c("feature_1", "feature_2", "feature_3", "feature_4", "feature_5", "feature_6")
y <- "outcome"
treatment_col <- "treatment"

trainh2o <- train
trainh2o$treat <- as.factor(train$treat)
trainh2o$y <- as.factor(train$y)
trainh2o <- as.h2o(trainh2o)

testh2o <- test
testh2o$treat <- as.factor(test$treat)
testh2o$y <- as.factor(test$y)
testh2o <- as.h2o(testh2o)
# Test data preparation for each implementation
train <- h2o.importFile(path=locate("smalldata/uplift/upliftml_train.csv"),
col.types=list(by.col.name=c(treatment_col, y), types=c("factor", "factor")))
test <- h2o.importFile(path=locate("smalldata/uplift/upliftml_test.csv"),
col.types=list(by.col.name=c(treatment_col, y), types=c("factor", "factor")))

expected_values_auuc_qini <- c(66.108996, 85.583648, 60.837472)
expected_values_auuc_lift <- c(0.212531, 0.260563, 0.204788)
expected_values_auuc_gain <- c(128.642298, 162.020112, 122.031586)
# excepted values on the training data - KL, ChiSquared, Euclidean
expected_values_auuc_qini <- c(173.291123, 207.336457, 189.869900)
expected_values_auuc_lift <- c(0.162149, 0.162407, 0.170787)
expected_values_auuc_gain <- c(344.481112, 408.390572, 375.539891)

expected_values_aecu_qini <- c(82.12082, 101.594370, 76.857630)
expected_values_aecu_lift <- c(0.2285426, 0.276573, 0.220808)
expected_values_aecu_gain <- c(160.666, 194.041557, 154.071902)
expected_values_aecu_qini <- c(57.154241, 79.030343, 72.546412)
expected_values_aecu_lift <- c(0.115824, 0.111228, 0.123988)
expected_values_aecu_gain <- c(113.365474, 153.057822, 142.062874)

expected_values_auuc_norm_qini <- c(2.065906, 2.674489, 1.901171)
expected_values_auuc_norm_gain <- c(2.010036, 2.531564, 1.906744)
expected_values_auuc_norm_lift <- c(0.212531, 0.260563, 0.204788)
expected_values_auuc_norm_qini <- c(0.804835, 0.962955, 0.881833)
expected_values_auuc_norm_lift <- c(0.162149, 0.162407, 0.170787)
expected_values_auuc_norm_gain <- c(0.803964, 0.953119, 0.876450)

for (i in 1:length(uplift_metrics)) {
print(paste("Train h2o uplift model", uplift_metrics[i]))
model <- h2o.upliftRandomForest(
x = c("X1", "X2", "X3", "X4", "X5", "X6"),
y = "y",
training_frame = trainh2o,
validation_frame = testh2o,
treatment_column = "treat",
x = x,
y = y,
training_frame = train,
validation_frame = test,
treatment_column = treatment_col,
uplift_metric = uplift_metrics[i],
auuc_type = "qini",
distribution = "bernoulli",
Expand Down Expand Up @@ -110,17 +105,18 @@ test.uplift <- function() {
tol <- 1e-4
expect_equal(auuc, auuc_qini, tolerance=tol)
expect_equal(auuc, expected_values_auuc_qini[i], tolerance=tol)
expect_equal(auuc_gain, expected_values_auuc_gain[i], tolerance=tol)
expect_equal(auuc_lift, expected_values_auuc_lift[i], tolerance=tol)
expect_equal(auuc_gain, expected_values_auuc_gain[i], tolerance=tol)

expect_equal(qini, aecu_qini, tolerance=tol)
expect_equal(aecu_qini, expected_values_aecu_qini[i], tolerance=tol)
expect_equal(aecu_gain, expected_values_aecu_gain[i], tolerance=tol)
expect_equal(aecu_qini, expected_values_aecu_qini[i], tolerance=tol)
expect_equal(aecu_lift, expected_values_aecu_lift[i], tolerance=tol)

expect_equal(aecu_gain, expected_values_aecu_gain[i], tolerance=tol)

expect_equal(auuc_norm, auuc_qini_norm, tolerance=tol)
expect_equal(auuc_norm, expected_values_auuc_norm_qini[i], tolerance=tol)
expect_equal(auuc_gain_norm, expected_values_auuc_norm_gain[i], tolerance=tol)
expect_equal(auuc_lift_norm, expected_values_auuc_norm_lift[i], tolerance=tol)
expect_equal(auuc_gain_norm, expected_values_auuc_norm_gain[i], tolerance=tol)

model_ate <- h2o.ate(model, train=TRUE, valid=TRUE)
print(model_ate)
Expand Down

0 comments on commit aedac69

Please sign in to comment.