Added Santander
# Scripts for Santander Customer Satisfaction Competition

## Algorithms used
1. XGBoost
2. Nnet
3. RandomForest
4. Support Vector Machines
5. Blended and Stacked models

## Short description of scripts
* clean_process.R: This script removes constant and duplicate columns. Then, some of the features are deskewed.
* boost_fulldat.R: XGBoost is trained on the full data using 5-fold cross-validation (CV). The determined parameters are then used to fit on all the training data and predictions on the test are performed. This results in a high score.
* nnet_fulldat.R: Training of neural network predictors. Neural networks (with one hidden layer) does not outperform XGBoost.
* rf_fulldat.R: RandomForest predictors are trained. A stratified data set is necessary for Random Forests (i.e. data sets where the observations corressponding to each type of responses are balanced).
* svm_fulldat.R: Support Vector Mahcine (SVM) predictors are trained. SVMs also need stratified data sets. They do not perform very well.

### Stacking
Model stacking is applied based on the models trained on the *full data*. In the *stacking* folder, we build the stacking model, and attempt to train the level 2 model. Then, in *stacking.tst0*, we stack up all the models using the full train data and then use the level2 model to predict on test data. We also considered the possibility of using simple logistic regression as the level 2 model, which indeed gave good results.

In the stacked models, we did not incorporate SVM, since its performance was poor.
100 changes: 100 additions & 0 deletions Santander/boost_fulldat.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
# Santander data (xgboost CV full data)
# vfull_dat
# Author: Burak H

# Read data
train0 <- read.csv("train.csv")
test0 <- read.csv("test.csv")

# Get the cleaned and processed data
fun.clean <- dget("clean_process.R")
cleaned <- fun.clean(train0,test0)
y <- cleaned[[1]]; train0 <- cleaned[[2]]; test0 <- cleaned[[3]] <- cleaned[[4]]

train0$TARGET <- y # Put TARGET back

# Create training and validation sets for cross-validation
inTrain <- createDataPartition(y = train0$TARGET, p = 0.7, list = FALSE)
training <- train0[inTrain, ]; validation <- train0[-inTrain, ]

# Fit XGBOOST with Caret

# # Old grid
# xgb_grid <- expand.grid(eta = 2^seq(-6,-4), colsample_bytree = c(0.4, 0.6, 0.8), nrounds = 2^(8:9), max_depth = 5,
# min_child_weight = c(0,1,2), gamma = c(0,2,4,6))

# Initial grid
# xgb_grid <- expand.grid(eta = c(0.001,0.01,0.1,1,10),
# colsample_bytree = c(0.4, 0.6, 0.8, 1.0),
# nrounds = 2^(8:9),
# max_depth = 5,
# min_child_weight = c(0,1,2),
# gamma = c(0.001,0.01,1,10))

# Fitting nrounds = 512, max_depth = 5, eta = 0.1, gamma = 10, colsample_bytree = 0.6,
# min_child_weight = 1 on full training set: auc = 0.8411

# Try more targeted grid
xgb_grid <- expand.grid(eta = 2^seq(-6,-4), colsample_bytree = c(0.4, 0.6, 0.8),
nrounds = c(350, 400, 450, 500, 550), max_depth = 5,
min_child_weight = c(0,1,2), gamma = c(2,4,6,8,10))

ctrl <- trainControl(method = "cv", number = 5, classProbs = TRUE, summaryFunction = twoClassSummary,
verboseIter = TRUE)

mod.xgb <- train(x = select(training, -TARGET), y = ifelse(training$TARGET == "1", "y", "n"),
method = "xgbTree", trControl = ctrl, metric = "ROC", tuneGrid = xgb_grid,
nthread = 8)

# 5-fold CV result was:
# Fitting nrounds = 400, max_depth = 5, eta = 0.0312, gamma = 4, colsample_bytree = 0.4, min_child_weight = 1
# auc = 0.8414
# Note: 10-fold does not make a big difference.

# Save model$finalModel, "mod-xgb")

# Predictions
validation$pred <- predict(mod.xgb$finalModel, newdata = as.matrix(select(validation, -TARGET)), outputmargin = FALSE)

roc.curve <- roc(validation$TARGET, validation$pred)

# Confusion Matrix
#cm <- confusionMatrix(ifelse(validation$pred < 0.5, 1, 0), validation$TARGET) # 0 is positive class

# ---- FINAL FIT ---- #
# Now, let us fit a boosting model on the whole train0 set and predict test0

dtrain <- xgb.DMatrix(data = as.matrix(select(train0, -TARGET)),
label = train0$TARGET)

params <- list(booster="gbtree", objective = "binary:logistic",
max_depth = 5, eta = 2^(-5), colsample_bytree = 0.4,
subsample = 1, gamma = 4, min_child_weight = 1)

mod.xgb <- xgboost(params = params, data = dtrain,
nrounds = 400, nthread = 8, subsample = 1,
print.every.n = 10 )

# Prediction for test0
pred <- predict(mod.xgb, newdata = as.matrix(test0))
final <- data.frame(ID =, TARGET = pred)
write.csv(final,"boost_deskewed_42116.csv", row.names = FALSE)
61 changes: 61 additions & 0 deletions Santander/clean_process.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# Santander data clean/process
# Clean and Process the data
# Author: Burak H
require(dplyr); require(e1071)

clean_process <- function(train0,test0){
# Save TARGET values and remove
y <- train0$TARGET; train0$TARGET <- NULL

# Find constant features and remove them from analysis
for (i in names(train0)){
if (length(unique(train0[[i]])) == 1 | length(unique(test0[[i]])) == 1){
train0[[i]] <- NULL
test0[[i]] <- NULL

# Find duplicate columns and remove them <- data.frame(i = integer(), j = integer())

for ( i in 2:(ncol(train0)-1) ){
for ( j in (i+1):ncol(train0) ){
if ( identical(train0[,i], train0[,j]) == TRUE ){ <- rbind(, data.frame(i=i,j=j))
train0 <- train0[,$j]
test0 <- test0[,$j]

# Save IDs and Remove them <- train0$ID; <- test0$ID
train0$ID <- NULL; test0$ID <- NULL

# Deskewing function
dsk <- function(, df.ts, threshold = 1.00){
df <- rbind(, df.ts) # Bind the two frames
for (j in 1:ncol(df)){
c.min <- min(df[[j]]); #c.max <- max(df[[j]])
t <- log(1 - c.min + df[[j]])
if ( abs(skewness(df[[j]])) > threshold * abs(skewness(t))){
df[[j]] <- t
# Separate the data frames back into original
isep = c(rep(1, nrow(,rep(-1, nrow(df.ts)))
df <- mutate(df, isep = factor(isep))
list(filter(df, isep == "1"), filter(df, isep == "-1")) # Return train and test separately

retfn <- dsk(train0,test0,threshold = 0.5)
train0 <- retfn[[1]]; test0 <- retfn[[2]]
train0$isep <- NULL; test0$isep <- NULL

# Return the processed data and the TARGET values
list(y, train0, test0,

98 changes: 98 additions & 0 deletions Santander/nnet_fulldat.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
# Santander data: nnet on full data
# Author: Burak H
library(dplyr); library(caret); library(nnet); library(e1071)

# Read data
train0 <- read.csv("train.csv")
test0 <- read.csv("test.csv")

# Get the cleaned and processed data
fun.clean <- dget("clean_process.R")
cleaned <- fun.clean(train0,test0)
y <- cleaned[[1]]; train0 <- cleaned[[2]]; test0 <- cleaned[[3]] <- cleaned[[4]]

train0$TARGET <- y # Put TARGET back

# Create training and validation sets for cross-validation
inTrain <- createDataPartition(y = train0$TARGET, p = 0.7, list = FALSE)
training <- train0[inTrain, ]; testing <- train0[-inTrain, ]

# -------------------------#
# ------ Train NNET ------ #
# ------------------------ #

# Let us determine the decay and hidden_layer_size by cross-validation using caret
input_layer_size = ncol(training)-1 # Initial layer (-1 for label column)
output_layer_size = 2 # Number of classes
max_hidden_layer_size = 15 # Maximum value of hidden layer size

# Maximum number of weights
N.weights = (input_layer_size+1)*max_hidden_layer_size + (max_hidden_layer_size+1)*output_layer_size

# # --- CAUTION: This takes days!!! --- #
# # Grid
# nnet.grid <- expand.grid(decay = c(0, 1, 2, 4), size = c(5, 10, 50))
# # Train control
# ctrl <- trainControl(method = "cv", number = 5, classProbs = TRUE, summaryFunction = twoClassSummary,
# verboseIter = TRUE)
# # Model training
# mod.nnet <- train(x = select(training, -TARGET), y = ifelse(training$TARGET == "1", "y", "n"),
# method = "nnet", trControl = ctrl, metric = "ROC", tuneGrid = nnet.grid,
# MaxNWts = N.weights , maxit = 1000)
# # After training, save the model
# save(mod.nnet, file = "nnet_final.Rdata")
# # ---END CAUTION ---- #

# Since we have learned that size = 5 and decay = 2 works for the reduced data, let's fix size = c(4, 5)
# and try to tune decay in the full data

# Grid
#nnet.grid <- expand.grid(decay = c(1, 2, 4), size = c(4,5))
nnet.grid <- expand.grid(decay = c(2,4), size = c(5, 10, 15))

# Train control
ctrl <- trainControl(method = "cv", number = 5, classProbs = TRUE, summaryFunction = twoClassSummary,
verboseIter = TRUE)

# Model training
mod.nnet <- train(x = select(training, -TARGET), y = ifelse(training$TARGET == "1", "y", "n"),
method = "nnet", trControl = ctrl, metric = "ROC", tuneGrid = nnet.grid,
MaxNWts = N.weights , maxit = 1000)
# Size 5 decay 2 is bestTune, try smaller size too

# --------- #

# Neural network structure
input_layer_size = ncol(training)-1 # Initial layer (-1 for label column)
hidden_layer_size = 5 # hidden layer size, from bestTune
output_layer_size = 2 # Number of classes
dec = 2.0 # From best tune

# # Number of weights
# N.weights = (input_layer_size+1)*hidden_layer_size + (hidden_layer_size+1)*output_layer_size
# # Need to determine hidden_layer_size and decay by CV
# mod.nnet <- nnet(TARGET ~., data = mutate(training, TARGET = factor(TARGET)), softmax = FALSE,
# size = hidden_layer_size, MaxNWts = N.weights , maxit = 1000, decay = 1.0)

pred.nnet <- predict(mod.nnet$finalModel, newdata = testing)
#pred.class <- as.numeric(ifelse(pred.nnet0 < 0.5, 0, 1))
#cm.nnet0 <- confusionMatrix(validation.nn$TARGET, pred.class)

ROC <- roc(response = testing$TARGET, predictor = as.numeric(pred.nnet))
65 changes: 65 additions & 0 deletions Santander/rf_fulldat.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# Santander data (Random Forest CV full data)
# vfull_dat
# Author: Burak H

# Read data
train0 <- read.csv("train.csv")
test0 <- read.csv("test.csv")

# Get the cleaned and processed data
fun.clean <- dget("clean_process.R")
cleaned <- fun.clean(train0,test0)
y <- cleaned[[1]]; train0 <- cleaned[[2]]; test0 <- cleaned[[3]] <- cleaned[[4]]

train0$TARGET <- y # Put TARGET back

# Create training and validation sets for cross-validation
inTrain <- createDataPartition(y = train0$TARGET, p = 0.7, list = FALSE)
training <- train0[inTrain, ]; validation <- train0[-inTrain, ]

# For RF, we pretty much need a balanced set for training
nmin <- sum(training$TARGET == "1")
nmax <- sum(training$TARGET == "0")
nfolds <- floor(nmax/nmin) # Number of data separations

# Folds to use
folds <- createFolds(1:nmax, k = nfolds)

# All 0's and all 1's data
temp.0 <- filter(training, TARGET == "0")
temp.1 <- filter(training, TARGET == "1")

# Balanced data used for predictions
tr.bal <- rbind(temp.1, temp.0[folds[[1]], ])

# Training with Caret
rf_grid <- expand.grid(mtry = c(10, 20, 30, 40, 50, 60, 100))

ctrl <- trainControl(method = "cv", number = 10, classProbs = TRUE, summaryFunction = twoClassSummary,
verboseIter = TRUE)

mod.rf <- train(x = select(tr.bal, -TARGET), y = ifelse(tr.bal$TARGET == "1", "y", "n"),
method = "rf", trControl = ctrl, metric = "ROC", tuneGrid = rf_grid, ntree = 1000)

# mtry = 40 in the first round of RF
# Second round of CV
# Training with Caret
rf_grid <- expand.grid(mtry = c(32, 34, 36, 38, 40, 42, 44, 46, 48))

ctrl <- trainControl(method = "cv", number = 10, classProbs = TRUE, summaryFunction = twoClassSummary,
verboseIter = TRUE)
mod.rf <- train(x = select(tr.bal, -TARGET), y = ifelse(tr.bal$TARGET == "1", "y", "n"),
method = "rf", trControl = ctrl, metric = "ROC", tuneGrid = rf_grid, ntree = 1000)

# In this one mtry = 38 is found..

