Added Santander

joshkyh · May 4, 2016 · 1ad6c96 · 1ad6c96
1 parent 36bbcde
commit 1ad6c96
Show file tree

Hide file tree

Showing 13 changed files with 304,620 additions and 0 deletions.
diff --git a/Santander/README.md b/Santander/README.md
@@ -0,0 +1,20 @@
+# Scripts for Santander Customer Satisfaction Competition
+
+## Algorithms used
+1. XGBoost
+2. Nnet
+3. RandomForest
+4. Support Vector Machines
+5. Blended and Stacked models
+
+## Short description of scripts
+* clean_process.R: This script removes constant and duplicate columns. Then, some of the features are deskewed.
+* boost_fulldat.R: XGBoost is trained on the full data using 5-fold cross-validation (CV). The determined parameters are then used to fit on all the training data and predictions on the test are performed. This results in a high score.
+* nnet_fulldat.R: Training of neural network predictors. Neural networks (with one hidden layer) does not outperform XGBoost.
+* rf_fulldat.R: RandomForest predictors are trained. A stratified data set is necessary for Random Forests (i.e. data sets where the observations corressponding to each type of responses are balanced). 
+* svm_fulldat.R: Support Vector Mahcine (SVM) predictors are trained. SVMs also need stratified data sets. They do not perform very well.
+
+### Stacking 
+Model stacking is applied based on the models trained on the *full data*. In the *stacking* folder, we build the stacking model, and attempt to train the level 2 model. Then, in *stacking.tst0*, we stack up all the models using the full train data and then use the level2 model to predict on test data. We also considered the possibility of using simple logistic regression as the level 2 model, which indeed gave good results. 
+
+In the stacked models, we did not incorporate SVM, since its performance was poor.  
diff --git a/Santander/boost_fulldat.R b/Santander/boost_fulldat.R
@@ -0,0 +1,100 @@
+# Santander data (xgboost CV full data)
+# vfull_dat
+# Author: Burak H
+#
+require(dplyr)
+require(caret)
+require(xgboost)
+require(e1071)
+setwd("~/Works/Rworkspace/Santender/")
+
+# Read data
+train0 <- read.csv("train.csv")
+test0 <- read.csv("test.csv")
+
+# Get the cleaned and processed data
+setwd("~/Works/Rworkspace/Santender/final/")
+fun.clean <- dget("clean_process.R") 
+cleaned <- fun.clean(train0,test0) 
+y <- cleaned[[1]]; train0 <- cleaned[[2]]; test0 <- cleaned[[3]]
+test0.id <- cleaned[[4]]
+rm(cleaned)
+
+train0$TARGET <- y # Put TARGET back
+
+# Create training and validation sets for cross-validation
+set.seed(101)
+inTrain <- createDataPartition(y = train0$TARGET, p = 0.7, list = FALSE)
+training <- train0[inTrain, ]; validation <- train0[-inTrain, ]
+
+#--------------------------#
+# Fit XGBOOST with Caret
+#--------------------------#
+
+# # Old grid
+# xgb_grid <- expand.grid(eta = 2^seq(-6,-4), colsample_bytree = c(0.4, 0.6, 0.8), nrounds = 2^(8:9), max_depth = 5, 
+#                         min_child_weight = c(0,1,2), gamma = c(0,2,4,6))
+
+# Initial grid
+# xgb_grid <- expand.grid(eta = c(0.001,0.01,0.1,1,10), 
+#                         colsample_bytree = c(0.4, 0.6, 0.8, 1.0), 
+#                         nrounds = 2^(8:9), 
+#                         max_depth = 5,
+#                         min_child_weight = c(0,1,2), 
+#                         gamma = c(0.001,0.01,1,10))
+
+# Fitting nrounds = 512, max_depth = 5, eta = 0.1, gamma = 10, colsample_bytree = 0.6, 
+# min_child_weight = 1 on full training set: auc = 0.8411
+
+# Try more targeted grid 
+xgb_grid <- expand.grid(eta = 2^seq(-6,-4), colsample_bytree = c(0.4, 0.6, 0.8),
+                        nrounds = c(350, 400, 450, 500, 550), max_depth = 5,
+                        min_child_weight = c(0,1,2), gamma = c(2,4,6,8,10))
+
+ctrl <- trainControl(method = "cv", number = 5, classProbs = TRUE, summaryFunction = twoClassSummary,
+                     verboseIter = TRUE)
+
+mod.xgb <- train(x = select(training, -TARGET), y = ifelse(training$TARGET == "1", "y", "n"),
+                 method = "xgbTree", trControl = ctrl, metric = "ROC", tuneGrid = xgb_grid,
+                 nthread = 8)
+
+
+# 5-fold CV result was:
+# Fitting nrounds = 400, max_depth = 5, eta = 0.0312, gamma = 4, colsample_bytree = 0.4, min_child_weight = 1 
+# auc = 0.8414
+# Note: 10-fold does not make a big difference.
+
+# Save model
+#xgb.save(mod.xgb$finalModel, "mod-xgb")
+
+# Predictions
+validation$pred <- predict(mod.xgb$finalModel, newdata = as.matrix(select(validation, -TARGET)), outputmargin = FALSE)
+
+# ROC
+library(pROC)
+roc.curve <- roc(validation$TARGET, validation$pred)
+plot(roc.curve)
+
+# Confusion Matrix
+#cm <- confusionMatrix(ifelse(validation$pred < 0.5, 1, 0), validation$TARGET) # 0 is positive class
+
+# ---- FINAL FIT ---- #
+# Now, let us fit a boosting model on the whole train0 set and predict test0
+
+dtrain <- xgb.DMatrix(data = as.matrix(select(train0, -TARGET)),
+                      label = train0$TARGET)
+
+params <- list(booster="gbtree", objective = "binary:logistic",
+               max_depth = 5, eta = 2^(-5), colsample_bytree = 0.4,
+               subsample = 1, gamma = 4, min_child_weight = 1)
+
+mod.xgb <- xgboost(params = params, data = dtrain, 
+                     nrounds = 400, nthread = 8, subsample = 1,
+                     print.every.n = 10 )
+
+#--------------------------#
+# Prediction for test0
+#--------------------------#
+pred <- predict(mod.xgb, newdata = as.matrix(test0))
+final <- data.frame(ID = test0.id, TARGET = pred)
+write.csv(final,"boost_deskewed_42116.csv", row.names = FALSE)
diff --git a/Santander/clean_process.R b/Santander/clean_process.R
@@ -0,0 +1,61 @@
+# Santander data clean/process
+# Clean and Process the data
+# Author: Burak H
+#
+require(dplyr); require(e1071)
+
+clean_process <- function(train0,test0){
+  # Save TARGET values and remove
+  y <- train0$TARGET; train0$TARGET <- NULL
+
+  # Find constant features and remove them from analysis
+  for (i in names(train0)){
+    if (length(unique(train0[[i]])) == 1 | length(unique(test0[[i]])) == 1){
+      train0[[i]] <- NULL
+      test0[[i]] <- NULL
+    }
+  }
+
+  # Find duplicate columns and remove them
+  dpl.cl <- data.frame(i = integer(), j = integer())
+
+  for ( i in 2:(ncol(train0)-1) ){
+    for ( j in (i+1):ncol(train0) ){
+      if ( identical(train0[,i], train0[,j]) == TRUE  ){
+        dpl.cl <- rbind(dpl.cl, data.frame(i=i,j=j))
+      }
+    }
+  }
+  train0 <- train0[,-dpl.cl$j]
+  test0 <- test0[, -dpl.cl$j]
+
+  # Save IDs and Remove them
+  train.id <- train0$ID; test.id <- test0$ID
+  train0$ID <- NULL; test0$ID <- NULL
+
+  # Deskewing function
+  dsk <- function(df.tr, df.ts, threshold = 1.00){
+    #
+    df <- rbind(df.tr, df.ts) # Bind the two frames 
+    #
+    for (j in 1:ncol(df)){
+      c.min <- min(df[[j]]); #c.max <- max(df[[j]])
+      t <- log(1 - c.min + df[[j]])
+      if ( abs(skewness(df[[j]])) > threshold * abs(skewness(t))){
+        df[[j]] <- t
+      }
+    }
+    # Separate the data frames back into original
+    isep = c(rep(1, nrow(df.tr)),rep(-1, nrow(df.ts)))
+    df <- mutate(df, isep = factor(isep))
+    list(filter(df, isep == "1"), filter(df, isep == "-1")) # Return train and test separately
+  }
+
+  retfn <- dsk(train0,test0,threshold = 0.5)
+  train0 <- retfn[[1]]; test0 <- retfn[[2]]
+  train0$isep <- NULL; test0$isep <- NULL
+
+  # Return the processed data and the TARGET values
+  list(y, train0, test0, test.id)
+}
+
diff --git a/Santander/nnet_fulldat.R b/Santander/nnet_fulldat.R
@@ -0,0 +1,98 @@
+# Santander data: nnet on full data
+# Author: Burak H
+#
+library(dplyr); library(caret); library(nnet); library(e1071)
+library(Matrix)
+setwd("~/Works/Rworkspace/Santender/")
+#setwd("~/Coursera/projects/santender/")
+
+# Read data
+train0 <- read.csv("train.csv")
+test0 <- read.csv("test.csv")
+
+# Get the cleaned and processed data
+setwd("~/Works/Rworkspace/Santender/final/")
+fun.clean <- dget("clean_process.R")
+cleaned <- fun.clean(train0,test0)
+y <- cleaned[[1]]; train0 <- cleaned[[2]]; test0 <- cleaned[[3]]
+test0.id <- cleaned[[4]]
+rm(cleaned)
+
+train0$TARGET <- y # Put TARGET back
+
+# Create training and validation sets for cross-validation
+set.seed(101)
+inTrain <- createDataPartition(y = train0$TARGET, p = 0.7, list = FALSE)
+training <- train0[inTrain, ]; testing <- train0[-inTrain, ]
+
+# -------------------------#
+# ------ Train NNET ------ #
+# ------------------------ #
+
+# Let us determine the decay and hidden_layer_size by cross-validation using caret
+input_layer_size = ncol(training)-1 # Initial layer (-1 for label column)
+output_layer_size = 2 # Number of classes
+max_hidden_layer_size = 15 # Maximum value of hidden layer size
+
+# Maximum number of weights
+N.weights = (input_layer_size+1)*max_hidden_layer_size + (max_hidden_layer_size+1)*output_layer_size
+
+# # --- CAUTION: This takes days!!! --- #
+# 
+# # Grid 
+# nnet.grid <- expand.grid(decay = c(0, 1, 2, 4), size = c(5, 10, 50))
+# 
+# # Train control
+# ctrl <- trainControl(method = "cv", number = 5, classProbs = TRUE, summaryFunction = twoClassSummary,
+#                      verboseIter = TRUE)
+# 
+# # Model training
+# mod.nnet <- train(x = select(training, -TARGET), y = ifelse(training$TARGET == "1", "y", "n"),
+#                   method = "nnet", trControl = ctrl, metric = "ROC", tuneGrid = nnet.grid,
+#                   MaxNWts = N.weights , maxit = 1000)
+# 
+# # After training, save the model
+# save(mod.nnet, file = "nnet_final.Rdata")
+# 
+# # ---END CAUTION ---- #
+
+# Since we have learned that size = 5 and decay = 2 works for the reduced data, let's fix size = c(4, 5)
+# and try to tune decay in the full data
+
+# Grid 
+#nnet.grid <- expand.grid(decay = c(1, 2, 4), size = c(4,5))
+nnet.grid <- expand.grid(decay = c(2,4), size = c(5, 10, 15))
+
+# Train control
+ctrl <- trainControl(method = "cv", number = 5, classProbs = TRUE, summaryFunction = twoClassSummary,
+                      verboseIter = TRUE)
+
+# Model training
+mod.nnet <- train(x = select(training, -TARGET), y = ifelse(training$TARGET == "1", "y", "n"),
+                   method = "nnet", trControl = ctrl, metric = "ROC", tuneGrid = nnet.grid,
+                   MaxNWts = N.weights , maxit = 1000)
+# Size 5 decay 2 is bestTune, try smaller size too
+
+# --------- #
+
+# Neural network structure
+input_layer_size = ncol(training)-1 # Initial layer (-1 for label column)
+hidden_layer_size = 5 # hidden layer size, from bestTune
+output_layer_size = 2 # Number of classes
+dec = 2.0 # From best tune
+
+# # Number of weights
+# N.weights = (input_layer_size+1)*hidden_layer_size + (hidden_layer_size+1)*output_layer_size
+# 
+# # Need to determine hidden_layer_size and decay by CV
+# mod.nnet <- nnet(TARGET ~., data = mutate(training, TARGET = factor(TARGET)), softmax = FALSE, 
+#               size = hidden_layer_size, MaxNWts = N.weights , maxit = 1000, decay = 1.0)
+
+pred.nnet <- predict(mod.nnet$finalModel, newdata = testing)
+#pred.class <- as.numeric(ifelse(pred.nnet0 < 0.5, 0, 1))
+#cm.nnet0 <- confusionMatrix(validation.nn$TARGET, pred.class)
+
+# ROC
+library(pROC)
+ROC <- roc(response = testing$TARGET, predictor = as.numeric(pred.nnet))
+plot(ROC)
diff --git a/Santander/rf_fulldat.R b/Santander/rf_fulldat.R
@@ -0,0 +1,65 @@
+# Santander data (Random Forest CV full data)
+# vfull_dat
+# Author: Burak H
+#
+require(dplyr)
+require(caret)
+require(randomForest)
+require(e1071)
+setwd("~/Works/Rworkspace/Santender/")
+
+# Read data
+train0 <- read.csv("train.csv")
+test0 <- read.csv("test.csv")
+
+# Get the cleaned and processed data
+setwd("~/Works/Rworkspace/Santender/final/")
+fun.clean <- dget("clean_process.R")
+cleaned <- fun.clean(train0,test0)
+y <- cleaned[[1]]; train0 <- cleaned[[2]]; test0 <- cleaned[[3]]
+test0.id <- cleaned[[4]]
+rm(cleaned)
+
+train0$TARGET <- y # Put TARGET back
+
+# Create training and validation sets for cross-validation
+set.seed(101)
+inTrain <- createDataPartition(y = train0$TARGET, p = 0.7, list = FALSE)
+training <- train0[inTrain, ]; validation <- train0[-inTrain, ]
+
+# For RF, we pretty much need a balanced set for training
+nmin <- sum(training$TARGET == "1")
+nmax <- sum(training$TARGET == "0")
+nfolds <- floor(nmax/nmin) # Number of data separations 
+
+# Folds to use
+folds <- createFolds(1:nmax, k = nfolds)
+
+# All 0's and all 1's data
+temp.0 <- filter(training, TARGET == "0")
+temp.1 <- filter(training, TARGET == "1")
+
+# Balanced data used for predictions
+tr.bal <- rbind(temp.1, temp.0[folds[[1]], ])
+
+# Training with Caret
+rf_grid <- expand.grid(mtry = c(10, 20, 30, 40, 50, 60, 100))
+
+ctrl <- trainControl(method = "cv", number = 10, classProbs = TRUE, summaryFunction = twoClassSummary,
+                     verboseIter = TRUE)
+
+mod.rf <- train(x = select(tr.bal, -TARGET), y = ifelse(tr.bal$TARGET == "1", "y", "n"),
+                method = "rf", trControl = ctrl, metric = "ROC", tuneGrid = rf_grid, ntree = 1000)
+
+# mtry = 40 in the first round of RF
+# Second round of CV
+# Training with Caret
+rf_grid <- expand.grid(mtry = c(32, 34, 36, 38, 40, 42, 44, 46, 48))
+
+ctrl <- trainControl(method = "cv", number = 10, classProbs = TRUE, summaryFunction = twoClassSummary,
+                     verboseIter = TRUE)
+set.seed(101)
+mod.rf <- train(x = select(tr.bal, -TARGET), y = ifelse(tr.bal$TARGET == "1", "y", "n"),
+                method = "rf", trControl = ctrl, metric = "ROC", tuneGrid = rf_grid, ntree = 1000)
+
+# In this one mtry = 38 is found..