forked from bhimmetoglu/kaggle_101
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
36bbcde
commit 1ad6c96
Showing
13 changed files
with
304,620 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
# Scripts for Santander Customer Satisfaction Competition | ||
|
||
## Algorithms used | ||
1. XGBoost | ||
2. Nnet | ||
3. RandomForest | ||
4. Support Vector Machines | ||
5. Blended and Stacked models | ||
|
||
## Short description of scripts | ||
* clean_process.R: This script removes constant and duplicate columns. Then, some of the features are deskewed. | ||
* boost_fulldat.R: XGBoost is trained on the full data using 5-fold cross-validation (CV). The determined parameters are then used to fit on all the training data and predictions on the test are performed. This results in a high score. | ||
* nnet_fulldat.R: Training of neural network predictors. Neural networks (with one hidden layer) does not outperform XGBoost. | ||
* rf_fulldat.R: RandomForest predictors are trained. A stratified data set is necessary for Random Forests (i.e. data sets where the observations corressponding to each type of responses are balanced). | ||
* svm_fulldat.R: Support Vector Mahcine (SVM) predictors are trained. SVMs also need stratified data sets. They do not perform very well. | ||
|
||
### Stacking | ||
Model stacking is applied based on the models trained on the *full data*. In the *stacking* folder, we build the stacking model, and attempt to train the level 2 model. Then, in *stacking.tst0*, we stack up all the models using the full train data and then use the level2 model to predict on test data. We also considered the possibility of using simple logistic regression as the level 2 model, which indeed gave good results. | ||
|
||
In the stacked models, we did not incorporate SVM, since its performance was poor. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
# Santander data (xgboost CV full data) | ||
# vfull_dat | ||
# Author: Burak H | ||
# | ||
require(dplyr) | ||
require(caret) | ||
require(xgboost) | ||
require(e1071) | ||
setwd("~/Works/Rworkspace/Santender/") | ||
|
||
# Read data | ||
train0 <- read.csv("train.csv") | ||
test0 <- read.csv("test.csv") | ||
|
||
# Get the cleaned and processed data | ||
setwd("~/Works/Rworkspace/Santender/final/") | ||
fun.clean <- dget("clean_process.R") | ||
cleaned <- fun.clean(train0,test0) | ||
y <- cleaned[[1]]; train0 <- cleaned[[2]]; test0 <- cleaned[[3]] | ||
test0.id <- cleaned[[4]] | ||
rm(cleaned) | ||
|
||
train0$TARGET <- y # Put TARGET back | ||
|
||
# Create training and validation sets for cross-validation | ||
set.seed(101) | ||
inTrain <- createDataPartition(y = train0$TARGET, p = 0.7, list = FALSE) | ||
training <- train0[inTrain, ]; validation <- train0[-inTrain, ] | ||
|
||
#--------------------------# | ||
# Fit XGBOOST with Caret | ||
#--------------------------# | ||
|
||
# # Old grid | ||
# xgb_grid <- expand.grid(eta = 2^seq(-6,-4), colsample_bytree = c(0.4, 0.6, 0.8), nrounds = 2^(8:9), max_depth = 5, | ||
# min_child_weight = c(0,1,2), gamma = c(0,2,4,6)) | ||
|
||
# Initial grid | ||
# xgb_grid <- expand.grid(eta = c(0.001,0.01,0.1,1,10), | ||
# colsample_bytree = c(0.4, 0.6, 0.8, 1.0), | ||
# nrounds = 2^(8:9), | ||
# max_depth = 5, | ||
# min_child_weight = c(0,1,2), | ||
# gamma = c(0.001,0.01,1,10)) | ||
|
||
# Fitting nrounds = 512, max_depth = 5, eta = 0.1, gamma = 10, colsample_bytree = 0.6, | ||
# min_child_weight = 1 on full training set: auc = 0.8411 | ||
|
||
# Try more targeted grid | ||
xgb_grid <- expand.grid(eta = 2^seq(-6,-4), colsample_bytree = c(0.4, 0.6, 0.8), | ||
nrounds = c(350, 400, 450, 500, 550), max_depth = 5, | ||
min_child_weight = c(0,1,2), gamma = c(2,4,6,8,10)) | ||
|
||
ctrl <- trainControl(method = "cv", number = 5, classProbs = TRUE, summaryFunction = twoClassSummary, | ||
verboseIter = TRUE) | ||
|
||
mod.xgb <- train(x = select(training, -TARGET), y = ifelse(training$TARGET == "1", "y", "n"), | ||
method = "xgbTree", trControl = ctrl, metric = "ROC", tuneGrid = xgb_grid, | ||
nthread = 8) | ||
|
||
|
||
# 5-fold CV result was: | ||
# Fitting nrounds = 400, max_depth = 5, eta = 0.0312, gamma = 4, colsample_bytree = 0.4, min_child_weight = 1 | ||
# auc = 0.8414 | ||
# Note: 10-fold does not make a big difference. | ||
|
||
# Save model | ||
#xgb.save(mod.xgb$finalModel, "mod-xgb") | ||
|
||
# Predictions | ||
validation$pred <- predict(mod.xgb$finalModel, newdata = as.matrix(select(validation, -TARGET)), outputmargin = FALSE) | ||
|
||
# ROC | ||
library(pROC) | ||
roc.curve <- roc(validation$TARGET, validation$pred) | ||
plot(roc.curve) | ||
|
||
# Confusion Matrix | ||
#cm <- confusionMatrix(ifelse(validation$pred < 0.5, 1, 0), validation$TARGET) # 0 is positive class | ||
|
||
# ---- FINAL FIT ---- # | ||
# Now, let us fit a boosting model on the whole train0 set and predict test0 | ||
|
||
dtrain <- xgb.DMatrix(data = as.matrix(select(train0, -TARGET)), | ||
label = train0$TARGET) | ||
|
||
params <- list(booster="gbtree", objective = "binary:logistic", | ||
max_depth = 5, eta = 2^(-5), colsample_bytree = 0.4, | ||
subsample = 1, gamma = 4, min_child_weight = 1) | ||
|
||
mod.xgb <- xgboost(params = params, data = dtrain, | ||
nrounds = 400, nthread = 8, subsample = 1, | ||
print.every.n = 10 ) | ||
|
||
#--------------------------# | ||
# Prediction for test0 | ||
#--------------------------# | ||
pred <- predict(mod.xgb, newdata = as.matrix(test0)) | ||
final <- data.frame(ID = test0.id, TARGET = pred) | ||
write.csv(final,"boost_deskewed_42116.csv", row.names = FALSE) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
# Santander data clean/process | ||
# Clean and Process the data | ||
# Author: Burak H | ||
# | ||
require(dplyr); require(e1071) | ||
|
||
clean_process <- function(train0,test0){ | ||
# Save TARGET values and remove | ||
y <- train0$TARGET; train0$TARGET <- NULL | ||
|
||
# Find constant features and remove them from analysis | ||
for (i in names(train0)){ | ||
if (length(unique(train0[[i]])) == 1 | length(unique(test0[[i]])) == 1){ | ||
train0[[i]] <- NULL | ||
test0[[i]] <- NULL | ||
} | ||
} | ||
|
||
# Find duplicate columns and remove them | ||
dpl.cl <- data.frame(i = integer(), j = integer()) | ||
|
||
for ( i in 2:(ncol(train0)-1) ){ | ||
for ( j in (i+1):ncol(train0) ){ | ||
if ( identical(train0[,i], train0[,j]) == TRUE ){ | ||
dpl.cl <- rbind(dpl.cl, data.frame(i=i,j=j)) | ||
} | ||
} | ||
} | ||
train0 <- train0[,-dpl.cl$j] | ||
test0 <- test0[, -dpl.cl$j] | ||
|
||
# Save IDs and Remove them | ||
train.id <- train0$ID; test.id <- test0$ID | ||
train0$ID <- NULL; test0$ID <- NULL | ||
|
||
# Deskewing function | ||
dsk <- function(df.tr, df.ts, threshold = 1.00){ | ||
# | ||
df <- rbind(df.tr, df.ts) # Bind the two frames | ||
# | ||
for (j in 1:ncol(df)){ | ||
c.min <- min(df[[j]]); #c.max <- max(df[[j]]) | ||
t <- log(1 - c.min + df[[j]]) | ||
if ( abs(skewness(df[[j]])) > threshold * abs(skewness(t))){ | ||
df[[j]] <- t | ||
} | ||
} | ||
# Separate the data frames back into original | ||
isep = c(rep(1, nrow(df.tr)),rep(-1, nrow(df.ts))) | ||
df <- mutate(df, isep = factor(isep)) | ||
list(filter(df, isep == "1"), filter(df, isep == "-1")) # Return train and test separately | ||
} | ||
|
||
retfn <- dsk(train0,test0,threshold = 0.5) | ||
train0 <- retfn[[1]]; test0 <- retfn[[2]] | ||
train0$isep <- NULL; test0$isep <- NULL | ||
|
||
# Return the processed data and the TARGET values | ||
list(y, train0, test0, test.id) | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
# Santander data: nnet on full data | ||
# Author: Burak H | ||
# | ||
library(dplyr); library(caret); library(nnet); library(e1071) | ||
library(Matrix) | ||
setwd("~/Works/Rworkspace/Santender/") | ||
#setwd("~/Coursera/projects/santender/") | ||
|
||
# Read data | ||
train0 <- read.csv("train.csv") | ||
test0 <- read.csv("test.csv") | ||
|
||
# Get the cleaned and processed data | ||
setwd("~/Works/Rworkspace/Santender/final/") | ||
fun.clean <- dget("clean_process.R") | ||
cleaned <- fun.clean(train0,test0) | ||
y <- cleaned[[1]]; train0 <- cleaned[[2]]; test0 <- cleaned[[3]] | ||
test0.id <- cleaned[[4]] | ||
rm(cleaned) | ||
|
||
train0$TARGET <- y # Put TARGET back | ||
|
||
# Create training and validation sets for cross-validation | ||
set.seed(101) | ||
inTrain <- createDataPartition(y = train0$TARGET, p = 0.7, list = FALSE) | ||
training <- train0[inTrain, ]; testing <- train0[-inTrain, ] | ||
|
||
# -------------------------# | ||
# ------ Train NNET ------ # | ||
# ------------------------ # | ||
|
||
# Let us determine the decay and hidden_layer_size by cross-validation using caret | ||
input_layer_size = ncol(training)-1 # Initial layer (-1 for label column) | ||
output_layer_size = 2 # Number of classes | ||
max_hidden_layer_size = 15 # Maximum value of hidden layer size | ||
|
||
# Maximum number of weights | ||
N.weights = (input_layer_size+1)*max_hidden_layer_size + (max_hidden_layer_size+1)*output_layer_size | ||
|
||
# # --- CAUTION: This takes days!!! --- # | ||
# | ||
# # Grid | ||
# nnet.grid <- expand.grid(decay = c(0, 1, 2, 4), size = c(5, 10, 50)) | ||
# | ||
# # Train control | ||
# ctrl <- trainControl(method = "cv", number = 5, classProbs = TRUE, summaryFunction = twoClassSummary, | ||
# verboseIter = TRUE) | ||
# | ||
# # Model training | ||
# mod.nnet <- train(x = select(training, -TARGET), y = ifelse(training$TARGET == "1", "y", "n"), | ||
# method = "nnet", trControl = ctrl, metric = "ROC", tuneGrid = nnet.grid, | ||
# MaxNWts = N.weights , maxit = 1000) | ||
# | ||
# # After training, save the model | ||
# save(mod.nnet, file = "nnet_final.Rdata") | ||
# | ||
# # ---END CAUTION ---- # | ||
|
||
# Since we have learned that size = 5 and decay = 2 works for the reduced data, let's fix size = c(4, 5) | ||
# and try to tune decay in the full data | ||
|
||
# Grid | ||
#nnet.grid <- expand.grid(decay = c(1, 2, 4), size = c(4,5)) | ||
nnet.grid <- expand.grid(decay = c(2,4), size = c(5, 10, 15)) | ||
|
||
# Train control | ||
ctrl <- trainControl(method = "cv", number = 5, classProbs = TRUE, summaryFunction = twoClassSummary, | ||
verboseIter = TRUE) | ||
|
||
# Model training | ||
mod.nnet <- train(x = select(training, -TARGET), y = ifelse(training$TARGET == "1", "y", "n"), | ||
method = "nnet", trControl = ctrl, metric = "ROC", tuneGrid = nnet.grid, | ||
MaxNWts = N.weights , maxit = 1000) | ||
# Size 5 decay 2 is bestTune, try smaller size too | ||
|
||
# --------- # | ||
|
||
# Neural network structure | ||
input_layer_size = ncol(training)-1 # Initial layer (-1 for label column) | ||
hidden_layer_size = 5 # hidden layer size, from bestTune | ||
output_layer_size = 2 # Number of classes | ||
dec = 2.0 # From best tune | ||
|
||
# # Number of weights | ||
# N.weights = (input_layer_size+1)*hidden_layer_size + (hidden_layer_size+1)*output_layer_size | ||
# | ||
# # Need to determine hidden_layer_size and decay by CV | ||
# mod.nnet <- nnet(TARGET ~., data = mutate(training, TARGET = factor(TARGET)), softmax = FALSE, | ||
# size = hidden_layer_size, MaxNWts = N.weights , maxit = 1000, decay = 1.0) | ||
|
||
pred.nnet <- predict(mod.nnet$finalModel, newdata = testing) | ||
#pred.class <- as.numeric(ifelse(pred.nnet0 < 0.5, 0, 1)) | ||
#cm.nnet0 <- confusionMatrix(validation.nn$TARGET, pred.class) | ||
|
||
# ROC | ||
library(pROC) | ||
ROC <- roc(response = testing$TARGET, predictor = as.numeric(pred.nnet)) | ||
plot(ROC) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
# Santander data (Random Forest CV full data) | ||
# vfull_dat | ||
# Author: Burak H | ||
# | ||
require(dplyr) | ||
require(caret) | ||
require(randomForest) | ||
require(e1071) | ||
setwd("~/Works/Rworkspace/Santender/") | ||
|
||
# Read data | ||
train0 <- read.csv("train.csv") | ||
test0 <- read.csv("test.csv") | ||
|
||
# Get the cleaned and processed data | ||
setwd("~/Works/Rworkspace/Santender/final/") | ||
fun.clean <- dget("clean_process.R") | ||
cleaned <- fun.clean(train0,test0) | ||
y <- cleaned[[1]]; train0 <- cleaned[[2]]; test0 <- cleaned[[3]] | ||
test0.id <- cleaned[[4]] | ||
rm(cleaned) | ||
|
||
train0$TARGET <- y # Put TARGET back | ||
|
||
# Create training and validation sets for cross-validation | ||
set.seed(101) | ||
inTrain <- createDataPartition(y = train0$TARGET, p = 0.7, list = FALSE) | ||
training <- train0[inTrain, ]; validation <- train0[-inTrain, ] | ||
|
||
# For RF, we pretty much need a balanced set for training | ||
nmin <- sum(training$TARGET == "1") | ||
nmax <- sum(training$TARGET == "0") | ||
nfolds <- floor(nmax/nmin) # Number of data separations | ||
|
||
# Folds to use | ||
folds <- createFolds(1:nmax, k = nfolds) | ||
|
||
# All 0's and all 1's data | ||
temp.0 <- filter(training, TARGET == "0") | ||
temp.1 <- filter(training, TARGET == "1") | ||
|
||
# Balanced data used for predictions | ||
tr.bal <- rbind(temp.1, temp.0[folds[[1]], ]) | ||
|
||
# Training with Caret | ||
rf_grid <- expand.grid(mtry = c(10, 20, 30, 40, 50, 60, 100)) | ||
|
||
ctrl <- trainControl(method = "cv", number = 10, classProbs = TRUE, summaryFunction = twoClassSummary, | ||
verboseIter = TRUE) | ||
|
||
mod.rf <- train(x = select(tr.bal, -TARGET), y = ifelse(tr.bal$TARGET == "1", "y", "n"), | ||
method = "rf", trControl = ctrl, metric = "ROC", tuneGrid = rf_grid, ntree = 1000) | ||
|
||
# mtry = 40 in the first round of RF | ||
# Second round of CV | ||
# Training with Caret | ||
rf_grid <- expand.grid(mtry = c(32, 34, 36, 38, 40, 42, 44, 46, 48)) | ||
|
||
ctrl <- trainControl(method = "cv", number = 10, classProbs = TRUE, summaryFunction = twoClassSummary, | ||
verboseIter = TRUE) | ||
set.seed(101) | ||
mod.rf <- train(x = select(tr.bal, -TARGET), y = ifelse(tr.bal$TARGET == "1", "y", "n"), | ||
method = "rf", trControl = ctrl, metric = "ROC", tuneGrid = rf_grid, ntree = 1000) | ||
|
||
# In this one mtry = 38 is found.. |
Oops, something went wrong.