forked from bhimmetoglu/kaggle_101
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
385fe36
commit b5fe01c
Showing
4 changed files
with
272 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
Data can be downloaded from [Kaggle](https://www.kaggle.com/c/predicting-red-hat-business-value) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
# Burak Himmetoglu | ||
# | ||
# Exercise: Sparse Model Matrices | ||
# | ||
require(Matrix) | ||
dd <- data.frame(a = gl(3,4), b = gl(4,1,12)) | ||
str(dd) | ||
mm <- model.matrix(~.-1, dd) # no bias term | ||
dd$c = gl(3,2) | ||
dd$c[sample(12,4)] <- NA # Add NA's | ||
mm <- model.matrix(~.-1, dd) # This removes rows with NA | ||
|
||
# Same thing for sparse.model.matrix | ||
mm.sparse <- sparse.model.matrix(~.-1, dd) | ||
|
||
# Suppose we add a numeric value | ||
dd$d = rnorm(12) | ||
mm <- model.matrix(~.-1, dd) # Works just fine | ||
mm <- sparse.model.matrix(~.-1, dd) # Works just fine too | ||
|
||
# Suppose we have a character column, that we want to use as factor eventually | ||
types <- c("type_1", "type_2", "type_3") | ||
dd$e <- rep(types,4); str(dd) | ||
|
||
# Notice that there is an issue with _ | ||
#colnames(dd) <- c("col_1", "col_2", "col_3", "col_4", "col_5) | ||
colnames(dd) <- c("c1", "c2", "c3", "c4", "c5") | ||
|
||
mm <- model.matrix(~.-1, dd) # Does the conversion automatically | ||
mm.sparse <- sparse.model.matrix(~.-1, dd) # Same.. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
# Burak Himmetoglu | ||
# begin: 08-04-2016 | ||
# | ||
# RedHat: Simple exploration | ||
# Inspired from an exploratory analysis from Kaggle | ||
# | ||
# Libraries | ||
options(stringsAsFactors = FALSE) | ||
require(dplyr) | ||
require(tidyr) | ||
require(ggplot2) | ||
|
||
# Read train/test data for actvities | ||
setwd("~/Works/Rworkspace/RedHat") | ||
# | ||
train <- read.csv("./data/act_train.csv") | ||
test <- read.csv("./data/act_test.csv") | ||
|
||
# People data | ||
people <- read.csv("./data/people.csv") | ||
|
||
# Total number of outcomes | ||
train %>% group_by(outcome) %>% summarize(number = n()) | ||
|
||
# Total number of outcomes per activity category | ||
train %>% group_by(activity_category,outcome) %>% summarise(number = n()) | ||
# Easier to see on plot | ||
gg0 <- ggplot(train,aes(x = outcome, fill = activity_category)) + geom_bar(width = 0.6, position = "fill") | ||
gg0 | ||
|
||
# Let us look at char_5 | ||
cat("Unique entries in char_5:", unique(train$char_5), "\n") | ||
gg1 <- ggplot(train, aes(x = outcome, fill = char_5)) + geom_bar(width = 0.6,position = "fill") | ||
gg1 # Majority is empty | ||
|
||
# Another way | ||
gg2 <- train %>% count(char_5) %>% ggplot(aes (x = reorder(char_5,n), y = n)) + geom_bar(stat = "identity", fill = "light blue") + | ||
coord_flip() + ggtitle("Count of char_5") | ||
gg2 | ||
|
||
# Char_10 seem to have many factor variables | ||
gg3 <- train %>% count(char_10, sort = TRUE) %>% filter(n > 8000) %>% | ||
ggplot(aes (x = reorder(char_10,n), y = n)) + geom_bar(stat = "identity") + | ||
coord_flip() + ggtitle("Count of char_10") | ||
gg3 | ||
|
||
# Cumulative sum | ||
count_char_10 <- train %>% count(char_10, sort = TRUE) | ||
plot(cumsum(count_char_10$n[1:1000])/sum(count_char_10$n), | ||
type = "b", pch = ".", main = "Cumulative Percent by types of char_10", ylab = "cumulative percent") | ||
|
||
# Popular types in char_10 (first 15 in count_char_10) | ||
popular <- count_char_10$char_10[1:15] | ||
gg4 <- train %>% filter(char_10 %in% popular) %>% ggplot(aes(x = outcome, fill = char_10)) + | ||
geom_bar(width = 0.6, position = "fill") + ggtitle("Outcome by char_10") | ||
gg4 | ||
# Another way | ||
gg5 <- train %>% filter(char_10 %in% popular) %>% ggplot(aes(x = char_10, fill = char_10)) + geom_bar() + coord_flip() + | ||
facet_grid(~ outcome) + ggtitle("Outcome by char_10") | ||
gg5 | ||
|
||
# People | ||
gg6 <- train %>% count(people_id, sort = TRUE) %>% ggplot(aes(x = n)) + geom_histogram() | ||
gg6 # A small number of people with a large number of activities | ||
|
||
people_count <- train %>% count(people_id, sort = TRUE) | ||
c(median(people_count$n), mean(people_count$n)) # Very different mean and median also reflects the above fact | ||
|
||
# Let is look at first 10 people with most activities | ||
people_count_10 <- people_count$people_id[1:10] | ||
gg7 <- train %>% filter(people_id %in% people_count_10) %>% ggplot(aes(x = outcome, fill = people_id)) + | ||
geom_bar(width = 0.6, position = "fill") + ggtitle("Outcome by top10 people") | ||
gg7 | ||
# A different way to look | ||
gg8 <- train %>% filter(people_id %in% people_count_10) %>% ggplot(aes(x = people_id)) + | ||
geom_bar() + facet_grid(~ outcome) + coord_flip() + ggtitle("Outcome by top10 people") | ||
gg8 | ||
|
||
# Proportion of positive outcomes by people_id | ||
gg9 <- train %>% filter(people_id %in% people_count$people_id[1:10000]) %>% group_by(people_id) %>% | ||
summarize(count = n(), outcome_pos = sum(outcome)) %>% mutate(frac_pos = outcome_pos/count) %>% | ||
ggplot(aes(x = frac_pos)) + geom_histogram() + | ||
ggtitle("Fraction of Positive Outcomes by person") | ||
gg9 # Most people have entirely positive or negative outcomes regardless of their actions. | ||
|
||
# No intersection between people_id's in train and test!!! | ||
intersect(test$people_id, train$people_id) | ||
|
||
##### Merging with people ##### | ||
train <- merge(train, people, by = "people_id", all.x = TRUE) | ||
# Notice that char_1 -- char_10 of activities are different from char_1 -- char_38 of people | ||
|
||
# Group count | ||
train %>% count(group_1, sort = TRUE) -> group_count | ||
|
||
# Char_38 (Only numeric value) | ||
gg10 <- train %>% ggplot(aes(x = char_38)) + geom_histogram() + ggtitle("Distribution of char_38") | ||
gg10 | ||
|
||
# By outcome | ||
gg11 <- train %>% ggplot(aes(x = char_38, fill = as.factor(outcome))) + geom_histogram() + facet_wrap(~ outcome) | ||
gg11 # Small values of char_38 is related to negative outcomes | ||
|
||
# Another view | ||
gg12 <- train %>% | ||
ggplot(aes(x = char_38, fill = as.factor(outcome))) + | ||
geom_histogram(position = "fill") + | ||
scale_fill_brewer(palette = "Set1") + | ||
ggtitle("Proportion of outcomes by values of char_38") | ||
gg12 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,131 @@ | ||
# Burak Himmetoglu | ||
# begin: 08-04-2016 | ||
# | ||
# RedHat: Simple data generation for analysis | ||
# | ||
# Debug flag | ||
debugFlag = TRUE | ||
|
||
# Libraries | ||
options(stringsAsFactors = FALSE) | ||
require(data.table) | ||
|
||
# Read train/test data for actvities | ||
setwd("~/Works/Rworkspace/RedHat") | ||
train <- fread("./data/act_train.csv") | ||
test <- fread("./data/act_test.csv") | ||
|
||
# Combine train and test into one data | ||
train[,is.train := 1]; test[,c("is.train","outcome"):= list(0,NA)] | ||
activity <- rbind(train,test) | ||
rm(train,test); gc() | ||
|
||
# People data (change TRUE & FALSE to 1 and 0) | ||
people <- fread("./data/people.csv") | ||
cols <- paste0("char_", 10:37) | ||
people[, (cols) := lapply(.SD,as.integer) ,.SDcols = cols] | ||
|
||
# Rename the columns to prevent confusion between activity chars and people chars | ||
# activity | ||
cols <- colnames(activity)[5:14] | ||
paste_activity <- function(c) { paste0("act:", c) } | ||
cols <- sapply(cols, paste_activity) | ||
colnames(activity)[5:14] <- cols | ||
|
||
# people | ||
cols <- colnames(people)[c(2,4,6:41)] | ||
paste_people <- function(c){ paste0("ppl:", c) } | ||
cols <- sapply(cols, paste_people) | ||
colnames(people)[c(2,4,6:41)] <- cols | ||
|
||
# Clean | ||
rm(cols,paste_people, paste_activity); gc() | ||
|
||
##### Merge activity and people ##### | ||
# Drop date from each | ||
activity[, date := NULL]; people[, date := NULL] | ||
|
||
# Merge | ||
act_ppl <- merge(activity,people,by="people_id", all.x = TRUE) | ||
rm(activity,people); gc() | ||
|
||
# Drop people_id, since we don't need it anymore | ||
act_ppl[, people_id := NULL] | ||
|
||
# Check for empty entries | ||
if (debugFlag){ | ||
# A function for testing empty entries | ||
is_empty <- function(c){ c == ""} | ||
|
||
# Number of empty entries in act:char_1 to act:char_10 | ||
act.cols <- colnames(act_ppl)[3:12] | ||
tmp <- act_ppl[, lapply(.SD, is_empty), .SDcols = act.cols] | ||
n_empty.act <- tmp[, lapply(.SD, sum), .SDcols = act.cols] | ||
rm(tmp); gc() | ||
|
||
# Number of empty entries in ppl:char_1 to ppl:char_9 | ||
ppl.cols <- colnames(act_ppl)[15:53] | ||
tmp <- act_ppl[, lapply(.SD, is_empty), .SDcols = ppl.cols] | ||
n_empty.ppl <- tmp[, lapply(.SD, sum), .SDcols = ppl.cols] # n_empty.ppl is all zeros!! | ||
rm(tmp); gc() | ||
} # There are empty entries in act:char's but no empty entries in ppl:char's | ||
|
||
# Maybe, one can replace empty levels with type_0 | ||
act.cols <- colnames(act_ppl)[3:12] | ||
replace_empty <- function(c){ c[c == ""] = "type 0"; c } | ||
act_ppl[, (act.cols) := lapply(.SD, replace_empty), .SDcols = act.cols] | ||
|
||
## Now, we can create a model matrix | ||
# Split back to train and test | ||
X <- act_ppl[, -(c("is.train", "outcome","activity_id")), with = FALSE] | ||
id.train <- act_ppl$is.train == 1 | ||
y.train <- act_ppl$outcome[id.train] | ||
act_id <- act_ppl$activity_id | ||
|
||
require(Matrix) | ||
|
||
# Rename colums, get rid of "_" and "_" (sparse.model.matrix does not like such column names) | ||
repUnderScore <- function(c){ c <- gsub("_","",c); c <- gsub(":","",c); c } | ||
colnames(X) <- repUnderScore(colnames(X)) | ||
|
||
# Create the sparse.model.matrix | ||
Xmm <- sparse.model.matrix(~.-1, X) # Warnings are OK, just conversion from char to factor | ||
warnings() | ||
|
||
# Test XGBoost | ||
require(xgboost) | ||
dtrain <- xgb.DMatrix(Xmm[id.train,],label = y.train) | ||
param <- list(booster="gblinear", | ||
objective="binary:logistic", | ||
eval_metric="auc", | ||
eta=0.03) | ||
# max_depth = 5, | ||
# subsample = 0.7, | ||
# colsample_bytree = 0.3, | ||
# min_child_weight = , | ||
# gamma = 1, | ||
# alpha = 0.0) | ||
|
||
set.seed(1011) | ||
watchlist <- list(train=dtrain) | ||
mod.xgb_cv <- xgb.cv(params=param, | ||
data=dtrain, | ||
nrounds=500, | ||
watchlist=watchlist, | ||
nfold=5, | ||
early.stop.round=3) | ||
|
||
# Predict on test set | ||
dtest = xgb.DMatrix(Xmm[!id.train,]) | ||
mod.xgb <- xgboost(params=param, | ||
data=dtrain, | ||
nrounds=173, | ||
watchlist=watchlist, | ||
early.stop.round=3) | ||
|
||
pred.test <- predict(mod.xgb, newdata = dtest) | ||
act_id.test <- act_id[!id.train] | ||
|
||
# Final and submission | ||
final <- data.frame(activity_id = act_id.test, outcome = pred.test) | ||
write.csv(final,"xgb_trial.csv", row.names = FALSE) |