diff --git a/RedHat/Readme.md b/RedHat/Readme.md new file mode 100644 index 0000000..24d63e6 --- /dev/null +++ b/RedHat/Readme.md @@ -0,0 +1 @@ +Data can be downloaded from [Kaggle](https://www.kaggle.com/c/predicting-red-hat-business-value) diff --git a/RedHat/exercise_sparseModelMatrix.R b/RedHat/exercise_sparseModelMatrix.R new file mode 100644 index 0000000..ed1341a --- /dev/null +++ b/RedHat/exercise_sparseModelMatrix.R @@ -0,0 +1,30 @@ +# Burak Himmetoglu +# +# Exercise: Sparse Model Matrices +# +require(Matrix) +dd <- data.frame(a = gl(3,4), b = gl(4,1,12)) +str(dd) +mm <- model.matrix(~.-1, dd) # no bias term +dd$c = gl(3,2) +dd$c[sample(12,4)] <- NA # Add NA's +mm <- model.matrix(~.-1, dd) # This removes rows with NA + +# Same thing for sparse.model.matrix +mm.sparse <- sparse.model.matrix(~.-1, dd) + +# Suppose we add a numeric value +dd$d = rnorm(12) +mm <- model.matrix(~.-1, dd) # Works just fine +mm <- sparse.model.matrix(~.-1, dd) # Works just fine too + +# Suppose we have a character column, that we want to use as factor eventually +types <- c("type_1", "type_2", "type_3") +dd$e <- rep(types,4); str(dd) + +# Notice that there is an issue with _ +#colnames(dd) <- c("col_1", "col_2", "col_3", "col_4", "col_5) +colnames(dd) <- c("c1", "c2", "c3", "c4", "c5") + +mm <- model.matrix(~.-1, dd) # Does the conversion automatically +mm.sparse <- sparse.model.matrix(~.-1, dd) # Same.. diff --git a/RedHat/explore_v.1.0.R b/RedHat/explore_v.1.0.R new file mode 100644 index 0000000..b2a8108 --- /dev/null +++ b/RedHat/explore_v.1.0.R @@ -0,0 +1,110 @@ +# Burak Himmetoglu +# begin: 08-04-2016 +# +# RedHat: Simple exploration +# Inspired from an exploratory analysis from Kaggle +# +# Libraries +options(stringsAsFactors = FALSE) +require(dplyr) +require(tidyr) +require(ggplot2) + +# Read train/test data for actvities +setwd("~/Works/Rworkspace/RedHat") +# +train <- read.csv("./data/act_train.csv") +test <- read.csv("./data/act_test.csv") + +# People data +people <- read.csv("./data/people.csv") + +# Total number of outcomes +train %>% group_by(outcome) %>% summarize(number = n()) + +# Total number of outcomes per activity category +train %>% group_by(activity_category,outcome) %>% summarise(number = n()) +# Easier to see on plot +gg0 <- ggplot(train,aes(x = outcome, fill = activity_category)) + geom_bar(width = 0.6, position = "fill") +gg0 + +# Let us look at char_5 +cat("Unique entries in char_5:", unique(train$char_5), "\n") +gg1 <- ggplot(train, aes(x = outcome, fill = char_5)) + geom_bar(width = 0.6,position = "fill") +gg1 # Majority is empty + +# Another way +gg2 <- train %>% count(char_5) %>% ggplot(aes (x = reorder(char_5,n), y = n)) + geom_bar(stat = "identity", fill = "light blue") + + coord_flip() + ggtitle("Count of char_5") +gg2 + +# Char_10 seem to have many factor variables +gg3 <- train %>% count(char_10, sort = TRUE) %>% filter(n > 8000) %>% + ggplot(aes (x = reorder(char_10,n), y = n)) + geom_bar(stat = "identity") + + coord_flip() + ggtitle("Count of char_10") +gg3 + +# Cumulative sum +count_char_10 <- train %>% count(char_10, sort = TRUE) +plot(cumsum(count_char_10$n[1:1000])/sum(count_char_10$n), + type = "b", pch = ".", main = "Cumulative Percent by types of char_10", ylab = "cumulative percent") + +# Popular types in char_10 (first 15 in count_char_10) +popular <- count_char_10$char_10[1:15] +gg4 <- train %>% filter(char_10 %in% popular) %>% ggplot(aes(x = outcome, fill = char_10)) + + geom_bar(width = 0.6, position = "fill") + ggtitle("Outcome by char_10") +gg4 +# Another way +gg5 <- train %>% filter(char_10 %in% popular) %>% ggplot(aes(x = char_10, fill = char_10)) + geom_bar() + coord_flip() + + facet_grid(~ outcome) + ggtitle("Outcome by char_10") +gg5 + +# People +gg6 <- train %>% count(people_id, sort = TRUE) %>% ggplot(aes(x = n)) + geom_histogram() +gg6 # A small number of people with a large number of activities + +people_count <- train %>% count(people_id, sort = TRUE) +c(median(people_count$n), mean(people_count$n)) # Very different mean and median also reflects the above fact + +# Let is look at first 10 people with most activities +people_count_10 <- people_count$people_id[1:10] +gg7 <- train %>% filter(people_id %in% people_count_10) %>% ggplot(aes(x = outcome, fill = people_id)) + + geom_bar(width = 0.6, position = "fill") + ggtitle("Outcome by top10 people") +gg7 +# A different way to look +gg8 <- train %>% filter(people_id %in% people_count_10) %>% ggplot(aes(x = people_id)) + + geom_bar() + facet_grid(~ outcome) + coord_flip() + ggtitle("Outcome by top10 people") +gg8 + +# Proportion of positive outcomes by people_id +gg9 <- train %>% filter(people_id %in% people_count$people_id[1:10000]) %>% group_by(people_id) %>% + summarize(count = n(), outcome_pos = sum(outcome)) %>% mutate(frac_pos = outcome_pos/count) %>% + ggplot(aes(x = frac_pos)) + geom_histogram() + + ggtitle("Fraction of Positive Outcomes by person") +gg9 # Most people have entirely positive or negative outcomes regardless of their actions. + +# No intersection between people_id's in train and test!!! +intersect(test$people_id, train$people_id) + +##### Merging with people ##### +train <- merge(train, people, by = "people_id", all.x = TRUE) +# Notice that char_1 -- char_10 of activities are different from char_1 -- char_38 of people + +# Group count +train %>% count(group_1, sort = TRUE) -> group_count + +# Char_38 (Only numeric value) +gg10 <- train %>% ggplot(aes(x = char_38)) + geom_histogram() + ggtitle("Distribution of char_38") +gg10 + +# By outcome +gg11 <- train %>% ggplot(aes(x = char_38, fill = as.factor(outcome))) + geom_histogram() + facet_wrap(~ outcome) +gg11 # Small values of char_38 is related to negative outcomes + +# Another view +gg12 <- train %>% + ggplot(aes(x = char_38, fill = as.factor(outcome))) + + geom_histogram(position = "fill") + + scale_fill_brewer(palette = "Set1") + + ggtitle("Proportion of outcomes by values of char_38") +gg12 diff --git a/RedHat/explore_v.2.0.R b/RedHat/explore_v.2.0.R new file mode 100644 index 0000000..f326498 --- /dev/null +++ b/RedHat/explore_v.2.0.R @@ -0,0 +1,131 @@ +# Burak Himmetoglu +# begin: 08-04-2016 +# +# RedHat: Simple data generation for analysis +# +# Debug flag +debugFlag = TRUE + +# Libraries +options(stringsAsFactors = FALSE) +require(data.table) + +# Read train/test data for actvities +setwd("~/Works/Rworkspace/RedHat") +train <- fread("./data/act_train.csv") +test <- fread("./data/act_test.csv") + +# Combine train and test into one data +train[,is.train := 1]; test[,c("is.train","outcome"):= list(0,NA)] +activity <- rbind(train,test) +rm(train,test); gc() + +# People data (change TRUE & FALSE to 1 and 0) +people <- fread("./data/people.csv") +cols <- paste0("char_", 10:37) +people[, (cols) := lapply(.SD,as.integer) ,.SDcols = cols] + +# Rename the columns to prevent confusion between activity chars and people chars +# activity +cols <- colnames(activity)[5:14] +paste_activity <- function(c) { paste0("act:", c) } +cols <- sapply(cols, paste_activity) +colnames(activity)[5:14] <- cols + +# people +cols <- colnames(people)[c(2,4,6:41)] +paste_people <- function(c){ paste0("ppl:", c) } +cols <- sapply(cols, paste_people) +colnames(people)[c(2,4,6:41)] <- cols + +# Clean +rm(cols,paste_people, paste_activity); gc() + +##### Merge activity and people ##### +# Drop date from each +activity[, date := NULL]; people[, date := NULL] + +# Merge +act_ppl <- merge(activity,people,by="people_id", all.x = TRUE) +rm(activity,people); gc() + +# Drop people_id, since we don't need it anymore +act_ppl[, people_id := NULL] + +# Check for empty entries +if (debugFlag){ + # A function for testing empty entries + is_empty <- function(c){ c == ""} + + # Number of empty entries in act:char_1 to act:char_10 + act.cols <- colnames(act_ppl)[3:12] + tmp <- act_ppl[, lapply(.SD, is_empty), .SDcols = act.cols] + n_empty.act <- tmp[, lapply(.SD, sum), .SDcols = act.cols] + rm(tmp); gc() + + # Number of empty entries in ppl:char_1 to ppl:char_9 + ppl.cols <- colnames(act_ppl)[15:53] + tmp <- act_ppl[, lapply(.SD, is_empty), .SDcols = ppl.cols] + n_empty.ppl <- tmp[, lapply(.SD, sum), .SDcols = ppl.cols] # n_empty.ppl is all zeros!! + rm(tmp); gc() +} # There are empty entries in act:char's but no empty entries in ppl:char's + +# Maybe, one can replace empty levels with type_0 +act.cols <- colnames(act_ppl)[3:12] +replace_empty <- function(c){ c[c == ""] = "type 0"; c } +act_ppl[, (act.cols) := lapply(.SD, replace_empty), .SDcols = act.cols] + +## Now, we can create a model matrix +# Split back to train and test +X <- act_ppl[, -(c("is.train", "outcome","activity_id")), with = FALSE] +id.train <- act_ppl$is.train == 1 +y.train <- act_ppl$outcome[id.train] +act_id <- act_ppl$activity_id + +require(Matrix) + +# Rename colums, get rid of "_" and "_" (sparse.model.matrix does not like such column names) +repUnderScore <- function(c){ c <- gsub("_","",c); c <- gsub(":","",c); c } +colnames(X) <- repUnderScore(colnames(X)) + +# Create the sparse.model.matrix +Xmm <- sparse.model.matrix(~.-1, X) # Warnings are OK, just conversion from char to factor +warnings() + +# Test XGBoost +require(xgboost) +dtrain <- xgb.DMatrix(Xmm[id.train,],label = y.train) +param <- list(booster="gblinear", + objective="binary:logistic", + eval_metric="auc", + eta=0.03) + # max_depth = 5, + # subsample = 0.7, + # colsample_bytree = 0.3, + # min_child_weight = , + # gamma = 1, + # alpha = 0.0) + +set.seed(1011) +watchlist <- list(train=dtrain) +mod.xgb_cv <- xgb.cv(params=param, + data=dtrain, + nrounds=500, + watchlist=watchlist, + nfold=5, + early.stop.round=3) + +# Predict on test set +dtest = xgb.DMatrix(Xmm[!id.train,]) +mod.xgb <- xgboost(params=param, + data=dtrain, + nrounds=173, + watchlist=watchlist, + early.stop.round=3) + +pred.test <- predict(mod.xgb, newdata = dtest) +act_id.test <- act_id[!id.train] + +# Final and submission +final <- data.frame(activity_id = act_id.test, outcome = pred.test) +write.csv(final,"xgb_trial.csv", row.names = FALSE)