Added RedHat

joshkyh · Oct 5, 2016 · b5fe01c · b5fe01c
1 parent 385fe36
commit b5fe01c
Show file tree

Hide file tree

Showing 4 changed files with 272 additions and 0 deletions.
diff --git a/RedHat/Readme.md b/RedHat/Readme.md
@@ -0,0 +1 @@
+Data can be downloaded from [Kaggle](https://www.kaggle.com/c/predicting-red-hat-business-value)
diff --git a/RedHat/exercise_sparseModelMatrix.R b/RedHat/exercise_sparseModelMatrix.R
@@ -0,0 +1,30 @@
+# Burak Himmetoglu
+# 
+# Exercise: Sparse Model Matrices
+#
+require(Matrix)
+dd <- data.frame(a = gl(3,4), b = gl(4,1,12))
+str(dd)
+mm <- model.matrix(~.-1, dd) # no bias term
+dd$c = gl(3,2)
+dd$c[sample(12,4)] <- NA # Add NA's
+mm <- model.matrix(~.-1, dd) # This removes rows with NA
+
+# Same thing for sparse.model.matrix
+mm.sparse <- sparse.model.matrix(~.-1, dd)
+
+# Suppose we add a numeric value
+dd$d = rnorm(12)
+mm <- model.matrix(~.-1, dd) # Works just fine
+mm <- sparse.model.matrix(~.-1, dd) # Works just fine too
+
+# Suppose we have a character column, that we want to use as factor eventually
+types <- c("type_1", "type_2", "type_3")
+dd$e <- rep(types,4); str(dd)
+
+# Notice that there is an issue with _
+#colnames(dd) <- c("col_1", "col_2", "col_3", "col_4", "col_5)
+colnames(dd) <- c("c1", "c2", "c3", "c4", "c5")
+
+mm <- model.matrix(~.-1, dd) # Does the conversion automatically
+mm.sparse <- sparse.model.matrix(~.-1, dd) # Same..
diff --git a/RedHat/explore_v.1.0.R b/RedHat/explore_v.1.0.R
@@ -0,0 +1,110 @@
+# Burak Himmetoglu
+# begin: 08-04-2016
+#
+# RedHat: Simple exploration
+# Inspired from an exploratory analysis from Kaggle
+#
+# Libraries
+options(stringsAsFactors = FALSE)
+require(dplyr)
+require(tidyr)
+require(ggplot2)
+
+# Read train/test data for actvities
+setwd("~/Works/Rworkspace/RedHat")
+#
+train <- read.csv("./data/act_train.csv")
+test <- read.csv("./data/act_test.csv")
+
+# People data
+people <- read.csv("./data/people.csv")
+
+# Total number of outcomes
+train %>% group_by(outcome) %>% summarize(number = n())
+
+# Total number of outcomes per activity category
+train %>% group_by(activity_category,outcome) %>% summarise(number = n())
+# Easier to see on plot
+gg0 <- ggplot(train,aes(x = outcome, fill = activity_category)) + geom_bar(width = 0.6, position = "fill")
+gg0
+
+# Let us look at char_5
+cat("Unique entries in char_5:", unique(train$char_5), "\n")
+gg1 <- ggplot(train, aes(x = outcome, fill = char_5)) + geom_bar(width = 0.6,position = "fill")
+gg1 # Majority is empty
+
+# Another way
+gg2 <- train %>% count(char_5) %>% ggplot(aes (x = reorder(char_5,n), y = n)) + geom_bar(stat = "identity", fill = "light blue") + 
+  coord_flip() + ggtitle("Count of char_5")
+gg2
+
+# Char_10 seem to have many factor variables
+gg3 <- train %>% count(char_10, sort = TRUE) %>% filter(n > 8000) %>%
+  ggplot(aes (x = reorder(char_10,n), y = n)) + geom_bar(stat = "identity") + 
+  coord_flip() + ggtitle("Count of char_10")
+gg3
+
+# Cumulative sum
+count_char_10 <- train %>% count(char_10, sort = TRUE)
+plot(cumsum(count_char_10$n[1:1000])/sum(count_char_10$n),
+     type = "b", pch = ".", main = "Cumulative Percent by types of char_10", ylab = "cumulative percent")
+
+# Popular types in char_10 (first 15 in count_char_10)
+popular <- count_char_10$char_10[1:15]
+gg4 <- train %>% filter(char_10 %in% popular) %>% ggplot(aes(x = outcome, fill = char_10)) + 
+  geom_bar(width = 0.6, position = "fill") + ggtitle("Outcome by char_10")
+gg4
+# Another way
+gg5 <- train %>% filter(char_10 %in% popular) %>% ggplot(aes(x = char_10, fill = char_10)) + geom_bar() + coord_flip() +
+  facet_grid(~ outcome) + ggtitle("Outcome by char_10")
+gg5
+
+# People
+gg6 <- train %>% count(people_id, sort = TRUE) %>% ggplot(aes(x = n)) + geom_histogram()
+gg6 # A small number of people with a large number of activities 
+
+people_count <- train %>% count(people_id, sort = TRUE)
+c(median(people_count$n), mean(people_count$n)) # Very different mean and median also reflects the above fact
+
+# Let is look at first 10 people with most activities
+people_count_10 <- people_count$people_id[1:10]
+gg7 <- train %>% filter(people_id %in% people_count_10) %>% ggplot(aes(x = outcome, fill = people_id)) + 
+  geom_bar(width = 0.6, position = "fill") + ggtitle("Outcome by top10 people")
+gg7
+# A different way to look
+gg8 <- train %>% filter(people_id %in% people_count_10) %>% ggplot(aes(x = people_id)) +
+  geom_bar() + facet_grid(~ outcome) + coord_flip() + ggtitle("Outcome by top10 people")
+gg8
+
+# Proportion of positive outcomes by people_id
+gg9 <- train %>% filter(people_id %in% people_count$people_id[1:10000]) %>% group_by(people_id) %>%
+  summarize(count = n(), outcome_pos = sum(outcome)) %>% mutate(frac_pos = outcome_pos/count) %>% 
+  ggplot(aes(x = frac_pos)) + geom_histogram() +
+  ggtitle("Fraction of Positive Outcomes by person")
+gg9 # Most people have entirely positive or negative outcomes regardless of their actions.
+
+# No intersection between people_id's in train and test!!!
+intersect(test$people_id, train$people_id)
+
+##### Merging with people #####
+train <- merge(train, people, by = "people_id", all.x = TRUE)
+# Notice that char_1 -- char_10 of activities are different from char_1 -- char_38 of people 
+
+# Group count
+train %>% count(group_1, sort = TRUE) -> group_count
+
+# Char_38 (Only numeric value)
+gg10 <- train %>% ggplot(aes(x = char_38)) + geom_histogram() + ggtitle("Distribution of char_38")
+gg10
+
+# By outcome
+gg11 <- train %>% ggplot(aes(x = char_38, fill = as.factor(outcome))) + geom_histogram() + facet_wrap(~ outcome)
+gg11 # Small values of char_38 is related to negative outcomes
+
+# Another view
+gg12 <- train %>% 
+  ggplot(aes(x = char_38, fill = as.factor(outcome))) +
+  geom_histogram(position = "fill") +
+  scale_fill_brewer(palette = "Set1") +
+  ggtitle("Proportion of outcomes by values of char_38")
+gg12
diff --git a/RedHat/explore_v.2.0.R b/RedHat/explore_v.2.0.R
@@ -0,0 +1,131 @@
+# Burak Himmetoglu
+# begin: 08-04-2016
+#
+# RedHat: Simple data generation for analysis
+#
+# Debug flag
+debugFlag = TRUE 
+
+# Libraries
+options(stringsAsFactors = FALSE)
+require(data.table)
+
+# Read train/test data for actvities
+setwd("~/Works/Rworkspace/RedHat")
+train <- fread("./data/act_train.csv")
+test <- fread("./data/act_test.csv")
+
+# Combine train and test into one data
+train[,is.train := 1]; test[,c("is.train","outcome"):= list(0,NA)]
+activity <- rbind(train,test)
+rm(train,test); gc()
+
+# People data (change TRUE & FALSE to 1 and 0)
+people <- fread("./data/people.csv")
+cols <- paste0("char_", 10:37)
+people[, (cols) := lapply(.SD,as.integer) ,.SDcols = cols]
+
+# Rename the columns to prevent confusion between activity chars and people chars
+# activity
+cols <- colnames(activity)[5:14]
+paste_activity <- function(c) { paste0("act:", c) }
+cols <- sapply(cols, paste_activity)
+colnames(activity)[5:14] <- cols
+
+# people
+cols <- colnames(people)[c(2,4,6:41)]
+paste_people <- function(c){ paste0("ppl:", c) }
+cols <- sapply(cols, paste_people)
+colnames(people)[c(2,4,6:41)] <- cols
+
+# Clean
+rm(cols,paste_people, paste_activity); gc()
+
+##### Merge activity and people #####
+# Drop date from each
+activity[, date := NULL]; people[, date := NULL]
+
+# Merge
+act_ppl <- merge(activity,people,by="people_id", all.x = TRUE)
+rm(activity,people); gc()
+
+# Drop people_id, since we don't need it anymore
+act_ppl[, people_id := NULL]
+
+# Check for empty entries
+if (debugFlag){
+  # A function for testing empty entries
+  is_empty <- function(c){ c == ""}
+
+  # Number of empty entries in act:char_1 to act:char_10
+  act.cols <- colnames(act_ppl)[3:12]
+  tmp <- act_ppl[, lapply(.SD, is_empty), .SDcols = act.cols]
+  n_empty.act <- tmp[, lapply(.SD, sum), .SDcols = act.cols]
+  rm(tmp); gc()
+
+  # Number of empty entries in ppl:char_1 to ppl:char_9
+  ppl.cols <- colnames(act_ppl)[15:53]
+  tmp <- act_ppl[, lapply(.SD, is_empty), .SDcols = ppl.cols]
+  n_empty.ppl <- tmp[, lapply(.SD, sum), .SDcols = ppl.cols] # n_empty.ppl is all zeros!!
+  rm(tmp); gc()
+} # There are empty entries in act:char's but no empty entries in ppl:char's
+
+# Maybe, one can replace empty levels with type_0
+act.cols <- colnames(act_ppl)[3:12]
+replace_empty <- function(c){ c[c == ""] = "type 0"; c }
+act_ppl[, (act.cols) := lapply(.SD, replace_empty), .SDcols = act.cols]
+
+## Now, we can create a model matrix
+# Split back to train and test
+X <- act_ppl[, -(c("is.train", "outcome","activity_id")), with = FALSE]
+id.train <- act_ppl$is.train == 1
+y.train <- act_ppl$outcome[id.train]
+act_id <- act_ppl$activity_id
+
+require(Matrix)
+
+# Rename colums, get rid of "_" and "_" (sparse.model.matrix does not like such column names)
+repUnderScore <- function(c){ c <- gsub("_","",c); c <- gsub(":","",c); c }
+colnames(X) <- repUnderScore(colnames(X))
+
+# Create the sparse.model.matrix
+Xmm <- sparse.model.matrix(~.-1, X) # Warnings are OK, just conversion from char to factor
+warnings()
+
+# Test XGBoost
+require(xgboost)
+dtrain <- xgb.DMatrix(Xmm[id.train,],label = y.train)
+param <- list(booster="gblinear",
+              objective="binary:logistic",
+              eval_metric="auc",
+              eta=0.03)
+              # max_depth = 5,
+              # subsample = 0.7,
+              # colsample_bytree = 0.3,
+              # min_child_weight = ,
+              # gamma = 1,
+              # alpha = 0.0)
+
+set.seed(1011)
+watchlist <- list(train=dtrain)
+mod.xgb_cv <- xgb.cv(params=param,
+                 data=dtrain,
+                 nrounds=500,
+                 watchlist=watchlist,
+                 nfold=5,
+                 early.stop.round=3)
+
+# Predict on test set
+dtest = xgb.DMatrix(Xmm[!id.train,])
+mod.xgb <- xgboost(params=param,
+                  data=dtrain,
+                  nrounds=173,
+                  watchlist=watchlist,
+                  early.stop.round=3)
+
+pred.test <- predict(mod.xgb, newdata = dtest)
+act_id.test <- act_id[!id.train]
+
+# Final and submission
+final <- data.frame(activity_id = act_id.test, outcome = pred.test)
+write.csv(final,"xgb_trial.csv", row.names = FALSE)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Data can be downloaded from [Kaggle](https://www.kaggle.com/c/predicting-red-hat-business-value)