Skip to content

Commit

Permalink
Added RedHat
Browse files Browse the repository at this point in the history
  • Loading branch information
bhimmetoglu committed Oct 5, 2016
1 parent 385fe36 commit b5fe01c
Show file tree
Hide file tree
Showing 4 changed files with 272 additions and 0 deletions.
1 change: 1 addition & 0 deletions RedHat/Readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Data can be downloaded from [Kaggle](https://www.kaggle.com/c/predicting-red-hat-business-value)
30 changes: 30 additions & 0 deletions RedHat/exercise_sparseModelMatrix.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Burak Himmetoglu
#
# Exercise: Sparse Model Matrices
#
require(Matrix)
dd <- data.frame(a = gl(3,4), b = gl(4,1,12))
str(dd)
mm <- model.matrix(~.-1, dd) # no bias term
dd$c = gl(3,2)
dd$c[sample(12,4)] <- NA # Add NA's
mm <- model.matrix(~.-1, dd) # This removes rows with NA

# Same thing for sparse.model.matrix
mm.sparse <- sparse.model.matrix(~.-1, dd)

# Suppose we add a numeric value
dd$d = rnorm(12)
mm <- model.matrix(~.-1, dd) # Works just fine
mm <- sparse.model.matrix(~.-1, dd) # Works just fine too

# Suppose we have a character column, that we want to use as factor eventually
types <- c("type_1", "type_2", "type_3")
dd$e <- rep(types,4); str(dd)

# Notice that there is an issue with _
#colnames(dd) <- c("col_1", "col_2", "col_3", "col_4", "col_5)
colnames(dd) <- c("c1", "c2", "c3", "c4", "c5")

mm <- model.matrix(~.-1, dd) # Does the conversion automatically
mm.sparse <- sparse.model.matrix(~.-1, dd) # Same..
110 changes: 110 additions & 0 deletions RedHat/explore_v.1.0.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
# Burak Himmetoglu
# begin: 08-04-2016
#
# RedHat: Simple exploration
# Inspired from an exploratory analysis from Kaggle
#
# Libraries
options(stringsAsFactors = FALSE)
require(dplyr)
require(tidyr)
require(ggplot2)

# Read train/test data for actvities
setwd("~/Works/Rworkspace/RedHat")
#
train <- read.csv("./data/act_train.csv")
test <- read.csv("./data/act_test.csv")

# People data
people <- read.csv("./data/people.csv")

# Total number of outcomes
train %>% group_by(outcome) %>% summarize(number = n())

# Total number of outcomes per activity category
train %>% group_by(activity_category,outcome) %>% summarise(number = n())
# Easier to see on plot
gg0 <- ggplot(train,aes(x = outcome, fill = activity_category)) + geom_bar(width = 0.6, position = "fill")
gg0

# Let us look at char_5
cat("Unique entries in char_5:", unique(train$char_5), "\n")
gg1 <- ggplot(train, aes(x = outcome, fill = char_5)) + geom_bar(width = 0.6,position = "fill")
gg1 # Majority is empty

# Another way
gg2 <- train %>% count(char_5) %>% ggplot(aes (x = reorder(char_5,n), y = n)) + geom_bar(stat = "identity", fill = "light blue") +
coord_flip() + ggtitle("Count of char_5")
gg2

# Char_10 seem to have many factor variables
gg3 <- train %>% count(char_10, sort = TRUE) %>% filter(n > 8000) %>%
ggplot(aes (x = reorder(char_10,n), y = n)) + geom_bar(stat = "identity") +
coord_flip() + ggtitle("Count of char_10")
gg3

# Cumulative sum
count_char_10 <- train %>% count(char_10, sort = TRUE)
plot(cumsum(count_char_10$n[1:1000])/sum(count_char_10$n),
type = "b", pch = ".", main = "Cumulative Percent by types of char_10", ylab = "cumulative percent")

# Popular types in char_10 (first 15 in count_char_10)
popular <- count_char_10$char_10[1:15]
gg4 <- train %>% filter(char_10 %in% popular) %>% ggplot(aes(x = outcome, fill = char_10)) +
geom_bar(width = 0.6, position = "fill") + ggtitle("Outcome by char_10")
gg4
# Another way
gg5 <- train %>% filter(char_10 %in% popular) %>% ggplot(aes(x = char_10, fill = char_10)) + geom_bar() + coord_flip() +
facet_grid(~ outcome) + ggtitle("Outcome by char_10")
gg5

# People
gg6 <- train %>% count(people_id, sort = TRUE) %>% ggplot(aes(x = n)) + geom_histogram()
gg6 # A small number of people with a large number of activities

people_count <- train %>% count(people_id, sort = TRUE)
c(median(people_count$n), mean(people_count$n)) # Very different mean and median also reflects the above fact

# Let is look at first 10 people with most activities
people_count_10 <- people_count$people_id[1:10]
gg7 <- train %>% filter(people_id %in% people_count_10) %>% ggplot(aes(x = outcome, fill = people_id)) +
geom_bar(width = 0.6, position = "fill") + ggtitle("Outcome by top10 people")
gg7
# A different way to look
gg8 <- train %>% filter(people_id %in% people_count_10) %>% ggplot(aes(x = people_id)) +
geom_bar() + facet_grid(~ outcome) + coord_flip() + ggtitle("Outcome by top10 people")
gg8

# Proportion of positive outcomes by people_id
gg9 <- train %>% filter(people_id %in% people_count$people_id[1:10000]) %>% group_by(people_id) %>%
summarize(count = n(), outcome_pos = sum(outcome)) %>% mutate(frac_pos = outcome_pos/count) %>%
ggplot(aes(x = frac_pos)) + geom_histogram() +
ggtitle("Fraction of Positive Outcomes by person")
gg9 # Most people have entirely positive or negative outcomes regardless of their actions.

# No intersection between people_id's in train and test!!!
intersect(test$people_id, train$people_id)

##### Merging with people #####
train <- merge(train, people, by = "people_id", all.x = TRUE)
# Notice that char_1 -- char_10 of activities are different from char_1 -- char_38 of people

# Group count
train %>% count(group_1, sort = TRUE) -> group_count

# Char_38 (Only numeric value)
gg10 <- train %>% ggplot(aes(x = char_38)) + geom_histogram() + ggtitle("Distribution of char_38")
gg10

# By outcome
gg11 <- train %>% ggplot(aes(x = char_38, fill = as.factor(outcome))) + geom_histogram() + facet_wrap(~ outcome)
gg11 # Small values of char_38 is related to negative outcomes

# Another view
gg12 <- train %>%
ggplot(aes(x = char_38, fill = as.factor(outcome))) +
geom_histogram(position = "fill") +
scale_fill_brewer(palette = "Set1") +
ggtitle("Proportion of outcomes by values of char_38")
gg12
131 changes: 131 additions & 0 deletions RedHat/explore_v.2.0.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
# Burak Himmetoglu
# begin: 08-04-2016
#
# RedHat: Simple data generation for analysis
#
# Debug flag
debugFlag = TRUE

# Libraries
options(stringsAsFactors = FALSE)
require(data.table)

# Read train/test data for actvities
setwd("~/Works/Rworkspace/RedHat")
train <- fread("./data/act_train.csv")
test <- fread("./data/act_test.csv")

# Combine train and test into one data
train[,is.train := 1]; test[,c("is.train","outcome"):= list(0,NA)]
activity <- rbind(train,test)
rm(train,test); gc()

# People data (change TRUE & FALSE to 1 and 0)
people <- fread("./data/people.csv")
cols <- paste0("char_", 10:37)
people[, (cols) := lapply(.SD,as.integer) ,.SDcols = cols]

# Rename the columns to prevent confusion between activity chars and people chars
# activity
cols <- colnames(activity)[5:14]
paste_activity <- function(c) { paste0("act:", c) }
cols <- sapply(cols, paste_activity)
colnames(activity)[5:14] <- cols

# people
cols <- colnames(people)[c(2,4,6:41)]
paste_people <- function(c){ paste0("ppl:", c) }
cols <- sapply(cols, paste_people)
colnames(people)[c(2,4,6:41)] <- cols

# Clean
rm(cols,paste_people, paste_activity); gc()

##### Merge activity and people #####
# Drop date from each
activity[, date := NULL]; people[, date := NULL]

# Merge
act_ppl <- merge(activity,people,by="people_id", all.x = TRUE)
rm(activity,people); gc()

# Drop people_id, since we don't need it anymore
act_ppl[, people_id := NULL]

# Check for empty entries
if (debugFlag){
# A function for testing empty entries
is_empty <- function(c){ c == ""}

# Number of empty entries in act:char_1 to act:char_10
act.cols <- colnames(act_ppl)[3:12]
tmp <- act_ppl[, lapply(.SD, is_empty), .SDcols = act.cols]
n_empty.act <- tmp[, lapply(.SD, sum), .SDcols = act.cols]
rm(tmp); gc()

# Number of empty entries in ppl:char_1 to ppl:char_9
ppl.cols <- colnames(act_ppl)[15:53]
tmp <- act_ppl[, lapply(.SD, is_empty), .SDcols = ppl.cols]
n_empty.ppl <- tmp[, lapply(.SD, sum), .SDcols = ppl.cols] # n_empty.ppl is all zeros!!
rm(tmp); gc()
} # There are empty entries in act:char's but no empty entries in ppl:char's

# Maybe, one can replace empty levels with type_0
act.cols <- colnames(act_ppl)[3:12]
replace_empty <- function(c){ c[c == ""] = "type 0"; c }
act_ppl[, (act.cols) := lapply(.SD, replace_empty), .SDcols = act.cols]

## Now, we can create a model matrix
# Split back to train and test
X <- act_ppl[, -(c("is.train", "outcome","activity_id")), with = FALSE]
id.train <- act_ppl$is.train == 1
y.train <- act_ppl$outcome[id.train]
act_id <- act_ppl$activity_id

require(Matrix)

# Rename colums, get rid of "_" and "_" (sparse.model.matrix does not like such column names)
repUnderScore <- function(c){ c <- gsub("_","",c); c <- gsub(":","",c); c }
colnames(X) <- repUnderScore(colnames(X))

# Create the sparse.model.matrix
Xmm <- sparse.model.matrix(~.-1, X) # Warnings are OK, just conversion from char to factor
warnings()

# Test XGBoost
require(xgboost)
dtrain <- xgb.DMatrix(Xmm[id.train,],label = y.train)
param <- list(booster="gblinear",
objective="binary:logistic",
eval_metric="auc",
eta=0.03)
# max_depth = 5,
# subsample = 0.7,
# colsample_bytree = 0.3,
# min_child_weight = ,
# gamma = 1,
# alpha = 0.0)

set.seed(1011)
watchlist <- list(train=dtrain)
mod.xgb_cv <- xgb.cv(params=param,
data=dtrain,
nrounds=500,
watchlist=watchlist,
nfold=5,
early.stop.round=3)

# Predict on test set
dtest = xgb.DMatrix(Xmm[!id.train,])
mod.xgb <- xgboost(params=param,
data=dtrain,
nrounds=173,
watchlist=watchlist,
early.stop.round=3)

pred.test <- predict(mod.xgb, newdata = dtest)
act_id.test <- act_id[!id.train]

# Final and submission
final <- data.frame(activity_id = act_id.test, outcome = pred.test)
write.csv(final,"xgb_trial.csv", row.names = FALSE)

0 comments on commit b5fe01c

Please sign in to comment.