Skip to content

Commit

Permalink
added files
Browse files Browse the repository at this point in the history
  • Loading branch information
bhimmetoglu committed Mar 3, 2016
1 parent 0483eb6 commit cddd659
Show file tree
Hide file tree
Showing 3 changed files with 490 additions and 0 deletions.
66 changes: 66 additions & 0 deletions titanic/analysis-rf.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# Titanic Survival Analysis (Random Forest version)
# Function:
# Read/clean train (clean.R)
#
# Burak H.
library(caret); library(dplyr)

# Go to working directory
setwd("~/Coursera/projects/titanic")

# Read the training and testing sets
train0 <- read.csv("./data/train.csv")
test0 <- read.csv("./data/test.csv")

# Get the cleaned data
fun.clean <- dget("clean.R")
cleaned <- fun.clean(train0,test0)
tr0 <- cleaned[[1]]; ts0 <- cleaned[[2]]

# Further split the tr0 data into training and testing sets by Caret
in.tr0 <- createDataPartition(tr0$Survived, p=0.7, list = FALSE)
training <- tr0[in.tr0, ]
testing <- tr0[-in.tr0, ]

# Train using random forests
ctrl <- trainControl(allowParallel=T, method = "repeatedcv", number = 10, repeats = 10)
modFit <- train(Survived ~ Sex + Age + Pclass + SibSp + Parch + Embarked + Fare + Title,
method = "rf", importance = TRUE, data = training, ntree = 500,
trControl = ctrl)

# Predictions on the training set
pred.tr <- predict(modFit, newdata = training)

# Predict on the testing set
pred0 <- predict(modFit, newdata = testing)

# Confusion Matrices
cm.test <- confusionMatrix(pred0, testing$Survived); cm.test$table
cm.train <- confusionMatrix(pred.tr, training$Survived); cm.train$table

# Now predict on the actual testing set (ts0)
pred.ts0 <- predict(modFit, newdata = ts0)

# Select Passanger ID and Survived to write into final table
final <- data.frame(Survived = pred.ts0, PassengerId = ts0$PassengerId)
write.csv(final, file = "predictions_rf.csv", row.names = FALSE, quote = FALSE)

##### Boosting ####
ctrl <- trainControl(method="boot")
mod.gbm <- train(Survived ~ Sex + Age + Pclass + SibSp + Parch + Embarked + Fare,
method = "gbm", data = training,
trControl = ctrl, verbose = FALSE)

# Predictions on the training set
pred.tr.gbm <- predict(mod.gbm, newdata = training)

# Predict on the testing set
pred.ts.gbm <- predict(mod.gbm, newdata = testing)

# Confusion Matrices
cm.test <- confusionMatrix(pred0, testing$Survived); cm.test$table
cm.train <- confusionMatrix(pred.tr, training$Survived); cm.train$table

# Select Passanger ID and Survived to write into final table
final <- data.frame(Survived = pred.ts0, PassengerId = ts0$PassengerId)
write.csv(final, file = "predictions_rf.csv", row.names = FALSE, quote = FALSE)
5 changes: 5 additions & 0 deletions titanic/clean.R
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ clean.tr <- function (train, test){
titles[i] <- str_trim(strsplit(temp, split = ",")[[1]][2], "left")
}
# New data frame with a column = title

# Change to factors
ts2 <- mutate(test, Title = as.factor(titles))

#Replce NA's with median age (this comes from train)
Expand All @@ -89,6 +91,9 @@ clean.tr <- function (train, test){
ts2 <- mutate(ts2, binaryCabin = 1)
ts2[ts2$Cabin == "", ]$binaryCabin = 0
ts2 <- mutate(ts2, binaryCabin = factor(binaryCabin))

# There is one Title == Dona, replace it with Don (assume equivalence)
levels(ts2$Title) <- c("Col", "Don", "Dr", "Master", "Miss", "Mr", "Mrs", "Ms", "Rev")

# Return the data.frames tr2 and ts2
list(tr2,ts2)
Expand Down
Loading

0 comments on commit cddd659

Please sign in to comment.