added files

joshkyh · Mar 3, 2016 · cddd659 · cddd659
1 parent 0483eb6
commit cddd659
Show file tree

Hide file tree

Showing 3 changed files with 490 additions and 0 deletions.
diff --git a/titanic/analysis-rf.R b/titanic/analysis-rf.R
@@ -0,0 +1,66 @@
+# Titanic Survival Analysis (Random Forest version)
+# Function:
+#     Read/clean train (clean.R)
+# 
+# Burak H. 
+library(caret); library(dplyr)
+
+# Go to working directory
+setwd("~/Coursera/projects/titanic")
+
+# Read the training and testing sets
+train0 <- read.csv("./data/train.csv")
+test0 <- read.csv("./data/test.csv")
+
+# Get the cleaned data
+fun.clean <- dget("clean.R")
+cleaned <- fun.clean(train0,test0)
+tr0 <- cleaned[[1]]; ts0 <- cleaned[[2]]
+
+# Further split the tr0 data into training and testing sets by Caret
+in.tr0 <- createDataPartition(tr0$Survived, p=0.7, list = FALSE)
+training <- tr0[in.tr0, ] 
+testing <- tr0[-in.tr0, ]
+
+# Train using random forests
+ctrl <- trainControl(allowParallel=T, method = "repeatedcv", number = 10, repeats = 10)
+modFit <- train(Survived ~ Sex + Age + Pclass + SibSp + Parch + Embarked + Fare + Title,
+                method = "rf", importance = TRUE, data = training, ntree = 500,
+                trControl = ctrl)
+
+# Predictions on the training set
+pred.tr <- predict(modFit, newdata = training)
+
+# Predict on the testing set
+pred0 <- predict(modFit, newdata = testing)
+
+# Confusion Matrices
+cm.test <- confusionMatrix(pred0, testing$Survived); cm.test$table
+cm.train <- confusionMatrix(pred.tr, training$Survived); cm.train$table
+
+# Now predict on the actual testing set (ts0)
+pred.ts0 <- predict(modFit, newdata = ts0)
+
+# Select Passanger ID and Survived to write into final table
+final <- data.frame(Survived = pred.ts0, PassengerId = ts0$PassengerId)
+write.csv(final, file = "predictions_rf.csv", row.names = FALSE, quote = FALSE)
+
+##### Boosting ####
+ctrl <- trainControl(method="boot")
+mod.gbm <- train(Survived ~ Sex + Age + Pclass + SibSp + Parch + Embarked + Fare,
+                method = "gbm", data = training,
+                trControl = ctrl, verbose = FALSE)
+
+# Predictions on the training set
+pred.tr.gbm <- predict(mod.gbm, newdata = training)
+
+# Predict on the testing set
+pred.ts.gbm <- predict(mod.gbm, newdata = testing)
+
+# Confusion Matrices
+cm.test <- confusionMatrix(pred0, testing$Survived); cm.test$table
+cm.train <- confusionMatrix(pred.tr, training$Survived); cm.train$table
+
+# Select Passanger ID and Survived to write into final table
+final <- data.frame(Survived = pred.ts0, PassengerId = ts0$PassengerId)
+write.csv(final, file = "predictions_rf.csv", row.names = FALSE, quote = FALSE)
diff --git a/titanic/clean.R b/titanic/clean.R
@@ -69,6 +69,8 @@ clean.tr <- function (train, test){
     titles[i] <- str_trim(strsplit(temp, split = ",")[[1]][2], "left")
   }
   # New data frame with a column = title
+
+  # Change to factors
   ts2 <- mutate(test, Title = as.factor(titles))
 
   #Replce NA's with median age (this comes from train)
@@ -89,6 +91,9 @@ clean.tr <- function (train, test){
   ts2 <- mutate(ts2, binaryCabin = 1)
   ts2[ts2$Cabin == "", ]$binaryCabin = 0
   ts2 <- mutate(ts2, binaryCabin = factor(binaryCabin))
+
+  # There is one Title == Dona, replace it with Don (assume equivalence)
+  levels(ts2$Title) <- c("Col", "Don", "Dr", "Master", "Miss", "Mr", "Mrs", "Ms", "Rev")
 
   # Return the data.frames tr2 and ts2
   list(tr2,ts2)