From cddd6593fa2e3bc6c7ac573512361400e5283311 Mon Sep 17 00:00:00 2001 From: bhimmetoglu Date: Thu, 3 Mar 2016 14:55:46 -0800 Subject: [PATCH] added files --- titanic/analysis-rf.R | 66 ++++++ titanic/clean.R | 5 + titanic/predictions_rf.csv | 419 +++++++++++++++++++++++++++++++++++++ 3 files changed, 490 insertions(+) create mode 100644 titanic/analysis-rf.R create mode 100644 titanic/predictions_rf.csv diff --git a/titanic/analysis-rf.R b/titanic/analysis-rf.R new file mode 100644 index 0000000..62b60e0 --- /dev/null +++ b/titanic/analysis-rf.R @@ -0,0 +1,66 @@ +# Titanic Survival Analysis (Random Forest version) +# Function: +# Read/clean train (clean.R) +# +# Burak H. +library(caret); library(dplyr) + +# Go to working directory +setwd("~/Coursera/projects/titanic") + +# Read the training and testing sets +train0 <- read.csv("./data/train.csv") +test0 <- read.csv("./data/test.csv") + +# Get the cleaned data +fun.clean <- dget("clean.R") +cleaned <- fun.clean(train0,test0) +tr0 <- cleaned[[1]]; ts0 <- cleaned[[2]] + +# Further split the tr0 data into training and testing sets by Caret +in.tr0 <- createDataPartition(tr0$Survived, p=0.7, list = FALSE) +training <- tr0[in.tr0, ] +testing <- tr0[-in.tr0, ] + +# Train using random forests +ctrl <- trainControl(allowParallel=T, method = "repeatedcv", number = 10, repeats = 10) +modFit <- train(Survived ~ Sex + Age + Pclass + SibSp + Parch + Embarked + Fare + Title, + method = "rf", importance = TRUE, data = training, ntree = 500, + trControl = ctrl) + +# Predictions on the training set +pred.tr <- predict(modFit, newdata = training) + +# Predict on the testing set +pred0 <- predict(modFit, newdata = testing) + +# Confusion Matrices +cm.test <- confusionMatrix(pred0, testing$Survived); cm.test$table +cm.train <- confusionMatrix(pred.tr, training$Survived); cm.train$table + +# Now predict on the actual testing set (ts0) +pred.ts0 <- predict(modFit, newdata = ts0) + +# Select Passanger ID and Survived to write into final table +final <- data.frame(Survived = pred.ts0, PassengerId = ts0$PassengerId) +write.csv(final, file = "predictions_rf.csv", row.names = FALSE, quote = FALSE) + +##### Boosting #### +ctrl <- trainControl(method="boot") +mod.gbm <- train(Survived ~ Sex + Age + Pclass + SibSp + Parch + Embarked + Fare, + method = "gbm", data = training, + trControl = ctrl, verbose = FALSE) + +# Predictions on the training set +pred.tr.gbm <- predict(mod.gbm, newdata = training) + +# Predict on the testing set +pred.ts.gbm <- predict(mod.gbm, newdata = testing) + +# Confusion Matrices +cm.test <- confusionMatrix(pred0, testing$Survived); cm.test$table +cm.train <- confusionMatrix(pred.tr, training$Survived); cm.train$table + +# Select Passanger ID and Survived to write into final table +final <- data.frame(Survived = pred.ts0, PassengerId = ts0$PassengerId) +write.csv(final, file = "predictions_rf.csv", row.names = FALSE, quote = FALSE) diff --git a/titanic/clean.R b/titanic/clean.R index d4892d8..56d5612 100644 --- a/titanic/clean.R +++ b/titanic/clean.R @@ -69,6 +69,8 @@ clean.tr <- function (train, test){ titles[i] <- str_trim(strsplit(temp, split = ",")[[1]][2], "left") } # New data frame with a column = title + + # Change to factors ts2 <- mutate(test, Title = as.factor(titles)) #Replce NA's with median age (this comes from train) @@ -89,6 +91,9 @@ clean.tr <- function (train, test){ ts2 <- mutate(ts2, binaryCabin = 1) ts2[ts2$Cabin == "", ]$binaryCabin = 0 ts2 <- mutate(ts2, binaryCabin = factor(binaryCabin)) + + # There is one Title == Dona, replace it with Don (assume equivalence) + levels(ts2$Title) <- c("Col", "Don", "Dr", "Master", "Miss", "Mr", "Mrs", "Ms", "Rev") # Return the data.frames tr2 and ts2 list(tr2,ts2) diff --git a/titanic/predictions_rf.csv b/titanic/predictions_rf.csv new file mode 100644 index 0000000..c854698 --- /dev/null +++ b/titanic/predictions_rf.csv @@ -0,0 +1,419 @@ +Survived,PassengerId +0,892 +0,893 +0,894 +0,895 +0,896 +0,897 +0,898 +0,899 +1,900 +0,901 +0,902 +0,903 +1,904 +0,905 +1,906 +1,907 +0,908 +0,909 +0,910 +1,911 +1,912 +0,913 +1,914 +0,915 +1,916 +0,917 +1,918 +0,919 +0,920 +0,921 +0,922 +0,923 +0,924 +0,925 +1,926 +0,927 +0,928 +0,929 +0,930 +0,931 +0,932 +0,933 +0,934 +1,935 +1,936 +0,937 +0,938 +0,939 +1,940 +1,941 +0,942 +0,943 +1,944 +1,945 +0,946 +0,947 +0,948 +0,949 +0,950 +1,951 +0,952 +0,953 +0,954 +1,955 +1,956 +1,957 +1,958 +0,959 +0,960 +1,961 +1,962 +0,963 +0,964 +0,965 +1,966 +0,967 +0,968 +1,969 +0,970 +1,971 +1,972 +0,973 +0,974 +0,975 +0,976 +0,977 +1,978 +0,979 +1,980 +1,981 +0,982 +0,983 +1,984 +0,985 +0,986 +0,987 +1,988 +0,989 +0,990 +0,991 +1,992 +0,993 +0,994 +0,995 +0,996 +0,997 +0,998 +0,999 +0,1000 +0,1001 +0,1002 +1,1003 +1,1004 +1,1005 +1,1006 +0,1007 +0,1008 +1,1009 +0,1010 +1,1011 +1,1012 +0,1013 +1,1014 +0,1015 +0,1016 +0,1017 +0,1018 +1,1019 +0,1020 +0,1021 +0,1022 +0,1023 +0,1024 +0,1025 +0,1026 +0,1027 +0,1028 +0,1029 +0,1030 +0,1031 +0,1032 +1,1033 +0,1034 +0,1035 +0,1036 +0,1037 +0,1038 +0,1039 +0,1040 +0,1041 +1,1042 +0,1043 +0,1044 +0,1045 +0,1046 +0,1047 +1,1048 +0,1049 +0,1050 +1,1051 +1,1052 +1,1053 +1,1054 +0,1055 +0,1056 +0,1057 +0,1058 +0,1059 +1,1060 +0,1061 +0,1062 +0,1063 +0,1064 +0,1065 +0,1066 +1,1067 +1,1068 +1,1069 +1,1070 +1,1071 +0,1072 +0,1073 +1,1074 +0,1075 +1,1076 +0,1077 +1,1078 +0,1079 +0,1080 +0,1081 +0,1082 +0,1083 +0,1084 +0,1085 +1,1086 +0,1087 +1,1088 +0,1089 +0,1090 +0,1091 +1,1092 +1,1093 +1,1094 +1,1095 +0,1096 +0,1097 +1,1098 +0,1099 +1,1100 +0,1101 +0,1102 +0,1103 +0,1104 +1,1105 +0,1106 +0,1107 +1,1108 +0,1109 +1,1110 +0,1111 +1,1112 +0,1113 +1,1114 +0,1115 +1,1116 +0,1117 +0,1118 +1,1119 +0,1120 +0,1121 +0,1122 +1,1123 +0,1124 +0,1125 +1,1126 +0,1127 +1,1128 +0,1129 +1,1130 +1,1131 +1,1132 +1,1133 +0,1134 +0,1135 +1,1136 +0,1137 +1,1138 +0,1139 +1,1140 +0,1141 +1,1142 +0,1143 +1,1144 +0,1145 +0,1146 +0,1147 +0,1148 +0,1149 +1,1150 +0,1151 +0,1152 +0,1153 +1,1154 +1,1155 +0,1156 +0,1157 +0,1158 +0,1159 +0,1160 +0,1161 +0,1162 +0,1163 +1,1164 +1,1165 +0,1166 +1,1167 +0,1168 +0,1169 +0,1170 +0,1171 +0,1172 +1,1173 +1,1174 +0,1175 +1,1176 +0,1177 +0,1178 +0,1179 +0,1180 +0,1181 +0,1182 +0,1183 +0,1184 +0,1185 +0,1186 +0,1187 +1,1188 +0,1189 +0,1190 +0,1191 +0,1192 +0,1193 +0,1194 +0,1195 +1,1196 +1,1197 +1,1198 +0,1199 +0,1200 +0,1201 +0,1202 +0,1203 +0,1204 +1,1205 +1,1206 +1,1207 +0,1208 +0,1209 +0,1210 +0,1211 +0,1212 +0,1213 +0,1214 +0,1215 +1,1216 +0,1217 +1,1218 +0,1219 +0,1220 +0,1221 +1,1222 +0,1223 +0,1224 +1,1225 +0,1226 +0,1227 +0,1228 +0,1229 +0,1230 +0,1231 +0,1232 +0,1233 +0,1234 +1,1235 +1,1236 +0,1237 +0,1238 +1,1239 +0,1240 +1,1241 +1,1242 +0,1243 +0,1244 +0,1245 +1,1246 +0,1247 +1,1248 +0,1249 +0,1250 +0,1251 +0,1252 +1,1253 +1,1254 +0,1255 +1,1256 +0,1257 +0,1258 +0,1259 +1,1260 +0,1261 +0,1262 +1,1263 +0,1264 +0,1265 +1,1266 +1,1267 +0,1268 +0,1269 +0,1270 +0,1271 +0,1272 +0,1273 +0,1274 +0,1275 +0,1276 +1,1277 +0,1278 +0,1279 +0,1280 +0,1281 +0,1282 +1,1283 +0,1284 +0,1285 +0,1286 +1,1287 +0,1288 +1,1289 +0,1290 +0,1291 +1,1292 +0,1293 +1,1294 +0,1295 +0,1296 +0,1297 +0,1298 +0,1299 +1,1300 +1,1301 +1,1302 +1,1303 +0,1304 +0,1305 +1,1306 +0,1307 +0,1308 +1,1309