This repository was archived by the owner on May 1, 2018. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_analysis.R
84 lines (65 loc) · 3.68 KB
/
run_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# run_analysis.R
# This script calculates the mean of certain measurements in the UCI Human
# Activity Recognition dataset, grouped by activity and volunteer (subject).
# The selected measurements are the ones identified as the mean, std and
# meanFreq values.
# The parts of the script which correspond to the tasks numbered 1 through 5 in
# the project instruction are identified by --X--, where X is a number. In
# some cases more than one line or section of code are associated with a task.
# See CodeBook.md for details of the output.
### USER OPTIONS
# Set datFolder to the root location of the dataset.
#datFolder <- "dat"
datFolder <- "UCI HAR Dataset"
#datFolder <- "."
# Set niceSet to T if you want the niceSet output (see CodeBook.md)
niceSet <- F
library(data.table)
"%.%" <- function(a, b) {paste(a, b, sep="")} # used on filenames in read.table
### PREPARATION FOR NAMING COLUMNS AND ROWS
# All names with capital M in "Mean" are angle measurements, not means
features <- gsub("\\()|-|Mean", "", read.table(datFolder%.%"/features.txt")[,2])
features <- tolower( features ) # --4--
# the measurement names, cleaned (char vector)
colsOfInterest <- grep("mean|std", features) # --2--
# indices of the measurements that we care about (int vector)
filename <- datFolder %.% "/activity_labels.txt"
actLabels <- sub( "_", "", tolower(as.character( read.table(filename)[,2] )) )
# activity names in numerical order (char vector) --3--
### MANIPULATION OF TEST DATA
# Read the data for the measurements that we care about
filename <- datFolder %.% "/test/X_test.txt"
testDF <- read.table(filename, colClasses="numeric")[,colsOfInterest] # --2--
# Put subject numbers in what will become the second column
testDF <- cbind( read.table(datFolder %.% "/test/subject_test.txt"), testDF )
# Put the activity names in the first column
testAct <- as.integer( read.table(datFolder %.% "/test/Y_test.txt")[,1] )
testDF <- cbind(actLab = actLabels[testAct], testDF)
### MANIPULATION OF TRAINING DATA
# Read the data for the measurements that we care about
filename <- datFolder %.% "/train/X_train.txt"
trainDF <- read.table(filename, colClasses="numeric")[,colsOfInterest] # --2--
# Put subject numbers in what will become the second column
trainDF <- cbind( read.table(datFolder%.%"/train/subject_train.txt"), trainDF )
# Put the activity names in the first column
trainAct <- as.integer( read.table(datFolder %.% "/train/Y_train.txt")[,1] )
trainDF <- cbind(actLab=actLabels[trainAct], trainDF)
### CREATION OF OUTPUT
XDF <- rbind(testDF, trainDF) # --1--
# the merged mean and std measurements from the input files (data table)
names(XDF) <- c("activity", "subject", features[colsOfInterest]) # --4--
tvec = interaction(XDF[2:1]) # used as index in tapply --5--
activity <- factor(tapply( XDF[,1], tvec, unique ), labels= levels( XDF[,1] ))
vtapply <- function(...) {as.vector( tapply(...) )} # function for lapply
tidySet <- data.frame( activity, lapply(XDF[2:ncol(XDF)], vtapply, tvec, mean) )
rownames(tidySet) <- NULL
if (niceSet) { # Can be set to T at the top of the script
tlist <- list(XDF$subject, XDF$activity)
niceSet <- lapply(XDF[3:ncol(XDF)], tapply, tlist, mean)
# mean of each measurement by subject and activity (list of matrices)
}
write.table(tidySet, file = "TidyHumanActivity.txt", row.names=FALSE)
### CLEANUP
# Leave only XDF, niceSet and tidySet
rm(features, colsOfInterest, actLabels, testDF, testAct, trainDF, trainAct,
activity, tvec, tlist, vtapply, datFolder, filename, "%.%")