|
| 1 | +## |
| 2 | +## title: "Algorithms from scratch using Gradient Descent to predict average GPU Run Time & classify it's run type" |
| 3 | +## author: "Sarthak Mohapatra" |
| 4 | +## date: "1/29/2020" |
| 5 | +## |
| 6 | + |
| 7 | +options(scipen = 999) |
| 8 | + |
| 9 | +## |
| 10 | +## Loading the required packages. |
| 11 | +## |
| 12 | + |
| 13 | +pacman::p_load(data.table, forecast, leaps, tidyverse, caret, corrplot, glmnet, mlbench, ggplot2, gplots, pivottabler,MASS, |
| 14 | + e1071, fpp2, gains, pROC, knitr, gplots, FNN, RColorBrewer, viridis, cowplot, ggpubr, gridExtra, rlist, d3heatmap) |
| 15 | + |
| 16 | + |
| 17 | +## |
| 18 | +## Importing the dataset from the working directory |
| 19 | +## |
| 20 | + |
| 21 | +setwd('D:/Second Semester - MSBA - UTD/Applied Machine Learning/Assignment 1/sgemm_product_dataset') |
| 22 | +gpu.df <- read.csv("sgemm_product.csv") |
| 23 | +head(gpu.df) |
| 24 | + |
| 25 | +## |
| 26 | +## Renaming the last 4 column names |
| 27 | +## |
| 28 | + |
| 29 | +names(gpu.df)[15] = "Run1" |
| 30 | +names(gpu.df)[16] = "Run2" |
| 31 | +names(gpu.df)[17] = "Run3" |
| 32 | +names(gpu.df)[18] = "Run4" |
| 33 | +head(gpu.df) |
| 34 | + |
| 35 | +## |
| 36 | +## Creating a new feature Average. It will contain the average of Run1 through Run4 |
| 37 | +## |
| 38 | + |
| 39 | +gpu.df$Average <- (gpu.df$Run1 + gpu.df$Run2 + gpu.df$Run3 + gpu.df$Run4) / 4 |
| 40 | +head(gpu.df) |
| 41 | + |
| 42 | +## |
| 43 | +## Data Partioning |
| 44 | +## |
| 45 | + |
| 46 | +set.seed(16) |
| 47 | + |
| 48 | +## |
| 49 | +## randomly order the dataset |
| 50 | +## |
| 51 | + |
| 52 | +rows <- sample(nrow(gpu.df)) |
| 53 | +gpu <- gpu.df[rows, -15:-18] |
| 54 | + |
| 55 | +## |
| 56 | +## find rows to split on |
| 57 | +## |
| 58 | + |
| 59 | +split <- round(nrow(gpu) * 0.7) |
| 60 | +gpu.train.df <- gpu[1:split, ] |
| 61 | +gpu.test.df <- gpu[(split+1):nrow(gpu), ] |
| 62 | + |
| 63 | +## |
| 64 | +## confirm the size of the split |
| 65 | +## |
| 66 | + |
| 67 | +round(nrow(gpu.train.df)/nrow(gpu), digits = 3) |
| 68 | +head(gpu.train.df) |
| 69 | +head(gpu.test.df) |
| 70 | + |
| 71 | +## |
| 72 | +## Normalizing the dataset. |
| 73 | +## |
| 74 | + |
| 75 | +gpu_train_norm <- gpu.train.df |
| 76 | +gpu_test_norm <- gpu.test.df |
| 77 | +gpu_norm_df <- gpu |
| 78 | + |
| 79 | +norm.values <- preProcess(gpu.train.df[, 1:15], method=c("center", "scale")) |
| 80 | +gpu_train_norm[, 1:15] <- predict(norm.values, gpu.train.df[, 1:15]) |
| 81 | +gpu_test_norm[, 1:15] <- predict(norm.values, gpu.test.df[, 1:15]) |
| 82 | +gpu_norm_df[, 1:15] <- predict(norm.values, gpu[, 1:15]) |
| 83 | +new.gpu.norm.df <- predict(norm.values, gpu) |
| 84 | + |
| 85 | +## |
| 86 | +## Creating the feature and target datasets ( X & Y) |
| 87 | +## |
| 88 | + |
| 89 | +x_gpu_train <- as.matrix(gpu_train_norm[c(1:14)]) |
| 90 | +y_gpu_train <- as.matrix(gpu_train_norm[c('Average')]) |
| 91 | + |
| 92 | +x_gpu_test <- as.matrix(gpu_test_norm[c(1:14)]) |
| 93 | +y_gpu_test <- as.matrix(gpu_test_norm[c('Average')]) |
| 94 | + |
| 95 | +x_gpu_train <- cbind(Intercept=1,x_gpu_train) |
| 96 | +head(x_gpu_train) |
| 97 | +head(y_gpu_train) |
| 98 | +x_gpu_test <- cbind(Intercept=1, x_gpu_test) |
| 99 | +head(x_gpu_test) |
| 100 | +length(y_gpu_train) |
| 101 | +length(y_gpu_test) |
| 102 | + |
| 103 | + |
| 104 | +## |
| 105 | +## Converting the problem statement to a binary class problem. |
| 106 | +## |
| 107 | + |
| 108 | +## |
| 109 | +## If the average run time of the record is less than Median value, it is given class 0 (low run type) and if greater or equal, it is termed as 1(high run type) |
| 110 | +## |
| 111 | + |
| 112 | +median.input <- median(gpu_norm_df$Average) |
| 113 | +median.input |
| 114 | + |
| 115 | +x.train.gpu.logit <- x_gpu_train |
| 116 | +y.train.gpu.logit <- y_gpu_train |
| 117 | + |
| 118 | +head(y.train.gpu.logit) |
| 119 | +y.train.gpu.logit <- ifelse(y.train.gpu.logit <= median.input, 0, 1) |
| 120 | +head(y.train.gpu.logit) |
| 121 | + |
| 122 | + |
| 123 | +x.test.gpu.logit <- x_gpu_test |
| 124 | +y.test.gpu.logit <- y_gpu_test |
| 125 | +head(y.test.gpu.logit) |
| 126 | +y.test.gpu.logit <- ifelse(y.test.gpu.logit <= median.input, 0, 1) |
| 127 | +head(y.test.gpu.logit) |
| 128 | + |
| 129 | + |
| 130 | + |
| 131 | +## |
| 132 | +## The Below code chunks is the implementation of the Gradient Descent method. Based on the experimentation performed, the best alpha selected for demonstration here |
| 133 | +## is alpha = 0.0001 and the threshold as thold = 0.000001. |
| 134 | +## |
| 135 | + |
| 136 | +## |
| 137 | +## Here, we are defining the Gradient Descent algorithm. First, we are declaring the variables to store cost, beta co-efficients, predicted target variable value and error. |
| 138 | +## |
| 139 | + |
| 140 | +gradient_descent <- function(x, y, alpha, m, beta, thold) |
| 141 | +{ |
| 142 | + cost_iter <<- list() |
| 143 | + beta_iter <<- matrix(0,nrow=m,ncol=15) |
| 144 | + yhat_iter <<- list() |
| 145 | + error_iter <<- list() |
| 146 | + ## |
| 147 | + ## We are iterating over the matrices with the goal of minimizing the cost function value. |
| 148 | + ## |
| 149 | + for (i in 1:10000){ |
| 150 | + |
| 151 | + yhat <- 1 / (1 + exp(-(as.matrix(x) %*% beta_value))) ## Predictions of target variable. |
| 152 | + yhat_iter[i] <- yhat ## Storing the predicted value. |
| 153 | + |
| 154 | + error <- yhat - y ## Calculating the error value. |
| 155 | + error_iter[i] <- error ## Storing the error value. |
| 156 | + |
| 157 | + cost <- -1 * (1/m) * sum( y*log(yhat) + (1-y)*log(1-yhat) ) ## Calculating the cost function value. |
| 158 | + cost_iter[i] <- cost ## storing the cost function value. |
| 159 | + |
| 160 | + beta_value <- beta_value - (alpha * (1/m) * (t(x) %*% (yhat - y))) ## Calculating the new beta coefficinets values. |
| 161 | + beta_iter[i,1:15] <- t(beta_value) ## storing the beta coefficients value. |
| 162 | + |
| 163 | + |
| 164 | +# if ((i > 1) && ((cost_iter[[i-1]] - cost_iter[[i]]) < thold)) { |
| 165 | +# print('Threshold reached') |
| 166 | +# break |
| 167 | +# } |
| 168 | + } |
| 169 | + |
| 170 | + final_val <- list(cost_iter, beta_iter, yhat_iter, error_iter) ## Storing the variables in a single variable so that it can be returned. |
| 171 | + return (final_val) ## Returning the values. |
| 172 | + |
| 173 | +} |
| 174 | + |
| 175 | + |
| 176 | +## |
| 177 | +## Prediction function for the validation dataset. |
| 178 | +## |
| 179 | + |
| 180 | +linear_test_predict <- function(beta_conv_iter, x_gpu_test, y_gpu_test) |
| 181 | +{ |
| 182 | + yhat_test <- 1 / (1 + exp(-(as.matrix(x.test.gpu.logit) %*% beta_conv_iter))) |
| 183 | + error_test <- yhat_test - y_gpu_test |
| 184 | + cost_test <- (1/(2*length(y_gpu_test))) * sum(y_gpu_test*log(yhat_test) + (1-y_gpu_test)*log(1-yhat_test)) |
| 185 | + test_val <- list(yhat_test, error_test, cost_test) |
| 186 | + |
| 187 | + return(test_val) |
| 188 | +} |
| 189 | + |
| 190 | + |
| 191 | +## |
| 192 | +## Let's define the main function for initializing the initial values of beta-i (slope) and beta-0 (y intercept) |
| 193 | +## |
| 194 | + |
| 195 | +main_function <- function(alpha, m, beta_value, thold){ |
| 196 | + cost_return_train <- list() |
| 197 | + beta_return_train <- list() |
| 198 | + yhat_return_train <- list() |
| 199 | + final_return_train <- list() |
| 200 | + |
| 201 | + cost_return_test <- list() |
| 202 | + yhat_return_test <- list() |
| 203 | + error_return_test <- list() |
| 204 | + |
| 205 | + final <- list() |
| 206 | + final_test <- list() |
| 207 | + |
| 208 | + |
| 209 | + final <- gradient_descent(x.train.gpu.logit, y.train.gpu.logit, alpha, m, beta, thold) |
| 210 | + |
| 211 | + cost_return_train <- final[[1]] |
| 212 | + beta_return_train <- final[[2]] |
| 213 | + yhat_return_train <- final[[3]] |
| 214 | + error_return_train <- final[[4]] |
| 215 | + |
| 216 | + |
| 217 | + conv_iter <- length(cost_return_train) |
| 218 | + conv_iter |
| 219 | + |
| 220 | + beta_conv_iter <- beta_return_train[conv_iter,1:15] |
| 221 | + beta_conv_iter |
| 222 | + |
| 223 | + cost_return_train[conv_iter] |
| 224 | + |
| 225 | + final_test <- linear_test_predict(beta_conv_iter, x_gpu_test, y_gpu_test) |
| 226 | + |
| 227 | + cost_return_test <- final_test[[3]] |
| 228 | + yhat_return_test <- final_test[[1]] |
| 229 | + error_return_test <- final_test[[2]] |
| 230 | + |
| 231 | + |
| 232 | + cost_return_test |
| 233 | + |
| 234 | + cost_result <- list(cost_return_train, cost_return_test, conv_iter, beta_conv_iter, yhat_return_test) |
| 235 | + return(cost_result) |
| 236 | + |
| 237 | +} |
| 238 | + |
| 239 | + |
| 240 | +## |
| 241 | +## Invoking the main function to apply the Gradient Descent algorithm. |
| 242 | +## |
| 243 | + |
| 244 | +thold = 0.0000000001 |
| 245 | +alpha <- 0.00001 |
| 246 | +m <- nrow(gpu.train.df) |
| 247 | +beta_value <<- rep(0,15) |
| 248 | +cost_return <- main_function(alpha, m, beta_value, thold) |
| 249 | +cost_return_train <- cost_return[[1]] |
| 250 | +cost_return_test <- cost_return[[2]] |
| 251 | +conv_iter <- cost_return[[3]] |
| 252 | +yhat_test <- cost_return[[4]] |
| 253 | +cost_train_0.0001_al <- cost_return_train |
| 254 | +cost_train_min_0.0001_al <- cost_return_train[conv_iter] |
| 255 | +cost_test_0.0001_al <- cost_return_test |
| 256 | + |
| 257 | + |
| 258 | + |
| 259 | +## |
| 260 | +## Plotting various performance validation curves |
| 261 | +## |
| 262 | + |
| 263 | +plot(1:length(cost_train_0.0001_al), cost_train_0.0001_al, main = 'Cost function convergence at alpha 0.0001.', xlab = 'No. of Iterations', ylab = 'Cost Function value', col='red', type='l', xlim=c(0,10000), ylim=c(0.68,0.7),sub='Convergence Threshold value - 0.000001') |
| 264 | +legend("topright",c("alpha=0.0001"),cex=0.7, bty='n', fill=c("red")) |
| 265 | + |
| 266 | + |
| 267 | + |
| 268 | + |
| 269 | + |
| 270 | + |
| 271 | + |
| 272 | + |
| 273 | + |
0 commit comments