Added TalkingData

joshkyh · Oct 5, 2016 · 385fe36 · 385fe36
1 parent 1ad6c96
commit 385fe36
Show file tree

Hide file tree

Showing 3 changed files with 351 additions and 0 deletions.
diff --git a/TalkingData/Readme.md b/TalkingData/Readme.md
@@ -0,0 +1 @@
+Data can be downloaded from [Kaggle](https://www.kaggle.com/c/talkingdata-mobile-user-demographics)
diff --git a/TalkingData/brandModelAppid.R b/TalkingData/brandModelAppid.R
@@ -0,0 +1,199 @@
+# Burak Himmetoglu
+# begin: 08-03-2016
+#
+# Talking Data, blahblah
+#
+# Libraries
+options(stringsAsFactors=F)
+require(data.table)
+
+# Setwd
+setwd("~/Works/Rworkspace/TalkingData/by_data_table/")
+
+# Get data
+train <- fread("../data/gender_age_train.csv", colClasses=c("character","character","integer","character"))
+test <- fread("../data/gender_age_test.csv",colClasses=c("character"))
+
+# Add NA's to group, age and gender for label test
+test[,c("gender","age","group"):= NA] 
+
+# Combine test & train
+full_data <- rbind(train,test)
+rm(test,train); gc() # Remove unnecessary data. Test can be reconstructed by is.na
+
+# Get phone brands and models
+brand_model <- fread("../data/phone_brand_device_model.csv",colClasses=c("character","character","character"))
+
+# Unique entries in brand_model
+brand_model <- unique(brand_model,by=NULL) # Unique entries
+
+# Combine phone_brand and device_model
+brand_model <- brand_model[,brand_model := paste(phone_brand,device_model,sep="-")]
+brand_model <- brand_model[, brand_model := as.factor(brand_model)]
+levs <- levels(brand_model[,brand_model]); n.levs <- length(levs)
+
+# Change the levels of brand_model to numeric values
+levels(brand_model$brand_model) <- 0:(n.levs-1) 
+
+# Remove uunnecessary columns
+brand_model <- brand_model[,c("phone_brand", "device_model") := NULL]
+
+# Merge full_data and brand_model
+brand_model_group <- merge(full_data, brand_model, by = "device_id", all.x = TRUE)
+rm(full_data,brand_model); gc() # Remove data that is not needed
+
+# Duplicated device_id's
+dupl <- duplicated(brand_model_group$device_id); dupl_id <- brand_model_group[dupl,device_id]
+dupl_data <- brand_model_group[device_id %in% dupl_id,]
+
+# For the duplicated id's, choose the one with the brand that has more phones in the data
+ind.rm <- c()
+for (ind in 1:(dim(dupl_data)[1]/2)){
+  i <- (ind-1)*2 + 1
+  i1 <- brand_model_group[brand_model == dupl_data$brand_model[i],]
+  i2 <- brand_model_group[brand_model == dupl_data$brand_model[i+1],]
+  dim1 <- length(i1[,brand_model]); dim2 <- length(i2[,brand_model])
+  iz <- ifelse(dim1 > dim2,2,1) # Choose the brand_model with smaller phones to remove
+  l.rm <- brand_model_group$device_id == dupl_id[ind] & brand_model_group$brand_model == dupl_data[iz+(ind-1)*2,brand_model]
+  ind.rm <- c(ind.rm, which(l.rm == TRUE))
+}
+rm(i1,i2,iz,dim1,dim2,dupl_data,dupl,dupl_id); gc() # Remove unused data
+
+# Remove duplicated data
+brand_model_group <- brand_model_group[-ind.rm,]
+
+# Read apps data
+events <- fread("../data/events.csv", colClasses=c("character","character","character","numeric","numeric"))
+event_app <- fread("../data/app_events.csv",colClasses=rep("character",4))
+
+# Labels
+#app_labels <- fread("../data/app_labels.csv", colClasses = c("character","integer"))
+#label_categories <- fread("../data/label_categories.csv")
+
+# Get unique events for device_id - event_id combination, and subset with (device_id,event_id)
+events <- unique(events[,.(device_id,event_id)],by=NULL)
+
+# Join app_labels and event_app (is this useful?)
+#N_app_label <- app_labels[,.N,by="app_id"] # How many of different labels does an app belong to ?
+
+# For each event_id, list unique app_id's s separated by commas
+event_apps <- event_app[,.(apps=paste(unique(app_id),collapse=",")),by="event_id"]
+
+# Combine this with events
+device_event_apps <- merge(events,event_apps,by="event_id")
+rm(events,event_app,event_apps);gc()
+
+# For each app by device_id get unique apps (Notice that now we group_by device_id not event_id!)
+f_split_paste <- function(z){paste(unique(unlist(strsplit(z,","))),collapse=",")}
+device_apps <- device_event_apps[,.(apps=f_split_paste(apps)),by="device_id"]
+
+rm(device_event_apps,f_split_paste);gc()
+
+# Spread the data (device_id, app_id in many rows)
+tmp <- strsplit(device_apps$apps,",") # Temporary list for storage apps (i.e. their IDs). Contains 60822 lists
+
+# device_id and app_is spread 
+device_apps <- data.table(device_id=rep(device_apps$device_id,times=sapply(tmp,length)),app_id=unlist(tmp))
+rm(tmp)
+
+# Combine the brand_model_group data and the device_apps data
+d1 <- brand_model_group[,list(device_id,brand_model)]
+brand_model_group$brand_model <- NULL
+d2 <- device_apps
+rm(device_apps)
+
+# Initiate factor variables
+d1[,brand_model:=paste0("brand_model:",brand_model)]
+d2[,app_id:=paste0("app_id:",app_id)]
+names(d1) <- names(d2) <- c("device_id","feature_name") # Just rename
+
+# Combine
+dd <- rbind(d1,d2)
+rm(d1,d2);gc()
+
+# Find unique device_id's and spread the features
+require(Matrix)
+ii <- unique(dd$device_id)
+jj <- unique(dd$feature_name)
+
+# Locations of unique entries in dd
+id_i <- match(dd$device_id,ii) # max(id_i) = length(ii)
+id_j <- match(dd$feature_name,jj) # max(id_j) = length(jj)
+
+# Collect the locations
+id_ij <- cbind(id_i,id_j)
+
+# Initiate matrix for storage (dimensions are unique entries in device_id and feature_names)
+M <- Matrix(0,nrow=length(ii),ncol=length(jj),
+            dimnames=list(ii,jj),sparse=T)
+
+# At locations where elements are, put 1: Factor variables
+M[id_ij] <- 1 
+# Notice that dim(id_ij) > dim(M). However, indices in id_ij are repeated ( max(id_i) = length(ii) & max(id_j) = length(jj) )
+# meaning that no offshoot of matrix M indices can occur
+
+rm(ii,jj,id_i,id_j,id_ij,dd);gc()
+# M has device_id's in rows and phone_brands in cols
+
+# Some more wrangling
+x <- M[rownames(M) %in% brand_model_group$device_id,] # Subset rows of M so that it matches brand_model$device_id
+id <- brand_model_group$device_id   #[match(label1$device_id,rownames(x))]
+y <- brand_model_group$group  #[match(label1$device_id,rownames(x))]
+rm(M,brand_model_group)
+
+# Training data
+x_train <- x[!is.na(y),] # Y has NAs for test set
+
+# Remove some features
+tmp_cnt_train <- colSums(x_train)
+
+# Subset features that are not completely zero (tmp_cnt_train>0)
+x <- x[,tmp_cnt_train>0]
+rm(x_train,tmp_cnt_train)
+
+### XGBoost test ####
+
+# Train model
+require(xgboost)
+
+# Label names
+group_name <- na.omit(unique(y))
+
+# Train and test data
+idx_train <- which(!is.na(y))
+idx_test <- which(is.na(y))
+train_data <- x[idx_train,]
+test_data <- x[idx_test,]
+
+# Assing labels (numeric)
+train_label <- match(y[idx_train],group_name)-1
+test_label <- match(y[idx_test],group_name)-1
+
+# XGBoost style matrices
+dtrain <- xgb.DMatrix(train_data,label=train_label,missing=NA)
+dtest <- xgb.DMatrix(test_data,label=test_label,missing=NA)
+
+param <- list(booster="gbtree",
+              objective="multi:softprob",
+              num_class=12,
+              eval_metric="mlogloss",
+              eta=0.1,
+              max_depth = 3,
+              subsample = 0.7,
+              colsample_bytree = 0.7)
+
+# Train
+set.seed(101)
+watchlist <- list(train=dtrain)
+fit_xgb <- xgb.train(params=param,
+                     data=dtrain,
+                     nrounds=300,
+                     watchlist=watchlist)
+# Predict
+pred <- predict(fit_xgb,dtest) # Contains prepdictions of 1112071 observations. It is a vector of dim 12*112071
+# Has to be rolled back to a matrix (12,112071)  
+pred_detail <- t(matrix(pred,nrow=length(group_name)))
+
+res_submit <- cbind(id=id[idx_test],as.data.frame(pred_detail))
+colnames(res_submit) <- c("device_id",group_name)
+write.csv(res_submit,file="submit_xgb_test2.csv",row.names=F,quote=F)
diff --git a/TalkingData/explore_v0.1.R b/TalkingData/explore_v0.1.R
@@ -0,0 +1,151 @@
+# Burak Himmetoglu
+# begin: 07-25-2016
+#
+# Talking Data, exploration and simple predictions
+# Computes conditional probabilities P(group|brand) for test and train
+# The saved data can be used as features in a later modeling study
+#
+# Libraries
+library(dplyr)
+library(tidyr)
+
+# Global define
+DEBUG_FLAG = FALSE
+
+# Load data
+setwd("~/Works/Rworkspace/TalkingData/")
+train <- read.table("./data/gender_age_train.csv", stringsAsFactors = FALSE, header = TRUE, sep = ",",
+                    colClasses = c("character", "character", "integer", "character"))  # Training data
+
+train <- train %>% mutate(group = as.factor(group))
+
+# Probability of a random device belonging to age-gender groups
+prob_group <- train %>% group_by(group) %>% summarize(tot_by_group = n()) %>%
+              mutate(prob_group = tot_by_group / sum(tot_by_group))
+
+# Phone brand data
+phone_brand_device_model <- read.table("./data/phone_brand_device_model.csv", stringsAsFactors = FALSE, header = TRUE,
+                                       sep = ",", colClasses = c("character","character","character"))
+
+# Get distinct values
+# This does not remove duplicated device_id's
+phone_brand_device_model <- phone_brand_device_model %>% distinct() 
+
+## Most popular brands by each age/gender group ##
+## We only need train and phone_brand_device_model for this ##
+
+# Connect train and phone_brand_device_model
+joint_train_brand <- left_join(train,phone_brand_device_model, by="device_id") %>% mutate(group = as.factor(group)) %>% mutate(phone_brand = as.factor(phone_brand))
+
+# There is one duplicated device_id in joint_train_brand
+dupl <- duplicated(joint_train_brand$device_id); dupl_id <- joint_train_brand[dupl,1]
+dupl_train <- filter(joint_train_brand, device_id == dupl_id)
+
+# For the duplicated id's, choose the one with the brand that has more phones in the data
+dim1 <- filter(phone_brand_device_model, phone_brand == dupl_train$phone_brand[1]) %>% select(phone_brand) %>% dim()
+dim2 <- filter(phone_brand_device_model, phone_brand == dupl_train$phone_brand[2]) %>% select(phone_brand) %>% dim()
+iz <- ifelse(dim1[1] > dim2[1], 2, 1) # dim2[1] is larger
+
+# Remove from train the entry corressponding to dupl_train[1,]
+ind.rm <- joint_train_brand$device_id %in% dupl_id & joint_train_brand$phone_brand == dupl_train[iz,5]
+ind.rm <- which(ind.rm == TRUE)
+joint_train_brand <- joint_train_brand[-ind.rm, ]
+
+# Group_by: group and phone_brand
+n_train_brand <- ungroup(joint_train_brand) %>% group_by(group,phone_brand) %>% summarise(tot_by_group_brand = n())
+
+# Total number of phones (all brands)
+tot_phones <- sum(n_train_brand$tot_by_group_brand) # Should be equal to dim(train)[1]
+if (DEBUG_FLAG){ if (tot_phones != dim(train)[1]) { print("Wrong dimensions!") } }
+
+# Find probability of each brand
+prob_brand <- ungroup(n_train_brand) %>% group_by(phone_brand) %>% summarise(tot_brand = sum(tot_by_group_brand)) %>% # Tot phones per brand
+              mutate(prob_brand = tot_brand/tot_phones) # Prob of each brand
+
+# Find conditional probability P(group|brand) by Bayes' theorem:
+## Join with prob_group & prob_brand then compute conditional probability:
+## P(group|brand) = P(brand|group)*P(group)/P(brand)
+## ## P(brand|group) = tot_by_group_brand / tot_by_group
+## ## P(group) = prob_group & P(brand) = prob_brand
+n_train_brand <- ungroup(n_train_brand) %>% left_join(prob_group, by = "group") %>%
+                     left_join(prob_brand, by = "phone_brand") %>%
+                     mutate(cond_prob = (tot_by_group_brand/tot_by_group) * prob_group / prob_brand)
+
+# Just select the relevant columns
+cond_group_brand <- select(n_train_brand,c(group,phone_brand,cond_prob))
+
+# Spread the data: group becomes columns
+P_brand_grp <- spread(cond_group_brand,group,cond_prob, fill = 0.0) # fill = 0.0 better than below
+
+if (DEBUG_FLAG){
+  # The probabilities must sum to 1 
+  which(round(rowSums(P_brand_grp[,-1]),4) != 1)
+}
+
+# Join with train and save for later
+joint_train_prob <- joint_train_brand %>% left_join(P_brand_grp, by = "phone_brand") %>%
+                    select(-c(gender,age,group, device_model,phone_brand))
+save(joint_train_prob, file = "train_P.group_brand.RData")
+
+## Given the conditional probability, we can compute for each given device_id, what probability each group has
+
+# Read the test data
+test <- read.table("./data/gender_age_test.csv", stringsAsFactors = FALSE, header = TRUE, sep = ",",
+                   colClasses = c("character")) 
+
+# Join with brand
+joint_test_brand <- left_join(test,phone_brand_device_model, by = "device_id") %>% mutate(phone_brand = as.factor(phone_brand))
+joint_test_brand$device_model <- NULL # device model not included in analysis
+joint_test_brand <- distinct(joint_test_brand) # Keep disticnt rows
+
+# Check for duplicated device_id's
+dupl <- duplicated(joint_test_brand$device_id); dupl_id <- joint_test_brand[dupl,1]
+dupl_test <- filter(joint_test_brand, device_id %in% dupl_id)
+
+# For the duplicated id's, choose the one with the brand that has more phones in the data
+ind.rm <- c()
+for (ind in 1:(dim(dupl_test)[1]/2)){
+  i <- (ind-1)*2 + 1
+  dim1 <- filter(phone_brand_device_model, phone_brand == dupl_test$phone_brand[i]) %>% select(phone_brand) %>% dim()
+  dim2 <- filter(phone_brand_device_model, phone_brand == dupl_test$phone_brand[i+1]) %>% select(phone_brand) %>% dim()
+  iz <- ifelse(dim1[1] > dim2[1], 2, 1) # Choose the brand with smaller phones to remove
+  l.rm <- joint_test_brand$device_id == dupl_id[ind] & joint_test_brand$phone_brand == dupl_test[iz+(ind-1)*2,2]
+  ind.rm <- c(ind.rm, which(l.rm == TRUE))
+}
+
+# Finish by removing the repeated elements
+joint_test_brand <- joint_test_brand[-ind.rm, ]
+
+# Fill the NA's with 0: 
+fill_na <- function(column){
+  column[is.na(column)] <- 0
+  column
+}
+# Join with P_brand_grp to get probabilities
+joint_test_prob <- left_join(joint_test_brand,P_brand_grp, by = "phone_brand") %>%
+                   mutate_each(funs(fill_na)) 
+joint_test_prob$phone_brand <- NULL
+
+# There are device_id's with all zero probs, since their brands are not in training set. Put these a value from p_group
+l.0 <- which(rowSums(joint_test_prob[,-1]) == 0)
+for (ind in 1:length(l.0)){
+  for (i in 2:13){
+    joint_test_prob[l.0[ind], i] <- prob_group$prob_group[i-1]
+  }
+}
+# Save for later
+save(joint_test_prob, file = "test_P.group_brand.RData")
+
+##  Benchmark with prob_group ##
+joint_prob_bench <- matrix(0, nrow=112071, ncol=12)
+for (ind in 1:12){
+  joint_prob_bench[,ind] <- prob_group$prob_group[ind]
+}
+grp.names <- as.character(prob_group$group)
+joint_prob_bench <- cbind(test, joint_prob_bench)
+colnames(joint_prob_bench) <- c("device_id",grp.names)
+
+# Write a submission file
+write.csv(joint_test_prob, file='submission_phone-brand.csv', row.names=FALSE, quote = FALSE)
+write.csv(joint_prob_bench, file='submission_group-bench.csv', row.names=FALSE, quote = FALSE)
+
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Data can be downloaded from [Kaggle](https://www.kaggle.com/c/talkingdata-mobile-user-demographics)