Concurrent model building in H2O using Parallel

Here is the full code snippet which shows how to build any model concurrently using H2O backend and R based parallel library:

 > library(h2o)
 > h2o.init(nthreads = -1)

 ## To simplify only use first 300 rows

 > prostate.hex = h2o.uploadFile(path = system.file("extdata", "prostate.csv", package="h2o"), destination_frame = "prostate.hex")
 > prostate.hex = prostate.hex[1:300,]
 > ones = rep(1, times = 100)
 > zeroes = rep(0, times = 100)
 > prostate.hex$Fold_1 = as.h2o(data.frame( Fold_1 = c(ones, zeroes, zeroes)))
 > prostate.hex$Fold_2 = as.h2o(data.frame( Fold_2 = c(zeroes, ones, zeroes)))
 > prostate.hex$Fold_3 = as.h2o(data.frame( Fold_3 = c(zeroes, zeroes, ones)))
 
 ## Case 1: Use weights in GLM that will essentially run multiple GLM models on the same frame (so no data replication)

 > glm_weights = c()
 > start = Sys.time()
 > for(i in 1:3) {
 glm_m = h2o.glm(x = c(3:9), y = 2, training_frame = prostate.hex, weights_column = paste0("Fold_", i), model_id = paste0("Fold_", i))
 glm_weights = c(glm_weights, glm_m)
 }
 > end = Sys.time()
 > weightsTime = end - start
 > weightsTime

 ## Case 2: Subset H2OFrame and try to run GLM in a for loop

 > prostate_1 = prostate.hex[1:100, ]
 > prostate_2 = prostate.hex[101:200, ]
 > prostate_3 = prostate.hex[201:300, ]
 > prostate = c(prostate_1,prostate_2,prostate_3)
 > glm_subset = c()
 > start = Sys.time()
 > for(i in 1:3) {
 glm_m = h2o.glm(x = c(3:9), y = 2, training_frame = prostate[[i]], model_id = paste0("Fold_", i))
 glm_subset = c(glm_subset, glm_m)
 }
 > end = Sys.time()
 > subsetTime = end - start
 > subsetTime

 ## Case 3: Use the package parallel to send all the GLM function calls over to H2O and H2O will handle how to run the multiple calls optimumly

 > library(parallel)
 > start = Sys.time()
 > glm_parallel = mclapply(1:3, function(i) 
 > glm_m = h2o.glm(x = c(3:9), y = 2, training_frame = prostate[[i]], model_id = paste0("Fold_", i)) )
 > end = Sys.time()
 > parallelTimes = end - start
 > parallelTimes

 ### Quick check to make sure all the GLM models return the same residual deviance

 > unlist(lapply(glm_parallel, function(x) h2o.residual_deviance(x)))
 > unlist(lapply(glm_weights, function(x) h2o.residual_deviance(x)))
 > unlist(lapply(glm_subset, function(x) h2o.residual_deviance(x)))
 
 ## Compare the model build time

 > comparison_table = data.frame(Time_Elapsed = c(weightsTime, subsetTime, parallelTimes), row.names = c("Case_1", "Case_2", "Case_3"))
Advertisements

One thought on “Concurrent model building in H2O using Parallel

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s