Here is the full code snippet which shows how to build any model concurrently using H2O backend and R based parallel library:

> library(h2o)
> h2o.init(nthreads = -1)
** ## To simplify only use first 300 rows**
> prostate.hex = h2o.uploadFile(path = system.file("extdata", "prostate.csv", package="h2o"), destination_frame = "prostate.hex")
> prostate.hex = prostate.hex[1:300,]
> ones = rep(1, times = 100)
> zeroes = rep(0, times = 100)
> prostate.hex$Fold_1 = as.h2o(data.frame( Fold_1 = c(ones, zeroes, zeroes)))
> prostate.hex$Fold_2 = as.h2o(data.frame( Fold_2 = c(zeroes, ones, zeroes)))
> prostate.hex$Fold_3 = as.h2o(data.frame( Fold_3 = c(zeroes, zeroes, ones)))
** ## Case 1: Use weights in GLM that will essentially run multiple GLM models on the same frame (so no data replication)**
> glm_weights = c()
> start = Sys.time()
> for(i in 1:3) {
glm_m = h2o.glm(x = c(3:9), y = 2, training_frame = prostate.hex, weights_column = paste0("Fold_", i), model_id = paste0("Fold_", i))
glm_weights = c(glm_weights, glm_m)
}
> end = Sys.time()
> weightsTime = end - start
> weightsTime
** ## Case 2: Subset H2OFrame and try to run GLM in a for loop**
> prostate_1 = prostate.hex[1:100, ]
> prostate_2 = prostate.hex[101:200, ]
> prostate_3 = prostate.hex[201:300, ]
> prostate = c(prostate_1,prostate_2,prostate_3)
> glm_subset = c()
> start = Sys.time()
> for(i in 1:3) {
glm_m = h2o.glm(x = c(3:9), y = 2, training_frame = prostate[[i]], model_id = paste0("Fold_", i))
glm_subset = c(glm_subset, glm_m)
}
> end = Sys.time()
> subsetTime = end - start
> subsetTime
** ## Case 3: Use the package parallel to send all the GLM function calls over to H2O and H2O will handle how to run the multiple calls optimumly**
> library(parallel)
> start = Sys.time()
> glm_parallel = mclapply(1:3, function(i)
> glm_m = h2o.glm(x = c(3:9), y = 2, training_frame = prostate[[i]], model_id = paste0("Fold_", i)) )
> end = Sys.time()
> parallelTimes = end - start
> parallelTimes
** ### Quick check to make sure all the GLM models return the same residual deviance**
> unlist(lapply(glm_parallel, function(x) h2o.residual_deviance(x)))
> unlist(lapply(glm_weights, function(x) h2o.residual_deviance(x)))
> unlist(lapply(glm_subset, function(x) h2o.residual_deviance(x)))
** ## Compare the model build time**
> comparison_table = data.frame(Time_Elapsed = c(weightsTime, subsetTime, parallelTimes), row.names = c("Case_1", "Case_2", "Case_3"))

### Like this:

Like Loading...