Spark with H2O using rsparkling and sparklyr in R

You must have installed:

  • sparklyr
  • rsparkling

 

Here is the working script:

library(sparklyr)
> options(rsparkling.sparklingwater.version = “2.1.6”)
> Sys.setenv(SPARK_HOME=’/Users/avkashchauhan/tools/spark-2.1.0-bin-hadoop2.6′)
> library(rsparkling)
> spark_disconnect(sc)
> sc <- spark_connect(master = “local”, version = “2.1.0”)

Testing the spark context:

sc
$master
[1] “local[8]”

$method
[1] “shell”

$app_name
[1] “sparklyr”

$config
$config$sparklyr.cores.local
[1] 8

$config$spark.sql.shuffle.partitions.local
[1] 8

$config$spark.env.SPARK_LOCAL_IP.local
[1] “127.0.0.1”

$config$sparklyr.csv.embedded
[1] “^1.*”

$config$`sparklyr.shell.driver-class-path`
[1] “”

attr(,”config”)
[1] “default”
attr(,”file”)
[1] “/Library/Frameworks/R.framework/Versions/3.4/Resources/library/sparklyr/conf/config-template.yml”

$spark_home
[1] “/Volumes/OSxexT/tools/spark-2.1.0-bin-hadoop2.6”

$backend
A connection with
description “->localhost:53374”
class “sockconn”
mode “wb”
text “binary”
opened “opened”
can read “yes”
can write “yes”

$monitor
A connection with
description “->localhost:8880”
class “sockconn”
mode “rb”
text “binary”
opened “opened”
can read “yes”
can write “yes”

$output_file
[1] “/var/folders/x7/331tvwcd6p17jj9zdmhnkpyc0000gn/T//RtmpIIVL8I/file6ba7b454325_spark.log”

$spark_context
<jobj[5]>
class org.apache.spark.SparkContext
org.apache.spark.SparkContext@159ba51c

$java_context
<jobj[6]>
class org.apache.spark.api.java.JavaSparkContext
org.apache.spark.api.java.JavaSparkContext@6b114a2d

$hive_context
<jobj[9]>
class org.apache.spark.sql.SparkSession
org.apache.spark.sql.SparkSession@2cd7fdf8

attr(,”class”)
[1] “spark_connection” “spark_shell_connection” “DBIConnection”
> h2o_context(sc, st)
Error in is.H2OFrame(x) : object ‘st’ not found
> h2o_context(sc, strict_version_check = FALSE)
<jobj[15]>
class org.apache.spark.h2o.H2OContext

Sparkling Water Context:
* H2O name: sparkling-water-avkashchauhan_1672148412
* cluster size: 1
* list of used nodes:
(executorId, host, port)
————————
(driver,127.0.0.1,54321)
————————

Open H2O Flow in browser: http://127.0.0.1:54321 (CMD + click in Mac OSX)

You can use the following command to launch H2O FLOW:

h2o_flow(sc, strict_version_check = FALSE)

Thats it, enjoy!!

 

Building GBM model in R and exporting POJO and MOJO model

Get the dataset:

Training:

http://h2o-training.s3.amazonaws.com/pums2013/adult_2013_train.csv.gz

Test:

http://h2o-training.s3.amazonaws.com/pums2013/adult_2013_test.csv.gz

Here is the script to build GBM grid model and export MOJO model:

library(h2o)
h2o.init()

# Importing Dataset
trainfile <- file.path("/Users/avkashchauhan/learn/adult_2013_train.csv.gz")
adult_2013_train <- h2o.importFile(trainfile, destination_frame = "adult_2013_train")
testfile <- file.path("/Users/avkashchauhan/learn/adult_2013_test.csv.gz")
adult_2013_test <- h2o.importFile(testfile, destination_frame = "adult_2013_test")

# Display Dataset
adult_2013_train
adult_2013_test

# Feature Engineering
actual_log_wagp <- h2o.assign(adult_2013_test[, "LOG_WAGP"], key = "actual_log_wagp")

for (j in c("COW", "SCHL", "MAR", "INDP", "RELP", "RAC1P", "SEX", "POBP")) {
 adult_2013_train[[j]] <- as.factor(adult_2013_train[[j]])
 adult_2013_test[[j]] <- as.factor(adult_2013_test[[j]])
}
predset <- c("RELP", "SCHL", "COW", "MAR", "INDP", "RAC1P", "SEX", "POBP", "AGEP", "WKHP", "LOG_CAPGAIN", "LOG_CAPLOSS")

# Building GBM Model:
log_wagp_gbm_grid <- h2o.gbm(x = predset,
 y = "LOG_WAGP",
 training_frame = adult_2013_train,
 model_id = "GBMModel",
 distribution = "gaussian",
 max_depth = 5,
 ntrees = 110,
 validation_frame = adult_2013_test)

log_wagp_gbm_grid

# Prediction 
h2o.predict(log_wagp_gbm_grid, adult_2013_test)

# Download POJO Model:
h2o.download_pojo(log_wagp_gbm_grid, "/Users/avkashchauhan/learn", get_genmodel_jar = TRUE)

# Download MOJO model:
h2o.download_mojo(log_wagp_gbm_grid, "/Users/avkashchauhan/learn", get_genmodel_jar = TRUE)

You will see GBM_model.java (as POJO Model) and GBM_model.zip (MOJO model) at the location where you will save these models.

Thats it, enjoy!

 

Saving H2O models from R/Python API in Hadoop Environment

When you are using H2O in clustered environment i.e. Hadoop the machine could be different where h2o.savemodel() is trying to write the model and thats why you see the error “No such file or directory”. If you just give the path i.e. /tmp and visit the machine ID where H2O connection is initiated from R, you will see the model stored there.
Here is a good example to understand it better:
Step [1] Starting Hadoop driver in EC2 environment as below:
[ec2-user@ip-10-0-104-179 ~]$ hadoop jar h2o-3.10.4.8-hdp2.6/h2odriver.jar -nodes 2 -mapperXmx 2g -output /usr/ec2-user/005
....
....
....
Open H2O Flow in your web browser: http://10.0.65.248:54323  <=== H2O is started.
Note: Above you could see that hadoop command is ran on ip address 10.0.104.179 however the node where H2O server is shown as 10.0.65.248.
Step [2] Connect R client with H2O
> h2o.init(ip = "10.0.65.248", port = 54323, strict_version_check = FALSE)
Note: I have used the ip address as shown above to connect with existing H2O cluster. However the machine where I am running R client is different as its IP address is 34.208.200.16.
Step [3]: Saving H2O model:
h2o.saveModel(my.glm, path = "/tmp", force = TRUE)
So when I am saving the mode it is saved at 10.0.65.248 machine even when the R client was running at 34.208.200.16.
ec2-user@ip-10-0-65-248 ~]$ ll /tmp/GLM*
-rw-r--r-- 1 yarn hadoop 90391 Jun 2 20:02 /tmp/GLM_model_R_1496447892009_1
So you need to make sure you have access to a folder where H2O service is running or you can save model at HDFS something similar to as below:
h2o.saveModel(my.glm, path = "hdfs://ip-10-0-104-179.us-west-2.compute.internal/user/achauhan", force = TRUE)

Thats it, enjoy!!

Using RESTful API to get POJO and MOJO models in H2O

 

CURL API for Listing Models:

http://<hostname>:<port>/3/Models/

CURL API for Listing specific POJO Model:

http://<hostname>:<port>/3/Models/model_name

List Specific MOJO Model:

http://<hostname>:<port>/3/Models/glm_model/mojo

Here is an example:

curl -X GET "http://localhost:54323/3/Models"
curl -X GET "http://localhost:54323/3/Models/deeplearning_model" >> NAME_IT

curl -X GET "http://localhost:54323/3/Models/deeplearning_model" >> dl_model.java
curl -X GET "http://localhost:54323/3/Models/glm_model/mojo" > myglm_mojo.zip

Thats it, enjoy!!

Creating Partial Dependency Plot (PDP) in H2O

Starting from H2O 3.10.0.8 H2O added partial dependency plot which has the Java backend to do the mutli-scoring of the dataset with the model. This makes creating PDP much faster.

To get PDP in H2O you must need Model, and the original data set used to generate mode. Here are few ways to create PDP:

If you want to generate PDP on a single column:

response = h2o.predict(model, data.pdp[, column_name])
To generate PDP on the original data set:
response = h2o.predict(model, data.pdp)
If you want to build PDP directly from Model and dataset without using PDP API, you can the following code:
model = prostate.gbm
column_name = "AGE"
data.pdp = data.hex
bins = unique(h2o.quantile(data.hex[, column_name], probs = seq(0.05,1,0.05)) )
mean_responses = c()

for(bin in bins ){
  data.pdp[, column_name] = bin
  response = h2o.predict(model, data.pdp[, column_name])
  mean_response = mean(response[,ncol(response)])
  mean_responses = c(mean_responses, mean_response)
}

pdp_manual = data.frame(AGE = bins, mean_response = mean_responses)
plot(pdp_manual, type = "l")
Thats it, enjoy!!

Grid Search for Naive Bayes in R using H2O

Here is a R sample code to show how to perform grid search in Naive Bayes algorithm using H2O machine learning platform:

# H2O
library(h2o)
library(ggplot2)
library(data.table)
 
# initialize the cluster with all the threads available
h2o.init(nthreads = -1)
h2o.init()
h2o.init(max_mem_size = "2g")
 
# Variables Necesarias
train.h2o<-as.h2o(training)
test.h2o <-as.h2o(testing)
names(train.h2o)
str(train.h2o)
 
y <-4
x <-c(5:16)
 
# specify the list of paramters
hyper_params <- list(
 laplace = c(0,0.5,1,2,3)
)
 
threshold =c(0.001,0.00001,0.0000001)
 
# performs the grid search
grid_id <-"dl_grid"
model_bayes_grid <- h2o.grid(
 algorithm = "naivebayes", # name of the algorithm
 grid_id = grid_id,
 training_frame = train.h2o,
 validation_frame = test.h2o,
 x = x,
 y = y,
 hyper_params = hyper_params
)
 
# find the best model and evaluate its performance
stopping_metric <- 'accuracy'
sorted_models <- h2o.getGrid(
 grid_id = grid_id,
 sort_by = stopping_metric,
 decreasing = TRUE
)
 
best_model<-h2o.getModel(sorted_models@model_ids[[1]])
best_model
 
h2o.confusionMatrix(best_model, valid = TRUE, metrics = 'accuracy')
 

auc <- h2o.auc(best_model, valid = TRUE)
fpr <- h2o.fpr( h2o.performance(best_model, valid = TRUE) )[['fpr']]
tpr <- h2o.tpr( h2o.performance(best_model, valid = TRUE) )[['tpr']]
ggplot( data.table(fpr = fpr, tpr = tpr), aes(fpr, tpr) ) +
 geom_line() + theme_bw()+ggtitle( sprintf('AUC: %f', auc) )
 

# To obtain the regularization, laplace, do the following:
best_model@parameters
 best_model@parameters$laplace

Thats it, enjoy!!

Tips building H2O and Deep Water source code

Get source code:

Building Source code without test:

Build the source code without tests (For both H2O-3 and DeepWater source)

$ ./gradlew build -x test

Build the Java developer version of source code  without tests (For both H2O-3 and DeepWater source)

$ ./gradlew build -x test

H2O tests uses various small and large file during the test. Which you can download separately depending on the size on your working machine. If you decide to download large data sets, it will take good amount of space from your disk.

To download all the large test data files:

$ ./gradlew syncBigdataLaptop

To download all the small test data files:

$ ./gradlew syncSmalldata

Using with intelliJ:

Pull the source code and then import as a project and use gradle as build system. Once project is loaded, if you want to just do the test run, select the following:

h2o-app > srcc > main > java > water > H2OApp

Screen Shot 2017-04-21 at 10.39.04 AM

Thats it, enjoy!!