Spark with H2O using rsparkling and sparklyr in R

You must have installed:

  • sparklyr
  • rsparkling

 

Here is the working script:

library(sparklyr)
> options(rsparkling.sparklingwater.version = “2.1.6”)
> Sys.setenv(SPARK_HOME=’/Users/avkashchauhan/tools/spark-2.1.0-bin-hadoop2.6′)
> library(rsparkling)
> spark_disconnect(sc)
> sc <- spark_connect(master = “local”, version = “2.1.0”)

Testing the spark context:

sc
$master
[1] “local[8]”

$method
[1] “shell”

$app_name
[1] “sparklyr”

$config
$config$sparklyr.cores.local
[1] 8

$config$spark.sql.shuffle.partitions.local
[1] 8

$config$spark.env.SPARK_LOCAL_IP.local
[1] “127.0.0.1”

$config$sparklyr.csv.embedded
[1] “^1.*”

$config$`sparklyr.shell.driver-class-path`
[1] “”

attr(,”config”)
[1] “default”
attr(,”file”)
[1] “/Library/Frameworks/R.framework/Versions/3.4/Resources/library/sparklyr/conf/config-template.yml”

$spark_home
[1] “/Volumes/OSxexT/tools/spark-2.1.0-bin-hadoop2.6”

$backend
A connection with
description “->localhost:53374”
class “sockconn”
mode “wb”
text “binary”
opened “opened”
can read “yes”
can write “yes”

$monitor
A connection with
description “->localhost:8880”
class “sockconn”
mode “rb”
text “binary”
opened “opened”
can read “yes”
can write “yes”

$output_file
[1] “/var/folders/x7/331tvwcd6p17jj9zdmhnkpyc0000gn/T//RtmpIIVL8I/file6ba7b454325_spark.log”

$spark_context
<jobj[5]>
class org.apache.spark.SparkContext
org.apache.spark.SparkContext@159ba51c

$java_context
<jobj[6]>
class org.apache.spark.api.java.JavaSparkContext
org.apache.spark.api.java.JavaSparkContext@6b114a2d

$hive_context
<jobj[9]>
class org.apache.spark.sql.SparkSession
org.apache.spark.sql.SparkSession@2cd7fdf8

attr(,”class”)
[1] “spark_connection” “spark_shell_connection” “DBIConnection”
> h2o_context(sc, st)
Error in is.H2OFrame(x) : object ‘st’ not found
> h2o_context(sc, strict_version_check = FALSE)
<jobj[15]>
class org.apache.spark.h2o.H2OContext

Sparkling Water Context:
* H2O name: sparkling-water-avkashchauhan_1672148412
* cluster size: 1
* list of used nodes:
(executorId, host, port)
————————
(driver,127.0.0.1,54321)
————————

Open H2O Flow in browser: http://127.0.0.1:54321 (CMD + click in Mac OSX)

You can use the following command to launch H2O FLOW:

h2o_flow(sc, strict_version_check = FALSE)

Thats it, enjoy!!

 

Building GBM model in R and exporting POJO and MOJO model

Get the dataset:

Training:

http://h2o-training.s3.amazonaws.com/pums2013/adult_2013_train.csv.gz

Test:

http://h2o-training.s3.amazonaws.com/pums2013/adult_2013_test.csv.gz

Here is the script to build GBM grid model and export MOJO model:

library(h2o)
h2o.init()

# Importing Dataset
trainfile <- file.path("/Users/avkashchauhan/learn/adult_2013_train.csv.gz")
adult_2013_train <- h2o.importFile(trainfile, destination_frame = "adult_2013_train")
testfile <- file.path("/Users/avkashchauhan/learn/adult_2013_test.csv.gz")
adult_2013_test <- h2o.importFile(testfile, destination_frame = "adult_2013_test")

# Display Dataset
adult_2013_train
adult_2013_test

# Feature Engineering
actual_log_wagp <- h2o.assign(adult_2013_test[, "LOG_WAGP"], key = "actual_log_wagp")

for (j in c("COW", "SCHL", "MAR", "INDP", "RELP", "RAC1P", "SEX", "POBP")) {
 adult_2013_train[[j]] <- as.factor(adult_2013_train[[j]])
 adult_2013_test[[j]] <- as.factor(adult_2013_test[[j]])
}
predset <- c("RELP", "SCHL", "COW", "MAR", "INDP", "RAC1P", "SEX", "POBP", "AGEP", "WKHP", "LOG_CAPGAIN", "LOG_CAPLOSS")

# Building GBM Model:
log_wagp_gbm_grid <- h2o.gbm(x = predset,
 y = "LOG_WAGP",
 training_frame = adult_2013_train,
 model_id = "GBMModel",
 distribution = "gaussian",
 max_depth = 5,
 ntrees = 110,
 validation_frame = adult_2013_test)

log_wagp_gbm_grid

# Prediction 
h2o.predict(log_wagp_gbm_grid, adult_2013_test)

# Download POJO Model:
h2o.download_pojo(log_wagp_gbm_grid, "/Users/avkashchauhan/learn", get_genmodel_jar = TRUE)

# Download MOJO model:
h2o.download_mojo(log_wagp_gbm_grid, "/Users/avkashchauhan/learn", get_genmodel_jar = TRUE)

You will see GBM_model.java (as POJO Model) and GBM_model.zip (MOJO model) at the location where you will save these models.

Thats it, enjoy!

 

Saving H2O models from R/Python API in Hadoop Environment

When you are using H2O in clustered environment i.e. Hadoop the machine could be different where h2o.savemodel() is trying to write the model and thats why you see the error “No such file or directory”. If you just give the path i.e. /tmp and visit the machine ID where H2O connection is initiated from R, you will see the model stored there.
Here is a good example to understand it better:
Step [1] Starting Hadoop driver in EC2 environment as below:
[ec2-user@ip-10-0-104-179 ~]$ hadoop jar h2o-3.10.4.8-hdp2.6/h2odriver.jar -nodes 2 -mapperXmx 2g -output /usr/ec2-user/005
....
....
....
Open H2O Flow in your web browser: http://10.0.65.248:54323  <=== H2O is started.
Note: Above you could see that hadoop command is ran on ip address 10.0.104.179 however the node where H2O server is shown as 10.0.65.248.
Step [2] Connect R client with H2O
> h2o.init(ip = "10.0.65.248", port = 54323, strict_version_check = FALSE)
Note: I have used the ip address as shown above to connect with existing H2O cluster. However the machine where I am running R client is different as its IP address is 34.208.200.16.
Step [3]: Saving H2O model:
h2o.saveModel(my.glm, path = "/tmp", force = TRUE)
So when I am saving the mode it is saved at 10.0.65.248 machine even when the R client was running at 34.208.200.16.
ec2-user@ip-10-0-65-248 ~]$ ll /tmp/GLM*
-rw-r--r-- 1 yarn hadoop 90391 Jun 2 20:02 /tmp/GLM_model_R_1496447892009_1
So you need to make sure you have access to a folder where H2O service is running or you can save model at HDFS something similar to as below:
h2o.saveModel(my.glm, path = "hdfs://ip-10-0-104-179.us-west-2.compute.internal/user/achauhan", force = TRUE)

Thats it, enjoy!!

Using H2O models into Java for scoring or prediction

This sample generate a GBM model from R H2O library and then consume the model into Java for prediction.

Here is R Script to generate sample model using H2O

setwd("/tmp/resources/")
library(h2o)
h2o.init()
df = iris
h2o_df = as.h2o(df)
y = "Species"
x = c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width")
model = h2o.gbm(y = y, x = x, training_frame = h2o_df)
model
h2o.download_mojo(model, get_genmodel_jar = TRUE)

Here is the Java code to use Model for prediction:

import hex.genmodel.easy.RowData;
import hex.genmodel.easy.EasyPredictModelWrapper;
import hex.genmodel.easy.prediction.*;
import hex.genmodel.MojoModel;

public class main {
    static void printIt(String message, MultinomialModelPrediction p) {
        System.out.println("");
        System.out.println(message);
        for (int i = 0; i < p.classProbabilities.length; i++) {
            if (i > 0) {
                System.out.print(",");
            }
            System.out.print(p.classProbabilities[i]);
        }
        System.out.println("");
    }
    public static void main(String[] args) throws Exception {
        EasyPredictModelWrapper model_orig = new EasyPredictModelWrapper(MojoModel.load("unzipped_orig"));
        {
            RowData row = new RowData();
            row.put("Sepal.Length", "1");
            row.put("Sepal.Width", "1");
            row.put("Petal.Length", "1");
            row.put("Petal.Width", "1");
            MultinomialModelPrediction p = model_orig.predictMultinomial(row);
            printIt("All 1s, orig", p);
        }
        {
            RowData row = new RowData();
            MultinomialModelPrediction p = model_orig.predictMultinomial(row);
            printIt("All NAs, orig", p);
        }
        {
            RowData row = new RowData();
            row.put("Sepal.Length", "1");
            row.put("sepwid", "1");
            row.put("Petal.Length", "1");
            row.put("Petal.Width", "1");

            MultinomialModelPrediction p = model_orig.predictMultinomial(row);
            printIt("Sepal width NA, orig", p);
        }
        // -------------------
        EasyPredictModelWrapper model_modified = new EasyPredictModelWrapper(MojoModel.load("unzipped_modified"));
        {
            RowData row = new RowData();
            row.put("Sepal.Length", "1");
            row.put("sepwid", "1");
            row.put("Petal.Length", "1");
            row.put("Petal.Width", "1");
            MultinomialModelPrediction p = model_modified.predictMultinomial(row);
            printIt("All 1s (with sepwid instead of Sepal.Width), modified", p);
        }
        {
            RowData row = new RowData();
            MultinomialModelPrediction p = model_modified.predictMultinomial(row);
            printIt("All NAs, modified", p);
        }
        {
            RowData row = new RowData();
            row.put("Sepal.Length", "1");
            row.put("Sepal.Width", "1");
            row.put("Petal.Length", "1");
            row.put("Petal.Width", "1");
            MultinomialModelPrediction p = model_modified.predictMultinomial(row);
            printIt("Sepal width NA (with Sepal.Width instead of sepwid), modified", p);
        }
    }
}

After the MOJO is downloaded you can see the model.ini as below:

[info]
h2o_version = 3.10.4.8
mojo_version = 1.20
license = Apache License Version 2.0
algo = gbm
algorithm = Gradient Boosting Machine
endianness = LITTLE_ENDIAN
category = Multinomial
uuid = 7712689150025610456
supervised = true
n_features = 4
n_classes = 3
n_columns = 5
n_domains = 1
balance_classes = false
default_threshold = 0.5
prior_class_distrib = [0.3333333333333333, 0.3333333333333333, 0.3333333333333333]
model_class_distrib = [0.3333333333333333, 0.3333333333333333, 0.3333333333333333]
timestamp = 2017-05-23T08:19:42.961-07:00
n_trees = 50
n_trees_per_class = 3
distribution = multinomial
init_f = 0.0
offset_column = null

[columns]
Sepal.Length
Sepal.Width
Petal.Length
Petal.Width
Species

[domains]
4: 3 d000.txt

If you decided to modify model.ini by renaming column (i.e.sepal.width to sepwid) you can do as below:

[info]
h2o_version = 3.10.4.8
mojo_version = 1.20
license = Apache License Version 2.0
algo = gbm
algorithm = Gradient Boosting Machine
endianness = LITTLE_ENDIAN
category = Multinomial
uuid = 7712689150025610456
supervised = true
n_features = 4
n_classes = 3
n_columns = 5
n_domains = 1
balance_classes = false
default_threshold = 0.5
prior_class_distrib = [0.3333333333333333, 0.3333333333333333, 0.3333333333333333]
model_class_distrib = [0.3333333333333333, 0.3333333333333333, 0.3333333333333333]
timestamp = 2017-05-23T08:19:42.961-07:00
n_trees = 50
n_trees_per_class = 3
distribution = multinomial
init_f = 0.0
offset_column = null

[columns]
Sepal.Length
SepWid
Petal.Length
Petal.Width
Species

[domains]
4: 3 d000.txt

Now we can run the Java commands to test the code as below:

$ java -cp .:h2o-genmodel.jar main

All 1s, orig
0.7998234476072545,0.15127335891610785,0.04890319347663747

All NAs, orig
0.009344361534466918,0.9813250958541073,0.009330542611425827

Sepal width NA, orig
0.7704658301004306,0.19829292017147707,0.03124124972809238

All 1s (with sepwid instead of Sepal.Width), modified
0.7998234476072545,0.15127335891610785,0.04890319347663747

All NAs, modified
0.009344361534466918,0.9813250958541073,0.009330542611425827

Sepal width NA (with Sepal.Width instead of sepwid), modified
0.7704658301004306,0.19829292017147707,0.03124124972809238
 Thats it, enjoy!!

Using RESTful API to get POJO and MOJO models in H2O

 

CURL API for Listing Models:

http://<hostname>:<port>/3/Models/

CURL API for Listing specific POJO Model:

http://<hostname>:<port>/3/Models/model_name

List Specific MOJO Model:

http://<hostname>:<port>/3/Models/glm_model/mojo

Here is an example:

curl -X GET "http://localhost:54323/3/Models"
curl -X GET "http://localhost:54323/3/Models/deeplearning_model" >> NAME_IT

curl -X GET "http://localhost:54323/3/Models/deeplearning_model" >> dl_model.java
curl -X GET "http://localhost:54323/3/Models/glm_model/mojo" > myglm_mojo.zip

Thats it, enjoy!!

Starter script for rsparkling (H2O on Spark with R)

The rsparkling R package is an extension package for sparklyr that creates an R front-end for the Sparkling WaterSpark package from H2O. This provides an interface to H2O’s high performance, distributed machine learning algorithms on Spark, using R. Visit github project: https://github.com/h2oai/rsparkling

You must have the following package installed in your R environment:

You must have Sparkling Water latest package download and unzipped locally:

I am using the following package in my environment:

  • Spark 2.1
  • Sparkling Water 2.1.8
  • sparklyr 0.4.4
  • rsparkling 0.2.0

Now here is rspakrling script to create the cluster locally:

options(rsparkling.sparklingwater.location="/tmp/sparkling-water-assembly_2.11-2.1.8-all.jar")
Sys.setenv(SPARK_HOME="/usr/hdp/current/spark2-client/")
library(sparklyr)
library(rsparkling)
config <- spark_config()
config$spark.executor.cores <- 4
config$spark.executor.memory <- "4G"
sc <- spark_connect(master = "local", config = config, version = '2.1.0')
print(sc)
h2o_context(sc, strict_version_check = FALSE)
h2o_flow(sc, strict_version_check = FALSE)
spark_disconnect(sc)

Now here is the rsparkling script to create Spark cluster with Yarn:

options(rsparkling.sparklingwater.location="/tmp/sparkling-water-assembly_2.11-2.1.8-all.jar")
Sys.setenv(SPARK_HOME="/usr/hdp/current/spark2-client/")
library(sparklyr)
library(rsparkling)
config <- spark_config()
config$spark.executor.cores <- 4
config$spark.executor.memory <- "4G"
config$spark.executor.instances = 2
sc <- spark_connect(master = "yarn-client", config = config, version = '2.1.0')
print(sc)
h2o_context(sc, strict_version_check = FALSE)
h2o_flow(sc, strict_version_check = FALSE)
spark_disconnect(sc)

Thats it, Enjoy!!

Installing R on Redhat 7 (EC2 RHEL 7)

Check you machine version:

$ cat /etc/redhat-release
Red Hat Enterprise Linux Server release 7.3 (Maipo)

Now  lets updated the RPM repo details:

$ sudo su -c 'rpm -Uvh http://mirror.sfo12.us.leaseweb.net/epel/7/x86_64/e/epel-release-7-9.noarch.rpm'
$ sudo yum update

Make sure all dependencies are installed individually:

$ wget http://mirror.centos.org/centos/7/os/x86_64/Packages/blas-devel-3.4.2-5.el7.x86_64.rpm
$ sudo yum localinstall blas-devel-3.4.2-5.el7.x86_64.rpm

$ wget http://mirror.centos.org/centos/7/os/x86_64/Packages/blas-3.4.2-5.el7.x86_64.rpm
$ sudo yum localinstall blas-3.4.2-5.el7.x86_64.rpm

$ wget http://mirror.centos.org/centos/7/os/x86_64/Packages/lapack-devel-3.4.2-5.el7.x86_64.rpm
$ sudo yum localinstall lapack-devel-3.4.2-5.el7.x86_64.rpm

$ wget http://mirror.centos.org/centos/7/os/x86_64/Packages/texinfo-tex-5.1-4.el7.x86_64.rpm
$ sudo yum install texinfo-tex-5.1-4.el7.x86_64.rpm

$ wget http://mirror.centos.org/centos/7/os/x86_64/Packages/texlive-epsf-svn21461.2.7.4-38.el7.noarch.rpm
$ sudo yum install texlive-epsf-svn21461.2.7.4-38.el7.noarch.rpm

Finally install R now:

$ sudo yum install R

Thats it.