Here is an example Scala code which shows how to use new Sparkling Water package to setup OneHot encoding and Stratified Sampling:
import org.apache.spark.h2o._ import water.Key import water.etl.prims.advmath.AdvMath.StratifiedSplit import water.etl.prims.mungers.Mungers.OneHotEncoder import water.etl.prims.operators.Operators.Eq import water.fvec.Frame import water.fvec.H2OFrame import java.net.URI // Use the following line if you decided to setup external SW Kluster // val conf = new H2OConf(sc).setExternalClusterMode().useManualClusterStart().setCloudName("test") //val hc = H2OContext.getOrCreate(sc, conf) // OR use the following for basic configuration val hc = H2OContext.getOrCreate(sc) val fr = new H2OFrame(new java.io.File("/Users/avkashchauhan/src/github.com/h2oai/h2o-3/smalldata/airlines/AirlinesTest.csv.zip")) val frOH = OneHotEncoder(fr, "Origin") fr.add(frOH) // Combine the pivoted result to the original frame val trainTestCol = StratifiedSplit(fr,"IsDepDelayed",0.2,123); val idx = Eq(trainTestCol,"train") val train = fr.deepSlice(idx,null) // get subset of the Frame according to True/False of boolean 1 column Frame "idx" val idx2 = Eq(trainTestCol,"test") val test = fr.deepSlice(idx2,null) println(train.toString(0L,10))
Thats it, enjoy!!