Unification of date and time data with joda in Spark

Here is the code snippet which can first parse  various kind of date and time formats and then unify them together to be processed by data munging process.

  import org.apache.spark.sql.functions._
  import org.joda.time._
  import org.joda.time.format._
  import org.apache.spark.sql.expressions.Window

    val getHour = udf((dt:String) =>
      dt match {
        case null => None
        case s => {
          val fmt:DateTimeFormatter = DateTimeFormat.forPattern("MM/dd/yyyy hh:mm:ss aa")
          Some(fmt.parseDateTime(s).getHourOfDay)
        }
    })

    val getDT = udf((dt:String) =>
      dt match {
        case null => None
        case s => {
          val fmt:DateTimeFormatter = DateTimeFormat.forPattern("MM/dd/yyyy hh:mm:ss aa")
          Some(fmt.parseDateTime(s).getMillis / 1000.0  )
        }
      })

    // UDF for day of week
    val getDayOfWeek = udf((dt:String) => {
      dt match {
        case null => None
        case s => {
          val fmt:DateTimeFormatter = DateTimeFormat.forPattern("MM/dd/yyyy")
          Some(fmt.parseDateTime(s.split(" ")(0)).getDayOfWeek)
        }
      }
    })

    val getDate = udf((dt:String) => {
      dt match {
        case null => None
        case s => {
          Some(s.split(" ")(0))
        }
      }
    })

    val getDiffKey = udf((diff:Double) => {
      val threshold = 5 // 15 minutes   5 // 75%-tile seconds
      if (diff > threshold) {
        1  // tag as 2nd diff
      } else {
        0 // 1st diff
      }
    })

  val rawDF = sqlContext.read
      .format("com.databricks.spark.csv")
      .option("header", "true")
      .load("hdfs://mr-0xc5.0xdata.loc:8020/user/file.csv")

    var df = rawDF.withColumn("hourOfDay", getHour(rawDF.col("datetime")))
      df = df.withColumn("timestamp", getDT(df.col("datetime")))
      df = df.withColumn("dayOfWeek", getDayOfWeek(df.col("datetime")))
      df = df.withColumn("date", getDate(df.col("datetime")))
Advertisements

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s