Viewed 3 times
    import org.apache.spark.sql.{SparkSession, DataFrame}
import org.apache.spark.sql.functions._

object SparkChargePoints {

  val input = "data/input/electric-chargepoints-2017.csv"
  val output = "data/output/chargepoints-2017-analysis"

  val spark = SparkSession.builder
    .master("local[*]")
    .appName("SparkChargePoints")
    .getOrCreate()

  import spark.implicits._

  def extract(): DataFrame = {
    val df = spark.read.option("header","true").option("inferschema","true").format("csv").load("input")
  }

  def transform(df: DataFrame): DataFrame = {
    df.withColumn("PluginDuration",col("PluginDuration").cast("double")).groupBy("CPID").agg(max("PluginDuration"), avg("PluginDuration"))
  }

  def load(df: DataFrame): DataFrame = {
    df.write.parquet("output")
  }

  def main(args: Array[String]): Unit = {
    load(transform(extract()))
  }
}

could anyone complete this code as the requirement is to find the unique cpid withe duration of the longest plugin and average plugin. Duration should be calculated with start date, start time and send date, end time

Similar Questions