Analyze using Scala

Read the JSON files produced by JsonSink directly into Spark DataFrames.

import org.apache.spark.sql.functions._

val sparkPath = "dbfs:/perfgazer/jsonsink/"

val dfJobsReports   = spark.read.option("basePath", sparkPath).json(sparkPath + "clusterName=*/date=*/applicationId=*/job-reports-*.json")
val dfStagesReports = spark.read.option("basePath", sparkPath).json(sparkPath + "clusterName=*/date=*/applicationId=*/stage-reports-*.json")
val dfTasksReports  = spark.read.option("basePath", sparkPath).json(sparkPath + "clusterName=*/date=*/applicationId=*/task-reports-*.json")

// Reconcile reports across jobs, stages, and tasks
val dfTasks = dfJobsReports
  .withColumn("stageId", explode(col("stages")))
  .drop("stages")
  .join(dfStagesReports, Seq("date", "applicationId", "stageId"))
  .join(dfTasksReports,  Seq("date", "applicationId", "stageId"))

display(dfTasks)