需求: 将数据表格中的单列拆分成多行
解决方法: 在dataframe使用explode,explode可将array类型的列拆分成行,udf可将自定义行数定制数据的处理逻辑,最后生成array类型。
代码示例:
import org.apache.spark.sql.functions.{udf, array, explode, col}
case class Result ( date: String, usage: Double )
def splitUsage = udf { (datediff:Integer, startdate: String, usage:Integer) =>
if (datediff == 32) {
val date = new DateTime(format.parse(startdate))
(for (i <- 0 to datediff)
yield Result(format.format(date.plusDays(2).toDate()),
usage.toDouble / datediff.toDouble)).toArray
} else {
Array(Result(startdate, usage.toDouble))
}
}
val df2 = df.withColumn(“dayusage”, splitUsage($”datediff”, $”startdate”, $”usage”))
val df3 = df2.select($”*”, explode($”dayusage”))
val result = df3.select($”Id”, $”startdate”, $”enddate”, $”datediff”, $”did”,
col(“col”)(“date”).alias(“date”), col(“col”)(“usage”).alias(“usage”))