[#553] Implement vacuuming and compaction on delta parallel writer

agilelab-tmnd1991 · agilelab-tmnd1991 · commit ef8b15fb2513 · 2023-09-14T16:46:24.000Z
# New features and improvements Parallel write with delta now takes 4 more (optional) configuration keys: * compactFrequency: Int * compactNumFile: Int * vacuumFrequency: Int * retentionHours: Int The behaviour is the following: When performing the write operation of the micro-batch, every $compactFrequency micro-batches (i.e. if the batch id % $compactFrequency == 0), the whole table is rewritten repartitioning it in $compactNumFile partitions (i.e. output files). When performing the write operation of the micro-batch, every $vacuumFrequency micro-batches (i.e. if the batch id % $retentionFrequency == 0), the table is vacuumed passing $retentionHours as retentionHours parameter of the [vacuum function](https://docs.delta.io/latest/delta-utility.html#-delta-vacuum). $compactFrequency and $compactNumFile must be both set or neither set, otherwise a configuration error will be thrown. Not setting them will simply disable the feature (i.e. no compaction is performed). $vacuumFrequency and $retentionHours must be both set or neither set, otherwise a configuration error will be thrown. Not setting them will simply disable the feature (i.e. no vacuum is performed). # Breaking changes None. # Migration None. # Bug fixes None. # How this feature was tested Existing unit tests. # Related issue Closes #553
diff --git a/plugin-parallel-write-spark/src/main/scala/it/agilelab/bigdata/wasp/consumers/spark/plugins/parallel/ParallelWriteWriters.scala b/plugin-parallel-write-spark/src/main/scala/it/agilelab/bigdata/wasp/consumers/spark/plugins/parallel/ParallelWriteWriters.scala
@@ -46,7 +46,7 @@ class ParallelWriteSparkStructuredStreamingWriter(
           )
           logger.info(s"Writing microbatch with id: $batchId")
           try
-            writer.write(writeExecutionPlan, batch, correlationId)
+            writer.write(writeExecutionPlan, batch, correlationId, batchId)
           catch {
             case e: Exception =>
               logger.error("Failed writing a microbatch", e)
diff --git a/plugin-parallel-write-spark/src/main/scala/it/agilelab/bigdata/wasp/consumers/spark/plugins/parallel/model/ParallelWriteModelParser.scala b/plugin-parallel-write-spark/src/main/scala/it/agilelab/bigdata/wasp/consumers/spark/plugins/parallel/model/ParallelWriteModelParser.scala
@@ -11,7 +11,7 @@ import spray.json._
 object ParallelWriteModelParser {
   implicit lazy val parallelWriteFormat: RootJsonFormat[ParallelWrite] = jsonFormat1((saveMode: String) => ParallelWrite.apply(saveMode))
   implicit lazy val catalogCoordinatesFormat: RootJsonFormat[CatalogCoordinates] = jsonFormat5(CatalogCoordinates.apply)
-  implicit lazy val continuousUpdateFormat: RootJsonFormat[ContinuousUpdate] = jsonFormat2((keys: List[String], orderingExpression: String) => ContinuousUpdate.apply(keys, orderingExpression))
+  implicit lazy val continuousUpdateFormat: RootJsonFormat[ContinuousUpdate] = jsonFormat6(ContinuousUpdate)
 
   implicit lazy val writerDetailsFormat: RootJsonFormat[WriterDetails] = new RootJsonFormat[WriterDetails] {
     override def read(json: JsValue): WriterDetails =
diff --git a/plugin-parallel-write-spark/src/main/scala/it/agilelab/bigdata/wasp/consumers/spark/plugins/parallel/model/WriterDetails.scala b/plugin-parallel-write-spark/src/main/scala/it/agilelab/bigdata/wasp/consumers/spark/plugins/parallel/model/WriterDetails.scala
@@ -2,20 +2,26 @@ package it.agilelab.bigdata.wasp.consumers.spark.plugins.parallel.model
 
 sealed trait WriterDetails
 object WriterDetails {
-  val parallelWrite = "parallelWrite"
+  val parallelWrite    = "parallelWrite"
   val continuousUpdate = "continuousUpdate"
 }
 
 /**
- * Details needeed by parallel writer
- * @param saveMode spark save mode
- */
+  * Details needeed by parallel writer
+  * @param saveMode spark save mode
+  */
 case class ParallelWrite(saveMode: String) extends WriterDetails
 
 /**
- * Details needed by continuous update writer
- * @param keys delta table unique keys column list
- * @param orderingExpression monotonically increasing select expression to choose upsert candidate
- */
-case class ContinuousUpdate(keys: List[String], orderingExpression: String) extends WriterDetails
-
+  * Details needed by continuous update writer
+  * @param keys delta table unique keys column list
+  * @param orderingExpression monotonically increasing select expression to choose upsert candidate
+  */
+case class ContinuousUpdate(
+                             keys: List[String],
+                             orderingExpression: String,
+                             compactFrequency: Option[Int] = None,
+                             compactNumFile: Option[Int] = None,
+                             retentionHours: Option[Int] = None,
+                             vacuumFrequency: Option[Int] = None
+) extends WriterDetails
diff --git a/plugin-parallel-write-spark/src/main/scala/it/agilelab/bigdata/wasp/consumers/spark/plugins/parallel/writers/ColdParallelWriter.scala b/plugin-parallel-write-spark/src/main/scala/it/agilelab/bigdata/wasp/consumers/spark/plugins/parallel/writers/ColdParallelWriter.scala
@@ -14,19 +14,20 @@ trait ColdParallelWriter extends ParallelWriter {
   final override def write(
       writeExecutionPlan: WriteExecutionPlanResponseBody,
       df: DataFrame,
-      correlationId: CorrelationId
+      correlationId: CorrelationId,
+      batchId: Long
   ): Unit = {
     val s3path: URI = HadoopS3Utils.useS3aScheme(
       new URI(writeExecutionPlan.writeUri.getOrElse(
         throw new RuntimeException("Entity responded without a writeUri field for a COLD case write"))))
     val spark       = df.sparkSession
     credentialsConfigurator.configureCredentials(writeExecutionPlan, spark.sparkContext.hadoopConfiguration)
     val partitioningColumns: Seq[String] = catalogService.getPartitioningColumns(spark, entityDetails)
-    performColdWrite(df, s3path, partitioningColumns)
+    performColdWrite(df, s3path, partitioningColumns, batchId)
     recoverPartitions(spark, partitioningColumns)
   }
 
-  protected def performColdWrite(df: DataFrame, path: URI, partitioningColumns: Seq[String]): Unit
+  protected def performColdWrite(df: DataFrame, path: URI, partitioningColumns: Seq[String], batchId: Long): Unit
 
   private def recoverPartitions(sparkSession: SparkSession, partitions: Seq[String]): Unit =
     if (partitions.nonEmpty)
diff --git a/plugin-parallel-write-spark/src/main/scala/it/agilelab/bigdata/wasp/consumers/spark/plugins/parallel/writers/ContinuousUpdateWriter.scala b/plugin-parallel-write-spark/src/main/scala/it/agilelab/bigdata/wasp/consumers/spark/plugins/parallel/writers/ContinuousUpdateWriter.scala
@@ -13,25 +13,56 @@ import java.net.URI
 class SchemaException(message: String) extends Exception(message)
 
 /**
- * Writer for continuous update.
- * @param writerDetails  Informations about unique keys, ordering expression and fields to drop
- * @param entityDetails
- */
+  * Writer for continuous update.
+  * @param writerDetails  Informations about unique keys, ordering expression and fields to drop
+  * @param entityDetails
+  */
 case class ContinuousUpdateWriter(
-  writerDetails: ContinuousUpdate,
-  entityAPI: ParallelWriteEntity,
-  entityDetails: CatalogCoordinates,
-  catalogService: DataCatalogService
+    writerDetails: ContinuousUpdate,
+    entityAPI: ParallelWriteEntity,
+    entityDetails: CatalogCoordinates,
+    catalogService: DataCatalogService
 ) extends DeltaParallelWriterTrait {
 
-  override def performDeltaWrite(df: DataFrame, s3path: URI, partitioningColumns: Seq[String]): Unit = {
+  override def performDeltaWrite(df: DataFrame, s3path: URI, partitioningColumns: Seq[String], batchId: Long): Unit = {
     // schema evolution not supported yet, property not necessary at the moment
     // ss.sql("SET spark.databricks.delta.schema.autoMerge.enabled = true")
     val spark: SparkSession = df.sparkSession
     val orderedDF           = applyOrderingLogic(df, writerDetails.keys, writerDetails.orderingExpression)
     val enforcedDf          = enforceSchema(orderedDF)
     val condition           = writerDetails.keys.map(x => s"table.$x = table2.$x").mkString(" AND ")
     val deltaTable          = getDeltaTable(s3path, spark, partitioningColumns)
+    (writerDetails.compactFrequency, writerDetails.compactNumFile) match {
+      case (None, None) =>
+      case (Some(compactFrequency), Some(compactNumFile)) =>
+        if (batchId % compactFrequency == 0) {
+          logger.info(s"Compacting table at ${s3path} with partitions $compactNumFile files")
+          deltaTable.toDF
+            .repartition(compactNumFile)
+            .write
+            .option("dataChange", "false")
+            .format("delta")
+            .mode("overwrite")
+            .partitionBy(partitioningColumns: _*)
+            .save(s3path.toString)
+        }
+      case other =>
+        throw new IllegalArgumentException(
+          s"Both compactFrequency and compactNumFile must be null or have a value, but ${other} was provided"
+        )
+    }
+    (writerDetails.retentionHours, writerDetails.vacuumFrequency) match {
+      case (None, None) =>
+      case (Some(retentionHours), Some(vacuumFrequency)) =>
+        if (batchId % vacuumFrequency == 0) {
+          logger.info(s"Vacuuming table ${s3path} with retention ${retentionHours} hours")
+          deltaTable.vacuum(retentionHours.toDouble)
+        }
+      case other =>
+        throw new IllegalArgumentException(
+          s"Both retentionHours and vacuumFrequency must be null or have a value, but ${other} was provided"
+        )
+    }
     deltaTable
       .as("table")
       .merge(enforcedDf.as("table2"), condition)
diff --git a/plugin-parallel-write-spark/src/main/scala/it/agilelab/bigdata/wasp/consumers/spark/plugins/parallel/writers/DeltaParallelWriter.scala b/plugin-parallel-write-spark/src/main/scala/it/agilelab/bigdata/wasp/consumers/spark/plugins/parallel/writers/DeltaParallelWriter.scala
@@ -15,7 +15,7 @@ case class DeltaParallelWriter(
   override val catalogService: DataCatalogService
 ) extends DeltaParallelWriterTrait {
 
-  override def performDeltaWrite(df: DataFrame, path: URI, partitioningColumns: Seq[String]): Unit =
+  override def performDeltaWrite(df: DataFrame, path: URI, partitioningColumns: Seq[String], batchId: Long): Unit =
     enforceSchema(df).write
       .mode(parallelWriteDetails.saveMode)
       .format("delta")
diff --git a/plugin-parallel-write-spark/src/main/scala/it/agilelab/bigdata/wasp/consumers/spark/plugins/parallel/writers/DeltaParallelWriterTrait.scala b/plugin-parallel-write-spark/src/main/scala/it/agilelab/bigdata/wasp/consumers/spark/plugins/parallel/writers/DeltaParallelWriterTrait.scala
@@ -8,12 +8,12 @@ import java.net.URI
 
 trait DeltaParallelWriterTrait extends ColdParallelWriter {
 
-  override final def performColdWrite(df: DataFrame, path: URI, partitioningColumns: Seq[String]): Unit = {
-    performDeltaWrite(df, path, partitioningColumns)
+  override final def performColdWrite(df: DataFrame, path: URI, partitioningColumns: Seq[String], batchId: Long): Unit = {
+    performDeltaWrite(df, path, partitioningColumns, batchId)
     reconciliateManifest(getDeltaTable(path, df.sparkSession, partitioningColumns))
   }
 
-  protected def performDeltaWrite(df: DataFrame, path: URI, partitioningColumns: Seq[String]): Unit
+  protected def performDeltaWrite(df: DataFrame, path: URI, partitioningColumns: Seq[String], batchId: Long): Unit
 
   private def reconciliateManifest(deltaTable: DeltaTable): Unit =
     deltaTable.generate("symlink_format_manifest")
diff --git a/plugin-parallel-write-spark/src/main/scala/it/agilelab/bigdata/wasp/consumers/spark/plugins/parallel/writers/HotParallelWriter.scala b/plugin-parallel-write-spark/src/main/scala/it/agilelab/bigdata/wasp/consumers/spark/plugins/parallel/writers/HotParallelWriter.scala
@@ -23,7 +23,8 @@ case class HotParallelWriter(
   override def write(
       writeExecutionPlan: WriteExecutionPlanResponseBody,
       df: DataFrame,
-      correlationId: CorrelationId
+      correlationId: CorrelationId,
+      batchId: Long
   ): Unit = {
     logger.info(s"Writing to entity ${entityDetails.name}")
     df.select(to_json(struct(df.columns.map(col): _*))).foreachPartition { it: Iterator[Row] =>
diff --git a/plugin-parallel-write-spark/src/main/scala/it/agilelab/bigdata/wasp/consumers/spark/plugins/parallel/writers/ParallelWriter.scala b/plugin-parallel-write-spark/src/main/scala/it/agilelab/bigdata/wasp/consumers/spark/plugins/parallel/writers/ParallelWriter.scala
@@ -25,7 +25,7 @@ trait ParallelWriter extends Logging{
     * @param writeExecutionPlan execution plan obtained from entity
     * @param df data to write
     */
-  def write(writeExecutionPlan: WriteExecutionPlanResponseBody, df: DataFrame, correlationId: CorrelationId): Unit
+  def write(writeExecutionPlan: WriteExecutionPlanResponseBody, df: DataFrame, correlationId: CorrelationId, batchId: Long): Unit
 
   def rollback(correlationId: CorrelationId): Unit =
     entityAPI.postDataComplete(DataCompleteRequestBody(false), correlationId)
diff --git a/plugin-parallel-write-spark/src/main/scala/it/agilelab/bigdata/wasp/consumers/spark/plugins/parallel/writers/ParquetParallelWriter.scala b/plugin-parallel-write-spark/src/main/scala/it/agilelab/bigdata/wasp/consumers/spark/plugins/parallel/writers/ParquetParallelWriter.scala
@@ -15,7 +15,7 @@ case class ParquetParallelWriter(
   catalogService: DataCatalogService
 ) extends ColdParallelWriter {
 
-  override protected def performColdWrite(df: DataFrame, s3path: URI, partitioningColumns: Seq[String]): Unit =
+  override protected def performColdWrite(df: DataFrame, s3path: URI, partitioningColumns: Seq[String], batchId: Long): Unit =
     enforceSchema(df).write
       .mode(parallelWriteDetails.saveMode)
       .format("parquet")
diff --git a/plugin-parallel-write-spark/src/test/scala/it/agilelab/bigdata/wasp/consumers/spark/plugins/parallel/tools/ParallelWriteModelParserSpec.scala b/plugin-parallel-write-spark/src/test/scala/it/agilelab/bigdata/wasp/consumers/spark/plugins/parallel/tools/ParallelWriteModelParserSpec.scala
@@ -1,8 +1,16 @@
 package it.agilelab.bigdata.wasp.consumers.spark.plugins.parallel.tools
 
 import it.agilelab.bigdata.wasp.consumers.spark.plugins.parallel.catalog.CatalogCoordinates
-import it.agilelab.bigdata.wasp.consumers.spark.plugins.parallel.model.ParallelWriteModelParser.{parseParallelWriteModel, writerDetailsFormat}
-import it.agilelab.bigdata.wasp.consumers.spark.plugins.parallel.model.{ContinuousUpdate, ParallelWrite, ParallelWriteModel, WriterDetails}
+import it.agilelab.bigdata.wasp.consumers.spark.plugins.parallel.model.ParallelWriteModelParser.{
+  parseParallelWriteModel,
+  writerDetailsFormat
+}
+import it.agilelab.bigdata.wasp.consumers.spark.plugins.parallel.model.{
+  ContinuousUpdate,
+  ParallelWrite,
+  ParallelWriteModel,
+  WriterDetails
+}
 import it.agilelab.bigdata.wasp.datastores.GenericProduct
 import it.agilelab.bigdata.wasp.models.GenericModel
 import org.mongodb.scala.bson.BsonDocument
@@ -26,8 +34,7 @@ class ParallelWriteModelParserSpec extends FunSuite {
   test("ParallelWrite writer") {
     lazy val genericModel = GenericModel(
       name = "test-generic",
-      value = BsonDocument(
-        """{
+      value = BsonDocument("""{
           |"writerDetails": {
           | "writerType": "parallelWrite",
           | "saveMode": "append"
@@ -37,14 +44,13 @@ class ParallelWriteModelParserSpec extends FunSuite {
       product = GenericProduct("parallelWrite", None)
     )
     val model: ParallelWriteModel = parseParallelWriteModel(genericModel)
-    val expectedModel = ParallelWriteModel(ParallelWrite("append"), CatalogCoordinates("", "mock", ""))
+    val expectedModel             = ParallelWriteModel(ParallelWrite("append"), CatalogCoordinates("", "mock", ""))
     assert(model == expectedModel)
   }
-  test("Continuous update model") {
+  test("Continuous update model without compaction") {
     lazy val genericModel = GenericModel(
       name = "test-generic",
-      value = BsonDocument(
-        """{"entityDetails": {"name":"mock"},
+      value = BsonDocument("""{"entityDetails": {"name":"mock"},
           |"writerDetails": {
           | "writerType": "continuousUpdate",
           | "keys": ["pk"],
@@ -53,16 +59,46 @@ class ParallelWriteModelParserSpec extends FunSuite {
           |}""".stripMargin),
       product = GenericProduct("parallelWrite", None)
     )
-    val model = parseParallelWriteModel(genericModel)
+    val model         = parseParallelWriteModel(genericModel)
     val expectedModel = ParallelWriteModel(ContinuousUpdate(List("pk"), "pk"), CatalogCoordinates("", "mock", ""))
     assert(model == expectedModel)
   }
 
+  test("Continuous update model with compaction") {
+    lazy val genericModel = GenericModel(
+      name = "test-generic",
+      value = BsonDocument("""{"entityDetails": {"name":"mock"},
+          |"writerDetails": {
+          | "writerType": "continuousUpdate",
+          | "keys": ["pk"],
+          | "orderingExpression": "pk",
+          | "compactFrequency": 100,
+          | "compactNumFile": 1,
+          | "retentionHours": 168,
+          | "vacuumFrequency": 100
+          |}
+          |}""".stripMargin),
+      product = GenericProduct("parallelWrite", None)
+    )
+    val model = parseParallelWriteModel(genericModel)
+    val expectedModel = ParallelWriteModel(
+      ContinuousUpdate(
+        keys = List("pk"),
+        orderingExpression = "pk",
+        compactFrequency = Some(100),
+        compactNumFile = Some(1),
+        retentionHours = Some(168),
+        vacuumFrequency = Some(100)
+      ),
+      CatalogCoordinates("", "mock", "")
+    )
+    assert(model == expectedModel)
+  }
+
   test("Wrong generic model kind JSON") {
     lazy val genericModel = GenericModel(
       name = "test-generic",
-      value = BsonDocument(
-        """{"mode": "append",
+      value = BsonDocument("""{"mode": "append",
           |"partitionBy": [],
           |"entityDetails": {"name":"mock"},
           |"s3aEndpoint": "localhost:4566",
@@ -79,8 +115,7 @@ class ParallelWriteModelParserSpec extends FunSuite {
   test("Wrong value JSON") {
     lazy val genericModel = GenericModel(
       name = "test-generic",
-      value = BsonDocument(
-        """{"mode": "append",
+      value = BsonDocument("""{"mode": "append",
           |"partitionBy": [],
           |"entityDetails": {"name":"mock"},
           |"s3aEndpointdasd": "localhost:4566",
diff --git a/plugin-parallel-write-spark/src/test/scala/it/agilelab/bigdata/wasp/consumers/spark/plugins/parallel/tools/TestModels.scala b/plugin-parallel-write-spark/src/test/scala/it/agilelab/bigdata/wasp/consumers/spark/plugins/parallel/tools/TestModels.scala
@@ -14,11 +14,11 @@ object TestModels {
 
   val modelWithDB = ParallelWriteModel(ParallelWrite("overwrite"), entityDetails = entityDetailsWithDB)
 
-  val continuousUpdateModel1 = ParallelWriteModel(ContinuousUpdate(keys = "column1" :: Nil, orderingExpression = "ordering"), entityDetails)
+  val continuousUpdateModel1 = ParallelWriteModel(ContinuousUpdate(keys = "column1" :: Nil, orderingExpression = "ordering", Some(100), Some(64), Some(168), Some(100)), entityDetails)
 
-  val continuousUpdateModel2 = ParallelWriteModel(ContinuousUpdate(keys = "column1" :: Nil, orderingExpression = "-(ordering1 + ordering2)"), entityDetails)
+  val continuousUpdateModel2 = ParallelWriteModel(ContinuousUpdate(keys = "column1" :: Nil, orderingExpression = "-(ordering1 + ordering2)", Some(100), Some(64), Some(168), Some(100)), entityDetails)
 
-  val wrongModel = ParallelWriteModel(ContinuousUpdate(keys = "column1" :: Nil, orderingExpression = "-(ordering1 + ordering2)"), wrongDetails)
+  val wrongModel = ParallelWriteModel(ContinuousUpdate(keys = "column1" :: Nil, orderingExpression = "-(ordering1 + ordering2)", Some(100), Some(64), Some(168), Some(100)), wrongDetails)
 
-  val notExistingEntityModel = ParallelWriteModel(ContinuousUpdate(keys = "column1" :: Nil, orderingExpression = "-(ordering1 + ordering2)"), notExistingEntityDetails)
+  val notExistingEntityModel = ParallelWriteModel(ContinuousUpdate(keys = "column1" :: Nil, orderingExpression = "-(ordering1 + ordering2)", Some(100), Some(64), Some(168), Some(100)), notExistingEntityDetails)
 }

Original file line number	Diff line number	Diff line change
`@@ -46,7 +46,7 @@ class ParallelWriteSparkStructuredStreamingWriter(`
`46`	`46`	`)`
`47`	`47`	`logger.info(s"Writing microbatch with id: $batchId")`
`48`	`48`	`try`
`49`		`- writer.write(writeExecutionPlan, batch, correlationId)`
	`49`	`+ writer.write(writeExecutionPlan, batch, correlationId, batchId)`
`50`	`50`	`catch {`
`51`	`51`	`case e: Exception =>`
`52`	`52`	`logger.error("Failed writing a microbatch", e)`