[Spark] Add support for sorting within partitions when Z-ordering (#4006)

maltevelin · zedtang · web-flow · commit 2d07216e25cb · 2025-02-06T15:01:49.000-08:00
#### Which Delta project/connector is this regarding?  - [x] Spark - [ ] Standalone - [ ] Flink - [ ] Kernel - [ ] Other (fill in here) ## Description  Resolves #4000 by introducing a new configuration property `spark.databricks.io.skipping.mdc.sortWithinPartitions` that clusters records in row groups, within Parquet files, based on Z-order or Hilbert curve values. This improves data skipping on the Parquet level. Benchmarks included in the issue demonstrate speedups of approximately 8× and 11× on two different datasets. Please refer to the issue for more details. ## How was this patch tested?  Added test cases in `MultiDimClusteringSuite.scala` for Hilbert and Z-order curves. ## Does this PR introduce _any_ user-facing changes?  Yes. This PR introduces a new configuration property `spark.databricks.io.skipping.mdc.sortWithinPartitions`. The property defaults to `false`, ensuring that existing users remain unaffected unless they opt-in by setting it to `true`. **Previous Behavior** Z-ordering did not sort data within partitions. **New Behavior** When the property is enabled, `sortWithinPartitions` is applied after `repartitionByRange` in `MultiDimClustering.scala`. --------- Signed-off-by: Malte Velin <maltevelin@proprotonmail.ch> Co-authored-by: Jiaheng Tang <audi.ttrs.coupe@gmail.com>
diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/skipping/MultiDimClustering.scala b/spark/src/main/scala/org/apache/spark/sql/delta/skipping/MultiDimClustering.scala
@@ -74,6 +74,7 @@ trait SpaceFillingCurveClustering extends MultiDimClustering {
     val conf = df.sparkSession.sessionState.conf
     val numRanges = conf.getConf(DeltaSQLConf.MDC_NUM_RANGE_IDS)
     val addNoise = conf.getConf(DeltaSQLConf.MDC_ADD_NOISE)
+    val sortWithinFiles = conf.getConf(DeltaSQLConf.MDC_SORT_WITHIN_FILES)
 
     val cols = colNames.map(df(_))
     val mdcCol = getClusteringExpression(cols, numRanges)
@@ -90,6 +91,10 @@ trait SpaceFillingCurveClustering extends MultiDimClustering {
         .repartitionByRange(approxNumPartitions, col(repartitionKeyColName))
     }
 
+    if (sortWithinFiles) {
+      repartitionedDf = repartitionedDf.sortWithinPartitions(repartitionKeyColName)
+    }
+
     repartitionedDf.drop(repartitionKeyColName)
   }
 }
diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/sources/DeltaSQLConf.scala b/spark/src/main/scala/org/apache/spark/sql/delta/sources/DeltaSQLConf.scala
@@ -1517,6 +1517,15 @@ trait DeltaSQLConfBase {
       .booleanConf
       .createWithDefault(true)
 
+  val MDC_SORT_WITHIN_FILES =
+    SQLConf.buildConf("spark.databricks.io.skipping.mdc.sortWithinFiles")
+      .internal()
+      .doc("If enabled, sort within files by the specified MDC curve. " +
+         "This might improve row-group skipping and data compression, at " +
+         "the cost of additional overhead for sorting.")
+      .booleanConf
+      .createWithDefault(false)
+
   val DELTA_OPTIMIZE_ZORDER_COL_STAT_CHECK =
     buildConf("optimize.zorder.checkStatsCollection.enabled")
       .internal()
diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/skipping/MultiDimClusteringSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/skipping/MultiDimClusteringSuite.scala
@@ -149,6 +149,90 @@ class MultiDimClusteringSuite extends QueryTest
     }
   }
 
+  test("ensure records in each partition are sorted according to Z-order values") {
+    withSQLConf(
+      MDC_SORT_WITHIN_FILES.key -> "true",
+      MDC_ADD_NOISE.key -> "false") {
+      val data = Seq(
+        // "c1" -> "c2", // (rangeId_c1, rangeId_c2) -> ZOrder (decimal Z-Order)
+        "a" -> 20, "a" -> 20, // (0, 1) -> 0x01 (1)
+        "b" -> 20, // (1, 1) -> 0x03 (3)
+        "c" -> 30, // (2, 2) -> 0x0C (12)
+        "d" -> 70, // (3, 3) -> 0x0F (15)
+        "e" -> 90, "e" -> 90, "e" -> 90, // (4, 4) -> 0x30 (48)
+        "f" -> 200, // (5, 5) -> 0x33 (51)
+        "g" -> 10, // (6, 0) -> 0x28 (40)
+        "h" -> 20) // (7, 1) -> 0x2B (43)
+
+      // Randomize the data. Use seed for deterministic input.
+      val inputDf = new Random(seed = 101).shuffle(data)
+          .toDF("c1", "c2")
+
+      // Cluster the data, range partition into one partition, and sort.
+      val outputDf = MultiDimClustering.cluster(
+        inputDf,
+        approxNumPartitions = 1,
+        colNames = Seq("c1", "c2"),
+        curve = "zorder")
+
+      // Check that dataframe is sorted.
+      checkAnswer(
+        outputDf,
+        Seq(
+          "a" -> 20, "a" -> 20,
+          "b" -> 20,
+          "c" -> 30,
+          "d" -> 70,
+          "g" -> 10,
+          "h" -> 20,
+          "e" -> 90, "e" -> 90, "e" -> 90,
+          "f" -> 200
+        ).toDF("c1", "c2").collect())
+    }
+  }
+
+  test("ensure records in each partition are sorted according to Hilbert curve values") {
+    withSQLConf(
+      MDC_SORT_WITHIN_FILES.key -> "true",
+      MDC_ADD_NOISE.key -> "false") {
+      val data = Seq(
+        // "c1" -> "c2", // (rangeId_c1, rangeId_c2) -> Decimal Hilbert index
+        "a" -> 20, "a" -> 20, // (0, 1) -> 3
+        "b" -> 20, // (1, 1) -> 2
+        "c" -> 30, // (2, 2) -> 8
+        "d" -> 70, // (3, 3) -> 10
+        "e" -> 90, "e" -> 90, "e" -> 90, // (4, 4) -> 32
+        "f" -> 200, // (5, 5) -> 34
+        "g" -> 10, // (6, 0) -> 20
+        "h" -> 20) // (7, 1) -> 22
+
+      // Randomize the data. Use seed for deterministic input.
+      val inputDf = new Random(seed = 101).shuffle(data)
+          .toDF("c1", "c2")
+
+      // Cluster the data, range partition into one partition, and sort.
+      val outputDf = MultiDimClustering.cluster(
+        inputDf,
+        approxNumPartitions = 1,
+        colNames = Seq("c1", "c2"),
+        curve = "hilbert")
+
+      // Check that dataframe is sorted.
+      checkAnswer(
+        outputDf,
+        Seq(
+          "b" -> 20,
+          "a" -> 20, "a" -> 20,
+          "c" -> 30,
+          "d" -> 70,
+          "g" -> 10,
+          "h" -> 20,
+          "e" -> 90, "e" -> 90, "e" -> 90,
+          "f" -> 200
+        ).toDF("c1", "c2").collect())
+    }
+  }
+
   test("noise is helpful in skew handling") {
     Seq("zorder", "hilbert").foreach { curve =>
       Seq("true", "false").foreach { addNoise =>

Original file line number	Diff line number	Diff line change
`@@ -74,6 +74,7 @@ trait SpaceFillingCurveClustering extends MultiDimClustering {`
`74`	`74`	`val conf = df.sparkSession.sessionState.conf`
`75`	`75`	`val numRanges = conf.getConf(DeltaSQLConf.MDC_NUM_RANGE_IDS)`
`76`	`76`	`val addNoise = conf.getConf(DeltaSQLConf.MDC_ADD_NOISE)`
	`77`	`+ val sortWithinFiles = conf.getConf(DeltaSQLConf.MDC_SORT_WITHIN_FILES)`
`77`	`78`
`78`	`79`	`val cols = colNames.map(df(_))`
`79`	`80`	`val mdcCol = getClusteringExpression(cols, numRanges)`
`@@ -90,6 +91,10 @@ trait SpaceFillingCurveClustering extends MultiDimClustering {`
`90`	`91`	`.repartitionByRange(approxNumPartitions, col(repartitionKeyColName))`
`91`	`92`	`}`
`92`	`93`
	`94`	`+ if (sortWithinFiles) {`
	`95`	`+ repartitionedDf = repartitionedDf.sortWithinPartitions(repartitionKeyColName)`
	`96`	`+ }`
	`97`	`+`
`93`	`98`	`repartitionedDf.drop(repartitionKeyColName)`
`94`	`99`	`}`
`95`	`100`	`}`