[SPARK-31672][SQL] Fix loading of timestamps before 1582-10-15 from dictionary encoded Parquet columns

MaxGekk · cloud-fan · commit 5d5866be1225 · 2020-05-11T04:58:08.000Z
### What changes were proposed in this pull request? Modified the `decodeDictionaryIds()` method of `VectorizedColumnReader` to handle especially `TimestampType` when the passed parameter `rebaseDateTime` is true. In that case, decoded milliseconds/microseconds are rebased from the hybrid calendar to Proleptic Gregorian calendar using `RebaseDateTime`.`rebaseJulianToGregorianMicros()`. ### Why are the changes needed? This fixes the bug of loading timestamps before the cutover day from dictionary encoded column in parquet files. The code below forces dictionary encoding: ```scala spark.conf.set("spark.sql.legacy.parquet.rebaseDateTimeInWrite.enabled", true) scala> spark.conf.set("spark.sql.parquet.outputTimestampType", "TIMESTAMP_MICROS") scala> Seq.tabulate(8)(_ => "1001-01-01 01:02:03.123").toDF("tsS") .select($"tsS".cast("timestamp").as("ts")).repartition(1) .write .option("parquet.enable.dictionary", true) .parquet(path) ``` Load the dates back: ```scala scala> spark.read.parquet(path).show(false) +-----------------------+ |ts | +-----------------------+ |1001-01-07 00:32:20.123| ... |1001-01-07 00:32:20.123| +-----------------------+ ``` Expected values **must be 1001-01-01 01:02:03.123** but not 1001-01-07 00:32:20.123. ### Does this PR introduce _any_ user-facing change? Yes. After the changes: ```scala scala> spark.read.parquet(path).show(false) +-----------------------+ |ts | +-----------------------+ |1001-01-01 01:02:03.123| ... |1001-01-01 01:02:03.123| +-----------------------+ ``` ### How was this patch tested? Modified the test `SPARK-31159: rebasing timestamps in write` in `ParquetIOSuite` to checked reading dictionary encoded dates. Closes apache#28489 from MaxGekk/fix-ts-rebase-parquet-dict-enc. Authored-by: Max Gekk <max.gekk@gmail.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java
@@ -159,7 +159,11 @@ private boolean isLazyDecodingSupported(PrimitiveType.PrimitiveTypeName typeName
         isSupported = originalType != OriginalType.DATE || !rebaseDateTime;
         break;
       case INT64:
-        isSupported = originalType != OriginalType.TIMESTAMP_MILLIS;
+        if (originalType == OriginalType.TIMESTAMP_MICROS) {
+          isSupported = !rebaseDateTime;
+        } else {
+          isSupported = originalType != OriginalType.TIMESTAMP_MILLIS;
+        }
         break;
       case FLOAT:
       case DOUBLE:
@@ -313,17 +317,36 @@ private void decodeDictionaryIds(
       case INT64:
         if (column.dataType() == DataTypes.LongType ||
             DecimalType.is64BitDecimalType(column.dataType()) ||
-            originalType == OriginalType.TIMESTAMP_MICROS) {
+            (originalType == OriginalType.TIMESTAMP_MICROS && !rebaseDateTime)) {
           for (int i = rowId; i < rowId + num; ++i) {
             if (!column.isNullAt(i)) {
               column.putLong(i, dictionary.decodeToLong(dictionaryIds.getDictId(i)));
             }
           }
         } else if (originalType == OriginalType.TIMESTAMP_MILLIS) {
+          if (rebaseDateTime) {
+            for (int i = rowId; i < rowId + num; ++i) {
+              if (!column.isNullAt(i)) {
+                long julianMillis = dictionary.decodeToLong(dictionaryIds.getDictId(i));
+                long julianMicros = DateTimeUtils.millisToMicros(julianMillis);
+                long gregorianMicros = RebaseDateTime.rebaseJulianToGregorianMicros(julianMicros);
+                column.putLong(i, gregorianMicros);
+              }
+            }
+          } else {
+            for (int i = rowId; i < rowId + num; ++i) {
+              if (!column.isNullAt(i)) {
+                long gregorianMillis = dictionary.decodeToLong(dictionaryIds.getDictId(i));
+                column.putLong(i, DateTimeUtils.millisToMicros(gregorianMillis));
+              }
+            }
+          }
+        } else if (originalType == OriginalType.TIMESTAMP_MICROS) {
           for (int i = rowId; i < rowId + num; ++i) {
             if (!column.isNullAt(i)) {
-              column.putLong(i,
-                DateTimeUtils.millisToMicros(dictionary.decodeToLong(dictionaryIds.getDictId(i))));
+              long julianMicros = dictionary.decodeToLong(dictionaryIds.getDictId(i));
+              long gregorianMicros = RebaseDateTime.rebaseJulianToGregorianMicros(julianMicros);
+              column.putLong(i, gregorianMicros);
             }
           }
         } else {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
@@ -937,37 +937,46 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSparkSession
   }
 
   test("SPARK-31159: rebasing timestamps in write") {
-    Seq(
-      ("TIMESTAMP_MILLIS", "1001-01-01 01:02:03.123", "1001-01-07 01:09:05.123"),
-      ("TIMESTAMP_MICROS", "1001-01-01 01:02:03.123456", "1001-01-07 01:09:05.123456"),
-      ("INT96", "1001-01-01 01:02:03.123456", "1001-01-01 01:02:03.123456")
-    ).foreach { case (outType, tsStr, nonRebased) =>
-      withClue(s"output type $outType") {
-        withSQLConf(SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE.key -> outType) {
-          withTempPath { dir =>
-            val path = dir.getAbsolutePath
-            withSQLConf(SQLConf.LEGACY_PARQUET_REBASE_DATETIME_IN_WRITE.key -> "true") {
-              Seq(tsStr).toDF("tsS")
-                .select($"tsS".cast("timestamp").as("ts"))
-                .write
-                .parquet(path)
-            }
+    val N = 8
+    Seq(false, true).foreach { dictionaryEncoding =>
+      Seq(
+        ("TIMESTAMP_MILLIS", "1001-01-01 01:02:03.123", "1001-01-07 01:09:05.123"),
+        ("TIMESTAMP_MICROS", "1001-01-01 01:02:03.123456", "1001-01-07 01:09:05.123456"),
+        ("INT96", "1001-01-01 01:02:03.123456", "1001-01-01 01:02:03.123456")
+      ).foreach { case (outType, tsStr, nonRebased) =>
+        withClue(s"output type $outType") {
+          withSQLConf(SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE.key -> outType) {
+            withTempPath { dir =>
+              val path = dir.getAbsolutePath
+              withSQLConf(SQLConf.LEGACY_PARQUET_REBASE_DATETIME_IN_WRITE.key -> "true") {
+                Seq.tabulate(N)(_ => tsStr).toDF("tsS")
+                  .select($"tsS".cast("timestamp").as("ts"))
+                  .repartition(1)
+                  .write
+                  .option("parquet.enable.dictionary", dictionaryEncoding)
+                  .parquet(path)
+              }
 
-            Seq(false, true).foreach { vectorized =>
-              withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> vectorized.toString) {
-                // The file metadata indicates if it needs rebase or not, so we can always get the
-                // correct result regardless of the "rebaseInRead" config.
-                Seq(true, false).foreach { rebase =>
-                  withSQLConf(
-                    SQLConf.LEGACY_PARQUET_REBASE_DATETIME_IN_READ.key -> rebase.toString) {
-                    checkAnswer(spark.read.parquet(path), Row(Timestamp.valueOf(tsStr)))
+              Seq(false, true).foreach { vectorized =>
+                withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> vectorized.toString) {
+                  // The file metadata indicates if it needs rebase or not, so we can always get the
+                  // correct result regardless of the "rebaseInRead" config.
+                  Seq(true, false).foreach { rebase =>
+                    withSQLConf(
+                      SQLConf.LEGACY_PARQUET_REBASE_DATETIME_IN_READ.key -> rebase.toString) {
+                      checkAnswer(
+                        spark.read.parquet(path),
+                        Seq.tabulate(N)(_ => Row(Timestamp.valueOf(tsStr))))
+                    }
                   }
-                }
 
-                // Force to not rebase to prove the written datetime values are rebased
-                // and we will get wrong result if we don't rebase while reading.
-                withSQLConf("spark.test.forceNoRebase" -> "true") {
-                  checkAnswer(spark.read.parquet(path), Row(Timestamp.valueOf(nonRebased)))
+                  // Force to not rebase to prove the written datetime values are rebased
+                  // and we will get wrong result if we don't rebase while reading.
+                  withSQLConf("spark.test.forceNoRebase" -> "true") {
+                    checkAnswer(
+                      spark.read.parquet(path),
+                      Seq.tabulate(N)(_ => Row(Timestamp.valueOf(nonRebased))))
+                  }
                 }
               }
             }