[Spark] Fix Utils.isTesting calls to use Delta implementation (#4074)

felipepessoto · vkorukanti · web-flow · commit bd1c935ed930 · 2025-02-10T11:36:34.000-08:00
#### Which Delta project/connector is this regarding? - [X] Spark - [ ] Standalone - [ ] Flink - [ ] Kernel - [ ] Other (fill in here) ## Description Some classes were using the `org.apache.spark.util.Utils` instead of the `org.apache.spark.sql.delta.util.Utils` The Spark implementation uses the SPARK_TESTING environment variable, while the Delta implementation uses the DELTA_TESTING key. Spark: https://github.com/apache/spark/blob/51fb84a54982719209c19136b1d72d2ef44726ee/core/src/main/scala/org/apache/spark/util/Utils.scala#L1878 Delta: https://github.com/delta-io/delta/blob/221d95cd69fdb9ff3f69cdd842c5c13ed47fd687/spark/src/main/scala/org/apache/spark/sql/delta/util/Utils.scala#L58 It means the unit tests are currently running non test code path, because we only set the DELTA_TESTING: https://github.com/delta-io/delta/blob/221d95cd69fdb9ff3f69cdd842c5c13ed47fd687/build.sbt#L466 https://github.com/delta-io/delta/blob/221d95cd69fdb9ff3f69cdd842c5c13ed47fd687/run-tests.py#L93 ## How was this patch tested? Unit tests ## Does this PR introduce _any_ user-facing changes? No Signed-off-by: Felipe Pessoto <fepessot@microsoft.com> Co-authored-by: Venki Korukanti <venki.korukanti@gmail.com>
diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/Checkpoints.scala b/spark/src/main/scala/org/apache/spark/sql/delta/Checkpoints.scala
@@ -34,6 +34,7 @@ import org.apache.spark.sql.delta.storage.LogStore
 import org.apache.spark.sql.delta.util.{DeltaFileOperations, DeltaLogGroupingIterator, FileNames}
 import org.apache.spark.sql.delta.util.FileNames._
 import org.apache.spark.sql.delta.util.JsonUtils
+import org.apache.spark.sql.delta.util.{Utils => DeltaUtils}
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileStatus, FileSystem, Path}
 import org.apache.hadoop.mapred.{JobConf, TaskAttemptContextImpl, TaskAttemptID}
@@ -280,7 +281,7 @@ trait Checkpoints extends DeltaLogging {
           data = Map("exception" -> e.getMessage(), "stackTrace" -> e.getStackTrace())
         )
         logWarning(log"Error when writing checkpoint-related files", e)
-        val throwError = Utils.isTesting ||
+        val throwError = DeltaUtils.isTesting ||
           spark.sessionState.conf.getConf(DeltaSQLConf.DELTA_CHECKPOINT_THROW_EXCEPTION_WHEN_FAILED)
         if (throwError) throw e
     }
@@ -1081,7 +1082,7 @@ object Checkpoints
       // overrides the final path even if it already exists. So we use exists here to handle that
       // case.
       // TODO: Remove isTesting and fs.exists check after fixing LocalFS
-      if (Utils.isTesting && fs.exists(finalPath)) {
+      if (DeltaUtils.isTesting && fs.exists(finalPath)) {
         false
       } else {
         fs.rename(tempPath, finalPath)
diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/Checksum.scala b/spark/src/main/scala/org/apache/spark/sql/delta/Checksum.scala
@@ -35,6 +35,7 @@ import org.apache.spark.sql.delta.stats.DeletedRecordCountsHistogram
 import org.apache.spark.sql.delta.stats.FileSizeHistogram
 import org.apache.spark.sql.delta.storage.LogStore
 import org.apache.spark.sql.delta.util.{FileNames, JsonUtils}
+import org.apache.spark.sql.delta.util.{Utils => DeltaUtils}
 import com.fasterxml.jackson.databind.annotation.JsonDeserialize
 import org.apache.hadoop.fs.FileStatus
 import org.apache.hadoop.fs.Path
@@ -440,7 +441,7 @@ trait RecordChecksum extends DeltaLogging {
           deltaLog,
           opType = "delta.allFilesInCrc.checksumMismatch.aggregated",
           data = eventData)
-        if (Utils.isTesting) {
+        if (DeltaUtils.isTesting) {
           throw new IllegalStateException("Incrementally Computed State failed checksum check" +
             s" for commit $attemptVersion [$eventData]")
         }
@@ -825,7 +826,7 @@ trait ValidateChecksum extends DeltaLogging { self: Snapshot =>
       this.deltaLog,
       opType = "delta.allFilesInCrc.checksumMismatch.differentAllFiles",
       data = eventData)
-    if (Utils.isTesting) throw new IllegalStateException(message)
+    if (DeltaUtils.isTesting) throw new IllegalStateException(message)
     false
   }
   /**
diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/DeltaLog.scala b/spark/src/main/scala/org/apache/spark/sql/delta/DeltaLog.scala
@@ -38,6 +38,7 @@ import org.apache.spark.sql.delta.schema.{SchemaMergingUtils, SchemaUtils}
 import org.apache.spark.sql.delta.sources._
 import org.apache.spark.sql.delta.storage.LogStoreProvider
 import org.apache.spark.sql.delta.util.FileNames
+import org.apache.spark.sql.delta.util.{Utils => DeltaUtils}
 import com.google.common.cache.{Cache, CacheBuilder, RemovalNotification}
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileStatus, FileSystem, Path}
@@ -152,7 +153,8 @@ class DeltaLog private(
 
   private[delta] def shouldVerifyIncrementalCommit: Boolean = {
     spark.conf.get(DeltaSQLConf.INCREMENTAL_COMMIT_VERIFY) ||
-      (Utils.isTesting && spark.conf.get(DeltaSQLConf.INCREMENTAL_COMMIT_FORCE_VERIFY_IN_TESTS))
+      (DeltaUtils.isTesting
+        && spark.conf.get(DeltaSQLConf.INCREMENTAL_COMMIT_FORCE_VERIFY_IN_TESTS))
   }
 
   /** The unique identifier for this table. */
diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/SnapshotManagement.scala b/spark/src/main/scala/org/apache/spark/sql/delta/SnapshotManagement.scala
@@ -35,6 +35,7 @@ import org.apache.spark.sql.delta.sources.DeltaSQLConf
 import org.apache.spark.sql.delta.util.FileNames._
 import org.apache.spark.sql.delta.util.JsonUtils
 import org.apache.spark.sql.delta.util.threads.DeltaThreadPool
+import org.apache.spark.sql.delta.util.{Utils => DeltaUtils}
 import com.fasterxml.jackson.annotation.JsonIgnore
 import io.delta.storage.commit.{Commit, GetCommitsResponse}
 import org.apache.hadoop.fs.{BlockLocation, FileStatus, LocatedFileStatus, Path}
@@ -44,7 +45,7 @@ import org.apache.spark.internal.MDC
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.catalog.CatalogTable
-import org.apache.spark.util.{ThreadUtils, Utils}
+import org.apache.spark.util.ThreadUtils
 
 /**
  * Wraps the most recently updated snapshot along with the timestamp the update was started.
@@ -270,7 +271,7 @@ trait SnapshotManagement { self: DeltaLog =>
         deltaLog = this,
         opType = CoordinatedCommitsUsageLogs.FS_COMMIT_COORDINATOR_LISTING_UNEXPECTED_GAPS,
         data = eventData)
-      if (Utils.isTesting) {
+      if (DeltaUtils.isTesting) {
         throw new IllegalStateException(
           s"Delta table at $dataPath unexpectedly still requires additional file-system listing " +
             s"after an additional file-system listing was already performed to reconcile the gap " +
@@ -646,7 +647,7 @@ trait SnapshotManagement { self: DeltaLog =>
       deltaLog = this,
       opType = "delta.getLogSegmentForVersion.compactedDeltaValidationFailed",
       data = eventData)
-    if (Utils.isTesting) {
+    if (DeltaUtils.isTesting) {
       assert(false, s"Validation around Compacted deltas failed while creating Snapshot. " +
         s"[${JsonUtils.toJson(eventData)}]")
     }
@@ -1071,7 +1072,7 @@ trait SnapshotManagement { self: DeltaLog =>
               catalogTableOpt)
           }
         } catch {
-          case NonFatal(e) if !Utils.isTesting =>
+          case NonFatal(e) if !DeltaUtils.isTesting =>
             // Failed to schedule the future -- fail in testing, but just log it in prod.
             recordDeltaEvent(this, "delta.snapshot.asyncUpdateFailed", data = Map("exception" -> e))
         }
@@ -1200,7 +1201,7 @@ trait SnapshotManagement { self: DeltaLog =>
   /** Installs the given `newSnapshot` as the `currentSnapshot` */
   protected def installSnapshot(newSnapshot: Snapshot, updateTimestamp: Long): Snapshot = {
     if (!snapshotLock.isHeldByCurrentThread) {
-      if (Utils.isTesting) {
+      if (DeltaUtils.isTesting) {
         throw new RuntimeException("DeltaLog snapshot replaced without taking lock")
       }
       recordDeltaEvent(this, "delta.update.unsafeReplace")
@@ -1292,7 +1293,7 @@ trait SnapshotManagement { self: DeltaLog =>
         // NOTE: Validation is a no-op with incremental commit disabled.
         newSnapshot.validateChecksum(Map("context" -> checksumContext))
       } catch {
-        case _: IllegalStateException if !Utils.isTesting => false
+        case _: IllegalStateException if !DeltaUtils.isTesting => false
       }
 
       if (!crcIsValid) {
diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/catalog/DeltaCatalog.scala b/spark/src/main/scala/org/apache/spark/sql/delta/catalog/DeltaCatalog.scala
@@ -39,6 +39,7 @@ import org.apache.spark.sql.delta.sources.{DeltaDataSource, DeltaSourceUtils, De
 import org.apache.spark.sql.delta.stats.StatisticsCollection
 import org.apache.spark.sql.delta.tablefeatures.DropFeature
 import org.apache.spark.sql.delta.util.PartitionUtils
+import org.apache.spark.sql.delta.util.{Utils => DeltaUtils}
 import org.apache.spark.sql.util.ScalaExtensions._
 import org.apache.hadoop.fs.Path
 
@@ -155,7 +156,7 @@ class DeltaCatalog extends DelegatingCatalogExtension
     // Note: Spark generates the table location for managed tables in
     // `DeltaCatalog#delegate#createTable`, so `isManagedLocation` should never be true if
     // Unity Catalog is not involved. For safety we also check `isUnityCatalog` here.
-    val respectManagedLoc = isUnityCatalog || org.apache.spark.util.Utils.isTesting
+    val respectManagedLoc = isUnityCatalog || DeltaUtils.isTesting
     val tableType = if (location.isEmpty || (isManagedLocation && respectManagedLoc)) {
       CatalogTableType.MANAGED
     } else {
diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/commands/VacuumCommand.scala b/spark/src/main/scala/org/apache/spark/sql/delta/commands/VacuumCommand.scala
@@ -768,7 +768,7 @@ trait VacuumCommandImpl extends DeltaCommand {
   protected def setCommitClock(deltaLog: DeltaLog, version: Long) = {
     // This is done to make sure that the commit timestamp reflects the one provided by the clock
     // object.
-    if (Utils.isTesting) {
+    if (DeltaUtils.isTesting) {
       val fs = deltaLog.logPath.getFileSystem(deltaLog.newDeltaHadoopConf())
       val filePath = DeltaCommitFileProvider(deltaLog.update()).deltaFile(version)
       if (fs.exists(filePath)) {
diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/files/DeltaFileFormatWriter.scala b/spark/src/main/scala/org/apache/spark/sql/delta/files/DeltaFileFormatWriter.scala
@@ -20,6 +20,7 @@ import java.util.{Date, UUID}
 
 import org.apache.spark.sql.delta.DeltaOptions
 import org.apache.spark.sql.delta.logging.DeltaLogKeys
+import org.apache.spark.sql.delta.util.{Utils => DeltaUtils}
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileAlreadyExistsException, Path}
 import org.apache.hadoop.mapreduce._
@@ -188,7 +189,7 @@ object DeltaFileFormatWriter extends LoggingShims {
     // 1) When the planned write config is disabled.
     // 2) When the concurrent writers are enabled (in this case the required ordering of a
     //    V1 write command will be empty).
-    if (Utils.isTesting) outputOrderingMatched = orderingMatched
+    if (DeltaUtils.isTesting) outputOrderingMatched = orderingMatched
 
     if (writeFilesOpt.isDefined) {
       // build `WriteFilesSpec` for `WriteFiles`
@@ -248,7 +249,7 @@ object DeltaFileFormatWriter extends LoggingShims {
       }
 
       // In testing, this is the only way to get hold of the actually executed plan written to file
-      if (Utils.isTesting) executedPlan = Some(planToExecute)
+      if (DeltaUtils.isTesting) executedPlan = Some(planToExecute)
 
       val rdd = planToExecute.execute()
 
@@ -331,7 +332,7 @@ object DeltaFileFormatWriter extends LoggingShims {
     val description = writeFilesSpec.description
 
     // In testing, this is the only way to get hold of the actually executed plan written to file
-    if (Utils.isTesting) executedPlan = Some(planForWrites)
+    if (DeltaUtils.isTesting) executedPlan = Some(planForWrites)
 
     writeAndCommit(job, description, committer) {
       val rdd = planForWrites.executeWrite(writeFilesSpec)
diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/metering/DeltaLogging.scala b/spark/src/main/scala/org/apache/spark/sql/delta/metering/DeltaLogging.scala
@@ -33,13 +33,13 @@ import org.apache.spark.sql.delta.actions.Metadata
 import org.apache.spark.sql.delta.logging.DeltaLogKeys
 import org.apache.spark.sql.delta.util.DeltaProgressReporter
 import org.apache.spark.sql.delta.util.JsonUtils
+import org.apache.spark.sql.delta.util.{Utils => DeltaUtils}
 import org.apache.spark.sql.util.ScalaExtensions._
 
 import org.apache.hadoop.fs.Path
 
 import org.apache.spark.SparkThrowable
 import org.apache.spark.internal.{LoggingShims, MDC, MessageWithContext}
-import org.apache.spark.util.Utils
 
 /**
  * Convenience wrappers for logging that include delta specific options and
@@ -153,7 +153,7 @@ trait DeltaLogging
       data: AnyRef = null,
       path: Option[Path] = None)
     : Unit = {
-    if (Utils.isTesting) {
+    if (DeltaUtils.isTesting) {
       assert(check, msg)
     } else if (!check) {
       recordDeltaEvent(
diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/schema/SchemaUtils.scala b/spark/src/main/scala/org/apache/spark/sql/delta/schema/SchemaUtils.scala
@@ -23,6 +23,7 @@ import scala.util.control.NonFatal
 import org.apache.spark.sql.delta.{DeltaAnalysisException, DeltaColumnMappingMode, DeltaErrors, DeltaLog, GeneratedColumn, NoMapping, TypeWidening, TypeWideningMode}
 import org.apache.spark.sql.delta.{RowCommitVersion, RowId}
 import org.apache.spark.sql.delta.ClassicColumnConversions._
+import org.apache.spark.sql.delta.util.{Utils => DeltaUtils}
 import org.apache.spark.sql.delta.actions.Protocol
 import org.apache.spark.sql.delta.commands.cdc.CDCReader
 import org.apache.spark.sql.delta.logging.DeltaLogKeys
@@ -43,7 +44,6 @@ import org.apache.spark.sql.execution.streaming.IncrementalExecution
 import org.apache.spark.sql.functions.{col, struct}
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
-import org.apache.spark.util.Utils
 
 object SchemaUtils extends DeltaLogging {
   // We use case insensitive resolution while writing into Delta
@@ -290,7 +290,7 @@ def normalizeColumnNamesInDataType(
         // The integral types can be cast to each other later on.
         sourceDataType
       case _ =>
-        if (Utils.isTesting) {
+        if (DeltaUtils.isTesting) {
           assert(sourceDataType == tableDataType,
             s"Types without nesting should match but $sourceDataType != $tableDataType")
         } else if (sourceDataType != tableDataType) {
diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/skipping/clustering/ZCube.scala b/spark/src/main/scala/org/apache/spark/sql/delta/skipping/clustering/ZCube.scala
@@ -20,18 +20,17 @@ import scala.collection.mutable.ArrayBuffer
 
 import org.apache.spark.sql.delta.actions.AddFile
 import org.apache.spark.sql.delta.commands.optimize.AddFileWithNumRecords
+import org.apache.spark.sql.delta.util.{Utils => DeltaUtils}
 import org.apache.spark.sql.delta.zorder.ZCubeInfo
 import org.apache.spark.sql.delta.zorder.ZCubeInfo.{getForFile => getZCubeInfo}
 
-import org.apache.spark.util.Utils
-
 /**
  * Collection of files that were produced by the same job in a run of the clustering command.
  */
 case class ZCube(files: Seq[AddFile]) {
   require(files.nonEmpty)
 
-  if (Utils.isTesting) {
+  if (DeltaUtils.isTesting) {
     assert(files.forall(getZCubeInfo(_) == Some(zCubeInfo)))
   }
 
diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/sources/DeltaSQLConf.scala b/spark/src/main/scala/org/apache/spark/sql/delta/sources/DeltaSQLConf.scala
@@ -24,7 +24,7 @@ import org.apache.spark.network.util.ByteUnit
 import org.apache.spark.sql.catalyst.FileSourceOptions
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.storage.StorageLevel
-import org.apache.spark.util.Utils
+import org.apache.spark.sql.delta.util.{Utils => DeltaUtils}
 
 /**
  * [[SQLConf]] entries for Delta features.
@@ -2423,21 +2423,21 @@ trait DeltaSQLConfBase {
       .internal()
       .doc("If true, post-commit hooks will by default throw an exception when they fail.")
       .booleanConf
-      .createWithDefault(Utils.isTesting)
+      .createWithDefault(DeltaUtils.isTesting)
 
   val TEST_FILE_NAME_PREFIX =
     buildStaticConf("testOnly.dataFileNamePrefix")
       .internal()
       .doc("[TEST_ONLY]: The prefix to use for the names of all Parquet data files.")
       .stringConf
-      .createWithDefault(if (Utils.isTesting) "test%file%prefix-" else "")
+      .createWithDefault(if (DeltaUtils.isTesting) "test%file%prefix-" else "")
 
   val TEST_DV_NAME_PREFIX =
     buildStaticConf("testOnly.dvFileNamePrefix")
       .internal()
       .doc("[TEST_ONLY]: The prefix to use for the names of all Deletion Vector files.")
       .stringConf
-      .createWithDefault(if (Utils.isTesting) "test%dv%prefix-" else "")
+      .createWithDefault(if (DeltaUtils.isTesting) "test%dv%prefix-" else "")
 
   ///////////
   // UTC TIMESTAMP PARTITION VALUES