From f569a328ca893d7c1dfc65b258061a83c879e7fe Mon Sep 17 00:00:00 2001 From: Sami Dalouche Date: Fri, 28 Jul 2023 10:28:02 -0600 Subject: [PATCH] Convert codebase to scala 2.13 --- pom.xml | 6 +-- .../com/amazon/deequ/VerificationResult.scala | 10 ++++- .../com/amazon/deequ/analyzers/Analyzer.scala | 2 +- .../deequ/analyzers/QuantileNonSample.scala | 2 +- .../deequ/analyzers/StateProvider.scala | 5 ++- .../analyzers/runners/AnalyzerContext.scala | 10 ++++- .../OnlineNormalStrategy.scala | 4 +- .../seasonal/HoltWinters.scala | 2 +- .../deequ/profiles/ColumnProfiler.scala | 2 +- .../repository/AnalysisResultSerde.scala | 16 ++++---- ...tricsRepositoryMultipleResultsLoader.scala | 4 +- .../fs/FileSystemMetricsRepository.scala | 3 +- .../memory/InMemoryMetricsRepository.scala | 5 ++- .../ConstraintSuggestionRunner.scala | 2 +- .../amazon/deequ/KLL/KLLDistanceTest.scala | 6 +-- .../amazon/deequ/VerificationResultTest.scala | 18 +++++---- .../deequ/analyzers/AnalyzerTests.scala | 32 ++++++++-------- .../runners/AnalysisRunnerTests.scala | 5 ++- .../runners/AnalyzerContextTest.scala | 37 ++++++++++--------- .../repository/AnalysisResultSerdeTest.scala | 19 +++++----- ...sRepositoryMultipleResultsLoaderTest.scala | 26 ++++++------- 21 files changed, 120 insertions(+), 96 deletions(-) diff --git a/pom.xml b/pom.xml index 39545d252..b6b98133e 100644 --- a/pom.xml +++ b/pom.xml @@ -13,8 +13,8 @@ 1.8 UTF-8 - 2.12 - ${scala.major.version}.10 + 2.13 + ${scala.major.version}.11 ${scala.major.version} 4.8.1 @@ -103,7 +103,7 @@ org.scalanlp breeze_${scala.major.version} - 0.13.2 + 1.2 diff --git a/src/main/scala/com/amazon/deequ/VerificationResult.scala b/src/main/scala/com/amazon/deequ/VerificationResult.scala index 6390db821..add26138c 100644 --- a/src/main/scala/com/amazon/deequ/VerificationResult.scala +++ b/src/main/scala/com/amazon/deequ/VerificationResult.scala @@ -186,6 +186,14 @@ object VerificationResult { } } - private[this] case class SimpleCheckResultOutput(checkDescription: String, checkLevel: String, + // remove private as it breaks somethingwith scala 2.13 + // java.lang.RuntimeException: Error while encoding: java.util.concurrent.ExecutionException: + // org.codehaus.commons.compiler.CompileException: + // File 'generated.java', Line 105, Column 25: failed to compile: + // org.codehaus.commons.compiler.CompileException: File 'generated.java', + // Line 105, Column 25: No applicable constructor/method found for zero actual parameters; candidates are: + // "public java.lang.String com.amazon.deequ.VerificationResult$SimpleCheckResultOutput.constraintStatus()" + // private[this] + case class SimpleCheckResultOutput(checkDescription: String, checkLevel: String, checkStatus: String, constraint: String, constraintStatus: String, constraintMessage: String) } diff --git a/src/main/scala/com/amazon/deequ/analyzers/Analyzer.scala b/src/main/scala/com/amazon/deequ/analyzers/Analyzer.scala index b3e44c4c7..cb3e9b93a 100644 --- a/src/main/scala/com/amazon/deequ/analyzers/Analyzer.scala +++ b/src/main/scala/com/amazon/deequ/analyzers/Analyzer.scala @@ -453,7 +453,7 @@ private[deequ] object Analyzers { if (nullInResult) { None } else { - Option(func(Unit)) + Option(func(())) } } diff --git a/src/main/scala/com/amazon/deequ/analyzers/QuantileNonSample.scala b/src/main/scala/com/amazon/deequ/analyzers/QuantileNonSample.scala index a39b2d672..fd8942c79 100644 --- a/src/main/scala/com/amazon/deequ/analyzers/QuantileNonSample.scala +++ b/src/main/scala/com/amazon/deequ/analyzers/QuantileNonSample.scala @@ -52,7 +52,7 @@ class QuantileNonSample[T]( this.shrinkingFactor = shrinkingFactor compactors = ArrayBuffer.fill(data.length)(new NonSampleCompactor[T]) for (i <- data.indices) { - compactors(i).buffer = data(i).to[ArrayBuffer] + compactors(i).buffer = ArrayBuffer.from(data(i)) // data(i).to[ArrayBuffer] } curNumOfCompactors = data.length compactorActualSize = getCompactorItemsCount diff --git a/src/main/scala/com/amazon/deequ/analyzers/StateProvider.scala b/src/main/scala/com/amazon/deequ/analyzers/StateProvider.scala index 65edb9424..dd696c56a 100644 --- a/src/main/scala/com/amazon/deequ/analyzers/StateProvider.scala +++ b/src/main/scala/com/amazon/deequ/analyzers/StateProvider.scala @@ -24,7 +24,7 @@ import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.expressions.aggregate.{ApproximatePercentile, DeequHyperLogLogPlusPlusUtils} import org.apache.spark.sql.SaveMode -import scala.collection.JavaConversions._ +import scala.jdk.CollectionConverters._ import scala.util.hashing.MurmurHash3 private object StateInformation { @@ -58,7 +58,8 @@ case class InMemoryStateProvider() extends StateLoader with StatePersister { override def toString: String = { val buffer = new StringBuilder() - statesByAnalyzer.foreach { case (analyzer, state) => + + statesByAnalyzer.asScala.foreach { case (analyzer, state) => buffer.append(analyzer.toString) buffer.append(" => ") buffer.append(state.toString) diff --git a/src/main/scala/com/amazon/deequ/analyzers/runners/AnalyzerContext.scala b/src/main/scala/com/amazon/deequ/analyzers/runners/AnalyzerContext.scala index e66bf5f0d..a280d9f38 100644 --- a/src/main/scala/com/amazon/deequ/analyzers/runners/AnalyzerContext.scala +++ b/src/main/scala/com/amazon/deequ/analyzers/runners/AnalyzerContext.scala @@ -124,13 +124,19 @@ object AnalyzerContext { } } - private[this] case class SimpleMetricOutput( + // when using private, leads to + // No applicable constructor/method + // private[this] + case class SimpleMetricOutput( entity: String, instance: String, name: String, value: Double) - private[this] object SimpleMetricOutput { + // when using private, leads to + // No applicable constructor/method + // private[this] + object SimpleMetricOutput { def apply(doubleMetric: DoubleMetric): SimpleMetricOutput = { SimpleMetricOutput( diff --git a/src/main/scala/com/amazon/deequ/anomalydetection/OnlineNormalStrategy.scala b/src/main/scala/com/amazon/deequ/anomalydetection/OnlineNormalStrategy.scala index 8bf8b634c..0e884fa51 100644 --- a/src/main/scala/com/amazon/deequ/anomalydetection/OnlineNormalStrategy.scala +++ b/src/main/scala/com/amazon/deequ/anomalydetection/OnlineNormalStrategy.scala @@ -116,12 +116,12 @@ case class OnlineNormalStrategy( ret += OnlineCalculationResult(currentMean, stdDev, isAnomaly = true) } } - ret + ret.toSeq } /** - * Search for anomalies in a series of data points. + * Search for anomalies in a series of datag points. * * @param dataSeries The data contained in a Vector of Doubles * @param searchInterval The indices between which anomalies should be detected. [a, b). diff --git a/src/main/scala/com/amazon/deequ/anomalydetection/seasonal/HoltWinters.scala b/src/main/scala/com/amazon/deequ/anomalydetection/seasonal/HoltWinters.scala index 0ee0ac25f..dc19ab275 100644 --- a/src/main/scala/com/amazon/deequ/anomalydetection/seasonal/HoltWinters.scala +++ b/src/main/scala/com/amazon/deequ/anomalydetection/seasonal/HoltWinters.scala @@ -132,7 +132,7 @@ class HoltWinters( } val forecasted = Y.drop(series.size) - ModelResults(forecasted, level, trend, seasonality, residuals) + ModelResults(forecasted.toSeq, level.toSeq, trend.toSeq, seasonality.toSeq, residuals.toSeq) } private def modelSelectionFor( diff --git a/src/main/scala/com/amazon/deequ/profiles/ColumnProfiler.scala b/src/main/scala/com/amazon/deequ/profiles/ColumnProfiler.scala index 57c8c3019..8c834db79 100644 --- a/src/main/scala/com/amazon/deequ/profiles/ColumnProfiler.scala +++ b/src/main/scala/com/amazon/deequ/profiles/ColumnProfiler.scala @@ -347,7 +347,7 @@ object ColumnProfiler { Histogram(histogram.column).equals(histogram) case _ => false } - analyzerContextExistingValues = AnalyzerContext(relevantEntries) + analyzerContextExistingValues = AnalyzerContext(relevantEntries.toMap) } } } diff --git a/src/main/scala/com/amazon/deequ/repository/AnalysisResultSerde.scala b/src/main/scala/com/amazon/deequ/repository/AnalysisResultSerde.scala index 4a2ca1058..35c087084 100644 --- a/src/main/scala/com/amazon/deequ/repository/AnalysisResultSerde.scala +++ b/src/main/scala/com/amazon/deequ/repository/AnalysisResultSerde.scala @@ -27,14 +27,13 @@ import com.google.gson._ import com.google.gson.reflect.TypeToken import scala.collection._ -import scala.collection.JavaConverters._ import java.util.{ArrayList => JArrayList, HashMap => JHashMap, List => JList, Map => JMap} import JsonSerializationConstants._ import com.amazon.deequ.analyzers.Histogram.{AggregateFunction, Count => HistogramCount, Sum => HistogramSum} import org.apache.spark.sql.Column import org.apache.spark.sql.functions.expr -import scala.collection.JavaConversions._ +import scala.jdk.CollectionConverters._ private[repository] object JsonSerializationConstants { @@ -70,6 +69,7 @@ private[deequ] object SimpleResultSerde { .asInstanceOf[JArrayList[JMap[String, String]]] .asScala .map(map => immutable.Map(map.asScala.toList: _*)) + .toSeq } } @@ -422,22 +422,22 @@ private[deequ] object AnalyzerDeserializer getOptionalWhereParam(json)) case "CountDistinct" => - CountDistinct(getColumnsAsSeq(context, json)) + CountDistinct(getColumnsAsSeq(context, json).toSeq) case "Distinctness" => - Distinctness(getColumnsAsSeq(context, json)) + Distinctness(getColumnsAsSeq(context, json).toSeq) case "Entropy" => Entropy(json.get(COLUMN_FIELD).getAsString) case "MutualInformation" => - MutualInformation(getColumnsAsSeq(context, json)) + MutualInformation(getColumnsAsSeq(context, json).toSeq) case "UniqueValueRatio" => - UniqueValueRatio(getColumnsAsSeq(context, json)) + UniqueValueRatio(getColumnsAsSeq(context, json).toSeq) case "Uniqueness" => - Uniqueness(getColumnsAsSeq(context, json)) + Uniqueness(getColumnsAsSeq(context, json).toSeq) case "Histogram" => Histogram( @@ -598,7 +598,7 @@ private[deequ] object MetricDeserializer extends JsonDeserializer[Metric[_]] { val instance = jsonObject.get("instance").getAsString if (jsonObject.has("value")) { val entries = jsonObject.get("value").getAsJsonObject - val values = entries.entrySet().map { entry => + val values = entries.entrySet().asScala.map { entry => entry.getKey -> entry.getValue.getAsDouble } .toMap diff --git a/src/main/scala/com/amazon/deequ/repository/MetricsRepositoryMultipleResultsLoader.scala b/src/main/scala/com/amazon/deequ/repository/MetricsRepositoryMultipleResultsLoader.scala index 3b9abc5b3..8351fe0b1 100644 --- a/src/main/scala/com/amazon/deequ/repository/MetricsRepositoryMultipleResultsLoader.scala +++ b/src/main/scala/com/amazon/deequ/repository/MetricsRepositoryMultipleResultsLoader.scala @@ -99,8 +99,8 @@ private[repository] object MetricsRepositoryMultipleResultsLoader { def jsonUnion(jsonOne: String, jsonTwo: String): String = { - val objectOne: Seq[Map[String, Any]] = SimpleResultSerde.deserialize(jsonOne) - val objectTwo: Seq[Map[String, Any]] = SimpleResultSerde.deserialize(jsonTwo) + val objectOne: Seq[Map[String, Any]] = SimpleResultSerde.deserialize(jsonOne).toSeq + val objectTwo: Seq[Map[String, Any]] = SimpleResultSerde.deserialize(jsonTwo).toSeq val columnsTotal = objectOne.headOption.getOrElse(Map.empty).keySet ++ objectTwo.headOption.getOrElse(Map.empty).keySet diff --git a/src/main/scala/com/amazon/deequ/repository/fs/FileSystemMetricsRepository.scala b/src/main/scala/com/amazon/deequ/repository/fs/FileSystemMetricsRepository.scala index 7b1101017..6668c90d1 100644 --- a/src/main/scala/com/amazon/deequ/repository/fs/FileSystemMetricsRepository.scala +++ b/src/main/scala/com/amazon/deequ/repository/fs/FileSystemMetricsRepository.scala @@ -153,10 +153,11 @@ class FileSystemMetricsRepositoryMultipleResultsLoader( .metricMap .filterKeys(analyzer => forAnalyzers.isEmpty || forAnalyzers.get.contains(analyzer)) - val requestedAnalyzerContext = AnalyzerContext(requestedMetrics) + val requestedAnalyzerContext = AnalyzerContext(requestedMetrics.toMap) AnalysisResult(analysisResult.resultKey, requestedAnalyzerContext) } + .toSeq } } diff --git a/src/main/scala/com/amazon/deequ/repository/memory/InMemoryMetricsRepository.scala b/src/main/scala/com/amazon/deequ/repository/memory/InMemoryMetricsRepository.scala index 61ad1e1ee..b5888ac08 100644 --- a/src/main/scala/com/amazon/deequ/repository/memory/InMemoryMetricsRepository.scala +++ b/src/main/scala/com/amazon/deequ/repository/memory/InMemoryMetricsRepository.scala @@ -21,7 +21,7 @@ import com.amazon.deequ.metrics.Metric import com.amazon.deequ.repository._ import com.amazon.deequ.analyzers.runners.AnalyzerContext -import scala.collection.JavaConversions._ +import scala.jdk.CollectionConverters._ import java.util.concurrent.ConcurrentHashMap /** A simple Repository implementation backed by a concurrent hash map */ @@ -118,6 +118,7 @@ class LimitedInMemoryMetricsRepositoryMultipleResultsLoader( /** Get the AnalysisResult */ def get(): Seq[AnalysisResult] = { resultsRepository + .asScala .filterKeys(key => after.isEmpty || after.get <= key.dataSetDate) .filterKeys(key => before.isEmpty || key.dataSetDate <= before.get) .filterKeys(key => tagValues.isEmpty || tagValues.get.toSet.subsetOf(key.tags.toSet)) @@ -129,7 +130,7 @@ class LimitedInMemoryMetricsRepositoryMultipleResultsLoader( .metricMap .filterKeys(analyzer => forAnalyzers.isEmpty || forAnalyzers.get.contains(analyzer)) - AnalysisResult(analysisResult.resultKey, AnalyzerContext(requestedMetrics)) + AnalysisResult(analysisResult.resultKey, AnalyzerContext(requestedMetrics.toMap)) } .toSeq } diff --git a/src/main/scala/com/amazon/deequ/suggestions/ConstraintSuggestionRunner.scala b/src/main/scala/com/amazon/deequ/suggestions/ConstraintSuggestionRunner.scala index de915956d..c230872ec 100644 --- a/src/main/scala/com/amazon/deequ/suggestions/ConstraintSuggestionRunner.scala +++ b/src/main/scala/com/amazon/deequ/suggestions/ConstraintSuggestionRunner.scala @@ -142,7 +142,7 @@ class ConstraintSuggestionRunner { groupedSuggestionsWithColumnNames.map { case (_, suggestion) => suggestion } } ConstraintSuggestionResult(columnProfiles.profiles, columnProfiles.numRecords, - columnsWithSuggestions, verificationResult) + columnsWithSuggestions.toMap, verificationResult) } private[this] def splitTrainTestSets( diff --git a/src/test/scala/com/amazon/deequ/KLL/KLLDistanceTest.scala b/src/test/scala/com/amazon/deequ/KLL/KLLDistanceTest.scala index 20017fa71..d60bd0b76 100644 --- a/src/test/scala/com/amazon/deequ/KLL/KLLDistanceTest.scala +++ b/src/test/scala/com/amazon/deequ/KLL/KLLDistanceTest.scala @@ -88,7 +88,7 @@ class KLLDistanceTest extends WordSpec with SparkContextSpec val sample2 = scala.collection.mutable.Map( "a" -> 22L, "b" -> 20L, "c" -> 25L, "d" -> 12L, "e" -> 13L, "f" -> 15L) val distance = Distance.categoricalDistance(sample1, sample2, method = LInfinityMethod(alpha = Some(0.003))) - assert(distance == 0.2726338046550349) + assert(distance == 0.27263380465503484) } "Categorial distance should compute correct linf_robust with different alpha value .1" in { @@ -137,7 +137,7 @@ class KLLDistanceTest extends WordSpec with SparkContextSpec "a" -> 100L, "b" -> 22L, "c" -> 25L, "d" -> 5L, "e" -> 13L, "f" -> 2L) val distance = Distance.categoricalDistance( sample1, sample2, correctForLowNumberOfSamples = true, method = ChisquareMethod()) - assert(distance == 8.789790456457125) + assert(distance == 8.789790456457123) } "Categorical distance should compute correct chisquare distance (low samples) with regrouping (yates)" in { @@ -170,7 +170,7 @@ class KLLDistanceTest extends WordSpec with SparkContextSpec "a" -> 100L, "b" -> 4L, "c" -> 3L, "d" -> 27L, "e" -> 20L, "f" -> 20L, "g" -> 20L, "h" -> 20L) val distance = Distance.categoricalDistance( sample, baseline, correctForLowNumberOfSamples = true, method = ChisquareMethod()) - assert(distance == 6.827423492761593) + assert(distance == 6.827423492761592) } "Categorical distance should compute correct chisquare distance (low samples) " + diff --git a/src/test/scala/com/amazon/deequ/VerificationResultTest.scala b/src/test/scala/com/amazon/deequ/VerificationResultTest.scala index 93aa73201..36079fe11 100644 --- a/src/test/scala/com/amazon/deequ/VerificationResultTest.scala +++ b/src/test/scala/com/amazon/deequ/VerificationResultTest.scala @@ -79,12 +79,13 @@ class VerificationResultTest extends WordSpec with Matchers with SparkContextSpe val successMetricsResultsJson = VerificationResult.successMetricsAsJson(results) val expectedJson = - """[{"entity":"Column","instance":"item","name":"Distinctness","value":1.0}, + """[ + |{"entity":"Multicolumn","instance":"att1,att2","name":"Uniqueness","value":0.25}, + {"entity":"Column","instance":"item","name":"Distinctness","value":1.0}, |{"entity": "Column", "instance":"att2","name":"Completeness","value":1.0}, - |{"entity":"Column","instance":"att1","name":"Completeness","value":1.0}, - |{"entity":"Multicolumn","instance":"att1,att2", - |"name":"Uniqueness","value":0.25}, - |{"entity":"Dataset","instance":"*","name":"Size","value":4.0}]""" + |{"entity":"Dataset","instance":"*","name":"Size","value":4.0}, + |{"entity":"Column","instance":"att1","name":"Completeness","value":1.0} + |]""" .stripMargin.replaceAll("\n", "") assertSameResultsJson(successMetricsResultsJson, expectedJson) @@ -102,9 +103,10 @@ class VerificationResultTest extends WordSpec with Matchers with SparkContextSpe VerificationResult.successMetricsAsJson(results, metricsForAnalyzers) val expectedJson = - """[{"entity":"Column","instance":"att1","name":"Completeness","value":1.0}, - |{"entity":"Multicolumn","instance":"att1,att2", - |"name":"Uniqueness","value":0.25}]""" + """[ + |{"entity":"Multicolumn","instance":"att1,att2","name":"Uniqueness","value":0.25}, + |{"entity":"Column","instance":"att1","name":"Completeness","value":1.0} + |]""" .stripMargin.replaceAll("\n", "") assertSameResultsJson(successMetricsResultsJson, expectedJson) diff --git a/src/test/scala/com/amazon/deequ/analyzers/AnalyzerTests.scala b/src/test/scala/com/amazon/deequ/analyzers/AnalyzerTests.scala index 03787b886..b7b2dcb7c 100644 --- a/src/test/scala/com/amazon/deequ/analyzers/AnalyzerTests.scala +++ b/src/test/scala/com/amazon/deequ/analyzers/AnalyzerTests.scala @@ -17,6 +17,7 @@ package com.amazon.deequ package analyzers +import com.amazon.deequ import com.amazon.deequ.analyzers.runners.NoSuchColumnException import com.amazon.deequ.metrics.Distribution import com.amazon.deequ.metrics.DistributionValue @@ -349,6 +350,7 @@ class AnalyzerTests extends AnyWordSpec with Matchers with SparkContextSpec with val dataTypes = DataTypeInstances.values.map { _.toString } val zeros = dataTypes + .unsorted .diff { nonZeroValuesWithStringKeys.map { case (distKey, _) => distKey }.toSet } .map(dataType => dataType -> DistributionValue(0, 0.0)) .toSeq @@ -361,33 +363,33 @@ class AnalyzerTests extends AnyWordSpec with Matchers with SparkContextSpec with "fail for non-atomic columns" in withSparkSession { sparkSession => val df = getDfWithNestedColumn(sparkSession) - assert(DataType("source").calculate(df).value.isFailure) + assert(deequ.analyzers.DataType("source").calculate(df).value.isFailure) } "fall back to String in case no known data type matched" in withSparkSession { sparkSession => val df = getDfFull(sparkSession) - DataType("att1").calculate(df).value shouldBe + deequ.analyzers.DataType("att1").calculate(df).value shouldBe Success(distributionFrom(DataTypeInstances.String -> DistributionValue(4, 1.0))) } "detect integral type correctly" in withSparkSession { sparkSession => val df = getDfWithNumericValues(sparkSession) val expectedResult = distributionFrom(DataTypeInstances.Integral -> DistributionValue(6, 1.0)) - DataType("att1").calculate(df).value shouldBe Success(expectedResult) + deequ.analyzers.DataType("att1").calculate(df).value shouldBe Success(expectedResult) } "detect integral type correctly for negative numbers" in withSparkSession { sparkSession => val df = getDfWithNegativeNumbers(sparkSession) val expectedResult = distributionFrom(DataTypeInstances.Integral -> DistributionValue(4, 1.0)) - DataType("att1").calculate(df).value shouldBe Success(expectedResult) + deequ.analyzers.DataType("att1").calculate(df).value shouldBe Success(expectedResult) } "detect fractional type correctly for negative numbers" in withSparkSession { sparkSession => val df = getDfWithNegativeNumbers(sparkSession) val expectedResult = distributionFrom(DataTypeInstances.Fractional -> DistributionValue(4, 1.0)) - DataType("att2").calculate(df).value shouldBe Success(expectedResult) + deequ.analyzers.DataType("att2").calculate(df).value shouldBe Success(expectedResult) } @@ -396,14 +398,14 @@ class AnalyzerTests extends AnyWordSpec with Matchers with SparkContextSpec with .withColumn("att1_float", col("att1").cast(FloatType)) val expectedResult = distributionFrom(DataTypeInstances.Fractional -> DistributionValue(6, 1.0)) - DataType("att1_float").calculate(df).value shouldBe Success(expectedResult) + deequ.analyzers.DataType("att1_float").calculate(df).value shouldBe Success(expectedResult) } "detect integral type in string column" in withSparkSession { sparkSession => val df = getDfWithNumericValues(sparkSession) .withColumn("att1_str", col("att1").cast(StringType)) val expectedResult = distributionFrom(DataTypeInstances.Integral -> DistributionValue(6, 1.0)) - DataType("att1_str").calculate(df).value shouldBe Success(expectedResult) + deequ.analyzers.DataType("att1_str").calculate(df).value shouldBe Success(expectedResult) } "detect fractional type in string column" in withSparkSession { sparkSession => @@ -412,19 +414,19 @@ class AnalyzerTests extends AnyWordSpec with Matchers with SparkContextSpec with val expectedResult = distributionFrom(DataTypeInstances.Fractional -> DistributionValue(6, 1.0)) - DataType("att1_str").calculate(df).value shouldBe Success(expectedResult) + deequ.analyzers.DataType("att1_str").calculate(df).value shouldBe Success(expectedResult) } "fall back to string in case the string column didn't match " + " any known other data type" in withSparkSession { sparkSession => val df = getDfFull(sparkSession) val expectedResult = distributionFrom(DataTypeInstances.String -> DistributionValue(4, 1.0)) - DataType("att1").calculate(df).value shouldBe Success(expectedResult) + deequ.analyzers.DataType("att1").calculate(df).value shouldBe Success(expectedResult) } "detect fractional for mixed fractional and integral" in withSparkSession { sparkSession => val df = getDfFractionalIntegralTypes(sparkSession) - DataType("att1").calculate(df).value shouldBe Success( + deequ.analyzers.DataType("att1").calculate(df).value shouldBe Success( distributionFrom( DataTypeInstances.Fractional -> DistributionValue(1, 0.5), DataTypeInstances.Integral -> DistributionValue(1, 0.5) @@ -434,7 +436,7 @@ class AnalyzerTests extends AnyWordSpec with Matchers with SparkContextSpec with "fall back to string for mixed fractional and string" in withSparkSession { sparkSession => val df = getDfFractionalStringTypes(sparkSession) - DataType("att1").calculate(df).value shouldBe Success( + deequ.analyzers.DataType("att1").calculate(df).value shouldBe Success( distributionFrom( DataTypeInstances.Fractional -> DistributionValue(1, 0.5), DataTypeInstances.String -> DistributionValue(1, 0.5) @@ -444,7 +446,7 @@ class AnalyzerTests extends AnyWordSpec with Matchers with SparkContextSpec with "fall back to string for mixed integral and string" in withSparkSession { sparkSession => val df = getDfIntegralStringTypes(sparkSession) - DataType("att1").calculate(df).value shouldBe Success( + deequ.analyzers.DataType("att1").calculate(df).value shouldBe Success( distributionFrom( DataTypeInstances.Integral -> DistributionValue(1, 0.5), DataTypeInstances.String -> DistributionValue(1, 0.5) @@ -454,7 +456,7 @@ class AnalyzerTests extends AnyWordSpec with Matchers with SparkContextSpec with "integral for numeric and null" in withSparkSession { sparkSession => val df = getDfWithUniqueColumns(sparkSession) - DataType("uniqueWithNulls").calculate(df).value shouldBe Success( + deequ.analyzers.DataType("uniqueWithNulls").calculate(df).value shouldBe Success( distributionFrom( DataTypeInstances.Unknown -> DistributionValue(1, 1.0/6.0), DataTypeInstances.Integral -> DistributionValue(5, 5.0/6.0) @@ -471,7 +473,7 @@ class AnalyzerTests extends AnyWordSpec with Matchers with SparkContextSpec with val expectedResult = distributionFrom(DataTypeInstances.Boolean -> DistributionValue(2, 1.0)) - DataType("att1").calculate(df).value shouldBe Success(expectedResult) + deequ.analyzers.DataType("att1").calculate(df).value shouldBe Success(expectedResult) } "fall back to string for boolean and null" in withSparkSession { sparkSession => @@ -483,7 +485,7 @@ class AnalyzerTests extends AnyWordSpec with Matchers with SparkContextSpec with ("4", "2.0") ).toDF("item", "att1") - DataType("att1").calculate(df).value shouldBe Success( + deequ.analyzers.DataType("att1").calculate(df).value shouldBe Success( distributionFrom( DataTypeInstances.Fractional -> DistributionValue(1, 0.25), DataTypeInstances.Unknown -> DistributionValue(1, 0.25), diff --git a/src/test/scala/com/amazon/deequ/analyzers/runners/AnalysisRunnerTests.scala b/src/test/scala/com/amazon/deequ/analyzers/runners/AnalysisRunnerTests.scala index 4ffc9eeb9..3e02c1071 100644 --- a/src/test/scala/com/amazon/deequ/analyzers/runners/AnalysisRunnerTests.scala +++ b/src/test/scala/com/amazon/deequ/analyzers/runners/AnalysisRunnerTests.scala @@ -184,12 +184,13 @@ class AnalysisRunnerTests extends AnyWordSpec Uniqueness("att1", Some("att3 = 0")) :: Nil val (separateResults, numSeparateJobs) = sparkMonitor.withMonitoringSession { stat => - val results = analyzers.map { _.calculate(df) }.toSet + val results: Seq[com.amazon.deequ.metrics.DoubleMetric] = + analyzers.map { _.calculate(df) }.toSeq.sortBy(_.name) (results, stat.jobCount) } val (runnerResults, numCombinedJobs) = sparkMonitor.withMonitoringSession { stat => - val results = AnalysisRunner.onData(df).addAnalyzers(analyzers).run().allMetrics.toSet + val results = AnalysisRunner.onData(df).addAnalyzers(analyzers).run().allMetrics.toSeq.sortBy(_.name) (results, stat.jobCount) } diff --git a/src/test/scala/com/amazon/deequ/analyzers/runners/AnalyzerContextTest.scala b/src/test/scala/com/amazon/deequ/analyzers/runners/AnalyzerContextTest.scala index 254fac9b4..71fe1f70d 100644 --- a/src/test/scala/com/amazon/deequ/analyzers/runners/AnalyzerContextTest.scala +++ b/src/test/scala/com/amazon/deequ/analyzers/runners/AnalyzerContextTest.scala @@ -81,20 +81,20 @@ class AnalyzerContextTest extends AnyWordSpec val successMetricsResultsJson = AnalyzerContext.successMetricsAsJson(results) - val expectedJson = - """[ - |{"entity":"Column","instance":"item","name":"Distinctness","value":1.0}, - |{"entity":"Column","instance":"att1","name":"Completeness","value":1.0}, - |{"entity":"Multicolumn","instance":"att1,att2","name":"Uniqueness","value":0.25}, - |{"entity":"Dataset","instance":"*","name":"Size (where: att2 == 'd')","value":1.0}, - |{"entity":"Dataset","instance":"*","name":"Size","value":4.0}, - |{"entity":"Column","instance":"att1","name":"Histogram.bins","value":2.0}, - |{"entity":"Column","instance":"att1","name":"Histogram.abs.a","value":3.0}, - |{"entity":"Column","instance":"att1","name":"Histogram.ratio.a","value":0.75}, - |{"entity":"Column","instance":"att1","name":"Histogram.abs.b","value":1.0}, - |{"entity":"Column","instance":"att1","name":"Histogram.ratio.b","value":0.25} - |]""" - .stripMargin.replaceAll("\n", "") + val expectedJson = + """[ + |{"entity":"Multicolumn","instance":"att1,att2","name":"Uniqueness","value":0.25}, + |{"entity":"Dataset","instance":"*","name":"Size (where: att2 == 'd')","value":1.0}, + |{"entity":"Column","instance":"item","name":"Distinctness","value":1.0}, + |{"entity":"Column","instance":"att1","name":"Histogram.bins","value":2.0}, + |{"entity":"Column","instance":"att1","name":"Histogram.abs.a","value":3.0}, + |{"entity":"Column","instance":"att1","name":"Histogram.ratio.a","value":0.75}, + |{"entity":"Column","instance":"att1","name":"Histogram.abs.b","value":1.0}, + |{"entity":"Column","instance":"att1","name":"Histogram.ratio.b","value":0.25}, + |{"entity":"Dataset","instance":"*","name":"Size","value":4.0}, + |{"entity":"Column","instance":"att1","name":"Completeness","value":1.0} + |]""" + .stripMargin.replaceAll("\n", "") assertSameJson(successMetricsResultsJson, expectedJson) } @@ -110,10 +110,11 @@ class AnalyzerContextTest extends AnyWordSpec val successMetricsResultsJson = AnalyzerContext.successMetricsAsJson(results, metricsForAnalyzers) - val expectedJson = - """[{"entity":"Column","instance":"att1","name":"Completeness","value":1.0}, - |{"entity":"Multicolumn","instance":"att1,att2", - |"name":"Uniqueness","value":0.25}]""" + val expectedJson = + """[ + |{"entity":"Multicolumn","instance":"att1,att2","name":"Uniqueness","value":0.25}, + |{"entity":"Column","instance":"att1","name":"Completeness","value":1.0} + |]""" .stripMargin.replaceAll("\n", "") assertSameJson(successMetricsResultsJson, expectedJson) diff --git a/src/test/scala/com/amazon/deequ/repository/AnalysisResultSerdeTest.scala b/src/test/scala/com/amazon/deequ/repository/AnalysisResultSerdeTest.scala index 7083a5c1d..250916aef 100644 --- a/src/test/scala/com/amazon/deequ/repository/AnalysisResultSerdeTest.scala +++ b/src/test/scala/com/amazon/deequ/repository/AnalysisResultSerdeTest.scala @@ -333,21 +333,22 @@ class SimpleResultSerdeTest extends WordSpec with Matchers with SparkContextSpec |{"dataset_date":1507975810,"entity":"Column","region":"EU", |"instance":"att2","name":"Completeness","value":1.0}, |{"dataset_date":1507975810,"entity":"Column","region":"EU", - |"instance":"att1","name":"Completeness","value":1.0}, - |{"dataset_date":1507975810,"entity":"Multicolumn","region":"EU", - |"instance":"att1,att2","name":"MutualInformation","value":0.5623351446188083}, - |{"dataset_date":1507975810,"entity":"Dataset","region":"EU", - |"instance":"*","name":"Size","value":4.0}, + |"instance":"att1","name":"MaxLength","value":1.0}, + |{"dataset_date":1507975810,"entity":"Column","region":"EU", + |"instance":"att1","name":"MinLength","value":1.0}, |{"dataset_date":1507975810,"entity":"Column","region":"EU", |"instance":"att1","name":"Uniqueness","value":0.25}, |{"dataset_date":1507975810,"entity":"Column","region":"EU", |"instance":"att1","name":"Distinctness","value":0.5}, + |{"dataset_date":1507975810,"entity":"Multicolumn","region":"EU", + |"instance":"att1,att2","name":"MutualInformation","value":0.5623351446188083}, |{"dataset_date":1507975810,"entity":"Column","region":"EU", - |"instance":"att1","name":"MinLength","value":1.0}, - |{"dataset_date":1507975810,"entity":"Column","region":"EU", - |"instance":"att1","name":"MaxLength","value":1.0}, + |"instance":"att2","name":"Uniqueness","value":0.25}, + |{"dataset_date":1507975810,"entity":"Dataset","region":"EU", + |"instance":"*","name":"Size","value":4.0}, |{"dataset_date":1507975810,"entity":"Column","region":"EU", - |"instance":"att2","name":"Uniqueness","value":0.25}]""" + |"instance":"att1","name":"Completeness","value":1.0} + |]""" .stripMargin.replaceAll("\n", "") // ordering of map entries is not guaranteed, so comparing strings is not an option diff --git a/src/test/scala/com/amazon/deequ/repository/MetricsRepositoryMultipleResultsLoaderTest.scala b/src/test/scala/com/amazon/deequ/repository/MetricsRepositoryMultipleResultsLoaderTest.scala index 6e61b9385..90b520c9f 100644 --- a/src/test/scala/com/amazon/deequ/repository/MetricsRepositoryMultipleResultsLoaderTest.scala +++ b/src/test/scala/com/amazon/deequ/repository/MetricsRepositoryMultipleResultsLoaderTest.scala @@ -81,8 +81,17 @@ class MetricsRepositoryMultipleResultsLoaderTest extends AnyWordSpec with Matche val analysisResultsAsJson = repository.load() .getSuccessMetricsAsJson() - val expected = - s"""[{"entity":"Dataset","instance":"*","name":"Size","value":4.0, + val expected = + s"""[{"entity":"Dataset","instance":"*","name":"Size","value":4.0, + |"region":"NA", "dataset_date":$DATE_TWO}, + |{"entity":"Column","instance":"att1","name":"Completeness","value":1.0, + |"region":"NA", "dataset_date":$DATE_TWO}, + |{"entity":"Column","instance":"item","name":"Distinctness","value":1.0, + |"region":"NA", "dataset_date":$DATE_TWO}, + |{"entity":"Multicolumn","instance":"att1,att2","name":"Uniqueness","value":0.25, + |"region":"NA", "dataset_date":$DATE_TWO}, + | + |{"entity":"Dataset","instance":"*","name":"Size","value":4.0, |"region":"EU", "dataset_date":$DATE_ONE}, |{"entity":"Column","instance":"att1","name":"Completeness","value":1.0, |"region":"EU", "dataset_date":$DATE_ONE}, @@ -90,17 +99,8 @@ class MetricsRepositoryMultipleResultsLoaderTest extends AnyWordSpec with Matche |"region":"EU", "dataset_date":$DATE_ONE}, |{"entity":"Multicolumn","instance":"att1,att2", |"name":"Uniqueness","value":0.25, - |"region":"EU", "dataset_date":$DATE_ONE}, - | - |{"entity":"Dataset","instance":"*","name":"Size","value":4.0, - |"region":"NA", "dataset_date":$DATE_TWO}, - |{"entity":"Column","instance":"att1","name":"Completeness","value":1.0, - |"region":"NA", "dataset_date":$DATE_TWO}, - |{"entity":"Column","instance":"item","name":"Distinctness","value":1.0, - |"region":"NA", "dataset_date":$DATE_TWO}, - |{"entity":"Multicolumn","instance":"att1,att2","name":"Uniqueness","value":0.25, - |"region":"NA", "dataset_date":$DATE_TWO}]""" - .stripMargin.replaceAll("\n", "") + |"region":"EU", "dataset_date":$DATE_ONE}]""" + .stripMargin.replaceAll("\n", "") assertSameJson(analysisResultsAsJson, expected) }