diff --git a/pom.xml b/pom.xml
index 39545d252..b6b98133e 100644
--- a/pom.xml
+++ b/pom.xml
@@ -13,8 +13,8 @@
1.8
UTF-8
- 2.12
- ${scala.major.version}.10
+ 2.13
+ ${scala.major.version}.11
${scala.major.version}
4.8.1
@@ -103,7 +103,7 @@
org.scalanlp
breeze_${scala.major.version}
- 0.13.2
+ 1.2
diff --git a/src/main/scala/com/amazon/deequ/VerificationResult.scala b/src/main/scala/com/amazon/deequ/VerificationResult.scala
index 6390db821..add26138c 100644
--- a/src/main/scala/com/amazon/deequ/VerificationResult.scala
+++ b/src/main/scala/com/amazon/deequ/VerificationResult.scala
@@ -186,6 +186,14 @@ object VerificationResult {
}
}
- private[this] case class SimpleCheckResultOutput(checkDescription: String, checkLevel: String,
+ // remove private as it breaks somethingwith scala 2.13
+ // java.lang.RuntimeException: Error while encoding: java.util.concurrent.ExecutionException:
+ // org.codehaus.commons.compiler.CompileException:
+ // File 'generated.java', Line 105, Column 25: failed to compile:
+ // org.codehaus.commons.compiler.CompileException: File 'generated.java',
+ // Line 105, Column 25: No applicable constructor/method found for zero actual parameters; candidates are:
+ // "public java.lang.String com.amazon.deequ.VerificationResult$SimpleCheckResultOutput.constraintStatus()"
+ // private[this]
+ case class SimpleCheckResultOutput(checkDescription: String, checkLevel: String,
checkStatus: String, constraint: String, constraintStatus: String, constraintMessage: String)
}
diff --git a/src/main/scala/com/amazon/deequ/analyzers/Analyzer.scala b/src/main/scala/com/amazon/deequ/analyzers/Analyzer.scala
index b3e44c4c7..cb3e9b93a 100644
--- a/src/main/scala/com/amazon/deequ/analyzers/Analyzer.scala
+++ b/src/main/scala/com/amazon/deequ/analyzers/Analyzer.scala
@@ -453,7 +453,7 @@ private[deequ] object Analyzers {
if (nullInResult) {
None
} else {
- Option(func(Unit))
+ Option(func(()))
}
}
diff --git a/src/main/scala/com/amazon/deequ/analyzers/QuantileNonSample.scala b/src/main/scala/com/amazon/deequ/analyzers/QuantileNonSample.scala
index a39b2d672..fd8942c79 100644
--- a/src/main/scala/com/amazon/deequ/analyzers/QuantileNonSample.scala
+++ b/src/main/scala/com/amazon/deequ/analyzers/QuantileNonSample.scala
@@ -52,7 +52,7 @@ class QuantileNonSample[T](
this.shrinkingFactor = shrinkingFactor
compactors = ArrayBuffer.fill(data.length)(new NonSampleCompactor[T])
for (i <- data.indices) {
- compactors(i).buffer = data(i).to[ArrayBuffer]
+ compactors(i).buffer = ArrayBuffer.from(data(i)) // data(i).to[ArrayBuffer]
}
curNumOfCompactors = data.length
compactorActualSize = getCompactorItemsCount
diff --git a/src/main/scala/com/amazon/deequ/analyzers/StateProvider.scala b/src/main/scala/com/amazon/deequ/analyzers/StateProvider.scala
index 65edb9424..dd696c56a 100644
--- a/src/main/scala/com/amazon/deequ/analyzers/StateProvider.scala
+++ b/src/main/scala/com/amazon/deequ/analyzers/StateProvider.scala
@@ -24,7 +24,7 @@ import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.expressions.aggregate.{ApproximatePercentile, DeequHyperLogLogPlusPlusUtils}
import org.apache.spark.sql.SaveMode
-import scala.collection.JavaConversions._
+import scala.jdk.CollectionConverters._
import scala.util.hashing.MurmurHash3
private object StateInformation {
@@ -58,7 +58,8 @@ case class InMemoryStateProvider() extends StateLoader with StatePersister {
override def toString: String = {
val buffer = new StringBuilder()
- statesByAnalyzer.foreach { case (analyzer, state) =>
+
+ statesByAnalyzer.asScala.foreach { case (analyzer, state) =>
buffer.append(analyzer.toString)
buffer.append(" => ")
buffer.append(state.toString)
diff --git a/src/main/scala/com/amazon/deequ/analyzers/runners/AnalyzerContext.scala b/src/main/scala/com/amazon/deequ/analyzers/runners/AnalyzerContext.scala
index e66bf5f0d..a280d9f38 100644
--- a/src/main/scala/com/amazon/deequ/analyzers/runners/AnalyzerContext.scala
+++ b/src/main/scala/com/amazon/deequ/analyzers/runners/AnalyzerContext.scala
@@ -124,13 +124,19 @@ object AnalyzerContext {
}
}
- private[this] case class SimpleMetricOutput(
+ // when using private, leads to
+ // No applicable constructor/method
+ // private[this]
+ case class SimpleMetricOutput(
entity: String,
instance: String,
name: String,
value: Double)
- private[this] object SimpleMetricOutput {
+ // when using private, leads to
+ // No applicable constructor/method
+ // private[this]
+ object SimpleMetricOutput {
def apply(doubleMetric: DoubleMetric): SimpleMetricOutput = {
SimpleMetricOutput(
diff --git a/src/main/scala/com/amazon/deequ/anomalydetection/OnlineNormalStrategy.scala b/src/main/scala/com/amazon/deequ/anomalydetection/OnlineNormalStrategy.scala
index 8bf8b634c..0e884fa51 100644
--- a/src/main/scala/com/amazon/deequ/anomalydetection/OnlineNormalStrategy.scala
+++ b/src/main/scala/com/amazon/deequ/anomalydetection/OnlineNormalStrategy.scala
@@ -116,12 +116,12 @@ case class OnlineNormalStrategy(
ret += OnlineCalculationResult(currentMean, stdDev, isAnomaly = true)
}
}
- ret
+ ret.toSeq
}
/**
- * Search for anomalies in a series of data points.
+ * Search for anomalies in a series of datag points.
*
* @param dataSeries The data contained in a Vector of Doubles
* @param searchInterval The indices between which anomalies should be detected. [a, b).
diff --git a/src/main/scala/com/amazon/deequ/anomalydetection/seasonal/HoltWinters.scala b/src/main/scala/com/amazon/deequ/anomalydetection/seasonal/HoltWinters.scala
index 0ee0ac25f..dc19ab275 100644
--- a/src/main/scala/com/amazon/deequ/anomalydetection/seasonal/HoltWinters.scala
+++ b/src/main/scala/com/amazon/deequ/anomalydetection/seasonal/HoltWinters.scala
@@ -132,7 +132,7 @@ class HoltWinters(
}
val forecasted = Y.drop(series.size)
- ModelResults(forecasted, level, trend, seasonality, residuals)
+ ModelResults(forecasted.toSeq, level.toSeq, trend.toSeq, seasonality.toSeq, residuals.toSeq)
}
private def modelSelectionFor(
diff --git a/src/main/scala/com/amazon/deequ/profiles/ColumnProfiler.scala b/src/main/scala/com/amazon/deequ/profiles/ColumnProfiler.scala
index 57c8c3019..8c834db79 100644
--- a/src/main/scala/com/amazon/deequ/profiles/ColumnProfiler.scala
+++ b/src/main/scala/com/amazon/deequ/profiles/ColumnProfiler.scala
@@ -347,7 +347,7 @@ object ColumnProfiler {
Histogram(histogram.column).equals(histogram)
case _ => false
}
- analyzerContextExistingValues = AnalyzerContext(relevantEntries)
+ analyzerContextExistingValues = AnalyzerContext(relevantEntries.toMap)
}
}
}
diff --git a/src/main/scala/com/amazon/deequ/repository/AnalysisResultSerde.scala b/src/main/scala/com/amazon/deequ/repository/AnalysisResultSerde.scala
index 4a2ca1058..35c087084 100644
--- a/src/main/scala/com/amazon/deequ/repository/AnalysisResultSerde.scala
+++ b/src/main/scala/com/amazon/deequ/repository/AnalysisResultSerde.scala
@@ -27,14 +27,13 @@ import com.google.gson._
import com.google.gson.reflect.TypeToken
import scala.collection._
-import scala.collection.JavaConverters._
import java.util.{ArrayList => JArrayList, HashMap => JHashMap, List => JList, Map => JMap}
import JsonSerializationConstants._
import com.amazon.deequ.analyzers.Histogram.{AggregateFunction, Count => HistogramCount, Sum => HistogramSum}
import org.apache.spark.sql.Column
import org.apache.spark.sql.functions.expr
-import scala.collection.JavaConversions._
+import scala.jdk.CollectionConverters._
private[repository] object JsonSerializationConstants {
@@ -70,6 +69,7 @@ private[deequ] object SimpleResultSerde {
.asInstanceOf[JArrayList[JMap[String, String]]]
.asScala
.map(map => immutable.Map(map.asScala.toList: _*))
+ .toSeq
}
}
@@ -422,22 +422,22 @@ private[deequ] object AnalyzerDeserializer
getOptionalWhereParam(json))
case "CountDistinct" =>
- CountDistinct(getColumnsAsSeq(context, json))
+ CountDistinct(getColumnsAsSeq(context, json).toSeq)
case "Distinctness" =>
- Distinctness(getColumnsAsSeq(context, json))
+ Distinctness(getColumnsAsSeq(context, json).toSeq)
case "Entropy" =>
Entropy(json.get(COLUMN_FIELD).getAsString)
case "MutualInformation" =>
- MutualInformation(getColumnsAsSeq(context, json))
+ MutualInformation(getColumnsAsSeq(context, json).toSeq)
case "UniqueValueRatio" =>
- UniqueValueRatio(getColumnsAsSeq(context, json))
+ UniqueValueRatio(getColumnsAsSeq(context, json).toSeq)
case "Uniqueness" =>
- Uniqueness(getColumnsAsSeq(context, json))
+ Uniqueness(getColumnsAsSeq(context, json).toSeq)
case "Histogram" =>
Histogram(
@@ -598,7 +598,7 @@ private[deequ] object MetricDeserializer extends JsonDeserializer[Metric[_]] {
val instance = jsonObject.get("instance").getAsString
if (jsonObject.has("value")) {
val entries = jsonObject.get("value").getAsJsonObject
- val values = entries.entrySet().map { entry =>
+ val values = entries.entrySet().asScala.map { entry =>
entry.getKey -> entry.getValue.getAsDouble
}
.toMap
diff --git a/src/main/scala/com/amazon/deequ/repository/MetricsRepositoryMultipleResultsLoader.scala b/src/main/scala/com/amazon/deequ/repository/MetricsRepositoryMultipleResultsLoader.scala
index 3b9abc5b3..8351fe0b1 100644
--- a/src/main/scala/com/amazon/deequ/repository/MetricsRepositoryMultipleResultsLoader.scala
+++ b/src/main/scala/com/amazon/deequ/repository/MetricsRepositoryMultipleResultsLoader.scala
@@ -99,8 +99,8 @@ private[repository] object MetricsRepositoryMultipleResultsLoader {
def jsonUnion(jsonOne: String, jsonTwo: String): String = {
- val objectOne: Seq[Map[String, Any]] = SimpleResultSerde.deserialize(jsonOne)
- val objectTwo: Seq[Map[String, Any]] = SimpleResultSerde.deserialize(jsonTwo)
+ val objectOne: Seq[Map[String, Any]] = SimpleResultSerde.deserialize(jsonOne).toSeq
+ val objectTwo: Seq[Map[String, Any]] = SimpleResultSerde.deserialize(jsonTwo).toSeq
val columnsTotal = objectOne.headOption.getOrElse(Map.empty).keySet ++
objectTwo.headOption.getOrElse(Map.empty).keySet
diff --git a/src/main/scala/com/amazon/deequ/repository/fs/FileSystemMetricsRepository.scala b/src/main/scala/com/amazon/deequ/repository/fs/FileSystemMetricsRepository.scala
index 7b1101017..6668c90d1 100644
--- a/src/main/scala/com/amazon/deequ/repository/fs/FileSystemMetricsRepository.scala
+++ b/src/main/scala/com/amazon/deequ/repository/fs/FileSystemMetricsRepository.scala
@@ -153,10 +153,11 @@ class FileSystemMetricsRepositoryMultipleResultsLoader(
.metricMap
.filterKeys(analyzer => forAnalyzers.isEmpty || forAnalyzers.get.contains(analyzer))
- val requestedAnalyzerContext = AnalyzerContext(requestedMetrics)
+ val requestedAnalyzerContext = AnalyzerContext(requestedMetrics.toMap)
AnalysisResult(analysisResult.resultKey, requestedAnalyzerContext)
}
+ .toSeq
}
}
diff --git a/src/main/scala/com/amazon/deequ/repository/memory/InMemoryMetricsRepository.scala b/src/main/scala/com/amazon/deequ/repository/memory/InMemoryMetricsRepository.scala
index 61ad1e1ee..b5888ac08 100644
--- a/src/main/scala/com/amazon/deequ/repository/memory/InMemoryMetricsRepository.scala
+++ b/src/main/scala/com/amazon/deequ/repository/memory/InMemoryMetricsRepository.scala
@@ -21,7 +21,7 @@ import com.amazon.deequ.metrics.Metric
import com.amazon.deequ.repository._
import com.amazon.deequ.analyzers.runners.AnalyzerContext
-import scala.collection.JavaConversions._
+import scala.jdk.CollectionConverters._
import java.util.concurrent.ConcurrentHashMap
/** A simple Repository implementation backed by a concurrent hash map */
@@ -118,6 +118,7 @@ class LimitedInMemoryMetricsRepositoryMultipleResultsLoader(
/** Get the AnalysisResult */
def get(): Seq[AnalysisResult] = {
resultsRepository
+ .asScala
.filterKeys(key => after.isEmpty || after.get <= key.dataSetDate)
.filterKeys(key => before.isEmpty || key.dataSetDate <= before.get)
.filterKeys(key => tagValues.isEmpty || tagValues.get.toSet.subsetOf(key.tags.toSet))
@@ -129,7 +130,7 @@ class LimitedInMemoryMetricsRepositoryMultipleResultsLoader(
.metricMap
.filterKeys(analyzer => forAnalyzers.isEmpty || forAnalyzers.get.contains(analyzer))
- AnalysisResult(analysisResult.resultKey, AnalyzerContext(requestedMetrics))
+ AnalysisResult(analysisResult.resultKey, AnalyzerContext(requestedMetrics.toMap))
}
.toSeq
}
diff --git a/src/main/scala/com/amazon/deequ/suggestions/ConstraintSuggestionRunner.scala b/src/main/scala/com/amazon/deequ/suggestions/ConstraintSuggestionRunner.scala
index de915956d..c230872ec 100644
--- a/src/main/scala/com/amazon/deequ/suggestions/ConstraintSuggestionRunner.scala
+++ b/src/main/scala/com/amazon/deequ/suggestions/ConstraintSuggestionRunner.scala
@@ -142,7 +142,7 @@ class ConstraintSuggestionRunner {
groupedSuggestionsWithColumnNames.map { case (_, suggestion) => suggestion } }
ConstraintSuggestionResult(columnProfiles.profiles, columnProfiles.numRecords,
- columnsWithSuggestions, verificationResult)
+ columnsWithSuggestions.toMap, verificationResult)
}
private[this] def splitTrainTestSets(
diff --git a/src/test/scala/com/amazon/deequ/KLL/KLLDistanceTest.scala b/src/test/scala/com/amazon/deequ/KLL/KLLDistanceTest.scala
index 20017fa71..d60bd0b76 100644
--- a/src/test/scala/com/amazon/deequ/KLL/KLLDistanceTest.scala
+++ b/src/test/scala/com/amazon/deequ/KLL/KLLDistanceTest.scala
@@ -88,7 +88,7 @@ class KLLDistanceTest extends WordSpec with SparkContextSpec
val sample2 = scala.collection.mutable.Map(
"a" -> 22L, "b" -> 20L, "c" -> 25L, "d" -> 12L, "e" -> 13L, "f" -> 15L)
val distance = Distance.categoricalDistance(sample1, sample2, method = LInfinityMethod(alpha = Some(0.003)))
- assert(distance == 0.2726338046550349)
+ assert(distance == 0.27263380465503484)
}
"Categorial distance should compute correct linf_robust with different alpha value .1" in {
@@ -137,7 +137,7 @@ class KLLDistanceTest extends WordSpec with SparkContextSpec
"a" -> 100L, "b" -> 22L, "c" -> 25L, "d" -> 5L, "e" -> 13L, "f" -> 2L)
val distance = Distance.categoricalDistance(
sample1, sample2, correctForLowNumberOfSamples = true, method = ChisquareMethod())
- assert(distance == 8.789790456457125)
+ assert(distance == 8.789790456457123)
}
"Categorical distance should compute correct chisquare distance (low samples) with regrouping (yates)" in {
@@ -170,7 +170,7 @@ class KLLDistanceTest extends WordSpec with SparkContextSpec
"a" -> 100L, "b" -> 4L, "c" -> 3L, "d" -> 27L, "e" -> 20L, "f" -> 20L, "g" -> 20L, "h" -> 20L)
val distance = Distance.categoricalDistance(
sample, baseline, correctForLowNumberOfSamples = true, method = ChisquareMethod())
- assert(distance == 6.827423492761593)
+ assert(distance == 6.827423492761592)
}
"Categorical distance should compute correct chisquare distance (low samples) " +
diff --git a/src/test/scala/com/amazon/deequ/VerificationResultTest.scala b/src/test/scala/com/amazon/deequ/VerificationResultTest.scala
index 93aa73201..36079fe11 100644
--- a/src/test/scala/com/amazon/deequ/VerificationResultTest.scala
+++ b/src/test/scala/com/amazon/deequ/VerificationResultTest.scala
@@ -79,12 +79,13 @@ class VerificationResultTest extends WordSpec with Matchers with SparkContextSpe
val successMetricsResultsJson = VerificationResult.successMetricsAsJson(results)
val expectedJson =
- """[{"entity":"Column","instance":"item","name":"Distinctness","value":1.0},
+ """[
+ |{"entity":"Multicolumn","instance":"att1,att2","name":"Uniqueness","value":0.25},
+ {"entity":"Column","instance":"item","name":"Distinctness","value":1.0},
|{"entity": "Column", "instance":"att2","name":"Completeness","value":1.0},
- |{"entity":"Column","instance":"att1","name":"Completeness","value":1.0},
- |{"entity":"Multicolumn","instance":"att1,att2",
- |"name":"Uniqueness","value":0.25},
- |{"entity":"Dataset","instance":"*","name":"Size","value":4.0}]"""
+ |{"entity":"Dataset","instance":"*","name":"Size","value":4.0},
+ |{"entity":"Column","instance":"att1","name":"Completeness","value":1.0}
+ |]"""
.stripMargin.replaceAll("\n", "")
assertSameResultsJson(successMetricsResultsJson, expectedJson)
@@ -102,9 +103,10 @@ class VerificationResultTest extends WordSpec with Matchers with SparkContextSpe
VerificationResult.successMetricsAsJson(results, metricsForAnalyzers)
val expectedJson =
- """[{"entity":"Column","instance":"att1","name":"Completeness","value":1.0},
- |{"entity":"Multicolumn","instance":"att1,att2",
- |"name":"Uniqueness","value":0.25}]"""
+ """[
+ |{"entity":"Multicolumn","instance":"att1,att2","name":"Uniqueness","value":0.25},
+ |{"entity":"Column","instance":"att1","name":"Completeness","value":1.0}
+ |]"""
.stripMargin.replaceAll("\n", "")
assertSameResultsJson(successMetricsResultsJson, expectedJson)
diff --git a/src/test/scala/com/amazon/deequ/analyzers/AnalyzerTests.scala b/src/test/scala/com/amazon/deequ/analyzers/AnalyzerTests.scala
index 03787b886..b7b2dcb7c 100644
--- a/src/test/scala/com/amazon/deequ/analyzers/AnalyzerTests.scala
+++ b/src/test/scala/com/amazon/deequ/analyzers/AnalyzerTests.scala
@@ -17,6 +17,7 @@
package com.amazon.deequ
package analyzers
+import com.amazon.deequ
import com.amazon.deequ.analyzers.runners.NoSuchColumnException
import com.amazon.deequ.metrics.Distribution
import com.amazon.deequ.metrics.DistributionValue
@@ -349,6 +350,7 @@ class AnalyzerTests extends AnyWordSpec with Matchers with SparkContextSpec with
val dataTypes = DataTypeInstances.values.map { _.toString }
val zeros = dataTypes
+ .unsorted
.diff { nonZeroValuesWithStringKeys.map { case (distKey, _) => distKey }.toSet }
.map(dataType => dataType -> DistributionValue(0, 0.0))
.toSeq
@@ -361,33 +363,33 @@ class AnalyzerTests extends AnyWordSpec with Matchers with SparkContextSpec with
"fail for non-atomic columns" in withSparkSession { sparkSession =>
val df = getDfWithNestedColumn(sparkSession)
- assert(DataType("source").calculate(df).value.isFailure)
+ assert(deequ.analyzers.DataType("source").calculate(df).value.isFailure)
}
"fall back to String in case no known data type matched" in withSparkSession { sparkSession =>
val df = getDfFull(sparkSession)
- DataType("att1").calculate(df).value shouldBe
+ deequ.analyzers.DataType("att1").calculate(df).value shouldBe
Success(distributionFrom(DataTypeInstances.String -> DistributionValue(4, 1.0)))
}
"detect integral type correctly" in withSparkSession { sparkSession =>
val df = getDfWithNumericValues(sparkSession)
val expectedResult = distributionFrom(DataTypeInstances.Integral -> DistributionValue(6, 1.0))
- DataType("att1").calculate(df).value shouldBe Success(expectedResult)
+ deequ.analyzers.DataType("att1").calculate(df).value shouldBe Success(expectedResult)
}
"detect integral type correctly for negative numbers" in withSparkSession { sparkSession =>
val df = getDfWithNegativeNumbers(sparkSession)
val expectedResult = distributionFrom(DataTypeInstances.Integral -> DistributionValue(4, 1.0))
- DataType("att1").calculate(df).value shouldBe Success(expectedResult)
+ deequ.analyzers.DataType("att1").calculate(df).value shouldBe Success(expectedResult)
}
"detect fractional type correctly for negative numbers" in withSparkSession { sparkSession =>
val df = getDfWithNegativeNumbers(sparkSession)
val expectedResult =
distributionFrom(DataTypeInstances.Fractional -> DistributionValue(4, 1.0))
- DataType("att2").calculate(df).value shouldBe Success(expectedResult)
+ deequ.analyzers.DataType("att2").calculate(df).value shouldBe Success(expectedResult)
}
@@ -396,14 +398,14 @@ class AnalyzerTests extends AnyWordSpec with Matchers with SparkContextSpec with
.withColumn("att1_float", col("att1").cast(FloatType))
val expectedResult =
distributionFrom(DataTypeInstances.Fractional -> DistributionValue(6, 1.0))
- DataType("att1_float").calculate(df).value shouldBe Success(expectedResult)
+ deequ.analyzers.DataType("att1_float").calculate(df).value shouldBe Success(expectedResult)
}
"detect integral type in string column" in withSparkSession { sparkSession =>
val df = getDfWithNumericValues(sparkSession)
.withColumn("att1_str", col("att1").cast(StringType))
val expectedResult = distributionFrom(DataTypeInstances.Integral -> DistributionValue(6, 1.0))
- DataType("att1_str").calculate(df).value shouldBe Success(expectedResult)
+ deequ.analyzers.DataType("att1_str").calculate(df).value shouldBe Success(expectedResult)
}
"detect fractional type in string column" in withSparkSession { sparkSession =>
@@ -412,19 +414,19 @@ class AnalyzerTests extends AnyWordSpec with Matchers with SparkContextSpec with
val expectedResult =
distributionFrom(DataTypeInstances.Fractional -> DistributionValue(6, 1.0))
- DataType("att1_str").calculate(df).value shouldBe Success(expectedResult)
+ deequ.analyzers.DataType("att1_str").calculate(df).value shouldBe Success(expectedResult)
}
"fall back to string in case the string column didn't match " +
" any known other data type" in withSparkSession { sparkSession =>
val df = getDfFull(sparkSession)
val expectedResult = distributionFrom(DataTypeInstances.String -> DistributionValue(4, 1.0))
- DataType("att1").calculate(df).value shouldBe Success(expectedResult)
+ deequ.analyzers.DataType("att1").calculate(df).value shouldBe Success(expectedResult)
}
"detect fractional for mixed fractional and integral" in withSparkSession { sparkSession =>
val df = getDfFractionalIntegralTypes(sparkSession)
- DataType("att1").calculate(df).value shouldBe Success(
+ deequ.analyzers.DataType("att1").calculate(df).value shouldBe Success(
distributionFrom(
DataTypeInstances.Fractional -> DistributionValue(1, 0.5),
DataTypeInstances.Integral -> DistributionValue(1, 0.5)
@@ -434,7 +436,7 @@ class AnalyzerTests extends AnyWordSpec with Matchers with SparkContextSpec with
"fall back to string for mixed fractional and string" in withSparkSession { sparkSession =>
val df = getDfFractionalStringTypes(sparkSession)
- DataType("att1").calculate(df).value shouldBe Success(
+ deequ.analyzers.DataType("att1").calculate(df).value shouldBe Success(
distributionFrom(
DataTypeInstances.Fractional -> DistributionValue(1, 0.5),
DataTypeInstances.String -> DistributionValue(1, 0.5)
@@ -444,7 +446,7 @@ class AnalyzerTests extends AnyWordSpec with Matchers with SparkContextSpec with
"fall back to string for mixed integral and string" in withSparkSession { sparkSession =>
val df = getDfIntegralStringTypes(sparkSession)
- DataType("att1").calculate(df).value shouldBe Success(
+ deequ.analyzers.DataType("att1").calculate(df).value shouldBe Success(
distributionFrom(
DataTypeInstances.Integral -> DistributionValue(1, 0.5),
DataTypeInstances.String -> DistributionValue(1, 0.5)
@@ -454,7 +456,7 @@ class AnalyzerTests extends AnyWordSpec with Matchers with SparkContextSpec with
"integral for numeric and null" in withSparkSession { sparkSession =>
val df = getDfWithUniqueColumns(sparkSession)
- DataType("uniqueWithNulls").calculate(df).value shouldBe Success(
+ deequ.analyzers.DataType("uniqueWithNulls").calculate(df).value shouldBe Success(
distributionFrom(
DataTypeInstances.Unknown -> DistributionValue(1, 1.0/6.0),
DataTypeInstances.Integral -> DistributionValue(5, 5.0/6.0)
@@ -471,7 +473,7 @@ class AnalyzerTests extends AnyWordSpec with Matchers with SparkContextSpec with
val expectedResult = distributionFrom(DataTypeInstances.Boolean -> DistributionValue(2, 1.0))
- DataType("att1").calculate(df).value shouldBe Success(expectedResult)
+ deequ.analyzers.DataType("att1").calculate(df).value shouldBe Success(expectedResult)
}
"fall back to string for boolean and null" in withSparkSession { sparkSession =>
@@ -483,7 +485,7 @@ class AnalyzerTests extends AnyWordSpec with Matchers with SparkContextSpec with
("4", "2.0")
).toDF("item", "att1")
- DataType("att1").calculate(df).value shouldBe Success(
+ deequ.analyzers.DataType("att1").calculate(df).value shouldBe Success(
distributionFrom(
DataTypeInstances.Fractional -> DistributionValue(1, 0.25),
DataTypeInstances.Unknown -> DistributionValue(1, 0.25),
diff --git a/src/test/scala/com/amazon/deequ/analyzers/runners/AnalysisRunnerTests.scala b/src/test/scala/com/amazon/deequ/analyzers/runners/AnalysisRunnerTests.scala
index 4ffc9eeb9..3e02c1071 100644
--- a/src/test/scala/com/amazon/deequ/analyzers/runners/AnalysisRunnerTests.scala
+++ b/src/test/scala/com/amazon/deequ/analyzers/runners/AnalysisRunnerTests.scala
@@ -184,12 +184,13 @@ class AnalysisRunnerTests extends AnyWordSpec
Uniqueness("att1", Some("att3 = 0")) :: Nil
val (separateResults, numSeparateJobs) = sparkMonitor.withMonitoringSession { stat =>
- val results = analyzers.map { _.calculate(df) }.toSet
+ val results: Seq[com.amazon.deequ.metrics.DoubleMetric] =
+ analyzers.map { _.calculate(df) }.toSeq.sortBy(_.name)
(results, stat.jobCount)
}
val (runnerResults, numCombinedJobs) = sparkMonitor.withMonitoringSession { stat =>
- val results = AnalysisRunner.onData(df).addAnalyzers(analyzers).run().allMetrics.toSet
+ val results = AnalysisRunner.onData(df).addAnalyzers(analyzers).run().allMetrics.toSeq.sortBy(_.name)
(results, stat.jobCount)
}
diff --git a/src/test/scala/com/amazon/deequ/analyzers/runners/AnalyzerContextTest.scala b/src/test/scala/com/amazon/deequ/analyzers/runners/AnalyzerContextTest.scala
index 254fac9b4..71fe1f70d 100644
--- a/src/test/scala/com/amazon/deequ/analyzers/runners/AnalyzerContextTest.scala
+++ b/src/test/scala/com/amazon/deequ/analyzers/runners/AnalyzerContextTest.scala
@@ -81,20 +81,20 @@ class AnalyzerContextTest extends AnyWordSpec
val successMetricsResultsJson = AnalyzerContext.successMetricsAsJson(results)
- val expectedJson =
- """[
- |{"entity":"Column","instance":"item","name":"Distinctness","value":1.0},
- |{"entity":"Column","instance":"att1","name":"Completeness","value":1.0},
- |{"entity":"Multicolumn","instance":"att1,att2","name":"Uniqueness","value":0.25},
- |{"entity":"Dataset","instance":"*","name":"Size (where: att2 == 'd')","value":1.0},
- |{"entity":"Dataset","instance":"*","name":"Size","value":4.0},
- |{"entity":"Column","instance":"att1","name":"Histogram.bins","value":2.0},
- |{"entity":"Column","instance":"att1","name":"Histogram.abs.a","value":3.0},
- |{"entity":"Column","instance":"att1","name":"Histogram.ratio.a","value":0.75},
- |{"entity":"Column","instance":"att1","name":"Histogram.abs.b","value":1.0},
- |{"entity":"Column","instance":"att1","name":"Histogram.ratio.b","value":0.25}
- |]"""
- .stripMargin.replaceAll("\n", "")
+ val expectedJson =
+ """[
+ |{"entity":"Multicolumn","instance":"att1,att2","name":"Uniqueness","value":0.25},
+ |{"entity":"Dataset","instance":"*","name":"Size (where: att2 == 'd')","value":1.0},
+ |{"entity":"Column","instance":"item","name":"Distinctness","value":1.0},
+ |{"entity":"Column","instance":"att1","name":"Histogram.bins","value":2.0},
+ |{"entity":"Column","instance":"att1","name":"Histogram.abs.a","value":3.0},
+ |{"entity":"Column","instance":"att1","name":"Histogram.ratio.a","value":0.75},
+ |{"entity":"Column","instance":"att1","name":"Histogram.abs.b","value":1.0},
+ |{"entity":"Column","instance":"att1","name":"Histogram.ratio.b","value":0.25},
+ |{"entity":"Dataset","instance":"*","name":"Size","value":4.0},
+ |{"entity":"Column","instance":"att1","name":"Completeness","value":1.0}
+ |]"""
+ .stripMargin.replaceAll("\n", "")
assertSameJson(successMetricsResultsJson, expectedJson)
}
@@ -110,10 +110,11 @@ class AnalyzerContextTest extends AnyWordSpec
val successMetricsResultsJson =
AnalyzerContext.successMetricsAsJson(results, metricsForAnalyzers)
- val expectedJson =
- """[{"entity":"Column","instance":"att1","name":"Completeness","value":1.0},
- |{"entity":"Multicolumn","instance":"att1,att2",
- |"name":"Uniqueness","value":0.25}]"""
+ val expectedJson =
+ """[
+ |{"entity":"Multicolumn","instance":"att1,att2","name":"Uniqueness","value":0.25},
+ |{"entity":"Column","instance":"att1","name":"Completeness","value":1.0}
+ |]"""
.stripMargin.replaceAll("\n", "")
assertSameJson(successMetricsResultsJson, expectedJson)
diff --git a/src/test/scala/com/amazon/deequ/repository/AnalysisResultSerdeTest.scala b/src/test/scala/com/amazon/deequ/repository/AnalysisResultSerdeTest.scala
index 7083a5c1d..250916aef 100644
--- a/src/test/scala/com/amazon/deequ/repository/AnalysisResultSerdeTest.scala
+++ b/src/test/scala/com/amazon/deequ/repository/AnalysisResultSerdeTest.scala
@@ -333,21 +333,22 @@ class SimpleResultSerdeTest extends WordSpec with Matchers with SparkContextSpec
|{"dataset_date":1507975810,"entity":"Column","region":"EU",
|"instance":"att2","name":"Completeness","value":1.0},
|{"dataset_date":1507975810,"entity":"Column","region":"EU",
- |"instance":"att1","name":"Completeness","value":1.0},
- |{"dataset_date":1507975810,"entity":"Multicolumn","region":"EU",
- |"instance":"att1,att2","name":"MutualInformation","value":0.5623351446188083},
- |{"dataset_date":1507975810,"entity":"Dataset","region":"EU",
- |"instance":"*","name":"Size","value":4.0},
+ |"instance":"att1","name":"MaxLength","value":1.0},
+ |{"dataset_date":1507975810,"entity":"Column","region":"EU",
+ |"instance":"att1","name":"MinLength","value":1.0},
|{"dataset_date":1507975810,"entity":"Column","region":"EU",
|"instance":"att1","name":"Uniqueness","value":0.25},
|{"dataset_date":1507975810,"entity":"Column","region":"EU",
|"instance":"att1","name":"Distinctness","value":0.5},
+ |{"dataset_date":1507975810,"entity":"Multicolumn","region":"EU",
+ |"instance":"att1,att2","name":"MutualInformation","value":0.5623351446188083},
|{"dataset_date":1507975810,"entity":"Column","region":"EU",
- |"instance":"att1","name":"MinLength","value":1.0},
- |{"dataset_date":1507975810,"entity":"Column","region":"EU",
- |"instance":"att1","name":"MaxLength","value":1.0},
+ |"instance":"att2","name":"Uniqueness","value":0.25},
+ |{"dataset_date":1507975810,"entity":"Dataset","region":"EU",
+ |"instance":"*","name":"Size","value":4.0},
|{"dataset_date":1507975810,"entity":"Column","region":"EU",
- |"instance":"att2","name":"Uniqueness","value":0.25}]"""
+ |"instance":"att1","name":"Completeness","value":1.0}
+ |]"""
.stripMargin.replaceAll("\n", "")
// ordering of map entries is not guaranteed, so comparing strings is not an option
diff --git a/src/test/scala/com/amazon/deequ/repository/MetricsRepositoryMultipleResultsLoaderTest.scala b/src/test/scala/com/amazon/deequ/repository/MetricsRepositoryMultipleResultsLoaderTest.scala
index 6e61b9385..90b520c9f 100644
--- a/src/test/scala/com/amazon/deequ/repository/MetricsRepositoryMultipleResultsLoaderTest.scala
+++ b/src/test/scala/com/amazon/deequ/repository/MetricsRepositoryMultipleResultsLoaderTest.scala
@@ -81,8 +81,17 @@ class MetricsRepositoryMultipleResultsLoaderTest extends AnyWordSpec with Matche
val analysisResultsAsJson = repository.load()
.getSuccessMetricsAsJson()
- val expected =
- s"""[{"entity":"Dataset","instance":"*","name":"Size","value":4.0,
+ val expected =
+ s"""[{"entity":"Dataset","instance":"*","name":"Size","value":4.0,
+ |"region":"NA", "dataset_date":$DATE_TWO},
+ |{"entity":"Column","instance":"att1","name":"Completeness","value":1.0,
+ |"region":"NA", "dataset_date":$DATE_TWO},
+ |{"entity":"Column","instance":"item","name":"Distinctness","value":1.0,
+ |"region":"NA", "dataset_date":$DATE_TWO},
+ |{"entity":"Multicolumn","instance":"att1,att2","name":"Uniqueness","value":0.25,
+ |"region":"NA", "dataset_date":$DATE_TWO},
+ |
+ |{"entity":"Dataset","instance":"*","name":"Size","value":4.0,
|"region":"EU", "dataset_date":$DATE_ONE},
|{"entity":"Column","instance":"att1","name":"Completeness","value":1.0,
|"region":"EU", "dataset_date":$DATE_ONE},
@@ -90,17 +99,8 @@ class MetricsRepositoryMultipleResultsLoaderTest extends AnyWordSpec with Matche
|"region":"EU", "dataset_date":$DATE_ONE},
|{"entity":"Multicolumn","instance":"att1,att2",
|"name":"Uniqueness","value":0.25,
- |"region":"EU", "dataset_date":$DATE_ONE},
- |
- |{"entity":"Dataset","instance":"*","name":"Size","value":4.0,
- |"region":"NA", "dataset_date":$DATE_TWO},
- |{"entity":"Column","instance":"att1","name":"Completeness","value":1.0,
- |"region":"NA", "dataset_date":$DATE_TWO},
- |{"entity":"Column","instance":"item","name":"Distinctness","value":1.0,
- |"region":"NA", "dataset_date":$DATE_TWO},
- |{"entity":"Multicolumn","instance":"att1,att2","name":"Uniqueness","value":0.25,
- |"region":"NA", "dataset_date":$DATE_TWO}]"""
- .stripMargin.replaceAll("\n", "")
+ |"region":"EU", "dataset_date":$DATE_ONE}]"""
+ .stripMargin.replaceAll("\n", "")
assertSameJson(analysisResultsAsJson, expected)
}