Test for sum_histograms

dvadym · dvadym · commit bdca9e1dba28 · 2024-09-03T15:35:30.000+02:00
diff --git a/analysis/contribution_bounders.py b/analysis/contribution_bounders.py
@@ -16,6 +16,7 @@
 import numpy as np
 from pipeline_dp import contribution_bounders
 from pipeline_dp import sampling_utils
+from typing import Iterable
 
 
 class AnalysisContributionBounder(contribution_bounders.ContributionBounder):
@@ -67,14 +68,27 @@ def rekey_per_privacy_id_per_partition_key_and_unnest(pid_pk_v_values):
             for partition_key, values in partition_values:
                 if sampler is not None and not sampler.keep(partition_key):
                     continue
-                if len(values) == 1:
-                    sum_values = values
-                # elif len(values[0]) == 1:  todo
-                #     # 1 value
-                #     sum_values = sum(values)
-                else:
+                # Sum values if it's needed values,
+                # values can contain multi-columns, the format is the following
+                # 1 column:
+                #   input: values = [v_0:float, ... ]
+                #   output: v_0 + ....
+                # k columns (k > 1):
+                #   input: values = [v_0=(v00, ... v0(k-1)), ...]
+                #   output: (00+v10+..., ...)
+                if not values:
+                    # Empty public partitions
+                    sum_values = 0
+                elif len(values) == 1:
+                    # No need to sum, return 0th value
+                    sum_values = values[0]
+                elif isinstance(values[0], Iterable):
                     # multiple value columns, sum each column independently
-                    sum_values = np.array(values).sum(axis=0).tolist()
+                    sum_values = tuple(np.array(values).sum(axis=0).tolist())
+                else:
+                    # 1 column
+                    sum_values = sum(values)
+
                 yield (privacy_id, partition_key), (
                     len(values),
                     sum_values,
diff --git a/analysis/tests/contribution_bounders_test.py b/analysis/tests/contribution_bounders_test.py
@@ -40,7 +40,8 @@ def _run_contribution_bounding(self,
                                    input,
                                    max_partitions_contributed,
                                    max_contributions_per_partition,
-                                   partitions_sampling_prob: float = 1.0):
+                                   partitions_sampling_prob: float = 1.0,
+                                   aggregate_fn=count_aggregate_fn):
         params = CrossAndPerPartitionContributionParams(
             max_partitions_contributed, max_contributions_per_partition)
 
@@ -50,7 +51,7 @@ def _run_contribution_bounding(self,
             bounder.bound_contributions(input, params,
                                         pipeline_dp.LocalBackend(),
                                         _create_report_generator(),
-                                        count_aggregate_fn))
+                                        aggregate_fn))
 
     def test_contribution_bounding_empty_col(self):
         input = []
@@ -117,8 +118,27 @@ def test_contribution_bounding_cross_partition_bounding_and_sampling(self):
         # Check per- and cross-partition contribution limits are not enforced.
         self.assertEqual(set(expected_result), set(bound_result))
 
+    def test_contribution_bounding_cross_partition_bounding_and_2_column_values(
+            self):
+        input = [("pid1", 'pk1', (1, 2)), ("pid1", 'pk1', (3, 4)),
+                 ("pid1", 'pk2', (-1, 0)), ("pid2", 'pk1', (5, 5))]
+        max_partitions_contributed = 3
+        max_contributions_per_partition = 5
 
-class SamplingL0LinfContributionBounderTest(parameterized.TestCase):
+        bound_result = self._run_contribution_bounding(
+            input,
+            max_partitions_contributed,
+            max_contributions_per_partition,
+            aggregate_fn=lambda x: x)
+
+        expected_result = [(('pid1', 'pk2'), (1, (-1, 0), 2, 3)),
+                           (('pid1', 'pk1'), (2, (4, 6), 2, 3)),
+                           (('pid2', 'pk1'), (1, (5, 5), 1, 1))]
+        # Check per- and cross-partition contribution limits are not enforced.
+        self.assertEqual(set(expected_result), set(bound_result))
+
+
+class NoOpContributionBounderTest(parameterized.TestCase):
 
     def test_contribution_bounding_doesnt_drop_contributions(self):
         # Arrange.
diff --git a/tests/dataset_histograms/sum_histogram_computation_test.py b/tests/dataset_histograms/sum_histogram_computation_test.py
@@ -163,6 +163,45 @@ def test_compute_linf_sum_contributions_histogram(self, testcase_name,
                              histogram.name)
             self.assertListEqual(expected, histogram.bins)
 
+    @parameterized.parameters(False, True)
+    def test_compute_linf_sum_contributions_histogram_2_columns(
+            self, pre_aggregated: bool):
+        # format: ((privacy_id, partition), value: tuple)
+        data = [((0, 0), (1, 10)), ((0, 1), (2, 20)), ((0, 1), (3, 30)),
+                ((1, 0), (5, 50))]
+        backend = pipeline_dp.LocalBackend()
+        expected = [
+            hist.Histogram(hist.HistogramType.LINF_SUM_CONTRIBUTIONS, [
+                hist.FrequencyBin(
+                    lower=1.0, upper=1.0004, count=1, sum=1, max=1),
+                hist.FrequencyBin(
+                    lower=4.9996, upper=5.0, count=2, sum=10, max=5)
+            ]),
+            hist.Histogram(hist.HistogramType.LINF_SUM_CONTRIBUTIONS, [
+                hist.FrequencyBin(
+                    lower=10.0, upper=10.004, count=1, sum=10, max=10),
+                hist.FrequencyBin(
+                    lower=49.996, upper=50.0, count=2, sum=100, max=50)
+            ])
+        ]
+        if pre_aggregated:
+            data = list(
+                pre_aggregation.preaggregate(
+                    data,
+                    backend,
+                    data_extractors=pipeline_dp.DataExtractors(
+                        privacy_id_extractor=lambda x: x[0][0],
+                        partition_extractor=lambda x: x[0][1],
+                        value_extractor=lambda x: x[1])))
+
+            compute_histograms = sum_histogram_computation._compute_linf_sum_contributions_histogram_on_preaggregated_data
+        else:
+            compute_histograms = sum_histogram_computation._compute_linf_sum_contributions_histogram
+        histograms = list(compute_histograms(data, backend))
+        self.assertLen(histograms, 1)
+        histograms = histograms[0]
+        self.assertListEqual(histograms, expected)
+
     @parameterized.product(
         (
             dict(testcase_name='empty histogram',
@@ -307,6 +346,45 @@ def test_compute_partition_sum_histogram(self, testcase_name, input,
                              histogram.name)
             self.assertListEqual(expected, histogram.bins)
 
+    @parameterized.parameters(False, True)
+    def test_compute_partition_sum_histogram_2_columns(self,
+                                                       pre_aggregated: bool):
+        # format: ((privacy_id, partition), value: tuple)
+        data = [((0, 0), (1, 10)), ((0, 1), (2, 20)), ((0, 1), (3, 30)),
+                ((1, 0), (5, 50))]
+        backend = pipeline_dp.LocalBackend()
+        expected = [
+            hist.Histogram(hist.HistogramType.SUM_PER_PARTITION, [
+                hist.FrequencyBin(
+                    lower=5.0, upper=5.0001, count=1, sum=5, max=5),
+                hist.FrequencyBin(
+                    lower=5.9999, upper=6.0, count=1, sum=6, max=6)
+            ]),
+            hist.Histogram(hist.HistogramType.SUM_PER_PARTITION, [
+                hist.FrequencyBin(
+                    lower=50.0, upper=50.001, count=1, sum=50, max=50),
+                hist.FrequencyBin(
+                    lower=59.999, upper=60.0, count=1, sum=60, max=60)
+            ])
+        ]
+        if pre_aggregated:
+            data = list(
+                pre_aggregation.preaggregate(
+                    data,
+                    backend,
+                    data_extractors=pipeline_dp.DataExtractors(
+                        privacy_id_extractor=lambda x: x[0][0],
+                        partition_extractor=lambda x: x[0][1],
+                        value_extractor=lambda x: x[1])))
+
+            compute_histograms = sum_histogram_computation._compute_partition_sum_histogram_on_preaggregated_data
+        else:
+            compute_histograms = sum_histogram_computation._compute_partition_sum_histogram
+        histograms = list(compute_histograms(data, backend))
+        self.assertLen(histograms, 1)
+        histograms = histograms[0]
+        self.assertListEqual(histograms, expected)
+
 
 if __name__ == '__main__':
     absltest.main()