Skip to content

Commit bdca9e1

Browse files
committed
Test for sum_histograms
1 parent b10287c commit bdca9e1

File tree

3 files changed

+122
-10
lines changed

3 files changed

+122
-10
lines changed

analysis/contribution_bounders.py

+21-7
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
import numpy as np
1717
from pipeline_dp import contribution_bounders
1818
from pipeline_dp import sampling_utils
19+
from typing import Iterable
1920

2021

2122
class AnalysisContributionBounder(contribution_bounders.ContributionBounder):
@@ -67,14 +68,27 @@ def rekey_per_privacy_id_per_partition_key_and_unnest(pid_pk_v_values):
6768
for partition_key, values in partition_values:
6869
if sampler is not None and not sampler.keep(partition_key):
6970
continue
70-
if len(values) == 1:
71-
sum_values = values
72-
# elif len(values[0]) == 1: todo
73-
# # 1 value
74-
# sum_values = sum(values)
75-
else:
71+
# Sum values if it's needed values,
72+
# values can contain multi-columns, the format is the following
73+
# 1 column:
74+
# input: values = [v_0:float, ... ]
75+
# output: v_0 + ....
76+
# k columns (k > 1):
77+
# input: values = [v_0=(v00, ... v0(k-1)), ...]
78+
# output: (00+v10+..., ...)
79+
if not values:
80+
# Empty public partitions
81+
sum_values = 0
82+
elif len(values) == 1:
83+
# No need to sum, return 0th value
84+
sum_values = values[0]
85+
elif isinstance(values[0], Iterable):
7686
# multiple value columns, sum each column independently
77-
sum_values = np.array(values).sum(axis=0).tolist()
87+
sum_values = tuple(np.array(values).sum(axis=0).tolist())
88+
else:
89+
# 1 column
90+
sum_values = sum(values)
91+
7892
yield (privacy_id, partition_key), (
7993
len(values),
8094
sum_values,

analysis/tests/contribution_bounders_test.py

+23-3
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,8 @@ def _run_contribution_bounding(self,
4040
input,
4141
max_partitions_contributed,
4242
max_contributions_per_partition,
43-
partitions_sampling_prob: float = 1.0):
43+
partitions_sampling_prob: float = 1.0,
44+
aggregate_fn=count_aggregate_fn):
4445
params = CrossAndPerPartitionContributionParams(
4546
max_partitions_contributed, max_contributions_per_partition)
4647

@@ -50,7 +51,7 @@ def _run_contribution_bounding(self,
5051
bounder.bound_contributions(input, params,
5152
pipeline_dp.LocalBackend(),
5253
_create_report_generator(),
53-
count_aggregate_fn))
54+
aggregate_fn))
5455

5556
def test_contribution_bounding_empty_col(self):
5657
input = []
@@ -117,8 +118,27 @@ def test_contribution_bounding_cross_partition_bounding_and_sampling(self):
117118
# Check per- and cross-partition contribution limits are not enforced.
118119
self.assertEqual(set(expected_result), set(bound_result))
119120

121+
def test_contribution_bounding_cross_partition_bounding_and_2_column_values(
122+
self):
123+
input = [("pid1", 'pk1', (1, 2)), ("pid1", 'pk1', (3, 4)),
124+
("pid1", 'pk2', (-1, 0)), ("pid2", 'pk1', (5, 5))]
125+
max_partitions_contributed = 3
126+
max_contributions_per_partition = 5
120127

121-
class SamplingL0LinfContributionBounderTest(parameterized.TestCase):
128+
bound_result = self._run_contribution_bounding(
129+
input,
130+
max_partitions_contributed,
131+
max_contributions_per_partition,
132+
aggregate_fn=lambda x: x)
133+
134+
expected_result = [(('pid1', 'pk2'), (1, (-1, 0), 2, 3)),
135+
(('pid1', 'pk1'), (2, (4, 6), 2, 3)),
136+
(('pid2', 'pk1'), (1, (5, 5), 1, 1))]
137+
# Check per- and cross-partition contribution limits are not enforced.
138+
self.assertEqual(set(expected_result), set(bound_result))
139+
140+
141+
class NoOpContributionBounderTest(parameterized.TestCase):
122142

123143
def test_contribution_bounding_doesnt_drop_contributions(self):
124144
# Arrange.

tests/dataset_histograms/sum_histogram_computation_test.py

+78
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,45 @@ def test_compute_linf_sum_contributions_histogram(self, testcase_name,
163163
histogram.name)
164164
self.assertListEqual(expected, histogram.bins)
165165

166+
@parameterized.parameters(False, True)
167+
def test_compute_linf_sum_contributions_histogram_2_columns(
168+
self, pre_aggregated: bool):
169+
# format: ((privacy_id, partition), value: tuple)
170+
data = [((0, 0), (1, 10)), ((0, 1), (2, 20)), ((0, 1), (3, 30)),
171+
((1, 0), (5, 50))]
172+
backend = pipeline_dp.LocalBackend()
173+
expected = [
174+
hist.Histogram(hist.HistogramType.LINF_SUM_CONTRIBUTIONS, [
175+
hist.FrequencyBin(
176+
lower=1.0, upper=1.0004, count=1, sum=1, max=1),
177+
hist.FrequencyBin(
178+
lower=4.9996, upper=5.0, count=2, sum=10, max=5)
179+
]),
180+
hist.Histogram(hist.HistogramType.LINF_SUM_CONTRIBUTIONS, [
181+
hist.FrequencyBin(
182+
lower=10.0, upper=10.004, count=1, sum=10, max=10),
183+
hist.FrequencyBin(
184+
lower=49.996, upper=50.0, count=2, sum=100, max=50)
185+
])
186+
]
187+
if pre_aggregated:
188+
data = list(
189+
pre_aggregation.preaggregate(
190+
data,
191+
backend,
192+
data_extractors=pipeline_dp.DataExtractors(
193+
privacy_id_extractor=lambda x: x[0][0],
194+
partition_extractor=lambda x: x[0][1],
195+
value_extractor=lambda x: x[1])))
196+
197+
compute_histograms = sum_histogram_computation._compute_linf_sum_contributions_histogram_on_preaggregated_data
198+
else:
199+
compute_histograms = sum_histogram_computation._compute_linf_sum_contributions_histogram
200+
histograms = list(compute_histograms(data, backend))
201+
self.assertLen(histograms, 1)
202+
histograms = histograms[0]
203+
self.assertListEqual(histograms, expected)
204+
166205
@parameterized.product(
167206
(
168207
dict(testcase_name='empty histogram',
@@ -307,6 +346,45 @@ def test_compute_partition_sum_histogram(self, testcase_name, input,
307346
histogram.name)
308347
self.assertListEqual(expected, histogram.bins)
309348

349+
@parameterized.parameters(False, True)
350+
def test_compute_partition_sum_histogram_2_columns(self,
351+
pre_aggregated: bool):
352+
# format: ((privacy_id, partition), value: tuple)
353+
data = [((0, 0), (1, 10)), ((0, 1), (2, 20)), ((0, 1), (3, 30)),
354+
((1, 0), (5, 50))]
355+
backend = pipeline_dp.LocalBackend()
356+
expected = [
357+
hist.Histogram(hist.HistogramType.SUM_PER_PARTITION, [
358+
hist.FrequencyBin(
359+
lower=5.0, upper=5.0001, count=1, sum=5, max=5),
360+
hist.FrequencyBin(
361+
lower=5.9999, upper=6.0, count=1, sum=6, max=6)
362+
]),
363+
hist.Histogram(hist.HistogramType.SUM_PER_PARTITION, [
364+
hist.FrequencyBin(
365+
lower=50.0, upper=50.001, count=1, sum=50, max=50),
366+
hist.FrequencyBin(
367+
lower=59.999, upper=60.0, count=1, sum=60, max=60)
368+
])
369+
]
370+
if pre_aggregated:
371+
data = list(
372+
pre_aggregation.preaggregate(
373+
data,
374+
backend,
375+
data_extractors=pipeline_dp.DataExtractors(
376+
privacy_id_extractor=lambda x: x[0][0],
377+
partition_extractor=lambda x: x[0][1],
378+
value_extractor=lambda x: x[1])))
379+
380+
compute_histograms = sum_histogram_computation._compute_partition_sum_histogram_on_preaggregated_data
381+
else:
382+
compute_histograms = sum_histogram_computation._compute_partition_sum_histogram
383+
histograms = list(compute_histograms(data, backend))
384+
self.assertLen(histograms, 1)
385+
histograms = histograms[0]
386+
self.assertListEqual(histograms, expected)
387+
310388

311389
if __name__ == '__main__':
312390
absltest.main()

0 commit comments

Comments
 (0)