Skip to content

Commit 8061485

Browse files
authored
feat: support ApproxDistinct with utf8view (#15200)
* feat: support ApproxDistinct with utf8view * Address comment
1 parent e221a2c commit 8061485

File tree

2 files changed

+59
-1
lines changed

2 files changed

+59
-1
lines changed

datafusion/functions-aggregate/src/approx_distinct.rs

+38-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
//! Defines physical expressions that can evaluated at runtime during query execution
1919
2020
use crate::hyperloglog::HyperLogLog;
21-
use arrow::array::BinaryArray;
21+
use arrow::array::{BinaryArray, StringViewArray};
2222
use arrow::array::{
2323
GenericBinaryArray, GenericStringArray, OffsetSizeTrait, PrimitiveArray,
2424
};
@@ -126,6 +126,27 @@ where
126126
}
127127
}
128128

129+
#[derive(Debug)]
130+
struct StringViewHLLAccumulator<T>
131+
where
132+
T: OffsetSizeTrait,
133+
{
134+
hll: HyperLogLog<String>,
135+
phantom_data: PhantomData<T>,
136+
}
137+
138+
impl<T> StringViewHLLAccumulator<T>
139+
where
140+
T: OffsetSizeTrait,
141+
{
142+
pub fn new() -> Self {
143+
Self {
144+
hll: HyperLogLog::new(),
145+
phantom_data: PhantomData,
146+
}
147+
}
148+
}
149+
129150
#[derive(Debug)]
130151
struct BinaryHLLAccumulator<T>
131152
where
@@ -197,6 +218,21 @@ where
197218
default_accumulator_impl!();
198219
}
199220

221+
impl<T> Accumulator for StringViewHLLAccumulator<T>
222+
where
223+
T: OffsetSizeTrait,
224+
{
225+
fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
226+
let array: &StringViewArray = downcast_value!(values[0], StringViewArray);
227+
// flatten because we would skip nulls
228+
self.hll
229+
.extend(array.iter().flatten().map(|s| s.to_string()));
230+
Ok(())
231+
}
232+
233+
default_accumulator_impl!();
234+
}
235+
200236
impl<T> Accumulator for StringHLLAccumulator<T>
201237
where
202238
T: OffsetSizeTrait,
@@ -311,6 +347,7 @@ impl AggregateUDFImpl for ApproxDistinct {
311347
DataType::Int64 => Box::new(NumericHLLAccumulator::<Int64Type>::new()),
312348
DataType::Utf8 => Box::new(StringHLLAccumulator::<i32>::new()),
313349
DataType::LargeUtf8 => Box::new(StringHLLAccumulator::<i64>::new()),
350+
DataType::Utf8View => Box::new(StringViewHLLAccumulator::<i32>::new()),
314351
DataType::Binary => Box::new(BinaryHLLAccumulator::<i32>::new()),
315352
DataType::LargeBinary => Box::new(BinaryHLLAccumulator::<i64>::new()),
316353
other => {

datafusion/sqllogictest/test_files/aggregate_skip_partial.slt

+21
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,27 @@ SELECT c2, approx_distinct(c1), approx_distinct(c5) FROM aggregate_test_100 GROU
298298
4 5 23
299299
5 5 14
300300

301+
# Test approx_distinct for varchar(with Utf8View) / int
302+
statement ok
303+
CREATE TABLE aggregate_test_100_utf8view AS SELECT
304+
arrow_cast(c1, 'Utf8View') as c1,
305+
c2,
306+
c5
307+
FROM aggregate_test_100;
308+
309+
# Test approx_distinct for varchar(with Utf8View) / int
310+
query III
311+
SELECT c2, approx_distinct(c1), approx_distinct(c5) FROM aggregate_test_100_utf8view GROUP BY c2 ORDER BY c2;
312+
----
313+
1 5 22
314+
2 5 22
315+
3 5 19
316+
4 5 23
317+
5 5 14
318+
319+
statement ok
320+
DROP TABLE aggregate_test_100_utf8view;
321+
301322
# Test count with nullable fields
302323
query III
303324
SELECT c2, count(c3), count(c11) FROM aggregate_test_100_null GROUP BY c2 ORDER BY c2;

0 commit comments

Comments
 (0)