@@ -64,6 +64,7 @@ pub(crate) struct PageIndexEvaluator<'a> {
64
64
row_group_metadata : & ' a RowGroupMetaData ,
65
65
iceberg_field_id_to_parquet_column_index : & ' a HashMap < i32 , usize > ,
66
66
snapshot_schema : & ' a Schema ,
67
+ row_count_cache : HashMap < usize , Vec < usize > > ,
67
68
}
68
69
69
70
impl < ' a > PageIndexEvaluator < ' a > {
@@ -80,6 +81,7 @@ impl<'a> PageIndexEvaluator<'a> {
80
81
row_group_metadata,
81
82
iceberg_field_id_to_parquet_column_index : field_id_map,
82
83
snapshot_schema,
84
+ row_count_cache : HashMap :: new ( ) ,
83
85
}
84
86
}
85
87
@@ -126,7 +128,7 @@ impl<'a> PageIndexEvaluator<'a> {
126
128
}
127
129
128
130
fn calc_row_selection < F > (
129
- & self ,
131
+ & mut self ,
130
132
field_id : i32 ,
131
133
predicate : F ,
132
134
missing_col_behavior : MissingColBehavior ,
@@ -168,17 +170,28 @@ impl<'a> PageIndexEvaluator<'a> {
168
170
return self . select_all_rows ( ) ;
169
171
} ;
170
172
171
- let Some ( offset_index) = self . offset_index . get ( parquet_column_index) else {
172
- // if we have a column index, we should always have an offset index.
173
- return Err ( Error :: new (
174
- ErrorKind :: Unexpected ,
175
- format ! ( "Missing offset index for field id {}" , field_id) ,
176
- ) ) ;
177
- } ;
173
+ let row_counts = {
174
+ // Caches row count calculations for columns that appear multiple times in
175
+ // the predicate
176
+ match self . row_count_cache . get ( & parquet_column_index) {
177
+ Some ( count) => count. clone ( ) ,
178
+ None => {
179
+ let Some ( offset_index) = self . offset_index . get ( parquet_column_index) else {
180
+ // if we have a column index, we should always have an offset index.
181
+ return Err ( Error :: new (
182
+ ErrorKind :: Unexpected ,
183
+ format ! ( "Missing offset index for field id {}" , field_id) ,
184
+ ) ) ;
185
+ } ;
186
+
187
+ let count = self . calc_row_counts ( offset_index) ;
188
+ self . row_count_cache
189
+ . insert ( parquet_column_index, count. clone ( ) ) ;
178
190
179
- // TODO: cache row_counts to avoid recalcing if the same column
180
- // appears multiple times in the filter predicate
181
- let row_counts = self . calc_row_counts ( offset_index) ;
191
+ count
192
+ }
193
+ }
194
+ } ;
182
195
183
196
let Some ( page_filter) = Self :: apply_predicate_to_column_index (
184
197
predicate,
@@ -205,7 +218,7 @@ impl<'a> PageIndexEvaluator<'a> {
205
218
Ok ( row_selectors. into ( ) )
206
219
}
207
220
208
- /// returns a list of row counts per page
221
+ /// Returns a list of row counts per page
209
222
fn calc_row_counts ( & self , offset_index : & OffsetIndexMetaData ) -> Vec < usize > {
210
223
let mut remaining_rows = self . row_group_metadata . num_rows ( ) as usize ;
211
224
let mut row_counts = Vec :: with_capacity ( self . offset_index . len ( ) ) ;
0 commit comments