@@ -62,6 +62,8 @@ public class ActionsIterator implements CloseableIterator<ActionWrapper> {
62
62
63
63
private final StructType readSchema ;
64
64
65
+ private final StructType checkpointReadSchema ;
66
+
65
67
private final boolean schemaContainsAddOrRemoveFiles ;
66
68
67
69
/**
@@ -79,12 +81,22 @@ public ActionsIterator(
79
81
List <FileStatus > files ,
80
82
StructType readSchema ,
81
83
Optional <Predicate > checkpointPredicate ) {
84
+ this (engine , files , readSchema , readSchema , checkpointPredicate );
85
+ }
86
+
87
+ public ActionsIterator (
88
+ Engine engine ,
89
+ List <FileStatus > files ,
90
+ StructType readSchema ,
91
+ StructType checkpointReadSchema ,
92
+ Optional <Predicate > checkpointPredicate ) {
82
93
this .engine = engine ;
83
94
this .checkpointPredicate = checkpointPredicate ;
84
95
this .filesList = new LinkedList <>();
85
96
this .filesList .addAll (
86
97
files .stream ().map (DeltaLogFile ::forCommitOrCheckpoint ).collect (Collectors .toList ()));
87
98
this .readSchema = readSchema ;
99
+ this .checkpointReadSchema = checkpointReadSchema ;
88
100
this .actionsIter = Optional .empty ();
89
101
this .schemaContainsAddOrRemoveFiles = LogReplay .containsAddOrRemoveFileActions (readSchema );
90
102
}
@@ -177,8 +189,10 @@ private CloseableIterator<ColumnarBatch> getActionsIterFromSinglePartOrV2Checkpo
177
189
// If the sidecars may contain the current action, read sidecars from the top-level v2
178
190
// checkpoint file(to be read later).
179
191
StructType modifiedReadSchema = readSchema ;
192
+ StructType modifiedCheckpointReadSchema = checkpointReadSchema ;
180
193
if (schemaContainsAddOrRemoveFiles ) {
181
194
modifiedReadSchema = LogReplay .withSidecarFileSchema (readSchema );
195
+ modifiedCheckpointReadSchema = LogReplay .withSidecarFileSchema (checkpointReadSchema );
182
196
}
183
197
184
198
long checkpointVersion = checkpointVersion (file .getPath ());
@@ -195,7 +209,12 @@ private CloseableIterator<ColumnarBatch> getActionsIterFromSinglePartOrV2Checkpo
195
209
checkpointPredicateIncludingSidecars = checkpointPredicate ;
196
210
}
197
211
final CloseableIterator <ColumnarBatch > topLevelIter ;
198
- StructType finalModifiedReadSchema = modifiedReadSchema ;
212
+ StructType finalCommitReadSchema = modifiedReadSchema ;
213
+ // We do not need to look at any `remove` files from the checkpoints. Skip the column to save
214
+ // I/O. Note that we are still going to process the row groups. Adds and removes are randomly
215
+ // scattered through checkpoint part files, so row group push down is unlikely to be useful.
216
+ StructType finalCheckpointReadSchema = modifiedCheckpointReadSchema ;
217
+
199
218
if (fileName .endsWith (".parquet" )) {
200
219
topLevelIter =
201
220
wrapEngineExceptionThrowsIO (
@@ -204,7 +223,7 @@ private CloseableIterator<ColumnarBatch> getActionsIterFromSinglePartOrV2Checkpo
204
223
.getParquetHandler ()
205
224
.readParquetFiles (
206
225
singletonCloseableIterator (file ),
207
- finalModifiedReadSchema ,
226
+ finalCheckpointReadSchema ,
208
227
checkpointPredicateIncludingSidecars ),
209
228
"Reading parquet log file `%s` with readSchema=%s and predicate=%s" ,
210
229
file ,
@@ -218,7 +237,7 @@ private CloseableIterator<ColumnarBatch> getActionsIterFromSinglePartOrV2Checkpo
218
237
.getJsonHandler ()
219
238
.readJsonFiles (
220
239
singletonCloseableIterator (file ),
221
- finalModifiedReadSchema ,
240
+ finalCommitReadSchema ,
222
241
checkpointPredicateIncludingSidecars ),
223
242
"Reading JSON log file `%s` with readSchema=%s and predicate=%s" ,
224
243
file ,
0 commit comments