44
44
* iterator of (ColumnarBatch, isFromCheckpoint) tuples, where the schema of the ColumnarBatch
45
45
* semantically represents actions (or, a subset of action fields) parsed from the Delta Log.
46
46
*
47
- * <p>Users must pass in a `readSchema` to select which actions and sub-fields they want to consume.
47
+ * <p>Users must pass in a `deltaReadSchema` to select which actions and sub-fields they want to
48
+ * consume.
49
+ *
50
+ * <p>Users can also pass in an optional `checkpointReadSchema` if it is different from
51
+ * `deltaReadSchema`.
48
52
*/
49
53
public class ActionsIterator implements CloseableIterator <ActionWrapper > {
50
54
private final Engine engine ;
@@ -60,7 +64,14 @@ public class ActionsIterator implements CloseableIterator<ActionWrapper> {
60
64
*/
61
65
private final LinkedList <DeltaLogFile > filesList ;
62
66
63
- private final StructType readSchema ;
67
+ /** Schema used for reading delta files. */
68
+ private final StructType deltaReadSchema ;
69
+
70
+ /**
71
+ * Schema to be used for reading checkpoint files. Checkpoint files can be Parquet or JSON in the
72
+ * case of v2 checkpoints.
73
+ */
74
+ private final StructType checkpointReadSchema ;
64
75
65
76
private final boolean schemaContainsAddOrRemoveFiles ;
66
77
@@ -77,16 +88,26 @@ public class ActionsIterator implements CloseableIterator<ActionWrapper> {
77
88
public ActionsIterator (
78
89
Engine engine ,
79
90
List <FileStatus > files ,
80
- StructType readSchema ,
91
+ StructType deltaReadSchema ,
92
+ Optional <Predicate > checkpointPredicate ) {
93
+ this (engine , files , deltaReadSchema , deltaReadSchema , checkpointPredicate );
94
+ }
95
+
96
+ public ActionsIterator (
97
+ Engine engine ,
98
+ List <FileStatus > files ,
99
+ StructType deltaReadSchema ,
100
+ StructType checkpointReadSchema ,
81
101
Optional <Predicate > checkpointPredicate ) {
82
102
this .engine = engine ;
83
103
this .checkpointPredicate = checkpointPredicate ;
84
104
this .filesList = new LinkedList <>();
85
105
this .filesList .addAll (
86
106
files .stream ().map (DeltaLogFile ::forCommitOrCheckpoint ).collect (Collectors .toList ()));
87
- this .readSchema = readSchema ;
107
+ this .deltaReadSchema = deltaReadSchema ;
108
+ this .checkpointReadSchema = checkpointReadSchema ;
88
109
this .actionsIter = Optional .empty ();
89
- this .schemaContainsAddOrRemoveFiles = LogReplay .containsAddOrRemoveFileActions (readSchema );
110
+ this .schemaContainsAddOrRemoveFiles = LogReplay .containsAddOrRemoveFileActions (deltaReadSchema );
90
111
}
91
112
92
113
@ Override
@@ -105,7 +126,8 @@ public boolean hasNext() {
105
126
106
127
/**
107
128
* @return a tuple of (ColumnarBatch, isFromCheckpoint), where ColumnarBatch conforms to the
108
- * instance {@link #readSchema}.
129
+ * instance {@link #deltaReadSchema} or {@link #checkpointReadSchema} (the latter when when
130
+ * isFromCheckpoint=true).
109
131
*/
110
132
@ Override
111
133
public ActionWrapper next () {
@@ -176,9 +198,9 @@ private CloseableIterator<ColumnarBatch> getActionsIterFromSinglePartOrV2Checkpo
176
198
FileStatus file , String fileName ) throws IOException {
177
199
// If the sidecars may contain the current action, read sidecars from the top-level v2
178
200
// checkpoint file(to be read later).
179
- StructType modifiedReadSchema = readSchema ;
201
+ StructType modifiedReadSchema = checkpointReadSchema ;
180
202
if (schemaContainsAddOrRemoveFiles ) {
181
- modifiedReadSchema = LogReplay .withSidecarFileSchema (readSchema );
203
+ modifiedReadSchema = LogReplay .withSidecarFileSchema (checkpointReadSchema );
182
204
}
183
205
184
206
long checkpointVersion = checkpointVersion (file .getPath ());
@@ -195,7 +217,8 @@ private CloseableIterator<ColumnarBatch> getActionsIterFromSinglePartOrV2Checkpo
195
217
checkpointPredicateIncludingSidecars = checkpointPredicate ;
196
218
}
197
219
final CloseableIterator <ColumnarBatch > topLevelIter ;
198
- StructType finalModifiedReadSchema = modifiedReadSchema ;
220
+ StructType finalReadSchema = modifiedReadSchema ;
221
+
199
222
if (fileName .endsWith (".parquet" )) {
200
223
topLevelIter =
201
224
wrapEngineExceptionThrowsIO (
@@ -204,11 +227,11 @@ private CloseableIterator<ColumnarBatch> getActionsIterFromSinglePartOrV2Checkpo
204
227
.getParquetHandler ()
205
228
.readParquetFiles (
206
229
singletonCloseableIterator (file ),
207
- finalModifiedReadSchema ,
230
+ finalReadSchema ,
208
231
checkpointPredicateIncludingSidecars ),
209
232
"Reading parquet log file `%s` with readSchema=%s and predicate=%s" ,
210
233
file ,
211
- modifiedReadSchema ,
234
+ finalReadSchema ,
212
235
checkpointPredicateIncludingSidecars );
213
236
} else if (fileName .endsWith (".json" )) {
214
237
topLevelIter =
@@ -218,11 +241,11 @@ private CloseableIterator<ColumnarBatch> getActionsIterFromSinglePartOrV2Checkpo
218
241
.getJsonHandler ()
219
242
.readJsonFiles (
220
243
singletonCloseableIterator (file ),
221
- finalModifiedReadSchema ,
244
+ finalReadSchema ,
222
245
checkpointPredicateIncludingSidecars ),
223
246
"Reading JSON log file `%s` with readSchema=%s and predicate=%s" ,
224
247
file ,
225
- modifiedReadSchema ,
248
+ finalReadSchema ,
226
249
checkpointPredicateIncludingSidecars );
227
250
} else {
228
251
throw new IOException ("Unrecognized top level v2 checkpoint file format: " + fileName );
@@ -309,10 +332,12 @@ private CloseableIterator<ActionWrapper> getNextActionsIter() {
309
332
engine
310
333
.getJsonHandler ()
311
334
.readJsonFiles (
312
- singletonCloseableIterator (nextFile ), readSchema , Optional .empty ()),
335
+ singletonCloseableIterator (nextFile ),
336
+ deltaReadSchema ,
337
+ Optional .empty ()),
313
338
"Reading JSON log file `%s` with readSchema=%s" ,
314
339
nextFile ,
315
- readSchema );
340
+ deltaReadSchema );
316
341
317
342
return combine (
318
343
dataIter ,
@@ -344,10 +369,11 @@ private CloseableIterator<ActionWrapper> getNextActionsIter() {
344
369
() ->
345
370
engine
346
371
.getParquetHandler ()
347
- .readParquetFiles (checkpointFiles , readSchema , checkpointPredicate ),
372
+ .readParquetFiles (
373
+ checkpointFiles , deltaReadSchema , checkpointPredicate ),
348
374
"Reading checkpoint sidecars [%s] with readSchema=%s and predicate=%s" ,
349
375
checkpointFiles ,
350
- readSchema ,
376
+ deltaReadSchema ,
351
377
checkpointPredicate );
352
378
353
379
long version = checkpointVersion (nextFilePath );
0 commit comments