Skip to content

Commit 484eb33

Browse files
authored
feat: introduce feature flags to select major arrow versions (#654)
This change introduces arrow_53 and arrow_54 feature flags on kernel which are _required_ when using default-engine or sync-engine. Fundamentally we must push users of the crate to select their arrow major version through flags since Cargo _will_ include multiple major versions in the dependency tree which can cause ABI breakages when passing around symbols such as `RecordBatch` See #640 --------- Signed-off-by: R. Tyler Croy <[email protected]>
1 parent 72b585d commit 484eb33

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+249
-296
lines changed

.github/workflows/build.yml

+4-4
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ jobs:
3737
cargo install cargo-msrv --locked
3838
- name: verify-msrv
3939
run: |
40-
cargo msrv --path kernel/ verify --all-features
40+
cargo msrv --path kernel/ verify --features $(cat .github/workflows/default-kernel-features)
4141
cargo msrv --path derive-macros/ verify --all-features
4242
cargo msrv --path ffi/ verify --all-features
4343
cargo msrv --path ffi-proc-macros/ verify --all-features
@@ -104,7 +104,7 @@ jobs:
104104
- name: check kernel builds with no-default-features
105105
run: cargo build -p delta_kernel --no-default-features
106106
- name: build and lint with clippy
107-
run: cargo clippy --benches --tests --all-features -- -D warnings
107+
run: cargo clippy --benches --tests --features $(cat .github/workflows/default-kernel-features) -- -D warnings
108108
- name: lint without default features
109109
run: cargo clippy --no-default-features -- -D warnings
110110
- name: check kernel builds with default-engine
@@ -129,7 +129,7 @@ jobs:
129129
override: true
130130
- uses: Swatinem/rust-cache@v2
131131
- name: test
132-
run: cargo test --workspace --verbose --all-features -- --skip read_table_version_hdfs
132+
run: cargo test --workspace --verbose --features $(cat .github/workflows/default-kernel-features) -- --skip read_table_version_hdfs
133133

134134
ffi_test:
135135
runs-on: ${{ matrix.os }}
@@ -229,7 +229,7 @@ jobs:
229229
uses: taiki-e/install-action@cargo-llvm-cov
230230
- uses: Swatinem/rust-cache@v2
231231
- name: Generate code coverage
232-
run: cargo llvm-cov --all-features --workspace --codecov --output-path codecov.json -- --skip read_table_version_hdfs
232+
run: cargo llvm-cov --features $(cat .github/workflows/default-kernel-features) --workspace --codecov --output-path codecov.json -- --skip read_table_version_hdfs
233233
- name: Upload coverage to Codecov
234234
uses: codecov/codecov-action@v5
235235
with:
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
integration-test,default-engine,default-engine-rustls,cloud,arrow,sync-engine

Cargo.toml

-15
Original file line numberDiff line numberDiff line change
@@ -23,21 +23,6 @@ rust-version = "1.80"
2323
version = "0.6.1"
2424

2525
[workspace.dependencies]
26-
# When changing the arrow version range, also modify ffi/Cargo.toml which has
27-
# its own arrow version ranges witeh modified features. Failure to do so will
28-
# result in compilation errors as two different sets of arrow dependencies may
29-
# be sourced
30-
arrow = { version = ">=53, <55" }
31-
arrow-arith = { version = ">=53, <55" }
32-
arrow-array = { version = ">=53, <55" }
33-
arrow-buffer = { version = ">=53, <55" }
34-
arrow-cast = { version = ">=53, <55" }
35-
arrow-data = { version = ">=53, <55" }
36-
arrow-ord = { version = ">=53, <55" }
37-
arrow-json = { version = ">=53, <55" }
38-
arrow-select = { version = ">=53, <55" }
39-
arrow-schema = { version = ">=53, <55" }
40-
parquet = { version = ">=53, <55", features = ["object_store"] }
4126
object_store = { version = ">=0.11, <0.12" }
4227
hdfs-native-object-store = "0.12.0"
4328
hdfs-native = "0.10.0"

acceptance/Cargo.toml

+1-6
Original file line numberDiff line numberDiff line change
@@ -14,19 +14,14 @@ rust-version.workspace = true
1414
release = false
1515

1616
[dependencies]
17-
arrow-array = { workspace = true }
18-
arrow-cast = { workspace = true }
19-
arrow-ord = { workspace = true }
20-
arrow-select = { workspace = true }
21-
arrow-schema = { workspace = true }
2217
delta_kernel = { path = "../kernel", features = [
2318
"default-engine",
19+
"arrow_53",
2420
"developer-visibility",
2521
] }
2622
futures = "0.3"
2723
itertools = "0.13"
2824
object_store = { workspace = true }
29-
parquet = { workspace = true }
3025
serde = { version = "1", features = ["derive"] }
3126
serde_json = "1"
3227
thiserror = "1"

acceptance/src/data.rs

+10-7
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,18 @@
11
use std::{path::Path, sync::Arc};
22

3-
use arrow_array::{Array, RecordBatch};
4-
use arrow_ord::sort::{lexsort_to_indices, SortColumn};
5-
use arrow_schema::{DataType, Schema};
6-
use arrow_select::{concat::concat_batches, filter::filter_record_batch, take::take};
3+
use delta_kernel::arrow::array::{Array, RecordBatch};
4+
use delta_kernel::arrow::compute::{
5+
concat_batches, filter_record_batch, lexsort_to_indices, take, SortColumn,
6+
};
7+
use delta_kernel::arrow::datatypes::{DataType, Schema};
78

9+
use delta_kernel::parquet::arrow::async_reader::{
10+
ParquetObjectReader, ParquetRecordBatchStreamBuilder,
11+
};
812
use delta_kernel::{engine::arrow_data::ArrowEngineData, DeltaResult, Engine, Error, Table};
913
use futures::{stream::TryStreamExt, StreamExt};
1014
use itertools::Itertools;
1115
use object_store::{local::LocalFileSystem, ObjectStore};
12-
use parquet::arrow::async_reader::{ParquetObjectReader, ParquetRecordBatchStreamBuilder};
1316

1417
use crate::{TestCaseInfo, TestResult};
1518

@@ -83,8 +86,8 @@ fn assert_schema_fields_match(schema: &Schema, golden: &Schema) {
8386
fn normalize_col(col: Arc<dyn Array>) -> Arc<dyn Array> {
8487
if let DataType::Timestamp(unit, Some(zone)) = col.data_type() {
8588
if **zone == *"+00:00" {
86-
arrow_cast::cast::cast(&col, &DataType::Timestamp(*unit, Some("UTC".into())))
87-
.expect("Could not cast to UTC")
89+
let data_type = DataType::Timestamp(*unit, Some("UTC".into()));
90+
delta_kernel::arrow::compute::cast(&col, &data_type).expect("Could not cast to UTC")
8891
} else {
8992
col
9093
}

feature-tests/Cargo.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ version.workspace = true
1212
release = false
1313

1414
[dependencies]
15-
delta_kernel = { path = "../kernel" }
15+
delta_kernel = { path = "../kernel", features = ["arrow_53"] }
1616

1717
[features]
1818
default-engine = [ "delta_kernel/default-engine" ]

ffi/Cargo.toml

+2-13
Original file line numberDiff line numberDiff line change
@@ -22,21 +22,13 @@ tracing-core = { version = "0.1", optional = true }
2222
tracing-subscriber = { version = "0.3", optional = true, features = [ "json" ] }
2323
url = "2"
2424
delta_kernel = { path = "../kernel", default-features = false, features = [
25+
"arrow",
2526
"developer-visibility",
2627
] }
2728
delta_kernel_ffi_macros = { path = "../ffi-proc-macros", version = "0.6.1" }
2829

29-
# used if we use the default engine to be able to move arrow data into the c-ffi format
30-
arrow-schema = { version = ">=53, <55", default-features = false, features = [
31-
"ffi",
32-
], optional = true }
33-
arrow-data = { version = ">=53, <55", default-features = false, features = [
34-
"ffi",
35-
], optional = true }
36-
arrow-array = { version = ">=53, <55", default-features = false, optional = true }
37-
3830
[build-dependencies]
39-
cbindgen = "0.27.0"
31+
cbindgen = "0.28"
4032
libc = "0.2.158"
4133

4234
[dev-dependencies]
@@ -52,9 +44,6 @@ default = ["default-engine"]
5244
cloud = ["delta_kernel/cloud"]
5345
default-engine = [
5446
"delta_kernel/default-engine",
55-
"arrow-array",
56-
"arrow-data",
57-
"arrow-schema",
5847
]
5948
tracing = [ "tracing-core", "tracing-subscriber" ]
6049
sync-engine = ["delta_kernel/sync-engine"]

ffi/cbindgen.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -25,4 +25,4 @@ parse_deps = true
2525
# only crates found in this list will ever be parsed.
2626
#
2727
# default: there is no allow-list (NOTE: this is the opposite of [])
28-
include = ["delta_kernel", "arrow-data", "arrow-schema"]
28+
include = ["arrow", "arrow-data", "arrow-schema", "delta_kernel"]

ffi/src/engine_data.rs

+11-7
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
//! EngineData related ffi code
22
3+
use delta_kernel::arrow::array::{
4+
ffi::{FFI_ArrowArray, FFI_ArrowSchema},
5+
ArrayData, StructArray,
6+
};
37
use delta_kernel::{DeltaResult, EngineData};
48
use std::ffi::c_void;
59

@@ -45,8 +49,8 @@ unsafe fn get_raw_engine_data_impl(data: &mut Handle<ExclusiveEngineData>) -> &m
4549
#[cfg(feature = "default-engine")]
4650
#[repr(C)]
4751
pub struct ArrowFFIData {
48-
pub array: arrow_data::ffi::FFI_ArrowArray,
49-
pub schema: arrow_schema::ffi::FFI_ArrowSchema,
52+
pub array: FFI_ArrowArray,
53+
pub schema: FFI_ArrowSchema,
5054
}
5155

5256
// TODO: This should use a callback to avoid having to have the engine free the struct
@@ -71,16 +75,16 @@ pub unsafe extern "C" fn get_raw_arrow_data(
7175
// TODO: This method leaks the returned pointer memory. How will the engine free it?
7276
#[cfg(feature = "default-engine")]
7377
fn get_raw_arrow_data_impl(data: Box<dyn EngineData>) -> DeltaResult<*mut ArrowFFIData> {
74-
let record_batch: arrow_array::RecordBatch = data
78+
let record_batch: delta_kernel::arrow::array::RecordBatch = data
7579
.into_any()
7680
.downcast::<delta_kernel::engine::arrow_data::ArrowEngineData>()
7781
.map_err(|_| delta_kernel::Error::EngineDataType("ArrowEngineData".to_string()))?
7882
.into();
79-
let sa: arrow_array::StructArray = record_batch.into();
80-
let array_data: arrow_data::ArrayData = sa.into();
83+
let sa: StructArray = record_batch.into();
84+
let array_data: ArrayData = sa.into();
8185
// these call `clone`. is there a way to not copy anything and what exactly are they cloning?
82-
let array = arrow_data::ffi::FFI_ArrowArray::new(&array_data);
83-
let schema = arrow_schema::ffi::FFI_ArrowSchema::try_from(array_data.data_type())?;
86+
let array = FFI_ArrowArray::new(&array_data);
87+
let schema = FFI_ArrowSchema::try_from(array_data.data_type())?;
8488
let ret_data = Box::new(ArrowFFIData { array, schema });
8589
Ok(Box::leak(ret_data))
8690
}

integration-tests/Cargo.toml

+1-16
Original file line numberDiff line numberDiff line change
@@ -6,19 +6,4 @@ edition = "2021"
66
[workspace]
77

88
[dependencies]
9-
arrow = "=53.0.0"
10-
delta_kernel = { path = "../kernel", features = ["arrow-conversion", "arrow-expression", "default-engine", "sync-engine"] }
11-
12-
[patch.'file:///../kernel']
13-
arrow = "=53.0.0"
14-
arrow-arith = "=53.0.0"
15-
arrow-array = "=53.0.0"
16-
arrow-buffer = "=53.0.0"
17-
arrow-cast = "=53.0.0"
18-
arrow-data = "=53.0.0"
19-
arrow-ord = "=53.0.0"
20-
arrow-json = "=53.0.0"
21-
arrow-select = "=53.0.0"
22-
arrow-schema = "=53.0.0"
23-
parquet = "=53.0.0"
24-
object_store = "=0.11.1"
9+
delta_kernel = { path = "../kernel", features = ["default-engine", "sync-engine"] }

integration-tests/src/main.rs

+5-4
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,16 @@
1-
fn create_arrow_schema() -> arrow::datatypes::Schema {
2-
use arrow::datatypes::{DataType, Field, Schema};
1+
use delta_kernel::arrow::datatypes::{DataType, Field, Schema};
2+
3+
fn create_arrow_schema() -> Schema {
34
let field_a = Field::new("a", DataType::Int64, false);
45
let field_b = Field::new("b", DataType::Boolean, false);
56
Schema::new(vec![field_a, field_b])
67
}
78

89
fn create_kernel_schema() -> delta_kernel::schema::Schema {
9-
use delta_kernel::schema::{DataType, Schema, StructField};
10+
use delta_kernel::schema::{DataType, StructField};
1011
let field_a = StructField::not_null("a", DataType::LONG);
1112
let field_b = StructField::not_null("b", DataType::BOOLEAN);
12-
Schema::new(vec![field_a, field_b])
13+
delta_kernel::schema::Schema::new(vec![field_a, field_b])
1314
}
1415

1516
fn main() {

integration-tests/test-all-arrow-versions.sh

+10-23
Original file line numberDiff line numberDiff line change
@@ -2,38 +2,25 @@
22

33
set -eu -o pipefail
44

5-
is_version_le() {
6-
[ "$1" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ]
7-
}
8-
9-
is_version_lt() {
10-
if [ "$1" = "$2" ]
11-
then
12-
return 1
13-
else
14-
is_version_le "$1" "$2"
15-
fi
16-
}
17-
185
test_arrow_version() {
196
ARROW_VERSION="$1"
207
echo "== Testing version $ARROW_VERSION =="
21-
sed -i'' -e "s/\(arrow[^\"]*=[^\"]*\).*/\1\"=$ARROW_VERSION\"/" Cargo.toml
22-
sed -i'' -e "s/\(parquet[^\"]*\).*/\1\"=$ARROW_VERSION\"/" Cargo.toml
238
cargo clean
249
rm -f Cargo.lock
2510
cargo update
2611
cat Cargo.toml
27-
cargo run
12+
cargo run --features ${ARROW_VERSION}
2813
}
2914

30-
MIN_ARROW_VER="53.0.0"
31-
MAX_ARROW_VER="54.0.0"
15+
FEATURES=$(cat ../kernel/Cargo.toml | grep -e ^arrow_ | awk '{ print $1 }' | sort -u)
3216

33-
for ARROW_VERSION in $(curl -s https://crates.io/api/v1/crates/arrow | jq -r '.versions[].num' | tr -d '\r')
17+
18+
echo "[features]" >> Cargo.toml
19+
20+
for ARROW_VERSION in ${FEATURES}
3421
do
35-
if ! is_version_lt "$ARROW_VERSION" "$MIN_ARROW_VER" && is_version_lt "$ARROW_VERSION" "$MAX_ARROW_VER"
36-
then
37-
test_arrow_version "$ARROW_VERSION"
38-
fi
22+
echo "${ARROW_VERSION} = [\"delta_kernel/${ARROW_VERSION}\"]" >> Cargo.toml
23+
test_arrow_version $ARROW_VERSION
3924
done
25+
26+
git checkout Cargo.toml

kernel/Cargo.toml

+23-35
Original file line numberDiff line numberDiff line change
@@ -58,20 +58,22 @@ visibility = "0.1.1"
5858

5959
# Used in the sync engine
6060
tempfile = { version = "3", optional = true }
61+
62+
# Arrow supported versions
63+
## 53
6164
# Used in default engine
62-
arrow-buffer = { workspace = true, optional = true }
63-
arrow-array = { workspace = true, optional = true, features = ["chrono-tz"] }
64-
arrow-select = { workspace = true, optional = true }
65-
arrow-arith = { workspace = true, optional = true }
66-
arrow-cast = { workspace = true, optional = true }
67-
arrow-json = { workspace = true, optional = true }
68-
arrow-ord = { workspace = true, optional = true }
69-
arrow-schema = { workspace = true, optional = true }
65+
arrow_53 = { package = "arrow", version = "53", features = ["chrono-tz", "ffi", "json", "prettyprint"], optional = true }
66+
# Used in default and sync engine
67+
parquet_53 = { package = "parquet", version = "53", features = ["async", "object_store"] , optional = true }
68+
######
69+
## 54
70+
arrow_54 = { package = "arrow", version = "54", features = ["chrono-tz", "ffi", "json", "prettyprint"], optional = true }
71+
parquet_54 = { package = "parquet", version = "54", features = ["async", "object_store"] , optional = true }
72+
######
73+
7074
futures = { version = "0.3", optional = true }
7175
object_store = { workspace = true, optional = true }
7276
hdfs-native-object-store = { workspace = true, optional = true }
73-
# Used in default and sync engine
74-
parquet = { workspace = true, optional = true }
7577
# Used for fetching direct urls (like pre-signed urls)
7678
reqwest = { version = "0.12.8", default-features = false, optional = true }
7779
strum = { version = "0.26", features = ["derive"] }
@@ -85,14 +87,16 @@ hdfs-native = { workspace = true, optional = true }
8587
walkdir = { workspace = true, optional = true }
8688

8789
[features]
88-
arrow-conversion = ["arrow-schema"]
89-
arrow-expression = [
90-
"arrow-arith",
91-
"arrow-array",
92-
"arrow-buffer",
93-
"arrow-ord",
94-
"arrow-schema",
95-
]
90+
# The default version to be expected
91+
arrow = ["arrow_53"]
92+
93+
arrow_53 = ["dep:arrow_53", "dep:parquet_53"]
94+
95+
arrow_54 = ["dep:arrow_54", "dep:parquet_54"]
96+
97+
arrow-conversion = []
98+
arrow-expression = []
99+
96100
cloud = [
97101
"object_store/aws",
98102
"object_store/azure",
@@ -107,16 +111,8 @@ default = []
107111
default-engine-base = [
108112
"arrow-conversion",
109113
"arrow-expression",
110-
"arrow-array",
111-
"arrow-buffer",
112-
"arrow-cast",
113-
"arrow-json",
114-
"arrow-schema",
115-
"arrow-select",
116114
"futures",
117115
"object_store",
118-
"parquet/async",
119-
"parquet/object_store",
120116
"tokio",
121117
"uuid/v4",
122118
"uuid/fast-rng",
@@ -134,13 +130,6 @@ default-engine-rustls = [
134130

135131
developer-visibility = []
136132
sync-engine = [
137-
"arrow-cast",
138-
"arrow-conversion",
139-
"arrow-expression",
140-
"arrow-array",
141-
"arrow-json",
142-
"arrow-select",
143-
"parquet",
144133
"tempfile",
145134
]
146135
integration-test = [
@@ -156,8 +145,7 @@ version = "=0.5.9"
156145
rustc_version = "0.4.1"
157146

158147
[dev-dependencies]
159-
arrow = { workspace = true, features = ["json", "prettyprint"] }
160-
delta_kernel = { path = ".", features = ["default-engine", "sync-engine"] }
148+
delta_kernel = { path = ".", features = ["arrow", "default-engine", "sync-engine"] }
161149
test_utils = { path = "../test-utils" }
162150
paste = "1.0"
163151
test-log = { version = "0.2", default-features = false, features = ["trace"] }

0 commit comments

Comments
 (0)