Skip to content

Commit d416f69

Browse files
authored
Fix FOR bug, also fix bench to compile (#341)
@robert3005 1. Fix issue with FoR compression where nullability was getting stripped, causing `compress` benchmark to fail 2. Fix some random tokio bs that was also causing `compress` benchmark to fail due to recursive runtime creations
1 parent 0e83666 commit d416f69

File tree

39 files changed

+684
-439
lines changed

39 files changed

+684
-439
lines changed

Cargo.lock

+1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

bench-vortex/Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ tokio = { workspace = true }
3737
uuid = { workspace = true }
3838
vortex-alp = { path = "../vortex-alp" }
3939
vortex-array = { path = "../vortex-array" }
40+
vortex-buffer = { path = "../vortex-buffer" }
4041
vortex-datetime-parts = { path = "../vortex-datetime-parts" }
4142
vortex-dict = { path = "../vortex-dict" }
4243
vortex-dtype = { path = "../vortex-dtype" }

bench-vortex/benches/compress_benchmark.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ use criterion::{black_box, criterion_group, criterion_main, Criterion};
77

88
fn vortex_compress_taxi(c: &mut Criterion) {
99
taxi_data_parquet();
10-
let mut group = c.benchmark_group("end to end");
10+
let mut group = c.benchmark_group("end to end - taxi");
1111
group.sample_size(10);
1212
group.bench_function("compress", |b| b.iter(|| black_box(compress_taxi_data())));
1313
group.finish()
@@ -16,7 +16,7 @@ fn vortex_compress_taxi(c: &mut Criterion) {
1616
fn vortex_compress_medicare1(c: &mut Criterion) {
1717
let dataset = BenchmarkDatasets::PBI(Medicare1);
1818
dataset.as_uncompressed();
19-
let mut group = c.benchmark_group("end to end");
19+
let mut group = c.benchmark_group("end to end - medicare");
2020
group.sample_size(10);
2121
group.bench_function("compress", |b| {
2222
b.iter(|| black_box(dataset.compress_to_vortex()))

bench-vortex/benches/random_access.rs

+1
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ fn random_access(c: &mut Criterion) {
2727
});
2828

2929
let dataset = BenchmarkDatasets::PBI(Medicare1);
30+
dataset.write_as_parquet();
3031
dataset.write_as_lance();
3132
// NB: our parquet benchmarks read from a single file, and we (currently) write each
3233
// file to an individual lance dataset for comparison parity.

bench-vortex/src/bin/compress.rs

+11-11
Original file line numberDiff line numberDiff line change
@@ -4,35 +4,35 @@ use std::path::PathBuf;
44
use bench_vortex::data_downloads::BenchmarkDataset;
55
use bench_vortex::public_bi_data::BenchmarkDatasets::PBI;
66
use bench_vortex::public_bi_data::PBIDataset;
7-
use bench_vortex::reader::{open_vortex, rewrite_parquet_as_vortex};
7+
use bench_vortex::reader::{open_vortex_async, rewrite_parquet_as_vortex};
88
use bench_vortex::taxi_data::taxi_data_parquet;
99
use bench_vortex::{setup_logger, IdempotentPath};
10-
use futures::executor::block_on;
1110
use log::{info, LevelFilter};
1211
use tokio::fs::File;
12+
use vortex_error::VortexResult;
1313

14-
pub fn main() {
14+
#[tokio::main]
15+
pub async fn main() {
1516
setup_logger(LevelFilter::Info);
1617
// compress_pbi(PBIDataset::Medicare1);
17-
compress_taxi();
18+
compress_taxi().await.unwrap();
1819
}
1920

20-
fn compress_taxi() {
21+
async fn compress_taxi() -> VortexResult<()> {
2122
let path: PathBuf = "taxi_data.vortex".to_data_path();
22-
block_on(async {
23-
let output_file = File::create(&path).await?;
24-
rewrite_parquet_as_vortex(taxi_data_parquet(), output_file).await
25-
})
26-
.unwrap();
23+
let output_file = File::create(&path).await?;
24+
rewrite_parquet_as_vortex(taxi_data_parquet(), output_file).await?;
2725

28-
let taxi_vortex = open_vortex(&path).unwrap();
26+
let taxi_vortex = open_vortex_async(&path).await?;
2927
info!("{}", taxi_vortex.tree_display());
3028

3129
let pq_size = taxi_data_parquet().metadata().unwrap().size();
3230
let vx_size = taxi_vortex.nbytes();
3331

3432
info!("Parquet size: {}, Vortex size: {}", pq_size, vx_size);
3533
info!("Compression ratio: {}", vx_size as f32 / pq_size as f32);
34+
35+
Ok(())
3636
}
3737

3838
#[allow(dead_code)]

bench-vortex/src/reader.rs

+12-1
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,17 @@ pub fn open_vortex(path: &Path) -> VortexResult<Array> {
4747
.map(|a| a.into_array())
4848
}
4949

50+
pub async fn open_vortex_async(path: &Path) -> VortexResult<Array> {
51+
let file = tokio::fs::File::open(path).await.unwrap();
52+
let mut msgs = MessageReader::try_new(TokioAdapter(file)).await.unwrap();
53+
msgs.array_stream_from_messages(&CTX)
54+
.await
55+
.unwrap()
56+
.collect_chunked()
57+
.await
58+
.map(|a| a.into_array())
59+
}
60+
5061
pub async fn rewrite_parquet_as_vortex<W: VortexWrite>(
5162
parquet_path: PathBuf,
5263
write: W,
@@ -103,7 +114,7 @@ pub fn take_vortex(path: &Path, indices: &[u64]) -> VortexResult<Array> {
103114
let array = open_vortex(path)?;
104115
let taken = take(&array, &indices.to_vec().into_array())?;
105116
// For equivalence.... we flatten to make sure we're not cheating too much.
106-
taken.flatten().map(|x| x.into_array())
117+
Ok(taken.flatten()?.into_array())
107118
}
108119

109120
pub fn take_parquet(path: &Path, indices: &[u64]) -> VortexResult<RecordBatch> {

bench-vortex/src/taxi_data.rs

+29-2
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
1+
use std::future::{ready, Future};
2+
use std::io::Write;
13
use std::path::PathBuf;
24

35
use futures::executor::block_on;
4-
use tokio::fs::File;
6+
use vortex_buffer::io_buf::IoBuf;
57
use vortex_error::VortexError;
8+
use vortex_ipc::io::VortexWrite;
69

710
use crate::data_downloads::{data_vortex_uncompressed, download_data, parquet_to_lance};
811
use crate::reader::rewrite_parquet_as_vortex;
@@ -33,10 +36,34 @@ pub fn taxi_data_vortex_uncompressed() -> PathBuf {
3336
pub fn taxi_data_vortex() -> PathBuf {
3437
idempotent("taxi.vortex", |output_fname| {
3538
block_on(async {
36-
let output_file = File::create(output_fname).await?;
39+
let output_file = std::fs::File::create(output_fname)?;
40+
let output_file = StdFile(output_file);
3741
rewrite_parquet_as_vortex(taxi_data_parquet(), output_file).await?;
3842
Ok::<PathBuf, VortexError>(output_fname.to_path_buf())
3943
})
4044
})
4145
.unwrap()
4246
}
47+
48+
//
49+
// Test code uses futures_executor with a local pool, and nothing in VortexWrite ties us to Tokio,
50+
// so this is a simple bridge to allow us to use a `std::fs::File` as a `VortexWrite`.
51+
//
52+
53+
struct StdFile(std::fs::File);
54+
55+
impl VortexWrite for StdFile {
56+
async fn write_all<B: IoBuf>(&mut self, buffer: B) -> std::io::Result<B> {
57+
self.0.write_all(buffer.as_slice())?;
58+
Ok(buffer)
59+
}
60+
61+
async fn flush(&mut self) -> std::io::Result<()> {
62+
self.0.flush()?;
63+
Ok(())
64+
}
65+
66+
fn shutdown(&mut self) -> impl Future<Output = std::io::Result<()>> {
67+
ready(Ok(()))
68+
}
69+
}

vortex-array/src/array/bool/compute/as_contiguous.rs

-27
This file was deleted.

vortex-array/src/array/bool/compute/mod.rs

-6
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
use crate::array::bool::BoolArray;
22
use crate::compute::as_arrow::AsArrowArray;
3-
use crate::compute::as_contiguous::AsContiguousFn;
43
use crate::compute::compare::CompareFn;
54
use crate::compute::fill::FillForwardFn;
65
use crate::compute::scalar_at::ScalarAtFn;
@@ -9,7 +8,6 @@ use crate::compute::take::TakeFn;
98
use crate::compute::ArrayCompute;
109

1110
mod as_arrow;
12-
mod as_contiguous;
1311
mod compare;
1412
mod fill;
1513
mod flatten;
@@ -22,10 +20,6 @@ impl ArrayCompute for BoolArray {
2220
Some(self)
2321
}
2422

25-
fn as_contiguous(&self) -> Option<&dyn AsContiguousFn> {
26-
Some(self)
27-
}
28-
2923
fn compare(&self) -> Option<&dyn CompareFn> {
3024
Some(self)
3125
}

vortex-array/src/array/chunked/compute/mod.rs

+2-21
Original file line numberDiff line numberDiff line change
@@ -2,23 +2,21 @@ use vortex_error::VortexResult;
22
use vortex_scalar::Scalar;
33

44
use crate::array::chunked::ChunkedArray;
5-
use crate::compute::as_contiguous::{as_contiguous, AsContiguousFn};
65
use crate::compute::scalar_at::{scalar_at, ScalarAtFn};
76
use crate::compute::scalar_subtract::SubtractScalarFn;
87
use crate::compute::slice::SliceFn;
98
use crate::compute::take::TakeFn;
109
use crate::compute::ArrayCompute;
11-
use crate::Array;
1210

1311
mod slice;
1412
mod take;
1513

1614
impl ArrayCompute for ChunkedArray {
17-
fn as_contiguous(&self) -> Option<&dyn AsContiguousFn> {
15+
fn scalar_at(&self) -> Option<&dyn ScalarAtFn> {
1816
Some(self)
1917
}
2018

21-
fn scalar_at(&self) -> Option<&dyn ScalarAtFn> {
19+
fn subtract_scalar(&self) -> Option<&dyn SubtractScalarFn> {
2220
Some(self)
2321
}
2422

@@ -29,23 +27,6 @@ impl ArrayCompute for ChunkedArray {
2927
fn take(&self) -> Option<&dyn TakeFn> {
3028
Some(self)
3129
}
32-
33-
fn subtract_scalar(&self) -> Option<&dyn SubtractScalarFn> {
34-
Some(self)
35-
}
36-
}
37-
38-
impl AsContiguousFn for ChunkedArray {
39-
fn as_contiguous(&self, arrays: &[Array]) -> VortexResult<Array> {
40-
// Combine all the chunks into one, then call as_contiguous again.
41-
let mut chunks = Vec::with_capacity(self.nchunks());
42-
for array in arrays {
43-
for chunk in Self::try_from(array).unwrap().chunks() {
44-
chunks.push(chunk);
45-
}
46-
}
47-
as_contiguous(&chunks)
48-
}
4930
}
5031

5132
impl ScalarAtFn for ChunkedArray {

vortex-array/src/array/chunked/compute/take.rs

+5-11
Original file line numberDiff line numberDiff line change
@@ -52,10 +52,7 @@ impl TakeFn for ChunkedArray {
5252

5353
#[cfg(test)]
5454
mod test {
55-
use itertools::Itertools;
56-
5755
use crate::array::chunked::ChunkedArray;
58-
use crate::compute::as_contiguous::as_contiguous;
5956
use crate::compute::take::take;
6057
use crate::{ArrayDType, ArrayTrait, AsArray, IntoArray};
6158

@@ -68,14 +65,11 @@ mod test {
6865
assert_eq!(arr.len(), 9);
6966
let indices = vec![0, 0, 6, 4].into_array();
7067

71-
let result = as_contiguous(
72-
&ChunkedArray::try_from(take(arr.as_array_ref(), &indices).unwrap())
73-
.unwrap()
74-
.chunks()
75-
.collect_vec(),
76-
)
77-
.unwrap()
78-
.into_primitive();
68+
let result = &ChunkedArray::try_from(take(arr.as_array_ref(), &indices).unwrap())
69+
.unwrap()
70+
.into_array()
71+
.flatten_primitive()
72+
.unwrap();
7973
assert_eq!(result.typed_data::<i32>(), &[1, 1, 1, 2]);
8074
}
8175
}

0 commit comments

Comments
 (0)