Skip to content

Commit 826ba5a

Browse files
authoredFeb 12, 2024··
Merge pull request #1 from scribd/dimensionsa
Add support for dimensional counts
2 parents fa0608d + 55fd3c3 commit 826ba5a

File tree

5 files changed

+180
-19
lines changed

5 files changed

+180
-19
lines changed
 

‎.github/workflows/release.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,6 @@ jobs:
8383
GITHUB_TOKEN: ${{ github.token }}
8484
with:
8585
upload_url: ${{ steps.create_release.outputs.upload_url }}
86-
asset_path: ./target/lambda/query-metrics/bootstrap.zip
86+
asset_path: ./target/lambda/query-metrics-lambda/bootstrap.zip
8787
asset_name: query-metrics-bootstrap-${{ github.ref_name }}.zip
8888
asset_content_type: application/zip

‎lambdas/query-metrics/Cargo.toml

+10-12
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,15 @@
11
[package]
22
name = "query-metrics"
3-
version = "0.1.0"
3+
version = "0.2.0"
44
edition = "2021"
55

6-
# Starting in Rust 1.62 you can use `cargo add` to add dependencies
7-
# to your project.
8-
#
9-
# If you're using an older Rust version,
10-
# download cargo-edit(https://github.com/killercup/cargo-edit#installation)
11-
# to install the `add` subcommand.
12-
#
13-
# Running `cargo add DEPENDENCY_NAME` will
14-
# add the latest version of a dependency to the list,
15-
# and it will keep the alphabetic ordering for you.
6+
[[bin]]
7+
name = "query-metrics"
8+
path = "src/cli.rs"
9+
10+
[[bin]]
11+
name = "query-metrics-lambda"
12+
path = "src/main.rs"
1613

1714
[dependencies]
1815
anyhow = "1.0.79"
@@ -21,7 +18,8 @@ aws-sdk-cloudwatch = "1.11.0"
2118
aws-sdk-config = "1.11.0"
2219
aws_lambda_events = { version = "0.12.0" }
2320
base64 = "0.21.7"
24-
deltalake = { version = "0.16.5", features = ["datafusion", "s3"] }
21+
deltalake-core = { version = "0.17.0", features = ["datafusion"] }
22+
deltalake-aws = { version = "0.1.0" }
2523

2624
lambda_runtime = "0.8.3"
2725
serde = { version = "1.0.195", features = ["derive"] }

‎lambdas/query-metrics/src/cli.rs

+97
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
///
2+
/// The CLI helps test a manifest
3+
///
4+
use std::collections::HashMap;
5+
use std::sync::Arc;
6+
7+
use deltalake_core::arrow::util::pretty::print_batches;
8+
use deltalake_core::arrow::{array::PrimitiveArray, datatypes::Int64Type};
9+
use deltalake_core::datafusion::common::*;
10+
use deltalake_core::datafusion::execution::context::SessionContext;
11+
use tracing::log::*;
12+
13+
mod config;
14+
15+
#[tokio::main]
16+
async fn main() -> anyhow::Result<()> {
17+
deltalake_aws::register_handlers(None);
18+
19+
tracing_subscriber::fmt()
20+
.with_env_filter(tracing_subscriber::EnvFilter::from_default_env())
21+
.with_target(true)
22+
.init();
23+
24+
let conf = config::Configuration::from_file("prod-manifest.yml");
25+
26+
for (name, gauges) in conf.gauges.iter() {
27+
for gauge in gauges.iter() {
28+
println!("Querying the {name} table");
29+
let ctx = SessionContext::new();
30+
let table = deltalake_core::open_table(&gauge.url)
31+
.await
32+
.expect("Failed to register table");
33+
println!("table opened");
34+
tokio::time::sleep(std::time::Duration::from_secs(5)).await;
35+
ctx.register_table("source", Arc::new(table))
36+
.expect("Failed to register table with datafusion");
37+
38+
println!("Running query: {}", gauge.query);
39+
40+
let df = ctx
41+
.sql(&gauge.query)
42+
.await
43+
.expect("Failed to execute query");
44+
45+
match gauge.measurement_type {
46+
config::Measurement::Count => {
47+
let count = df.count().await.expect("Failed to collect batches");
48+
println!("Counted {count} rows");
49+
}
50+
config::Measurement::DimensionalCount => {
51+
println!("Need to run dimensional count");
52+
let batches = df.collect().await.expect("Failed to collect batches");
53+
//let batches = df.explain(false, false).unwrap().collect().await.expect("Failed to collect batches");
54+
let _ = print_batches(&batches);
55+
56+
println!("I see this many batches: {}", batches.len());
57+
// Interestingly the collect produces a lot of zero row batches
58+
for batch in batches.iter().filter(|b| b.num_rows() > 0) {
59+
if let Some(_counts) = batch.column_by_name("count") {
60+
// Fetching the count column just to ensure that it exists before doing
61+
// any more computation
62+
let schema = batch.schema();
63+
let fields = schema.fields();
64+
65+
for row in 0..batch.num_rows() {
66+
let mut dimensions: HashMap<String, String> = HashMap::new();
67+
let mut counted = false;
68+
let mut count = 0;
69+
70+
for (idx, column) in batch.columns().iter().enumerate() {
71+
let field = &fields[idx];
72+
let name = field.name();
73+
if name == "count" {
74+
let arr: &PrimitiveArray<Int64Type> =
75+
arrow::array::cast::as_primitive_array(&column);
76+
count = arr.value(row);
77+
counted = true;
78+
} else {
79+
let arr = arrow::array::cast::as_string_array(&column);
80+
dimensions.insert(name.into(), arr.value(row).into());
81+
}
82+
}
83+
84+
if counted {
85+
println!("{count}: {dimensions:?}");
86+
}
87+
}
88+
} else {
89+
error!("The result set must have a column named `count`");
90+
}
91+
}
92+
}
93+
}
94+
}
95+
}
96+
Ok(())
97+
}

‎lambdas/query-metrics/src/config.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,7 @@ pub struct Configuration {
1313
}
1414

1515
impl Configuration {
16-
#[cfg(test)]
17-
fn from_file<S: Into<String> + AsRef<Path>>(location: S) -> Self {
16+
pub fn from_file<S: Into<String> + AsRef<Path>>(location: S) -> Self {
1817
serde_yaml::from_reader(File::open(location).expect("Failed to open manifest"))
1918
.expect("Failed to deserialize")
2019
}
@@ -39,6 +38,7 @@ pub struct Gauge {
3938
#[serde(rename_all = "lowercase")]
4039
pub enum Measurement {
4140
Count,
41+
DimensionalCount,
4242
}
4343

4444
#[cfg(test)]

‎lambdas/query-metrics/src/main.rs

+70-4
Original file line numberDiff line numberDiff line change
@@ -5,19 +5,23 @@
55
use aws_lambda_events::event::cloudwatch_events::CloudWatchEvent;
66
use aws_sdk_cloudwatch::{
77
primitives::DateTime,
8-
types::{MetricDatum, StandardUnit},
8+
types::{Dimension, MetricDatum, StandardUnit},
99
};
10-
use deltalake::datafusion::common::*;
11-
use deltalake::datafusion::execution::context::SessionContext;
10+
use deltalake_core::arrow::{array::PrimitiveArray, datatypes::Int64Type};
11+
use deltalake_core::datafusion::common::*;
12+
use deltalake_core::datafusion::execution::context::SessionContext;
1213
use lambda_runtime::{run, service_fn, Error, LambdaEvent};
1314
use tracing::log::*;
1415

16+
use std::collections::HashMap;
1517
use std::sync::Arc;
1618
use std::time::SystemTime;
1719

1820
mod config;
1921

2022
async fn function_handler(_event: LambdaEvent<CloudWatchEvent>) -> Result<(), Error> {
23+
deltalake_aws::register_handlers(None);
24+
2125
let aws_config = aws_config::load_defaults(aws_config::BehaviorVersion::latest()).await;
2226
let cloudwatch = aws_sdk_cloudwatch::Client::new(&aws_config);
2327

@@ -31,7 +35,7 @@ async fn function_handler(_event: LambdaEvent<CloudWatchEvent>) -> Result<(), Er
3135
for gauge in gauges.iter() {
3236
debug!("Querying the {name} table");
3337
let ctx = SessionContext::new();
34-
let table = deltalake::open_table(&gauge.url)
38+
let table = deltalake_core::open_table(&gauge.url)
3539
.await
3640
.expect("Failed to register table");
3741
ctx.register_table("source", Arc::new(table))
@@ -64,6 +68,68 @@ async fn function_handler(_event: LambdaEvent<CloudWatchEvent>) -> Result<(), Er
6468
.await?;
6569
debug!("Result of CloudWatch send: {res:?}");
6670
}
71+
config::Measurement::DimensionalCount => {
72+
let batches = df.collect().await.expect("Failed to collect batches");
73+
debug!("I see this many batches: {}", batches.len());
74+
75+
// Interestingly the collect produces a lot of zero row batches
76+
for batch in batches.iter().filter(|b| b.num_rows() > 0) {
77+
if let Some(_counts) = batch.column_by_name("count") {
78+
// Fetching the count column just to ensure that it exists before doing
79+
// any more computation
80+
let schema = batch.schema();
81+
let fields = schema.fields();
82+
83+
for row in 0..batch.num_rows() {
84+
let mut dimensions: HashMap<String, String> = HashMap::new();
85+
let mut counted = false;
86+
let mut count = 0;
87+
88+
for (idx, column) in batch.columns().iter().enumerate() {
89+
let field = &fields[idx];
90+
let name = field.name();
91+
if name == "count" {
92+
let arr: &PrimitiveArray<Int64Type> =
93+
arrow::array::cast::as_primitive_array(&column);
94+
count = arr.value(row);
95+
counted = true;
96+
} else {
97+
let arr = arrow::array::cast::as_string_array(&column);
98+
dimensions.insert(name.into(), arr.value(row).into());
99+
}
100+
}
101+
102+
if counted {
103+
debug!("{count}: {dimensions:?}");
104+
let mut dims: Vec<Dimension> = vec![];
105+
106+
for (key, value) in dimensions.iter() {
107+
dims.push(
108+
Dimension::builder().name(key).value(value).build(),
109+
);
110+
}
111+
let datum = MetricDatum::builder()
112+
.metric_name(&gauge.name)
113+
.timestamp(DateTime::from(SystemTime::now()))
114+
.set_dimensions(Some(dims))
115+
.value(count as f64)
116+
.unit(StandardUnit::Count)
117+
.build();
118+
119+
let res = cloudwatch
120+
.put_metric_data()
121+
.namespace(format!("DataLake/{name}"))
122+
.metric_data(datum)
123+
.send()
124+
.await?;
125+
debug!("Result of CloudWatch send: {res:?}");
126+
}
127+
}
128+
} else {
129+
error!("The result set must have a column named `count`");
130+
}
131+
}
132+
}
67133
}
68134
}
69135
}

0 commit comments

Comments
 (0)
Please sign in to comment.