Skip to content

Commit ef00ee4

Browse files
committed
Add the Numeric metric type and add some documentation and tooling
1 parent c7b8054 commit ef00ee4

File tree

9 files changed

+169
-3
lines changed

9 files changed

+169
-3
lines changed

Makefile

+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
2+
.PHONY: help
3+
help: ## Show this help
4+
@egrep -h '\s##\s' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-20s\033[0m %s\n", $$1, $$2}'
5+
6+
.PHONY: all build build-release check test clean
7+
all: check build test ## Perform all the checks builds and testing
8+
9+
check: ## Ensure that the crate meets the basic formatting and structure
10+
cargo fmt --check
11+
cargo clippy
12+
13+
build: ## Build the crate with each set of features
14+
./ci/build.sh
15+
16+
build-release: check test ## Build the release versions of Lambdas
17+
./ci/build-release.sh
18+
test: ## Run the crate's tests with each set of features
19+
./ci/test.sh
20+
21+
clean: ## Clean up resources from build
22+
cargo clean

ci/build-release.sh

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
#!/bin/sh
2+
3+
if [ -f "${HOME}/.cargo/env" ]; then
4+
. "${HOME}/.cargo/env"
5+
fi;
6+
7+
exec cargo lambda build --release --output-format zip

ci/build.sh

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
#!/bin/sh
2+
3+
if [ -f "${HOME}/.cargo/env" ]; then
4+
. "${HOME}/.cargo/env"
5+
fi;
6+
7+
set -xe
8+
9+
cargo fmt --check
10+
11+
exec cargo build

ci/test.sh

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
#!/bin/sh
2+
if [ -f "${HOME}/.cargo/env" ]; then
3+
. "${HOME}/.cargo/env"
4+
fi;
5+
6+
exec cargo test --verbose

lambdas/query-metrics/Cargo.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "query-metrics"
3-
version = "0.3.0"
3+
version = "0.4.0"
44
edition = "2021"
55

66
[[bin]]

lambdas/query-metrics/README.md

+50
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
2+
# query-metrics
3+
4+
This Lambda will execute DataFusion queries defined in a YAML file and submit
5+
the results to CloudWatch Metrics which can be alerted upon or forwarded into
6+
other tools
7+
8+
9+
10+
## Types
11+
12+
### Count
13+
14+
This is the simplest type of query and will simply record the number of rows from the query, e.g.:
15+
16+
```sql
17+
SELECT id FROM source WHERE id > 1000 AND id <= 2000
18+
```
19+
20+
Would consistently produce a counted metric value of `1000`.
21+
22+
23+
### Numeric
24+
25+
Numeric is likely the most common and easy to understand query. There should only be one row in the result set and all of its values should be numeric values, e.g.:
26+
27+
```sql
28+
SELECT COUNT(*) AS total, SUM(CASE WHEN (id > 1000 AND id <= 2000) THEN 1 ELSE 0 END) AS valid_ids FROM source
29+
```
30+
31+
This will produce a result set of:
32+
33+
```
34+
+-------+-----------+
35+
| total | valid_ids |
36+
+-------+-----------+
37+
| 4096 | 1000 |
38+
+-------+-----------+
39+
```
40+
41+
Which wiull produce metric values of:
42+
43+
* `total` 4096
44+
* `valid_ids` 1000
45+
46+
47+
### Dimensional Count
48+
49+
The dimensional count is the most advanced query type and can be used to
50+
provide dimensional (or tagged) metrics in CloudWatch

lambdas/query-metrics/src/cli.rs

+28
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,34 @@ async fn main() -> anyhow::Result<()> {
4747
let count = df.count().await.expect("Failed to collect batches");
4848
println!("Counted {count} rows");
4949
}
50+
config::Measurement::Numeric => {
51+
println!("Need to run dimensional count");
52+
let batches = df.collect().await.expect("Failed to collect batches");
53+
let _ = print_batches(&batches);
54+
55+
println!("I see this many batches: {}", batches.len());
56+
let mut dimensions: HashMap<String, i64> = HashMap::new();
57+
for batch in batches.iter().filter(|b| b.num_rows() > 0) {
58+
let schema = batch.schema();
59+
let fields = schema.fields();
60+
for row in 0..batch.num_rows() {
61+
for (idx, column) in batch.columns().iter().enumerate() {
62+
let field = &fields[idx];
63+
let name = field.name();
64+
65+
if !dimensions.contains_key(name) {
66+
dimensions.insert(name.to_string(), 0);
67+
}
68+
let current = dimensions.get(name).expect("Failed to retrieve");
69+
let arr: &PrimitiveArray<Int64Type> =
70+
arrow::array::cast::as_primitive_array(&column);
71+
let count = arr.value(row);
72+
dimensions.insert(name.to_string(), count + current);
73+
}
74+
}
75+
}
76+
println!("results: {dimensions:?}");
77+
}
5078
config::Measurement::DimensionalCount => {
5179
println!("Need to run dimensional count");
5280
let batches = df.collect().await.expect("Failed to collect batches");

lambdas/query-metrics/src/config.rs

+1
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ pub struct Gauge {
3838
#[serde(rename_all = "lowercase")]
3939
pub enum Measurement {
4040
Count,
41+
Numeric,
4142
DimensionalCount,
4243
}
4344

lambdas/query-metrics/src/main.rs

+43-2
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ async fn function_handler(_event: LambdaEvent<CloudWatchEvent>) -> Result<(), Er
2929
std::env::var("MANIFEST_B64").expect("The `MANIFEST_B64` variable was not defined"),
3030
)
3131
.expect("The `MANIFEST_B64` environment variable does not contain a valid manifest yml");
32-
debug!("Configuration loaded: {conf:?}");
32+
info!("Configuration loaded: {conf:?}");
3333

3434
for (name, gauges) in conf.gauges.iter() {
3535
for gauge in gauges.iter() {
@@ -41,7 +41,7 @@ async fn function_handler(_event: LambdaEvent<CloudWatchEvent>) -> Result<(), Er
4141
ctx.register_table("source", Arc::new(table))
4242
.expect("Failed to register table with datafusion");
4343

44-
debug!("Running query: {}", gauge.query);
44+
info!("Running query: {}", gauge.query);
4545

4646
let df = ctx
4747
.sql(&gauge.query)
@@ -68,6 +68,47 @@ async fn function_handler(_event: LambdaEvent<CloudWatchEvent>) -> Result<(), Er
6868
.await?;
6969
debug!("Result of CloudWatch send: {res:?}");
7070
}
71+
config::Measurement::Numeric => {
72+
let batches = df.collect().await.expect("Failed to collect batches");
73+
let mut values: HashMap<String, i64> = HashMap::new();
74+
75+
for batch in batches.iter().filter(|b| b.num_rows() > 0) {
76+
let schema = batch.schema();
77+
let fields = schema.fields();
78+
for row in 0..batch.num_rows() {
79+
for (idx, column) in batch.columns().iter().enumerate() {
80+
let field = &fields[idx];
81+
let name = field.name();
82+
83+
if !values.contains_key(name) {
84+
values.insert(name.to_string(), 0);
85+
}
86+
let current = values.get(name).expect("Failed to retrieve");
87+
let arr: &PrimitiveArray<Int64Type> =
88+
arrow::array::cast::as_primitive_array(&column);
89+
let count = arr.value(row);
90+
values.insert(name.to_string(), count + current);
91+
}
92+
}
93+
}
94+
info!("results: {values:?}");
95+
for (key, value) in values.into_iter() {
96+
let datum = MetricDatum::builder()
97+
.metric_name(&key)
98+
.timestamp(DateTime::from(SystemTime::now()))
99+
.unit(StandardUnit::Count)
100+
.value(value as f64)
101+
.build();
102+
103+
let res = cloudwatch
104+
.put_metric_data()
105+
.namespace(format!("DataLake/{name}"))
106+
.metric_data(datum)
107+
.send()
108+
.await?;
109+
info!("submitting {key} to cloudwatch: {res:?}");
110+
}
111+
}
71112
config::Measurement::DimensionalCount => {
72113
let batches = df.collect().await.expect("Failed to collect batches");
73114
debug!("I see this many batches: {}", batches.len());

0 commit comments

Comments
 (0)