From e2a1fc34fd4dad7a7ec4dc01a40656de6801c040 Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Fri, 28 Feb 2025 23:55:11 +0530 Subject: [PATCH 01/39] refactor: accept array in `push_logs` --- src/handlers/http/modal/utils/ingest_utils.rs | 95 +++++++++---------- 1 file changed, 44 insertions(+), 51 deletions(-) diff --git a/src/handlers/http/modal/utils/ingest_utils.rs b/src/handlers/http/modal/utils/ingest_utils.rs index 84d5ae117..55e81f0df 100644 --- a/src/handlers/http/modal/utils/ingest_utils.rs +++ b/src/handlers/http/modal/utils/ingest_utils.rs @@ -39,45 +39,36 @@ pub async fn flatten_and_push_logs( stream_name: &str, log_source: &LogSource, ) -> Result<(), PostError> { - match log_source { + let json = match log_source { LogSource::Kinesis => { //custom flattening required for Amazon Kinesis let message: Message = serde_json::from_value(json)?; - for record in flatten_kinesis_logs(message) { - push_logs(stream_name, record, &LogSource::default()).await?; - } + flatten_kinesis_logs(message) } LogSource::OtelLogs => { //custom flattening required for otel logs let logs: LogsData = serde_json::from_value(json)?; - for record in flatten_otel_logs(&logs) { - push_logs(stream_name, record, log_source).await?; - } + flatten_otel_logs(&logs) } LogSource::OtelTraces => { //custom flattening required for otel traces let traces: TracesData = serde_json::from_value(json)?; - for record in flatten_otel_traces(&traces) { - push_logs(stream_name, record, log_source).await?; - } + flatten_otel_traces(&traces) } LogSource::OtelMetrics => { //custom flattening required for otel metrics let metrics: MetricsData = serde_json::from_value(json)?; - for record in flatten_otel_metrics(metrics) { - push_logs(stream_name, record, log_source).await?; - } + flatten_otel_metrics(metrics) } - _ => { - push_logs(stream_name, json, log_source).await?; - } - } + _ => vec![json], + }; + push_logs(stream_name, json, log_source).await?; Ok(()) } async fn push_logs( stream_name: &str, - json: Value, + jsons: Vec, log_source: &LogSource, ) -> Result<(), PostError> { let stream = PARSEABLE.get_stream(stream_name)?; @@ -89,42 +80,44 @@ async fn push_logs( let custom_partition = stream.get_custom_partition(); let schema_version = stream.get_schema_version(); let p_timestamp = Utc::now(); - - let data = if time_partition.is_some() || custom_partition.is_some() { - convert_array_to_object( - json, - time_partition.as_ref(), - time_partition_limit, - custom_partition.as_ref(), - schema_version, - log_source, - )? - } else { - vec![convert_to_array(convert_array_to_object( - json, - None, - None, - None, - schema_version, - log_source, - )?)?] - }; - - for json in data { - let origin_size = serde_json::to_vec(&json).unwrap().len() as u64; // string length need not be the same as byte length - let schema = PARSEABLE.get_stream(stream_name)?.get_schema_raw(); - json::Event { json, p_timestamp } - .into_event( - stream_name.to_owned(), - origin_size, - &schema, - static_schema_flag, - custom_partition.as_ref(), + + for json in jsons { + let data = if time_partition.is_some() || custom_partition.is_some() { + convert_array_to_object( + json, time_partition.as_ref(), + time_partition_limit, + custom_partition.as_ref(), schema_version, - StreamType::UserDefined, + log_source, )? - .process()?; + } else { + vec![convert_to_array(convert_array_to_object( + json, + None, + None, + None, + schema_version, + log_source, + )?)?] + }; + + for json in data { + let origin_size = serde_json::to_vec(&json).unwrap().len() as u64; // string length need not be the same as byte length + let schema = PARSEABLE.get_stream(stream_name)?.get_schema_raw(); + json::Event { json, p_timestamp } + .into_event( + stream_name.to_owned(), + origin_size, + &schema, + static_schema_flag, + custom_partition.as_ref(), + time_partition.as_ref(), + schema_version, + StreamType::UserDefined, + )? + .process()?; + } } Ok(()) } From d04ba905f7cff9334923a3c46179c560c2706c81 Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Sat, 1 Mar 2025 01:39:43 +0530 Subject: [PATCH 02/39] move tests to associated module --- src/event/format/json.rs | 448 ++++++++++++++++ src/handlers/http/ingest.rs | 499 ------------------ src/handlers/http/modal/utils/ingest_utils.rs | 2 +- src/parseable/streams.rs | 5 +- src/utils/json/mod.rs | 137 +++++ 5 files changed, 587 insertions(+), 504 deletions(-) diff --git a/src/event/format/json.rs b/src/event/format/json.rs index c28b701de..43c23f5ad 100644 --- a/src/event/format/json.rs +++ b/src/event/format/json.rs @@ -327,6 +327,8 @@ fn valid_type(data_type: &DataType, value: &Value, schema_version: SchemaVersion mod tests { use std::str::FromStr; + use arrow::datatypes::Int64Type; + use arrow_array::{ArrayRef, Float64Array, Int64Array, ListArray, StringArray}; use serde_json::json; use super::*; @@ -355,4 +357,450 @@ mod tests { assert!(parsed.is_err()); } + + trait TestExt { + fn as_int64_arr(&self) -> Option<&Int64Array>; + fn as_float64_arr(&self) -> Option<&Float64Array>; + fn as_utf8_arr(&self) -> Option<&StringArray>; + } + + impl TestExt for ArrayRef { + fn as_int64_arr(&self) -> Option<&Int64Array> { + self.as_any().downcast_ref() + } + + fn as_float64_arr(&self) -> Option<&Float64Array> { + self.as_any().downcast_ref() + } + + fn as_utf8_arr(&self) -> Option<&StringArray> { + self.as_any().downcast_ref() + } + } + + fn fields_to_map(iter: impl Iterator) -> HashMap> { + iter.map(|x| (x.name().clone(), Arc::new(x))).collect() + } + + #[test] + fn basic_object_into_rb() { + let json = json!({ + "c": 4.23, + "a": 1, + "b": "hello", + }); + + let (rb, _) = Event::new(json) + .into_recordbatch(&HashMap::default(), false, None, SchemaVersion::V0) + .unwrap(); + + assert_eq!(rb.num_rows(), 1); + assert_eq!(rb.num_columns(), 4); + assert_eq!( + rb.column_by_name("a").unwrap().as_int64_arr().unwrap(), + &Int64Array::from_iter([1]) + ); + assert_eq!( + rb.column_by_name("b").unwrap().as_utf8_arr().unwrap(), + &StringArray::from_iter_values(["hello"]) + ); + assert_eq!( + rb.column_by_name("c").unwrap().as_float64_arr().unwrap(), + &Float64Array::from_iter([4.23]) + ); + } + + #[test] + fn basic_object_with_null_into_rb() { + let json = json!({ + "a": 1, + "b": "hello", + "c": null + }); + + let (rb, _) = Event::new(json) + .into_recordbatch(&HashMap::default(), false, None, SchemaVersion::V0) + .unwrap(); + + assert_eq!(rb.num_rows(), 1); + assert_eq!(rb.num_columns(), 3); + assert_eq!( + rb.column_by_name("a").unwrap().as_int64_arr().unwrap(), + &Int64Array::from_iter([1]) + ); + assert_eq!( + rb.column_by_name("b").unwrap().as_utf8_arr().unwrap(), + &StringArray::from_iter_values(["hello"]) + ); + } + + #[test] + fn basic_object_derive_schema_into_rb() { + let json = json!({ + "a": 1, + "b": "hello", + }); + + let schema = fields_to_map( + [ + Field::new("a", DataType::Int64, true), + Field::new("b", DataType::Utf8, true), + Field::new("c", DataType::Float64, true), + ] + .into_iter(), + ); + + let (rb, _) = Event::new(json) + .into_recordbatch(&schema, false, None, SchemaVersion::V0) + .unwrap(); + + assert_eq!(rb.num_rows(), 1); + assert_eq!(rb.num_columns(), 3); + assert_eq!( + rb.column_by_name("a").unwrap().as_int64_arr().unwrap(), + &Int64Array::from_iter([1]) + ); + assert_eq!( + rb.column_by_name("b").unwrap().as_utf8_arr().unwrap(), + &StringArray::from_iter_values(["hello"]) + ); + } + + #[test] + fn basic_object_schema_mismatch() { + let json = json!({ + "a": 1, + "b": 1, // type mismatch + }); + + let schema = fields_to_map( + [ + Field::new("a", DataType::Int64, true), + Field::new("b", DataType::Utf8, true), + Field::new("c", DataType::Float64, true), + ] + .into_iter(), + ); + + assert!(Event::new(json) + .into_recordbatch(&schema, false, None, SchemaVersion::V0,) + .is_err()); + } + + #[test] + fn empty_object() { + let json = json!({}); + + let schema = fields_to_map( + [ + Field::new("a", DataType::Int64, true), + Field::new("b", DataType::Utf8, true), + Field::new("c", DataType::Float64, true), + ] + .into_iter(), + ); + + let (rb, _) = Event::new(json) + .into_recordbatch(&schema, false, None, SchemaVersion::V0) + .unwrap(); + + assert_eq!(rb.num_rows(), 1); + assert_eq!(rb.num_columns(), 1); + } + + #[test] + fn array_into_recordbatch_inffered_schema() { + let json = json!([ + { + "b": "hello", + }, + { + "b": "hello", + "a": 1, + "c": 1 + }, + { + "a": 1, + "b": "hello", + "c": null + }, + ]); + + let (rb, _) = Event::new(json) + .into_recordbatch(&HashMap::default(), false, None, SchemaVersion::V0) + .unwrap(); + + assert_eq!(rb.num_rows(), 3); + assert_eq!(rb.num_columns(), 4); + + let schema = rb.schema(); + let fields = &schema.fields; + + assert_eq!(&*fields[1], &Field::new("a", DataType::Int64, true)); + assert_eq!(&*fields[2], &Field::new("b", DataType::Utf8, true)); + assert_eq!(&*fields[3], &Field::new("c", DataType::Int64, true)); + + assert_eq!( + rb.column_by_name("a").unwrap().as_int64_arr().unwrap(), + &Int64Array::from(vec![None, Some(1), Some(1)]) + ); + assert_eq!( + rb.column_by_name("b").unwrap().as_utf8_arr().unwrap(), + &StringArray::from(vec![Some("hello"), Some("hello"), Some("hello"),]) + ); + assert_eq!( + rb.column_by_name("c").unwrap().as_int64_arr().unwrap(), + &Int64Array::from(vec![None, Some(1), None]) + ); + } + + #[test] + fn arr_with_null_into_rb() { + let json = json!([ + { + "c": null, + "b": "hello", + "a": null + }, + { + "a": 1, + "c": 1.22, + "b": "hello" + }, + { + "b": "hello", + "a": 1, + "c": null + }, + ]); + + let (rb, _) = Event::new(json) + .into_recordbatch(&HashMap::default(), false, None, SchemaVersion::V0) + .unwrap(); + + assert_eq!(rb.num_rows(), 3); + assert_eq!(rb.num_columns(), 4); + assert_eq!( + rb.column_by_name("a").unwrap().as_int64_arr().unwrap(), + &Int64Array::from(vec![None, Some(1), Some(1)]) + ); + assert_eq!( + rb.column_by_name("b").unwrap().as_utf8_arr().unwrap(), + &StringArray::from(vec![Some("hello"), Some("hello"), Some("hello"),]) + ); + assert_eq!( + rb.column_by_name("c").unwrap().as_float64_arr().unwrap(), + &Float64Array::from(vec![None, Some(1.22), None,]) + ); + } + + #[test] + fn arr_with_null_derive_schema_into_rb() { + let json = json!([ + { + "c": null, + "b": "hello", + "a": null + }, + { + "a": 1, + "c": 1.22, + "b": "hello" + }, + { + "b": "hello", + "a": 1, + "c": null + }, + ]); + + let schema = fields_to_map( + [ + Field::new("a", DataType::Int64, true), + Field::new("b", DataType::Utf8, true), + Field::new("c", DataType::Float64, true), + ] + .into_iter(), + ); + + let (rb, _) = Event::new(json) + .into_recordbatch(&schema, false, None, SchemaVersion::V0) + .unwrap(); + + assert_eq!(rb.num_rows(), 3); + assert_eq!(rb.num_columns(), 4); + assert_eq!( + rb.column_by_name("a").unwrap().as_int64_arr().unwrap(), + &Int64Array::from(vec![None, Some(1), Some(1)]) + ); + assert_eq!( + rb.column_by_name("b").unwrap().as_utf8_arr().unwrap(), + &StringArray::from(vec![Some("hello"), Some("hello"), Some("hello"),]) + ); + assert_eq!( + rb.column_by_name("c").unwrap().as_float64_arr().unwrap(), + &Float64Array::from(vec![None, Some(1.22), None,]) + ); + } + + #[test] + fn arr_schema_mismatch() { + let json = json!([ + { + "a": null, + "b": "hello", + "c": 1.24 + }, + { + "a": 1, + "b": "hello", + "c": 1 + }, + { + "a": 1, + "b": "hello", + "c": null + }, + ]); + + let schema = fields_to_map( + [ + Field::new("a", DataType::Int64, true), + Field::new("b", DataType::Utf8, true), + Field::new("c", DataType::Float64, true), + ] + .into_iter(), + ); + + assert!(Event::new(json) + .into_recordbatch(&schema, false, None, SchemaVersion::V0,) + .is_err()); + } + + #[test] + fn arr_obj_with_nested_type() { + let json = json!([ + { + "a": 1, + "b": "hello", + }, + { + "a": 1, + "b": "hello", + }, + { + "a": 1, + "b": "hello", + "c_a": [1], + }, + { + "a": 1, + "b": "hello", + "c_a": [1], + "c_b": [2], + }, + ]); + + let (rb, _) = Event::new(json) + .into_recordbatch(&HashMap::default(), false, None, SchemaVersion::V0) + .unwrap(); + assert_eq!(rb.num_rows(), 4); + assert_eq!(rb.num_columns(), 5); + assert_eq!( + rb.column_by_name("a").unwrap().as_int64_arr().unwrap(), + &Int64Array::from(vec![Some(1), Some(1), Some(1), Some(1)]) + ); + assert_eq!( + rb.column_by_name("b").unwrap().as_utf8_arr().unwrap(), + &StringArray::from(vec![ + Some("hello"), + Some("hello"), + Some("hello"), + Some("hello") + ]) + ); + + assert_eq!( + rb.column_by_name("c_a") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(), + &ListArray::from_iter_primitive::(vec![ + None, + None, + Some(vec![Some(1i64)]), + Some(vec![Some(1)]) + ]) + ); + + assert_eq!( + rb.column_by_name("c_b") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(), + &ListArray::from_iter_primitive::(vec![ + None, + None, + None, + Some(vec![Some(2i64)]) + ]) + ); + } + + #[test] + fn arr_obj_with_nested_type_v1() { + let json = json!([ + { + "a": 1, + "b": "hello", + }, + { + "a": 1, + "b": "hello", + }, + { + "a": 1, + "b": "hello", + "c_a": 1, + }, + { + "a": 1, + "b": "hello", + "c_a": 1, + "c_b": 2, + }, + ]); + + let (rb, _) = Event::new(json) + .into_recordbatch(&HashMap::default(), false, None, SchemaVersion::V1) + .unwrap(); + + assert_eq!(rb.num_rows(), 4); + assert_eq!(rb.num_columns(), 5); + assert_eq!( + rb.column_by_name("a").unwrap().as_float64_arr().unwrap(), + &Float64Array::from(vec![Some(1.0), Some(1.0), Some(1.0), Some(1.0)]) + ); + assert_eq!( + rb.column_by_name("b").unwrap().as_utf8_arr().unwrap(), + &StringArray::from(vec![ + Some("hello"), + Some("hello"), + Some("hello"), + Some("hello") + ]) + ); + + assert_eq!( + rb.column_by_name("c_a").unwrap().as_float64_arr().unwrap(), + &Float64Array::from(vec![None, None, Some(1.0), Some(1.0)]) + ); + + assert_eq!( + rb.column_by_name("c_b").unwrap().as_float64_arr().unwrap(), + &Float64Array::from(vec![None, None, None, Some(2.0)]) + ); + } } diff --git a/src/handlers/http/ingest.rs b/src/handlers/http/ingest.rs index 0523e8757..bca4e36dc 100644 --- a/src/handlers/http/ingest.rs +++ b/src/handlers/http/ingest.rs @@ -332,502 +332,3 @@ impl actix_web::ResponseError for PostError { .body(self.to_string()) } } - -#[cfg(test)] -mod tests { - - use arrow::datatypes::Int64Type; - use arrow_array::{ArrayRef, Float64Array, Int64Array, ListArray, StringArray}; - use arrow_schema::{DataType, Field}; - use serde_json::json; - use std::{collections::HashMap, sync::Arc}; - - use crate::{ - event::format::{json, EventFormat}, - metadata::SchemaVersion, - utils::json::{convert_array_to_object, flatten::convert_to_array}, - }; - - trait TestExt { - fn as_int64_arr(&self) -> Option<&Int64Array>; - fn as_float64_arr(&self) -> Option<&Float64Array>; - fn as_utf8_arr(&self) -> Option<&StringArray>; - } - - impl TestExt for ArrayRef { - fn as_int64_arr(&self) -> Option<&Int64Array> { - self.as_any().downcast_ref() - } - - fn as_float64_arr(&self) -> Option<&Float64Array> { - self.as_any().downcast_ref() - } - - fn as_utf8_arr(&self) -> Option<&StringArray> { - self.as_any().downcast_ref() - } - } - - fn fields_to_map(iter: impl Iterator) -> HashMap> { - iter.map(|x| (x.name().clone(), Arc::new(x))).collect() - } - - #[test] - fn basic_object_into_rb() { - let json = json!({ - "c": 4.23, - "a": 1, - "b": "hello", - }); - - let (rb, _) = json::Event::new(json) - .into_recordbatch(&HashMap::default(), false, None, SchemaVersion::V0) - .unwrap(); - - assert_eq!(rb.num_rows(), 1); - assert_eq!(rb.num_columns(), 4); - assert_eq!( - rb.column_by_name("a").unwrap().as_int64_arr().unwrap(), - &Int64Array::from_iter([1]) - ); - assert_eq!( - rb.column_by_name("b").unwrap().as_utf8_arr().unwrap(), - &StringArray::from_iter_values(["hello"]) - ); - assert_eq!( - rb.column_by_name("c").unwrap().as_float64_arr().unwrap(), - &Float64Array::from_iter([4.23]) - ); - } - - #[test] - fn basic_object_with_null_into_rb() { - let json = json!({ - "a": 1, - "b": "hello", - "c": null - }); - - let (rb, _) = json::Event::new(json) - .into_recordbatch(&HashMap::default(), false, None, SchemaVersion::V0) - .unwrap(); - - assert_eq!(rb.num_rows(), 1); - assert_eq!(rb.num_columns(), 3); - assert_eq!( - rb.column_by_name("a").unwrap().as_int64_arr().unwrap(), - &Int64Array::from_iter([1]) - ); - assert_eq!( - rb.column_by_name("b").unwrap().as_utf8_arr().unwrap(), - &StringArray::from_iter_values(["hello"]) - ); - } - - #[test] - fn basic_object_derive_schema_into_rb() { - let json = json!({ - "a": 1, - "b": "hello", - }); - - let schema = fields_to_map( - [ - Field::new("a", DataType::Int64, true), - Field::new("b", DataType::Utf8, true), - Field::new("c", DataType::Float64, true), - ] - .into_iter(), - ); - - let (rb, _) = json::Event::new(json) - .into_recordbatch(&schema, false, None, SchemaVersion::V0) - .unwrap(); - - assert_eq!(rb.num_rows(), 1); - assert_eq!(rb.num_columns(), 3); - assert_eq!( - rb.column_by_name("a").unwrap().as_int64_arr().unwrap(), - &Int64Array::from_iter([1]) - ); - assert_eq!( - rb.column_by_name("b").unwrap().as_utf8_arr().unwrap(), - &StringArray::from_iter_values(["hello"]) - ); - } - - #[test] - fn basic_object_schema_mismatch() { - let json = json!({ - "a": 1, - "b": 1, // type mismatch - }); - - let schema = fields_to_map( - [ - Field::new("a", DataType::Int64, true), - Field::new("b", DataType::Utf8, true), - Field::new("c", DataType::Float64, true), - ] - .into_iter(), - ); - - assert!(json::Event::new(json) - .into_recordbatch(&schema, false, None, SchemaVersion::V0,) - .is_err()); - } - - #[test] - fn empty_object() { - let json = json!({}); - - let schema = fields_to_map( - [ - Field::new("a", DataType::Int64, true), - Field::new("b", DataType::Utf8, true), - Field::new("c", DataType::Float64, true), - ] - .into_iter(), - ); - - let (rb, _) = json::Event::new(json) - .into_recordbatch(&schema, false, None, SchemaVersion::V0) - .unwrap(); - - assert_eq!(rb.num_rows(), 1); - assert_eq!(rb.num_columns(), 1); - } - - #[test] - fn non_object_arr_is_err() { - let json = json!([1]); - - assert!(convert_array_to_object( - json, - None, - None, - None, - SchemaVersion::V0, - &crate::event::format::LogSource::default() - ) - .is_err()) - } - - #[test] - fn array_into_recordbatch_inffered_schema() { - let json = json!([ - { - "b": "hello", - }, - { - "b": "hello", - "a": 1, - "c": 1 - }, - { - "a": 1, - "b": "hello", - "c": null - }, - ]); - - let (rb, _) = json::Event::new(json) - .into_recordbatch(&HashMap::default(), false, None, SchemaVersion::V0) - .unwrap(); - - assert_eq!(rb.num_rows(), 3); - assert_eq!(rb.num_columns(), 4); - - let schema = rb.schema(); - let fields = &schema.fields; - - assert_eq!(&*fields[1], &Field::new("a", DataType::Int64, true)); - assert_eq!(&*fields[2], &Field::new("b", DataType::Utf8, true)); - assert_eq!(&*fields[3], &Field::new("c", DataType::Int64, true)); - - assert_eq!( - rb.column_by_name("a").unwrap().as_int64_arr().unwrap(), - &Int64Array::from(vec![None, Some(1), Some(1)]) - ); - assert_eq!( - rb.column_by_name("b").unwrap().as_utf8_arr().unwrap(), - &StringArray::from(vec![Some("hello"), Some("hello"), Some("hello"),]) - ); - assert_eq!( - rb.column_by_name("c").unwrap().as_int64_arr().unwrap(), - &Int64Array::from(vec![None, Some(1), None]) - ); - } - - #[test] - fn arr_with_null_into_rb() { - let json = json!([ - { - "c": null, - "b": "hello", - "a": null - }, - { - "a": 1, - "c": 1.22, - "b": "hello" - }, - { - "b": "hello", - "a": 1, - "c": null - }, - ]); - - let (rb, _) = json::Event::new(json) - .into_recordbatch(&HashMap::default(), false, None, SchemaVersion::V0) - .unwrap(); - - assert_eq!(rb.num_rows(), 3); - assert_eq!(rb.num_columns(), 4); - assert_eq!( - rb.column_by_name("a").unwrap().as_int64_arr().unwrap(), - &Int64Array::from(vec![None, Some(1), Some(1)]) - ); - assert_eq!( - rb.column_by_name("b").unwrap().as_utf8_arr().unwrap(), - &StringArray::from(vec![Some("hello"), Some("hello"), Some("hello"),]) - ); - assert_eq!( - rb.column_by_name("c").unwrap().as_float64_arr().unwrap(), - &Float64Array::from(vec![None, Some(1.22), None,]) - ); - } - - #[test] - fn arr_with_null_derive_schema_into_rb() { - let json = json!([ - { - "c": null, - "b": "hello", - "a": null - }, - { - "a": 1, - "c": 1.22, - "b": "hello" - }, - { - "b": "hello", - "a": 1, - "c": null - }, - ]); - - let schema = fields_to_map( - [ - Field::new("a", DataType::Int64, true), - Field::new("b", DataType::Utf8, true), - Field::new("c", DataType::Float64, true), - ] - .into_iter(), - ); - - let (rb, _) = json::Event::new(json) - .into_recordbatch(&schema, false, None, SchemaVersion::V0) - .unwrap(); - - assert_eq!(rb.num_rows(), 3); - assert_eq!(rb.num_columns(), 4); - assert_eq!( - rb.column_by_name("a").unwrap().as_int64_arr().unwrap(), - &Int64Array::from(vec![None, Some(1), Some(1)]) - ); - assert_eq!( - rb.column_by_name("b").unwrap().as_utf8_arr().unwrap(), - &StringArray::from(vec![Some("hello"), Some("hello"), Some("hello"),]) - ); - assert_eq!( - rb.column_by_name("c").unwrap().as_float64_arr().unwrap(), - &Float64Array::from(vec![None, Some(1.22), None,]) - ); - } - - #[test] - fn arr_schema_mismatch() { - let json = json!([ - { - "a": null, - "b": "hello", - "c": 1.24 - }, - { - "a": 1, - "b": "hello", - "c": 1 - }, - { - "a": 1, - "b": "hello", - "c": null - }, - ]); - - let schema = fields_to_map( - [ - Field::new("a", DataType::Int64, true), - Field::new("b", DataType::Utf8, true), - Field::new("c", DataType::Float64, true), - ] - .into_iter(), - ); - - assert!(json::Event::new(json) - .into_recordbatch(&schema, false, None, SchemaVersion::V0,) - .is_err()); - } - - #[test] - fn arr_obj_with_nested_type() { - let json = json!([ - { - "a": 1, - "b": "hello", - }, - { - "a": 1, - "b": "hello", - }, - { - "a": 1, - "b": "hello", - "c": [{"a": 1}] - }, - { - "a": 1, - "b": "hello", - "c": [{"a": 1, "b": 2}] - }, - ]); - let flattened_json = convert_to_array( - convert_array_to_object( - json, - None, - None, - None, - SchemaVersion::V0, - &crate::event::format::LogSource::default(), - ) - .unwrap(), - ) - .unwrap(); - - let (rb, _) = json::Event::new(flattened_json) - .into_recordbatch(&HashMap::default(), false, None, SchemaVersion::V0) - .unwrap(); - assert_eq!(rb.num_rows(), 4); - assert_eq!(rb.num_columns(), 5); - assert_eq!( - rb.column_by_name("a").unwrap().as_int64_arr().unwrap(), - &Int64Array::from(vec![Some(1), Some(1), Some(1), Some(1)]) - ); - assert_eq!( - rb.column_by_name("b").unwrap().as_utf8_arr().unwrap(), - &StringArray::from(vec![ - Some("hello"), - Some("hello"), - Some("hello"), - Some("hello") - ]) - ); - - assert_eq!( - rb.column_by_name("c_a") - .unwrap() - .as_any() - .downcast_ref::() - .unwrap(), - &ListArray::from_iter_primitive::(vec![ - None, - None, - Some(vec![Some(1i64)]), - Some(vec![Some(1)]) - ]) - ); - - assert_eq!( - rb.column_by_name("c_b") - .unwrap() - .as_any() - .downcast_ref::() - .unwrap(), - &ListArray::from_iter_primitive::(vec![ - None, - None, - None, - Some(vec![Some(2i64)]) - ]) - ); - } - - #[test] - fn arr_obj_with_nested_type_v1() { - let json = json!([ - { - "a": 1, - "b": "hello", - }, - { - "a": 1, - "b": "hello", - }, - { - "a": 1, - "b": "hello", - "c": [{"a": 1}] - }, - { - "a": 1, - "b": "hello", - "c": [{"a": 1, "b": 2}] - }, - ]); - let flattened_json = convert_to_array( - convert_array_to_object( - json, - None, - None, - None, - SchemaVersion::V1, - &crate::event::format::LogSource::default(), - ) - .unwrap(), - ) - .unwrap(); - - let (rb, _) = json::Event::new(flattened_json) - .into_recordbatch(&HashMap::default(), false, None, SchemaVersion::V1) - .unwrap(); - - assert_eq!(rb.num_rows(), 4); - assert_eq!(rb.num_columns(), 5); - assert_eq!( - rb.column_by_name("a").unwrap().as_float64_arr().unwrap(), - &Float64Array::from(vec![Some(1.0), Some(1.0), Some(1.0), Some(1.0)]) - ); - assert_eq!( - rb.column_by_name("b").unwrap().as_utf8_arr().unwrap(), - &StringArray::from(vec![ - Some("hello"), - Some("hello"), - Some("hello"), - Some("hello") - ]) - ); - - assert_eq!( - rb.column_by_name("c_a").unwrap().as_float64_arr().unwrap(), - &Float64Array::from(vec![None, None, Some(1.0), Some(1.0)]) - ); - - assert_eq!( - rb.column_by_name("c_b").unwrap().as_float64_arr().unwrap(), - &Float64Array::from(vec![None, None, None, Some(2.0)]) - ); - } -} diff --git a/src/handlers/http/modal/utils/ingest_utils.rs b/src/handlers/http/modal/utils/ingest_utils.rs index 55e81f0df..257dc014e 100644 --- a/src/handlers/http/modal/utils/ingest_utils.rs +++ b/src/handlers/http/modal/utils/ingest_utils.rs @@ -80,7 +80,7 @@ async fn push_logs( let custom_partition = stream.get_custom_partition(); let schema_version = stream.get_schema_version(); let p_timestamp = Utc::now(); - + for json in jsons { let data = if time_partition.is_some() || custom_partition.is_some() { convert_array_to_object( diff --git a/src/parseable/streams.rs b/src/parseable/streams.rs index 088ca509d..009e01d2c 100644 --- a/src/parseable/streams.rs +++ b/src/parseable/streams.rs @@ -513,10 +513,7 @@ impl Stream { let file_size = match file.metadata() { Ok(meta) => meta.len(), Err(err) => { - warn!( - "File ({}) not found; Error = {err}", - file.display() - ); + warn!("File ({}) not found; Error = {err}", file.display()); continue; } }; diff --git a/src/utils/json/mod.rs b/src/utils/json/mod.rs index efa9cb2e2..0583f722a 100644 --- a/src/utils/json/mod.rs +++ b/src/utils/json/mod.rs @@ -278,4 +278,141 @@ mod tests { assert_eq!(deserialized.value, original.value); assert_eq!(deserialized.other_field, original.other_field); } + + #[test] + fn non_object_arr_is_err() { + let json = json!([1]); + + assert!(convert_array_to_object( + json, + None, + None, + None, + SchemaVersion::V0, + &crate::event::format::LogSource::default() + ) + .is_err()) + } + + #[test] + fn arr_obj_with_nested_type() { + let json = json!([ + { + "a": 1, + "b": "hello", + }, + { + "a": 1, + "b": "hello", + }, + { + "a": 1, + "b": "hello", + "c": [{"a": 1}] + }, + { + "a": 1, + "b": "hello", + "c": [{"a": 1, "b": 2}] + }, + ]); + let flattened_json = convert_to_array( + convert_array_to_object( + json, + None, + None, + None, + SchemaVersion::V0, + &crate::event::format::LogSource::default(), + ) + .unwrap(), + ) + .unwrap(); + + assert_eq!( + json!([ + { + "a": 1, + "b": "hello", + }, + { + "a": 1, + "b": "hello", + }, + { + "a": 1, + "b": "hello", + "c_a": [1], + }, + { + "a": 1, + "b": "hello", + "c_a": [1], + "c_b": [2], + }, + ]), + flattened_json + ); + } + + #[test] + fn arr_obj_with_nested_type_v1() { + let json = json!([ + { + "a": 1, + "b": "hello", + }, + { + "a": 1, + "b": "hello", + }, + { + "a": 1, + "b": "hello", + "c": [{"a": 1}] + }, + { + "a": 1, + "b": "hello", + "c": [{"a": 1, "b": 2}] + }, + ]); + let flattened_json = convert_to_array( + convert_array_to_object( + json, + None, + None, + None, + SchemaVersion::V1, + &crate::event::format::LogSource::default(), + ) + .unwrap(), + ) + .unwrap(); + + assert_eq!( + json!([ + { + "a": 1, + "b": "hello", + }, + { + "a": 1, + "b": "hello", + }, + { + "a": 1, + "b": "hello", + "c_a": 1, + }, + { + "a": 1, + "b": "hello", + "c_a": 1, + "c_b": 2, + }, + ]), + flattened_json + ); + } } From 5798b7b820b19adf84e09d2d7d53c9ca2a4e6a26 Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Sat, 1 Mar 2025 03:42:27 +0530 Subject: [PATCH 03/39] refactor: remove `is_schema_matching` --- src/event/format/mod.rs | 35 ++++++++++------------------------- 1 file changed, 10 insertions(+), 25 deletions(-) diff --git a/src/event/format/mod.rs b/src/event/format/mod.rs index ce90cfc52..6bf10d059 100644 --- a/src/event/format/mod.rs +++ b/src/event/format/mod.rs @@ -137,11 +137,18 @@ pub trait EventFormat: Sized { )), ); - // prepare the record batch and new fields to be added - let mut new_schema = Arc::new(Schema::new(schema)); - if !Self::is_schema_matching(new_schema.clone(), storage_schema, static_schema_flag) { + if static_schema_flag + && schema.iter().any(|field| { + storage_schema + .get(field.name()) + .is_none_or(|storage_field| storage_field != field) + }) + { return Err(anyhow!("Schema mismatch")); } + + // prepare the record batch and new fields to be added + let mut new_schema = Arc::new(Schema::new(schema)); new_schema = update_field_type_in_schema(new_schema, None, time_partition, None, schema_version); @@ -156,28 +163,6 @@ pub trait EventFormat: Sized { Ok((rb, is_first)) } - fn is_schema_matching( - new_schema: Arc, - storage_schema: &HashMap>, - static_schema_flag: bool, - ) -> bool { - if !static_schema_flag { - return true; - } - for field in new_schema.fields() { - let Some(storage_field) = storage_schema.get(field.name()) else { - return false; - }; - if field.name() != storage_field.name() { - return false; - } - if field.data_type() != storage_field.data_type() { - return false; - } - } - true - } - #[allow(clippy::too_many_arguments)] fn into_event( self, From 78da6b458211206f26c8ebe16cce6eeac215373d Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Sat, 1 Mar 2025 11:33:12 +0530 Subject: [PATCH 04/39] doc: improve readability --- src/event/format/json.rs | 71 ++++++++++++++++++---------------------- 1 file changed, 31 insertions(+), 40 deletions(-) diff --git a/src/event/format/json.rs b/src/event/format/json.rs index 43c23f5ad..ae4cdc55f 100644 --- a/src/event/format/json.rs +++ b/src/event/format/json.rs @@ -59,12 +59,10 @@ impl EventFormat for Event { // also extract the arrow schema, tags and metadata from the incoming json fn to_data( self, - schema: &HashMap>, + stored_schema: &HashMap>, time_partition: Option<&String>, schema_version: SchemaVersion, ) -> Result<(Self::Data, Vec>, bool), anyhow::Error> { - let stream_schema = schema; - // incoming event may be a single json or a json array // but Data (type defined above) is a vector of json values // hence we need to convert the incoming event to a vector of json values @@ -79,23 +77,23 @@ impl EventFormat for Event { collect_keys(value_arr.iter()).expect("fields can be collected from array of objects"); let mut is_first = false; - let schema = match derive_arrow_schema(stream_schema, fields) { - Ok(schema) => schema, - Err(_) => { + let schema = match derive_arrow_schema(stored_schema, fields) { + Some(schema) => schema, + _ => { let mut infer_schema = infer_json_schema_from_iterator(value_arr.iter().map(Ok)) .map_err(|err| { anyhow!("Could not infer schema for this event due to err {:?}", err) })?; let new_infer_schema = super::update_field_type_in_schema( Arc::new(infer_schema), - Some(stream_schema), + Some(stored_schema), time_partition, Some(&value_arr), schema_version, ); infer_schema = Schema::new(new_infer_schema.fields().clone()); Schema::try_merge(vec![ - Schema::new(stream_schema.values().cloned().collect::()), + Schema::new(stored_schema.values().cloned().collect::()), infer_schema.clone(), ]).map_err(|err| anyhow!("Could not merge schema of this event with that of the existing stream. {:?}", err))?; is_first = true; @@ -221,51 +219,44 @@ fn extract_and_parse_time( // Returns arrow schema with the fields that are present in the request body // This schema is an input to convert the request body to arrow record batch +// Returns None if even one of the fields in the json is new and not seen before fn derive_arrow_schema( schema: &HashMap>, - fields: Vec<&str>, -) -> Result>, ()> { + fields: HashSet<&str>, +) -> Option>> { let mut res = Vec::with_capacity(fields.len()); - let fields = fields.into_iter().map(|field_name| schema.get(field_name)); - for field in fields { - let Some(field) = field else { return Err(()) }; + for field_name in fields { + let field = schema.get(field_name)?; res.push(field.clone()) } - Ok(res) + + Some(res) } -fn collect_keys<'a>(values: impl Iterator) -> Result, ()> { - let mut keys = Vec::new(); +// Returns a list of keys that are present in the given iterable of JSON objects +// Returns None if even one of the value is not an Object +fn collect_keys<'a>(values: impl Iterator) -> Option> { + let mut keys = HashSet::new(); for value in values { - if let Some(obj) = value.as_object() { - for key in obj.keys() { - match keys.binary_search(&key.as_str()) { - Ok(_) => (), - Err(pos) => { - keys.insert(pos, key.as_str()); - } - } - } - } else { - return Err(()); + let obj = value.as_object()?; + for key in obj.keys() { + keys.insert(key.as_str()); } } - Ok(keys) + + Some(keys) } +// Returns true when the field doesn't exist in schema or has an invalid type fn fields_mismatch(schema: &[Arc], body: &Value, schema_version: SchemaVersion) -> bool { - for (name, val) in body.as_object().expect("body is of object variant") { - if val.is_null() { - continue; - } - let Some(field) = get_field(schema, name) else { - return true; - }; - if !valid_type(field.data_type(), val, schema_version) { - return true; - } - } - false + body.as_object() + .expect("body is of object variant") + .iter() + .any(|(key, value)| { + !value.is_null() + && get_field(schema, key) + .is_none_or(|field| !valid_type(field.data_type(), value, schema_version)) + }) } fn valid_type(data_type: &DataType, value: &Value, schema_version: SchemaVersion) -> bool { From 938c33dea9a796a29f395aaaf89fc4f0f17da199 Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Sat, 1 Mar 2025 11:54:50 +0530 Subject: [PATCH 05/39] simplify `replace_columns` --- src/event/format/mod.rs | 3 +-- src/utils/arrow/mod.rs | 12 +++++------- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/src/event/format/mod.rs b/src/event/format/mod.rs index 6bf10d059..5c5f8dee8 100644 --- a/src/event/format/mod.rs +++ b/src/event/format/mod.rs @@ -156,8 +156,7 @@ pub trait EventFormat: Sized { rb = replace_columns( rb.schema(), &rb, - &[0], - &[Arc::new(get_timestamp_array(p_timestamp, rb.num_rows()))], + &[(0, Arc::new(get_timestamp_array(p_timestamp, rb.num_rows())))], ); Ok((rb, is_first)) diff --git a/src/utils/arrow/mod.rs b/src/utils/arrow/mod.rs index 53e6437d6..a11186ee0 100644 --- a/src/utils/arrow/mod.rs +++ b/src/utils/arrow/mod.rs @@ -61,8 +61,7 @@ use serde_json::{Map, Value}; /// /// * `schema` - The schema of the record batch. /// * `batch` - The record batch to modify. -/// * `indexes` - The indexes of the columns to replace. -/// * `arrays` - The new arrays to replace the columns with. +/// * `indexed_arrays` - A list of indexes and arrays to replace the columns indexed with. /// /// # Returns /// @@ -70,12 +69,11 @@ use serde_json::{Map, Value}; pub fn replace_columns( schema: Arc, batch: &RecordBatch, - indexes: &[usize], - arrays: &[Arc], + indexed_arrays: &[(usize, Arc)], ) -> RecordBatch { let mut batch_arrays = batch.columns().iter().map(Arc::clone).collect_vec(); - for (&index, arr) in indexes.iter().zip(arrays.iter()) { - batch_arrays[index] = Arc::clone(arr); + for (index, arr) in indexed_arrays { + batch_arrays[*index] = Arc::clone(arr); } RecordBatch::try_new(schema, batch_arrays).unwrap() } @@ -178,7 +176,7 @@ mod tests { let arr: Arc = Arc::new(Int32Array::from_value(0, 3)); - let new_rb = replace_columns(schema_ref.clone(), &rb, &[2], &[arr]); + let new_rb = replace_columns(schema_ref.clone(), &rb, &[(2, arr)]); assert_eq!(new_rb.schema(), schema_ref); assert_eq!(new_rb.num_columns(), 3); From 6fb53854a798a51148218551fcc6daa1dc92198a Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Sat, 1 Mar 2025 12:00:07 +0530 Subject: [PATCH 06/39] ci: fix imports --- src/event/format/json.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/event/format/json.rs b/src/event/format/json.rs index ae4cdc55f..3233e1b5c 100644 --- a/src/event/format/json.rs +++ b/src/event/format/json.rs @@ -27,7 +27,10 @@ use chrono::{DateTime, NaiveDateTime, Utc}; use datafusion::arrow::util::bit_util::round_upto_multiple_of_64; use itertools::Itertools; use serde_json::Value; -use std::{collections::HashMap, sync::Arc}; +use std::{ + collections::{HashMap, HashSet}, + sync::Arc, +}; use tracing::error; use super::EventFormat; From 23155b699b55fe0631fe22685288ed4fc052457c Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Sat, 1 Mar 2025 12:14:42 +0530 Subject: [PATCH 07/39] push flattening into `Event` --- src/connectors/kafka/processor.rs | 3 + src/event/format/json.rs | 165 +++++++++++++++--- src/event/format/mod.rs | 19 +- src/handlers/http/ingest.rs | 2 + src/handlers/http/modal/utils/ingest_utils.rs | 44 ++--- 5 files changed, 174 insertions(+), 59 deletions(-) diff --git a/src/connectors/kafka/processor.rs b/src/connectors/kafka/processor.rs index b74754003..b9fe2101d 100644 --- a/src/connectors/kafka/processor.rs +++ b/src/connectors/kafka/processor.rs @@ -57,6 +57,7 @@ impl ParseableSinkProcessor { let stream = PARSEABLE.get_stream(stream_name)?; let schema = stream.get_schema_raw(); let time_partition = stream.get_time_partition(); + let time_partition_limit = stream.get_time_partition_limit(); let custom_partition = stream.get_custom_partition(); let static_schema_flag = stream.get_static_schema_flag(); let schema_version = stream.get_schema_version(); @@ -78,7 +79,9 @@ impl ParseableSinkProcessor { static_schema_flag, custom_partition.as_ref(), time_partition.as_ref(), + time_partition_limit, schema_version, + &LogSource::Custom("Kafka".to_owned()), StreamType::UserDefined, )?; diff --git a/src/event/format/json.rs b/src/event/format/json.rs index 3233e1b5c..c6f5a11e5 100644 --- a/src/event/format/json.rs +++ b/src/event/format/json.rs @@ -29,12 +29,20 @@ use itertools::Itertools; use serde_json::Value; use std::{ collections::{HashMap, HashSet}, + num::NonZeroU32, sync::Arc, }; use tracing::error; -use super::EventFormat; -use crate::{metadata::SchemaVersion, storage::StreamType, utils::arrow::get_field}; +use super::{EventFormat, LogSource}; +use crate::{ + metadata::SchemaVersion, + storage::StreamType, + utils::{ + arrow::get_field, + json::{convert_array_to_object, flatten::convert_to_array}, + }, +}; pub struct Event { pub json: Value, @@ -64,26 +72,40 @@ impl EventFormat for Event { self, stored_schema: &HashMap>, time_partition: Option<&String>, + time_partition_limit: Option, + custom_partition: Option<&String>, schema_version: SchemaVersion, + log_source: &LogSource, ) -> Result<(Self::Data, Vec>, bool), anyhow::Error> { - // incoming event may be a single json or a json array - // but Data (type defined above) is a vector of json values - // hence we need to convert the incoming event to a vector of json values - let value_arr = match self.json { - Value::Array(arr) => arr, - value @ Value::Object(_) => vec![value], - _ => unreachable!("flatten would have failed beforehand"), + let flattened = if time_partition.is_some() || custom_partition.is_some() { + convert_array_to_object( + self.json, + time_partition, + time_partition_limit, + custom_partition, + schema_version, + log_source, + )? + } else { + vec![convert_to_array(convert_array_to_object( + self.json, + None, + None, + None, + schema_version, + log_source, + )?)?] }; // collect all the keys from all the json objects in the request body let fields = - collect_keys(value_arr.iter()).expect("fields can be collected from array of objects"); + collect_keys(flattened.iter()).expect("fields can be collected from array of objects"); let mut is_first = false; let schema = match derive_arrow_schema(stored_schema, fields) { Some(schema) => schema, _ => { - let mut infer_schema = infer_json_schema_from_iterator(value_arr.iter().map(Ok)) + let mut infer_schema = infer_json_schema_from_iterator(flattened.iter().map(Ok)) .map_err(|err| { anyhow!("Could not infer schema for this event due to err {:?}", err) })?; @@ -91,7 +113,7 @@ impl EventFormat for Event { Arc::new(infer_schema), Some(stored_schema), time_partition, - Some(&value_arr), + Some(&flattened), schema_version, ); infer_schema = Schema::new(new_infer_schema.fields().clone()); @@ -110,7 +132,7 @@ impl EventFormat for Event { } }; - if value_arr + if flattened .iter() .any(|value| fields_mismatch(&schema, value, schema_version)) { @@ -119,7 +141,7 @@ impl EventFormat for Event { )); } - Ok((value_arr, schema, is_first)) + Ok((flattened, schema, is_first)) } // Convert the Data type (defined above) to arrow record batch @@ -147,7 +169,9 @@ impl EventFormat for Event { static_schema_flag: bool, custom_partitions: Option<&String>, time_partition: Option<&String>, + time_partition_limit: Option, schema_version: SchemaVersion, + log_source: &LogSource, stream_type: StreamType, ) -> Result { let custom_partition_values = match custom_partitions.as_ref() { @@ -167,7 +191,10 @@ impl EventFormat for Event { storage_schema, static_schema_flag, time_partition, + time_partition_limit, + custom_partitions, schema_version, + log_source, )?; Ok(super::Event { @@ -385,7 +412,15 @@ mod tests { }); let (rb, _) = Event::new(json) - .into_recordbatch(&HashMap::default(), false, None, SchemaVersion::V0) + .into_recordbatch( + &HashMap::default(), + false, + None, + None, + None, + SchemaVersion::V0, + &LogSource::Json, + ) .unwrap(); assert_eq!(rb.num_rows(), 1); @@ -413,7 +448,15 @@ mod tests { }); let (rb, _) = Event::new(json) - .into_recordbatch(&HashMap::default(), false, None, SchemaVersion::V0) + .into_recordbatch( + &HashMap::default(), + false, + None, + None, + None, + SchemaVersion::V0, + &LogSource::Json, + ) .unwrap(); assert_eq!(rb.num_rows(), 1); @@ -445,7 +488,15 @@ mod tests { ); let (rb, _) = Event::new(json) - .into_recordbatch(&schema, false, None, SchemaVersion::V0) + .into_recordbatch( + &schema, + false, + None, + None, + None, + SchemaVersion::V0, + &LogSource::Json, + ) .unwrap(); assert_eq!(rb.num_rows(), 1); @@ -477,7 +528,15 @@ mod tests { ); assert!(Event::new(json) - .into_recordbatch(&schema, false, None, SchemaVersion::V0,) + .into_recordbatch( + &schema, + false, + None, + None, + None, + SchemaVersion::V0, + &LogSource::Json + ) .is_err()); } @@ -495,7 +554,15 @@ mod tests { ); let (rb, _) = Event::new(json) - .into_recordbatch(&schema, false, None, SchemaVersion::V0) + .into_recordbatch( + &schema, + false, + None, + None, + None, + SchemaVersion::V0, + &LogSource::Json, + ) .unwrap(); assert_eq!(rb.num_rows(), 1); @@ -521,7 +588,15 @@ mod tests { ]); let (rb, _) = Event::new(json) - .into_recordbatch(&HashMap::default(), false, None, SchemaVersion::V0) + .into_recordbatch( + &HashMap::default(), + false, + None, + None, + None, + SchemaVersion::V0, + &LogSource::Json, + ) .unwrap(); assert_eq!(rb.num_rows(), 3); @@ -569,7 +644,15 @@ mod tests { ]); let (rb, _) = Event::new(json) - .into_recordbatch(&HashMap::default(), false, None, SchemaVersion::V0) + .into_recordbatch( + &HashMap::default(), + false, + None, + None, + None, + SchemaVersion::V0, + &LogSource::Json, + ) .unwrap(); assert_eq!(rb.num_rows(), 3); @@ -618,7 +701,15 @@ mod tests { ); let (rb, _) = Event::new(json) - .into_recordbatch(&schema, false, None, SchemaVersion::V0) + .into_recordbatch( + &schema, + false, + None, + None, + None, + SchemaVersion::V0, + &LogSource::Json, + ) .unwrap(); assert_eq!(rb.num_rows(), 3); @@ -667,7 +758,15 @@ mod tests { ); assert!(Event::new(json) - .into_recordbatch(&schema, false, None, SchemaVersion::V0,) + .into_recordbatch( + &schema, + false, + None, + None, + None, + SchemaVersion::V0, + &LogSource::Json + ) .is_err()); } @@ -696,7 +795,15 @@ mod tests { ]); let (rb, _) = Event::new(json) - .into_recordbatch(&HashMap::default(), false, None, SchemaVersion::V0) + .into_recordbatch( + &HashMap::default(), + false, + None, + None, + None, + SchemaVersion::V0, + &LogSource::Json, + ) .unwrap(); assert_eq!(rb.num_rows(), 4); assert_eq!(rb.num_columns(), 5); @@ -768,7 +875,15 @@ mod tests { ]); let (rb, _) = Event::new(json) - .into_recordbatch(&HashMap::default(), false, None, SchemaVersion::V1) + .into_recordbatch( + &HashMap::default(), + false, + None, + None, + None, + SchemaVersion::V1, + &LogSource::Json, + ) .unwrap(); assert_eq!(rb.num_rows(), 4); diff --git a/src/event/format/mod.rs b/src/event/format/mod.rs index 5c5f8dee8..ec9ed076f 100644 --- a/src/event/format/mod.rs +++ b/src/event/format/mod.rs @@ -20,6 +20,7 @@ use std::{ collections::{HashMap, HashSet}, fmt::Display, + num::NonZeroU32, sync::Arc, }; @@ -101,7 +102,10 @@ pub trait EventFormat: Sized { self, schema: &HashMap>, time_partition: Option<&String>, + time_partition_limit: Option, + custom_partition: Option<&String>, schema_version: SchemaVersion, + log_source: &LogSource, ) -> Result<(Self::Data, EventSchema, bool), AnyError>; fn decode(data: Self::Data, schema: Arc) -> Result; @@ -114,11 +118,20 @@ pub trait EventFormat: Sized { storage_schema: &HashMap>, static_schema_flag: bool, time_partition: Option<&String>, + time_partition_limit: Option, + custom_partition: Option<&String>, schema_version: SchemaVersion, + log_source: &LogSource, ) -> Result<(RecordBatch, bool), AnyError> { let p_timestamp = self.get_p_timestamp(); - let (data, mut schema, is_first) = - self.to_data(storage_schema, time_partition, schema_version)?; + let (data, mut schema, is_first) = self.to_data( + storage_schema, + time_partition, + time_partition_limit, + custom_partition, + schema_version, + log_source, + )?; if get_field(&schema, DEFAULT_TIMESTAMP_KEY).is_some() { return Err(anyhow!( @@ -171,7 +184,9 @@ pub trait EventFormat: Sized { static_schema_flag: bool, custom_partitions: Option<&String>, time_partition: Option<&String>, + time_partition_limit: Option, schema_version: SchemaVersion, + log_source: &LogSource, stream_type: StreamType, ) -> Result; } diff --git a/src/handlers/http/ingest.rs b/src/handlers/http/ingest.rs index bca4e36dc..09b11eddc 100644 --- a/src/handlers/http/ingest.rs +++ b/src/handlers/http/ingest.rs @@ -91,7 +91,9 @@ pub async fn ingest_internal_stream(stream_name: String, body: Bytes) -> Result< false, None, None, + None, SchemaVersion::V0, + &LogSource::Pmeta, StreamType::Internal, )? .process()?; diff --git a/src/handlers/http/modal/utils/ingest_utils.rs b/src/handlers/http/modal/utils/ingest_utils.rs index 257dc014e..7c6d66b60 100644 --- a/src/handlers/http/modal/utils/ingest_utils.rs +++ b/src/handlers/http/modal/utils/ingest_utils.rs @@ -31,7 +31,6 @@ use crate::{ otel::{logs::flatten_otel_logs, metrics::flatten_otel_metrics, traces::flatten_otel_traces}, parseable::PARSEABLE, storage::StreamType, - utils::json::{convert_array_to_object, flatten::convert_to_array}, }; pub async fn flatten_and_push_logs( @@ -82,42 +81,23 @@ async fn push_logs( let p_timestamp = Utc::now(); for json in jsons { - let data = if time_partition.is_some() || custom_partition.is_some() { - convert_array_to_object( - json, + let origin_size = serde_json::to_vec(&json).unwrap().len() as u64; // string length need not be the same as byte length + let schema = PARSEABLE.get_stream(stream_name)?.get_schema_raw(); + json::Event { json, p_timestamp } + .into_event( + stream_name.to_owned(), + origin_size, + &schema, + static_schema_flag, + custom_partition.as_ref(), time_partition.as_ref(), time_partition_limit, - custom_partition.as_ref(), schema_version, log_source, + StreamType::UserDefined, )? - } else { - vec![convert_to_array(convert_array_to_object( - json, - None, - None, - None, - schema_version, - log_source, - )?)?] - }; - - for json in data { - let origin_size = serde_json::to_vec(&json).unwrap().len() as u64; // string length need not be the same as byte length - let schema = PARSEABLE.get_stream(stream_name)?.get_schema_raw(); - json::Event { json, p_timestamp } - .into_event( - stream_name.to_owned(), - origin_size, - &schema, - static_schema_flag, - custom_partition.as_ref(), - time_partition.as_ref(), - schema_version, - StreamType::UserDefined, - )? - .process()?; - } + .process()?; } + Ok(()) } From d604b65ea1a83edce03b19739e76d6bfd341ed5b Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Sat, 1 Mar 2025 12:33:23 +0530 Subject: [PATCH 08/39] refactor: move kinesis to lib level --- src/handlers/http/mod.rs | 1 - src/{handlers/http => }/kinesis.rs | 0 src/lib.rs | 1 + 3 files changed, 1 insertion(+), 1 deletion(-) rename src/{handlers/http => }/kinesis.rs (100%) diff --git a/src/handlers/http/mod.rs b/src/handlers/http/mod.rs index f1f702d4b..4bdf85adf 100644 --- a/src/handlers/http/mod.rs +++ b/src/handlers/http/mod.rs @@ -34,7 +34,6 @@ pub mod cluster; pub mod correlation; pub mod health_check; pub mod ingest; -mod kinesis; pub mod llm; pub mod logstream; pub mod middleware; diff --git a/src/handlers/http/kinesis.rs b/src/kinesis.rs similarity index 100% rename from src/handlers/http/kinesis.rs rename to src/kinesis.rs diff --git a/src/lib.rs b/src/lib.rs index 2f8eb06ad..4f940aded 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -29,6 +29,7 @@ pub mod correlation; mod event; pub mod handlers; pub mod hottier; +mod kinesis; mod livetail; mod metadata; pub mod metrics; From 303ba35ba15a9b9f45d53782b3d1732a8aac2e85 Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Sat, 1 Mar 2025 12:33:45 +0530 Subject: [PATCH 09/39] refactor: perform flattening in `to_data` alone --- src/event/format/json.rs | 96 ++++++++++++++----- src/event/format/mod.rs | 1 + src/handlers/http/ingest.rs | 12 +-- .../http/modal/ingest/ingestor_ingest.rs | 4 +- src/handlers/http/modal/utils/ingest_utils.rs | 48 +--------- 5 files changed, 87 insertions(+), 74 deletions(-) diff --git a/src/event/format/json.rs b/src/event/format/json.rs index c6f5a11e5..0bdda6a38 100644 --- a/src/event/format/json.rs +++ b/src/event/format/json.rs @@ -26,6 +26,9 @@ use arrow_schema::{DataType, Field, Fields, Schema}; use chrono::{DateTime, NaiveDateTime, Utc}; use datafusion::arrow::util::bit_util::round_upto_multiple_of_64; use itertools::Itertools; +use opentelemetry_proto::tonic::{ + logs::v1::LogsData, metrics::v1::MetricsData, trace::v1::TracesData, +}; use serde_json::Value; use std::{ collections::{HashMap, HashSet}, @@ -36,7 +39,9 @@ use tracing::error; use super::{EventFormat, LogSource}; use crate::{ + kinesis::{flatten_kinesis_logs, Message}, metadata::SchemaVersion, + otel::{logs::flatten_otel_logs, metrics::flatten_otel_metrics, traces::flatten_otel_traces}, storage::StreamType, utils::{ arrow::get_field, @@ -58,6 +63,64 @@ impl Event { } } +pub fn flatten_logs( + json: Value, + time_partition: Option<&String>, + time_partition_limit: Option, + custom_partitions: Option<&String>, + schema_version: SchemaVersion, + log_source: &LogSource, +) -> Result, anyhow::Error> { + let data = match log_source { + LogSource::Kinesis => { + //custom flattening required for Amazon Kinesis + let message: Message = serde_json::from_value(json)?; + flatten_kinesis_logs(message) + } + LogSource::OtelLogs => { + //custom flattening required for otel logs + let logs: LogsData = serde_json::from_value(json)?; + flatten_otel_logs(&logs) + } + LogSource::OtelTraces => { + //custom flattening required for otel traces + let traces: TracesData = serde_json::from_value(json)?; + flatten_otel_traces(&traces) + } + LogSource::OtelMetrics => { + //custom flattening required for otel metrics + let metrics: MetricsData = serde_json::from_value(json)?; + flatten_otel_metrics(metrics) + } + _ => vec![json], + }; + + let mut logs = vec![]; + for json in data { + if time_partition.is_some() || custom_partitions.is_some() { + logs.append(&mut convert_array_to_object( + json, + time_partition, + time_partition_limit, + custom_partitions, + schema_version, + log_source, + )?) + } else { + logs.push(convert_to_array(convert_array_to_object( + json, + None, + None, + None, + schema_version, + log_source, + )?)?) + } + } + + Ok(logs) +} + impl EventFormat for Event { type Data = Vec; @@ -73,29 +136,18 @@ impl EventFormat for Event { stored_schema: &HashMap>, time_partition: Option<&String>, time_partition_limit: Option, - custom_partition: Option<&String>, + custom_partitions: Option<&String>, schema_version: SchemaVersion, log_source: &LogSource, ) -> Result<(Self::Data, Vec>, bool), anyhow::Error> { - let flattened = if time_partition.is_some() || custom_partition.is_some() { - convert_array_to_object( - self.json, - time_partition, - time_partition_limit, - custom_partition, - schema_version, - log_source, - )? - } else { - vec![convert_to_array(convert_array_to_object( - self.json, - None, - None, - None, - schema_version, - log_source, - )?)?] - }; + let flattened = flatten_logs( + self.json, + time_partition, + time_partition_limit, + custom_partitions, + schema_version, + log_source, + )?; // collect all the keys from all the json objects in the request body let fields = @@ -175,8 +227,8 @@ impl EventFormat for Event { stream_type: StreamType, ) -> Result { let custom_partition_values = match custom_partitions.as_ref() { - Some(custom_partition) => { - let custom_partitions = custom_partition.split(',').collect_vec(); + Some(custom_partitions) => { + let custom_partitions = custom_partitions.split(',').collect_vec(); extract_custom_partition_values(&self.json, &custom_partitions) } None => HashMap::new(), diff --git a/src/event/format/mod.rs b/src/event/format/mod.rs index ec9ed076f..bf259159a 100644 --- a/src/event/format/mod.rs +++ b/src/event/format/mod.rs @@ -113,6 +113,7 @@ pub trait EventFormat: Sized { /// Returns the UTC time at ingestion fn get_p_timestamp(&self) -> DateTime; + #[allow(clippy::too_many_arguments)] fn into_recordbatch( self, storage_schema: &HashMap>, diff --git a/src/handlers/http/ingest.rs b/src/handlers/http/ingest.rs index 09b11eddc..06d18ed0b 100644 --- a/src/handlers/http/ingest.rs +++ b/src/handlers/http/ingest.rs @@ -38,7 +38,7 @@ use crate::utils::header_parsing::ParseHeaderError; use crate::utils::json::flatten::JsonFlattenError; use super::logstream::error::{CreateStreamError, StreamError}; -use super::modal::utils::ingest_utils::flatten_and_push_logs; +use super::modal::utils::ingest_utils::push_logs; use super::users::dashboards::DashboardError; use super::users::filters::FiltersError; @@ -72,7 +72,7 @@ pub async fn ingest(req: HttpRequest, Json(json): Json) -> Result Result Result<(), PostError> { - let json = match log_source { - LogSource::Kinesis => { - //custom flattening required for Amazon Kinesis - let message: Message = serde_json::from_value(json)?; - flatten_kinesis_logs(message) - } - LogSource::OtelLogs => { - //custom flattening required for otel logs - let logs: LogsData = serde_json::from_value(json)?; - flatten_otel_logs(&logs) - } - LogSource::OtelTraces => { - //custom flattening required for otel traces - let traces: TracesData = serde_json::from_value(json)?; - flatten_otel_traces(&traces) - } - LogSource::OtelMetrics => { - //custom flattening required for otel metrics - let metrics: MetricsData = serde_json::from_value(json)?; - flatten_otel_metrics(metrics) - } - _ => vec![json], - }; - push_logs(stream_name, json, log_source).await?; - Ok(()) -} - -async fn push_logs( - stream_name: &str, - jsons: Vec, + json: Value, log_source: &LogSource, ) -> Result<(), PostError> { let stream = PARSEABLE.get_stream(stream_name)?; @@ -80,7 +41,6 @@ async fn push_logs( let schema_version = stream.get_schema_version(); let p_timestamp = Utc::now(); - for json in jsons { let origin_size = serde_json::to_vec(&json).unwrap().len() as u64; // string length need not be the same as byte length let schema = PARSEABLE.get_stream(stream_name)?.get_schema_raw(); json::Event { json, p_timestamp } @@ -97,7 +57,7 @@ async fn push_logs( StreamType::UserDefined, )? .process()?; - } + Ok(()) } From c2faefc8d9d0ba4b9785265e82343dada688e5f5 Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Sat, 1 Mar 2025 12:49:53 +0530 Subject: [PATCH 10/39] refactor: further streamline, associate w/ `Parseable` --- src/connectors/kafka/processor.rs | 47 ++++---------- src/handlers/http/ingest.rs | 52 +++++++-------- src/handlers/http/modal/utils/ingest_utils.rs | 63 ------------------- src/handlers/http/modal/utils/mod.rs | 1 - src/parseable/streams.rs | 35 ++++++++++- 5 files changed, 73 insertions(+), 125 deletions(-) delete mode 100644 src/handlers/http/modal/utils/ingest_utils.rs diff --git a/src/connectors/kafka/processor.rs b/src/connectors/kafka/processor.rs index b9fe2101d..fa2771fc7 100644 --- a/src/connectors/kafka/processor.rs +++ b/src/connectors/kafka/processor.rs @@ -26,12 +26,7 @@ use tokio_stream::wrappers::ReceiverStream; use tracing::{debug, error}; use crate::{ - connectors::common::processor::Processor, - event::{ - format::{json, EventFormat, LogSource}, - Event as ParseableEvent, - }, - parseable::PARSEABLE, + connectors::common::processor::Processor, event::format::LogSource, parseable::PARSEABLE, storage::StreamType, }; @@ -41,10 +36,7 @@ use super::{config::BufferConfig, ConsumerRecord, StreamConsumer, TopicPartition pub struct ParseableSinkProcessor; impl ParseableSinkProcessor { - async fn build_event_from_chunk( - &self, - records: &[ConsumerRecord], - ) -> anyhow::Result { + async fn process_event_from_chunk(&self, records: &[ConsumerRecord]) -> anyhow::Result { let stream_name = records .first() .map(|r| r.topic.as_str()) @@ -54,14 +46,6 @@ impl ParseableSinkProcessor { .create_stream_if_not_exists(stream_name, StreamType::UserDefined, LogSource::Json) .await?; - let stream = PARSEABLE.get_stream(stream_name)?; - let schema = stream.get_schema_raw(); - let time_partition = stream.get_time_partition(); - let time_partition_limit = stream.get_time_partition_limit(); - let custom_partition = stream.get_custom_partition(); - let static_schema_flag = stream.get_static_schema_flag(); - let schema_version = stream.get_schema_version(); - let mut json_vec = Vec::with_capacity(records.len()); let mut total_payload_size = 0u64; @@ -72,20 +56,15 @@ impl ParseableSinkProcessor { } } - let p_event = json::Event::new(Value::Array(json_vec)).into_event( - stream_name.to_string(), - total_payload_size, - &schema, - static_schema_flag, - custom_partition.as_ref(), - time_partition.as_ref(), - time_partition_limit, - schema_version, - &LogSource::Custom("Kafka".to_owned()), - StreamType::UserDefined, - )?; - - Ok(p_event) + PARSEABLE + .get_or_create_stream(stream_name) + .push_logs( + Value::Array(json_vec), + &LogSource::Custom("Kafka".to_owned()), + ) + .await?; + + Ok(total_payload_size) } } @@ -95,9 +74,9 @@ impl Processor, ()> for ParseableSinkProcessor { let len = records.len(); debug!("Processing {len} records"); - self.build_event_from_chunk(&records).await?.process()?; + let size = self.process_event_from_chunk(&records).await?; - debug!("Processed {len} records"); + debug!("Processed {len} records, size = {size} Bytes"); Ok(()) } } diff --git a/src/handlers/http/ingest.rs b/src/handlers/http/ingest.rs index 06d18ed0b..740269d40 100644 --- a/src/handlers/http/ingest.rs +++ b/src/handlers/http/ingest.rs @@ -28,9 +28,8 @@ use serde_json::Value; use crate::event; use crate::event::error::EventError; -use crate::event::format::{self, EventFormat, LogSource}; +use crate::event::format::LogSource; use crate::handlers::{LOG_SOURCE_KEY, STREAM_NAME_HEADER_KEY}; -use crate::metadata::SchemaVersion; use crate::option::Mode; use crate::parseable::{StreamNotFound, PARSEABLE}; use crate::storage::{ObjectStorageError, StreamType}; @@ -38,7 +37,6 @@ use crate::utils::header_parsing::ParseHeaderError; use crate::utils::json::flatten::JsonFlattenError; use super::logstream::error::{CreateStreamError, StreamError}; -use super::modal::utils::ingest_utils::push_logs; use super::users::dashboards::DashboardError; use super::users::filters::FiltersError; @@ -72,31 +70,21 @@ pub async fn ingest(req: HttpRequest, Json(json): Json) -> Result Result<(), PostError> { - let size: usize = body.len(); let json: Value = serde_json::from_slice(&body)?; - let schema = PARSEABLE.get_stream(&stream_name)?.get_schema_raw(); - - // For internal streams, use old schema - format::json::Event::new(json) - .into_event( - stream_name, - size as u64, - &schema, - false, - None, - None, - None, - SchemaVersion::V0, - &LogSource::Pmeta, - StreamType::Internal, - )? - .process()?; + + PARSEABLE + .get_stream(&stream_name)? + .push_logs(json, &LogSource::Pmeta) + .await?; Ok(()) } @@ -125,7 +113,10 @@ pub async fn handle_otel_logs_ingestion( .create_stream_if_not_exists(&stream_name, StreamType::UserDefined, LogSource::OtelLogs) .await?; - push_logs(&stream_name, json, &log_source).await?; + PARSEABLE + .get_or_create_stream(&stream_name) + .push_logs(json, &log_source) + .await?; Ok(HttpResponse::Ok().finish()) } @@ -156,7 +147,10 @@ pub async fn handle_otel_metrics_ingestion( ) .await?; - push_logs(&stream_name, json, &log_source).await?; + PARSEABLE + .get_or_create_stream(&stream_name) + .push_logs(json, &log_source) + .await?; Ok(HttpResponse::Ok().finish()) } @@ -184,7 +178,10 @@ pub async fn handle_otel_traces_ingestion( .create_stream_if_not_exists(&stream_name, StreamType::UserDefined, LogSource::OtelTraces) .await?; - push_logs(&stream_name, json, &log_source).await?; + PARSEABLE + .get_or_create_stream(&stream_name) + .push_logs(json, &log_source) + .await?; Ok(HttpResponse::Ok().finish()) } @@ -233,7 +230,10 @@ pub async fn post_event( return Err(PostError::OtelNotSupported); } - push_logs(&stream_name, json, &log_source).await?; + PARSEABLE + .get_or_create_stream(&stream_name) + .push_logs(json, &log_source) + .await?; Ok(HttpResponse::Ok().finish()) } diff --git a/src/handlers/http/modal/utils/ingest_utils.rs b/src/handlers/http/modal/utils/ingest_utils.rs deleted file mode 100644 index aa8fca6a7..000000000 --- a/src/handlers/http/modal/utils/ingest_utils.rs +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Parseable Server (C) 2022 - 2024 Parseable, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as - * published by the Free Software Foundation, either version 3 of the - * License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - * - */ - -use chrono::Utc; -use serde_json::Value; - -use crate::{ - event::format::{json, EventFormat, LogSource}, - handlers::http::ingest::PostError, - parseable::PARSEABLE, - storage::StreamType, -}; - -pub async fn push_logs( - stream_name: &str, - json: Value, - log_source: &LogSource, -) -> Result<(), PostError> { - let stream = PARSEABLE.get_stream(stream_name)?; - let time_partition = stream.get_time_partition(); - let time_partition_limit = PARSEABLE - .get_stream(stream_name)? - .get_time_partition_limit(); - let static_schema_flag = stream.get_static_schema_flag(); - let custom_partition = stream.get_custom_partition(); - let schema_version = stream.get_schema_version(); - let p_timestamp = Utc::now(); - - let origin_size = serde_json::to_vec(&json).unwrap().len() as u64; // string length need not be the same as byte length - let schema = PARSEABLE.get_stream(stream_name)?.get_schema_raw(); - json::Event { json, p_timestamp } - .into_event( - stream_name.to_owned(), - origin_size, - &schema, - static_schema_flag, - custom_partition.as_ref(), - time_partition.as_ref(), - time_partition_limit, - schema_version, - log_source, - StreamType::UserDefined, - )? - .process()?; - - - Ok(()) -} diff --git a/src/handlers/http/modal/utils/mod.rs b/src/handlers/http/modal/utils/mod.rs index 61930d43d..1d0a3767b 100644 --- a/src/handlers/http/modal/utils/mod.rs +++ b/src/handlers/http/modal/utils/mod.rs @@ -16,6 +16,5 @@ * */ -pub mod ingest_utils; pub mod logstream_utils; pub mod rbac_utils; diff --git a/src/parseable/streams.rs b/src/parseable/streams.rs index 009e01d2c..d9cd373ad 100644 --- a/src/parseable/streams.rs +++ b/src/parseable/streams.rs @@ -42,12 +42,16 @@ use parquet::{ }; use rand::distributions::DistString; use relative_path::RelativePathBuf; +use serde_json::Value; use tokio::task::JoinSet; use tracing::{error, info, trace, warn}; use crate::{ cli::Options, - event::DEFAULT_TIMESTAMP_KEY, + event::{ + format::{json, EventFormat, LogSource}, + DEFAULT_TIMESTAMP_KEY, + }, metadata::{LogStreamMetadata, SchemaVersion}, metrics, option::Mode, @@ -109,6 +113,35 @@ impl Stream { }) } + pub async fn push_logs(&self, json: Value, log_source: &LogSource) -> anyhow::Result<()> { + let time_partition = self.get_time_partition(); + let time_partition_limit = self.get_time_partition_limit(); + let static_schema_flag = self.get_static_schema_flag(); + let custom_partition = self.get_custom_partition(); + let schema_version = self.get_schema_version(); + let schema = self.get_schema_raw(); + let stream_type = self.get_stream_type(); + + let origin_size = serde_json::to_vec(&json).unwrap().len() as u64; // string length need not be the same as byte length + + json::Event::new(json) + .into_event( + self.stream_name.to_owned(), + origin_size, + &schema, + static_schema_flag, + custom_partition.as_ref(), + time_partition.as_ref(), + time_partition_limit, + schema_version, + log_source, + stream_type, + )? + .process()?; + + Ok(()) + } + // Concatenates record batches and puts them in memory store for each event. pub fn push( &self, From 354061a9a95214c0dbd2b3ec6b97bf1a03587d7f Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Sat, 1 Mar 2025 12:53:45 +0530 Subject: [PATCH 11/39] ci: deepsource suggestion --- src/event/format/json.rs | 61 ++++++++++++++++++++++------------------ 1 file changed, 33 insertions(+), 28 deletions(-) diff --git a/src/event/format/json.rs b/src/event/format/json.rs index 0bdda6a38..0c2c06155 100644 --- a/src/event/format/json.rs +++ b/src/event/format/json.rs @@ -154,34 +154,39 @@ impl EventFormat for Event { collect_keys(flattened.iter()).expect("fields can be collected from array of objects"); let mut is_first = false; - let schema = match derive_arrow_schema(stored_schema, fields) { - Some(schema) => schema, - _ => { - let mut infer_schema = infer_json_schema_from_iterator(flattened.iter().map(Ok)) - .map_err(|err| { - anyhow!("Could not infer schema for this event due to err {:?}", err) - })?; - let new_infer_schema = super::update_field_type_in_schema( - Arc::new(infer_schema), - Some(stored_schema), - time_partition, - Some(&flattened), - schema_version, - ); - infer_schema = Schema::new(new_infer_schema.fields().clone()); - Schema::try_merge(vec![ - Schema::new(stored_schema.values().cloned().collect::()), - infer_schema.clone(), - ]).map_err(|err| anyhow!("Could not merge schema of this event with that of the existing stream. {:?}", err))?; - is_first = true; - infer_schema - .fields - .iter() - .filter(|field| !field.data_type().is_null()) - .cloned() - .sorted_by(|a, b| a.name().cmp(b.name())) - .collect() - } + let schema = if let Some(schema) = derive_arrow_schema(stored_schema, fields) { + schema + } else { + let mut infer_schema = infer_json_schema_from_iterator(flattened.iter().map(Ok)) + .map_err(|err| { + anyhow!("Could not infer schema for this event due to err {:?}", err) + })?; + let new_infer_schema = super::update_field_type_in_schema( + Arc::new(infer_schema), + Some(stored_schema), + time_partition, + Some(&flattened), + schema_version, + ); + infer_schema = Schema::new(new_infer_schema.fields().clone()); + Schema::try_merge(vec![ + Schema::new(stored_schema.values().cloned().collect::()), + infer_schema.clone(), + ]) + .map_err(|err| { + anyhow!( + "Could not merge schema of this event with that of the existing stream. {:?}", + err + ) + })?; + is_first = true; + infer_schema + .fields + .iter() + .filter(|field| !field.data_type().is_null()) + .cloned() + .sorted_by(|a, b| a.name().cmp(b.name())) + .collect() }; if flattened From 1386d3bd5de58f89d1f9758aae98ef544e180378 Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Sat, 1 Mar 2025 13:02:07 +0530 Subject: [PATCH 12/39] fix: flattening --- src/event/format/json.rs | 42 ++++++++++++++-------------------------- 1 file changed, 14 insertions(+), 28 deletions(-) diff --git a/src/event/format/json.rs b/src/event/format/json.rs index 0c2c06155..a056f2bf4 100644 --- a/src/event/format/json.rs +++ b/src/event/format/json.rs @@ -43,10 +43,7 @@ use crate::{ metadata::SchemaVersion, otel::{logs::flatten_otel_logs, metrics::flatten_otel_metrics, traces::flatten_otel_traces}, storage::StreamType, - utils::{ - arrow::get_field, - json::{convert_array_to_object, flatten::convert_to_array}, - }, + utils::{arrow::get_field, json::convert_array_to_object}, }; pub struct Event { @@ -70,7 +67,7 @@ pub fn flatten_logs( custom_partitions: Option<&String>, schema_version: SchemaVersion, log_source: &LogSource, -) -> Result, anyhow::Error> { +) -> anyhow::Result> { let data = match log_source { LogSource::Kinesis => { //custom flattening required for Amazon Kinesis @@ -97,25 +94,14 @@ pub fn flatten_logs( let mut logs = vec![]; for json in data { - if time_partition.is_some() || custom_partitions.is_some() { - logs.append(&mut convert_array_to_object( - json, - time_partition, - time_partition_limit, - custom_partitions, - schema_version, - log_source, - )?) - } else { - logs.push(convert_to_array(convert_array_to_object( - json, - None, - None, - None, - schema_version, - log_source, - )?)?) - } + logs.append(&mut convert_array_to_object( + json, + time_partition, + time_partition_limit, + custom_partitions, + schema_version, + log_source, + )?) } Ok(logs) @@ -139,7 +125,7 @@ impl EventFormat for Event { custom_partitions: Option<&String>, schema_version: SchemaVersion, log_source: &LogSource, - ) -> Result<(Self::Data, Vec>, bool), anyhow::Error> { + ) -> anyhow::Result<(Self::Data, Vec>, bool)> { let flattened = flatten_logs( self.json, time_partition, @@ -202,7 +188,7 @@ impl EventFormat for Event { } // Convert the Data type (defined above) to arrow record batch - fn decode(data: Self::Data, schema: Arc) -> Result { + fn decode(data: Self::Data, schema: Arc) -> anyhow::Result { let array_capacity = round_upto_multiple_of_64(data.len()); let mut reader = ReaderBuilder::new(schema) .with_batch_size(array_capacity) @@ -230,7 +216,7 @@ impl EventFormat for Event { schema_version: SchemaVersion, log_source: &LogSource, stream_type: StreamType, - ) -> Result { + ) -> anyhow::Result { let custom_partition_values = match custom_partitions.as_ref() { Some(custom_partitions) => { let custom_partitions = custom_partitions.split(',').collect_vec(); @@ -295,7 +281,7 @@ pub fn extract_custom_partition_values( fn extract_and_parse_time( json: &Value, time_partition: &str, -) -> Result { +) -> anyhow::Result { let current_time = json .get(time_partition) .ok_or_else(|| anyhow!("Missing field for time partition in json: {time_partition}"))?; From 38a52c2cffe8b70d356a502f0cc3f12f8b35a017 Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Sat, 1 Mar 2025 13:12:59 +0530 Subject: [PATCH 13/39] remove unused code --- .../http/modal/ingest/ingestor_ingest.rs | 43 ------------------- 1 file changed, 43 deletions(-) delete mode 100644 src/handlers/http/modal/ingest/ingestor_ingest.rs diff --git a/src/handlers/http/modal/ingest/ingestor_ingest.rs b/src/handlers/http/modal/ingest/ingestor_ingest.rs deleted file mode 100644 index 1af6180d4..000000000 --- a/src/handlers/http/modal/ingest/ingestor_ingest.rs +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Parseable Server (C) 2022 - 2024 Parseable, Inc. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as - * published by the Free Software Foundation, either version 3 of the - * License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - * - */ - -use actix_web::{HttpRequest, HttpResponse}; -use bytes::Bytes; - -use crate::{handlers::http::{ingest::PostError, modal::utils::ingest_utils::push_logs}, metadata::PARSEABLE.streams}; - - -// Handler for POST /api/v1/logstream/{logstream} -// only ingests events into the specified logstream -// fails if the logstream does not exist -pub async fn post_event(req: HttpRequest, body: Bytes) -> Result { - let stream_name: String = req.match_info().get("logstream").unwrap().parse().unwrap(); - let internal_stream_names = PARSEABLE.streams.list_internal_streams(); - if internal_stream_names.contains(&stream_name) { - return Err(PostError::Invalid(anyhow::anyhow!( - "Stream {} is an internal stream and cannot be ingested into", - stream_name - ))); - } - if !PARSEABLE.streams.stream_exists(&stream_name) { - return Err(PostError::StreamNotFound(stream_name)); - } - - push_logs(req, body, stream_name).await?; - Ok(HttpResponse::Ok().finish()) -} \ No newline at end of file From a7b2db391b649a0747e265efb09d7630fbcaf314 Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Sat, 1 Mar 2025 14:29:21 +0530 Subject: [PATCH 14/39] fix: partitioning --- src/event/format/json.rs | 326 ++++++++++++++++++++++----------- src/event/format/mod.rs | 40 ++-- src/event/mod.rs | 93 +++++----- src/handlers/http/ingest.rs | 12 +- src/handlers/http/logstream.rs | 8 +- src/utils/json/mod.rs | 69 +++---- 6 files changed, 319 insertions(+), 229 deletions(-) diff --git a/src/event/format/json.rs b/src/event/format/json.rs index a056f2bf4..08c132209 100644 --- a/src/event/format/json.rs +++ b/src/event/format/json.rs @@ -39,11 +39,15 @@ use tracing::error; use super::{EventFormat, LogSource}; use crate::{ + event::PartitionEvent, kinesis::{flatten_kinesis_logs, Message}, metadata::SchemaVersion, otel::{logs::flatten_otel_logs, metrics::flatten_otel_metrics, traces::flatten_otel_traces}, storage::StreamType, - utils::{arrow::get_field, json::convert_array_to_object}, + utils::{ + arrow::get_field, + json::{flatten_json_body, Json}, + }, }; pub struct Event { @@ -67,7 +71,7 @@ pub fn flatten_logs( custom_partitions: Option<&String>, schema_version: SchemaVersion, log_source: &LogSource, -) -> anyhow::Result> { +) -> anyhow::Result> { let data = match log_source { LogSource::Kinesis => { //custom flattening required for Amazon Kinesis @@ -94,26 +98,40 @@ pub fn flatten_logs( let mut logs = vec![]; for json in data { - logs.append(&mut convert_array_to_object( + let json = flatten_json_body( json, time_partition, time_partition_limit, custom_partitions, schema_version, + true, log_source, - )?) + )?; + + // incoming event may be a single json or a json array + // but Data (type defined above) is a vector of json values + // hence we need to convert the incoming event to a vector of json values + match json { + Value::Array(arr) => { + for log in arr { + let Value::Object(json) = log else { + return Err(anyhow!( + "Expected an object or a list of objects, received: {log:?}" + )); + }; + logs.push(json); + } + } + Value::Object(obj) => logs.push(obj), + _ => unreachable!("flatten would have failed beforehand"), + } } Ok(logs) } impl EventFormat for Event { - type Data = Vec; - - /// Returns the time at ingestion, i.e. the `p_timestamp` value - fn get_p_timestamp(&self) -> DateTime { - self.p_timestamp - } + type Data = Vec; // convert the incoming json to a vector of json values // also extract the arrow schema, tags and metadata from the incoming json @@ -136,17 +154,17 @@ impl EventFormat for Event { )?; // collect all the keys from all the json objects in the request body - let fields = - collect_keys(flattened.iter()).expect("fields can be collected from array of objects"); + let fields = collect_keys(flattened.iter()); let mut is_first = false; let schema = if let Some(schema) = derive_arrow_schema(stored_schema, fields) { schema } else { - let mut infer_schema = infer_json_schema_from_iterator(flattened.iter().map(Ok)) - .map_err(|err| { - anyhow!("Could not infer schema for this event due to err {:?}", err) - })?; + // TODO: + let mut infer_schema = infer_json_schema_from_iterator( + flattened.iter().map(|obj| Ok(Value::Object(obj.clone()))), + ) + .map_err(|err| anyhow!("Could not infer schema for this event due to err {:?}", err))?; let new_infer_schema = super::update_field_type_in_schema( Arc::new(infer_schema), Some(stored_schema), @@ -217,22 +235,9 @@ impl EventFormat for Event { log_source: &LogSource, stream_type: StreamType, ) -> anyhow::Result { - let custom_partition_values = match custom_partitions.as_ref() { - Some(custom_partitions) => { - let custom_partitions = custom_partitions.split(',').collect_vec(); - extract_custom_partition_values(&self.json, &custom_partitions) - } - None => HashMap::new(), - }; - - let parsed_timestamp = match time_partition { - Some(time_partition) => extract_and_parse_time(&self.json, time_partition)?, - _ => self.p_timestamp.naive_utc(), - }; - - let (rb, is_first_event) = self.into_recordbatch( + let p_timestamp = self.p_timestamp; + let (data, schema, is_first_event) = self.to_data( storage_schema, - static_schema_flag, time_partition, time_partition_limit, custom_partitions, @@ -240,15 +245,45 @@ impl EventFormat for Event { log_source, )?; + let mut partitions = vec![]; + for json in data { + let custom_partition_values = match custom_partitions.as_ref() { + Some(custom_partitions) => { + let custom_partitions = custom_partitions.split(',').collect_vec(); + extract_custom_partition_values(&json, &custom_partitions) + } + None => HashMap::new(), + }; + + let parsed_timestamp = match time_partition { + Some(time_partition) => extract_and_parse_time(&json, time_partition)?, + _ => p_timestamp.naive_utc(), + }; + + let rb = Self::into_recordbatch( + p_timestamp, + vec![json], + schema.clone(), + storage_schema, + static_schema_flag, + time_partition, + schema_version, + )?; + + partitions.push(PartitionEvent { + rb, + parsed_timestamp, + custom_partition_values, + }); + } + Ok(super::Event { - rb, stream_name, origin_format: "json", origin_size, is_first_event, - parsed_timestamp, time_partition: None, - custom_partition_values, + partitions, stream_type, }) } @@ -257,7 +292,7 @@ impl EventFormat for Event { /// Extracts custom partition values from provided JSON object /// e.g. `json: {"status": 400, "msg": "Hello, World!"}, custom_partition_list: ["status"]` returns `{"status" => 400}` pub fn extract_custom_partition_values( - json: &Value, + json: &Json, custom_partition_list: &[&str], ) -> HashMap { let mut custom_partition_values: HashMap = HashMap::new(); @@ -278,10 +313,7 @@ pub fn extract_custom_partition_values( /// Returns the parsed timestamp of deignated time partition from json object /// e.g. `json: {"timestamp": "2025-05-15T15:30:00Z"}` returns `2025-05-15T15:30:00` -fn extract_and_parse_time( - json: &Value, - time_partition: &str, -) -> anyhow::Result { +fn extract_and_parse_time(json: &Json, time_partition: &str) -> anyhow::Result { let current_time = json .get(time_partition) .ok_or_else(|| anyhow!("Missing field for time partition in json: {time_partition}"))?; @@ -308,28 +340,24 @@ fn derive_arrow_schema( // Returns a list of keys that are present in the given iterable of JSON objects // Returns None if even one of the value is not an Object -fn collect_keys<'a>(values: impl Iterator) -> Option> { +fn collect_keys<'a>(objects: impl Iterator) -> HashSet<&'a str> { let mut keys = HashSet::new(); - for value in values { - let obj = value.as_object()?; - for key in obj.keys() { + for object in objects { + for key in object.keys() { keys.insert(key.as_str()); } } - Some(keys) + keys } // Returns true when the field doesn't exist in schema or has an invalid type -fn fields_mismatch(schema: &[Arc], body: &Value, schema_version: SchemaVersion) -> bool { - body.as_object() - .expect("body is of object variant") - .iter() - .any(|(key, value)| { - !value.is_null() - && get_field(schema, key) - .is_none_or(|field| !valid_type(field.data_type(), value, schema_version)) - }) +fn fields_mismatch(schema: &[Arc], body: &Json, schema_version: SchemaVersion) -> bool { + body.iter().any(|(key, value)| { + !value.is_null() + && get_field(schema, key) + .is_none_or(|field| !valid_type(field.data_type(), value, schema_version)) + }) } fn valid_type(data_type: &DataType, value: &Value, schema_version: SchemaVersion) -> bool { @@ -400,7 +428,7 @@ mod tests { #[test] fn parse_time_parition_from_value() { let json = json!({"timestamp": "2025-05-15T15:30:00Z"}); - let parsed = extract_and_parse_time(&json, "timestamp"); + let parsed = extract_and_parse_time(json.as_object().unwrap(), "timestamp"); let expected = NaiveDateTime::from_str("2025-05-15T15:30:00").unwrap(); assert_eq!(parsed.unwrap(), expected); @@ -409,7 +437,7 @@ mod tests { #[test] fn time_parition_not_in_json() { let json = json!({"hello": "world!"}); - let parsed = extract_and_parse_time(&json, "timestamp"); + let parsed = extract_and_parse_time(json.as_object().unwrap(), "timestamp"); assert!(parsed.is_err()); } @@ -417,7 +445,7 @@ mod tests { #[test] fn time_parition_not_parseable_as_datetime() { let json = json!({"timestamp": "not time"}); - let parsed = extract_and_parse_time(&json, "timestamp"); + let parsed = extract_and_parse_time(json.as_object().unwrap(), "timestamp"); assert!(parsed.is_err()); } @@ -454,10 +482,10 @@ mod tests { "b": "hello", }); - let (rb, _) = Event::new(json) - .into_recordbatch( - &HashMap::default(), - false, + let store_schema = HashMap::default(); + let (data, schema, _) = Event::new(json) + .to_data( + &store_schema, None, None, None, @@ -465,6 +493,16 @@ mod tests { &LogSource::Json, ) .unwrap(); + let rb = Event::into_recordbatch( + Utc::now(), + data, + schema, + &store_schema, + false, + None, + SchemaVersion::V0, + ) + .unwrap(); assert_eq!(rb.num_rows(), 1); assert_eq!(rb.num_columns(), 4); @@ -490,10 +528,10 @@ mod tests { "c": null }); - let (rb, _) = Event::new(json) - .into_recordbatch( - &HashMap::default(), - false, + let store_schema = HashMap::default(); + let (data, schema, _) = Event::new(json) + .to_data( + &store_schema, None, None, None, @@ -501,6 +539,16 @@ mod tests { &LogSource::Json, ) .unwrap(); + let rb = Event::into_recordbatch( + Utc::now(), + data, + schema, + &store_schema, + false, + None, + SchemaVersion::V0, + ) + .unwrap(); assert_eq!(rb.num_rows(), 1); assert_eq!(rb.num_columns(), 3); @@ -521,7 +569,7 @@ mod tests { "b": "hello", }); - let schema = fields_to_map( + let store_schema = fields_to_map( [ Field::new("a", DataType::Int64, true), Field::new("b", DataType::Utf8, true), @@ -529,11 +577,9 @@ mod tests { ] .into_iter(), ); - - let (rb, _) = Event::new(json) - .into_recordbatch( - &schema, - false, + let (data, schema, _) = Event::new(json) + .to_data( + &store_schema, None, None, None, @@ -541,6 +587,16 @@ mod tests { &LogSource::Json, ) .unwrap(); + let rb = Event::into_recordbatch( + Utc::now(), + data, + schema, + &store_schema, + false, + None, + SchemaVersion::V0, + ) + .unwrap(); assert_eq!(rb.num_rows(), 1); assert_eq!(rb.num_columns(), 3); @@ -561,7 +617,7 @@ mod tests { "b": 1, // type mismatch }); - let schema = fields_to_map( + let store_schema = fields_to_map( [ Field::new("a", DataType::Int64, true), Field::new("b", DataType::Utf8, true), @@ -571,14 +627,13 @@ mod tests { ); assert!(Event::new(json) - .into_recordbatch( - &schema, - false, + .to_data( + &store_schema, None, None, None, SchemaVersion::V0, - &LogSource::Json + &LogSource::Json, ) .is_err()); } @@ -587,7 +642,7 @@ mod tests { fn empty_object() { let json = json!({}); - let schema = fields_to_map( + let store_schema = fields_to_map( [ Field::new("a", DataType::Int64, true), Field::new("b", DataType::Utf8, true), @@ -596,10 +651,9 @@ mod tests { .into_iter(), ); - let (rb, _) = Event::new(json) - .into_recordbatch( - &schema, - false, + let (data, schema, _) = Event::new(json) + .to_data( + &store_schema, None, None, None, @@ -607,6 +661,16 @@ mod tests { &LogSource::Json, ) .unwrap(); + let rb = Event::into_recordbatch( + Utc::now(), + data, + schema, + &store_schema, + false, + None, + SchemaVersion::V0, + ) + .unwrap(); assert_eq!(rb.num_rows(), 1); assert_eq!(rb.num_columns(), 1); @@ -630,10 +694,10 @@ mod tests { }, ]); - let (rb, _) = Event::new(json) - .into_recordbatch( - &HashMap::default(), - false, + let store_schema = HashMap::new(); + let (data, schema, _) = Event::new(json) + .to_data( + &store_schema, None, None, None, @@ -641,6 +705,16 @@ mod tests { &LogSource::Json, ) .unwrap(); + let rb = Event::into_recordbatch( + Utc::now(), + data, + schema, + &store_schema, + false, + None, + SchemaVersion::V0, + ) + .unwrap(); assert_eq!(rb.num_rows(), 3); assert_eq!(rb.num_columns(), 4); @@ -686,10 +760,10 @@ mod tests { }, ]); - let (rb, _) = Event::new(json) - .into_recordbatch( - &HashMap::default(), - false, + let store_schema = HashMap::new(); + let (data, schema, _) = Event::new(json) + .to_data( + &store_schema, None, None, None, @@ -697,6 +771,16 @@ mod tests { &LogSource::Json, ) .unwrap(); + let rb = Event::into_recordbatch( + Utc::now(), + data, + schema, + &store_schema, + false, + None, + SchemaVersion::V0, + ) + .unwrap(); assert_eq!(rb.num_rows(), 3); assert_eq!(rb.num_columns(), 4); @@ -734,7 +818,7 @@ mod tests { }, ]); - let schema = fields_to_map( + let store_schema = fields_to_map( [ Field::new("a", DataType::Int64, true), Field::new("b", DataType::Utf8, true), @@ -742,11 +826,9 @@ mod tests { ] .into_iter(), ); - - let (rb, _) = Event::new(json) - .into_recordbatch( - &schema, - false, + let (data, schema, _) = Event::new(json) + .to_data( + &store_schema, None, None, None, @@ -754,6 +836,16 @@ mod tests { &LogSource::Json, ) .unwrap(); + let rb = Event::into_recordbatch( + Utc::now(), + data, + schema, + &store_schema, + false, + None, + SchemaVersion::V0, + ) + .unwrap(); assert_eq!(rb.num_rows(), 3); assert_eq!(rb.num_columns(), 4); @@ -791,7 +883,7 @@ mod tests { }, ]); - let schema = fields_to_map( + let store_schema = fields_to_map( [ Field::new("a", DataType::Int64, true), Field::new("b", DataType::Utf8, true), @@ -801,14 +893,13 @@ mod tests { ); assert!(Event::new(json) - .into_recordbatch( - &schema, - false, + .to_data( + &store_schema, None, None, None, SchemaVersion::V0, - &LogSource::Json + &LogSource::Json, ) .is_err()); } @@ -837,10 +928,10 @@ mod tests { }, ]); - let (rb, _) = Event::new(json) - .into_recordbatch( - &HashMap::default(), - false, + let store_schema = HashMap::new(); + let (data, schema, _) = Event::new(json) + .to_data( + &store_schema, None, None, None, @@ -848,6 +939,17 @@ mod tests { &LogSource::Json, ) .unwrap(); + let rb = Event::into_recordbatch( + Utc::now(), + data, + schema, + &store_schema, + false, + None, + SchemaVersion::V0, + ) + .unwrap(); + assert_eq!(rb.num_rows(), 4); assert_eq!(rb.num_columns(), 5); assert_eq!( @@ -917,10 +1019,10 @@ mod tests { }, ]); - let (rb, _) = Event::new(json) - .into_recordbatch( - &HashMap::default(), - false, + let store_schema = HashMap::new(); + let (data, schema, _) = Event::new(json) + .to_data( + &store_schema, None, None, None, @@ -928,6 +1030,16 @@ mod tests { &LogSource::Json, ) .unwrap(); + let rb = Event::into_recordbatch( + Utc::now(), + data, + schema, + &store_schema, + false, + None, + SchemaVersion::V1, + ) + .unwrap(); assert_eq!(rb.num_rows(), 4); assert_eq!(rb.num_columns(), 5); diff --git a/src/event/format/mod.rs b/src/event/format/mod.rs index bf259159a..a6b977d7c 100644 --- a/src/event/format/mod.rs +++ b/src/event/format/mod.rs @@ -34,7 +34,10 @@ use serde_json::Value; use crate::{ metadata::SchemaVersion, storage::StreamType, - utils::arrow::{get_field, get_timestamp_array, replace_columns}, + utils::{ + arrow::{get_field, get_timestamp_array, replace_columns}, + json::Json, + }, }; use super::{Event, DEFAULT_TIMESTAMP_KEY}; @@ -110,30 +113,16 @@ pub trait EventFormat: Sized { fn decode(data: Self::Data, schema: Arc) -> Result; - /// Returns the UTC time at ingestion - fn get_p_timestamp(&self) -> DateTime; - #[allow(clippy::too_many_arguments)] fn into_recordbatch( - self, + p_timestamp: DateTime, + data: Self::Data, + mut schema: EventSchema, storage_schema: &HashMap>, static_schema_flag: bool, time_partition: Option<&String>, - time_partition_limit: Option, - custom_partition: Option<&String>, schema_version: SchemaVersion, - log_source: &LogSource, - ) -> Result<(RecordBatch, bool), AnyError> { - let p_timestamp = self.get_p_timestamp(); - let (data, mut schema, is_first) = self.to_data( - storage_schema, - time_partition, - time_partition_limit, - custom_partition, - schema_version, - log_source, - )?; - + ) -> Result { if get_field(&schema, DEFAULT_TIMESTAMP_KEY).is_some() { return Err(anyhow!( "field {} is a reserved field", @@ -162,7 +151,7 @@ pub trait EventFormat: Sized { } // prepare the record batch and new fields to be added - let mut new_schema = Arc::new(Schema::new(schema)); + let mut new_schema = Arc::new(Schema::new(schema.clone())); new_schema = update_field_type_in_schema(new_schema, None, time_partition, None, schema_version); @@ -173,7 +162,7 @@ pub trait EventFormat: Sized { &[(0, Arc::new(get_timestamp_array(p_timestamp, rb.num_rows())))], ); - Ok((rb, is_first)) + Ok(rb) } #[allow(clippy::too_many_arguments)] @@ -247,7 +236,7 @@ pub fn update_field_type_in_schema( inferred_schema: Arc, existing_schema: Option<&HashMap>>, time_partition: Option<&String>, - log_records: Option<&Vec>, + log_records: Option<&[Json]>, schema_version: SchemaVersion, ) -> Arc { let mut updated_schema = inferred_schema.clone(); @@ -292,18 +281,15 @@ pub fn update_field_type_in_schema( // a string value parseable into timestamp as timestamp type and all numbers as float64. pub fn override_data_type( inferred_schema: Arc, - log_record: Value, + log_record: Json, schema_version: SchemaVersion, ) -> Arc { - let Value::Object(map) = log_record else { - return inferred_schema; - }; let updated_schema: Vec = inferred_schema .fields() .iter() .map(|field| { let field_name = field.name().as_str(); - match (schema_version, map.get(field.name())) { + match (schema_version, log_record.get(field.name())) { // in V1 for new fields in json named "time"/"date" or such and having inferred // type string, that can be parsed as timestamp, use the timestamp type. // NOTE: support even more datetime string formats diff --git a/src/event/mod.rs b/src/event/mod.rs index 29a4a0899..e17bbedec 100644 --- a/src/event/mod.rs +++ b/src/event/mod.rs @@ -36,69 +36,78 @@ use std::collections::HashMap; pub const DEFAULT_TIMESTAMP_KEY: &str = "p_timestamp"; -#[derive(Clone)] +pub struct PartitionEvent { + pub rb: RecordBatch, + pub parsed_timestamp: NaiveDateTime, + pub custom_partition_values: HashMap, +} + pub struct Event { pub stream_name: String, - pub rb: RecordBatch, pub origin_format: &'static str, pub origin_size: u64, pub is_first_event: bool, - pub parsed_timestamp: NaiveDateTime, pub time_partition: Option, - pub custom_partition_values: HashMap, + pub partitions: Vec, pub stream_type: StreamType, } // Events holds the schema related to a each event for a single log stream impl Event { pub fn process(self) -> Result<(), EventError> { - let mut key = get_schema_key(&self.rb.schema().fields); - if self.time_partition.is_some() { - let parsed_timestamp_to_min = self.parsed_timestamp.format("%Y%m%dT%H%M").to_string(); - key.push_str(&parsed_timestamp_to_min); - } + for partition in self.partitions { + let mut key = get_schema_key(&partition.rb.schema().fields); + if self.time_partition.is_some() { + let parsed_timestamp_to_min = + partition.parsed_timestamp.format("%Y%m%dT%H%M").to_string(); + key.push_str(&parsed_timestamp_to_min); + } - if !self.custom_partition_values.is_empty() { - for (k, v) in self.custom_partition_values.iter().sorted_by_key(|v| v.0) { + for (k, v) in partition + .custom_partition_values + .iter() + .sorted_by_key(|v| v.0) + { key.push_str(&format!("&{k}={v}")); } - } - - if self.is_first_event { - commit_schema(&self.stream_name, self.rb.schema())?; - } - - PARSEABLE.get_or_create_stream(&self.stream_name).push( - &key, - &self.rb, - self.parsed_timestamp, - &self.custom_partition_values, - self.stream_type, - )?; - update_stats( - &self.stream_name, - self.origin_format, - self.origin_size, - self.rb.num_rows(), - self.parsed_timestamp.date(), - ); - - crate::livetail::LIVETAIL.process(&self.stream_name, &self.rb); + if self.is_first_event { + commit_schema(&self.stream_name, partition.rb.schema())?; + } + PARSEABLE.get_or_create_stream(&self.stream_name).push( + &key, + &partition.rb, + partition.parsed_timestamp, + &partition.custom_partition_values, + self.stream_type, + )?; + + update_stats( + &self.stream_name, + self.origin_format, + self.origin_size, + partition.rb.num_rows(), + partition.parsed_timestamp.date(), + ); + + crate::livetail::LIVETAIL.process(&self.stream_name, &partition.rb); + } Ok(()) } pub fn process_unchecked(&self) -> Result<(), EventError> { - let key = get_schema_key(&self.rb.schema().fields); - - PARSEABLE.get_or_create_stream(&self.stream_name).push( - &key, - &self.rb, - self.parsed_timestamp, - &self.custom_partition_values, - self.stream_type, - )?; + for partition in &self.partitions { + let key = get_schema_key(&partition.rb.schema().fields); + + PARSEABLE.get_or_create_stream(&self.stream_name).push( + &key, + &partition.rb, + partition.parsed_timestamp, + &partition.custom_partition_values, + self.stream_type, + )?; + } Ok(()) } diff --git a/src/handlers/http/ingest.rs b/src/handlers/http/ingest.rs index 740269d40..a50b0845d 100644 --- a/src/handlers/http/ingest.rs +++ b/src/handlers/http/ingest.rs @@ -26,9 +26,9 @@ use chrono::Utc; use http::StatusCode; use serde_json::Value; -use crate::event; use crate::event::error::EventError; use crate::event::format::LogSource; +use crate::event::{self, PartitionEvent}; use crate::handlers::{LOG_SOURCE_KEY, STREAM_NAME_HEADER_KEY}; use crate::option::Mode; use crate::parseable::{StreamNotFound, PARSEABLE}; @@ -243,14 +243,16 @@ pub async fn push_logs_unchecked( stream_name: &str, ) -> Result { let unchecked_event = event::Event { - rb: batches, stream_name: stream_name.to_string(), origin_format: "json", origin_size: 0, - parsed_timestamp: Utc::now().naive_utc(), time_partition: None, - is_first_event: true, // NOTE: Maybe should be false - custom_partition_values: HashMap::new(), // should be an empty map for unchecked push + is_first_event: true, // NOTE: Maybe should be false + partitions: vec![PartitionEvent { + rb: batches, + parsed_timestamp: Utc::now().naive_utc(), + custom_partition_values: HashMap::new(), // should be an empty map for unchecked push + }], stream_type: StreamType::UserDefined, }; unchecked_event.process_unchecked()?; diff --git a/src/handlers/http/logstream.rs b/src/handlers/http/logstream.rs index b9fb64edc..f486338fe 100644 --- a/src/handlers/http/logstream.rs +++ b/src/handlers/http/logstream.rs @@ -114,7 +114,13 @@ pub async fn detect_schema(Json(json): Json) -> Result; + /// calls the function `flatten_json` which results Vec or Error /// in case when Vec is returned, converts the Vec to Value of Array /// this is to ensure recursive flattening does not happen for heavily nested jsons @@ -61,32 +63,8 @@ pub fn flatten_json_body( custom_partition, validation_required, )?; - Ok(nested_value) -} -pub fn convert_array_to_object( - body: Value, - time_partition: Option<&String>, - time_partition_limit: Option, - custom_partition: Option<&String>, - schema_version: SchemaVersion, - log_source: &LogSource, -) -> Result, anyhow::Error> { - let data = flatten_json_body( - body, - time_partition, - time_partition_limit, - custom_partition, - schema_version, - true, - log_source, - )?; - let value_arr = match data { - Value::Array(arr) => arr, - value @ Value::Object(_) => vec![value], - _ => unreachable!("flatten would have failed beforehand"), - }; - Ok(value_arr) + Ok(nested_value) } struct TrueFromStr; @@ -283,12 +261,13 @@ mod tests { fn non_object_arr_is_err() { let json = json!([1]); - assert!(convert_array_to_object( + assert!(flatten_json_body( json, None, None, None, SchemaVersion::V0, + false, &crate::event::format::LogSource::default() ) .is_err()) @@ -316,16 +295,14 @@ mod tests { "c": [{"a": 1, "b": 2}] }, ]); - let flattened_json = convert_to_array( - convert_array_to_object( - json, - None, - None, - None, - SchemaVersion::V0, - &crate::event::format::LogSource::default(), - ) - .unwrap(), + let flattened_json = flatten_json_body( + json, + None, + None, + None, + SchemaVersion::V0, + false, + &crate::event::format::LogSource::default(), ) .unwrap(); @@ -377,16 +354,14 @@ mod tests { "c": [{"a": 1, "b": 2}] }, ]); - let flattened_json = convert_to_array( - convert_array_to_object( - json, - None, - None, - None, - SchemaVersion::V1, - &crate::event::format::LogSource::default(), - ) - .unwrap(), + let flattened_json = flatten_json_body( + json, + None, + None, + None, + SchemaVersion::V1, + false, + &crate::event::format::LogSource::default(), ) .unwrap(); From dc34a8513f6caf6e5c1621e3284aea25b3f15dfa Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Sat, 1 Mar 2025 14:56:27 +0530 Subject: [PATCH 15/39] refactor: share `Stream` state when processing --- src/event/format/json.rs | 34 +++++++++++++++++----------------- src/event/format/mod.rs | 12 ++---------- src/event/mod.rs | 17 ++++++++--------- src/handlers/http/ingest.rs | 7 +++---- src/parseable/mod.rs | 2 +- src/parseable/streams.rs | 23 ++--------------------- src/utils/arrow/flight.rs | 8 ++++---- 7 files changed, 37 insertions(+), 66 deletions(-) diff --git a/src/event/format/json.rs b/src/event/format/json.rs index 08c132209..61cb373f1 100644 --- a/src/event/format/json.rs +++ b/src/event/format/json.rs @@ -43,7 +43,7 @@ use crate::{ kinesis::{flatten_kinesis_logs, Message}, metadata::SchemaVersion, otel::{logs::flatten_otel_logs, metrics::flatten_otel_metrics, traces::flatten_otel_traces}, - storage::StreamType, + parseable::Stream, utils::{ arrow::get_field, json::{flatten_json_body, Json}, @@ -224,23 +224,24 @@ impl EventFormat for Event { /// Converts a JSON event into a Parseable Event fn into_event( self, - stream_name: String, origin_size: u64, - storage_schema: &HashMap>, - static_schema_flag: bool, - custom_partitions: Option<&String>, - time_partition: Option<&String>, - time_partition_limit: Option, - schema_version: SchemaVersion, + stream: &Stream, log_source: &LogSource, - stream_type: StreamType, ) -> anyhow::Result { + let time_partition = stream.get_time_partition(); + let time_partition_limit = stream.get_time_partition_limit(); + let static_schema_flag = stream.get_static_schema_flag(); + let custom_partitions = stream.get_custom_partition(); + let schema_version = stream.get_schema_version(); + let storage_schema = stream.get_schema_raw(); + let stream_type = stream.get_stream_type(); + let p_timestamp = self.p_timestamp; let (data, schema, is_first_event) = self.to_data( - storage_schema, - time_partition, + &storage_schema, + time_partition.as_ref(), time_partition_limit, - custom_partitions, + custom_partitions.as_ref(), schema_version, log_source, )?; @@ -255,8 +256,8 @@ impl EventFormat for Event { None => HashMap::new(), }; - let parsed_timestamp = match time_partition { - Some(time_partition) => extract_and_parse_time(&json, time_partition)?, + let parsed_timestamp = match time_partition.as_ref() { + Some(time_partition) => extract_and_parse_time(&json, time_partition.as_ref())?, _ => p_timestamp.naive_utc(), }; @@ -264,9 +265,9 @@ impl EventFormat for Event { p_timestamp, vec![json], schema.clone(), - storage_schema, + &storage_schema, static_schema_flag, - time_partition, + time_partition.as_ref(), schema_version, )?; @@ -278,7 +279,6 @@ impl EventFormat for Event { } Ok(super::Event { - stream_name, origin_format: "json", origin_size, is_first_event, diff --git a/src/event/format/mod.rs b/src/event/format/mod.rs index a6b977d7c..2d181fc7a 100644 --- a/src/event/format/mod.rs +++ b/src/event/format/mod.rs @@ -33,7 +33,7 @@ use serde_json::Value; use crate::{ metadata::SchemaVersion, - storage::StreamType, + parseable::Stream, utils::{ arrow::{get_field, get_timestamp_array, replace_columns}, json::Json, @@ -165,19 +165,11 @@ pub trait EventFormat: Sized { Ok(rb) } - #[allow(clippy::too_many_arguments)] fn into_event( self, - stream_name: String, origin_size: u64, - storage_schema: &HashMap>, - static_schema_flag: bool, - custom_partitions: Option<&String>, - time_partition: Option<&String>, - time_partition_limit: Option, - schema_version: SchemaVersion, + stream: &Stream, log_source: &LogSource, - stream_type: StreamType, ) -> Result; } diff --git a/src/event/mod.rs b/src/event/mod.rs index e17bbedec..a0c803340 100644 --- a/src/event/mod.rs +++ b/src/event/mod.rs @@ -27,7 +27,7 @@ use std::sync::Arc; use self::error::EventError; use crate::{ metadata::update_stats, - parseable::{StagingError, PARSEABLE}, + parseable::{StagingError, Stream, PARSEABLE}, storage::StreamType, LOCK_EXPECT, }; @@ -43,7 +43,6 @@ pub struct PartitionEvent { } pub struct Event { - pub stream_name: String, pub origin_format: &'static str, pub origin_size: u64, pub is_first_event: bool, @@ -54,7 +53,7 @@ pub struct Event { // Events holds the schema related to a each event for a single log stream impl Event { - pub fn process(self) -> Result<(), EventError> { + pub fn process(self, stream: &Stream) -> Result<(), EventError> { for partition in self.partitions { let mut key = get_schema_key(&partition.rb.schema().fields); if self.time_partition.is_some() { @@ -72,10 +71,10 @@ impl Event { } if self.is_first_event { - commit_schema(&self.stream_name, partition.rb.schema())?; + commit_schema(&stream.stream_name, partition.rb.schema())?; } - PARSEABLE.get_or_create_stream(&self.stream_name).push( + stream.push( &key, &partition.rb, partition.parsed_timestamp, @@ -84,23 +83,23 @@ impl Event { )?; update_stats( - &self.stream_name, + &stream.stream_name, self.origin_format, self.origin_size, partition.rb.num_rows(), partition.parsed_timestamp.date(), ); - crate::livetail::LIVETAIL.process(&self.stream_name, &partition.rb); + crate::livetail::LIVETAIL.process(&stream.stream_name, &partition.rb); } Ok(()) } - pub fn process_unchecked(&self) -> Result<(), EventError> { + pub fn process_unchecked(&self, stream: &Stream) -> Result<(), EventError> { for partition in &self.partitions { let key = get_schema_key(&partition.rb.schema().fields); - PARSEABLE.get_or_create_stream(&self.stream_name).push( + stream.push( &key, &partition.rb, partition.parsed_timestamp, diff --git a/src/handlers/http/ingest.rs b/src/handlers/http/ingest.rs index a50b0845d..75c8ccc82 100644 --- a/src/handlers/http/ingest.rs +++ b/src/handlers/http/ingest.rs @@ -31,7 +31,7 @@ use crate::event::format::LogSource; use crate::event::{self, PartitionEvent}; use crate::handlers::{LOG_SOURCE_KEY, STREAM_NAME_HEADER_KEY}; use crate::option::Mode; -use crate::parseable::{StreamNotFound, PARSEABLE}; +use crate::parseable::{Stream, StreamNotFound, PARSEABLE}; use crate::storage::{ObjectStorageError, StreamType}; use crate::utils::header_parsing::ParseHeaderError; use crate::utils::json::flatten::JsonFlattenError; @@ -240,10 +240,9 @@ pub async fn post_event( pub async fn push_logs_unchecked( batches: RecordBatch, - stream_name: &str, + stream: &Stream, ) -> Result { let unchecked_event = event::Event { - stream_name: stream_name.to_string(), origin_format: "json", origin_size: 0, time_partition: None, @@ -255,7 +254,7 @@ pub async fn push_logs_unchecked( }], stream_type: StreamType::UserDefined, }; - unchecked_event.process_unchecked()?; + unchecked_event.process_unchecked(stream)?; Ok(unchecked_event) } diff --git a/src/parseable/mod.rs b/src/parseable/mod.rs index 60ec06b55..9437b3916 100644 --- a/src/parseable/mod.rs +++ b/src/parseable/mod.rs @@ -28,7 +28,7 @@ use http::{header::CONTENT_TYPE, HeaderName, HeaderValue, StatusCode}; use once_cell::sync::Lazy; pub use staging::StagingError; use streams::StreamRef; -pub use streams::{StreamNotFound, Streams}; +pub use streams::{Stream, StreamNotFound, Streams}; use tracing::error; #[cfg(feature = "kafka")] diff --git a/src/parseable/streams.rs b/src/parseable/streams.rs index d9cd373ad..1982c147d 100644 --- a/src/parseable/streams.rs +++ b/src/parseable/streams.rs @@ -114,30 +114,11 @@ impl Stream { } pub async fn push_logs(&self, json: Value, log_source: &LogSource) -> anyhow::Result<()> { - let time_partition = self.get_time_partition(); - let time_partition_limit = self.get_time_partition_limit(); - let static_schema_flag = self.get_static_schema_flag(); - let custom_partition = self.get_custom_partition(); - let schema_version = self.get_schema_version(); - let schema = self.get_schema_raw(); - let stream_type = self.get_stream_type(); - let origin_size = serde_json::to_vec(&json).unwrap().len() as u64; // string length need not be the same as byte length json::Event::new(json) - .into_event( - self.stream_name.to_owned(), - origin_size, - &schema, - static_schema_flag, - custom_partition.as_ref(), - time_partition.as_ref(), - time_partition_limit, - schema_version, - log_source, - stream_type, - )? - .process()?; + .into_event(origin_size, self, log_source)? + .process(self)?; Ok(()) } diff --git a/src/utils/arrow/flight.rs b/src/utils/arrow/flight.rs index c8d2dacf2..57ea88b67 100644 --- a/src/utils/arrow/flight.rs +++ b/src/utils/arrow/flight.rs @@ -95,14 +95,14 @@ pub async fn append_temporary_events( Event, Status, > { - let schema = PARSEABLE + let stream = PARSEABLE .get_stream(stream_name) - .map_err(|err| Status::failed_precondition(format!("Metadata Error: {}", err)))? - .get_schema(); + .map_err(|err| Status::failed_precondition(format!("Metadata Error: {}", err)))?; + let schema = stream.get_schema(); let rb = concat_batches(&schema, minute_result) .map_err(|err| Status::failed_precondition(format!("ArrowError: {}", err)))?; - let event = push_logs_unchecked(rb, stream_name) + let event = push_logs_unchecked(rb, &stream) .await .map_err(|err| Status::internal(err.to_string()))?; Ok(event) From 19708df571a7f631aaafecb47cea434692939385 Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Sat, 1 Mar 2025 18:54:43 +0530 Subject: [PATCH 16/39] refactor: `Parseable::commit_schema` --- src/event/mod.rs | 29 ++++------------------------- src/handlers/http/query.rs | 6 +++--- src/parseable/streams.rs | 13 +++++++++++++ 3 files changed, 20 insertions(+), 28 deletions(-) diff --git a/src/event/mod.rs b/src/event/mod.rs index a0c803340..f1de15011 100644 --- a/src/event/mod.rs +++ b/src/event/mod.rs @@ -20,17 +20,12 @@ pub mod format; use arrow_array::RecordBatch; -use arrow_schema::{Field, Fields, Schema}; +use arrow_schema::Field; use itertools::Itertools; use std::sync::Arc; use self::error::EventError; -use crate::{ - metadata::update_stats, - parseable::{StagingError, Stream, PARSEABLE}, - storage::StreamType, - LOCK_EXPECT, -}; +use crate::{metadata::update_stats, parseable::Stream, storage::StreamType}; use chrono::NaiveDateTime; use std::collections::HashMap; @@ -71,7 +66,8 @@ impl Event { } if self.is_first_event { - commit_schema(&stream.stream_name, partition.rb.schema())?; + let schema = partition.rb.schema().as_ref().clone(); + stream.commit_schema(schema)?; } stream.push( @@ -122,23 +118,6 @@ pub fn get_schema_key(fields: &[Arc]) -> String { format!("{hash:x}") } -pub fn commit_schema(stream_name: &str, schema: Arc) -> Result<(), StagingError> { - let mut stream_metadata = PARSEABLE.streams.write().expect("lock poisoned"); - - let map = &mut stream_metadata - .get_mut(stream_name) - .expect("map has entry for this stream name") - .metadata - .write() - .expect(LOCK_EXPECT) - .schema; - let current_schema = Schema::new(map.values().cloned().collect::()); - let schema = Schema::try_merge(vec![current_schema, schema.as_ref().clone()])?; - map.clear(); - map.extend(schema.fields.iter().map(|f| (f.name().clone(), f.clone()))); - Ok(()) -} - pub mod error { use crate::{parseable::StagingError, storage::ObjectStorageError}; diff --git a/src/handlers/http/query.rs b/src/handlers/http/query.rs index 3b6f4dedf..d89e3ef63 100644 --- a/src/handlers/http/query.rs +++ b/src/handlers/http/query.rs @@ -29,14 +29,12 @@ use serde::{Deserialize, Serialize}; use serde_json::{json, Value}; use std::collections::HashMap; use std::pin::Pin; -use std::sync::Arc; use std::time::Instant; use tracing::error; use crate::event::error::EventError; use crate::handlers::http::fetch_schema; -use crate::event::commit_schema; use crate::metrics::QUERY_EXECUTE_TIME; use crate::option::Mode; use crate::parseable::PARSEABLE; @@ -174,7 +172,9 @@ pub async fn update_schema_when_distributed(tables: &Vec) -> Result<(), // commit schema merges the schema internally and updates the schema in storage. commit_schema_to_storage(table, new_schema.clone()).await?; - commit_schema(table, Arc::new(new_schema))?; + PARSEABLE + .get_or_create_stream(table) + .commit_schema(new_schema)?; } } } diff --git a/src/parseable/streams.rs b/src/parseable/streams.rs index 1982c147d..cd3082d61 100644 --- a/src/parseable/streams.rs +++ b/src/parseable/streams.rs @@ -624,6 +624,19 @@ impl Stream { Arc::new(Schema::new(fields)) } + pub fn commit_schema(&self, schema: Schema) -> Result<(), StagingError> { + let current_schema = self.get_schema().as_ref().clone(); + let updated_schema = Schema::try_merge([current_schema, schema])? + .fields + .into_iter() + .map(|field| (field.name().to_owned(), field.clone())) + .collect(); + + self.metadata.write().expect(LOCK_EXPECT).schema = updated_schema; + + Ok(()) + } + pub fn get_schema_raw(&self) -> HashMap> { self.metadata.read().expect(LOCK_EXPECT).schema.clone() } From 7215e8e7d5998ac4e722f86d08ef06e75182da36 Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Sat, 1 Mar 2025 20:33:53 +0530 Subject: [PATCH 17/39] map schema keys to recordbatches --- src/event/format/json.rs | 28 +++++++++++++++++++++------- src/event/mod.rs | 25 ++++--------------------- src/handlers/http/ingest.rs | 21 +++++++++++++-------- 3 files changed, 38 insertions(+), 36 deletions(-) diff --git a/src/event/format/json.rs b/src/event/format/json.rs index 61cb373f1..6ee9310be 100644 --- a/src/event/format/json.rs +++ b/src/event/format/json.rs @@ -39,7 +39,7 @@ use tracing::error; use super::{EventFormat, LogSource}; use crate::{ - event::PartitionEvent, + event::{get_schema_key, PartitionEvent}, kinesis::{flatten_kinesis_logs, Message}, metadata::SchemaVersion, otel::{logs::flatten_otel_logs, metrics::flatten_otel_metrics, traces::flatten_otel_traces}, @@ -246,7 +246,7 @@ impl EventFormat for Event { log_source, )?; - let mut partitions = vec![]; + let mut partitions = HashMap::new(); for json in data { let custom_partition_values = match custom_partitions.as_ref() { Some(custom_partitions) => { @@ -271,11 +271,25 @@ impl EventFormat for Event { schema_version, )?; - partitions.push(PartitionEvent { - rb, - parsed_timestamp, - custom_partition_values, - }); + let schema = rb.schema(); + let mut key = get_schema_key(&schema.fields); + if time_partition.is_some() { + let parsed_timestamp_to_min = parsed_timestamp.format("%Y%m%dT%H%M").to_string(); + key.push_str(&parsed_timestamp_to_min); + } + + for (k, v) in custom_partition_values.iter().sorted_by_key(|v| v.0) { + key.push_str(&format!("&{k}={v}")); + } + + partitions.insert( + key, + PartitionEvent { + rb, + parsed_timestamp, + custom_partition_values, + }, + ); } Ok(super::Event { diff --git a/src/event/mod.rs b/src/event/mod.rs index f1de15011..30ee4abf9 100644 --- a/src/event/mod.rs +++ b/src/event/mod.rs @@ -42,29 +42,14 @@ pub struct Event { pub origin_size: u64, pub is_first_event: bool, pub time_partition: Option, - pub partitions: Vec, + pub partitions: HashMap, pub stream_type: StreamType, } // Events holds the schema related to a each event for a single log stream impl Event { pub fn process(self, stream: &Stream) -> Result<(), EventError> { - for partition in self.partitions { - let mut key = get_schema_key(&partition.rb.schema().fields); - if self.time_partition.is_some() { - let parsed_timestamp_to_min = - partition.parsed_timestamp.format("%Y%m%dT%H%M").to_string(); - key.push_str(&parsed_timestamp_to_min); - } - - for (k, v) in partition - .custom_partition_values - .iter() - .sorted_by_key(|v| v.0) - { - key.push_str(&format!("&{k}={v}")); - } - + for (key, partition) in self.partitions { if self.is_first_event { let schema = partition.rb.schema().as_ref().clone(); stream.commit_schema(schema)?; @@ -92,11 +77,9 @@ impl Event { } pub fn process_unchecked(&self, stream: &Stream) -> Result<(), EventError> { - for partition in &self.partitions { - let key = get_schema_key(&partition.rb.schema().fields); - + for (key, partition) in &self.partitions { stream.push( - &key, + key, &partition.rb, partition.parsed_timestamp, &partition.custom_partition_values, diff --git a/src/handlers/http/ingest.rs b/src/handlers/http/ingest.rs index 75c8ccc82..343a0ffac 100644 --- a/src/handlers/http/ingest.rs +++ b/src/handlers/http/ingest.rs @@ -28,7 +28,7 @@ use serde_json::Value; use crate::event::error::EventError; use crate::event::format::LogSource; -use crate::event::{self, PartitionEvent}; +use crate::event::{self, get_schema_key, PartitionEvent}; use crate::handlers::{LOG_SOURCE_KEY, STREAM_NAME_HEADER_KEY}; use crate::option::Mode; use crate::parseable::{Stream, StreamNotFound, PARSEABLE}; @@ -239,21 +239,26 @@ pub async fn post_event( } pub async fn push_logs_unchecked( - batches: RecordBatch, + rb: RecordBatch, stream: &Stream, ) -> Result { - let unchecked_event = event::Event { + let mut unchecked_event = event::Event { origin_format: "json", origin_size: 0, time_partition: None, is_first_event: true, // NOTE: Maybe should be false - partitions: vec![PartitionEvent { - rb: batches, - parsed_timestamp: Utc::now().naive_utc(), - custom_partition_values: HashMap::new(), // should be an empty map for unchecked push - }], + partitions: HashMap::new(), stream_type: StreamType::UserDefined, }; + unchecked_event.partitions.insert( + get_schema_key(&rb.schema().fields), + PartitionEvent { + rb, + parsed_timestamp: Utc::now().naive_utc(), + custom_partition_values: HashMap::new(), // should be an empty map for unchecked push + }, + ); + unchecked_event.process_unchecked(stream)?; Ok(unchecked_event) From 975b1f674211a5d6a56af4498b2fb59ae601c8e3 Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Sat, 1 Mar 2025 20:35:37 +0530 Subject: [PATCH 18/39] construct map directly --- src/handlers/http/ingest.rs | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/src/handlers/http/ingest.rs b/src/handlers/http/ingest.rs index 343a0ffac..e7724e73b 100644 --- a/src/handlers/http/ingest.rs +++ b/src/handlers/http/ingest.rs @@ -242,22 +242,23 @@ pub async fn push_logs_unchecked( rb: RecordBatch, stream: &Stream, ) -> Result { - let mut unchecked_event = event::Event { + let unchecked_event = event::Event { origin_format: "json", origin_size: 0, time_partition: None, is_first_event: true, // NOTE: Maybe should be false - partitions: HashMap::new(), + partitions: [( + get_schema_key(&rb.schema().fields), + PartitionEvent { + rb, + parsed_timestamp: Utc::now().naive_utc(), + custom_partition_values: HashMap::new(), // should be an empty map for unchecked push + }, + )] + .into_iter() + .collect(), stream_type: StreamType::UserDefined, }; - unchecked_event.partitions.insert( - get_schema_key(&rb.schema().fields), - PartitionEvent { - rb, - parsed_timestamp: Utc::now().naive_utc(), - custom_partition_values: HashMap::new(), // should be an empty map for unchecked push - }, - ); unchecked_event.process_unchecked(stream)?; From 4c1f6d89015efdb47738861e9de6f08b148bfe1f Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Sat, 1 Mar 2025 21:01:15 +0530 Subject: [PATCH 19/39] fix: concat to not lose data --- Cargo.toml | 3 ++- src/event/format/json.rs | 16 ++++++++-------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index a8d9ccdec..0ce7d02b6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,7 +9,8 @@ build = "build.rs" [dependencies] # Arrow and DataFusion ecosystem -arrow-array = { version = "53.0.0" } +arrow = "53.0.0" +arrow-array = "53.0.0" arrow-flight = { version = "53.0.0", features = ["tls"] } arrow-ipc = { version = "53.0.0", features = ["zstd"] } arrow-json = "53.0.0" diff --git a/src/event/format/json.rs b/src/event/format/json.rs index 6ee9310be..563762a6b 100644 --- a/src/event/format/json.rs +++ b/src/event/format/json.rs @@ -20,6 +20,7 @@ #![allow(deprecated)] use anyhow::anyhow; +use arrow::compute::concat_batches; use arrow_array::RecordBatch; use arrow_json::reader::{infer_json_schema_from_iterator, ReaderBuilder}; use arrow_schema::{DataType, Field, Fields, Schema}; @@ -282,14 +283,13 @@ impl EventFormat for Event { key.push_str(&format!("&{k}={v}")); } - partitions.insert( - key, - PartitionEvent { - rb, - parsed_timestamp, - custom_partition_values, - }, - ); + let entry = partitions.entry(key).or_insert(PartitionEvent { + rb: RecordBatch::new_empty(schema.clone()), + parsed_timestamp, + custom_partition_values, + }); + + entry.rb = concat_batches(&schema, [&entry.rb, &rb])?; } Ok(super::Event { From 5a2bcc193dc1840b462dda01c214f1738915f3e7 Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Sun, 2 Mar 2025 03:01:14 +0530 Subject: [PATCH 20/39] refactor: extract `byte_size` during json deserialization --- src/connectors/kafka/processor.rs | 7 ++-- src/event/format/json.rs | 2 +- src/event/format/mod.rs | 2 +- src/event/mod.rs | 2 +- src/handlers/http/cluster/mod.rs | 40 +++++++++---------- src/handlers/http/cluster/utils.rs | 64 ++++++++++++++++++++++++++++-- src/handlers/http/ingest.rs | 38 +++++++----------- src/metadata.rs | 2 +- src/parseable/streams.rs | 9 +++-- 9 files changed, 110 insertions(+), 56 deletions(-) diff --git a/src/connectors/kafka/processor.rs b/src/connectors/kafka/processor.rs index fa2771fc7..5fead256c 100644 --- a/src/connectors/kafka/processor.rs +++ b/src/connectors/kafka/processor.rs @@ -36,7 +36,7 @@ use super::{config::BufferConfig, ConsumerRecord, StreamConsumer, TopicPartition pub struct ParseableSinkProcessor; impl ParseableSinkProcessor { - async fn process_event_from_chunk(&self, records: &[ConsumerRecord]) -> anyhow::Result { + async fn process_event_from_chunk(&self, records: &[ConsumerRecord]) -> anyhow::Result { let stream_name = records .first() .map(|r| r.topic.as_str()) @@ -47,10 +47,10 @@ impl ParseableSinkProcessor { .await?; let mut json_vec = Vec::with_capacity(records.len()); - let mut total_payload_size = 0u64; + let mut total_payload_size = 0; for record in records.iter().filter_map(|r| r.payload.as_ref()) { - total_payload_size += record.len() as u64; + total_payload_size += record.len(); if let Ok(value) = serde_json::from_slice::(record) { json_vec.push(value); } @@ -60,6 +60,7 @@ impl ParseableSinkProcessor { .get_or_create_stream(stream_name) .push_logs( Value::Array(json_vec), + total_payload_size, &LogSource::Custom("Kafka".to_owned()), ) .await?; diff --git a/src/event/format/json.rs b/src/event/format/json.rs index 563762a6b..0d0849ef4 100644 --- a/src/event/format/json.rs +++ b/src/event/format/json.rs @@ -225,7 +225,7 @@ impl EventFormat for Event { /// Converts a JSON event into a Parseable Event fn into_event( self, - origin_size: u64, + origin_size: usize, stream: &Stream, log_source: &LogSource, ) -> anyhow::Result { diff --git a/src/event/format/mod.rs b/src/event/format/mod.rs index 2d181fc7a..c44ebe7b5 100644 --- a/src/event/format/mod.rs +++ b/src/event/format/mod.rs @@ -167,7 +167,7 @@ pub trait EventFormat: Sized { fn into_event( self, - origin_size: u64, + origin_size: usize, stream: &Stream, log_source: &LogSource, ) -> Result; diff --git a/src/event/mod.rs b/src/event/mod.rs index 30ee4abf9..6de567b83 100644 --- a/src/event/mod.rs +++ b/src/event/mod.rs @@ -39,7 +39,7 @@ pub struct PartitionEvent { pub struct Event { pub origin_format: &'static str, - pub origin_size: u64, + pub origin_size: usize, pub is_first_event: bool, pub time_partition: Option, pub partitions: HashMap, diff --git a/src/handlers/http/cluster/mod.rs b/src/handlers/http/cluster/mod.rs index 60a61a1ec..3ed451cb6 100644 --- a/src/handlers/http/cluster/mod.rs +++ b/src/handlers/http/cluster/mod.rs @@ -37,7 +37,7 @@ use tracing::{error, info, warn}; use url::Url; use utils::{check_liveness, to_url_string, IngestionStats, QueriedStats, StorageStats}; -use crate::handlers::http::ingest::ingest_internal_stream; +use crate::event::format::LogSource; use crate::metrics::prom_utils::Metrics; use crate::parseable::PARSEABLE; use crate::rbac::role::model::DefaultPrivilege; @@ -774,29 +774,29 @@ pub fn init_cluster_metrics_schedular() -> Result<(), PostError> { scheduler .every(CLUSTER_METRICS_INTERVAL_SECONDS) .run(move || async { + let internal_stream = PARSEABLE.get_or_create_stream(INTERNAL_STREAM_NAME); let result: Result<(), PostError> = async { let cluster_metrics = fetch_cluster_metrics().await; - if let Ok(metrics) = cluster_metrics { - if !metrics.is_empty() { - info!("Cluster metrics fetched successfully from all ingestors"); - if let Ok(metrics_bytes) = serde_json::to_vec(&metrics) { - if matches!( - ingest_internal_stream( - INTERNAL_STREAM_NAME.to_string(), - bytes::Bytes::from(metrics_bytes), - ) - .await, - Ok(()) - ) { - info!("Cluster metrics successfully ingested into internal stream"); - } else { - error!("Failed to ingest cluster metrics into internal stream"); - } - } else { - error!("Failed to serialize cluster metrics"); - } + let Ok(metrics) = cluster_metrics else { + return Ok(()); + }; + if !metrics.is_empty() { + info!("Cluster metrics fetched successfully from all ingestors"); + let json = serde_json::to_value(&metrics).expect("should be json serializable"); + let byte_size = serde_json::to_vec(&metrics).unwrap().len(); + + if matches!( + internal_stream + .push_logs(json, byte_size, &LogSource::Pmeta) + .await, + Ok(()) + ) { + info!("Cluster metrics successfully ingested into internal stream"); + } else { + error!("Failed to ingest cluster metrics into internal stream"); } } + Ok(()) } .await; diff --git a/src/handlers/http/cluster/utils.rs b/src/handlers/http/cluster/utils.rs index b41582d70..1d141b896 100644 --- a/src/handlers/http/cluster/utils.rs +++ b/src/handlers/http/cluster/utils.rs @@ -16,11 +16,23 @@ * */ -use crate::{handlers::http::base_path_without_preceding_slash, HTTP_CLIENT}; -use actix_web::http::header; +use std::{future::Future, pin::Pin}; + +use crate::{ + handlers::http::{base_path_without_preceding_slash, MAX_EVENT_PAYLOAD_SIZE}, + HTTP_CLIENT, +}; +use actix_web::{ + dev::Payload, + error::{ErrorPayloadTooLarge, JsonPayloadError}, + http::header, + FromRequest, HttpRequest, +}; +use bytes::BytesMut; use chrono::{DateTime, Utc}; +use futures::StreamExt; use itertools::Itertools; -use serde::{Deserialize, Serialize}; +use serde::{de::DeserializeOwned, Deserialize, Serialize}; use tracing::error; use url::Url; @@ -248,3 +260,49 @@ pub fn to_url_string(str: String) -> String { format!("http://{}/", str) } + +pub struct JsonWithSize { + pub json: T, + pub byte_size: usize, +} + +impl FromRequest for JsonWithSize { + type Error = actix_web::error::Error; + type Future = Pin>>>; + + fn from_request(_: &HttpRequest, payload: &mut Payload) -> Self::Future { + let limit = MAX_EVENT_PAYLOAD_SIZE; + + // Take ownership of payload for async processing + let mut payload = payload.take(); + + Box::pin(async move { + // Buffer to collect all bytes + let mut body = BytesMut::new(); + let mut byte_size = 0; + + // Collect all bytes from the payload stream + while let Some(chunk) = payload.next().await { + let chunk = chunk?; + byte_size += chunk.len(); + + // Check the size limit + if byte_size > limit { + return Err(ErrorPayloadTooLarge(byte_size).into()); + } + + // Extend our buffer with the chunk + body.extend_from_slice(&chunk); + } + + // Convert the collected bytes to Bytes + let bytes = body.freeze(); + + // Deserialize the JSON payload + let json = serde_json::from_slice::(&bytes) + .map_err(|e| JsonPayloadError::Deserialize(e))?; + + Ok(JsonWithSize { json, byte_size }) + }) + } +} diff --git a/src/handlers/http/ingest.rs b/src/handlers/http/ingest.rs index e7724e73b..76f236690 100644 --- a/src/handlers/http/ingest.rs +++ b/src/handlers/http/ingest.rs @@ -18,10 +18,9 @@ use std::collections::HashMap; -use actix_web::web::{Json, Path}; +use actix_web::web::Path; use actix_web::{http::header::ContentType, HttpRequest, HttpResponse}; use arrow_array::RecordBatch; -use bytes::Bytes; use chrono::Utc; use http::StatusCode; use serde_json::Value; @@ -36,6 +35,7 @@ use crate::storage::{ObjectStorageError, StreamType}; use crate::utils::header_parsing::ParseHeaderError; use crate::utils::json::flatten::JsonFlattenError; +use super::cluster::utils::JsonWithSize; use super::logstream::error::{CreateStreamError, StreamError}; use super::users::dashboards::DashboardError; use super::users::filters::FiltersError; @@ -43,7 +43,10 @@ use super::users::filters::FiltersError; // Handler for POST /api/v1/ingest // ingests events by extracting stream name from header // creates if stream does not exist -pub async fn ingest(req: HttpRequest, Json(json): Json) -> Result { +pub async fn ingest( + req: HttpRequest, + JsonWithSize { json, byte_size }: JsonWithSize, +) -> Result { let Some(stream_name) = req.headers().get(STREAM_NAME_HEADER_KEY) else { return Err(PostError::Header(ParseHeaderError::MissingStreamName)); }; @@ -72,29 +75,18 @@ pub async fn ingest(req: HttpRequest, Json(json): Json) -> Result Result<(), PostError> { - let json: Value = serde_json::from_slice(&body)?; - - PARSEABLE - .get_stream(&stream_name)? - .push_logs(json, &LogSource::Pmeta) - .await?; - - Ok(()) -} - // Handler for POST /v1/logs to ingest OTEL logs // ingests events by extracting stream name from header // creates if stream does not exist pub async fn handle_otel_logs_ingestion( req: HttpRequest, - Json(json): Json, + JsonWithSize { json, byte_size }: JsonWithSize, ) -> Result { let Some(stream_name) = req.headers().get(STREAM_NAME_HEADER_KEY) else { return Err(PostError::Header(ParseHeaderError::MissingStreamName)); @@ -115,7 +107,7 @@ pub async fn handle_otel_logs_ingestion( PARSEABLE .get_or_create_stream(&stream_name) - .push_logs(json, &log_source) + .push_logs(json, byte_size, &log_source) .await?; Ok(HttpResponse::Ok().finish()) @@ -126,7 +118,7 @@ pub async fn handle_otel_logs_ingestion( // creates if stream does not exist pub async fn handle_otel_metrics_ingestion( req: HttpRequest, - Json(json): Json, + JsonWithSize { json, byte_size }: JsonWithSize, ) -> Result { let Some(stream_name) = req.headers().get(STREAM_NAME_HEADER_KEY) else { return Err(PostError::Header(ParseHeaderError::MissingStreamName)); @@ -149,7 +141,7 @@ pub async fn handle_otel_metrics_ingestion( PARSEABLE .get_or_create_stream(&stream_name) - .push_logs(json, &log_source) + .push_logs(json, byte_size, &log_source) .await?; Ok(HttpResponse::Ok().finish()) @@ -160,7 +152,7 @@ pub async fn handle_otel_metrics_ingestion( // creates if stream does not exist pub async fn handle_otel_traces_ingestion( req: HttpRequest, - Json(json): Json, + JsonWithSize { json, byte_size }: JsonWithSize, ) -> Result { let Some(stream_name) = req.headers().get(STREAM_NAME_HEADER_KEY) else { return Err(PostError::Header(ParseHeaderError::MissingStreamName)); @@ -180,7 +172,7 @@ pub async fn handle_otel_traces_ingestion( PARSEABLE .get_or_create_stream(&stream_name) - .push_logs(json, &log_source) + .push_logs(json, byte_size, &log_source) .await?; Ok(HttpResponse::Ok().finish()) @@ -192,7 +184,7 @@ pub async fn handle_otel_traces_ingestion( pub async fn post_event( req: HttpRequest, stream_name: Path, - Json(json): Json, + JsonWithSize { json, byte_size }: JsonWithSize, ) -> Result { let stream_name = stream_name.into_inner(); @@ -232,7 +224,7 @@ pub async fn post_event( PARSEABLE .get_or_create_stream(&stream_name) - .push_logs(json, &log_source) + .push_logs(json, byte_size, &log_source) .await?; Ok(HttpResponse::Ok().finish()) diff --git a/src/metadata.rs b/src/metadata.rs index f4d2e2225..bacd01083 100644 --- a/src/metadata.rs +++ b/src/metadata.rs @@ -35,7 +35,7 @@ use crate::storage::StreamType; pub fn update_stats( stream_name: &str, origin: &'static str, - size: u64, + size: usize, num_rows: usize, parsed_date: NaiveDate, ) { diff --git a/src/parseable/streams.rs b/src/parseable/streams.rs index cd3082d61..a35c8fcf4 100644 --- a/src/parseable/streams.rs +++ b/src/parseable/streams.rs @@ -113,9 +113,12 @@ impl Stream { }) } - pub async fn push_logs(&self, json: Value, log_source: &LogSource) -> anyhow::Result<()> { - let origin_size = serde_json::to_vec(&json).unwrap().len() as u64; // string length need not be the same as byte length - + pub async fn push_logs( + &self, + json: Value, + origin_size: usize, + log_source: &LogSource, + ) -> anyhow::Result<()> { json::Event::new(json) .into_event(origin_size, self, log_source)? .process(self)?; From f26842217aa260da6f301255fc8db1707823163c Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Sun, 2 Mar 2025 12:29:00 +0530 Subject: [PATCH 21/39] ci: clippy suggestion --- src/handlers/http/cluster/utils.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/handlers/http/cluster/utils.rs b/src/handlers/http/cluster/utils.rs index 1d141b896..431f61399 100644 --- a/src/handlers/http/cluster/utils.rs +++ b/src/handlers/http/cluster/utils.rs @@ -288,7 +288,7 @@ impl FromRequest for JsonWithSize { // Check the size limit if byte_size > limit { - return Err(ErrorPayloadTooLarge(byte_size).into()); + return Err(ErrorPayloadTooLarge(byte_size)); } // Extend our buffer with the chunk @@ -299,8 +299,8 @@ impl FromRequest for JsonWithSize { let bytes = body.freeze(); // Deserialize the JSON payload - let json = serde_json::from_slice::(&bytes) - .map_err(|e| JsonPayloadError::Deserialize(e))?; + let json = + serde_json::from_slice::(&bytes).map_err(JsonPayloadError::Deserialize)?; Ok(JsonWithSize { json, byte_size }) }) From d304b9f46685f059eb610a4f3a29e39cb50b74fa Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Sun, 2 Mar 2025 12:39:33 +0530 Subject: [PATCH 22/39] feat: `DiskWriter` handles writing to arrow part file --- src/parseable/staging/writer.rs | 42 ++++++++++++++++++++++++++++++--- src/parseable/streams.rs | 18 ++++---------- 2 files changed, 44 insertions(+), 16 deletions(-) diff --git a/src/parseable/staging/writer.rs b/src/parseable/staging/writer.rs index c43252f14..2dbfe1e49 100644 --- a/src/parseable/staging/writer.rs +++ b/src/parseable/staging/writer.rs @@ -19,7 +19,8 @@ use std::{ collections::{HashMap, HashSet}, - fs::File, + fs::{File, OpenOptions}, + path::PathBuf, sync::Arc, }; @@ -28,13 +29,48 @@ use arrow_ipc::writer::StreamWriter; use arrow_schema::Schema; use arrow_select::concat::concat_batches; use itertools::Itertools; +use tracing::error; -use crate::utils::arrow::adapt_batch; +use crate::{parseable::ARROW_FILE_EXTENSION, utils::arrow::adapt_batch}; + +use super::StagingError; #[derive(Default)] pub struct Writer { pub mem: MemWriter<16384>, - pub disk: HashMap>, + pub disk: HashMap, +} + +pub struct DiskWriter { + pub inner: StreamWriter, + pub path: PathBuf, +} + +impl DiskWriter { + pub fn new(path: PathBuf, schema: &Schema) -> Result { + let file = OpenOptions::new().create(true).append(true).open(&path)?; + + let inner = StreamWriter::try_new(file, schema)?; + + Ok(Self { inner, path }) + } + + pub fn write(&mut self, rb: &RecordBatch) -> Result<(), StagingError> { + self.inner.write(rb).map_err(StagingError::Arrow) + } + + pub fn finish(&mut self) { + if let Err(err) = self.inner.finish() { + error!("Couldn't finish arrow file {:?}, error = {err}", self.path); + return; + } + + let mut arrow_path = self.path.to_owned(); + arrow_path.set_extension(ARROW_FILE_EXTENSION); + if let Err(err) = std::fs::rename(&self.path, &arrow_path) { + error!("Couldn't rename file {:?}, error = {err}", self.path); + } + } } /// Structure to keep recordbatches in memory. diff --git a/src/parseable/streams.rs b/src/parseable/streams.rs index a35c8fcf4..cf3d65549 100644 --- a/src/parseable/streams.rs +++ b/src/parseable/streams.rs @@ -28,7 +28,6 @@ use std::{ }; use arrow_array::RecordBatch; -use arrow_ipc::writer::StreamWriter; use arrow_schema::{Field, Fields, Schema}; use chrono::{NaiveDateTime, Timelike}; use derive_more::{Deref, DerefMut}; @@ -63,7 +62,7 @@ use crate::{ use super::{ staging::{ reader::{MergedRecordReader, MergedReverseRecordReader}, - writer::Writer, + writer::{DiskWriter, Writer}, StagingError, }, LogStream, ARROW_FILE_EXTENSION, @@ -143,21 +142,14 @@ impl Stream { } None => { // entry is not present thus we create it - let file_path = self.path_by_current_time( + let path = self.path_by_current_time( schema_key, parsed_timestamp, custom_partition_values, ); std::fs::create_dir_all(&self.data_path)?; - let file = OpenOptions::new() - .create(true) - .append(true) - .open(&file_path)?; - - let mut writer = StreamWriter::try_new(file, &record.schema()) - .expect("File and RecordBatch both are checked"); - + let mut writer = DiskWriter::new(path, &record.schema())?; writer.write(record)?; guard.disk.insert(schema_key.to_owned(), writer); } @@ -180,7 +172,7 @@ impl Stream { hostname.push_str(id); } let filename = format!( - "{stream_hash}.date={}.hour={:02}.minute={}.{}{hostname}.data.{ARROW_FILE_EXTENSION}", + "{stream_hash}.date={}.hour={:02}.minute={}.{}{hostname}.data.part", parsed_timestamp.date(), parsed_timestamp.hour(), minute_to_slot(parsed_timestamp.minute(), OBJECT_STORE_DATA_GRANULARITY).unwrap(), @@ -391,7 +383,7 @@ impl Stream { // Flush disk for writer in disk_writers.values_mut() { - _ = writer.finish(); + writer.finish(); } } From 97f560332e2c089aa8adeb807066b80ef5379302 Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Sun, 2 Mar 2025 15:11:26 +0530 Subject: [PATCH 23/39] test: fix expectation --- src/parseable/streams.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/parseable/streams.rs b/src/parseable/streams.rs index cf3d65549..b9d41c944 100644 --- a/src/parseable/streams.rs +++ b/src/parseable/streams.rs @@ -909,7 +909,7 @@ mod tests { ); let expected_path = staging.data_path.join(format!( - "{stream_hash}.date={}.hour={:02}.minute={}.{}.data.{ARROW_FILE_EXTENSION}", + "{stream_hash}.date={}.hour={:02}.minute={}.{}.data.part", parsed_timestamp.date(), parsed_timestamp.hour(), minute_to_slot(parsed_timestamp.minute(), OBJECT_STORE_DATA_GRANULARITY).unwrap(), @@ -943,7 +943,7 @@ mod tests { ); let expected_path = staging.data_path.join(format!( - "{stream_hash}.date={}.hour={:02}.minute={}.key1=value1.key2=value2.{}.data.{ARROW_FILE_EXTENSION}", + "{stream_hash}.date={}.hour={:02}.minute={}.key1=value1.key2=value2.{}.data.part", parsed_timestamp.date(), parsed_timestamp.hour(), minute_to_slot(parsed_timestamp.minute(), OBJECT_STORE_DATA_GRANULARITY).unwrap(), From a9513c454dd673b4f25d16056cd4b774746b1e1f Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Sun, 2 Mar 2025 16:20:41 +0530 Subject: [PATCH 24/39] refactor: don't add a step --- src/event/format/json.rs | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/src/event/format/json.rs b/src/event/format/json.rs index 0d0849ef4..750205f6a 100644 --- a/src/event/format/json.rs +++ b/src/event/format/json.rs @@ -262,7 +262,7 @@ impl EventFormat for Event { _ => p_timestamp.naive_utc(), }; - let rb = Self::into_recordbatch( + let batch = Self::into_recordbatch( p_timestamp, vec![json], schema.clone(), @@ -272,7 +272,7 @@ impl EventFormat for Event { schema_version, )?; - let schema = rb.schema(); + let schema = batch.schema(); let mut key = get_schema_key(&schema.fields); if time_partition.is_some() { let parsed_timestamp_to_min = parsed_timestamp.format("%Y%m%dT%H%M").to_string(); @@ -283,13 +283,21 @@ impl EventFormat for Event { key.push_str(&format!("&{k}={v}")); } - let entry = partitions.entry(key).or_insert(PartitionEvent { - rb: RecordBatch::new_empty(schema.clone()), - parsed_timestamp, - custom_partition_values, - }); - - entry.rb = concat_batches(&schema, [&entry.rb, &rb])?; + match partitions.get_mut(&key) { + Some(PartitionEvent { rb, .. }) => { + *rb = concat_batches(&schema, [&rb, &batch])?; + } + _ => { + partitions.insert( + key, + PartitionEvent { + rb: batch, + parsed_timestamp, + custom_partition_values, + }, + ); + } + } } Ok(super::Event { From 1c98e3b65c62dc01c2ef71483be87cf7070bf758 Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Mon, 3 Mar 2025 02:26:08 +0530 Subject: [PATCH 25/39] refactor: `get_schemas_if_present` --- src/parseable/streams.rs | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/src/parseable/streams.rs b/src/parseable/streams.rs index b9d41c944..870736d2f 100644 --- a/src/parseable/streams.rs +++ b/src/parseable/streams.rs @@ -20,6 +20,7 @@ use std::{ collections::HashMap, fs::{remove_file, write, File, OpenOptions}, + io::BufReader, num::NonZeroU32, path::{Path, PathBuf}, process, @@ -273,23 +274,13 @@ impl Stream { } pub fn get_schemas_if_present(&self) -> Option> { - let Ok(dir) = self.data_path.read_dir() else { - return None; - }; - let mut schemas: Vec = Vec::new(); - for file in dir.flatten() { - if let Some(ext) = file.path().extension() { - if ext.eq("schema") { - let file = File::open(file.path()).expect("Schema File should exist"); + for path in self.schema_files() { + let file = File::open(path).expect("Schema File should exist"); - let schema = match serde_json::from_reader(file) { - Ok(schema) => schema, - Err(_) => continue, - }; - schemas.push(schema); - } + if let Ok(schema) = serde_json::from_reader(BufReader::new(file)) { + schemas.push(schema); } } From a86fc476df655704036db60226ab16aca2b6bc8e Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Mon, 3 Mar 2025 14:51:27 +0530 Subject: [PATCH 26/39] refactor: `prepare_and_validate_schema` --- src/event/format/json.rs | 127 +++++++++++---------------------------- src/event/format/mod.rs | 33 +++++----- 2 files changed, 53 insertions(+), 107 deletions(-) diff --git a/src/event/format/json.rs b/src/event/format/json.rs index 750205f6a..0f1c4a86c 100644 --- a/src/event/format/json.rs +++ b/src/event/format/json.rs @@ -138,6 +138,7 @@ impl EventFormat for Event { // also extract the arrow schema, tags and metadata from the incoming json fn to_data( self, + static_schema_flag: bool, stored_schema: &HashMap>, time_partition: Option<&String>, time_partition_limit: Option, @@ -203,6 +204,8 @@ impl EventFormat for Event { )); } + let schema = Self::prepare_and_validate_schema(schema, &stored_schema, static_schema_flag)?; + Ok((flattened, schema, is_first)) } @@ -239,6 +242,7 @@ impl EventFormat for Event { let p_timestamp = self.p_timestamp; let (data, schema, is_first_event) = self.to_data( + static_schema_flag, &storage_schema, time_partition.as_ref(), time_partition_limit, @@ -265,9 +269,7 @@ impl EventFormat for Event { let batch = Self::into_recordbatch( p_timestamp, vec![json], - schema.clone(), - &storage_schema, - static_schema_flag, + &schema, time_partition.as_ref(), schema_version, )?; @@ -507,6 +509,7 @@ mod tests { let store_schema = HashMap::default(); let (data, schema, _) = Event::new(json) .to_data( + false, &store_schema, None, None, @@ -515,16 +518,8 @@ mod tests { &LogSource::Json, ) .unwrap(); - let rb = Event::into_recordbatch( - Utc::now(), - data, - schema, - &store_schema, - false, - None, - SchemaVersion::V0, - ) - .unwrap(); + let rb = + Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap(); assert_eq!(rb.num_rows(), 1); assert_eq!(rb.num_columns(), 4); @@ -553,6 +548,7 @@ mod tests { let store_schema = HashMap::default(); let (data, schema, _) = Event::new(json) .to_data( + false, &store_schema, None, None, @@ -561,16 +557,8 @@ mod tests { &LogSource::Json, ) .unwrap(); - let rb = Event::into_recordbatch( - Utc::now(), - data, - schema, - &store_schema, - false, - None, - SchemaVersion::V0, - ) - .unwrap(); + let rb = + Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap(); assert_eq!(rb.num_rows(), 1); assert_eq!(rb.num_columns(), 3); @@ -601,6 +589,7 @@ mod tests { ); let (data, schema, _) = Event::new(json) .to_data( + false, &store_schema, None, None, @@ -609,16 +598,8 @@ mod tests { &LogSource::Json, ) .unwrap(); - let rb = Event::into_recordbatch( - Utc::now(), - data, - schema, - &store_schema, - false, - None, - SchemaVersion::V0, - ) - .unwrap(); + let rb = + Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap(); assert_eq!(rb.num_rows(), 1); assert_eq!(rb.num_columns(), 3); @@ -650,6 +631,7 @@ mod tests { assert!(Event::new(json) .to_data( + false, &store_schema, None, None, @@ -675,6 +657,7 @@ mod tests { let (data, schema, _) = Event::new(json) .to_data( + false, &store_schema, None, None, @@ -683,16 +666,8 @@ mod tests { &LogSource::Json, ) .unwrap(); - let rb = Event::into_recordbatch( - Utc::now(), - data, - schema, - &store_schema, - false, - None, - SchemaVersion::V0, - ) - .unwrap(); + let rb = + Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap(); assert_eq!(rb.num_rows(), 1); assert_eq!(rb.num_columns(), 1); @@ -719,6 +694,7 @@ mod tests { let store_schema = HashMap::new(); let (data, schema, _) = Event::new(json) .to_data( + false, &store_schema, None, None, @@ -727,16 +703,8 @@ mod tests { &LogSource::Json, ) .unwrap(); - let rb = Event::into_recordbatch( - Utc::now(), - data, - schema, - &store_schema, - false, - None, - SchemaVersion::V0, - ) - .unwrap(); + let rb = + Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap(); assert_eq!(rb.num_rows(), 3); assert_eq!(rb.num_columns(), 4); @@ -785,6 +753,7 @@ mod tests { let store_schema = HashMap::new(); let (data, schema, _) = Event::new(json) .to_data( + false, &store_schema, None, None, @@ -793,16 +762,8 @@ mod tests { &LogSource::Json, ) .unwrap(); - let rb = Event::into_recordbatch( - Utc::now(), - data, - schema, - &store_schema, - false, - None, - SchemaVersion::V0, - ) - .unwrap(); + let rb = + Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap(); assert_eq!(rb.num_rows(), 3); assert_eq!(rb.num_columns(), 4); @@ -850,6 +811,7 @@ mod tests { ); let (data, schema, _) = Event::new(json) .to_data( + false, &store_schema, None, None, @@ -858,16 +820,8 @@ mod tests { &LogSource::Json, ) .unwrap(); - let rb = Event::into_recordbatch( - Utc::now(), - data, - schema, - &store_schema, - false, - None, - SchemaVersion::V0, - ) - .unwrap(); + let rb = + Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap(); assert_eq!(rb.num_rows(), 3); assert_eq!(rb.num_columns(), 4); @@ -916,6 +870,7 @@ mod tests { assert!(Event::new(json) .to_data( + false, &store_schema, None, None, @@ -953,6 +908,7 @@ mod tests { let store_schema = HashMap::new(); let (data, schema, _) = Event::new(json) .to_data( + false, &store_schema, None, None, @@ -961,16 +917,8 @@ mod tests { &LogSource::Json, ) .unwrap(); - let rb = Event::into_recordbatch( - Utc::now(), - data, - schema, - &store_schema, - false, - None, - SchemaVersion::V0, - ) - .unwrap(); + let rb = + Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap(); assert_eq!(rb.num_rows(), 4); assert_eq!(rb.num_columns(), 5); @@ -1044,6 +992,7 @@ mod tests { let store_schema = HashMap::new(); let (data, schema, _) = Event::new(json) .to_data( + false, &store_schema, None, None, @@ -1052,16 +1001,8 @@ mod tests { &LogSource::Json, ) .unwrap(); - let rb = Event::into_recordbatch( - Utc::now(), - data, - schema, - &store_schema, - false, - None, - SchemaVersion::V1, - ) - .unwrap(); + let rb = + Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V1).unwrap(); assert_eq!(rb.num_rows(), 4); assert_eq!(rb.num_columns(), 5); diff --git a/src/event/format/mod.rs b/src/event/format/mod.rs index c44ebe7b5..de9b5aaab 100644 --- a/src/event/format/mod.rs +++ b/src/event/format/mod.rs @@ -103,32 +103,26 @@ pub trait EventFormat: Sized { fn to_data( self, - schema: &HashMap>, + static_schema_flag: bool, + stored_schema: &HashMap>, time_partition: Option<&String>, time_partition_limit: Option, - custom_partition: Option<&String>, + custom_partitions: Option<&String>, schema_version: SchemaVersion, log_source: &LogSource, ) -> Result<(Self::Data, EventSchema, bool), AnyError>; fn decode(data: Self::Data, schema: Arc) -> Result; - #[allow(clippy::too_many_arguments)] - fn into_recordbatch( - p_timestamp: DateTime, - data: Self::Data, + /// Updates inferred schema with `p_timestamp` field and ensures it adheres to expectations + fn prepare_and_validate_schema( mut schema: EventSchema, storage_schema: &HashMap>, static_schema_flag: bool, - time_partition: Option<&String>, - schema_version: SchemaVersion, - ) -> Result { + ) -> Result { if get_field(&schema, DEFAULT_TIMESTAMP_KEY).is_some() { - return Err(anyhow!( - "field {} is a reserved field", - DEFAULT_TIMESTAMP_KEY - )); - }; + return Err(anyhow!("field {DEFAULT_TIMESTAMP_KEY} is a reserved field",)); + } // add the p_timestamp field to the event schema to the 0th index schema.insert( @@ -150,6 +144,17 @@ pub trait EventFormat: Sized { return Err(anyhow!("Schema mismatch")); } + Ok(schema) + } + + #[allow(clippy::too_many_arguments)] + fn into_recordbatch( + p_timestamp: DateTime, + data: Self::Data, + schema: &EventSchema, + time_partition: Option<&String>, + schema_version: SchemaVersion, + ) -> Result { // prepare the record batch and new fields to be added let mut new_schema = Arc::new(Schema::new(schema.clone())); new_schema = From 51d166e176edd61592dc6eca527ebe8e04173453 Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Mon, 3 Mar 2025 15:16:56 +0530 Subject: [PATCH 27/39] refactor: event construction and processing --- src/connectors/kafka/processor.rs | 21 ++- src/event/format/json.rs | 273 ++++++++++-------------------- src/event/format/mod.rs | 9 +- src/handlers/http/cluster/mod.rs | 11 +- src/handlers/http/ingest.rs | 47 ++--- src/parseable/streams.rs | 19 +-- 6 files changed, 137 insertions(+), 243 deletions(-) diff --git a/src/connectors/kafka/processor.rs b/src/connectors/kafka/processor.rs index 5fead256c..0619a50a3 100644 --- a/src/connectors/kafka/processor.rs +++ b/src/connectors/kafka/processor.rs @@ -26,7 +26,9 @@ use tokio_stream::wrappers::ReceiverStream; use tracing::{debug, error}; use crate::{ - connectors::common::processor::Processor, event::format::LogSource, parseable::PARSEABLE, + connectors::common::processor::Processor, + event::format::{json, EventFormat, LogSource}, + parseable::PARSEABLE, storage::StreamType, }; @@ -56,14 +58,15 @@ impl ParseableSinkProcessor { } } - PARSEABLE - .get_or_create_stream(stream_name) - .push_logs( - Value::Array(json_vec), - total_payload_size, - &LogSource::Custom("Kafka".to_owned()), - ) - .await?; + let stream = PARSEABLE.get_or_create_stream(stream_name); + + json::Event::new( + Value::Array(json_vec), + total_payload_size, + LogSource::Custom("Kafka".to_owned()), + ) + .into_event(&stream)? + .process(&stream)?; Ok(total_payload_size) } diff --git a/src/event/format/json.rs b/src/event/format/json.rs index 0f1c4a86c..ce4408a1e 100644 --- a/src/event/format/json.rs +++ b/src/event/format/json.rs @@ -53,82 +53,85 @@ use crate::{ pub struct Event { pub json: Value, + pub origin_size: usize, pub p_timestamp: DateTime, + pub log_source: LogSource, } impl Event { - pub fn new(json: Value) -> Self { + pub fn new(json: Value, origin_size: usize, log_source: LogSource) -> Self { Self { json, + origin_size, p_timestamp: Utc::now(), + log_source, } } -} -pub fn flatten_logs( - json: Value, - time_partition: Option<&String>, - time_partition_limit: Option, - custom_partitions: Option<&String>, - schema_version: SchemaVersion, - log_source: &LogSource, -) -> anyhow::Result> { - let data = match log_source { - LogSource::Kinesis => { - //custom flattening required for Amazon Kinesis - let message: Message = serde_json::from_value(json)?; - flatten_kinesis_logs(message) - } - LogSource::OtelLogs => { - //custom flattening required for otel logs - let logs: LogsData = serde_json::from_value(json)?; - flatten_otel_logs(&logs) - } - LogSource::OtelTraces => { - //custom flattening required for otel traces - let traces: TracesData = serde_json::from_value(json)?; - flatten_otel_traces(&traces) - } - LogSource::OtelMetrics => { - //custom flattening required for otel metrics - let metrics: MetricsData = serde_json::from_value(json)?; - flatten_otel_metrics(metrics) - } - _ => vec![json], - }; + pub fn flatten_logs( + self, + time_partition: Option<&String>, + time_partition_limit: Option, + custom_partitions: Option<&String>, + schema_version: SchemaVersion, + ) -> anyhow::Result> { + let data = match self.log_source { + LogSource::Kinesis => { + //custom flattening required for Amazon Kinesis + let message: Message = serde_json::from_value(self.json)?; + flatten_kinesis_logs(message) + } + LogSource::OtelLogs => { + //custom flattening required for otel logs + let logs: LogsData = serde_json::from_value(self.json)?; + flatten_otel_logs(&logs) + } + LogSource::OtelTraces => { + //custom flattening required for otel traces + let traces: TracesData = serde_json::from_value(self.json)?; + flatten_otel_traces(&traces) + } + LogSource::OtelMetrics => { + //custom flattening required for otel metrics + let metrics: MetricsData = serde_json::from_value(self.json)?; + flatten_otel_metrics(metrics) + } + _ => vec![self.json], + }; - let mut logs = vec![]; - for json in data { - let json = flatten_json_body( - json, - time_partition, - time_partition_limit, - custom_partitions, - schema_version, - true, - log_source, - )?; + let mut logs = vec![]; + for json in data { + let json = flatten_json_body( + json, + time_partition, + time_partition_limit, + custom_partitions, + schema_version, + true, + &self.log_source, + )?; - // incoming event may be a single json or a json array - // but Data (type defined above) is a vector of json values - // hence we need to convert the incoming event to a vector of json values - match json { - Value::Array(arr) => { - for log in arr { - let Value::Object(json) = log else { - return Err(anyhow!( - "Expected an object or a list of objects, received: {log:?}" - )); - }; - logs.push(json); + // incoming event may be a single json or a json array + // but Data (type defined above) is a vector of json values + // hence we need to convert the incoming event to a vector of json values + match json { + Value::Array(arr) => { + for log in arr { + let Value::Object(json) = log else { + return Err(anyhow!( + "Expected an object or a list of objects, received: {log:?}" + )); + }; + logs.push(json); + } } + Value::Object(obj) => logs.push(obj), + _ => unreachable!("flatten would have failed beforehand"), } - Value::Object(obj) => logs.push(obj), - _ => unreachable!("flatten would have failed beforehand"), } - } - Ok(logs) + Ok(logs) + } } impl EventFormat for Event { @@ -144,15 +147,12 @@ impl EventFormat for Event { time_partition_limit: Option, custom_partitions: Option<&String>, schema_version: SchemaVersion, - log_source: &LogSource, ) -> anyhow::Result<(Self::Data, Vec>, bool)> { - let flattened = flatten_logs( - self.json, + let flattened = self.flatten_logs( time_partition, time_partition_limit, custom_partitions, schema_version, - log_source, )?; // collect all the keys from all the json objects in the request body @@ -204,7 +204,7 @@ impl EventFormat for Event { )); } - let schema = Self::prepare_and_validate_schema(schema, &stored_schema, static_schema_flag)?; + let schema = Self::prepare_and_validate_schema(schema, stored_schema, static_schema_flag)?; Ok((flattened, schema, is_first)) } @@ -226,12 +226,7 @@ impl EventFormat for Event { } /// Converts a JSON event into a Parseable Event - fn into_event( - self, - origin_size: usize, - stream: &Stream, - log_source: &LogSource, - ) -> anyhow::Result { + fn into_event(self, stream: &Stream) -> anyhow::Result { let time_partition = stream.get_time_partition(); let time_partition_limit = stream.get_time_partition_limit(); let static_schema_flag = stream.get_static_schema_flag(); @@ -241,6 +236,7 @@ impl EventFormat for Event { let stream_type = stream.get_stream_type(); let p_timestamp = self.p_timestamp; + let origin_size = self.origin_size; let (data, schema, is_first_event) = self.to_data( static_schema_flag, &storage_schema, @@ -248,7 +244,6 @@ impl EventFormat for Event { time_partition_limit, custom_partitions.as_ref(), schema_version, - log_source, )?; let mut partitions = HashMap::new(); @@ -287,7 +282,7 @@ impl EventFormat for Event { match partitions.get_mut(&key) { Some(PartitionEvent { rb, .. }) => { - *rb = concat_batches(&schema, [&rb, &batch])?; + *rb = concat_batches(&schema, [rb, &batch])?; } _ => { partitions.insert( @@ -507,16 +502,8 @@ mod tests { }); let store_schema = HashMap::default(); - let (data, schema, _) = Event::new(json) - .to_data( - false, - &store_schema, - None, - None, - None, - SchemaVersion::V0, - &LogSource::Json, - ) + let (data, schema, _) = Event::new(json, 0 /* doesn't matter */, LogSource::Json) + .to_data(false, &store_schema, None, None, None, SchemaVersion::V0) .unwrap(); let rb = Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap(); @@ -546,16 +533,8 @@ mod tests { }); let store_schema = HashMap::default(); - let (data, schema, _) = Event::new(json) - .to_data( - false, - &store_schema, - None, - None, - None, - SchemaVersion::V0, - &LogSource::Json, - ) + let (data, schema, _) = Event::new(json, 0 /* doesn't matter */, LogSource::Json) + .to_data(false, &store_schema, None, None, None, SchemaVersion::V0) .unwrap(); let rb = Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap(); @@ -587,16 +566,8 @@ mod tests { ] .into_iter(), ); - let (data, schema, _) = Event::new(json) - .to_data( - false, - &store_schema, - None, - None, - None, - SchemaVersion::V0, - &LogSource::Json, - ) + let (data, schema, _) = Event::new(json, 0 /* doesn't matter */, LogSource::Json) + .to_data(false, &store_schema, None, None, None, SchemaVersion::V0) .unwrap(); let rb = Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap(); @@ -629,16 +600,8 @@ mod tests { .into_iter(), ); - assert!(Event::new(json) - .to_data( - false, - &store_schema, - None, - None, - None, - SchemaVersion::V0, - &LogSource::Json, - ) + assert!(Event::new(json, 0 /* doesn't matter */, LogSource::Json) + .to_data(false, &store_schema, None, None, None, SchemaVersion::V0,) .is_err()); } @@ -655,16 +618,8 @@ mod tests { .into_iter(), ); - let (data, schema, _) = Event::new(json) - .to_data( - false, - &store_schema, - None, - None, - None, - SchemaVersion::V0, - &LogSource::Json, - ) + let (data, schema, _) = Event::new(json, 0 /* doesn't matter */, LogSource::Json) + .to_data(false, &store_schema, None, None, None, SchemaVersion::V0) .unwrap(); let rb = Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap(); @@ -692,16 +647,8 @@ mod tests { ]); let store_schema = HashMap::new(); - let (data, schema, _) = Event::new(json) - .to_data( - false, - &store_schema, - None, - None, - None, - SchemaVersion::V0, - &LogSource::Json, - ) + let (data, schema, _) = Event::new(json, 0 /* doesn't matter */, LogSource::Json) + .to_data(false, &store_schema, None, None, None, SchemaVersion::V0) .unwrap(); let rb = Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap(); @@ -751,16 +698,8 @@ mod tests { ]); let store_schema = HashMap::new(); - let (data, schema, _) = Event::new(json) - .to_data( - false, - &store_schema, - None, - None, - None, - SchemaVersion::V0, - &LogSource::Json, - ) + let (data, schema, _) = Event::new(json, 0 /* doesn't matter */, LogSource::Json) + .to_data(false, &store_schema, None, None, None, SchemaVersion::V0) .unwrap(); let rb = Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap(); @@ -809,16 +748,8 @@ mod tests { ] .into_iter(), ); - let (data, schema, _) = Event::new(json) - .to_data( - false, - &store_schema, - None, - None, - None, - SchemaVersion::V0, - &LogSource::Json, - ) + let (data, schema, _) = Event::new(json, 0 /* doesn't matter */, LogSource::Json) + .to_data(false, &store_schema, None, None, None, SchemaVersion::V0) .unwrap(); let rb = Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap(); @@ -868,16 +799,8 @@ mod tests { .into_iter(), ); - assert!(Event::new(json) - .to_data( - false, - &store_schema, - None, - None, - None, - SchemaVersion::V0, - &LogSource::Json, - ) + assert!(Event::new(json, 0 /* doesn't matter */, LogSource::Json) + .to_data(false, &store_schema, None, None, None, SchemaVersion::V0,) .is_err()); } @@ -906,16 +829,8 @@ mod tests { ]); let store_schema = HashMap::new(); - let (data, schema, _) = Event::new(json) - .to_data( - false, - &store_schema, - None, - None, - None, - SchemaVersion::V0, - &LogSource::Json, - ) + let (data, schema, _) = Event::new(json, 0 /* doesn't matter */, LogSource::Json) + .to_data(false, &store_schema, None, None, None, SchemaVersion::V0) .unwrap(); let rb = Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap(); @@ -990,16 +905,8 @@ mod tests { ]); let store_schema = HashMap::new(); - let (data, schema, _) = Event::new(json) - .to_data( - false, - &store_schema, - None, - None, - None, - SchemaVersion::V1, - &LogSource::Json, - ) + let (data, schema, _) = Event::new(json, 0 /* doesn't matter */, LogSource::Json) + .to_data(false, &store_schema, None, None, None, SchemaVersion::V1) .unwrap(); let rb = Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V1).unwrap(); diff --git a/src/event/format/mod.rs b/src/event/format/mod.rs index de9b5aaab..5459e035c 100644 --- a/src/event/format/mod.rs +++ b/src/event/format/mod.rs @@ -109,7 +109,6 @@ pub trait EventFormat: Sized { time_partition_limit: Option, custom_partitions: Option<&String>, schema_version: SchemaVersion, - log_source: &LogSource, ) -> Result<(Self::Data, EventSchema, bool), AnyError>; fn decode(data: Self::Data, schema: Arc) -> Result; @@ -147,7 +146,6 @@ pub trait EventFormat: Sized { Ok(schema) } - #[allow(clippy::too_many_arguments)] fn into_recordbatch( p_timestamp: DateTime, data: Self::Data, @@ -170,12 +168,7 @@ pub trait EventFormat: Sized { Ok(rb) } - fn into_event( - self, - origin_size: usize, - stream: &Stream, - log_source: &LogSource, - ) -> Result; + fn into_event(self, stream: &Stream) -> Result; } pub fn get_existing_field_names( diff --git a/src/handlers/http/cluster/mod.rs b/src/handlers/http/cluster/mod.rs index 3ed451cb6..eb9751d4e 100644 --- a/src/handlers/http/cluster/mod.rs +++ b/src/handlers/http/cluster/mod.rs @@ -24,6 +24,7 @@ use std::time::Duration; use actix_web::http::header::{self, HeaderMap}; use actix_web::web::Path; use actix_web::Responder; +use anyhow::anyhow; use bytes::Bytes; use chrono::Utc; use clokwerk::{AsyncScheduler, Interval}; @@ -37,7 +38,7 @@ use tracing::{error, info, warn}; use url::Url; use utils::{check_liveness, to_url_string, IngestionStats, QueriedStats, StorageStats}; -use crate::event::format::LogSource; +use crate::event::format::{json, EventFormat, LogSource}; use crate::metrics::prom_utils::Metrics; use crate::parseable::PARSEABLE; use crate::rbac::role::model::DefaultPrivilege; @@ -786,9 +787,11 @@ pub fn init_cluster_metrics_schedular() -> Result<(), PostError> { let byte_size = serde_json::to_vec(&metrics).unwrap().len(); if matches!( - internal_stream - .push_logs(json, byte_size, &LogSource::Pmeta) - .await, + json::Event::new(json, byte_size, LogSource::Pmeta) + .into_event(&internal_stream) + .and_then(|event| event + .process(&internal_stream) + .map_err(|e| anyhow!(e))), Ok(()) ) { info!("Cluster metrics successfully ingested into internal stream"); diff --git a/src/handlers/http/ingest.rs b/src/handlers/http/ingest.rs index 76f236690..bdf35d4c9 100644 --- a/src/handlers/http/ingest.rs +++ b/src/handlers/http/ingest.rs @@ -26,7 +26,7 @@ use http::StatusCode; use serde_json::Value; use crate::event::error::EventError; -use crate::event::format::LogSource; +use crate::event::format::{json, EventFormat, LogSource}; use crate::event::{self, get_schema_key, PartitionEvent}; use crate::handlers::{LOG_SOURCE_KEY, STREAM_NAME_HEADER_KEY}; use crate::option::Mode; @@ -73,10 +73,11 @@ pub async fn ingest( return Err(PostError::OtelNotSupported); } - PARSEABLE - .get_or_create_stream(&stream_name) - .push_logs(json, byte_size, &log_source) - .await?; + let stream = PARSEABLE.get_or_create_stream(&stream_name); + + json::Event::new(json, byte_size, log_source) + .into_event(&stream)? + .process(&stream)?; Ok(HttpResponse::Ok().finish()) } @@ -105,10 +106,11 @@ pub async fn handle_otel_logs_ingestion( .create_stream_if_not_exists(&stream_name, StreamType::UserDefined, LogSource::OtelLogs) .await?; - PARSEABLE - .get_or_create_stream(&stream_name) - .push_logs(json, byte_size, &log_source) - .await?; + let stream = PARSEABLE.get_or_create_stream(&stream_name); + + json::Event::new(json, byte_size, log_source) + .into_event(&stream)? + .process(&stream)?; Ok(HttpResponse::Ok().finish()) } @@ -139,10 +141,11 @@ pub async fn handle_otel_metrics_ingestion( ) .await?; - PARSEABLE - .get_or_create_stream(&stream_name) - .push_logs(json, byte_size, &log_source) - .await?; + let stream = PARSEABLE.get_or_create_stream(&stream_name); + + json::Event::new(json, byte_size, log_source) + .into_event(&stream)? + .process(&stream)?; Ok(HttpResponse::Ok().finish()) } @@ -170,10 +173,11 @@ pub async fn handle_otel_traces_ingestion( .create_stream_if_not_exists(&stream_name, StreamType::UserDefined, LogSource::OtelTraces) .await?; - PARSEABLE - .get_or_create_stream(&stream_name) - .push_logs(json, byte_size, &log_source) - .await?; + let stream = PARSEABLE.get_or_create_stream(&stream_name); + + json::Event::new(json, byte_size, log_source) + .into_event(&stream)? + .process(&stream)?; Ok(HttpResponse::Ok().finish()) } @@ -222,10 +226,11 @@ pub async fn post_event( return Err(PostError::OtelNotSupported); } - PARSEABLE - .get_or_create_stream(&stream_name) - .push_logs(json, byte_size, &log_source) - .await?; + let stream = PARSEABLE.get_or_create_stream(&stream_name); + + json::Event::new(json, byte_size, log_source) + .into_event(&stream)? + .process(&stream)?; Ok(HttpResponse::Ok().finish()) } diff --git a/src/parseable/streams.rs b/src/parseable/streams.rs index 870736d2f..235523cdc 100644 --- a/src/parseable/streams.rs +++ b/src/parseable/streams.rs @@ -42,16 +42,12 @@ use parquet::{ }; use rand::distributions::DistString; use relative_path::RelativePathBuf; -use serde_json::Value; use tokio::task::JoinSet; use tracing::{error, info, trace, warn}; use crate::{ cli::Options, - event::{ - format::{json, EventFormat, LogSource}, - DEFAULT_TIMESTAMP_KEY, - }, + event::DEFAULT_TIMESTAMP_KEY, metadata::{LogStreamMetadata, SchemaVersion}, metrics, option::Mode, @@ -113,19 +109,6 @@ impl Stream { }) } - pub async fn push_logs( - &self, - json: Value, - origin_size: usize, - log_source: &LogSource, - ) -> anyhow::Result<()> { - json::Event::new(json) - .into_event(origin_size, self, log_source)? - .process(self)?; - - Ok(()) - } - // Concatenates record batches and puts them in memory store for each event. pub fn push( &self, From 218080ebe9c68b6439f60381db1093d1f9a20413 Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Mon, 3 Mar 2025 19:48:18 +0530 Subject: [PATCH 28/39] fix: concat at once --- src/event/format/json.rs | 8 +++----- src/event/mod.rs | 27 ++++++++++++++++++--------- src/handlers/http/ingest.rs | 8 +++++--- 3 files changed, 26 insertions(+), 17 deletions(-) diff --git a/src/event/format/json.rs b/src/event/format/json.rs index ce4408a1e..5ebb7b83a 100644 --- a/src/event/format/json.rs +++ b/src/event/format/json.rs @@ -20,7 +20,6 @@ #![allow(deprecated)] use anyhow::anyhow; -use arrow::compute::concat_batches; use arrow_array::RecordBatch; use arrow_json::reader::{infer_json_schema_from_iterator, ReaderBuilder}; use arrow_schema::{DataType, Field, Fields, Schema}; @@ -281,14 +280,13 @@ impl EventFormat for Event { } match partitions.get_mut(&key) { - Some(PartitionEvent { rb, .. }) => { - *rb = concat_batches(&schema, [rb, &batch])?; - } + Some(PartitionEvent { rbs, .. }) => rbs.push(batch), _ => { partitions.insert( key, PartitionEvent { - rb: batch, + rbs: vec![batch], + schema, parsed_timestamp, custom_partition_values, }, diff --git a/src/event/mod.rs b/src/event/mod.rs index 6de567b83..f0297f61c 100644 --- a/src/event/mod.rs +++ b/src/event/mod.rs @@ -19,20 +19,26 @@ pub mod format; +use arrow::compute::concat_batches; use arrow_array::RecordBatch; -use arrow_schema::Field; +use arrow_schema::{Field, Schema}; use itertools::Itertools; use std::sync::Arc; use self::error::EventError; -use crate::{metadata::update_stats, parseable::Stream, storage::StreamType}; +use crate::{ + metadata::update_stats, + parseable::{StagingError, Stream}, + storage::StreamType, +}; use chrono::NaiveDateTime; use std::collections::HashMap; pub const DEFAULT_TIMESTAMP_KEY: &str = "p_timestamp"; pub struct PartitionEvent { - pub rb: RecordBatch, + pub rbs: Vec, + pub schema: Arc, pub parsed_timestamp: NaiveDateTime, pub custom_partition_values: HashMap, } @@ -50,14 +56,15 @@ pub struct Event { impl Event { pub fn process(self, stream: &Stream) -> Result<(), EventError> { for (key, partition) in self.partitions { + let rb = + concat_batches(&partition.schema, &partition.rbs).map_err(StagingError::Arrow)?; if self.is_first_event { - let schema = partition.rb.schema().as_ref().clone(); - stream.commit_schema(schema)?; + stream.commit_schema(partition.schema.as_ref().clone())?; } stream.push( &key, - &partition.rb, + &rb, partition.parsed_timestamp, &partition.custom_partition_values, self.stream_type, @@ -67,20 +74,22 @@ impl Event { &stream.stream_name, self.origin_format, self.origin_size, - partition.rb.num_rows(), + rb.num_rows(), partition.parsed_timestamp.date(), ); - crate::livetail::LIVETAIL.process(&stream.stream_name, &partition.rb); + crate::livetail::LIVETAIL.process(&stream.stream_name, &rb); } Ok(()) } pub fn process_unchecked(&self, stream: &Stream) -> Result<(), EventError> { for (key, partition) in &self.partitions { + let rb = + concat_batches(&partition.schema, &partition.rbs).map_err(StagingError::Arrow)?; stream.push( key, - &partition.rb, + &rb, partition.parsed_timestamp, &partition.custom_partition_values, self.stream_type, diff --git a/src/handlers/http/ingest.rs b/src/handlers/http/ingest.rs index bdf35d4c9..d96236322 100644 --- a/src/handlers/http/ingest.rs +++ b/src/handlers/http/ingest.rs @@ -236,18 +236,20 @@ pub async fn post_event( } pub async fn push_logs_unchecked( - rb: RecordBatch, + batch: RecordBatch, stream: &Stream, ) -> Result { + let schema = batch.schema(); let unchecked_event = event::Event { origin_format: "json", origin_size: 0, time_partition: None, is_first_event: true, // NOTE: Maybe should be false partitions: [( - get_schema_key(&rb.schema().fields), + get_schema_key(&schema.fields), PartitionEvent { - rb, + rbs: vec![batch], + schema, parsed_timestamp: Utc::now().naive_utc(), custom_partition_values: HashMap::new(), // should be an empty map for unchecked push }, From aa0befa5f4c22a6353fa4b33d4a35b1f05bae0cb Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Mon, 3 Mar 2025 20:53:18 +0530 Subject: [PATCH 29/39] refactor: separate out flatten from schema inference --- src/event/format/json.rs | 109 +++++++++++++++++++++++++-------------- src/event/format/mod.rs | 14 +++-- 2 files changed, 82 insertions(+), 41 deletions(-) diff --git a/src/event/format/json.rs b/src/event/format/json.rs index 5ebb7b83a..69584e1f7 100644 --- a/src/event/format/json.rs +++ b/src/event/format/json.rs @@ -140,22 +140,28 @@ impl EventFormat for Event { // also extract the arrow schema, tags and metadata from the incoming json fn to_data( self, - static_schema_flag: bool, - stored_schema: &HashMap>, time_partition: Option<&String>, time_partition_limit: Option, custom_partitions: Option<&String>, schema_version: SchemaVersion, - ) -> anyhow::Result<(Self::Data, Vec>, bool)> { - let flattened = self.flatten_logs( + ) -> anyhow::Result { + self.flatten_logs( time_partition, time_partition_limit, custom_partitions, schema_version, - )?; + ) + } + fn infer_schema( + data: &Self::Data, + stored_schema: &HashMap>, + time_partition: Option<&String>, + static_schema_flag: bool, + schema_version: SchemaVersion, + ) -> anyhow::Result<(super::EventSchema, bool)> { // collect all the keys from all the json objects in the request body - let fields = collect_keys(flattened.iter()); + let fields = collect_keys(data.iter()); let mut is_first = false; let schema = if let Some(schema) = derive_arrow_schema(stored_schema, fields) { @@ -163,14 +169,14 @@ impl EventFormat for Event { } else { // TODO: let mut infer_schema = infer_json_schema_from_iterator( - flattened.iter().map(|obj| Ok(Value::Object(obj.clone()))), + data.iter().map(|obj| Ok(Value::Object(obj.clone()))), ) .map_err(|err| anyhow!("Could not infer schema for this event due to err {:?}", err))?; let new_infer_schema = super::update_field_type_in_schema( Arc::new(infer_schema), Some(stored_schema), time_partition, - Some(&flattened), + Some(data), schema_version, ); infer_schema = Schema::new(new_infer_schema.fields().clone()); @@ -194,7 +200,7 @@ impl EventFormat for Event { .collect() }; - if flattened + if data .iter() .any(|value| fields_mismatch(&schema, value, schema_version)) { @@ -205,7 +211,7 @@ impl EventFormat for Event { let schema = Self::prepare_and_validate_schema(schema, stored_schema, static_schema_flag)?; - Ok((flattened, schema, is_first)) + Ok((schema, is_first)) } // Convert the Data type (defined above) to arrow record batch @@ -231,19 +237,24 @@ impl EventFormat for Event { let static_schema_flag = stream.get_static_schema_flag(); let custom_partitions = stream.get_custom_partition(); let schema_version = stream.get_schema_version(); - let storage_schema = stream.get_schema_raw(); + let stored_schema = stream.get_schema_raw(); let stream_type = stream.get_stream_type(); let p_timestamp = self.p_timestamp; let origin_size = self.origin_size; - let (data, schema, is_first_event) = self.to_data( - static_schema_flag, - &storage_schema, + let data = self.to_data( time_partition.as_ref(), time_partition_limit, custom_partitions.as_ref(), schema_version, )?; + let (schema, is_first_event) = Self::infer_schema( + &data, + &stored_schema, + time_partition.as_ref(), + static_schema_flag, + schema_version, + )?; let mut partitions = HashMap::new(); for json in data { @@ -500,9 +511,11 @@ mod tests { }); let store_schema = HashMap::default(); - let (data, schema, _) = Event::new(json, 0 /* doesn't matter */, LogSource::Json) - .to_data(false, &store_schema, None, None, None, SchemaVersion::V0) + let data = Event::new(json, 0 /* doesn't matter */, LogSource::Json) + .to_data(None, None, None, SchemaVersion::V0) .unwrap(); + let (schema, _) = + Event::infer_schema(&data, &store_schema, None, false, SchemaVersion::V0).unwrap(); let rb = Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap(); @@ -531,9 +544,11 @@ mod tests { }); let store_schema = HashMap::default(); - let (data, schema, _) = Event::new(json, 0 /* doesn't matter */, LogSource::Json) - .to_data(false, &store_schema, None, None, None, SchemaVersion::V0) + let data = Event::new(json, 0 /* doesn't matter */, LogSource::Json) + .to_data(None, None, None, SchemaVersion::V0) .unwrap(); + let (schema, _) = + Event::infer_schema(&data, &store_schema, None, false, SchemaVersion::V0).unwrap(); let rb = Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap(); @@ -564,9 +579,11 @@ mod tests { ] .into_iter(), ); - let (data, schema, _) = Event::new(json, 0 /* doesn't matter */, LogSource::Json) - .to_data(false, &store_schema, None, None, None, SchemaVersion::V0) + let data = Event::new(json, 0 /* doesn't matter */, LogSource::Json) + .to_data(None, None, None, SchemaVersion::V0) .unwrap(); + let (schema, _) = + Event::infer_schema(&data, &store_schema, None, false, SchemaVersion::V0).unwrap(); let rb = Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap(); @@ -598,9 +615,11 @@ mod tests { .into_iter(), ); - assert!(Event::new(json, 0 /* doesn't matter */, LogSource::Json) - .to_data(false, &store_schema, None, None, None, SchemaVersion::V0,) - .is_err()); + let data = Event::new(json, 0 /* doesn't matter */, LogSource::Json) + .to_data(None, None, None, SchemaVersion::V0) + .unwrap(); + + assert!(Event::infer_schema(&data, &store_schema, None, false, SchemaVersion::V0).is_err()); } #[test] @@ -616,9 +635,11 @@ mod tests { .into_iter(), ); - let (data, schema, _) = Event::new(json, 0 /* doesn't matter */, LogSource::Json) - .to_data(false, &store_schema, None, None, None, SchemaVersion::V0) + let data = Event::new(json, 0 /* doesn't matter */, LogSource::Json) + .to_data(None, None, None, SchemaVersion::V0) .unwrap(); + let (schema, _) = + Event::infer_schema(&data, &store_schema, None, false, SchemaVersion::V0).unwrap(); let rb = Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap(); @@ -645,9 +666,11 @@ mod tests { ]); let store_schema = HashMap::new(); - let (data, schema, _) = Event::new(json, 0 /* doesn't matter */, LogSource::Json) - .to_data(false, &store_schema, None, None, None, SchemaVersion::V0) + let data = Event::new(json, 0 /* doesn't matter */, LogSource::Json) + .to_data(None, None, None, SchemaVersion::V0) .unwrap(); + let (schema, _) = + Event::infer_schema(&data, &store_schema, None, false, SchemaVersion::V0).unwrap(); let rb = Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap(); @@ -696,9 +719,11 @@ mod tests { ]); let store_schema = HashMap::new(); - let (data, schema, _) = Event::new(json, 0 /* doesn't matter */, LogSource::Json) - .to_data(false, &store_schema, None, None, None, SchemaVersion::V0) + let data = Event::new(json, 0 /* doesn't matter */, LogSource::Json) + .to_data(None, None, None, SchemaVersion::V0) .unwrap(); + let (schema, _) = + Event::infer_schema(&data, &store_schema, None, false, SchemaVersion::V0).unwrap(); let rb = Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap(); @@ -746,9 +771,11 @@ mod tests { ] .into_iter(), ); - let (data, schema, _) = Event::new(json, 0 /* doesn't matter */, LogSource::Json) - .to_data(false, &store_schema, None, None, None, SchemaVersion::V0) + let data = Event::new(json, 0 /* doesn't matter */, LogSource::Json) + .to_data(None, None, None, SchemaVersion::V0) .unwrap(); + let (schema, _) = + Event::infer_schema(&data, &store_schema, None, false, SchemaVersion::V0).unwrap(); let rb = Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap(); @@ -797,9 +824,11 @@ mod tests { .into_iter(), ); - assert!(Event::new(json, 0 /* doesn't matter */, LogSource::Json) - .to_data(false, &store_schema, None, None, None, SchemaVersion::V0,) - .is_err()); + let data = Event::new(json, 0 /* doesn't matter */, LogSource::Json) + .to_data(None, None, None, SchemaVersion::V0) + .unwrap(); + + assert!(Event::infer_schema(&data, &store_schema, None, false, SchemaVersion::V0).is_err()); } #[test] @@ -827,9 +856,11 @@ mod tests { ]); let store_schema = HashMap::new(); - let (data, schema, _) = Event::new(json, 0 /* doesn't matter */, LogSource::Json) - .to_data(false, &store_schema, None, None, None, SchemaVersion::V0) + let data = Event::new(json, 0 /* doesn't matter */, LogSource::Json) + .to_data(None, None, None, SchemaVersion::V0) .unwrap(); + let (schema, _) = + Event::infer_schema(&data, &store_schema, None, false, SchemaVersion::V0).unwrap(); let rb = Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap(); @@ -903,9 +934,11 @@ mod tests { ]); let store_schema = HashMap::new(); - let (data, schema, _) = Event::new(json, 0 /* doesn't matter */, LogSource::Json) - .to_data(false, &store_schema, None, None, None, SchemaVersion::V1) + let data = Event::new(json, 0 /* doesn't matter */, LogSource::Json) + .to_data(None, None, None, SchemaVersion::V1) .unwrap(); + let (schema, _) = + Event::infer_schema(&data, &store_schema, None, false, SchemaVersion::V1).unwrap(); let rb = Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V1).unwrap(); diff --git a/src/event/format/mod.rs b/src/event/format/mod.rs index 5459e035c..5590c4c3d 100644 --- a/src/event/format/mod.rs +++ b/src/event/format/mod.rs @@ -96,6 +96,8 @@ impl Display for LogSource { } } +pub type IsFirstEvent = bool; + // Global Trait for event format // This trait is implemented by all the event formats pub trait EventFormat: Sized { @@ -103,13 +105,19 @@ pub trait EventFormat: Sized { fn to_data( self, - static_schema_flag: bool, - stored_schema: &HashMap>, time_partition: Option<&String>, time_partition_limit: Option, custom_partitions: Option<&String>, schema_version: SchemaVersion, - ) -> Result<(Self::Data, EventSchema, bool), AnyError>; + ) -> anyhow::Result; + + fn infer_schema( + data: &Self::Data, + stored_schema: &HashMap>, + time_partition: Option<&String>, + static_schema_flag: bool, + schema_version: SchemaVersion, + ) -> anyhow::Result<(EventSchema, IsFirstEvent)>; fn decode(data: Self::Data, schema: Arc) -> Result; From da5e16c016772578c2823dc9127ff018b249cfa1 Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Tue, 4 Mar 2025 00:35:00 +0530 Subject: [PATCH 30/39] style: `anyhow::Result` --- src/event/format/mod.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/event/format/mod.rs b/src/event/format/mod.rs index 5590c4c3d..326969fd3 100644 --- a/src/event/format/mod.rs +++ b/src/event/format/mod.rs @@ -24,7 +24,7 @@ use std::{ sync::Arc, }; -use anyhow::{anyhow, Error as AnyError}; +use anyhow::anyhow; use arrow_array::RecordBatch; use arrow_schema::{DataType, Field, Schema, TimeUnit}; use chrono::{DateTime, Utc}; @@ -119,14 +119,14 @@ pub trait EventFormat: Sized { schema_version: SchemaVersion, ) -> anyhow::Result<(EventSchema, IsFirstEvent)>; - fn decode(data: Self::Data, schema: Arc) -> Result; + fn decode(data: Self::Data, schema: Arc) -> anyhow::Result; /// Updates inferred schema with `p_timestamp` field and ensures it adheres to expectations fn prepare_and_validate_schema( mut schema: EventSchema, storage_schema: &HashMap>, static_schema_flag: bool, - ) -> Result { + ) -> anyhow::Result { if get_field(&schema, DEFAULT_TIMESTAMP_KEY).is_some() { return Err(anyhow!("field {DEFAULT_TIMESTAMP_KEY} is a reserved field",)); } @@ -160,7 +160,7 @@ pub trait EventFormat: Sized { schema: &EventSchema, time_partition: Option<&String>, schema_version: SchemaVersion, - ) -> Result { + ) -> anyhow::Result { // prepare the record batch and new fields to be added let mut new_schema = Arc::new(Schema::new(schema.clone())); new_schema = @@ -176,7 +176,7 @@ pub trait EventFormat: Sized { Ok(rb) } - fn into_event(self, stream: &Stream) -> Result; + fn into_event(self, stream: &Stream) -> anyhow::Result; } pub fn get_existing_field_names( From 9b0d865560864cad2c1b17dd3ffac64eaf7992c3 Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Tue, 4 Mar 2025 00:36:48 +0530 Subject: [PATCH 31/39] fix: rb per object --- src/event/format/json.rs | 129 ++++++++++++++++++--------------------- src/event/format/mod.rs | 16 +++-- src/event/mod.rs | 66 ++++++++++---------- 3 files changed, 96 insertions(+), 115 deletions(-) diff --git a/src/event/format/json.rs b/src/event/format/json.rs index 69584e1f7..381a76a9d 100644 --- a/src/event/format/json.rs +++ b/src/event/format/json.rs @@ -134,7 +134,7 @@ impl Event { } impl EventFormat for Event { - type Data = Vec; + type Data = Json; // convert the incoming json to a vector of json values // also extract the arrow schema, tags and metadata from the incoming json @@ -144,7 +144,7 @@ impl EventFormat for Event { time_partition_limit: Option, custom_partitions: Option<&String>, schema_version: SchemaVersion, - ) -> anyhow::Result { + ) -> anyhow::Result> { self.flatten_logs( time_partition, time_partition_limit, @@ -161,17 +161,18 @@ impl EventFormat for Event { schema_version: SchemaVersion, ) -> anyhow::Result<(super::EventSchema, bool)> { // collect all the keys from all the json objects in the request body - let fields = collect_keys(data.iter()); + let fields = collect_keys(data); let mut is_first = false; let schema = if let Some(schema) = derive_arrow_schema(stored_schema, fields) { schema } else { // TODO: - let mut infer_schema = infer_json_schema_from_iterator( - data.iter().map(|obj| Ok(Value::Object(obj.clone()))), - ) - .map_err(|err| anyhow!("Could not infer schema for this event due to err {:?}", err))?; + let mut infer_schema = + infer_json_schema_from_iterator([Ok(Value::Object(data.clone()))].into_iter()) + .map_err(|err| { + anyhow!("Could not infer schema for this event due to err {:?}", err) + })?; let new_infer_schema = super::update_field_type_in_schema( Arc::new(infer_schema), Some(stored_schema), @@ -200,10 +201,7 @@ impl EventFormat for Event { .collect() }; - if data - .iter() - .any(|value| fields_mismatch(&schema, value, schema_version)) - { + if fields_mismatch(&schema, data, schema_version) { return Err(anyhow!( "Could not process this event due to mismatch in datatype" )); @@ -215,14 +213,14 @@ impl EventFormat for Event { } // Convert the Data type (defined above) to arrow record batch - fn decode(data: Self::Data, schema: Arc) -> anyhow::Result { + fn decode(data: &[Self::Data], schema: Arc) -> anyhow::Result { let array_capacity = round_upto_multiple_of_64(data.len()); let mut reader = ReaderBuilder::new(schema) .with_batch_size(array_capacity) .with_coerce_primitive(false) .build_decoder()?; - reader.serialize(&data)?; + reader.serialize(data)?; match reader.flush() { Ok(Some(recordbatch)) => Ok(recordbatch), Err(err) => Err(anyhow!("Failed to create recordbatch due to {:?}", err)), @@ -248,16 +246,18 @@ impl EventFormat for Event { custom_partitions.as_ref(), schema_version, )?; - let (schema, is_first_event) = Self::infer_schema( - &data, - &stored_schema, - time_partition.as_ref(), - static_schema_flag, - schema_version, - )?; + let mut is_first_event = false; let mut partitions = HashMap::new(); for json in data { + let (schema, is_first) = Self::infer_schema( + &json, + &stored_schema, + time_partition.as_ref(), + static_schema_flag, + schema_version, + )?; + is_first_event = is_first_event || is_first; let custom_partition_values = match custom_partitions.as_ref() { Some(custom_partitions) => { let custom_partitions = custom_partitions.split(',').collect_vec(); @@ -273,7 +273,7 @@ impl EventFormat for Event { let batch = Self::into_recordbatch( p_timestamp, - vec![json], + &[json], &schema, time_partition.as_ref(), schema_version, @@ -368,15 +368,8 @@ fn derive_arrow_schema( // Returns a list of keys that are present in the given iterable of JSON objects // Returns None if even one of the value is not an Object -fn collect_keys<'a>(objects: impl Iterator) -> HashSet<&'a str> { - let mut keys = HashSet::new(); - for object in objects { - for key in object.keys() { - keys.insert(key.as_str()); - } - } - - keys +fn collect_keys(object: &Json) -> HashSet<&str> { + object.keys().map(|k| k.as_str()).collect() } // Returns true when the field doesn't exist in schema or has an invalid type @@ -515,9 +508,9 @@ mod tests { .to_data(None, None, None, SchemaVersion::V0) .unwrap(); let (schema, _) = - Event::infer_schema(&data, &store_schema, None, false, SchemaVersion::V0).unwrap(); + Event::infer_schema(&data[0], &store_schema, None, false, SchemaVersion::V0).unwrap(); let rb = - Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap(); + Event::into_recordbatch(Utc::now(), &data, &schema, None, SchemaVersion::V0).unwrap(); assert_eq!(rb.num_rows(), 1); assert_eq!(rb.num_columns(), 4); @@ -548,9 +541,9 @@ mod tests { .to_data(None, None, None, SchemaVersion::V0) .unwrap(); let (schema, _) = - Event::infer_schema(&data, &store_schema, None, false, SchemaVersion::V0).unwrap(); + Event::infer_schema(&data[0], &store_schema, None, false, SchemaVersion::V0).unwrap(); let rb = - Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap(); + Event::into_recordbatch(Utc::now(), &data, &schema, None, SchemaVersion::V0).unwrap(); assert_eq!(rb.num_rows(), 1); assert_eq!(rb.num_columns(), 3); @@ -583,9 +576,9 @@ mod tests { .to_data(None, None, None, SchemaVersion::V0) .unwrap(); let (schema, _) = - Event::infer_schema(&data, &store_schema, None, false, SchemaVersion::V0).unwrap(); + Event::infer_schema(&data[0], &store_schema, None, false, SchemaVersion::V0).unwrap(); let rb = - Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap(); + Event::into_recordbatch(Utc::now(), &data, &schema, None, SchemaVersion::V0).unwrap(); assert_eq!(rb.num_rows(), 1); assert_eq!(rb.num_columns(), 3); @@ -619,7 +612,9 @@ mod tests { .to_data(None, None, None, SchemaVersion::V0) .unwrap(); - assert!(Event::infer_schema(&data, &store_schema, None, false, SchemaVersion::V0).is_err()); + assert!( + Event::infer_schema(&data[0], &store_schema, None, false, SchemaVersion::V0).is_err() + ); } #[test] @@ -639,9 +634,9 @@ mod tests { .to_data(None, None, None, SchemaVersion::V0) .unwrap(); let (schema, _) = - Event::infer_schema(&data, &store_schema, None, false, SchemaVersion::V0).unwrap(); + Event::infer_schema(&data[0], &store_schema, None, false, SchemaVersion::V0).unwrap(); let rb = - Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap(); + Event::into_recordbatch(Utc::now(), &data, &schema, None, SchemaVersion::V0).unwrap(); assert_eq!(rb.num_rows(), 1); assert_eq!(rb.num_columns(), 1); @@ -670,9 +665,9 @@ mod tests { .to_data(None, None, None, SchemaVersion::V0) .unwrap(); let (schema, _) = - Event::infer_schema(&data, &store_schema, None, false, SchemaVersion::V0).unwrap(); + Event::infer_schema(&data[1], &store_schema, None, false, SchemaVersion::V0).unwrap(); let rb = - Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap(); + Event::into_recordbatch(Utc::now(), &data, &schema, None, SchemaVersion::V0).unwrap(); assert_eq!(rb.num_rows(), 3); assert_eq!(rb.num_columns(), 4); @@ -723,9 +718,9 @@ mod tests { .to_data(None, None, None, SchemaVersion::V0) .unwrap(); let (schema, _) = - Event::infer_schema(&data, &store_schema, None, false, SchemaVersion::V0).unwrap(); + Event::infer_schema(&data[1], &store_schema, None, false, SchemaVersion::V0).unwrap(); let rb = - Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap(); + Event::into_recordbatch(Utc::now(), &data, &schema, None, SchemaVersion::V0).unwrap(); assert_eq!(rb.num_rows(), 3); assert_eq!(rb.num_columns(), 4); @@ -775,9 +770,9 @@ mod tests { .to_data(None, None, None, SchemaVersion::V0) .unwrap(); let (schema, _) = - Event::infer_schema(&data, &store_schema, None, false, SchemaVersion::V0).unwrap(); + Event::infer_schema(&data[0], &store_schema, None, false, SchemaVersion::V0).unwrap(); let rb = - Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap(); + Event::into_recordbatch(Utc::now(), &data, &schema, None, SchemaVersion::V0).unwrap(); assert_eq!(rb.num_rows(), 3); assert_eq!(rb.num_columns(), 4); @@ -797,23 +792,12 @@ mod tests { #[test] fn arr_schema_mismatch() { - let json = json!([ - { - "a": null, - "b": "hello", - "c": 1.24 - }, - { - "a": 1, - "b": "hello", - "c": 1 - }, - { - "a": 1, - "b": "hello", - "c": null - }, - ]); + let json = json!( + { + "a": 1, + "b": "hello", + "c": 1 + }); let store_schema = fields_to_map( [ @@ -824,11 +808,14 @@ mod tests { .into_iter(), ); - let data = Event::new(json, 0 /* doesn't matter */, LogSource::Json) - .to_data(None, None, None, SchemaVersion::V0) - .unwrap(); - - assert!(Event::infer_schema(&data, &store_schema, None, false, SchemaVersion::V0).is_err()); + assert!(Event::infer_schema( + json.as_object().unwrap(), + &store_schema, + None, + false, + SchemaVersion::V0 + ) + .is_err()); } #[test] @@ -860,9 +847,9 @@ mod tests { .to_data(None, None, None, SchemaVersion::V0) .unwrap(); let (schema, _) = - Event::infer_schema(&data, &store_schema, None, false, SchemaVersion::V0).unwrap(); + Event::infer_schema(&data[3], &store_schema, None, false, SchemaVersion::V0).unwrap(); let rb = - Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap(); + Event::into_recordbatch(Utc::now(), &data, &schema, None, SchemaVersion::V0).unwrap(); assert_eq!(rb.num_rows(), 4); assert_eq!(rb.num_columns(), 5); @@ -938,9 +925,9 @@ mod tests { .to_data(None, None, None, SchemaVersion::V1) .unwrap(); let (schema, _) = - Event::infer_schema(&data, &store_schema, None, false, SchemaVersion::V1).unwrap(); + Event::infer_schema(&data[3], &store_schema, None, false, SchemaVersion::V1).unwrap(); let rb = - Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V1).unwrap(); + Event::into_recordbatch(Utc::now(), &data, &schema, None, SchemaVersion::V1).unwrap(); assert_eq!(rb.num_rows(), 4); assert_eq!(rb.num_columns(), 5); diff --git a/src/event/format/mod.rs b/src/event/format/mod.rs index 326969fd3..94b60662e 100644 --- a/src/event/format/mod.rs +++ b/src/event/format/mod.rs @@ -109,7 +109,7 @@ pub trait EventFormat: Sized { time_partition_limit: Option, custom_partitions: Option<&String>, schema_version: SchemaVersion, - ) -> anyhow::Result; + ) -> anyhow::Result>; fn infer_schema( data: &Self::Data, @@ -119,7 +119,7 @@ pub trait EventFormat: Sized { schema_version: SchemaVersion, ) -> anyhow::Result<(EventSchema, IsFirstEvent)>; - fn decode(data: Self::Data, schema: Arc) -> anyhow::Result; + fn decode(data: &[Self::Data], schema: Arc) -> anyhow::Result; /// Updates inferred schema with `p_timestamp` field and ensures it adheres to expectations fn prepare_and_validate_schema( @@ -156,7 +156,7 @@ pub trait EventFormat: Sized { fn into_recordbatch( p_timestamp: DateTime, - data: Self::Data, + data: &[Self::Data], schema: &EventSchema, time_partition: Option<&String>, schema_version: SchemaVersion, @@ -234,7 +234,7 @@ pub fn update_field_type_in_schema( inferred_schema: Arc, existing_schema: Option<&HashMap>>, time_partition: Option<&String>, - log_records: Option<&[Json]>, + log_records: Option<&Json>, schema_version: SchemaVersion, ) -> Arc { let mut updated_schema = inferred_schema.clone(); @@ -245,11 +245,9 @@ pub fn update_field_type_in_schema( updated_schema = override_existing_timestamp_fields(existing_schema, updated_schema); } - if let Some(log_records) = log_records { - for log_record in log_records { - updated_schema = - override_data_type(updated_schema.clone(), log_record.clone(), schema_version); - } + if let Some(log_record) = log_records { + updated_schema = + override_data_type(updated_schema.clone(), log_record.clone(), schema_version); } let Some(time_partition) = time_partition else { diff --git a/src/event/mod.rs b/src/event/mod.rs index f0297f61c..bdfe458e8 100644 --- a/src/event/mod.rs +++ b/src/event/mod.rs @@ -19,23 +19,19 @@ pub mod format; -use arrow::compute::concat_batches; use arrow_array::RecordBatch; use arrow_schema::{Field, Schema}; use itertools::Itertools; use std::sync::Arc; use self::error::EventError; -use crate::{ - metadata::update_stats, - parseable::{StagingError, Stream}, - storage::StreamType, -}; +use crate::{metadata::update_stats, parseable::Stream, storage::StreamType}; use chrono::NaiveDateTime; use std::collections::HashMap; pub const DEFAULT_TIMESTAMP_KEY: &str = "p_timestamp"; +#[derive(Debug)] pub struct PartitionEvent { pub rbs: Vec, pub schema: Arc, @@ -43,6 +39,7 @@ pub struct PartitionEvent { pub custom_partition_values: HashMap, } +#[derive(Debug)] pub struct Event { pub origin_format: &'static str, pub origin_size: usize, @@ -56,44 +53,43 @@ pub struct Event { impl Event { pub fn process(self, stream: &Stream) -> Result<(), EventError> { for (key, partition) in self.partitions { - let rb = - concat_batches(&partition.schema, &partition.rbs).map_err(StagingError::Arrow)?; if self.is_first_event { stream.commit_schema(partition.schema.as_ref().clone())?; } - - stream.push( - &key, - &rb, - partition.parsed_timestamp, - &partition.custom_partition_values, - self.stream_type, - )?; - - update_stats( - &stream.stream_name, - self.origin_format, - self.origin_size, - rb.num_rows(), - partition.parsed_timestamp.date(), - ); - - crate::livetail::LIVETAIL.process(&stream.stream_name, &rb); + for rb in partition.rbs { + stream.push( + &key, + &rb, + partition.parsed_timestamp, + &partition.custom_partition_values, + self.stream_type, + )?; + + update_stats( + &stream.stream_name, + self.origin_format, + self.origin_size, + rb.num_rows(), + partition.parsed_timestamp.date(), + ); + + crate::livetail::LIVETAIL.process(&stream.stream_name, &rb); + } } Ok(()) } pub fn process_unchecked(&self, stream: &Stream) -> Result<(), EventError> { for (key, partition) in &self.partitions { - let rb = - concat_batches(&partition.schema, &partition.rbs).map_err(StagingError::Arrow)?; - stream.push( - key, - &rb, - partition.parsed_timestamp, - &partition.custom_partition_values, - self.stream_type, - )?; + for rb in &partition.rbs { + stream.push( + key, + rb, + partition.parsed_timestamp, + &partition.custom_partition_values, + self.stream_type, + )?; + } } Ok(()) From d096ce0b82aefefa47902d5f458aeb7647ed7617 Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Wed, 5 Mar 2025 00:43:32 +0530 Subject: [PATCH 32/39] perf: partition at json level --- src/event/format/json.rs | 82 ++++++++++++++++++---------- src/event/mod.rs | 52 ++++++------------ src/handlers/http/ingest.rs | 13 ++--- src/parseable/streams.rs | 105 +++++++++--------------------------- 4 files changed, 100 insertions(+), 152 deletions(-) diff --git a/src/event/format/json.rs b/src/event/format/json.rs index 381a76a9d..8c1403962 100644 --- a/src/event/format/json.rs +++ b/src/event/format/json.rs @@ -23,7 +23,7 @@ use anyhow::anyhow; use arrow_array::RecordBatch; use arrow_json::reader::{infer_json_schema_from_iterator, ReaderBuilder}; use arrow_schema::{DataType, Field, Fields, Schema}; -use chrono::{DateTime, NaiveDateTime, Utc}; +use chrono::{DateTime, NaiveDate, NaiveDateTime, Utc}; use datafusion::arrow::util::bit_util::round_upto_multiple_of_64; use itertools::Itertools; use opentelemetry_proto::tonic::{ @@ -47,9 +47,17 @@ use crate::{ utils::{ arrow::get_field, json::{flatten_json_body, Json}, + time::Minute, }, + OBJECT_STORE_DATA_GRANULARITY, }; +struct JsonPartition { + batch: Vec, + schema: Vec>, + date: NaiveDate, +} + pub struct Event { pub json: Value, pub origin_size: usize, @@ -248,7 +256,7 @@ impl EventFormat for Event { )?; let mut is_first_event = false; - let mut partitions = HashMap::new(); + let mut json_partitions = HashMap::new(); for json in data { let (schema, is_first) = Self::infer_schema( &json, @@ -257,6 +265,7 @@ impl EventFormat for Event { static_schema_flag, schema_version, )?; + is_first_event = is_first_event || is_first; let custom_partition_values = match custom_partitions.as_ref() { Some(custom_partitions) => { @@ -267,45 +276,60 @@ impl EventFormat for Event { }; let parsed_timestamp = match time_partition.as_ref() { - Some(time_partition) => extract_and_parse_time(&json, time_partition.as_ref())?, + Some(time_partition) => extract_and_parse_time(&json, time_partition)?, _ => p_timestamp.naive_utc(), }; - let batch = Self::into_recordbatch( - p_timestamp, - &[json], - &schema, - time_partition.as_ref(), - schema_version, - )?; - - let schema = batch.schema(); - let mut key = get_schema_key(&schema.fields); - if time_partition.is_some() { - let parsed_timestamp_to_min = parsed_timestamp.format("%Y%m%dT%H%M").to_string(); - key.push_str(&parsed_timestamp_to_min); - } - - for (k, v) in custom_partition_values.iter().sorted_by_key(|v| v.0) { - key.push_str(&format!("&{k}={v}")); - } + let prefix = format!( + "{}.{}.minute={}.{}", + get_schema_key(&schema), + parsed_timestamp.format("date=%Y-%m-%d.hour=%H"), + Minute::from(parsed_timestamp).to_slot(OBJECT_STORE_DATA_GRANULARITY), + custom_partition_values + .iter() + .sorted_by_key(|v| v.0) + .map(|(key, value)| format!("{key}={value}.")) + .join("") + ); - match partitions.get_mut(&key) { - Some(PartitionEvent { rbs, .. }) => rbs.push(batch), + match json_partitions.get_mut(&prefix) { + Some(JsonPartition { batch, .. }) => batch.push(json), _ => { - partitions.insert( - key, - PartitionEvent { - rbs: vec![batch], + let date = parsed_timestamp.date(); + let batch = vec![json]; + json_partitions.insert( + prefix, + JsonPartition { + batch, schema, - parsed_timestamp, - custom_partition_values, + date, }, ); } } } + let mut partitions = HashMap::new(); + for ( + prefix, + JsonPartition { + batch, + schema, + date, + }, + ) in json_partitions + { + let batch = Self::into_recordbatch( + p_timestamp, + &batch, + &schema, + time_partition.as_ref(), + schema_version, + )?; + + partitions.insert(prefix, PartitionEvent { rb: batch, date }); + } + Ok(super::Event { origin_format: "json", origin_size, diff --git a/src/event/mod.rs b/src/event/mod.rs index bdfe458e8..6f3918a01 100644 --- a/src/event/mod.rs +++ b/src/event/mod.rs @@ -20,23 +20,21 @@ pub mod format; use arrow_array::RecordBatch; -use arrow_schema::{Field, Schema}; +use arrow_schema::Field; use itertools::Itertools; use std::sync::Arc; use self::error::EventError; use crate::{metadata::update_stats, parseable::Stream, storage::StreamType}; -use chrono::NaiveDateTime; +use chrono::NaiveDate; use std::collections::HashMap; pub const DEFAULT_TIMESTAMP_KEY: &str = "p_timestamp"; #[derive(Debug)] pub struct PartitionEvent { - pub rbs: Vec, - pub schema: Arc, - pub parsed_timestamp: NaiveDateTime, - pub custom_partition_values: HashMap, + pub rb: RecordBatch, + pub date: NaiveDate, } #[derive(Debug)] @@ -52,44 +50,28 @@ pub struct Event { // Events holds the schema related to a each event for a single log stream impl Event { pub fn process(self, stream: &Stream) -> Result<(), EventError> { - for (key, partition) in self.partitions { + for (prefix, PartitionEvent { rb, date }) in self.partitions { if self.is_first_event { - stream.commit_schema(partition.schema.as_ref().clone())?; + stream.commit_schema(rb.schema().as_ref().clone())?; } - for rb in partition.rbs { - stream.push( - &key, - &rb, - partition.parsed_timestamp, - &partition.custom_partition_values, - self.stream_type, - )?; + stream.push(&prefix, &rb, self.stream_type)?; - update_stats( - &stream.stream_name, - self.origin_format, - self.origin_size, - rb.num_rows(), - partition.parsed_timestamp.date(), - ); + update_stats( + &stream.stream_name, + self.origin_format, + self.origin_size, + rb.num_rows(), + date, + ); - crate::livetail::LIVETAIL.process(&stream.stream_name, &rb); - } + crate::livetail::LIVETAIL.process(&stream.stream_name, &rb); } Ok(()) } pub fn process_unchecked(&self, stream: &Stream) -> Result<(), EventError> { - for (key, partition) in &self.partitions { - for rb in &partition.rbs { - stream.push( - key, - rb, - partition.parsed_timestamp, - &partition.custom_partition_values, - self.stream_type, - )?; - } + for (prefix, partition) in &self.partitions { + stream.push(prefix, &partition.rb, self.stream_type)?; } Ok(()) diff --git a/src/handlers/http/ingest.rs b/src/handlers/http/ingest.rs index d96236322..7a9897993 100644 --- a/src/handlers/http/ingest.rs +++ b/src/handlers/http/ingest.rs @@ -16,8 +16,6 @@ * */ -use std::collections::HashMap; - use actix_web::web::Path; use actix_web::{http::header::ContentType, HttpRequest, HttpResponse}; use arrow_array::RecordBatch; @@ -236,22 +234,19 @@ pub async fn post_event( } pub async fn push_logs_unchecked( - batch: RecordBatch, + rb: RecordBatch, stream: &Stream, ) -> Result { - let schema = batch.schema(); let unchecked_event = event::Event { origin_format: "json", origin_size: 0, time_partition: None, is_first_event: true, // NOTE: Maybe should be false partitions: [( - get_schema_key(&schema.fields), + get_schema_key(&rb.schema().fields), PartitionEvent { - rbs: vec![batch], - schema, - parsed_timestamp: Utc::now().naive_utc(), - custom_partition_values: HashMap::new(), // should be an empty map for unchecked push + rb, + date: Utc::now().date_naive(), }, )] .into_iter() diff --git a/src/parseable/streams.rs b/src/parseable/streams.rs index ab83fb609..314771714 100644 --- a/src/parseable/streams.rs +++ b/src/parseable/streams.rs @@ -30,7 +30,6 @@ use std::{ use arrow_array::RecordBatch; use arrow_schema::{Field, Fields, Schema}; -use chrono::{NaiveDateTime, Timelike}; use derive_more::{Deref, DerefMut}; use itertools::Itertools; use parquet::{ @@ -52,8 +51,7 @@ use crate::{ metrics, option::Mode, storage::{object_storage::to_bytes, retention::Retention, StreamType}, - utils::time::Minute, - LOCK_EXPECT, OBJECT_STORE_DATA_GRANULARITY, + LOCK_EXPECT, }; use super::{ @@ -112,60 +110,40 @@ impl Stream { // Concatenates record batches and puts them in memory store for each event. pub fn push( &self, - schema_key: &str, + prefix: &str, record: &RecordBatch, - parsed_timestamp: NaiveDateTime, - custom_partition_values: &HashMap, stream_type: StreamType, ) -> Result<(), StagingError> { let mut guard = self.writer.lock().unwrap(); if self.options.mode != Mode::Query || stream_type == StreamType::Internal { - match guard.disk.get_mut(schema_key) { + match guard.disk.get_mut(prefix) { Some(writer) => { writer.write(record)?; } None => { // entry is not present thus we create it - let path = self.path_by_current_time( - schema_key, - parsed_timestamp, - custom_partition_values, - ); + let path = self.path_by_current_time(prefix); std::fs::create_dir_all(&self.data_path)?; let mut writer = DiskWriter::new(path, &record.schema())?; writer.write(record)?; - guard.disk.insert(schema_key.to_owned(), writer); + guard.disk.insert(prefix.to_owned(), writer); } }; } - guard.mem.push(schema_key, record); + guard.mem.push(prefix, record); Ok(()) } - pub fn path_by_current_time( - &self, - stream_hash: &str, - parsed_timestamp: NaiveDateTime, - custom_partition_values: &HashMap, - ) -> PathBuf { + pub fn path_by_current_time(&self, prefix: &str) -> PathBuf { let mut hostname = hostname::get().unwrap().into_string().unwrap(); if let Some(id) = &self.ingestor_id { hostname.push_str(id); } - let filename = format!( - "{stream_hash}.date={}.hour={:02}.minute={}.{}{hostname}.data.part", - parsed_timestamp.date(), - parsed_timestamp.hour(), - Minute::from(parsed_timestamp).to_slot(OBJECT_STORE_DATA_GRANULARITY), - custom_partition_values - .iter() - .sorted_by_key(|v| v.0) - .map(|(key, value)| format!("{key}={value}.")) - .join("") - ); + + let filename = format!("{prefix}.{hostname}.data.part",); self.data_path.join(filename) } @@ -766,10 +744,12 @@ mod tests { use arrow_array::{Int32Array, StringArray, TimestampMillisecondArray}; use arrow_schema::{DataType, Field, TimeUnit}; - use chrono::{NaiveDate, TimeDelta, Utc}; + use chrono::{NaiveDate, NaiveDateTime, TimeDelta, Utc}; use temp_dir::TempDir; use tokio::time::sleep; + use crate::{utils::time::Minute, OBJECT_STORE_DATA_GRANULARITY}; + use super::*; #[test] @@ -865,41 +845,8 @@ mod tests { } #[test] - fn generate_correct_path_with_current_time_and_no_custom_partitioning() { - let stream_name = "test_stream"; - let stream_hash = "abc123"; - let parsed_timestamp = NaiveDate::from_ymd_opt(2023, 10, 1) - .unwrap() - .and_hms_opt(12, 30, 0) - .unwrap(); - let custom_partition_values = HashMap::new(); - - let options = Options::default(); - let staging = Stream::new( - Arc::new(options), - stream_name, - LogStreamMetadata::default(), - None, - ); - - let expected_path = staging.data_path.join(format!( - "{stream_hash}.date={}.hour={:02}.minute={}.{}.data.part", - parsed_timestamp.date(), - parsed_timestamp.hour(), - Minute::from(parsed_timestamp).to_slot(OBJECT_STORE_DATA_GRANULARITY), - hostname::get().unwrap().into_string().unwrap() - )); - - let generated_path = - staging.path_by_current_time(stream_hash, parsed_timestamp, &custom_partition_values); - - assert_eq!(generated_path, expected_path); - } - - #[test] - fn generate_correct_path_with_current_time_and_custom_partitioning() { + fn generate_correct_path() { let stream_name = "test_stream"; - let stream_hash = "abc123"; let parsed_timestamp = NaiveDate::from_ymd_opt(2023, 10, 1) .unwrap() .and_hms_opt(12, 30, 0) @@ -908,6 +855,12 @@ mod tests { custom_partition_values.insert("key1".to_string(), "value1".to_string()); custom_partition_values.insert("key2".to_string(), "value2".to_string()); + let prefix = format!( + "abc123.{}.minute={}.key1=value1.key2=value2", + parsed_timestamp.format("date={%Y-%m-%d}.hour={%H}"), + Minute::from(parsed_timestamp).to_slot(OBJECT_STORE_DATA_GRANULARITY), + ); + let options = Options::default(); let staging = Stream::new( Arc::new(options), @@ -917,15 +870,11 @@ mod tests { ); let expected_path = staging.data_path.join(format!( - "{stream_hash}.date={}.hour={:02}.minute={}.key1=value1.key2=value2.{}.data.part", - parsed_timestamp.date(), - parsed_timestamp.hour(), - Minute::from(parsed_timestamp).to_slot(OBJECT_STORE_DATA_GRANULARITY), + "{prefix}.{}.data.part", hostname::get().unwrap().into_string().unwrap() )); - let generated_path = - staging.path_by_current_time(stream_hash, parsed_timestamp, &custom_partition_values); + let generated_path = staging.path_by_current_time(&prefix); assert_eq!(generated_path, expected_path); } @@ -965,6 +914,10 @@ mod tests { .checked_sub_signed(TimeDelta::minutes(mins)) .unwrap() .naive_utc(); + let prefix = format!( + "abc.{}.key1=value1.key2=value2", + time.format("date=%Y-%m-%d.hour=%H.minute=%M") + ); let batch = RecordBatch::try_new( Arc::new(schema.clone()), vec![ @@ -975,13 +928,7 @@ mod tests { ) .unwrap(); staging - .push( - "abc", - &batch, - time, - &HashMap::new(), - StreamType::UserDefined, - ) + .push(&prefix, &batch, StreamType::UserDefined) .unwrap(); staging.flush(); } From e15b0d2a7d3773ac02cacc89ab76b4ffb41a0f91 Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Wed, 5 Mar 2025 00:50:38 +0530 Subject: [PATCH 33/39] style: deepsource suggestion --- src/event/format/json.rs | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/src/event/format/json.rs b/src/event/format/json.rs index 8c1403962..19544d80d 100644 --- a/src/event/format/json.rs +++ b/src/event/format/json.rs @@ -292,20 +292,17 @@ impl EventFormat for Event { .join("") ); - match json_partitions.get_mut(&prefix) { - Some(JsonPartition { batch, .. }) => batch.push(json), - _ => { - let date = parsed_timestamp.date(); - let batch = vec![json]; - json_partitions.insert( - prefix, - JsonPartition { - batch, - schema, - date, - }, - ); - } + if let Some(JsonPartition { batch, .. }) = json_partitions.get_mut(&prefix) { + batch.push(json) + } else { + json_partitions.insert( + prefix, + JsonPartition { + batch: vec![json], + schema, + date: parsed_timestamp.date(), + }, + ); } } From ef27f97767a49f1380d9fe1f09525a9281328818 Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Mon, 10 Mar 2025 12:44:14 +0530 Subject: [PATCH 34/39] chore: remove unused --- src/handlers/http/cluster/utils.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/handlers/http/cluster/utils.rs b/src/handlers/http/cluster/utils.rs index ae72cf300..9256525c6 100644 --- a/src/handlers/http/cluster/utils.rs +++ b/src/handlers/http/cluster/utils.rs @@ -31,7 +31,6 @@ use actix_web::{ use bytes::BytesMut; use chrono::{DateTime, Utc}; use futures::StreamExt; -use itertools::Itertools; use serde::{de::DeserializeOwned, Deserialize, Serialize}; use tracing::error; use url::Url; From b8606b35be9b44fce1f62f120841d51b80eef74b Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Mon, 10 Mar 2025 13:25:58 +0530 Subject: [PATCH 35/39] fix: custom partitioned file names --- src/event/format/json.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/event/format/json.rs b/src/event/format/json.rs index 19544d80d..7a959717f 100644 --- a/src/event/format/json.rs +++ b/src/event/format/json.rs @@ -288,8 +288,8 @@ impl EventFormat for Event { custom_partition_values .iter() .sorted_by_key(|v| v.0) - .map(|(key, value)| format!("{key}={value}.")) - .join("") + .map(|(key, value)| format!("{key}={value}")) + .join(".") ); if let Some(JsonPartition { batch, .. }) = json_partitions.get_mut(&prefix) { From fbd3a18e5feb6e657c9aa2c01e527b5087fb313d Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Mon, 10 Mar 2025 21:40:50 +0530 Subject: [PATCH 36/39] perf: use a buffer --- src/parseable/staging/writer.rs | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/parseable/staging/writer.rs b/src/parseable/staging/writer.rs index 2dbfe1e49..5e5985aa5 100644 --- a/src/parseable/staging/writer.rs +++ b/src/parseable/staging/writer.rs @@ -18,10 +18,7 @@ */ use std::{ - collections::{HashMap, HashSet}, - fs::{File, OpenOptions}, - path::PathBuf, - sync::Arc, + collections::{HashMap, HashSet}, fs::{File, OpenOptions}, io::BufWriter, path::PathBuf, sync::Arc }; use arrow_array::RecordBatch; @@ -42,7 +39,7 @@ pub struct Writer { } pub struct DiskWriter { - pub inner: StreamWriter, + pub inner: StreamWriter>, pub path: PathBuf, } @@ -50,7 +47,7 @@ impl DiskWriter { pub fn new(path: PathBuf, schema: &Schema) -> Result { let file = OpenOptions::new().create(true).append(true).open(&path)?; - let inner = StreamWriter::try_new(file, schema)?; + let inner = StreamWriter::try_new_buffered(file, schema)?; Ok(Self { inner, path }) } From c2f67691a788c40b8be5d99798abaab0954770db Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Mon, 10 Mar 2025 21:47:42 +0530 Subject: [PATCH 37/39] refactor: drop to flush --- src/parseable/staging/writer.rs | 12 +++++++++++- src/parseable/streams.rs | 17 +++++------------ 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/src/parseable/staging/writer.rs b/src/parseable/staging/writer.rs index 5e5985aa5..1e299068a 100644 --- a/src/parseable/staging/writer.rs +++ b/src/parseable/staging/writer.rs @@ -18,7 +18,11 @@ */ use std::{ - collections::{HashMap, HashSet}, fs::{File, OpenOptions}, io::BufWriter, path::PathBuf, sync::Arc + collections::{HashMap, HashSet}, + fs::{File, OpenOptions}, + io::BufWriter, + path::PathBuf, + sync::Arc, }; use arrow_array::RecordBatch; @@ -70,6 +74,12 @@ impl DiskWriter { } } +impl Drop for DiskWriter { + fn drop(&mut self) { + self.finish(); + } +} + /// Structure to keep recordbatches in memory. /// /// Any new schema is updated in the schema map. diff --git a/src/parseable/streams.rs b/src/parseable/streams.rs index 314771714..c367c8b32 100644 --- a/src/parseable/streams.rs +++ b/src/parseable/streams.rs @@ -325,18 +325,11 @@ impl Stream { } pub fn flush(&self) { - let mut disk_writers = { - let mut writer = self.writer.lock().unwrap(); - // Flush memory - writer.mem.clear(); - // Take schema -> disk writer mapping - std::mem::take(&mut writer.disk) - }; - - // Flush disk - for writer in disk_writers.values_mut() { - writer.finish(); - } + let mut writer = self.writer.lock().unwrap(); + // Flush memory + writer.mem.clear(); + // Drop DiskWirters to flush all streams in memory + drop(std::mem::take(&mut writer.disk)) } fn parquet_writer_props( From 592d3a136a9d2f5be2ba508ada42f5a2dd6f1b7b Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Mon, 10 Mar 2025 22:05:42 +0530 Subject: [PATCH 38/39] fix & test: prefix generation --- src/event/format/json.rs | 79 ++++++++++++++++++++++++++++++++++------ 1 file changed, 67 insertions(+), 12 deletions(-) diff --git a/src/event/format/json.rs b/src/event/format/json.rs index 7a959717f..a27ce180b 100644 --- a/src/event/format/json.rs +++ b/src/event/format/json.rs @@ -280,18 +280,7 @@ impl EventFormat for Event { _ => p_timestamp.naive_utc(), }; - let prefix = format!( - "{}.{}.minute={}.{}", - get_schema_key(&schema), - parsed_timestamp.format("date=%Y-%m-%d.hour=%H"), - Minute::from(parsed_timestamp).to_slot(OBJECT_STORE_DATA_GRANULARITY), - custom_partition_values - .iter() - .sorted_by_key(|v| v.0) - .map(|(key, value)| format!("{key}={value}")) - .join(".") - ); - + let prefix = generate_prefix(&schema, parsed_timestamp, &custom_partition_values); if let Some(JsonPartition { batch, .. }) = json_partitions.get_mut(&prefix) { batch.push(json) } else { @@ -338,6 +327,24 @@ impl EventFormat for Event { } } +fn generate_prefix( + schema: &[Arc], + parsed_timestamp: NaiveDateTime, + custom_partition_values: &HashMap, +) -> String { + format!( + "{}.{}.minute={}{}", + get_schema_key(schema), + parsed_timestamp.format("date=%Y-%m-%d.hour=%H"), + Minute::from(parsed_timestamp).to_slot(OBJECT_STORE_DATA_GRANULARITY), + custom_partition_values + .iter() + .sorted_by_key(|v| v.0) + .map(|(key, value)| format!(".{key}={value}")) + .join("") + ) +} + /// Extracts custom partition values from provided JSON object /// e.g. `json: {"status": 400, "msg": "Hello, World!"}, custom_partition_list: ["status"]` returns `{"status" => 400}` pub fn extract_custom_partition_values( @@ -463,6 +470,7 @@ mod tests { use arrow::datatypes::Int64Type; use arrow_array::{ArrayRef, Float64Array, Int64Array, ListArray, StringArray}; + use chrono::Timelike; use serde_json::json; use super::*; @@ -976,4 +984,51 @@ mod tests { &Float64Array::from(vec![None, None, None, Some(2.0)]) ); } + + #[test] + fn generate_correct_prefix_with_current_time_and_no_custom_partitioning() { + let schema = vec![]; + let parsed_timestamp = NaiveDate::from_ymd_opt(2023, 10, 1) + .unwrap() + .and_hms_opt(12, 30, 0) + .unwrap(); + let custom_partition_values = HashMap::new(); + + let expected = format!( + "{}.date={}.hour={:02}.minute={}", + get_schema_key(&schema), + parsed_timestamp.date(), + parsed_timestamp.hour(), + Minute::from(parsed_timestamp).to_slot(OBJECT_STORE_DATA_GRANULARITY), + ); + + let generated = generate_prefix(&schema, parsed_timestamp, &custom_partition_values); + + assert_eq!(generated, expected); + } + + #[test] + fn generate_correct_prefix_with_current_time_and_custom_partitioning() { + let schema = vec![]; + let parsed_timestamp = NaiveDate::from_ymd_opt(2023, 10, 1) + .unwrap() + .and_hms_opt(12, 30, 0) + .unwrap(); + let custom_partition_values = HashMap::from_iter([ + ("key1".to_string(), "value1".to_string()), + ("key2".to_string(), "value2".to_string()), + ]); + + let expected = format!( + "{}.date={}.hour={:02}.minute={}.key1=value1.key2=value2", + get_schema_key(&schema), + parsed_timestamp.date(), + parsed_timestamp.hour(), + Minute::from(parsed_timestamp).to_slot(OBJECT_STORE_DATA_GRANULARITY), + ); + + let generated = generate_prefix(&schema, parsed_timestamp, &custom_partition_values); + + assert_eq!(generated, expected); + } } From 3d8c33818aa2e0c516335f79c416e992ea135bf7 Mon Sep 17 00:00:00 2001 From: Devdutt Shenoi Date: Wed, 19 Mar 2025 00:26:38 +0530 Subject: [PATCH 39/39] spinoff #1251 --- src/event/format/mod.rs | 3 ++- src/utils/arrow/mod.rs | 12 +++++++----- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/src/event/format/mod.rs b/src/event/format/mod.rs index 40697c4ff..df73475b1 100644 --- a/src/event/format/mod.rs +++ b/src/event/format/mod.rs @@ -187,7 +187,8 @@ pub trait EventFormat: Sized { rb = replace_columns( rb.schema(), &rb, - &[(0, Arc::new(get_timestamp_array(p_timestamp, rb.num_rows())))], + &[0], + &[Arc::new(get_timestamp_array(p_timestamp, rb.num_rows()))], ); Ok(rb) diff --git a/src/utils/arrow/mod.rs b/src/utils/arrow/mod.rs index a11186ee0..53e6437d6 100644 --- a/src/utils/arrow/mod.rs +++ b/src/utils/arrow/mod.rs @@ -61,7 +61,8 @@ use serde_json::{Map, Value}; /// /// * `schema` - The schema of the record batch. /// * `batch` - The record batch to modify. -/// * `indexed_arrays` - A list of indexes and arrays to replace the columns indexed with. +/// * `indexes` - The indexes of the columns to replace. +/// * `arrays` - The new arrays to replace the columns with. /// /// # Returns /// @@ -69,11 +70,12 @@ use serde_json::{Map, Value}; pub fn replace_columns( schema: Arc, batch: &RecordBatch, - indexed_arrays: &[(usize, Arc)], + indexes: &[usize], + arrays: &[Arc], ) -> RecordBatch { let mut batch_arrays = batch.columns().iter().map(Arc::clone).collect_vec(); - for (index, arr) in indexed_arrays { - batch_arrays[*index] = Arc::clone(arr); + for (&index, arr) in indexes.iter().zip(arrays.iter()) { + batch_arrays[index] = Arc::clone(arr); } RecordBatch::try_new(schema, batch_arrays).unwrap() } @@ -176,7 +178,7 @@ mod tests { let arr: Arc = Arc::new(Int32Array::from_value(0, 3)); - let new_rb = replace_columns(schema_ref.clone(), &rb, &[(2, arr)]); + let new_rb = replace_columns(schema_ref.clone(), &rb, &[2], &[arr]); assert_eq!(new_rb.schema(), schema_ref); assert_eq!(new_rb.num_columns(), 3);