From e2a1fc34fd4dad7a7ec4dc01a40656de6801c040 Mon Sep 17 00:00:00 2001
From: Devdutt Shenoi <devdutt@parseable.com>
Date: Fri, 28 Feb 2025 23:55:11 +0530
Subject: [PATCH 01/39] refactor: accept array in `push_logs`

---
 src/handlers/http/modal/utils/ingest_utils.rs | 95 +++++++++----------
 1 file changed, 44 insertions(+), 51 deletions(-)
diff --git a/src/handlers/http/modal/utils/ingest_utils.rs b/src/handlers/http/modal/utils/ingest_utils.rs
index 84d5ae117..55e81f0df 100644
--- a/src/handlers/http/modal/utils/ingest_utils.rs
+++ b/src/handlers/http/modal/utils/ingest_utils.rs
@@ -39,45 +39,36 @@ pub async fn flatten_and_push_logs(
     stream_name: &str,
     log_source: &LogSource,
 ) -> Result<(), PostError> {
-    match log_source {
+    let json = match log_source {
         LogSource::Kinesis => {
             //custom flattening required for Amazon Kinesis
             let message: Message = serde_json::from_value(json)?;
-            for record in flatten_kinesis_logs(message) {
-                push_logs(stream_name, record, &LogSource::default()).await?;
-            }
+            flatten_kinesis_logs(message)
         }
         LogSource::OtelLogs => {
             //custom flattening required for otel logs
             let logs: LogsData = serde_json::from_value(json)?;
-            for record in flatten_otel_logs(&logs) {
-                push_logs(stream_name, record, log_source).await?;
-            }
+            flatten_otel_logs(&logs)
         }
         LogSource::OtelTraces => {
             //custom flattening required for otel traces
             let traces: TracesData = serde_json::from_value(json)?;
-            for record in flatten_otel_traces(&traces) {
-                push_logs(stream_name, record, log_source).await?;
-            }
+            flatten_otel_traces(&traces)
         }
         LogSource::OtelMetrics => {
             //custom flattening required for otel metrics
             let metrics: MetricsData = serde_json::from_value(json)?;
-            for record in flatten_otel_metrics(metrics) {
-                push_logs(stream_name, record, log_source).await?;
-            }
+            flatten_otel_metrics(metrics)
         }
-        _ => {
-            push_logs(stream_name, json, log_source).await?;
-        }
-    }
+        _ => vec![json],
+    };
+    push_logs(stream_name, json, log_source).await?;
     Ok(())
 }
 
 async fn push_logs(
     stream_name: &str,
-    json: Value,
+    jsons: Vec<Value>,
     log_source: &LogSource,
 ) -> Result<(), PostError> {
     let stream = PARSEABLE.get_stream(stream_name)?;
@@ -89,42 +80,44 @@ async fn push_logs(
     let custom_partition = stream.get_custom_partition();
     let schema_version = stream.get_schema_version();
     let p_timestamp = Utc::now();
-
-    let data = if time_partition.is_some() || custom_partition.is_some() {
-        convert_array_to_object(
-            json,
-            time_partition.as_ref(),
-            time_partition_limit,
-            custom_partition.as_ref(),
-            schema_version,
-            log_source,
-        )?
-    } else {
-        vec![convert_to_array(convert_array_to_object(
-            json,
-            None,
-            None,
-            None,
-            schema_version,
-            log_source,
-        )?)?]
-    };
-
-    for json in data {
-        let origin_size = serde_json::to_vec(&json).unwrap().len() as u64; // string length need not be the same as byte length
-        let schema = PARSEABLE.get_stream(stream_name)?.get_schema_raw();
-        json::Event { json, p_timestamp }
-            .into_event(
-                stream_name.to_owned(),
-                origin_size,
-                &schema,
-                static_schema_flag,
-                custom_partition.as_ref(),
+    
+    for json in jsons {
+        let data = if time_partition.is_some() || custom_partition.is_some() {
+            convert_array_to_object(
+                json,
                 time_partition.as_ref(),
+                time_partition_limit,
+                custom_partition.as_ref(),
                 schema_version,
-                StreamType::UserDefined,
+                log_source,
             )?
-            .process()?;
+        } else {
+            vec![convert_to_array(convert_array_to_object(
+                json,
+                None,
+                None,
+                None,
+                schema_version,
+                log_source,
+            )?)?]
+        };
+
+        for json in data {
+            let origin_size = serde_json::to_vec(&json).unwrap().len() as u64; // string length need not be the same as byte length
+            let schema = PARSEABLE.get_stream(stream_name)?.get_schema_raw();
+            json::Event { json, p_timestamp }
+                .into_event(
+                    stream_name.to_owned(),
+                    origin_size,
+                    &schema,
+                    static_schema_flag,
+                    custom_partition.as_ref(),
+                    time_partition.as_ref(),
+                    schema_version,
+                    StreamType::UserDefined,
+                )?
+                .process()?;
+        }
     }
     Ok(())
 }

From d04ba905f7cff9334923a3c46179c560c2706c81 Mon Sep 17 00:00:00 2001
From: Devdutt Shenoi <devdutt@parseable.com>
Date: Sat, 1 Mar 2025 01:39:43 +0530
Subject: [PATCH 02/39] move tests to associated module

---
 src/event/format/json.rs                      | 448 ++++++++++++++++
 src/handlers/http/ingest.rs                   | 499 ------------------
 src/handlers/http/modal/utils/ingest_utils.rs |   2 +-
 src/parseable/streams.rs                      |   5 +-
 src/utils/json/mod.rs                         | 137 +++++
 5 files changed, 587 insertions(+), 504 deletions(-)

diff --git a/src/event/format/json.rs b/src/event/format/json.rs
index c28b701de..43c23f5ad 100644
--- a/src/event/format/json.rs
+++ b/src/event/format/json.rs
@@ -327,6 +327,8 @@ fn valid_type(data_type: &DataType, value: &Value, schema_version: SchemaVersion
 mod tests {
     use std::str::FromStr;
 
+    use arrow::datatypes::Int64Type;
+    use arrow_array::{ArrayRef, Float64Array, Int64Array, ListArray, StringArray};
     use serde_json::json;
 
     use super::*;
@@ -355,4 +357,450 @@ mod tests {
 
         assert!(parsed.is_err());
     }
+
+    trait TestExt {
+        fn as_int64_arr(&self) -> Option<&Int64Array>;
+        fn as_float64_arr(&self) -> Option<&Float64Array>;
+        fn as_utf8_arr(&self) -> Option<&StringArray>;
+    }
+
+    impl TestExt for ArrayRef {
+        fn as_int64_arr(&self) -> Option<&Int64Array> {
+            self.as_any().downcast_ref()
+        }
+
+        fn as_float64_arr(&self) -> Option<&Float64Array> {
+            self.as_any().downcast_ref()
+        }
+
+        fn as_utf8_arr(&self) -> Option<&StringArray> {
+            self.as_any().downcast_ref()
+        }
+    }
+
+    fn fields_to_map(iter: impl Iterator<Item = Field>) -> HashMap<String, Arc<Field>> {
+        iter.map(|x| (x.name().clone(), Arc::new(x))).collect()
+    }
+
+    #[test]
+    fn basic_object_into_rb() {
+        let json = json!({
+            "c": 4.23,
+            "a": 1,
+            "b": "hello",
+        });
+
+        let (rb, _) = Event::new(json)
+            .into_recordbatch(&HashMap::default(), false, None, SchemaVersion::V0)
+            .unwrap();
+
+        assert_eq!(rb.num_rows(), 1);
+        assert_eq!(rb.num_columns(), 4);
+        assert_eq!(
+            rb.column_by_name("a").unwrap().as_int64_arr().unwrap(),
+            &Int64Array::from_iter([1])
+        );
+        assert_eq!(
+            rb.column_by_name("b").unwrap().as_utf8_arr().unwrap(),
+            &StringArray::from_iter_values(["hello"])
+        );
+        assert_eq!(
+            rb.column_by_name("c").unwrap().as_float64_arr().unwrap(),
+            &Float64Array::from_iter([4.23])
+        );
+    }
+
+    #[test]
+    fn basic_object_with_null_into_rb() {
+        let json = json!({
+            "a": 1,
+            "b": "hello",
+            "c": null
+        });
+
+        let (rb, _) = Event::new(json)
+            .into_recordbatch(&HashMap::default(), false, None, SchemaVersion::V0)
+            .unwrap();
+
+        assert_eq!(rb.num_rows(), 1);
+        assert_eq!(rb.num_columns(), 3);
+        assert_eq!(
+            rb.column_by_name("a").unwrap().as_int64_arr().unwrap(),
+            &Int64Array::from_iter([1])
+        );
+        assert_eq!(
+            rb.column_by_name("b").unwrap().as_utf8_arr().unwrap(),
+            &StringArray::from_iter_values(["hello"])
+        );
+    }
+
+    #[test]
+    fn basic_object_derive_schema_into_rb() {
+        let json = json!({
+            "a": 1,
+            "b": "hello",
+        });
+
+        let schema = fields_to_map(
+            [
+                Field::new("a", DataType::Int64, true),
+                Field::new("b", DataType::Utf8, true),
+                Field::new("c", DataType::Float64, true),
+            ]
+            .into_iter(),
+        );
+
+        let (rb, _) = Event::new(json)
+            .into_recordbatch(&schema, false, None, SchemaVersion::V0)
+            .unwrap();
+
+        assert_eq!(rb.num_rows(), 1);
+        assert_eq!(rb.num_columns(), 3);
+        assert_eq!(
+            rb.column_by_name("a").unwrap().as_int64_arr().unwrap(),
+            &Int64Array::from_iter([1])
+        );
+        assert_eq!(
+            rb.column_by_name("b").unwrap().as_utf8_arr().unwrap(),
+            &StringArray::from_iter_values(["hello"])
+        );
+    }
+
+    #[test]
+    fn basic_object_schema_mismatch() {
+        let json = json!({
+            "a": 1,
+            "b": 1, // type mismatch
+        });
+
+        let schema = fields_to_map(
+            [
+                Field::new("a", DataType::Int64, true),
+                Field::new("b", DataType::Utf8, true),
+                Field::new("c", DataType::Float64, true),
+            ]
+            .into_iter(),
+        );
+
+        assert!(Event::new(json)
+            .into_recordbatch(&schema, false, None, SchemaVersion::V0,)
+            .is_err());
+    }
+
+    #[test]
+    fn empty_object() {
+        let json = json!({});
+
+        let schema = fields_to_map(
+            [
+                Field::new("a", DataType::Int64, true),
+                Field::new("b", DataType::Utf8, true),
+                Field::new("c", DataType::Float64, true),
+            ]
+            .into_iter(),
+        );
+
+        let (rb, _) = Event::new(json)
+            .into_recordbatch(&schema, false, None, SchemaVersion::V0)
+            .unwrap();
+
+        assert_eq!(rb.num_rows(), 1);
+        assert_eq!(rb.num_columns(), 1);
+    }
+
+    #[test]
+    fn array_into_recordbatch_inffered_schema() {
+        let json = json!([
+            {
+                "b": "hello",
+            },
+            {
+                "b": "hello",
+                "a": 1,
+                "c": 1
+            },
+            {
+                "a": 1,
+                "b": "hello",
+                "c": null
+            },
+        ]);
+
+        let (rb, _) = Event::new(json)
+            .into_recordbatch(&HashMap::default(), false, None, SchemaVersion::V0)
+            .unwrap();
+
+        assert_eq!(rb.num_rows(), 3);
+        assert_eq!(rb.num_columns(), 4);
+
+        let schema = rb.schema();
+        let fields = &schema.fields;
+
+        assert_eq!(&*fields[1], &Field::new("a", DataType::Int64, true));
+        assert_eq!(&*fields[2], &Field::new("b", DataType::Utf8, true));
+        assert_eq!(&*fields[3], &Field::new("c", DataType::Int64, true));
+
+        assert_eq!(
+            rb.column_by_name("a").unwrap().as_int64_arr().unwrap(),
+            &Int64Array::from(vec![None, Some(1), Some(1)])
+        );
+        assert_eq!(
+            rb.column_by_name("b").unwrap().as_utf8_arr().unwrap(),
+            &StringArray::from(vec![Some("hello"), Some("hello"), Some("hello"),])
+        );
+        assert_eq!(
+            rb.column_by_name("c").unwrap().as_int64_arr().unwrap(),
+            &Int64Array::from(vec![None, Some(1), None])
+        );
+    }
+
+    #[test]
+    fn arr_with_null_into_rb() {
+        let json = json!([
+            {
+                "c": null,
+                "b": "hello",
+                "a": null
+            },
+            {
+                "a": 1,
+                "c": 1.22,
+                "b": "hello"
+            },
+            {
+                "b": "hello",
+                "a": 1,
+                "c": null
+            },
+        ]);
+
+        let (rb, _) = Event::new(json)
+            .into_recordbatch(&HashMap::default(), false, None, SchemaVersion::V0)
+            .unwrap();
+
+        assert_eq!(rb.num_rows(), 3);
+        assert_eq!(rb.num_columns(), 4);
+        assert_eq!(
+            rb.column_by_name("a").unwrap().as_int64_arr().unwrap(),
+            &Int64Array::from(vec![None, Some(1), Some(1)])
+        );
+        assert_eq!(
+            rb.column_by_name("b").unwrap().as_utf8_arr().unwrap(),
+            &StringArray::from(vec![Some("hello"), Some("hello"), Some("hello"),])
+        );
+        assert_eq!(
+            rb.column_by_name("c").unwrap().as_float64_arr().unwrap(),
+            &Float64Array::from(vec![None, Some(1.22), None,])
+        );
+    }
+
+    #[test]
+    fn arr_with_null_derive_schema_into_rb() {
+        let json = json!([
+            {
+                "c": null,
+                "b": "hello",
+                "a": null
+            },
+            {
+                "a": 1,
+                "c": 1.22,
+                "b": "hello"
+            },
+            {
+                "b": "hello",
+                "a": 1,
+                "c": null
+            },
+        ]);
+
+        let schema = fields_to_map(
+            [
+                Field::new("a", DataType::Int64, true),
+                Field::new("b", DataType::Utf8, true),
+                Field::new("c", DataType::Float64, true),
+            ]
+            .into_iter(),
+        );
+
+        let (rb, _) = Event::new(json)
+            .into_recordbatch(&schema, false, None, SchemaVersion::V0)
+            .unwrap();
+
+        assert_eq!(rb.num_rows(), 3);
+        assert_eq!(rb.num_columns(), 4);
+        assert_eq!(
+            rb.column_by_name("a").unwrap().as_int64_arr().unwrap(),
+            &Int64Array::from(vec![None, Some(1), Some(1)])
+        );
+        assert_eq!(
+            rb.column_by_name("b").unwrap().as_utf8_arr().unwrap(),
+            &StringArray::from(vec![Some("hello"), Some("hello"), Some("hello"),])
+        );
+        assert_eq!(
+            rb.column_by_name("c").unwrap().as_float64_arr().unwrap(),
+            &Float64Array::from(vec![None, Some(1.22), None,])
+        );
+    }
+
+    #[test]
+    fn arr_schema_mismatch() {
+        let json = json!([
+            {
+                "a": null,
+                "b": "hello",
+                "c": 1.24
+            },
+            {
+                "a": 1,
+                "b": "hello",
+                "c": 1
+            },
+            {
+                "a": 1,
+                "b": "hello",
+                "c": null
+            },
+        ]);
+
+        let schema = fields_to_map(
+            [
+                Field::new("a", DataType::Int64, true),
+                Field::new("b", DataType::Utf8, true),
+                Field::new("c", DataType::Float64, true),
+            ]
+            .into_iter(),
+        );
+
+        assert!(Event::new(json)
+            .into_recordbatch(&schema, false, None, SchemaVersion::V0,)
+            .is_err());
+    }
+
+    #[test]
+    fn arr_obj_with_nested_type() {
+        let json = json!([
+            {
+                "a": 1,
+                "b": "hello",
+            },
+            {
+                "a": 1,
+                "b": "hello",
+            },
+            {
+                "a": 1,
+                "b": "hello",
+                "c_a": [1],
+            },
+            {
+                "a": 1,
+                "b": "hello",
+                "c_a": [1],
+                "c_b": [2],
+            },
+        ]);
+
+        let (rb, _) = Event::new(json)
+            .into_recordbatch(&HashMap::default(), false, None, SchemaVersion::V0)
+            .unwrap();
+        assert_eq!(rb.num_rows(), 4);
+        assert_eq!(rb.num_columns(), 5);
+        assert_eq!(
+            rb.column_by_name("a").unwrap().as_int64_arr().unwrap(),
+            &Int64Array::from(vec![Some(1), Some(1), Some(1), Some(1)])
+        );
+        assert_eq!(
+            rb.column_by_name("b").unwrap().as_utf8_arr().unwrap(),
+            &StringArray::from(vec![
+                Some("hello"),
+                Some("hello"),
+                Some("hello"),
+                Some("hello")
+            ])
+        );
+
+        assert_eq!(
+            rb.column_by_name("c_a")
+                .unwrap()
+                .as_any()
+                .downcast_ref::<ListArray>()
+                .unwrap(),
+            &ListArray::from_iter_primitive::<Int64Type, _, _>(vec![
+                None,
+                None,
+                Some(vec![Some(1i64)]),
+                Some(vec![Some(1)])
+            ])
+        );
+
+        assert_eq!(
+            rb.column_by_name("c_b")
+                .unwrap()
+                .as_any()
+                .downcast_ref::<ListArray>()
+                .unwrap(),
+            &ListArray::from_iter_primitive::<Int64Type, _, _>(vec![
+                None,
+                None,
+                None,
+                Some(vec![Some(2i64)])
+            ])
+        );
+    }
+
+    #[test]
+    fn arr_obj_with_nested_type_v1() {
+        let json = json!([
+            {
+                "a": 1,
+                "b": "hello",
+            },
+            {
+                "a": 1,
+                "b": "hello",
+            },
+            {
+                "a": 1,
+                "b": "hello",
+                "c_a": 1,
+            },
+            {
+                "a": 1,
+                "b": "hello",
+                "c_a": 1,
+                "c_b": 2,
+            },
+        ]);
+
+        let (rb, _) = Event::new(json)
+            .into_recordbatch(&HashMap::default(), false, None, SchemaVersion::V1)
+            .unwrap();
+
+        assert_eq!(rb.num_rows(), 4);
+        assert_eq!(rb.num_columns(), 5);
+        assert_eq!(
+            rb.column_by_name("a").unwrap().as_float64_arr().unwrap(),
+            &Float64Array::from(vec![Some(1.0), Some(1.0), Some(1.0), Some(1.0)])
+        );
+        assert_eq!(
+            rb.column_by_name("b").unwrap().as_utf8_arr().unwrap(),
+            &StringArray::from(vec![
+                Some("hello"),
+                Some("hello"),
+                Some("hello"),
+                Some("hello")
+            ])
+        );
+
+        assert_eq!(
+            rb.column_by_name("c_a").unwrap().as_float64_arr().unwrap(),
+            &Float64Array::from(vec![None, None, Some(1.0), Some(1.0)])
+        );
+
+        assert_eq!(
+            rb.column_by_name("c_b").unwrap().as_float64_arr().unwrap(),
+            &Float64Array::from(vec![None, None, None, Some(2.0)])
+        );
+    }
 }
diff --git a/src/handlers/http/ingest.rs b/src/handlers/http/ingest.rs
index 0523e8757..bca4e36dc 100644
--- a/src/handlers/http/ingest.rs
+++ b/src/handlers/http/ingest.rs
@@ -332,502 +332,3 @@ impl actix_web::ResponseError for PostError {
             .body(self.to_string())
     }
 }
-
-#[cfg(test)]
-mod tests {
-
-    use arrow::datatypes::Int64Type;
-    use arrow_array::{ArrayRef, Float64Array, Int64Array, ListArray, StringArray};
-    use arrow_schema::{DataType, Field};
-    use serde_json::json;
-    use std::{collections::HashMap, sync::Arc};
-
-    use crate::{
-        event::format::{json, EventFormat},
-        metadata::SchemaVersion,
-        utils::json::{convert_array_to_object, flatten::convert_to_array},
-    };
-
-    trait TestExt {
-        fn as_int64_arr(&self) -> Option<&Int64Array>;
-        fn as_float64_arr(&self) -> Option<&Float64Array>;
-        fn as_utf8_arr(&self) -> Option<&StringArray>;
-    }
-
-    impl TestExt for ArrayRef {
-        fn as_int64_arr(&self) -> Option<&Int64Array> {
-            self.as_any().downcast_ref()
-        }
-
-        fn as_float64_arr(&self) -> Option<&Float64Array> {
-            self.as_any().downcast_ref()
-        }
-
-        fn as_utf8_arr(&self) -> Option<&StringArray> {
-            self.as_any().downcast_ref()
-        }
-    }
-
-    fn fields_to_map(iter: impl Iterator<Item = Field>) -> HashMap<String, Arc<Field>> {
-        iter.map(|x| (x.name().clone(), Arc::new(x))).collect()
-    }
-
-    #[test]
-    fn basic_object_into_rb() {
-        let json = json!({
-            "c": 4.23,
-            "a": 1,
-            "b": "hello",
-        });
-
-        let (rb, _) = json::Event::new(json)
-            .into_recordbatch(&HashMap::default(), false, None, SchemaVersion::V0)
-            .unwrap();
-
-        assert_eq!(rb.num_rows(), 1);
-        assert_eq!(rb.num_columns(), 4);
-        assert_eq!(
-            rb.column_by_name("a").unwrap().as_int64_arr().unwrap(),
-            &Int64Array::from_iter([1])
-        );
-        assert_eq!(
-            rb.column_by_name("b").unwrap().as_utf8_arr().unwrap(),
-            &StringArray::from_iter_values(["hello"])
-        );
-        assert_eq!(
-            rb.column_by_name("c").unwrap().as_float64_arr().unwrap(),
-            &Float64Array::from_iter([4.23])
-        );
-    }
-
-    #[test]
-    fn basic_object_with_null_into_rb() {
-        let json = json!({
-            "a": 1,
-            "b": "hello",
-            "c": null
-        });
-
-        let (rb, _) = json::Event::new(json)
-            .into_recordbatch(&HashMap::default(), false, None, SchemaVersion::V0)
-            .unwrap();
-
-        assert_eq!(rb.num_rows(), 1);
-        assert_eq!(rb.num_columns(), 3);
-        assert_eq!(
-            rb.column_by_name("a").unwrap().as_int64_arr().unwrap(),
-            &Int64Array::from_iter([1])
-        );
-        assert_eq!(
-            rb.column_by_name("b").unwrap().as_utf8_arr().unwrap(),
-            &StringArray::from_iter_values(["hello"])
-        );
-    }
-
-    #[test]
-    fn basic_object_derive_schema_into_rb() {
-        let json = json!({
-            "a": 1,
-            "b": "hello",
-        });
-
-        let schema = fields_to_map(
-            [
-                Field::new("a", DataType::Int64, true),
-                Field::new("b", DataType::Utf8, true),
-                Field::new("c", DataType::Float64, true),
-            ]
-            .into_iter(),
-        );
-
-        let (rb, _) = json::Event::new(json)
-            .into_recordbatch(&schema, false, None, SchemaVersion::V0)
-            .unwrap();
-
-        assert_eq!(rb.num_rows(), 1);
-        assert_eq!(rb.num_columns(), 3);
-        assert_eq!(
-            rb.column_by_name("a").unwrap().as_int64_arr().unwrap(),
-            &Int64Array::from_iter([1])
-        );
-        assert_eq!(
-            rb.column_by_name("b").unwrap().as_utf8_arr().unwrap(),
-            &StringArray::from_iter_values(["hello"])
-        );
-    }
-
-    #[test]
-    fn basic_object_schema_mismatch() {
-        let json = json!({
-            "a": 1,
-            "b": 1, // type mismatch
-        });
-
-        let schema = fields_to_map(
-            [
-                Field::new("a", DataType::Int64, true),
-                Field::new("b", DataType::Utf8, true),
-                Field::new("c", DataType::Float64, true),
-            ]
-            .into_iter(),
-        );
-
-        assert!(json::Event::new(json)
-            .into_recordbatch(&schema, false, None, SchemaVersion::V0,)
-            .is_err());
-    }
-
-    #[test]
-    fn empty_object() {
-        let json = json!({});
-
-        let schema = fields_to_map(
-            [
-                Field::new("a", DataType::Int64, true),
-                Field::new("b", DataType::Utf8, true),
-                Field::new("c", DataType::Float64, true),
-            ]
-            .into_iter(),
-        );
-
-        let (rb, _) = json::Event::new(json)
-            .into_recordbatch(&schema, false, None, SchemaVersion::V0)
-            .unwrap();
-
-        assert_eq!(rb.num_rows(), 1);
-        assert_eq!(rb.num_columns(), 1);
-    }
-
-    #[test]
-    fn non_object_arr_is_err() {
-        let json = json!([1]);
-
-        assert!(convert_array_to_object(
-            json,
-            None,
-            None,
-            None,
-            SchemaVersion::V0,
-            &crate::event::format::LogSource::default()
-        )
-        .is_err())
-    }
-
-    #[test]
-    fn array_into_recordbatch_inffered_schema() {
-        let json = json!([
-            {
-                "b": "hello",
-            },
-            {
-                "b": "hello",
-                "a": 1,
-                "c": 1
-            },
-            {
-                "a": 1,
-                "b": "hello",
-                "c": null
-            },
-        ]);
-
-        let (rb, _) = json::Event::new(json)
-            .into_recordbatch(&HashMap::default(), false, None, SchemaVersion::V0)
-            .unwrap();
-
-        assert_eq!(rb.num_rows(), 3);
-        assert_eq!(rb.num_columns(), 4);
-
-        let schema = rb.schema();
-        let fields = &schema.fields;
-
-        assert_eq!(&*fields[1], &Field::new("a", DataType::Int64, true));
-        assert_eq!(&*fields[2], &Field::new("b", DataType::Utf8, true));
-        assert_eq!(&*fields[3], &Field::new("c", DataType::Int64, true));
-
-        assert_eq!(
-            rb.column_by_name("a").unwrap().as_int64_arr().unwrap(),
-            &Int64Array::from(vec![None, Some(1), Some(1)])
-        );
-        assert_eq!(
-            rb.column_by_name("b").unwrap().as_utf8_arr().unwrap(),
-            &StringArray::from(vec![Some("hello"), Some("hello"), Some("hello"),])
-        );
-        assert_eq!(
-            rb.column_by_name("c").unwrap().as_int64_arr().unwrap(),
-            &Int64Array::from(vec![None, Some(1), None])
-        );
-    }
-
-    #[test]
-    fn arr_with_null_into_rb() {
-        let json = json!([
-            {
-                "c": null,
-                "b": "hello",
-                "a": null
-            },
-            {
-                "a": 1,
-                "c": 1.22,
-                "b": "hello"
-            },
-            {
-                "b": "hello",
-                "a": 1,
-                "c": null
-            },
-        ]);
-
-        let (rb, _) = json::Event::new(json)
-            .into_recordbatch(&HashMap::default(), false, None, SchemaVersion::V0)
-            .unwrap();
-
-        assert_eq!(rb.num_rows(), 3);
-        assert_eq!(rb.num_columns(), 4);
-        assert_eq!(
-            rb.column_by_name("a").unwrap().as_int64_arr().unwrap(),
-            &Int64Array::from(vec![None, Some(1), Some(1)])
-        );
-        assert_eq!(
-            rb.column_by_name("b").unwrap().as_utf8_arr().unwrap(),
-            &StringArray::from(vec![Some("hello"), Some("hello"), Some("hello"),])
-        );
-        assert_eq!(
-            rb.column_by_name("c").unwrap().as_float64_arr().unwrap(),
-            &Float64Array::from(vec![None, Some(1.22), None,])
-        );
-    }
-
-    #[test]
-    fn arr_with_null_derive_schema_into_rb() {
-        let json = json!([
-            {
-                "c": null,
-                "b": "hello",
-                "a": null
-            },
-            {
-                "a": 1,
-                "c": 1.22,
-                "b": "hello"
-            },
-            {
-                "b": "hello",
-                "a": 1,
-                "c": null
-            },
-        ]);
-
-        let schema = fields_to_map(
-            [
-                Field::new("a", DataType::Int64, true),
-                Field::new("b", DataType::Utf8, true),
-                Field::new("c", DataType::Float64, true),
-            ]
-            .into_iter(),
-        );
-
-        let (rb, _) = json::Event::new(json)
-            .into_recordbatch(&schema, false, None, SchemaVersion::V0)
-            .unwrap();
-
-        assert_eq!(rb.num_rows(), 3);
-        assert_eq!(rb.num_columns(), 4);
-        assert_eq!(
-            rb.column_by_name("a").unwrap().as_int64_arr().unwrap(),
-            &Int64Array::from(vec![None, Some(1), Some(1)])
-        );
-        assert_eq!(
-            rb.column_by_name("b").unwrap().as_utf8_arr().unwrap(),
-            &StringArray::from(vec![Some("hello"), Some("hello"), Some("hello"),])
-        );
-        assert_eq!(
-            rb.column_by_name("c").unwrap().as_float64_arr().unwrap(),
-            &Float64Array::from(vec![None, Some(1.22), None,])
-        );
-    }
-
-    #[test]
-    fn arr_schema_mismatch() {
-        let json = json!([
-            {
-                "a": null,
-                "b": "hello",
-                "c": 1.24
-            },
-            {
-                "a": 1,
-                "b": "hello",
-                "c": 1
-            },
-            {
-                "a": 1,
-                "b": "hello",
-                "c": null
-            },
-        ]);
-
-        let schema = fields_to_map(
-            [
-                Field::new("a", DataType::Int64, true),
-                Field::new("b", DataType::Utf8, true),
-                Field::new("c", DataType::Float64, true),
-            ]
-            .into_iter(),
-        );
-
-        assert!(json::Event::new(json)
-            .into_recordbatch(&schema, false, None, SchemaVersion::V0,)
-            .is_err());
-    }
-
-    #[test]
-    fn arr_obj_with_nested_type() {
-        let json = json!([
-            {
-                "a": 1,
-                "b": "hello",
-            },
-            {
-                "a": 1,
-                "b": "hello",
-            },
-            {
-                "a": 1,
-                "b": "hello",
-                "c": [{"a": 1}]
-            },
-            {
-                "a": 1,
-                "b": "hello",
-                "c": [{"a": 1, "b": 2}]
-            },
-        ]);
-        let flattened_json = convert_to_array(
-            convert_array_to_object(
-                json,
-                None,
-                None,
-                None,
-                SchemaVersion::V0,
-                &crate::event::format::LogSource::default(),
-            )
-            .unwrap(),
-        )
-        .unwrap();
-
-        let (rb, _) = json::Event::new(flattened_json)
-            .into_recordbatch(&HashMap::default(), false, None, SchemaVersion::V0)
-            .unwrap();
-        assert_eq!(rb.num_rows(), 4);
-        assert_eq!(rb.num_columns(), 5);
-        assert_eq!(
-            rb.column_by_name("a").unwrap().as_int64_arr().unwrap(),
-            &Int64Array::from(vec![Some(1), Some(1), Some(1), Some(1)])
-        );
-        assert_eq!(
-            rb.column_by_name("b").unwrap().as_utf8_arr().unwrap(),
-            &StringArray::from(vec![
-                Some("hello"),
-                Some("hello"),
-                Some("hello"),
-                Some("hello")
-            ])
-        );
-
-        assert_eq!(
-            rb.column_by_name("c_a")
-                .unwrap()
-                .as_any()
-                .downcast_ref::<ListArray>()
-                .unwrap(),
-            &ListArray::from_iter_primitive::<Int64Type, _, _>(vec![
-                None,
-                None,
-                Some(vec![Some(1i64)]),
-                Some(vec![Some(1)])
-            ])
-        );
-
-        assert_eq!(
-            rb.column_by_name("c_b")
-                .unwrap()
-                .as_any()
-                .downcast_ref::<ListArray>()
-                .unwrap(),
-            &ListArray::from_iter_primitive::<Int64Type, _, _>(vec![
-                None,
-                None,
-                None,
-                Some(vec![Some(2i64)])
-            ])
-        );
-    }
-
-    #[test]
-    fn arr_obj_with_nested_type_v1() {
-        let json = json!([
-            {
-                "a": 1,
-                "b": "hello",
-            },
-            {
-                "a": 1,
-                "b": "hello",
-            },
-            {
-                "a": 1,
-                "b": "hello",
-                "c": [{"a": 1}]
-            },
-            {
-                "a": 1,
-                "b": "hello",
-                "c": [{"a": 1, "b": 2}]
-            },
-        ]);
-        let flattened_json = convert_to_array(
-            convert_array_to_object(
-                json,
-                None,
-                None,
-                None,
-                SchemaVersion::V1,
-                &crate::event::format::LogSource::default(),
-            )
-            .unwrap(),
-        )
-        .unwrap();
-
-        let (rb, _) = json::Event::new(flattened_json)
-            .into_recordbatch(&HashMap::default(), false, None, SchemaVersion::V1)
-            .unwrap();
-
-        assert_eq!(rb.num_rows(), 4);
-        assert_eq!(rb.num_columns(), 5);
-        assert_eq!(
-            rb.column_by_name("a").unwrap().as_float64_arr().unwrap(),
-            &Float64Array::from(vec![Some(1.0), Some(1.0), Some(1.0), Some(1.0)])
-        );
-        assert_eq!(
-            rb.column_by_name("b").unwrap().as_utf8_arr().unwrap(),
-            &StringArray::from(vec![
-                Some("hello"),
-                Some("hello"),
-                Some("hello"),
-                Some("hello")
-            ])
-        );
-
-        assert_eq!(
-            rb.column_by_name("c_a").unwrap().as_float64_arr().unwrap(),
-            &Float64Array::from(vec![None, None, Some(1.0), Some(1.0)])
-        );
-
-        assert_eq!(
-            rb.column_by_name("c_b").unwrap().as_float64_arr().unwrap(),
-            &Float64Array::from(vec![None, None, None, Some(2.0)])
-        );
-    }
-}
diff --git a/src/handlers/http/modal/utils/ingest_utils.rs b/src/handlers/http/modal/utils/ingest_utils.rs
index 55e81f0df..257dc014e 100644
--- a/src/handlers/http/modal/utils/ingest_utils.rs
+++ b/src/handlers/http/modal/utils/ingest_utils.rs
@@ -80,7 +80,7 @@ async fn push_logs(
     let custom_partition = stream.get_custom_partition();
     let schema_version = stream.get_schema_version();
     let p_timestamp = Utc::now();
-    
+
     for json in jsons {
         let data = if time_partition.is_some() || custom_partition.is_some() {
             convert_array_to_object(
diff --git a/src/parseable/streams.rs b/src/parseable/streams.rs
index 088ca509d..009e01d2c 100644
--- a/src/parseable/streams.rs
+++ b/src/parseable/streams.rs
@@ -513,10 +513,7 @@ impl Stream {
                     let file_size = match file.metadata() {
                         Ok(meta) => meta.len(),
                         Err(err) => {
-                            warn!(
-                                "File ({}) not found; Error = {err}",
-                                file.display()
-                            );
+                            warn!("File ({}) not found; Error = {err}", file.display());
                             continue;
                         }
                     };
diff --git a/src/utils/json/mod.rs b/src/utils/json/mod.rs
index efa9cb2e2..0583f722a 100644
--- a/src/utils/json/mod.rs
+++ b/src/utils/json/mod.rs
@@ -278,4 +278,141 @@ mod tests {
         assert_eq!(deserialized.value, original.value);
         assert_eq!(deserialized.other_field, original.other_field);
     }
+
+    #[test]
+    fn non_object_arr_is_err() {
+        let json = json!([1]);
+
+        assert!(convert_array_to_object(
+            json,
+            None,
+            None,
+            None,
+            SchemaVersion::V0,
+            &crate::event::format::LogSource::default()
+        )
+        .is_err())
+    }
+
+    #[test]
+    fn arr_obj_with_nested_type() {
+        let json = json!([
+            {
+                "a": 1,
+                "b": "hello",
+            },
+            {
+                "a": 1,
+                "b": "hello",
+            },
+            {
+                "a": 1,
+                "b": "hello",
+                "c": [{"a": 1}]
+            },
+            {
+                "a": 1,
+                "b": "hello",
+                "c": [{"a": 1, "b": 2}]
+            },
+        ]);
+        let flattened_json = convert_to_array(
+            convert_array_to_object(
+                json,
+                None,
+                None,
+                None,
+                SchemaVersion::V0,
+                &crate::event::format::LogSource::default(),
+            )
+            .unwrap(),
+        )
+        .unwrap();
+
+        assert_eq!(
+            json!([
+                {
+                    "a": 1,
+                    "b": "hello",
+                },
+                {
+                    "a": 1,
+                    "b": "hello",
+                },
+                {
+                    "a": 1,
+                    "b": "hello",
+                    "c_a": [1],
+                },
+                {
+                    "a": 1,
+                    "b": "hello",
+                    "c_a": [1],
+                    "c_b": [2],
+                },
+            ]),
+            flattened_json
+        );
+    }
+
+    #[test]
+    fn arr_obj_with_nested_type_v1() {
+        let json = json!([
+            {
+                "a": 1,
+                "b": "hello",
+            },
+            {
+                "a": 1,
+                "b": "hello",
+            },
+            {
+                "a": 1,
+                "b": "hello",
+                "c": [{"a": 1}]
+            },
+            {
+                "a": 1,
+                "b": "hello",
+                "c": [{"a": 1, "b": 2}]
+            },
+        ]);
+        let flattened_json = convert_to_array(
+            convert_array_to_object(
+                json,
+                None,
+                None,
+                None,
+                SchemaVersion::V1,
+                &crate::event::format::LogSource::default(),
+            )
+            .unwrap(),
+        )
+        .unwrap();
+
+        assert_eq!(
+            json!([
+                {
+                    "a": 1,
+                    "b": "hello",
+                },
+                {
+                    "a": 1,
+                    "b": "hello",
+                },
+                {
+                    "a": 1,
+                    "b": "hello",
+                    "c_a": 1,
+                },
+                {
+                    "a": 1,
+                    "b": "hello",
+                    "c_a": 1,
+                    "c_b": 2,
+                },
+            ]),
+            flattened_json
+        );
+    }
 }

From 5798b7b820b19adf84e09d2d7d53c9ca2a4e6a26 Mon Sep 17 00:00:00 2001
From: Devdutt Shenoi <devdutt@parseable.com>
Date: Sat, 1 Mar 2025 03:42:27 +0530
Subject: [PATCH 03/39] refactor: remove `is_schema_matching`

---
 src/event/format/mod.rs | 35 ++++++++++-------------------------
 1 file changed, 10 insertions(+), 25 deletions(-)

diff --git a/src/event/format/mod.rs b/src/event/format/mod.rs
index ce90cfc52..6bf10d059 100644
--- a/src/event/format/mod.rs
+++ b/src/event/format/mod.rs
@@ -137,11 +137,18 @@ pub trait EventFormat: Sized {
             )),
         );
 
-        // prepare the record batch and new fields to be added
-        let mut new_schema = Arc::new(Schema::new(schema));
-        if !Self::is_schema_matching(new_schema.clone(), storage_schema, static_schema_flag) {
+        if static_schema_flag
+            && schema.iter().any(|field| {
+                storage_schema
+                    .get(field.name())
+                    .is_none_or(|storage_field| storage_field != field)
+            })
+        {
             return Err(anyhow!("Schema mismatch"));
         }
+
+        // prepare the record batch and new fields to be added
+        let mut new_schema = Arc::new(Schema::new(schema));
         new_schema =
             update_field_type_in_schema(new_schema, None, time_partition, None, schema_version);
 
@@ -156,28 +163,6 @@ pub trait EventFormat: Sized {
         Ok((rb, is_first))
     }
 
-    fn is_schema_matching(
-        new_schema: Arc<Schema>,
-        storage_schema: &HashMap<String, Arc<Field>>,
-        static_schema_flag: bool,
-    ) -> bool {
-        if !static_schema_flag {
-            return true;
-        }
-        for field in new_schema.fields() {
-            let Some(storage_field) = storage_schema.get(field.name()) else {
-                return false;
-            };
-            if field.name() != storage_field.name() {
-                return false;
-            }
-            if field.data_type() != storage_field.data_type() {
-                return false;
-            }
-        }
-        true
-    }
-
     #[allow(clippy::too_many_arguments)]
     fn into_event(
         self,

From 78da6b458211206f26c8ebe16cce6eeac215373d Mon Sep 17 00:00:00 2001
From: Devdutt Shenoi <devdutt@parseable.com>
Date: Sat, 1 Mar 2025 11:33:12 +0530
Subject: [PATCH 04/39] doc: improve readability

---
 src/event/format/json.rs | 71 ++++++++++++++++++----------------------
 1 file changed, 31 insertions(+), 40 deletions(-)

diff --git a/src/event/format/json.rs b/src/event/format/json.rs
index 43c23f5ad..ae4cdc55f 100644
--- a/src/event/format/json.rs
+++ b/src/event/format/json.rs
@@ -59,12 +59,10 @@ impl EventFormat for Event {
     // also extract the arrow schema, tags and metadata from the incoming json
     fn to_data(
         self,
-        schema: &HashMap<String, Arc<Field>>,
+        stored_schema: &HashMap<String, Arc<Field>>,
         time_partition: Option<&String>,
         schema_version: SchemaVersion,
     ) -> Result<(Self::Data, Vec<Arc<Field>>, bool), anyhow::Error> {
-        let stream_schema = schema;
-
         // incoming event may be a single json or a json array
         // but Data (type defined above) is a vector of json values
         // hence we need to convert the incoming event to a vector of json values
@@ -79,23 +77,23 @@ impl EventFormat for Event {
             collect_keys(value_arr.iter()).expect("fields can be collected from array of objects");
 
         let mut is_first = false;
-        let schema = match derive_arrow_schema(stream_schema, fields) {
-            Ok(schema) => schema,
-            Err(_) => {
+        let schema = match derive_arrow_schema(stored_schema, fields) {
+            Some(schema) => schema,
+            _ => {
                 let mut infer_schema = infer_json_schema_from_iterator(value_arr.iter().map(Ok))
                     .map_err(|err| {
                         anyhow!("Could not infer schema for this event due to err {:?}", err)
                     })?;
                 let new_infer_schema = super::update_field_type_in_schema(
                     Arc::new(infer_schema),
-                    Some(stream_schema),
+                    Some(stored_schema),
                     time_partition,
                     Some(&value_arr),
                     schema_version,
                 );
                 infer_schema = Schema::new(new_infer_schema.fields().clone());
                 Schema::try_merge(vec![
-                    Schema::new(stream_schema.values().cloned().collect::<Fields>()),
+                    Schema::new(stored_schema.values().cloned().collect::<Fields>()),
                     infer_schema.clone(),
                 ]).map_err(|err| anyhow!("Could not merge schema of this event with that of the existing stream. {:?}", err))?;
                 is_first = true;
@@ -221,51 +219,44 @@ fn extract_and_parse_time(
 
 // Returns arrow schema with the fields that are present in the request body
 // This schema is an input to convert the request body to arrow record batch
+// Returns None if even one of the fields in the json is new and not seen before
 fn derive_arrow_schema(
     schema: &HashMap<String, Arc<Field>>,
-    fields: Vec<&str>,
-) -> Result<Vec<Arc<Field>>, ()> {
+    fields: HashSet<&str>,
+) -> Option<Vec<Arc<Field>>> {
     let mut res = Vec::with_capacity(fields.len());
-    let fields = fields.into_iter().map(|field_name| schema.get(field_name));
-    for field in fields {
-        let Some(field) = field else { return Err(()) };
+    for field_name in fields {
+        let field = schema.get(field_name)?;
         res.push(field.clone())
     }
-    Ok(res)
+
+    Some(res)
 }
 
-fn collect_keys<'a>(values: impl Iterator<Item = &'a Value>) -> Result<Vec<&'a str>, ()> {
-    let mut keys = Vec::new();
+// Returns a list of keys that are present in the given iterable of JSON objects
+// Returns None if even one of the value is not an Object
+fn collect_keys<'a>(values: impl Iterator<Item = &'a Value>) -> Option<HashSet<&'a str>> {
+    let mut keys = HashSet::new();
     for value in values {
-        if let Some(obj) = value.as_object() {
-            for key in obj.keys() {
-                match keys.binary_search(&key.as_str()) {
-                    Ok(_) => (),
-                    Err(pos) => {
-                        keys.insert(pos, key.as_str());
-                    }
-                }
-            }
-        } else {
-            return Err(());
+        let obj = value.as_object()?;
+        for key in obj.keys() {
+            keys.insert(key.as_str());
         }
     }
-    Ok(keys)
+
+    Some(keys)
 }
 
+// Returns true when the field doesn't exist in schema or has an invalid type
 fn fields_mismatch(schema: &[Arc<Field>], body: &Value, schema_version: SchemaVersion) -> bool {
-    for (name, val) in body.as_object().expect("body is of object variant") {
-        if val.is_null() {
-            continue;
-        }
-        let Some(field) = get_field(schema, name) else {
-            return true;
-        };
-        if !valid_type(field.data_type(), val, schema_version) {
-            return true;
-        }
-    }
-    false
+    body.as_object()
+        .expect("body is of object variant")
+        .iter()
+        .any(|(key, value)| {
+            !value.is_null()
+                && get_field(schema, key)
+                    .is_none_or(|field| !valid_type(field.data_type(), value, schema_version))
+        })
 }
 
 fn valid_type(data_type: &DataType, value: &Value, schema_version: SchemaVersion) -> bool {

From 938c33dea9a796a29f395aaaf89fc4f0f17da199 Mon Sep 17 00:00:00 2001
From: Devdutt Shenoi <devdutt@parseable.com>
Date: Sat, 1 Mar 2025 11:54:50 +0530
Subject: [PATCH 05/39] simplify `replace_columns`

---
 src/event/format/mod.rs |  3 +--
 src/utils/arrow/mod.rs  | 12 +++++-------
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/src/event/format/mod.rs b/src/event/format/mod.rs
index 6bf10d059..5c5f8dee8 100644
--- a/src/event/format/mod.rs
+++ b/src/event/format/mod.rs
@@ -156,8 +156,7 @@ pub trait EventFormat: Sized {
         rb = replace_columns(
             rb.schema(),
             &rb,
-            &[0],
-            &[Arc::new(get_timestamp_array(p_timestamp, rb.num_rows()))],
+            &[(0, Arc::new(get_timestamp_array(p_timestamp, rb.num_rows())))],
         );
 
         Ok((rb, is_first))
diff --git a/src/utils/arrow/mod.rs b/src/utils/arrow/mod.rs
index 53e6437d6..a11186ee0 100644
--- a/src/utils/arrow/mod.rs
+++ b/src/utils/arrow/mod.rs
@@ -61,8 +61,7 @@ use serde_json::{Map, Value};
 ///
 /// * `schema` - The schema of the record batch.
 /// * `batch` - The record batch to modify.
-/// * `indexes` - The indexes of the columns to replace.
-/// * `arrays` - The new arrays to replace the columns with.
+/// * `indexed_arrays` - A list of indexes and arrays to replace the columns indexed with.
 ///
 /// # Returns
 ///
@@ -70,12 +69,11 @@ use serde_json::{Map, Value};
 pub fn replace_columns(
     schema: Arc<Schema>,
     batch: &RecordBatch,
-    indexes: &[usize],
-    arrays: &[Arc<dyn Array + 'static>],
+    indexed_arrays: &[(usize, Arc<dyn Array + 'static>)],
 ) -> RecordBatch {
     let mut batch_arrays = batch.columns().iter().map(Arc::clone).collect_vec();
-    for (&index, arr) in indexes.iter().zip(arrays.iter()) {
-        batch_arrays[index] = Arc::clone(arr);
+    for (index, arr) in indexed_arrays {
+        batch_arrays[*index] = Arc::clone(arr);
     }
     RecordBatch::try_new(schema, batch_arrays).unwrap()
 }
@@ -178,7 +176,7 @@ mod tests {
 
         let arr: Arc<dyn Array + 'static> = Arc::new(Int32Array::from_value(0, 3));
 
-        let new_rb = replace_columns(schema_ref.clone(), &rb, &[2], &[arr]);
+        let new_rb = replace_columns(schema_ref.clone(), &rb, &[(2, arr)]);
 
         assert_eq!(new_rb.schema(), schema_ref);
         assert_eq!(new_rb.num_columns(), 3);

From 6fb53854a798a51148218551fcc6daa1dc92198a Mon Sep 17 00:00:00 2001
From: Devdutt Shenoi <devdutt@parseable.com>
Date: Sat, 1 Mar 2025 12:00:07 +0530
Subject: [PATCH 06/39] ci: fix imports

---
 src/event/format/json.rs | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/event/format/json.rs b/src/event/format/json.rs
index ae4cdc55f..3233e1b5c 100644
--- a/src/event/format/json.rs
+++ b/src/event/format/json.rs
@@ -27,7 +27,10 @@ use chrono::{DateTime, NaiveDateTime, Utc};
 use datafusion::arrow::util::bit_util::round_upto_multiple_of_64;
 use itertools::Itertools;
 use serde_json::Value;
-use std::{collections::HashMap, sync::Arc};
+use std::{
+    collections::{HashMap, HashSet},
+    sync::Arc,
+};
 use tracing::error;
 
 use super::EventFormat;

From 23155b699b55fe0631fe22685288ed4fc052457c Mon Sep 17 00:00:00 2001
From: Devdutt Shenoi <devdutt@parseable.com>
Date: Sat, 1 Mar 2025 12:14:42 +0530
Subject: [PATCH 07/39] push flattening into `Event`

---
 src/connectors/kafka/processor.rs             |   3 +
 src/event/format/json.rs                      | 165 +++++++++++++++---
 src/event/format/mod.rs                       |  19 +-
 src/handlers/http/ingest.rs                   |   2 +
 src/handlers/http/modal/utils/ingest_utils.rs |  44 ++---
 5 files changed, 174 insertions(+), 59 deletions(-)

diff --git a/src/connectors/kafka/processor.rs b/src/connectors/kafka/processor.rs
index b74754003..b9fe2101d 100644
--- a/src/connectors/kafka/processor.rs
+++ b/src/connectors/kafka/processor.rs
@@ -57,6 +57,7 @@ impl ParseableSinkProcessor {
         let stream = PARSEABLE.get_stream(stream_name)?;
         let schema = stream.get_schema_raw();
         let time_partition = stream.get_time_partition();
+        let time_partition_limit = stream.get_time_partition_limit();
         let custom_partition = stream.get_custom_partition();
         let static_schema_flag = stream.get_static_schema_flag();
         let schema_version = stream.get_schema_version();
@@ -78,7 +79,9 @@ impl ParseableSinkProcessor {
             static_schema_flag,
             custom_partition.as_ref(),
             time_partition.as_ref(),
+            time_partition_limit,
             schema_version,
+            &LogSource::Custom("Kafka".to_owned()),
             StreamType::UserDefined,
         )?;
 
diff --git a/src/event/format/json.rs b/src/event/format/json.rs
index 3233e1b5c..c6f5a11e5 100644
--- a/src/event/format/json.rs
+++ b/src/event/format/json.rs
@@ -29,12 +29,20 @@ use itertools::Itertools;
 use serde_json::Value;
 use std::{
     collections::{HashMap, HashSet},
+    num::NonZeroU32,
     sync::Arc,
 };
 use tracing::error;
 
-use super::EventFormat;
-use crate::{metadata::SchemaVersion, storage::StreamType, utils::arrow::get_field};
+use super::{EventFormat, LogSource};
+use crate::{
+    metadata::SchemaVersion,
+    storage::StreamType,
+    utils::{
+        arrow::get_field,
+        json::{convert_array_to_object, flatten::convert_to_array},
+    },
+};
 
 pub struct Event {
     pub json: Value,
@@ -64,26 +72,40 @@ impl EventFormat for Event {
         self,
         stored_schema: &HashMap<String, Arc<Field>>,
         time_partition: Option<&String>,
+        time_partition_limit: Option<NonZeroU32>,
+        custom_partition: Option<&String>,
         schema_version: SchemaVersion,
+        log_source: &LogSource,
     ) -> Result<(Self::Data, Vec<Arc<Field>>, bool), anyhow::Error> {
-        // incoming event may be a single json or a json array
-        // but Data (type defined above) is a vector of json values
-        // hence we need to convert the incoming event to a vector of json values
-        let value_arr = match self.json {
-            Value::Array(arr) => arr,
-            value @ Value::Object(_) => vec![value],
-            _ => unreachable!("flatten would have failed beforehand"),
+        let flattened = if time_partition.is_some() || custom_partition.is_some() {
+            convert_array_to_object(
+                self.json,
+                time_partition,
+                time_partition_limit,
+                custom_partition,
+                schema_version,
+                log_source,
+            )?
+        } else {
+            vec![convert_to_array(convert_array_to_object(
+                self.json,
+                None,
+                None,
+                None,
+                schema_version,
+                log_source,
+            )?)?]
         };
 
         // collect all the keys from all the json objects in the request body
         let fields =
-            collect_keys(value_arr.iter()).expect("fields can be collected from array of objects");
+            collect_keys(flattened.iter()).expect("fields can be collected from array of objects");
 
         let mut is_first = false;
         let schema = match derive_arrow_schema(stored_schema, fields) {
             Some(schema) => schema,
             _ => {
-                let mut infer_schema = infer_json_schema_from_iterator(value_arr.iter().map(Ok))
+                let mut infer_schema = infer_json_schema_from_iterator(flattened.iter().map(Ok))
                     .map_err(|err| {
                         anyhow!("Could not infer schema for this event due to err {:?}", err)
                     })?;
@@ -91,7 +113,7 @@ impl EventFormat for Event {
                     Arc::new(infer_schema),
                     Some(stored_schema),
                     time_partition,
-                    Some(&value_arr),
+                    Some(&flattened),
                     schema_version,
                 );
                 infer_schema = Schema::new(new_infer_schema.fields().clone());
@@ -110,7 +132,7 @@ impl EventFormat for Event {
             }
         };
 
-        if value_arr
+        if flattened
             .iter()
             .any(|value| fields_mismatch(&schema, value, schema_version))
         {
@@ -119,7 +141,7 @@ impl EventFormat for Event {
             ));
         }
 
-        Ok((value_arr, schema, is_first))
+        Ok((flattened, schema, is_first))
     }
 
     // Convert the Data type (defined above) to arrow record batch
@@ -147,7 +169,9 @@ impl EventFormat for Event {
         static_schema_flag: bool,
         custom_partitions: Option<&String>,
         time_partition: Option<&String>,
+        time_partition_limit: Option<NonZeroU32>,
         schema_version: SchemaVersion,
+        log_source: &LogSource,
         stream_type: StreamType,
     ) -> Result<super::Event, anyhow::Error> {
         let custom_partition_values = match custom_partitions.as_ref() {
@@ -167,7 +191,10 @@ impl EventFormat for Event {
             storage_schema,
             static_schema_flag,
             time_partition,
+            time_partition_limit,
+            custom_partitions,
             schema_version,
+            log_source,
         )?;
 
         Ok(super::Event {
@@ -385,7 +412,15 @@ mod tests {
         });
 
         let (rb, _) = Event::new(json)
-            .into_recordbatch(&HashMap::default(), false, None, SchemaVersion::V0)
+            .into_recordbatch(
+                &HashMap::default(),
+                false,
+                None,
+                None,
+                None,
+                SchemaVersion::V0,
+                &LogSource::Json,
+            )
             .unwrap();
 
         assert_eq!(rb.num_rows(), 1);
@@ -413,7 +448,15 @@ mod tests {
         });
 
         let (rb, _) = Event::new(json)
-            .into_recordbatch(&HashMap::default(), false, None, SchemaVersion::V0)
+            .into_recordbatch(
+                &HashMap::default(),
+                false,
+                None,
+                None,
+                None,
+                SchemaVersion::V0,
+                &LogSource::Json,
+            )
             .unwrap();
 
         assert_eq!(rb.num_rows(), 1);
@@ -445,7 +488,15 @@ mod tests {
         );
 
         let (rb, _) = Event::new(json)
-            .into_recordbatch(&schema, false, None, SchemaVersion::V0)
+            .into_recordbatch(
+                &schema,
+                false,
+                None,
+                None,
+                None,
+                SchemaVersion::V0,
+                &LogSource::Json,
+            )
             .unwrap();
 
         assert_eq!(rb.num_rows(), 1);
@@ -477,7 +528,15 @@ mod tests {
         );
 
         assert!(Event::new(json)
-            .into_recordbatch(&schema, false, None, SchemaVersion::V0,)
+            .into_recordbatch(
+                &schema,
+                false,
+                None,
+                None,
+                None,
+                SchemaVersion::V0,
+                &LogSource::Json
+            )
             .is_err());
     }
 
@@ -495,7 +554,15 @@ mod tests {
         );
 
         let (rb, _) = Event::new(json)
-            .into_recordbatch(&schema, false, None, SchemaVersion::V0)
+            .into_recordbatch(
+                &schema,
+                false,
+                None,
+                None,
+                None,
+                SchemaVersion::V0,
+                &LogSource::Json,
+            )
             .unwrap();
 
         assert_eq!(rb.num_rows(), 1);
@@ -521,7 +588,15 @@ mod tests {
         ]);
 
         let (rb, _) = Event::new(json)
-            .into_recordbatch(&HashMap::default(), false, None, SchemaVersion::V0)
+            .into_recordbatch(
+                &HashMap::default(),
+                false,
+                None,
+                None,
+                None,
+                SchemaVersion::V0,
+                &LogSource::Json,
+            )
             .unwrap();
 
         assert_eq!(rb.num_rows(), 3);
@@ -569,7 +644,15 @@ mod tests {
         ]);
 
         let (rb, _) = Event::new(json)
-            .into_recordbatch(&HashMap::default(), false, None, SchemaVersion::V0)
+            .into_recordbatch(
+                &HashMap::default(),
+                false,
+                None,
+                None,
+                None,
+                SchemaVersion::V0,
+                &LogSource::Json,
+            )
             .unwrap();
 
         assert_eq!(rb.num_rows(), 3);
@@ -618,7 +701,15 @@ mod tests {
         );
 
         let (rb, _) = Event::new(json)
-            .into_recordbatch(&schema, false, None, SchemaVersion::V0)
+            .into_recordbatch(
+                &schema,
+                false,
+                None,
+                None,
+                None,
+                SchemaVersion::V0,
+                &LogSource::Json,
+            )
             .unwrap();
 
         assert_eq!(rb.num_rows(), 3);
@@ -667,7 +758,15 @@ mod tests {
         );
 
         assert!(Event::new(json)
-            .into_recordbatch(&schema, false, None, SchemaVersion::V0,)
+            .into_recordbatch(
+                &schema,
+                false,
+                None,
+                None,
+                None,
+                SchemaVersion::V0,
+                &LogSource::Json
+            )
             .is_err());
     }
 
@@ -696,7 +795,15 @@ mod tests {
         ]);
 
         let (rb, _) = Event::new(json)
-            .into_recordbatch(&HashMap::default(), false, None, SchemaVersion::V0)
+            .into_recordbatch(
+                &HashMap::default(),
+                false,
+                None,
+                None,
+                None,
+                SchemaVersion::V0,
+                &LogSource::Json,
+            )
             .unwrap();
         assert_eq!(rb.num_rows(), 4);
         assert_eq!(rb.num_columns(), 5);
@@ -768,7 +875,15 @@ mod tests {
         ]);
 
         let (rb, _) = Event::new(json)
-            .into_recordbatch(&HashMap::default(), false, None, SchemaVersion::V1)
+            .into_recordbatch(
+                &HashMap::default(),
+                false,
+                None,
+                None,
+                None,
+                SchemaVersion::V1,
+                &LogSource::Json,
+            )
             .unwrap();
 
         assert_eq!(rb.num_rows(), 4);
diff --git a/src/event/format/mod.rs b/src/event/format/mod.rs
index 5c5f8dee8..ec9ed076f 100644
--- a/src/event/format/mod.rs
+++ b/src/event/format/mod.rs
@@ -20,6 +20,7 @@
 use std::{
     collections::{HashMap, HashSet},
     fmt::Display,
+    num::NonZeroU32,
     sync::Arc,
 };
 
@@ -101,7 +102,10 @@ pub trait EventFormat: Sized {
         self,
         schema: &HashMap<String, Arc<Field>>,
         time_partition: Option<&String>,
+        time_partition_limit: Option<NonZeroU32>,
+        custom_partition: Option<&String>,
         schema_version: SchemaVersion,
+        log_source: &LogSource,
     ) -> Result<(Self::Data, EventSchema, bool), AnyError>;
 
     fn decode(data: Self::Data, schema: Arc<Schema>) -> Result<RecordBatch, AnyError>;
@@ -114,11 +118,20 @@ pub trait EventFormat: Sized {
         storage_schema: &HashMap<String, Arc<Field>>,
         static_schema_flag: bool,
         time_partition: Option<&String>,
+        time_partition_limit: Option<NonZeroU32>,
+        custom_partition: Option<&String>,
         schema_version: SchemaVersion,
+        log_source: &LogSource,
     ) -> Result<(RecordBatch, bool), AnyError> {
         let p_timestamp = self.get_p_timestamp();
-        let (data, mut schema, is_first) =
-            self.to_data(storage_schema, time_partition, schema_version)?;
+        let (data, mut schema, is_first) = self.to_data(
+            storage_schema,
+            time_partition,
+            time_partition_limit,
+            custom_partition,
+            schema_version,
+            log_source,
+        )?;
 
         if get_field(&schema, DEFAULT_TIMESTAMP_KEY).is_some() {
             return Err(anyhow!(
@@ -171,7 +184,9 @@ pub trait EventFormat: Sized {
         static_schema_flag: bool,
         custom_partitions: Option<&String>,
         time_partition: Option<&String>,
+        time_partition_limit: Option<NonZeroU32>,
         schema_version: SchemaVersion,
+        log_source: &LogSource,
         stream_type: StreamType,
     ) -> Result<Event, AnyError>;
 }
diff --git a/src/handlers/http/ingest.rs b/src/handlers/http/ingest.rs
index bca4e36dc..09b11eddc 100644
--- a/src/handlers/http/ingest.rs
+++ b/src/handlers/http/ingest.rs
@@ -91,7 +91,9 @@ pub async fn ingest_internal_stream(stream_name: String, body: Bytes) -> Result<
             false,
             None,
             None,
+            None,
             SchemaVersion::V0,
+            &LogSource::Pmeta,
             StreamType::Internal,
         )?
         .process()?;
diff --git a/src/handlers/http/modal/utils/ingest_utils.rs b/src/handlers/http/modal/utils/ingest_utils.rs
index 257dc014e..7c6d66b60 100644
--- a/src/handlers/http/modal/utils/ingest_utils.rs
+++ b/src/handlers/http/modal/utils/ingest_utils.rs
@@ -31,7 +31,6 @@ use crate::{
     otel::{logs::flatten_otel_logs, metrics::flatten_otel_metrics, traces::flatten_otel_traces},
     parseable::PARSEABLE,
     storage::StreamType,
-    utils::json::{convert_array_to_object, flatten::convert_to_array},
 };
 
 pub async fn flatten_and_push_logs(
@@ -82,42 +81,23 @@ async fn push_logs(
     let p_timestamp = Utc::now();
 
     for json in jsons {
-        let data = if time_partition.is_some() || custom_partition.is_some() {
-            convert_array_to_object(
-                json,
+        let origin_size = serde_json::to_vec(&json).unwrap().len() as u64; // string length need not be the same as byte length
+        let schema = PARSEABLE.get_stream(stream_name)?.get_schema_raw();
+        json::Event { json, p_timestamp }
+            .into_event(
+                stream_name.to_owned(),
+                origin_size,
+                &schema,
+                static_schema_flag,
+                custom_partition.as_ref(),
                 time_partition.as_ref(),
                 time_partition_limit,
-                custom_partition.as_ref(),
                 schema_version,
                 log_source,
+                StreamType::UserDefined,
             )?
-        } else {
-            vec![convert_to_array(convert_array_to_object(
-                json,
-                None,
-                None,
-                None,
-                schema_version,
-                log_source,
-            )?)?]
-        };
-
-        for json in data {
-            let origin_size = serde_json::to_vec(&json).unwrap().len() as u64; // string length need not be the same as byte length
-            let schema = PARSEABLE.get_stream(stream_name)?.get_schema_raw();
-            json::Event { json, p_timestamp }
-                .into_event(
-                    stream_name.to_owned(),
-                    origin_size,
-                    &schema,
-                    static_schema_flag,
-                    custom_partition.as_ref(),
-                    time_partition.as_ref(),
-                    schema_version,
-                    StreamType::UserDefined,
-                )?
-                .process()?;
-        }
+            .process()?;
     }
+
     Ok(())
 }

From d604b65ea1a83edce03b19739e76d6bfd341ed5b Mon Sep 17 00:00:00 2001
From: Devdutt Shenoi <devdutt@parseable.com>
Date: Sat, 1 Mar 2025 12:33:23 +0530
Subject: [PATCH 08/39] refactor: move kinesis to lib level

---
 src/handlers/http/mod.rs           | 1 -
 src/{handlers/http => }/kinesis.rs | 0
 src/lib.rs                         | 1 +
 3 files changed, 1 insertion(+), 1 deletion(-)
 rename src/{handlers/http => }/kinesis.rs (100%)

diff --git a/src/handlers/http/mod.rs b/src/handlers/http/mod.rs
index f1f702d4b..4bdf85adf 100644
--- a/src/handlers/http/mod.rs
+++ b/src/handlers/http/mod.rs
@@ -34,7 +34,6 @@ pub mod cluster;
 pub mod correlation;
 pub mod health_check;
 pub mod ingest;
-mod kinesis;
 pub mod llm;
 pub mod logstream;
 pub mod middleware;
diff --git a/src/handlers/http/kinesis.rs b/src/kinesis.rs
similarity index 100%
rename from src/handlers/http/kinesis.rs
rename to src/kinesis.rs
diff --git a/src/lib.rs b/src/lib.rs
index 2f8eb06ad..4f940aded 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -29,6 +29,7 @@ pub mod correlation;
 mod event;
 pub mod handlers;
 pub mod hottier;
+mod kinesis;
 mod livetail;
 mod metadata;
 pub mod metrics;

From 303ba35ba15a9b9f45d53782b3d1732a8aac2e85 Mon Sep 17 00:00:00 2001
From: Devdutt Shenoi <devdutt@parseable.com>
Date: Sat, 1 Mar 2025 12:33:45 +0530
Subject: [PATCH 09/39] refactor: perform flattening in `to_data` alone

---
 src/event/format/json.rs                      | 96 ++++++++++++++-----
 src/event/format/mod.rs                       |  1 +
 src/handlers/http/ingest.rs                   | 12 +--
 .../http/modal/ingest/ingestor_ingest.rs      |  4 +-
 src/handlers/http/modal/utils/ingest_utils.rs | 48 +---------
 5 files changed, 87 insertions(+), 74 deletions(-)

diff --git a/src/event/format/json.rs b/src/event/format/json.rs
index c6f5a11e5..0bdda6a38 100644
--- a/src/event/format/json.rs
+++ b/src/event/format/json.rs
@@ -26,6 +26,9 @@ use arrow_schema::{DataType, Field, Fields, Schema};
 use chrono::{DateTime, NaiveDateTime, Utc};
 use datafusion::arrow::util::bit_util::round_upto_multiple_of_64;
 use itertools::Itertools;
+use opentelemetry_proto::tonic::{
+    logs::v1::LogsData, metrics::v1::MetricsData, trace::v1::TracesData,
+};
 use serde_json::Value;
 use std::{
     collections::{HashMap, HashSet},
@@ -36,7 +39,9 @@ use tracing::error;
 
 use super::{EventFormat, LogSource};
 use crate::{
+    kinesis::{flatten_kinesis_logs, Message},
     metadata::SchemaVersion,
+    otel::{logs::flatten_otel_logs, metrics::flatten_otel_metrics, traces::flatten_otel_traces},
     storage::StreamType,
     utils::{
         arrow::get_field,
@@ -58,6 +63,64 @@ impl Event {
     }
 }
 
+pub fn flatten_logs(
+    json: Value,
+    time_partition: Option<&String>,
+    time_partition_limit: Option<NonZeroU32>,
+    custom_partitions: Option<&String>,
+    schema_version: SchemaVersion,
+    log_source: &LogSource,
+) -> Result<Vec<Value>, anyhow::Error> {
+    let data = match log_source {
+        LogSource::Kinesis => {
+            //custom flattening required for Amazon Kinesis
+            let message: Message = serde_json::from_value(json)?;
+            flatten_kinesis_logs(message)
+        }
+        LogSource::OtelLogs => {
+            //custom flattening required for otel logs
+            let logs: LogsData = serde_json::from_value(json)?;
+            flatten_otel_logs(&logs)
+        }
+        LogSource::OtelTraces => {
+            //custom flattening required for otel traces
+            let traces: TracesData = serde_json::from_value(json)?;
+            flatten_otel_traces(&traces)
+        }
+        LogSource::OtelMetrics => {
+            //custom flattening required for otel metrics
+            let metrics: MetricsData = serde_json::from_value(json)?;
+            flatten_otel_metrics(metrics)
+        }
+        _ => vec![json],
+    };
+
+    let mut logs = vec![];
+    for json in data {
+        if time_partition.is_some() || custom_partitions.is_some() {
+            logs.append(&mut convert_array_to_object(
+                json,
+                time_partition,
+                time_partition_limit,
+                custom_partitions,
+                schema_version,
+                log_source,
+            )?)
+        } else {
+            logs.push(convert_to_array(convert_array_to_object(
+                json,
+                None,
+                None,
+                None,
+                schema_version,
+                log_source,
+            )?)?)
+        }
+    }
+
+    Ok(logs)
+}
+
 impl EventFormat for Event {
     type Data = Vec<Value>;
 
@@ -73,29 +136,18 @@ impl EventFormat for Event {
         stored_schema: &HashMap<String, Arc<Field>>,
         time_partition: Option<&String>,
         time_partition_limit: Option<NonZeroU32>,
-        custom_partition: Option<&String>,
+        custom_partitions: Option<&String>,
         schema_version: SchemaVersion,
         log_source: &LogSource,
     ) -> Result<(Self::Data, Vec<Arc<Field>>, bool), anyhow::Error> {
-        let flattened = if time_partition.is_some() || custom_partition.is_some() {
-            convert_array_to_object(
-                self.json,
-                time_partition,
-                time_partition_limit,
-                custom_partition,
-                schema_version,
-                log_source,
-            )?
-        } else {
-            vec![convert_to_array(convert_array_to_object(
-                self.json,
-                None,
-                None,
-                None,
-                schema_version,
-                log_source,
-            )?)?]
-        };
+        let flattened = flatten_logs(
+            self.json,
+            time_partition,
+            time_partition_limit,
+            custom_partitions,
+            schema_version,
+            log_source,
+        )?;
 
         // collect all the keys from all the json objects in the request body
         let fields =
@@ -175,8 +227,8 @@ impl EventFormat for Event {
         stream_type: StreamType,
     ) -> Result<super::Event, anyhow::Error> {
         let custom_partition_values = match custom_partitions.as_ref() {
-            Some(custom_partition) => {
-                let custom_partitions = custom_partition.split(',').collect_vec();
+            Some(custom_partitions) => {
+                let custom_partitions = custom_partitions.split(',').collect_vec();
                 extract_custom_partition_values(&self.json, &custom_partitions)
             }
             None => HashMap::new(),
diff --git a/src/event/format/mod.rs b/src/event/format/mod.rs
index ec9ed076f..bf259159a 100644
--- a/src/event/format/mod.rs
+++ b/src/event/format/mod.rs
@@ -113,6 +113,7 @@ pub trait EventFormat: Sized {
     /// Returns the UTC time at ingestion
     fn get_p_timestamp(&self) -> DateTime<Utc>;
 
+    #[allow(clippy::too_many_arguments)]
     fn into_recordbatch(
         self,
         storage_schema: &HashMap<String, Arc<Field>>,
diff --git a/src/handlers/http/ingest.rs b/src/handlers/http/ingest.rs
index 09b11eddc..06d18ed0b 100644
--- a/src/handlers/http/ingest.rs
+++ b/src/handlers/http/ingest.rs
@@ -38,7 +38,7 @@ use crate::utils::header_parsing::ParseHeaderError;
 use crate::utils::json::flatten::JsonFlattenError;
 
 use super::logstream::error::{CreateStreamError, StreamError};
-use super::modal::utils::ingest_utils::flatten_and_push_logs;
+use super::modal::utils::ingest_utils::push_logs;
 use super::users::dashboards::DashboardError;
 use super::users::filters::FiltersError;
 
@@ -72,7 +72,7 @@ pub async fn ingest(req: HttpRequest, Json(json): Json<Value>) -> Result<HttpRes
         return Err(PostError::OtelNotSupported);
     }
 
-    flatten_and_push_logs(json, &stream_name, &log_source).await?;
+    push_logs(&stream_name, json, &log_source).await?;
 
     Ok(HttpResponse::Ok().finish())
 }
@@ -125,7 +125,7 @@ pub async fn handle_otel_logs_ingestion(
         .create_stream_if_not_exists(&stream_name, StreamType::UserDefined, LogSource::OtelLogs)
         .await?;
 
-    flatten_and_push_logs(json, &stream_name, &log_source).await?;
+    push_logs(&stream_name, json, &log_source).await?;
 
     Ok(HttpResponse::Ok().finish())
 }
@@ -156,7 +156,7 @@ pub async fn handle_otel_metrics_ingestion(
         )
         .await?;
 
-    flatten_and_push_logs(json, &stream_name, &log_source).await?;
+    push_logs(&stream_name, json, &log_source).await?;
 
     Ok(HttpResponse::Ok().finish())
 }
@@ -184,7 +184,7 @@ pub async fn handle_otel_traces_ingestion(
         .create_stream_if_not_exists(&stream_name, StreamType::UserDefined, LogSource::OtelTraces)
         .await?;
 
-    flatten_and_push_logs(json, &stream_name, &log_source).await?;
+    push_logs(&stream_name, json, &log_source).await?;
 
     Ok(HttpResponse::Ok().finish())
 }
@@ -233,7 +233,7 @@ pub async fn post_event(
         return Err(PostError::OtelNotSupported);
     }
 
-    flatten_and_push_logs(json, &stream_name, &log_source).await?;
+    push_logs(&stream_name, json, &log_source).await?;
 
     Ok(HttpResponse::Ok().finish())
 }
diff --git a/src/handlers/http/modal/ingest/ingestor_ingest.rs b/src/handlers/http/modal/ingest/ingestor_ingest.rs
index 847eef40d..1af6180d4 100644
--- a/src/handlers/http/modal/ingest/ingestor_ingest.rs
+++ b/src/handlers/http/modal/ingest/ingestor_ingest.rs
@@ -19,7 +19,7 @@
 use actix_web::{HttpRequest, HttpResponse};
 use bytes::Bytes;
 
-use crate::{handlers::http::{ingest::PostError, modal::utils::ingest_utils::flatten_and_push_logs}, metadata::PARSEABLE.streams};
+use crate::{handlers::http::{ingest::PostError, modal::utils::ingest_utils::push_logs}, metadata::PARSEABLE.streams};
 
 
 // Handler for POST /api/v1/logstream/{logstream}
@@ -38,6 +38,6 @@ pub async fn post_event(req: HttpRequest, body: Bytes) -> Result<HttpResponse, P
         return Err(PostError::StreamNotFound(stream_name));
     }
 
-    flatten_and_push_logs(req, body, stream_name).await?;
+    push_logs(req, body, stream_name).await?;
     Ok(HttpResponse::Ok().finish())
 }
\ No newline at end of file
diff --git a/src/handlers/http/modal/utils/ingest_utils.rs b/src/handlers/http/modal/utils/ingest_utils.rs
index 7c6d66b60..aa8fca6a7 100644
--- a/src/handlers/http/modal/utils/ingest_utils.rs
+++ b/src/handlers/http/modal/utils/ingest_utils.rs
@@ -17,57 +17,18 @@
  */
 
 use chrono::Utc;
-use opentelemetry_proto::tonic::{
-    logs::v1::LogsData, metrics::v1::MetricsData, trace::v1::TracesData,
-};
 use serde_json::Value;
 
 use crate::{
     event::format::{json, EventFormat, LogSource},
-    handlers::http::{
-        ingest::PostError,
-        kinesis::{flatten_kinesis_logs, Message},
-    },
-    otel::{logs::flatten_otel_logs, metrics::flatten_otel_metrics, traces::flatten_otel_traces},
+    handlers::http::ingest::PostError,
     parseable::PARSEABLE,
     storage::StreamType,
 };
 
-pub async fn flatten_and_push_logs(
-    json: Value,
+pub async fn push_logs(
     stream_name: &str,
-    log_source: &LogSource,
-) -> Result<(), PostError> {
-    let json = match log_source {
-        LogSource::Kinesis => {
-            //custom flattening required for Amazon Kinesis
-            let message: Message = serde_json::from_value(json)?;
-            flatten_kinesis_logs(message)
-        }
-        LogSource::OtelLogs => {
-            //custom flattening required for otel logs
-            let logs: LogsData = serde_json::from_value(json)?;
-            flatten_otel_logs(&logs)
-        }
-        LogSource::OtelTraces => {
-            //custom flattening required for otel traces
-            let traces: TracesData = serde_json::from_value(json)?;
-            flatten_otel_traces(&traces)
-        }
-        LogSource::OtelMetrics => {
-            //custom flattening required for otel metrics
-            let metrics: MetricsData = serde_json::from_value(json)?;
-            flatten_otel_metrics(metrics)
-        }
-        _ => vec![json],
-    };
-    push_logs(stream_name, json, log_source).await?;
-    Ok(())
-}
-
-async fn push_logs(
-    stream_name: &str,
-    jsons: Vec<Value>,
+    json: Value,
     log_source: &LogSource,
 ) -> Result<(), PostError> {
     let stream = PARSEABLE.get_stream(stream_name)?;
@@ -80,7 +41,6 @@ async fn push_logs(
     let schema_version = stream.get_schema_version();
     let p_timestamp = Utc::now();
 
-    for json in jsons {
         let origin_size = serde_json::to_vec(&json).unwrap().len() as u64; // string length need not be the same as byte length
         let schema = PARSEABLE.get_stream(stream_name)?.get_schema_raw();
         json::Event { json, p_timestamp }
@@ -97,7 +57,7 @@ async fn push_logs(
                 StreamType::UserDefined,
             )?
             .process()?;
-    }
+    
 
     Ok(())
 }

From c2faefc8d9d0ba4b9785265e82343dada688e5f5 Mon Sep 17 00:00:00 2001
From: Devdutt Shenoi <devdutt@parseable.com>
Date: Sat, 1 Mar 2025 12:49:53 +0530
Subject: [PATCH 10/39] refactor: further streamline, associate w/ `Parseable`

---
 src/connectors/kafka/processor.rs             | 47 ++++----------
 src/handlers/http/ingest.rs                   | 52 +++++++--------
 src/handlers/http/modal/utils/ingest_utils.rs | 63 -------------------
 src/handlers/http/modal/utils/mod.rs          |  1 -
 src/parseable/streams.rs                      | 35 ++++++++++-
 5 files changed, 73 insertions(+), 125 deletions(-)
 delete mode 100644 src/handlers/http/modal/utils/ingest_utils.rs

diff --git a/src/connectors/kafka/processor.rs b/src/connectors/kafka/processor.rs
index b9fe2101d..fa2771fc7 100644
--- a/src/connectors/kafka/processor.rs
+++ b/src/connectors/kafka/processor.rs
@@ -26,12 +26,7 @@ use tokio_stream::wrappers::ReceiverStream;
 use tracing::{debug, error};
 
 use crate::{
-    connectors::common::processor::Processor,
-    event::{
-        format::{json, EventFormat, LogSource},
-        Event as ParseableEvent,
-    },
-    parseable::PARSEABLE,
+    connectors::common::processor::Processor, event::format::LogSource, parseable::PARSEABLE,
     storage::StreamType,
 };
 
@@ -41,10 +36,7 @@ use super::{config::BufferConfig, ConsumerRecord, StreamConsumer, TopicPartition
 pub struct ParseableSinkProcessor;
 
 impl ParseableSinkProcessor {
-    async fn build_event_from_chunk(
-        &self,
-        records: &[ConsumerRecord],
-    ) -> anyhow::Result<ParseableEvent> {
+    async fn process_event_from_chunk(&self, records: &[ConsumerRecord]) -> anyhow::Result<u64> {
         let stream_name = records
             .first()
             .map(|r| r.topic.as_str())
@@ -54,14 +46,6 @@ impl ParseableSinkProcessor {
             .create_stream_if_not_exists(stream_name, StreamType::UserDefined, LogSource::Json)
             .await?;
 
-        let stream = PARSEABLE.get_stream(stream_name)?;
-        let schema = stream.get_schema_raw();
-        let time_partition = stream.get_time_partition();
-        let time_partition_limit = stream.get_time_partition_limit();
-        let custom_partition = stream.get_custom_partition();
-        let static_schema_flag = stream.get_static_schema_flag();
-        let schema_version = stream.get_schema_version();
-
         let mut json_vec = Vec::with_capacity(records.len());
         let mut total_payload_size = 0u64;
 
@@ -72,20 +56,15 @@ impl ParseableSinkProcessor {
             }
         }
 
-        let p_event = json::Event::new(Value::Array(json_vec)).into_event(
-            stream_name.to_string(),
-            total_payload_size,
-            &schema,
-            static_schema_flag,
-            custom_partition.as_ref(),
-            time_partition.as_ref(),
-            time_partition_limit,
-            schema_version,
-            &LogSource::Custom("Kafka".to_owned()),
-            StreamType::UserDefined,
-        )?;
-
-        Ok(p_event)
+        PARSEABLE
+            .get_or_create_stream(stream_name)
+            .push_logs(
+                Value::Array(json_vec),
+                &LogSource::Custom("Kafka".to_owned()),
+            )
+            .await?;
+
+        Ok(total_payload_size)
     }
 }
 
@@ -95,9 +74,9 @@ impl Processor<Vec<ConsumerRecord>, ()> for ParseableSinkProcessor {
         let len = records.len();
         debug!("Processing {len} records");
 
-        self.build_event_from_chunk(&records).await?.process()?;
+        let size = self.process_event_from_chunk(&records).await?;
 
-        debug!("Processed {len} records");
+        debug!("Processed {len} records, size = {size} Bytes");
         Ok(())
     }
 }
diff --git a/src/handlers/http/ingest.rs b/src/handlers/http/ingest.rs
index 06d18ed0b..740269d40 100644
--- a/src/handlers/http/ingest.rs
+++ b/src/handlers/http/ingest.rs
@@ -28,9 +28,8 @@ use serde_json::Value;
 
 use crate::event;
 use crate::event::error::EventError;
-use crate::event::format::{self, EventFormat, LogSource};
+use crate::event::format::LogSource;
 use crate::handlers::{LOG_SOURCE_KEY, STREAM_NAME_HEADER_KEY};
-use crate::metadata::SchemaVersion;
 use crate::option::Mode;
 use crate::parseable::{StreamNotFound, PARSEABLE};
 use crate::storage::{ObjectStorageError, StreamType};
@@ -38,7 +37,6 @@ use crate::utils::header_parsing::ParseHeaderError;
 use crate::utils::json::flatten::JsonFlattenError;
 
 use super::logstream::error::{CreateStreamError, StreamError};
-use super::modal::utils::ingest_utils::push_logs;
 use super::users::dashboards::DashboardError;
 use super::users::filters::FiltersError;
 
@@ -72,31 +70,21 @@ pub async fn ingest(req: HttpRequest, Json(json): Json<Value>) -> Result<HttpRes
         return Err(PostError::OtelNotSupported);
     }
 
-    push_logs(&stream_name, json, &log_source).await?;
+    PARSEABLE
+        .get_or_create_stream(&stream_name)
+        .push_logs(json, &log_source)
+        .await?;
 
     Ok(HttpResponse::Ok().finish())
 }
 
 pub async fn ingest_internal_stream(stream_name: String, body: Bytes) -> Result<(), PostError> {
-    let size: usize = body.len();
     let json: Value = serde_json::from_slice(&body)?;
-    let schema = PARSEABLE.get_stream(&stream_name)?.get_schema_raw();
-
-    // For internal streams, use old schema
-    format::json::Event::new(json)
-        .into_event(
-            stream_name,
-            size as u64,
-            &schema,
-            false,
-            None,
-            None,
-            None,
-            SchemaVersion::V0,
-            &LogSource::Pmeta,
-            StreamType::Internal,
-        )?
-        .process()?;
+
+    PARSEABLE
+        .get_stream(&stream_name)?
+        .push_logs(json, &LogSource::Pmeta)
+        .await?;
 
     Ok(())
 }
@@ -125,7 +113,10 @@ pub async fn handle_otel_logs_ingestion(
         .create_stream_if_not_exists(&stream_name, StreamType::UserDefined, LogSource::OtelLogs)
         .await?;
 
-    push_logs(&stream_name, json, &log_source).await?;
+    PARSEABLE
+        .get_or_create_stream(&stream_name)
+        .push_logs(json, &log_source)
+        .await?;
 
     Ok(HttpResponse::Ok().finish())
 }
@@ -156,7 +147,10 @@ pub async fn handle_otel_metrics_ingestion(
         )
         .await?;
 
-    push_logs(&stream_name, json, &log_source).await?;
+    PARSEABLE
+        .get_or_create_stream(&stream_name)
+        .push_logs(json, &log_source)
+        .await?;
 
     Ok(HttpResponse::Ok().finish())
 }
@@ -184,7 +178,10 @@ pub async fn handle_otel_traces_ingestion(
         .create_stream_if_not_exists(&stream_name, StreamType::UserDefined, LogSource::OtelTraces)
         .await?;
 
-    push_logs(&stream_name, json, &log_source).await?;
+    PARSEABLE
+        .get_or_create_stream(&stream_name)
+        .push_logs(json, &log_source)
+        .await?;
 
     Ok(HttpResponse::Ok().finish())
 }
@@ -233,7 +230,10 @@ pub async fn post_event(
         return Err(PostError::OtelNotSupported);
     }
 
-    push_logs(&stream_name, json, &log_source).await?;
+    PARSEABLE
+        .get_or_create_stream(&stream_name)
+        .push_logs(json, &log_source)
+        .await?;
 
     Ok(HttpResponse::Ok().finish())
 }
diff --git a/src/handlers/http/modal/utils/ingest_utils.rs b/src/handlers/http/modal/utils/ingest_utils.rs
deleted file mode 100644
index aa8fca6a7..000000000
--- a/src/handlers/http/modal/utils/ingest_utils.rs
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Parseable Server (C) 2022 - 2024 Parseable, Inc.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- */
-
-use chrono::Utc;
-use serde_json::Value;
-
-use crate::{
-    event::format::{json, EventFormat, LogSource},
-    handlers::http::ingest::PostError,
-    parseable::PARSEABLE,
-    storage::StreamType,
-};
-
-pub async fn push_logs(
-    stream_name: &str,
-    json: Value,
-    log_source: &LogSource,
-) -> Result<(), PostError> {
-    let stream = PARSEABLE.get_stream(stream_name)?;
-    let time_partition = stream.get_time_partition();
-    let time_partition_limit = PARSEABLE
-        .get_stream(stream_name)?
-        .get_time_partition_limit();
-    let static_schema_flag = stream.get_static_schema_flag();
-    let custom_partition = stream.get_custom_partition();
-    let schema_version = stream.get_schema_version();
-    let p_timestamp = Utc::now();
-
-        let origin_size = serde_json::to_vec(&json).unwrap().len() as u64; // string length need not be the same as byte length
-        let schema = PARSEABLE.get_stream(stream_name)?.get_schema_raw();
-        json::Event { json, p_timestamp }
-            .into_event(
-                stream_name.to_owned(),
-                origin_size,
-                &schema,
-                static_schema_flag,
-                custom_partition.as_ref(),
-                time_partition.as_ref(),
-                time_partition_limit,
-                schema_version,
-                log_source,
-                StreamType::UserDefined,
-            )?
-            .process()?;
-    
-
-    Ok(())
-}
diff --git a/src/handlers/http/modal/utils/mod.rs b/src/handlers/http/modal/utils/mod.rs
index 61930d43d..1d0a3767b 100644
--- a/src/handlers/http/modal/utils/mod.rs
+++ b/src/handlers/http/modal/utils/mod.rs
@@ -16,6 +16,5 @@
  *
  */
 
-pub mod ingest_utils;
 pub mod logstream_utils;
 pub mod rbac_utils;
diff --git a/src/parseable/streams.rs b/src/parseable/streams.rs
index 009e01d2c..d9cd373ad 100644
--- a/src/parseable/streams.rs
+++ b/src/parseable/streams.rs
@@ -42,12 +42,16 @@ use parquet::{
 };
 use rand::distributions::DistString;
 use relative_path::RelativePathBuf;
+use serde_json::Value;
 use tokio::task::JoinSet;
 use tracing::{error, info, trace, warn};
 
 use crate::{
     cli::Options,
-    event::DEFAULT_TIMESTAMP_KEY,
+    event::{
+        format::{json, EventFormat, LogSource},
+        DEFAULT_TIMESTAMP_KEY,
+    },
     metadata::{LogStreamMetadata, SchemaVersion},
     metrics,
     option::Mode,
@@ -109,6 +113,35 @@ impl Stream {
         })
     }
 
+    pub async fn push_logs(&self, json: Value, log_source: &LogSource) -> anyhow::Result<()> {
+        let time_partition = self.get_time_partition();
+        let time_partition_limit = self.get_time_partition_limit();
+        let static_schema_flag = self.get_static_schema_flag();
+        let custom_partition = self.get_custom_partition();
+        let schema_version = self.get_schema_version();
+        let schema = self.get_schema_raw();
+        let stream_type = self.get_stream_type();
+
+        let origin_size = serde_json::to_vec(&json).unwrap().len() as u64; // string length need not be the same as byte length
+
+        json::Event::new(json)
+            .into_event(
+                self.stream_name.to_owned(),
+                origin_size,
+                &schema,
+                static_schema_flag,
+                custom_partition.as_ref(),
+                time_partition.as_ref(),
+                time_partition_limit,
+                schema_version,
+                log_source,
+                stream_type,
+            )?
+            .process()?;
+
+        Ok(())
+    }
+
     // Concatenates record batches and puts them in memory store for each event.
     pub fn push(
         &self,

From 354061a9a95214c0dbd2b3ec6b97bf1a03587d7f Mon Sep 17 00:00:00 2001
From: Devdutt Shenoi <devdutt@parseable.com>
Date: Sat, 1 Mar 2025 12:53:45 +0530
Subject: [PATCH 11/39] ci: deepsource suggestion

---
 src/event/format/json.rs | 61 ++++++++++++++++++++++------------------
 1 file changed, 33 insertions(+), 28 deletions(-)

diff --git a/src/event/format/json.rs b/src/event/format/json.rs
index 0bdda6a38..0c2c06155 100644
--- a/src/event/format/json.rs
+++ b/src/event/format/json.rs
@@ -154,34 +154,39 @@ impl EventFormat for Event {
             collect_keys(flattened.iter()).expect("fields can be collected from array of objects");
 
         let mut is_first = false;
-        let schema = match derive_arrow_schema(stored_schema, fields) {
-            Some(schema) => schema,
-            _ => {
-                let mut infer_schema = infer_json_schema_from_iterator(flattened.iter().map(Ok))
-                    .map_err(|err| {
-                        anyhow!("Could not infer schema for this event due to err {:?}", err)
-                    })?;
-                let new_infer_schema = super::update_field_type_in_schema(
-                    Arc::new(infer_schema),
-                    Some(stored_schema),
-                    time_partition,
-                    Some(&flattened),
-                    schema_version,
-                );
-                infer_schema = Schema::new(new_infer_schema.fields().clone());
-                Schema::try_merge(vec![
-                    Schema::new(stored_schema.values().cloned().collect::<Fields>()),
-                    infer_schema.clone(),
-                ]).map_err(|err| anyhow!("Could not merge schema of this event with that of the existing stream. {:?}", err))?;
-                is_first = true;
-                infer_schema
-                    .fields
-                    .iter()
-                    .filter(|field| !field.data_type().is_null())
-                    .cloned()
-                    .sorted_by(|a, b| a.name().cmp(b.name()))
-                    .collect()
-            }
+        let schema = if let Some(schema) = derive_arrow_schema(stored_schema, fields) {
+            schema
+        } else {
+            let mut infer_schema = infer_json_schema_from_iterator(flattened.iter().map(Ok))
+                .map_err(|err| {
+                    anyhow!("Could not infer schema for this event due to err {:?}", err)
+                })?;
+            let new_infer_schema = super::update_field_type_in_schema(
+                Arc::new(infer_schema),
+                Some(stored_schema),
+                time_partition,
+                Some(&flattened),
+                schema_version,
+            );
+            infer_schema = Schema::new(new_infer_schema.fields().clone());
+            Schema::try_merge(vec![
+                Schema::new(stored_schema.values().cloned().collect::<Fields>()),
+                infer_schema.clone(),
+            ])
+            .map_err(|err| {
+                anyhow!(
+                    "Could not merge schema of this event with that of the existing stream. {:?}",
+                    err
+                )
+            })?;
+            is_first = true;
+            infer_schema
+                .fields
+                .iter()
+                .filter(|field| !field.data_type().is_null())
+                .cloned()
+                .sorted_by(|a, b| a.name().cmp(b.name()))
+                .collect()
         };
 
         if flattened

From 1386d3bd5de58f89d1f9758aae98ef544e180378 Mon Sep 17 00:00:00 2001
From: Devdutt Shenoi <devdutt@parseable.com>
Date: Sat, 1 Mar 2025 13:02:07 +0530
Subject: [PATCH 12/39] fix: flattening

---
 src/event/format/json.rs | 42 ++++++++++++++--------------------------
 1 file changed, 14 insertions(+), 28 deletions(-)

diff --git a/src/event/format/json.rs b/src/event/format/json.rs
index 0c2c06155..a056f2bf4 100644
--- a/src/event/format/json.rs
+++ b/src/event/format/json.rs
@@ -43,10 +43,7 @@ use crate::{
     metadata::SchemaVersion,
     otel::{logs::flatten_otel_logs, metrics::flatten_otel_metrics, traces::flatten_otel_traces},
     storage::StreamType,
-    utils::{
-        arrow::get_field,
-        json::{convert_array_to_object, flatten::convert_to_array},
-    },
+    utils::{arrow::get_field, json::convert_array_to_object},
 };
 
 pub struct Event {
@@ -70,7 +67,7 @@ pub fn flatten_logs(
     custom_partitions: Option<&String>,
     schema_version: SchemaVersion,
     log_source: &LogSource,
-) -> Result<Vec<Value>, anyhow::Error> {
+) -> anyhow::Result<Vec<Value>> {
     let data = match log_source {
         LogSource::Kinesis => {
             //custom flattening required for Amazon Kinesis
@@ -97,25 +94,14 @@ pub fn flatten_logs(
 
     let mut logs = vec![];
     for json in data {
-        if time_partition.is_some() || custom_partitions.is_some() {
-            logs.append(&mut convert_array_to_object(
-                json,
-                time_partition,
-                time_partition_limit,
-                custom_partitions,
-                schema_version,
-                log_source,
-            )?)
-        } else {
-            logs.push(convert_to_array(convert_array_to_object(
-                json,
-                None,
-                None,
-                None,
-                schema_version,
-                log_source,
-            )?)?)
-        }
+        logs.append(&mut convert_array_to_object(
+            json,
+            time_partition,
+            time_partition_limit,
+            custom_partitions,
+            schema_version,
+            log_source,
+        )?)
     }
 
     Ok(logs)
@@ -139,7 +125,7 @@ impl EventFormat for Event {
         custom_partitions: Option<&String>,
         schema_version: SchemaVersion,
         log_source: &LogSource,
-    ) -> Result<(Self::Data, Vec<Arc<Field>>, bool), anyhow::Error> {
+    ) -> anyhow::Result<(Self::Data, Vec<Arc<Field>>, bool)> {
         let flattened = flatten_logs(
             self.json,
             time_partition,
@@ -202,7 +188,7 @@ impl EventFormat for Event {
     }
 
     // Convert the Data type (defined above) to arrow record batch
-    fn decode(data: Self::Data, schema: Arc<Schema>) -> Result<RecordBatch, anyhow::Error> {
+    fn decode(data: Self::Data, schema: Arc<Schema>) -> anyhow::Result<RecordBatch> {
         let array_capacity = round_upto_multiple_of_64(data.len());
         let mut reader = ReaderBuilder::new(schema)
             .with_batch_size(array_capacity)
@@ -230,7 +216,7 @@ impl EventFormat for Event {
         schema_version: SchemaVersion,
         log_source: &LogSource,
         stream_type: StreamType,
-    ) -> Result<super::Event, anyhow::Error> {
+    ) -> anyhow::Result<super::Event> {
         let custom_partition_values = match custom_partitions.as_ref() {
             Some(custom_partitions) => {
                 let custom_partitions = custom_partitions.split(',').collect_vec();
@@ -295,7 +281,7 @@ pub fn extract_custom_partition_values(
 fn extract_and_parse_time(
     json: &Value,
     time_partition: &str,
-) -> Result<NaiveDateTime, anyhow::Error> {
+) -> anyhow::Result<NaiveDateTime> {
     let current_time = json
         .get(time_partition)
         .ok_or_else(|| anyhow!("Missing field for time partition in json: {time_partition}"))?;

From 38a52c2cffe8b70d356a502f0cc3f12f8b35a017 Mon Sep 17 00:00:00 2001
From: Devdutt Shenoi <devdutt@parseable.com>
Date: Sat, 1 Mar 2025 13:12:59 +0530
Subject: [PATCH 13/39] remove unused code

---
 .../http/modal/ingest/ingestor_ingest.rs      | 43 -------------------
 1 file changed, 43 deletions(-)
 delete mode 100644 src/handlers/http/modal/ingest/ingestor_ingest.rs

diff --git a/src/handlers/http/modal/ingest/ingestor_ingest.rs b/src/handlers/http/modal/ingest/ingestor_ingest.rs
deleted file mode 100644
index 1af6180d4..000000000
--- a/src/handlers/http/modal/ingest/ingestor_ingest.rs
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Parseable Server (C) 2022 - 2024 Parseable, Inc.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- */
-
-use actix_web::{HttpRequest, HttpResponse};
-use bytes::Bytes;
-
-use crate::{handlers::http::{ingest::PostError, modal::utils::ingest_utils::push_logs}, metadata::PARSEABLE.streams};
-
-
-// Handler for POST /api/v1/logstream/{logstream}
-// only ingests events into the specified logstream
-// fails if the logstream does not exist
-pub async fn post_event(req: HttpRequest, body: Bytes) -> Result<HttpResponse, PostError> {
-    let stream_name: String = req.match_info().get("logstream").unwrap().parse().unwrap();
-    let internal_stream_names = PARSEABLE.streams.list_internal_streams();
-    if internal_stream_names.contains(&stream_name) {
-        return Err(PostError::Invalid(anyhow::anyhow!(
-            "Stream {} is an internal stream and cannot be ingested into",
-            stream_name
-        )));
-    }
-    if !PARSEABLE.streams.stream_exists(&stream_name) {
-        return Err(PostError::StreamNotFound(stream_name));
-    }
-
-    push_logs(req, body, stream_name).await?;
-    Ok(HttpResponse::Ok().finish())
-}
\ No newline at end of file

From a7b2db391b649a0747e265efb09d7630fbcaf314 Mon Sep 17 00:00:00 2001
From: Devdutt Shenoi <devdutt@parseable.com>
Date: Sat, 1 Mar 2025 14:29:21 +0530
Subject: [PATCH 14/39] fix: partitioning

---
 src/event/format/json.rs       | 326 ++++++++++++++++++++++-----------
 src/event/format/mod.rs        |  40 ++--
 src/event/mod.rs               |  93 +++++-----
 src/handlers/http/ingest.rs    |  12 +-
 src/handlers/http/logstream.rs |   8 +-
 src/utils/json/mod.rs          |  69 +++----
 6 files changed, 319 insertions(+), 229 deletions(-)

diff --git a/src/event/format/json.rs b/src/event/format/json.rs
index a056f2bf4..08c132209 100644
--- a/src/event/format/json.rs
+++ b/src/event/format/json.rs
@@ -39,11 +39,15 @@ use tracing::error;
 
 use super::{EventFormat, LogSource};
 use crate::{
+    event::PartitionEvent,
     kinesis::{flatten_kinesis_logs, Message},
     metadata::SchemaVersion,
     otel::{logs::flatten_otel_logs, metrics::flatten_otel_metrics, traces::flatten_otel_traces},
     storage::StreamType,
-    utils::{arrow::get_field, json::convert_array_to_object},
+    utils::{
+        arrow::get_field,
+        json::{flatten_json_body, Json},
+    },
 };
 
 pub struct Event {
@@ -67,7 +71,7 @@ pub fn flatten_logs(
     custom_partitions: Option<&String>,
     schema_version: SchemaVersion,
     log_source: &LogSource,
-) -> anyhow::Result<Vec<Value>> {
+) -> anyhow::Result<Vec<Json>> {
     let data = match log_source {
         LogSource::Kinesis => {
             //custom flattening required for Amazon Kinesis
@@ -94,26 +98,40 @@ pub fn flatten_logs(
 
     let mut logs = vec![];
     for json in data {
-        logs.append(&mut convert_array_to_object(
+        let json = flatten_json_body(
             json,
             time_partition,
             time_partition_limit,
             custom_partitions,
             schema_version,
+            true,
             log_source,
-        )?)
+        )?;
+
+        // incoming event may be a single json or a json array
+        // but Data (type defined above) is a vector of json values
+        // hence we need to convert the incoming event to a vector of json values
+        match json {
+            Value::Array(arr) => {
+                for log in arr {
+                    let Value::Object(json) = log else {
+                        return Err(anyhow!(
+                            "Expected an object or a list of objects, received: {log:?}"
+                        ));
+                    };
+                    logs.push(json);
+                }
+            }
+            Value::Object(obj) => logs.push(obj),
+            _ => unreachable!("flatten would have failed beforehand"),
+        }
     }
 
     Ok(logs)
 }
 
 impl EventFormat for Event {
-    type Data = Vec<Value>;
-
-    /// Returns the time at ingestion, i.e. the `p_timestamp` value
-    fn get_p_timestamp(&self) -> DateTime<Utc> {
-        self.p_timestamp
-    }
+    type Data = Vec<Json>;
 
     // convert the incoming json to a vector of json values
     // also extract the arrow schema, tags and metadata from the incoming json
@@ -136,17 +154,17 @@ impl EventFormat for Event {
         )?;
 
         // collect all the keys from all the json objects in the request body
-        let fields =
-            collect_keys(flattened.iter()).expect("fields can be collected from array of objects");
+        let fields = collect_keys(flattened.iter());
 
         let mut is_first = false;
         let schema = if let Some(schema) = derive_arrow_schema(stored_schema, fields) {
             schema
         } else {
-            let mut infer_schema = infer_json_schema_from_iterator(flattened.iter().map(Ok))
-                .map_err(|err| {
-                    anyhow!("Could not infer schema for this event due to err {:?}", err)
-                })?;
+            // TODO:
+            let mut infer_schema = infer_json_schema_from_iterator(
+                flattened.iter().map(|obj| Ok(Value::Object(obj.clone()))),
+            )
+            .map_err(|err| anyhow!("Could not infer schema for this event due to err {:?}", err))?;
             let new_infer_schema = super::update_field_type_in_schema(
                 Arc::new(infer_schema),
                 Some(stored_schema),
@@ -217,22 +235,9 @@ impl EventFormat for Event {
         log_source: &LogSource,
         stream_type: StreamType,
     ) -> anyhow::Result<super::Event> {
-        let custom_partition_values = match custom_partitions.as_ref() {
-            Some(custom_partitions) => {
-                let custom_partitions = custom_partitions.split(',').collect_vec();
-                extract_custom_partition_values(&self.json, &custom_partitions)
-            }
-            None => HashMap::new(),
-        };
-
-        let parsed_timestamp = match time_partition {
-            Some(time_partition) => extract_and_parse_time(&self.json, time_partition)?,
-            _ => self.p_timestamp.naive_utc(),
-        };
-
-        let (rb, is_first_event) = self.into_recordbatch(
+        let p_timestamp = self.p_timestamp;
+        let (data, schema, is_first_event) = self.to_data(
             storage_schema,
-            static_schema_flag,
             time_partition,
             time_partition_limit,
             custom_partitions,
@@ -240,15 +245,45 @@ impl EventFormat for Event {
             log_source,
         )?;
 
+        let mut partitions = vec![];
+        for json in data {
+            let custom_partition_values = match custom_partitions.as_ref() {
+                Some(custom_partitions) => {
+                    let custom_partitions = custom_partitions.split(',').collect_vec();
+                    extract_custom_partition_values(&json, &custom_partitions)
+                }
+                None => HashMap::new(),
+            };
+
+            let parsed_timestamp = match time_partition {
+                Some(time_partition) => extract_and_parse_time(&json, time_partition)?,
+                _ => p_timestamp.naive_utc(),
+            };
+
+            let rb = Self::into_recordbatch(
+                p_timestamp,
+                vec![json],
+                schema.clone(),
+                storage_schema,
+                static_schema_flag,
+                time_partition,
+                schema_version,
+            )?;
+
+            partitions.push(PartitionEvent {
+                rb,
+                parsed_timestamp,
+                custom_partition_values,
+            });
+        }
+
         Ok(super::Event {
-            rb,
             stream_name,
             origin_format: "json",
             origin_size,
             is_first_event,
-            parsed_timestamp,
             time_partition: None,
-            custom_partition_values,
+            partitions,
             stream_type,
         })
     }
@@ -257,7 +292,7 @@ impl EventFormat for Event {
 /// Extracts custom partition values from provided JSON object
 /// e.g. `json: {"status": 400, "msg": "Hello, World!"}, custom_partition_list: ["status"]` returns `{"status" => 400}`
 pub fn extract_custom_partition_values(
-    json: &Value,
+    json: &Json,
     custom_partition_list: &[&str],
 ) -> HashMap<String, String> {
     let mut custom_partition_values: HashMap<String, String> = HashMap::new();
@@ -278,10 +313,7 @@ pub fn extract_custom_partition_values(
 
 /// Returns the parsed timestamp of deignated time partition from json object
 /// e.g. `json: {"timestamp": "2025-05-15T15:30:00Z"}` returns `2025-05-15T15:30:00`
-fn extract_and_parse_time(
-    json: &Value,
-    time_partition: &str,
-) -> anyhow::Result<NaiveDateTime> {
+fn extract_and_parse_time(json: &Json, time_partition: &str) -> anyhow::Result<NaiveDateTime> {
     let current_time = json
         .get(time_partition)
         .ok_or_else(|| anyhow!("Missing field for time partition in json: {time_partition}"))?;
@@ -308,28 +340,24 @@ fn derive_arrow_schema(
 
 // Returns a list of keys that are present in the given iterable of JSON objects
 // Returns None if even one of the value is not an Object
-fn collect_keys<'a>(values: impl Iterator<Item = &'a Value>) -> Option<HashSet<&'a str>> {
+fn collect_keys<'a>(objects: impl Iterator<Item = &'a Json>) -> HashSet<&'a str> {
     let mut keys = HashSet::new();
-    for value in values {
-        let obj = value.as_object()?;
-        for key in obj.keys() {
+    for object in objects {
+        for key in object.keys() {
             keys.insert(key.as_str());
         }
     }
 
-    Some(keys)
+    keys
 }
 
 // Returns true when the field doesn't exist in schema or has an invalid type
-fn fields_mismatch(schema: &[Arc<Field>], body: &Value, schema_version: SchemaVersion) -> bool {
-    body.as_object()
-        .expect("body is of object variant")
-        .iter()
-        .any(|(key, value)| {
-            !value.is_null()
-                && get_field(schema, key)
-                    .is_none_or(|field| !valid_type(field.data_type(), value, schema_version))
-        })
+fn fields_mismatch(schema: &[Arc<Field>], body: &Json, schema_version: SchemaVersion) -> bool {
+    body.iter().any(|(key, value)| {
+        !value.is_null()
+            && get_field(schema, key)
+                .is_none_or(|field| !valid_type(field.data_type(), value, schema_version))
+    })
 }
 
 fn valid_type(data_type: &DataType, value: &Value, schema_version: SchemaVersion) -> bool {
@@ -400,7 +428,7 @@ mod tests {
     #[test]
     fn parse_time_parition_from_value() {
         let json = json!({"timestamp": "2025-05-15T15:30:00Z"});
-        let parsed = extract_and_parse_time(&json, "timestamp");
+        let parsed = extract_and_parse_time(json.as_object().unwrap(), "timestamp");
 
         let expected = NaiveDateTime::from_str("2025-05-15T15:30:00").unwrap();
         assert_eq!(parsed.unwrap(), expected);
@@ -409,7 +437,7 @@ mod tests {
     #[test]
     fn time_parition_not_in_json() {
         let json = json!({"hello": "world!"});
-        let parsed = extract_and_parse_time(&json, "timestamp");
+        let parsed = extract_and_parse_time(json.as_object().unwrap(), "timestamp");
 
         assert!(parsed.is_err());
     }
@@ -417,7 +445,7 @@ mod tests {
     #[test]
     fn time_parition_not_parseable_as_datetime() {
         let json = json!({"timestamp": "not time"});
-        let parsed = extract_and_parse_time(&json, "timestamp");
+        let parsed = extract_and_parse_time(json.as_object().unwrap(), "timestamp");
 
         assert!(parsed.is_err());
     }
@@ -454,10 +482,10 @@ mod tests {
             "b": "hello",
         });
 
-        let (rb, _) = Event::new(json)
-            .into_recordbatch(
-                &HashMap::default(),
-                false,
+        let store_schema = HashMap::default();
+        let (data, schema, _) = Event::new(json)
+            .to_data(
+                &store_schema,
                 None,
                 None,
                 None,
@@ -465,6 +493,16 @@ mod tests {
                 &LogSource::Json,
             )
             .unwrap();
+        let rb = Event::into_recordbatch(
+            Utc::now(),
+            data,
+            schema,
+            &store_schema,
+            false,
+            None,
+            SchemaVersion::V0,
+        )
+        .unwrap();
 
         assert_eq!(rb.num_rows(), 1);
         assert_eq!(rb.num_columns(), 4);
@@ -490,10 +528,10 @@ mod tests {
             "c": null
         });
 
-        let (rb, _) = Event::new(json)
-            .into_recordbatch(
-                &HashMap::default(),
-                false,
+        let store_schema = HashMap::default();
+        let (data, schema, _) = Event::new(json)
+            .to_data(
+                &store_schema,
                 None,
                 None,
                 None,
@@ -501,6 +539,16 @@ mod tests {
                 &LogSource::Json,
             )
             .unwrap();
+        let rb = Event::into_recordbatch(
+            Utc::now(),
+            data,
+            schema,
+            &store_schema,
+            false,
+            None,
+            SchemaVersion::V0,
+        )
+        .unwrap();
 
         assert_eq!(rb.num_rows(), 1);
         assert_eq!(rb.num_columns(), 3);
@@ -521,7 +569,7 @@ mod tests {
             "b": "hello",
         });
 
-        let schema = fields_to_map(
+        let store_schema = fields_to_map(
             [
                 Field::new("a", DataType::Int64, true),
                 Field::new("b", DataType::Utf8, true),
@@ -529,11 +577,9 @@ mod tests {
             ]
             .into_iter(),
         );
-
-        let (rb, _) = Event::new(json)
-            .into_recordbatch(
-                &schema,
-                false,
+        let (data, schema, _) = Event::new(json)
+            .to_data(
+                &store_schema,
                 None,
                 None,
                 None,
@@ -541,6 +587,16 @@ mod tests {
                 &LogSource::Json,
             )
             .unwrap();
+        let rb = Event::into_recordbatch(
+            Utc::now(),
+            data,
+            schema,
+            &store_schema,
+            false,
+            None,
+            SchemaVersion::V0,
+        )
+        .unwrap();
 
         assert_eq!(rb.num_rows(), 1);
         assert_eq!(rb.num_columns(), 3);
@@ -561,7 +617,7 @@ mod tests {
             "b": 1, // type mismatch
         });
 
-        let schema = fields_to_map(
+        let store_schema = fields_to_map(
             [
                 Field::new("a", DataType::Int64, true),
                 Field::new("b", DataType::Utf8, true),
@@ -571,14 +627,13 @@ mod tests {
         );
 
         assert!(Event::new(json)
-            .into_recordbatch(
-                &schema,
-                false,
+            .to_data(
+                &store_schema,
                 None,
                 None,
                 None,
                 SchemaVersion::V0,
-                &LogSource::Json
+                &LogSource::Json,
             )
             .is_err());
     }
@@ -587,7 +642,7 @@ mod tests {
     fn empty_object() {
         let json = json!({});
 
-        let schema = fields_to_map(
+        let store_schema = fields_to_map(
             [
                 Field::new("a", DataType::Int64, true),
                 Field::new("b", DataType::Utf8, true),
@@ -596,10 +651,9 @@ mod tests {
             .into_iter(),
         );
 
-        let (rb, _) = Event::new(json)
-            .into_recordbatch(
-                &schema,
-                false,
+        let (data, schema, _) = Event::new(json)
+            .to_data(
+                &store_schema,
                 None,
                 None,
                 None,
@@ -607,6 +661,16 @@ mod tests {
                 &LogSource::Json,
             )
             .unwrap();
+        let rb = Event::into_recordbatch(
+            Utc::now(),
+            data,
+            schema,
+            &store_schema,
+            false,
+            None,
+            SchemaVersion::V0,
+        )
+        .unwrap();
 
         assert_eq!(rb.num_rows(), 1);
         assert_eq!(rb.num_columns(), 1);
@@ -630,10 +694,10 @@ mod tests {
             },
         ]);
 
-        let (rb, _) = Event::new(json)
-            .into_recordbatch(
-                &HashMap::default(),
-                false,
+        let store_schema = HashMap::new();
+        let (data, schema, _) = Event::new(json)
+            .to_data(
+                &store_schema,
                 None,
                 None,
                 None,
@@ -641,6 +705,16 @@ mod tests {
                 &LogSource::Json,
             )
             .unwrap();
+        let rb = Event::into_recordbatch(
+            Utc::now(),
+            data,
+            schema,
+            &store_schema,
+            false,
+            None,
+            SchemaVersion::V0,
+        )
+        .unwrap();
 
         assert_eq!(rb.num_rows(), 3);
         assert_eq!(rb.num_columns(), 4);
@@ -686,10 +760,10 @@ mod tests {
             },
         ]);
 
-        let (rb, _) = Event::new(json)
-            .into_recordbatch(
-                &HashMap::default(),
-                false,
+        let store_schema = HashMap::new();
+        let (data, schema, _) = Event::new(json)
+            .to_data(
+                &store_schema,
                 None,
                 None,
                 None,
@@ -697,6 +771,16 @@ mod tests {
                 &LogSource::Json,
             )
             .unwrap();
+        let rb = Event::into_recordbatch(
+            Utc::now(),
+            data,
+            schema,
+            &store_schema,
+            false,
+            None,
+            SchemaVersion::V0,
+        )
+        .unwrap();
 
         assert_eq!(rb.num_rows(), 3);
         assert_eq!(rb.num_columns(), 4);
@@ -734,7 +818,7 @@ mod tests {
             },
         ]);
 
-        let schema = fields_to_map(
+        let store_schema = fields_to_map(
             [
                 Field::new("a", DataType::Int64, true),
                 Field::new("b", DataType::Utf8, true),
@@ -742,11 +826,9 @@ mod tests {
             ]
             .into_iter(),
         );
-
-        let (rb, _) = Event::new(json)
-            .into_recordbatch(
-                &schema,
-                false,
+        let (data, schema, _) = Event::new(json)
+            .to_data(
+                &store_schema,
                 None,
                 None,
                 None,
@@ -754,6 +836,16 @@ mod tests {
                 &LogSource::Json,
             )
             .unwrap();
+        let rb = Event::into_recordbatch(
+            Utc::now(),
+            data,
+            schema,
+            &store_schema,
+            false,
+            None,
+            SchemaVersion::V0,
+        )
+        .unwrap();
 
         assert_eq!(rb.num_rows(), 3);
         assert_eq!(rb.num_columns(), 4);
@@ -791,7 +883,7 @@ mod tests {
             },
         ]);
 
-        let schema = fields_to_map(
+        let store_schema = fields_to_map(
             [
                 Field::new("a", DataType::Int64, true),
                 Field::new("b", DataType::Utf8, true),
@@ -801,14 +893,13 @@ mod tests {
         );
 
         assert!(Event::new(json)
-            .into_recordbatch(
-                &schema,
-                false,
+            .to_data(
+                &store_schema,
                 None,
                 None,
                 None,
                 SchemaVersion::V0,
-                &LogSource::Json
+                &LogSource::Json,
             )
             .is_err());
     }
@@ -837,10 +928,10 @@ mod tests {
             },
         ]);
 
-        let (rb, _) = Event::new(json)
-            .into_recordbatch(
-                &HashMap::default(),
-                false,
+        let store_schema = HashMap::new();
+        let (data, schema, _) = Event::new(json)
+            .to_data(
+                &store_schema,
                 None,
                 None,
                 None,
@@ -848,6 +939,17 @@ mod tests {
                 &LogSource::Json,
             )
             .unwrap();
+        let rb = Event::into_recordbatch(
+            Utc::now(),
+            data,
+            schema,
+            &store_schema,
+            false,
+            None,
+            SchemaVersion::V0,
+        )
+        .unwrap();
+
         assert_eq!(rb.num_rows(), 4);
         assert_eq!(rb.num_columns(), 5);
         assert_eq!(
@@ -917,10 +1019,10 @@ mod tests {
             },
         ]);
 
-        let (rb, _) = Event::new(json)
-            .into_recordbatch(
-                &HashMap::default(),
-                false,
+        let store_schema = HashMap::new();
+        let (data, schema, _) = Event::new(json)
+            .to_data(
+                &store_schema,
                 None,
                 None,
                 None,
@@ -928,6 +1030,16 @@ mod tests {
                 &LogSource::Json,
             )
             .unwrap();
+        let rb = Event::into_recordbatch(
+            Utc::now(),
+            data,
+            schema,
+            &store_schema,
+            false,
+            None,
+            SchemaVersion::V1,
+        )
+        .unwrap();
 
         assert_eq!(rb.num_rows(), 4);
         assert_eq!(rb.num_columns(), 5);
diff --git a/src/event/format/mod.rs b/src/event/format/mod.rs
index bf259159a..a6b977d7c 100644
--- a/src/event/format/mod.rs
+++ b/src/event/format/mod.rs
@@ -34,7 +34,10 @@ use serde_json::Value;
 use crate::{
     metadata::SchemaVersion,
     storage::StreamType,
-    utils::arrow::{get_field, get_timestamp_array, replace_columns},
+    utils::{
+        arrow::{get_field, get_timestamp_array, replace_columns},
+        json::Json,
+    },
 };
 
 use super::{Event, DEFAULT_TIMESTAMP_KEY};
@@ -110,30 +113,16 @@ pub trait EventFormat: Sized {
 
     fn decode(data: Self::Data, schema: Arc<Schema>) -> Result<RecordBatch, AnyError>;
 
-    /// Returns the UTC time at ingestion
-    fn get_p_timestamp(&self) -> DateTime<Utc>;
-
     #[allow(clippy::too_many_arguments)]
     fn into_recordbatch(
-        self,
+        p_timestamp: DateTime<Utc>,
+        data: Self::Data,
+        mut schema: EventSchema,
         storage_schema: &HashMap<String, Arc<Field>>,
         static_schema_flag: bool,
         time_partition: Option<&String>,
-        time_partition_limit: Option<NonZeroU32>,
-        custom_partition: Option<&String>,
         schema_version: SchemaVersion,
-        log_source: &LogSource,
-    ) -> Result<(RecordBatch, bool), AnyError> {
-        let p_timestamp = self.get_p_timestamp();
-        let (data, mut schema, is_first) = self.to_data(
-            storage_schema,
-            time_partition,
-            time_partition_limit,
-            custom_partition,
-            schema_version,
-            log_source,
-        )?;
-
+    ) -> Result<RecordBatch, AnyError> {
         if get_field(&schema, DEFAULT_TIMESTAMP_KEY).is_some() {
             return Err(anyhow!(
                 "field {} is a reserved field",
@@ -162,7 +151,7 @@ pub trait EventFormat: Sized {
         }
 
         // prepare the record batch and new fields to be added
-        let mut new_schema = Arc::new(Schema::new(schema));
+        let mut new_schema = Arc::new(Schema::new(schema.clone()));
         new_schema =
             update_field_type_in_schema(new_schema, None, time_partition, None, schema_version);
 
@@ -173,7 +162,7 @@ pub trait EventFormat: Sized {
             &[(0, Arc::new(get_timestamp_array(p_timestamp, rb.num_rows())))],
         );
 
-        Ok((rb, is_first))
+        Ok(rb)
     }
 
     #[allow(clippy::too_many_arguments)]
@@ -247,7 +236,7 @@ pub fn update_field_type_in_schema(
     inferred_schema: Arc<Schema>,
     existing_schema: Option<&HashMap<String, Arc<Field>>>,
     time_partition: Option<&String>,
-    log_records: Option<&Vec<Value>>,
+    log_records: Option<&[Json]>,
     schema_version: SchemaVersion,
 ) -> Arc<Schema> {
     let mut updated_schema = inferred_schema.clone();
@@ -292,18 +281,15 @@ pub fn update_field_type_in_schema(
 // a string value parseable into timestamp as timestamp type and all numbers as float64.
 pub fn override_data_type(
     inferred_schema: Arc<Schema>,
-    log_record: Value,
+    log_record: Json,
     schema_version: SchemaVersion,
 ) -> Arc<Schema> {
-    let Value::Object(map) = log_record else {
-        return inferred_schema;
-    };
     let updated_schema: Vec<Field> = inferred_schema
         .fields()
         .iter()
         .map(|field| {
             let field_name = field.name().as_str();
-            match (schema_version, map.get(field.name())) {
+            match (schema_version, log_record.get(field.name())) {
                 // in V1 for new fields in json named "time"/"date" or such and having inferred
                 // type string, that can be parsed as timestamp, use the timestamp type.
                 // NOTE: support even more datetime string formats
diff --git a/src/event/mod.rs b/src/event/mod.rs
index 29a4a0899..e17bbedec 100644
--- a/src/event/mod.rs
+++ b/src/event/mod.rs
@@ -36,69 +36,78 @@ use std::collections::HashMap;
 
 pub const DEFAULT_TIMESTAMP_KEY: &str = "p_timestamp";
 
-#[derive(Clone)]
+pub struct PartitionEvent {
+    pub rb: RecordBatch,
+    pub parsed_timestamp: NaiveDateTime,
+    pub custom_partition_values: HashMap<String, String>,
+}
+
 pub struct Event {
     pub stream_name: String,
-    pub rb: RecordBatch,
     pub origin_format: &'static str,
     pub origin_size: u64,
     pub is_first_event: bool,
-    pub parsed_timestamp: NaiveDateTime,
     pub time_partition: Option<String>,
-    pub custom_partition_values: HashMap<String, String>,
+    pub partitions: Vec<PartitionEvent>,
     pub stream_type: StreamType,
 }
 
 // Events holds the schema related to a each event for a single log stream
 impl Event {
     pub fn process(self) -> Result<(), EventError> {
-        let mut key = get_schema_key(&self.rb.schema().fields);
-        if self.time_partition.is_some() {
-            let parsed_timestamp_to_min = self.parsed_timestamp.format("%Y%m%dT%H%M").to_string();
-            key.push_str(&parsed_timestamp_to_min);
-        }
+        for partition in self.partitions {
+            let mut key = get_schema_key(&partition.rb.schema().fields);
+            if self.time_partition.is_some() {
+                let parsed_timestamp_to_min =
+                    partition.parsed_timestamp.format("%Y%m%dT%H%M").to_string();
+                key.push_str(&parsed_timestamp_to_min);
+            }
 
-        if !self.custom_partition_values.is_empty() {
-            for (k, v) in self.custom_partition_values.iter().sorted_by_key(|v| v.0) {
+            for (k, v) in partition
+                .custom_partition_values
+                .iter()
+                .sorted_by_key(|v| v.0)
+            {
                 key.push_str(&format!("&{k}={v}"));
             }
-        }
-
-        if self.is_first_event {
-            commit_schema(&self.stream_name, self.rb.schema())?;
-        }
-
-        PARSEABLE.get_or_create_stream(&self.stream_name).push(
-            &key,
-            &self.rb,
-            self.parsed_timestamp,
-            &self.custom_partition_values,
-            self.stream_type,
-        )?;
 
-        update_stats(
-            &self.stream_name,
-            self.origin_format,
-            self.origin_size,
-            self.rb.num_rows(),
-            self.parsed_timestamp.date(),
-        );
-
-        crate::livetail::LIVETAIL.process(&self.stream_name, &self.rb);
+            if self.is_first_event {
+                commit_schema(&self.stream_name, partition.rb.schema())?;
+            }
 
+            PARSEABLE.get_or_create_stream(&self.stream_name).push(
+                &key,
+                &partition.rb,
+                partition.parsed_timestamp,
+                &partition.custom_partition_values,
+                self.stream_type,
+            )?;
+
+            update_stats(
+                &self.stream_name,
+                self.origin_format,
+                self.origin_size,
+                partition.rb.num_rows(),
+                partition.parsed_timestamp.date(),
+            );
+
+            crate::livetail::LIVETAIL.process(&self.stream_name, &partition.rb);
+        }
         Ok(())
     }
 
     pub fn process_unchecked(&self) -> Result<(), EventError> {
-        let key = get_schema_key(&self.rb.schema().fields);
-
-        PARSEABLE.get_or_create_stream(&self.stream_name).push(
-            &key,
-            &self.rb,
-            self.parsed_timestamp,
-            &self.custom_partition_values,
-            self.stream_type,
-        )?;
+        for partition in &self.partitions {
+            let key = get_schema_key(&partition.rb.schema().fields);
+
+            PARSEABLE.get_or_create_stream(&self.stream_name).push(
+                &key,
+                &partition.rb,
+                partition.parsed_timestamp,
+                &partition.custom_partition_values,
+                self.stream_type,
+            )?;
+        }
 
         Ok(())
     }
diff --git a/src/handlers/http/ingest.rs b/src/handlers/http/ingest.rs
index 740269d40..a50b0845d 100644
--- a/src/handlers/http/ingest.rs
+++ b/src/handlers/http/ingest.rs
@@ -26,9 +26,9 @@ use chrono::Utc;
 use http::StatusCode;
 use serde_json::Value;
 
-use crate::event;
 use crate::event::error::EventError;
 use crate::event::format::LogSource;
+use crate::event::{self, PartitionEvent};
 use crate::handlers::{LOG_SOURCE_KEY, STREAM_NAME_HEADER_KEY};
 use crate::option::Mode;
 use crate::parseable::{StreamNotFound, PARSEABLE};
@@ -243,14 +243,16 @@ pub async fn push_logs_unchecked(
     stream_name: &str,
 ) -> Result<event::Event, PostError> {
     let unchecked_event = event::Event {
-        rb: batches,
         stream_name: stream_name.to_string(),
         origin_format: "json",
         origin_size: 0,
-        parsed_timestamp: Utc::now().naive_utc(),
         time_partition: None,
-        is_first_event: true,                    // NOTE: Maybe should be false
-        custom_partition_values: HashMap::new(), // should be an empty map for unchecked push
+        is_first_event: true, // NOTE: Maybe should be false
+        partitions: vec![PartitionEvent {
+            rb: batches,
+            parsed_timestamp: Utc::now().naive_utc(),
+            custom_partition_values: HashMap::new(), // should be an empty map for unchecked push
+        }],
         stream_type: StreamType::UserDefined,
     };
     unchecked_event.process_unchecked()?;
diff --git a/src/handlers/http/logstream.rs b/src/handlers/http/logstream.rs
index b9fb64edc..f486338fe 100644
--- a/src/handlers/http/logstream.rs
+++ b/src/handlers/http/logstream.rs
@@ -114,7 +114,13 @@ pub async fn detect_schema(Json(json): Json<Value>) -> Result<impl Responder, St
     };
 
     let mut schema = Arc::new(infer_json_schema_from_iterator(log_records.iter().map(Ok)).unwrap());
-    for log_record in log_records {
+    for value in log_records {
+        let Value::Object(log_record) = value else {
+            return Err(StreamError::Custom {
+                msg: format!("Expected an object, received: {value:?}"),
+                status: StatusCode::BAD_REQUEST,
+            });
+        };
         schema = override_data_type(schema, log_record, SchemaVersion::V1);
     }
     Ok((web::Json(schema), StatusCode::OK))
diff --git a/src/utils/json/mod.rs b/src/utils/json/mod.rs
index 0583f722a..cc7edf507 100644
--- a/src/utils/json/mod.rs
+++ b/src/utils/json/mod.rs
@@ -21,14 +21,16 @@ use std::num::NonZeroU32;
 
 use flatten::{convert_to_array, generic_flattening, has_more_than_four_levels};
 use serde::de::Visitor;
-use serde_json;
 use serde_json::Value;
+use serde_json::{self, Map};
 
 use crate::event::format::LogSource;
 use crate::metadata::SchemaVersion;
 
 pub mod flatten;
 
+pub type Json = Map<String, Value>;
+
 /// calls the function `flatten_json` which results Vec<Value> or Error
 /// in case when Vec<Value> is returned, converts the Vec<Value> to Value of Array
 /// this is to ensure recursive flattening does not happen for heavily nested jsons
@@ -61,32 +63,8 @@ pub fn flatten_json_body(
         custom_partition,
         validation_required,
     )?;
-    Ok(nested_value)
-}
 
-pub fn convert_array_to_object(
-    body: Value,
-    time_partition: Option<&String>,
-    time_partition_limit: Option<NonZeroU32>,
-    custom_partition: Option<&String>,
-    schema_version: SchemaVersion,
-    log_source: &LogSource,
-) -> Result<Vec<Value>, anyhow::Error> {
-    let data = flatten_json_body(
-        body,
-        time_partition,
-        time_partition_limit,
-        custom_partition,
-        schema_version,
-        true,
-        log_source,
-    )?;
-    let value_arr = match data {
-        Value::Array(arr) => arr,
-        value @ Value::Object(_) => vec![value],
-        _ => unreachable!("flatten would have failed beforehand"),
-    };
-    Ok(value_arr)
+    Ok(nested_value)
 }
 
 struct TrueFromStr;
@@ -283,12 +261,13 @@ mod tests {
     fn non_object_arr_is_err() {
         let json = json!([1]);
 
-        assert!(convert_array_to_object(
+        assert!(flatten_json_body(
             json,
             None,
             None,
             None,
             SchemaVersion::V0,
+            false,
             &crate::event::format::LogSource::default()
         )
         .is_err())
@@ -316,16 +295,14 @@ mod tests {
                 "c": [{"a": 1, "b": 2}]
             },
         ]);
-        let flattened_json = convert_to_array(
-            convert_array_to_object(
-                json,
-                None,
-                None,
-                None,
-                SchemaVersion::V0,
-                &crate::event::format::LogSource::default(),
-            )
-            .unwrap(),
+        let flattened_json = flatten_json_body(
+            json,
+            None,
+            None,
+            None,
+            SchemaVersion::V0,
+            false,
+            &crate::event::format::LogSource::default(),
         )
         .unwrap();
 
@@ -377,16 +354,14 @@ mod tests {
                 "c": [{"a": 1, "b": 2}]
             },
         ]);
-        let flattened_json = convert_to_array(
-            convert_array_to_object(
-                json,
-                None,
-                None,
-                None,
-                SchemaVersion::V1,
-                &crate::event::format::LogSource::default(),
-            )
-            .unwrap(),
+        let flattened_json = flatten_json_body(
+            json,
+            None,
+            None,
+            None,
+            SchemaVersion::V1,
+            false,
+            &crate::event::format::LogSource::default(),
         )
         .unwrap();
 

From dc34a8513f6caf6e5c1621e3284aea25b3f15dfa Mon Sep 17 00:00:00 2001
From: Devdutt Shenoi <devdutt@parseable.com>
Date: Sat, 1 Mar 2025 14:56:27 +0530
Subject: [PATCH 15/39] refactor: share `Stream` state when processing

---
 src/event/format/json.rs    | 34 +++++++++++++++++-----------------
 src/event/format/mod.rs     | 12 ++----------
 src/event/mod.rs            | 17 ++++++++---------
 src/handlers/http/ingest.rs |  7 +++----
 src/parseable/mod.rs        |  2 +-
 src/parseable/streams.rs    | 23 ++---------------------
 src/utils/arrow/flight.rs   |  8 ++++----
 7 files changed, 37 insertions(+), 66 deletions(-)

diff --git a/src/event/format/json.rs b/src/event/format/json.rs
index 08c132209..61cb373f1 100644
--- a/src/event/format/json.rs
+++ b/src/event/format/json.rs
@@ -43,7 +43,7 @@ use crate::{
     kinesis::{flatten_kinesis_logs, Message},
     metadata::SchemaVersion,
     otel::{logs::flatten_otel_logs, metrics::flatten_otel_metrics, traces::flatten_otel_traces},
-    storage::StreamType,
+    parseable::Stream,
     utils::{
         arrow::get_field,
         json::{flatten_json_body, Json},
@@ -224,23 +224,24 @@ impl EventFormat for Event {
     /// Converts a JSON event into a Parseable Event
     fn into_event(
         self,
-        stream_name: String,
         origin_size: u64,
-        storage_schema: &HashMap<String, Arc<Field>>,
-        static_schema_flag: bool,
-        custom_partitions: Option<&String>,
-        time_partition: Option<&String>,
-        time_partition_limit: Option<NonZeroU32>,
-        schema_version: SchemaVersion,
+        stream: &Stream,
         log_source: &LogSource,
-        stream_type: StreamType,
     ) -> anyhow::Result<super::Event> {
+        let time_partition = stream.get_time_partition();
+        let time_partition_limit = stream.get_time_partition_limit();
+        let static_schema_flag = stream.get_static_schema_flag();
+        let custom_partitions = stream.get_custom_partition();
+        let schema_version = stream.get_schema_version();
+        let storage_schema = stream.get_schema_raw();
+        let stream_type = stream.get_stream_type();
+
         let p_timestamp = self.p_timestamp;
         let (data, schema, is_first_event) = self.to_data(
-            storage_schema,
-            time_partition,
+            &storage_schema,
+            time_partition.as_ref(),
             time_partition_limit,
-            custom_partitions,
+            custom_partitions.as_ref(),
             schema_version,
             log_source,
         )?;
@@ -255,8 +256,8 @@ impl EventFormat for Event {
                 None => HashMap::new(),
             };
 
-            let parsed_timestamp = match time_partition {
-                Some(time_partition) => extract_and_parse_time(&json, time_partition)?,
+            let parsed_timestamp = match time_partition.as_ref() {
+                Some(time_partition) => extract_and_parse_time(&json, time_partition.as_ref())?,
                 _ => p_timestamp.naive_utc(),
             };
 
@@ -264,9 +265,9 @@ impl EventFormat for Event {
                 p_timestamp,
                 vec![json],
                 schema.clone(),
-                storage_schema,
+                &storage_schema,
                 static_schema_flag,
-                time_partition,
+                time_partition.as_ref(),
                 schema_version,
             )?;
 
@@ -278,7 +279,6 @@ impl EventFormat for Event {
         }
 
         Ok(super::Event {
-            stream_name,
             origin_format: "json",
             origin_size,
             is_first_event,
diff --git a/src/event/format/mod.rs b/src/event/format/mod.rs
index a6b977d7c..2d181fc7a 100644
--- a/src/event/format/mod.rs
+++ b/src/event/format/mod.rs
@@ -33,7 +33,7 @@ use serde_json::Value;
 
 use crate::{
     metadata::SchemaVersion,
-    storage::StreamType,
+    parseable::Stream,
     utils::{
         arrow::{get_field, get_timestamp_array, replace_columns},
         json::Json,
@@ -165,19 +165,11 @@ pub trait EventFormat: Sized {
         Ok(rb)
     }
 
-    #[allow(clippy::too_many_arguments)]
     fn into_event(
         self,
-        stream_name: String,
         origin_size: u64,
-        storage_schema: &HashMap<String, Arc<Field>>,
-        static_schema_flag: bool,
-        custom_partitions: Option<&String>,
-        time_partition: Option<&String>,
-        time_partition_limit: Option<NonZeroU32>,
-        schema_version: SchemaVersion,
+        stream: &Stream,
         log_source: &LogSource,
-        stream_type: StreamType,
     ) -> Result<Event, AnyError>;
 }
 
diff --git a/src/event/mod.rs b/src/event/mod.rs
index e17bbedec..a0c803340 100644
--- a/src/event/mod.rs
+++ b/src/event/mod.rs
@@ -27,7 +27,7 @@ use std::sync::Arc;
 use self::error::EventError;
 use crate::{
     metadata::update_stats,
-    parseable::{StagingError, PARSEABLE},
+    parseable::{StagingError, Stream, PARSEABLE},
     storage::StreamType,
     LOCK_EXPECT,
 };
@@ -43,7 +43,6 @@ pub struct PartitionEvent {
 }
 
 pub struct Event {
-    pub stream_name: String,
     pub origin_format: &'static str,
     pub origin_size: u64,
     pub is_first_event: bool,
@@ -54,7 +53,7 @@ pub struct Event {
 
 // Events holds the schema related to a each event for a single log stream
 impl Event {
-    pub fn process(self) -> Result<(), EventError> {
+    pub fn process(self, stream: &Stream) -> Result<(), EventError> {
         for partition in self.partitions {
             let mut key = get_schema_key(&partition.rb.schema().fields);
             if self.time_partition.is_some() {
@@ -72,10 +71,10 @@ impl Event {
             }
 
             if self.is_first_event {
-                commit_schema(&self.stream_name, partition.rb.schema())?;
+                commit_schema(&stream.stream_name, partition.rb.schema())?;
             }
 
-            PARSEABLE.get_or_create_stream(&self.stream_name).push(
+            stream.push(
                 &key,
                 &partition.rb,
                 partition.parsed_timestamp,
@@ -84,23 +83,23 @@ impl Event {
             )?;
 
             update_stats(
-                &self.stream_name,
+                &stream.stream_name,
                 self.origin_format,
                 self.origin_size,
                 partition.rb.num_rows(),
                 partition.parsed_timestamp.date(),
             );
 
-            crate::livetail::LIVETAIL.process(&self.stream_name, &partition.rb);
+            crate::livetail::LIVETAIL.process(&stream.stream_name, &partition.rb);
         }
         Ok(())
     }
 
-    pub fn process_unchecked(&self) -> Result<(), EventError> {
+    pub fn process_unchecked(&self, stream: &Stream) -> Result<(), EventError> {
         for partition in &self.partitions {
             let key = get_schema_key(&partition.rb.schema().fields);
 
-            PARSEABLE.get_or_create_stream(&self.stream_name).push(
+            stream.push(
                 &key,
                 &partition.rb,
                 partition.parsed_timestamp,
diff --git a/src/handlers/http/ingest.rs b/src/handlers/http/ingest.rs
index a50b0845d..75c8ccc82 100644
--- a/src/handlers/http/ingest.rs
+++ b/src/handlers/http/ingest.rs
@@ -31,7 +31,7 @@ use crate::event::format::LogSource;
 use crate::event::{self, PartitionEvent};
 use crate::handlers::{LOG_SOURCE_KEY, STREAM_NAME_HEADER_KEY};
 use crate::option::Mode;
-use crate::parseable::{StreamNotFound, PARSEABLE};
+use crate::parseable::{Stream, StreamNotFound, PARSEABLE};
 use crate::storage::{ObjectStorageError, StreamType};
 use crate::utils::header_parsing::ParseHeaderError;
 use crate::utils::json::flatten::JsonFlattenError;
@@ -240,10 +240,9 @@ pub async fn post_event(
 
 pub async fn push_logs_unchecked(
     batches: RecordBatch,
-    stream_name: &str,
+    stream: &Stream,
 ) -> Result<event::Event, PostError> {
     let unchecked_event = event::Event {
-        stream_name: stream_name.to_string(),
         origin_format: "json",
         origin_size: 0,
         time_partition: None,
@@ -255,7 +254,7 @@ pub async fn push_logs_unchecked(
         }],
         stream_type: StreamType::UserDefined,
     };
-    unchecked_event.process_unchecked()?;
+    unchecked_event.process_unchecked(stream)?;
 
     Ok(unchecked_event)
 }
diff --git a/src/parseable/mod.rs b/src/parseable/mod.rs
index 60ec06b55..9437b3916 100644
--- a/src/parseable/mod.rs
+++ b/src/parseable/mod.rs
@@ -28,7 +28,7 @@ use http::{header::CONTENT_TYPE, HeaderName, HeaderValue, StatusCode};
 use once_cell::sync::Lazy;
 pub use staging::StagingError;
 use streams::StreamRef;
-pub use streams::{StreamNotFound, Streams};
+pub use streams::{Stream, StreamNotFound, Streams};
 use tracing::error;
 
 #[cfg(feature = "kafka")]
diff --git a/src/parseable/streams.rs b/src/parseable/streams.rs
index d9cd373ad..1982c147d 100644
--- a/src/parseable/streams.rs
+++ b/src/parseable/streams.rs
@@ -114,30 +114,11 @@ impl Stream {
     }
 
     pub async fn push_logs(&self, json: Value, log_source: &LogSource) -> anyhow::Result<()> {
-        let time_partition = self.get_time_partition();
-        let time_partition_limit = self.get_time_partition_limit();
-        let static_schema_flag = self.get_static_schema_flag();
-        let custom_partition = self.get_custom_partition();
-        let schema_version = self.get_schema_version();
-        let schema = self.get_schema_raw();
-        let stream_type = self.get_stream_type();
-
         let origin_size = serde_json::to_vec(&json).unwrap().len() as u64; // string length need not be the same as byte length
 
         json::Event::new(json)
-            .into_event(
-                self.stream_name.to_owned(),
-                origin_size,
-                &schema,
-                static_schema_flag,
-                custom_partition.as_ref(),
-                time_partition.as_ref(),
-                time_partition_limit,
-                schema_version,
-                log_source,
-                stream_type,
-            )?
-            .process()?;
+            .into_event(origin_size, self, log_source)?
+            .process(self)?;
 
         Ok(())
     }
diff --git a/src/utils/arrow/flight.rs b/src/utils/arrow/flight.rs
index c8d2dacf2..57ea88b67 100644
--- a/src/utils/arrow/flight.rs
+++ b/src/utils/arrow/flight.rs
@@ -95,14 +95,14 @@ pub async fn append_temporary_events(
     Event,
     Status,
 > {
-    let schema = PARSEABLE
+    let stream = PARSEABLE
         .get_stream(stream_name)
-        .map_err(|err| Status::failed_precondition(format!("Metadata Error: {}", err)))?
-        .get_schema();
+        .map_err(|err| Status::failed_precondition(format!("Metadata Error: {}", err)))?;
+    let schema = stream.get_schema();
     let rb = concat_batches(&schema, minute_result)
         .map_err(|err| Status::failed_precondition(format!("ArrowError: {}", err)))?;
 
-    let event = push_logs_unchecked(rb, stream_name)
+    let event = push_logs_unchecked(rb, &stream)
         .await
         .map_err(|err| Status::internal(err.to_string()))?;
     Ok(event)

From 19708df571a7f631aaafecb47cea434692939385 Mon Sep 17 00:00:00 2001
From: Devdutt Shenoi <devdutt@parseable.com>
Date: Sat, 1 Mar 2025 18:54:43 +0530
Subject: [PATCH 16/39] refactor: `Parseable::commit_schema`

---
 src/event/mod.rs           | 29 ++++-------------------------
 src/handlers/http/query.rs |  6 +++---
 src/parseable/streams.rs   | 13 +++++++++++++
 3 files changed, 20 insertions(+), 28 deletions(-)

diff --git a/src/event/mod.rs b/src/event/mod.rs
index a0c803340..f1de15011 100644
--- a/src/event/mod.rs
+++ b/src/event/mod.rs
@@ -20,17 +20,12 @@
 pub mod format;
 
 use arrow_array::RecordBatch;
-use arrow_schema::{Field, Fields, Schema};
+use arrow_schema::Field;
 use itertools::Itertools;
 use std::sync::Arc;
 
 use self::error::EventError;
-use crate::{
-    metadata::update_stats,
-    parseable::{StagingError, Stream, PARSEABLE},
-    storage::StreamType,
-    LOCK_EXPECT,
-};
+use crate::{metadata::update_stats, parseable::Stream, storage::StreamType};
 use chrono::NaiveDateTime;
 use std::collections::HashMap;
 
@@ -71,7 +66,8 @@ impl Event {
             }
 
             if self.is_first_event {
-                commit_schema(&stream.stream_name, partition.rb.schema())?;
+                let schema = partition.rb.schema().as_ref().clone();
+                stream.commit_schema(schema)?;
             }
 
             stream.push(
@@ -122,23 +118,6 @@ pub fn get_schema_key(fields: &[Arc<Field>]) -> String {
     format!("{hash:x}")
 }
 
-pub fn commit_schema(stream_name: &str, schema: Arc<Schema>) -> Result<(), StagingError> {
-    let mut stream_metadata = PARSEABLE.streams.write().expect("lock poisoned");
-
-    let map = &mut stream_metadata
-        .get_mut(stream_name)
-        .expect("map has entry for this stream name")
-        .metadata
-        .write()
-        .expect(LOCK_EXPECT)
-        .schema;
-    let current_schema = Schema::new(map.values().cloned().collect::<Fields>());
-    let schema = Schema::try_merge(vec![current_schema, schema.as_ref().clone()])?;
-    map.clear();
-    map.extend(schema.fields.iter().map(|f| (f.name().clone(), f.clone())));
-    Ok(())
-}
-
 pub mod error {
 
     use crate::{parseable::StagingError, storage::ObjectStorageError};
diff --git a/src/handlers/http/query.rs b/src/handlers/http/query.rs
index 3b6f4dedf..d89e3ef63 100644
--- a/src/handlers/http/query.rs
+++ b/src/handlers/http/query.rs
@@ -29,14 +29,12 @@ use serde::{Deserialize, Serialize};
 use serde_json::{json, Value};
 use std::collections::HashMap;
 use std::pin::Pin;
-use std::sync::Arc;
 use std::time::Instant;
 use tracing::error;
 
 use crate::event::error::EventError;
 use crate::handlers::http::fetch_schema;
 
-use crate::event::commit_schema;
 use crate::metrics::QUERY_EXECUTE_TIME;
 use crate::option::Mode;
 use crate::parseable::PARSEABLE;
@@ -174,7 +172,9 @@ pub async fn update_schema_when_distributed(tables: &Vec<String>) -> Result<(),
                 // commit schema merges the schema internally and updates the schema in storage.
                 commit_schema_to_storage(table, new_schema.clone()).await?;
 
-                commit_schema(table, Arc::new(new_schema))?;
+                PARSEABLE
+                    .get_or_create_stream(table)
+                    .commit_schema(new_schema)?;
             }
         }
     }
diff --git a/src/parseable/streams.rs b/src/parseable/streams.rs
index 1982c147d..cd3082d61 100644
--- a/src/parseable/streams.rs
+++ b/src/parseable/streams.rs
@@ -624,6 +624,19 @@ impl Stream {
         Arc::new(Schema::new(fields))
     }
 
+    pub fn commit_schema(&self, schema: Schema) -> Result<(), StagingError> {
+        let current_schema = self.get_schema().as_ref().clone();
+        let updated_schema = Schema::try_merge([current_schema, schema])?
+            .fields
+            .into_iter()
+            .map(|field| (field.name().to_owned(), field.clone()))
+            .collect();
+
+        self.metadata.write().expect(LOCK_EXPECT).schema = updated_schema;
+
+        Ok(())
+    }
+
     pub fn get_schema_raw(&self) -> HashMap<String, Arc<Field>> {
         self.metadata.read().expect(LOCK_EXPECT).schema.clone()
     }

From 7215e8e7d5998ac4e722f86d08ef06e75182da36 Mon Sep 17 00:00:00 2001
From: Devdutt Shenoi <devdutt@parseable.com>
Date: Sat, 1 Mar 2025 20:33:53 +0530
Subject: [PATCH 17/39] map schema keys to recordbatches

---
 src/event/format/json.rs    | 28 +++++++++++++++++++++-------
 src/event/mod.rs            | 25 ++++---------------------
 src/handlers/http/ingest.rs | 21 +++++++++++++--------
 3 files changed, 38 insertions(+), 36 deletions(-)

diff --git a/src/event/format/json.rs b/src/event/format/json.rs
index 61cb373f1..6ee9310be 100644
--- a/src/event/format/json.rs
+++ b/src/event/format/json.rs
@@ -39,7 +39,7 @@ use tracing::error;
 
 use super::{EventFormat, LogSource};
 use crate::{
-    event::PartitionEvent,
+    event::{get_schema_key, PartitionEvent},
     kinesis::{flatten_kinesis_logs, Message},
     metadata::SchemaVersion,
     otel::{logs::flatten_otel_logs, metrics::flatten_otel_metrics, traces::flatten_otel_traces},
@@ -246,7 +246,7 @@ impl EventFormat for Event {
             log_source,
         )?;
 
-        let mut partitions = vec![];
+        let mut partitions = HashMap::new();
         for json in data {
             let custom_partition_values = match custom_partitions.as_ref() {
                 Some(custom_partitions) => {
@@ -271,11 +271,25 @@ impl EventFormat for Event {
                 schema_version,
             )?;
 
-            partitions.push(PartitionEvent {
-                rb,
-                parsed_timestamp,
-                custom_partition_values,
-            });
+            let schema = rb.schema();
+            let mut key = get_schema_key(&schema.fields);
+            if time_partition.is_some() {
+                let parsed_timestamp_to_min = parsed_timestamp.format("%Y%m%dT%H%M").to_string();
+                key.push_str(&parsed_timestamp_to_min);
+            }
+
+            for (k, v) in custom_partition_values.iter().sorted_by_key(|v| v.0) {
+                key.push_str(&format!("&{k}={v}"));
+            }
+
+            partitions.insert(
+                key,
+                PartitionEvent {
+                    rb,
+                    parsed_timestamp,
+                    custom_partition_values,
+                },
+            );
         }
 
         Ok(super::Event {
diff --git a/src/event/mod.rs b/src/event/mod.rs
index f1de15011..30ee4abf9 100644
--- a/src/event/mod.rs
+++ b/src/event/mod.rs
@@ -42,29 +42,14 @@ pub struct Event {
     pub origin_size: u64,
     pub is_first_event: bool,
     pub time_partition: Option<String>,
-    pub partitions: Vec<PartitionEvent>,
+    pub partitions: HashMap<String, PartitionEvent>,
     pub stream_type: StreamType,
 }
 
 // Events holds the schema related to a each event for a single log stream
 impl Event {
     pub fn process(self, stream: &Stream) -> Result<(), EventError> {
-        for partition in self.partitions {
-            let mut key = get_schema_key(&partition.rb.schema().fields);
-            if self.time_partition.is_some() {
-                let parsed_timestamp_to_min =
-                    partition.parsed_timestamp.format("%Y%m%dT%H%M").to_string();
-                key.push_str(&parsed_timestamp_to_min);
-            }
-
-            for (k, v) in partition
-                .custom_partition_values
-                .iter()
-                .sorted_by_key(|v| v.0)
-            {
-                key.push_str(&format!("&{k}={v}"));
-            }
-
+        for (key, partition) in self.partitions {
             if self.is_first_event {
                 let schema = partition.rb.schema().as_ref().clone();
                 stream.commit_schema(schema)?;
@@ -92,11 +77,9 @@ impl Event {
     }
 
     pub fn process_unchecked(&self, stream: &Stream) -> Result<(), EventError> {
-        for partition in &self.partitions {
-            let key = get_schema_key(&partition.rb.schema().fields);
-
+        for (key, partition) in &self.partitions {
             stream.push(
-                &key,
+                key,
                 &partition.rb,
                 partition.parsed_timestamp,
                 &partition.custom_partition_values,
diff --git a/src/handlers/http/ingest.rs b/src/handlers/http/ingest.rs
index 75c8ccc82..343a0ffac 100644
--- a/src/handlers/http/ingest.rs
+++ b/src/handlers/http/ingest.rs
@@ -28,7 +28,7 @@ use serde_json::Value;
 
 use crate::event::error::EventError;
 use crate::event::format::LogSource;
-use crate::event::{self, PartitionEvent};
+use crate::event::{self, get_schema_key, PartitionEvent};
 use crate::handlers::{LOG_SOURCE_KEY, STREAM_NAME_HEADER_KEY};
 use crate::option::Mode;
 use crate::parseable::{Stream, StreamNotFound, PARSEABLE};
@@ -239,21 +239,26 @@ pub async fn post_event(
 }
 
 pub async fn push_logs_unchecked(
-    batches: RecordBatch,
+    rb: RecordBatch,
     stream: &Stream,
 ) -> Result<event::Event, PostError> {
-    let unchecked_event = event::Event {
+    let mut unchecked_event = event::Event {
         origin_format: "json",
         origin_size: 0,
         time_partition: None,
         is_first_event: true, // NOTE: Maybe should be false
-        partitions: vec![PartitionEvent {
-            rb: batches,
-            parsed_timestamp: Utc::now().naive_utc(),
-            custom_partition_values: HashMap::new(), // should be an empty map for unchecked push
-        }],
+        partitions: HashMap::new(),
         stream_type: StreamType::UserDefined,
     };
+    unchecked_event.partitions.insert(
+        get_schema_key(&rb.schema().fields),
+        PartitionEvent {
+            rb,
+            parsed_timestamp: Utc::now().naive_utc(),
+            custom_partition_values: HashMap::new(), // should be an empty map for unchecked push
+        },
+    );
+
     unchecked_event.process_unchecked(stream)?;
 
     Ok(unchecked_event)

From 975b1f674211a5d6a56af4498b2fb59ae601c8e3 Mon Sep 17 00:00:00 2001
From: Devdutt Shenoi <devdutt@parseable.com>
Date: Sat, 1 Mar 2025 20:35:37 +0530
Subject: [PATCH 18/39] construct map directly

---
 src/handlers/http/ingest.rs | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/src/handlers/http/ingest.rs b/src/handlers/http/ingest.rs
index 343a0ffac..e7724e73b 100644
--- a/src/handlers/http/ingest.rs
+++ b/src/handlers/http/ingest.rs
@@ -242,22 +242,23 @@ pub async fn push_logs_unchecked(
     rb: RecordBatch,
     stream: &Stream,
 ) -> Result<event::Event, PostError> {
-    let mut unchecked_event = event::Event {
+    let unchecked_event = event::Event {
         origin_format: "json",
         origin_size: 0,
         time_partition: None,
         is_first_event: true, // NOTE: Maybe should be false
-        partitions: HashMap::new(),
+        partitions: [(
+            get_schema_key(&rb.schema().fields),
+            PartitionEvent {
+                rb,
+                parsed_timestamp: Utc::now().naive_utc(),
+                custom_partition_values: HashMap::new(), // should be an empty map for unchecked push
+            },
+        )]
+        .into_iter()
+        .collect(),
         stream_type: StreamType::UserDefined,
     };
-    unchecked_event.partitions.insert(
-        get_schema_key(&rb.schema().fields),
-        PartitionEvent {
-            rb,
-            parsed_timestamp: Utc::now().naive_utc(),
-            custom_partition_values: HashMap::new(), // should be an empty map for unchecked push
-        },
-    );
 
     unchecked_event.process_unchecked(stream)?;
 

From 4c1f6d89015efdb47738861e9de6f08b148bfe1f Mon Sep 17 00:00:00 2001
From: Devdutt Shenoi <devdutt@parseable.com>
Date: Sat, 1 Mar 2025 21:01:15 +0530
Subject: [PATCH 19/39] fix: concat to not lose data

---
 Cargo.toml               |  3 ++-
 src/event/format/json.rs | 16 ++++++++--------
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index a8d9ccdec..0ce7d02b6 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -9,7 +9,8 @@ build = "build.rs"
 
 [dependencies]
 # Arrow and DataFusion ecosystem
-arrow-array = { version = "53.0.0" }
+arrow = "53.0.0"
+arrow-array = "53.0.0" 
 arrow-flight = { version = "53.0.0", features = ["tls"] }
 arrow-ipc = { version = "53.0.0", features = ["zstd"] }
 arrow-json = "53.0.0"
diff --git a/src/event/format/json.rs b/src/event/format/json.rs
index 6ee9310be..563762a6b 100644
--- a/src/event/format/json.rs
+++ b/src/event/format/json.rs
@@ -20,6 +20,7 @@
 #![allow(deprecated)]
 
 use anyhow::anyhow;
+use arrow::compute::concat_batches;
 use arrow_array::RecordBatch;
 use arrow_json::reader::{infer_json_schema_from_iterator, ReaderBuilder};
 use arrow_schema::{DataType, Field, Fields, Schema};
@@ -282,14 +283,13 @@ impl EventFormat for Event {
                 key.push_str(&format!("&{k}={v}"));
             }
 
-            partitions.insert(
-                key,
-                PartitionEvent {
-                    rb,
-                    parsed_timestamp,
-                    custom_partition_values,
-                },
-            );
+            let entry = partitions.entry(key).or_insert(PartitionEvent {
+                rb: RecordBatch::new_empty(schema.clone()),
+                parsed_timestamp,
+                custom_partition_values,
+            });
+
+            entry.rb = concat_batches(&schema, [&entry.rb, &rb])?;
         }
 
         Ok(super::Event {

From 5a2bcc193dc1840b462dda01c214f1738915f3e7 Mon Sep 17 00:00:00 2001
From: Devdutt Shenoi <devdutt@parseable.com>
Date: Sun, 2 Mar 2025 03:01:14 +0530
Subject: [PATCH 20/39] refactor: extract `byte_size` during json
 deserialization

---
 src/connectors/kafka/processor.rs  |  7 ++--
 src/event/format/json.rs           |  2 +-
 src/event/format/mod.rs            |  2 +-
 src/event/mod.rs                   |  2 +-
 src/handlers/http/cluster/mod.rs   | 40 +++++++++----------
 src/handlers/http/cluster/utils.rs | 64 ++++++++++++++++++++++++++++--
 src/handlers/http/ingest.rs        | 38 +++++++-----------
 src/metadata.rs                    |  2 +-
 src/parseable/streams.rs           |  9 +++--
 9 files changed, 110 insertions(+), 56 deletions(-)

diff --git a/src/connectors/kafka/processor.rs b/src/connectors/kafka/processor.rs
index fa2771fc7..5fead256c 100644
--- a/src/connectors/kafka/processor.rs
+++ b/src/connectors/kafka/processor.rs
@@ -36,7 +36,7 @@ use super::{config::BufferConfig, ConsumerRecord, StreamConsumer, TopicPartition
 pub struct ParseableSinkProcessor;
 
 impl ParseableSinkProcessor {
-    async fn process_event_from_chunk(&self, records: &[ConsumerRecord]) -> anyhow::Result<u64> {
+    async fn process_event_from_chunk(&self, records: &[ConsumerRecord]) -> anyhow::Result<usize> {
         let stream_name = records
             .first()
             .map(|r| r.topic.as_str())
@@ -47,10 +47,10 @@ impl ParseableSinkProcessor {
             .await?;
 
         let mut json_vec = Vec::with_capacity(records.len());
-        let mut total_payload_size = 0u64;
+        let mut total_payload_size = 0;
 
         for record in records.iter().filter_map(|r| r.payload.as_ref()) {
-            total_payload_size += record.len() as u64;
+            total_payload_size += record.len();
             if let Ok(value) = serde_json::from_slice::<Value>(record) {
                 json_vec.push(value);
             }
@@ -60,6 +60,7 @@ impl ParseableSinkProcessor {
             .get_or_create_stream(stream_name)
             .push_logs(
                 Value::Array(json_vec),
+                total_payload_size,
                 &LogSource::Custom("Kafka".to_owned()),
             )
             .await?;
diff --git a/src/event/format/json.rs b/src/event/format/json.rs
index 563762a6b..0d0849ef4 100644
--- a/src/event/format/json.rs
+++ b/src/event/format/json.rs
@@ -225,7 +225,7 @@ impl EventFormat for Event {
     /// Converts a JSON event into a Parseable Event
     fn into_event(
         self,
-        origin_size: u64,
+        origin_size: usize,
         stream: &Stream,
         log_source: &LogSource,
     ) -> anyhow::Result<super::Event> {
diff --git a/src/event/format/mod.rs b/src/event/format/mod.rs
index 2d181fc7a..c44ebe7b5 100644
--- a/src/event/format/mod.rs
+++ b/src/event/format/mod.rs
@@ -167,7 +167,7 @@ pub trait EventFormat: Sized {
 
     fn into_event(
         self,
-        origin_size: u64,
+        origin_size: usize,
         stream: &Stream,
         log_source: &LogSource,
     ) -> Result<Event, AnyError>;
diff --git a/src/event/mod.rs b/src/event/mod.rs
index 30ee4abf9..6de567b83 100644
--- a/src/event/mod.rs
+++ b/src/event/mod.rs
@@ -39,7 +39,7 @@ pub struct PartitionEvent {
 
 pub struct Event {
     pub origin_format: &'static str,
-    pub origin_size: u64,
+    pub origin_size: usize,
     pub is_first_event: bool,
     pub time_partition: Option<String>,
     pub partitions: HashMap<String, PartitionEvent>,
diff --git a/src/handlers/http/cluster/mod.rs b/src/handlers/http/cluster/mod.rs
index 60a61a1ec..3ed451cb6 100644
--- a/src/handlers/http/cluster/mod.rs
+++ b/src/handlers/http/cluster/mod.rs
@@ -37,7 +37,7 @@ use tracing::{error, info, warn};
 use url::Url;
 use utils::{check_liveness, to_url_string, IngestionStats, QueriedStats, StorageStats};
 
-use crate::handlers::http::ingest::ingest_internal_stream;
+use crate::event::format::LogSource;
 use crate::metrics::prom_utils::Metrics;
 use crate::parseable::PARSEABLE;
 use crate::rbac::role::model::DefaultPrivilege;
@@ -774,29 +774,29 @@ pub fn init_cluster_metrics_schedular() -> Result<(), PostError> {
     scheduler
         .every(CLUSTER_METRICS_INTERVAL_SECONDS)
         .run(move || async {
+            let internal_stream = PARSEABLE.get_or_create_stream(INTERNAL_STREAM_NAME);
             let result: Result<(), PostError> = async {
                 let cluster_metrics = fetch_cluster_metrics().await;
-                if let Ok(metrics) = cluster_metrics {
-                    if !metrics.is_empty() {
-                        info!("Cluster metrics fetched successfully from all ingestors");
-                        if let Ok(metrics_bytes) = serde_json::to_vec(&metrics) {
-                            if matches!(
-                                ingest_internal_stream(
-                                    INTERNAL_STREAM_NAME.to_string(),
-                                    bytes::Bytes::from(metrics_bytes),
-                                )
-                                .await,
-                                Ok(())
-                            ) {
-                                info!("Cluster metrics successfully ingested into internal stream");
-                            } else {
-                                error!("Failed to ingest cluster metrics into internal stream");
-                            }
-                        } else {
-                            error!("Failed to serialize cluster metrics");
-                        }
+                let Ok(metrics) = cluster_metrics else {
+                    return Ok(());
+                };
+                if !metrics.is_empty() {
+                    info!("Cluster metrics fetched successfully from all ingestors");
+                    let json = serde_json::to_value(&metrics).expect("should be json serializable");
+                    let byte_size = serde_json::to_vec(&metrics).unwrap().len();
+
+                    if matches!(
+                        internal_stream
+                            .push_logs(json, byte_size, &LogSource::Pmeta)
+                            .await,
+                        Ok(())
+                    ) {
+                        info!("Cluster metrics successfully ingested into internal stream");
+                    } else {
+                        error!("Failed to ingest cluster metrics into internal stream");
                     }
                 }
+
                 Ok(())
             }
             .await;
diff --git a/src/handlers/http/cluster/utils.rs b/src/handlers/http/cluster/utils.rs
index b41582d70..1d141b896 100644
--- a/src/handlers/http/cluster/utils.rs
+++ b/src/handlers/http/cluster/utils.rs
@@ -16,11 +16,23 @@
  *
  */
 
-use crate::{handlers::http::base_path_without_preceding_slash, HTTP_CLIENT};
-use actix_web::http::header;
+use std::{future::Future, pin::Pin};
+
+use crate::{
+    handlers::http::{base_path_without_preceding_slash, MAX_EVENT_PAYLOAD_SIZE},
+    HTTP_CLIENT,
+};
+use actix_web::{
+    dev::Payload,
+    error::{ErrorPayloadTooLarge, JsonPayloadError},
+    http::header,
+    FromRequest, HttpRequest,
+};
+use bytes::BytesMut;
 use chrono::{DateTime, Utc};
+use futures::StreamExt;
 use itertools::Itertools;
-use serde::{Deserialize, Serialize};
+use serde::{de::DeserializeOwned, Deserialize, Serialize};
 use tracing::error;
 use url::Url;
 
@@ -248,3 +260,49 @@ pub fn to_url_string(str: String) -> String {
 
     format!("http://{}/", str)
 }
+
+pub struct JsonWithSize<T> {
+    pub json: T,
+    pub byte_size: usize,
+}
+
+impl<T: DeserializeOwned + 'static> FromRequest for JsonWithSize<T> {
+    type Error = actix_web::error::Error;
+    type Future = Pin<Box<dyn Future<Output = Result<Self, Self::Error>>>>;
+
+    fn from_request(_: &HttpRequest, payload: &mut Payload) -> Self::Future {
+        let limit = MAX_EVENT_PAYLOAD_SIZE;
+
+        // Take ownership of payload for async processing
+        let mut payload = payload.take();
+
+        Box::pin(async move {
+            // Buffer to collect all bytes
+            let mut body = BytesMut::new();
+            let mut byte_size = 0;
+
+            // Collect all bytes from the payload stream
+            while let Some(chunk) = payload.next().await {
+                let chunk = chunk?;
+                byte_size += chunk.len();
+
+                // Check the size limit
+                if byte_size > limit {
+                    return Err(ErrorPayloadTooLarge(byte_size).into());
+                }
+
+                // Extend our buffer with the chunk
+                body.extend_from_slice(&chunk);
+            }
+
+            // Convert the collected bytes to Bytes
+            let bytes = body.freeze();
+
+            // Deserialize the JSON payload
+            let json = serde_json::from_slice::<T>(&bytes)
+                .map_err(|e| JsonPayloadError::Deserialize(e))?;
+
+            Ok(JsonWithSize { json, byte_size })
+        })
+    }
+}
diff --git a/src/handlers/http/ingest.rs b/src/handlers/http/ingest.rs
index e7724e73b..76f236690 100644
--- a/src/handlers/http/ingest.rs
+++ b/src/handlers/http/ingest.rs
@@ -18,10 +18,9 @@
 
 use std::collections::HashMap;
 
-use actix_web::web::{Json, Path};
+use actix_web::web::Path;
 use actix_web::{http::header::ContentType, HttpRequest, HttpResponse};
 use arrow_array::RecordBatch;
-use bytes::Bytes;
 use chrono::Utc;
 use http::StatusCode;
 use serde_json::Value;
@@ -36,6 +35,7 @@ use crate::storage::{ObjectStorageError, StreamType};
 use crate::utils::header_parsing::ParseHeaderError;
 use crate::utils::json::flatten::JsonFlattenError;
 
+use super::cluster::utils::JsonWithSize;
 use super::logstream::error::{CreateStreamError, StreamError};
 use super::users::dashboards::DashboardError;
 use super::users::filters::FiltersError;
@@ -43,7 +43,10 @@ use super::users::filters::FiltersError;
 // Handler for POST /api/v1/ingest
 // ingests events by extracting stream name from header
 // creates if stream does not exist
-pub async fn ingest(req: HttpRequest, Json(json): Json<Value>) -> Result<HttpResponse, PostError> {
+pub async fn ingest(
+    req: HttpRequest,
+    JsonWithSize { json, byte_size }: JsonWithSize<Value>,
+) -> Result<HttpResponse, PostError> {
     let Some(stream_name) = req.headers().get(STREAM_NAME_HEADER_KEY) else {
         return Err(PostError::Header(ParseHeaderError::MissingStreamName));
     };
@@ -72,29 +75,18 @@ pub async fn ingest(req: HttpRequest, Json(json): Json<Value>) -> Result<HttpRes
 
     PARSEABLE
         .get_or_create_stream(&stream_name)
-        .push_logs(json, &log_source)
+        .push_logs(json, byte_size, &log_source)
         .await?;
 
     Ok(HttpResponse::Ok().finish())
 }
 
-pub async fn ingest_internal_stream(stream_name: String, body: Bytes) -> Result<(), PostError> {
-    let json: Value = serde_json::from_slice(&body)?;
-
-    PARSEABLE
-        .get_stream(&stream_name)?
-        .push_logs(json, &LogSource::Pmeta)
-        .await?;
-
-    Ok(())
-}
-
 // Handler for POST /v1/logs to ingest OTEL logs
 // ingests events by extracting stream name from header
 // creates if stream does not exist
 pub async fn handle_otel_logs_ingestion(
     req: HttpRequest,
-    Json(json): Json<Value>,
+    JsonWithSize { json, byte_size }: JsonWithSize<Value>,
 ) -> Result<HttpResponse, PostError> {
     let Some(stream_name) = req.headers().get(STREAM_NAME_HEADER_KEY) else {
         return Err(PostError::Header(ParseHeaderError::MissingStreamName));
@@ -115,7 +107,7 @@ pub async fn handle_otel_logs_ingestion(
 
     PARSEABLE
         .get_or_create_stream(&stream_name)
-        .push_logs(json, &log_source)
+        .push_logs(json, byte_size, &log_source)
         .await?;
 
     Ok(HttpResponse::Ok().finish())
@@ -126,7 +118,7 @@ pub async fn handle_otel_logs_ingestion(
 // creates if stream does not exist
 pub async fn handle_otel_metrics_ingestion(
     req: HttpRequest,
-    Json(json): Json<Value>,
+    JsonWithSize { json, byte_size }: JsonWithSize<Value>,
 ) -> Result<HttpResponse, PostError> {
     let Some(stream_name) = req.headers().get(STREAM_NAME_HEADER_KEY) else {
         return Err(PostError::Header(ParseHeaderError::MissingStreamName));
@@ -149,7 +141,7 @@ pub async fn handle_otel_metrics_ingestion(
 
     PARSEABLE
         .get_or_create_stream(&stream_name)
-        .push_logs(json, &log_source)
+        .push_logs(json, byte_size, &log_source)
         .await?;
 
     Ok(HttpResponse::Ok().finish())
@@ -160,7 +152,7 @@ pub async fn handle_otel_metrics_ingestion(
 // creates if stream does not exist
 pub async fn handle_otel_traces_ingestion(
     req: HttpRequest,
-    Json(json): Json<Value>,
+    JsonWithSize { json, byte_size }: JsonWithSize<Value>,
 ) -> Result<HttpResponse, PostError> {
     let Some(stream_name) = req.headers().get(STREAM_NAME_HEADER_KEY) else {
         return Err(PostError::Header(ParseHeaderError::MissingStreamName));
@@ -180,7 +172,7 @@ pub async fn handle_otel_traces_ingestion(
 
     PARSEABLE
         .get_or_create_stream(&stream_name)
-        .push_logs(json, &log_source)
+        .push_logs(json, byte_size, &log_source)
         .await?;
 
     Ok(HttpResponse::Ok().finish())
@@ -192,7 +184,7 @@ pub async fn handle_otel_traces_ingestion(
 pub async fn post_event(
     req: HttpRequest,
     stream_name: Path<String>,
-    Json(json): Json<Value>,
+    JsonWithSize { json, byte_size }: JsonWithSize<Value>,
 ) -> Result<HttpResponse, PostError> {
     let stream_name = stream_name.into_inner();
 
@@ -232,7 +224,7 @@ pub async fn post_event(
 
     PARSEABLE
         .get_or_create_stream(&stream_name)
-        .push_logs(json, &log_source)
+        .push_logs(json, byte_size, &log_source)
         .await?;
 
     Ok(HttpResponse::Ok().finish())
diff --git a/src/metadata.rs b/src/metadata.rs
index f4d2e2225..bacd01083 100644
--- a/src/metadata.rs
+++ b/src/metadata.rs
@@ -35,7 +35,7 @@ use crate::storage::StreamType;
 pub fn update_stats(
     stream_name: &str,
     origin: &'static str,
-    size: u64,
+    size: usize,
     num_rows: usize,
     parsed_date: NaiveDate,
 ) {
diff --git a/src/parseable/streams.rs b/src/parseable/streams.rs
index cd3082d61..a35c8fcf4 100644
--- a/src/parseable/streams.rs
+++ b/src/parseable/streams.rs
@@ -113,9 +113,12 @@ impl Stream {
         })
     }
 
-    pub async fn push_logs(&self, json: Value, log_source: &LogSource) -> anyhow::Result<()> {
-        let origin_size = serde_json::to_vec(&json).unwrap().len() as u64; // string length need not be the same as byte length
-
+    pub async fn push_logs(
+        &self,
+        json: Value,
+        origin_size: usize,
+        log_source: &LogSource,
+    ) -> anyhow::Result<()> {
         json::Event::new(json)
             .into_event(origin_size, self, log_source)?
             .process(self)?;

From f26842217aa260da6f301255fc8db1707823163c Mon Sep 17 00:00:00 2001
From: Devdutt Shenoi <devdutt@parseable.com>
Date: Sun, 2 Mar 2025 12:29:00 +0530
Subject: [PATCH 21/39] ci: clippy suggestion

---
 src/handlers/http/cluster/utils.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/handlers/http/cluster/utils.rs b/src/handlers/http/cluster/utils.rs
index 1d141b896..431f61399 100644
--- a/src/handlers/http/cluster/utils.rs
+++ b/src/handlers/http/cluster/utils.rs
@@ -288,7 +288,7 @@ impl<T: DeserializeOwned + 'static> FromRequest for JsonWithSize<T> {
 
                 // Check the size limit
                 if byte_size > limit {
-                    return Err(ErrorPayloadTooLarge(byte_size).into());
+                    return Err(ErrorPayloadTooLarge(byte_size));
                 }
 
                 // Extend our buffer with the chunk
@@ -299,8 +299,8 @@ impl<T: DeserializeOwned + 'static> FromRequest for JsonWithSize<T> {
             let bytes = body.freeze();
 
             // Deserialize the JSON payload
-            let json = serde_json::from_slice::<T>(&bytes)
-                .map_err(|e| JsonPayloadError::Deserialize(e))?;
+            let json =
+                serde_json::from_slice::<T>(&bytes).map_err(JsonPayloadError::Deserialize)?;
 
             Ok(JsonWithSize { json, byte_size })
         })

From d304b9f46685f059eb610a4f3a29e39cb50b74fa Mon Sep 17 00:00:00 2001
From: Devdutt Shenoi <devdutt@parseable.com>
Date: Sun, 2 Mar 2025 12:39:33 +0530
Subject: [PATCH 22/39] feat: `DiskWriter` handles writing to arrow part file

---
 src/parseable/staging/writer.rs | 42 ++++++++++++++++++++++++++++++---
 src/parseable/streams.rs        | 18 ++++----------
 2 files changed, 44 insertions(+), 16 deletions(-)

diff --git a/src/parseable/staging/writer.rs b/src/parseable/staging/writer.rs
index c43252f14..2dbfe1e49 100644
--- a/src/parseable/staging/writer.rs
+++ b/src/parseable/staging/writer.rs
@@ -19,7 +19,8 @@
 
 use std::{
     collections::{HashMap, HashSet},
-    fs::File,
+    fs::{File, OpenOptions},
+    path::PathBuf,
     sync::Arc,
 };
 
@@ -28,13 +29,48 @@ use arrow_ipc::writer::StreamWriter;
 use arrow_schema::Schema;
 use arrow_select::concat::concat_batches;
 use itertools::Itertools;
+use tracing::error;
 
-use crate::utils::arrow::adapt_batch;
+use crate::{parseable::ARROW_FILE_EXTENSION, utils::arrow::adapt_batch};
+
+use super::StagingError;
 
 #[derive(Default)]
 pub struct Writer {
     pub mem: MemWriter<16384>,
-    pub disk: HashMap<String, StreamWriter<File>>,
+    pub disk: HashMap<String, DiskWriter>,
+}
+
+pub struct DiskWriter {
+    pub inner: StreamWriter<File>,
+    pub path: PathBuf,
+}
+
+impl DiskWriter {
+    pub fn new(path: PathBuf, schema: &Schema) -> Result<Self, StagingError> {
+        let file = OpenOptions::new().create(true).append(true).open(&path)?;
+
+        let inner = StreamWriter::try_new(file, schema)?;
+
+        Ok(Self { inner, path })
+    }
+
+    pub fn write(&mut self, rb: &RecordBatch) -> Result<(), StagingError> {
+        self.inner.write(rb).map_err(StagingError::Arrow)
+    }
+
+    pub fn finish(&mut self) {
+        if let Err(err) = self.inner.finish() {
+            error!("Couldn't finish arrow file {:?}, error = {err}", self.path);
+            return;
+        }
+
+        let mut arrow_path = self.path.to_owned();
+        arrow_path.set_extension(ARROW_FILE_EXTENSION);
+        if let Err(err) = std::fs::rename(&self.path, &arrow_path) {
+            error!("Couldn't rename file {:?}, error = {err}", self.path);
+        }
+    }
 }
 
 /// Structure to keep recordbatches in memory.
diff --git a/src/parseable/streams.rs b/src/parseable/streams.rs
index a35c8fcf4..cf3d65549 100644
--- a/src/parseable/streams.rs
+++ b/src/parseable/streams.rs
@@ -28,7 +28,6 @@ use std::{
 };
 
 use arrow_array::RecordBatch;
-use arrow_ipc::writer::StreamWriter;
 use arrow_schema::{Field, Fields, Schema};
 use chrono::{NaiveDateTime, Timelike};
 use derive_more::{Deref, DerefMut};
@@ -63,7 +62,7 @@ use crate::{
 use super::{
     staging::{
         reader::{MergedRecordReader, MergedReverseRecordReader},
-        writer::Writer,
+        writer::{DiskWriter, Writer},
         StagingError,
     },
     LogStream, ARROW_FILE_EXTENSION,
@@ -143,21 +142,14 @@ impl Stream {
                 }
                 None => {
                     // entry is not present thus we create it
-                    let file_path = self.path_by_current_time(
+                    let path = self.path_by_current_time(
                         schema_key,
                         parsed_timestamp,
                         custom_partition_values,
                     );
                     std::fs::create_dir_all(&self.data_path)?;
 
-                    let file = OpenOptions::new()
-                        .create(true)
-                        .append(true)
-                        .open(&file_path)?;
-
-                    let mut writer = StreamWriter::try_new(file, &record.schema())
-                        .expect("File and RecordBatch both are checked");
-
+                    let mut writer = DiskWriter::new(path, &record.schema())?;
                     writer.write(record)?;
                     guard.disk.insert(schema_key.to_owned(), writer);
                 }
@@ -180,7 +172,7 @@ impl Stream {
             hostname.push_str(id);
         }
         let filename = format!(
-            "{stream_hash}.date={}.hour={:02}.minute={}.{}{hostname}.data.{ARROW_FILE_EXTENSION}",
+            "{stream_hash}.date={}.hour={:02}.minute={}.{}{hostname}.data.part",
             parsed_timestamp.date(),
             parsed_timestamp.hour(),
             minute_to_slot(parsed_timestamp.minute(), OBJECT_STORE_DATA_GRANULARITY).unwrap(),
@@ -391,7 +383,7 @@ impl Stream {
 
         // Flush disk
         for writer in disk_writers.values_mut() {
-            _ = writer.finish();
+            writer.finish();
         }
     }
 

From 97f560332e2c089aa8adeb807066b80ef5379302 Mon Sep 17 00:00:00 2001
From: Devdutt Shenoi <devdutt@parseable.com>
Date: Sun, 2 Mar 2025 15:11:26 +0530
Subject: [PATCH 23/39] test: fix expectation

---
 src/parseable/streams.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/parseable/streams.rs b/src/parseable/streams.rs
index cf3d65549..b9d41c944 100644
--- a/src/parseable/streams.rs
+++ b/src/parseable/streams.rs
@@ -909,7 +909,7 @@ mod tests {
         );
 
         let expected_path = staging.data_path.join(format!(
-            "{stream_hash}.date={}.hour={:02}.minute={}.{}.data.{ARROW_FILE_EXTENSION}",
+            "{stream_hash}.date={}.hour={:02}.minute={}.{}.data.part",
             parsed_timestamp.date(),
             parsed_timestamp.hour(),
             minute_to_slot(parsed_timestamp.minute(), OBJECT_STORE_DATA_GRANULARITY).unwrap(),
@@ -943,7 +943,7 @@ mod tests {
         );
 
         let expected_path = staging.data_path.join(format!(
-            "{stream_hash}.date={}.hour={:02}.minute={}.key1=value1.key2=value2.{}.data.{ARROW_FILE_EXTENSION}",
+            "{stream_hash}.date={}.hour={:02}.minute={}.key1=value1.key2=value2.{}.data.part",
             parsed_timestamp.date(),
             parsed_timestamp.hour(),
             minute_to_slot(parsed_timestamp.minute(), OBJECT_STORE_DATA_GRANULARITY).unwrap(),

From a9513c454dd673b4f25d16056cd4b774746b1e1f Mon Sep 17 00:00:00 2001
From: Devdutt Shenoi <devdutt@parseable.com>
Date: Sun, 2 Mar 2025 16:20:41 +0530
Subject: [PATCH 24/39] refactor: don't add a step

---
 src/event/format/json.rs | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/src/event/format/json.rs b/src/event/format/json.rs
index 0d0849ef4..750205f6a 100644
--- a/src/event/format/json.rs
+++ b/src/event/format/json.rs
@@ -262,7 +262,7 @@ impl EventFormat for Event {
                 _ => p_timestamp.naive_utc(),
             };
 
-            let rb = Self::into_recordbatch(
+            let batch = Self::into_recordbatch(
                 p_timestamp,
                 vec![json],
                 schema.clone(),
@@ -272,7 +272,7 @@ impl EventFormat for Event {
                 schema_version,
             )?;
 
-            let schema = rb.schema();
+            let schema = batch.schema();
             let mut key = get_schema_key(&schema.fields);
             if time_partition.is_some() {
                 let parsed_timestamp_to_min = parsed_timestamp.format("%Y%m%dT%H%M").to_string();
@@ -283,13 +283,21 @@ impl EventFormat for Event {
                 key.push_str(&format!("&{k}={v}"));
             }
 
-            let entry = partitions.entry(key).or_insert(PartitionEvent {
-                rb: RecordBatch::new_empty(schema.clone()),
-                parsed_timestamp,
-                custom_partition_values,
-            });
-
-            entry.rb = concat_batches(&schema, [&entry.rb, &rb])?;
+            match partitions.get_mut(&key) {
+                Some(PartitionEvent { rb, .. }) => {
+                    *rb = concat_batches(&schema, [&rb, &batch])?;
+                }
+                _ => {
+                    partitions.insert(
+                        key,
+                        PartitionEvent {
+                            rb: batch,
+                            parsed_timestamp,
+                            custom_partition_values,
+                        },
+                    );
+                }
+            }
         }
 
         Ok(super::Event {

From 1c98e3b65c62dc01c2ef71483be87cf7070bf758 Mon Sep 17 00:00:00 2001
From: Devdutt Shenoi <devdutt@parseable.com>
Date: Mon, 3 Mar 2025 02:26:08 +0530
Subject: [PATCH 25/39] refactor: `get_schemas_if_present`

---
 src/parseable/streams.rs | 19 +++++--------------
 1 file changed, 5 insertions(+), 14 deletions(-)

diff --git a/src/parseable/streams.rs b/src/parseable/streams.rs
index b9d41c944..870736d2f 100644
--- a/src/parseable/streams.rs
+++ b/src/parseable/streams.rs
@@ -20,6 +20,7 @@
 use std::{
     collections::HashMap,
     fs::{remove_file, write, File, OpenOptions},
+    io::BufReader,
     num::NonZeroU32,
     path::{Path, PathBuf},
     process,
@@ -273,23 +274,13 @@ impl Stream {
     }
 
     pub fn get_schemas_if_present(&self) -> Option<Vec<Schema>> {
-        let Ok(dir) = self.data_path.read_dir() else {
-            return None;
-        };
-
         let mut schemas: Vec<Schema> = Vec::new();
 
-        for file in dir.flatten() {
-            if let Some(ext) = file.path().extension() {
-                if ext.eq("schema") {
-                    let file = File::open(file.path()).expect("Schema File should exist");
+        for path in self.schema_files() {
+            let file = File::open(path).expect("Schema File should exist");
 
-                    let schema = match serde_json::from_reader(file) {
-                        Ok(schema) => schema,
-                        Err(_) => continue,
-                    };
-                    schemas.push(schema);
-                }
+            if let Ok(schema) = serde_json::from_reader(BufReader::new(file)) {
+                schemas.push(schema);
             }
         }
 

From a86fc476df655704036db60226ab16aca2b6bc8e Mon Sep 17 00:00:00 2001
From: Devdutt Shenoi <devdutt@parseable.com>
Date: Mon, 3 Mar 2025 14:51:27 +0530
Subject: [PATCH 26/39] refactor: `prepare_and_validate_schema`

---
 src/event/format/json.rs | 127 +++++++++++----------------------------
 src/event/format/mod.rs  |  33 +++++-----
 2 files changed, 53 insertions(+), 107 deletions(-)

diff --git a/src/event/format/json.rs b/src/event/format/json.rs
index 750205f6a..0f1c4a86c 100644
--- a/src/event/format/json.rs
+++ b/src/event/format/json.rs
@@ -138,6 +138,7 @@ impl EventFormat for Event {
     // also extract the arrow schema, tags and metadata from the incoming json
     fn to_data(
         self,
+        static_schema_flag: bool,
         stored_schema: &HashMap<String, Arc<Field>>,
         time_partition: Option<&String>,
         time_partition_limit: Option<NonZeroU32>,
@@ -203,6 +204,8 @@ impl EventFormat for Event {
             ));
         }
 
+        let schema = Self::prepare_and_validate_schema(schema, &stored_schema, static_schema_flag)?;
+
         Ok((flattened, schema, is_first))
     }
 
@@ -239,6 +242,7 @@ impl EventFormat for Event {
 
         let p_timestamp = self.p_timestamp;
         let (data, schema, is_first_event) = self.to_data(
+            static_schema_flag,
             &storage_schema,
             time_partition.as_ref(),
             time_partition_limit,
@@ -265,9 +269,7 @@ impl EventFormat for Event {
             let batch = Self::into_recordbatch(
                 p_timestamp,
                 vec![json],
-                schema.clone(),
-                &storage_schema,
-                static_schema_flag,
+                &schema,
                 time_partition.as_ref(),
                 schema_version,
             )?;
@@ -507,6 +509,7 @@ mod tests {
         let store_schema = HashMap::default();
         let (data, schema, _) = Event::new(json)
             .to_data(
+                false,
                 &store_schema,
                 None,
                 None,
@@ -515,16 +518,8 @@ mod tests {
                 &LogSource::Json,
             )
             .unwrap();
-        let rb = Event::into_recordbatch(
-            Utc::now(),
-            data,
-            schema,
-            &store_schema,
-            false,
-            None,
-            SchemaVersion::V0,
-        )
-        .unwrap();
+        let rb =
+            Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap();
 
         assert_eq!(rb.num_rows(), 1);
         assert_eq!(rb.num_columns(), 4);
@@ -553,6 +548,7 @@ mod tests {
         let store_schema = HashMap::default();
         let (data, schema, _) = Event::new(json)
             .to_data(
+                false,
                 &store_schema,
                 None,
                 None,
@@ -561,16 +557,8 @@ mod tests {
                 &LogSource::Json,
             )
             .unwrap();
-        let rb = Event::into_recordbatch(
-            Utc::now(),
-            data,
-            schema,
-            &store_schema,
-            false,
-            None,
-            SchemaVersion::V0,
-        )
-        .unwrap();
+        let rb =
+            Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap();
 
         assert_eq!(rb.num_rows(), 1);
         assert_eq!(rb.num_columns(), 3);
@@ -601,6 +589,7 @@ mod tests {
         );
         let (data, schema, _) = Event::new(json)
             .to_data(
+                false,
                 &store_schema,
                 None,
                 None,
@@ -609,16 +598,8 @@ mod tests {
                 &LogSource::Json,
             )
             .unwrap();
-        let rb = Event::into_recordbatch(
-            Utc::now(),
-            data,
-            schema,
-            &store_schema,
-            false,
-            None,
-            SchemaVersion::V0,
-        )
-        .unwrap();
+        let rb =
+            Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap();
 
         assert_eq!(rb.num_rows(), 1);
         assert_eq!(rb.num_columns(), 3);
@@ -650,6 +631,7 @@ mod tests {
 
         assert!(Event::new(json)
             .to_data(
+                false,
                 &store_schema,
                 None,
                 None,
@@ -675,6 +657,7 @@ mod tests {
 
         let (data, schema, _) = Event::new(json)
             .to_data(
+                false,
                 &store_schema,
                 None,
                 None,
@@ -683,16 +666,8 @@ mod tests {
                 &LogSource::Json,
             )
             .unwrap();
-        let rb = Event::into_recordbatch(
-            Utc::now(),
-            data,
-            schema,
-            &store_schema,
-            false,
-            None,
-            SchemaVersion::V0,
-        )
-        .unwrap();
+        let rb =
+            Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap();
 
         assert_eq!(rb.num_rows(), 1);
         assert_eq!(rb.num_columns(), 1);
@@ -719,6 +694,7 @@ mod tests {
         let store_schema = HashMap::new();
         let (data, schema, _) = Event::new(json)
             .to_data(
+                false,
                 &store_schema,
                 None,
                 None,
@@ -727,16 +703,8 @@ mod tests {
                 &LogSource::Json,
             )
             .unwrap();
-        let rb = Event::into_recordbatch(
-            Utc::now(),
-            data,
-            schema,
-            &store_schema,
-            false,
-            None,
-            SchemaVersion::V0,
-        )
-        .unwrap();
+        let rb =
+            Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap();
 
         assert_eq!(rb.num_rows(), 3);
         assert_eq!(rb.num_columns(), 4);
@@ -785,6 +753,7 @@ mod tests {
         let store_schema = HashMap::new();
         let (data, schema, _) = Event::new(json)
             .to_data(
+                false,
                 &store_schema,
                 None,
                 None,
@@ -793,16 +762,8 @@ mod tests {
                 &LogSource::Json,
             )
             .unwrap();
-        let rb = Event::into_recordbatch(
-            Utc::now(),
-            data,
-            schema,
-            &store_schema,
-            false,
-            None,
-            SchemaVersion::V0,
-        )
-        .unwrap();
+        let rb =
+            Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap();
 
         assert_eq!(rb.num_rows(), 3);
         assert_eq!(rb.num_columns(), 4);
@@ -850,6 +811,7 @@ mod tests {
         );
         let (data, schema, _) = Event::new(json)
             .to_data(
+                false,
                 &store_schema,
                 None,
                 None,
@@ -858,16 +820,8 @@ mod tests {
                 &LogSource::Json,
             )
             .unwrap();
-        let rb = Event::into_recordbatch(
-            Utc::now(),
-            data,
-            schema,
-            &store_schema,
-            false,
-            None,
-            SchemaVersion::V0,
-        )
-        .unwrap();
+        let rb =
+            Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap();
 
         assert_eq!(rb.num_rows(), 3);
         assert_eq!(rb.num_columns(), 4);
@@ -916,6 +870,7 @@ mod tests {
 
         assert!(Event::new(json)
             .to_data(
+                false,
                 &store_schema,
                 None,
                 None,
@@ -953,6 +908,7 @@ mod tests {
         let store_schema = HashMap::new();
         let (data, schema, _) = Event::new(json)
             .to_data(
+                false,
                 &store_schema,
                 None,
                 None,
@@ -961,16 +917,8 @@ mod tests {
                 &LogSource::Json,
             )
             .unwrap();
-        let rb = Event::into_recordbatch(
-            Utc::now(),
-            data,
-            schema,
-            &store_schema,
-            false,
-            None,
-            SchemaVersion::V0,
-        )
-        .unwrap();
+        let rb =
+            Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap();
 
         assert_eq!(rb.num_rows(), 4);
         assert_eq!(rb.num_columns(), 5);
@@ -1044,6 +992,7 @@ mod tests {
         let store_schema = HashMap::new();
         let (data, schema, _) = Event::new(json)
             .to_data(
+                false,
                 &store_schema,
                 None,
                 None,
@@ -1052,16 +1001,8 @@ mod tests {
                 &LogSource::Json,
             )
             .unwrap();
-        let rb = Event::into_recordbatch(
-            Utc::now(),
-            data,
-            schema,
-            &store_schema,
-            false,
-            None,
-            SchemaVersion::V1,
-        )
-        .unwrap();
+        let rb =
+            Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V1).unwrap();
 
         assert_eq!(rb.num_rows(), 4);
         assert_eq!(rb.num_columns(), 5);
diff --git a/src/event/format/mod.rs b/src/event/format/mod.rs
index c44ebe7b5..de9b5aaab 100644
--- a/src/event/format/mod.rs
+++ b/src/event/format/mod.rs
@@ -103,32 +103,26 @@ pub trait EventFormat: Sized {
 
     fn to_data(
         self,
-        schema: &HashMap<String, Arc<Field>>,
+        static_schema_flag: bool,
+        stored_schema: &HashMap<String, Arc<Field>>,
         time_partition: Option<&String>,
         time_partition_limit: Option<NonZeroU32>,
-        custom_partition: Option<&String>,
+        custom_partitions: Option<&String>,
         schema_version: SchemaVersion,
         log_source: &LogSource,
     ) -> Result<(Self::Data, EventSchema, bool), AnyError>;
 
     fn decode(data: Self::Data, schema: Arc<Schema>) -> Result<RecordBatch, AnyError>;
 
-    #[allow(clippy::too_many_arguments)]
-    fn into_recordbatch(
-        p_timestamp: DateTime<Utc>,
-        data: Self::Data,
+    /// Updates inferred schema with `p_timestamp` field and ensures it adheres to expectations
+    fn prepare_and_validate_schema(
         mut schema: EventSchema,
         storage_schema: &HashMap<String, Arc<Field>>,
         static_schema_flag: bool,
-        time_partition: Option<&String>,
-        schema_version: SchemaVersion,
-    ) -> Result<RecordBatch, AnyError> {
+    ) -> Result<EventSchema, AnyError> {
         if get_field(&schema, DEFAULT_TIMESTAMP_KEY).is_some() {
-            return Err(anyhow!(
-                "field {} is a reserved field",
-                DEFAULT_TIMESTAMP_KEY
-            ));
-        };
+            return Err(anyhow!("field {DEFAULT_TIMESTAMP_KEY} is a reserved field",));
+        }
 
         // add the p_timestamp field to the event schema to the 0th index
         schema.insert(
@@ -150,6 +144,17 @@ pub trait EventFormat: Sized {
             return Err(anyhow!("Schema mismatch"));
         }
 
+        Ok(schema)
+    }
+
+    #[allow(clippy::too_many_arguments)]
+    fn into_recordbatch(
+        p_timestamp: DateTime<Utc>,
+        data: Self::Data,
+        schema: &EventSchema,
+        time_partition: Option<&String>,
+        schema_version: SchemaVersion,
+    ) -> Result<RecordBatch, AnyError> {
         // prepare the record batch and new fields to be added
         let mut new_schema = Arc::new(Schema::new(schema.clone()));
         new_schema =

From 51d166e176edd61592dc6eca527ebe8e04173453 Mon Sep 17 00:00:00 2001
From: Devdutt Shenoi <devdutt@parseable.com>
Date: Mon, 3 Mar 2025 15:16:56 +0530
Subject: [PATCH 27/39] refactor: event construction and processing

---
 src/connectors/kafka/processor.rs |  21 ++-
 src/event/format/json.rs          | 273 ++++++++++--------------------
 src/event/format/mod.rs           |   9 +-
 src/handlers/http/cluster/mod.rs  |  11 +-
 src/handlers/http/ingest.rs       |  47 ++---
 src/parseable/streams.rs          |  19 +--
 6 files changed, 137 insertions(+), 243 deletions(-)

diff --git a/src/connectors/kafka/processor.rs b/src/connectors/kafka/processor.rs
index 5fead256c..0619a50a3 100644
--- a/src/connectors/kafka/processor.rs
+++ b/src/connectors/kafka/processor.rs
@@ -26,7 +26,9 @@ use tokio_stream::wrappers::ReceiverStream;
 use tracing::{debug, error};
 
 use crate::{
-    connectors::common::processor::Processor, event::format::LogSource, parseable::PARSEABLE,
+    connectors::common::processor::Processor,
+    event::format::{json, EventFormat, LogSource},
+    parseable::PARSEABLE,
     storage::StreamType,
 };
 
@@ -56,14 +58,15 @@ impl ParseableSinkProcessor {
             }
         }
 
-        PARSEABLE
-            .get_or_create_stream(stream_name)
-            .push_logs(
-                Value::Array(json_vec),
-                total_payload_size,
-                &LogSource::Custom("Kafka".to_owned()),
-            )
-            .await?;
+        let stream = PARSEABLE.get_or_create_stream(stream_name);
+
+        json::Event::new(
+            Value::Array(json_vec),
+            total_payload_size,
+            LogSource::Custom("Kafka".to_owned()),
+        )
+        .into_event(&stream)?
+        .process(&stream)?;
 
         Ok(total_payload_size)
     }
diff --git a/src/event/format/json.rs b/src/event/format/json.rs
index 0f1c4a86c..ce4408a1e 100644
--- a/src/event/format/json.rs
+++ b/src/event/format/json.rs
@@ -53,82 +53,85 @@ use crate::{
 
 pub struct Event {
     pub json: Value,
+    pub origin_size: usize,
     pub p_timestamp: DateTime<Utc>,
+    pub log_source: LogSource,
 }
 
 impl Event {
-    pub fn new(json: Value) -> Self {
+    pub fn new(json: Value, origin_size: usize, log_source: LogSource) -> Self {
         Self {
             json,
+            origin_size,
             p_timestamp: Utc::now(),
+            log_source,
         }
     }
-}
 
-pub fn flatten_logs(
-    json: Value,
-    time_partition: Option<&String>,
-    time_partition_limit: Option<NonZeroU32>,
-    custom_partitions: Option<&String>,
-    schema_version: SchemaVersion,
-    log_source: &LogSource,
-) -> anyhow::Result<Vec<Json>> {
-    let data = match log_source {
-        LogSource::Kinesis => {
-            //custom flattening required for Amazon Kinesis
-            let message: Message = serde_json::from_value(json)?;
-            flatten_kinesis_logs(message)
-        }
-        LogSource::OtelLogs => {
-            //custom flattening required for otel logs
-            let logs: LogsData = serde_json::from_value(json)?;
-            flatten_otel_logs(&logs)
-        }
-        LogSource::OtelTraces => {
-            //custom flattening required for otel traces
-            let traces: TracesData = serde_json::from_value(json)?;
-            flatten_otel_traces(&traces)
-        }
-        LogSource::OtelMetrics => {
-            //custom flattening required for otel metrics
-            let metrics: MetricsData = serde_json::from_value(json)?;
-            flatten_otel_metrics(metrics)
-        }
-        _ => vec![json],
-    };
+    pub fn flatten_logs(
+        self,
+        time_partition: Option<&String>,
+        time_partition_limit: Option<NonZeroU32>,
+        custom_partitions: Option<&String>,
+        schema_version: SchemaVersion,
+    ) -> anyhow::Result<Vec<Json>> {
+        let data = match self.log_source {
+            LogSource::Kinesis => {
+                //custom flattening required for Amazon Kinesis
+                let message: Message = serde_json::from_value(self.json)?;
+                flatten_kinesis_logs(message)
+            }
+            LogSource::OtelLogs => {
+                //custom flattening required for otel logs
+                let logs: LogsData = serde_json::from_value(self.json)?;
+                flatten_otel_logs(&logs)
+            }
+            LogSource::OtelTraces => {
+                //custom flattening required for otel traces
+                let traces: TracesData = serde_json::from_value(self.json)?;
+                flatten_otel_traces(&traces)
+            }
+            LogSource::OtelMetrics => {
+                //custom flattening required for otel metrics
+                let metrics: MetricsData = serde_json::from_value(self.json)?;
+                flatten_otel_metrics(metrics)
+            }
+            _ => vec![self.json],
+        };
 
-    let mut logs = vec![];
-    for json in data {
-        let json = flatten_json_body(
-            json,
-            time_partition,
-            time_partition_limit,
-            custom_partitions,
-            schema_version,
-            true,
-            log_source,
-        )?;
+        let mut logs = vec![];
+        for json in data {
+            let json = flatten_json_body(
+                json,
+                time_partition,
+                time_partition_limit,
+                custom_partitions,
+                schema_version,
+                true,
+                &self.log_source,
+            )?;
 
-        // incoming event may be a single json or a json array
-        // but Data (type defined above) is a vector of json values
-        // hence we need to convert the incoming event to a vector of json values
-        match json {
-            Value::Array(arr) => {
-                for log in arr {
-                    let Value::Object(json) = log else {
-                        return Err(anyhow!(
-                            "Expected an object or a list of objects, received: {log:?}"
-                        ));
-                    };
-                    logs.push(json);
+            // incoming event may be a single json or a json array
+            // but Data (type defined above) is a vector of json values
+            // hence we need to convert the incoming event to a vector of json values
+            match json {
+                Value::Array(arr) => {
+                    for log in arr {
+                        let Value::Object(json) = log else {
+                            return Err(anyhow!(
+                                "Expected an object or a list of objects, received: {log:?}"
+                            ));
+                        };
+                        logs.push(json);
+                    }
                 }
+                Value::Object(obj) => logs.push(obj),
+                _ => unreachable!("flatten would have failed beforehand"),
             }
-            Value::Object(obj) => logs.push(obj),
-            _ => unreachable!("flatten would have failed beforehand"),
         }
-    }
 
-    Ok(logs)
+        Ok(logs)
+    }
 }
 
 impl EventFormat for Event {
@@ -144,15 +147,12 @@ impl EventFormat for Event {
         time_partition_limit: Option<NonZeroU32>,
         custom_partitions: Option<&String>,
         schema_version: SchemaVersion,
-        log_source: &LogSource,
     ) -> anyhow::Result<(Self::Data, Vec<Arc<Field>>, bool)> {
-        let flattened = flatten_logs(
-            self.json,
+        let flattened = self.flatten_logs(
             time_partition,
             time_partition_limit,
             custom_partitions,
             schema_version,
-            log_source,
         )?;
 
         // collect all the keys from all the json objects in the request body
@@ -204,7 +204,7 @@ impl EventFormat for Event {
             ));
         }
 
-        let schema = Self::prepare_and_validate_schema(schema, &stored_schema, static_schema_flag)?;
+        let schema = Self::prepare_and_validate_schema(schema, stored_schema, static_schema_flag)?;
 
         Ok((flattened, schema, is_first))
     }
@@ -226,12 +226,7 @@ impl EventFormat for Event {
     }
 
     /// Converts a JSON event into a Parseable Event
-    fn into_event(
-        self,
-        origin_size: usize,
-        stream: &Stream,
-        log_source: &LogSource,
-    ) -> anyhow::Result<super::Event> {
+    fn into_event(self, stream: &Stream) -> anyhow::Result<super::Event> {
         let time_partition = stream.get_time_partition();
         let time_partition_limit = stream.get_time_partition_limit();
         let static_schema_flag = stream.get_static_schema_flag();
@@ -241,6 +236,7 @@ impl EventFormat for Event {
         let stream_type = stream.get_stream_type();
 
         let p_timestamp = self.p_timestamp;
+        let origin_size = self.origin_size;
         let (data, schema, is_first_event) = self.to_data(
             static_schema_flag,
             &storage_schema,
@@ -248,7 +244,6 @@ impl EventFormat for Event {
             time_partition_limit,
             custom_partitions.as_ref(),
             schema_version,
-            log_source,
         )?;
 
         let mut partitions = HashMap::new();
@@ -287,7 +282,7 @@ impl EventFormat for Event {
 
             match partitions.get_mut(&key) {
                 Some(PartitionEvent { rb, .. }) => {
-                    *rb = concat_batches(&schema, [&rb, &batch])?;
+                    *rb = concat_batches(&schema, [rb, &batch])?;
                 }
                 _ => {
                     partitions.insert(
@@ -507,16 +502,8 @@ mod tests {
         });
 
         let store_schema = HashMap::default();
-        let (data, schema, _) = Event::new(json)
-            .to_data(
-                false,
-                &store_schema,
-                None,
-                None,
-                None,
-                SchemaVersion::V0,
-                &LogSource::Json,
-            )
+        let (data, schema, _) = Event::new(json, 0 /* doesn't matter */, LogSource::Json)
+            .to_data(false, &store_schema, None, None, None, SchemaVersion::V0)
             .unwrap();
         let rb =
             Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap();
@@ -546,16 +533,8 @@ mod tests {
         });
 
         let store_schema = HashMap::default();
-        let (data, schema, _) = Event::new(json)
-            .to_data(
-                false,
-                &store_schema,
-                None,
-                None,
-                None,
-                SchemaVersion::V0,
-                &LogSource::Json,
-            )
+        let (data, schema, _) = Event::new(json, 0 /* doesn't matter */, LogSource::Json)
+            .to_data(false, &store_schema, None, None, None, SchemaVersion::V0)
             .unwrap();
         let rb =
             Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap();
@@ -587,16 +566,8 @@ mod tests {
             ]
             .into_iter(),
         );
-        let (data, schema, _) = Event::new(json)
-            .to_data(
-                false,
-                &store_schema,
-                None,
-                None,
-                None,
-                SchemaVersion::V0,
-                &LogSource::Json,
-            )
+        let (data, schema, _) = Event::new(json, 0 /* doesn't matter */, LogSource::Json)
+            .to_data(false, &store_schema, None, None, None, SchemaVersion::V0)
             .unwrap();
         let rb =
             Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap();
@@ -629,16 +600,8 @@ mod tests {
             .into_iter(),
         );
 
-        assert!(Event::new(json)
-            .to_data(
-                false,
-                &store_schema,
-                None,
-                None,
-                None,
-                SchemaVersion::V0,
-                &LogSource::Json,
-            )
+        assert!(Event::new(json, 0 /* doesn't matter */, LogSource::Json)
+            .to_data(false, &store_schema, None, None, None, SchemaVersion::V0,)
             .is_err());
     }
 
@@ -655,16 +618,8 @@ mod tests {
             .into_iter(),
         );
 
-        let (data, schema, _) = Event::new(json)
-            .to_data(
-                false,
-                &store_schema,
-                None,
-                None,
-                None,
-                SchemaVersion::V0,
-                &LogSource::Json,
-            )
+        let (data, schema, _) = Event::new(json, 0 /* doesn't matter */, LogSource::Json)
+            .to_data(false, &store_schema, None, None, None, SchemaVersion::V0)
             .unwrap();
         let rb =
             Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap();
@@ -692,16 +647,8 @@ mod tests {
         ]);
 
         let store_schema = HashMap::new();
-        let (data, schema, _) = Event::new(json)
-            .to_data(
-                false,
-                &store_schema,
-                None,
-                None,
-                None,
-                SchemaVersion::V0,
-                &LogSource::Json,
-            )
+        let (data, schema, _) = Event::new(json, 0 /* doesn't matter */, LogSource::Json)
+            .to_data(false, &store_schema, None, None, None, SchemaVersion::V0)
             .unwrap();
         let rb =
             Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap();
@@ -751,16 +698,8 @@ mod tests {
         ]);
 
         let store_schema = HashMap::new();
-        let (data, schema, _) = Event::new(json)
-            .to_data(
-                false,
-                &store_schema,
-                None,
-                None,
-                None,
-                SchemaVersion::V0,
-                &LogSource::Json,
-            )
+        let (data, schema, _) = Event::new(json, 0 /* doesn't matter */, LogSource::Json)
+            .to_data(false, &store_schema, None, None, None, SchemaVersion::V0)
             .unwrap();
         let rb =
             Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap();
@@ -809,16 +748,8 @@ mod tests {
             ]
             .into_iter(),
         );
-        let (data, schema, _) = Event::new(json)
-            .to_data(
-                false,
-                &store_schema,
-                None,
-                None,
-                None,
-                SchemaVersion::V0,
-                &LogSource::Json,
-            )
+        let (data, schema, _) = Event::new(json, 0 /* doesn't matter */, LogSource::Json)
+            .to_data(false, &store_schema, None, None, None, SchemaVersion::V0)
             .unwrap();
         let rb =
             Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap();
@@ -868,16 +799,8 @@ mod tests {
             .into_iter(),
         );
 
-        assert!(Event::new(json)
-            .to_data(
-                false,
-                &store_schema,
-                None,
-                None,
-                None,
-                SchemaVersion::V0,
-                &LogSource::Json,
-            )
+        assert!(Event::new(json, 0 /* doesn't matter */, LogSource::Json)
+            .to_data(false, &store_schema, None, None, None, SchemaVersion::V0,)
             .is_err());
     }
 
@@ -906,16 +829,8 @@ mod tests {
         ]);
 
         let store_schema = HashMap::new();
-        let (data, schema, _) = Event::new(json)
-            .to_data(
-                false,
-                &store_schema,
-                None,
-                None,
-                None,
-                SchemaVersion::V0,
-                &LogSource::Json,
-            )
+        let (data, schema, _) = Event::new(json, 0 /* doesn't matter */, LogSource::Json)
+            .to_data(false, &store_schema, None, None, None, SchemaVersion::V0)
             .unwrap();
         let rb =
             Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap();
@@ -990,16 +905,8 @@ mod tests {
         ]);
 
         let store_schema = HashMap::new();
-        let (data, schema, _) = Event::new(json)
-            .to_data(
-                false,
-                &store_schema,
-                None,
-                None,
-                None,
-                SchemaVersion::V1,
-                &LogSource::Json,
-            )
+        let (data, schema, _) = Event::new(json, 0 /* doesn't matter */, LogSource::Json)
+            .to_data(false, &store_schema, None, None, None, SchemaVersion::V1)
             .unwrap();
         let rb =
             Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V1).unwrap();
diff --git a/src/event/format/mod.rs b/src/event/format/mod.rs
index de9b5aaab..5459e035c 100644
--- a/src/event/format/mod.rs
+++ b/src/event/format/mod.rs
@@ -109,7 +109,6 @@ pub trait EventFormat: Sized {
         time_partition_limit: Option<NonZeroU32>,
         custom_partitions: Option<&String>,
         schema_version: SchemaVersion,
-        log_source: &LogSource,
     ) -> Result<(Self::Data, EventSchema, bool), AnyError>;
 
     fn decode(data: Self::Data, schema: Arc<Schema>) -> Result<RecordBatch, AnyError>;
@@ -147,7 +146,6 @@ pub trait EventFormat: Sized {
         Ok(schema)
     }
 
-    #[allow(clippy::too_many_arguments)]
     fn into_recordbatch(
         p_timestamp: DateTime<Utc>,
         data: Self::Data,
@@ -170,12 +168,7 @@ pub trait EventFormat: Sized {
         Ok(rb)
     }
 
-    fn into_event(
-        self,
-        origin_size: usize,
-        stream: &Stream,
-        log_source: &LogSource,
-    ) -> Result<Event, AnyError>;
+    fn into_event(self, stream: &Stream) -> Result<Event, AnyError>;
 }
 
 pub fn get_existing_field_names(
diff --git a/src/handlers/http/cluster/mod.rs b/src/handlers/http/cluster/mod.rs
index 3ed451cb6..eb9751d4e 100644
--- a/src/handlers/http/cluster/mod.rs
+++ b/src/handlers/http/cluster/mod.rs
@@ -24,6 +24,7 @@ use std::time::Duration;
 use actix_web::http::header::{self, HeaderMap};
 use actix_web::web::Path;
 use actix_web::Responder;
+use anyhow::anyhow;
 use bytes::Bytes;
 use chrono::Utc;
 use clokwerk::{AsyncScheduler, Interval};
@@ -37,7 +38,7 @@ use tracing::{error, info, warn};
 use url::Url;
 use utils::{check_liveness, to_url_string, IngestionStats, QueriedStats, StorageStats};
 
-use crate::event::format::LogSource;
+use crate::event::format::{json, EventFormat, LogSource};
 use crate::metrics::prom_utils::Metrics;
 use crate::parseable::PARSEABLE;
 use crate::rbac::role::model::DefaultPrivilege;
@@ -786,9 +787,11 @@ pub fn init_cluster_metrics_schedular() -> Result<(), PostError> {
                     let byte_size = serde_json::to_vec(&metrics).unwrap().len();
 
                     if matches!(
-                        internal_stream
-                            .push_logs(json, byte_size, &LogSource::Pmeta)
-                            .await,
+                        json::Event::new(json, byte_size, LogSource::Pmeta)
+                            .into_event(&internal_stream)
+                            .and_then(|event| event
+                                .process(&internal_stream)
+                                .map_err(|e| anyhow!(e))),
                         Ok(())
                     ) {
                         info!("Cluster metrics successfully ingested into internal stream");
diff --git a/src/handlers/http/ingest.rs b/src/handlers/http/ingest.rs
index 76f236690..bdf35d4c9 100644
--- a/src/handlers/http/ingest.rs
+++ b/src/handlers/http/ingest.rs
@@ -26,7 +26,7 @@ use http::StatusCode;
 use serde_json::Value;
 
 use crate::event::error::EventError;
-use crate::event::format::LogSource;
+use crate::event::format::{json, EventFormat, LogSource};
 use crate::event::{self, get_schema_key, PartitionEvent};
 use crate::handlers::{LOG_SOURCE_KEY, STREAM_NAME_HEADER_KEY};
 use crate::option::Mode;
@@ -73,10 +73,11 @@ pub async fn ingest(
         return Err(PostError::OtelNotSupported);
     }
 
-    PARSEABLE
-        .get_or_create_stream(&stream_name)
-        .push_logs(json, byte_size, &log_source)
-        .await?;
+    let stream = PARSEABLE.get_or_create_stream(&stream_name);
+
+    json::Event::new(json, byte_size, log_source)
+        .into_event(&stream)?
+        .process(&stream)?;
 
     Ok(HttpResponse::Ok().finish())
 }
@@ -105,10 +106,11 @@ pub async fn handle_otel_logs_ingestion(
         .create_stream_if_not_exists(&stream_name, StreamType::UserDefined, LogSource::OtelLogs)
         .await?;
 
-    PARSEABLE
-        .get_or_create_stream(&stream_name)
-        .push_logs(json, byte_size, &log_source)
-        .await?;
+    let stream = PARSEABLE.get_or_create_stream(&stream_name);
+
+    json::Event::new(json, byte_size, log_source)
+        .into_event(&stream)?
+        .process(&stream)?;
 
     Ok(HttpResponse::Ok().finish())
 }
@@ -139,10 +141,11 @@ pub async fn handle_otel_metrics_ingestion(
         )
         .await?;
 
-    PARSEABLE
-        .get_or_create_stream(&stream_name)
-        .push_logs(json, byte_size, &log_source)
-        .await?;
+    let stream = PARSEABLE.get_or_create_stream(&stream_name);
+
+    json::Event::new(json, byte_size, log_source)
+        .into_event(&stream)?
+        .process(&stream)?;
 
     Ok(HttpResponse::Ok().finish())
 }
@@ -170,10 +173,11 @@ pub async fn handle_otel_traces_ingestion(
         .create_stream_if_not_exists(&stream_name, StreamType::UserDefined, LogSource::OtelTraces)
         .await?;
 
-    PARSEABLE
-        .get_or_create_stream(&stream_name)
-        .push_logs(json, byte_size, &log_source)
-        .await?;
+    let stream = PARSEABLE.get_or_create_stream(&stream_name);
+
+    json::Event::new(json, byte_size, log_source)
+        .into_event(&stream)?
+        .process(&stream)?;
 
     Ok(HttpResponse::Ok().finish())
 }
@@ -222,10 +226,11 @@ pub async fn post_event(
         return Err(PostError::OtelNotSupported);
     }
 
-    PARSEABLE
-        .get_or_create_stream(&stream_name)
-        .push_logs(json, byte_size, &log_source)
-        .await?;
+    let stream = PARSEABLE.get_or_create_stream(&stream_name);
+
+    json::Event::new(json, byte_size, log_source)
+        .into_event(&stream)?
+        .process(&stream)?;
 
     Ok(HttpResponse::Ok().finish())
 }
diff --git a/src/parseable/streams.rs b/src/parseable/streams.rs
index 870736d2f..235523cdc 100644
--- a/src/parseable/streams.rs
+++ b/src/parseable/streams.rs
@@ -42,16 +42,12 @@ use parquet::{
 };
 use rand::distributions::DistString;
 use relative_path::RelativePathBuf;
-use serde_json::Value;
 use tokio::task::JoinSet;
 use tracing::{error, info, trace, warn};
 
 use crate::{
     cli::Options,
-    event::{
-        format::{json, EventFormat, LogSource},
-        DEFAULT_TIMESTAMP_KEY,
-    },
+    event::DEFAULT_TIMESTAMP_KEY,
     metadata::{LogStreamMetadata, SchemaVersion},
     metrics,
     option::Mode,
@@ -113,19 +109,6 @@ impl Stream {
         })
     }
 
-    pub async fn push_logs(
-        &self,
-        json: Value,
-        origin_size: usize,
-        log_source: &LogSource,
-    ) -> anyhow::Result<()> {
-        json::Event::new(json)
-            .into_event(origin_size, self, log_source)?
-            .process(self)?;
-
-        Ok(())
-    }
-
     // Concatenates record batches and puts them in memory store for each event.
     pub fn push(
         &self,

From 218080ebe9c68b6439f60381db1093d1f9a20413 Mon Sep 17 00:00:00 2001
From: Devdutt Shenoi <devdutt@parseable.com>
Date: Mon, 3 Mar 2025 19:48:18 +0530
Subject: [PATCH 28/39] fix: concat at once

---
 src/event/format/json.rs    |  8 +++-----
 src/event/mod.rs            | 27 ++++++++++++++++++---------
 src/handlers/http/ingest.rs |  8 +++++---
 3 files changed, 26 insertions(+), 17 deletions(-)

diff --git a/src/event/format/json.rs b/src/event/format/json.rs
index ce4408a1e..5ebb7b83a 100644
--- a/src/event/format/json.rs
+++ b/src/event/format/json.rs
@@ -20,7 +20,6 @@
 #![allow(deprecated)]
 
 use anyhow::anyhow;
-use arrow::compute::concat_batches;
 use arrow_array::RecordBatch;
 use arrow_json::reader::{infer_json_schema_from_iterator, ReaderBuilder};
 use arrow_schema::{DataType, Field, Fields, Schema};
@@ -281,14 +280,13 @@ impl EventFormat for Event {
             }
 
             match partitions.get_mut(&key) {
-                Some(PartitionEvent { rb, .. }) => {
-                    *rb = concat_batches(&schema, [rb, &batch])?;
-                }
+                Some(PartitionEvent { rbs, .. }) => rbs.push(batch),
                 _ => {
                     partitions.insert(
                         key,
                         PartitionEvent {
-                            rb: batch,
+                            rbs: vec![batch],
+                            schema,
                             parsed_timestamp,
                             custom_partition_values,
                         },
diff --git a/src/event/mod.rs b/src/event/mod.rs
index 6de567b83..f0297f61c 100644
--- a/src/event/mod.rs
+++ b/src/event/mod.rs
@@ -19,20 +19,26 @@
 
 pub mod format;
 
+use arrow::compute::concat_batches;
 use arrow_array::RecordBatch;
-use arrow_schema::Field;
+use arrow_schema::{Field, Schema};
 use itertools::Itertools;
 use std::sync::Arc;
 
 use self::error::EventError;
-use crate::{metadata::update_stats, parseable::Stream, storage::StreamType};
+use crate::{
+    metadata::update_stats,
+    parseable::{StagingError, Stream},
+    storage::StreamType,
+};
 use chrono::NaiveDateTime;
 use std::collections::HashMap;
 
 pub const DEFAULT_TIMESTAMP_KEY: &str = "p_timestamp";
 
 pub struct PartitionEvent {
-    pub rb: RecordBatch,
+    pub rbs: Vec<RecordBatch>,
+    pub schema: Arc<Schema>,
     pub parsed_timestamp: NaiveDateTime,
     pub custom_partition_values: HashMap<String, String>,
 }
@@ -50,14 +56,15 @@ pub struct Event {
 impl Event {
     pub fn process(self, stream: &Stream) -> Result<(), EventError> {
         for (key, partition) in self.partitions {
+            let rb =
+                concat_batches(&partition.schema, &partition.rbs).map_err(StagingError::Arrow)?;
             if self.is_first_event {
-                let schema = partition.rb.schema().as_ref().clone();
-                stream.commit_schema(schema)?;
+                stream.commit_schema(partition.schema.as_ref().clone())?;
             }
 
             stream.push(
                 &key,
-                &partition.rb,
+                &rb,
                 partition.parsed_timestamp,
                 &partition.custom_partition_values,
                 self.stream_type,
@@ -67,20 +74,22 @@ impl Event {
                 &stream.stream_name,
                 self.origin_format,
                 self.origin_size,
-                partition.rb.num_rows(),
+                rb.num_rows(),
                 partition.parsed_timestamp.date(),
             );
 
-            crate::livetail::LIVETAIL.process(&stream.stream_name, &partition.rb);
+            crate::livetail::LIVETAIL.process(&stream.stream_name, &rb);
         }
         Ok(())
     }
 
     pub fn process_unchecked(&self, stream: &Stream) -> Result<(), EventError> {
         for (key, partition) in &self.partitions {
+            let rb =
+                concat_batches(&partition.schema, &partition.rbs).map_err(StagingError::Arrow)?;
             stream.push(
                 key,
-                &partition.rb,
+                &rb,
                 partition.parsed_timestamp,
                 &partition.custom_partition_values,
                 self.stream_type,
diff --git a/src/handlers/http/ingest.rs b/src/handlers/http/ingest.rs
index bdf35d4c9..d96236322 100644
--- a/src/handlers/http/ingest.rs
+++ b/src/handlers/http/ingest.rs
@@ -236,18 +236,20 @@ pub async fn post_event(
 }
 
 pub async fn push_logs_unchecked(
-    rb: RecordBatch,
+    batch: RecordBatch,
     stream: &Stream,
 ) -> Result<event::Event, PostError> {
+    let schema = batch.schema();
     let unchecked_event = event::Event {
         origin_format: "json",
         origin_size: 0,
         time_partition: None,
         is_first_event: true, // NOTE: Maybe should be false
         partitions: [(
-            get_schema_key(&rb.schema().fields),
+            get_schema_key(&schema.fields),
             PartitionEvent {
-                rb,
+                rbs: vec![batch],
+                schema,
                 parsed_timestamp: Utc::now().naive_utc(),
                 custom_partition_values: HashMap::new(), // should be an empty map for unchecked push
             },

From aa0befa5f4c22a6353fa4b33d4a35b1f05bae0cb Mon Sep 17 00:00:00 2001
From: Devdutt Shenoi <devdutt@parseable.com>
Date: Mon, 3 Mar 2025 20:53:18 +0530
Subject: [PATCH 29/39] refactor: separate out flatten from schema inference

---
 src/event/format/json.rs | 109 +++++++++++++++++++++++++--------------
 src/event/format/mod.rs  |  14 +++--
 2 files changed, 82 insertions(+), 41 deletions(-)

diff --git a/src/event/format/json.rs b/src/event/format/json.rs
index 5ebb7b83a..69584e1f7 100644
--- a/src/event/format/json.rs
+++ b/src/event/format/json.rs
@@ -140,22 +140,28 @@ impl EventFormat for Event {
     // also extract the arrow schema, tags and metadata from the incoming json
     fn to_data(
         self,
-        static_schema_flag: bool,
-        stored_schema: &HashMap<String, Arc<Field>>,
         time_partition: Option<&String>,
         time_partition_limit: Option<NonZeroU32>,
         custom_partitions: Option<&String>,
         schema_version: SchemaVersion,
-    ) -> anyhow::Result<(Self::Data, Vec<Arc<Field>>, bool)> {
-        let flattened = self.flatten_logs(
+    ) -> anyhow::Result<Self::Data> {
+        self.flatten_logs(
             time_partition,
             time_partition_limit,
             custom_partitions,
             schema_version,
-        )?;
+        )
+    }
 
+    fn infer_schema(
+        data: &Self::Data,
+        stored_schema: &HashMap<String, Arc<Field>>,
+        time_partition: Option<&String>,
+        static_schema_flag: bool,
+        schema_version: SchemaVersion,
+    ) -> anyhow::Result<(super::EventSchema, bool)> {
         // collect all the keys from all the json objects in the request body
-        let fields = collect_keys(flattened.iter());
+        let fields = collect_keys(data.iter());
 
         let mut is_first = false;
         let schema = if let Some(schema) = derive_arrow_schema(stored_schema, fields) {
@@ -163,14 +169,14 @@ impl EventFormat for Event {
         } else {
             // TODO:
             let mut infer_schema = infer_json_schema_from_iterator(
-                flattened.iter().map(|obj| Ok(Value::Object(obj.clone()))),
+                data.iter().map(|obj| Ok(Value::Object(obj.clone()))),
             )
             .map_err(|err| anyhow!("Could not infer schema for this event due to err {:?}", err))?;
             let new_infer_schema = super::update_field_type_in_schema(
                 Arc::new(infer_schema),
                 Some(stored_schema),
                 time_partition,
-                Some(&flattened),
+                Some(data),
                 schema_version,
             );
             infer_schema = Schema::new(new_infer_schema.fields().clone());
@@ -194,7 +200,7 @@ impl EventFormat for Event {
                 .collect()
         };
 
-        if flattened
+        if data
             .iter()
             .any(|value| fields_mismatch(&schema, value, schema_version))
         {
@@ -205,7 +211,7 @@ impl EventFormat for Event {
 
         let schema = Self::prepare_and_validate_schema(schema, stored_schema, static_schema_flag)?;
 
-        Ok((flattened, schema, is_first))
+        Ok((schema, is_first))
     }
 
     // Convert the Data type (defined above) to arrow record batch
@@ -231,19 +237,24 @@ impl EventFormat for Event {
         let static_schema_flag = stream.get_static_schema_flag();
         let custom_partitions = stream.get_custom_partition();
         let schema_version = stream.get_schema_version();
-        let storage_schema = stream.get_schema_raw();
+        let stored_schema = stream.get_schema_raw();
         let stream_type = stream.get_stream_type();
 
         let p_timestamp = self.p_timestamp;
         let origin_size = self.origin_size;
-        let (data, schema, is_first_event) = self.to_data(
-            static_schema_flag,
-            &storage_schema,
+        let data = self.to_data(
             time_partition.as_ref(),
             time_partition_limit,
             custom_partitions.as_ref(),
             schema_version,
         )?;
+        let (schema, is_first_event) = Self::infer_schema(
+            &data,
+            &stored_schema,
+            time_partition.as_ref(),
+            static_schema_flag,
+            schema_version,
+        )?;
 
         let mut partitions = HashMap::new();
         for json in data {
@@ -500,9 +511,11 @@ mod tests {
         });
 
         let store_schema = HashMap::default();
-        let (data, schema, _) = Event::new(json, 0 /* doesn't matter */, LogSource::Json)
-            .to_data(false, &store_schema, None, None, None, SchemaVersion::V0)
+        let data = Event::new(json, 0 /* doesn't matter */, LogSource::Json)
+            .to_data(None, None, None, SchemaVersion::V0)
             .unwrap();
+        let (schema, _) =
+            Event::infer_schema(&data, &store_schema, None, false, SchemaVersion::V0).unwrap();
         let rb =
             Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap();
 
@@ -531,9 +544,11 @@ mod tests {
         });
 
         let store_schema = HashMap::default();
-        let (data, schema, _) = Event::new(json, 0 /* doesn't matter */, LogSource::Json)
-            .to_data(false, &store_schema, None, None, None, SchemaVersion::V0)
+        let data = Event::new(json, 0 /* doesn't matter */, LogSource::Json)
+            .to_data(None, None, None, SchemaVersion::V0)
             .unwrap();
+        let (schema, _) =
+            Event::infer_schema(&data, &store_schema, None, false, SchemaVersion::V0).unwrap();
         let rb =
             Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap();
 
@@ -564,9 +579,11 @@ mod tests {
             ]
             .into_iter(),
         );
-        let (data, schema, _) = Event::new(json, 0 /* doesn't matter */, LogSource::Json)
-            .to_data(false, &store_schema, None, None, None, SchemaVersion::V0)
+        let data = Event::new(json, 0 /* doesn't matter */, LogSource::Json)
+            .to_data(None, None, None, SchemaVersion::V0)
             .unwrap();
+        let (schema, _) =
+            Event::infer_schema(&data, &store_schema, None, false, SchemaVersion::V0).unwrap();
         let rb =
             Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap();
 
@@ -598,9 +615,11 @@ mod tests {
             .into_iter(),
         );
 
-        assert!(Event::new(json, 0 /* doesn't matter */, LogSource::Json)
-            .to_data(false, &store_schema, None, None, None, SchemaVersion::V0,)
-            .is_err());
+        let data = Event::new(json, 0 /* doesn't matter */, LogSource::Json)
+            .to_data(None, None, None, SchemaVersion::V0)
+            .unwrap();
+
+        assert!(Event::infer_schema(&data, &store_schema, None, false, SchemaVersion::V0).is_err());
     }
 
     #[test]
@@ -616,9 +635,11 @@ mod tests {
             .into_iter(),
         );
 
-        let (data, schema, _) = Event::new(json, 0 /* doesn't matter */, LogSource::Json)
-            .to_data(false, &store_schema, None, None, None, SchemaVersion::V0)
+        let data = Event::new(json, 0 /* doesn't matter */, LogSource::Json)
+            .to_data(None, None, None, SchemaVersion::V0)
             .unwrap();
+        let (schema, _) =
+            Event::infer_schema(&data, &store_schema, None, false, SchemaVersion::V0).unwrap();
         let rb =
             Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap();
 
@@ -645,9 +666,11 @@ mod tests {
         ]);
 
         let store_schema = HashMap::new();
-        let (data, schema, _) = Event::new(json, 0 /* doesn't matter */, LogSource::Json)
-            .to_data(false, &store_schema, None, None, None, SchemaVersion::V0)
+        let data = Event::new(json, 0 /* doesn't matter */, LogSource::Json)
+            .to_data(None, None, None, SchemaVersion::V0)
             .unwrap();
+        let (schema, _) =
+            Event::infer_schema(&data, &store_schema, None, false, SchemaVersion::V0).unwrap();
         let rb =
             Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap();
 
@@ -696,9 +719,11 @@ mod tests {
         ]);
 
         let store_schema = HashMap::new();
-        let (data, schema, _) = Event::new(json, 0 /* doesn't matter */, LogSource::Json)
-            .to_data(false, &store_schema, None, None, None, SchemaVersion::V0)
+        let data = Event::new(json, 0 /* doesn't matter */, LogSource::Json)
+            .to_data(None, None, None, SchemaVersion::V0)
             .unwrap();
+        let (schema, _) =
+            Event::infer_schema(&data, &store_schema, None, false, SchemaVersion::V0).unwrap();
         let rb =
             Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap();
 
@@ -746,9 +771,11 @@ mod tests {
             ]
             .into_iter(),
         );
-        let (data, schema, _) = Event::new(json, 0 /* doesn't matter */, LogSource::Json)
-            .to_data(false, &store_schema, None, None, None, SchemaVersion::V0)
+        let data = Event::new(json, 0 /* doesn't matter */, LogSource::Json)
+            .to_data(None, None, None, SchemaVersion::V0)
             .unwrap();
+        let (schema, _) =
+            Event::infer_schema(&data, &store_schema, None, false, SchemaVersion::V0).unwrap();
         let rb =
             Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap();
 
@@ -797,9 +824,11 @@ mod tests {
             .into_iter(),
         );
 
-        assert!(Event::new(json, 0 /* doesn't matter */, LogSource::Json)
-            .to_data(false, &store_schema, None, None, None, SchemaVersion::V0,)
-            .is_err());
+        let data = Event::new(json, 0 /* doesn't matter */, LogSource::Json)
+            .to_data(None, None, None, SchemaVersion::V0)
+            .unwrap();
+
+        assert!(Event::infer_schema(&data, &store_schema, None, false, SchemaVersion::V0).is_err());
     }
 
     #[test]
@@ -827,9 +856,11 @@ mod tests {
         ]);
 
         let store_schema = HashMap::new();
-        let (data, schema, _) = Event::new(json, 0 /* doesn't matter */, LogSource::Json)
-            .to_data(false, &store_schema, None, None, None, SchemaVersion::V0)
+        let data = Event::new(json, 0 /* doesn't matter */, LogSource::Json)
+            .to_data(None, None, None, SchemaVersion::V0)
             .unwrap();
+        let (schema, _) =
+            Event::infer_schema(&data, &store_schema, None, false, SchemaVersion::V0).unwrap();
         let rb =
             Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap();
 
@@ -903,9 +934,11 @@ mod tests {
         ]);
 
         let store_schema = HashMap::new();
-        let (data, schema, _) = Event::new(json, 0 /* doesn't matter */, LogSource::Json)
-            .to_data(false, &store_schema, None, None, None, SchemaVersion::V1)
+        let data = Event::new(json, 0 /* doesn't matter */, LogSource::Json)
+            .to_data(None, None, None, SchemaVersion::V1)
             .unwrap();
+        let (schema, _) =
+            Event::infer_schema(&data, &store_schema, None, false, SchemaVersion::V1).unwrap();
         let rb =
             Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V1).unwrap();
 
diff --git a/src/event/format/mod.rs b/src/event/format/mod.rs
index 5459e035c..5590c4c3d 100644
--- a/src/event/format/mod.rs
+++ b/src/event/format/mod.rs
@@ -96,6 +96,8 @@ impl Display for LogSource {
     }
 }
 
+pub type IsFirstEvent = bool;
+
 // Global Trait for event format
 // This trait is implemented by all the event formats
 pub trait EventFormat: Sized {
@@ -103,13 +105,19 @@ pub trait EventFormat: Sized {
 
     fn to_data(
         self,
-        static_schema_flag: bool,
-        stored_schema: &HashMap<String, Arc<Field>>,
         time_partition: Option<&String>,
         time_partition_limit: Option<NonZeroU32>,
         custom_partitions: Option<&String>,
         schema_version: SchemaVersion,
-    ) -> Result<(Self::Data, EventSchema, bool), AnyError>;
+    ) -> anyhow::Result<Self::Data>;
+
+    fn infer_schema(
+        data: &Self::Data,
+        stored_schema: &HashMap<String, Arc<Field>>,
+        time_partition: Option<&String>,
+        static_schema_flag: bool,
+        schema_version: SchemaVersion,
+    ) -> anyhow::Result<(EventSchema, IsFirstEvent)>;
 
     fn decode(data: Self::Data, schema: Arc<Schema>) -> Result<RecordBatch, AnyError>;
 

From da5e16c016772578c2823dc9127ff018b249cfa1 Mon Sep 17 00:00:00 2001
From: Devdutt Shenoi <devdutt@parseable.com>
Date: Tue, 4 Mar 2025 00:35:00 +0530
Subject: [PATCH 30/39] style: `anyhow::Result`

---
 src/event/format/mod.rs | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/event/format/mod.rs b/src/event/format/mod.rs
index 5590c4c3d..326969fd3 100644
--- a/src/event/format/mod.rs
+++ b/src/event/format/mod.rs
@@ -24,7 +24,7 @@ use std::{
     sync::Arc,
 };
 
-use anyhow::{anyhow, Error as AnyError};
+use anyhow::anyhow;
 use arrow_array::RecordBatch;
 use arrow_schema::{DataType, Field, Schema, TimeUnit};
 use chrono::{DateTime, Utc};
@@ -119,14 +119,14 @@ pub trait EventFormat: Sized {
         schema_version: SchemaVersion,
     ) -> anyhow::Result<(EventSchema, IsFirstEvent)>;
 
-    fn decode(data: Self::Data, schema: Arc<Schema>) -> Result<RecordBatch, AnyError>;
+    fn decode(data: Self::Data, schema: Arc<Schema>) -> anyhow::Result<RecordBatch>;
 
     /// Updates inferred schema with `p_timestamp` field and ensures it adheres to expectations
     fn prepare_and_validate_schema(
         mut schema: EventSchema,
         storage_schema: &HashMap<String, Arc<Field>>,
         static_schema_flag: bool,
-    ) -> Result<EventSchema, AnyError> {
+    ) -> anyhow::Result<EventSchema> {
         if get_field(&schema, DEFAULT_TIMESTAMP_KEY).is_some() {
             return Err(anyhow!("field {DEFAULT_TIMESTAMP_KEY} is a reserved field",));
         }
@@ -160,7 +160,7 @@ pub trait EventFormat: Sized {
         schema: &EventSchema,
         time_partition: Option<&String>,
         schema_version: SchemaVersion,
-    ) -> Result<RecordBatch, AnyError> {
+    ) -> anyhow::Result<RecordBatch> {
         // prepare the record batch and new fields to be added
         let mut new_schema = Arc::new(Schema::new(schema.clone()));
         new_schema =
@@ -176,7 +176,7 @@ pub trait EventFormat: Sized {
         Ok(rb)
     }
 
-    fn into_event(self, stream: &Stream) -> Result<Event, AnyError>;
+    fn into_event(self, stream: &Stream) -> anyhow::Result<Event>;
 }
 
 pub fn get_existing_field_names(

From 9b0d865560864cad2c1b17dd3ffac64eaf7992c3 Mon Sep 17 00:00:00 2001
From: Devdutt Shenoi <devdutt@parseable.com>
Date: Tue, 4 Mar 2025 00:36:48 +0530
Subject: [PATCH 31/39] fix: rb per object

---
 src/event/format/json.rs | 129 ++++++++++++++++++---------------------
 src/event/format/mod.rs  |  16 +++--
 src/event/mod.rs         |  66 ++++++++++----------
 3 files changed, 96 insertions(+), 115 deletions(-)

diff --git a/src/event/format/json.rs b/src/event/format/json.rs
index 69584e1f7..381a76a9d 100644
--- a/src/event/format/json.rs
+++ b/src/event/format/json.rs
@@ -134,7 +134,7 @@ impl Event {
 }
 
 impl EventFormat for Event {
-    type Data = Vec<Json>;
+    type Data = Json;
 
     // convert the incoming json to a vector of json values
     // also extract the arrow schema, tags and metadata from the incoming json
@@ -144,7 +144,7 @@ impl EventFormat for Event {
         time_partition_limit: Option<NonZeroU32>,
         custom_partitions: Option<&String>,
         schema_version: SchemaVersion,
-    ) -> anyhow::Result<Self::Data> {
+    ) -> anyhow::Result<Vec<Self::Data>> {
         self.flatten_logs(
             time_partition,
             time_partition_limit,
@@ -161,17 +161,18 @@ impl EventFormat for Event {
         schema_version: SchemaVersion,
     ) -> anyhow::Result<(super::EventSchema, bool)> {
         // collect all the keys from all the json objects in the request body
-        let fields = collect_keys(data.iter());
+        let fields = collect_keys(data);
 
         let mut is_first = false;
         let schema = if let Some(schema) = derive_arrow_schema(stored_schema, fields) {
             schema
         } else {
             // TODO:
-            let mut infer_schema = infer_json_schema_from_iterator(
-                data.iter().map(|obj| Ok(Value::Object(obj.clone()))),
-            )
-            .map_err(|err| anyhow!("Could not infer schema for this event due to err {:?}", err))?;
+            let mut infer_schema =
+                infer_json_schema_from_iterator([Ok(Value::Object(data.clone()))].into_iter())
+                    .map_err(|err| {
+                        anyhow!("Could not infer schema for this event due to err {:?}", err)
+                    })?;
             let new_infer_schema = super::update_field_type_in_schema(
                 Arc::new(infer_schema),
                 Some(stored_schema),
@@ -200,10 +201,7 @@ impl EventFormat for Event {
                 .collect()
         };
 
-        if data
-            .iter()
-            .any(|value| fields_mismatch(&schema, value, schema_version))
-        {
+        if fields_mismatch(&schema, data, schema_version) {
             return Err(anyhow!(
                 "Could not process this event due to mismatch in datatype"
             ));
@@ -215,14 +213,14 @@ impl EventFormat for Event {
     }
 
     // Convert the Data type (defined above) to arrow record batch
-    fn decode(data: Self::Data, schema: Arc<Schema>) -> anyhow::Result<RecordBatch> {
+    fn decode(data: &[Self::Data], schema: Arc<Schema>) -> anyhow::Result<RecordBatch> {
         let array_capacity = round_upto_multiple_of_64(data.len());
         let mut reader = ReaderBuilder::new(schema)
             .with_batch_size(array_capacity)
             .with_coerce_primitive(false)
             .build_decoder()?;
 
-        reader.serialize(&data)?;
+        reader.serialize(data)?;
         match reader.flush() {
             Ok(Some(recordbatch)) => Ok(recordbatch),
             Err(err) => Err(anyhow!("Failed to create recordbatch due to {:?}", err)),
@@ -248,16 +246,18 @@ impl EventFormat for Event {
             custom_partitions.as_ref(),
             schema_version,
         )?;
-        let (schema, is_first_event) = Self::infer_schema(
-            &data,
-            &stored_schema,
-            time_partition.as_ref(),
-            static_schema_flag,
-            schema_version,
-        )?;
 
+        let mut is_first_event = false;
         let mut partitions = HashMap::new();
         for json in data {
+            let (schema, is_first) = Self::infer_schema(
+                &json,
+                &stored_schema,
+                time_partition.as_ref(),
+                static_schema_flag,
+                schema_version,
+            )?;
+            is_first_event = is_first_event || is_first;
             let custom_partition_values = match custom_partitions.as_ref() {
                 Some(custom_partitions) => {
                     let custom_partitions = custom_partitions.split(',').collect_vec();
@@ -273,7 +273,7 @@ impl EventFormat for Event {
 
             let batch = Self::into_recordbatch(
                 p_timestamp,
-                vec![json],
+                &[json],
                 &schema,
                 time_partition.as_ref(),
                 schema_version,
@@ -368,15 +368,8 @@ fn derive_arrow_schema(
 
 // Returns a list of keys that are present in the given iterable of JSON objects
 // Returns None if even one of the value is not an Object
-fn collect_keys<'a>(objects: impl Iterator<Item = &'a Json>) -> HashSet<&'a str> {
-    let mut keys = HashSet::new();
-    for object in objects {
-        for key in object.keys() {
-            keys.insert(key.as_str());
-        }
-    }
-
-    keys
+fn collect_keys(object: &Json) -> HashSet<&str> {
+    object.keys().map(|k| k.as_str()).collect()
 }
 
 // Returns true when the field doesn't exist in schema or has an invalid type
@@ -515,9 +508,9 @@ mod tests {
             .to_data(None, None, None, SchemaVersion::V0)
             .unwrap();
         let (schema, _) =
-            Event::infer_schema(&data, &store_schema, None, false, SchemaVersion::V0).unwrap();
+            Event::infer_schema(&data[0], &store_schema, None, false, SchemaVersion::V0).unwrap();
         let rb =
-            Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap();
+            Event::into_recordbatch(Utc::now(), &data, &schema, None, SchemaVersion::V0).unwrap();
 
         assert_eq!(rb.num_rows(), 1);
         assert_eq!(rb.num_columns(), 4);
@@ -548,9 +541,9 @@ mod tests {
             .to_data(None, None, None, SchemaVersion::V0)
             .unwrap();
         let (schema, _) =
-            Event::infer_schema(&data, &store_schema, None, false, SchemaVersion::V0).unwrap();
+            Event::infer_schema(&data[0], &store_schema, None, false, SchemaVersion::V0).unwrap();
         let rb =
-            Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap();
+            Event::into_recordbatch(Utc::now(), &data, &schema, None, SchemaVersion::V0).unwrap();
 
         assert_eq!(rb.num_rows(), 1);
         assert_eq!(rb.num_columns(), 3);
@@ -583,9 +576,9 @@ mod tests {
             .to_data(None, None, None, SchemaVersion::V0)
             .unwrap();
         let (schema, _) =
-            Event::infer_schema(&data, &store_schema, None, false, SchemaVersion::V0).unwrap();
+            Event::infer_schema(&data[0], &store_schema, None, false, SchemaVersion::V0).unwrap();
         let rb =
-            Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap();
+            Event::into_recordbatch(Utc::now(), &data, &schema, None, SchemaVersion::V0).unwrap();
 
         assert_eq!(rb.num_rows(), 1);
         assert_eq!(rb.num_columns(), 3);
@@ -619,7 +612,9 @@ mod tests {
             .to_data(None, None, None, SchemaVersion::V0)
             .unwrap();
 
-        assert!(Event::infer_schema(&data, &store_schema, None, false, SchemaVersion::V0).is_err());
+        assert!(
+            Event::infer_schema(&data[0], &store_schema, None, false, SchemaVersion::V0).is_err()
+        );
     }
 
     #[test]
@@ -639,9 +634,9 @@ mod tests {
             .to_data(None, None, None, SchemaVersion::V0)
             .unwrap();
         let (schema, _) =
-            Event::infer_schema(&data, &store_schema, None, false, SchemaVersion::V0).unwrap();
+            Event::infer_schema(&data[0], &store_schema, None, false, SchemaVersion::V0).unwrap();
         let rb =
-            Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap();
+            Event::into_recordbatch(Utc::now(), &data, &schema, None, SchemaVersion::V0).unwrap();
 
         assert_eq!(rb.num_rows(), 1);
         assert_eq!(rb.num_columns(), 1);
@@ -670,9 +665,9 @@ mod tests {
             .to_data(None, None, None, SchemaVersion::V0)
             .unwrap();
         let (schema, _) =
-            Event::infer_schema(&data, &store_schema, None, false, SchemaVersion::V0).unwrap();
+            Event::infer_schema(&data[1], &store_schema, None, false, SchemaVersion::V0).unwrap();
         let rb =
-            Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap();
+            Event::into_recordbatch(Utc::now(), &data, &schema, None, SchemaVersion::V0).unwrap();
 
         assert_eq!(rb.num_rows(), 3);
         assert_eq!(rb.num_columns(), 4);
@@ -723,9 +718,9 @@ mod tests {
             .to_data(None, None, None, SchemaVersion::V0)
             .unwrap();
         let (schema, _) =
-            Event::infer_schema(&data, &store_schema, None, false, SchemaVersion::V0).unwrap();
+            Event::infer_schema(&data[1], &store_schema, None, false, SchemaVersion::V0).unwrap();
         let rb =
-            Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap();
+            Event::into_recordbatch(Utc::now(), &data, &schema, None, SchemaVersion::V0).unwrap();
 
         assert_eq!(rb.num_rows(), 3);
         assert_eq!(rb.num_columns(), 4);
@@ -775,9 +770,9 @@ mod tests {
             .to_data(None, None, None, SchemaVersion::V0)
             .unwrap();
         let (schema, _) =
-            Event::infer_schema(&data, &store_schema, None, false, SchemaVersion::V0).unwrap();
+            Event::infer_schema(&data[0], &store_schema, None, false, SchemaVersion::V0).unwrap();
         let rb =
-            Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap();
+            Event::into_recordbatch(Utc::now(), &data, &schema, None, SchemaVersion::V0).unwrap();
 
         assert_eq!(rb.num_rows(), 3);
         assert_eq!(rb.num_columns(), 4);
@@ -797,23 +792,12 @@ mod tests {
 
     #[test]
     fn arr_schema_mismatch() {
-        let json = json!([
-            {
-                "a": null,
-                "b": "hello",
-                "c": 1.24
-            },
-            {
-                "a": 1,
-                "b": "hello",
-                "c": 1
-            },
-            {
-                "a": 1,
-                "b": "hello",
-                "c": null
-            },
-        ]);
+        let json = json!(
+        {
+            "a": 1,
+            "b": "hello",
+            "c": 1
+        });
 
         let store_schema = fields_to_map(
             [
@@ -824,11 +808,14 @@ mod tests {
             .into_iter(),
         );
 
-        let data = Event::new(json, 0 /* doesn't matter */, LogSource::Json)
-            .to_data(None, None, None, SchemaVersion::V0)
-            .unwrap();
-
-        assert!(Event::infer_schema(&data, &store_schema, None, false, SchemaVersion::V0).is_err());
+        assert!(Event::infer_schema(
+            json.as_object().unwrap(),
+            &store_schema,
+            None,
+            false,
+            SchemaVersion::V0
+        )
+        .is_err());
     }
 
     #[test]
@@ -860,9 +847,9 @@ mod tests {
             .to_data(None, None, None, SchemaVersion::V0)
             .unwrap();
         let (schema, _) =
-            Event::infer_schema(&data, &store_schema, None, false, SchemaVersion::V0).unwrap();
+            Event::infer_schema(&data[3], &store_schema, None, false, SchemaVersion::V0).unwrap();
         let rb =
-            Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V0).unwrap();
+            Event::into_recordbatch(Utc::now(), &data, &schema, None, SchemaVersion::V0).unwrap();
 
         assert_eq!(rb.num_rows(), 4);
         assert_eq!(rb.num_columns(), 5);
@@ -938,9 +925,9 @@ mod tests {
             .to_data(None, None, None, SchemaVersion::V1)
             .unwrap();
         let (schema, _) =
-            Event::infer_schema(&data, &store_schema, None, false, SchemaVersion::V1).unwrap();
+            Event::infer_schema(&data[3], &store_schema, None, false, SchemaVersion::V1).unwrap();
         let rb =
-            Event::into_recordbatch(Utc::now(), data, &schema, None, SchemaVersion::V1).unwrap();
+            Event::into_recordbatch(Utc::now(), &data, &schema, None, SchemaVersion::V1).unwrap();
 
         assert_eq!(rb.num_rows(), 4);
         assert_eq!(rb.num_columns(), 5);
diff --git a/src/event/format/mod.rs b/src/event/format/mod.rs
index 326969fd3..94b60662e 100644
--- a/src/event/format/mod.rs
+++ b/src/event/format/mod.rs
@@ -109,7 +109,7 @@ pub trait EventFormat: Sized {
         time_partition_limit: Option<NonZeroU32>,
         custom_partitions: Option<&String>,
         schema_version: SchemaVersion,
-    ) -> anyhow::Result<Self::Data>;
+    ) -> anyhow::Result<Vec<Self::Data>>;
 
     fn infer_schema(
         data: &Self::Data,
@@ -119,7 +119,7 @@ pub trait EventFormat: Sized {
         schema_version: SchemaVersion,
     ) -> anyhow::Result<(EventSchema, IsFirstEvent)>;
 
-    fn decode(data: Self::Data, schema: Arc<Schema>) -> anyhow::Result<RecordBatch>;
+    fn decode(data: &[Self::Data], schema: Arc<Schema>) -> anyhow::Result<RecordBatch>;
 
     /// Updates inferred schema with `p_timestamp` field and ensures it adheres to expectations
     fn prepare_and_validate_schema(
@@ -156,7 +156,7 @@ pub trait EventFormat: Sized {
 
     fn into_recordbatch(
         p_timestamp: DateTime<Utc>,
-        data: Self::Data,
+        data: &[Self::Data],
         schema: &EventSchema,
         time_partition: Option<&String>,
         schema_version: SchemaVersion,
@@ -234,7 +234,7 @@ pub fn update_field_type_in_schema(
     inferred_schema: Arc<Schema>,
     existing_schema: Option<&HashMap<String, Arc<Field>>>,
     time_partition: Option<&String>,
-    log_records: Option<&[Json]>,
+    log_records: Option<&Json>,
     schema_version: SchemaVersion,
 ) -> Arc<Schema> {
     let mut updated_schema = inferred_schema.clone();
@@ -245,11 +245,9 @@ pub fn update_field_type_in_schema(
         updated_schema = override_existing_timestamp_fields(existing_schema, updated_schema);
     }
 
-    if let Some(log_records) = log_records {
-        for log_record in log_records {
-            updated_schema =
-                override_data_type(updated_schema.clone(), log_record.clone(), schema_version);
-        }
+    if let Some(log_record) = log_records {
+        updated_schema =
+            override_data_type(updated_schema.clone(), log_record.clone(), schema_version);
     }
 
     let Some(time_partition) = time_partition else {
diff --git a/src/event/mod.rs b/src/event/mod.rs
index f0297f61c..bdfe458e8 100644
--- a/src/event/mod.rs
+++ b/src/event/mod.rs
@@ -19,23 +19,19 @@
 
 pub mod format;
 
-use arrow::compute::concat_batches;
 use arrow_array::RecordBatch;
 use arrow_schema::{Field, Schema};
 use itertools::Itertools;
 use std::sync::Arc;
 
 use self::error::EventError;
-use crate::{
-    metadata::update_stats,
-    parseable::{StagingError, Stream},
-    storage::StreamType,
-};
+use crate::{metadata::update_stats, parseable::Stream, storage::StreamType};
 use chrono::NaiveDateTime;
 use std::collections::HashMap;
 
 pub const DEFAULT_TIMESTAMP_KEY: &str = "p_timestamp";
 
+#[derive(Debug)]
 pub struct PartitionEvent {
     pub rbs: Vec<RecordBatch>,
     pub schema: Arc<Schema>,
@@ -43,6 +39,7 @@ pub struct PartitionEvent {
     pub custom_partition_values: HashMap<String, String>,
 }
 
+#[derive(Debug)]
 pub struct Event {
     pub origin_format: &'static str,
     pub origin_size: usize,
@@ -56,44 +53,43 @@ pub struct Event {
 impl Event {
     pub fn process(self, stream: &Stream) -> Result<(), EventError> {
         for (key, partition) in self.partitions {
-            let rb =
-                concat_batches(&partition.schema, &partition.rbs).map_err(StagingError::Arrow)?;
             if self.is_first_event {
                 stream.commit_schema(partition.schema.as_ref().clone())?;
             }
-
-            stream.push(
-                &key,
-                &rb,
-                partition.parsed_timestamp,
-                &partition.custom_partition_values,
-                self.stream_type,
-            )?;
-
-            update_stats(
-                &stream.stream_name,
-                self.origin_format,
-                self.origin_size,
-                rb.num_rows(),
-                partition.parsed_timestamp.date(),
-            );
-
-            crate::livetail::LIVETAIL.process(&stream.stream_name, &rb);
+            for rb in partition.rbs {
+                stream.push(
+                    &key,
+                    &rb,
+                    partition.parsed_timestamp,
+                    &partition.custom_partition_values,
+                    self.stream_type,
+                )?;
+
+                update_stats(
+                    &stream.stream_name,
+                    self.origin_format,
+                    self.origin_size,
+                    rb.num_rows(),
+                    partition.parsed_timestamp.date(),
+                );
+
+                crate::livetail::LIVETAIL.process(&stream.stream_name, &rb);
+            }
         }
         Ok(())
     }
 
     pub fn process_unchecked(&self, stream: &Stream) -> Result<(), EventError> {
         for (key, partition) in &self.partitions {
-            let rb =
-                concat_batches(&partition.schema, &partition.rbs).map_err(StagingError::Arrow)?;
-            stream.push(
-                key,
-                &rb,
-                partition.parsed_timestamp,
-                &partition.custom_partition_values,
-                self.stream_type,
-            )?;
+            for rb in &partition.rbs {
+                stream.push(
+                    key,
+                    rb,
+                    partition.parsed_timestamp,
+                    &partition.custom_partition_values,
+                    self.stream_type,
+                )?;
+            }
         }
 
         Ok(())

From d096ce0b82aefefa47902d5f458aeb7647ed7617 Mon Sep 17 00:00:00 2001
From: Devdutt Shenoi <devdutt@parseable.com>
Date: Wed, 5 Mar 2025 00:43:32 +0530
Subject: [PATCH 32/39] perf: partition at json level

---
 src/event/format/json.rs    |  82 ++++++++++++++++++----------
 src/event/mod.rs            |  52 ++++++------------
 src/handlers/http/ingest.rs |  13 ++---
 src/parseable/streams.rs    | 105 +++++++++---------------------------
 4 files changed, 100 insertions(+), 152 deletions(-)

diff --git a/src/event/format/json.rs b/src/event/format/json.rs
index 381a76a9d..8c1403962 100644
--- a/src/event/format/json.rs
+++ b/src/event/format/json.rs
@@ -23,7 +23,7 @@ use anyhow::anyhow;
 use arrow_array::RecordBatch;
 use arrow_json::reader::{infer_json_schema_from_iterator, ReaderBuilder};
 use arrow_schema::{DataType, Field, Fields, Schema};
-use chrono::{DateTime, NaiveDateTime, Utc};
+use chrono::{DateTime, NaiveDate, NaiveDateTime, Utc};
 use datafusion::arrow::util::bit_util::round_upto_multiple_of_64;
 use itertools::Itertools;
 use opentelemetry_proto::tonic::{
@@ -47,9 +47,17 @@ use crate::{
     utils::{
         arrow::get_field,
         json::{flatten_json_body, Json},
+        time::Minute,
     },
+    OBJECT_STORE_DATA_GRANULARITY,
 };
 
+struct JsonPartition {
+    batch: Vec<Json>,
+    schema: Vec<Arc<Field>>,
+    date: NaiveDate,
+}
+
 pub struct Event {
     pub json: Value,
     pub origin_size: usize,
@@ -248,7 +256,7 @@ impl EventFormat for Event {
         )?;
 
         let mut is_first_event = false;
-        let mut partitions = HashMap::new();
+        let mut json_partitions = HashMap::new();
         for json in data {
             let (schema, is_first) = Self::infer_schema(
                 &json,
@@ -257,6 +265,7 @@ impl EventFormat for Event {
                 static_schema_flag,
                 schema_version,
             )?;
+
             is_first_event = is_first_event || is_first;
             let custom_partition_values = match custom_partitions.as_ref() {
                 Some(custom_partitions) => {
@@ -267,45 +276,60 @@ impl EventFormat for Event {
             };
 
             let parsed_timestamp = match time_partition.as_ref() {
-                Some(time_partition) => extract_and_parse_time(&json, time_partition.as_ref())?,
+                Some(time_partition) => extract_and_parse_time(&json, time_partition)?,
                 _ => p_timestamp.naive_utc(),
             };
 
-            let batch = Self::into_recordbatch(
-                p_timestamp,
-                &[json],
-                &schema,
-                time_partition.as_ref(),
-                schema_version,
-            )?;
-
-            let schema = batch.schema();
-            let mut key = get_schema_key(&schema.fields);
-            if time_partition.is_some() {
-                let parsed_timestamp_to_min = parsed_timestamp.format("%Y%m%dT%H%M").to_string();
-                key.push_str(&parsed_timestamp_to_min);
-            }
-
-            for (k, v) in custom_partition_values.iter().sorted_by_key(|v| v.0) {
-                key.push_str(&format!("&{k}={v}"));
-            }
+            let prefix = format!(
+                "{}.{}.minute={}.{}",
+                get_schema_key(&schema),
+                parsed_timestamp.format("date=%Y-%m-%d.hour=%H"),
+                Minute::from(parsed_timestamp).to_slot(OBJECT_STORE_DATA_GRANULARITY),
+                custom_partition_values
+                    .iter()
+                    .sorted_by_key(|v| v.0)
+                    .map(|(key, value)| format!("{key}={value}."))
+                    .join("")
+            );
 
-            match partitions.get_mut(&key) {
-                Some(PartitionEvent { rbs, .. }) => rbs.push(batch),
+            match json_partitions.get_mut(&prefix) {
+                Some(JsonPartition { batch, .. }) => batch.push(json),
                 _ => {
-                    partitions.insert(
-                        key,
-                        PartitionEvent {
-                            rbs: vec![batch],
+                    let date = parsed_timestamp.date();
+                    let batch = vec![json];
+                    json_partitions.insert(
+                        prefix,
+                        JsonPartition {
+                            batch,
                             schema,
-                            parsed_timestamp,
-                            custom_partition_values,
+                            date,
                         },
                     );
                 }
             }
         }
 
+        let mut partitions = HashMap::new();
+        for (
+            prefix,
+            JsonPartition {
+                batch,
+                schema,
+                date,
+            },
+        ) in json_partitions
+        {
+            let batch = Self::into_recordbatch(
+                p_timestamp,
+                &batch,
+                &schema,
+                time_partition.as_ref(),
+                schema_version,
+            )?;
+
+            partitions.insert(prefix, PartitionEvent { rb: batch, date });
+        }
+
         Ok(super::Event {
             origin_format: "json",
             origin_size,
diff --git a/src/event/mod.rs b/src/event/mod.rs
index bdfe458e8..6f3918a01 100644
--- a/src/event/mod.rs
+++ b/src/event/mod.rs
@@ -20,23 +20,21 @@
 pub mod format;
 
 use arrow_array::RecordBatch;
-use arrow_schema::{Field, Schema};
+use arrow_schema::Field;
 use itertools::Itertools;
 use std::sync::Arc;
 
 use self::error::EventError;
 use crate::{metadata::update_stats, parseable::Stream, storage::StreamType};
-use chrono::NaiveDateTime;
+use chrono::NaiveDate;
 use std::collections::HashMap;
 
 pub const DEFAULT_TIMESTAMP_KEY: &str = "p_timestamp";
 
 #[derive(Debug)]
 pub struct PartitionEvent {
-    pub rbs: Vec<RecordBatch>,
-    pub schema: Arc<Schema>,
-    pub parsed_timestamp: NaiveDateTime,
-    pub custom_partition_values: HashMap<String, String>,
+    pub rb: RecordBatch,
+    pub date: NaiveDate,
 }
 
 #[derive(Debug)]
@@ -52,44 +50,28 @@ pub struct Event {
 // Events holds the schema related to a each event for a single log stream
 impl Event {
     pub fn process(self, stream: &Stream) -> Result<(), EventError> {
-        for (key, partition) in self.partitions {
+        for (prefix, PartitionEvent { rb, date }) in self.partitions {
             if self.is_first_event {
-                stream.commit_schema(partition.schema.as_ref().clone())?;
+                stream.commit_schema(rb.schema().as_ref().clone())?;
             }
-            for rb in partition.rbs {
-                stream.push(
-                    &key,
-                    &rb,
-                    partition.parsed_timestamp,
-                    &partition.custom_partition_values,
-                    self.stream_type,
-                )?;
+            stream.push(&prefix, &rb, self.stream_type)?;
 
-                update_stats(
-                    &stream.stream_name,
-                    self.origin_format,
-                    self.origin_size,
-                    rb.num_rows(),
-                    partition.parsed_timestamp.date(),
-                );
+            update_stats(
+                &stream.stream_name,
+                self.origin_format,
+                self.origin_size,
+                rb.num_rows(),
+                date,
+            );
 
-                crate::livetail::LIVETAIL.process(&stream.stream_name, &rb);
-            }
+            crate::livetail::LIVETAIL.process(&stream.stream_name, &rb);
         }
         Ok(())
     }
 
     pub fn process_unchecked(&self, stream: &Stream) -> Result<(), EventError> {
-        for (key, partition) in &self.partitions {
-            for rb in &partition.rbs {
-                stream.push(
-                    key,
-                    rb,
-                    partition.parsed_timestamp,
-                    &partition.custom_partition_values,
-                    self.stream_type,
-                )?;
-            }
+        for (prefix, partition) in &self.partitions {
+            stream.push(prefix, &partition.rb, self.stream_type)?;
         }
 
         Ok(())
diff --git a/src/handlers/http/ingest.rs b/src/handlers/http/ingest.rs
index d96236322..7a9897993 100644
--- a/src/handlers/http/ingest.rs
+++ b/src/handlers/http/ingest.rs
@@ -16,8 +16,6 @@
  *
  */
 
-use std::collections::HashMap;
-
 use actix_web::web::Path;
 use actix_web::{http::header::ContentType, HttpRequest, HttpResponse};
 use arrow_array::RecordBatch;
@@ -236,22 +234,19 @@ pub async fn post_event(
 }
 
 pub async fn push_logs_unchecked(
-    batch: RecordBatch,
+    rb: RecordBatch,
     stream: &Stream,
 ) -> Result<event::Event, PostError> {
-    let schema = batch.schema();
     let unchecked_event = event::Event {
         origin_format: "json",
         origin_size: 0,
         time_partition: None,
         is_first_event: true, // NOTE: Maybe should be false
         partitions: [(
-            get_schema_key(&schema.fields),
+            get_schema_key(&rb.schema().fields),
             PartitionEvent {
-                rbs: vec![batch],
-                schema,
-                parsed_timestamp: Utc::now().naive_utc(),
-                custom_partition_values: HashMap::new(), // should be an empty map for unchecked push
+                rb,
+                date: Utc::now().date_naive(),
             },
         )]
         .into_iter()
diff --git a/src/parseable/streams.rs b/src/parseable/streams.rs
index ab83fb609..314771714 100644
--- a/src/parseable/streams.rs
+++ b/src/parseable/streams.rs
@@ -30,7 +30,6 @@ use std::{
 
 use arrow_array::RecordBatch;
 use arrow_schema::{Field, Fields, Schema};
-use chrono::{NaiveDateTime, Timelike};
 use derive_more::{Deref, DerefMut};
 use itertools::Itertools;
 use parquet::{
@@ -52,8 +51,7 @@ use crate::{
     metrics,
     option::Mode,
     storage::{object_storage::to_bytes, retention::Retention, StreamType},
-    utils::time::Minute,
-    LOCK_EXPECT, OBJECT_STORE_DATA_GRANULARITY,
+    LOCK_EXPECT,
 };
 
 use super::{
@@ -112,60 +110,40 @@ impl Stream {
     // Concatenates record batches and puts them in memory store for each event.
     pub fn push(
         &self,
-        schema_key: &str,
+        prefix: &str,
         record: &RecordBatch,
-        parsed_timestamp: NaiveDateTime,
-        custom_partition_values: &HashMap<String, String>,
         stream_type: StreamType,
     ) -> Result<(), StagingError> {
         let mut guard = self.writer.lock().unwrap();
         if self.options.mode != Mode::Query || stream_type == StreamType::Internal {
-            match guard.disk.get_mut(schema_key) {
+            match guard.disk.get_mut(prefix) {
                 Some(writer) => {
                     writer.write(record)?;
                 }
                 None => {
                     // entry is not present thus we create it
-                    let path = self.path_by_current_time(
-                        schema_key,
-                        parsed_timestamp,
-                        custom_partition_values,
-                    );
+                    let path = self.path_by_current_time(prefix);
                     std::fs::create_dir_all(&self.data_path)?;
 
                     let mut writer = DiskWriter::new(path, &record.schema())?;
                     writer.write(record)?;
-                    guard.disk.insert(schema_key.to_owned(), writer);
+                    guard.disk.insert(prefix.to_owned(), writer);
                 }
             };
         }
 
-        guard.mem.push(schema_key, record);
+        guard.mem.push(prefix, record);
 
         Ok(())
     }
 
-    pub fn path_by_current_time(
-        &self,
-        stream_hash: &str,
-        parsed_timestamp: NaiveDateTime,
-        custom_partition_values: &HashMap<String, String>,
-    ) -> PathBuf {
+    pub fn path_by_current_time(&self, prefix: &str) -> PathBuf {
         let mut hostname = hostname::get().unwrap().into_string().unwrap();
         if let Some(id) = &self.ingestor_id {
             hostname.push_str(id);
         }
-        let filename = format!(
-            "{stream_hash}.date={}.hour={:02}.minute={}.{}{hostname}.data.part",
-            parsed_timestamp.date(),
-            parsed_timestamp.hour(),
-            Minute::from(parsed_timestamp).to_slot(OBJECT_STORE_DATA_GRANULARITY),
-            custom_partition_values
-                .iter()
-                .sorted_by_key(|v| v.0)
-                .map(|(key, value)| format!("{key}={value}."))
-                .join("")
-        );
+
+        let filename = format!("{prefix}.{hostname}.data.part",);
         self.data_path.join(filename)
     }
 
@@ -766,10 +744,12 @@ mod tests {
 
     use arrow_array::{Int32Array, StringArray, TimestampMillisecondArray};
     use arrow_schema::{DataType, Field, TimeUnit};
-    use chrono::{NaiveDate, TimeDelta, Utc};
+    use chrono::{NaiveDate, NaiveDateTime, TimeDelta, Utc};
     use temp_dir::TempDir;
     use tokio::time::sleep;
 
+    use crate::{utils::time::Minute, OBJECT_STORE_DATA_GRANULARITY};
+
     use super::*;
 
     #[test]
@@ -865,41 +845,8 @@ mod tests {
     }
 
     #[test]
-    fn generate_correct_path_with_current_time_and_no_custom_partitioning() {
-        let stream_name = "test_stream";
-        let stream_hash = "abc123";
-        let parsed_timestamp = NaiveDate::from_ymd_opt(2023, 10, 1)
-            .unwrap()
-            .and_hms_opt(12, 30, 0)
-            .unwrap();
-        let custom_partition_values = HashMap::new();
-
-        let options = Options::default();
-        let staging = Stream::new(
-            Arc::new(options),
-            stream_name,
-            LogStreamMetadata::default(),
-            None,
-        );
-
-        let expected_path = staging.data_path.join(format!(
-            "{stream_hash}.date={}.hour={:02}.minute={}.{}.data.part",
-            parsed_timestamp.date(),
-            parsed_timestamp.hour(),
-            Minute::from(parsed_timestamp).to_slot(OBJECT_STORE_DATA_GRANULARITY),
-            hostname::get().unwrap().into_string().unwrap()
-        ));
-
-        let generated_path =
-            staging.path_by_current_time(stream_hash, parsed_timestamp, &custom_partition_values);
-
-        assert_eq!(generated_path, expected_path);
-    }
-
-    #[test]
-    fn generate_correct_path_with_current_time_and_custom_partitioning() {
+    fn generate_correct_path() {
         let stream_name = "test_stream";
-        let stream_hash = "abc123";
         let parsed_timestamp = NaiveDate::from_ymd_opt(2023, 10, 1)
             .unwrap()
             .and_hms_opt(12, 30, 0)
@@ -908,6 +855,12 @@ mod tests {
         custom_partition_values.insert("key1".to_string(), "value1".to_string());
         custom_partition_values.insert("key2".to_string(), "value2".to_string());
 
+        let prefix = format!(
+            "abc123.{}.minute={}.key1=value1.key2=value2",
+            parsed_timestamp.format("date={%Y-%m-%d}.hour={%H}"),
+            Minute::from(parsed_timestamp).to_slot(OBJECT_STORE_DATA_GRANULARITY),
+        );
+
         let options = Options::default();
         let staging = Stream::new(
             Arc::new(options),
@@ -917,15 +870,11 @@ mod tests {
         );
 
         let expected_path = staging.data_path.join(format!(
-            "{stream_hash}.date={}.hour={:02}.minute={}.key1=value1.key2=value2.{}.data.part",
-            parsed_timestamp.date(),
-            parsed_timestamp.hour(),
-            Minute::from(parsed_timestamp).to_slot(OBJECT_STORE_DATA_GRANULARITY),
+            "{prefix}.{}.data.part",
             hostname::get().unwrap().into_string().unwrap()
         ));
 
-        let generated_path =
-            staging.path_by_current_time(stream_hash, parsed_timestamp, &custom_partition_values);
+        let generated_path = staging.path_by_current_time(&prefix);
 
         assert_eq!(generated_path, expected_path);
     }
@@ -965,6 +914,10 @@ mod tests {
             .checked_sub_signed(TimeDelta::minutes(mins))
             .unwrap()
             .naive_utc();
+        let prefix = format!(
+            "abc.{}.key1=value1.key2=value2",
+            time.format("date=%Y-%m-%d.hour=%H.minute=%M")
+        );
         let batch = RecordBatch::try_new(
             Arc::new(schema.clone()),
             vec![
@@ -975,13 +928,7 @@ mod tests {
         )
         .unwrap();
         staging
-            .push(
-                "abc",
-                &batch,
-                time,
-                &HashMap::new(),
-                StreamType::UserDefined,
-            )
+            .push(&prefix, &batch, StreamType::UserDefined)
             .unwrap();
         staging.flush();
     }

From e15b0d2a7d3773ac02cacc89ab76b4ffb41a0f91 Mon Sep 17 00:00:00 2001
From: Devdutt Shenoi <devdutt@parseable.com>
Date: Wed, 5 Mar 2025 00:50:38 +0530
Subject: [PATCH 33/39] style: deepsource suggestion

---
 src/event/format/json.rs | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/src/event/format/json.rs b/src/event/format/json.rs
index 8c1403962..19544d80d 100644
--- a/src/event/format/json.rs
+++ b/src/event/format/json.rs
@@ -292,20 +292,17 @@ impl EventFormat for Event {
                     .join("")
             );
 
-            match json_partitions.get_mut(&prefix) {
-                Some(JsonPartition { batch, .. }) => batch.push(json),
-                _ => {
-                    let date = parsed_timestamp.date();
-                    let batch = vec![json];
-                    json_partitions.insert(
-                        prefix,
-                        JsonPartition {
-                            batch,
-                            schema,
-                            date,
-                        },
-                    );
-                }
+            if let Some(JsonPartition { batch, .. }) = json_partitions.get_mut(&prefix) {
+                batch.push(json)
+            } else {
+                json_partitions.insert(
+                    prefix,
+                    JsonPartition {
+                        batch: vec![json],
+                        schema,
+                        date: parsed_timestamp.date(),
+                    },
+                );
             }
         }
 

From ef27f97767a49f1380d9fe1f09525a9281328818 Mon Sep 17 00:00:00 2001
From: Devdutt Shenoi <devdutt@parseable.com>
Date: Mon, 10 Mar 2025 12:44:14 +0530
Subject: [PATCH 34/39] chore: remove unused

---
 src/handlers/http/cluster/utils.rs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/handlers/http/cluster/utils.rs b/src/handlers/http/cluster/utils.rs
index ae72cf300..9256525c6 100644
--- a/src/handlers/http/cluster/utils.rs
+++ b/src/handlers/http/cluster/utils.rs
@@ -31,7 +31,6 @@ use actix_web::{
 use bytes::BytesMut;
 use chrono::{DateTime, Utc};
 use futures::StreamExt;
-use itertools::Itertools;
 use serde::{de::DeserializeOwned, Deserialize, Serialize};
 use tracing::error;
 use url::Url;

From b8606b35be9b44fce1f62f120841d51b80eef74b Mon Sep 17 00:00:00 2001
From: Devdutt Shenoi <devdutt@parseable.com>
Date: Mon, 10 Mar 2025 13:25:58 +0530
Subject: [PATCH 35/39] fix: custom partitioned file names

---
 src/event/format/json.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/event/format/json.rs b/src/event/format/json.rs
index 19544d80d..7a959717f 100644
--- a/src/event/format/json.rs
+++ b/src/event/format/json.rs
@@ -288,8 +288,8 @@ impl EventFormat for Event {
                 custom_partition_values
                     .iter()
                     .sorted_by_key(|v| v.0)
-                    .map(|(key, value)| format!("{key}={value}."))
-                    .join("")
+                    .map(|(key, value)| format!("{key}={value}"))
+                    .join(".")
             );
 
             if let Some(JsonPartition { batch, .. }) = json_partitions.get_mut(&prefix) {

From fbd3a18e5feb6e657c9aa2c01e527b5087fb313d Mon Sep 17 00:00:00 2001
From: Devdutt Shenoi <devdutt@parseable.com>
Date: Mon, 10 Mar 2025 21:40:50 +0530
Subject: [PATCH 36/39] perf: use a buffer

---
 src/parseable/staging/writer.rs | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/parseable/staging/writer.rs b/src/parseable/staging/writer.rs
index 2dbfe1e49..5e5985aa5 100644
--- a/src/parseable/staging/writer.rs
+++ b/src/parseable/staging/writer.rs
@@ -18,10 +18,7 @@
  */
 
 use std::{
-    collections::{HashMap, HashSet},
-    fs::{File, OpenOptions},
-    path::PathBuf,
-    sync::Arc,
+    collections::{HashMap, HashSet}, fs::{File, OpenOptions}, io::BufWriter, path::PathBuf, sync::Arc
 };
 
 use arrow_array::RecordBatch;
@@ -42,7 +39,7 @@ pub struct Writer {
 }
 
 pub struct DiskWriter {
-    pub inner: StreamWriter<File>,
+    pub inner: StreamWriter<BufWriter<File>>,
     pub path: PathBuf,
 }
 
@@ -50,7 +47,7 @@ impl DiskWriter {
     pub fn new(path: PathBuf, schema: &Schema) -> Result<Self, StagingError> {
         let file = OpenOptions::new().create(true).append(true).open(&path)?;
 
-        let inner = StreamWriter::try_new(file, schema)?;
+        let inner = StreamWriter::try_new_buffered(file, schema)?;
 
         Ok(Self { inner, path })
     }

From c2f67691a788c40b8be5d99798abaab0954770db Mon Sep 17 00:00:00 2001
From: Devdutt Shenoi <devdutt@parseable.com>
Date: Mon, 10 Mar 2025 21:47:42 +0530
Subject: [PATCH 37/39] refactor: drop to flush

---
 src/parseable/staging/writer.rs | 12 +++++++++++-
 src/parseable/streams.rs        | 17 +++++------------
 2 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/src/parseable/staging/writer.rs b/src/parseable/staging/writer.rs
index 5e5985aa5..1e299068a 100644
--- a/src/parseable/staging/writer.rs
+++ b/src/parseable/staging/writer.rs
@@ -18,7 +18,11 @@
  */
 
 use std::{
-    collections::{HashMap, HashSet}, fs::{File, OpenOptions}, io::BufWriter, path::PathBuf, sync::Arc
+    collections::{HashMap, HashSet},
+    fs::{File, OpenOptions},
+    io::BufWriter,
+    path::PathBuf,
+    sync::Arc,
 };
 
 use arrow_array::RecordBatch;
@@ -70,6 +74,12 @@ impl DiskWriter {
     }
 }
 
+impl Drop for DiskWriter {
+    fn drop(&mut self) {
+        self.finish();
+    }
+}
+
 /// Structure to keep recordbatches in memory.
 ///
 /// Any new schema is updated in the schema map.
diff --git a/src/parseable/streams.rs b/src/parseable/streams.rs
index 314771714..c367c8b32 100644
--- a/src/parseable/streams.rs
+++ b/src/parseable/streams.rs
@@ -325,18 +325,11 @@ impl Stream {
     }
 
     pub fn flush(&self) {
-        let mut disk_writers = {
-            let mut writer = self.writer.lock().unwrap();
-            // Flush memory
-            writer.mem.clear();
-            // Take schema -> disk writer mapping
-            std::mem::take(&mut writer.disk)
-        };
-
-        // Flush disk
-        for writer in disk_writers.values_mut() {
-            writer.finish();
-        }
+        let mut writer = self.writer.lock().unwrap();
+        // Flush memory
+        writer.mem.clear();
+        // Drop DiskWirters to flush all streams in memory
+        drop(std::mem::take(&mut writer.disk))
     }
 
     fn parquet_writer_props(

From 592d3a136a9d2f5be2ba508ada42f5a2dd6f1b7b Mon Sep 17 00:00:00 2001
From: Devdutt Shenoi <devdutt@parseable.com>
Date: Mon, 10 Mar 2025 22:05:42 +0530
Subject: [PATCH 38/39] fix & test: prefix generation

---
 src/event/format/json.rs | 79 ++++++++++++++++++++++++++++++++++------
 1 file changed, 67 insertions(+), 12 deletions(-)

diff --git a/src/event/format/json.rs b/src/event/format/json.rs
index 7a959717f..a27ce180b 100644
--- a/src/event/format/json.rs
+++ b/src/event/format/json.rs
@@ -280,18 +280,7 @@ impl EventFormat for Event {
                 _ => p_timestamp.naive_utc(),
             };
 
-            let prefix = format!(
-                "{}.{}.minute={}.{}",
-                get_schema_key(&schema),
-                parsed_timestamp.format("date=%Y-%m-%d.hour=%H"),
-                Minute::from(parsed_timestamp).to_slot(OBJECT_STORE_DATA_GRANULARITY),
-                custom_partition_values
-                    .iter()
-                    .sorted_by_key(|v| v.0)
-                    .map(|(key, value)| format!("{key}={value}"))
-                    .join(".")
-            );
-
+            let prefix = generate_prefix(&schema, parsed_timestamp, &custom_partition_values);
             if let Some(JsonPartition { batch, .. }) = json_partitions.get_mut(&prefix) {
                 batch.push(json)
             } else {
@@ -338,6 +327,24 @@ impl EventFormat for Event {
     }
 }
 
+fn generate_prefix(
+    schema: &[Arc<Field>],
+    parsed_timestamp: NaiveDateTime,
+    custom_partition_values: &HashMap<String, String>,
+) -> String {
+    format!(
+        "{}.{}.minute={}{}",
+        get_schema_key(schema),
+        parsed_timestamp.format("date=%Y-%m-%d.hour=%H"),
+        Minute::from(parsed_timestamp).to_slot(OBJECT_STORE_DATA_GRANULARITY),
+        custom_partition_values
+            .iter()
+            .sorted_by_key(|v| v.0)
+            .map(|(key, value)| format!(".{key}={value}"))
+            .join("")
+    )
+}
+
 /// Extracts custom partition values from provided JSON object
 /// e.g. `json: {"status": 400, "msg": "Hello, World!"}, custom_partition_list: ["status"]` returns `{"status" => 400}`
 pub fn extract_custom_partition_values(
@@ -463,6 +470,7 @@ mod tests {
 
     use arrow::datatypes::Int64Type;
     use arrow_array::{ArrayRef, Float64Array, Int64Array, ListArray, StringArray};
+    use chrono::Timelike;
     use serde_json::json;
 
     use super::*;
@@ -976,4 +984,51 @@ mod tests {
             &Float64Array::from(vec![None, None, None, Some(2.0)])
         );
     }
+
+    #[test]
+    fn generate_correct_prefix_with_current_time_and_no_custom_partitioning() {
+        let schema = vec![];
+        let parsed_timestamp = NaiveDate::from_ymd_opt(2023, 10, 1)
+            .unwrap()
+            .and_hms_opt(12, 30, 0)
+            .unwrap();
+        let custom_partition_values = HashMap::new();
+
+        let expected = format!(
+            "{}.date={}.hour={:02}.minute={}",
+            get_schema_key(&schema),
+            parsed_timestamp.date(),
+            parsed_timestamp.hour(),
+            Minute::from(parsed_timestamp).to_slot(OBJECT_STORE_DATA_GRANULARITY),
+        );
+
+        let generated = generate_prefix(&schema, parsed_timestamp, &custom_partition_values);
+
+        assert_eq!(generated, expected);
+    }
+
+    #[test]
+    fn generate_correct_prefix_with_current_time_and_custom_partitioning() {
+        let schema = vec![];
+        let parsed_timestamp = NaiveDate::from_ymd_opt(2023, 10, 1)
+            .unwrap()
+            .and_hms_opt(12, 30, 0)
+            .unwrap();
+        let custom_partition_values = HashMap::from_iter([
+            ("key1".to_string(), "value1".to_string()),
+            ("key2".to_string(), "value2".to_string()),
+        ]);
+
+        let expected = format!(
+            "{}.date={}.hour={:02}.minute={}.key1=value1.key2=value2",
+            get_schema_key(&schema),
+            parsed_timestamp.date(),
+            parsed_timestamp.hour(),
+            Minute::from(parsed_timestamp).to_slot(OBJECT_STORE_DATA_GRANULARITY),
+        );
+
+        let generated = generate_prefix(&schema, parsed_timestamp, &custom_partition_values);
+
+        assert_eq!(generated, expected);
+    }
 }

From 3d8c33818aa2e0c516335f79c416e992ea135bf7 Mon Sep 17 00:00:00 2001
From: Devdutt Shenoi <devdutt@parseable.com>
Date: Wed, 19 Mar 2025 00:26:38 +0530
Subject: [PATCH 39/39] spinoff #1251

---
 src/event/format/mod.rs |  3 ++-
 src/utils/arrow/mod.rs  | 12 +++++++-----
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/event/format/mod.rs b/src/event/format/mod.rs
index 40697c4ff..df73475b1 100644
--- a/src/event/format/mod.rs
+++ b/src/event/format/mod.rs
@@ -187,7 +187,8 @@ pub trait EventFormat: Sized {
         rb = replace_columns(
             rb.schema(),
             &rb,
-            &[(0, Arc::new(get_timestamp_array(p_timestamp, rb.num_rows())))],
+            &[0],
+            &[Arc::new(get_timestamp_array(p_timestamp, rb.num_rows()))],
         );
 
         Ok(rb)
diff --git a/src/utils/arrow/mod.rs b/src/utils/arrow/mod.rs
index a11186ee0..53e6437d6 100644
--- a/src/utils/arrow/mod.rs
+++ b/src/utils/arrow/mod.rs
@@ -61,7 +61,8 @@ use serde_json::{Map, Value};
 ///
 /// * `schema` - The schema of the record batch.
 /// * `batch` - The record batch to modify.
-/// * `indexed_arrays` - A list of indexes and arrays to replace the columns indexed with.
+/// * `indexes` - The indexes of the columns to replace.
+/// * `arrays` - The new arrays to replace the columns with.
 ///
 /// # Returns
 ///
@@ -69,11 +70,12 @@ use serde_json::{Map, Value};
 pub fn replace_columns(
     schema: Arc<Schema>,
     batch: &RecordBatch,
-    indexed_arrays: &[(usize, Arc<dyn Array + 'static>)],
+    indexes: &[usize],
+    arrays: &[Arc<dyn Array + 'static>],
 ) -> RecordBatch {
     let mut batch_arrays = batch.columns().iter().map(Arc::clone).collect_vec();
-    for (index, arr) in indexed_arrays {
-        batch_arrays[*index] = Arc::clone(arr);
+    for (&index, arr) in indexes.iter().zip(arrays.iter()) {
+        batch_arrays[index] = Arc::clone(arr);
     }
     RecordBatch::try_new(schema, batch_arrays).unwrap()
 }
@@ -176,7 +178,7 @@ mod tests {
 
         let arr: Arc<dyn Array + 'static> = Arc::new(Int32Array::from_value(0, 3));
 
-        let new_rb = replace_columns(schema_ref.clone(), &rb, &[(2, arr)]);
+        let new_rb = replace_columns(schema_ref.clone(), &rb, &[2], &[arr]);
 
         assert_eq!(new_rb.schema(), schema_ref);
         assert_eq!(new_rb.num_columns(), 3);