19
19
20
20
use std:: {
21
21
collections:: HashMap ,
22
- fs:: { remove_file, File , OpenOptions } ,
22
+ fs:: { self , remove_file, File , OpenOptions } ,
23
23
path:: { Path , PathBuf } ,
24
24
process,
25
25
sync:: { Arc , Mutex , RwLock } ,
@@ -42,15 +42,17 @@ use parquet::{
42
42
schema:: types:: ColumnPath ,
43
43
} ;
44
44
use rand:: distributions:: DistString ;
45
- use tracing:: { error, trace} ;
45
+ use relative_path:: RelativePathBuf ;
46
+ use tracing:: { error, info, trace, warn} ;
46
47
47
48
use crate :: {
48
49
cli:: Options ,
49
50
event:: DEFAULT_TIMESTAMP_KEY ,
50
51
handlers:: http:: modal:: ingest_server:: INGESTOR_META ,
52
+ metadata:: { LOCK_EXPECT , STREAM_INFO } ,
51
53
metrics,
52
54
option:: { Mode , CONFIG } ,
53
- storage:: { StreamType , OBJECT_STORE_DATA_GRANULARITY } ,
55
+ storage:: { object_storage :: to_bytes , StreamType , OBJECT_STORE_DATA_GRANULARITY } ,
54
56
utils:: minute_to_slot,
55
57
} ;
56
58
@@ -278,6 +280,62 @@ impl<'a> Stream<'a> {
278
280
parquet_path
279
281
}
280
282
283
+ /// Converts arrow files in staging into parquet files, does so only for past minutes when run with `!shutdown_signal`
284
+ fn prepare_parquet ( & self , shutdown_signal : bool ) -> Result < ( ) , StagingError > {
285
+ info ! (
286
+ "Starting arrow_conversion job for stream- {}" ,
287
+ self . stream_name
288
+ ) ;
289
+
290
+ let time_partition = STREAM_INFO . get_time_partition ( & self . stream_name ) ?;
291
+ let custom_partition = STREAM_INFO . get_custom_partition ( & self . stream_name ) ?;
292
+
293
+ // read arrow files on disk
294
+ // convert them to parquet
295
+ let schema = self
296
+ . convert_disk_files_to_parquet (
297
+ time_partition. as_ref ( ) ,
298
+ custom_partition. as_ref ( ) ,
299
+ shutdown_signal,
300
+ )
301
+ . inspect_err ( |err| warn ! ( "Error while converting arrow to parquet- {err:?}" ) ) ?;
302
+
303
+ // check if there is already a schema file in staging pertaining to this stream
304
+ // if yes, then merge them and save
305
+
306
+ if let Some ( schema) = schema {
307
+ let static_schema_flag = STREAM_INFO . get_static_schema_flag ( & self . stream_name ) ?;
308
+ if !static_schema_flag {
309
+ // schema is dynamic, read from staging and merge if present
310
+
311
+ // need to add something before .schema to make the file have an extension of type `schema`
312
+ let path = RelativePathBuf :: from_iter ( [ format ! ( "{}.schema" , self . stream_name) ] )
313
+ . to_path ( & self . data_path ) ;
314
+
315
+ let staging_schemas = self . get_schemas_if_present ( ) ;
316
+ if let Some ( mut staging_schemas) = staging_schemas {
317
+ warn ! (
318
+ "Found {} schemas in staging for stream- {}" ,
319
+ staging_schemas. len( ) ,
320
+ self . stream_name
321
+ ) ;
322
+ staging_schemas. push ( schema) ;
323
+ let merged_schema = Schema :: try_merge ( staging_schemas) ?;
324
+
325
+ warn ! ( "writing merged schema to path- {path:?}" ) ;
326
+ // save the merged schema on staging disk
327
+ // the path should be stream/.ingestor.id.schema
328
+ fs:: write ( path, to_bytes ( & merged_schema) ) ?;
329
+ } else {
330
+ info ! ( "writing single schema to path- {path:?}" ) ;
331
+ fs:: write ( path, to_bytes ( & schema) ) ?;
332
+ }
333
+ }
334
+ }
335
+
336
+ Ok ( ( ) )
337
+ }
338
+
281
339
pub fn recordbatches_cloned ( & self , schema : & Arc < Schema > ) -> Vec < RecordBatch > {
282
340
self . writer . lock ( ) . unwrap ( ) . mem . recordbatch_cloned ( schema)
283
341
}
@@ -506,6 +564,21 @@ impl Streams {
506
564
staging. flush ( )
507
565
}
508
566
}
567
+
568
+ /// Convert arrow files into parquet, preparing it for upload
569
+ pub fn prepare_parquet ( & self , shutdown_signal : bool ) -> Result < ( ) , StagingError > {
570
+ if !Path :: new ( & CONFIG . staging_dir ( ) ) . exists ( ) {
571
+ return Ok ( ( ) ) ;
572
+ }
573
+
574
+ for stream in self . read ( ) . expect ( LOCK_EXPECT ) . values ( ) {
575
+ stream
576
+ . prepare_parquet ( shutdown_signal)
577
+ . inspect_err ( |err| error ! ( "Failed to run conversion task {err:?}" ) ) ?;
578
+ }
579
+
580
+ Ok ( ( ) )
581
+ }
509
582
}
510
583
511
584
#[ cfg( test) ]
0 commit comments