Skip to content

Commit a8f3b61

Browse files
committed
Restructure deltacat metrics to allow for and use decorators
1 parent 634bfcb commit a8f3b61

22 files changed

+345
-202
lines changed

deltacat/compute/compactor/compaction_session.py

+27-9
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@
5353
from deltacat.compute.compactor.model.compactor_version import CompactorVersion
5454
from deltacat.compute.compactor.utils.sort_key import validate_sort_keys
5555
from deltacat.utils.resources import get_current_node_peak_memory_usage_in_bytes
56+
from deltacat.utils.metrics import MetricsConfigSingleton
5657

5758

5859
if importlib.util.find_spec("memray"):
@@ -139,6 +140,10 @@ def compact_partition(
139140
if s3_client_kwargs is None:
140141
s3_client_kwargs = {}
141142

143+
if metrics_config:
144+
# Initialize MetricsConfigSingleton
145+
MetricsConfigSingleton.instance(metrics_config)
146+
142147
# memray official documentation link:
143148
# https://bloomberg.github.io/memray/getting_started.html
144149
with memray.Tracker(
@@ -166,7 +171,6 @@ def compact_partition(
166171
rebase_source_partition_locator,
167172
rebase_source_partition_high_watermark,
168173
enable_profiler,
169-
metrics_config,
170174
list_deltas_kwargs,
171175
read_kwargs_provider,
172176
s3_table_writer_kwargs,
@@ -217,7 +221,6 @@ def _execute_compaction_round(
217221
rebase_source_partition_locator: Optional[PartitionLocator],
218222
rebase_source_partition_high_watermark: Optional[int],
219223
enable_profiler: Optional[bool],
220-
metrics_config: Optional[MetricsConfig],
221224
list_deltas_kwargs: Optional[Dict[str, Any]],
222225
read_kwargs_provider: Optional[ReadKwargsProvider],
223226
s3_table_writer_kwargs: Optional[Dict[str, Any]],
@@ -234,6 +237,13 @@ def _execute_compaction_round(
234237
if rebase_source_partition_locator
235238
else source_partition_locator
236239
)
240+
241+
# We need to pass in metrics_config to each step, as they are run on separate processes
242+
try:
243+
metrics_config = MetricsConfigSingleton.instance().metrics_config
244+
except Exception:
245+
metrics_config = None
246+
237247
base_audit_url = rcf_source_partition_locator.path(
238248
f"s3://{compaction_artifact_s3_bucket}/compaction-audit"
239249
)
@@ -438,11 +448,11 @@ def _execute_compaction_round(
438448
num_buckets=hash_bucket_count,
439449
num_groups=max_parallelism,
440450
enable_profiler=enable_profiler,
441-
metrics_config=metrics_config,
442451
read_kwargs_provider=read_kwargs_provider,
443452
object_store=object_store,
444453
deltacat_storage=deltacat_storage,
445454
deltacat_storage_kwargs=deltacat_storage_kwargs,
455+
metrics_config=metrics_config,
446456
**kwargs,
447457
)
448458
hb_invoke_end = time.monotonic()
@@ -453,7 +463,8 @@ def _execute_compaction_round(
453463
hb_end = time.monotonic()
454464
hb_results_retrieved_at = time.time()
455465

456-
telemetry_time_hb = compaction_audit.save_step_stats(
466+
cluster_util_after_task_latency = 0
467+
cluster_util_after_task_latency += compaction_audit.save_step_stats(
457468
CompactionSessionAuditInfo.HASH_BUCKET_STEP_NAME,
458469
hb_results,
459470
hb_results_retrieved_at,
@@ -528,8 +539,8 @@ def _execute_compaction_round(
528539
sort_keys=sort_keys,
529540
num_materialize_buckets=num_materialize_buckets,
530541
enable_profiler=enable_profiler,
531-
metrics_config=metrics_config,
532542
object_store=object_store,
543+
metrics_config=metrics_config,
533544
)
534545

535546
dedupe_invoke_end = time.monotonic()
@@ -546,7 +557,7 @@ def _execute_compaction_round(
546557
total_dd_record_count = sum([ddr.deduped_record_count for ddr in dd_results])
547558
logger.info(f"Deduped {total_dd_record_count} records...")
548559

549-
telemetry_time_dd = compaction_audit.save_step_stats(
560+
cluster_util_after_task_latency += compaction_audit.save_step_stats(
550561
CompactionSessionAuditInfo.DEDUPE_STEP_NAME,
551562
dd_results,
552563
dedupe_results_retrieved_at,
@@ -605,12 +616,12 @@ def _execute_compaction_round(
605616
max_records_per_output_file=records_per_compacted_file,
606617
compacted_file_content_type=compacted_file_content_type,
607618
enable_profiler=enable_profiler,
608-
metrics_config=metrics_config,
609619
read_kwargs_provider=read_kwargs_provider,
610620
s3_table_writer_kwargs=s3_table_writer_kwargs,
611621
object_store=object_store,
612622
deltacat_storage=deltacat_storage,
613623
deltacat_storage_kwargs=deltacat_storage_kwargs,
624+
metrics_config=metrics_config,
614625
)
615626

616627
materialize_invoke_end = time.monotonic()
@@ -623,7 +634,7 @@ def _execute_compaction_round(
623634
materialize_end = time.monotonic()
624635
materialize_results_retrieved_at = time.time()
625636

626-
telemetry_time_materialize = compaction_audit.save_step_stats(
637+
cluster_util_after_task_latency += compaction_audit.save_step_stats(
627638
CompactionSessionAuditInfo.MATERIALIZE_STEP_NAME,
628639
mat_results,
629640
materialize_results_retrieved_at,
@@ -685,8 +696,15 @@ def _execute_compaction_round(
685696
session_peak_memory
686697
)
687698

699+
metrics_telemetry_time = 0
700+
try:
701+
metrics_telemetry_time = MetricsConfigSingleton.instance().total_telemetry_time
702+
except Exception as e:
703+
logger.warn(
704+
f"Skipping calculating metrics telemetry time due to exception: {e}"
705+
)
688706
compaction_audit.save_round_completion_stats(
689-
mat_results, telemetry_time_hb + telemetry_time_dd + telemetry_time_materialize
707+
mat_results, cluster_util_after_task_latency + metrics_telemetry_time
690708
)
691709

692710
s3_utils.upload(

deltacat/compute/compactor/model/compaction_session_audit_info.py

+1-6
Original file line numberDiff line numberDiff line change
@@ -832,7 +832,6 @@ def save_step_stats(
832832
f"{step_name}PostObjectStoreMemoryUsedBytes"
833833
] = cluster_utilization_after_task.used_object_store_memory_bytes
834834

835-
telemetry_time = 0
836835
if task_results:
837836
last_task_completed_at = max(
838837
result.task_completed_at for result in task_results
@@ -846,13 +845,9 @@ def save_step_stats(
846845
result.peak_memory_usage_bytes for result in task_results
847846
)
848847

849-
telemetry_time = sum(
850-
result.telemetry_time_in_seconds for result in task_results
851-
)
852-
853848
self[f"{step_name}TaskPeakMemoryUsedBytes"] = peak_task_memory.item()
854849

855-
return cluster_util_after_task_latency + telemetry_time
850+
return cluster_util_after_task_latency
856851

857852
def save_round_completion_stats(
858853
self, mat_results: List[MaterializeResult], total_telemetry_time: float

deltacat/compute/compactor/model/dedupe_result.py

-1
Original file line numberDiff line numberDiff line change
@@ -7,5 +7,4 @@ class DedupeResult(NamedTuple):
77
mat_bucket_idx_to_obj_id: Dict[int, Tuple]
88
deduped_record_count: np.int64
99
peak_memory_usage_bytes: np.double
10-
telemetry_time_in_seconds: np.double
1110
task_completed_at: np.double

deltacat/compute/compactor/model/hash_bucket_result.py

-1
Original file line numberDiff line numberDiff line change
@@ -7,5 +7,4 @@ class HashBucketResult(NamedTuple):
77
hash_bucket_group_to_obj_id: np.ndarray
88
hb_record_count: np.int64
99
peak_memory_usage_bytes: np.double
10-
telemetry_time_in_seconds: np.double
1110
task_completed_at: np.double

deltacat/compute/compactor/model/materialize_result.py

-6
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@ def of(
1616
pyarrow_write_result: PyArrowWriteResult,
1717
referenced_pyarrow_write_result: Optional[PyArrowWriteResult] = None,
1818
peak_memory_usage_bytes: Optional[np.double] = None,
19-
telemetry_time_in_seconds: Optional[np.double] = None,
2019
task_completed_at: Optional[np.double] = None,
2120
) -> MaterializeResult:
2221
materialize_result = MaterializeResult()
@@ -25,7 +24,6 @@ def of(
2524
materialize_result["paWriteResult"] = pyarrow_write_result
2625
materialize_result["referencedPaWriteResult"] = referenced_pyarrow_write_result
2726
materialize_result["peakMemoryUsageBytes"] = peak_memory_usage_bytes
28-
materialize_result["telemetryTimeInSeconds"] = telemetry_time_in_seconds
2927
materialize_result["taskCompletedAt"] = task_completed_at
3028
return materialize_result
3129

@@ -44,10 +42,6 @@ def task_index(self) -> int:
4442
def peak_memory_usage_bytes(self) -> Optional[np.double]:
4543
return self["peakMemoryUsageBytes"]
4644

47-
@property
48-
def telemetry_time_in_seconds(self) -> Optional[np.double]:
49-
return self["telemetryTimeInSeconds"]
50-
5145
@property
5246
def pyarrow_write_result(self) -> PyArrowWriteResult:
5347
val: Dict[str, Any] = self.get("paWriteResult")

deltacat/compute/compactor/repartition_session.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
PartitionLocator,
3131
interface as unimplemented_deltacat_storage,
3232
)
33-
from deltacat.utils.metrics import MetricsConfig
33+
from deltacat.utils.metrics import MetricsConfig, MetricsConfigSingleton
3434
from deltacat.compute.compactor.utils.sort_key import validate_sort_keys
3535

3636
logger = logs.configure_deltacat_logger(logging.getLogger(__name__))
@@ -59,6 +59,9 @@ def repartition(
5959
deltacat_storage=unimplemented_deltacat_storage,
6060
**kwargs,
6161
) -> Optional[str]:
62+
# Initialize MetricsConfigSingleton
63+
if metrics_config:
64+
MetricsConfigSingleton.instance(metrics_config)
6265

6366
node_resource_keys = None
6467
if pg_config: # use resource in each placement group
@@ -130,7 +133,6 @@ def repartition(
130133
max_records_per_output_file=records_per_repartitioned_file,
131134
destination_partition=partition,
132135
enable_profiler=enable_profiler,
133-
metrics_config=metrics_config,
134136
read_kwargs_provider=read_kwargs_provider,
135137
s3_table_writer_kwargs=s3_table_writer_kwargs,
136138
repartitioned_file_content_type=repartitioned_file_content_type,

deltacat/compute/compactor/steps/dedupe.py

+15-26
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,11 @@
2323
get_current_ray_worker_id,
2424
)
2525
from deltacat.utils.performance import timed_invocation
26-
from deltacat.utils.metrics import emit_timer_metrics, MetricsConfig
26+
from deltacat.utils.metrics import (
27+
emit_timer_metrics,
28+
MetricsConfig,
29+
MetricsConfigSingleton,
30+
)
2731
from deltacat.io.object_store import IObjectStore
2832
from deltacat.utils.resources import get_current_node_peak_memory_usage_in_bytes
2933

@@ -100,6 +104,7 @@ def delta_file_locator_to_mat_bucket_index(
100104
return int.from_bytes(digest, "big") % materialize_bucket_count
101105

102106

107+
@emit_timer_metrics(metrics_name="dedupe")
103108
def _timed_dedupe(
104109
object_ids: List[Any],
105110
sort_keys: List[SortKey],
@@ -108,7 +113,8 @@ def _timed_dedupe(
108113
enable_profiler: bool,
109114
object_store: Optional[IObjectStore],
110115
**kwargs,
111-
):
116+
) -> DedupeResult:
117+
logger.info(f"[Dedupe task {dedupe_task_index}] Starting dedupe task...")
112118
task_id = get_current_ray_task_id()
113119
worker_id = get_current_ray_worker_id()
114120
with memray.Tracker(
@@ -229,11 +235,11 @@ def _timed_dedupe(
229235
)
230236

231237
peak_memory_usage_bytes = get_current_node_peak_memory_usage_in_bytes()
238+
logger.info(f"[Dedupe task index {dedupe_task_index}] Finished dedupe task...")
232239
return DedupeResult(
233240
mat_bucket_to_dd_idx_obj_id,
234241
np.int64(total_deduped_records),
235242
np.double(peak_memory_usage_bytes),
236-
np.double(0.0),
237243
np.double(time.time()),
238244
)
239245

@@ -245,13 +251,15 @@ def dedupe(
245251
num_materialize_buckets: int,
246252
dedupe_task_index: int,
247253
enable_profiler: bool,
248-
metrics_config: MetricsConfig,
249254
object_store: Optional[IObjectStore],
255+
metrics_config: Optional[MetricsConfig] = None,
250256
**kwargs,
251257
) -> DedupeResult:
252-
logger.info(f"[Dedupe task {dedupe_task_index}] Starting dedupe task...")
253-
dedupe_result, duration = timed_invocation(
254-
func=_timed_dedupe,
258+
# initialize singleton on new process
259+
if metrics_config:
260+
MetricsConfigSingleton.instance(metrics_config)
261+
262+
return _timed_dedupe(
255263
object_ids=object_ids,
256264
sort_keys=sort_keys,
257265
num_materialize_buckets=num_materialize_buckets,
@@ -260,22 +268,3 @@ def dedupe(
260268
object_store=object_store,
261269
**kwargs,
262270
)
263-
264-
emit_metrics_time = 0.0
265-
if metrics_config:
266-
emit_result, latency = timed_invocation(
267-
func=emit_timer_metrics,
268-
metrics_name="dedupe",
269-
value=duration,
270-
metrics_config=metrics_config,
271-
)
272-
emit_metrics_time = latency
273-
274-
logger.info(f"[Dedupe task index {dedupe_task_index}] Finished dedupe task...")
275-
return DedupeResult(
276-
dedupe_result[0],
277-
dedupe_result[1],
278-
dedupe_result[2],
279-
np.double(emit_metrics_time),
280-
dedupe_result[4],
281-
)

0 commit comments

Comments
 (0)