diff --git a/docs/integrations/system-metrics.md b/docs/integrations/system-metrics.md index 860d2637..4366e3f0 100644 --- a/docs/integrations/system-metrics.md +++ b/docs/integrations/system-metrics.md @@ -35,8 +35,8 @@ logfire.instrument_system_metrics({ }) ``` -1. `process.runtime.cpu.utilization` will lead to exporting a metric that is actually named `process.runtime.cpython.cpu.utilization` or a similar name depending on the Python implementation used. The `None` value means that there are no fields to configure for this metric. The value of this metric is `[psutil.Process().cpu_percent()](https://psutil.readthedocs.io/en/latest/#psutil.Process.cpu_percent) / 100`, i.e. the fraction of CPU time used by this process, where 1 means using 100% of a single CPU core. The value can be greater than 1 if the process uses multiple cores. -2. The `None` value means that there are no fields to configure for this metric. The value of this metric is `[psutil.cpu_percent()](https://psutil.readthedocs.io/en/latest/#psutil.cpu_percent) / 100`, i.e. the fraction of CPU time used by the whole system, where 1 means using 100% of all CPU cores. +1. `process.runtime.cpu.utilization` will lead to exporting a metric that is actually named `process.runtime.cpython.cpu.utilization` or a similar name depending on the Python implementation used. The `None` value means that there are no fields to configure for this metric. The value of this metric is [`psutil.Process().cpu_percent()`](https://psutil.readthedocs.io/en/latest/#psutil.Process.cpu_percent)`/ 100`, i.e. the fraction of CPU time used by this process, where 1 means using 100% of a single CPU core. The value can be greater than 1 if the process uses multiple cores. In the next major release, the default will instead emit `process.cpu.core_utilization`, which is the same metric but with a simpler name. +2. The `None` value means that there are no fields to configure for this metric. The value of this metric is [`psutil.cpu_percent()`](https://psutil.readthedocs.io/en/latest/#psutil.cpu_percent)`/ 100`, i.e. the fraction of CPU time used by the whole system, where 1 means using 100% of all CPU cores. 3. The value here is a list of 'modes' of memory. The full list can be seen in the [`psutil` documentation](https://psutil.readthedocs.io/en/latest/#psutil.virtual_memory). `available` is "the memory that can be given instantly to processes without the system going into swap. This is calculated by summing different memory metrics that vary depending on the platform. It is supposed to be used to monitor actual memory usage in a cross platform fashion." The value of the metric is a number between 0 and 1, and subtracting the value from 1 gives the fraction of memory used. 4. This is the fraction of available swap used. The value is a number between 0 and 1. @@ -69,13 +69,22 @@ logfire.instrument_system_metrics({ 'system.network.errors': ['transmit', 'receive'], 'system.network.io': ['transmit', 'receive'], 'system.thread_count': None, + 'process.context_switches': ['involuntary', 'voluntary'], + 'process.runtime.gc_count': None, + 'process.open_file_descriptor.count': None, + 'process.cpu.time': ['user', 'system'], + 'process.cpu.utilization': None, + 'process.cpu.core_utilization': None, + 'process.memory.usage': None, + 'process.memory.virtual': None, + 'process.thread.count': None, + # These are deprecated and equivalent to some of the above. + # base='full' will stop including them in the next major release. 'process.runtime.memory': ['rss', 'vms'], 'process.runtime.cpu.time': ['user', 'system'], - 'process.runtime.gc_count': None, 'process.runtime.thread_count': None, 'process.runtime.cpu.utilization': None, 'process.runtime.context_switches': ['involuntary', 'voluntary'], - 'process.open_file_descriptor.count': None, }) ``` diff --git a/logfire/_internal/integrations/system_metrics.py b/logfire/_internal/integrations/system_metrics.py index af652c6c..fdf56829 100644 --- a/logfire/_internal/integrations/system_metrics.py +++ b/logfire/_internal/integrations/system_metrics.py @@ -43,13 +43,21 @@ 'system.network.io', 'system.network.connections', 'system.thread_count', + 'process.open_file_descriptor.count', + 'process.context_switches', + 'process.cpu.time', + 'process.cpu.utilization', + 'process.cpu.core_utilization', + 'process.memory.usage', + 'process.memory.virtual', + 'process.thread.count', + 'process.runtime.gc_count', + # ##### These are deprecated: 'process.runtime.memory', 'process.runtime.cpu.time', - 'process.runtime.gc_count', 'process.runtime.thread_count', 'process.runtime.cpu.utilization', 'process.runtime.context_switches', - 'process.open_file_descriptor.count', ] ] = Literal[ # type: ignore # but pyright doesn't like it 'system.cpu.simple_utilization', @@ -68,13 +76,21 @@ 'system.network.io', 'system.network.connections', 'system.thread_count', + 'process.open_file_descriptor.count', + 'process.context_switches', + 'process.cpu.time', + 'process.cpu.utilization', + 'process.cpu.core_utilization', + 'process.memory.usage', + 'process.memory.virtual', + 'process.thread.count', + 'process.runtime.gc_count', + # ##### These are deprecated: 'process.runtime.memory', 'process.runtime.cpu.time', - 'process.runtime.gc_count', 'process.runtime.thread_count', 'process.runtime.cpu.utilization', 'process.runtime.context_switches', - 'process.open_file_descriptor.count', ] Config = Dict[MetricName, Optional[Iterable[str]]] @@ -92,6 +108,7 @@ FULL_CONFIG: Config = { **cast(Config, _DEFAULT_CONFIG), 'system.cpu.simple_utilization': None, + 'process.cpu.core_utilization': None, 'system.cpu.time': CPU_FIELDS, 'system.cpu.utilization': CPU_FIELDS, # For usage, knowing the total amount of bytes available might be handy. @@ -135,11 +152,19 @@ def instrument_system_metrics(logfire_instance: Logfire, config: Config | None = if 'system.cpu.simple_utilization' in config: measure_simple_cpu_utilization(logfire_instance) + if 'process.cpu.core_utilization' in config: + measure_process_cpu_core_utilization(logfire_instance) + if 'process.runtime.cpu.utilization' in config: # Override OTEL here, see comment in measure_process_runtime_cpu_utilization..callback. measure_process_runtime_cpu_utilization(logfire_instance) del config['process.runtime.cpu.utilization'] + if 'process.cpu.utilization' in config: + # Override OTEL here to avoid emitting 0 in the first measurement. + measure_process_cpu_utilization(logfire_instance) + del config['process.cpu.utilization'] + instrumentor = SystemMetricsInstrumentor(config=config) # type: ignore instrumentor.instrument(meter_provider=logfire_instance.config.get_meter_provider()) @@ -177,3 +202,41 @@ def callback(_options: CallbackOptions) -> Iterable[Observation]: description='Runtime CPU utilization', unit='1', ) + + +def measure_process_cpu_utilization(logfire_instance: Logfire): + process = psutil.Process() + # This first call always returns 0, do it here so that the first real measurement from an exporter + # will return a nonzero value. + # Otherwise this function mimics what OTel's SystemMetricsInstrumentor does. + process.cpu_percent() + + num_cpus = psutil.cpu_count() or 1 + + def callback(_options: CallbackOptions) -> Iterable[Observation]: + yield Observation(process.cpu_percent() / 100 / num_cpus) + + logfire_instance.metric_gauge_callback( + 'process.cpu.utilization', + [callback], + description='Runtime CPU utilization', + unit='1', + ) + + +def measure_process_cpu_core_utilization(logfire_instance: Logfire): + """Same as process.cpu.utilization, but not divided by the number of available cores.""" + process = psutil.Process() + # This first call always returns 0, do it here so that the first real measurement from an exporter + # will return a nonzero value. + process.cpu_percent() + + def callback(_options: CallbackOptions) -> Iterable[Observation]: + yield Observation(process.cpu_percent() / 100) + + logfire_instance.metric_gauge_callback( + 'process.cpu.core_utilization', + [callback], + description='Runtime CPU utilization, not divided by the number of available cores.', + unit='core', + ) diff --git a/tests/otel_integrations/test_system_metrics.py b/tests/otel_integrations/test_system_metrics.py index 03682b33..4f2dc64b 100644 --- a/tests/otel_integrations/test_system_metrics.py +++ b/tests/otel_integrations/test_system_metrics.py @@ -36,12 +36,16 @@ def test_default_system_metrics_collection(metrics_reader: InMemoryMetricReader) ) -# TODO FIX THIS -@pytest.mark.xfail def test_all_system_metrics_collection(metrics_reader: InMemoryMetricReader) -> None: logfire.instrument_system_metrics(base='full') assert get_collected_metric_names(metrics_reader) == snapshot( [ + 'process.context_switches', + 'process.cpu.core_utilization', + 'process.cpu.time', + 'process.cpu.utilization', + 'process.memory.usage', + 'process.memory.virtual', 'process.open_file_descriptor.count', 'process.runtime.cpython.context_switches', 'process.runtime.cpython.cpu.utilization', @@ -49,6 +53,7 @@ def test_all_system_metrics_collection(metrics_reader: InMemoryMetricReader) -> 'process.runtime.cpython.gc_count', 'process.runtime.cpython.memory', 'process.runtime.cpython.thread_count', + 'process.thread.count', 'system.cpu.simple_utilization', 'system.cpu.time', 'system.cpu.utilization', @@ -69,8 +74,21 @@ def test_all_system_metrics_collection(metrics_reader: InMemoryMetricReader) -> def test_custom_system_metrics_collection(metrics_reader: InMemoryMetricReader) -> None: - logfire.instrument_system_metrics({'system.memory.utilization': ['available']}, base=None) - assert get_collected_metric_names(metrics_reader) == ['system.memory.utilization'] + logfire.instrument_system_metrics( + { + 'system.memory.utilization': ['available'], + 'process.cpu.core_utilization': None, + 'process.cpu.utilization': None, + }, + base=None, + ) + assert get_collected_metric_names(metrics_reader) == snapshot( + [ + 'process.cpu.core_utilization', + 'process.cpu.utilization', + 'system.memory.utilization', + ] + ) def test_basic_base(): @@ -82,8 +100,6 @@ def test_basic_base(): }, 'Docs need to be updated if this test fails' -# TODO FIX THIS -@pytest.mark.xfail def test_full_base(): config = get_base_config('full') config.pop('system.network.connections', None) @@ -137,13 +153,22 @@ def test_full_base(): 'system.network.errors': ['transmit', 'receive'], 'system.network.io': ['transmit', 'receive'], 'system.thread_count': None, + 'process.runtime.gc_count': None, + 'process.open_file_descriptor.count': None, + 'process.memory.usage': None, + 'process.memory.virtual': None, + 'process.cpu.time': ['user', 'system'], + # There's no reason for OTel to give a value here, so the docs say `None` + 'process.cpu.utilization': ['user', 'system'], + 'process.cpu.core_utilization': None, + 'process.thread.count': None, + 'process.context_switches': ['involuntary', 'voluntary'], + # These are deprecated: 'process.runtime.memory': ['rss', 'vms'], 'process.runtime.cpu.time': ['user', 'system'], - 'process.runtime.gc_count': None, - 'process.runtime.thread_count': None, 'process.runtime.cpu.utilization': None, + 'process.runtime.thread_count': None, 'process.runtime.context_switches': ['involuntary', 'voluntary'], - 'process.open_file_descriptor.count': None, }, 'Docs and the MetricName type need to be updated if this test fails'