Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Stop implicitly emitting deprecated process runtime metrics #932

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 9 additions & 8 deletions docs/integrations/system-metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,15 +28,15 @@ By default, `instrument_system_metrics` collects only the metrics it needs to di

```py
logfire.instrument_system_metrics({
'process.runtime.cpu.utilization': None, # (1)!
'process.cpu.utilization': None, # (1)!
'system.cpu.simple_utilization': None, # (2)!
'system.memory.utilization': ['available'], # (3)!
'system.swap.utilization': ['used'], # (4)!
})
```

1. `process.runtime.cpu.utilization` will lead to exporting a metric that is actually named `process.runtime.cpython.cpu.utilization` or a similar name depending on the Python implementation used. The `None` value means that there are no fields to configure for this metric. The value of this metric is `[psutil.Process().cpu_percent()](https://psutil.readthedocs.io/en/latest/#psutil.Process.cpu_percent) / 100`, i.e. the fraction of CPU time used by this process, where 1 means using 100% of a single CPU core. The value can be greater than 1 if the process uses multiple cores.
2. The `None` value means that there are no fields to configure for this metric. The value of this metric is `[psutil.cpu_percent()](https://psutil.readthedocs.io/en/latest/#psutil.cpu_percent) / 100`, i.e. the fraction of CPU time used by the whole system, where 1 means using 100% of all CPU cores.
1. The `None` value means that there are no fields to configure for this metric. The value of this metric is [`psutil.Process().cpu_percent()`](https://psutil.readthedocs.io/en/latest/#psutil.Process.cpu_percent)`/100`, i.e. the fraction of CPU time used by this process, where 1 means using 100% of a single CPU core. The value can be greater than 1 if the process uses multiple cores.
2. The `None` value means that there are no fields to configure for this metric. The value of this metric is [`psutil.cpu_percent()`](https://psutil.readthedocs.io/en/latest/#psutil.cpu_percent)`/100`, i.e. the fraction of CPU time used by the whole system, where 1 means using 100% of all CPU cores.
3. The value here is a list of 'modes' of memory. The full list can be seen in the [`psutil` documentation](https://psutil.readthedocs.io/en/latest/#psutil.virtual_memory). `available` is "the memory that can be given instantly to processes without the system going into swap. This is calculated by summing different memory metrics that vary depending on the platform. It is supposed to be used to monitor actual memory usage in a cross platform fashion." The value of the metric is a number between 0 and 1, and subtracting the value from 1 gives the fraction of memory used.
4. This is the fraction of available swap used. The value is a number between 0 and 1.

Expand Down Expand Up @@ -69,13 +69,14 @@ logfire.instrument_system_metrics({
'system.network.errors': ['transmit', 'receive'],
'system.network.io': ['transmit', 'receive'],
'system.thread_count': None,
'process.runtime.memory': ['rss', 'vms'],
'process.runtime.cpu.time': ['user', 'system'],
'process.context_switches': ['involuntary', 'voluntary'],
'process.runtime.gc_count': None,
'process.runtime.thread_count': None,
'process.runtime.cpu.utilization': None,
'process.runtime.context_switches': ['involuntary', 'voluntary'],
'process.open_file_descriptor.count': None,
'process.cpu.time': ['user', 'system'],
'process.cpu.utilization': ['user', 'system'],
'process.memory.usage': None,
'process.memory.virtual': None,
'process.thread.count': None,
})
```

Expand Down
42 changes: 27 additions & 15 deletions logfire/_internal/integrations/system_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,14 @@
'system.network.io',
'system.network.connections',
'system.thread_count',
'process.runtime.memory',
'process.runtime.cpu.time',
'process.runtime.gc_count',
'process.runtime.thread_count',
'process.runtime.cpu.utilization',
'process.runtime.context_switches',
'process.open_file_descriptor.count',
'process.context_switches',
'process.cpu.time',
'process.cpu.utilization',
'process.memory.usage',
'process.memory.virtual',
'process.thread.count',
'process.runtime.gc_count',
]
] = Literal[ # type: ignore # but pyright doesn't like it
'system.cpu.simple_utilization',
Expand All @@ -68,13 +69,14 @@
'system.network.io',
'system.network.connections',
'system.thread_count',
'process.runtime.memory',
'process.runtime.cpu.time',
'process.runtime.gc_count',
'process.runtime.thread_count',
'process.runtime.cpu.utilization',
'process.runtime.context_switches',
'process.open_file_descriptor.count',
'process.context_switches',
'process.cpu.time',
'process.cpu.utilization',
'process.memory.usage',
'process.memory.virtual',
'process.thread.count',
'process.runtime.gc_count',
]

Config = Dict[MetricName, Optional[Iterable[str]]]
Expand Down Expand Up @@ -107,8 +109,17 @@
# upstream pr: https://github.com/open-telemetry/opentelemetry-python-contrib/pull/2008
FULL_CONFIG.pop('system.network.connections', None)

for _deprecated in [
'process.runtime.memory',
'process.runtime.cpu.time',
'process.runtime.thread_count',
'process.runtime.cpu.utilization',
'process.runtime.context_switches',
]:
FULL_CONFIG.pop(_deprecated, None) # type: ignore

BASIC_CONFIG: Config = {
'process.runtime.cpu.utilization': None,
'process.cpu.utilization': None,
'system.cpu.simple_utilization': None,
# The actually used memory ratio can be calculated as `1 - available`.
'system.memory.utilization': ['available'],
Expand All @@ -135,10 +146,11 @@ def instrument_system_metrics(logfire_instance: Logfire, config: Config | None =
if 'system.cpu.simple_utilization' in config:
measure_simple_cpu_utilization(logfire_instance)

if 'process.runtime.cpu.utilization' in config:
if 'process.runtime.cpu.utilization' in config: # type: ignore
# Override OTEL here, see comment in measure_process_runtime_cpu_utilization.<locals>.callback.
# (The name is also deprecated by OTEL, but that's not really important)
measure_process_runtime_cpu_utilization(logfire_instance)
del config['process.runtime.cpu.utilization']
del config['process.runtime.cpu.utilization'] # type: ignore

instrumentor = SystemMetricsInstrumentor(config=config) # type: ignore
instrumentor.instrument(meter_provider=logfire_instance.config.get_meter_provider())
Expand Down
36 changes: 18 additions & 18 deletions tests/otel_integrations/test_system_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,27 +28,26 @@ def test_default_system_metrics_collection(metrics_reader: InMemoryMetricReader)
logfire.instrument_system_metrics()
assert get_collected_metric_names(metrics_reader) == snapshot(
[
'process.runtime.cpython.cpu.utilization',
'process.cpu.utilization',
'system.cpu.simple_utilization',
'system.memory.utilization',
'system.swap.utilization',
]
)


# TODO FIX THIS
@pytest.mark.xfail
def test_all_system_metrics_collection(metrics_reader: InMemoryMetricReader) -> None:
logfire.instrument_system_metrics(base='full')
assert get_collected_metric_names(metrics_reader) == snapshot(
[
'process.context_switches',
'process.cpu.time',
'process.cpu.utilization',
'process.memory.usage',
'process.memory.virtual',
'process.open_file_descriptor.count',
'process.runtime.cpython.context_switches',
'process.runtime.cpython.cpu.utilization',
'process.runtime.cpython.cpu_time',
'process.runtime.cpython.gc_count',
'process.runtime.cpython.memory',
'process.runtime.cpython.thread_count',
'process.thread.count',
'system.cpu.simple_utilization',
'system.cpu.time',
'system.cpu.utilization',
Expand All @@ -69,21 +68,21 @@ def test_all_system_metrics_collection(metrics_reader: InMemoryMetricReader) ->


def test_custom_system_metrics_collection(metrics_reader: InMemoryMetricReader) -> None:
logfire.instrument_system_metrics({'system.memory.utilization': ['available']}, base=None)
assert get_collected_metric_names(metrics_reader) == ['system.memory.utilization']
# This metric is now deprecated by OTEL, but there isn't a strong reason to stop allowing it when requested,
# and I also want to test measure_process_runtime_cpu_utilization.
logfire.instrument_system_metrics({'process.runtime.cpu.utilization': None}, base=None) # type: ignore
assert get_collected_metric_names(metrics_reader) == ['process.runtime.cpython.cpu.utilization']


def test_basic_base():
assert get_base_config('basic') == {
'process.runtime.cpu.utilization': None,
'process.cpu.utilization': None,
'system.cpu.simple_utilization': None,
'system.memory.utilization': ['available'],
'system.swap.utilization': ['used'],
}, 'Docs need to be updated if this test fails'


# TODO FIX THIS
@pytest.mark.xfail
def test_full_base():
config = get_base_config('full')
config.pop('system.network.connections', None)
Expand Down Expand Up @@ -137,13 +136,14 @@ def test_full_base():
'system.network.errors': ['transmit', 'receive'],
'system.network.io': ['transmit', 'receive'],
'system.thread_count': None,
'process.runtime.memory': ['rss', 'vms'],
'process.runtime.cpu.time': ['user', 'system'],
'process.runtime.gc_count': None,
'process.runtime.thread_count': None,
'process.runtime.cpu.utilization': None,
'process.runtime.context_switches': ['involuntary', 'voluntary'],
'process.open_file_descriptor.count': None,
'process.memory.usage': None,
'process.memory.virtual': None,
'process.cpu.time': ['user', 'system'],
'process.cpu.utilization': ['user', 'system'],
'process.thread.count': None,
'process.context_switches': ['involuntary', 'voluntary'],
}, 'Docs and the MetricName type need to be updated if this test fails'


Expand Down