-
Notifications
You must be signed in to change notification settings - Fork 3.9k
/
Copy pathcpuunclaimed.py
executable file
·409 lines (367 loc) · 14.6 KB
/
cpuunclaimed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
#!/usr/bin/env python
# @lint-avoid-python-3-compatibility-imports
#
# cpuunclaimed Sample CPU run queues and calculate unclaimed idle CPU.
# For Linux, uses BCC, eBPF.
#
# This samples the length of the run queues and determine when there are idle
# CPUs, yet queued threads waiting their turn. Report the amount of idle
# (yet unclaimed by waiting threads) CPU as a system-wide percentage.
#
# This situation can happen for a number of reasons:
#
# - An application has been bound to some, but not all, CPUs, and has runnable
# threads that cannot migrate to other CPUs due to this configuration.
# - CPU affinity: an optimization that leaves threads on CPUs where the CPU
# caches are warm, even if this means short periods of waiting while other
# CPUs are idle. The wait period is tunale (see sysctl, kernel.sched*).
# - Scheduler bugs.
#
# An unclaimed idle of < 1% is likely to be CPU affinity, and not usually a
# cause for concern. By leaving the CPU idle, overall throughput of the system
# may be improved. This tool is best for identifying larger issues, > 2%, due
# to the coarseness of its 99 Hertz samples.
#
# This is an experimental tool that currently works by use of sampling to
# keep overheads low. Tool assumptions:
#
# - CPU samples consistently fire around the same offset. There will sometimes
# be a lag as a sample is delayed by higher-priority interrupts, but it is
# assumed the subsequent samples will catch up to the expected offsets (as
# is seen in practice). You can use -J to inspect sample offsets. Some
# systems can power down CPUs when idle, and when they wake up again they
# may begin firing at a skewed offset: this tool will detect the skew, print
# an error, and exit.
# - All CPUs are online (see ncpu).
#
# If this identifies unclaimed CPU, you can double check it by dumping raw
# samples (-j), as well as using other tracing tools to instrument scheduler
# events (although this latter approach has much higher overhead).
#
# This tool passes all sampled events to user space for post processing.
# I originally wrote this to do the calculations entirerly in kernel context,
# and only pass a summary. That involves a number of challenges, and the
# overhead savings may not outweigh the caveats. You can see my WIP here:
# https://gist.github.com/brendangregg/731cf2ce54bf1f9a19d4ccd397625ad9
#
# USAGE: cpuunclaimed [-h] [-j] [-J] [-T] [interval] [count]
#
# If you see "Lost 1881 samples" warnings, try increasing wakeup_hz.
#
# REQUIRES: Linux 4.9+ (BPF_PROG_TYPE_PERF_EVENT support). Under tools/old is
# a version of this tool that may work on Linux 4.6 - 4.8.
#
# Copyright 2016 Netflix, Inc.
# Licensed under the Apache License, Version 2.0 (the "License")
#
# 20-Dec-2016 Brendan Gregg Created this.
from __future__ import print_function
from bcc import BPF, PerfType, PerfSWConfig
from time import sleep, strftime
import argparse
import multiprocessing
from os import getpid, system, open, close, dup, unlink, O_WRONLY
from tempfile import NamedTemporaryFile
# arguments
examples = """examples:
./cpuunclaimed # sample and calculate unclaimed idle CPUs,
# output every 1 second (default)
./cpuunclaimed 5 10 # print 5 second summaries, 10 times
./cpuunclaimed -T 1 # 1s summaries and timestamps
./cpuunclaimed -j # raw dump of all samples (verbose), CSV
"""
parser = argparse.ArgumentParser(
description="Sample CPU run queues and calculate unclaimed idle CPU",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=examples)
parser.add_argument("-j", "--csv", action="store_true",
help="print sample summaries (verbose) as comma-separated values")
parser.add_argument("-J", "--fullcsv", action="store_true",
help="print sample summaries with extra fields: CPU sample offsets")
parser.add_argument("-T", "--timestamp", action="store_true",
help="include timestamp on output")
parser.add_argument("interval", nargs="?", default=-1,
help="output interval, in seconds")
parser.add_argument("count", nargs="?", default=99999999,
help="number of outputs")
parser.add_argument("--ebpf", action="store_true",
help=argparse.SUPPRESS)
args = parser.parse_args()
countdown = int(args.count)
frequency = 99
dobind = 1
wakeup_hz = 10 # frequency to read buffers
wakeup_s = float(1) / wakeup_hz
ncpu = multiprocessing.cpu_count() # assume all are online
debug = 0
# Linux 4.15 introduced a new field runnable_weight
# in linux_src:kernel/sched/sched.h as
# struct cfs_rq {
# struct load_weight load;
# unsigned long runnable_weight;
# unsigned int nr_running, h_nr_running;
# ......
# }
# and this tool requires to access nr_running to get
# runqueue len information.
#
# The commit which introduces cfs_rq->runnable_weight
# field also introduces the field sched_entity->runnable_weight
# where sched_entity is defined in linux_src:include/linux/sched.h.
#
# To cope with pre-4.15 and 4.15/post-4.15 releases,
# we run a simple BPF program to detect whether
# field sched_entity->runnable_weight exists. The existence of
# this field should infer the existence of cfs_rq->runnable_weight.
#
# This will need maintenance as the relationship between these
# two fields may change in the future.
#
def check_runnable_weight_field():
# Define the bpf program for checking purpose
bpf_check_text = """
#include <linux/sched.h>
unsigned long dummy(struct sched_entity *entity)
{
return entity->runnable_weight;
}
"""
# Get a temporary file name
tmp_file = NamedTemporaryFile(delete=False)
tmp_file.close();
# Duplicate and close stderr (fd = 2)
old_stderr = dup(2)
close(2)
# Open a new file, should get fd number 2
# This will avoid printing llvm errors on the screen
fd = open(tmp_file.name, O_WRONLY)
try:
t = BPF(text=bpf_check_text)
success_compile = True
except:
success_compile = False
# Release the fd 2, and next dup should restore old stderr
close(fd)
dup(old_stderr)
close(old_stderr)
# remove the temporary file and return
unlink(tmp_file.name)
return success_compile
# process arguments
if args.fullcsv:
args.csv = True
if args.csv:
interval = 0.2
if args.interval != -1 and (args.fullcsv or args.csv):
print("ERROR: cannot use interval with either -j or -J. Exiting.")
exit()
if args.interval == -1:
args.interval = "1"
interval = float(args.interval)
# define BPF program
bpf_text = """
#include <uapi/linux/ptrace.h>
#include <uapi/linux/bpf_perf_event.h>
#include <linux/sched.h>
struct data_t {
u64 ts;
u64 cpu;
u64 len;
};
BPF_PERF_OUTPUT(events);
// Declare enough of cfs_rq to find nr_running, since we can't #import the
// header. This will need maintenance. It is from kernel/sched/sched.h:
// The runnable_weight field is removed from Linux 5.7.0
struct cfs_rq_partial {
struct load_weight load;
#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 7, 0)
RUNNABLE_WEIGHT_FIELD
#endif
unsigned int nr_running, h_nr_running;
};
int do_perf_event(struct bpf_perf_event_data *ctx)
{
int cpu = bpf_get_smp_processor_id();
u64 now = bpf_ktime_get_ns();
/*
* Fetch the run queue length from task->se.cfs_rq->nr_running. This is an
* unstable interface and may need maintenance. Perhaps a future version
* of BPF will support task_rq(p) or something similar as a more reliable
* interface.
*/
unsigned int len = 0;
struct task_struct *task = NULL;
struct cfs_rq_partial *my_q = NULL;
task = (struct task_struct *)bpf_get_current_task();
my_q = (struct cfs_rq_partial *)task->se.cfs_rq;
len = my_q->nr_running;
struct data_t data = {.ts = now, .cpu = cpu, .len = len};
events.perf_submit(ctx, &data, sizeof(data));
return 0;
}
"""
# If target has BTF enabled, use BTF to check runnable_weight field exists in
# cfs_rq first, otherwise fallback to use check_runnable_weight_field().
if BPF.kernel_struct_has_field(b'cfs_rq', b'runnable_weight') == 1 \
or check_runnable_weight_field():
bpf_text = bpf_text.replace('RUNNABLE_WEIGHT_FIELD', 'unsigned long runnable_weight;')
else:
bpf_text = bpf_text.replace('RUNNABLE_WEIGHT_FIELD', '')
# code substitutions
if debug or args.ebpf:
print(bpf_text)
if args.ebpf:
exit()
# initialize BPF & perf_events
b = BPF(text=bpf_text)
# TODO: check for HW counters first and use if more accurate
b.attach_perf_event(ev_type=PerfType.SOFTWARE,
ev_config=PerfSWConfig.TASK_CLOCK, fn_name="do_perf_event",
sample_period=0, sample_freq=frequency)
if args.csv:
if args.timestamp:
print("TIME", end=",")
print("TIMESTAMP_ns", end=",")
print(",".join("CPU" + str(c) for c in range(ncpu)), end="")
if args.fullcsv:
print(",", end="")
print(",".join("OFFSET_ns_CPU" + str(c) for c in range(ncpu)), end="")
print()
else:
print(("Sampling run queues... Output every %s seconds. " +
"Hit Ctrl-C to end.") % args.interval)
samples = {}
group = {}
last = 0
# process event
def print_event(cpu, data, size):
event = b["events"].event(data)
samples[event.ts] = {}
samples[event.ts]['cpu'] = event.cpu
samples[event.ts]['len'] = event.len
exiting = 0 if args.interval else 1
slept = float(0)
# Choose the elapsed time from one sample group to the next that identifies a
# new sample group (a group being a set of samples from all CPUs). The
# earliest timestamp is compared in each group. This trigger is also used
# for sanity testing, if a group's samples exceed half this value.
trigger = int(0.8 * (1000000000 / frequency))
# read events
b["events"].open_perf_buffer(print_event, page_cnt=64)
while 1:
# allow some buffering by calling sleep(), to reduce the context switch
# rate and lower overhead.
try:
if not exiting:
sleep(wakeup_s)
except KeyboardInterrupt:
exiting = 1
b.perf_buffer_poll()
slept += wakeup_s
if slept < 0.999 * interval: # floating point workaround
continue
slept = 0
positive = 0 # number of samples where an idle CPU could have run work
running = 0
idle = 0
if debug >= 2:
print("DEBUG: begin samples loop, count %d" % len(samples))
for e in sorted(samples):
if debug >= 2:
print("DEBUG: ts %d cpu %d len %d delta %d trig %d" % (e,
samples[e]['cpu'], samples[e]['len'], e - last,
e - last > trigger))
# look for time jumps to identify a new sample group
if e - last > trigger:
# first first group timestamp, and sanity test
g_time = 0
g_max = 0
for ge in sorted(group):
if g_time == 0:
g_time = ge
g_max = ge
# process previous sample group
if args.csv:
lens = [0] * ncpu
offs = [0] * ncpu
for ge in sorted(group):
lens[samples[ge]['cpu']] = samples[ge]['len']
if args.fullcsv:
offs[samples[ge]['cpu']] = ge - g_time
if g_time > 0: # else first sample
if args.timestamp:
print("%-8s" % strftime("%H:%M:%S"), end=",")
print("%d" % g_time, end=",")
print(",".join(str(lens[c]) for c in range(ncpu)), end="")
if args.fullcsv:
print(",", end="")
print(",".join(str(offs[c]) for c in range(ncpu)))
else:
print()
else:
# calculate stats
g_running = 0
g_queued = 0
for ge in group:
if samples[ge]['len'] > 0:
g_running += 1
if samples[ge]['len'] > 1:
g_queued += samples[ge]['len'] - 1
g_idle = ncpu - g_running
# calculate the number of threads that could have run as the
# minimum of idle and queued
if g_idle > 0 and g_queued > 0:
if g_queued > g_idle:
i = g_idle
else:
i = g_queued
positive += i
running += g_running
idle += g_idle
# now sanity test, after -J output
g_range = g_max - g_time
if g_range > trigger / 2:
# if a sample group exceeds half the interval, we can no
# longer draw conclusions about some CPUs idle while others
# have queued work. Error and exit. This can happen when
# CPUs power down, then start again on different offsets.
# TODO: Since this is a sampling tool, an error margin should
# be anticipated, so an improvement may be to bump a counter
# instead of exiting, and only exit if this counter shows
# a skewed sample rate of over, say, 1%. Such an approach
# would allow a small rate of outliers (sampling error),
# and, we could tighten the trigger to be, say, trigger / 5.
# In the case of a power down, if it's detectable, perhaps
# the tool could reinitialize the timers (although exiting
# is simple and works).
print(("ERROR: CPU samples arrived at skewed offsets " +
"(CPUs may have powered down when idle), " +
"spanning %d ns (expected < %d ns). Debug with -J, " +
"and see the man page. As output may begin to be " +
"unreliable, exiting.") % (g_range, trigger / 2))
exit()
# these are done, remove
for ge in sorted(group):
del samples[ge]
# begin next group
group = {}
last = e
# stash this timestamp in a sample group dict
group[e] = 1
if not args.csv:
total = running + idle
unclaimed = util = 0
if debug:
print("DEBUG: hit %d running %d idle %d total %d buffered %d" % (
positive, running, idle, total, len(samples)))
if args.timestamp:
print("%-8s " % strftime("%H:%M:%S"), end="")
# output
if total:
unclaimed = float(positive) / total
util = float(running) / total
print("%%CPU %6.2f%%, unclaimed idle %0.2f%%" % (100 * util,
100 * unclaimed))
countdown -= 1
if exiting or countdown == 0:
exit()