Skip to content

Commit d188e07

Browse files
authored
tools: add numasched: trace task switch between NUMA (#4396)
Tasks frequently switch between NUMA, often resulting in poor performance. programs running on the same NUMA will have better performance (usually at the memory level). If processes are migrating between different CPUs and these CPUs belong to different NUMAs, this tool provides an effective means of tracing to optimize the program or configuration. For example: Terminal 1: $ numactl --hardware available: 4 nodes (0-3) node 0 cpus: 0 - 23 node 1 cpus: 24 - 47 node 2 cpus: 48 - 71 node 3 cpus: 72 - 95 Terminal 2: $ sudo ./numasched.py Tracing task NUMA switch... TIME PID SRC_NID DST_NID COMM Terminal 3: $ taskset -c 1 yes >/dev/null & $ taskset -p 0x1000000 $(pidof yes) $ taskset -p 0x1000000000000 $(pidof yes) Then, Terminal 2: $ sudo ./numasched.py Tracing task NUMA switch... TIME PID TID SRC_NID DST_NID COMM 20:55:35 355842 355842 0 -> 1 b'yes' 20:55:50 355842 355842 1 -> 2 b'yes' Signed-off-by: Rong Tao <[email protected]>
1 parent ad05282 commit d188e07

File tree

3 files changed

+267
-0
lines changed

3 files changed

+267
-0
lines changed

man/man8/numasched.8

+66
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
.TH numasched 8 "2022-12-14" "USER COMMANDS"
2+
.SH NAME
3+
numasched \- Tracing task switch NUMA. Uses bcc/eBPF.
4+
.SH SYNOPSIS
5+
.B numasched
6+
.SH DESCRIPTION
7+
numasched tracked task switch of NUMA.
8+
9+
This program is also a basic example of bcc and tracepoint.
10+
11+
Since this uses BPF, only the root user can use this tool.
12+
.SH REQUIREMENTS
13+
CONFIG_BPF and bcc.
14+
.SH OPTIONS
15+
.TP
16+
\-p, --pid PID
17+
Trace this PID only.
18+
.TP
19+
\-t, --tid TID
20+
Trace this TID only.
21+
.TP
22+
\-c, --comm COMM
23+
Trace this COMM only.
24+
.SH EXAMPLES
25+
.TP
26+
Tracing task switch NUMA:
27+
#
28+
.B numasched
29+
.TP
30+
Trace PID 181 only:
31+
#
32+
.B numasched \-p 181
33+
.SH FIELDS
34+
.TP
35+
TIME
36+
A timestamp on the output, in "HH:MM:SS" format.
37+
.TP
38+
PID
39+
The process ID.
40+
.TP
41+
TID
42+
The thread ID.
43+
.TP
44+
SRC_NID
45+
Source NUMA ID.
46+
.TP
47+
DST_NID
48+
Target NUMA ID.
49+
.TP
50+
COMM
51+
The process COMM.
52+
.SH SOURCE
53+
This is from bcc.
54+
.IP
55+
https://github.com/iovisor/bcc
56+
.PP
57+
Also look in the bcc distribution for a companion _examples.txt file
58+
containing example usage, output, and commentary for this tool.
59+
.SH OS
60+
Linux
61+
.SH STABILITY
62+
Unstable - in development.
63+
.SH AUTHOR
64+
65+
.SH SEE ALSO
66+
opensnoop(8)

tools/numasched.py

+146
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
#!/usr/bin/env python
2+
# @lint-avoid-python-3-compatibility-imports
3+
#
4+
# numasched Trace task NUMA switch
5+
# For Linux, uses BCC, eBPF.
6+
#
7+
# USAGE: numasched [-p PID] [-t TID] [-c COMM]
8+
#
9+
# This script tracks NUMA migrations of tasks, and in general, frequent
10+
# NUMA migrations can cause poor performance.
11+
#
12+
# Copyright 2022 CESTC, Co.
13+
# Licensed under the Apache License, Version 2.0 (the "License")
14+
#
15+
# 14-Dec-2022 Rong Tao Created this.
16+
17+
from __future__ import print_function
18+
from bcc import BPF
19+
import argparse
20+
from time import strftime
21+
from socket import inet_ntop, AF_INET, AF_INET6
22+
from struct import pack
23+
from time import sleep
24+
25+
26+
# arguments
27+
examples = """examples:
28+
./numasched # trace all processes
29+
./numasched -p 185 # trace PID 185 only
30+
"""
31+
parser = argparse.ArgumentParser(
32+
description="Trace task NUMA switch",
33+
formatter_class=argparse.RawDescriptionHelpFormatter,
34+
epilog=examples)
35+
parser.add_argument("-p", "--pid",
36+
help="trace this PID only")
37+
parser.add_argument("-t", "--tid",
38+
help="trace this TID only")
39+
parser.add_argument("-c", "--comm",
40+
help="trace this COMM only")
41+
args = parser.parse_args()
42+
43+
44+
bpf_text = """
45+
#include <linux/sched.h>
46+
#include <linux/topology.h>
47+
48+
struct data_t {
49+
char comm[TASK_COMM_LEN];
50+
u32 pid;
51+
u32 tid;
52+
u32 old_nid;
53+
u32 new_nid;
54+
};
55+
BPF_PERF_OUTPUT(events);
56+
57+
struct val_t {
58+
u32 nid;
59+
};
60+
BPF_HASH(numaid_info, u32, struct val_t);
61+
62+
63+
TRACEPOINT_PROBE(sched, sched_switch)
64+
{
65+
u64 pid_tgid = bpf_get_current_pid_tgid();
66+
u32 pid = pid_tgid >> 32;
67+
u32 tid = (u32)pid_tgid;
68+
u32 new_nid = bpf_get_numa_node_id();
69+
struct val_t val = {}, *valp;
70+
u32 old_nid;
71+
72+
if (FILTER_PID)
73+
return 0;
74+
75+
if (FILTER_TID)
76+
return 0;
77+
78+
val.nid = new_nid;
79+
80+
valp = numaid_info.lookup(&tid);
81+
if (!valp)
82+
goto update;
83+
84+
old_nid = valp->nid;
85+
86+
if (old_nid != new_nid) {
87+
struct data_t data = {};
88+
89+
bpf_get_current_comm(&data.comm, sizeof(data.comm));
90+
data.pid = pid;
91+
data.tid = tid;
92+
data.old_nid = old_nid;
93+
data.new_nid = new_nid;
94+
95+
events.perf_submit(args, &data, sizeof(data));
96+
}
97+
98+
update:
99+
numaid_info.update(&tid, &val);
100+
return 0;
101+
}
102+
"""
103+
104+
if args.pid:
105+
bpf_text = bpf_text.replace('FILTER_PID', 'pid != %s' % args.pid)
106+
else:
107+
# always skip PID=0
108+
bpf_text = bpf_text.replace('FILTER_PID', 'pid == 0')
109+
110+
if args.tid:
111+
bpf_text = bpf_text.replace('FILTER_TID', 'tid != %s' % args.tid)
112+
else:
113+
# always skip TID=0
114+
bpf_text = bpf_text.replace('FILTER_TID', 'tid == 0')
115+
116+
# process event
117+
def print_event(cpu, data, size):
118+
event = b["events"].event(data)
119+
120+
# Filter events by comm
121+
if args.comm:
122+
if not args.comm == event.comm.decode('utf-8', 'replace'):
123+
return
124+
125+
print("%-8s %-8d %-8d %-8d -> %-8d %-8s" %
126+
(strftime("%H:%M:%S"),
127+
event.pid,
128+
event.tid,
129+
event.old_nid,
130+
event.new_nid,
131+
event.comm))
132+
133+
134+
b = BPF(text=bpf_text)
135+
136+
print("Tracing task NUMA switch...")
137+
print("%-8s %-8s %-8s %-8s %-8s %-8s" %
138+
("TIME", "PID", "TID", "SRC_NID", "DST_NID", "COMM"))
139+
140+
b["events"].open_perf_buffer(print_event)
141+
while 1:
142+
try:
143+
b.perf_buffer_poll()
144+
except KeyboardInterrupt:
145+
exit()
146+

tools/numasched_example.txt

+55
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
Demonstrations of numasched.py, the Linux eBPF/bcc version.
2+
3+
This example trace the task switch numa. Some example output:
4+
5+
NUMA Information:
6+
7+
$ numactl --hardware
8+
available: 4 nodes (0-3)
9+
node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
10+
node 0 size: 97373 MB
11+
node 0 free: 1756 MB
12+
node 1 cpus: 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
13+
node 1 size: 98192 MB
14+
node 1 free: 1269 MB
15+
node 2 cpus: 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
16+
node 2 size: 98192 MB
17+
node 2 free: 4811 MB
18+
node 3 cpus: 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
19+
node 3 size: 98135 MB
20+
node 3 free: 1617 MB
21+
node distances:
22+
node 0 1 2 3
23+
0: 10 12 20 22
24+
1: 12 10 22 24
25+
2: 20 22 10 12
26+
3: 22 24 12 10
27+
28+
29+
Terminal 1, start a task running on NUMA0:
30+
31+
$ taskset -c 1 yes >/dev/null
32+
33+
34+
Terminal 2, start tracing:
35+
36+
$ sudo ./numasched.py
37+
38+
39+
Terminal 3
40+
41+
# taskset 'yes' task to NUMA1(cpu=24):
42+
$ taskset -p 0x1000000 $(pidof yes)
43+
44+
# taskset 'yes' task to NUMA2(cpu=48):
45+
$ taskset -p 0x1000000000000 $(pidof yes)
46+
47+
48+
Then, Terminal 2 shows:
49+
50+
$ sudo ./numasched.py
51+
Tracing task NUMA switch...
52+
TIME PID TID SRC_NID DST_NID COMM
53+
20:55:35 355842 355842 0 -> 1 b'yes'
54+
20:55:50 355842 355842 1 -> 2 b'yes'
55+

0 commit comments

Comments
 (0)