-
Notifications
You must be signed in to change notification settings - Fork 0
/
gpu.go
213 lines (198 loc) · 6.58 KB
/
gpu.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
package gputil
import (
"bytes"
"context"
"encoding/csv"
"fmt"
"github.com/pkg/errors"
"io"
"os/exec"
"strings"
)
// the binary should be executable
var binary = "nvidia-smi"
// SetBinaryPath set the path of nvidia-smi
func SetBinaryPath(path string) {
binary = path
}
const (
queryInfo = "--query-gpu=index,uuid,utilization.gpu,memory.total,memory.used,memory.free,driver_version,name,gpu_serial,power.draw,power.limit,temperature.gpu,timestamp"
queryProcess = "--query-compute-apps=timestamp,gpu_name,gpu_uuid,pid,name,used_memory"
queryFormat = "--format=csv,noheader,nounits"
)
// GPU information
type GPU struct {
// Zero based index of the GPU. Can change at each boot.
Index string `json:"index"`
// This value is the globally unique immutable alphanumeric identifier of the GPU.
// It does not correspond to any physical label on the board.
UUID string `json:"uuid"`
// Percent of time over the past sample period during which one or more kernels was executing on the GPU.
// The sample period may be between 1 second and 1/6 second depending on the product.
UtilizationGPU string `json:"utilizationGPU"`
// Total installed GPU memory. units, MiB
MemoryTotal string `json:"memoryTotal"`
// Total memory allocated by active contexts. units, MiB
MemoryUsed string `json:"memoryUsed"`
// Total free memory. units, MiB
MemoryFree string `json:"memoryFree"`
// The version of the installed NVIDIA display driver.
// This is an alphanumeric string.
DriverVersion string `json:"driverVersion"`
// The official product name of the GPU. This is an alphanumeric string
Name string `json:"name"`
// This number matches the serial number physically printed on each board. It is a globally unique immutable alphanumeric value.
Serial string `json:"serial"`
// The last measured power draw for the entire board, in watts.
// On Ampere or newer devices, returns average power draw over 1 sec.
// On older devices, returns instantaneous power draw. Only available if power management is supported.
// This reading is accurate to within +/- 5 watts.
PowerDraw string `json:"powerDraw"`
// The software power limit in watts. Set by software like nvidia-smi.
// On Kepler devices Power Limit can be adjusted using [-pl | --power-limit=] switches.
PowerLimit string `json:"powerLimit"`
// Core GPU temperature. in degrees C.
Temperature string `json:"temperature"`
// The timestamp of when the query was made in format "YYYY/MM/DD HH:MM:SS.msec".
Timestamp string `json:"timestamp"`
}
// GPUComputeApp processes having compute context on the device.
type GPUComputeApp struct {
// The timestamp of when the query was made in format "YYYY/MM/DD HH:MM:SS.msec".
Timestamp string `json:"timestamp"`
// The official product name of the GPU.
// This is an alphanumeric string. For all products.
Name string `json:"name"`
// This value is the globally unique immutable alphanumeric identifier of the GPU.
// It does not correspond to any physical label on the board.
UUID string `json:"uuid"`
// Process ID of the compute application
PID string `json:"pid"`
// Process Name
ProcessName string `json:"processName"`
// Amount memory used on the device by the context.
// Not available on Windows when running in WDDM mode because Windows KMD manages all the memory not NVIDIA driver.
UsedMemory string `json:"usedMemory"`
}
// GetGPUs returns all GPUs or specified index/uuids information
func GetGPUs(ctx context.Context, indexOrUUIDs ...string) (result []GPU, err error) {
var rsp []byte
var args = []string{queryInfo, queryFormat}
if len(indexOrUUIDs) > 0 {
args = append(args, fmt.Sprintf("-i %s", strings.Join(indexOrUUIDs, ",")))
}
if rsp, err = run(ctx, args...); err != nil {
return
}
result, err = composeGPUInfoLines(rsp)
return
}
// GetProcesses returns processes having compute context on the device
// Note: if no processes running, empty result return
func GetProcesses(ctx context.Context, indexOrUUIDs ...string) (result []GPUComputeApp, err error) {
var rsp []byte
var args = []string{queryProcess, queryFormat}
if len(indexOrUUIDs) > 0 {
args = append(args, fmt.Sprintf("--id=%s", strings.Join(indexOrUUIDs, ",")))
}
if rsp, err = run(ctx, args...); err != nil {
return
}
result, err = composeProcessInfoLines(rsp)
return
}
func composeProcessInfoLines(rsp []byte) (result []GPUComputeApp, err error) {
var lines [][]string
if lines, err = parse(rsp); err != nil {
return
}
result = make([]GPUComputeApp, 0, len(lines))
for _, line := range lines {
result = append(result, GPUComputeApp{
Timestamp: sanitize(line[0]),
Name: sanitize(line[1]),
UUID: sanitize(line[2]),
PID: sanitize(line[3]),
ProcessName: sanitize(line[4]),
UsedMemory: sanitize(line[5]),
})
}
return
}
func run(ctx context.Context, args ...string) (rsp []byte, err error) {
cmd := exec.CommandContext(ctx, binary, args...)
if rsp, err = cmd.Output(); err != nil {
if ee, ok := err.(*exec.ExitError); ok {
err = errors.WithMessagef(ee, "execute output: %s", rsp)
return
}
return
}
return
}
// compose csv lines to GPU instance
func composeGPUInfoLines(rsp []byte) (result []GPU, err error) {
var lines [][]string
if lines, err = parse(rsp); err != nil {
return
}
result = make([]GPU, 0, len(lines))
for _, line := range lines {
result = append(result, GPU{
Index: sanitize(line[0]),
UUID: sanitize(line[1]),
UtilizationGPU: sanitize(line[2]),
MemoryTotal: sanitize(line[3]),
MemoryUsed: sanitize(line[4]),
MemoryFree: sanitize(line[5]),
DriverVersion: sanitize(line[6]),
Name: sanitize(line[7]),
Serial: sanitize(line[8]),
PowerDraw: sanitize(line[9]),
PowerLimit: sanitize(line[10]),
Temperature: sanitize(line[11]),
Timestamp: sanitize(line[12]),
})
}
return
}
// parse csv lines
func parse(content []byte) (lines [][]string, err error) {
r := csv.NewReader(bytes.NewReader(content))
for {
row, e := r.Read()
if e != nil {
if errors.Is(e, io.EOF) {
break
}
err = e
break
}
lines = append(lines, row)
}
return
}
func (g *GPU) String() string {
return fmt.Sprintf(
"%s, %s, %s %%, %s MiB, %s MiB, %s MiB, %s, %s, %s, %s W, %s W, %s, %s",
g.Index,
g.UUID,
g.UtilizationGPU,
g.MemoryTotal,
g.MemoryUsed,
g.MemoryFree,
g.DriverVersion,
g.Name,
g.Serial,
g.PowerDraw,
g.PowerLimit,
g.Temperature,
g.Timestamp,
)
}
func (c *GPUComputeApp) String() string {
return fmt.Sprintf("%s, %s, %s, %s, %s MiB", c.Timestamp, c.Name, c.UUID, c.PID, c.UsedMemory)
}
func sanitize(input string) string {
return strings.TrimSpace(input)
}