-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathhealthcheck_wn_condor
executable file
·505 lines (456 loc) · 14.7 KB
/
healthcheck_wn_condor
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
#!/bin/bash
#
# This script takes over most of the nagios checks and actions.
# There can be soft errors or hard errors, with appropriate actions
#
# Note to explain healhcheck script with per VO disabling.
# If we want to disable jobs starts on a per VO basis the script needs to return something like:
# NODE_IS_HEALTHY = True && (regexp("atl", Owner) =!= False) && (regexp("cms", Owner) =!= False)
# Variable Declarations
DATE=$(date)
PROGRAM=`basename $0`
SHOSTNAME=$(hostname -s)
CONDOR_MESG="NODE_IS_HEALTHY = True" # Message to Condor, default OK
CONDOR_MESG_GATEWAY="True"
CONDOR_MESG_GATEWAY_STATUS="Healthy"
CONDOR_MESG_PROXY="True"
CONDOR_MESG_PROXY_STATUS="Healthy"
NAGIOS_MESG="All_OK" # Message to nagios, default OK
LOGGER="/usr/bin/env logger" # logger program
LOG_MESG="Failed healthcheck:" # log message preamble, to be extended in script
TEST_MESG="" # What the test says.
TIME_SERVER="time.rl.ac.uk" # The RAL time server
TEST_FILE="/pool/condor/healthcheck_wn_test_flagfile" # A test file name
POOL_MOUNTPOINT=$(df --output=target /pool | tail -n 1) # Command to find mountpoint of /pool
CG_ENABLED="" # Variable used to check cgroups - its a string
typeset -i RC=0 # Return code from other scripts / progs
typeset -i DEBUG_VAL=0 # Are we debugging
typeset -i EXIT_CODE=0 # Exit code for script and nagios
typeset -i VIRTUAL_WORKER=0 # Is the node a virtual node
typeset -i UPTIME_TEST_VAL=10 # minimum uptime for machine
typeset -i UPTIME_VAL=10 # actual uptime for machine
typeset -i VIRTUAL_WORKER=0 # Is this a VM
declare -a FILE_SYSTEMS # Array to hold list of filesystems
declare -a DEAD_REPO_USERS # Array to hold list of dead repo users
# we need a hash to map VOs to regex - Only declare it once
declare -A VOS
VOS["atlas.cern.ch"]=atl
VOS["atlas-condb.cern.ch"]=atl
VOS["cms.cern.ch"]=cms
VOS["lhcb.cern.ch"]=lhcb
VOS["alice.cern.ch"]=alice
# End Variable Declarations
# House keeping
# first check for debug on command line. It is useful to be able to run this in debug mode
case "$1" in
-d|--debug)
DEBUG_VAL=1
shift
;;
esac
function debug() {
if [ $DEBUG_VAL -gt 0 ]
then
echo "DEBUG: $@"
fi
}
# We only want debug - nothing else.
if [ $# -gt 0 ]
then
echo ""
echo "Wrong input for $PROGRAM, -d is the only option"
exit 3
fi
# End of housekeeping
# Start of functions
function fatal_exit() {
# function to deal with a fatal test failure
NAGIOS_MESG=$@
CONDOR_MESG="NODE_IS_HEALTHY = False" # Message for condor
EXIT_CODE=2
echo $CONDOR_MESG # standard out to condor
echo "NODE_STATUS = \"$NAGIOS_MESG\""
LOG_MESG="$LOG_MESG ${NAGIOS_MESG}" # log message
if [ $DEBUG_VAL -gt 0 ] # if debug, we want to know what is going on
then # And we want to continue wth the other tests
echo "EXIT_CODE ${EXIT_CODE}"
echo
echo "Normally exit here === Continuing in debug mode ==="
echo
return # We want to do all the tests in debug mode, so return for more
else # no debug, so send nsca and write log message and exit
if [ -n "$LOG_MESG" ] # log what we do the syslog.
then
$LOGGER $LOG_MESG
fi
exit ${EXIT_CODE}
fi
}
# End of functions.
# Start work
debug "starting ${PROGRAM} on ${DATE}"
# Check if we are on a vm
# OpenStack VM's can have it's metadata queried using the below curl
if curl --max-time 5 -s http://169.254.169.254/openstack/latest/meta_data.json -o /dev/null
then
VIRTUAL_WORKER=1 # Is the node a virtual node?
UPTIME_TEST_VAL=2
else
VIRTUAL_WORKER=0
UPTIME_TEST_VAL=10
fi
debug "VIRTUAL_WORKER is $VIRTUAL_WORKER"
debug "\$UPTIME_TEST_VAL is $UPTIME_TEST_VAL"
# End of vm Checks
# Start of checks - Fatal checks first.
# The first check to fire ends the script except in debug mode.
# Uptime check
debug "Testing uptime is > $UPTIME_TEST_VAL mins"
if [ $DEBUG_VAL -gt 0 ]
then
uptime | awk '{print "Machine up: " $3 " " $4}'
fi
UPTIME_VAL=$(uptime | awk '{if (match($4, "min")) print $3 ; else print "100"}')
if [ $UPTIME_VAL -le $UPTIME_TEST_VAL ]
then
fatal_exit "Problem: Machine up less than $UPTIME_TEST_VAL minutes."
fi
# end of uptime check
# Test for file /etc/condor/pool_password
debug "Checking /etc/condor/pool_password"
if [ ! -s /etc/condor/pool_password ]
then
fatal_exit "No or zero sized pool_password file found"
fi
# End of pool_password check
# test for time
debug "Testing system time against ${TIME_SERVER}"
# check the nagios plugin test exists
if [ ! -f /usr/lib64/nagios/plugins/check_ntp_time ]
then
fatal_exit "Problem: No check time script available"
fi
if [ $DEBUG_VAL -gt 0 ]
then
/usr/lib64/nagios/plugins/check_ntp_time -c 1 -H ${TIME_SERVER}
RC=$?
else
/usr/lib64/nagios/plugins/check_ntp_time -c 1 -H ${TIME_SERVER} >/dev/null 2>&1
RC=$?
fi
if [ $RC -gt 0 ]
then
fatal_exit "Problem: machine clock more than 1 second out"
fi
# end of time check.
# Check if /pool is a mountpoint on physical workernodes
if [[ $VIRTUAL_WORKER -ne 1 ]]
then
debug "Checking if /pool is a mountpoint"
mountpoint /pool > /dev/null 2>&1
RC=$?
if [ $RC -gt 0 ]
then
fatal_exit "Problem: /pool is not a mountpoint"
fi
fi
# end of mountpoint check
# Check if xfs_info is installed
debug "Checking if xfs_info is installed"
if [ ! -f /usr/sbin/xfs_info ]; then
fatal_exit "Problem: xfs_info is not installed"
fi
# end of xfs_info check
# Check to confirm xfs mountpoint is queryable
# If it's not, this can lock containers on the workernode and cause
# the "Zombie container" issue where pilot containers aren't deleted
debug "Checking if /pool is queryable"
timeout 30 xfs_info /pool > /dev/null 2>&1
RC=$?
if [ $RC -eq 124 ]
then
fatal_exit "Problem: xfs_info timed out. Likely Docker containers aren't being deleted. Check CVMFS repos"
fi
# end check if /pool is queryable
# Check if /pool is formatted correctly on workernodes
debug "Checking if /pool is formatted correctly"
# Check what format /pool is.
df -T /pool | tail -n 1 | awk '{print $2}' > /dev/null 2>&1
RC=$?
if [[ $RC -eq "xfs" ]]
then
debug "/pool is xfs formatted, continuing"
# Check if /pool is mountpoint or pool is directory
debug "The target mountpoint of /pool is $POOL_MOUNTPOINT"
xfs_info $POOL_MOUNTPOINT | grep ftype=1 > /dev/null 2>&1
RC=$?
if [ $RC -gt 0 ]
then
fatal_exit "Problem: $POOL_MOUNTPOINT is not on a properly formatted XFS filesystem"
fi
elif [[ $RC -eq "ext4" ]]
then
# Allow /pool to be on ext4 filesystem
# This is historical, however still supported
debug "$POOL_MOUNTPOINT is ext4 formatted, continuing"
else
fatal_exit "Problem: /pool is not on a properly formatted filesystem (ext4 or xfs)"
fi
# end check if /pool is formatted correctly
# Read-only filesystem check. Cant run the whole thing cos we would need root.
# But we can check the condor directory.
debug "Testing for writable /pool/condor"
if [ ! -d /pool/condor ] # no condor dir
then
fatal_exit "Problem: No condor scratch area"
fi
trap "rm -f $TEST_FILE; exit" INT TERM EXIT
touch ${TEST_FILE} 1> /dev/null 2>&1
RC=$?
rm -f ${TEST_FILE} > /dev/null 2>&1
trap - INT TERM EXIT
if [ $RC -gt 0 ]
then
fatal_exit "Problem: Condor cant write to condor scratch area"
else
debug "OK condor scratch area is writable"
fi
# end of read-only filesystem check
# check swap but not on vitual machines
if [[ $VIRTUAL_WORKER -ne 1 ]]
then
# This test depends on the nagios check_swap script being available, so check first
if [ ! -f /usr/lib64/nagios/plugins/check_swap ]
then
fatal_exit "Problem: No nagios check swap script available"
fi
debug "Testing swap free"
if [ $DEBUG_VAL -gt 0 ]
then
/usr/lib64/nagios/plugins/check_swap -w 25% -c 20%
RC=$?
else
/usr/lib64/nagios/plugins/check_swap -w 25% -c 20% >/dev/null 2>&1
RC=$?
fi
if [ $RC -gt 0 ]
then
fatal_exit "Problem: Swap in use, less than 25% free"
fi
fi
# End of check swap.
# Specifically check for grid.cern.ch as it is now supplying our middle ware
debug "Checking CVMFS grid.cern.ch"
# This test depends on the nagios check_cvmfs script being available,
# so check first
if [ ! -f /usr/local/bin/check_cvmfs2.sh ]
then
debug "script, /usr/lib64/nagios/plugins/check_disk not available"
fatal_exit "Problem: No check cvmfs script available"
fi
if [ $DEBUG_VAL -gt 0 ]
then
/usr/local/bin/check_cvmfs2.sh grid.cern.ch
RC=$?
else
/usr/local/bin/check_cvmfs2.sh grid.cern.ch >/dev/null 2>&1
RC=$?
fi
if [ $RC -gt 0 ]
then
fatal_exit "Problem: CVMFS for grid.cern.ch failed"
fi
# End of check for grid.cern.ch
# check disk pool
# This test and the next one depends on the nagios check_disk script being available,
# so check first
if [ ! -f /usr/lib64/nagios/plugins/check_disk ]
then
debug "script, /usr/lib64/nagios/plugins/check_disk not available"
fatal_exit "Problem: No check disk script available"
fi
debug "Testing space on /pool"
if [ $DEBUG_VAL -gt 0 ]
then
/usr/lib64/nagios/plugins/check_disk -c 5000 -p /pool
RC=$?
else
/usr/lib64/nagios/plugins/check_disk -c 5000 -p /pool >/dev/null 2>&1
RC=$?
fi
if [ $RC -gt 0 ]
then
fatal_exit "Problem: less than 5000Mb free on /pool"
fi
# End of check on /pool
# Check all other filesystems have at least 5% free.
# Note that this purposely does not check cvmfs or rather fuse
# And there is an exception for /mnt/context
FILE_SYSTEMS=`df -lh -x fuse | sed -ne '/^\/dev/s/^.*\% \([^ ]*\)$/\1/p'`
debug "Testing local filesystems for min 5% free space"
for dir in ${FILE_SYSTEMS[@]}
do
# We want to skip nubes-context (/mnt/context) cos it is always 100%
if [ $dir == "/mnt/context" ]
then
debug "Skipping $dir because we dont want it"
continue
fi
if [ $DEBUG_VAL -gt 0 ]
then
/usr/lib64/nagios/plugins/check_disk -c 5% -p $dir
RC=$?
else
/usr/lib64/nagios/plugins/check_disk -c 5% -p $dir >/dev/null 2>&1
RC=$?
fi
if [ $RC -gt 0 ]
then
debug "less than 5% free on $dir"
TEST_MESG="${TEST_MESG} Problem: less than 5% free on $dir "
fi
done
if [ ! -z "$TEST_MESG" ]
then
fatal_exit $TEST_MESG
fi
# End of filesystems check
# Check cgroups on worker nodes that don't have Docker installed
debug "Testing cgroups"
if [ ! -f "/usr/bin/docker" ]
then
CG_ENABLED=$(condor_config_val BASE_CGROUP 2>/dev/null)
debug "condor_config_val for BASE_CGROUP is: $CG_ENABLED"
if [ ! -z $CG_ENABLED ]
then
RC=0
if [ $DEBUG_VAL -gt 0 ]
then
debug "cgconfig status is: "
/sbin/service cgconfig status
RC=$?
else
/sbin/service cgconfig status >/dev/null 2>&1
RC=$?
fi
if [ $RC -gt 0 ]
then
debug "cgroups configured in condor but cgconfig not running"
fatal_exit "Problem: cgconfig not running"
fi
fi
fi
# End Check cgroupgs
# Check if Docker daemon is ok
if [ -f "/usr/bin/docker" ]
then
# A basic check that the Docker daemon is running
sudo /usr/bin/docker ps >/dev/null 2>&1
RC=$?
if [ $RC -ne 0 ]
then
fatal_exit "Problem: Cannot obtain list of Docker containers"
fi
# Check that containerd is running
pidof containerd > /dev/null 2>&1
RC=$?
if [ $RC -ne 0 ]
then
fatal_exit "Problem: containerd is not running"
fi
# Check that containers can be created successfully
sudo /usr/bin/docker run --rm -m 64m busybox date >/dev/null 2>&1
RC=$?
if [ $RC -ne 0 ]
then
fatal_exit "Problem: Unable to run Docker containers"
fi
fi
# End Check docker
# Check if local Echo xrootd gateway is running
if [ -f "/usr/bin/docker" ]
then
{ gwstatus=`sudo /usr/bin/docker inspect --format '{{ .State.Health.Status }}' xrootd-gateway`; } 2>/dev/null
RC=$?
if [ $RC -ne 0 ]; then
TEST_MESG="${TEST_MESG} Problem: local xrootd gateway"
CONDOR_MESG_GATEWAY="(WantEchoXrootd =?= False || WantEchoXrootd =?= UNDEFINED)"
CONDOR_MESG_GATEWAY_STATUS="None"
else
if [ $gwstatus == "unhealthy" ]; then
TEST_MESG="${TEST_MESG} Problem: local xrootd gateway"
CONDOR_MESG_GATEWAY="(WantEchoXrootd =?= False || WantEchoXrootd =?= UNDEFINED)"
CONDOR_MESG_GATEWAY_STATUS="Unhealthy"
fi
fi
fi
# Check if local Echo xrootd proxy is running
if [ -f "/usr/bin/docker" ]
then
{ gwstatus=`sudo /usr/bin/docker inspect --format '{{ .State.Health.Status }}' xrootd-proxy`; } 2>/dev/null
RC=$?
if [ $RC -ne 0 ]; then
TEST_MESG="${TEST_MESG} Problem: local xrootd proxy"
CONDOR_MESG_PROXY="(WantEchoXrootd =?= False || WantEchoXrootd =?= UNDEFINED)"
CONDOR_MESG_PROXY_STATUS="None"
else
if [ $gwstatus == "unhealthy" ]; then
TEST_MESG="${TEST_MESG} Problem: local xrootd proxy"
CONDOR_MESG_PROXY="(WantEchoXrootd =?= False || WantEchoXrootd =?= UNDEFINED)"
CONDOR_MESG_PROXY_STATUS="Unhealthy"
fi
fi
fi
## Non-fatal checks these don't call fatal_exit but simply set NAGIOS_MESG,
## CONDOR_MESG and EXIT_CODE and exit
# Check CVMFS for selected VOs
debug "CVMFS check for selected VOs"
for vo in "${!VOS[@]}"
do
debug "Checking $vo"
if [ $DEBUG_VAL -gt 0 ]
then
/usr/local/bin/check_cvmfs2.sh $vo
RC=$?
else
/usr/local/bin/check_cvmfs2.sh $vo >/dev/null 2>&1
RC=$?
fi
if [ $RC -ne 0 ]
then
TEST_MESG="${TEST_MESG} Problem: CVMFS for $vo"
DEAD_REPO_USERS=("${DEAD_REPO_USERS[@]}" "${VOS[$vo]}")
fi
done
# lets see if we got any DEAD_REPO_USERS
# if more than 2, it is a fatal_exit
# if only 1 or 2 , set CONDOR_MESG, NAGIOS_MESG and EXIT_CODE appropriately
debug "After checking CVMFS, the number of problem users is ${#DEAD_REPO_USERS[@]}"
if [ ${#DEAD_REPO_USERS[@]} -gt 2 ]
then
NAGIOS_MESG=$TEST_MESG
fatal_exit "More that 2 CVMFS repo broken"
elif [ ${#DEAD_REPO_USERS[@]} -gt 0 ]
then
NAGIOS_MESG=$TEST_MESG
for dead in ${DEAD_REPO_USERS[@]}
do
CONDOR_MESG="$CONDOR_MESG && (regexp(\"$dead\", Owner) =?= False)"
done
EXIT_CODE=1 # Nagios warn
fi
# end of cvmfs checks
## End of non-fatal checks
# Handle status of local xrootd gateway
CONDOR_MESG="${CONDOR_MESG} && ${CONDOR_MESG_GATEWAY}"
# Handle status of local xrootd proxy
CONDOR_MESG="${CONDOR_MESG} && ${CONDOR_MESG_PROXY}"
# Now just do an exit with correct codes..
debug "NAGIOS_MESG $NAGIOS_MESG"
debug "EXIT_CODE $EXIT_CODE"
echo $CONDOR_MESG
echo "NODE_STATUS = \"$NAGIOS_MESG\""
echo "ECHO_XROOTD_GATEWAY_STATUS = \"$CONDOR_MESG_GATEWAY_STATUS\""
echo "ECHO_XROOTD_PROXY_STATUS = \"$CONDOR_MESG_PROXY_STATUS\""
# We don't bother logging non-fatal runs.
exit ${EXIT_CODE}