96
96
"""
97
97
_AGENT_THROTTLED_TIME_THRESHOLD = 120 # 2 minutes
98
98
99
+
99
100
class DisableCgroups (object ):
100
101
ALL = "all"
101
102
AGENT = "agent"
102
103
EXTENSIONS = "extensions"
103
104
105
+
104
106
def _log_cgroup_info (format_string , * args ):
105
107
message = format_string .format (* args )
106
108
logger .info ("[CGI] " + message )
@@ -120,6 +122,7 @@ class CGroupConfigurator(object):
120
122
NOTE: with the exception of start_extension_command, none of the methods in this class
121
123
raise exceptions (cgroup operations should not block extensions)
122
124
"""
125
+
123
126
class _Impl (object ):
124
127
def __init__ (self ):
125
128
self ._initialized = False
@@ -165,7 +168,9 @@ def initialize(self):
165
168
self .__setup_azure_slice ()
166
169
167
170
cpu_controller_root , memory_controller_root = self .__get_cgroup_controllers ()
168
- self ._agent_cpu_cgroup_path , self ._agent_memory_cgroup_path = self .__get_agent_cgroups (agent_slice , cpu_controller_root , memory_controller_root )
171
+ self ._agent_cpu_cgroup_path , self ._agent_memory_cgroup_path = self .__get_agent_cgroups (agent_slice ,
172
+ cpu_controller_root ,
173
+ memory_controller_root )
169
174
170
175
if self ._agent_cpu_cgroup_path is not None :
171
176
_log_cgroup_info ("Agent CPU cgroup: {0}" , self ._agent_cpu_cgroup_path )
@@ -218,7 +223,7 @@ def __collect_agent_unit_files_telemetry():
218
223
agent_service_name = get_osutil ().get_service_name ()
219
224
try :
220
225
fragment_path = systemd .get_unit_property (agent_service_name , "FragmentPath" )
221
- if fragment_path != "/lib/ systemd/system/{0}.service" . format ( agent_service_name ):
226
+ if fragment_path != systemd . get_agent_unit_file ( ):
222
227
agent_unit_files .append (fragment_path )
223
228
except Exception as exception :
224
229
_log_cgroup_warning ("Failed to query the agent's FragmentPath: {0}" , ustr (exception ))
@@ -233,7 +238,8 @@ def __collect_agent_unit_files_telemetry():
233
238
for unit_file in agent_unit_files :
234
239
try :
235
240
with open (unit_file , "r" ) as file_object :
236
- _log_cgroup_info ("Found a custom unit file for the agent: {0}\n {1}" , unit_file , file_object .read ())
241
+ _log_cgroup_info ("Found a custom unit file for the agent: {0}\n {1}" , unit_file ,
242
+ file_object .read ())
237
243
except Exception as exception :
238
244
_log_cgroup_warning ("Can't read {0}: {1}" , unit_file , ustr (exception ))
239
245
@@ -269,7 +275,8 @@ def __get_cgroup_controllers(self):
269
275
#
270
276
cgroup2_mount_point , cgroup2_controllers = self ._cgroups_api .get_cgroup2_controllers ()
271
277
if cgroup2_mount_point is not None :
272
- _log_cgroup_info ("cgroups v2 mounted at {0}. Controllers: [{1}]" , cgroup2_mount_point , cgroup2_controllers )
278
+ _log_cgroup_info ("cgroups v2 mounted at {0}. Controllers: [{1}]" , cgroup2_mount_point ,
279
+ cgroup2_controllers )
273
280
274
281
return cpu_controller_root , memory_controller_root
275
282
@@ -322,7 +329,7 @@ def __setup_azure_slice():
322
329
323
330
if not os .path .exists (logcollector_slice ):
324
331
slice_contents = _LOGCOLLECTOR_SLICE_CONTENTS_FMT .format (cpu_quota = _LOGCOLLECTOR_CPU_QUOTA ,
325
- memory_limit = _LOGCOLLECTOR_MEMORY_LIMIT )
332
+ memory_limit = _LOGCOLLECTOR_MEMORY_LIMIT )
326
333
327
334
files_to_create .append ((logcollector_slice , slice_contents ))
328
335
@@ -408,7 +415,8 @@ def __get_agent_cgroups(self, agent_slice, cpu_controller_root, memory_controlle
408
415
agent_unit_name = systemd .get_agent_unit_name ()
409
416
410
417
expected_relative_path = os .path .join (agent_slice , agent_unit_name )
411
- cpu_cgroup_relative_path , memory_cgroup_relative_path = self ._cgroups_api .get_process_cgroup_relative_paths ("self" )
418
+ cpu_cgroup_relative_path , memory_cgroup_relative_path = self ._cgroups_api .get_process_cgroup_relative_paths (
419
+ "self" )
412
420
413
421
if cpu_cgroup_relative_path is None :
414
422
_log_cgroup_warning ("The agent's process is not within a CPU cgroup" )
@@ -417,11 +425,11 @@ def __get_agent_cgroups(self, agent_slice, cpu_controller_root, memory_controlle
417
425
_log_cgroup_info ('CPUAccounting: {0}' , systemd .get_unit_property (agent_unit_name , "CPUAccounting" ))
418
426
_log_cgroup_info ('CPUQuota: {0}' , systemd .get_unit_property (agent_unit_name , "CPUQuotaPerSecUSec" ))
419
427
else :
420
- cpu_cgroup_relative_path = None # Set the path to None to prevent monitoring
421
428
_log_cgroup_warning (
422
429
"The Agent is not in the expected CPU cgroup; will not enable monitoring. Cgroup:[{0}] Expected:[{1}]" ,
423
430
cpu_cgroup_relative_path ,
424
431
expected_relative_path )
432
+ cpu_cgroup_relative_path = None # Set the path to None to prevent monitoring
425
433
426
434
if memory_cgroup_relative_path is None :
427
435
_log_cgroup_warning ("The agent's process is not within a memory cgroup" )
@@ -430,11 +438,11 @@ def __get_agent_cgroups(self, agent_slice, cpu_controller_root, memory_controlle
430
438
memory_accounting = systemd .get_unit_property (agent_unit_name , "MemoryAccounting" )
431
439
_log_cgroup_info ('MemoryAccounting: {0}' , memory_accounting )
432
440
else :
433
- memory_cgroup_relative_path = None # Set the path to None to prevent monitoring
434
441
_log_cgroup_info (
435
442
"The Agent is not in the expected memory cgroup; will not enable monitoring. CGroup:[{0}] Expected:[{1}]" ,
436
443
memory_cgroup_relative_path ,
437
444
expected_relative_path )
445
+ memory_cgroup_relative_path = None # Set the path to None to prevent monitoring
438
446
439
447
if cpu_controller_root is not None and cpu_cgroup_relative_path is not None :
440
448
agent_cpu_cgroup_path = os .path .join (cpu_controller_root , cpu_cgroup_relative_path )
@@ -462,30 +470,30 @@ def extensions_enabled(self):
462
470
463
471
def enable (self ):
464
472
if not self .supported ():
465
- raise CGroupsException ("Attempted to enable cgroups, but they are not supported on the current platform" )
473
+ raise CGroupsException (
474
+ "Attempted to enable cgroups, but they are not supported on the current platform" )
466
475
self ._agent_cgroups_enabled = True
467
476
self ._extensions_cgroups_enabled = True
468
477
self .__set_cpu_quota (conf .get_agent_cpu_quota ())
469
478
470
- def disable (self , reason , disableCgroups ):
479
+ def disable (self , reason , disable_cgroups ):
471
480
# Todo: disable/reset extension when ext quotas introduced
472
- if disableCgroups == DisableCgroups .ALL : # disable all
481
+ if disable_cgroups == DisableCgroups .ALL : # disable all
473
482
self ._agent_cgroups_enabled = False
474
483
self ._extensions_cgroups_enabled = False
475
484
self .__reset_agent_cpu_quota ()
476
485
CGroupsTelemetry .reset ()
477
- elif disableCgroups == DisableCgroups .AGENT : # disable agent
486
+ elif disable_cgroups == DisableCgroups .AGENT : # disable agent
478
487
self ._agent_cgroups_enabled = False
479
488
self .__reset_agent_cpu_quota ()
480
489
CGroupsTelemetry .stop_tracking (CpuCgroup (AGENT_NAME_TELEMETRY , self ._agent_cpu_cgroup_path ))
481
- elif disableCgroups == DisableCgroups .EXTENSIONS : # disable extensions
490
+ elif disable_cgroups == DisableCgroups .EXTENSIONS : # disable extensions
482
491
self ._extensions_cgroups_enabled = False
483
492
484
493
message = "[CGW] Disabling resource usage monitoring. Reason: {0}" .format (reason )
485
494
logger .info (message ) # log as INFO for now, in the future it should be logged as WARNING
486
495
add_event (op = WALAEventOperation .CGroupsDisabled , message = message , is_success = False , log_event = False )
487
496
488
-
489
497
@staticmethod
490
498
def __set_cpu_quota (quota ):
491
499
"""
@@ -510,6 +518,7 @@ def __reset_agent_cpu_quota():
510
518
logger .info ("Resetting agent's CPUQuota" )
511
519
if CGroupConfigurator ._Impl .__try_set_cpu_quota ('' ): # setting an empty value resets to the default (infinity)
512
520
CGroupsTelemetry .set_track_throttled_time (False )
521
+ _log_cgroup_info ('CPUQuota: {0}' , systemd .get_unit_property (systemd .get_agent_unit_name (), "CPUQuotaPerSecUSec" ))
513
522
514
523
@staticmethod
515
524
def __try_set_cpu_quota (quota ):
@@ -589,7 +598,8 @@ def _check_processes_in_agent_cgroup(self):
589
598
if process in (daemon , extension_handler ) or process in systemd_run_commands :
590
599
continue
591
600
# systemd_run_commands contains the shell that started systemd-run, so we also need to check for the parent
592
- if self ._get_parent (process ) in systemd_run_commands and self ._get_command (process ) == 'systemd-run' :
601
+ if self ._get_parent (process ) in systemd_run_commands and self ._get_command (
602
+ process ) == 'systemd-run' :
593
603
continue
594
604
# check if the process is a command started by the agent or a descendant of one of those commands
595
605
current = process
@@ -697,7 +707,8 @@ def stop_tracking_extension_cgroups(self, extension_name):
697
707
except Exception as exception :
698
708
logger .info ("Failed to stop tracking resource usage for the extension service: {0}" , ustr (exception ))
699
709
700
- def start_extension_command (self , extension_name , command , cmd_name , timeout , shell , cwd , env , stdout , stderr , error_code = ExtensionErrorCodes .PluginUnknownFailure ):
710
+ def start_extension_command (self , extension_name , command , cmd_name , timeout , shell , cwd , env , stdout , stderr ,
711
+ error_code = ExtensionErrorCodes .PluginUnknownFailure ):
701
712
"""
702
713
Starts a command (install/enable/etc) for an extension and adds the command's PID to the extension's cgroup
703
714
:param extension_name: The extension executing the command
@@ -713,9 +724,12 @@ def start_extension_command(self, extension_name, command, cmd_name, timeout, sh
713
724
"""
714
725
if self .enabled ():
715
726
try :
716
- return self ._cgroups_api .start_extension_command (extension_name , command , cmd_name , timeout , shell = shell , cwd = cwd , env = env , stdout = stdout , stderr = stderr , error_code = error_code )
727
+ return self ._cgroups_api .start_extension_command (extension_name , command , cmd_name , timeout ,
728
+ shell = shell , cwd = cwd , env = env , stdout = stdout ,
729
+ stderr = stderr , error_code = error_code )
717
730
except SystemdRunError as exception :
718
- reason = 'Failed to start {0} using systemd-run, will try invoking the extension directly. Error: {1}' .format (extension_name , ustr (exception ))
731
+ reason = 'Failed to start {0} using systemd-run, will try invoking the extension directly. Error: {1}' .format (
732
+ extension_name , ustr (exception ))
719
733
self .disable (reason , DisableCgroups .ALL )
720
734
# fall-through and re-invoke the extension
721
735
@@ -735,7 +749,7 @@ def setup_extension_slice(self, extension_name):
735
749
if self .enabled ():
736
750
unit_file_install_path = systemd .get_unit_file_install_path ()
737
751
extension_slice_path = os .path .join (unit_file_install_path ,
738
- SystemdCgroupsApi .get_extension_slice_name (extension_name ))
752
+ SystemdCgroupsApi .get_extension_slice_name (extension_name ))
739
753
try :
740
754
slice_contents = _EXTENSION_SLICE_CONTENTS .format (extension_name = extension_name )
741
755
CGroupConfigurator ._Impl .__create_unit_file (extension_slice_path , slice_contents )
@@ -755,12 +769,6 @@ def remove_extension_slice(self, extension_name):
755
769
if os .path .exists (extension_slice_path ):
756
770
self .stop_tracking_extension_cgroups (extension_name )
757
771
CGroupConfigurator ._Impl .__cleanup_unit_file (extension_slice_path )
758
- # stop the unit gracefully; the extensions slices will be removed from /sys/fs/cgroup path
759
- try :
760
- logger .info ("Executing systemctl stop {0}" .format (extension_slice_name ))
761
- shellutil .run_command (["systemctl" , "stop" , extension_slice_name ])
762
- except Exception as exception :
763
- _log_cgroup_warning ("systemctl stop failed (remove slice): {0}" , ustr (exception ))
764
772
765
773
def set_extension_services_cpu_memory_quota (self , services_list ):
766
774
"""
0 commit comments