Skip to content

Commit

Permalink
fix(aws): CleanupAlarmsAgent cycle to catch exceptions
Browse files Browse the repository at this point in the history
  • Loading branch information
christosarvanitis committed Jan 17, 2025
1 parent 3f43ac2 commit 7b366a7
Showing 1 changed file with 29 additions and 27 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ import java.util.regex.Pattern

@Slf4j
class CleanupAlarmsAgent implements RunnableAgent, CustomScheduledAgent {
public static final long POLL_INTERVAL_MILLIS = TimeUnit.HOURS.toMillis(24)
public static final long POLL_INTERVAL_MILLIS = TimeUnit.MINUTES.toMillis(3)
public static final long DEFAULT_TIMEOUT_MILLIS = TimeUnit.MINUTES.toMillis(20)

public final Pattern ALARM_NAME_PATTERN = Pattern.compile(alarmsNamePattern)
Expand Down Expand Up @@ -86,36 +86,38 @@ class CleanupAlarmsAgent implements RunnableAgent, CustomScheduledAgent {
getAccounts().each { NetflixAmazonCredentials credentials ->
credentials.regions.each { AmazonCredentials.AWSRegion region ->
log.info("Looking for alarms to delete")

def cloudWatch = amazonClientProvider.getCloudWatch(credentials, region.name)
Set<String> attachedAlarms = getAttachedAlarms(amazonClientProvider.getAutoScaling(credentials, region.name))
def describeAlarmsRequest = new DescribeAlarmsRequest().withStateValue(StateValue.INSUFFICIENT_DATA)

while (true) {
def result = cloudWatch.describeAlarms(describeAlarmsRequest)

List<MetricAlarm> alarmsToDelete = result.metricAlarms.findAll {
it.stateUpdatedTimestamp.before(DateTime.now().minusDays(daysToLeave).toDate()) &&
!attachedAlarms.contains(it.alarmName) &&
ALARM_NAME_PATTERN.matcher(it.alarmName).matches()
}

if (alarmsToDelete) {
// terminate up to 20 alarms at a time (avoids any AWS limits on # of concurrent deletes)
alarmsToDelete.collate(20).each {
log.info("Deleting ${it.size()} alarms in ${credentials.name}/${region.name} " +
"(alarms: ${it.alarmName.join(", ")})")
cloudWatch.deleteAlarms(new DeleteAlarmsRequest().withAlarmNames(it.alarmName))
Thread.sleep(500)
try {
def cloudWatch = amazonClientProvider.getCloudWatch(credentials, region.name)
Set<String> attachedAlarms = getAttachedAlarms(amazonClientProvider.getAutoScaling(credentials, region.name))
def describeAlarmsRequest = new DescribeAlarmsRequest().withStateValue(StateValue.INSUFFICIENT_DATA)

while (true) {
def result = cloudWatch.describeAlarms(describeAlarmsRequest)

List<MetricAlarm> alarmsToDelete = result.metricAlarms.findAll {
it.stateUpdatedTimestamp.before(DateTime.now().minusDays(daysToLeave).toDate()) &&
!attachedAlarms.contains(it.alarmName) &&
ALARM_NAME_PATTERN.matcher(it.alarmName).matches()
}

}
if (alarmsToDelete) {
// terminate up to 20 alarms at a time (avoids any AWS limits on # of concurrent deletes)
alarmsToDelete.collate(20).each {
log.info("Deleting ${it.size()} alarms in ${credentials.name}/${region.name} " +
"(alarms: ${it.alarmName.join(", ")})")
cloudWatch.deleteAlarms(new DeleteAlarmsRequest().withAlarmNames(it.alarmName))
Thread.sleep(500)
}
}

if (result.nextToken) {
describeAlarmsRequest.withNextToken(result.nextToken)
} else {
break
if (result.nextToken) {
describeAlarmsRequest.withNextToken(result.nextToken)
} else {
break
}
}
} catch (Exception e) {
log.error("Error occurred while processing alarms for ${credentials.name}/${region.name}: ${e.message}", e)
}
}
}
Expand Down

0 comments on commit 7b366a7

Please sign in to comment.