17
17
18
18
package nextflow.cloud.aws.batch
19
19
20
- import static AwsContainerOptionsMapper.*
20
+ import static nextflow.cloud.aws.batch. AwsContainerOptionsMapper.*
21
21
22
22
import java.nio.file.Path
23
23
import java.nio.file.Paths
24
- import java.util.regex.Pattern
25
24
26
25
import com.amazonaws.services.batch.AWSBatch
27
26
import com.amazonaws.services.batch.model.AWSBatchException
@@ -33,6 +32,7 @@ import com.amazonaws.services.batch.model.DescribeJobDefinitionsRequest
33
32
import com.amazonaws.services.batch.model.DescribeJobDefinitionsResult
34
33
import com.amazonaws.services.batch.model.DescribeJobsRequest
35
34
import com.amazonaws.services.batch.model.DescribeJobsResult
35
+ import com.amazonaws.services.batch.model.EvaluateOnExit
36
36
import com.amazonaws.services.batch.model.Host
37
37
import com.amazonaws.services.batch.model.JobDefinition
38
38
import com.amazonaws.services.batch.model.JobDefinitionType
@@ -53,14 +53,12 @@ import groovy.transform.CompileStatic
53
53
import groovy.util.logging.Slf4j
54
54
import nextflow.cloud.types.CloudMachineInfo
55
55
import nextflow.container.ContainerNameValidator
56
- import nextflow.exception.NodeTerminationException
57
56
import nextflow.exception.ProcessSubmitException
58
57
import nextflow.exception.ProcessUnrecoverableException
59
58
import nextflow.executor.BashWrapperBuilder
60
59
import nextflow.executor.res.AcceleratorResource
61
60
import nextflow.processor.BatchContext
62
61
import nextflow.processor.BatchHandler
63
- import nextflow.processor.ErrorStrategy
64
62
import nextflow.processor.TaskBean
65
63
import nextflow.processor.TaskHandler
66
64
import nextflow.processor.TaskRun
@@ -74,8 +72,6 @@ import nextflow.util.CacheHelper
74
72
@Slf4j
75
73
class AwsBatchTaskHandler extends TaskHandler implements BatchHandler<String ,JobDetail > {
76
74
77
- private static Pattern TERMINATED = ~/ ^Host EC2 .* terminated.*/
78
-
79
75
private final Path exitFile
80
76
81
77
private final Path wrapperFile
@@ -108,8 +104,6 @@ class AwsBatchTaskHandler extends TaskHandler implements BatchHandler<String,Job
108
104
109
105
private Map<String ,String > environment
110
106
111
- private boolean batchNativeRetry
112
-
113
107
final static private Map<String ,String > jobDefinitions = [:]
114
108
115
109
/**
@@ -256,23 +250,15 @@ class AwsBatchTaskHandler extends TaskHandler implements BatchHandler<String,Job
256
250
final job = describeJob(jobId)
257
251
final done = job?. status in [' SUCCEEDED' , ' FAILED' ]
258
252
if ( done ) {
259
- if ( ! batchNativeRetry && TERMINATED . matcher(job. statusReason). find() ) {
260
- // kee track of the node termination error
261
- task. error = new NodeTerminationException (job. statusReason)
262
- // mark the task as ABORTED since thr failure is caused by a node failure
263
- task. aborted = true
253
+ // finalize the task
254
+ task. exitStatus = readExitFile()
255
+ task. stdout = outputFile
256
+ if ( job?. status == ' FAILED' ) {
257
+ task. error = new ProcessUnrecoverableException (errReason(job))
258
+ task. stderr = executor. getJobOutputStream(jobId) ?: errorFile
264
259
}
265
260
else {
266
- // finalize the task
267
- task. exitStatus = readExitFile()
268
- task. stdout = outputFile
269
- if ( job?. status == ' FAILED' ) {
270
- task. error = new ProcessUnrecoverableException (errReason(job))
271
- task. stderr = executor. getJobOutputStream(jobId) ?: errorFile
272
- }
273
- else {
274
- task. stderr = errorFile
275
- }
261
+ task. stderr = errorFile
276
262
}
277
263
status = TaskStatus . COMPLETED
278
264
return true
@@ -620,6 +606,10 @@ class AwsBatchTaskHandler extends TaskHandler implements BatchHandler<String,Job
620
606
return [' bash' ,' -o' ,' pipefail' ,' -c' , cmd. toString() ]
621
607
}
622
608
609
+ protected maxSpotAttempts () {
610
+ return executor. awsOptions. maxSpotAttempts
611
+ }
612
+
623
613
/**
624
614
* Create a new Batch job request for the given NF {@link TaskRun}
625
615
*
@@ -636,19 +626,16 @@ class AwsBatchTaskHandler extends TaskHandler implements BatchHandler<String,Job
636
626
result. setJobQueue(getJobQueue(task))
637
627
result. setJobDefinition(getJobDefinition(task))
638
628
639
- // -- NF uses `maxRetries` *only* if `retry` error strategy is specified
640
- // otherwise delegates the the retry to AWS Batch
641
- // -- NOTE: make sure the `errorStrategy` is a static value before invoking `getMaxRetries` and `getErrorStrategy`
642
- // when the errorStrategy is closure (ie. dynamic evaluated) value, the `task.config.getMaxRetries() && task.config.getErrorStrategy()`
643
- // condition should not be evaluated because otherwise the closure value is cached using the wrong task.attempt and task.exitStatus values.
644
- // -- use of `config.getRawValue('errorStrategy')` instead of `config.getErrorStrategy()` to prevent the resolution
645
- // of values dynamic values i.e. closures
646
- final strategy = task. config. getRawValue(' errorStrategy' )
647
- final canCheck = strategy == null || strategy instanceof CharSequence
648
- if ( canCheck && task. config. getMaxRetries() && task. config. getErrorStrategy() != ErrorStrategy . RETRY ) {
649
- def retry = new RetryStrategy (). withAttempts( task. config. getMaxRetries()+1 )
629
+ /*
630
+ * retry on spot reclaim
631
+ * https://aws.amazon.com/blogs/compute/introducing-retry-strategies-for-aws-batch/
632
+ */
633
+ final attempts = maxSpotAttempts()
634
+ if ( attempts> 0 ) {
635
+ final retry = new RetryStrategy ()
636
+ .withAttempts( attempts )
637
+ .withEvaluateOnExit( new EvaluateOnExit (). withOnReason(' Host EC2*' ). withAction(' RETRY' ) )
650
638
result. setRetryStrategy(retry)
651
- this . batchNativeRetry = true
652
639
}
653
640
654
641
// set task timeout
0 commit comments