optAllocationRequestId) {
- LOGGER.info("Requesting {} containers with resource={} and allocation request id = {}", numContainers, resource, optAllocationRequestId);
+ LOGGER.info("Requesting {} containers with resource = {} and allocation request id = {}", numContainers, resource, optAllocationRequestId);
IntStream.range(0, numContainers)
.forEach(i -> requestContainer(Optional.absent(), resource, optAllocationRequestId));
}
@@ -553,16 +491,7 @@ protected ByteBuffer getSecurityTokens() throws IOException {
}
@VisibleForTesting
- protected String buildContainerCommand(Container container, String helixParticipantId, String helixInstanceTag) {
- long allocationRequestId = container.getAllocationRequestId();
- WorkerProfile workerProfile = Optional.fromNullable(this.workerProfileByAllocationRequestId.get(allocationRequestId))
- .or(() -> {
- LOGGER.warn("No Worker Profile found for {}, so falling back to default", allocationRequestId);
- return this.workerProfileByAllocationRequestId.computeIfAbsent(DEFAULT_ALLOCATION_REQUEST_ID, k -> {
- LOGGER.warn("WARNING: (LIKELY) UNEXPECTED CONCURRENCY: No Worker Profile even yet mapped to the default allocation request ID {} - creating one now", DEFAULT_ALLOCATION_REQUEST_ID);
- return new WorkerProfile(this.config);
- });
- });
+ protected String buildContainerCommand(Container container, String workerProfileName, WorkerProfile workerProfile) {
Config workerProfileConfig = workerProfile.getConfig();
double workerJvmMemoryXmxRatio = ConfigUtils.getDouble(workerProfileConfig,
@@ -574,13 +503,13 @@ protected String buildContainerCommand(Container container, String helixParticip
GobblinYarnConfigurationKeys.DEFAULT_CONTAINER_JVM_MEMORY_OVERHEAD_MBS);
Preconditions.checkArgument(workerJvmMemoryXmxRatio >= 0 && workerJvmMemoryXmxRatio <= 1,
- workerProfile.getName() + " : " + GobblinYarnConfigurationKeys.CONTAINER_JVM_MEMORY_XMX_RATIO_KEY +
+ workerProfileName + " : " + GobblinYarnConfigurationKeys.CONTAINER_JVM_MEMORY_XMX_RATIO_KEY +
" must be between 0 and 1 inclusive");
long containerMemoryMbs = container.getResource().getMemorySize();
Preconditions.checkArgument(workerJvmMemoryOverheadMbs < containerMemoryMbs * workerJvmMemoryXmxRatio,
- workerProfile.getName() + " : " + GobblinYarnConfigurationKeys.CONTAINER_JVM_MEMORY_OVERHEAD_MBS_KEY +
+ workerProfileName + " : " + GobblinYarnConfigurationKeys.CONTAINER_JVM_MEMORY_OVERHEAD_MBS_KEY +
" cannot be more than " + GobblinYarnConfigurationKeys.CONTAINER_MEMORY_MBS_KEY + " * " +
GobblinYarnConfigurationKeys.CONTAINER_JVM_MEMORY_XMX_RATIO_KEY);
@@ -602,14 +531,8 @@ protected String buildContainerCommand(Container container, String helixParticip
.append(" --").append(GobblinClusterConfigurationKeys.APPLICATION_NAME_OPTION_NAME)
.append(" ").append(this.applicationName)
.append(" --").append(GobblinClusterConfigurationKeys.APPLICATION_ID_OPTION_NAME)
- .append(" ").append(this.applicationId)
- .append(" --").append(GobblinClusterConfigurationKeys.HELIX_INSTANCE_NAME_OPTION_NAME)
- .append(" ").append(helixParticipantId);
+ .append(" ").append(this.applicationId);
- if (!Strings.isNullOrEmpty(helixInstanceTag)) {
- containerCommand.append(" --").append(GobblinClusterConfigurationKeys.HELIX_INSTANCE_TAGS_OPTION_NAME)
- .append(" ").append(helixInstanceTag);
- }
return containerCommand.append(" 1>").append(ApplicationConstants.LOG_DIR_EXPANSION_VAR).append(File.separator).append(
containerProcessName).append(".").append(ApplicationConstants.STDOUT)
.append(" 2>").append(ApplicationConstants.LOG_DIR_EXPANSION_VAR).append(File.separator).append(
@@ -637,103 +560,11 @@ private boolean shouldStickToTheSameNode(int containerExitStatus) {
}
/**
- * Handle the completion of a container. A new container will be requested to replace the one
- * that just exited. Depending on the exit status and if container host affinity is enabled,
- * the new container may or may not try to be started on the same node.
- *
- * A container completes in either of the following conditions: 1) some error happens in the
- * container and caused the container to exit, 2) the container gets killed due to some reason,
- * for example, if it runs over the allowed amount of virtual or physical memory, 3) the gets
- * preempted by the ResourceManager, or 4) the container gets stopped by the ApplicationMaster.
- * A replacement container is needed in all but the last case.
+ * Handle the completion of a container.
+ * Just removes the containerId from {@link #containerMap}
*/
protected void handleContainerCompletion(ContainerStatus containerStatus) {
- ContainerInfo completedContainerInfo = this.containerMap.remove(containerStatus.getContainerId());
- //Get the Helix instance name for the completed container. Because callbacks are processed asynchronously, we might
- //encounter situations where handleContainerCompletion() is called before onContainersAllocated(), resulting in the
- //containerId missing from the containersMap.
- // We use removedContainerID to remember these containers and remove them from containerMap later when we call requestTargetNumberOfContainers method
- if (completedContainerInfo == null) {
- removedContainerID.putIfAbsent(containerStatus.getContainerId(), "");
- }
- String completedInstanceName = UNKNOWN_HELIX_INSTANCE;
-
- String helixTag = completedContainerInfo == null ? helixInstanceTags : completedContainerInfo.getHelixTag();
- if (completedContainerInfo != null) {
- allocatedContainerCountMap.get(helixTag).decrementAndGet();
- }
-
- LOGGER.info(String.format("Container %s running Helix instance %s with tag %s has completed with exit status %d",
- containerStatus.getContainerId(), completedInstanceName, helixTag, containerStatus.getExitStatus()));
-
- if (!Strings.isNullOrEmpty(containerStatus.getDiagnostics())) {
- LOGGER.info(String.format("Received the following diagnostics information for container %s: %s",
- containerStatus.getContainerId(), containerStatus.getDiagnostics()));
- }
-
- switch(containerStatus.getExitStatus()) {
- case(ContainerExitStatus.ABORTED):
- if (handleAbortedContainer(containerStatus, completedContainerInfo, completedInstanceName)) {
- return;
- }
- break;
- case(1): // Same as linux exit status 1 Often occurs when launch_container.sh failed
- LOGGER.info("Exit status 1. CompletedContainerInfo={}", completedContainerInfo);
- break;
- default:
- break;
- }
-
- if (this.shutdownInProgress) {
- return;
- }
- if(completedContainerInfo != null) {
- this.helixInstanceRetryCount.putIfAbsent(completedInstanceName, new AtomicInteger(0));
- int retryCount = this.helixInstanceRetryCount.get(completedInstanceName).incrementAndGet();
-
- // Populate event metadata
- Optional> eventMetadataBuilder = Optional.absent();
- if (this.eventSubmitter.isPresent()) {
- eventMetadataBuilder = Optional.of(buildContainerStatusEventMetadata(containerStatus));
- eventMetadataBuilder.get().put(GobblinYarnEventConstants.EventMetadata.HELIX_INSTANCE_ID, completedInstanceName);
- eventMetadataBuilder.get().put(GobblinYarnEventConstants.EventMetadata.CONTAINER_STATUS_RETRY_ATTEMPT, retryCount + "");
- }
-
- if (this.helixInstanceMaxRetries > 0 && retryCount > this.helixInstanceMaxRetries) {
- if (this.eventSubmitter.isPresent()) {
- this.eventSubmitter.get()
- .submit(GobblinYarnEventConstants.EventNames.HELIX_INSTANCE_COMPLETION, eventMetadataBuilder.get().build());
- }
-
- LOGGER.warn("Maximum number of retries has been achieved for Helix instance " + completedInstanceName);
- return;
- }
-
- // Add the Helix instance name of the completed container to the set of unused
- // instance names so they can be reused by a replacement container.
- LOGGER.info("Adding instance {} to the pool of unused instances", completedInstanceName);
- this.unusedHelixInstanceNames.add(completedInstanceName);
-
- /**
- * NOTE: logic for handling container failure is removed because {@link #YarnService} relies on the auto scaling manager
- * to control the number of containers by polling helix for the current number of tasks
- * Without that integration, that code requests too many containers when there are exceptions and overloads yarn
- */
- }
- }
-
- private boolean handleAbortedContainer(ContainerStatus containerStatus, ContainerInfo completedContainerInfo,
- String completedInstanceName) {
- if (this.releasedContainerCache.getIfPresent(containerStatus.getContainerId()) != null) {
- LOGGER.info("Container release requested, so not spawning a replacement for containerId {}", containerStatus.getContainerId());
- if (completedContainerInfo != null) {
- LOGGER.info("Adding instance {} to the pool of unused instances", completedInstanceName);
- this.unusedHelixInstanceNames.add(completedInstanceName);
- }
- return true;
- }
- LOGGER.info("Container {} aborted due to lost NM", containerStatus.getContainerId());
- return false;
+ this.containerMap.remove(containerStatus.getContainerId());
}
private ImmutableMap.Builder buildContainerStatusEventMetadata(ContainerStatus containerStatus) {
@@ -782,33 +613,28 @@ public void onContainersCompleted(List statuses) {
@Override
public void onContainersAllocated(List containers) {
for (final Container container : containers) {
+ long allocationRequestId = container.getAllocationRequestId();
+ WorkerProfile workerProfile = Optional.fromNullable(workerProfileByAllocationRequestId.get(allocationRequestId))
+ .or(() -> {
+ LOGGER.warn("No Worker Profile found for {}, so falling back to default", allocationRequestId);
+ return workerProfileByAllocationRequestId.computeIfAbsent(DEFAULT_ALLOCATION_REQUEST_ID, k -> {
+ LOGGER.warn("WARNING: (LIKELY) UNEXPECTED CONCURRENCY: No Worker Profile even yet mapped to the default allocation request ID {} - creating one now", DEFAULT_ALLOCATION_REQUEST_ID);
+ return new WorkerProfile(config);
+ });
+ });
+
String containerId = container.getId().toString();
- String containerHelixTag = helixInstanceTags;
if (eventSubmitter.isPresent()) {
eventSubmitter.get().submit(GobblinYarnEventConstants.EventNames.CONTAINER_ALLOCATION,
GobblinYarnMetricTagNames.CONTAINER_ID, containerId);
}
- LOGGER.info("Container {} has been allocated with resource {} for helix tag {}",
- container.getId(), container.getResource(), containerHelixTag);
-
- //Iterate over the (thread-safe) set of unused instances to find the first instance that is not currently live.
- //Once we find a candidate instance, it is removed from the set.
- String instanceName = null;
-
- //Ensure that updates to unusedHelixInstanceNames are visible to other threads that might concurrently
- //invoke the callback on container allocation.
- synchronized (this) {
- Iterator iterator = unusedHelixInstanceNames.iterator();
- while (iterator.hasNext()) {
- instanceName = iterator.next();
- }
- }
+ LOGGER.info("Container {} has been allocated with resource {} for Worker Profile {}",
+ container.getId(), container.getResource(), WorkforceProfiles.renderName(workerProfile.getName()));
- ContainerInfo containerInfo = new ContainerInfo(container, instanceName, containerHelixTag);
+ ContainerInfo containerInfo = new ContainerInfo(container,
+ WorkforceProfiles.renderName(workerProfile.getName()), workerProfile);
containerMap.put(container.getId(), containerInfo);
- allocatedContainerCountMap.putIfAbsent(containerHelixTag, new AtomicInteger(0));
- allocatedContainerCountMap.get(containerHelixTag).incrementAndGet();
// Find matching requests and remove the request (YARN-660). We the scheduler are responsible
// for cleaning up requests after allocation based on the design in the described ticket.
@@ -969,26 +795,26 @@ public void onStopContainerError(ContainerId containerId, Throwable t) {
}
}
- // Class encapsulates Container instances, Helix participant IDs of the containers, Helix Tag, and
+ // Class encapsulates Container instance, WorkerProfile name to print, WorkerProfile, and
// initial startup command
@Getter
class ContainerInfo {
private final Container container;
- private final String helixParticipantId;
- private final String helixTag;
+ private final String workerProfileName; // Storing this to avoid calling WorkforceProfiles.renderName(workerProfile.getName()) while logging
+ private final WorkerProfile workerProfile;
private final String startupCommand;
- public ContainerInfo(Container container, String helixParticipantId, String helixTag) {
+ public ContainerInfo(Container container, String workerProfileName, WorkerProfile workerProfile) {
this.container = container;
- this.helixParticipantId = helixParticipantId;
- this.helixTag = helixTag;
- this.startupCommand = YarnService.this.buildContainerCommand(container, helixParticipantId, helixTag);
+ this.workerProfileName = workerProfileName;
+ this.workerProfile = workerProfile;
+ this.startupCommand = YarnService.this.buildContainerCommand(container, workerProfileName, workerProfile);
}
@Override
public String toString() {
- return String.format("ContainerInfo{ container=%s, helixParticipantId=%s, helixTag=%s, startupCommand=%s }",
- container.getId(), helixParticipantId, helixTag, startupCommand);
+ return String.format("ContainerInfo{ container=%s, workerProfileName=%s, startupCommand=%s }",
+ container.getId(), workerProfileName, startupCommand);
}
}
}
diff --git a/gobblin-temporal/src/test/java/org/apache/gobblin/temporal/yarn/YarnServiceTest.java b/gobblin-temporal/src/test/java/org/apache/gobblin/temporal/yarn/YarnServiceTest.java
index 3c81316b85c..8d216450a34 100644
--- a/gobblin-temporal/src/test/java/org/apache/gobblin/temporal/yarn/YarnServiceTest.java
+++ b/gobblin-temporal/src/test/java/org/apache/gobblin/temporal/yarn/YarnServiceTest.java
@@ -20,8 +20,13 @@
import java.io.IOException;
import java.net.URL;
+import org.apache.gobblin.temporal.dynamic.WorkerProfile;
+import org.apache.gobblin.temporal.dynamic.WorkforceProfiles;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.yarn.api.protocolrecords.RegisterApplicationMasterResponse;
+import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
+import org.apache.hadoop.yarn.api.records.ApplicationId;
+import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.api.records.Container;
import org.apache.hadoop.yarn.client.api.async.AMRMClientAsync;
@@ -103,12 +108,6 @@ public void testBuildContainerCommand() throws Exception {
.withValue(GobblinYarnConfigurationKeys.CONTAINER_JVM_MEMORY_XMX_RATIO_KEY, ConfigValueFactory.fromAnyRef(jvmMemoryXmxRatio))
.withValue(GobblinYarnConfigurationKeys.CONTAINER_JVM_MEMORY_OVERHEAD_MBS_KEY, ConfigValueFactory.fromAnyRef(jvmMemoryOverheadMbs));
- Resource resource = Resource.newInstance(resourceMemoryMB, 2);
-
- Container mockContainer = Mockito.mock(Container.class);
- Mockito.when(mockContainer.getResource()).thenReturn(resource);
- Mockito.when(mockContainer.getAllocationRequestId()).thenReturn(0L);
-
YarnService yarnService = new YarnService(
config,
"testApplicationName",
@@ -118,9 +117,13 @@ public void testBuildContainerCommand() throws Exception {
eventBus
);
- yarnService.startUp();
-
- String command = yarnService.buildContainerCommand(mockContainer, "testHelixParticipantId", "testHelixInstanceTag");
+ WorkerProfile workerProfile = new WorkerProfile(config);
+ ContainerId containerId = ContainerId.newContainerId(ApplicationAttemptId.newInstance(ApplicationId.newInstance(1, 0),
+ 0), 0);
+ Resource resource = Resource.newInstance(resourceMemoryMB, 2);
+ Container container = Container.newInstance(containerId, null, null, resource, null, null);
+ YarnService.ContainerInfo containerInfo = yarnService.new ContainerInfo(container, WorkforceProfiles.BASELINE_NAME_RENDERING, workerProfile);
+ String command = containerInfo.getStartupCommand();
Assert.assertTrue(command.contains("-Xmx" + expectedJvmMemory + "M"));
}
}