Skip to content

Commit 76fb7b6

Browse files
committed
Refactor IDE spawn logic
- Instead of respawning when there is no join link, respawn when there the PID is dead. - Give more time for the PID to become alive before trying to respawn. - More logging.
1 parent 502e33e commit 76fb7b6

File tree

3 files changed

+82
-42
lines changed

3 files changed

+82
-42
lines changed

CHANGELOG.md

+12
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,18 @@
44

55
## Unreleased
66

7+
### Changed
8+
9+
- Previously, the plugin would try to respawn the IDE if we fail to get a join
10+
link after five seconds. However, it seems sometimes we do not get a join link
11+
that quickly. Now the plugin will wait indefinitely for a join link as long as
12+
the process is still alive. If the process never comes alive after 30 seconds
13+
or it dies after coming alive, the plugin will attempt to respawn the IDE.
14+
15+
### Added
16+
17+
- Extra logging around the IDE spawn to help debugging.
18+
719
## 2.13.0 - 2024-07-16
820

921
### Added

gradle.properties

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ pluginGroup=com.coder.gateway
44
# Zip file name.
55
pluginName=coder-gateway
66
# SemVer format -> https://semver.org
7-
pluginVersion=2.13.0
7+
pluginVersion=2.13.1
88
# See https://plugins.jetbrains.com/docs/intellij/build-number-ranges.html
99
# for insight into build numbers and IntelliJ Platform versions.
1010
pluginSinceBuild=233.6745

src/main/kotlin/com/coder/gateway/CoderRemoteConnectionHandle.kt

+69-41
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ import java.net.URI
4848
import java.time.Duration
4949
import java.time.LocalDateTime
5050
import java.time.format.DateTimeFormatter
51+
import java.util.concurrent.TimeUnit
5152
import java.util.concurrent.TimeoutException
5253
import kotlin.coroutines.resume
5354
import kotlin.coroutines.resumeWithException
@@ -227,16 +228,9 @@ class CoderRemoteConnectionHandle {
227228

228229
// Wait for the IDE to come up.
229230
indicator.text = "Waiting for ${workspace.ideName} backend..."
230-
var status: UnattendedHostStatus? = null
231231
val remoteProjectPath = accessor.makeRemotePath(ShellArgument.PlainText(workspace.projectPath))
232232
val logsDir = accessor.getLogsDir(workspace.ideProduct.productCode, remoteProjectPath)
233-
while (lifetime.status == LifetimeStatus.Alive) {
234-
status = ensureIDEBackend(accessor, workspace, ideDir, remoteProjectPath, logsDir, lifetime, null)
235-
if (!status?.joinLink.isNullOrBlank()) {
236-
break
237-
}
238-
delay(5000)
239-
}
233+
var status = ensureIDEBackend(accessor, workspace, ideDir, remoteProjectPath, logsDir, lifetime, null)
240234

241235
// We wait for non-null, so this only happens on cancellation.
242236
val joinLink = status?.joinLink
@@ -302,6 +296,7 @@ class CoderRemoteConnectionHandle {
302296
}
303297
// Continue once the client is present.
304298
handle.onClientPresenceChanged.advise(lifetime) {
299+
logger.info("${workspace.ideName} client to ${workspace.hostname} presence: ${handle.clientPresent}")
305300
if (handle.clientPresent && continuation.isActive) {
306301
continuation.resume(true)
307302
}
@@ -437,8 +432,8 @@ class CoderRemoteConnectionHandle {
437432
}
438433

439434
/**
440-
* Ensure the backend is started. Status and/or links may be null if the
441-
* backend has not started.
435+
* Ensure the backend is started. It will not return until a join link is
436+
* received or the lifetime expires.
442437
*/
443438
private suspend fun ensureIDEBackend(
444439
accessor: HighLevelHostAccessor,
@@ -449,41 +444,74 @@ class CoderRemoteConnectionHandle {
449444
lifetime: LifetimeDefinition,
450445
currentStatus: UnattendedHostStatus?,
451446
): UnattendedHostStatus? {
452-
val details = "${workspace.hostname}:${ideDir.toRawString()}, project=${remoteProjectPath.toRawString()}"
453-
return try {
454-
if (currentStatus?.appPid != null &&
455-
!currentStatus.joinLink.isNullOrBlank() &&
456-
accessor.isPidAlive(currentStatus.appPid.toInt())
457-
) {
458-
// If the PID is alive, assume the join link we have is still
459-
// valid. The join link seems to change even if it is the same
460-
// backend running, so if we always fetched the link the client
461-
// would relaunch over and over.
462-
return currentStatus
463-
}
447+
val details = "$${workspace.hostname}:${ideDir.toRawString()}, project=${remoteProjectPath.toRawString()}"
448+
val wait = TimeUnit.SECONDS.toMillis(5)
464449

465-
// See if there is already a backend running. Weirdly, there is
466-
// always a PID, even if there is no backend running, and
467-
// backendUnresponsive is always false, but the links are null so
468-
// hopefully that is an accurate indicator that the IDE is up.
469-
val status = accessor.getHostIdeStatus(ideDir, remoteProjectPath)
470-
if (!status.joinLink.isNullOrBlank()) {
471-
logger.info("Found existing ${workspace.ideName} backend on $details")
472-
return status
450+
// Check if the current IDE is alive.
451+
if (currentStatus != null) {
452+
while (lifetime.status == LifetimeStatus.Alive) {
453+
try {
454+
val isAlive = accessor.isPidAlive(currentStatus.appPid.toInt())
455+
logger.info("${workspace.ideName} status: pid=${currentStatus.appPid}, alive=$isAlive")
456+
if (isAlive) {
457+
// Use the current status and join link.
458+
return currentStatus
459+
} else {
460+
logger.info("Relaunching ${workspace.ideName} since it is not alive...")
461+
break
462+
}
463+
} catch (ex: Exception) {
464+
logger.info("Failed to check if ${workspace.ideName} is alive on $details; waiting $wait ms to try again: pid=${currentStatus.appPid}", ex)
465+
}
466+
delay(wait)
473467
}
468+
} else {
469+
logger.info("Launching ${workspace.ideName} for the first time on ${workspace.hostname}...")
470+
}
471+
472+
// This means we broke out because the user canceled or closed the IDE.
473+
if (lifetime.status != LifetimeStatus.Alive) {
474+
return null
475+
}
474476

475-
// Otherwise, spawn a new backend. This does not seem to spawn a
476-
// second backend if one is already running, yet it does somehow
477-
// cause a second client to launch. So only run this if we are
478-
// really sure we have to launch a new backend.
479-
logger.info("Starting ${workspace.ideName} backend on $details")
480-
accessor.startHostIdeInBackgroundAndDetach(lifetime, ideDir, remoteProjectPath, logsDir)
481-
// Get the newly spawned PID and join link.
482-
return accessor.getHostIdeStatus(ideDir, remoteProjectPath)
483-
} catch (ex: Exception) {
484-
logger.info("Failed to get ${workspace.ideName} status from $details", ex)
485-
currentStatus
477+
// If the PID is not alive, spawn a new backend. This may not be
478+
// idempotent, so only call if we are really sure we need to.
479+
accessor.startHostIdeInBackgroundAndDetach(lifetime, ideDir, remoteProjectPath, logsDir)
480+
481+
// Get the newly spawned PID and join link.
482+
var attempts = 0
483+
val maxAttempts = 6
484+
while (lifetime.status == LifetimeStatus.Alive) {
485+
try {
486+
attempts++
487+
val status = accessor.getHostIdeStatus(ideDir, remoteProjectPath)
488+
if (!status.joinLink.isNullOrBlank()) {
489+
logger.info("Found join link for ${workspace.ideName}; proceeding to connect: pid=${status.appPid}")
490+
return status
491+
}
492+
// If we did not get a join link, see if the IDE is alive in
493+
// case it died and we need to respawn.
494+
val isAlive = status.appPid > 0 && accessor.isPidAlive(status.appPid.toInt())
495+
logger.info("${workspace.ideName} status: pid=${status.appPid}, alive=$isAlive, unresponsive=${status.backendUnresponsive}, attempt=$attempts")
496+
// It is not clear whether the PID can be trusted because we get
497+
// one even when there is no backend at all. For now give it
498+
// some time and if it is still dead, only then try to respawn.
499+
if (!isAlive && attempts >= maxAttempts) {
500+
logger.info("${workspace.ideName} is still not alive after $attempts checks, respawning backend and waiting $wait ms to try again")
501+
accessor.startHostIdeInBackgroundAndDetach(lifetime, ideDir, remoteProjectPath, logsDir)
502+
attempts = 0
503+
} else {
504+
logger.info("No join link found in status; waiting $wait ms to try again")
505+
}
506+
} catch (ex: Exception) {
507+
logger.info("Failed to get ${workspace.ideName} status from $details; waiting $wait ms to try again", ex)
508+
}
509+
delay(wait)
486510
}
511+
512+
// This means the lifetime is no longer alive.
513+
logger.info("Connection to ${workspace.ideName} on $details aborted by user")
514+
return null
487515
}
488516

489517
companion object {

0 commit comments

Comments
 (0)