@@ -48,6 +48,7 @@ import java.net.URI
48
48
import java.time.Duration
49
49
import java.time.LocalDateTime
50
50
import java.time.format.DateTimeFormatter
51
+ import java.util.concurrent.TimeUnit
51
52
import java.util.concurrent.TimeoutException
52
53
import kotlin.coroutines.resume
53
54
import kotlin.coroutines.resumeWithException
@@ -227,16 +228,9 @@ class CoderRemoteConnectionHandle {
227
228
228
229
// Wait for the IDE to come up.
229
230
indicator.text = " Waiting for ${workspace.ideName} backend..."
230
- var status: UnattendedHostStatus ? = null
231
231
val remoteProjectPath = accessor.makeRemotePath(ShellArgument .PlainText (workspace.projectPath))
232
232
val logsDir = accessor.getLogsDir(workspace.ideProduct.productCode, remoteProjectPath)
233
- while (lifetime.status == LifetimeStatus .Alive ) {
234
- status = ensureIDEBackend(accessor, workspace, ideDir, remoteProjectPath, logsDir, lifetime, null )
235
- if (! status?.joinLink.isNullOrBlank()) {
236
- break
237
- }
238
- delay(5000 )
239
- }
233
+ var status = ensureIDEBackend(accessor, workspace, ideDir, remoteProjectPath, logsDir, lifetime, null )
240
234
241
235
// We wait for non-null, so this only happens on cancellation.
242
236
val joinLink = status?.joinLink
@@ -302,6 +296,7 @@ class CoderRemoteConnectionHandle {
302
296
}
303
297
// Continue once the client is present.
304
298
handle.onClientPresenceChanged.advise(lifetime) {
299
+ logger.info(" ${workspace.ideName} client to ${workspace.hostname} presence: ${handle.clientPresent} " )
305
300
if (handle.clientPresent && continuation.isActive) {
306
301
continuation.resume(true )
307
302
}
@@ -437,8 +432,8 @@ class CoderRemoteConnectionHandle {
437
432
}
438
433
439
434
/* *
440
- * Ensure the backend is started. Status and/or links may be null if the
441
- * backend has not started .
435
+ * Ensure the backend is started. It will not return until a join link is
436
+ * received or the lifetime expires .
442
437
*/
443
438
private suspend fun ensureIDEBackend (
444
439
accessor : HighLevelHostAccessor ,
@@ -449,41 +444,74 @@ class CoderRemoteConnectionHandle {
449
444
lifetime : LifetimeDefinition ,
450
445
currentStatus : UnattendedHostStatus ? ,
451
446
): UnattendedHostStatus ? {
452
- val details = " ${workspace.hostname} :${ideDir.toRawString()} , project=${remoteProjectPath.toRawString()} "
453
- return try {
454
- if (currentStatus?.appPid != null &&
455
- ! currentStatus.joinLink.isNullOrBlank() &&
456
- accessor.isPidAlive(currentStatus.appPid.toInt())
457
- ) {
458
- // If the PID is alive, assume the join link we have is still
459
- // valid. The join link seems to change even if it is the same
460
- // backend running, so if we always fetched the link the client
461
- // would relaunch over and over.
462
- return currentStatus
463
- }
447
+ val details = " $${workspace.hostname} :${ideDir.toRawString()} , project=${remoteProjectPath.toRawString()} "
448
+ val wait = TimeUnit .SECONDS .toMillis(5 )
464
449
465
- // See if there is already a backend running. Weirdly, there is
466
- // always a PID, even if there is no backend running, and
467
- // backendUnresponsive is always false, but the links are null so
468
- // hopefully that is an accurate indicator that the IDE is up.
469
- val status = accessor.getHostIdeStatus(ideDir, remoteProjectPath)
470
- if (! status.joinLink.isNullOrBlank()) {
471
- logger.info(" Found existing ${workspace.ideName} backend on $details " )
472
- return status
450
+ // Check if the current IDE is alive.
451
+ if (currentStatus != null ) {
452
+ while (lifetime.status == LifetimeStatus .Alive ) {
453
+ try {
454
+ val isAlive = accessor.isPidAlive(currentStatus.appPid.toInt())
455
+ logger.info(" ${workspace.ideName} status: pid=${currentStatus.appPid} , alive=$isAlive " )
456
+ if (isAlive) {
457
+ // Use the current status and join link.
458
+ return currentStatus
459
+ } else {
460
+ logger.info(" Relaunching ${workspace.ideName} since it is not alive..." )
461
+ break
462
+ }
463
+ } catch (ex: Exception ) {
464
+ logger.info(" Failed to check if ${workspace.ideName} is alive on $details ; waiting $wait ms to try again: pid=${currentStatus.appPid} " , ex)
465
+ }
466
+ delay(wait)
473
467
}
468
+ } else {
469
+ logger.info(" Launching ${workspace.ideName} for the first time on ${workspace.hostname} ..." )
470
+ }
471
+
472
+ // This means we broke out because the user canceled or closed the IDE.
473
+ if (lifetime.status != LifetimeStatus .Alive ) {
474
+ return null
475
+ }
474
476
475
- // Otherwise, spawn a new backend. This does not seem to spawn a
476
- // second backend if one is already running, yet it does somehow
477
- // cause a second client to launch. So only run this if we are
478
- // really sure we have to launch a new backend.
479
- logger.info(" Starting ${workspace.ideName} backend on $details " )
480
- accessor.startHostIdeInBackgroundAndDetach(lifetime, ideDir, remoteProjectPath, logsDir)
481
- // Get the newly spawned PID and join link.
482
- return accessor.getHostIdeStatus(ideDir, remoteProjectPath)
483
- } catch (ex: Exception ) {
484
- logger.info(" Failed to get ${workspace.ideName} status from $details " , ex)
485
- currentStatus
477
+ // If the PID is not alive, spawn a new backend. This may not be
478
+ // idempotent, so only call if we are really sure we need to.
479
+ accessor.startHostIdeInBackgroundAndDetach(lifetime, ideDir, remoteProjectPath, logsDir)
480
+
481
+ // Get the newly spawned PID and join link.
482
+ var attempts = 0
483
+ val maxAttempts = 6
484
+ while (lifetime.status == LifetimeStatus .Alive ) {
485
+ try {
486
+ attempts++
487
+ val status = accessor.getHostIdeStatus(ideDir, remoteProjectPath)
488
+ if (! status.joinLink.isNullOrBlank()) {
489
+ logger.info(" Found join link for ${workspace.ideName} ; proceeding to connect: pid=${status.appPid} " )
490
+ return status
491
+ }
492
+ // If we did not get a join link, see if the IDE is alive in
493
+ // case it died and we need to respawn.
494
+ val isAlive = status.appPid > 0 && accessor.isPidAlive(status.appPid.toInt())
495
+ logger.info(" ${workspace.ideName} status: pid=${status.appPid} , alive=$isAlive , unresponsive=${status.backendUnresponsive} , attempt=$attempts " )
496
+ // It is not clear whether the PID can be trusted because we get
497
+ // one even when there is no backend at all. For now give it
498
+ // some time and if it is still dead, only then try to respawn.
499
+ if (! isAlive && attempts >= maxAttempts) {
500
+ logger.info(" ${workspace.ideName} is still not alive after $attempts checks, respawning backend and waiting $wait ms to try again" )
501
+ accessor.startHostIdeInBackgroundAndDetach(lifetime, ideDir, remoteProjectPath, logsDir)
502
+ attempts = 0
503
+ } else {
504
+ logger.info(" No join link found in status; waiting $wait ms to try again" )
505
+ }
506
+ } catch (ex: Exception ) {
507
+ logger.info(" Failed to get ${workspace.ideName} status from $details ; waiting $wait ms to try again" , ex)
508
+ }
509
+ delay(wait)
486
510
}
511
+
512
+ // This means the lifetime is no longer alive.
513
+ logger.info(" Connection to ${workspace.ideName} on $details aborted by user" )
514
+ return null
487
515
}
488
516
489
517
companion object {
0 commit comments