@@ -48,6 +48,7 @@ import java.net.URI
48
48
import java.time.Duration
49
49
import java.time.LocalDateTime
50
50
import java.time.format.DateTimeFormatter
51
+ import java.util.concurrent.TimeUnit
51
52
import java.util.concurrent.TimeoutException
52
53
import kotlin.coroutines.resume
53
54
import kotlin.coroutines.resumeWithException
@@ -227,16 +228,9 @@ class CoderRemoteConnectionHandle {
227
228
228
229
// Wait for the IDE to come up.
229
230
indicator.text= " Waiting for${workspace.ideName} backend..."
230
- var status: UnattendedHostStatus ? = null
231
231
val remoteProjectPath= accessor.makeRemotePath(ShellArgument .PlainText (workspace.projectPath))
232
232
val logsDir= accessor.getLogsDir(workspace.ideProduct.productCode, remoteProjectPath)
233
- while (lifetime.status== LifetimeStatus .Alive ) {
234
- status= ensureIDEBackend(accessor, workspace, ideDir, remoteProjectPath, logsDir, lifetime,null )
235
- if (! status?.joinLink.isNullOrBlank()) {
236
- break
237
- }
238
- delay(5000 )
239
- }
233
+ var status= ensureIDEBackend(accessor, workspace, ideDir, remoteProjectPath, logsDir, lifetime,null )
240
234
241
235
// We wait for non-null, so this only happens on cancellation.
242
236
val joinLink= status?.joinLink
@@ -302,6 +296,7 @@ class CoderRemoteConnectionHandle {
302
296
}
303
297
// Continue once the client is present.
304
298
handle.onClientPresenceChanged.advise(lifetime) {
299
+ logger.info(" ${workspace.ideName} client to${workspace.hostname} presence:${handle.clientPresent} " )
305
300
if (handle.clientPresent&& continuation.isActive) {
306
301
continuation.resume(true )
307
302
}
@@ -437,8 +432,8 @@ class CoderRemoteConnectionHandle {
437
432
}
438
433
439
434
/* *
440
- * Ensure the backend is started.Status and/or links may be null if the
441
- *backend has not started .
435
+ * Ensure the backend is started.It will not return until a join link is
436
+ *received or the lifetime expires .
442
437
*/
443
438
private suspend fun ensureIDEBackend (
444
439
accessor : HighLevelHostAccessor ,
@@ -449,41 +444,74 @@ class CoderRemoteConnectionHandle {
449
444
lifetime : LifetimeDefinition ,
450
445
currentStatus : UnattendedHostStatus ? ,
451
446
):UnattendedHostStatus ? {
452
- val details= " ${workspace.hostname} :${ideDir.toRawString()} , project=${remoteProjectPath.toRawString()} "
453
- return try {
454
- if (currentStatus?.appPid!= null &&
455
- ! currentStatus.joinLink.isNullOrBlank()&&
456
- accessor.isPidAlive(currentStatus.appPid.toInt())
457
- ) {
458
- // If the PID is alive, assume the join link we have is still
459
- // valid. The join link seems to change even if it is the same
460
- // backend running, so if we always fetched the link the client
461
- // would relaunch over and over.
462
- return currentStatus
463
- }
447
+ val details= " $${workspace.hostname} :${ideDir.toRawString()} , project=${remoteProjectPath.toRawString()} "
448
+ val wait= TimeUnit .SECONDS .toMillis(5 )
464
449
465
- // See if there is already a backend running. Weirdly, there is
466
- // always a PID, even if there is no backend running, and
467
- // backendUnresponsive is always false, but the links are null so
468
- // hopefully that is an accurate indicator that the IDE is up.
469
- val status= accessor.getHostIdeStatus(ideDir, remoteProjectPath)
470
- if (! status.joinLink.isNullOrBlank()) {
471
- logger.info(" Found existing${workspace.ideName} backend on$details " )
472
- return status
450
+ // Check if the current IDE is alive.
451
+ if (currentStatus!= null ) {
452
+ while (lifetime.status== LifetimeStatus .Alive ) {
453
+ try {
454
+ val isAlive= accessor.isPidAlive(currentStatus.appPid.toInt())
455
+ logger.info(" ${workspace.ideName} status: pid=${currentStatus.appPid} , alive=$isAlive " )
456
+ if (isAlive) {
457
+ // Use the current status and join link.
458
+ return currentStatus
459
+ }else {
460
+ logger.info(" Relaunching${workspace.ideName} since it is not alive..." )
461
+ break
462
+ }
463
+ }catch (ex: Exception ) {
464
+ logger.info(" Failed to check if${workspace.ideName} is alive on$details ; waiting$wait ms to try again: pid=${currentStatus.appPid} " , ex)
465
+ }
466
+ delay(wait)
473
467
}
468
+ }else {
469
+ logger.info(" Launching${workspace.ideName} for the first time on${workspace.hostname} ..." )
470
+ }
471
+
472
+ // This means we broke out because the user canceled or closed the IDE.
473
+ if (lifetime.status!= LifetimeStatus .Alive ) {
474
+ return null
475
+ }
474
476
475
- // Otherwise, spawn a new backend. This does not seem to spawn a
476
- // second backend if one is already running, yet it does somehow
477
- // cause a second client to launch. So only run this if we are
478
- // really sure we have to launch a new backend.
479
- logger.info(" Starting${workspace.ideName} backend on$details " )
480
- accessor.startHostIdeInBackgroundAndDetach(lifetime, ideDir, remoteProjectPath, logsDir)
481
- // Get the newly spawned PID and join link.
482
- return accessor.getHostIdeStatus(ideDir, remoteProjectPath)
483
- }catch (ex: Exception ) {
484
- logger.info(" Failed to get${workspace.ideName} status from$details " , ex)
485
- currentStatus
477
+ // If the PID is not alive, spawn a new backend. This may not be
478
+ // idempotent, so only call if we are really sure we need to.
479
+ accessor.startHostIdeInBackgroundAndDetach(lifetime, ideDir, remoteProjectPath, logsDir)
480
+
481
+ // Get the newly spawned PID and join link.
482
+ var attempts= 0
483
+ val maxAttempts= 6
484
+ while (lifetime.status== LifetimeStatus .Alive ) {
485
+ try {
486
+ attempts++
487
+ val status= accessor.getHostIdeStatus(ideDir, remoteProjectPath)
488
+ if (! status.joinLink.isNullOrBlank()) {
489
+ logger.info(" Found join link for${workspace.ideName} ; proceeding to connect: pid=${status.appPid} " )
490
+ return status
491
+ }
492
+ // If we did not get a join link, see if the IDE is alive in
493
+ // case it died and we need to respawn.
494
+ val isAlive= status.appPid> 0 && accessor.isPidAlive(status.appPid.toInt())
495
+ logger.info(" ${workspace.ideName} status: pid=${status.appPid} , alive=$isAlive , unresponsive=${status.backendUnresponsive} , attempt=$attempts " )
496
+ // It is not clear whether the PID can be trusted because we get
497
+ // one even when there is no backend at all. For now give it
498
+ // some time and if it is still dead, only then try to respawn.
499
+ if (! isAlive&& attempts>= maxAttempts) {
500
+ logger.info(" ${workspace.ideName} is still not alive after$attempts checks, respawning backend and waiting$wait ms to try again" )
501
+ accessor.startHostIdeInBackgroundAndDetach(lifetime, ideDir, remoteProjectPath, logsDir)
502
+ attempts= 0
503
+ }else {
504
+ logger.info(" No join link found in status; waiting$wait ms to try again" )
505
+ }
506
+ }catch (ex: Exception ) {
507
+ logger.info(" Failed to get${workspace.ideName} status from$details ; waiting$wait ms to try again" , ex)
508
+ }
509
+ delay(wait)
486
510
}
511
+
512
+ // This means the lifetime is no longer alive.
513
+ logger.info(" Connection to${workspace.ideName} on$details aborted by user" )
514
+ return null
487
515
}
488
516
489
517
companion object {