@@ -48,6 +48,7 @@ import java.net.URI
4848import java.time.Duration
4949import java.time.LocalDateTime
5050import java.time.format.DateTimeFormatter
51+ import java.util.concurrent.TimeUnit
5152import java.util.concurrent.TimeoutException
5253import kotlin.coroutines.resume
5354import kotlin.coroutines.resumeWithException
@@ -227,16 +228,9 @@ class CoderRemoteConnectionHandle {
227228
228229// Wait for the IDE to come up.
229230 indicator.text= " Waiting for${workspace.ideName} backend..."
230- var status: UnattendedHostStatus ? = null
231231val remoteProjectPath= accessor.makeRemotePath(ShellArgument .PlainText (workspace.projectPath))
232232val logsDir= accessor.getLogsDir(workspace.ideProduct.productCode, remoteProjectPath)
233- while (lifetime.status== LifetimeStatus .Alive ) {
234- status= ensureIDEBackend(accessor, workspace, ideDir, remoteProjectPath, logsDir, lifetime,null )
235- if (! status?.joinLink.isNullOrBlank()) {
236- break
237- }
238- delay(5000 )
239- }
233+ var status= ensureIDEBackend(accessor, workspace, ideDir, remoteProjectPath, logsDir, lifetime,null )
240234
241235// We wait for non-null, so this only happens on cancellation.
242236val joinLink= status?.joinLink
@@ -302,6 +296,7 @@ class CoderRemoteConnectionHandle {
302296 }
303297// Continue once the client is present.
304298 handle.onClientPresenceChanged.advise(lifetime) {
299+ logger.info(" ${workspace.ideName} client to${workspace.hostname} presence:${handle.clientPresent} " )
305300if (handle.clientPresent&& continuation.isActive) {
306301 continuation.resume(true )
307302 }
@@ -437,8 +432,8 @@ class CoderRemoteConnectionHandle {
437432 }
438433
439434/* *
440- * Ensure the backend is started.Status and/or links may be null if the
441- *backend has not started .
435+ * Ensure the backend is started.It will not return until a join link is
436+ *received or the lifetime expires .
442437*/
443438private suspend fun ensureIDEBackend (
444439accessor : HighLevelHostAccessor ,
@@ -449,41 +444,67 @@ class CoderRemoteConnectionHandle {
449444lifetime : LifetimeDefinition ,
450445currentStatus : UnattendedHostStatus ? ,
451446 ):UnattendedHostStatus ? {
452- val details= " ${workspace.hostname} :${ideDir.toRawString()} , project=${remoteProjectPath.toRawString()} "
453- return try {
454- if (currentStatus?.appPid!= null &&
455- ! currentStatus.joinLink.isNullOrBlank()&&
456- accessor.isPidAlive(currentStatus.appPid.toInt())
457- ) {
458- // If the PID is alive, assume the join link we have is still
459- // valid. The join link seems to change even if it is the same
460- // backend running, so if we always fetched the link the client
461- // would relaunch over and over.
447+ val details= " $${workspace.hostname} :${ideDir.toRawString()} , project=${remoteProjectPath.toRawString()} "
448+
449+ // Check if the current IDE is alive.
450+ if (currentStatus!= null ) {
451+ val isAlive= try {
452+ val isAlive= accessor.isPidAlive(currentStatus.appPid.toInt())
453+ logger.info(" ${workspace.ideName} status: pid=${currentStatus.appPid} , alive=$isAlive " )
454+ isAlive
455+ }catch (ex: Exception ) {
456+ logger.info(" Failed to check if${workspace.ideName} is alive on$details : pid=${currentStatus.appPid} " , ex)
457+ false
458+ }
459+ if (isAlive) {
460+ // Use the current status and join link.
462461return currentStatus
462+ }else {
463+ logger.info(" Relaunching${workspace.ideName} since it is not alive..." )
463464 }
465+ }else {
466+ logger.info(" Launching${workspace.ideName} for the first time on${workspace.hostname} ..." )
467+ }
464468
465- // See if there is already a backend running. Weirdly, there is
466- // always a PID, even if there is no backend running, and
467- // backendUnresponsive is always false, but the links are null so
468- // hopefully that is an accurate indicator that the IDE is up.
469- val status= accessor.getHostIdeStatus(ideDir, remoteProjectPath)
470- if (! status.joinLink.isNullOrBlank()) {
471- logger.info(" Found existing${workspace.ideName} backend on$details " )
472- return status
473- }
469+ // If the PID is not alive, spawn a new backend. This may not be
470+ // idempotent, so only call if we are really sure we need to.
471+ accessor.startHostIdeInBackgroundAndDetach(lifetime, ideDir, remoteProjectPath, logsDir)
474472
475- // Otherwise, spawn a new backend. This does not seem to spawn a
476- // second backend if one is already running, yet it does somehow
477- // cause a second client to launch. So only run this if we are
478- // really sure we have to launch a new backend.
479- logger.info(" Starting${workspace.ideName} backend on$details " )
480- accessor.startHostIdeInBackgroundAndDetach(lifetime, ideDir, remoteProjectPath, logsDir)
481- // Get the newly spawned PID and join link.
482- return accessor.getHostIdeStatus(ideDir, remoteProjectPath)
483- }catch (ex: Exception ) {
484- logger.info(" Failed to get${workspace.ideName} status from$details " , ex)
485- currentStatus
473+ // Get the newly spawned PID and join link.
474+ var attempts= 0
475+ val maxAttempts= 6
476+ val wait= TimeUnit .SECONDS .toMillis(5 )
477+ while (lifetime.status== LifetimeStatus .Alive ) {
478+ try {
479+ attempts++
480+ val status= accessor.getHostIdeStatus(ideDir, remoteProjectPath)
481+ if (! status.joinLink.isNullOrBlank()) {
482+ logger.info(" Found join link for${workspace.ideName} ; proceeding to connect: pid=${status.appPid} " )
483+ return status
484+ }
485+ // If we did not get a join link, see if the IDE is alive in
486+ // case it died and we need to respawn.
487+ val isAlive= status.appPid> 0 && accessor.isPidAlive(status.appPid.toInt())
488+ logger.info(" ${workspace.ideName} status: pid=${status.appPid} , alive=$isAlive , unresponsive=${status.backendUnresponsive} , attempt=$attempts " )
489+ // It is not clear whether the PID can be trusted because we get
490+ // one even when there is no backend at all. For now give it
491+ // some time and if it is still dead, only then try to respawn.
492+ if (! isAlive&& attempts>= maxAttempts) {
493+ logger.info(" ${workspace.ideName} is still not alive after$attempts checks, respawning backend and waiting$wait ms to try again" )
494+ accessor.startHostIdeInBackgroundAndDetach(lifetime, ideDir, remoteProjectPath, logsDir)
495+ attempts= 0
496+ }else {
497+ logger.info(" No join link found in status; waiting$wait ms to try again" )
498+ }
499+ }catch (ex: Exception ) {
500+ logger.info(" Failed to get${workspace.ideName} status from$details ; waiting$wait ms to try again" , ex)
501+ }
502+ delay(wait)
486503 }
504+
505+ // This means the lifetime is no longer alive.
506+ logger.info(" Connection to${workspace.ideName} on$details aborted by user" )
507+ return null
487508 }
488509
489510companion object {