diff --git a/app/MindWork AI Studio/Tools/Services/RustAvailabilityMonitorService.cs b/app/MindWork AI Studio/Tools/Services/RustAvailabilityMonitorService.cs index ea604a7d..40c22f0f 100644 --- a/app/MindWork AI Studio/Tools/Services/RustAvailabilityMonitorService.cs +++ b/app/MindWork AI Studio/Tools/Services/RustAvailabilityMonitorService.cs @@ -8,9 +8,11 @@ public sealed class RustAvailabilityMonitorService : BackgroundService, IMessage private readonly ILogger logger; private readonly MessageBus messageBus; + private readonly RustService rustService; private readonly IHostApplicationLifetime appLifetime; private int rustUnavailableCount; + private int availabilityCheckTriggered; // To prevent multiple shutdown triggers. We use int instead of bool for Interlocked operations. private int shutdownTriggered; @@ -18,10 +20,12 @@ public sealed class RustAvailabilityMonitorService : BackgroundService, IMessage public RustAvailabilityMonitorService( ILogger logger, MessageBus messageBus, + RustService rustService, IHostApplicationLifetime appLifetime) { this.logger = logger; this.messageBus = messageBus; + this.rustService = rustService; this.appLifetime = appLifetime; this.messageBus.RegisterComponent(this); @@ -53,6 +57,25 @@ public sealed class RustAvailabilityMonitorService : BackgroundService, IMessage // Thread-safe incrementation of the unavailable count and check against the threshold: var numEvents = Interlocked.Increment(ref this.rustUnavailableCount); + + // On the first event, trigger some Rust availability checks to confirm. + // Just fire and forget - we don't need to await this here. + if (numEvents == 1 && Interlocked.Exchange(ref this.availabilityCheckTriggered, 1) == 0) + { + // + // This is also useful to speed up the detection of Rust availability issues, + // as it triggers two immediate checks instead of waiting for the next scheduled check. + // Scheduled checks are typically every few minutes, which might be too long to wait + // in case of critical Rust service failures. + // + // On the other hand, we cannot kill the .NET server on the first failure, as it might + // be a transient issue. + // + + _ = this.VerifyRustAvailability(); + _ = this.VerifyRustAvailability(); + } + if (numEvents <= UNAVAILABLE_EVENT_THRESHOLD) { this.logger.LogWarning("Rust service unavailable (num repeats={NumRepeats}, threshold={Threshold}). Reason = '{Reason}'. Waiting for more occurrences before shutting down the server.", numEvents, UNAVAILABLE_EVENT_THRESHOLD, reason); @@ -72,4 +95,17 @@ public sealed class RustAvailabilityMonitorService : BackgroundService, IMessage { return Task.FromResult(default); } + + private async Task VerifyRustAvailability() + { + try + { + await this.rustService.ReadUserLanguage(); + } + catch (Exception e) + { + this.logger.LogWarning(e, "Rust availability check failed."); + await this.messageBus.SendMessage(null, Event.RUST_SERVICE_UNAVAILABLE, "Rust availability check failed"); + } + } }