saluki_core/health/
mod.rs

1//! Health registry for tracking component readiness and liveness.
2
3use std::future::Future;
4#[cfg(test)]
5use std::sync::atomic::AtomicUsize;
6use std::{
7    collections::HashSet,
8    sync::{
9        atomic::{AtomicBool, Ordering::Relaxed},
10        Arc, Mutex,
11    },
12    time::Duration,
13};
14
15use futures::StreamExt as _;
16use saluki_error::{generic_error, GenericError};
17use saluki_metrics::static_metrics;
18use stringtheory::MetaString;
19use tokio::{pin, time::Instant};
20use tokio::{
21    select,
22    sync::{
23        mpsc::{self, error::TrySendError},
24        Notify,
25    },
26};
27use tokio_util::time::{delay_queue::Key, DelayQueue};
28use tracing::{debug, info, trace};
29
30mod api;
31pub use self::api::HealthAPIHandler;
32
33mod worker;
34pub use self::worker::HealthRegistryWorker;
35
36const DEFAULT_PROBE_TIMEOUT_DUR: Duration = Duration::from_secs(5);
37const DEFAULT_PROBE_BACKOFF_DUR: Duration = Duration::from_secs(1);
38
39/// A handle for updating the health of a component.
40pub struct Health {
41    shared: Arc<SharedComponentState>,
42    request_rx: mpsc::Receiver<LivenessRequest>,
43    response_tx: mpsc::Sender<LivenessResponse>,
44    readiness_notify: Arc<Notify>,
45}
46
47impl Health {
48    /// Marks the component as ready.
49    pub fn mark_ready(&mut self) {
50        self.update_readiness(true);
51    }
52
53    /// Marks the component as not ready.
54    pub fn mark_not_ready(&mut self) {
55        self.update_readiness(false);
56    }
57
58    fn update_readiness(&self, ready: bool) {
59        self.shared.ready.store(ready, Relaxed);
60        self.shared.telemetry.update_readiness(ready);
61
62        // Wake any tasks waiting in `HealthRegistry::all_ready` so they can re-check whether all components are ready.
63        if ready {
64            self.readiness_notify.notify_waiters();
65        }
66    }
67
68    /// Waits for a liveness probe to be sent to the component, and then responds to it.
69    ///
70    /// This should generally be polled as part of a `select!` block to ensure it's checked alongside other
71    /// asynchronous operations.
72    pub async fn live(&mut self) {
73        // Simply wait for the health registry to send us a liveness probe, and if we receive one, we respond back to it
74        // immediately.
75        if let Some(request) = self.request_rx.recv().await {
76            let response = request.into_response();
77            let _ = self.response_tx.send(response).await;
78        }
79    }
80}
81
82#[derive(Clone, Copy, Eq, PartialEq)]
83enum HealthState {
84    Live,
85    Unknown,
86    Dead,
87}
88
89static_metrics!(
90    name => Telemetry,
91    prefix => health,
92    labels => [component_id: Arc<str>],
93    metrics => [
94        gauge(component_ready),
95        gauge(component_live),
96        trace_histogram(component_liveness_latency_seconds),
97    ]
98);
99
100impl Telemetry {
101    fn from_name(name: &str) -> Self {
102        Self::new(Arc::from(name))
103    }
104
105    fn update_readiness(&self, ready: bool) {
106        self.component_ready().set(if ready { 1.0 } else { 0.0 });
107    }
108
109    fn update_liveness(&self, state: HealthState, response_latency: Duration) {
110        let live = match state {
111            HealthState::Live => 1.0,
112            HealthState::Unknown => 0.0,
113            HealthState::Dead => -1.0,
114        };
115
116        self.component_live().set(live);
117        self.component_liveness_latency_seconds()
118            .record(response_latency.as_secs_f64());
119    }
120}
121
122struct SharedComponentState {
123    ready: AtomicBool,
124    telemetry: Telemetry,
125}
126
127struct ComponentState {
128    name: MetaString,
129    health: HealthState,
130    shared: Arc<SharedComponentState>,
131    request_tx: mpsc::Sender<LivenessRequest>,
132    last_response: Instant,
133    last_response_latency: Duration,
134}
135
136impl ComponentState {
137    fn new(
138        name: MetaString, response_tx: mpsc::Sender<LivenessResponse>, readiness_notify: Arc<Notify>,
139    ) -> (Self, Health) {
140        let shared = Arc::new(SharedComponentState {
141            ready: AtomicBool::new(false),
142            telemetry: Telemetry::from_name(&name),
143        });
144        let (request_tx, request_rx) = mpsc::channel(1);
145
146        let state = Self {
147            name,
148            health: HealthState::Unknown,
149            shared: Arc::clone(&shared),
150            request_tx,
151            last_response: Instant::now(),
152            last_response_latency: Duration::from_secs(0),
153        };
154
155        let handle = Health {
156            shared,
157            request_rx,
158            response_tx,
159            readiness_notify,
160        };
161
162        (state, handle)
163    }
164
165    fn is_ready(&self) -> bool {
166        // We consider a component ready if it's marked as ready (duh) and it's not dead.
167        //
168        // Being "dead" is a special case as it means the component is very likely not even running at all, not just
169        // responding slowly or deadlocked. In these cases, it can't possibly be ready since it's not even running.
170        self.shared.ready.load(Relaxed) && self.health != HealthState::Dead
171    }
172
173    fn is_live(&self) -> bool {
174        self.health == HealthState::Live
175    }
176
177    fn mark_live(&mut self, response_sent: Instant, response_latency: Duration) {
178        self.health = HealthState::Live;
179        self.last_response = response_sent;
180        self.last_response_latency = response_latency;
181        self.shared.telemetry.update_liveness(self.health, response_latency);
182    }
183
184    fn mark_not_live(&mut self) {
185        self.health = HealthState::Unknown;
186
187        // We use the default timeout as the latency for when the component is not considered alive.
188        self.shared
189            .telemetry
190            .update_liveness(self.health, DEFAULT_PROBE_TIMEOUT_DUR);
191    }
192
193    fn mark_dead(&mut self) {
194        self.health = HealthState::Dead;
195
196        // We use the default timeout as the latency for when the component is not considered alive.
197        self.shared
198            .telemetry
199            .update_liveness(self.health, DEFAULT_PROBE_TIMEOUT_DUR);
200    }
201}
202
203struct LivenessRequest {
204    component_id: usize,
205    timeout_key: Key,
206    request_sent: Instant,
207}
208
209impl LivenessRequest {
210    fn new(component_id: usize, timeout_key: Key) -> Self {
211        Self {
212            component_id,
213            timeout_key,
214            request_sent: Instant::now(),
215        }
216    }
217
218    fn into_response(self) -> LivenessResponse {
219        LivenessResponse {
220            request: self,
221            response_sent: Instant::now(),
222        }
223    }
224}
225
226struct LivenessResponse {
227    request: LivenessRequest,
228    response_sent: Instant,
229}
230
231enum HealthUpdate {
232    Alive {
233        last_response: Instant,
234        last_response_latency: Duration,
235    },
236    Unknown,
237    Dead,
238}
239
240impl HealthUpdate {
241    fn as_str(&self) -> &'static str {
242        match self {
243            HealthUpdate::Alive { .. } => "alive",
244            HealthUpdate::Unknown => "unknown",
245            HealthUpdate::Dead => "dead",
246        }
247    }
248}
249
250struct RegistryState {
251    registered_components: HashSet<MetaString>,
252    component_state: Vec<ComponentState>,
253    responses_tx: mpsc::Sender<LivenessResponse>,
254    responses_rx: Option<mpsc::Receiver<LivenessResponse>>,
255    pending_components: Vec<usize>,
256    pending_components_notify: Arc<Notify>,
257    readiness_notify: Arc<Notify>,
258}
259
260impl RegistryState {
261    fn new() -> Self {
262        let (responses_tx, responses_rx) = mpsc::channel(16);
263
264        Self {
265            registered_components: HashSet::new(),
266            component_state: Vec::new(),
267            responses_tx,
268            responses_rx: Some(responses_rx),
269            pending_components: Vec::new(),
270            pending_components_notify: Arc::new(Notify::new()),
271            readiness_notify: Arc::new(Notify::new()),
272        }
273    }
274}
275
276/// A registry of components and their health.
277///
278/// `HealthRegistry` is responsible for tracking the health of all registered components, by storing both their
279/// readiness, which indicates whether or not they're initialized and generally ready to process data, as well as
280/// probing their liveness, which indicates if they're currently responding, or able to respond, to requests.
281///
282/// # Telemetry
283///
284/// The health registry emits some internal telemetry about the status of registered components. In particular, three
285/// metrics are emitted:
286///
287/// - `health.component_ready`: whether or not a component is ready (`gauge`, `0` for not ready, `1` for ready)
288/// - `health.component_alive`: whether or not a component is alive (`gauge`, `0` for not alive/unknown, `1` for alive, `-1` for dead)
289/// - `health.component_liveness_latency_secs`: the response latency of the component for liveness probes (`histogram`,
290///   in seconds)
291///
292/// All metrics have a `component_id` tag that corresponds to the name of the component that was given when registering it.
293#[derive(Clone)]
294pub struct HealthRegistry {
295    inner: Arc<Mutex<RegistryState>>,
296}
297
298impl HealthRegistry {
299    /// Creates an empty registry.
300    pub fn new() -> Self {
301        Self {
302            inner: Arc::new(Mutex::new(RegistryState::new())),
303        }
304    }
305
306    #[cfg(test)]
307    fn state(&self) -> Arc<Mutex<RegistryState>> {
308        Arc::clone(&self.inner)
309    }
310
311    /// Registers a component with the registry.
312    ///
313    /// A handle is returned that must be used by the component to set its readiness as well as respond to liveness
314    /// probes. See [`Health::mark_ready`], [`Health::mark_not_ready`], and [`Health::live`] for more information.
315    pub fn register_component<S: Into<MetaString>>(&self, name: S) -> Option<Health> {
316        let mut inner = self.inner.lock().unwrap();
317
318        // Make sure we don't already have this component registered.
319        let name = name.into();
320        if !inner.registered_components.insert(name.clone()) {
321            return None;
322        }
323
324        // Add the component state.
325        let readiness_notify = Arc::clone(&inner.readiness_notify);
326        let (state, handle) = ComponentState::new(name.clone(), inner.responses_tx.clone(), readiness_notify);
327        let component_id = inner.component_state.len();
328        inner.component_state.push(state);
329
330        debug!(component_id, "Registered component '{}'.", name);
331
332        // Mark ourselves as having a pending component that needs to be scheduled.
333        inner.pending_components.push(component_id);
334        inner.pending_components_notify.notify_one();
335
336        Some(handle)
337    }
338
339    /// Gets an API handler for reporting the health of all components.
340    ///
341    /// This handler exposes routes for querying the readiness and liveness of all registered components. See
342    /// [`HealthAPIHandler`] for more information about routes and responses.
343    pub fn api_handler(&self) -> HealthAPIHandler {
344        HealthAPIHandler::from_state(Arc::clone(&self.inner))
345    }
346
347    /// Waits until all registered components are ready.
348    ///
349    /// If no components are registered, or all currently registered components are ready, the method returns immediately. Otherwise,
350    /// the method will return as soon as all registered components transition to ready.
351    ///
352    /// Note that components can be registered while this method is waiting, which will influence how long this method
353    /// takes to return. Callers should ensure that all components have been registered before calling this method.
354    pub async fn all_ready(&self) {
355        self.all_ready_matching(|_| true).await
356    }
357
358    /// Waits until all currently registered components whose name matches `predicate` are ready.
359    ///
360    /// This is a scoped variant of [`all_ready`][Self::all_ready] that only considers components whose name satisfies
361    /// the given predicate, which is useful for waiting on a specific subsystem's components to become ready without
362    /// waiting on every other component in the registry.
363    ///
364    /// If no registered component matches the predicate, the method returns immediately. As with
365    /// [`all_ready`][Self::all_ready], components can be registered while this method is waiting, so callers should
366    /// ensure all components they care about have been registered before calling this method.
367    pub async fn all_ready_matching<F>(&self, predicate: F)
368    where
369        F: Fn(&str) -> bool,
370    {
371        let readiness_notify = {
372            let inner = self.inner.lock().unwrap();
373            Arc::clone(&inner.readiness_notify)
374        };
375
376        loop {
377            // Register as a waiter _before_ checking to avoid missing notifications during the check.
378            let notified = readiness_notify.notified();
379
380            if self.check_ready_matching(&predicate) {
381                return;
382            }
383
384            notified.await;
385        }
386    }
387
388    fn check_ready_matching<F>(&self, predicate: &F) -> bool
389    where
390        F: Fn(&str) -> bool,
391    {
392        let inner = self.inner.lock().unwrap();
393        inner
394            .component_state
395            .iter()
396            .filter(|component| predicate(&component.name))
397            .all(|component| component.is_ready())
398    }
399
400    /// Returns a JSON snapshot of the current readiness and liveness state of all registered components.
401    ///
402    /// Each component appears as a key in the returned JSON object, with its `live` and `ready` boolean fields
403    /// reflecting the state at the time of the call. This is the same data exposed by the `/health/ready` and
404    /// `/health/live` HTTP endpoints, but collected in a single pass for use outside of the HTTP handler path (for
405    /// example, when building a diagnostic artifact).
406    pub fn snapshot_json(&self) -> String {
407        #[derive(serde::Serialize)]
408        struct ComponentSnapshot {
409            live: bool,
410            ready: bool,
411        }
412
413        let inner = self.inner.lock().unwrap();
414        let mut state: std::collections::HashMap<String, ComponentSnapshot> = std::collections::HashMap::new();
415        for component in &inner.component_state {
416            state.insert(
417                component.name.to_string(),
418                ComponentSnapshot {
419                    live: component.is_live(),
420                    ready: component.is_ready(),
421                },
422            );
423        }
424        serde_json::to_string_pretty(&state).unwrap_or_else(|e| format!("{{\"error\": \"{e}\"}}"))
425    }
426
427    /// Creates a [`HealthRegistryWorker`] that can be added to a supervisor to run the health registry.
428    ///
429    /// The worker handles the lifecycle of the health registry runner, including registering the health API routes
430    /// dynamically and running the liveness probing event loop.
431    pub fn worker(&self) -> HealthRegistryWorker {
432        HealthRegistryWorker::new(self.clone())
433    }
434
435    pub(crate) fn into_runner(self) -> Result<Runner, GenericError> {
436        // Make sure the runner hasn't already been spawned.
437        let (responses_rx, pending_components_notify) = {
438            let mut inner = self.inner.lock().unwrap();
439            let responses_rx = match inner.responses_rx.take() {
440                Some(rx) => rx,
441                None => return Err(generic_error!("health registry already spawned")),
442            };
443
444            let pending_components_notify = Arc::clone(&inner.pending_components_notify);
445            (responses_rx, pending_components_notify)
446        };
447
448        Ok(Runner::new(self.inner, responses_rx, pending_components_notify))
449    }
450}
451
452/// A guard that returns the response receiver back to the registry when dropped.
453///
454/// This allows the health registry runner to be restarted gracefully: whenever the runner task
455/// finishes and this guard is dropped (for example, after a shutdown or task cancellation), the
456/// receiver is returned to the registry state so that a subsequent call to `spawn()` can succeed.
457struct RunnerGuard {
458    registry: Arc<Mutex<RegistryState>>,
459    responses_rx: Option<mpsc::Receiver<LivenessResponse>>,
460}
461
462impl Drop for RunnerGuard {
463    fn drop(&mut self) {
464        if let Some(rx) = self.responses_rx.take() {
465            let mut inner = self.registry.lock().expect("registry state poisoned");
466            inner.responses_rx = Some(rx);
467            debug!("Returned response receiver to registry state.");
468        }
469    }
470}
471
472#[cfg(test)]
473struct RunnerState {
474    pending_scheduled_probes: AtomicUsize,
475    pending_probe_timeouts: AtomicUsize,
476}
477
478#[cfg(test)]
479impl RunnerState {
480    fn new() -> Self {
481        Self {
482            pending_scheduled_probes: AtomicUsize::new(0),
483            pending_probe_timeouts: AtomicUsize::new(0),
484        }
485    }
486
487    fn pending_scheduled_probes(&self) -> usize {
488        self.pending_scheduled_probes.load(Relaxed)
489    }
490
491    fn pending_probe_timeouts(&self) -> usize {
492        self.pending_probe_timeouts.load(Relaxed)
493    }
494
495    fn increment_pending_scheduled_probes(&self) {
496        self.pending_scheduled_probes.fetch_add(1, Relaxed);
497    }
498
499    fn increment_pending_probe_timeouts(&self) {
500        self.pending_probe_timeouts.fetch_add(1, Relaxed);
501    }
502
503    fn decrement_pending_scheduled_probes(&self) {
504        self.pending_scheduled_probes.fetch_sub(1, Relaxed);
505    }
506
507    fn decrement_pending_probe_timeouts(&self) {
508        self.pending_probe_timeouts.fetch_sub(1, Relaxed);
509    }
510}
511
512pub(super) struct Runner {
513    registry: Arc<Mutex<RegistryState>>,
514    pending_probes: DelayQueue<usize>,
515    pending_timeouts: DelayQueue<usize>,
516    guard: RunnerGuard,
517    pending_components_notify: Arc<Notify>,
518    #[cfg(test)]
519    state: Arc<RunnerState>,
520}
521
522impl Runner {
523    fn new(
524        registry: Arc<Mutex<RegistryState>>, responses_rx: mpsc::Receiver<LivenessResponse>,
525        pending_components_notify: Arc<Notify>,
526    ) -> Self {
527        #[cfg(test)]
528        let state = Arc::new(RunnerState::new());
529
530        let guard = RunnerGuard {
531            registry: Arc::clone(&registry),
532            responses_rx: Some(responses_rx),
533        };
534
535        Self {
536            registry,
537            pending_probes: DelayQueue::new(),
538            pending_timeouts: DelayQueue::new(),
539            guard,
540            pending_components_notify,
541            #[cfg(test)]
542            state,
543        }
544    }
545
546    #[cfg(test)]
547    fn state(&self) -> Arc<RunnerState> {
548        Arc::clone(&self.state)
549    }
550
551    fn drain_pending_components(&mut self) -> Vec<usize> {
552        // Drain all pending components.
553        let mut registry = self.registry.lock().unwrap();
554        registry.pending_components.drain(..).collect()
555    }
556
557    fn send_component_probe_request(&mut self, component_id: usize) -> Option<HealthUpdate> {
558        let mut registry = self.registry.lock().unwrap();
559        let component_state = &mut registry.component_state[component_id];
560
561        // Check if our component is already dead, in which case we don't need to send a liveness probe.
562        if component_state.request_tx.is_closed() {
563            debug!(component_name = %component_state.name, "Component is dead, skipping liveness probe.");
564            return Some(HealthUpdate::Dead);
565        }
566
567        trace!(component_name = %component_state.name, probe_timeout = ?DEFAULT_PROBE_TIMEOUT_DUR, "Sending liveness probe to component.");
568
569        // Our component _isn't_ dead, so try to send a liveness probe to it.
570        //
571        // We'll register an entry in `pending_timeouts` that automatically marks the component as not live if we don't
572        // receive a response to the liveness probe within the timeout duration.
573        let timeout_key = self.pending_timeouts.insert(component_id, DEFAULT_PROBE_TIMEOUT_DUR);
574
575        #[cfg(test)]
576        self.state.increment_pending_probe_timeouts();
577
578        let request = LivenessRequest::new(component_id, timeout_key);
579        if let Err(TrySendError::Closed(request)) = component_state.request_tx.try_send(request) {
580            debug!(component_name = %component_state.name, "Component is dead, removing pending timeout.");
581
582            // We failed to send the probe to the component due to the component being dead. We'll drop our pending
583            // timeout as we're going to mark this component dead right now.
584            //
585            // When our send fails due to the channel being full, that's OK: it means it's going to be handled by an
586            // existing timeout and will be probed again later.
587            self.pending_timeouts.remove(&request.timeout_key);
588
589            #[cfg(test)]
590            self.state.decrement_pending_probe_timeouts();
591
592            return Some(HealthUpdate::Dead);
593        }
594
595        None
596    }
597
598    fn schedule_probe_for_component(&mut self, component_id: usize, duration: Duration) {
599        #[cfg(test)]
600        self.state.increment_pending_scheduled_probes();
601
602        self.pending_probes.insert(component_id, duration);
603    }
604
605    fn schedule_all_existing_components(&mut self, responses_rx: &mut mpsc::Receiver<LivenessResponse>) {
606        // First, drain any pending components to avoid scheduling them twice.
607        // This handles the case where components were registered before the runner started.
608        let _pending = self.drain_pending_components();
609
610        // Drain any queued probe responses from the previous runner. These responses were sent by
611        // components before the runner shut down but weren't processed. Processing them now updates
612        // `last_response` timestamps, which affects the staleness check below — a response that
613        // arrived just before shutdown should count as fresh.
614        while let Ok(response) = responses_rx.try_recv() {
615            self.handle_component_probe_response(response);
616        }
617
618        // Determine which components have stale probe results. Components whose last response is
619        // within the probe timeout are considered fresh and their health state is preserved,
620        // avoiding unnecessary bursts of failed liveness/readiness probes on runner restart.
621        let (component_count, stale_component_ids) = {
622            let registry = self.registry.lock().unwrap();
623            let now = Instant::now();
624            let stale_ids: Vec<usize> = (0..registry.component_state.len())
625                .filter(|&id| {
626                    now.duration_since(registry.component_state[id].last_response) >= DEFAULT_PROBE_TIMEOUT_DUR
627                })
628                .collect();
629            (registry.component_state.len(), stale_ids)
630        };
631
632        // Only reset health to Unknown for components with stale probe results.
633        for &component_id in &stale_component_ids {
634            self.process_component_health_update(component_id, HealthUpdate::Unknown);
635        }
636
637        // Schedule immediate probes for all components regardless of staleness.
638        for component_id in 0..component_count {
639            self.schedule_probe_for_component(component_id, Duration::ZERO);
640        }
641
642        if component_count > 0 {
643            let fresh_count = component_count - stale_component_ids.len();
644            debug!(
645                component_count,
646                fresh_count,
647                stale_count = stale_component_ids.len(),
648                "Scheduled probes for all existing components."
649            );
650        }
651    }
652
653    fn handle_component_probe_response(&mut self, response: LivenessResponse) {
654        let component_id = response.request.component_id;
655        let timeout_key = response.request.timeout_key;
656        let request_sent = response.request.request_sent;
657        let response_sent = response.response_sent;
658        let response_latency = response_sent.checked_duration_since(request_sent).unwrap_or_default();
659
660        // Clear any pending timeouts for this component and schedule the next probe.
661        let timeout_was_pending = self.pending_timeouts.try_remove(&timeout_key).is_some();
662        if !timeout_was_pending {
663            let mut registry = self.registry.lock().unwrap();
664            let component_state = &mut registry.component_state[component_id];
665
666            debug!(component_name = %component_state.name, "Received probe response for component that already timed out.");
667        }
668
669        // Update the component's health to show as alive.
670        let update = HealthUpdate::Alive {
671            last_response: response_sent,
672            last_response_latency: response_latency,
673        };
674        self.process_component_health_update(component_id, update);
675
676        // Only schedule the next probe if we successfully removed the timeout, meaning it hadn't fired yet.
677        // This prevents duplicate probe scheduling when a response arrives after a timeout.
678        if timeout_was_pending {
679            #[cfg(test)]
680            self.state.decrement_pending_probe_timeouts();
681
682            self.schedule_probe_for_component(component_id, DEFAULT_PROBE_BACKOFF_DUR);
683        }
684    }
685
686    fn handle_component_timeout(&mut self, component_id: usize) {
687        // Update the component's health to show as not alive.
688        self.process_component_health_update(component_id, HealthUpdate::Unknown);
689
690        // Schedule the next probe for this component.
691        self.schedule_probe_for_component(component_id, DEFAULT_PROBE_BACKOFF_DUR);
692    }
693
694    fn process_component_health_update(&mut self, component_id: usize, update: HealthUpdate) {
695        // Update the component's health state based on the given update.
696        let mut registry = self.registry.lock().unwrap();
697        let component_state = &mut registry.component_state[component_id];
698        trace!(component_name = %component_state.name, status = update.as_str(), "Updating component health status.");
699
700        match update {
701            HealthUpdate::Alive {
702                last_response,
703                last_response_latency,
704            } => component_state.mark_live(last_response, last_response_latency),
705            HealthUpdate::Unknown => component_state.mark_not_live(),
706            HealthUpdate::Dead => component_state.mark_dead(),
707        }
708    }
709
710    async fn run<F: Future<Output = ()>>(mut self, shutdown: F) {
711        info!("Health checker running.");
712
713        // Take the response receiver out of the guard so we can use it in the select loop.
714        // It will be put back when the guard is dropped.
715        let mut responses_rx = self
716            .guard
717            .responses_rx
718            .take()
719            .expect("responses_rx should always be Some when Runner is created");
720
721        // Schedule probes for all existing components. This allows the runner to "pick up where it
722        // left off" after a restart - any components that were registered before the runner was
723        // restarted will be immediately probed.
724        self.schedule_all_existing_components(&mut responses_rx);
725
726        // Pin the shutdown future so we can poll it in the select loop.
727        pin!(shutdown);
728
729        loop {
730            select! {
731                // Shutdown signal received - exit the run loop gracefully.
732                _ = &mut shutdown => {
733                    info!("Health checker shutting down.");
734                    break;
735                },
736
737                // A component has been scheduled to have a liveness probe sent to it.
738                Some(entry) = self.pending_probes.next() => {
739                    #[cfg(test)]
740                    self.state.decrement_pending_scheduled_probes();
741
742                    let component_id = entry.into_inner();
743                    if let Some(health_update) = self.send_component_probe_request(component_id) {
744                        // If we got a health update for this component, that means we detected that it's dead, so we need
745                        // to do an out-of-band update to its health.
746                        self.process_component_health_update(component_id, health_update);
747                    }
748                },
749
750                // A component's outstanding liveness probe has expired.
751                Some(entry) = self.pending_timeouts.next() => {
752                    #[cfg(test)]
753                    self.state.decrement_pending_probe_timeouts();
754
755                    let component_id = entry.into_inner();
756                    self.handle_component_timeout(component_id);
757                },
758
759                // A probe response has been received.
760                Some(response) = responses_rx.recv() => {
761                    self.handle_component_probe_response(response);
762                },
763
764                // A component is pending finalization of their registration.
765                _ = self.pending_components_notify.notified() => {
766                    // Drain all pending components, give them a clean initial state of "unknown", and immediately schedule a probe for them.
767                    let pending_component_ids = self.drain_pending_components();
768                    for pending_component_id in pending_component_ids {
769                        self.process_component_health_update(pending_component_id, HealthUpdate::Unknown);
770                        self.schedule_probe_for_component(pending_component_id, Duration::ZERO);
771                    }
772                },
773            }
774        }
775
776        // Put the receiver back in the guard so it can be returned to the registry state when dropped.
777        self.guard.responses_rx = Some(responses_rx);
778
779        // When we exit the loop, the RunnerGuard will be dropped, returning the response receiver
780        // back to the registry state so that a subsequent spawn() can succeed.
781    }
782}
783
784#[cfg(test)]
785mod tests {
786    use std::future::Future;
787
788    use futures::FutureExt as _;
789    use tokio::sync::oneshot;
790    use tokio_test::{
791        assert_pending, assert_ready,
792        task::{spawn, Spawn},
793    };
794
795    use super::*;
796
797    const COMPONENT_ID: &str = "test_component";
798
799    #[track_caller]
800    fn initialize_registry_with_component(
801        component_id: &str,
802    ) -> (
803        Health,
804        Spawn<impl Future<Output = ()>>,
805        Arc<Mutex<RegistryState>>,
806        Arc<RunnerState>,
807    ) {
808        let registry = HealthRegistry::new();
809        let registry_state = registry.state();
810
811        // Add our component to the registry:
812        let handle = registry.register_component(component_id).unwrap();
813
814        // Extract the registry runner task and poll it until it's quiesced.
815        //
816        // This ensures that the component is registered, and that it schedules/sends an initial probe request to the component:
817        let runner = registry.into_runner().expect("should not fail to create runner");
818        let runner_state = runner.state();
819
820        // Create a shutdown future that never resolves (for tests that don't need shutdown).
821        let shutdown = std::future::pending();
822        let registry_task = spawn(runner.run(shutdown));
823
824        (handle, registry_task, registry_state, runner_state)
825    }
826
827    #[track_caller]
828    fn drive_until_quiesced<F: Future<Output = ()>>(task: &mut Spawn<F>) {
829        assert_pending!(task.poll());
830        while task.is_woken() {
831            assert_pending!(task.poll());
832        }
833    }
834
835    fn component_live(state: &Mutex<RegistryState>, component_id: &str) -> bool {
836        let state = state.lock().unwrap();
837        state
838            .component_state
839            .iter()
840            .find(|state| state.name == component_id)
841            .map(|state| state.is_live())
842            .unwrap()
843    }
844
845    #[test]
846    fn basic_registration() {
847        let registry = HealthRegistry::new();
848        assert!(registry.register_component(COMPONENT_ID).is_some());
849    }
850
851    #[test]
852    fn duplicate_component_registration_fails() {
853        let registry = HealthRegistry::new();
854
855        // Registering the same component twice should fail:
856        assert!(registry.register_component(COMPONENT_ID).is_some());
857        assert!(registry.register_component(COMPONENT_ID).is_none());
858    }
859
860    #[test]
861    fn duplicate_runner_creation_fails_while_running() {
862        let registry = HealthRegistry::new();
863        let registry2 = registry.clone();
864
865        // First runner creation should succeed. We hold on to it so the RunnerGuard doesn't
866        // return the receiver back to the registry state.
867        let _runner = registry.into_runner().expect("first runner creation should succeed");
868
869        // Second creation should fail while the first runner still holds the receiver.
870        assert!(registry2.into_runner().is_err());
871    }
872
873    #[tokio::test]
874    async fn registry_can_be_respawned_after_shutdown() {
875        let registry = HealthRegistry::new();
876        let registry2 = registry.clone();
877        let registry3 = registry.clone();
878
879        // First runner creation should succeed.
880        let (shutdown_tx, shutdown_rx) = oneshot::channel::<()>();
881        let runner = registry.into_runner().expect("first runner creation should succeed");
882
883        // Run the runner on a spawned task so we can trigger shutdown.
884        let join_handle = tokio::spawn(runner.run(shutdown_rx.map(|_| ())));
885
886        // Trigger shutdown.
887        let _ = shutdown_tx.send(());
888
889        // Wait for the runner to stop.
890        join_handle.await.expect("runner should complete without panic");
891
892        // Now we should be able to create a runner again (the RunnerGuard returned the receiver).
893        let _runner2 = registry2
894            .into_runner()
895            .expect("should be able to create runner after shutdown");
896
897        // But not a third time while the second runner holds the receiver.
898        assert!(
899            registry3.into_runner().is_err(),
900            "should not be able to create runner while one exists"
901        );
902    }
903
904    #[test]
905    fn readiness() {
906        let registry = HealthRegistry::new();
907
908        // An empty registry is always ready, so `all_ready` resolves immediately:
909        let mut all_ready_fut = spawn(registry.all_ready());
910        assert_ready!(all_ready_fut.poll());
911
912        // Components start out as not ready, so adding this component changes the registry to not ready overall:
913        let mut handle = registry.register_component(COMPONENT_ID).unwrap();
914
915        let mut all_ready_fut = spawn(registry.all_ready());
916        assert_pending!(all_ready_fut.poll());
917
918        // Now mark the component as ready. `all_ready` should resolve on the next poll:
919        handle.mark_ready();
920
921        assert!(all_ready_fut.is_woken());
922        assert_ready!(all_ready_fut.poll());
923
924        // Ensure a fresh `all_ready` call immediately observes all components being ready:
925        let mut all_ready_fut = spawn(registry.all_ready());
926        assert_ready!(all_ready_fut.poll());
927
928        // Finally, make sure that the readiness state isn't latched, as `all_ready` should always reflect the current state:
929        handle.mark_not_ready();
930
931        let mut all_ready_fut = spawn(registry.all_ready());
932        assert_pending!(all_ready_fut.poll());
933    }
934
935    #[tokio::test(start_paused = true)]
936    async fn component_responds_before_timeout() {
937        // Create our registry with a registered component:
938        let (mut handle, mut registry, registry_state, runner_state) = initialize_registry_with_component(COMPONENT_ID);
939
940        // Manually create our `live` call and ensure that it's not ready yet, as the registry task has not yet been driven,
941        // which means the component hasn't been registered yet and no probe request has been sent:
942        let mut live_future = spawn(handle.live());
943        assert_pending!(live_future.poll());
944        assert_eq!(runner_state.pending_probe_timeouts(), 0);
945        assert_eq!(runner_state.pending_scheduled_probes(), 0);
946
947        // Drive our registry task until it us quiesced to ensure the component is registered and that a probe request is sent:
948        drive_until_quiesced(&mut registry);
949        assert_eq!(runner_state.pending_probe_timeouts(), 1);
950        assert_eq!(runner_state.pending_scheduled_probes(), 0);
951
952        // Ensure our component is not live since, despite being registered, we haven't received a probe response for it yet:
953        assert!(!component_live(&registry_state, COMPONENT_ID));
954
955        // After polling the registry task, we should have sent a probe request which will have now woken up our `live` future.
956        //
957        // Poll the future which should then respond to the probe request:
958        assert!(live_future.is_woken());
959        assert_ready!(live_future.poll());
960
961        // The registry task should have been woken by the probe response.
962        //
963        // Drive the registry task until it is quiesced and ensure that the component is now live:
964        assert!(registry.is_woken());
965        drive_until_quiesced(&mut registry);
966
967        assert!(component_live(&registry_state, COMPONENT_ID));
968
969        // Since the probe response was received, we should have a pending schedule probe now since this is a "normal" probe now,
970        // and isn't the initial probe request which is scheduled immediately:
971        assert_eq!(runner_state.pending_probe_timeouts(), 0);
972        assert_eq!(runner_state.pending_scheduled_probes(), 1);
973    }
974
975    #[tokio::test(start_paused = true)]
976    async fn component_responds_after_timeout() {
977        // Create our registry with a registered component:
978        let (mut handle, mut registry, registry_state, runner_state) = initialize_registry_with_component(COMPONENT_ID);
979
980        // Manually create our `live` call and ensure that it's not ready yet, as the registry task has not yet been driven,
981        // which means the component hasn't been registered yet and no probe request has been sent:
982        let mut live_future = spawn(handle.live());
983        assert_pending!(live_future.poll());
984        assert_eq!(runner_state.pending_probe_timeouts(), 0);
985        assert_eq!(runner_state.pending_scheduled_probes(), 0);
986
987        // Drive our registry task until it us quiesced to ensure the component is registered and that a probe request is sent:
988        drive_until_quiesced(&mut registry);
989        assert_eq!(runner_state.pending_probe_timeouts(), 1);
990        assert_eq!(runner_state.pending_scheduled_probes(), 0);
991
992        // Ensure our component is not live since, despite being registered, we haven't received a probe response for it yet:
993        assert!(!component_live(&registry_state, COMPONENT_ID));
994
995        // After polling the registry task, we should have sent a probe request which will have now woken up
996        // our `live` future, but we won't yet poll it. In fact, we'll advance time _past_ the probe timeout to simulate
997        // the probe timeout expiring:
998        assert!(live_future.is_woken());
999        assert!(!registry.is_woken());
1000
1001        tokio::time::advance(DEFAULT_PROBE_TIMEOUT_DUR + Duration::from_secs(1)).await;
1002
1003        // The registry task should have been woken by the probe timeout expiring.
1004        //
1005        // Drive the registry task until it is quiesced and ensure that the component is still not live:
1006        assert!(registry.is_woken());
1007        drive_until_quiesced(&mut registry);
1008
1009        assert!(!component_live(&registry_state, COMPONENT_ID));
1010
1011        // Since the probe response was not received, we should have a pending schedule probe now since this is a "normal" probe now,
1012        // and isn't the initial probe request which is scheduled immediately:
1013        assert_eq!(runner_state.pending_probe_timeouts(), 0);
1014        assert_eq!(runner_state.pending_scheduled_probes(), 1);
1015
1016        // Now, we'll actually drive the `live` future to respond to the probe request, which should mark the component as live:
1017        assert_ready!(live_future.poll());
1018
1019        assert!(registry.is_woken());
1020        drive_until_quiesced(&mut registry);
1021
1022        assert!(component_live(&registry_state, COMPONENT_ID));
1023
1024        // However, since the first probe response timed out, and we haven't yet fired off our scheduled probe, receiving this late
1025        // response should not trigger the scheduling of another probe:
1026        assert_eq!(runner_state.pending_probe_timeouts(), 0);
1027        assert_eq!(runner_state.pending_scheduled_probes(), 1);
1028    }
1029
1030    #[track_caller]
1031    #[allow(clippy::type_complexity)]
1032    fn initialize_registry_with_component_and_shutdown(
1033        component_id: &str,
1034    ) -> (
1035        Health,
1036        Spawn<impl Future<Output = ()>>,
1037        Arc<Mutex<RegistryState>>,
1038        Arc<RunnerState>,
1039        oneshot::Sender<()>,
1040    ) {
1041        let registry = HealthRegistry::new();
1042        let registry_state = registry.state();
1043        let handle = registry.register_component(component_id).unwrap();
1044        let runner = registry.into_runner().expect("should not fail to create runner");
1045        let runner_state = runner.state();
1046
1047        let (shutdown_tx, shutdown_rx) = oneshot::channel::<()>();
1048        let registry_task = spawn(runner.run(shutdown_rx.map(|_| ())));
1049
1050        (handle, registry_task, registry_state, runner_state, shutdown_tx)
1051    }
1052
1053    #[tokio::test(start_paused = true)]
1054    async fn respawn_preserves_fresh_component_health() {
1055        // Create our registry with a registered component and drive the runner until the initial probe is sent:
1056        let (mut handle, mut registry, registry_state, _runner_state, shutdown_tx) =
1057            initialize_registry_with_component_and_shutdown(COMPONENT_ID);
1058        drive_until_quiesced(&mut registry);
1059
1060        // Respond to the probe request so the component becomes live:
1061        let mut live_future = spawn(handle.live());
1062        assert_ready!(live_future.poll());
1063        drive_until_quiesced(&mut registry);
1064        assert!(component_live(&registry_state, COMPONENT_ID));
1065
1066        // Shut down the runner gracefully, which returns the response receiver to the registry state:
1067        let _ = shutdown_tx.send(());
1068        assert_ready!(registry.poll());
1069
1070        // Respawn the runner immediately (no time advance), so the probe result is still fresh:
1071        let registry = HealthRegistry {
1072            inner: Arc::clone(&registry_state),
1073        };
1074        let runner = registry
1075            .into_runner()
1076            .expect("should be able to respawn after shutdown");
1077        let _runner_state = runner.state();
1078        let mut registry = spawn(runner.run(std::future::pending()));
1079        drive_until_quiesced(&mut registry);
1080
1081        // The component's health should be preserved as Live since its last response is fresh:
1082        assert!(component_live(&registry_state, COMPONENT_ID));
1083    }
1084
1085    #[tokio::test(start_paused = true)]
1086    async fn respawn_resets_stale_component_health() {
1087        // Create our registry with a registered component and drive the runner until the initial probe is sent:
1088        let (mut handle, mut registry, registry_state, _runner_state, shutdown_tx) =
1089            initialize_registry_with_component_and_shutdown(COMPONENT_ID);
1090        drive_until_quiesced(&mut registry);
1091
1092        // Respond to the probe request so the component becomes live:
1093        let mut live_future = spawn(handle.live());
1094        assert_ready!(live_future.poll());
1095        drive_until_quiesced(&mut registry);
1096        assert!(component_live(&registry_state, COMPONENT_ID));
1097
1098        // Shut down the runner gracefully, which returns the response receiver to the registry state:
1099        let _ = shutdown_tx.send(());
1100        assert_ready!(registry.poll());
1101
1102        // Advance time past the probe timeout so the last response becomes stale:
1103        tokio::time::advance(DEFAULT_PROBE_TIMEOUT_DUR + Duration::from_secs(1)).await;
1104
1105        // Respawn the runner. The stale component should be reset to Unknown:
1106        let registry = HealthRegistry {
1107            inner: Arc::clone(&registry_state),
1108        };
1109        let runner = registry
1110            .into_runner()
1111            .expect("should be able to respawn after shutdown");
1112        let _runner_state = runner.state();
1113        let mut registry = spawn(runner.run(std::future::pending()));
1114        drive_until_quiesced(&mut registry);
1115
1116        // The component's health should have been reset to Unknown since its last response is stale:
1117        assert!(!component_live(&registry_state, COMPONENT_ID));
1118    }
1119}
saluki_core/health/mod.rs

saluki_core/health/
mod.rs