Skip to main content

saluki_core/runtime/
restart.rs

1use std::{collections::VecDeque, time::Duration};
2
3use tokio::time::Instant;
4use tracing::debug;
5
6/// Restart mode for child processes.
7#[derive(Clone, Copy)]
8pub enum RestartMode {
9    /// Restarts the failed child process only.
10    OneForOne,
11
12    /// Restarts all child processes, including the failed one.
13    OneForAll,
14}
15
16/// Restart strategy for a supervisor.
17///
18/// Defaults to one-to-one mode (only restart the failed process) and a restart intensity of 1 over a period of 5
19/// seconds.
20///
21/// # Restarts and permanent failure
22///
23/// A supervisor will allow up to `intensity` process restarts, across all child processes, over a given `period`. When
24/// this limit is exceeded, the supervisor will stop all child processes and return an error itself, indicating that the
25/// supervisor has failed overall.
26///
27/// Permanent failure bubbles up to the parent supervisor, until reaching the root supervisor. Once permanent failure
28/// reaches the root supervisor, and the root supervisor exceeds its own restart limits, the root supervisor will fail
29/// and cease execution.
30#[derive(Clone, Copy)]
31pub struct RestartStrategy {
32    mode: RestartMode,
33    intensity: usize,
34    period: Duration,
35}
36
37impl RestartStrategy {
38    /// Creates a new `RestartStrategy` with the given mode, intensity, and period.
39    pub const fn new(mode: RestartMode, intensity: usize, period: Duration) -> Self {
40        Self {
41            mode,
42            intensity,
43            period,
44        }
45    }
46
47    /// Creates a new `RestartStrategy` with the one-to-one restart mode, and the default intensity/period.
48    pub fn one_to_one() -> Self {
49        Self {
50            mode: RestartMode::OneForOne,
51            ..Default::default()
52        }
53    }
54
55    /// Creates a new `RestartStrategy` with the one-for-all restart mode, and the default intensity/period.
56    pub fn one_for_all() -> Self {
57        Self {
58            mode: RestartMode::OneForAll,
59            ..Default::default()
60        }
61    }
62
63    /// Sets the restart intensity and period for the strategy.
64    pub const fn with_intensity_and_period(mut self, intensity: usize, period: Duration) -> Self {
65        self.intensity = intensity;
66        self.period = period;
67        self
68    }
69}
70
71impl Default for RestartStrategy {
72    fn default() -> Self {
73        Self::new(RestartMode::OneForOne, 1, Duration::from_secs(5))
74    }
75}
76
77/// Restart policy for an individual child process.
78///
79/// While [`RestartStrategy`] governs supervisor-wide behavior (which children are restarted together, and how often
80/// before the supervisor gives up), [`RestartType`] governs whether an _individual_ child is eligible for restart at
81/// all, based on how it exited.
82///
83/// Defaults to [`Permanent`][Self::Permanent], which marks a child process to always be restarted.
84#[derive(Clone, Copy, Debug, PartialEq, Eq, Default)]
85pub enum RestartType {
86    /// The child is always restarted, whether it exits normally or abnormally.
87    ///
88    /// This suits long-lived processes that are always expected to be running.
89    #[default]
90    Permanent,
91
92    /// The child is restarted only if it exits abnormally.
93    ///
94    /// An abnormal exit is an error, panic, or forced abort. A normal exit (the child's future resolves with `Ok(())`)
95    /// is treated as intentional, and the child is not restarted. This governs the child's _own_ exit; a transient
96    /// child is still restarted when a sibling triggers a [`RestartMode::OneForAll`] group restart, matching
97    /// Erlang/OTP.
98    Transient,
99
100    /// The child is never restarted, regardless of how it exits.
101    ///
102    /// This suits short-lived, on-demand children -- for example, one task per network connection -- whose termination
103    /// is a normal part of operation. A temporary child is never restarted even when a sibling triggers a
104    /// [`RestartMode::OneForAll`] group restart: it is shut down with the group but not brought back.
105    Temporary,
106}
107
108impl RestartType {
109    /// Returns `true` if a child with this restart policy should be restarted, given how it exited.
110    ///
111    /// `abnormal` indicates the child exited due to an error, panic, or forced abort, rather than completing normally.
112    pub(super) fn should_restart(self, abnormal: bool) -> bool {
113        match self {
114            Self::Permanent => true,
115            Self::Transient => abnormal,
116            Self::Temporary => false,
117        }
118    }
119}
120
121pub(super) enum RestartAction {
122    /// Execute a restart with the given mode.
123    Restart(RestartMode),
124
125    /// Supervisor must shutdown as the maximum number of restarts has been reached.
126    Shutdown,
127}
128
129pub(super) struct RestartState {
130    strategy: RestartStrategy,
131    restart_history: VecDeque<Instant>,
132}
133
134impl RestartState {
135    /// Creates a new `RestartState` with the given strategy.
136    pub fn new(strategy: RestartStrategy) -> Self {
137        Self {
138            strategy,
139            restart_history: VecDeque::with_capacity(strategy.intensity),
140        }
141    }
142
143    /// Evaluates a restart based on the current state and determine the action the supervisor should take in response.
144    pub fn evaluate_restart(&mut self) -> RestartAction {
145        // Short circuit if our intensity is zero.
146        if self.strategy.intensity == 0 {
147            debug!("Restart strategy configured with restart intensity of zero, shutting down.");
148            return RestartAction::Shutdown;
149        }
150
151        // Since we only keep track of the last `intensity` restarts, we simply need to check if the oldest restart
152        // we're tracking is within `period` of the current time, and if the number of tracked restarts is equal to
153        // `intensity`.
154        //
155        // When both of these are true, we have exceeded the restart intensity limit and must shutdown.
156        let now = Instant::now();
157        if self.restart_history.len() == self.strategy.intensity {
158            let oldest = self.restart_history.front().expect("restart history cannot be empty");
159            if now.saturating_duration_since(*oldest) < self.strategy.period {
160                debug!(
161                    "Restart limit exceeded ({} in {:?}), shutting down.",
162                    self.strategy.intensity, self.strategy.period
163                );
164                return RestartAction::Shutdown;
165            }
166
167            // Remove the oldest restart from the history since it is outside the period.
168            self.restart_history.pop_front();
169        }
170
171        // Track this latest restart.
172        self.restart_history.push_back(now);
173
174        debug!("Restart limit not exceeded, restarting worker.");
175        RestartAction::Restart(self.strategy.mode)
176    }
177}