saluki_core/runtime/restart.rs
1use std::{collections::VecDeque, time::Duration};
2
3use tokio::time::Instant;
4use tracing::debug;
5
6/// Restart mode for child processes.
7#[derive(Clone, Copy)]
8pub enum RestartMode {
9 /// Restarts the failed child process only.
10 OneForOne,
11
12 /// Restarts all child processes, including the failed one.
13 OneForAll,
14}
15
16/// Restart strategy for a supervisor.
17///
18/// Defaults to one-to-one mode (only restart the failed process) and a restart intensity of 1 over a period of 5
19/// seconds.
20///
21/// # Restarts and permanent failure
22///
23/// A supervisor will allow up to `intensity` process restarts, across all child processes, over a given `period`. When
24/// this limit is exceeded, the supervisor will stop all child processes and return an error itself, indicating that the
25/// supervisor has failed overall.
26///
27/// Permanent failure bubbles up to the parent supervisor, until reaching the root supervisor. Once permanent failure
28/// reaches the root supervisor, and the root supervisor exceeds its own restart limits, the root supervisor will fail
29/// and cease execution.
30#[derive(Clone, Copy)]
31pub struct RestartStrategy {
32 mode: RestartMode,
33 intensity: usize,
34 period: Duration,
35}
36
37impl RestartStrategy {
38 /// Creates a new `RestartStrategy` with the given mode, intensity, and period.
39 pub const fn new(mode: RestartMode, intensity: usize, period: Duration) -> Self {
40 Self {
41 mode,
42 intensity,
43 period,
44 }
45 }
46
47 /// Creates a new `RestartStrategy` with the one-to-one restart mode, and the default intensity/period.
48 pub fn one_to_one() -> Self {
49 Self {
50 mode: RestartMode::OneForOne,
51 ..Default::default()
52 }
53 }
54
55 /// Creates a new `RestartStrategy` with the one-for-all restart mode, and the default intensity/period.
56 pub fn one_for_all() -> Self {
57 Self {
58 mode: RestartMode::OneForAll,
59 ..Default::default()
60 }
61 }
62
63 /// Sets the restart intensity and period for the strategy.
64 pub const fn with_intensity_and_period(mut self, intensity: usize, period: Duration) -> Self {
65 self.intensity = intensity;
66 self.period = period;
67 self
68 }
69}
70
71impl Default for RestartStrategy {
72 fn default() -> Self {
73 Self::new(RestartMode::OneForOne, 1, Duration::from_secs(5))
74 }
75}
76
77/// Restart policy for an individual child process.
78///
79/// While [`RestartStrategy`] governs supervisor-wide behavior (which children are restarted together, and how often
80/// before the supervisor gives up), [`RestartType`] governs whether an _individual_ child is eligible for restart at
81/// all, based on how it exited.
82///
83/// Defaults to [`Permanent`][Self::Permanent], which marks a child process to always be restarted.
84#[derive(Clone, Copy, Debug, PartialEq, Eq, Default)]
85pub enum RestartType {
86 /// The child is always restarted, whether it exits normally or abnormally.
87 ///
88 /// This suits long-lived processes that are always expected to be running.
89 #[default]
90 Permanent,
91
92 /// The child is restarted only if it exits abnormally.
93 ///
94 /// An abnormal exit is an error, panic, or forced abort. A normal exit (the child's future resolves with `Ok(())`)
95 /// is treated as intentional, and the child is not restarted. This governs the child's _own_ exit; a transient
96 /// child is still restarted when a sibling triggers a [`RestartMode::OneForAll`] group restart, matching
97 /// Erlang/OTP.
98 Transient,
99
100 /// The child is never restarted, regardless of how it exits.
101 ///
102 /// This suits short-lived, on-demand children -- for example, one task per network connection -- whose termination
103 /// is a normal part of operation. A temporary child is never restarted even when a sibling triggers a
104 /// [`RestartMode::OneForAll`] group restart: it is shut down with the group but not brought back.
105 Temporary,
106}
107
108impl RestartType {
109 /// Returns `true` if a child with this restart policy should be restarted, given how it exited.
110 ///
111 /// `abnormal` indicates the child exited due to an error, panic, or forced abort, rather than completing normally.
112 pub(super) fn should_restart(self, abnormal: bool) -> bool {
113 match self {
114 Self::Permanent => true,
115 Self::Transient => abnormal,
116 Self::Temporary => false,
117 }
118 }
119}
120
121pub(super) enum RestartAction {
122 /// Execute a restart with the given mode.
123 Restart(RestartMode),
124
125 /// Supervisor must shutdown as the maximum number of restarts has been reached.
126 Shutdown,
127}
128
129pub(super) struct RestartState {
130 strategy: RestartStrategy,
131 restart_history: VecDeque<Instant>,
132}
133
134impl RestartState {
135 /// Creates a new `RestartState` with the given strategy.
136 pub fn new(strategy: RestartStrategy) -> Self {
137 Self {
138 strategy,
139 restart_history: VecDeque::with_capacity(strategy.intensity),
140 }
141 }
142
143 /// Evaluates a restart based on the current state and determine the action the supervisor should take in response.
144 pub fn evaluate_restart(&mut self) -> RestartAction {
145 // Short circuit if our intensity is zero.
146 if self.strategy.intensity == 0 {
147 debug!("Restart strategy configured with restart intensity of zero, shutting down.");
148 return RestartAction::Shutdown;
149 }
150
151 // Since we only keep track of the last `intensity` restarts, we simply need to check if the oldest restart
152 // we're tracking is within `period` of the current time, and if the number of tracked restarts is equal to
153 // `intensity`.
154 //
155 // When both of these are true, we have exceeded the restart intensity limit and must shutdown.
156 let now = Instant::now();
157 if self.restart_history.len() == self.strategy.intensity {
158 let oldest = self.restart_history.front().expect("restart history cannot be empty");
159 if now.saturating_duration_since(*oldest) < self.strategy.period {
160 debug!(
161 "Restart limit exceeded ({} in {:?}), shutting down.",
162 self.strategy.intensity, self.strategy.period
163 );
164 return RestartAction::Shutdown;
165 }
166
167 // Remove the oldest restart from the history since it is outside the period.
168 self.restart_history.pop_front();
169 }
170
171 // Track this latest restart.
172 self.restart_history.push_back(now);
173
174 debug!("Restart limit not exceeded, restarting worker.");
175 RestartAction::Restart(self.strategy.mode)
176 }
177}