CheckpointAndExitAfterWallclock.hpp
1 // Distributed under the MIT License.
2 // See LICENSE.txt for details.
3 
4 #pragma once
5 
6 #include <optional>
7 #include <pup.h>
8 #include <string>
9 #include <type_traits>
10 #include <utility>
11 
12 #include "Options/Auto.hpp"
13 #include "Options/Options.hpp"
14 #include "Parallel/AlgorithmMetafunctions.hpp"
16 #include "Parallel/GlobalCache.hpp"
17 #include "Parallel/Main.hpp"
18 #include "Parallel/PhaseControl/PhaseChange.hpp"
20 #include "Utilities/Functional.hpp"
22 #include "Utilities/TMPL.hpp"
23 #include "Utilities/TaggedTuple.hpp"
24 
25 /// \cond
26 namespace PhaseControl {
27 template <typename Metavariables, typename PhaseChangeRegistrars>
29 
30 namespace Registrars {
31 template <typename Metavariables>
33  template <typename PhaseChangeRegistrars>
34  using f =
35  ::PhaseControl::CheckpointAndExitAfterWallclock<Metavariables,
36  PhaseChangeRegistrars>;
37 };
38 } // namespace Registrars
39 /// \endcond
40 
41 namespace Tags {
42 /// Storage in the phase change decision tuple so that the Main chare can record
43 /// the phase to go to when restarting the run from a checkpoint file.
44 ///
45 /// \note This tag is not intended to participate in any of the reduction
46 /// procedures, so will error if the combine method is called.
47 template <typename PhaseType>
48 struct RestartPhase {
50 
51  struct combine_method {
52  std::optional<PhaseType> operator()(
53  const std::optional<PhaseType> /*first_phase*/,
54  const std::optional<PhaseType>& /*second_phase*/) noexcept {
55  ERROR(
56  "The restart phase should only be altered by the phase change "
57  "arbitration in the Main chare, so no reduction data should be "
58  "provided.");
59  }
60  };
61 
63 };
64 
65 /// Storage in the phase change decision tuple so that the Main chare can record
66 /// the elapsed wallclock time since the start of the run.
67 ///
68 /// \note This tag is not intended to participate in any of the reduction
69 /// procedures, so will error if the combine method is called.
72 
73  struct combine_method {
74  std::optional<double> operator()(
75  const std::optional<double> /*first_time*/,
76  const std::optional<double>& /*second_time*/) noexcept {
77  ERROR(
78  "The wallclock time at which a checkpoint was requested should "
79  "only be altered by the phase change arbitration in the Main "
80  "chare, so no reduction data should be provided.");
81  }
82  };
84 };
85 
86 /// Stores whether the checkpoint and exit has been requested.
87 ///
88 /// Combinations are performed via `funcl::Or`, as the phase in question should
89 /// be chosen if any component requests the jump.
91  using type = bool;
92 
95 };
96 
97 } // namespace Tags
98 
99 /*!
100  * \brief Phase control object that runs the WriteCheckpoint and Exit phases
101  * after a specified amount of wallclock time has elapsed.
102  *
103  * This phase control is useful for running SpECTRE executables performing
104  * lengthy computations that may exceed a supercomputer's wallclock limits.
105  * Writing a single checkpoint at the end of the job's allocated time allows
106  * the computation to be continued, while minimizing the disc space taken up by
107  * checkpoint files.
108  *
109  * Note that this phase control is not a trigger on wallclock time. Rather,
110  * it checks the elapsed wallclock time when called, likely from a global sync
111  * point triggered by some other mechanism, e.g., at some slab boundary.
112  * Therefore, the WriteCheckpoint and Exit phases will run the first time
113  * this phase control is called after the specified wallclock time has been
114  * reached.
115  *
116  * \warning the global sync points _must_ be triggered often enough to ensure
117  * there will be at least one sync point (i.e., one call to this phase control)
118  * in the window between the requested checkpoint-and-exit time and the time at
119  * which the batch system will kill the executable. To make this more concrete,
120  * consider this example: when running on a 12-hour queue with a
121  * checkpoint-and-exit requested after 11.5 hours, there is a 0.5-hour window
122  * for a global sync to occur, the checkpoint files to be written to disc, and
123  * the executable to clean up. In this case, triggering a global sync every
124  * 2-10 minutes might be desirable. Matching the global sync frequency with the
125  * time window for checkpoint and exit is the responsibility of the user!
126  */
127 template <typename Metavariables,
128  typename PhaseChangeRegistrars = tmpl::list<
129  Registrars::CheckpointAndExitAfterWallclock<Metavariables>>>
131  : public PhaseChange<PhaseChangeRegistrars> {
132  // This PhaseChange only makes sense if Metavars has a WriteCheckpoint phase
133  static_assert(Parallel::Algorithm_detail::has_WriteCheckpoint_v<
134  typename Metavariables::Phase>,
135  "Requested to write checkpoints but Metavariables::Phase "
136  "doesn't have a WriteCheckpoint phase");
137 
139  const Options::Context& context = {})
140  : wallclock_hours_for_checkpoint_and_exit_(wallclock_hours) {
141  if (wallclock_hours.has_value() and wallclock_hours.value() < 0.0) {
142  PARSE_ERROR(context, "Must give a positive time in hours, but got "
143  << wallclock_hours.value());
144  }
145  }
146  explicit CheckpointAndExitAfterWallclock(CkMigrateMessage* msg) noexcept
148 
149  /// \cond
151  using PUP::able::register_constructor;
153  /// \endcond
154 
155  struct WallclockHours {
157  static constexpr Options::String help = {
158  "Time in hours after which to write the checkpoint and exit. "
159  "If 'None' is specified, no action will be taken."};
160  };
161  using options = tmpl::list<WallclockHours>;
162  static constexpr Options::String help{
163  "Once the wallclock time has exceeded the specified amount, trigger "
164  "writing a checkpoint and then exit."};
165 
166  using argument_tags = tmpl::list<>;
167  using return_tags = tmpl::list<>;
168 
169  using phase_change_tags_and_combines =
170  tmpl::list<Tags::RestartPhase<typename Metavariables::Phase>,
173 
174  template <typename LocalMetavariables>
175  using participating_components = typename LocalMetavariables::component_list;
176 
177  template <typename... DecisionTags>
178  void initialize_phase_data_impl(
180  phase_change_decision_data) const noexcept {
181  tuples::get<Tags::RestartPhase<typename Metavariables::Phase>>(
182  *phase_change_decision_data) = std::nullopt;
183  tuples::get<Tags::WallclockHoursAtCheckpoint>(*phase_change_decision_data) =
184  std::nullopt;
185  tuples::get<Tags::CheckpointAndExitRequested>(*phase_change_decision_data) =
186  false;
187  }
188 
189  template <typename ParallelComponent, typename ArrayIndex,
190  typename LocalMetavariables>
191  void contribute_phase_data_impl(
193  const ArrayIndex& array_index) const noexcept {
194  if constexpr (std::is_same_v<typename ParallelComponent::chare_type,
196  Parallel::contribute_to_phase_change_reduction<ParallelComponent>(
198  array_index);
199  } else {
200  Parallel::contribute_to_phase_change_reduction<ParallelComponent>(
202  }
203  }
204 
205  template <typename... DecisionTags, typename LocalMetavariables>
206  typename std::optional<
208  arbitrate_phase_change_impl(
210  phase_change_decision_data,
211  const typename LocalMetavariables::Phase current_phase,
213  const noexcept {
214  // If no checkpoint-and-exit time given, then do nothing
215  if (not wallclock_hours_for_checkpoint_and_exit_.has_value()) {
216  return std::nullopt;
217  }
218 
219  const double elapsed_hours = sys::wall_time() / 3600.0;
220 
221  auto& restart_phase =
222  tuples::get<Tags::RestartPhase<typename Metavariables::Phase>>(
223  *phase_change_decision_data);
224  auto& wallclock_hours_at_checkpoint =
225  tuples::get<Tags::WallclockHoursAtCheckpoint>(
226  *phase_change_decision_data);
227  if (restart_phase.has_value()) {
228  ASSERT(wallclock_hours_at_checkpoint.has_value(),
229  "Consistency error: Should have recorded the Wallclock time "
230  "while recording a phase to restart from.");
231  // This `if` branch, where restart_phase has a value, is the
232  // post-checkpoint call to arbitrate_phase_change. Depending on the time
233  // elapsed so far in this run, next phase is...
234  // - Exit, if the time is large
235  // - restart_phase, if the time is small
236  if (elapsed_hours >= wallclock_hours_at_checkpoint.value()) {
237  // Preserve restart_phase for use after restarting from the checkpoint
238  return std::make_pair(Metavariables::Phase::Exit,
239  ArbitrationStrategy::RunPhaseImmediately);
240  } else {
241  // Reset restart_phase until it is needed for the next checkpoint
242  const auto result = restart_phase;
243  restart_phase.reset();
244  wallclock_hours_at_checkpoint.reset();
245  return std::make_pair(result.value(),
246  ArbitrationStrategy::PermitAdditionalJumps);
247  }
248  }
249 
250  auto& checkpoint_and_exit_requested =
251  tuples::get<Tags::CheckpointAndExitRequested>(
252  *phase_change_decision_data);
253  if (checkpoint_and_exit_requested) {
254  checkpoint_and_exit_requested = false;
255  // We checked wallclock_hours_for_checkpoint_and_exit_ has value above
256  if (elapsed_hours >= wallclock_hours_for_checkpoint_and_exit_.value()) {
257  // Record phase and actual elapsed time for determining following phase
258  restart_phase = current_phase;
259  wallclock_hours_at_checkpoint = elapsed_hours;
260  return std::make_pair(Metavariables::Phase::WriteCheckpoint,
261  ArbitrationStrategy::RunPhaseImmediately);
262  }
263  }
264  return std::nullopt;
265  }
266 
267  void pup(PUP::er& p) noexcept override {
269  p | wallclock_hours_for_checkpoint_and_exit_;
270  }
271 
272  private:
273  std::optional<double> wallclock_hours_for_checkpoint_and_exit_ = std::nullopt;
274 };
275 } // namespace PhaseControl
276 
277 /// \cond
278 template <typename Metavariables, typename PhaseChangeRegistrars>
279 PUP::able::PUP_ID PhaseControl::CheckpointAndExitAfterWallclock<
280  Metavariables, PhaseChangeRegistrars>::my_PUP_ID = 0;
281 /// \endcond
PhaseControl
Contains utilities for determining control-flow among phases.
Definition: ExecutePhaseChange.hpp:17
Tags::WallclockHoursAtCheckpoint
Storage in the phase change decision tuple so that the Main chare can record the elapsed wallclock ti...
Definition: CheckpointAndExitAfterWallclock.hpp:70
CharmPupable.hpp
Main.hpp
utility
Parallel::GlobalCache
Definition: ElementReceiveInterpPoints.hpp:15
PARSE_ERROR
#define PARSE_ERROR(context, m)
Definition: Options.hpp:71
std::pair
GlobalCache.hpp
Options.hpp
Error.hpp
Tags::CheckpointAndExitRequested
Stores whether the checkpoint and exit has been requested.
Definition: CheckpointAndExitAfterWallclock.hpp:90
ParallelInfo.hpp
Tags::RestartPhase::combine_method
Definition: CheckpointAndExitAfterWallclock.hpp:51
Options::Context
Definition: Options.hpp:41
ERROR
#define ERROR(m)
prints an error message to the standard error stream and aborts the program.
Definition: Error.hpp:37
WRAPPED_PUPable_decl_template
#define WRAPPED_PUPable_decl_template(className)
Mark derived classes as serializable.
Definition: CharmPupable.hpp:22
CheckpointAndExitAfterWallclock::WallclockHours
Definition: CheckpointAndExitAfterWallclock.hpp:155
tuples::TaggedTuple
An associative container that is indexed by structs.
Definition: TaggedTuple.hpp:271
Tags::WallclockHoursAtCheckpoint::combine_method
Definition: CheckpointAndExitAfterWallclock.hpp:73
Tags::RestartPhase
Storage in the phase change decision tuple so that the Main chare can record the phase to go to when ...
Definition: CheckpointAndExitAfterWallclock.hpp:48
Options::Auto
A class indicating that a parsed value can be automatically computed instead of specified.
Definition: Auto.hpp:36
ASSERT
#define ASSERT(a, m)
Assert that an expression should be true.
Definition: Assert.hpp:49
sys::wall_time
double wall_time()
The elapsed wall time in seconds.
Definition: ParallelInfo.hpp:94
ActionTesting::cache
Parallel::GlobalCache< Metavariables > & cache(MockRuntimeSystem< Metavariables > &runner, const ArrayIndex &array_index) noexcept
Returns the GlobalCache of Component with index array_index.
Definition: MockRuntimeSystemFreeFunctions.hpp:382
Parallel::Algorithms::Array
A struct that stores the charm++ types relevant for a particular array component.
Definition: AlgorithmArrayDeclarations.hpp:31
PhaseChange
PhaseChange objects determine the storage types and logic for moving between phases based on runtime ...
Definition: PhaseChange.hpp:141
Options::String
const char *const String
The string used in option structs.
Definition: Options.hpp:32
optional
CheckpointAndExitAfterWallclock
Phase control object that runs the WriteCheckpoint and Exit phases after a specified amount of wallcl...
Definition: CheckpointAndExitAfterWallclock.hpp:130
funcl::Or
Functional for computing or of two objects.
Definition: Functional.hpp:240
type_traits
TMPL.hpp
gsl::not_null
Require a pointer to not be a nullptr
Definition: ReadSpecPiecewisePolynomial.hpp:13
string