SpECTRE Documentation Coverage Report
Current view: top level - Parallel/PhaseControl - CheckpointAndExitAfterWallclock.hpp Hit Total Coverage
Commit: a18e59fda1a195609825c55450f7d61ad20a91a4 Lines: 4 30 13.3 %
Date: 2026-06-11 22:10:41
Legend: Lines: hit not hit

          Line data    Source code
       1           0 : // Distributed under the MIT License.
       2             : // See LICENSE.txt for details.
       3             : 
       4             : #pragma once
       5             : 
       6             : #include <limits>
       7             : #include <optional>
       8             : #include <pup.h>
       9             : #include <string>
      10             : #include <type_traits>
      11             : #include <utility>
      12             : 
      13             : #include "DataStructures/TaggedTuple.hpp"
      14             : #include "Options/Auto.hpp"
      15             : #include "Options/Options.hpp"
      16             : #include "Parallel/AlgorithmMetafunctions.hpp"
      17             : #include "Parallel/ExitCode.hpp"
      18             : #include "Parallel/GlobalCache.hpp"
      19             : #include "Parallel/Phase.hpp"
      20             : #include "Parallel/PhaseControl/ContributeToPhaseChangeReduction.hpp"
      21             : #include "Parallel/PhaseControl/PhaseChange.hpp"
      22             : #include "Utilities/ErrorHandling/Assert.hpp"
      23             : #include "Utilities/Functional.hpp"
      24             : #include "Utilities/Serialization/CharmPupable.hpp"
      25             : #include "Utilities/System/ParallelInfo.hpp"
      26             : #include "Utilities/TMPL.hpp"
      27             : 
      28           1 : namespace PhaseControl {
      29             : 
      30           0 : namespace Tags {
      31             : /// Storage in the phase change decision tuple so that the Main chare can record
      32             : /// the phase to go to when restarting the run from a checkpoint file.
      33             : ///
      34             : /// \note This tag is not intended to participate in any of the reduction
      35             : /// procedures, so will error if the combine method is called.
      36           1 : struct RestartPhase {
      37           0 :   using type = std::optional<Parallel::Phase>;
      38             : 
      39           0 :   struct combine_method {
      40           0 :     [[noreturn]] std::optional<Parallel::Phase> operator()(
      41             :         const std::optional<Parallel::Phase> /*first_phase*/,
      42             :         const std::optional<Parallel::Phase>& /*second_phase*/);
      43             :   };
      44             : 
      45           0 :   using main_combine_method = combine_method;
      46             : };
      47             : 
      48             : /// Stores whether the checkpoint and exit has been requested.
      49             : ///
      50             : /// Combinations are performed via `funcl::Or`, as the phase in question should
      51             : /// be chosen if any component requests the jump.
      52           1 : struct CheckpointAndExitRequested {
      53           0 :   using type = bool;
      54             : 
      55           0 :   using combine_method = funcl::Or<>;
      56           0 :   using main_combine_method = funcl::Or<>;
      57             : };
      58             : 
      59             : }  // namespace Tags
      60             : 
      61             : /*!
      62             :  * \brief Phase control object that runs the WriteCheckpoint and Exit phases
      63             :  * after a specified amount of wallclock time has elapsed.
      64             :  *
      65             :  * When the executable exits from here, it does so with
      66             :  * `Parallel::ExitCode::ContinueFromCheckpoint`.
      67             :  *
      68             :  * This phase control is useful for running SpECTRE executables performing
      69             :  * lengthy computations that may exceed a supercomputer's wallclock limits.
      70             :  * Writing a single checkpoint at the end of the job's allocated time allows
      71             :  * the computation to be continued, while minimizing the disc space taken up by
      72             :  * checkpoint files.
      73             :  *
      74             :  * Note that this phase control is not a trigger on wallclock time. Rather,
      75             :  * it checks the elapsed wallclock time when called, likely from a global sync
      76             :  * point triggered by some other mechanism, e.g., at some slab boundary.
      77             :  * Therefore, the WriteCheckpoint and Exit phases will run the first time
      78             :  * this phase control is called after the specified wallclock time has been
      79             :  * reached.
      80             :  *
      81             :  * \warning the global sync points _must_ be triggered often enough to ensure
      82             :  * there will be at least one sync point (i.e., one call to this phase control)
      83             :  * in the window between the requested checkpoint-and-exit time and the time at
      84             :  * which the batch system will kill the executable. To make this more concrete,
      85             :  * consider this example: when running on a 12-hour queue with a
      86             :  * checkpoint-and-exit requested after 11.5 hours, there is a 0.5-hour window
      87             :  * for a global sync to occur, the checkpoint files to be written to disc, and
      88             :  * the executable to clean up. In this case, triggering a global sync every
      89             :  * 2-10 minutes might be desirable. Matching the global sync frequency with the
      90             :  * time window for checkpoint and exit is the responsibility of the user!
      91             :  *
      92             :  * \parblock
      93             :  * \warning If modifying the phase-change logic on a
      94             :  * checkpoint-restart, this PhaseChange must remain in the list after
      95             :  * modification so that the end of the restart logic will run.  The
      96             :  * WallclockHours can be changed to None to disable further restarts.
      97             :  * \endparblock
      98             :  */
      99           1 : struct CheckpointAndExitAfterWallclock : public PhaseChange {
     100           0 :   CheckpointAndExitAfterWallclock(const std::optional<double> wallclock_hours,
     101             :                                   const Options::Context& context = {});
     102             : 
     103           0 :   explicit CheckpointAndExitAfterWallclock(CkMigrateMessage* msg);
     104             : 
     105             :   /// \cond
     106             :   CheckpointAndExitAfterWallclock() = default;
     107             :   using PUP::able::register_constructor;
     108             :   WRAPPED_PUPable_decl_template(CheckpointAndExitAfterWallclock);  // NOLINT
     109             :   /// \endcond
     110             : 
     111           0 :   struct WallclockHours {
     112           0 :     using type = Options::Auto<double, Options::AutoLabel::None>;
     113           0 :     static constexpr Options::String help = {
     114             :         "Time in hours after which to write the checkpoint and exit. "
     115             :         "If 'None' is specified, no action will be taken."};
     116             :   };
     117             : 
     118           0 :   using options = tmpl::list<WallclockHours>;
     119           0 :   static constexpr Options::String help{
     120             :       "Once the wallclock time has exceeded the specified amount, trigger "
     121             :       "writing a checkpoint and then exit with the 'ContinueFromCheckpoint' "
     122             :       "exit code."};
     123             : 
     124           0 :   using argument_tags = tmpl::list<>;
     125           0 :   using return_tags = tmpl::list<>;
     126             : 
     127           0 :   using phase_change_tags_and_combines =
     128             :       tmpl::list<Tags::RestartPhase, Tags::CheckpointAndExitRequested>;
     129             : 
     130             :   template <typename Metavariables>
     131           0 :   using participating_components = typename Metavariables::component_list;
     132             : 
     133             :   template <typename... DecisionTags>
     134           0 :   void initialize_phase_data_impl(
     135             :       const gsl::not_null<tuples::TaggedTuple<DecisionTags...>*>
     136             :           phase_change_decision_data) const;
     137             : 
     138             :   template <typename ParallelComponent, typename ArrayIndex,
     139             :             typename Metavariables>
     140           0 :   void contribute_phase_data_impl(Parallel::GlobalCache<Metavariables>& cache,
     141             :                                   const ArrayIndex& array_index) const;
     142             : 
     143             :   template <typename... DecisionTags, typename Metavariables>
     144             :   typename std::optional<std::pair<Parallel::Phase, ArbitrationStrategy>>
     145           0 :   arbitrate_phase_change_impl(
     146             :       const gsl::not_null<tuples::TaggedTuple<DecisionTags...>*>
     147             :           phase_change_decision_data,
     148             :       const Parallel::Phase current_phase,
     149             :       const Parallel::GlobalCache<Metavariables>& /*cache*/) const;
     150             : 
     151           0 :   void pup(PUP::er& p) override;
     152             : 
     153             :  private:
     154           0 :   std::optional<double> wallclock_hours_for_checkpoint_and_exit_ = std::nullopt;
     155             :   // This flag is set during arbitration when the class decides to
     156             :   // halt the run.  As it is not checkpointed, this distinguishes the
     157             :   // state immediately after writing the checkpoint from that
     158             :   // immediately after reading it during the restart.
     159             :   //
     160             :   // Phase arbitration is only run from Main, so there are no
     161             :   // threading issues here.
     162             :   // NOLINTNEXTLINE(spectre-mutable)
     163           0 :   mutable bool halting_ = false;
     164             : };
     165             : 
     166             : template <typename... DecisionTags>
     167             : void CheckpointAndExitAfterWallclock::initialize_phase_data_impl(
     168             :     const gsl::not_null<tuples::TaggedTuple<DecisionTags...>*>
     169             :         phase_change_decision_data) const {
     170             :   tuples::get<Tags::RestartPhase>(*phase_change_decision_data) = std::nullopt;
     171             :   tuples::get<Tags::CheckpointAndExitRequested>(*phase_change_decision_data) =
     172             :       false;
     173             : }
     174             : 
     175             : template <typename ParallelComponent, typename ArrayIndex,
     176             :           typename Metavariables>
     177             : void CheckpointAndExitAfterWallclock::contribute_phase_data_impl(
     178             :     Parallel::GlobalCache<Metavariables>& cache,
     179             :     const ArrayIndex& array_index) const {
     180             :   if constexpr (std::is_same_v<typename ParallelComponent::chare_type,
     181             :                                Parallel::Algorithms::Array>) {
     182             :     Parallel::contribute_to_phase_change_reduction<ParallelComponent>(
     183             :         tuples::TaggedTuple<Tags::CheckpointAndExitRequested>{true}, cache,
     184             :         array_index);
     185             :   } else {
     186             :     Parallel::contribute_to_phase_change_reduction<ParallelComponent>(
     187             :         tuples::TaggedTuple<Tags::CheckpointAndExitRequested>{true}, cache);
     188             :   }
     189             : }
     190             : 
     191             : template <typename... DecisionTags, typename Metavariables>
     192             : typename std::optional<std::pair<Parallel::Phase, ArbitrationStrategy>>
     193             : CheckpointAndExitAfterWallclock::arbitrate_phase_change_impl(
     194             :     const gsl::not_null<tuples::TaggedTuple<DecisionTags...>*>
     195             :         phase_change_decision_data,
     196             :     const Parallel::Phase current_phase,
     197             :     const Parallel::GlobalCache<Metavariables>& /*cache*/) const {
     198             :   const double elapsed_hours = sys::wall_time() / 3600.0;
     199             : 
     200             :   auto& restart_phase =
     201             :       tuples::get<Tags::RestartPhase>(*phase_change_decision_data);
     202             :   auto& exit_code =
     203             :       tuples::get<Parallel::Tags::ExitCode>(*phase_change_decision_data);
     204             :   if (restart_phase.has_value()) {
     205             :     // This `if` branch, where restart_phase has a value, is the
     206             :     // post-checkpoint call to arbitrate_phase_change.
     207             :     if (halting_) {
     208             :       // Preserve restart_phase for use after restarting from the checkpoint
     209             :       exit_code = Parallel::ExitCode::ContinueFromCheckpoint;
     210             :       return std::make_pair(Parallel::Phase::Exit,
     211             :                             ArbitrationStrategy::RunPhaseImmediately);
     212             :     } else {
     213             :       // Reset restart_phase until it is needed for the next checkpoint
     214             :       const auto result = restart_phase;
     215             :       restart_phase.reset();
     216             :       return std::make_pair(result.value(),
     217             :                             ArbitrationStrategy::PermitAdditionalJumps);
     218             :     }
     219             :   }
     220             : 
     221             :   auto& checkpoint_and_exit_requested =
     222             :       tuples::get<Tags::CheckpointAndExitRequested>(
     223             :           *phase_change_decision_data);
     224             :   if (checkpoint_and_exit_requested) {
     225             :     checkpoint_and_exit_requested = false;
     226             :     if (elapsed_hours >= wallclock_hours_for_checkpoint_and_exit_.value_or(
     227             :                              std::numeric_limits<double>::infinity())) {
     228             :       // Record phase and actual elapsed time for determining following phase
     229             :       restart_phase = current_phase;
     230             :       ASSERT(not halting_, "Halting for checkpoint recursively");
     231             :       halting_ = true;
     232             :       return std::make_pair(Parallel::Phase::WriteCheckpoint,
     233             :                             ArbitrationStrategy::RunPhaseImmediately);
     234             :     }
     235             :   }
     236             :   return std::nullopt;
     237             : }
     238             : }  // namespace PhaseControl

Generated by: LCOV version 1.14