SpECTRE Documentation Coverage Report
Current view: top level - Parallel/PhaseControl - CheckpointAndExitAfterWallclock.hpp Hit Total Coverage
Commit: eb45036e71ee786d31156fb02c6e736b9a032426 Lines: 5 34 14.7 %
Date: 2024-04-18 22:31:00
Legend: Lines: hit not hit

          Line data    Source code
       1           0 : // Distributed under the MIT License.
       2             : // See LICENSE.txt for details.
       3             : 
       4             : #pragma once
       5             : 
       6             : #include <optional>
       7             : #include <pup.h>
       8             : #include <string>
       9             : #include <type_traits>
      10             : #include <utility>
      11             : 
      12             : #include "Options/Auto.hpp"
      13             : #include "Options/Options.hpp"
      14             : #include "Parallel/AlgorithmMetafunctions.hpp"
      15             : #include "Parallel/ExitCode.hpp"
      16             : #include "Parallel/GlobalCache.hpp"
      17             : #include "Parallel/Phase.hpp"
      18             : #include "Parallel/PhaseControl/ContributeToPhaseChangeReduction.hpp"
      19             : #include "Parallel/PhaseControl/PhaseChange.hpp"
      20             : #include "Utilities/ErrorHandling/Assert.hpp"
      21             : #include "Utilities/Functional.hpp"
      22             : #include "Utilities/Serialization/CharmPupable.hpp"
      23             : #include "Utilities/System/ParallelInfo.hpp"
      24             : #include "Utilities/TMPL.hpp"
      25             : #include "Utilities/TaggedTuple.hpp"
      26             : 
      27           1 : namespace PhaseControl {
      28             : 
      29           0 : namespace Tags {
      30             : /// Storage in the phase change decision tuple so that the Main chare can record
      31             : /// the phase to go to when restarting the run from a checkpoint file.
      32             : ///
      33             : /// \note This tag is not intended to participate in any of the reduction
      34             : /// procedures, so will error if the combine method is called.
      35           1 : struct RestartPhase {
      36           0 :   using type = std::optional<Parallel::Phase>;
      37             : 
      38           0 :   struct combine_method {
      39           0 :     [[noreturn]] std::optional<Parallel::Phase> operator()(
      40             :         const std::optional<Parallel::Phase> /*first_phase*/,
      41             :         const std::optional<Parallel::Phase>& /*second_phase*/);
      42             :   };
      43             : 
      44           0 :   using main_combine_method = combine_method;
      45             : };
      46             : 
      47             : /// Storage in the phase change decision tuple so that the Main chare can record
      48             : /// the elapsed wallclock time since the start of the run.
      49             : ///
      50             : /// \note This tag is not intended to participate in any of the reduction
      51             : /// procedures, so will error if the combine method is called.
      52           1 : struct WallclockHoursAtCheckpoint {
      53           0 :   using type = std::optional<double>;
      54             : 
      55           0 :   struct combine_method {
      56           0 :     [[noreturn]] std::optional<double> operator()(
      57             :         const std::optional<double> /*first_time*/,
      58             :         const std::optional<double>& /*second_time*/);
      59             :   };
      60           0 :   using main_combine_method = combine_method;
      61             : };
      62             : 
      63             : /// Stores whether the checkpoint and exit has been requested.
      64             : ///
      65             : /// Combinations are performed via `funcl::Or`, as the phase in question should
      66             : /// be chosen if any component requests the jump.
      67           1 : struct CheckpointAndExitRequested {
      68           0 :   using type = bool;
      69             : 
      70           0 :   using combine_method = funcl::Or<>;
      71           0 :   using main_combine_method = funcl::Or<>;
      72             : };
      73             : 
      74             : }  // namespace Tags
      75             : 
      76             : /*!
      77             :  * \brief Phase control object that runs the WriteCheckpoint and Exit phases
      78             :  * after a specified amount of wallclock time has elapsed.
      79             :  *
      80             :  * When the executable exits from here, it does so with
      81             :  * `Parallel::ExitCode::ContinueFromCheckpoint`.
      82             :  *
      83             :  * This phase control is useful for running SpECTRE executables performing
      84             :  * lengthy computations that may exceed a supercomputer's wallclock limits.
      85             :  * Writing a single checkpoint at the end of the job's allocated time allows
      86             :  * the computation to be continued, while minimizing the disc space taken up by
      87             :  * checkpoint files.
      88             :  *
      89             :  * Note that this phase control is not a trigger on wallclock time. Rather,
      90             :  * it checks the elapsed wallclock time when called, likely from a global sync
      91             :  * point triggered by some other mechanism, e.g., at some slab boundary.
      92             :  * Therefore, the WriteCheckpoint and Exit phases will run the first time
      93             :  * this phase control is called after the specified wallclock time has been
      94             :  * reached.
      95             :  *
      96             :  * \warning the global sync points _must_ be triggered often enough to ensure
      97             :  * there will be at least one sync point (i.e., one call to this phase control)
      98             :  * in the window between the requested checkpoint-and-exit time and the time at
      99             :  * which the batch system will kill the executable. To make this more concrete,
     100             :  * consider this example: when running on a 12-hour queue with a
     101             :  * checkpoint-and-exit requested after 11.5 hours, there is a 0.5-hour window
     102             :  * for a global sync to occur, the checkpoint files to be written to disc, and
     103             :  * the executable to clean up. In this case, triggering a global sync every
     104             :  * 2-10 minutes might be desirable. Matching the global sync frequency with the
     105             :  * time window for checkpoint and exit is the responsibility of the user!
     106             :  */
     107           1 : struct CheckpointAndExitAfterWallclock : public PhaseChange {
     108           0 :   CheckpointAndExitAfterWallclock(const std::optional<double> wallclock_hours,
     109             :                                   const Options::Context& context = {});
     110             : 
     111           0 :   explicit CheckpointAndExitAfterWallclock(CkMigrateMessage* msg);
     112             : 
     113             :   /// \cond
     114             :   CheckpointAndExitAfterWallclock() = default;
     115             :   using PUP::able::register_constructor;
     116             :   WRAPPED_PUPable_decl_template(CheckpointAndExitAfterWallclock);  // NOLINT
     117             :   /// \endcond
     118             : 
     119           0 :   struct WallclockHours {
     120           0 :     using type = Options::Auto<double, Options::AutoLabel::None>;
     121           0 :     static constexpr Options::String help = {
     122             :         "Time in hours after which to write the checkpoint and exit. "
     123             :         "If 'None' is specified, no action will be taken."};
     124             :   };
     125             : 
     126           0 :   using options = tmpl::list<WallclockHours>;
     127           0 :   static constexpr Options::String help{
     128             :       "Once the wallclock time has exceeded the specified amount, trigger "
     129             :       "writing a checkpoint and then exit with the 'ContinueFromCheckpoint' "
     130             :       "exit code."};
     131             : 
     132           0 :   using argument_tags = tmpl::list<>;
     133           0 :   using return_tags = tmpl::list<>;
     134             : 
     135           0 :   using phase_change_tags_and_combines =
     136             :       tmpl::list<Tags::RestartPhase, Tags::WallclockHoursAtCheckpoint,
     137             :                  Tags::CheckpointAndExitRequested>;
     138             : 
     139             :   template <typename Metavariables>
     140           0 :   using participating_components = typename Metavariables::component_list;
     141             : 
     142             :   template <typename... DecisionTags>
     143           0 :   void initialize_phase_data_impl(
     144             :       const gsl::not_null<tuples::TaggedTuple<DecisionTags...>*>
     145             :           phase_change_decision_data) const;
     146             : 
     147             :   template <typename ParallelComponent, typename ArrayIndex,
     148             :             typename Metavariables>
     149           0 :   void contribute_phase_data_impl(Parallel::GlobalCache<Metavariables>& cache,
     150             :                                   const ArrayIndex& array_index) const;
     151             : 
     152             :   template <typename... DecisionTags, typename Metavariables>
     153             :   typename std::optional<std::pair<Parallel::Phase, ArbitrationStrategy>>
     154           0 :   arbitrate_phase_change_impl(
     155             :       const gsl::not_null<tuples::TaggedTuple<DecisionTags...>*>
     156             :           phase_change_decision_data,
     157             :       const Parallel::Phase current_phase,
     158             :       const Parallel::GlobalCache<Metavariables>& /*cache*/) const;
     159             : 
     160           0 :   void pup(PUP::er& p) override;
     161             : 
     162             :  private:
     163           0 :   std::optional<double> wallclock_hours_for_checkpoint_and_exit_ = std::nullopt;
     164             : };
     165             : 
     166             : template <typename... DecisionTags>
     167             : void CheckpointAndExitAfterWallclock::initialize_phase_data_impl(
     168             :     const gsl::not_null<tuples::TaggedTuple<DecisionTags...>*>
     169             :         phase_change_decision_data) const {
     170             :   tuples::get<Tags::RestartPhase>(*phase_change_decision_data) = std::nullopt;
     171             :   tuples::get<Tags::WallclockHoursAtCheckpoint>(*phase_change_decision_data) =
     172             :       std::nullopt;
     173             :   tuples::get<Tags::CheckpointAndExitRequested>(*phase_change_decision_data) =
     174             :       false;
     175             : }
     176             : 
     177             : template <typename ParallelComponent, typename ArrayIndex,
     178             :           typename Metavariables>
     179             : void CheckpointAndExitAfterWallclock::contribute_phase_data_impl(
     180             :     Parallel::GlobalCache<Metavariables>& cache,
     181             :     const ArrayIndex& array_index) const {
     182             :   if constexpr (std::is_same_v<typename ParallelComponent::chare_type,
     183             :                                Parallel::Algorithms::Array>) {
     184             :     Parallel::contribute_to_phase_change_reduction<ParallelComponent>(
     185             :         tuples::TaggedTuple<Tags::CheckpointAndExitRequested>{true}, cache,
     186             :         array_index);
     187             :   } else {
     188             :     Parallel::contribute_to_phase_change_reduction<ParallelComponent>(
     189             :         tuples::TaggedTuple<Tags::CheckpointAndExitRequested>{true}, cache);
     190             :   }
     191             : }
     192             : 
     193             : template <typename... DecisionTags, typename Metavariables>
     194             : typename std::optional<std::pair<Parallel::Phase, ArbitrationStrategy>>
     195             : CheckpointAndExitAfterWallclock::arbitrate_phase_change_impl(
     196             :     const gsl::not_null<tuples::TaggedTuple<DecisionTags...>*>
     197             :         phase_change_decision_data,
     198             :     const Parallel::Phase current_phase,
     199             :     const Parallel::GlobalCache<Metavariables>& /*cache*/) const {
     200             :   // If no checkpoint-and-exit time given, then do nothing
     201             :   if (not wallclock_hours_for_checkpoint_and_exit_.has_value()) {
     202             :     return std::nullopt;
     203             :   }
     204             : 
     205             :   const double elapsed_hours = sys::wall_time() / 3600.0;
     206             : 
     207             :   auto& restart_phase =
     208             :       tuples::get<Tags::RestartPhase>(*phase_change_decision_data);
     209             :   auto& wallclock_hours_at_checkpoint =
     210             :       tuples::get<Tags::WallclockHoursAtCheckpoint>(
     211             :           *phase_change_decision_data);
     212             :   auto& exit_code =
     213             :       tuples::get<Parallel::Tags::ExitCode>(*phase_change_decision_data);
     214             :   if (restart_phase.has_value()) {
     215             :     ASSERT(wallclock_hours_at_checkpoint.has_value(),
     216             :            "Consistency error: Should have recorded the Wallclock time "
     217             :            "while recording a phase to restart from.");
     218             :     // This `if` branch, where restart_phase has a value, is the
     219             :     // post-checkpoint call to arbitrate_phase_change. Depending on the time
     220             :     // elapsed so far in this run, next phase is...
     221             :     // - Exit, if the time is large
     222             :     // - restart_phase, if the time is small
     223             :     if (elapsed_hours >= wallclock_hours_at_checkpoint.value()) {
     224             :       // Preserve restart_phase for use after restarting from the checkpoint
     225             :       exit_code = Parallel::ExitCode::ContinueFromCheckpoint;
     226             :       return std::make_pair(Parallel::Phase::Exit,
     227             :                             ArbitrationStrategy::RunPhaseImmediately);
     228             :     } else {
     229             :       // Reset restart_phase until it is needed for the next checkpoint
     230             :       const auto result = restart_phase;
     231             :       restart_phase.reset();
     232             :       wallclock_hours_at_checkpoint.reset();
     233             :       return std::make_pair(result.value(),
     234             :                             ArbitrationStrategy::PermitAdditionalJumps);
     235             :     }
     236             :   }
     237             : 
     238             :   auto& checkpoint_and_exit_requested =
     239             :       tuples::get<Tags::CheckpointAndExitRequested>(
     240             :           *phase_change_decision_data);
     241             :   if (checkpoint_and_exit_requested) {
     242             :     checkpoint_and_exit_requested = false;
     243             :     // We checked wallclock_hours_for_checkpoint_and_exit_ has value above
     244             :     if (elapsed_hours >= wallclock_hours_for_checkpoint_and_exit_.value()) {
     245             :       // Record phase and actual elapsed time for determining following phase
     246             :       restart_phase = current_phase;
     247             :       wallclock_hours_at_checkpoint = elapsed_hours;
     248             :       return std::make_pair(Parallel::Phase::WriteCheckpoint,
     249             :                             ArbitrationStrategy::RunPhaseImmediately);
     250             :     }
     251             :   }
     252             :   return std::nullopt;
     253             : }
     254             : }  // namespace PhaseControl

Generated by: LCOV version 1.14