Line data Source code
1 0 : // Distributed under the MIT License. 2 : // See LICENSE.txt for details. 3 : 4 : #pragma once 5 : 6 : #include <limits> 7 : #include <optional> 8 : #include <pup.h> 9 : #include <string> 10 : #include <type_traits> 11 : #include <utility> 12 : 13 : #include "DataStructures/TaggedTuple.hpp" 14 : #include "Options/Auto.hpp" 15 : #include "Options/Options.hpp" 16 : #include "Parallel/AlgorithmMetafunctions.hpp" 17 : #include "Parallel/ExitCode.hpp" 18 : #include "Parallel/GlobalCache.hpp" 19 : #include "Parallel/Phase.hpp" 20 : #include "Parallel/PhaseControl/ContributeToPhaseChangeReduction.hpp" 21 : #include "Parallel/PhaseControl/PhaseChange.hpp" 22 : #include "Utilities/ErrorHandling/Assert.hpp" 23 : #include "Utilities/Functional.hpp" 24 : #include "Utilities/Serialization/CharmPupable.hpp" 25 : #include "Utilities/System/ParallelInfo.hpp" 26 : #include "Utilities/TMPL.hpp" 27 : 28 1 : namespace PhaseControl { 29 : 30 0 : namespace Tags { 31 : /// Storage in the phase change decision tuple so that the Main chare can record 32 : /// the phase to go to when restarting the run from a checkpoint file. 33 : /// 34 : /// \note This tag is not intended to participate in any of the reduction 35 : /// procedures, so will error if the combine method is called. 36 1 : struct RestartPhase { 37 0 : using type = std::optional<Parallel::Phase>; 38 : 39 0 : struct combine_method { 40 0 : [[noreturn]] std::optional<Parallel::Phase> operator()( 41 : const std::optional<Parallel::Phase> /*first_phase*/, 42 : const std::optional<Parallel::Phase>& /*second_phase*/); 43 : }; 44 : 45 0 : using main_combine_method = combine_method; 46 : }; 47 : 48 : /// Stores whether the checkpoint and exit has been requested. 49 : /// 50 : /// Combinations are performed via `funcl::Or`, as the phase in question should 51 : /// be chosen if any component requests the jump. 52 1 : struct CheckpointAndExitRequested { 53 0 : using type = bool; 54 : 55 0 : using combine_method = funcl::Or<>; 56 0 : using main_combine_method = funcl::Or<>; 57 : }; 58 : 59 : } // namespace Tags 60 : 61 : /*! 62 : * \brief Phase control object that runs the WriteCheckpoint and Exit phases 63 : * after a specified amount of wallclock time has elapsed. 64 : * 65 : * When the executable exits from here, it does so with 66 : * `Parallel::ExitCode::ContinueFromCheckpoint`. 67 : * 68 : * This phase control is useful for running SpECTRE executables performing 69 : * lengthy computations that may exceed a supercomputer's wallclock limits. 70 : * Writing a single checkpoint at the end of the job's allocated time allows 71 : * the computation to be continued, while minimizing the disc space taken up by 72 : * checkpoint files. 73 : * 74 : * Note that this phase control is not a trigger on wallclock time. Rather, 75 : * it checks the elapsed wallclock time when called, likely from a global sync 76 : * point triggered by some other mechanism, e.g., at some slab boundary. 77 : * Therefore, the WriteCheckpoint and Exit phases will run the first time 78 : * this phase control is called after the specified wallclock time has been 79 : * reached. 80 : * 81 : * \warning the global sync points _must_ be triggered often enough to ensure 82 : * there will be at least one sync point (i.e., one call to this phase control) 83 : * in the window between the requested checkpoint-and-exit time and the time at 84 : * which the batch system will kill the executable. To make this more concrete, 85 : * consider this example: when running on a 12-hour queue with a 86 : * checkpoint-and-exit requested after 11.5 hours, there is a 0.5-hour window 87 : * for a global sync to occur, the checkpoint files to be written to disc, and 88 : * the executable to clean up. In this case, triggering a global sync every 89 : * 2-10 minutes might be desirable. Matching the global sync frequency with the 90 : * time window for checkpoint and exit is the responsibility of the user! 91 : * 92 : * \parblock 93 : * \warning If modifying the phase-change logic on a 94 : * checkpoint-restart, this PhaseChange must remain in the list after 95 : * modification so that the end of the restart logic will run. The 96 : * WallclockHours can be changed to None to disable further restarts. 97 : * \endparblock 98 : */ 99 1 : struct CheckpointAndExitAfterWallclock : public PhaseChange { 100 0 : CheckpointAndExitAfterWallclock(const std::optional<double> wallclock_hours, 101 : const Options::Context& context = {}); 102 : 103 0 : explicit CheckpointAndExitAfterWallclock(CkMigrateMessage* msg); 104 : 105 : /// \cond 106 : CheckpointAndExitAfterWallclock() = default; 107 : using PUP::able::register_constructor; 108 : WRAPPED_PUPable_decl_template(CheckpointAndExitAfterWallclock); // NOLINT 109 : /// \endcond 110 : 111 0 : struct WallclockHours { 112 0 : using type = Options::Auto<double, Options::AutoLabel::None>; 113 0 : static constexpr Options::String help = { 114 : "Time in hours after which to write the checkpoint and exit. " 115 : "If 'None' is specified, no action will be taken."}; 116 : }; 117 : 118 0 : using options = tmpl::list<WallclockHours>; 119 0 : static constexpr Options::String help{ 120 : "Once the wallclock time has exceeded the specified amount, trigger " 121 : "writing a checkpoint and then exit with the 'ContinueFromCheckpoint' " 122 : "exit code."}; 123 : 124 0 : using argument_tags = tmpl::list<>; 125 0 : using return_tags = tmpl::list<>; 126 : 127 0 : using phase_change_tags_and_combines = 128 : tmpl::list<Tags::RestartPhase, Tags::CheckpointAndExitRequested>; 129 : 130 : template <typename Metavariables> 131 0 : using participating_components = typename Metavariables::component_list; 132 : 133 : template <typename... DecisionTags> 134 0 : void initialize_phase_data_impl( 135 : const gsl::not_null<tuples::TaggedTuple<DecisionTags...>*> 136 : phase_change_decision_data) const; 137 : 138 : template <typename ParallelComponent, typename ArrayIndex, 139 : typename Metavariables> 140 0 : void contribute_phase_data_impl(Parallel::GlobalCache<Metavariables>& cache, 141 : const ArrayIndex& array_index) const; 142 : 143 : template <typename... DecisionTags, typename Metavariables> 144 : typename std::optional<std::pair<Parallel::Phase, ArbitrationStrategy>> 145 0 : arbitrate_phase_change_impl( 146 : const gsl::not_null<tuples::TaggedTuple<DecisionTags...>*> 147 : phase_change_decision_data, 148 : const Parallel::Phase current_phase, 149 : const Parallel::GlobalCache<Metavariables>& /*cache*/) const; 150 : 151 0 : void pup(PUP::er& p) override; 152 : 153 : private: 154 0 : std::optional<double> wallclock_hours_for_checkpoint_and_exit_ = std::nullopt; 155 : // This flag is set during arbitration when the class decides to 156 : // halt the run. As it is not checkpointed, this distinguishes the 157 : // state immediately after writing the checkpoint from that 158 : // immediately after reading it during the restart. 159 : // 160 : // Phase arbitration is only run from Main, so there are no 161 : // threading issues here. 162 : // NOLINTNEXTLINE(spectre-mutable) 163 0 : mutable bool halting_ = false; 164 : }; 165 : 166 : template <typename... DecisionTags> 167 : void CheckpointAndExitAfterWallclock::initialize_phase_data_impl( 168 : const gsl::not_null<tuples::TaggedTuple<DecisionTags...>*> 169 : phase_change_decision_data) const { 170 : tuples::get<Tags::RestartPhase>(*phase_change_decision_data) = std::nullopt; 171 : tuples::get<Tags::CheckpointAndExitRequested>(*phase_change_decision_data) = 172 : false; 173 : } 174 : 175 : template <typename ParallelComponent, typename ArrayIndex, 176 : typename Metavariables> 177 : void CheckpointAndExitAfterWallclock::contribute_phase_data_impl( 178 : Parallel::GlobalCache<Metavariables>& cache, 179 : const ArrayIndex& array_index) const { 180 : if constexpr (std::is_same_v<typename ParallelComponent::chare_type, 181 : Parallel::Algorithms::Array>) { 182 : Parallel::contribute_to_phase_change_reduction<ParallelComponent>( 183 : tuples::TaggedTuple<Tags::CheckpointAndExitRequested>{true}, cache, 184 : array_index); 185 : } else { 186 : Parallel::contribute_to_phase_change_reduction<ParallelComponent>( 187 : tuples::TaggedTuple<Tags::CheckpointAndExitRequested>{true}, cache); 188 : } 189 : } 190 : 191 : template <typename... DecisionTags, typename Metavariables> 192 : typename std::optional<std::pair<Parallel::Phase, ArbitrationStrategy>> 193 : CheckpointAndExitAfterWallclock::arbitrate_phase_change_impl( 194 : const gsl::not_null<tuples::TaggedTuple<DecisionTags...>*> 195 : phase_change_decision_data, 196 : const Parallel::Phase current_phase, 197 : const Parallel::GlobalCache<Metavariables>& /*cache*/) const { 198 : const double elapsed_hours = sys::wall_time() / 3600.0; 199 : 200 : auto& restart_phase = 201 : tuples::get<Tags::RestartPhase>(*phase_change_decision_data); 202 : auto& exit_code = 203 : tuples::get<Parallel::Tags::ExitCode>(*phase_change_decision_data); 204 : if (restart_phase.has_value()) { 205 : // This `if` branch, where restart_phase has a value, is the 206 : // post-checkpoint call to arbitrate_phase_change. 207 : if (halting_) { 208 : // Preserve restart_phase for use after restarting from the checkpoint 209 : exit_code = Parallel::ExitCode::ContinueFromCheckpoint; 210 : return std::make_pair(Parallel::Phase::Exit, 211 : ArbitrationStrategy::RunPhaseImmediately); 212 : } else { 213 : // Reset restart_phase until it is needed for the next checkpoint 214 : const auto result = restart_phase; 215 : restart_phase.reset(); 216 : return std::make_pair(result.value(), 217 : ArbitrationStrategy::PermitAdditionalJumps); 218 : } 219 : } 220 : 221 : auto& checkpoint_and_exit_requested = 222 : tuples::get<Tags::CheckpointAndExitRequested>( 223 : *phase_change_decision_data); 224 : if (checkpoint_and_exit_requested) { 225 : checkpoint_and_exit_requested = false; 226 : if (elapsed_hours >= wallclock_hours_for_checkpoint_and_exit_.value_or( 227 : std::numeric_limits<double>::infinity())) { 228 : // Record phase and actual elapsed time for determining following phase 229 : restart_phase = current_phase; 230 : ASSERT(not halting_, "Halting for checkpoint recursively"); 231 : halting_ = true; 232 : return std::make_pair(Parallel::Phase::WriteCheckpoint, 233 : ArbitrationStrategy::RunPhaseImmediately); 234 : } 235 : } 236 : return std::nullopt; 237 : } 238 : } // namespace PhaseControl