Line data Source code
1 0 : // Distributed under the MIT License. 2 : // See LICENSE.txt for details. 3 : 4 : #pragma once 5 : 6 : #include <optional> 7 : #include <pup.h> 8 : #include <string> 9 : #include <type_traits> 10 : #include <utility> 11 : 12 : #include "Options/Auto.hpp" 13 : #include "Options/Options.hpp" 14 : #include "Parallel/AlgorithmMetafunctions.hpp" 15 : #include "Parallel/ExitCode.hpp" 16 : #include "Parallel/GlobalCache.hpp" 17 : #include "Parallel/Phase.hpp" 18 : #include "Parallel/PhaseControl/ContributeToPhaseChangeReduction.hpp" 19 : #include "Parallel/PhaseControl/PhaseChange.hpp" 20 : #include "Utilities/ErrorHandling/Assert.hpp" 21 : #include "Utilities/Functional.hpp" 22 : #include "Utilities/Serialization/CharmPupable.hpp" 23 : #include "Utilities/System/ParallelInfo.hpp" 24 : #include "Utilities/TMPL.hpp" 25 : #include "Utilities/TaggedTuple.hpp" 26 : 27 1 : namespace PhaseControl { 28 : 29 0 : namespace Tags { 30 : /// Storage in the phase change decision tuple so that the Main chare can record 31 : /// the phase to go to when restarting the run from a checkpoint file. 32 : /// 33 : /// \note This tag is not intended to participate in any of the reduction 34 : /// procedures, so will error if the combine method is called. 35 1 : struct RestartPhase { 36 0 : using type = std::optional<Parallel::Phase>; 37 : 38 0 : struct combine_method { 39 0 : [[noreturn]] std::optional<Parallel::Phase> operator()( 40 : const std::optional<Parallel::Phase> /*first_phase*/, 41 : const std::optional<Parallel::Phase>& /*second_phase*/); 42 : }; 43 : 44 0 : using main_combine_method = combine_method; 45 : }; 46 : 47 : /// Storage in the phase change decision tuple so that the Main chare can record 48 : /// the elapsed wallclock time since the start of the run. 49 : /// 50 : /// \note This tag is not intended to participate in any of the reduction 51 : /// procedures, so will error if the combine method is called. 52 1 : struct WallclockHoursAtCheckpoint { 53 0 : using type = std::optional<double>; 54 : 55 0 : struct combine_method { 56 0 : [[noreturn]] std::optional<double> operator()( 57 : const std::optional<double> /*first_time*/, 58 : const std::optional<double>& /*second_time*/); 59 : }; 60 0 : using main_combine_method = combine_method; 61 : }; 62 : 63 : /// Stores whether the checkpoint and exit has been requested. 64 : /// 65 : /// Combinations are performed via `funcl::Or`, as the phase in question should 66 : /// be chosen if any component requests the jump. 67 1 : struct CheckpointAndExitRequested { 68 0 : using type = bool; 69 : 70 0 : using combine_method = funcl::Or<>; 71 0 : using main_combine_method = funcl::Or<>; 72 : }; 73 : 74 : } // namespace Tags 75 : 76 : /*! 77 : * \brief Phase control object that runs the WriteCheckpoint and Exit phases 78 : * after a specified amount of wallclock time has elapsed. 79 : * 80 : * When the executable exits from here, it does so with 81 : * `Parallel::ExitCode::ContinueFromCheckpoint`. 82 : * 83 : * This phase control is useful for running SpECTRE executables performing 84 : * lengthy computations that may exceed a supercomputer's wallclock limits. 85 : * Writing a single checkpoint at the end of the job's allocated time allows 86 : * the computation to be continued, while minimizing the disc space taken up by 87 : * checkpoint files. 88 : * 89 : * Note that this phase control is not a trigger on wallclock time. Rather, 90 : * it checks the elapsed wallclock time when called, likely from a global sync 91 : * point triggered by some other mechanism, e.g., at some slab boundary. 92 : * Therefore, the WriteCheckpoint and Exit phases will run the first time 93 : * this phase control is called after the specified wallclock time has been 94 : * reached. 95 : * 96 : * \warning the global sync points _must_ be triggered often enough to ensure 97 : * there will be at least one sync point (i.e., one call to this phase control) 98 : * in the window between the requested checkpoint-and-exit time and the time at 99 : * which the batch system will kill the executable. To make this more concrete, 100 : * consider this example: when running on a 12-hour queue with a 101 : * checkpoint-and-exit requested after 11.5 hours, there is a 0.5-hour window 102 : * for a global sync to occur, the checkpoint files to be written to disc, and 103 : * the executable to clean up. In this case, triggering a global sync every 104 : * 2-10 minutes might be desirable. Matching the global sync frequency with the 105 : * time window for checkpoint and exit is the responsibility of the user! 106 : */ 107 1 : struct CheckpointAndExitAfterWallclock : public PhaseChange { 108 0 : CheckpointAndExitAfterWallclock(const std::optional<double> wallclock_hours, 109 : const Options::Context& context = {}); 110 : 111 0 : explicit CheckpointAndExitAfterWallclock(CkMigrateMessage* msg); 112 : 113 : /// \cond 114 : CheckpointAndExitAfterWallclock() = default; 115 : using PUP::able::register_constructor; 116 : WRAPPED_PUPable_decl_template(CheckpointAndExitAfterWallclock); // NOLINT 117 : /// \endcond 118 : 119 0 : struct WallclockHours { 120 0 : using type = Options::Auto<double, Options::AutoLabel::None>; 121 0 : static constexpr Options::String help = { 122 : "Time in hours after which to write the checkpoint and exit. " 123 : "If 'None' is specified, no action will be taken."}; 124 : }; 125 : 126 0 : using options = tmpl::list<WallclockHours>; 127 0 : static constexpr Options::String help{ 128 : "Once the wallclock time has exceeded the specified amount, trigger " 129 : "writing a checkpoint and then exit with the 'ContinueFromCheckpoint' " 130 : "exit code."}; 131 : 132 0 : using argument_tags = tmpl::list<>; 133 0 : using return_tags = tmpl::list<>; 134 : 135 0 : using phase_change_tags_and_combines = 136 : tmpl::list<Tags::RestartPhase, Tags::WallclockHoursAtCheckpoint, 137 : Tags::CheckpointAndExitRequested>; 138 : 139 : template <typename Metavariables> 140 0 : using participating_components = typename Metavariables::component_list; 141 : 142 : template <typename... DecisionTags> 143 0 : void initialize_phase_data_impl( 144 : const gsl::not_null<tuples::TaggedTuple<DecisionTags...>*> 145 : phase_change_decision_data) const; 146 : 147 : template <typename ParallelComponent, typename ArrayIndex, 148 : typename Metavariables> 149 0 : void contribute_phase_data_impl(Parallel::GlobalCache<Metavariables>& cache, 150 : const ArrayIndex& array_index) const; 151 : 152 : template <typename... DecisionTags, typename Metavariables> 153 : typename std::optional<std::pair<Parallel::Phase, ArbitrationStrategy>> 154 0 : arbitrate_phase_change_impl( 155 : const gsl::not_null<tuples::TaggedTuple<DecisionTags...>*> 156 : phase_change_decision_data, 157 : const Parallel::Phase current_phase, 158 : const Parallel::GlobalCache<Metavariables>& /*cache*/) const; 159 : 160 0 : void pup(PUP::er& p) override; 161 : 162 : private: 163 0 : std::optional<double> wallclock_hours_for_checkpoint_and_exit_ = std::nullopt; 164 : }; 165 : 166 : template <typename... DecisionTags> 167 : void CheckpointAndExitAfterWallclock::initialize_phase_data_impl( 168 : const gsl::not_null<tuples::TaggedTuple<DecisionTags...>*> 169 : phase_change_decision_data) const { 170 : tuples::get<Tags::RestartPhase>(*phase_change_decision_data) = std::nullopt; 171 : tuples::get<Tags::WallclockHoursAtCheckpoint>(*phase_change_decision_data) = 172 : std::nullopt; 173 : tuples::get<Tags::CheckpointAndExitRequested>(*phase_change_decision_data) = 174 : false; 175 : } 176 : 177 : template <typename ParallelComponent, typename ArrayIndex, 178 : typename Metavariables> 179 : void CheckpointAndExitAfterWallclock::contribute_phase_data_impl( 180 : Parallel::GlobalCache<Metavariables>& cache, 181 : const ArrayIndex& array_index) const { 182 : if constexpr (std::is_same_v<typename ParallelComponent::chare_type, 183 : Parallel::Algorithms::Array>) { 184 : Parallel::contribute_to_phase_change_reduction<ParallelComponent>( 185 : tuples::TaggedTuple<Tags::CheckpointAndExitRequested>{true}, cache, 186 : array_index); 187 : } else { 188 : Parallel::contribute_to_phase_change_reduction<ParallelComponent>( 189 : tuples::TaggedTuple<Tags::CheckpointAndExitRequested>{true}, cache); 190 : } 191 : } 192 : 193 : template <typename... DecisionTags, typename Metavariables> 194 : typename std::optional<std::pair<Parallel::Phase, ArbitrationStrategy>> 195 : CheckpointAndExitAfterWallclock::arbitrate_phase_change_impl( 196 : const gsl::not_null<tuples::TaggedTuple<DecisionTags...>*> 197 : phase_change_decision_data, 198 : const Parallel::Phase current_phase, 199 : const Parallel::GlobalCache<Metavariables>& /*cache*/) const { 200 : // If no checkpoint-and-exit time given, then do nothing 201 : if (not wallclock_hours_for_checkpoint_and_exit_.has_value()) { 202 : return std::nullopt; 203 : } 204 : 205 : const double elapsed_hours = sys::wall_time() / 3600.0; 206 : 207 : auto& restart_phase = 208 : tuples::get<Tags::RestartPhase>(*phase_change_decision_data); 209 : auto& wallclock_hours_at_checkpoint = 210 : tuples::get<Tags::WallclockHoursAtCheckpoint>( 211 : *phase_change_decision_data); 212 : auto& exit_code = 213 : tuples::get<Parallel::Tags::ExitCode>(*phase_change_decision_data); 214 : if (restart_phase.has_value()) { 215 : ASSERT(wallclock_hours_at_checkpoint.has_value(), 216 : "Consistency error: Should have recorded the Wallclock time " 217 : "while recording a phase to restart from."); 218 : // This `if` branch, where restart_phase has a value, is the 219 : // post-checkpoint call to arbitrate_phase_change. Depending on the time 220 : // elapsed so far in this run, next phase is... 221 : // - Exit, if the time is large 222 : // - restart_phase, if the time is small 223 : if (elapsed_hours >= wallclock_hours_at_checkpoint.value()) { 224 : // Preserve restart_phase for use after restarting from the checkpoint 225 : exit_code = Parallel::ExitCode::ContinueFromCheckpoint; 226 : return std::make_pair(Parallel::Phase::Exit, 227 : ArbitrationStrategy::RunPhaseImmediately); 228 : } else { 229 : // Reset restart_phase until it is needed for the next checkpoint 230 : const auto result = restart_phase; 231 : restart_phase.reset(); 232 : wallclock_hours_at_checkpoint.reset(); 233 : return std::make_pair(result.value(), 234 : ArbitrationStrategy::PermitAdditionalJumps); 235 : } 236 : } 237 : 238 : auto& checkpoint_and_exit_requested = 239 : tuples::get<Tags::CheckpointAndExitRequested>( 240 : *phase_change_decision_data); 241 : if (checkpoint_and_exit_requested) { 242 : checkpoint_and_exit_requested = false; 243 : // We checked wallclock_hours_for_checkpoint_and_exit_ has value above 244 : if (elapsed_hours >= wallclock_hours_for_checkpoint_and_exit_.value()) { 245 : // Record phase and actual elapsed time for determining following phase 246 : restart_phase = current_phase; 247 : wallclock_hours_at_checkpoint = elapsed_hours; 248 : return std::make_pair(Parallel::Phase::WriteCheckpoint, 249 : ArbitrationStrategy::RunPhaseImmediately); 250 : } 251 : } 252 : return std::nullopt; 253 : } 254 : } // namespace PhaseControl