Line data Source code
1 1 : // Distributed under the MIT License.
2 : // See LICENSE.txt for details.
3 :
4 : /// \file
5 : /// Defines the Charm++ mainchare.
6 :
7 : #pragma once
8 :
9 : #include <array>
10 : #include <boost/program_options.hpp>
11 : #include <charm++.h>
12 : #include <initializer_list>
13 : #include <pup.h>
14 : #include <regex>
15 : #include <sstream>
16 : #include <string>
17 : #include <type_traits>
18 :
19 : #include "Informer/Informer.hpp"
20 : #include "Options/ParseOptions.hpp"
21 : #include "Options/Tags.hpp"
22 : #include "Parallel/AlgorithmMetafunctions.hpp"
23 : #include "Parallel/CharmRegistration.hpp"
24 : #include "Parallel/CreateFromOptions.hpp"
25 : #include "Parallel/ExitCode.hpp"
26 : #include "Parallel/GlobalCache.hpp"
27 : #include "Parallel/Local.hpp"
28 : #include "Parallel/ParallelComponentHelpers.hpp"
29 : #include "Parallel/Phase.hpp"
30 : #include "Parallel/PhaseControl/ExecutePhaseChange.hpp"
31 : #include "Parallel/PhaseControl/InitializePhaseChangeDecisionData.hpp"
32 : #include "Parallel/PhaseControl/PhaseControlTags.hpp"
33 : #include "Parallel/PhaseControlReductionHelpers.hpp"
34 : #include "Parallel/Printf/Printf.hpp"
35 : #include "Parallel/Reduction.hpp"
36 : #include "Parallel/ResourceInfo.hpp"
37 : #include "Parallel/Tags/ResourceInfo.hpp"
38 : #include "Parallel/TypeTraits.hpp"
39 : #include "Utilities/ErrorHandling/Error.hpp"
40 : #include "Utilities/FileSystem.hpp"
41 : #include "Utilities/Formaline.hpp"
42 : #include "Utilities/MakeString.hpp"
43 : #include "Utilities/Overloader.hpp"
44 : #include "Utilities/StdHelpers.hpp"
45 : #include "Utilities/System/Exit.hpp"
46 : #include "Utilities/System/ParallelInfo.hpp"
47 : #include "Utilities/TMPL.hpp"
48 : #include "Utilities/TaggedTuple.hpp"
49 : #include "Utilities/TypeTraits/CreateGetTypeAliasOrDefault.hpp"
50 : #include "Utilities/TypeTraits/CreateIsCallable.hpp"
51 :
52 : #include "Parallel/Main.decl.h"
53 :
54 : namespace Parallel {
55 : namespace detail {
56 : CREATE_IS_CALLABLE(run_deadlock_analysis_simple_actions)
57 : CREATE_IS_CALLABLE_V(run_deadlock_analysis_simple_actions)
58 : } // namespace detail
59 :
60 : /// \ingroup ParallelGroup
61 : /// The main function of a Charm++ executable.
62 : /// See [the Parallelization documentation](group__ParallelGroup.html#details)
63 : /// for an overview of Metavariables, Phases, and parallel components.
64 : template <typename Metavariables>
65 1 : class Main : public CBase_Main<Metavariables> {
66 : public:
67 0 : using component_list = typename Metavariables::component_list;
68 0 : using const_global_cache_tags = get_const_global_cache_tags<Metavariables>;
69 0 : using mutable_global_cache_tags =
70 : get_mutable_global_cache_tags<Metavariables>;
71 :
72 0 : using phase_change_tags_and_combines_list =
73 : PhaseControl::get_phase_change_tags<Metavariables>;
74 : /// \cond HIDDEN_SYMBOLS
75 : /// The constructor used to register the class
76 : explicit Main(const Parallel::charmxx::
77 : MainChareRegistrationConstructor& /*used_for_reg*/) {}
78 : ~Main() override {
79 : (void)Parallel::charmxx::RegisterChare<
80 : Main<Metavariables>, CkIndex_Main<Metavariables>>::registrar;
81 : }
82 : Main(const Main&) = default;
83 : Main& operator=(const Main&) = default;
84 : Main(Main&&) = default;
85 : Main& operator=(Main&&) = default;
86 : /// \endcond
87 :
88 0 : explicit Main(CkArgMsg* msg);
89 0 : explicit Main(CkMigrateMessage* msg);
90 :
91 : // NOLINTNEXTLINE(google-runtime-references)
92 0 : void pup(PUP::er& p) override;
93 :
94 : /// Allocate singleton components and the initial elements of array
95 : /// components, then execute the initialization phase on each component
96 1 : void allocate_remaining_components_and_execute_initialization_phase();
97 :
98 : /// Determine the next phase of the simulation and execute it.
99 1 : void execute_next_phase();
100 :
101 : /// Place the Charm++ call that starts load balancing
102 : ///
103 : /// \details This call is wrapped within an entry method so that it may be
104 : /// used as the callback after a quiescence detection.
105 1 : void start_load_balance();
106 :
107 : /// Place the Charm++ call that starts writing a checkpoint
108 : /// Reset the checkpoint counter to zero if the checkpoints directory does not
109 : /// exist (this happens when the simulation continues in a new segment).
110 : ///
111 : /// \details This call is wrapped within an entry method so that it may be
112 : /// used as the callback after a quiescence detection.
113 1 : void start_write_checkpoint();
114 :
115 : /// Reduction target for data used in phase change decisions.
116 : ///
117 : /// It is required that the `Parallel::ReductionData` holds a single
118 : /// `tuples::TaggedTuple`.
119 : template <typename InvokeCombine, typename... Tags>
120 1 : void phase_change_reduction(
121 : ReductionData<ReductionDatum<tuples::TaggedTuple<Tags...>, InvokeCombine,
122 : funcl::Identity, std::index_sequence<>>>
123 : reduction_data);
124 :
125 : /// Add an exception to the list of exceptions. Upon a phase change we print
126 : /// all the received exceptions and exit.
127 : ///
128 : /// Upon receiving an exception all algorithms are terminated to guarantee
129 : /// quiescence occurs soon after the exception is reported.
130 1 : void add_exception_message(std::string exception_message);
131 :
132 : /// A reduction target used to determine if all the elements of the array,
133 : /// group, nodegroup, or singleton parallel components terminated
134 : /// successfully.
135 : ///
136 : /// This allows detecting deadlocks in iterable actions, but not simple or
137 : /// reduction action.
138 1 : void did_all_elements_terminate(bool all_elements_terminated);
139 :
140 : /// Prints exit info and stops the executable with failure if a deadlock was
141 : /// detected.
142 1 : void post_deadlock_analysis_termination();
143 :
144 : private:
145 : // Return the dir name for the Charm++ checkpoints as well as the prefix for
146 : // checkpoint names and their padding. This is a "detail" function so that
147 : // these pieces can be defined in one place only.
148 0 : std::tuple<std::string, std::string, size_t> checkpoints_dir_prefix_pad()
149 : const;
150 :
151 : // Return the dir name for the next Charm++ checkpoint; check and error if
152 : // this name already exists and writing the checkpoint would be unsafe.
153 0 : std::string next_checkpoint_dir() const;
154 :
155 : // Check if future checkpoint dirs are available; error if any already exist.
156 0 : void check_future_checkpoint_dirs_available() const;
157 :
158 : // Starts a reduction on the component specified by
159 : // the current_termination_check_index_ member variable, then increment
160 : // current_termination_check_index_
161 0 : void check_if_component_terminated_correctly();
162 :
163 : // Lists of all parallel component types
164 0 : using group_component_list =
165 : tmpl::filter<component_list, tmpl::or_<Parallel::is_group<tmpl::_1>,
166 : Parallel::is_nodegroup<tmpl::_1>>>;
167 0 : using all_array_component_list =
168 : tmpl::filter<component_list, Parallel::is_array<tmpl::_1>>;
169 0 : using non_bound_array_component_list =
170 : tmpl::filter<component_list,
171 : tmpl::and_<Parallel::is_array<tmpl::_1>,
172 : tmpl::not_<Parallel::is_bound_array<tmpl::_1>>>>;
173 0 : using bound_array_component_list =
174 : tmpl::filter<component_list,
175 : tmpl::and_<Parallel::is_array<tmpl::_1>,
176 : Parallel::is_bound_array<tmpl::_1>>>;
177 0 : using singleton_component_list =
178 : tmpl::filter<component_list, Parallel::is_singleton<tmpl::_1>>;
179 :
180 : template <typename ParallelComponent>
181 0 : using parallel_component_options = Parallel::get_option_tags<
182 : typename ParallelComponent::simple_tags_from_options, Metavariables>;
183 : template <typename ArrayComponent>
184 0 : using array_component_allocation_options =
185 : Parallel::get_option_tags<typename ArrayComponent::array_allocation_tags,
186 : Metavariables>;
187 0 : using option_list = tmpl::remove_duplicates<tmpl::flatten<tmpl::list<
188 : Parallel::OptionTags::ResourceInfo<Metavariables>,
189 : Parallel::get_option_tags<const_global_cache_tags, Metavariables>,
190 : Parallel::get_option_tags<mutable_global_cache_tags, Metavariables>,
191 : tmpl::transform<component_list,
192 : tmpl::bind<parallel_component_options, tmpl::_1>>,
193 : tmpl::transform<
194 : all_array_component_list,
195 : tmpl::bind<array_component_allocation_options, tmpl::_1>>>>>;
196 :
197 0 : Parallel::Phase current_phase_{Parallel::Phase::Initialization};
198 0 : CProxy_GlobalCache<Metavariables> global_cache_proxy_;
199 0 : detail::CProxy_AtSyncIndicator<Metavariables> at_sync_indicator_proxy_;
200 : // This is only used during startup, and will be cleared after all
201 : // the chares are created. It is a member variable because passing
202 : // local state through charm callbacks is painful.
203 0 : tuples::tagged_tuple_from_typelist<option_list> options_{};
204 : // type to be determined by the collection of available phase changers in the
205 : // Metavariables
206 : tuples::tagged_tuple_from_typelist<phase_change_tags_and_combines_list>
207 0 : phase_change_decision_data_;
208 0 : size_t checkpoint_dir_counter_ = 0_st;
209 0 : Parallel::ResourceInfo<Metavariables> resource_info_{};
210 : // All exception errors we've received so far.
211 0 : std::vector<std::string> exception_messages_{};
212 : // Used to keep track of which parallel component we are checking has
213 : // successfully terminated.
214 0 : size_t current_termination_check_index_{0};
215 0 : std::vector<std::string> components_that_did_not_terminate_{};
216 : };
217 :
218 : namespace detail {
219 :
220 : // Charm++ AtSync effectively requires an additional global sync to the
221 : // quiescence detection we do for switching phases. However, AtSync only needs
222 : // to be called for one array to trigger the sync-based load balancing, so the
223 : // AtSyncIndicator is a silly hack to have a centralized indication to start
224 : // load-balancing. It participates in the `AtSync` barrier, but is not
225 : // migratable, and should only be constructed by the `Main` chare on the same
226 : // processor as the `Main` chare. When load-balancing occurs, main invokes the
227 : // member function `IndicateAtSync()`, and when `ResumeFromSync()` is called
228 : // from charm, `AtSyncIndicator` simply passes control back to the Main chare
229 : // via `execute_next_phase()`.
230 : template <typename Metavariables>
231 : class AtSyncIndicator : public CBase_AtSyncIndicator<Metavariables> {
232 : public:
233 : AtSyncIndicator(CProxy_Main<Metavariables> main_proxy);
234 : AtSyncIndicator(const AtSyncIndicator&) = default;
235 : AtSyncIndicator& operator=(const AtSyncIndicator&) = default;
236 : AtSyncIndicator(AtSyncIndicator&&) = default;
237 : AtSyncIndicator& operator=(AtSyncIndicator&&) = default;
238 : ~AtSyncIndicator() override {
239 : (void)Parallel::charmxx::RegisterChare<
240 : AtSyncIndicator<Metavariables>,
241 : CkIndex_AtSyncIndicator<Metavariables>>::registrar;
242 : }
243 :
244 : void IndicateAtSync();
245 :
246 : void ResumeFromSync() override;
247 :
248 : explicit AtSyncIndicator(CkMigrateMessage* msg)
249 : : CBase_AtSyncIndicator<Metavariables>(msg) {}
250 :
251 : void pup(PUP::er& p) override { p | main_proxy_; }
252 :
253 : private:
254 : CProxy_Main<Metavariables> main_proxy_;
255 : };
256 :
257 : template <typename Metavariables>
258 : AtSyncIndicator<Metavariables>::AtSyncIndicator(
259 : CProxy_Main<Metavariables> main_proxy)
260 : : main_proxy_{main_proxy} {
261 : this->usesAtSync = true;
262 : this->setMigratable(false);
263 : }
264 :
265 : template <typename Metavariables>
266 : void AtSyncIndicator<Metavariables>::IndicateAtSync() {
267 : this->AtSync();
268 : }
269 :
270 : template <typename Metavariables>
271 : void AtSyncIndicator<Metavariables>::ResumeFromSync() {
272 : main_proxy_.execute_next_phase();
273 : }
274 : } // namespace detail
275 :
276 : // ================================================================
277 :
278 : template <typename Metavariables>
279 : Main<Metavariables>::Main(CkArgMsg* msg) {
280 : Informer::print_startup_info(msg);
281 :
282 : /// \todo detail::register_events_to_trace();
283 :
284 : namespace bpo = boost::program_options;
285 : try {
286 : bpo::options_description command_line_options;
287 : // disable clang-format because it combines the repeated call operator
288 : // invocations making the code more difficult to parse.
289 : // clang-format off
290 : command_line_options.add_options()
291 : ("help,h", "Describe program options")
292 : ("check-options", "Check input file options")
293 : ("dump-source-tree-as", bpo::value<std::string>(),
294 : "If specified, then a gzip archive of the source tree is dumped "
295 : "with the specified name. The archive can be expanded using "
296 : "'tar -xzf ARCHIVE.tar.gz'")
297 : ("dump-paths",
298 : "Dump the PATH, CPATH, LD_LIBRARY_PATH, LIBRARY_PATH, and "
299 : "CMAKE_PREFIX_PATH at compile time.")
300 : ("dump-environment",
301 : "Dump the result of printenv at compile time.")
302 : ("dump-build-info",
303 : "Dump the contents of SpECTRE's BuildInfo.txt")
304 : ("dump-only",
305 : "Exit after dumping requested information.")
306 : ;
307 : // clang-format on
308 :
309 : // False if there are no other options besides the explicitly added
310 : // Parallel::OptionTags::ResourceInfo<Metavariables>,
311 : constexpr bool has_options = tmpl::size<option_list>::value > 1;
312 : // Add input-file option if it makes sense
313 : Overloader{
314 : [&command_line_options](std::true_type /*meta*/, auto mv,
315 : int /*gcc_bug*/)
316 : -> std::void_t<decltype(
317 : tmpl::type_from<decltype(mv)>::input_file)> {
318 : // Metavariables has options and default input file name
319 : command_line_options.add_options()(
320 : "input-file",
321 : bpo::value<std::string>()->default_value(
322 : tmpl::type_from<decltype(mv)>::input_file),
323 : "Input file name");
324 : },
325 : [&command_line_options](std::true_type /*meta*/, auto /*mv*/,
326 : auto... /*unused*/) {
327 : // Metavariables has options and no default input file name
328 : command_line_options.add_options()(
329 : "input-file", bpo::value<std::string>(), "Input file name");
330 : },
331 : [](std::false_type /*meta*/, auto mv, int /*gcc_bug*/)
332 : -> std::void_t<decltype(
333 : tmpl::type_from<decltype(mv)>::input_file)> {
334 : // Metavariables has no options and default input file name
335 :
336 : // always false, but must depend on mv
337 : static_assert(std::is_same_v<decltype(mv), void>,
338 : "Metavariables supplies input file name, "
339 : "but there are no options");
340 : ERROR("This should have failed at compile time");
341 : },
342 : [](std::false_type /*meta*/, auto... /*unused*/) {
343 : // Metavariables has no options and no default input file name
344 : }}(std::bool_constant<has_options>{}, tmpl::type_<Metavariables>{}, 0);
345 :
346 : bpo::command_line_parser command_line_parser(msg->argc, msg->argv);
347 : command_line_parser.options(command_line_options);
348 :
349 : const bool ignore_unrecognized_command_line_options = Overloader{
350 : [](auto mv, int /*gcc_bug*/)
351 : -> decltype(tmpl::type_from<decltype(
352 : mv)>::ignore_unrecognized_command_line_options) {
353 : return tmpl::type_from<decltype(
354 : mv)>::ignore_unrecognized_command_line_options;
355 : },
356 : [](auto /*mv*/, auto... /*meta*/) { return false; }}(
357 : tmpl::type_<Metavariables>{}, 0);
358 : if (ignore_unrecognized_command_line_options) {
359 : // Allow unknown --options
360 : command_line_parser.allow_unregistered();
361 : } else {
362 : // Forbid positional parameters
363 : command_line_parser.positional({});
364 : }
365 :
366 : bpo::variables_map parsed_command_line_options;
367 : bpo::store(command_line_parser.run(), parsed_command_line_options);
368 : bpo::notify(parsed_command_line_options);
369 :
370 : Options::Parser<tmpl::remove<option_list, Options::Tags::InputSource>>
371 : options(Metavariables::help);
372 :
373 : if (parsed_command_line_options.count("help") != 0) {
374 : Parallel::printf("%s\n%s", command_line_options, options.help());
375 : sys::exit();
376 : }
377 :
378 : if (parsed_command_line_options.count("dump-source-tree-as") != 0) {
379 : formaline::write_to_file(
380 : parsed_command_line_options["dump-source-tree-as"].as<std::string>());
381 : Parallel::printf("Dumping archive of source tree at link time.\n");
382 : }
383 : if (parsed_command_line_options.count("dump-paths") != 0) {
384 : Parallel::printf("Paths at link time were:\n%s\n",
385 : formaline::get_paths());
386 : }
387 : if (parsed_command_line_options.count("dump-environment") != 0) {
388 : Parallel::printf("Environment variables at link time were:\n%s\n",
389 : formaline::get_environment_variables());
390 : }
391 : if (parsed_command_line_options.count("dump-build-info") != 0) {
392 : Parallel::printf("BuildInfo.txt at link time was:\n%s\n",
393 : formaline::get_build_info());
394 : }
395 : if (parsed_command_line_options.count("dump-only") != 0) {
396 : sys::exit();
397 : }
398 :
399 : std::string input_file;
400 : if (has_options) {
401 : if (parsed_command_line_options.count("input-file") == 0) {
402 : ERROR_NO_TRACE("No default input file name. Pass --input-file.");
403 : }
404 : input_file = parsed_command_line_options["input-file"].as<std::string>();
405 : options.parse_file(input_file);
406 : } else {
407 : if constexpr (tmpl::size<singleton_component_list>::value > 0) {
408 : options.parse(
409 : "ResourceInfo:\n"
410 : " AvoidGlobalProc0: false\n"
411 : " Singletons: Auto\n");
412 : } else {
413 : options.parse(
414 : "ResourceInfo:\n"
415 : " AvoidGlobalProc0: false\n");
416 : }
417 : }
418 :
419 : if (parsed_command_line_options.count("check-options") != 0) {
420 : // Force all the options to be created.
421 : options.template apply<option_list, Metavariables>([](auto... args) {
422 : (void)std::initializer_list<char>{((void)args, '0')...};
423 : });
424 : if (has_options) {
425 : Parallel::printf("\n%s parsed successfully!\n", input_file);
426 : } else {
427 : // This is still considered successful, since it means the
428 : // program would have started.
429 : Parallel::printf("\nNo options to check!\n");
430 : }
431 :
432 : // Include a check that the checkpoint dirs are available for writing as
433 : // part of checking the option parsing. Doing these checks together helps
434 : // catch more user errors before running the executable 'for real'.
435 : //
436 : // Note we don't do this check at the beginning of the Main chare
437 : // constructor because we don't _always_ want to error if checkpoint dirs
438 : // already exist. For example, running the executable with flags like
439 : // `--help` or `--dump-source-tree-as` should succeed even if checkpoints
440 : // were previously written.
441 : check_future_checkpoint_dirs_available();
442 :
443 : sys::exit();
444 : }
445 :
446 : options_ =
447 : options.template apply<option_list, Metavariables>([](auto... args) {
448 : return tuples::tagged_tuple_from_typelist<option_list>(
449 : std::move(args)...);
450 : });
451 :
452 : resource_info_ =
453 : tuples::get<Parallel::OptionTags::ResourceInfo<Metavariables>>(
454 : options_);
455 :
456 : Parallel::printf("\nOption parsing completed.\n");
457 : } catch (const bpo::error& e) {
458 : ERROR(e.what());
459 : }
460 :
461 : check_future_checkpoint_dirs_available();
462 :
463 : global_cache_proxy_ = CProxy_GlobalCache<Metavariables>::ckNew(
464 : Parallel::create_from_options<Metavariables>(options_,
465 : const_global_cache_tags{}),
466 : Parallel::create_from_options<Metavariables>(options_,
467 : mutable_global_cache_tags{}),
468 : this->thisProxy);
469 :
470 : // Now that the GlobalCache has been built, create the singleton map which
471 : // will be used to allocate all the singletons. We need to be careful here
472 : // because the parallel components have not been set at this point, so if we
473 : // try to Parallel::get_parallel_component here, an error will occur. This
474 : // call is OK though because build_singleton_map() only uses the parallel info
475 : // functions from the GlobalCache (like cache.number_of_procs()).
476 : resource_info_.build_singleton_map(
477 : *Parallel::local_branch(global_cache_proxy_));
478 :
479 : // Now that the singleton map has been built, set the resource info in the
480 : // GlobalCache (if the tags exist). Since this info will be constant
481 : // throughout a simulation, we opt for directly editing a const tag in the
482 : // GlobalCache before we pass it to any other parallel component rather than
483 : // having a mutable tag and using a mutate call to set it.
484 : global_cache_proxy_.set_resource_info(resource_info_);
485 :
486 : // Now that the singleton map has been built, we have to replace the
487 : // ResourceInfo that was created from options with the one that has all the
488 : // correct singleton assignments so simple tags can be created from options
489 : // with a valid ResourceInfo.
490 : get<Parallel::OptionTags::ResourceInfo<Metavariables>>(options_) =
491 : resource_info_;
492 :
493 : at_sync_indicator_proxy_ =
494 : detail::CProxy_AtSyncIndicator<Metavariables>::ckNew();
495 : at_sync_indicator_proxy_[0].insert(this->thisProxy, sys::my_proc());
496 : at_sync_indicator_proxy_.doneInserting();
497 :
498 : using parallel_component_tag_list = tmpl::transform<
499 : component_list,
500 : tmpl::bind<
501 : tmpl::type_,
502 : tmpl::bind<Parallel::proxy_from_parallel_component, tmpl::_1>>>;
503 : tuples::tagged_tuple_from_typelist<parallel_component_tag_list>
504 : the_parallel_components;
505 :
506 : // Print info on DataBox variants
507 : #ifdef SPECTRE_DEBUG
508 : Parallel::printf("\nParallel components:\n");
509 : tmpl::for_each<component_list>([](auto parallel_component_v) {
510 : using parallel_component = tmpl::type_from<decltype(parallel_component_v)>;
511 : using chare_type = typename parallel_component::chare_type;
512 : using charm_type = Parallel::charm_types_with_parameters<
513 : parallel_component, typename Parallel::get_array_index<
514 : chare_type>::template f<parallel_component>>;
515 : Parallel::printf(
516 : " %s (%s) has a DataBox with %u items.\n",
517 : pretty_type::name<parallel_component>(),
518 : pretty_type::name<chare_type>(),
519 : tmpl::size<
520 : typename charm_type::algorithm::databox_type::tags_list>::value);
521 : });
522 : Parallel::printf("\n");
523 : #endif // SPECTRE_DEBUG
524 :
525 : // Construct the group proxies with a dependency on the GlobalCache proxy
526 : CkEntryOptions global_cache_dependency;
527 : global_cache_dependency.setGroupDepID(global_cache_proxy_.ckGetGroupID());
528 :
529 : tmpl::for_each<group_component_list>([this, &the_parallel_components,
530 : &global_cache_dependency](
531 : auto parallel_component_v) {
532 : using parallel_component = tmpl::type_from<decltype(parallel_component_v)>;
533 : using ParallelComponentProxy =
534 : Parallel::proxy_from_parallel_component<parallel_component>;
535 : tuples::get<tmpl::type_<ParallelComponentProxy>>(the_parallel_components) =
536 : ParallelComponentProxy::ckNew(
537 : global_cache_proxy_,
538 : Parallel::create_from_options<Metavariables>(
539 : options_,
540 : typename parallel_component::simple_tags_from_options{}),
541 : &global_cache_dependency);
542 : });
543 :
544 : // Create proxies for empty array chares (whose elements will be created by
545 : // the allocate functions of the array components during
546 : // execute_initialization_phase)
547 : tmpl::for_each<non_bound_array_component_list>([&the_parallel_components](
548 : auto parallel_component) {
549 : using ParallelComponentProxy = Parallel::proxy_from_parallel_component<
550 : tmpl::type_from<decltype(parallel_component)>>;
551 : tuples::get<tmpl::type_<ParallelComponentProxy>>(the_parallel_components) =
552 : ParallelComponentProxy::ckNew();
553 : });
554 :
555 : // Create proxies for empty bound array chares
556 : tmpl::for_each<bound_array_component_list>([&the_parallel_components](
557 : auto parallel_component) {
558 : using ParallelComponentProxy = Parallel::proxy_from_parallel_component<
559 : tmpl::type_from<decltype(parallel_component)>>;
560 : CkArrayOptions opts;
561 : opts.bindTo(
562 : tuples::get<tmpl::type_<Parallel::proxy_from_parallel_component<
563 : typename tmpl::type_from<decltype(parallel_component)>::bind_to>>>(
564 : the_parallel_components));
565 : tuples::get<tmpl::type_<ParallelComponentProxy>>(the_parallel_components) =
566 : ParallelComponentProxy::ckNew(opts);
567 : });
568 :
569 : // Create proxies for singletons (which are single-element charm++ arrays)
570 : tmpl::for_each<singleton_component_list>([&the_parallel_components](
571 : auto parallel_component) {
572 : using ParallelComponentProxy = Parallel::proxy_from_parallel_component<
573 : tmpl::type_from<decltype(parallel_component)>>;
574 : tuples::get<tmpl::type_<ParallelComponentProxy>>(the_parallel_components) =
575 : ParallelComponentProxy::ckNew();
576 : });
577 :
578 : // Send the complete list of parallel_components to the GlobalCache on
579 : // each Charm++ node. After all nodes have finished, the callback is
580 : // executed.
581 : CkCallback callback(
582 : CkIndex_Main<Metavariables>::
583 : allocate_remaining_components_and_execute_initialization_phase(),
584 : this->thisProxy);
585 : global_cache_proxy_.set_parallel_components(the_parallel_components,
586 : callback);
587 :
588 : get<Tags::ExitCode>(phase_change_decision_data_) =
589 : Parallel::ExitCode::Complete;
590 : PhaseControl::initialize_phase_change_decision_data(
591 : make_not_null(&phase_change_decision_data_),
592 : *Parallel::local_branch(global_cache_proxy_));
593 :
594 : printer_chare = CProxy_PrinterChare::ckNew(1);
595 : printer_chare_is_set = true;
596 : }
597 :
598 : template <typename Metavariables>
599 : Main<Metavariables>::Main(CkMigrateMessage* msg)
600 : : CBase_Main<Metavariables>(msg) {}
601 :
602 : template <typename Metavariables>
603 : void Main<Metavariables>::pup(PUP::er& p) { // NOLINT
604 : p | current_phase_;
605 : p | global_cache_proxy_;
606 : p | at_sync_indicator_proxy_;
607 : // Note: we do NOT serialize the options.
608 : // This is because options are only used in the initialization phase when
609 : // the executable first starts up. Thereafter, the information from the
610 : // options will be held in various code objects that will themselves be
611 : // serialized.
612 : p | phase_change_decision_data_;
613 :
614 : p | checkpoint_dir_counter_;
615 : p | resource_info_;
616 : p | exception_messages_;
617 : p | current_termination_check_index_;
618 : p | components_that_did_not_terminate_;
619 : if (p.isUnpacking()) {
620 : check_future_checkpoint_dirs_available();
621 : }
622 :
623 : // For now we only support restarts on the same hardware configuration (same
624 : // number of nodes and same procs per node) used when writing the checkpoint.
625 : // We check this by adding counters to the pup stream.
626 : if (p.isUnpacking()) {
627 : int previous_nodes = 0;
628 : int previous_procs = 0;
629 : p | previous_nodes;
630 : p | previous_procs;
631 : if (previous_nodes != sys::number_of_nodes() or
632 : previous_procs != sys::number_of_procs()) {
633 : ERROR(
634 : "Must restart on the same hardware configuration used when writing "
635 : "the checkpoint.\n"
636 : "Checkpoint written with "
637 : << previous_nodes << " nodes, " << previous_procs
638 : << " procs.\n"
639 : "Restarted with "
640 : << sys::number_of_nodes() << " nodes, " << sys::number_of_procs()
641 : << " procs.");
642 : }
643 : } else {
644 : int current_nodes = sys::number_of_nodes();
645 : int current_procs = sys::number_of_procs();
646 : p | current_nodes;
647 : p | current_procs;
648 : }
649 : }
650 :
651 : template <typename Metavariables>
652 : void Main<Metavariables>::
653 : allocate_remaining_components_and_execute_initialization_phase() {
654 : if (current_phase_ != Parallel::Phase::Initialization) {
655 : ERROR("Must be in the Initialization phase.");
656 : }
657 : // Since singletons are actually single-element Charm++ arrays, we have to
658 : // allocate them here along with the other Charm++ arrays.
659 : tmpl::for_each<singleton_component_list>([this](auto singleton_component_v) {
660 : using singleton_component =
661 : tmpl::type_from<decltype(singleton_component_v)>;
662 : auto& local_cache = *Parallel::local_branch(global_cache_proxy_);
663 : auto& singleton_proxy =
664 : Parallel::get_parallel_component<singleton_component>(local_cache);
665 : auto options = Parallel::create_from_options<Metavariables>(
666 : options_, typename singleton_component::simple_tags_from_options{});
667 :
668 : const size_t proc = resource_info_.template proc_for<singleton_component>();
669 : singleton_proxy[0].insert(global_cache_proxy_, std::move(options), proc);
670 : singleton_proxy.doneInserting();
671 : });
672 :
673 : // These are Spectre array components built on Charm++ array chares. Each
674 : // component is in charge of allocating and distributing its elements over the
675 : // computing system.
676 : tmpl::for_each<all_array_component_list>([this](auto parallel_component_v) {
677 : using parallel_component = tmpl::type_from<decltype(parallel_component_v)>;
678 : parallel_component::allocate_array(
679 : global_cache_proxy_,
680 : Parallel::create_from_options<Metavariables>(
681 : options_, typename parallel_component::simple_tags_from_options{}),
682 : Parallel::create_from_options<Metavariables>(
683 : options_, typename parallel_component::array_allocation_tags{}),
684 : resource_info_.procs_to_ignore());
685 : });
686 :
687 : // Free any resources from the initial option parsing.
688 : options_ = decltype(options_){};
689 :
690 : tmpl::for_each<component_list>([this](auto parallel_component_v) {
691 : using parallel_component = tmpl::type_from<decltype(parallel_component_v)>;
692 : Parallel::get_parallel_component<parallel_component>(
693 : *Parallel::local_branch(global_cache_proxy_))
694 : .start_phase(current_phase_);
695 : });
696 : CkStartQD(CkCallback(CkIndex_Main<Metavariables>::execute_next_phase(),
697 : this->thisProxy));
698 : }
699 :
700 : template <typename Metavariables>
701 : void Main<Metavariables>::execute_next_phase() {
702 : if (not exception_messages_.empty()) {
703 : // Print exceptions whether we errored during execution or cleanup
704 : Parallel::printf(
705 : "\n\n###############################\n"
706 : "The following exceptions were reported during the phase: %s\n",
707 : current_phase_);
708 : for (const std::string& exception_message : exception_messages_) {
709 : Parallel::printf("%s\n\n", exception_message);
710 : }
711 : exception_messages_.clear();
712 : Parallel::printf(
713 : "To determine where an exception is thrown, run gdb and do\n"
714 : "catch throw EXCEPTION_TYPE\n"
715 : "run\n"
716 : "where EXCEPTION_TYPE is the Type of the exception above.\n"
717 : "You may have to type `continue` to skip some option parser\n"
718 : "exceptions until you get to the one you care about\n"
719 : "You may also have to type `up` or `down` to go up and down\n"
720 : "the function calls in order to find a useful line number.\n\n");
721 :
722 : // Errored during cleanup. Can't have this, so just abort
723 : if (current_phase_ == Parallel::Phase::PostFailureCleanup) {
724 : Parallel::printf(
725 : "Received termination while cleaning up a previous termination. This "
726 : "is cyclic behavior and cannot be supported. Cleanup must exit "
727 : "cleanly without errors.");
728 : sys::abort("");
729 : }
730 :
731 : // Errored during execution. Go to cleanup
732 : current_phase_ = Parallel::Phase::PostFailureCleanup;
733 : Parallel::printf("Entering phase: %s at time %s\n", current_phase_,
734 : sys::pretty_wall_time());
735 : } else {
736 : if (Parallel::Phase::Exit == current_phase_) {
737 : ERROR("Current phase is Exit, but program did not exit!");
738 : }
739 :
740 : if (current_phase_ == Parallel::Phase::PostFailureCleanup) {
741 : Parallel::printf("PostFailureCleanup phase complete. Aborting.\n");
742 : Informer::print_exit_info();
743 : sys::abort("");
744 : }
745 :
746 : const auto next_phase = PhaseControl::arbitrate_phase_change(
747 : make_not_null(&phase_change_decision_data_), current_phase_,
748 : *Parallel::local_branch(global_cache_proxy_));
749 : if (next_phase.has_value()) {
750 : // Only print info if there was an actual phase change.
751 : if (current_phase_ != next_phase.value()) {
752 : Parallel::printf("Entering phase from phase control: %s at time %s\n",
753 : next_phase.value(), sys::pretty_wall_time());
754 : current_phase_ = next_phase.value();
755 : }
756 : } else {
757 : const auto& default_order = Metavariables::default_phase_order;
758 : auto it = alg::find(default_order, current_phase_);
759 : using ::operator<<;
760 : if (it == std::end(default_order)) {
761 : ERROR("Cannot determine next phase as '"
762 : << current_phase_
763 : << "' is not in Metavariables::default_phase_order "
764 : << default_order << "\n");
765 : }
766 : if (std::next(it) == std::end(default_order)) {
767 : ERROR("Cannot determine next phase as '"
768 : << current_phase_
769 : << "' is last in Metavariables::default_phase_order "
770 : << default_order << "\n");
771 : }
772 : current_phase_ = *std::next(it);
773 :
774 : Parallel::printf("Entering phase: %s at time %s\n", current_phase_,
775 : sys::pretty_wall_time());
776 : }
777 : }
778 :
779 : if (Parallel::Phase::Exit == current_phase_) {
780 : check_if_component_terminated_correctly();
781 : return;
782 : }
783 : tmpl::for_each<component_list>([this](auto parallel_component) {
784 : tmpl::type_from<decltype(parallel_component)>::execute_next_phase(
785 : current_phase_, global_cache_proxy_);
786 : });
787 :
788 : // Here we handle phases with direct Charm++ calls. By handling these phases
789 : // after calling each component's execute_next_phase entry method, we ensure
790 : // that each component knows what phase it is in. This is useful for pup
791 : // functions that need special handling that depends on the phase.
792 : //
793 : // Note that in future versions of Charm++ it may become possible for pup
794 : // functions to have knowledge of the migration type. At that point, it
795 : // should no longer be necessary to wait until after
796 : // component::execute_next_phase to make the direct charm calls. Instead, the
797 : // load balance or checkpoint work could be initiated *before* the call to
798 : // component::execute_next_phase and *without* the need for a quiescence
799 : // detection. This may be a slight optimization.
800 : if (current_phase_ == Parallel::Phase::LoadBalancing) {
801 : CkStartQD(CkCallback(CkIndex_Main<Metavariables>::start_load_balance(),
802 : this->thisProxy));
803 : return;
804 : }
805 : if (current_phase_ == Parallel::Phase::WriteCheckpoint) {
806 : CkStartQD(CkCallback(CkIndex_Main<Metavariables>::start_write_checkpoint(),
807 : this->thisProxy));
808 : return;
809 : }
810 :
811 : // The general case simply returns to execute_next_phase
812 : CkStartQD(CkCallback(CkIndex_Main<Metavariables>::execute_next_phase(),
813 : this->thisProxy));
814 : }
815 :
816 : template <typename Metavariables>
817 : void Main<Metavariables>::start_load_balance() {
818 : at_sync_indicator_proxy_.IndicateAtSync();
819 : // No need for a callback to return to execute_next_phase: this is done by
820 : // ResumeFromSync instead.
821 : }
822 :
823 : template <typename Metavariables>
824 : void Main<Metavariables>::start_write_checkpoint() {
825 : // Reset the counter if the checkpoints directory does not exist.
826 : // This happens when the simulation continues in a new segment.
827 : const auto [checkpoints_dir, prefix, pad] = checkpoints_dir_prefix_pad();
828 : if (not file_system::check_if_dir_exists(checkpoints_dir)) {
829 : checkpoint_dir_counter_ = 0;
830 : }
831 : const std::string dir = next_checkpoint_dir();
832 : checkpoint_dir_counter_++;
833 : file_system::create_directory(dir);
834 : CkStartCheckpoint(
835 : dir.c_str(), CkCallback(CkIndex_Main<Metavariables>::execute_next_phase(),
836 : this->thisProxy));
837 : }
838 :
839 : template <typename Metavariables>
840 : template <typename InvokeCombine, typename... Tags>
841 : void Main<Metavariables>::phase_change_reduction(
842 : ReductionData<ReductionDatum<tuples::TaggedTuple<Tags...>, InvokeCombine,
843 : funcl::Identity, std::index_sequence<>>>
844 : reduction_data) {
845 : using tagged_tuple_type = std::decay_t<
846 : std::tuple_element_t<0, std::decay_t<decltype(reduction_data.data())>>>;
847 : (void)Parallel::charmxx::RegisterPhaseChangeReduction<
848 : Metavariables, InvokeCombine, Tags...>::registrar;
849 : static_assert(tt::is_a_v<tuples::TaggedTuple, tagged_tuple_type>,
850 : "The main chare expects a tagged tuple in the phase change "
851 : "reduction target.");
852 : reduction_data.finalize();
853 : PhaseControl::TaggedTupleMainCombine::apply(
854 : make_not_null(&phase_change_decision_data_),
855 : get<0>(reduction_data.data()));
856 : }
857 :
858 : template <typename Metavariables>
859 : void Main<Metavariables>::add_exception_message(std::string exception_message) {
860 : exception_messages_.push_back(std::move(exception_message));
861 : auto* global_cache = Parallel::local_branch(global_cache_proxy_);
862 : ASSERT(global_cache != nullptr, "Could not retrieve the local global cache.");
863 : // Set terminate_=true on all components to cause them to stop the current
864 : // phase.
865 : tmpl::for_each<component_list>([global_cache](auto component_tag_v) {
866 : using component_tag = tmpl::type_from<decltype(component_tag_v)>;
867 : Parallel::get_parallel_component<component_tag>(*global_cache)
868 : .set_terminate(true);
869 : });
870 : }
871 :
872 : template <typename Metavariables>
873 : void Main<Metavariables>::did_all_elements_terminate(
874 : const bool all_elements_terminated) {
875 : if (not all_elements_terminated) {
876 : tmpl::for_each<component_list>([this](auto component_tag_v) {
877 : using component_tag = tmpl::type_from<decltype(component_tag_v)>;
878 : if (tmpl::index_of<component_list, component_tag>::value ==
879 : current_termination_check_index_ - 1) {
880 : components_that_did_not_terminate_.push_back(
881 : pretty_type::name<component_tag>());
882 : }
883 : });
884 : }
885 : if (current_termination_check_index_ == tmpl::size<component_list>::value) {
886 : if (not components_that_did_not_terminate_.empty()) {
887 : using ::operator<<;
888 : // Need the MakeString to avoid GCC compilation failure that it can't
889 : // print out the vector...
890 : Parallel::printf(
891 : "\n############ ERROR ############\n"
892 : "The following components did not terminate cleanly:\n"
893 : "%s\n\n"
894 : "This means the executable stopped because of a hang/deadlock.\n"
895 : "############ ERROR ############\n\n",
896 : std::string{MakeString{} << components_that_did_not_terminate_});
897 : if constexpr (detail::is_run_deadlock_analysis_simple_actions_callable_v<
898 : Metavariables, Parallel::GlobalCache<Metavariables>&,
899 : const std::vector<std::string>&>) {
900 : Parallel::printf("Starting deadlock analysis.\n");
901 : Metavariables::run_deadlock_analysis_simple_actions(
902 : *Parallel::local_branch(global_cache_proxy_),
903 : components_that_did_not_terminate_);
904 : CkStartQD(CkCallback(
905 : CkIndex_Main<Metavariables>::post_deadlock_analysis_termination(),
906 : this->thisProxy));
907 : return;
908 : } else {
909 : Parallel::printf(
910 : "No deadlock analysis function found in metavariables. To enable "
911 : "deadlock analysis via simple actions add a function:\n"
912 : " static void run_deadlock_analysis_simple_actions(\n"
913 : " Parallel::GlobalCache<metavariables>& cache,\n"
914 : " const std::vector<std::string>& deadlocked_components);\n"
915 : "to your metavariables.\n");
916 : }
917 : }
918 : post_deadlock_analysis_termination();
919 : }
920 :
921 : check_if_component_terminated_correctly();
922 : }
923 :
924 : template <typename Metavariables>
925 : void Main<Metavariables>::check_if_component_terminated_correctly() {
926 : auto* global_cache = Parallel::local_branch(global_cache_proxy_);
927 : ASSERT(global_cache != nullptr, "Could not retrieve the local global cache.");
928 :
929 : tmpl::for_each<component_list>([global_cache, this](auto component_tag_v) {
930 : using component_tag = tmpl::type_from<decltype(component_tag_v)>;
931 : if (tmpl::index_of<component_list, component_tag>::value ==
932 : current_termination_check_index_) {
933 : Parallel::get_parallel_component<component_tag>(*global_cache)
934 : .contribute_termination_status_to_main();
935 : }
936 : });
937 : current_termination_check_index_++;
938 : }
939 :
940 : template <typename Metavariables>
941 : void Main<Metavariables>::post_deadlock_analysis_termination() {
942 : Informer::print_exit_info();
943 : if (not components_that_did_not_terminate_.empty()) {
944 : sys::abort("");
945 : } else {
946 : const Parallel::ExitCode exit_code =
947 : get<Tags::ExitCode>(phase_change_decision_data_);
948 : sys::exit(static_cast<int>(exit_code));
949 : }
950 : }
951 :
952 : template <typename Metavariables>
953 : std::tuple<std::string, std::string, size_t>
954 : Main<Metavariables>::checkpoints_dir_prefix_pad() const {
955 : const std::string checkpoints_dir = "Checkpoints";
956 : const std::string prefix = "Checkpoint_";
957 : constexpr size_t pad = 4;
958 : return std::make_tuple(checkpoints_dir, prefix, pad);
959 : }
960 :
961 : template <typename Metavariables>
962 : std::string Main<Metavariables>::next_checkpoint_dir() const {
963 : const auto [checkpoints_dir, prefix, pad] = checkpoints_dir_prefix_pad();
964 : const std::string counter = std::to_string(checkpoint_dir_counter_);
965 : const std::string padded_counter =
966 : std::string(pad - counter.size(), '0').append(counter);
967 : const std::string result = checkpoints_dir + "/" + prefix + padded_counter;
968 : if (file_system::check_if_dir_exists(result)) {
969 : ERROR("Can't write checkpoint: dir " + result + " already exists!");
970 : }
971 : return result;
972 : }
973 :
974 : template <typename Metavariables>
975 : void Main<Metavariables>::check_future_checkpoint_dirs_available() const {
976 : const auto [checkpoints_dir, prefix, pad] = checkpoints_dir_prefix_pad();
977 : if (not file_system::check_if_dir_exists(checkpoints_dir)) {
978 : return;
979 : }
980 : const auto next_checkpoint = next_checkpoint_dir();
981 :
982 : // Find existing files with names that match the checkpoint dir name pattern
983 : const auto all_files = file_system::ls(checkpoints_dir);
984 : const std::regex re(prefix + "[0-9]{" + std::to_string(pad) + "}");
985 : std::vector<std::string> checkpoint_files;
986 : std::copy_if(all_files.begin(), all_files.end(),
987 : std::back_inserter(checkpoint_files),
988 : [&re](const std::string& s) { return std::regex_match(s, re); });
989 :
990 : // Using string comparison of filenames, check that all the files we found
991 : // are from older checkpoints, but not from future checkpoints
992 : const bool found_older_checkpoints_only = std::all_of(
993 : checkpoint_files.begin(), checkpoint_files.end(),
994 : [&next_checkpoint](const std::string& s) { return s < next_checkpoint; });
995 : if (not found_older_checkpoints_only) {
996 : ERROR(
997 : "Can't start run: found checkpoints that may be overwritten!\n"
998 : "Dirs from "
999 : << next_checkpoint << " onward must not exist.\n");
1000 : }
1001 : }
1002 :
1003 : } // namespace Parallel
1004 :
1005 0 : #define CK_TEMPLATES_ONLY
1006 : #include "Parallel/Main.def.h"
1007 : #undef CK_TEMPLATES_ONLY
|