Line data Source code
1 0 : // Distributed under the MIT License. 2 : // See LICENSE.txt for details. 3 : 4 : #pragma once 5 : 6 : #include <array> 7 : #include <cstddef> 8 : #include <optional> 9 : #include <string> 10 : #include <unordered_map> 11 : #include <unordered_set> 12 : #include <utility> 13 : #include <vector> 14 : 15 : #include "Options/Options.hpp" 16 : #include "Options/ParseError.hpp" 17 : #include "Utilities/TypeTraits/CreateGetStaticMemberVariableOrDefault.hpp" 18 : 19 : /// \cond 20 : template <size_t Dim> 21 : class Block; 22 : 23 : template <size_t Dim> 24 : class ElementId; 25 : 26 : namespace Spectral { 27 : enum class Basis : uint8_t; 28 : enum class Quadrature : uint8_t; 29 : } // namespace Spectral 30 : /// \endcond 31 : 32 : namespace domain { 33 : /// The weighting scheme for assigning computational costs to `Element`s for 34 : /// distributing balanced compuational costs per processor (see 35 : /// `BlockZCurveProcDistribution`) 36 1 : enum class ElementWeight { 37 : /// A weighting scheme where each `Element` is assigned the same computational 38 : /// cost 39 : Uniform, 40 : /// A weighting scheme where each `Element`'s computational cost is equal to 41 : /// the number of grid points in that `Element` 42 : NumGridPoints, 43 : /// A weighting scheme where each `Element`'s computational cost is weighted 44 : /// by both the number of grid points and minimum spacing between grid points 45 : /// in that `Element` (see `get_num_points_and_grid_spacing_cost()` for 46 : /// details) 47 : NumGridPointsAndGridSpacing 48 : }; 49 : 50 0 : std::ostream& operator<<(std::ostream& os, ElementWeight weight); 51 : 52 : /// \brief Get the cost of each `Element` in a list of `Block`s where 53 : /// `element_weight` specifies which weight distribution scheme to use 54 : /// 55 : /// \details It is only necessary to pass in a value for `i1_basis` and 56 : /// `i1_quadrature` if the value for `element_weight` is 57 : /// `ElementWeight::NumGridPointsAndGridSpacing`. Otherwise, the argument isn't 58 : /// needed and will have no effect if it does have a value. 59 : template <size_t Dim> 60 1 : std::unordered_map<ElementId<Dim>, double> get_element_costs( 61 : const std::vector<Block<Dim>>& blocks, 62 : const std::vector<std::array<size_t, Dim>>& initial_refinement_levels, 63 : const std::vector<std::array<size_t, Dim>>& initial_extents, 64 : ElementWeight element_weight, 65 : const std::optional<Spectral::Basis>& i1_basis, 66 : const std::optional<Spectral::Quadrature>& i1_quadrature); 67 : 68 : /*! 69 : * \brief Distribution strategy for assigning elements to CPUs using a 70 : * Morton ('Z-order') space-filling curve to determine placement within each 71 : * block, where `Element`s are distributed across CPUs 72 : * 73 : * \details The element distribution attempts to assign a balanced total 74 : * computational cost to each processor that is allowed to have `Element`s. 75 : * First, each `Block`'s `Element`s are ordered by their Z-curve index (see more 76 : * below). `Element`s are traversed in this order and assigned to CPUs in order, 77 : * moving onto the next CPU once the target cost per CPU is met. The target cost 78 : * per CPU is defined as the remaining cost to distribute divided by the 79 : * remaining number of CPUs to distribute to. This is an important distinction 80 : * from simply having one constant target cost per CPU defined as the total cost 81 : * divided by the total number of CPUs with elements. Since the total cost of 82 : * `Element`s on a processor will nearly never add up to be exactly the average 83 : * cost per CPU, this means that we would either have to decide to overshoot or 84 : * undershoot the average as we iterate over the CPUs and assign `Element`s. If 85 : * we overshoot the average on each processor, the final processor could have a 86 : * much lower cost than the rest of the processors and we run the risk of 87 : * overshooting so much that one or more of the requested processors don't get 88 : * assigned any `Element`s at all. If we undershoot the average on each 89 : * processor, the final processor could have a much higher cost than the others 90 : * due to remainder cost piling up. This algorithm avoids these risks by instead 91 : * adjusting the target cost per CPU as we finish assigning cost to previous 92 : * CPUs. 93 : * 94 : * Morton curves are a simple and easily-computed space-filling curve that 95 : * (unlike Hilbert curves) permit diagonal traversal. See, for instance, 96 : * \cite Borrell2018 for a discussion of mesh partitioning using space-filling 97 : * curves. 98 : * A concrete example of the use of a Morton curve in 2d is given below. 99 : * 100 : * A sketch of a 2D block with 4x2 elements, with each element labeled according 101 : * to the order on the Morton curve: 102 : * ``` 103 : * x--> 104 : * 0 1 2 3 105 : * ---------------- 106 : * y 0 | 0 2 4 6 107 : * | | | / | / | / | 108 : * v 1 | 1 3 5 7 109 : * ``` 110 : * (forming a zig-zag path, that under some rotation/reflection has a 'Z' 111 : * shape). 112 : * 113 : * The Morton curve method is a quick way of getting acceptable spatial locality 114 : * -- usually, for approximately even distributions, it will ensure that 115 : * elements are assigned in large volume chunks, and the structure of the Morton 116 : * curve ensures that for a given processor and block, the elements will be 117 : * assigned in no more than two orthogonally connected clusters. In principle, a 118 : * Hilbert curve could potentially improve upon the gains obtained by this class 119 : * by guaranteeing that all elements within each block form a single 120 : * orthogonally connected cluster. 121 : * 122 : * The assignment of portions of blocks to processors may use partial blocks, 123 : * and/or multiple blocks to ensure an even distribution of elements to 124 : * processors. 125 : * We currently make no distinction between dividing elements between processors 126 : * within a node and dividing elements between processors across nodes. The 127 : * current technique aims to have a simple method of reducing communication 128 : * globally, though it would likely be more efficient to prioritize minimization 129 : * of inter-node communication, because communication across interconnects is 130 : * the primary cost of communication in charm++ runs. 131 : * 132 : * \warning The use of the Morton curve to generate a well-clustered element 133 : * distribution currently assumes that the refinement is uniform over each 134 : * block, with no internal structure that would be generated by, for instance 135 : * AMR. 136 : * This distribution method will need alteration to perform well for blocks with 137 : * internal structure from h-refinement. Morton curves can be defined 138 : * recursively, so a generalization of the present method is possible for blocks 139 : * with internal refinement 140 : * 141 : * \tparam Dim the number of spatial dimensions of the `Block`s 142 : */ 143 : template <size_t Dim> 144 1 : struct BlockZCurveProcDistribution { 145 0 : BlockZCurveProcDistribution() = default; 146 : 147 : /// The `number_of_procs_with_elements` argument represents how many procs 148 : /// will have elements. This is not necessarily equal to the total number of 149 : /// procs because some global procs may be ignored by the sixth argument 150 : /// `global_procs_to_ignore`. 151 1 : BlockZCurveProcDistribution( 152 : const std::unordered_map<ElementId<Dim>, double>& element_costs, 153 : size_t number_of_procs_with_elements, 154 : const std::vector<Block<Dim>>& blocks, 155 : const std::vector<std::array<size_t, Dim>>& initial_refinement_levels, 156 : const std::vector<std::array<size_t, Dim>>& initial_extents, 157 : const std::unordered_set<size_t>& global_procs_to_ignore = {}); 158 : 159 : /// Gets the suggested processor number for a particular `ElementId`, 160 : /// determined by the Morton curve weighted element assignment described in 161 : /// detail in the parent class documentation. 162 1 : size_t get_proc_for_element(const ElementId<Dim>& element_id) const; 163 : 164 : const std::vector<std::vector<std::pair<size_t, size_t>>>& 165 0 : block_element_distribution() const { 166 : return block_element_distribution_; 167 : } 168 : 169 : private: 170 : // in this nested data structure: 171 : // - The block id is the first index 172 : // - There is an arbitrary number of CPUs per block, each with an element 173 : // allowance 174 : // - Each element allowance is represented by a pair of proc number, number of 175 : // elements in the allowance 176 : std::vector<std::vector<std::pair<size_t, size_t>>> 177 0 : block_element_distribution_; 178 : }; 179 : } // namespace domain 180 : 181 : namespace element_weight_detail { 182 : CREATE_GET_STATIC_MEMBER_VARIABLE_OR_DEFAULT(local_time_stepping) 183 : } // namespace element_weight_detail 184 : 185 : template <> 186 0 : struct Options::create_from_yaml<domain::ElementWeight> { 187 : template <typename Metavariables> 188 0 : static domain::ElementWeight create(const Options::Option& options) { 189 : const auto ordering = options.parse_as<std::string>(); 190 : if (ordering == "Uniform") { 191 : return domain::ElementWeight::Uniform; 192 : } else if (ordering == "NumGridPoints") { 193 : return domain::ElementWeight::NumGridPoints; 194 : } else if (ordering == "NumGridPointsAndGridSpacing") { 195 : if constexpr (not element_weight_detail:: 196 : get_local_time_stepping_or_default_v<Metavariables, 197 : false>) { 198 : PARSE_ERROR( 199 : options.context(), 200 : "When not using local time stepping, you cannot use " 201 : "NumGridPointsAndGridSpacing for the element distribution. Please " 202 : "choose another element distribution."); 203 : } 204 : return domain::ElementWeight::NumGridPointsAndGridSpacing; 205 : } 206 : PARSE_ERROR(options.context(), 207 : "ElementWeight must be 'Uniform', 'NumGridPoints', or, " 208 : "'NumGridPointsAndGridSpacing'"); 209 : } 210 : };