Zoltan2
Loading...
Searching...
No Matches
Zoltan2_AlgMultiJagged.hpp
Go to the documentation of this file.
1// @HEADER
2//
3// ***********************************************************************
4//
5// Zoltan2: A package of combinatorial algorithms for scientific computing
6// Copyright 2012 Sandia Corporation
7//
8// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
9// the U.S. Government retains certain rights in this software.
10//
11// Redistribution and use in source and binary forms, with or without
12// modification, are permitted provided that the following conditions are
13// met:
14//
15// 1. Redistributions of source code must retain the above copyright
16// notice, this list of conditions and the following disclaimer.
17//
18// 2. Redistributions in binary form must reproduce the above copyright
19// notice, this list of conditions and the following disclaimer in the
20// documentation and/or other materials provided with the distribution.
21//
22// 3. Neither the name of the Corporation nor the names of the
23// contributors may be used to endorse or promote products derived from
24// this software without specific prior written permission.
25//
26// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37//
38// Questions? Contact Karen Devine (kddevin@sandia.gov)
39// Erik Boman (egboman@sandia.gov)
40// Siva Rajamanickam (srajama@sandia.gov)
41//
42// ***********************************************************************
43//
44// @HEADER
49#ifndef _ZOLTAN2_ALGMultiJagged_HPP_
50#define _ZOLTAN2_ALGMultiJagged_HPP_
51
55#include <Zoltan2_Algorithm.hpp>
58#include <Zoltan2_Util.hpp>
59#include <Tpetra_Distributor.hpp>
60#include <Teuchos_StandardParameterEntryValidators.hpp>
61#include <Teuchos_ParameterList.hpp>
62#include <Kokkos_Sort.hpp>
63
64#include <algorithm> // std::sort
65#include <vector>
66#include <unordered_map>
67
68#ifdef ZOLTAN2_USEZOLTANCOMM
69#ifdef HAVE_ZOLTAN2_MPI
70#define ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
71#include "zoltan_comm_cpp.h"
72#include "zoltan_types.h" // for error codes
73#endif
74#endif
75
76namespace Teuchos{
77
81template <typename Ordinal, typename T>
82class Zoltan2_BoxBoundaries : public ValueTypeReductionOp<Ordinal,T>
83{
84private:
85 Ordinal size;
86 T epsilon;
87
88public:
92 epsilon(std::numeric_limits<T>::epsilon()) {}
93
98 size(s_), epsilon(std::numeric_limits<T>::epsilon()) {}
99
105 void reduce( const Ordinal count, const T inBuffer[], T inoutBuffer[]) const {
106 for(Ordinal i = 0; i < count; i++) {
107 if(Z2_ABS(inBuffer[i]) > epsilon) {
108 inoutBuffer[i] = inBuffer[i];
109 }
110 }
111 }
112};
113
114} // namespace Teuchos
115
116namespace Zoltan2{
117
124template <typename IT, typename CT, typename WT>
126{
127public:
128 // TODO: Why volatile?
129 // no idea, another intel compiler failure.
130 volatile IT index;
131 volatile CT count;
132 volatile WT *val;
133 volatile WT epsilon;
134
136 this->index = 0;
137 this->count = 0;
138 this->val = NULL;
139 this->epsilon = std::numeric_limits<WT>::epsilon() * 100;
140 }
141
142 // TODO: Document these methods?
143 uMultiSortItem(IT index_ ,CT count_, WT *vals_) {
144 this->index = index_;
145 this->count = count_;
146 this->val = vals_;
147 this->epsilon = std::numeric_limits<WT>::epsilon() * 100;
148 }
149
151 }
152
153 void set(IT index_ ,CT count_, WT *vals_) {
154 this->index = index_;
155 this->count = count_;
156 this->val = vals_;
157 }
158
159 bool operator<(const uMultiSortItem<IT,CT,WT>& other) const {
160 assert(this->count == other.count);
161 for(CT i = 0; i < this->count; ++i) {
162 // if the values are equal go to next one.
163 if(std::abs(this->val[i] - other.val[i]) < this->epsilon) {
164 continue;
165 }
166 // if next value is smaller return true;
167 if(this->val[i] < other.val[i]) {
168 return true;
169 }
170 // if next value is bigger return false;
171 else {
172 return false;
173 }
174 }
175 // if they are totally equal.
176 return this->index < other.index;
177 }
178};
179
182template <class IT, class WT>
184{
185 IT id;
186 WT val;
187};
188
193template <class IT, class WT>
194void uqsort(IT n, uSortItem<IT, WT> * arr) {
195 const int NSTACK = 50;
196 int M = 7;
197 IT i, ir=n, j, k, l=1;
198 IT jstack=0, istack[NSTACK];
199 WT aval;
201
202 --arr;
203 for(;;) {
204 if(ir-l < M) {
205 for(j=l+1;j<=ir;j++) {
206 a=arr[j];
207 aval = a.val;
208 for(i=j-1;i>=1;i--) {
209 if(arr[i].val <= aval)
210 break;
211 arr[i+1] = arr[i];
212 }
213 arr[i+1]=a;
214 }
215 if(jstack == 0)
216 break;
217 ir=istack[jstack--];
218 l=istack[jstack--];
219 }
220 else {
221 k=(l+ir) >> 1;
222 std::swap(arr[k],arr[l+1]);
223 if(arr[l+1].val > arr[ir].val) {
224 std::swap(arr[l+1],arr[ir]);
225 }
226 if(arr[l].val > arr[ir].val) {
227 std::swap(arr[l],arr[ir]);
228 }
229 if(arr[l+1].val > arr[l].val) {
230 std::swap(arr[l+1],arr[l]);
231 }
232 i=l+1;
233 j=ir;
234 a=arr[l];
235 aval = a.val;
236 for(;;) {
237 do i++; while (arr[i].val < aval);
238 do j--; while (arr[j].val > aval);
239 if(j < i) break;
240 std::swap(arr[i],arr[j]);
241 }
242 arr[l]=arr[j];
243 arr[j]=a;
244 jstack += 2;
245 if(jstack > NSTACK) {
246 std::cout << "uqsort: NSTACK too small in sort." << std::endl;
247 std::terminate();
248 }
249 if(ir-i+1 >= j-l) {
250 istack[jstack]=ir;
251 istack[jstack-1]=i;
252 ir=j-1;
253 }
254 else {
255 istack[jstack]=j-1;
256 istack[jstack-1]=l;
257 l=i;
258 }
259 }
260 }
261}
262
263template <class IT, class WT, class SIGN>
265{
266 IT id;
267 WT val;
268 SIGN signbit; // 1 means positive, 0 means negative.
270 /*if I am negative, the other is positive*/
271 if(this->signbit < rhs.signbit) {
272 return true;
273 }
274 /*if both has the same sign*/
275 else if(this->signbit == rhs.signbit) {
276 if(this->val < rhs.val) {//if my value is smaller,
277 return this->signbit;//then if we both are positive return true.
278 //if we both are negative, return false.
279 }
280 else if(this->val > rhs.val) {//if my value is larger,
281 return !this->signbit; //then if we both are positive return false.
282 //if we both are negative, return true.
283 }
284 else { //if both are equal.
285 return false;
286 }
287 }
288 else {
289 /*if I am positive, the other is negative*/
290 return false;
291 }
292 }
293
295 return (this->val == rhs.val && this->signbit == rhs.signbit) || (*this < rhs);
296 }
297};
298
302template <class IT, class WT, class SIGN>
304 const IT NSTACK = 50;
305 IT M = 7;
306 IT i, ir=n, j, k, l=1;
307 IT jstack=0, istack[NSTACK];
309
310 --arr;
311 for(;;) {
312 if(ir < M + l) {
313 for(j=l+1;j<=ir;j++) {
314 a=arr[j];
315 for(i=j-1;i>=1;i--) {
316 if(arr[i] <= a) {
317 break;
318 }
319 arr[i+1] = arr[i];
320 }
321 arr[i+1]=a;
322 }
323 if(jstack == 0) {
324 break;
325 }
326 ir=istack[jstack--];
327 l=istack[jstack--];
328 }
329 else {
330 k=(l+ir) >> 1;
331 std::swap(arr[k],arr[l+1]);
332 if(arr[ir] < arr[l+1]) {
333 std::swap(arr[l+1],arr[ir]);
334 }
335 if(arr[ir] < arr[l] ) {
336 std::swap(arr[l],arr[ir]);
337 }
338 if(arr[l] < arr[l+1]) {
339 std::swap(arr[l+1],arr[l]);
340 }
341 i=l+1;
342 j=ir;
343 a=arr[l];
344 for(;;) {
345 do i++; while (arr[i] < a);
346 do j--; while (a < arr[j]);
347 if(j < i) break;
348 std::swap(arr[i],arr[j]);
349 }
350 arr[l]=arr[j];
351 arr[j]=a;
352 jstack += 2;
353 if(jstack > NSTACK) {
354 std::cout << "uqsort: NSTACK too small in sort." << std::endl;
355 std::terminate();
356 }
357 if(ir+l+1 >= j+i) {
358 istack[jstack]=ir;
359 istack[jstack-1]=i;
360 ir=j-1;
361 }
362 else {
363 istack[jstack]=j-1;
364 istack[jstack-1]=l;
365 l=i;
366 }
367 }
368 }
369}
370
371// This exists only so we can track how many times the MJ algorithm is
372// called and put each of those into different timer names.
373// Currently the MultiJaggedTest.cpp will actually call it twice.
374// First time with data from a Tpetra MultiVector and then a second time using
375// a BasicVectorAdapter which allows us to turn UVM off for some tests. The
376// results of the two runs are compared which helps to catch a lot of bugs. For
377// profiling I'm mostly just interested in the UVM off case and need it to be
378// in separate timers. Passing a value through would mess up the API. Possibly
379// we could check the Adapter and use that. The statics have to be outside the
380// templated class as the two called instances will be different template
381// parameters. Another complication is that MultiJagged.cpp will call through
382// the Zoltan2_AlgMJ class and we want to time things in both classes. However
383// TaskMapper will directly call AlgMJ so I made two counters for the two
384// classes to make sure it was always correct. This does not impact any
385// behavior and has the sole purpose of generating unique timer names. If you
386// run an MJ test you'll see MJ(0) and MJ(1) in the names to distinguish the
387// 1st and 2nd run. Right now only MultijaggedTest.cpp cares about this.
389 static int get_counter_AlgMJ() {
390 static int counter = 0;
391 return counter++;
392 }
394 static int counter = 0;
395 return counter++;
396 }
397};
398
401template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
402 typename mj_part_t, typename mj_node_t>
403class AlgMJ
404{
405private:
406 typedef typename mj_node_t::device_type device_t; // for views
408 typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
409
410 //if the (last dimension reduce all count) x the mpi world size
411 //estimated to be bigger than this number then migration will be forced
412 //in earlier iterations.
413 static constexpr size_t future_reduceall_cutoff = 1500000;
414
415 //if parts right before last dimension are estimated to have less than
416 //MIN_WORK_LAST_DIM many coords, migration will be forced in earlier iterations.
417 static constexpr mj_lno_t min_work_last_dim = 1000;
418
419 static constexpr mj_scalar_t least_signifiance = 0.0001;
420 static constexpr int significance_mul = 1000;
421
422 std::string mj_timer_base_string; // for convenience making timer names
423
424 RCP<const Environment> mj_env; // the environment object
425 RCP<const Comm<int> > mj_problemComm; // initial comm object
426 RCP<Comm<int> > comm; // comm object than can be altered during execution
427 double imbalance_tolerance; // input imbalance tolerance.
428 int recursion_depth; // number of steps that partitioning will be solved in.
429 int coord_dim; // coordinate dim
430 int num_weights_per_coord; // # of weights per coord
431 size_t initial_num_loc_coords; // initial num local coords.
432 global_size_t initial_num_glob_coords; // initial num global coords.
433 mj_lno_t num_local_coords; // number of local coords.
434 mj_gno_t num_global_coords; // number of global coords.
435 mj_scalar_t sEpsilon; // epsilon for mj_scalar_t
436
437 // can distribute points on same coordinant to different parts.
438 bool distribute_points_on_cut_lines;
439
440 // how many parts we can calculate concurrently.
441 mj_part_t max_concurrent_part_calculation;
442
443 bool mj_run_as_rcb; // means recursion depth is adjusted to maximum value.
444 int mj_user_recursion_depth; // the recursion depth value provided by user.
445 bool mj_keep_part_boxes; // if the boxes need to be kept.
446
447 // whether to migrate=1, avoid migrate=2, or leave decision to MJ=0
448 int check_migrate_avoid_migration_option;
449
450 // when doing the migration, 0 will aim for perfect load-imbalance, 1 - will
451 // aim for minimized number of messages with possibly bad load-imbalance
452 int migration_type;
453
454 // when MJ decides whether to migrate, the minimum imbalance for migration.
455 double minimum_migration_imbalance;
456
457 // Nonuniform first level partitioning
458 // (Currently available only for sequential_task_partitioning):
459 // Used for Dragonfly task mapping by partitioning Dragonfly RCA
460 // machine coordinates and application coordinates.
461 // An optimization that completely partitions the most important machine dimension
462 // first (i.e. the Dragonfly group coordinate, or RCA's x coordinate). The standard
463 // MJ alg follows after the nonuniform first level partitioning.
464 //
465 // Ex. (first level partitioning): If we have 120 elements,
466 // num_first_level_parts = 3, first_level_distribution = [4, 10, 6], then
467 // part sizes after first level will be [24, 60, 36]. Standard uniform MJ
468 // continues for all subsequent levels.
469
470 // If used, number of parts requested for a nonuniform
471 // first level partitioning
472 mj_part_t num_first_level_parts;
473
474 // If used, the requested distribution of parts for the
475 // nonuniform first level partitioning
476 Kokkos::View<mj_part_t*, Kokkos::HostSpace> first_level_distribution;
477
478 mj_part_t total_num_cut ; // how many cuts will be totally
479 mj_part_t total_num_part; // how many parts will be totally
480
481 mj_part_t max_num_part_along_dim ; // maximum part count along a dimension.
482 mj_part_t max_num_cut_along_dim; // maximum cut count along a dimension.
483
484 // maximum part+cut count along a dimension.
485 size_t max_num_total_part_along_dim;
486
487 mj_part_t total_dim_num_reduce_all; // estimate on #reduceAlls can be done.
488
489 // max no of parts that might occur during the partition before the last
490 // partitioning dimension.
491 mj_part_t last_dim_num_part;
492
493 // input part array specifying num part to divide along each dim.
494 Kokkos::View<mj_part_t *, Kokkos::HostSpace> part_no_array;
495
496 // two dimension coordinate array
497 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
498 Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t>
499 mj_coordinates;
500
501 // two dimension weight array
502 Kokkos::View<mj_scalar_t **, device_t> mj_weights;
503
504 // if the target parts are uniform
505 Kokkos::View<bool *, Kokkos::HostSpace> mj_uniform_parts;
506
507 // if the coordinates have uniform weights
508 Kokkos::View<bool *, Kokkos::HostSpace> mj_uniform_weights;
509
510 int mj_num_teams; // the number of teams
511
512 size_t num_global_parts; // the targeted number of parts
513
514 // vector of all boxes for all parts, constructed if mj_keep_part_boxes true
515 RCP<mj_partBoxVector_t> kept_boxes;
516
517 RCP<mj_partBox_t> global_box;
518
519 int myRank; // processor rank
520 int myActualRank; // initial rank
521
522 bool divide_to_prime_first;
523
524 // initial global ids of the coordinates.
525 Kokkos::View<const mj_gno_t*, device_t> initial_mj_gnos;
526
527 // current global ids of the coordinates, might change during migration.
528 Kokkos::View<mj_gno_t*, device_t> current_mj_gnos;
529
530 // the actual processor owner of the coordinate, to track after migrations.
531 Kokkos::View<int*, Kokkos::HostSpace> owner_of_coordinate;
532
533 // permutation of coordinates, for partitioning.
534 Kokkos::View<mj_lno_t*, device_t> coordinate_permutations;
535
536 // permutation work array.
537 Kokkos::View<mj_lno_t*, device_t> new_coordinate_permutations;
538
539 // the part ids assigned to coordinates.
540 Kokkos::View<mj_part_t*, device_t> assigned_part_ids;
541
542 // beginning and end of each part.
543 Kokkos::View<mj_lno_t *, device_t> part_xadj;
544
545 // work array for beginning and end of each part.
546 Kokkos::View<mj_lno_t *, device_t> new_part_xadj;
547
548 Kokkos::View<mj_scalar_t *, device_t> all_cut_coordinates;
549
550 // how much weight should a MPI put left side of the each cutline
551 Kokkos::View<mj_scalar_t *, device_t>
552 process_cut_line_weight_to_put_left;
553
554 // weight percentage each thread in MPI puts left side of the each outline
555 Kokkos::View<mj_scalar_t *, device_t>
556 thread_cut_line_weight_to_put_left;
557
558 // work array to manipulate coordinate of cutlines in different iterations.
559 // necessary because previous cut line information is used for determining
560 // the next cutline information. therefore, cannot update the cut work array
561 // until all cutlines are determined.
562 Kokkos::View<mj_scalar_t *, device_t> cut_coordinates_work_array;
563
564 // Used for swapping above cut_coordinates_work_array
565 Kokkos::View<mj_scalar_t *, device_t> temp_cut_coords;
566
567 // cumulative part weight array.
568 Kokkos::View<mj_scalar_t *, device_t> target_part_weights;
569
570 // upper bound coordinate of a cut line
571 Kokkos::View<mj_scalar_t *, device_t> cut_upper_bound_coordinates;
572
573 // lower bound coordinate of a cut line
574 Kokkos::View<mj_scalar_t *, device_t> cut_lower_bound_coordinates;
575
576 // lower bound weight of a cut line
577 Kokkos::View<mj_scalar_t *, device_t> cut_lower_bound_weights;
578
579 // upper bound weight of a cut line
580 Kokkos::View<mj_scalar_t *, device_t> cut_upper_bound_weights;
581
582 // combined array to exchange the min and max coordinate, and total
583 // weight of part.
584 Kokkos::View<mj_scalar_t *, device_t>
585 process_local_min_max_coord_total_weight;
586
587 // global combined array with the results for min, max and total weight.
588 Kokkos::View<mj_scalar_t *, device_t>
589 global_min_max_coord_total_weight;
590
591 // isDone is used to determine if a cutline is determined already. If a cut
592 // line is already determined, the next iterations will skip this cut line.
593 Kokkos::View<bool *, device_t> is_cut_line_determined;
594
595 // incomplete_cut_count count holds the number of cutlines that have not
596 // been finalized for each part when concurrentPartCount>1, using this
597 // information, if incomplete_cut_count[x]==0, then no work is done
598 // for this part.
599 Kokkos::View<mj_part_t *, device_t> device_incomplete_cut_count;
600 typename decltype(device_incomplete_cut_count)::HostMirror
601 incomplete_cut_count;
602
603 // Need a quick accessor for this on host
604 typename decltype (part_xadj)::HostMirror host_part_xadj;
605
606 // local part weights of each thread.
607 Kokkos::View<double *, device_t>
608 thread_part_weights;
609
610 // the work manupulation array for partweights.
611 Kokkos::View<double *, device_t>
612 thread_part_weight_work;
613
614 // thread_cut_left_closest_point to hold the closest coordinate
615 // to a cutline from left (for each thread).
616 Kokkos::View<mj_scalar_t *, device_t>
617 thread_cut_left_closest_point;
618
619 // thread_cut_right_closest_point to hold the closest coordinate
620 // to a cutline from right (for each thread)
621 Kokkos::View<mj_scalar_t *, device_t>
622 thread_cut_right_closest_point;
623
624 // to store how many points in each part a thread has.
625 Kokkos::View<mj_lno_t *, device_t>
626 thread_point_counts;
627
628 Kokkos::View<mj_scalar_t *, device_t> process_rectilinear_cut_weight;
629 Kokkos::View<mj_scalar_t *, device_t> global_rectilinear_cut_weight;
630
631 // for faster communication, concatanation of
632 // totalPartWeights sized 2P-1, since there are P parts and P-1 cut lines
633 // leftClosest distances sized P-1, since P-1 cut lines
634 // rightClosest distances size P-1, since P-1 cut lines.
635 Kokkos::View<mj_scalar_t *, device_t>
636 total_part_weight_left_right_closests;
637 Kokkos::View<mj_scalar_t *, device_t>
638 global_total_part_weight_left_right_closests;
639
640 Kokkos::View<mj_part_t*, device_t> device_num_partitioning_in_current_dim;
641 typename decltype(device_num_partitioning_in_current_dim)::HostMirror
642 host_num_partitioning_in_current_dim; // for quick access on host
643
644 /* \brief helper functio to calculate imbalance.
645 * \param achieved balance we achieved.
646 * \param expected balance expected.
647 */
648 static
649 KOKKOS_INLINE_FUNCTION
650 double calculate_imbalance(mj_scalar_t achieved, mj_scalar_t expected) {
651 return static_cast<double>(achieved) / static_cast<double>(expected) - 1.0;
652 }
653
654 /* \brief Either the mj array (part_no_array) or num_global_parts should be
655 * provided in the input. part_no_array takes precedence if both are
656 * provided. Depending on these parameters, total cut/part number, maximum
657 * part/cut number along a dimension, estimated number of reduceAlls,
658 * and the number of parts before the last dimension is calculated.
659 * */
660 void set_part_specifications();
661
662 /* \brief Tries to determine the part number for current dimension,
663 * by trying to make the partitioning as square as possible.
664 * \param num_total_future how many more partitionings are required.
665 * \param root how many more recursion depth is left.
666 */
667 inline mj_part_t get_part_count(
668 mj_part_t num_total_future,
669 double root);
670
671 /* \brief for part communication we keep track of the box boundaries.
672 * This is performed when either asked specifically, or when geometric
673 * mapping is performed afterwards. This function initializes a single box
674 * with all global min and max coordinates.
675 * \param initial_partitioning_boxes the input and output vector for boxes.
676 */
677 void init_part_boxes(RCP<mj_partBoxVector_t> & outPartBoxes);
678
679 /* \brief Function returns how many parts that will be obtained after this
680 * dimension partitioning. It sets how many parts each current part will be
681 * partitioned into in this dimension to device_num_partitioning_in_current_dim
682 * vector, sets how many total future parts each obtained part will be
683 * partitioned into in next_future_num_parts_in_parts vector, If part boxes
684 * are kept, then sets initializes the output_part_boxes as its ancestor.
685 * \param future_num_part_in_parts: input, how many future parts each
686 * current part will be partitioned into.
687 * \param next_future_num_parts_in_parts: output, how many future parts
688 * each obtained part will be partitioned into.
689 * \param future_num_parts: output, max number of future parts that will be
690 * obtained from a single
691 * \param current_num_parts: input, how many parts are there currently.
692 * \param current_iteration: input, current dimension iteration number.
693 * \param input_part_boxes: input, if boxes are kept, current boxes.
694 * \param output_part_boxes: output, if boxes are kept, the initial box
695 * boundaries for obtained parts.
696 * \param atomic_part_count // DOCWORK: Documentation
697 */
698 mj_part_t update_part_num_arrays(
699 std::vector<mj_part_t> *future_num_part_in_parts,
700 std::vector<mj_part_t> *next_future_num_parts_in_parts,
701 mj_part_t &future_num_parts,
702 mj_part_t current_num_parts,
703 int current_iteration,
704 RCP<mj_partBoxVector_t> input_part_boxes,
705 RCP<mj_partBoxVector_t> output_part_boxes,
706 mj_part_t atomic_part_count);
707
719 static
720 KOKKOS_INLINE_FUNCTION
721 void mj_calculate_new_cut_position (
722 mj_scalar_t cut_upper_bound,
723 mj_scalar_t cut_lower_bound,
724 mj_scalar_t cut_upper_weight,
725 mj_scalar_t cut_lower_weight,
726 mj_scalar_t expected_weight,
727 mj_scalar_t &new_cut_position,
728 mj_scalar_t sEpsilon);
729
754 bool mj_perform_migration(
755 mj_part_t in_num_parts, //current number of parts
756 mj_part_t &out_num_parts, //output number of parts.
757 std::vector<mj_part_t> *next_future_num_parts_in_parts,
758 mj_part_t &output_part_begin_index,
759 size_t migration_reduce_all_population,
760 mj_lno_t num_coords_for_last_dim_part,
761 std::string iteration,
762 RCP<mj_partBoxVector_t> &input_part_boxes,
763 RCP<mj_partBoxVector_t> &output_part_boxes);
764
782 bool mj_check_to_migrate(
783 size_t migration_reduce_all_population,
784 mj_lno_t num_coords_for_last_dim_part,
785 mj_part_t num_procs,
786 mj_part_t num_parts,
787 mj_gno_t *num_points_in_all_processor_parts);
788
813 void mj_migration_part_proc_assignment(
814 mj_gno_t * num_points_in_all_processor_parts,
815 mj_part_t num_parts,
816 mj_part_t num_procs,
817 mj_lno_t *send_count_to_each_proc,
818 std::vector<mj_part_t> &processor_ranks_for_subcomm,
819 std::vector<mj_part_t> *next_future_num_parts_in_parts,
820 mj_part_t &out_num_part,
821 std::vector<mj_part_t> &out_part_indices,
822 mj_part_t &output_part_numbering_begin_index,
823 int *coordinate_destinations);
824
850 void mj_assign_proc_to_parts(
851 mj_gno_t * num_points_in_all_processor_parts,
852 mj_part_t num_parts,
853 mj_part_t num_procs,
854 mj_lno_t *send_count_to_each_proc,
855 std::vector<mj_part_t> &processor_ranks_for_subcomm,
856 std::vector<mj_part_t> *next_future_num_parts_in_parts,
857 mj_part_t &out_part_index,
858 mj_part_t &output_part_numbering_begin_index,
859 int *coordinate_destinations);
860
876 void assign_send_destinations(
877 mj_part_t num_parts,
878 mj_part_t *part_assignment_proc_begin_indices,
879 mj_part_t *processor_chains_in_parts,
880 mj_lno_t *send_count_to_each_proc,
881 int *coordinate_destinations);
882
897 void assign_send_destinations2(
898 mj_part_t num_parts,
899 uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment,
900 int *coordinate_destinations,
901 mj_part_t &output_part_numbering_begin_index,
902 std::vector<mj_part_t> *next_future_num_parts_in_parts);
903
926 void mj_assign_parts_to_procs(
927 mj_gno_t * num_points_in_all_processor_parts,
928 mj_part_t num_parts,
929 mj_part_t num_procs,
930 mj_lno_t *send_count_to_each_proc,
931 std::vector<mj_part_t> *next_future_num_parts_in_parts,
932 mj_part_t &out_num_part,
933 std::vector<mj_part_t> &out_part_indices,
934 mj_part_t &output_part_numbering_begin_index,
935 int *coordinate_destinations);
936
950 void mj_migrate_coords(
951 mj_part_t num_procs,
952 mj_lno_t &num_new_local_points,
953 std::string iteration,
954 int *coordinate_destinations,
955 mj_part_t num_parts);
956
962 void create_sub_communicator(
963 std::vector<mj_part_t> &processor_ranks_for_subcomm);
964
969 mj_part_t find_largest_prime_factor(mj_part_t num_parts) {
970 mj_part_t largest_factor = 1;
971 mj_part_t n = num_parts;
972 mj_part_t divisor = 2;
973 while (n > 1) {
974 while (n % divisor == 0) {
975 n = n / divisor;
976 largest_factor = divisor;
977 }
978 ++divisor;
979 if(divisor * divisor > n) {
980 if(n > 1) {
981 largest_factor = n;
982 }
983 break;
984 }
985 }
986 return largest_factor;
987 }
988
989public:
990 AlgMJ();
991
992 // DOCWORK: Make param documentation use : consistently
1018 void multi_jagged_part(
1019 const RCP<const Environment> &env,
1020 RCP<const Comm<int> > &problemComm,
1021 double imbalance_tolerance,
1022 int num_teams,
1023 size_t num_global_parts,
1024 Kokkos::View<mj_part_t*, Kokkos::HostSpace> & part_no_array,
1025 int recursion_depth,
1026 int coord_dim,
1027 mj_lno_t num_local_coords,
1028 mj_gno_t num_global_coords,
1029 Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos,
1030 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
1031 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> & mj_coordinates,
1032 int num_weights_per_coord,
1033 Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_weights,
1034 Kokkos::View<mj_scalar_t**, device_t> & mj_weights,
1035 Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_parts,
1036 Kokkos::View<mj_part_t*, device_t> & result_assigned_part_ids,
1037 Kokkos::View<mj_gno_t*, device_t> & result_mj_gnos);
1038
1052 bool distribute_points_on_cut_lines_,
1053 int max_concurrent_part_calculation_,
1054 int check_migrate_avoid_migration_option_,
1055 double minimum_migration_imbalance_,
1056 int migration_type_ = 0);
1057
1061
1064 RCP<mj_partBox_t> get_global_box() const;
1065
1068 RCP<mj_partBoxVector_t> get_kept_boxes() const;
1069
1072 RCP<mj_partBoxVector_t> compute_global_box_boundaries(
1073 RCP<mj_partBoxVector_t> &localPartBoxes) const;
1074
1114 const RCP<const Environment> &env,
1115 mj_lno_t num_total_coords,
1116 mj_lno_t num_selected_coords,
1117 size_t num_target_part,
1118 int coord_dim,
1119 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
1120 Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t> & mj_coordinates_,
1121 Kokkos::View<mj_lno_t *, device_t> &
1122 initial_selected_coords_output_permutation,
1123 mj_lno_t *output_xadj,
1124 int recursion_depth_,
1125 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & part_no_array,
1126 bool partition_along_longest_dim,
1127 int num_ranks_per_node,
1128 bool divide_to_prime_first_,
1129 mj_part_t num_first_level_parts_ = 1,
1130 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & first_level_distribution_
1131 = Kokkos::View<mj_part_t *, Kokkos::HostSpace>());
1132
1133#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
1134 public:
1135#else
1136 private:
1137#endif
1138
1139 /* \brief Allocates all required memory for the mj partitioning algorithm.
1140 */
1141 void allocate_set_work_memory();
1142
1143 /* \brief compute global bounding box: min/max coords of global domain */
1144 void compute_global_box();
1145
1146 // DOCWORK: Inconsisent use of ! for descriptive/brief commenting - decide.
1153 void mj_get_local_min_max_coord_totW(
1154 mj_part_t current_work_part,
1155 mj_part_t current_concurrent_num_parts,
1156 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords);
1157
1170 void mj_get_global_min_max_coord_totW(
1171 mj_part_t current_concurrent_num_parts,
1172 Kokkos::View<mj_scalar_t *, device_t> & local_min_max_total,
1173 Kokkos::View<mj_scalar_t *, device_t> & global_min_max_total);
1174
1205 void mj_get_initial_cut_coords_target_weights(
1206 mj_scalar_t min_coord,
1207 mj_scalar_t max_coord,
1208 mj_part_t num_cuts/*p-1*/ ,
1209 mj_scalar_t global_weight,
1210 Kokkos::View<mj_scalar_t *, device_t> & initial_cut_coords,
1211 Kokkos::View<mj_scalar_t *, device_t> & target_part_weights,
1212 std::vector <mj_part_t> *future_num_part_in_parts,
1213 std::vector <mj_part_t> *next_future_num_parts_in_parts,
1214 mj_part_t concurrent_current_part,
1215 mj_part_t obtained_part_index,
1216 mj_part_t num_target_first_level_parts = 1,
1217 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & target_first_level_dist =
1218 Kokkos::View<mj_part_t *, Kokkos::HostSpace>());
1219
1236 void set_initial_coordinate_parts(
1237 mj_scalar_t &max_coordinate,
1238 mj_scalar_t &min_coordinate,
1239 mj_lno_t coordinate_begin_index,
1240 mj_lno_t coordinate_end_index,
1241 Kokkos::View<mj_lno_t *, device_t> &
1242 mj_current_coordinate_permutations,
1243 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1244 Kokkos::View<mj_part_t *, device_t> & mj_part_ids,
1245 mj_part_t &partition_count);
1246
1263 void mj_1D_part(
1264 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1265 double imbalanceTolerance,
1266 mj_part_t current_work_part,
1267 mj_part_t current_concurrent_num_parts,
1268 Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
1269 mj_part_t total_incomplete_cut_count,
1270 Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count,
1271 Kokkos::View<size_t*, device_t> & view_total_reduction_size);
1272
1278 void mj_1D_part_get_part_weights(
1279 mj_part_t current_concurrent_num_parts,
1280 mj_part_t current_work_part,
1281 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1282 int loop_count);
1283
1291 void mj_combine_rightleft_and_weights(
1292 mj_part_t current_work_part,
1293 mj_part_t current_concurrent_num_parts);
1294
1307 void mj_create_new_partitions(
1308 mj_part_t num_parts,
1309 mj_part_t current_concurrent_work_part,
1310 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1311 Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
1312 Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
1313 Kokkos::View<mj_lno_t *, device_t> & out_part_xadj);
1314
1350 void mj_get_new_cut_coordinates(
1351 mj_part_t current_concurrent_num_parts,
1352 mj_part_t kk,
1353 const mj_part_t &num_cuts,
1354 const double &used_imbalance_tolerance,
1355 Kokkos::View<mj_scalar_t *, device_t> & current_global_part_weights,
1356 Kokkos::View<mj_scalar_t *, device_t> & current_local_part_weights,
1357 Kokkos::View<mj_scalar_t *, device_t> & current_part_target_weights,
1358 Kokkos::View<bool *, device_t> & current_cut_line_determined,
1359 Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
1360 Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_bounds,
1361 Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bounds,
1362 Kokkos::View<mj_scalar_t *, device_t> & current_global_left_closest_points,
1363 Kokkos::View<mj_scalar_t *, device_t> & current_global_right_closest_points,
1364 Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bound_weights,
1365 Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_weights,
1366 Kokkos::View<mj_scalar_t *, device_t> & new_current_cut_coordinates,
1367 Kokkos::View<mj_scalar_t *, device_t> &
1368 current_part_cut_line_weight_to_put_left,
1369 Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count);
1370
1380 void get_processor_num_points_in_parts(
1381 mj_part_t num_procs,
1382 mj_part_t num_parts,
1383 mj_gno_t *&num_points_in_all_processor_parts);
1384
1389 void fill_permutation_array(
1390 mj_part_t output_num_parts,
1391 mj_part_t num_parts);
1392
1414 void create_consistent_chunks(
1415 mj_part_t num_parts,
1416 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
1417 Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
1418 mj_lno_t coordinate_begin,
1419 mj_lno_t coordinate_end,
1420 Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
1421 Kokkos::View<mj_lno_t *, device_t> & out_part_xadj,
1422 int coordInd,
1423 bool longest_dim_part,
1424 uSignedSortItem<int, mj_scalar_t, char> *p_coord_dimension_range_sorted);
1425
1434 void set_final_parts(
1435 mj_part_t current_num_parts,
1436 mj_part_t output_part_begin_index,
1437 RCP<mj_partBoxVector_t> &output_part_boxes,
1438 bool is_data_ever_migrated);
1439};
1440
1443template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
1444 typename mj_part_t, typename mj_node_t>
1446 mj_env(), mj_problemComm(), comm(), imbalance_tolerance(0),
1447 recursion_depth(0), coord_dim(0),
1448 num_weights_per_coord(0), initial_num_loc_coords(0),
1449 initial_num_glob_coords(0),
1450 num_local_coords(0), num_global_coords(0),
1451 sEpsilon(std::numeric_limits<mj_scalar_t>::epsilon() * 100),
1452 distribute_points_on_cut_lines(true),
1453 max_concurrent_part_calculation(1),
1454 mj_run_as_rcb(false), mj_user_recursion_depth(0),
1455 mj_keep_part_boxes(false),
1456 check_migrate_avoid_migration_option(0), migration_type(0),
1457 minimum_migration_imbalance(0.30),
1458 num_first_level_parts(1),
1459 total_num_cut(0), total_num_part(0), max_num_part_along_dim(0),
1460 max_num_cut_along_dim(0),
1461 max_num_total_part_along_dim(0),
1462 total_dim_num_reduce_all(0),
1463 last_dim_num_part(0),
1464 mj_num_teams(0),
1465 num_global_parts(1),
1466 kept_boxes(), global_box(),
1467 myRank(0), myActualRank(0),
1468 divide_to_prime_first(false)
1469{
1470}
1471
1515template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
1516 typename mj_part_t, typename mj_node_t>
1519 const RCP<const Environment> &env,
1520 mj_lno_t num_total_coords,
1521 mj_lno_t num_selected_coords,
1522 size_t num_target_part,
1523 int coord_dim_,
1524 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
1525 Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t> &
1526 mj_coordinates_,
1527 Kokkos::View<mj_lno_t *, device_t> & initial_adjList_output_adjlist,
1528 mj_lno_t *output_xadj,
1529 int recursion_depth_,
1530 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & part_no_array_,
1531 bool partition_along_longest_dim,
1532 int num_ranks_per_node,
1533 bool divide_to_prime_first_,
1534 mj_part_t num_first_level_parts_,
1535 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & first_level_distribution_)
1536{
1537 this->mj_env = env;
1538 const RCP<Comm<int> > commN;
1539 this->mj_problemComm = Teuchos::DefaultComm<int>::getDefaultSerialComm(commN);
1540 this->comm = Teuchos::rcp_const_cast<Comm<int> >(this->mj_problemComm);
1541 this->myActualRank = this->myRank = 1;
1542
1543 this->divide_to_prime_first = divide_to_prime_first_;
1544 //weights are uniform for task mapping
1545
1546 //parts are uniform for task mapping
1547 //as input indices.
1548 this->imbalance_tolerance = 0;
1549 this->num_global_parts = num_target_part;
1550 this->part_no_array = part_no_array_;
1551 this->recursion_depth = recursion_depth_;
1552
1553 // If nonuniform first level partitioning, the requested num of parts and the
1554 // requested distribution of elements for each part
1555 this->num_first_level_parts = num_first_level_parts_;
1556
1557 this->first_level_distribution = first_level_distribution_;
1558
1559 this->coord_dim = coord_dim_;
1560 this->num_local_coords = num_total_coords;
1561
1562 this->num_global_coords = num_total_coords;
1563 this->mj_coordinates = mj_coordinates_;
1564
1565
1566 this->initial_mj_gnos =
1567 Kokkos::View<mj_gno_t*, device_t>("gids", this->num_local_coords);
1568
1569 this->num_weights_per_coord = 0;
1570
1571 this->mj_uniform_weights = Kokkos::View<bool*, Kokkos::HostSpace>(
1572 "uniform weights", 1);
1573 this->mj_uniform_weights(0) = true;
1574
1575 this->mj_weights = Kokkos::View<mj_scalar_t**, device_t>
1576 ("weights", 1, 1);
1577
1578 this->mj_uniform_parts =
1579 Kokkos::View<bool*, Kokkos::HostSpace>("uniform parts", 1);
1580 this->mj_uniform_parts(0) = true;
1581
1582 this->set_part_specifications();
1583
1584 this->allocate_set_work_memory();
1585
1586 // Do single init
1587 auto local_part_xadj = this->part_xadj;
1588 Kokkos::parallel_for(
1589 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
1590 KOKKOS_LAMBDA (int dummy) {
1591 local_part_xadj(0) = static_cast<mj_lno_t>(num_selected_coords);
1592 });
1593
1594 Kokkos::deep_copy(coordinate_permutations, initial_adjList_output_adjlist);
1595
1596 mj_part_t current_num_parts = 1;
1597
1598 Kokkos::View<mj_scalar_t *, device_t> current_cut_coordinates =
1599 this->all_cut_coordinates;
1600
1601 mj_part_t future_num_parts = this->total_num_part;
1602
1603 std::vector<mj_part_t> *future_num_part_in_parts =
1604 new std::vector<mj_part_t>();
1605 std::vector<mj_part_t> *next_future_num_parts_in_parts =
1606 new std::vector<mj_part_t>();
1607 next_future_num_parts_in_parts->push_back(this->num_global_parts);
1608 RCP<mj_partBoxVector_t> t1;
1609 RCP<mj_partBoxVector_t> t2;
1610
1611 std::vector <uSignedSortItem<int, mj_scalar_t, char>>
1612 coord_dimension_range_sorted(this->coord_dim);
1613 uSignedSortItem<int, mj_scalar_t, char> *p_coord_dimension_range_sorted =
1614 &(coord_dimension_range_sorted[0]);
1615 std::vector <mj_scalar_t> coord_dim_mins(this->coord_dim);
1616 std::vector <mj_scalar_t> coord_dim_maxs(this->coord_dim);
1617
1618 // Need a device counter - how best to allocate?
1619 // Putting this allocation in the loops is very costly so moved out here.
1620 Kokkos::View<mj_part_t*, device_t>
1621 view_rectilinear_cut_count("view_rectilinear_cut_count", 1);
1622 Kokkos::View<size_t*, device_t>
1623 view_total_reduction_size("view_total_reduction_size", 1);
1624
1625 for(int rd = 0; rd < this->recursion_depth; ++rd) {
1626 // next_future_num_parts_in_parts will be as the size of outnumParts,
1627 // and this will hold how many more parts that each output part
1628 // should be divided. this array will also be used to determine the weight
1629 // ratios of the parts.
1630 // swap the arrays to use iteratively..
1631 std::vector<mj_part_t> *tmpPartVect = future_num_part_in_parts;
1632 future_num_part_in_parts = next_future_num_parts_in_parts;
1633 next_future_num_parts_in_parts = tmpPartVect;
1634
1635 // clear next_future_num_parts_in_parts array as
1636 // getPartitionArrays expects it to be empty.
1637 next_future_num_parts_in_parts->clear();
1638
1639 // returns the total number of output parts for this dimension partitioning.
1640 mj_part_t output_part_count_in_dimension =
1641 this->update_part_num_arrays(
1642 future_num_part_in_parts,
1643 next_future_num_parts_in_parts,
1644 future_num_parts,
1645 current_num_parts,
1646 rd,
1647 t1,
1648 t2, num_ranks_per_node);
1649
1650 // if the number of obtained parts equal to current number of parts,
1651 // skip this dimension. For example, this happens when 1 is given in
1652 // the input part array is given. P=4,5,1,2
1653 if(output_part_count_in_dimension == current_num_parts) {
1654 tmpPartVect = future_num_part_in_parts;
1655 future_num_part_in_parts = next_future_num_parts_in_parts;
1656 next_future_num_parts_in_parts = tmpPartVect;
1657 continue;
1658 }
1659
1660 //convert i to string to be used for debugging purposes.
1661 std::string istring = std::to_string(rd);
1662
1663 // alloc Memory to point the indices
1664 // of the parts in the permutation array.
1665 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
1666 "new part xadj", output_part_count_in_dimension);
1667
1668 // the index where in the outtotalCounts will be written.
1669
1670 mj_part_t output_part_index = 0;
1671
1672 // whatever is written to outTotalCounts will be added with previousEnd
1673 // so that the points will be shifted.
1674 mj_part_t output_coordinate_end_index = 0;
1675
1676 mj_part_t current_work_part = 0;
1677 mj_part_t current_concurrent_num_parts = 1;
1678
1679 mj_part_t obtained_part_index = 0;
1680
1681 // get the coordinate axis along which the partitioning will be done.
1682 int coordInd = rd % this->coord_dim;
1683
1684 Kokkos::View<mj_scalar_t *, device_t> mj_current_dim_coords =
1685 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
1686
1687 auto host_process_local_min_max_coord_total_weight =
1688 Kokkos::create_mirror_view(process_local_min_max_coord_total_weight);
1689 auto host_global_min_max_coord_total_weight =
1690 Kokkos::create_mirror_view(global_min_max_coord_total_weight);
1691
1692 // run for all available parts.
1693 for(; current_work_part < current_num_parts;
1694 current_work_part += current_concurrent_num_parts) {
1695
1696 mj_part_t actual_work_part_count = 0;
1697
1698 // initialization for 1D partitioning.
1699 // get the min and max coordinates of each part
1700 // together with the part weights of each part.
1701 for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
1702 mj_part_t current_work_part_in_concurrent_parts =
1703 current_work_part + kk;
1704
1705 // if this part wont be partitioned any further
1706 // dont do any work for this part.
1707 mj_part_t partition_count = host_num_partitioning_in_current_dim(
1708 current_work_part_in_concurrent_parts);
1709 if(partition_count == 1) {
1710 continue;
1711 }
1712 ++actual_work_part_count;
1713 if(partition_along_longest_dim) {
1714 auto local_process_local_min_max_coord_total_weight =
1715 this->process_local_min_max_coord_total_weight;
1716 for(int coord_traverse_ind = 0;
1717 coord_traverse_ind < this->coord_dim; ++coord_traverse_ind) {
1718
1719 Kokkos::View<mj_scalar_t *, device_t> coords =
1720 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coord_traverse_ind);
1721
1722 this->mj_get_local_min_max_coord_totW(
1723 current_work_part,
1724 current_concurrent_num_parts,
1725 coords);
1726
1727 coord_dimension_range_sorted[coord_traverse_ind].id =
1728 coord_traverse_ind;
1729 coord_dimension_range_sorted[coord_traverse_ind].signbit = 1;
1730
1731 Kokkos::deep_copy(host_process_local_min_max_coord_total_weight,
1732 process_local_min_max_coord_total_weight);
1733
1734 coord_dim_mins[coord_traverse_ind] =
1735 host_process_local_min_max_coord_total_weight(kk);
1736 coord_dim_maxs[coord_traverse_ind] =
1737 host_process_local_min_max_coord_total_weight(
1738 kk + current_concurrent_num_parts);
1739 coord_dimension_range_sorted[coord_traverse_ind].val =
1740 host_process_local_min_max_coord_total_weight(
1741 kk + current_concurrent_num_parts) -
1742 host_process_local_min_max_coord_total_weight(kk);
1743 }
1744
1745 uqSignsort(this->coord_dim, p_coord_dimension_range_sorted);
1746 coordInd = p_coord_dimension_range_sorted[this->coord_dim - 1].id;
1747 auto set_min = coord_dim_mins[coordInd];
1748 auto set_max = coord_dim_maxs[coordInd];
1749 Kokkos::parallel_for(
1750 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
1751 (0, 1), KOKKOS_LAMBDA (int dummy) {
1752 local_process_local_min_max_coord_total_weight(kk) = set_min;
1753 local_process_local_min_max_coord_total_weight(
1754 kk + current_concurrent_num_parts) = set_max;
1755 });
1756
1757 mj_current_dim_coords =
1758 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
1759 }
1760 else {
1761 Kokkos::View<mj_scalar_t *, device_t> coords =
1762 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
1763 this->mj_get_local_min_max_coord_totW(
1764 current_work_part,
1765 current_concurrent_num_parts,
1766 coords);
1767 }
1768 }
1769
1770 // 1D partitioning
1771 if(actual_work_part_count > 0) {
1772 // obtain global Min max of the part.
1773 this->mj_get_global_min_max_coord_totW(
1774 current_concurrent_num_parts,
1775 this->process_local_min_max_coord_total_weight,
1776 this->global_min_max_coord_total_weight);
1777
1778 // update host copy
1779 Kokkos::deep_copy(host_global_min_max_coord_total_weight,
1780 global_min_max_coord_total_weight);
1781
1782 // represents the total number of cutlines
1783 // whose coordinate should be determined.
1784 mj_part_t total_incomplete_cut_count = 0;
1785
1786 //Compute weight ratios for parts & cuts:
1787 //e.g., 0.25 0.25 0.5 0.5 0.75 0.75 1.0
1788 // part0 cut0 part1 cut1 part2 cut2 part3
1789 mj_part_t concurrent_part_cut_shift = 0;
1790 mj_part_t concurrent_part_part_shift = 0;
1791 for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
1792 mj_scalar_t min_coordinate =
1793 host_global_min_max_coord_total_weight(kk);
1794 mj_scalar_t max_coordinate = host_global_min_max_coord_total_weight(
1795 kk + current_concurrent_num_parts);
1796 mj_scalar_t global_total_weight = host_global_min_max_coord_total_weight(
1797 kk + 2*current_concurrent_num_parts);
1798
1799 mj_part_t concurrent_current_part_index = current_work_part + kk;
1800
1801 mj_part_t partition_count = host_num_partitioning_in_current_dim(
1802 concurrent_current_part_index);
1803
1804 Kokkos::View<mj_scalar_t *, device_t> usedCutCoordinate =
1805 Kokkos::subview(current_cut_coordinates,
1806 std::pair<mj_lno_t, mj_lno_t>(
1807 concurrent_part_cut_shift,
1808 current_cut_coordinates.size()));
1809 Kokkos::View<mj_scalar_t *, device_t>
1810 current_target_part_weights =
1811 Kokkos::subview(target_part_weights,
1812 std::pair<mj_lno_t, mj_lno_t>(
1813 concurrent_part_part_shift,
1814 target_part_weights.size()));
1815
1816 // shift the usedCutCoordinate array as noCuts.
1817 concurrent_part_cut_shift += partition_count - 1;
1818 // shift the partRatio array as noParts.
1819 concurrent_part_part_shift += partition_count;
1820 // calculate only if part is not empty,
1821 // and part will be further partitioend.
1822 if(partition_count > 1 && min_coordinate <= max_coordinate) {
1823 // increase allDone by the number of cuts of the current
1824 // part's cut line number.
1825 total_incomplete_cut_count += partition_count - 1;
1826
1827 this->incomplete_cut_count(kk) = partition_count - 1;
1828
1829 // When num_first_level_parts != 1 we have
1830 // nonuniform partitioning on the first level, providing
1831 // requested number of parts (num_first_level_parts) and
1832 // requested distribution in parts (first_level_distribution)
1833
1834 // Get the target part weights given a desired distribution
1835 this->mj_get_initial_cut_coords_target_weights(
1836 min_coordinate,
1837 max_coordinate,
1838 partition_count - 1,
1839 global_total_weight,
1840 usedCutCoordinate,
1841 current_target_part_weights,
1842 future_num_part_in_parts,
1843 next_future_num_parts_in_parts,
1844 concurrent_current_part_index,
1845 obtained_part_index,
1846 rd == 0 ? this->num_first_level_parts : 1,
1847 this->first_level_distribution);
1848
1849 mj_lno_t coordinate_end_index =
1850 host_part_xadj(concurrent_current_part_index);
1851 mj_lno_t coordinate_begin_index =
1852 (concurrent_current_part_index==0) ? 0 :
1853 host_part_xadj[concurrent_current_part_index - 1];
1854
1855 // get the initial estimated part assignments of the coordinates.
1856 this->set_initial_coordinate_parts(
1857 max_coordinate,
1858 min_coordinate,
1859 coordinate_begin_index, coordinate_end_index,
1860 this->coordinate_permutations,
1861 mj_current_dim_coords,
1862 this->assigned_part_ids,
1863 partition_count);
1864 }
1865 else {
1866 // e.g., if have fewer coordinates than parts, don't need to do
1867 // next dim.
1868 this->incomplete_cut_count(kk) = 0;
1869 }
1870 obtained_part_index += partition_count;
1871 }
1872
1873 // used imbalance, it is always 0, as it is difficult
1874 // to estimate a range.
1875 double used_imbalance = 0;
1876
1877 // Determine cut lines for k parts here.
1878 this->mj_env->timerStart(MACRO_TIMERS,
1879 mj_timer_base_string + "mj_1D_part()");
1880
1881 this->mj_1D_part(
1882 mj_current_dim_coords,
1883 used_imbalance,
1884 current_work_part,
1885 current_concurrent_num_parts,
1886 current_cut_coordinates,
1887 total_incomplete_cut_count,
1888 view_rectilinear_cut_count,
1889 view_total_reduction_size);
1890
1891 this->mj_env->timerStop(MACRO_TIMERS,
1892 mj_timer_base_string + "mj_1D_part()");
1893 }
1894 else {
1895 obtained_part_index += current_concurrent_num_parts;
1896 }
1897 // create part chunks
1898 {
1899 mj_part_t output_array_shift = 0;
1900 mj_part_t cut_shift = 0;
1901 size_t tlr_shift = 0;
1902 size_t partweight_array_shift = 0;
1903
1904 for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
1905 mj_part_t current_concurrent_work_part = current_work_part + kk;
1906
1907 mj_part_t num_parts = host_num_partitioning_in_current_dim(
1908 current_concurrent_work_part);
1909
1910 // if the part is empty, skip the part.
1911 int coordinateA_bigger_than_coordinateB =
1912 host_global_min_max_coord_total_weight(kk) >
1913 host_global_min_max_coord_total_weight(
1914 kk + current_concurrent_num_parts);
1915
1916 if((num_parts != 1) && coordinateA_bigger_than_coordinateB) {
1917 // we still need to write the begin and end point of the empty part.
1918 // simply set it zero, the array indices will be shifted later
1919 auto local_new_part_xadj = this->new_part_xadj;
1920 Kokkos::parallel_for(
1921 Kokkos::RangePolicy<typename mj_node_t::execution_space,
1922 mj_part_t> (0, num_parts), KOKKOS_LAMBDA(mj_part_t jj) {
1923 local_new_part_xadj(
1924 output_part_index + output_array_shift + jj) = 0;
1925 });
1926
1927 cut_shift += num_parts - 1;
1928 tlr_shift += (4 *(num_parts - 1) + 1);
1929 output_array_shift += num_parts;
1930 partweight_array_shift += (2 * (num_parts - 1) + 1);
1931 continue;
1932 }
1933 mj_lno_t coordinate_end =
1934 host_part_xadj(current_concurrent_work_part);
1935 mj_lno_t coordinate_begin =
1936 current_concurrent_work_part==0 ? 0 :
1937 host_part_xadj(current_concurrent_work_part-1);
1938
1939 Kokkos::View<mj_scalar_t *, device_t>
1940 current_concurrent_cut_coordinate =
1941 Kokkos::subview(current_cut_coordinates,
1942 std::pair<mj_lno_t, mj_lno_t>(
1943 cut_shift,
1944 current_cut_coordinates.size()));
1945 Kokkos::View<mj_scalar_t *, device_t>
1946 used_local_cut_line_weight_to_left =
1947 Kokkos::subview(process_cut_line_weight_to_put_left,
1948 std::pair<mj_lno_t, mj_lno_t>(
1949 cut_shift,
1950 process_cut_line_weight_to_put_left.size()));
1951
1952 this->thread_part_weight_work =
1953 Kokkos::subview(
1954 this->thread_part_weights,
1955 std::pair<mj_lno_t, mj_lno_t>(
1956 partweight_array_shift,
1957 this->thread_part_weights.size()));
1958
1959 if(num_parts > 1) {
1960 // Rewrite the indices based on the computed cuts.
1961 Kokkos::View<mj_lno_t *, device_t> subview_new_part_xadj =
1962 Kokkos::subview(this->new_part_xadj,
1963 std::pair<mj_lno_t, mj_lno_t>(
1964 output_part_index + output_array_shift,
1965 this->new_part_xadj.size()));
1966
1967 this->create_consistent_chunks(
1968 num_parts,
1969 mj_current_dim_coords,
1970 current_concurrent_cut_coordinate,
1971 coordinate_begin,
1972 coordinate_end,
1973 used_local_cut_line_weight_to_left,
1974 subview_new_part_xadj,
1975 coordInd,
1976 partition_along_longest_dim,
1977 p_coord_dimension_range_sorted);
1978 }
1979 else {
1980 // if this part is partitioned into 1 then just copy
1981 // the old values.
1982 mj_lno_t part_size = coordinate_end - coordinate_begin;
1983
1984 auto local_new_part_xadj = this->new_part_xadj;
1985 Kokkos::parallel_for(
1986 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
1987 (0, 1), KOKKOS_LAMBDA (int dummy) {
1988 local_new_part_xadj(output_part_index + output_array_shift)
1989 = part_size;
1990 });
1991
1992 auto subview_new_coordinate_permutations =
1993 Kokkos::subview(this->new_coordinate_permutations,
1994 std::pair<mj_lno_t, mj_lno_t>(
1995 coordinate_begin,
1996 coordinate_begin + part_size));
1997 auto subview_coordinate_permutations =
1998 Kokkos::subview(this->coordinate_permutations,
1999 std::pair<mj_lno_t, mj_lno_t>(
2000 coordinate_begin,
2001 coordinate_begin + part_size));
2002 Kokkos::deep_copy(subview_new_coordinate_permutations,
2003 subview_coordinate_permutations);
2004 }
2005
2006 cut_shift += num_parts - 1;
2007 tlr_shift += (4 *(num_parts - 1) + 1);
2008 output_array_shift += num_parts;
2009 partweight_array_shift += (2 * (num_parts - 1) + 1);
2010 }
2011
2012 // shift cut coordinates so that all cut coordinates are stored.
2013 // current_cut_coordinates += cutShift;
2014
2015 // getChunks from coordinates partitioned the parts and
2016 // wrote the indices as if there were a single part.
2017 // now we need to shift the beginning indices.
2018 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
2019 mj_part_t num_parts =
2020 host_num_partitioning_in_current_dim(current_work_part + kk);
2021 auto local_new_part_xadj = this->new_part_xadj;
2022 auto local_mj_current_dim_coords = mj_current_dim_coords;
2023 auto local_new_coordinate_permutations =
2024 new_coordinate_permutations;
2025 Kokkos::parallel_for(
2026 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t> (
2027 0, num_parts), KOKKOS_LAMBDA (mj_part_t ii) {
2028 //shift it by previousCount
2029 local_new_part_xadj(output_part_index+ii) +=
2030 output_coordinate_end_index;
2031
2032 if(ii % 2 == 1) {
2033 mj_lno_t coordinate_end =
2034 local_new_part_xadj(output_part_index+ii);
2035 mj_lno_t coordinate_begin =
2036 local_new_part_xadj(output_part_index);
2037
2038 for(mj_lno_t task_traverse = coordinate_begin;
2039 task_traverse < coordinate_end; ++task_traverse) {
2040 mj_lno_t l = local_new_coordinate_permutations(task_traverse);
2041 //MARKER: FLIPPED ZORDER BELOW
2042 local_mj_current_dim_coords(l) = -local_mj_current_dim_coords(l);
2043 }
2044 }
2045 });
2046
2047 // increase the previous count by current end.
2048 mj_part_t get_single;
2049 Kokkos::parallel_reduce("Read new_part_xadj",
2050 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>(0, 1),
2051 KOKKOS_LAMBDA(int dummy, mj_part_t & set_single) {
2052 set_single = local_new_part_xadj(output_part_index + num_parts - 1);
2053 }, get_single);;
2054
2055 output_coordinate_end_index = get_single;
2056 // increase the current out.
2057 output_part_index += num_parts;
2058 }
2059 }
2060 }
2061
2062 // end of this partitioning dimension
2063 // set the current num parts for next dim partitioning
2064 current_num_parts = output_part_count_in_dimension;
2065
2066 //swap the coordinate permutations for the next dimension.
2067 Kokkos::View<mj_lno_t *, device_t> tmp = this->coordinate_permutations;
2068 this->coordinate_permutations = this->new_coordinate_permutations;
2069 this->new_coordinate_permutations = tmp;
2070
2071 this->part_xadj = this->new_part_xadj;
2072 this->host_part_xadj = Kokkos::create_mirror_view(part_xadj);
2073 Kokkos::deep_copy(host_part_xadj, part_xadj); // keep in sync
2074 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>("empty", 0);
2075 }
2076
2077 Kokkos::deep_copy(initial_adjList_output_adjlist, coordinate_permutations);
2078
2079 // Return output_xadj in CSR format
2080 output_xadj[0] = 0;
2081 for(size_t i = 0; i < this->num_global_parts ; ++i) {
2082 output_xadj[i+1] = host_part_xadj(i);
2083 }
2084
2085 delete future_num_part_in_parts;
2086 delete next_future_num_parts_in_parts;
2087}
2088
2092template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2093 typename mj_part_t, typename mj_node_t>
2094RCP<typename AlgMJ
2095 <mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t,mj_node_t>::mj_partBox_t>
2097 get_global_box() const
2098{
2099 return this->global_box;
2100}
2101
2104template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2105 typename mj_part_t, typename mj_node_t>
2106void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
2107 mj_node_t>::set_to_keep_part_boxes()
2108{
2109 this->mj_keep_part_boxes = true;
2110}
2111
2112/* \brief Either the mj array (part_no_array) or num_global_parts should be
2113 * provided in the input. part_no_array takes
2114 * precedence if both are provided.
2115 * Depending on these parameters, total cut/part number,
2116 * maximum part/cut number along a dimension, estimated number of reduceAlls,
2117 * and the number of parts before the last dimension is calculated.
2118 * */
2119template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2120 typename mj_part_t, typename mj_node_t>
2123{
2124 this->total_num_cut = 0; //how many cuts will be totally
2125 this->total_num_part = 1; //how many parts will be totally
2126 this->max_num_part_along_dim = 0; // maximum part count along a dimension.
2127 this->total_dim_num_reduce_all = 0; // estimate on #reduceAlls can be done.
2128 this->last_dim_num_part = 1; //max no of parts that might occur
2129 //during the partition before the
2130 //last partitioning dimension.
2131 this->max_num_cut_along_dim = 0;
2132 this->max_num_total_part_along_dim = 0;
2133
2134 if(this->part_no_array.size()) {
2135 auto local_recursion_depth = this->recursion_depth;
2136
2137 this->total_dim_num_reduce_all =
2138 this->total_num_part * this->recursion_depth;
2139
2140 this->total_num_part = 1;
2141 for(int i = 0; i < local_recursion_depth; ++i) {
2142 this->total_num_part *= this->part_no_array(i);
2143 }
2144
2145 mj_part_t track_max = 0;
2146 for(int i = 0; i < local_recursion_depth; ++i) {
2147 if(part_no_array(i) > track_max) {
2148 track_max = this->part_no_array(i);
2149 };
2150 }
2151
2152 this->last_dim_num_part = this->total_num_part /
2153 this->part_no_array(local_recursion_depth-1);
2154
2155 this->max_num_part_along_dim = track_max;
2156 this->num_global_parts = this->total_num_part;
2157 } else {
2158 mj_part_t future_num_parts = this->num_global_parts;
2159
2160 // If using nonuniform first level partitioning.
2161 // initial value max_num_part_along_dim == num_first_level_parts
2162 if (this->first_level_distribution.size() != 0 &&
2163 this->num_first_level_parts > 1) {
2164 this->max_num_part_along_dim = this->num_first_level_parts;
2165 }
2166
2167 // we need to calculate the part numbers now, to determine
2168 // the maximum along the dimensions.
2169 for(int rd = 0; rd < this->recursion_depth; ++rd) {
2170 mj_part_t maxNoPartAlongI = 0;
2171 mj_part_t nfutureNumParts = 0;
2172
2173 // Nonuniform first level partitioning sets part specificiations for
2174 // rd == 0 only, given requested num of parts and distribution in parts
2175 // for the first level.
2176 if (rd == 0 &&
2177 this->first_level_distribution.size() != 0 &&
2178 this->num_first_level_parts > 1) {
2179
2180 maxNoPartAlongI = this->num_first_level_parts;
2181 this->max_num_part_along_dim = this->num_first_level_parts;
2182
2183 mj_part_t sum_first_level_dist = 0;
2184 mj_part_t max_part = 0;
2185
2186 // Cumulative sum of distribution of parts and size of largest part
2187 for (int i = 0; i < this->num_first_level_parts; ++i) {
2188 sum_first_level_dist += this->first_level_distribution(i);
2189 if (this->first_level_distribution(i) > max_part)
2190 max_part = this->first_level_distribution(i);
2191 }
2192
2193 // Total parts in largest nonuniform superpart from
2194 // first level partitioning
2195 nfutureNumParts =
2196 this->num_global_parts * max_part / sum_first_level_dist;
2197 }
2198 // Standard uniform partitioning this level
2199 else {
2200 maxNoPartAlongI = this->get_part_count(future_num_parts,
2201 1.0f / (this->recursion_depth - rd));
2202 if (maxNoPartAlongI > this->max_num_part_along_dim)
2203 this->max_num_part_along_dim = maxNoPartAlongI;
2204 nfutureNumParts = future_num_parts / maxNoPartAlongI;
2205 if (future_num_parts % maxNoPartAlongI) {
2206 ++nfutureNumParts;
2207 }
2208 }
2209 future_num_parts = nfutureNumParts;
2210 }
2211 this->total_num_part = this->num_global_parts;
2212
2213 if(this->divide_to_prime_first) {
2214 this->total_dim_num_reduce_all = this->num_global_parts * 2;
2215 this->last_dim_num_part = this->num_global_parts;
2216 }
2217 else {
2218 //this is the lower bound.
2219 //estimate reduceAll Count here.
2220 //we find the upperbound instead.
2221 size_t p = 1;
2222 for(int i = 0; i < this->recursion_depth; ++i) {
2223 this->total_dim_num_reduce_all += p;
2224 p *= this->max_num_part_along_dim;
2225 }
2226
2227 if(p / this->max_num_part_along_dim > this->num_global_parts) {
2228 this->last_dim_num_part = this->num_global_parts;
2229 }
2230 else {
2231 this->last_dim_num_part = p / this->max_num_part_along_dim;
2232 }
2233 }
2234 }
2235
2236 this->total_num_cut = this->total_num_part - 1;
2237 this->max_num_cut_along_dim = this->max_num_part_along_dim - 1;
2238 this->max_num_total_part_along_dim = this->max_num_part_along_dim +
2239 size_t(this->max_num_cut_along_dim);
2240 // maxPartNo is P, maxCutNo = P-1, matTotalPartcount = 2P-1
2241
2242 // refine the concurrent part count, if it is given bigger than the maximum
2243 // possible part count.
2244 if(this->max_concurrent_part_calculation > this->last_dim_num_part) {
2245 if(this->mj_problemComm->getRank() == 0) {
2246 std::cerr << "Warning: Concurrent part count (" <<
2247 this->max_concurrent_part_calculation <<
2248 ") has been set bigger than maximum amount that can be used." <<
2249 " Setting to:" << this->last_dim_num_part << "." << std::endl;
2250 }
2251 this->max_concurrent_part_calculation = this->last_dim_num_part;
2252 }
2253}
2254
2255/* \brief Tries to determine the part number for current dimension,
2256 * by trying to make the partitioning as square as possible.
2257 * \param num_total_future how many more partitionings are required.
2258 * \param root how many more recursion depth is left.
2259 */
2260template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2261 typename mj_part_t, typename mj_node_t>
2262inline mj_part_t AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
2263 get_part_count(mj_part_t num_total_future, double root)
2264{
2265 double fp = pow(num_total_future, root);
2266 mj_part_t ip = mj_part_t(fp);
2267 if(fp - ip < std::numeric_limits<float>::epsilon() * 100) {
2268 return ip;
2269 }
2270 else {
2271 return ip + 1;
2272 }
2273}
2274
2275/* \brief Function returns how many parts that will be obtained after this
2276 * dimension partitioning. It sets how many parts each current part will be
2277 * partitioned into in this dimension to device_num_partitioning_in_current_dim
2278 * view, sets how many total future parts each obtained part will be
2279 * partitioned into in next_future_num_parts_in_parts vector. If part boxes are
2280 * kept, then sets initializes the output_part_boxes as its ancestor.
2281 * \param future_num_part_in_parts: input, how many future parts each current
2282 * part will be partitioned into.
2283 * \param next_future_num_parts_in_parts: output, how many future parts each
2284 * obtained part will be partitioned into.
2285 * \param future_num_parts: output, max number of future parts that will be
2286 * obtained from a single
2287 * \param current_num_parts: input, how many parts are there currently.
2288 * \param current_iteration: input, current dimension iteration number.
2289 * \param input_part_boxes: input, if boxes are kept, current boxes.
2290 * \param output_part_boxes: output, if boxes are kept, the initial box
2291 * boundaries for obtained parts.
2292 * \param atomic_part_count DOCWORK: Documentation
2293 */
2294template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2295 typename mj_part_t, typename mj_node_t>
2296mj_part_t AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
2297 update_part_num_arrays(
2298 std::vector<mj_part_t> *future_num_part_in_parts,
2299 std::vector<mj_part_t> *next_future_num_parts_in_parts,
2300 mj_part_t &future_num_parts,
2301 mj_part_t current_num_parts,
2302 int current_iteration,
2303 RCP<mj_partBoxVector_t> input_part_boxes,
2304 RCP<mj_partBoxVector_t> output_part_boxes,
2305 mj_part_t atomic_part_count)
2306{
2307 std::vector<mj_part_t> num_partitioning_in_current_dim;
2308
2309 // how many parts that will be obtained after this dimension.
2310 mj_part_t output_num_parts = 0;
2311 if(this->part_no_array.size()) {
2312 // when the partNo array is provided as input,
2313 // each current partition will be partition to the same number of parts.
2314 // we dont need to use the future_num_part_in_parts vector in this case.
2315 mj_part_t current_part_no_array =
2316 this->part_no_array(current_iteration);
2317
2318 if(current_part_no_array < 1) {
2319 std::cout << "Current recursive iteration: " << current_iteration <<
2320 " part_no_array[" << current_iteration << "] is given as:" <<
2321 current_part_no_array << std::endl;
2322 std::terminate();
2323 }
2324 if(current_part_no_array == 1) {
2325 return current_num_parts;
2326 }
2327
2328 // If using part_no_array, ensure compatibility with num_first_level_parts.
2329 if (this->first_level_distribution.size() != 0 &&
2330 current_iteration == 0 &&
2331 current_part_no_array != this->num_first_level_parts) {
2332 std::cout << "Current recursive iteration: " << current_iteration
2333 << " part_no_array[" << current_iteration << "] is given as: " <<
2334 current_part_no_array << " and contradicts num_first_level_parts: " <<
2335 this->num_first_level_parts << std::endl;
2336 std::terminate();
2337 }
2338
2339 for(mj_part_t ii = 0; ii < current_num_parts; ++ii) {
2340 num_partitioning_in_current_dim.push_back(current_part_no_array);
2341 }
2342
2343/*
2344 std::cout << "\n\nme: " << this->myRank << " current_iteration: " <<
2345 current_iteration << " current_num_parts: " <<
2346 current_num_parts << "\n\n";
2347
2348 std::cout << "\n\nnum_partitioning_in_current_dim[0]: " <<
2349 num_partitioning_in_current_dim[0] << "\n\n";
2350
2351 std::cout << "\n\nfuture_num_parts: " << future_num_parts
2352 << " num_partitioning_in_current_dim[0]: " <<
2353 num_partitioning_in_current_dim[0] << " " <<
2354 future_num_parts / num_partitioning_in_current_dim[0] << "\n\n";
2355*/
2356
2357 future_num_parts /= num_partitioning_in_current_dim[0];
2358 output_num_parts = current_num_parts *
2359 num_partitioning_in_current_dim[0];
2360 if(this->mj_keep_part_boxes) {
2361 for(mj_part_t k = 0; k < current_num_parts; ++k) {
2362 //initialized the output boxes as its ancestor.
2363 for(mj_part_t j = 0; j <
2364 num_partitioning_in_current_dim[0]; ++j) {
2365 output_part_boxes->push_back((*input_part_boxes)[k]);
2366 }
2367 }
2368 }
2369
2370 // set the how many more parts each part will be divided.
2371 // this is obvious when partNo array is provided as input.
2372 // however, fill this so weights will be calculated according to this array.
2373 for(mj_part_t ii = 0; ii < output_num_parts; ++ii) {
2374 next_future_num_parts_in_parts->push_back(future_num_parts);
2375 }
2376 }
2377 else {
2378 // if partNo array is not provided as input, future_num_part_in_parts
2379 // holds how many parts each part should be divided. Initially it holds a
2380 // single number equal to the total number of global parts.
2381
2382 // calculate the future_num_parts from beginning,
2383 // since each part might be divided into different number of parts.
2384 future_num_parts = 1;
2385
2386 // cout << "i:" << i << std::endl;
2387 for(mj_part_t ii = 0; ii < current_num_parts; ++ii) {
2388 // get how many parts a part should be divided.
2389 mj_part_t future_num_parts_of_part_ii = (*future_num_part_in_parts)[ii];
2390
2391 // get the ideal number of parts that is close to the
2392 // (recursion_depth - i) root of the future_num_parts_of_part_ii.
2393 mj_part_t num_partitions_in_current_dim =
2394 this->get_part_count(future_num_parts_of_part_ii,
2395 1.0 / (this->recursion_depth - current_iteration)
2396 );
2397 if(num_partitions_in_current_dim > this->max_num_part_along_dim) {
2398 std::cerr << "ERROR: maxPartNo calculation is wrong."
2399 " num_partitions_in_current_dim: "
2400 << num_partitions_in_current_dim << " this->max_num_part_along_dim: "
2401 << this->max_num_part_along_dim <<
2402 " this->recursion_depth: " << this->recursion_depth <<
2403 " current_iteration:" << current_iteration <<
2404 " future_num_parts_of_part_ii: " << future_num_parts_of_part_ii <<
2405 " might need to fix max part no calculation for "
2406 "largest_prime_first partitioning." <<
2407 std::endl;
2408 std::terminate();
2409 }
2410 // add this number to vector_num_partitioning_in_current_dim vector.
2411 // num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2412 // mj_part_t largest_prime_factor = num_partitions_in_current_dim;
2413
2414 // Update part num arrays when on current_iteration == 0 and
2415 // using nonuniform first level partitioning
2416 // with requested num parts (num_first_level_parts) and
2417 // a requested distribution in parts (first_level_distribution).
2418 if (current_iteration == 0 &&
2419 this->first_level_distribution.size() != 0 &&
2420 this->num_first_level_parts > 1) {
2421 // Only 1 current part to begin and partitions into
2422 // num_first_level_parts many parts
2423 num_partitioning_in_current_dim.push_back(this->num_first_level_parts);
2424
2425 // The output number of parts from first level partitioning
2426 output_num_parts = this->num_first_level_parts;
2427
2428 // Remaining parts left to partition for all future levels
2429 future_num_parts /= this->num_first_level_parts;
2430
2431 mj_part_t max_part = 0;
2432 mj_part_t sum_first_level_dist = 0;
2433
2434 // Cumulative sum of distribution of first level parts
2435 // and size of largest first level part
2436 for (int i = 0; i < this->num_first_level_parts; ++i) {
2437 sum_first_level_dist += this->first_level_distribution(i);
2438
2439 if (this->first_level_distribution(i) > max_part)
2440 max_part = this->first_level_distribution(i);
2441 }
2442
2443 // Maximum # of remaining parts left to partition for all future levels
2444 future_num_parts = this->num_global_parts * max_part / sum_first_level_dist;
2445
2446 // Number of parts remaining left to partition for each future_part
2447 // The sum must exactly equal global_num_parts
2448 for (int i = 0; i < this->num_first_level_parts; ++i) {
2449 next_future_num_parts_in_parts->push_back(this->first_level_distribution(i) *
2450 this->num_global_parts / sum_first_level_dist);
2451 }
2452 }
2453 else if (this->divide_to_prime_first) {
2454 // Add this number to num_partitioning_in_current_dim vector.
2455 num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2456
2457 mj_part_t largest_prime_factor = num_partitions_in_current_dim;
2458
2459 //increase the output number of parts.
2460 output_num_parts += num_partitions_in_current_dim;
2461
2462 if (future_num_parts_of_part_ii == atomic_part_count ||
2463 future_num_parts_of_part_ii % atomic_part_count != 0) {
2464 atomic_part_count = 1;
2465 }
2466
2467 largest_prime_factor =
2468 this->find_largest_prime_factor(future_num_parts_of_part_ii / atomic_part_count);
2469
2470 // We divide to num_partitions_in_current_dim. But we adjust the weights
2471 // based on largest prime/ if num_partitions_in_current_dim = 2,
2472 // largest prime = 5 --> we divide to 2 parts with weights 3x and 2x.
2473 // if the largest prime is less than part count, we use the part count
2474 // so that we divide uniformly.
2475 if (largest_prime_factor < num_partitions_in_current_dim) {
2476 largest_prime_factor = num_partitions_in_current_dim;
2477 }
2478 //ideal number of future partitions for each part.
2479 mj_part_t ideal_num_future_parts_in_part =
2480 (future_num_parts_of_part_ii / atomic_part_count) / largest_prime_factor;
2481 //if num_partitions_in_current_dim = 2, largest prime = 5 then ideal weight is 2x
2482 mj_part_t ideal_prime_scale = largest_prime_factor / num_partitions_in_current_dim;
2483
2484/*
2485 std::cout << "\ncurrent num part: " << ii
2486 << " largest_prime_factor: " << largest_prime_factor
2487 << " To Partition: " << future_num_parts_of_part_ii << "\n\n";
2488*/
2489
2490 for (mj_part_t iii = 0; iii < num_partitions_in_current_dim; ++iii) {
2491 //if num_partitions_in_current_dim = 2, largest prime = 5 then ideal weight is 2x
2492 mj_part_t my_ideal_primescale = ideal_prime_scale;
2493 //left over weighs. Left side is adjusted to be 3x, right side stays as 2x
2494 if (iii < (largest_prime_factor) % num_partitions_in_current_dim) {
2495 ++my_ideal_primescale;
2496 }
2497 //scale with 'x';
2498 mj_part_t num_future_parts_for_part_iii =
2499 ideal_num_future_parts_in_part * my_ideal_primescale;
2500
2501 //if there is a remainder in the part increase the part weight.
2502 if (iii < (future_num_parts_of_part_ii / atomic_part_count) % largest_prime_factor) {
2503 //if not uniform, add 1 for the extra parts.
2504 ++num_future_parts_for_part_iii;
2505 }
2506
2507 next_future_num_parts_in_parts->push_back(num_future_parts_for_part_iii * atomic_part_count);
2508
2509 //if part boxes are stored, initialize the box of the parts as the ancestor.
2510 if (this->mj_keep_part_boxes) {
2511 output_part_boxes->push_back((*input_part_boxes)[ii]);
2512 }
2513
2514 //set num future_num_parts to maximum in this part.
2515 if (num_future_parts_for_part_iii > future_num_parts)
2516 future_num_parts = num_future_parts_for_part_iii;
2517
2518 }
2519 }
2520 else {
2521 // Add this number to num_partitioning_in_current_dim vector.
2522 num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
2523
2524 //increase the output number of parts.
2525 output_num_parts += num_partitions_in_current_dim;
2526
2527 if((future_num_parts_of_part_ii == atomic_part_count) ||
2528 (future_num_parts_of_part_ii % atomic_part_count != 0)) {
2529 atomic_part_count = 1;
2530 }
2531 //ideal number of future partitions for each part.
2532 mj_part_t ideal_num_future_parts_in_part =
2533 (future_num_parts_of_part_ii / atomic_part_count) /
2534 num_partitions_in_current_dim;
2535 for(mj_part_t iii = 0; iii < num_partitions_in_current_dim; ++iii) {
2536 mj_part_t num_future_parts_for_part_iii =
2537 ideal_num_future_parts_in_part;
2538
2539 //if there is a remainder in the part increase the part weight.
2540 if(iii < (future_num_parts_of_part_ii / atomic_part_count) %
2541 num_partitions_in_current_dim) {
2542 // if not uniform, add 1 for the extra parts.
2543 ++num_future_parts_for_part_iii;
2544 }
2545
2546 next_future_num_parts_in_parts->push_back(
2547 num_future_parts_for_part_iii * atomic_part_count);
2548
2549 // if part boxes are stored, initialize the box of the parts as
2550 // the ancestor.
2551 if(this->mj_keep_part_boxes) {
2552 output_part_boxes->push_back((*input_part_boxes)[ii]);
2553 }
2554 //set num future_num_parts to maximum in this part.
2555 if(num_future_parts_for_part_iii > future_num_parts)
2556 future_num_parts = num_future_parts_for_part_iii;
2557 }
2558 }
2559 }
2560 }
2561 // move temp std::vector to host view
2562 device_num_partitioning_in_current_dim = Kokkos::View<
2563 mj_part_t*, device_t>("test", num_partitioning_in_current_dim.size());
2564 host_num_partitioning_in_current_dim =
2565 Kokkos::create_mirror_view(device_num_partitioning_in_current_dim);
2566 for(size_t n = 0; n < num_partitioning_in_current_dim.size(); ++n) {
2567 host_num_partitioning_in_current_dim(n) =
2568 num_partitioning_in_current_dim[n];
2569 }
2570 // setup device equivalent - this data is used on host and device and it's
2571 // more efficient to just setup array on both sides now rather than copy
2572 // values as needed later.
2573 Kokkos::deep_copy(device_num_partitioning_in_current_dim,
2574 host_num_partitioning_in_current_dim);
2575 return output_num_parts;
2576}
2577
2578/* \brief Allocates and initializes the work memory that will be used by MJ.
2579 * */
2580template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2581 typename mj_part_t, typename mj_node_t>
2582void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
2583 allocate_set_work_memory()
2584{
2585 // Throughout the partitioning execution,
2586 // instead of the moving the coordinates, hold a permutation array for parts.
2587 // coordinate_permutations holds the current permutation.
2588 this->coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>(
2589 Kokkos::ViewAllocateWithoutInitializing("coordinate_permutations"),
2590 this->num_local_coords);
2591 auto local_coordinate_permutations = coordinate_permutations;
2592 Kokkos::parallel_for(
2593 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t> (
2594 0, this->num_local_coords), KOKKOS_LAMBDA (mj_lno_t i) {
2595 local_coordinate_permutations(i) = i;
2596 });
2597
2598 // new_coordinate_permutations holds the current permutation.
2599 this->new_coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>(
2600 Kokkos::ViewAllocateWithoutInitializing("num_local_coords"),
2601 this->num_local_coords);
2602
2603 this->assigned_part_ids = Kokkos::View<mj_part_t*, device_t>(
2604 Kokkos::ViewAllocateWithoutInitializing("assigned parts"), 0);
2605 if(this->num_local_coords > 0) {
2606 this->assigned_part_ids = Kokkos::View<mj_part_t*, device_t>(
2607 Kokkos::ViewAllocateWithoutInitializing("assigned part ids"),
2608 this->num_local_coords);
2609 }
2610
2611 // single partition starts at index-0, and ends at numLocalCoords
2612 // inTotalCounts array holds the end points in coordinate_permutations array
2613 // for each partition. Initially sized 1, and single element is set to
2614 // numLocalCoords.
2615 this->part_xadj = Kokkos::View<mj_lno_t*, device_t>(
2616 Kokkos::ViewAllocateWithoutInitializing("part xadj"), 1);
2617 this->host_part_xadj = Kokkos::create_mirror_view(part_xadj);
2618 host_part_xadj(0) = num_local_coords;
2619 Kokkos::deep_copy(this->part_xadj, host_part_xadj);
2620
2621 // the ends points of the output, this is allocated later.
2622 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
2623 Kokkos::ViewAllocateWithoutInitializing("empty"), 0);
2624
2625 // only store this much if cuts are needed to be stored.
2626 this->all_cut_coordinates = Kokkos::View<mj_scalar_t*, device_t>(
2627 Kokkos::ViewAllocateWithoutInitializing("all cut coordinates"),
2628 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2629
2630 // how much weight percentage should a MPI put left side of the each cutline
2631 this->process_cut_line_weight_to_put_left = Kokkos::View<mj_scalar_t*,
2632 device_t>(Kokkos::ViewAllocateWithoutInitializing("empty"), 0);
2633
2634 // how much weight percentage should each thread in MPI put left side of
2635 // each outline
2636 this->thread_cut_line_weight_to_put_left =
2637 Kokkos::View<mj_scalar_t*, device_t>(
2638 Kokkos::ViewAllocateWithoutInitializing("empty"), 0);
2639
2640 if(this->distribute_points_on_cut_lines) {
2641 this->process_cut_line_weight_to_put_left =
2642 Kokkos::View<mj_scalar_t *, device_t>(
2643 Kokkos::ViewAllocateWithoutInitializing(
2644 "process_cut_line_weight_to_put_left"),
2645 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2646 this->thread_cut_line_weight_to_put_left =
2647 Kokkos::View<mj_scalar_t *, device_t>(
2648 Kokkos::ViewAllocateWithoutInitializing(
2649 "thread_cut_line_weight_to_put_left"),
2650 this->max_num_cut_along_dim);
2651 this->process_rectilinear_cut_weight =
2652 Kokkos::View<mj_scalar_t *, device_t>(
2653 Kokkos::ViewAllocateWithoutInitializing("process_rectilinear_cut_weight"),
2654 this->max_num_cut_along_dim);
2655 this->global_rectilinear_cut_weight =
2656 Kokkos::View<mj_scalar_t *, device_t>(
2657 Kokkos::ViewAllocateWithoutInitializing("global_rectilinear_cut_weight"),
2658 this->max_num_cut_along_dim);
2659 }
2660
2661 // work array to manipulate coordinate of cutlines in different iterations.
2662 // necessary because previous cut line information is used for determining
2663 // the next cutline information. therefore, cannot update the cut work array
2664 // until all cutlines are determined.
2665 this->cut_coordinates_work_array =
2666 Kokkos::View<mj_scalar_t *, device_t>(
2667 Kokkos::ViewAllocateWithoutInitializing("cut_coordinates_work_array"),
2668 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2669
2670 // cumulative part weight array.
2671 this->target_part_weights = Kokkos::View<mj_scalar_t*, device_t>(
2672 Kokkos::ViewAllocateWithoutInitializing("target_part_weights"),
2673 this->max_num_part_along_dim * this->max_concurrent_part_calculation);
2674
2675 // upper bound coordinate of a cut line
2676 this->cut_upper_bound_coordinates =
2677 Kokkos::View<mj_scalar_t*, device_t>(
2678 Kokkos::ViewAllocateWithoutInitializing("cut_upper_bound_coordinates"),
2679 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2680
2681 // lower bound coordinate of a cut line
2682 this->cut_lower_bound_coordinates =
2683 Kokkos::View<mj_scalar_t*, device_t>(
2684 Kokkos::ViewAllocateWithoutInitializing("cut_lower_bound_coordinates"),
2685 this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2686
2687 // lower bound weight of a cut line
2688 this->cut_lower_bound_weights =
2689 Kokkos::View<mj_scalar_t*, device_t>(
2690 Kokkos::ViewAllocateWithoutInitializing("cut_lower_bound_weights"),
2691 this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2692
2693 //upper bound weight of a cut line
2694 this->cut_upper_bound_weights =
2695 Kokkos::View<mj_scalar_t*, device_t>(
2696 Kokkos::ViewAllocateWithoutInitializing("cut_upper_bound_weights"),
2697 this->max_num_cut_along_dim* this->max_concurrent_part_calculation);
2698
2699 // combined array to exchange the min and max coordinate,
2700 // and total weight of part.
2701 this->process_local_min_max_coord_total_weight =
2702 Kokkos::View<mj_scalar_t*, device_t>(
2703 Kokkos::ViewAllocateWithoutInitializing(
2704 "process_local_min_max_coord_total_weight"),
2705 3 * this->max_concurrent_part_calculation);
2706
2707 // global combined array with the results for min, max and total weight.
2708 this->global_min_max_coord_total_weight =
2709 Kokkos::View<mj_scalar_t*, device_t>(
2710 Kokkos::ViewAllocateWithoutInitializing("global_min_max_coord_total_weight"),
2711 3 * this->max_concurrent_part_calculation);
2712
2713 // is_cut_line_determined is used to determine if a cutline is
2714 // determined already. If a cut line is already determined, the next
2715 // iterations will skip this cut line.
2716 this->is_cut_line_determined = Kokkos::View<bool *, device_t>(
2717 Kokkos::ViewAllocateWithoutInitializing("is_cut_line_determined"),
2718 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2719
2720 // incomplete_cut_count count holds the number of cutlines that have not
2721 // been finalized for each part when concurrentPartCount>1, using this
2722 // information, if incomplete_cut_count[x]==0, then no work is done for
2723 // this part.
2724 this->device_incomplete_cut_count = Kokkos::View<mj_part_t *, device_t>(
2725 Kokkos::ViewAllocateWithoutInitializing("device_incomplete_cut_count"),
2726 this->max_concurrent_part_calculation);
2727 this->incomplete_cut_count =
2728 Kokkos::create_mirror_view(device_incomplete_cut_count);
2729
2730 // local part weights of each thread.
2731 this->thread_part_weights = Kokkos::View<double *, device_t>(
2732 Kokkos::ViewAllocateWithoutInitializing("thread_part_weights"),
2733 this->max_num_total_part_along_dim * this->max_concurrent_part_calculation);
2734
2735 this->thread_cut_left_closest_point = Kokkos::View<mj_scalar_t *, device_t>(
2736 Kokkos::ViewAllocateWithoutInitializing("thread_cut_left_closest_point"),
2737 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2738
2739 // thread_cut_right_closest_point to hold the closest coordinate to a
2740 // cutline from right (for each thread)
2741 this->thread_cut_right_closest_point = Kokkos::View<mj_scalar_t *, device_t>(
2742 Kokkos::ViewAllocateWithoutInitializing("thread_cut_right_closest_point"),
2743 this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
2744
2745 // to store how many points in each part a thread has.
2746 this->thread_point_counts = Kokkos::View<mj_lno_t *, device_t>(
2747 Kokkos::ViewAllocateWithoutInitializing("thread_point_counts"),
2748 this->max_num_part_along_dim);
2749
2750 // for faster communication, concatanation of
2751 // totalPartWeights sized 2P-1, since there are P parts and P-1 cut lines
2752 // leftClosest distances sized P-1, since P-1 cut lines
2753 // rightClosest distances size P-1, since P-1 cut lines.
2754 this->total_part_weight_left_right_closests =
2755 Kokkos::View<mj_scalar_t*, device_t>(
2756 Kokkos::ViewAllocateWithoutInitializing(
2757 "total_part_weight_left_right_closests"),
2758 (this->max_num_total_part_along_dim + this->max_num_cut_along_dim * 2) *
2759 this->max_concurrent_part_calculation);
2760
2761 this->global_total_part_weight_left_right_closests =
2762 Kokkos::View<mj_scalar_t*, device_t>(
2763 Kokkos::ViewAllocateWithoutInitializing(
2764 "global_total_part_weight_left_right_closests"),
2765 (this->max_num_total_part_along_dim +
2766 this->max_num_cut_along_dim * 2) * this->max_concurrent_part_calculation);
2767
2768 this->current_mj_gnos = Kokkos::View<mj_gno_t*, device_t>(
2769 Kokkos::ViewAllocateWithoutInitializing("gids"), num_local_coords);
2770
2771 this->owner_of_coordinate = Kokkos::View<int *, Kokkos::HostSpace>(
2772 Kokkos::ViewAllocateWithoutInitializing("owner_of_coordinate"),
2773 num_local_coords);
2774
2775 // changes owners back to host - so we don't run them on device
2776 // this improves migration code but means we have to serial init here.
2777 // Note we might allow this to be OpenMP when available even for CUDA.
2778 Kokkos::deep_copy(owner_of_coordinate, myActualRank);
2779
2780 auto local_current_mj_gnos = current_mj_gnos;
2781 auto local_initial_mj_gnos = initial_mj_gnos;
2782 Kokkos::parallel_for(
2783 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2784 (0, num_local_coords), KOKKOS_LAMBDA (mj_lno_t j) {
2785 local_current_mj_gnos(j) = local_initial_mj_gnos(j);
2786 });
2787}
2788
2789/* \brief compute the global bounding box
2790 */
2791template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2792 typename mj_part_t, typename mj_node_t>
2793void AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t,
2794 mj_node_t>::compute_global_box()
2795{
2796 //local min coords
2797 mj_scalar_t *mins = new mj_scalar_t[this->coord_dim];
2798 //global min coords
2799 mj_scalar_t *gmins = new mj_scalar_t[this->coord_dim];
2800 //local max coords
2801 mj_scalar_t *maxs = new mj_scalar_t[this->coord_dim];
2802 //global max coords
2803 mj_scalar_t *gmaxs = new mj_scalar_t[this->coord_dim];
2804
2805 auto local_mj_coordinates = this->mj_coordinates;
2806
2807 // If we are only doing 2 parts then we don't need these values
2808 // for y and z. Init them all to 0 first
2809 for(int i = 0; i < this->coord_dim; ++i) {
2810 mins[i] = 0;
2811 maxs[i] = 0;
2812 }
2813
2814 for(int i = 0; i < std::min(this->recursion_depth, this->coord_dim); ++i) {
2815 Kokkos::parallel_reduce("MinReduce",
2816 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2817 (0, this->num_local_coords),
2818 KOKKOS_LAMBDA(mj_lno_t j, mj_scalar_t & running_min) {
2819 if(local_mj_coordinates(j,i) < running_min) {
2820 running_min = local_mj_coordinates(j,i);
2821 }
2822 }, Kokkos::Min<mj_scalar_t>(mins[i]));
2823 Kokkos::parallel_reduce("MaxReduce",
2824 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2825 (0, this->num_local_coords),
2826 KOKKOS_LAMBDA(mj_lno_t j, mj_scalar_t & running_max) {
2827 if(local_mj_coordinates(j,i) > running_max) {
2828 running_max = local_mj_coordinates(j,i);
2829 }
2830 }, Kokkos::Max<mj_scalar_t>(maxs[i]));
2831 }
2832
2833 reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MIN,
2834 this->coord_dim, mins, gmins
2835 );
2836
2837 reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MAX,
2838 this->coord_dim, maxs, gmaxs
2839 );
2840
2841 //create single box with all areas.
2842 global_box = rcp(new mj_partBox_t(0,this->coord_dim,gmins,gmaxs));
2843 //coordinateModelPartBox <mj_scalar_t, mj_part_t> tmpBox (0, coordDim);
2844 delete [] mins;
2845 delete [] gmins;
2846 delete [] maxs;
2847 delete [] gmaxs;
2848}
2849
2850/* \brief for part communication we keep track of the box boundaries.
2851 * This is performed when either asked specifically, or when geometric mapping
2852 * is performed afterwards.
2853 * This function initializes a single box with all global min, max coordinates.
2854 * \param initial_partitioning_boxes the input and output vector for boxes.
2855 */
2856template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2857 typename mj_part_t, typename mj_node_t>
2858void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
2859 mj_node_t>::init_part_boxes(
2860 RCP<mj_partBoxVector_t> & initial_partitioning_boxes)
2861{
2862 mj_partBox_t tmp_box(*global_box);
2863 initial_partitioning_boxes->push_back(tmp_box);
2864}
2865
2870template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2871 typename mj_part_t,
2872 typename mj_node_t>
2873void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
2874 mj_get_local_min_max_coord_totW(
2875 mj_part_t current_work_part,
2876 mj_part_t current_concurrent_num_parts,
2877 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords)
2878{
2879 auto local_coordinate_permutations = this->coordinate_permutations;
2880 auto local_process_local_min_max_coord_total_weight =
2881 this->process_local_min_max_coord_total_weight;
2882 auto local_mj_weights = this->mj_weights;
2883
2884 bool bUniformWeights = mj_uniform_weights(0);
2885
2886 for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
2887
2888 mj_part_t concurrent_current_part = current_work_part + kk;
2889 mj_lno_t coordinate_begin_index = concurrent_current_part == 0 ? 0 :
2890 host_part_xadj(concurrent_current_part - 1);
2891 mj_lno_t coordinate_end_index =
2892 host_part_xadj(concurrent_current_part);
2893
2894 mj_scalar_t my_min_coord = 0;
2895 mj_scalar_t my_max_coord = 0;
2896 mj_scalar_t my_total_weight;
2897 //if the part is empty.
2898 //set the min and max coordinates as reverse.
2899 if(coordinate_begin_index >= coordinate_end_index)
2900 {
2901 my_min_coord = std::numeric_limits<mj_scalar_t>::max();
2902 my_max_coord = -std::numeric_limits<mj_scalar_t>::max();
2903 my_total_weight = 0;
2904 }
2905 else {
2906 // get min
2907 Kokkos::parallel_reduce("get min",
2908 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2909 (coordinate_begin_index, coordinate_end_index),
2910 KOKKOS_LAMBDA (mj_lno_t j, mj_scalar_t & running_min) {
2911 int i = local_coordinate_permutations(j);
2912 if(mj_current_dim_coords(i) < running_min)
2913 running_min = mj_current_dim_coords(i);
2914 }, Kokkos::Min<mj_scalar_t>(my_min_coord));
2915 // get max
2916 Kokkos::parallel_reduce("get max",
2917 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2918 (coordinate_begin_index, coordinate_end_index),
2919 KOKKOS_LAMBDA (mj_lno_t j, mj_scalar_t & running_max) {
2920 int i = local_coordinate_permutations(j);
2921 if(mj_current_dim_coords(i) > running_max)
2922 running_max = mj_current_dim_coords(i);
2923 }, Kokkos::Max<mj_scalar_t>(my_max_coord));
2924 if(bUniformWeights) {
2925 my_total_weight = coordinate_end_index - coordinate_begin_index;
2926 }
2927 else {
2928 my_total_weight = 0;
2929 Kokkos::parallel_reduce("get weight",
2930 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
2931 (coordinate_begin_index, coordinate_end_index),
2932 KOKKOS_LAMBDA (mj_lno_t j, mj_scalar_t & lsum) {
2933 int i = local_coordinate_permutations(j);
2934 lsum += local_mj_weights(i,0);
2935 }, my_total_weight);
2936 }
2937 }
2938
2939 // single write
2940 Kokkos::parallel_for(
2941 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
2942 (0, 1), KOKKOS_LAMBDA (int dummy) {
2943 local_process_local_min_max_coord_total_weight(kk) =
2944 my_min_coord;
2945 local_process_local_min_max_coord_total_weight(
2946 kk + current_concurrent_num_parts) = my_max_coord;
2947 local_process_local_min_max_coord_total_weight(
2948 kk + 2*current_concurrent_num_parts) = my_total_weight;
2949 });
2950 }
2951}
2952
2965template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
2966 typename mj_part_t, typename mj_node_t>
2967void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
2968 mj_node_t>::mj_get_global_min_max_coord_totW(
2969 mj_part_t current_concurrent_num_parts,
2970 Kokkos::View<mj_scalar_t *, device_t> & local_min_max_total,
2971 Kokkos::View<mj_scalar_t *, device_t> & global_min_max_total) {
2972 // reduce min for first current_concurrent_num_parts elements, reduce
2973 // max for next concurrentPartCount elements, reduce sum for the last
2974 // concurrentPartCount elements.
2975 if(this->comm->getSize() > 1) {
2976 // We're using explicit host here as Spectrum MPI would fail
2977 // with the prior HostMirror UVMSpace to UVMSpace setup.
2978 auto host_local_min_max_total =
2979 Kokkos::create_mirror_view(Kokkos::HostSpace(), local_min_max_total);
2980 auto host_global_min_max_total =
2981 Kokkos::create_mirror_view(Kokkos::HostSpace(), global_min_max_total);
2982 Kokkos::deep_copy(host_local_min_max_total, local_min_max_total);
2984 reductionOp(current_concurrent_num_parts,
2985 current_concurrent_num_parts, current_concurrent_num_parts);
2986 try {
2987 reduceAll<int, mj_scalar_t>(
2988 *(this->comm),
2989 reductionOp,
2990 3 * current_concurrent_num_parts,
2991 host_local_min_max_total.data(),
2992 host_global_min_max_total.data());
2993 }
2994 Z2_THROW_OUTSIDE_ERROR(*(this->mj_env))
2995 Kokkos::deep_copy(global_min_max_total, host_global_min_max_total);
2996 }
2997 else {
2998 mj_part_t s = 3 * current_concurrent_num_parts;
2999 Kokkos::parallel_for(
3000 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
3001 (0, s), KOKKOS_LAMBDA (mj_part_t i) {
3002 global_min_max_total(i) = local_min_max_total(i);
3003 });
3004 }
3005}
3006
3039template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3040 typename mj_part_t, typename mj_node_t>
3041void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
3042 mj_get_initial_cut_coords_target_weights(
3043 mj_scalar_t min_coord,
3044 mj_scalar_t max_coord,
3045 mj_part_t num_cuts/*p-1*/ ,
3046 mj_scalar_t global_weight,
3047 /*p - 1 sized, coordinate of each cut line*/
3048 Kokkos::View<mj_scalar_t *, device_t> & initial_cut_coords,
3049 /*cumulative weights, at left side of each cut line. p-1 sized*/
3050 Kokkos::View<mj_scalar_t *, device_t> & current_target_part_weights ,
3051 std::vector <mj_part_t> *future_num_part_in_parts, //the vecto
3052 std::vector <mj_part_t> *next_future_num_parts_in_parts,
3053 mj_part_t concurrent_current_part,
3054 mj_part_t obtained_part_index,
3055 mj_part_t num_target_first_level_parts,
3056 const Kokkos::View<mj_part_t *, Kokkos::HostSpace> & target_first_level_dist)
3057{
3058 mj_scalar_t coord_range = max_coord - min_coord;
3059
3060 // We decided we could keep some std::vectors around for now. Eventually
3061 // it would be nice to have everything just as views with some being device
3062 // and some host. This particular case needs a bit of work to get setup
3063 // in a cleaner way so not going to mess with it at the moment.
3064
3065 bool bUniformPartsCheck =
3066 num_target_first_level_parts <= 1 && this->mj_uniform_parts(0);
3067
3068 if(!bUniformPartsCheck) {
3069 bool bValidNonUniformTargetWeights =
3070 (num_target_first_level_parts > 1 && target_first_level_dist.size() != 0);
3071 if(!bValidNonUniformTargetWeights) {
3072 std::cerr << "MJ does not support non uniform part weights beyond the first partition" << std::endl;
3073 std::terminate();
3074 }
3075 }
3076
3077 Kokkos::View<mj_scalar_t*, device_t> device_cumulative(
3078 "device_cumulative", num_cuts);
3079 auto host_cumulative = Kokkos::create_mirror_view(device_cumulative);
3080
3081 mj_scalar_t cumulative = 0;
3082
3083 if(bUniformPartsCheck) {
3084 // How many total future parts the part will be partitioned into.
3085 mj_scalar_t total_future_part_count_in_part =
3086 static_cast<mj_scalar_t>((*future_num_part_in_parts)[concurrent_current_part]);
3087
3088 // How much each part should weigh in ideal case.
3089 mj_scalar_t unit_part_weight =
3090 global_weight / total_future_part_count_in_part;
3091
3092 for(mj_part_t i = 0; i < num_cuts; ++i) {
3093 cumulative += unit_part_weight * static_cast<mj_scalar_t>((*next_future_num_parts_in_parts)[i + obtained_part_index]);
3094 host_cumulative(i) = cumulative;
3095 }
3096 }
3097 else {
3098 // Sum of entries in the first level partition distribution vector
3099 mj_scalar_t sum_target_first_level_dist = 0.0;
3100 for (int i = 0; i < num_target_first_level_parts; ++i) {
3101 sum_target_first_level_dist += target_first_level_dist(i);
3102 }
3103
3104 for(mj_part_t i = 0; i < num_cuts; ++i) {
3105 cumulative += global_weight * target_first_level_dist(i) /
3106 sum_target_first_level_dist;
3107 host_cumulative(i) = cumulative;
3108 }
3109 }
3110
3111 Kokkos::deep_copy(device_cumulative, host_cumulative);
3112
3113 Kokkos::parallel_for("Write num in parts",
3114 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
3115 (0, num_cuts), KOKKOS_LAMBDA(mj_part_t cut) {
3116 // set target part weight.
3117 current_target_part_weights(cut) = device_cumulative(cut);
3118 initial_cut_coords(cut) = min_coord +
3119 (coord_range * device_cumulative(cut)) / global_weight;
3120 // set this multiple times but here for device handling
3121 current_target_part_weights(num_cuts) = global_weight;
3122 });
3123
3124 // round the target part weights.
3125 // Note need to discuss regarding DragonFly commits and determine if we
3126 // would not simply check mj_uniform_weights here.
3127 if (!bUniformPartsCheck || this->mj_uniform_weights[0]) {
3128 Kokkos::parallel_for(
3129 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
3130 (0, num_cuts + 1),
3131 KOKKOS_LAMBDA (mj_part_t i) {
3132 current_target_part_weights(i) =
3133 long(current_target_part_weights(i) + 0.5);
3134 });
3135 }
3136}
3137
3154template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3155 typename mj_part_t, typename mj_node_t>
3156void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
3157 set_initial_coordinate_parts(
3158 mj_scalar_t &max_coordinate,
3159 mj_scalar_t &min_coordinate,
3160 mj_lno_t coordinate_begin_index,
3161 mj_lno_t coordinate_end_index,
3162 Kokkos::View<mj_lno_t *, device_t> & mj_current_coordinate_permutations,
3163 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
3164 Kokkos::View<mj_part_t *, device_t> & mj_part_ids,
3165 mj_part_t &partition_count)
3166{
3167 mj_scalar_t coordinate_range = max_coordinate - min_coordinate;
3168
3169 // if there is single point, or if all points are along a line.
3170 // set initial part to 0 for all.
3171 if(std::abs(coordinate_range) < this->sEpsilon ) {
3172 Kokkos::parallel_for(
3173 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
3174 (coordinate_begin_index, coordinate_end_index),
3175 KOKKOS_LAMBDA (mj_lno_t ii) {
3176 mj_part_ids(mj_current_coordinate_permutations[ii]) = 0;
3177 });
3178 }
3179 else {
3180 // otherwise estimate an initial part for each coordinate.
3181 // assuming uniform distribution of points.
3182 mj_scalar_t slice = coordinate_range / partition_count;
3183 Kokkos::parallel_for(
3184 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
3185 (coordinate_begin_index, coordinate_end_index),
3186 KOKKOS_LAMBDA (mj_lno_t ii) {
3187 mj_lno_t iii = mj_current_coordinate_permutations[ii];
3188 mj_part_t pp =
3189 mj_part_t((mj_current_dim_coords[iii] - min_coordinate) / slice);
3190 if(pp >= partition_count) {
3191 pp = partition_count - 1; // don't want last coord in an invalid part
3192 }
3193 mj_part_ids[iii] = 2 * pp;
3194 });
3195 }
3196}
3197
3212template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
3213 typename mj_part_t, typename mj_node_t>
3214void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,mj_node_t>::mj_1D_part(
3215 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
3216 double used_imbalance_tolerance,
3217 mj_part_t current_work_part,
3218 mj_part_t current_concurrent_num_parts,
3219 Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
3220 mj_part_t total_incomplete_cut_count,
3221 Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count,
3222 Kokkos::View<size_t*, device_t> & view_total_reduction_size)
3223{
3224 this->temp_cut_coords = current_cut_coordinates;
3225
3227 *reductionOp = NULL;
3228
3229 bool bSingleProcess = (this->comm->getSize() == 1);
3230
3231 std::vector<mj_part_t> temp(host_num_partitioning_in_current_dim.size());
3232 if(!bSingleProcess) {
3233 for(size_t n = 0; n < host_num_partitioning_in_current_dim.size(); ++n) {
3234 temp[n] = host_num_partitioning_in_current_dim(n);
3235 }
3237 <mj_part_t, mj_scalar_t>(
3238 &temp,
3239 current_work_part,
3240 current_concurrent_num_parts);
3241 }
3242
3243 auto local_cut_lower_bound_coordinates =
3244 cut_lower_bound_coordinates;
3245 auto local_cut_upper_bound_coordinates =
3246 cut_upper_bound_coordinates;
3247 auto local_cut_upper_bound_weights = cut_upper_bound_weights;
3248 auto local_cut_lower_bound_weights = cut_lower_bound_weights;
3249 bool local_distribute_points_on_cut_lines = distribute_points_on_cut_lines;
3250 auto local_process_cut_line_weight_to_put_left =
3251 process_cut_line_weight_to_put_left;
3252 auto local_temp_cut_coords = temp_cut_coords;
3253 auto local_global_total_part_weight_left_right_closests =
3254 global_total_part_weight_left_right_closests;
3255 auto local_cut_coordinates_work_array =
3256 cut_coordinates_work_array;
3257 auto local_part_xadj = part_xadj;
3258 auto local_global_min_max_coord_total_weight =
3259 global_min_max_coord_total_weight;
3260 auto local_target_part_weights =
3261 target_part_weights;
3262 auto local_global_rectilinear_cut_weight =
3263 global_rectilinear_cut_weight;
3264 auto local_process_rectilinear_cut_weight =
3265 process_rectilinear_cut_weight;
3266
3267 auto local_is_cut_line_determined = this->is_cut_line_determined;
3268 auto local_device_num_partitioning_in_current_dim =
3269 device_num_partitioning_in_current_dim;
3270
3271 Kokkos::parallel_for(
3272 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
3273 KOKKOS_LAMBDA (int dummy) {
3274
3275 // these need to be initialized
3276 view_rectilinear_cut_count(0) = 0;
3277 view_total_reduction_size(0) = 0;
3278
3279 // initialize the lower and upper bounds of the cuts.
3280 mj_part_t next = 0;
3281 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i) {
3282 mj_part_t num_part_in_dim =
3283 local_device_num_partitioning_in_current_dim(current_work_part + i);
3284 mj_part_t num_cut_in_dim = num_part_in_dim - 1;
3285 view_total_reduction_size(0) += (4 * num_cut_in_dim + 1);
3286
3287 for(mj_part_t ii = 0; ii < num_cut_in_dim; ++ii) {
3288 local_is_cut_line_determined(next) = false;
3289 // min coordinate
3290 local_cut_lower_bound_coordinates(next) =
3291 local_global_min_max_coord_total_weight(i);
3292 // max coordinate
3293 local_cut_upper_bound_coordinates(next) =
3294 local_global_min_max_coord_total_weight(
3295 i + current_concurrent_num_parts);
3296 // total weight
3297 local_cut_upper_bound_weights(next) =
3298 local_global_min_max_coord_total_weight(
3299 i + 2 * current_concurrent_num_parts);
3300 local_cut_lower_bound_weights(next) = 0;
3301 if(local_distribute_points_on_cut_lines) {
3302 local_process_cut_line_weight_to_put_left(next) = 0;
3303 }
3304 ++next;
3305 }
3306 }
3307 });
3308
3309 // loop_count allows the kernel to behave differently on the first loop
3310 // and subsequent loops. First loop we do a binary search and subsequent
3311 // loops we simply step towards our target.
3312 int loop_count = 0;
3313 while (total_incomplete_cut_count != 0) {
3314 this->mj_1D_part_get_part_weights(
3315 current_concurrent_num_parts,
3316 current_work_part,
3317 mj_current_dim_coords,
3318 loop_count);
3319 ++loop_count;
3320
3321 this->mj_combine_rightleft_and_weights(
3322 current_work_part,
3323 current_concurrent_num_parts);
3324
3325 // now sum up the results of mpi processors.
3326 if(!bSingleProcess) {
3327 // We're using explicit host here as Spectrum MPI would fail
3328 // with the prior HostMirror UVMSpace to UVMSpace setup.
3329 auto host_total_part_weight_left_right_closests =
3330 Kokkos::create_mirror_view(Kokkos::HostSpace(),
3331 total_part_weight_left_right_closests);
3332 auto host_global_total_part_weight_left_right_closests =
3333 Kokkos::create_mirror_view(Kokkos::HostSpace(),
3334 global_total_part_weight_left_right_closests);
3335
3336 Kokkos::deep_copy(host_total_part_weight_left_right_closests,
3337 total_part_weight_left_right_closests);
3338
3339 size_t host_view_total_reduction_size;
3340 Kokkos::parallel_reduce("Read single",
3341 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
3342 KOKKOS_LAMBDA(int dummy, size_t & set_single) {
3343 set_single = view_total_reduction_size(0);
3344 }, host_view_total_reduction_size);
3345
3346 reduceAll<int, mj_scalar_t>( *(this->comm), *reductionOp,
3347 host_view_total_reduction_size,
3348 host_total_part_weight_left_right_closests.data(),
3349 host_global_total_part_weight_left_right_closests.data());
3350 Kokkos::deep_copy(global_total_part_weight_left_right_closests,
3351 host_global_total_part_weight_left_right_closests);
3352 }
3353 else {
3354 local_global_total_part_weight_left_right_closests =
3355 this->total_part_weight_left_right_closests;
3356 }
3357
3358 // how much cut will be shifted for the next part in the concurrent
3359 // part calculation.
3360 mj_part_t cut_shift = 0;
3361
3362 // how much the concantaneted array will be shifted for the next part
3363 // in concurrent part calculation.
3364 size_t tlr_shift = 0;
3365
3366 Kokkos::View<mj_part_t*, Kokkos::HostSpace>
3367 save_initial_incomplete_cut_count("save_initial_incomplete_cut_count",
3368 current_concurrent_num_parts);
3369
3370 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
3371
3372 mj_part_t num_parts =
3373 host_num_partitioning_in_current_dim(current_work_part + kk);
3374
3375 mj_part_t num_cuts = num_parts - 1;
3376 size_t num_total_part = num_parts + size_t (num_cuts);
3377
3378 //if the cuts of this cut has already been completed.
3379 //nothing to do for this part.
3380 //just update the shift amount and proceed.
3381 mj_part_t kk_incomplete_cut_count = this->incomplete_cut_count(kk);
3382
3383 if(kk_incomplete_cut_count == 0) {
3384 cut_shift += num_cuts;
3385 tlr_shift += (num_total_part + 2 * num_cuts);
3386 continue;
3387 }
3388
3389 Kokkos::View<mj_scalar_t *, device_t> current_local_part_weights =
3390 Kokkos::subview(this->total_part_weight_left_right_closests,
3391 std::pair<mj_lno_t, mj_lno_t>(
3392 tlr_shift,
3393 this->total_part_weight_left_right_closests.size()));
3394
3395 Kokkos::View<mj_scalar_t *, device_t> current_global_tlr =
3396 Kokkos::subview(
3397 local_global_total_part_weight_left_right_closests,
3398 std::pair<mj_lno_t, mj_lno_t>(
3399 tlr_shift,
3400 local_global_total_part_weight_left_right_closests.size()));
3401 Kokkos::View<mj_scalar_t *, device_t>
3402 current_global_left_closest_points =
3403 Kokkos::subview(current_global_tlr,
3404 std::pair<mj_lno_t, mj_lno_t>(
3405 num_total_part,
3406 current_global_tlr.size()));
3407 Kokkos::View<mj_scalar_t *, device_t>
3408 current_global_right_closest_points =
3409 Kokkos::subview(current_global_tlr,
3410 std::pair<mj_lno_t, mj_lno_t>(
3411 num_total_part + num_cuts,
3412 current_global_tlr.size()));
3413 Kokkos::View<mj_scalar_t *, device_t> current_global_part_weights =
3414 current_global_tlr;
3415
3416 Kokkos::View<bool *, device_t> current_cut_line_determined =
3417 Kokkos::subview(this->is_cut_line_determined,
3418 std::pair<mj_lno_t, mj_lno_t>(
3419 cut_shift,
3420 this->is_cut_line_determined.size()));
3421 Kokkos::View<mj_scalar_t *, device_t> current_part_target_weights =
3422 Kokkos::subview(local_target_part_weights,
3423 std::pair<mj_lno_t, mj_lno_t>(
3424 cut_shift + kk,
3425 local_target_part_weights.size()));
3426 Kokkos::View<mj_scalar_t *, device_t>
3427 current_part_cut_line_weight_to_put_left =
3428 Kokkos::subview(local_process_cut_line_weight_to_put_left,
3429 std::pair<mj_lno_t, mj_lno_t>(
3430 cut_shift,
3431 local_process_cut_line_weight_to_put_left.size()));
3432
3433 save_initial_incomplete_cut_count(kk) =
3434 kk_incomplete_cut_count;
3435
3436 Kokkos::View<mj_scalar_t *, device_t>
3437 current_cut_lower_bound_weights =
3438 Kokkos::subview(local_cut_lower_bound_weights,
3439 std::pair<mj_lno_t, mj_lno_t>(
3440 cut_shift,
3441 local_cut_lower_bound_weights.size()));
3442 Kokkos::View<mj_scalar_t *, device_t> current_cut_upper_weights =
3443 Kokkos::subview(local_cut_upper_bound_weights,
3444 std::pair<mj_lno_t, mj_lno_t>(
3445 cut_shift,
3446 local_cut_upper_bound_weights.size()));
3447 Kokkos::View<mj_scalar_t *, device_t> current_cut_upper_bounds =
3448 Kokkos::subview(local_cut_upper_bound_coordinates,
3449 std::pair<mj_lno_t, mj_lno_t>(
3450 cut_shift,
3451 local_cut_upper_bound_coordinates.size()));
3452 Kokkos::View<mj_scalar_t *, device_t> current_cut_lower_bounds =
3453 Kokkos::subview(local_cut_lower_bound_coordinates,
3454 std::pair<mj_lno_t, mj_lno_t>(
3455 cut_shift,
3456 local_cut_lower_bound_coordinates.size()));
3457
3458 // Now compute the new cut coordinates.
3459 Kokkos::View<mj_scalar_t*, device_t> sub_temp_cut_coords =
3460 Kokkos::subview(this->temp_cut_coords,
3461 std::pair<mj_lno_t, mj_lno_t>(
3462 cut_shift, this->temp_cut_coords.size()));
3463 Kokkos::View<mj_scalar_t*, device_t> sub_cut_coordinates_work_array =
3464 Kokkos::subview(this->cut_coordinates_work_array,
3465 std::pair<mj_lno_t, mj_lno_t>(
3466 cut_shift, this->cut_coordinates_work_array.size()));
3467
3468 this->mj_get_new_cut_coordinates(
3469 current_concurrent_num_parts,
3470 kk,
3471 num_cuts,
3472 used_imbalance_tolerance,
3473 current_global_part_weights,
3474 current_local_part_weights,
3475 current_part_target_weights,
3476 current_cut_line_determined,
3477 sub_temp_cut_coords,
3478 current_cut_upper_bounds,
3479 current_cut_lower_bounds,
3480 current_global_left_closest_points,
3481 current_global_right_closest_points,
3482 current_cut_lower_bound_weights,
3483 current_cut_upper_weights,
3484 sub_cut_coordinates_work_array,
3485 current_part_cut_line_weight_to_put_left,
3486 view_rectilinear_cut_count);
3487
3488 cut_shift += num_cuts;
3489 tlr_shift += (num_total_part + 2 * num_cuts);
3490 } // end of kk loop
3491
3492 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
3493 mj_part_t iteration_complete_cut_count =
3494 save_initial_incomplete_cut_count(kk) - this->incomplete_cut_count(kk);
3495 total_incomplete_cut_count -= iteration_complete_cut_count;
3496 }
3497
3498 Kokkos::parallel_for(
3499 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
3500 (0, local_temp_cut_coords.size()), KOKKOS_LAMBDA(int n) {
3501 auto t = local_temp_cut_coords(n);
3502 local_temp_cut_coords(n) = local_cut_coordinates_work_array(n);
3503 local_cut_coordinates_work_array(n) = t;
3504 });
3505 } // end of the while loop
3506
3507 // Needed only if keep_cuts; otherwise can simply swap array pointers
3508 // cutCoordinates and cutCoordinatesWork.
3509 // (at first iteration, cutCoordinates == cutCoorindates_tmp).
3510 // computed cuts must be in cutCoordinates.
3511 if(current_cut_coordinates != local_temp_cut_coords) {
3512 Kokkos::parallel_for(
3513 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
3514 (0, 1), KOKKOS_LAMBDA(int dummy) {
3515 mj_part_t next = 0;
3516 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i) {
3517 mj_part_t num_parts = -1;
3518 num_parts = local_device_num_partitioning_in_current_dim(
3519 current_work_part + i);
3520 mj_part_t num_cuts = num_parts - 1;
3521 for(mj_part_t ii = 0; ii < num_cuts; ++ii) {
3522 current_cut_coordinates(next + ii) = local_temp_cut_coords(next + ii);
3523 }
3524 next += num_cuts;
3525 }
3526 for(int n = 0; n <
3527 static_cast<int>(local_cut_coordinates_work_array.size()); ++n) {
3528 local_cut_coordinates_work_array(n) = local_temp_cut_coords(n);
3529 }
3530 });
3531 }
3532
3533 delete reductionOp;
3534}
3535
3536template<class scalar_t>
3538 scalar_t * ptr;
3539
3540 // With new kokkos setup parallel_reduce will call empty constructor and
3541 // we update the ptr in the init method.
3542 KOKKOS_INLINE_FUNCTION
3544
3545 KOKKOS_INLINE_FUNCTION
3546 Zoltan2_MJArrayType(scalar_t * pSetPtr) : ptr(pSetPtr) {};
3547
3549 ptr = zmj.ptr;
3550 return *this;
3551 }
3552};
3553
3554#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
3555
3556template<class policy_t, class scalar_t, class part_t>
3558
3561 scalar_t max_scalar;
3565
3566 KOKKOS_INLINE_FUNCTION ArrayCombinationReducer(
3567 scalar_t mj_max_scalar,
3568 value_type &val,
3569 int mj_value_count_rightleft,
3570 int mj_value_count_weights) :
3571 max_scalar(mj_max_scalar),
3572 value(&val),
3573 value_count_rightleft(mj_value_count_rightleft),
3574 value_count_weights(mj_value_count_weights)
3575 {}
3576
3577 KOKKOS_INLINE_FUNCTION
3579 return *value;
3580 }
3581
3582 KOKKOS_INLINE_FUNCTION
3583 void join(value_type& dst, const value_type& src) const {
3584 for(int n = 0; n < value_count_weights; ++n) {
3585 dst.ptr[n] += src.ptr[n];
3586 }
3587
3588 for(int n = value_count_weights + 2;
3589 n < value_count_weights + value_count_rightleft - 2; n += 2) {
3590 if(src.ptr[n] > dst.ptr[n]) {
3591 dst.ptr[n] = src.ptr[n];
3592 }
3593 if(src.ptr[n+1] < dst.ptr[n+1]) {
3594 dst.ptr[n+1] = src.ptr[n+1];
3595 }
3596 }
3597 }
3598
3599 KOKKOS_INLINE_FUNCTION
3600 void join (volatile value_type& dst, const volatile value_type& src) const {
3601 for(int n = 0; n < value_count_weights; ++n) {
3602 dst.ptr[n] += src.ptr[n];
3603 }
3604
3605 for(int n = value_count_weights + 2;
3606 n < value_count_weights + value_count_rightleft - 2; n += 2) {
3607 if(src.ptr[n] > dst.ptr[n]) {
3608 dst.ptr[n] = src.ptr[n];
3609 }
3610 if(src.ptr[n+1] < dst.ptr[n+1]) {
3611 dst.ptr[n+1] = src.ptr[n+1];
3612 }
3613 }
3614 }
3615
3616 KOKKOS_INLINE_FUNCTION void init (value_type& dst) const {
3617 dst.ptr = value->ptr; // must update ptr
3618
3619 for(int n = 0; n < value_count_weights; ++n) {
3620 dst.ptr[n] = 0;
3621 }
3622
3623 for(int n = value_count_weights;
3625 dst.ptr[n] = -max_scalar;
3626 dst.ptr[n+1] = max_scalar;
3627 }
3628 }
3629};
3630#endif // KOKKOS_ENABLE_CUDA && KOKKOS_ENABLE_HIP
3631
3632template<class policy_t, class scalar_t, class part_t, class index_t,
3633 class device_t, class array_t>
3635 typedef typename policy_t::member_type member_type;
3636 typedef Kokkos::View<scalar_t*> scalar_view_t;
3637
3638#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
3639 typedef array_t value_type[];
3640#endif
3641
3643 array_t max_scalar;
3644
3652 Kokkos::View<index_t*, device_t> permutations;
3653 Kokkos::View<scalar_t *, device_t> coordinates;
3654 Kokkos::View<scalar_t**, device_t> weights;
3655 Kokkos::View<part_t*, device_t> parts;
3656 Kokkos::View<scalar_t *, device_t> cut_coordinates;
3657 Kokkos::View<index_t *, device_t> part_xadj;
3659 scalar_t sEpsilon;
3660
3661#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3662 Kokkos::View<double *, device_t> current_part_weights;
3663 Kokkos::View<scalar_t *, device_t> current_left_closest;
3664 Kokkos::View<scalar_t *, device_t> current_right_closest;
3665#endif // KOKKOS_ENABLE_CUDA || defined(KOKKOS_ENABLE_HIP)
3666
3668 int mj_loop_count,
3669 array_t mj_max_scalar,
3670 part_t mj_concurrent_current_part,
3671 part_t mj_num_cuts,
3672 part_t mj_current_work_part,
3673 part_t mj_current_concurrent_num_parts,
3674 part_t mj_left_right_array_size,
3675 part_t mj_weight_array_size,
3676 Kokkos::View<index_t*, device_t> & mj_permutations,
3677 Kokkos::View<scalar_t *, device_t> & mj_coordinates,
3678 Kokkos::View<scalar_t**, device_t> & mj_weights,
3679 Kokkos::View<part_t*, device_t> & mj_parts,
3680 Kokkos::View<scalar_t *, device_t> & mj_cut_coordinates,
3681 Kokkos::View<index_t *, device_t> & mj_part_xadj,
3682 bool mj_uniform_weights0,
3683 scalar_t mj_sEpsilon
3684#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3685 ,Kokkos::View<double *, device_t> & mj_current_part_weights,
3686 Kokkos::View<scalar_t *, device_t> & mj_current_left_closest,
3687 Kokkos::View<scalar_t *, device_t> & mj_current_right_closest
3688#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3689 ) :
3690 loop_count(mj_loop_count),
3691 max_scalar(mj_max_scalar),
3692 concurrent_current_part(mj_concurrent_current_part),
3693 num_cuts(mj_num_cuts),
3694 current_work_part(mj_current_work_part),
3695 current_concurrent_num_parts(mj_current_concurrent_num_parts),
3696 value_count_rightleft(mj_left_right_array_size),
3697 value_count_weights(mj_weight_array_size),
3698 value_count(mj_weight_array_size+mj_left_right_array_size),
3699 permutations(mj_permutations),
3700 coordinates(mj_coordinates),
3701 weights(mj_weights),
3702 parts(mj_parts),
3703 cut_coordinates(mj_cut_coordinates),
3704 part_xadj(mj_part_xadj),
3705 uniform_weights0(mj_uniform_weights0),
3706 sEpsilon(mj_sEpsilon)
3707#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3708 ,current_part_weights(mj_current_part_weights),
3709 current_left_closest(mj_current_left_closest),
3710 current_right_closest(mj_current_right_closest)
3711#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3712 {
3713 }
3714
3715 size_t team_shmem_size (int team_size) const {
3716#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3717 int result = sizeof(array_t) *
3719#else
3720 int result = sizeof(array_t) *
3722#endif
3723
3724 // pad this to a multiple of 8 or it will run corrupt
3725 int remainder = result % 8;
3726 if(remainder != 0) {
3727 result += 8 - remainder;
3728 }
3729 return result;
3730 }
3731
3732 KOKKOS_INLINE_FUNCTION
3733#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3734 void operator() (const member_type & teamMember) const {
3735#else
3736 void operator() (const member_type & teamMember, value_type teamSum) const {
3737#endif
3738
3739 index_t all_begin = (concurrent_current_part == 0) ? 0 :
3741 index_t all_end = part_xadj(concurrent_current_part);
3742
3743 index_t num_working_points = all_end - all_begin;
3744 int num_teams = teamMember.league_size();
3745
3746 index_t stride = num_working_points / num_teams;
3747 if((num_working_points % num_teams) > 0) {
3748 stride += 1; // make sure we have coverage for the final points
3749 }
3750
3751 // the last team may have less work than the other teams
3752 // the last team can be empty (begin > end) if num_teams > stride
3753 // which is true for many teams and small numbers of coords (tests)
3754 index_t begin = all_begin + stride * teamMember.league_rank();
3755 index_t end = begin + stride;
3756 if(end > all_end) {
3757 end = all_end;
3758 }
3759
3760#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3761 size_t sh_mem_size = sizeof(array_t) * (value_count_weights +
3763
3764 array_t * shared_ptr = (array_t *) teamMember.team_shmem().get_shmem(
3765 sh_mem_size);
3766
3767 // init the shared array to 0
3768 Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
3769 for(int n = 0; n < value_count_weights; ++n) {
3770 shared_ptr[n] = 0;
3771 }
3772 for(int n = value_count_weights;
3774 shared_ptr[n] = -max_scalar;
3775 shared_ptr[n+1] = max_scalar;
3776 }
3777 });
3778 teamMember.team_barrier();
3779
3780 Kokkos::parallel_for(
3781 Kokkos::TeamThreadRange(teamMember, begin, end),
3782 [=] (index_t ii) {
3783#else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3784 // create the team shared data - each thread gets one of the arrays
3785 size_t sh_mem_size = sizeof(array_t) * (value_count_weights +
3786 value_count_rightleft) * teamMember.team_size();
3787
3788 array_t * shared_ptr = (array_t *) teamMember.team_shmem().get_shmem(
3789 sh_mem_size);
3790
3791 // select the array for this thread
3792 Zoltan2_MJArrayType<array_t> array(&shared_ptr[teamMember.team_rank() *
3794
3795 // create reducer which handles the Zoltan2_MJArrayType class
3797 max_scalar, array,
3800
3801 Kokkos::parallel_reduce(
3802 Kokkos::TeamThreadRange(teamMember, begin, end),
3803 [=] (size_t ii, Zoltan2_MJArrayType<array_t>& threadSum) {
3804#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3805
3806 int i = permutations(ii);
3807 scalar_t coord = coordinates(i);
3808 array_t w = uniform_weights0 ? 1 : (array_t) weights(i,0);
3809
3810 // now check each part and it's right cut
3811 index_t part = parts(i)/2;
3812
3813 int upper = num_cuts;
3814 int lower = 0;
3815
3816 // binary search - find matching part
3817 while(true) {
3818 scalar_t a = (part == 0) ? -max_scalar : cut_coordinates(part-1);
3819 scalar_t b = (part == num_cuts) ? max_scalar : cut_coordinates(part);
3820
3821 if(coord >= a + sEpsilon && coord <= b - sEpsilon) {
3822#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3823 Kokkos::atomic_add(&shared_ptr[part*2], w);
3824#else
3825 threadSum.ptr[part*2] += w;
3826#endif
3827
3828 parts(i) = part*2;
3829
3830 // now handle the left/right closest part
3831#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3832 array_t new_value = (array_t) coord;
3833 array_t prev_value = shared_ptr[value_count_weights + part * 2 + 1];
3834 while(new_value < prev_value) {
3835 prev_value = Kokkos::atomic_compare_exchange(
3836 &shared_ptr[value_count_weights + part * 2 + 1],
3837 prev_value, new_value);
3838 }
3839 prev_value = shared_ptr[value_count_weights + part * 2 + 2];
3840 while(new_value > prev_value) {
3841 prev_value = Kokkos::atomic_compare_exchange(
3842 &shared_ptr[value_count_weights + part * 2 + 2],
3843 prev_value, new_value);
3844 }
3845#else
3846 // note cut to left needs to set right closest and cut to right needs
3847 // to set left closest. It's index +1 and +2 instead of -1 and +0
3848 // because right/left segment is padded with an extra pair at
3849 // begining and end to avoid branching with if checks.
3850 if(coord < threadSum.ptr[value_count_weights + part * 2 + 1]) {
3851 threadSum.ptr[value_count_weights + part * 2 + 1] = coord;
3852 }
3853 if(coord > threadSum.ptr[value_count_weights + part * 2 + 2]) {
3854 threadSum.ptr[value_count_weights + part * 2 + 2] = coord;
3855 }
3856#endif
3857
3858 break;
3859 }
3860 else if(part != num_cuts) {
3861 if(coord < b + sEpsilon && coord > b - sEpsilon) {
3862 // Note if on cut we set right/left closest to the cut itself
3863 // but we add +2 because we buffered the area with an extra slot
3864 // to reduce cuda branching. So it's +2, +3 instead of +0, +1.
3865#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3866 Kokkos::atomic_add(&shared_ptr[part*2+1], w);
3867 shared_ptr[value_count_weights + part * 2 + 2] = b;
3868 shared_ptr[value_count_weights + part * 2 + 3] = b;
3869#else
3870 threadSum.ptr[part*2+1] += w;
3871 threadSum.ptr[value_count_weights + part * 2 + 2] = b;
3872 threadSum.ptr[value_count_weights + part * 2 + 3] = b;
3873#endif
3874
3875 parts(i) = part*2+1;
3876
3877 // Need to scan up for any other cuts of same coordinate
3878 // This is costly but it's only relevant for the fix4785 test
3879 // which loads a lot of coordinates on the same point, so without
3880 // this our cuts would all just sit at 0.
3881 part_t base_b = part;
3882 scalar_t base_coord = cut_coordinates(base_b);
3883 part += 1;
3884 while(part < num_cuts) {
3885 b = cut_coordinates(part);
3886 scalar_t delta = b - base_coord;
3887 if(delta < 0) delta = -delta;
3888 if(delta < sEpsilon) {
3889 // Note if on cut we set right/left closest to the cut itself
3890 // but we add +2 because we buffered the area with an extra slot
3891 // to reduce cuda branching. So it's +2, +3 instead of +0, +1.
3892#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3893 Kokkos::atomic_add(&shared_ptr[part*2+1], w);
3894 shared_ptr[value_count_weights + part * 2 + 2] = b;
3895 shared_ptr[value_count_weights + part * 2 + 3] = b;
3896#else
3897 threadSum.ptr[part*2+1] += w;
3898 threadSum.ptr[value_count_weights + part * 2 + 2] = b;
3899 threadSum.ptr[value_count_weights + part * 2 + 3] = b;
3900#endif
3901 }
3902 else { break; }
3903 ++part;
3904 }
3905 part = base_b - 1;
3906 while(part >= 0) {
3907 b = cut_coordinates(part);
3908 scalar_t delta = b - base_coord;
3909 if(delta < 0) delta = -delta;
3910 if(delta < sEpsilon) {
3911 // Note if on cut we set right/left closest to the cut itself
3912 // but we add +2 because we buffered the area with an extra slot
3913 // to reduce cuda branching. So it's +2, +3 instead of +0, +1.
3914#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3915 Kokkos::atomic_add(&shared_ptr[part*2+1], w);
3916 shared_ptr[value_count_weights + part * 2 + 2] = b;
3917 shared_ptr[value_count_weights + part * 2 + 3] = b;
3918#else
3919 threadSum.ptr[part*2+1] += w;
3920 threadSum.ptr[value_count_weights + part * 2 + 2] = b;
3921 threadSum.ptr[value_count_weights + part * 2 + 3] = b;
3922#endif
3923 }
3924 else { break; }
3925 --part;
3926 }
3927
3928 break;
3929 }
3930 }
3931
3932 if(loop_count != 0) {
3933 // subsequent loops can just step towards target
3934 if(coord < b) {
3935 part -= 1;
3936 }
3937 else {
3938 part += 1;
3939 }
3940 }
3941 else {
3942 // initial loop binary search
3943 if(coord < b) {
3944 if(part == lower + 1) {
3945 part = lower;
3946 }
3947 else {
3948 upper = part - 1;
3949 part -= (part - lower)/2;
3950 }
3951 }
3952 else if(part == upper - 1) {
3953 part = upper;
3954 }
3955 else {
3956 lower = part + 1;
3957 part += (upper - part)/2;
3958 }
3959 }
3960 }
3961#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3962 });
3963#else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3964 }, arraySumReducer);
3965#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3966
3967 teamMember.team_barrier();
3968
3969 // collect all the team's results
3970 Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
3971 for(int n = 0; n < value_count_weights; ++n) {
3972#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3973 Kokkos::atomic_add(&current_part_weights(n),
3974 static_cast<double>(shared_ptr[n]));
3975#else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3976 teamSum[n] += array.ptr[n];
3977#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
3978 }
3979
3980#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3981 int insert_left = 0;
3982 int insert_right = 0;
3983#endif
3984
3985 for(int n = 2 + value_count_weights;
3986 n < value_count_weights + value_count_rightleft - 2; n += 2) {
3987#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
3988 scalar_t new_value = shared_ptr[n+1];
3989 scalar_t prev_value = current_right_closest(insert_right);
3990 while(new_value < prev_value) {
3991 prev_value = Kokkos::atomic_compare_exchange(
3992 &current_right_closest(insert_right), prev_value, new_value);
3993 }
3994
3995 new_value = shared_ptr[n];
3996 prev_value = current_left_closest(insert_left);
3997 while(new_value > prev_value) {
3998 prev_value = Kokkos::atomic_compare_exchange(
3999 &current_left_closest(insert_left), prev_value, new_value);
4000 }
4001
4002 ++insert_left;
4003 ++insert_right;
4004#else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4005 if(array.ptr[n] > teamSum[n]) {
4006 teamSum[n] = array.ptr[n];
4007 }
4008 if(array.ptr[n+1] < teamSum[n+1]) {
4009 teamSum[n+1] = array.ptr[n+1];
4010 }
4011#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4012 }
4013 });
4014
4015 teamMember.team_barrier();
4016 }
4017
4018#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4019 KOKKOS_INLINE_FUNCTION
4020 void join(value_type dst, const value_type src) const {
4021 for(int n = 0; n < value_count_weights; ++n) {
4022 dst[n] += src[n];
4023 }
4024
4025 for(int n = value_count_weights + 2;
4026 n < value_count_weights + value_count_rightleft - 2; n += 2) {
4027 if(src[n] > dst[n]) {
4028 dst[n] = src[n];
4029 }
4030 if(src[n+1] < dst[n+1]) {
4031 dst[n+1] = src[n+1];
4032 }
4033 }
4034 }
4035
4036 KOKKOS_INLINE_FUNCTION void init (value_type dst) const {
4037 for(int n = 0; n < value_count_weights; ++n) {
4038 dst[n] = 0;
4039 }
4040
4041 for(int n = value_count_weights;
4043 dst[n] = -max_scalar;
4044 dst[n+1] = max_scalar;
4045 }
4046 }
4047#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4048};
4049
4057template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4058 typename mj_part_t, typename mj_node_t>
4059void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t,mj_part_t, mj_node_t>::
4060 mj_1D_part_get_part_weights(
4062 mj_part_t current_work_part,
4063 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
4064 int loop_count)
4065{
4066 auto local_is_cut_line_determined = is_cut_line_determined;
4067 auto local_thread_part_weights = thread_part_weights;
4068 auto local_thread_cut_left_closest_point = thread_cut_left_closest_point;
4069 auto local_thread_cut_right_closest_point = thread_cut_right_closest_point;
4070
4071 // Create some locals so we don't use this inside the kernels
4072 // which causes problems
4073 auto local_sEpsilon = this->sEpsilon;
4074 auto local_assigned_part_ids = this->assigned_part_ids;
4075 auto local_coordinate_permutations = this->coordinate_permutations;
4076 auto local_mj_weights = this->mj_weights;
4077 auto local_part_xadj = this->part_xadj;
4078 auto local_global_min_max_coord_total_weight =
4079 this->global_min_max_coord_total_weight;
4080
4081 typedef Kokkos::TeamPolicy<typename mj_node_t::execution_space> policy_t;
4082
4083 auto local_device_num_partitioning_in_current_dim =
4084 device_num_partitioning_in_current_dim;
4085
4086 Kokkos::deep_copy(device_incomplete_cut_count, this->incomplete_cut_count);
4087 auto local_device_incomplete_cut_count = device_incomplete_cut_count;
4088
4089 mj_part_t total_part_shift = 0;
4090
4091 mj_part_t concurrent_cut_shifts = 0;
4092 for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
4093 Kokkos::View<mj_scalar_t *, device_t> local_temp_cut_coords =
4094 Kokkos::subview(temp_cut_coords, std::pair<mj_lno_t, mj_lno_t>(
4095 concurrent_cut_shifts, temp_cut_coords.size()));
4096
4097 mj_part_t num_parts =
4098 host_num_partitioning_in_current_dim(current_work_part + kk);
4099 mj_part_t num_cuts = num_parts - 1;
4100 mj_part_t total_part_count = num_parts + num_cuts;
4101 mj_part_t weight_array_length = num_cuts + num_parts;
4102
4103 // for right/left closest + buffer cut on either side
4104 mj_part_t right_left_array_length = (num_cuts + 2) * 2;
4105
4106 if(this->incomplete_cut_count(kk) == 0) {
4107 total_part_shift += total_part_count;
4108 concurrent_cut_shifts += num_cuts;
4109 continue;
4110 }
4111
4112 // if not set use 60 - was initial testing amount but somewhat arbitrary
4113 auto policy_ReduceWeightsFunctor = policy_t(
4114 mj_num_teams ? mj_num_teams : 60, Kokkos::AUTO);
4115
4116#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4117 int total_array_length =
4118 weight_array_length + right_left_array_length;
4119#endif
4120
4121 // Using float here caused some numerical errors for coord on cut calculations.
4122 // Probably that can be fixed with proper epsilon adjustment but since cuda
4123 // doesn't reduce right now the shared memory pressure is no longer relevant.
4124 // Just use scalar_t to match the original algorithm.
4125 typedef mj_scalar_t array_t;
4126
4127#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4128 Kokkos::View<array_t*, Kokkos::HostSpace> reduce_array("reduce_array", total_array_length);
4129#endif // KOKKOS_ENABLE_CUDA && KOKKOS_ENABLE_HIP
4130
4131 int offset_cuts = 0;
4132 for(int kk2 = 0; kk2 < kk; ++kk2) {
4133 offset_cuts +=
4134 host_num_partitioning_in_current_dim(current_work_part + kk2) - 1;
4135 }
4136 Kokkos::View<double *, device_t> my_current_part_weights =
4137 Kokkos::subview(local_thread_part_weights,
4138 std::pair<mj_lno_t, mj_lno_t>(total_part_shift,
4139 total_part_shift + total_part_count));
4140 Kokkos::View<mj_scalar_t *, device_t> my_current_left_closest =
4141 Kokkos::subview(local_thread_cut_left_closest_point,
4142 std::pair<mj_lno_t, mj_lno_t>(
4143 offset_cuts,
4144 local_thread_cut_left_closest_point.size()));
4145 Kokkos::View<mj_scalar_t *, device_t> my_current_right_closest =
4146 Kokkos::subview(local_thread_cut_right_closest_point,
4147 std::pair<mj_lno_t, mj_lno_t>(
4148 offset_cuts,
4149 local_thread_cut_right_closest_point.size()));
4150
4151 array_t max_scalar = std::numeric_limits<array_t>::max();
4152
4153#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4154 // initialize values
4155 Kokkos::parallel_for(
4156 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4157 KOKKOS_LAMBDA (int dummy) {
4158 for(int n = 0; n < weight_array_length; ++n) {
4159 my_current_part_weights(n) = 0;
4160 }
4161 for(int n = 0; n < num_cuts; ++n) {
4162 my_current_left_closest(n) = -max_scalar;
4163 my_current_right_closest(n) = max_scalar;
4164 }
4165 });
4166#endif
4167
4168 mj_part_t concurrent_current_part =
4169 current_work_part + kk;
4170
4171 ReduceWeightsFunctor<policy_t, mj_scalar_t, mj_part_t, mj_lno_t,
4172 typename mj_node_t::device_type, array_t>
4173 teamFunctor(
4174 loop_count,
4175 max_scalar,
4177 num_cuts,
4180 right_left_array_length,
4181 weight_array_length,
4182 coordinate_permutations,
4183 mj_current_dim_coords,
4184 mj_weights,
4185 assigned_part_ids,
4186 local_temp_cut_coords,
4187 part_xadj,
4188 mj_uniform_weights(0), // host and currently only relevant to slot 0
4189 sEpsilon
4190#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4191 ,my_current_part_weights,
4192 my_current_left_closest,
4193 my_current_right_closest
4194#endif
4195 );
4196
4197#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4198 Kokkos::parallel_for(policy_ReduceWeightsFunctor, teamFunctor);
4199#else
4200 Kokkos::parallel_reduce(policy_ReduceWeightsFunctor,
4201 teamFunctor, reduce_array);
4202 Kokkos::fence();
4203#endif
4204
4205#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4206 auto hostArray = Kokkos::create_mirror_view(my_current_part_weights);
4207
4208 for(int i = 0; i < static_cast<int>(total_part_count); ++i) {
4209 hostArray(i) = reduce_array[i];
4210 }
4211
4212 Kokkos::deep_copy(my_current_part_weights, hostArray);
4213
4214 auto hostLeftArray = Kokkos::create_mirror_view(my_current_left_closest);
4215 auto hostRightArray = Kokkos::create_mirror_view(my_current_right_closest);
4216 for(mj_part_t cut = 0; cut < num_cuts; ++cut) {
4217 hostLeftArray(cut) = reduce_array[weight_array_length + (cut+1)*2+0];
4218 hostRightArray(cut) = reduce_array[weight_array_length + (cut+1)*2+1];
4219 }
4220 Kokkos::deep_copy(my_current_left_closest, hostLeftArray);
4221 Kokkos::deep_copy(my_current_right_closest, hostRightArray);
4222#endif
4223
4224 total_part_shift += total_part_count;
4225 concurrent_cut_shifts += num_cuts;
4226 }
4227
4228 auto local_temp_cut_coords = temp_cut_coords;
4229
4230 Kokkos::parallel_for(
4231 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
4232 (0, current_concurrent_num_parts), KOKKOS_LAMBDA(mj_part_t kk) {
4233 mj_part_t num_parts = local_device_num_partitioning_in_current_dim(
4234 current_work_part + kk);
4235 mj_part_t num_cuts = num_parts - 1;
4236 mj_part_t total_part_count = num_parts + num_cuts;
4237
4238 if(local_device_incomplete_cut_count(kk) > 0) {
4239 // get the prefix sum
4240 // This is an inefficiency but not sure if it matters much
4241 size_t offset = 0;
4242 size_t offset_cuts = 0;
4243 for(mj_part_t kk2 = 0; kk2 < kk; ++kk2) {
4244 auto num_parts_kk2 = local_device_num_partitioning_in_current_dim(
4245 current_work_part + kk2);
4246 offset += num_parts_kk2 * 2 - 1;
4247 offset_cuts += num_parts_kk2 - 1;
4248 }
4249
4250 for(mj_part_t i = 1; i < total_part_count; ++i) {
4251 // check for cuts sharing the same position; all cuts sharing a position
4252 // have the same weight == total weight for all cuts sharing the
4253 // position. Don't want to accumulate that total weight more than once.
4254 if(i % 2 == 0 && i > 1 && i < total_part_count - 1 &&
4255 std::abs(local_temp_cut_coords(offset_cuts + i / 2) -
4256 local_temp_cut_coords(offset_cuts + i /2 - 1))
4257 < local_sEpsilon) {
4258 // i % 2 = 0 when part i represents the cut coordinate.
4259 // if it is a cut, and if next cut also has the same coordinate, then
4260 // dont addup.
4261 local_thread_part_weights(offset + i)
4262 = local_thread_part_weights(offset + i-2);
4263 continue;
4264 }
4265
4266 // otherwise do the prefix sum.
4267 local_thread_part_weights(offset + i) +=
4268 local_thread_part_weights(offset + i-1);
4269 }
4270 }
4271 });
4272}
4273
4281template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4282 typename mj_part_t, typename mj_node_t>
4283void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
4284 mj_combine_rightleft_and_weights(
4285 mj_part_t current_work_part,
4287{
4288 auto local_thread_part_weights = this->thread_part_weights;
4289 auto local_is_cut_line_determined = this->is_cut_line_determined;
4290 auto local_thread_cut_left_closest_point =
4291 this->thread_cut_left_closest_point;
4292 auto local_thread_cut_right_closest_point =
4293 this->thread_cut_right_closest_point;
4294 auto local_total_part_weight_left_right_closests =
4295 this->total_part_weight_left_right_closests;
4296 auto local_device_num_partitioning_in_current_dim =
4297 device_num_partitioning_in_current_dim;
4298 Kokkos::parallel_for(
4299 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>(0,1),
4300 KOKKOS_LAMBDA (int dummy) {
4301
4302 size_t tlr_array_shift = 0;
4303 mj_part_t cut_shift = 0;
4304 size_t total_part_array_shift = 0;
4305
4306 // iterate for all concurrent parts to find the left and right closest
4307 // points in the process.
4308 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i) {
4309
4310 mj_part_t num_parts_in_part =
4311 local_device_num_partitioning_in_current_dim(current_work_part + i);
4312 mj_part_t num_cuts_in_part = num_parts_in_part - 1;
4313 size_t num_total_part_in_part =
4314 num_parts_in_part + size_t (num_cuts_in_part);
4315
4316 // iterate for cuts in a single part.
4317 for(int ii = 0; ii < num_cuts_in_part; ++ii) {
4318 mj_part_t next = tlr_array_shift + ii;
4319 mj_part_t cut_index = cut_shift + ii;
4320
4321 if(!local_is_cut_line_determined(cut_index)) {
4322 mj_scalar_t left_closest_in_process =
4323 local_thread_cut_left_closest_point(cut_index);
4324 mj_scalar_t right_closest_in_process =
4325 local_thread_cut_right_closest_point(cut_index);
4326
4327 // store the left and right closes points.
4328 local_total_part_weight_left_right_closests(
4329 num_total_part_in_part + next) = left_closest_in_process;
4330
4331 local_total_part_weight_left_right_closests(
4332 num_total_part_in_part + num_cuts_in_part + next) =
4333 right_closest_in_process;
4334 }
4335 }
4336
4337 for(size_t j = 0; j < num_total_part_in_part; ++j) {
4338 mj_part_t cut_ind = j / 2 + cut_shift;
4339
4340 // need to check j != num_total_part_in_part - 1
4341 // which is same as j/2 != num_cuts_in_part.
4342 // we cannot check it using cut_ind, because of the concurrent part
4343 // concantanetion.
4344 if(j == num_total_part_in_part - 1 ||
4345 !local_is_cut_line_determined(cut_ind)) {
4346 double pwj = local_thread_part_weights(total_part_array_shift + j);
4347 local_total_part_weight_left_right_closests(tlr_array_shift + j) = pwj;
4348 }
4349 }
4350
4351 // set the shift position in the arrays
4352 cut_shift += num_cuts_in_part;
4353 tlr_array_shift += num_total_part_in_part + 2 * num_cuts_in_part;
4354 total_part_array_shift += num_total_part_in_part;
4355 }
4356 });
4357}
4358
4371template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4372 typename mj_part_t, typename mj_node_t>
4373KOKKOS_INLINE_FUNCTION
4374void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
4375 mj_node_t>::mj_calculate_new_cut_position(mj_scalar_t cut_upper_bound,
4376 mj_scalar_t cut_lower_bound,
4377 mj_scalar_t cut_upper_weight,
4378 mj_scalar_t cut_lower_weight,
4379 mj_scalar_t expected_weight,
4380 mj_scalar_t &new_cut_position,
4381 mj_scalar_t sEpsilon) {
4382
4383 if(std::abs(cut_upper_bound - cut_lower_bound) < sEpsilon) {
4384 new_cut_position = cut_upper_bound; //or lower bound does not matter.
4385 }
4386
4387 if(std::abs(cut_upper_weight - cut_lower_weight) < sEpsilon) {
4388 new_cut_position = cut_lower_bound;
4389 }
4390
4391 mj_scalar_t coordinate_range = (cut_upper_bound - cut_lower_bound);
4392 mj_scalar_t weight_range = (cut_upper_weight - cut_lower_weight);
4393 mj_scalar_t my_weight_diff = (expected_weight - cut_lower_weight);
4394
4395 mj_scalar_t required_shift = (my_weight_diff / weight_range);
4396 int scale_constant = 20;
4397 int shiftint= int (required_shift * scale_constant);
4398 if(shiftint == 0) shiftint = 1;
4399 required_shift = mj_scalar_t (shiftint) / scale_constant;
4400 new_cut_position = coordinate_range * required_shift + cut_lower_bound;
4401}
4402
4403#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4404
4405template<class policy_t, class scalar_t>
4407
4412
4413 KOKKOS_INLINE_FUNCTION ArrayReducer(
4414 value_type &val,
4415 int mj_value_count) :
4416 value(&val),
4417 value_count(mj_value_count)
4418 {}
4419
4420 KOKKOS_INLINE_FUNCTION
4422 return *value;
4423 }
4424
4425 KOKKOS_INLINE_FUNCTION
4426 void join(value_type& dst, const value_type& src) const {
4427 for(int n = 0; n < value_count; ++n) {
4428 dst.ptr[n] += src.ptr[n];
4429 }
4430 }
4431
4432 KOKKOS_INLINE_FUNCTION void init (value_type& dst) const {
4433 dst.ptr = value->ptr; // must update ptr
4434 for(int n = 0; n < value_count; ++n) {
4435 dst.ptr[n] = 0;
4436 }
4437 }
4438};
4439
4440#endif
4441
4442template<class policy_t, class scalar_t, class part_t, class index_t,
4443 class device_t, class array_t>
4445 typedef typename policy_t::member_type member_type;
4446 typedef Kokkos::View<scalar_t*> scalar_view_t;
4447
4448#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4449 typedef array_t value_type[];
4450#endif
4451
4454 Kokkos::View<index_t*, device_t> permutations;
4455 Kokkos::View<scalar_t *, device_t> coordinates;
4456 Kokkos::View<part_t*, device_t> parts;
4457 Kokkos::View<index_t *, device_t> part_xadj;
4458 Kokkos::View<index_t *, device_t> track_on_cuts;
4459
4460#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4461 Kokkos::View<int *, device_t> local_point_counts;
4462#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4463
4465 part_t mj_concurrent_current_part,
4466 part_t mj_weight_array_size,
4467 Kokkos::View<index_t*, device_t> & mj_permutations,
4468 Kokkos::View<scalar_t *, device_t> & mj_coordinates,
4469 Kokkos::View<part_t*, device_t> & mj_parts,
4470 Kokkos::View<index_t *, device_t> & mj_part_xadj,
4471 Kokkos::View<index_t *, device_t> & mj_track_on_cuts
4472#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4473 ,Kokkos::View<int *, device_t> & mj_local_point_counts
4474#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4475 ) :
4476 concurrent_current_part(mj_concurrent_current_part),
4477 value_count(mj_weight_array_size),
4478 permutations(mj_permutations),
4479 coordinates(mj_coordinates),
4480 parts(mj_parts),
4481 part_xadj(mj_part_xadj),
4482 track_on_cuts(mj_track_on_cuts)
4483#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4484 ,local_point_counts(mj_local_point_counts)
4485#endif
4486 {
4487 }
4488
4489 size_t team_shmem_size (int team_size) const {
4490#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4491 int result = sizeof(array_t) * (value_count);
4492#else
4493 int result = sizeof(array_t) * (value_count) * team_size;
4494#endif
4495
4496 // pad this to a multiple of 8 or it will run corrupt
4497 int remainder = result % 8;
4498 if(remainder != 0) {
4499 result += 8 - remainder;
4500 }
4501 return result;
4502 }
4503
4504 KOKKOS_INLINE_FUNCTION
4505#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4506 void operator() (const member_type & teamMember) const {
4507#else
4508 void operator() (const member_type & teamMember, value_type teamSum) const {
4509#endif
4510 index_t all_begin = (concurrent_current_part == 0) ? 0 :
4511 part_xadj(concurrent_current_part - 1);
4512 index_t all_end = part_xadj(concurrent_current_part);
4513
4514 index_t num_working_points = all_end - all_begin;
4515 int num_teams = teamMember.league_size();
4516
4517 index_t stride = num_working_points / num_teams;
4518 if((num_working_points % num_teams) > 0) {
4519 stride += 1; // make sure we have coverage for the final points
4520 }
4521
4522 index_t begin = all_begin + stride * teamMember.league_rank();
4523 index_t end = begin + stride;
4524 if(end > all_end) {
4525 end = all_end; // the last team may have less work than the other teams
4526 }
4527
4528 int track_on_cuts_insert_index = track_on_cuts.size() - 1;
4529
4530 // create the team shared data - each thread gets one of the arrays
4531#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4532 size_t sh_mem_size = sizeof(array_t) * (value_count);
4533#else
4534 size_t sh_mem_size =
4535 sizeof(array_t) * (value_count) * teamMember.team_size();
4536#endif
4537
4538 array_t * shared_ptr = (array_t *) teamMember.team_shmem().get_shmem(
4539 sh_mem_size);
4540
4541#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4542 // init the shared array to 0
4543 Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
4544 for(int n = 0; n < value_count; ++n) {
4545 shared_ptr[n] = 0;
4546 }
4547 });
4548 teamMember.team_barrier();
4549
4550 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, begin, end),
4551 [=] (index_t ii) {
4552#else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4553 // select the array for this thread
4554 Zoltan2_MJArrayType<array_t> array(&shared_ptr[teamMember.team_rank() *
4555 (value_count)]);
4556
4557 // create reducer which handles the Zoltan2_MJArrayType class
4558 ArrayReducer<policy_t, array_t> arrayReducer(array, value_count);
4559
4560 Kokkos::parallel_reduce(
4561 Kokkos::TeamThreadRange(teamMember, begin, end),
4562 [=] (size_t ii, Zoltan2_MJArrayType<array_t>& threadSum) {
4563#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4564
4565 index_t coordinate_index = permutations(ii);
4566 part_t place = parts(coordinate_index);
4567 part_t part = place / 2;
4568 if(place % 2 == 0) {
4569#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4570 Kokkos::atomic_add(&shared_ptr[part], 1);
4571#else
4572 threadSum.ptr[part] += 1;
4573#endif
4574
4575 parts(coordinate_index) = part;
4576 }
4577 else {
4578 // fill a tracking array so we can process these slower points
4579 // in next cycle
4580 index_t set_index = Kokkos::atomic_fetch_add(
4581 &track_on_cuts(track_on_cuts_insert_index), 1);
4582 track_on_cuts(set_index) = ii;
4583 }
4584#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4585 });
4586#else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4587 }, arrayReducer);
4588#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4589
4590 teamMember.team_barrier();
4591
4592 // collect all the team's results
4593 Kokkos::single(Kokkos::PerTeam(teamMember), [=] () {
4594 for(int n = 0; n < value_count; ++n) {
4595#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4596 Kokkos::atomic_add(&local_point_counts(n), shared_ptr[n]);
4597#else // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4598 teamSum[n] += array.ptr[n];
4599#endif // KOKKOS_ENABLE_CUDA || KOKKOS_ENABLE_HIP
4600 }
4601 });
4602
4603 teamMember.team_barrier();
4604 }
4605
4606#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4607
4608 KOKKOS_INLINE_FUNCTION
4609 void join(value_type dst, const value_type src) const {
4610 for(int n = 0; n < value_count; ++n) {
4611 dst[n] += src[n];
4612 }
4613 }
4614
4615 KOKKOS_INLINE_FUNCTION void init (value_type dst) const {
4616 for(int n = 0; n < value_count; ++n) {
4617 dst[n] = 0;
4618 }
4619 }
4620#endif
4621};
4622
4638template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
4639 typename mj_part_t, typename mj_node_t>
4640void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
4641mj_create_new_partitions(
4642 mj_part_t num_parts,
4643 mj_part_t current_concurrent_work_part,
4644 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
4645 Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
4646 Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
4647 Kokkos::View<mj_lno_t *, device_t> & out_part_xadj)
4648{
4649 // Get locals for cuda
4650 auto local_thread_part_weight_work = this->thread_part_weight_work;
4651 auto local_point_counts = this->thread_point_counts;
4652 auto local_distribute_points_on_cut_lines =
4653 this->distribute_points_on_cut_lines;
4654 auto local_thread_cut_line_weight_to_put_left =
4655 this->thread_cut_line_weight_to_put_left;
4656 auto local_sEpsilon = this->sEpsilon;
4657 auto local_coordinate_permutations = this->coordinate_permutations;
4658 auto local_mj_weights = this->mj_weights;
4659 auto local_assigned_part_ids = this->assigned_part_ids;
4660 auto local_new_coordinate_permutations = this->new_coordinate_permutations;
4661
4662 mj_part_t num_cuts = num_parts - 1;
4663
4664 Kokkos::parallel_for(
4665 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4666 KOKKOS_LAMBDA(int dummy) {
4667
4668 if(local_distribute_points_on_cut_lines) {
4669 for(int i = 0; i < num_cuts; ++i) {
4670 mj_scalar_t left_weight = used_local_cut_line_weight_to_left(i);
4671 if(left_weight > local_sEpsilon) {
4672 // the weight of thread ii on cut.
4673 mj_scalar_t thread_ii_weight_on_cut =
4674 local_thread_part_weight_work(i * 2 + 1) -
4675 local_thread_part_weight_work(i * 2);
4676
4677 if(thread_ii_weight_on_cut < left_weight) {
4678 // if left weight is bigger than threads weight on cut.
4679 local_thread_cut_line_weight_to_put_left(i) =
4680 thread_ii_weight_on_cut;
4681 }
4682 else {
4683 // if thread's weight is bigger than space, then put only a portion.
4684 local_thread_cut_line_weight_to_put_left(i) = left_weight;
4685 }
4686 left_weight -= thread_ii_weight_on_cut;
4687 }
4688 else {
4689 local_thread_cut_line_weight_to_put_left(i) = 0;
4690 }
4691 }
4692
4693 // this is a special case. If cutlines share the same coordinate,
4694 // their weights are equal. We need to adjust the ratio for that.
4695 for(mj_part_t i = num_cuts - 1; i > 0 ; --i) {
4696 if(std::abs(current_concurrent_cut_coordinate(i) -
4697 current_concurrent_cut_coordinate(i -1)) < local_sEpsilon) {
4698 local_thread_cut_line_weight_to_put_left(i) -=
4699 local_thread_cut_line_weight_to_put_left(i - 1);
4700 }
4701 local_thread_cut_line_weight_to_put_left(i) =
4702 static_cast<long long>((local_thread_cut_line_weight_to_put_left(i) +
4703 least_signifiance) * significance_mul) /
4704 static_cast<mj_scalar_t>(significance_mul);
4705 }
4706 }
4707
4708 for(mj_part_t i = 0; i < num_parts; ++i) {
4709 local_point_counts(i) = 0;
4710 }
4711 });
4712
4713 mj_lno_t coordinate_begin_index =
4714 current_concurrent_work_part == 0 ? 0 :
4715 host_part_xadj(current_concurrent_work_part - 1);
4716 mj_lno_t coordinate_end_index =
4717 host_part_xadj(current_concurrent_work_part);
4718
4719 mj_lno_t total_on_cut;
4720 Kokkos::parallel_reduce("Get total_on_cut",
4721 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (
4722 coordinate_begin_index, coordinate_end_index),
4723 KOKKOS_LAMBDA(int ii, mj_lno_t & val) {
4724 mj_lno_t coordinate_index = local_coordinate_permutations(ii);
4725 mj_part_t coordinate_assigned_place =
4726 local_assigned_part_ids(coordinate_index);
4727 if(coordinate_assigned_place % 2 == 1) {
4728 val += 1;
4729 }
4730 }, total_on_cut);
4731
4732 Kokkos::View<mj_lno_t *, device_t> track_on_cuts;
4733 if(total_on_cut > 0) {
4734 track_on_cuts = Kokkos::View<mj_lno_t *, device_t>(
4735 "track_on_cuts", // would do WithoutInitialization but need last init to 0
4736 total_on_cut + 1); // extra index to use for tracking
4737 }
4738
4739 // here we need to parallel reduce an array to count coords in each part
4740 // atomically adding, especially for low part count would kill us
4741 // in the original setup we kept arrays allocated for each thread but for
4742 // the cuda version we'd like to avoid allocating N arrays for the number
4743 // of teams/threads which would be complicated based on running openmp or
4744 // cuda.
4745 typedef Kokkos::TeamPolicy<typename mj_node_t::execution_space> policy_t;
4746
4747 // if not set use 60 - somewhat arbitrary based on initial performance tests
4748 int use_num_teams = mj_num_teams ? mj_num_teams : 60;
4749
4750 auto policy_ReduceFunctor = policy_t(use_num_teams, Kokkos::AUTO);
4751 typedef int array_t;
4752
4753 // just need parts - on the cuts will be handled in a separate serial
4754 // call after this.
4755#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4756 Kokkos::View<array_t*, Kokkos::HostSpace> reduce_array("reduce_array", num_parts);
4757#endif
4758
4759 ReduceArrayFunctor<policy_t, mj_scalar_t, mj_part_t, mj_lno_t,
4760 typename mj_node_t::device_type, array_t>teamFunctor(
4761 current_concurrent_work_part,
4762 num_parts,
4763 coordinate_permutations,
4764 mj_current_dim_coords,
4765 assigned_part_ids,
4766 part_xadj,
4767 track_on_cuts
4768#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4769 ,local_point_counts
4770#endif
4771 );
4772
4773#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4774 Kokkos::parallel_for(policy_ReduceFunctor, teamFunctor);
4775#else
4776 Kokkos::parallel_reduce(policy_ReduceFunctor, teamFunctor, reduce_array);
4777 Kokkos::fence();
4778#endif
4779
4780#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
4781 for(mj_part_t part = 0; part < num_parts; ++part) {
4782 local_point_counts(part) = reduce_array[part];
4783 }
4784#endif
4785
4786 // the last member is utility used for atomically inserting the values.
4787 // Sorting here avoids potential indeterminancy in the partitioning results
4788 if(track_on_cuts.size() > 0) { // size 0 unused, or size is minimum of 2
4789 auto track_on_cuts_sort = Kokkos::subview(track_on_cuts,
4790 std::pair<mj_lno_t, mj_lno_t>(0, track_on_cuts.size() - 1)); // do not sort last element
4791 Kokkos::sort(track_on_cuts_sort);
4792 }
4793
4794 bool uniform_weights0 = this->mj_uniform_weights(0);
4795 Kokkos::parallel_for(
4796 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4797 KOKKOS_LAMBDA (int dummy) {
4798
4799 for(int j = 0; j < total_on_cut; ++j) {
4800 int ii = track_on_cuts(j);
4801 mj_lno_t coordinate_index = local_coordinate_permutations(ii);
4802 mj_scalar_t coordinate_weight = uniform_weights0 ? 1 :
4803 local_mj_weights(coordinate_index,0);
4804 mj_part_t coordinate_assigned_place =
4805 local_assigned_part_ids(coordinate_index);
4806 mj_part_t coordinate_assigned_part = coordinate_assigned_place / 2;
4807 // if it is on the cut.
4808 if(local_distribute_points_on_cut_lines &&
4809 local_thread_cut_line_weight_to_put_left(
4810 coordinate_assigned_part) > local_sEpsilon) {
4811 // if the rectilinear partitioning is allowed,
4812 // and the thread has still space to put on the left of the cut
4813 // then thread puts the vertex to left.
4814 local_thread_cut_line_weight_to_put_left(
4815 coordinate_assigned_part) -= coordinate_weight;
4816 // if putting the vertex to left increased the weight more
4817 // than expected, and if the next cut is on the same coordinate,
4818 // then we need to adjust how much weight next cut puts to its left as
4819 // well, in order to take care of the imbalance.
4820 if(local_thread_cut_line_weight_to_put_left(
4821 coordinate_assigned_part) < 0 && coordinate_assigned_part <
4822 num_cuts - 1 &&
4823 std::abs(current_concurrent_cut_coordinate(
4824 coordinate_assigned_part+1) -
4825 current_concurrent_cut_coordinate(
4826 coordinate_assigned_part)) < local_sEpsilon)
4827 {
4828 local_thread_cut_line_weight_to_put_left(
4829 coordinate_assigned_part + 1) +=
4830 local_thread_cut_line_weight_to_put_left(
4831 coordinate_assigned_part);
4832 }
4833 ++local_point_counts(coordinate_assigned_part);
4834 local_assigned_part_ids(coordinate_index) =
4835 coordinate_assigned_part;
4836 }
4837 else {
4838 // if there is no more space on the left, put the coordinate to the
4839 // right of the cut.
4840 ++coordinate_assigned_part;
4841 // this while loop is necessary when a line is partitioned into more
4842 // than 2 parts.
4843 while(local_distribute_points_on_cut_lines &&
4844 coordinate_assigned_part < num_cuts)
4845 {
4846 // traverse all the cut lines having the same partitiong
4847 if(std::abs(current_concurrent_cut_coordinate(
4848 coordinate_assigned_part) -
4849 current_concurrent_cut_coordinate(
4850 coordinate_assigned_part - 1)) < local_sEpsilon)
4851 {
4852 // if line has enough space on left, put it there.
4853 if(local_thread_cut_line_weight_to_put_left(
4854 coordinate_assigned_part) > local_sEpsilon &&
4855 local_thread_cut_line_weight_to_put_left(
4856 coordinate_assigned_part) >=
4857 std::abs(local_thread_cut_line_weight_to_put_left(
4858 coordinate_assigned_part) - coordinate_weight))
4859 {
4860 local_thread_cut_line_weight_to_put_left(
4861 coordinate_assigned_part) -= coordinate_weight;
4862 // Again if it put too much on left of the cut,
4863 // update how much the next cut sharing the same coordinate will
4864 // put to its left.
4865 if(local_thread_cut_line_weight_to_put_left(
4866 coordinate_assigned_part) < 0 &&
4867 coordinate_assigned_part < num_cuts - 1 &&
4868 std::abs(current_concurrent_cut_coordinate(
4869 coordinate_assigned_part+1) -
4870 current_concurrent_cut_coordinate(
4871 coordinate_assigned_part)) < local_sEpsilon)
4872 {
4873 local_thread_cut_line_weight_to_put_left(
4874 coordinate_assigned_part + 1) +=
4875 local_thread_cut_line_weight_to_put_left(
4876 coordinate_assigned_part);
4877 }
4878 break;
4879 }
4880 }
4881 else {
4882 break;
4883 }
4884 ++coordinate_assigned_part;
4885 }
4886 local_point_counts(coordinate_assigned_part) += 1;
4887 local_assigned_part_ids(coordinate_index) = coordinate_assigned_part;
4888 }
4889 }
4890
4891 for(int j = 0; j < num_parts; ++j) {
4892 out_part_xadj(j) = local_point_counts(j);
4893 local_point_counts(j) = 0;
4894
4895 if(j != 0) {
4896 out_part_xadj(j) += out_part_xadj(j - 1);
4897 local_point_counts(j) += out_part_xadj(j - 1);
4898 }
4899 }
4900 });
4901
4902 // here we will determine insert indices for N teams
4903 // then all the teams can fill
4904
4905#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
4906
4907 // This is the fastest so far - just straight atomic writes for CUDA
4908 // However this is not a deterministic result since it is atomic.
4909 // The final result will be deterministic.
4910 Kokkos::parallel_for(
4911 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t> (
4912 coordinate_begin_index, coordinate_end_index),
4913 KOKKOS_LAMBDA (mj_lno_t ii) {
4914 mj_lno_t i = local_coordinate_permutations(ii);
4915 mj_part_t p = local_assigned_part_ids(i);
4916 mj_lno_t idx = Kokkos::atomic_fetch_add(&local_point_counts(p), 1);
4917 local_new_coordinate_permutations(coordinate_begin_index + idx) = i;
4918 });
4919
4920#else
4921
4922#ifdef KOKKOS_ENABLE_OPENMP
4923 // will return and fix this - revert back to 1 for clear auto testing
4924 const int num_threads = 1; // Kokkos::OpenMP::impl_max_hardware_threads();
4925#else
4926 const int num_threads = 1;
4927#endif
4928
4929 const int num_teams = 1; // cuda is handled above using a different format
4930
4931 // allow init - we want all 0's first
4932 Kokkos::View<mj_lno_t*, device_t>
4933 point_counter("insert indices", num_teams * num_threads * num_parts);
4934
4935 // count how many coords per thread
4936 // then we will fill each independently
4937 Kokkos::TeamPolicy<typename mj_node_t::execution_space>
4938 block_policy(num_teams, num_threads);
4939 typedef typename Kokkos::TeamPolicy<typename mj_node_t::execution_space>::
4940 member_type member_type;
4941 mj_lno_t range = coordinate_end_index - coordinate_begin_index;
4942 mj_lno_t block_size = range / num_teams + 1;
4943 Kokkos::parallel_for(block_policy, KOKKOS_LAMBDA(member_type team_member) {
4944 int team = team_member.league_rank();
4945 int team_offset = team * num_threads * num_parts;
4946 mj_lno_t begin = coordinate_begin_index + team * block_size;
4947 mj_lno_t end = begin + block_size;
4948 if(end > coordinate_end_index) {
4949 end = coordinate_end_index;
4950 }
4951
4952 Kokkos::parallel_for(Kokkos::TeamThreadRange(team_member, begin, end),
4953 [=] (mj_lno_t ii) {
4954 int thread = team_member.team_rank();
4955 mj_lno_t i = local_coordinate_permutations(ii);
4956 mj_part_t p = local_assigned_part_ids(i);
4957 int index = team_offset + thread * num_parts + p;
4958 ++point_counter(index);
4959 });
4960 });
4961
4962 // now prefix sum
4963 // we currently have the counts in the slots
4964 // we want the first counter for each part to be 0
4965 // then the rest should be the sum of all the priors
4966 Kokkos::parallel_for(
4967 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
4968 KOKKOS_LAMBDA (int dummy) {
4969 int num_sets = point_counter.size() / num_parts;
4970 for(int set = num_sets - 1; set >= 1; set -=1) {
4971 int base = set * num_parts;
4972 for(int part = 0; part < num_parts; ++part) {
4973 point_counter(base + part) = point_counter(base + part - num_parts);
4974 }
4975 }
4976
4977 for(int part = 0; part < num_parts; ++part) {
4978 point_counter(part) = 0;
4979 }
4980
4981 for(int set = 1; set < num_sets; ++set) {
4982 int base = set * num_parts;
4983 for(int part = 0; part < num_parts; ++part) {
4984 point_counter(base + part) += point_counter(base + part - num_parts);
4985 }
4986 }
4987 });
4988
4989 // now permute
4990 Kokkos::parallel_for(block_policy, KOKKOS_LAMBDA(member_type team_member) {
4991 int team = team_member.league_rank();
4992 int team_offset = team * num_threads * num_parts;
4993 mj_lno_t begin = coordinate_begin_index + team * block_size;
4994 mj_lno_t end = begin + block_size;
4995 if(end > coordinate_end_index) {
4996 end = coordinate_end_index;
4997 }
4998 Kokkos::parallel_for(Kokkos::TeamThreadRange(team_member, begin, end),
4999 [=] (mj_lno_t ii) {
5000 int thread = team_member.team_rank();
5001 mj_lno_t i = local_coordinate_permutations(ii);
5002 mj_part_t p = local_assigned_part_ids(i);
5003 int index = team_offset + thread * num_parts + p;
5004 int set_counter = (point_counter(index)++) + local_point_counts(p);
5005 local_new_coordinate_permutations(coordinate_begin_index + set_counter) = i;
5006 });
5007 });
5008#endif
5009}
5010
5054template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5055 typename mj_part_t, typename mj_node_t>
5056void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
5057 mj_node_t>::mj_get_new_cut_coordinates(
5058 mj_part_t current_concurrent_num_parts,
5059 mj_part_t kk,
5060 const mj_part_t &num_cuts,
5061 const double &used_imbalance_tolerance,
5062 Kokkos::View<mj_scalar_t *, device_t> & current_global_part_weights,
5063 Kokkos::View<mj_scalar_t *, device_t> & current_local_part_weights,
5064 Kokkos::View<mj_scalar_t *, device_t> & current_part_target_weights,
5065 Kokkos::View<bool *, device_t> & current_cut_line_determined,
5066 Kokkos::View<mj_scalar_t *, device_t> & current_cut_coordinates,
5067 Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_bounds,
5068 Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bounds,
5069 Kokkos::View<mj_scalar_t *, device_t> & current_global_left_closest_points,
5070 Kokkos::View<mj_scalar_t *, device_t> & current_global_right_closest_points,
5071 Kokkos::View<mj_scalar_t *, device_t> & current_cut_lower_bound_weights,
5072 Kokkos::View<mj_scalar_t *, device_t> & current_cut_upper_weights,
5073 Kokkos::View<mj_scalar_t *, device_t> & new_current_cut_coordinates,
5074 Kokkos::View<mj_scalar_t *, device_t> &
5075 current_part_cut_line_weight_to_put_left,
5076 Kokkos::View<mj_part_t *, device_t> & view_rectilinear_cut_count)
5077{
5078 Kokkos::deep_copy(device_incomplete_cut_count, this->incomplete_cut_count);
5079
5080 auto local_device_incomplete_cut_count = device_incomplete_cut_count;
5081 auto local_sEpsilon = sEpsilon;
5082 auto local_distribute_points_on_cut_lines = distribute_points_on_cut_lines;
5083 auto local_global_rectilinear_cut_weight = global_rectilinear_cut_weight;
5084 auto local_process_rectilinear_cut_weight = process_rectilinear_cut_weight;
5085 auto local_global_min_max_coord_total_weight =
5086 global_min_max_coord_total_weight;
5087
5088 const auto _sEpsilon = this->sEpsilon;
5089 // Note for a 22 part system I tried removing the outer loop
5090 // and doing each sub loop as a simple parallel_for over num_cuts.
5091 // But that was about twice as slow (10ms) as the current form (5ms)
5092 // so I think the overhead of launching the new global parallel kernels
5093 // is costly. This form is just running one team so effectively using
5094 // a single warp to process the cuts. I expect with a lot of parts this
5095 // might need changing.
5096 Kokkos::TeamPolicy<typename mj_node_t::execution_space>
5097 policy_one_team(1, Kokkos::AUTO());
5098 typedef typename Kokkos::TeamPolicy<typename mj_node_t::execution_space>::
5099 member_type member_type;
5100 Kokkos::parallel_for(policy_one_team, KOKKOS_LAMBDA(member_type team_member) {
5101
5102 mj_scalar_t min_coordinate =
5103 local_global_min_max_coord_total_weight(kk);
5104 mj_scalar_t max_coordinate =
5105 local_global_min_max_coord_total_weight(
5106 kk + current_concurrent_num_parts);
5107 mj_scalar_t global_total_weight =
5108 local_global_min_max_coord_total_weight(
5109 kk + current_concurrent_num_parts * 2);
5110
5111 Kokkos::parallel_for(Kokkos::TeamThreadRange (team_member, num_cuts),
5112 [=] (mj_part_t i) {
5113 // if left and right closest points are not set yet,
5114 // set it to the cut itself.
5115 if(min_coordinate -
5116 current_global_left_closest_points(i) > local_sEpsilon) {
5117 current_global_left_closest_points(i) =
5118 current_cut_coordinates(i);
5119 }
5120 if(current_global_right_closest_points(i) -
5121 max_coordinate > local_sEpsilon) {
5122 current_global_right_closest_points(i) =
5123 current_cut_coordinates(i);
5124 }
5125 });
5126 team_member.team_barrier(); // for end of Kokkos::TeamThreadRange
5127
5128 Kokkos::parallel_for(Kokkos::TeamThreadRange (team_member, num_cuts),
5129 [=] (mj_part_t i) {
5130 using algMJ_t = AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t,
5131 mj_node_t>;
5132 // seen weight in the part
5133 mj_scalar_t seen_weight_in_part = 0;
5134 // expected weight for part.
5135 mj_scalar_t expected_weight_in_part = 0;
5136 // imbalance for the left and right side of the cut.
5137 double imbalance_on_left = 0, imbalance_on_right = 0;
5138 if(local_distribute_points_on_cut_lines) {
5139 // init the weight on the cut.
5140 local_global_rectilinear_cut_weight(i) = 0;
5141 local_process_rectilinear_cut_weight(i) = 0;
5142 }
5143 bool bContinue = false;
5144 // if already determined at previous iterations,
5145 // then just write the coordinate to new array, and proceed.
5146 if(current_cut_line_determined(i)) {
5147 new_current_cut_coordinates(i) =
5148 current_cut_coordinates(i);
5149 bContinue = true;
5150 }
5151 if(!bContinue) {
5152 //current weight of the part at the left of the cut line.
5153 seen_weight_in_part = current_global_part_weights(i * 2);
5154
5155 //expected ratio
5156 expected_weight_in_part = current_part_target_weights(i);
5157
5158 //leftImbalance = imbalanceOf(seenW, globalTotalWeight, expected);
5159 imbalance_on_left = algMJ_t::calculate_imbalance(seen_weight_in_part,
5160 expected_weight_in_part);
5161 // rightImbalance = imbalanceOf(globalTotalWeight - seenW,
5162 // globalTotalWeight, 1 - expected);
5163 imbalance_on_right = algMJ_t::calculate_imbalance(global_total_weight -
5164 seen_weight_in_part, global_total_weight - expected_weight_in_part);
5165 bool is_left_imbalance_valid = std::abs(imbalance_on_left) -
5166 used_imbalance_tolerance < local_sEpsilon ;
5167 bool is_right_imbalance_valid = std::abs(imbalance_on_right) -
5168 used_imbalance_tolerance < local_sEpsilon;
5169 //if the cut line reaches to desired imbalance.
5170 if(is_left_imbalance_valid && is_right_imbalance_valid) {
5171 current_cut_line_determined(i) = true;
5172 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5173 new_current_cut_coordinates(i) = current_cut_coordinates(i);
5174 }
5175 else if(imbalance_on_left < 0) {
5176 //if left imbalance < 0 then we need to move the cut to right.
5177 if(local_distribute_points_on_cut_lines) {
5178 // if it is okay to distribute the coordinate on
5179 // the same coordinate to left and right.
5180 // then check if we can reach to the target weight by including the
5181 // coordinates in the part.
5182 if(current_global_part_weights(i * 2 + 1) ==
5183 expected_weight_in_part) {
5184 // if it is we are done.
5185 current_cut_line_determined(i) = true;
5186 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5187
5188 //then assign everything on the cut to the left of the cut.
5189 new_current_cut_coordinates(i) =
5190 current_cut_coordinates(i);
5191 //for this cut all the weight on cut will be put to left.
5192 current_part_cut_line_weight_to_put_left(i) =
5193 current_local_part_weights(i * 2 + 1) -
5194 current_local_part_weights(i * 2);
5195 bContinue = true;
5196 }
5197 else if(current_global_part_weights(i * 2 + 1) >
5198 expected_weight_in_part) {
5199 // if the weight is larger than the expected weight,
5200 // then we need to distribute some points to left, some to right.
5201 current_cut_line_determined(i) = true;
5202 Kokkos::atomic_add(&view_rectilinear_cut_count(0), 1);
5203
5204 // increase the num cuts to be determined with rectilinear
5205 // partitioning.
5206 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5207 new_current_cut_coordinates(i) =
5208 current_cut_coordinates(i);
5209 local_process_rectilinear_cut_weight[i] =
5210 current_local_part_weights(i * 2 + 1) -
5211 current_local_part_weights(i * 2);
5212 bContinue = true;
5213 }
5214 }
5215
5216 if(!bContinue) {
5217
5218 // we need to move further right,so set lower bound to current line,
5219 // and shift it to the closes point from right.
5220 current_cut_lower_bounds(i) =
5221 current_global_right_closest_points(i);
5222
5223 //set the lower bound weight to the weight we have seen.
5224 current_cut_lower_bound_weights(i) = seen_weight_in_part;
5225
5226 // compare the upper bound with what has been found in the
5227 // last iteration.
5228 // we try to make more strict bounds for the cut here.
5229 for(mj_part_t ii = i + 1; ii < num_cuts ; ++ii) {
5230 mj_scalar_t p_weight = current_global_part_weights(ii * 2);
5231 mj_scalar_t line_weight =
5232 current_global_part_weights(ii * 2 + 1);
5233 if(p_weight >= expected_weight_in_part) {
5234 // if a cut on the right has the expected weight, then we found
5235 // our cut position. Set up and low coordiantes to this
5236 // new cut coordinate, but we need one more iteration to
5237 // finalize the cut position, as wee need to update the part ids.
5238 if(p_weight == expected_weight_in_part) {
5239 current_cut_upper_bounds(i) =
5240 current_cut_coordinates(ii);
5241 current_cut_upper_weights(i) = p_weight;
5242 current_cut_lower_bounds(i) =
5243 current_cut_coordinates(ii);
5244 current_cut_lower_bound_weights(i) = p_weight;
5245 } else if(p_weight < current_cut_upper_weights(i)) {
5246 // if a part weight is larger then my expected weight,
5247 // but lower than my upper bound weight, update upper bound.
5248 current_cut_upper_bounds(i) =
5249 current_global_left_closest_points(ii);
5250 current_cut_upper_weights(i) = p_weight;
5251 }
5252 break;
5253 }
5254 // if comes here then pw < ew
5255 // then compare the weight against line weight.
5256 if(line_weight >= expected_weight_in_part) {
5257 // if the line is larger than the expected weight, then we need
5258 // to reach to the balance by distributing coordinates on
5259 // this line.
5260 current_cut_upper_bounds(i) =
5261 current_cut_coordinates(ii);
5262 current_cut_upper_weights(i) = line_weight;
5263 current_cut_lower_bounds(i) =
5264 current_cut_coordinates(ii);
5265 current_cut_lower_bound_weights(i) = p_weight;
5266 break;
5267 }
5268 // if a stricter lower bound is found,
5269 // update the lower bound.
5270 if(p_weight <= expected_weight_in_part && p_weight >=
5271 current_cut_lower_bound_weights(i)) {
5272 current_cut_lower_bounds(i) =
5273 current_global_right_closest_points(ii);
5274 current_cut_lower_bound_weights(i) = p_weight;
5275 }
5276 }
5277
5278 mj_scalar_t new_cut_position = 0;
5279 algMJ_t::mj_calculate_new_cut_position(
5280 current_cut_upper_bounds(i),
5281 current_cut_lower_bounds(i),
5282 current_cut_upper_weights(i),
5283 current_cut_lower_bound_weights(i),
5284 expected_weight_in_part, new_cut_position,
5285 _sEpsilon);
5286
5287 // if cut line does not move significantly.
5288 // then finalize the search.
5289 if(std::abs(current_cut_coordinates(i) -
5290 new_cut_position) < local_sEpsilon) {
5291 current_cut_line_determined(i) = true;
5292 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5293
5294 //set the cut coordinate and proceed.
5295 new_current_cut_coordinates(i) =
5296 current_cut_coordinates(i);
5297 } else {
5298 new_current_cut_coordinates(i) = new_cut_position;
5299 }
5300 } // bContinue
5301 } else {
5302 // need to move the cut line to left.
5303 // set upper bound to current line.
5304 current_cut_upper_bounds(i) =
5305 current_global_left_closest_points(i);
5306 current_cut_upper_weights(i) =
5307 seen_weight_in_part;
5308 // compare the current cut line weights with
5309 // previous upper and lower bounds.
5310 for(int ii = i - 1; ii >= 0; --ii) {
5311 mj_scalar_t p_weight =
5312 current_global_part_weights(ii * 2);
5313 mj_scalar_t line_weight =
5314 current_global_part_weights(ii * 2 + 1);
5315 if(p_weight <= expected_weight_in_part) {
5316 if(p_weight == expected_weight_in_part) {
5317 // if the weight of the part is my expected weight
5318 // then we find the solution.
5319 current_cut_upper_bounds(i) =
5320 current_cut_coordinates(ii);
5321 current_cut_upper_weights(i) = p_weight;
5322 current_cut_lower_bounds(i) =
5323 current_cut_coordinates(ii);
5324 current_cut_lower_bound_weights(i) = p_weight;
5325 }
5326 else if(p_weight > current_cut_lower_bound_weights(i)) {
5327 // if found weight is bigger than the lower bound
5328 // then update the lower bound.
5329 current_cut_lower_bounds(i) =
5330 current_global_right_closest_points(ii);
5331 current_cut_lower_bound_weights(i) = p_weight;
5332
5333 // at the same time, if weight of line is bigger than the
5334 // expected weight, then update the upper bound as well.
5335 // in this case the balance will be obtained by distributing
5336 // weights on this cut position.
5337 if(line_weight > expected_weight_in_part) {
5338 current_cut_upper_bounds(i) =
5339 current_global_right_closest_points(ii);
5340 current_cut_upper_weights(i) = line_weight;
5341 }
5342 }
5343 break;
5344 }
5345 // if the weight of the cut on the left is still bigger than
5346 // my weight, and also if the weight is smaller than the current
5347 // upper weight, or if the weight is equal to current upper
5348 // weight, but on the left of the upper weight, then update
5349 // upper bound.
5350 if(p_weight >= expected_weight_in_part &&
5351 (p_weight < current_cut_upper_weights(i) ||
5352 (p_weight == current_cut_upper_weights(i) &&
5353 current_cut_upper_bounds(i) >
5354 current_global_left_closest_points(ii)))) {
5355 current_cut_upper_bounds(i) =
5356 current_global_left_closest_points(ii);
5357 current_cut_upper_weights(i) = p_weight;
5358 }
5359 }
5360 mj_scalar_t new_cut_position = 0;
5361 algMJ_t::mj_calculate_new_cut_position(
5362 current_cut_upper_bounds(i),
5363 current_cut_lower_bounds(i),
5364 current_cut_upper_weights(i),
5365 current_cut_lower_bound_weights(i),
5366 expected_weight_in_part,
5367 new_cut_position,
5368 _sEpsilon);
5369
5370 // if cut line does not move significantly.
5371 if(std::abs(current_cut_coordinates(i) -
5372 new_cut_position) < local_sEpsilon) {
5373 current_cut_line_determined(i) = true;
5374 Kokkos::atomic_add(&local_device_incomplete_cut_count(kk), -1);
5375 //set the cut coordinate and proceed.
5376 new_current_cut_coordinates(i) =
5377 current_cut_coordinates(i);
5378 } else {
5379 new_current_cut_coordinates(i) =
5380 new_cut_position;
5381 }
5382 }
5383 }; // bContinue
5384 });
5385
5386 team_member.team_barrier(); // for end of Kokkos::TeamThreadRange
5387 });
5388
5389 // view_rectilinear_cut_count
5390 mj_part_t rectilinear_cut_count;
5391 Kokkos::parallel_reduce("Read bDoingWork",
5392 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>(0, 1),
5393 KOKKOS_LAMBDA(int dummy, int & set_single) {
5394 set_single = view_rectilinear_cut_count(0);
5395 }, rectilinear_cut_count);
5396
5397 if(rectilinear_cut_count > 0) {
5398 auto host_local_process_rectilinear_cut_weight =
5399 Kokkos::create_mirror_view(Kokkos::HostSpace(),
5400 local_process_rectilinear_cut_weight);
5401 auto host_local_global_rectilinear_cut_weight =
5402 Kokkos::create_mirror_view(Kokkos::HostSpace(),
5403 local_global_rectilinear_cut_weight);
5404 Kokkos::deep_copy(host_local_process_rectilinear_cut_weight,
5405 local_process_rectilinear_cut_weight);
5406 Kokkos::deep_copy(host_local_global_rectilinear_cut_weight,
5407 local_global_rectilinear_cut_weight);
5408 Teuchos::scan<int,mj_scalar_t>(
5409 *comm, Teuchos::REDUCE_SUM,
5410 num_cuts,
5411 host_local_process_rectilinear_cut_weight.data(),
5412 host_local_global_rectilinear_cut_weight.data());
5413 Kokkos::deep_copy(local_process_rectilinear_cut_weight,
5414 host_local_process_rectilinear_cut_weight);
5415 Kokkos::deep_copy(local_global_rectilinear_cut_weight,
5416 host_local_global_rectilinear_cut_weight);
5417
5418 Kokkos::parallel_for("finish up mj_get_new_cut_coordinates",
5419 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
5420 KOKKOS_LAMBDA(int dummy) {
5421 for(mj_part_t i = 0; i < num_cuts; ++i) {
5422 // if cut line weight to be distributed.
5423 if(local_global_rectilinear_cut_weight(i) > 0) {
5424 // expected weight to go to left of the cut.
5425 mj_scalar_t expected_part_weight = current_part_target_weights(i);
5426 // the weight that should be put to left of the cut.
5427 mj_scalar_t necessary_weight_on_line_for_left =
5428 expected_part_weight - current_global_part_weights(i * 2);
5429
5430 // the weight of the cut in the process
5431 mj_scalar_t my_weight_on_line =
5432 local_process_rectilinear_cut_weight(i);
5433
5434 // the sum of the cut weights upto this process,
5435 // including the weight of this process.
5436 mj_scalar_t weight_on_line_upto_process_inclusive =
5437 local_global_rectilinear_cut_weight(i);
5438 // the space on the left side of the cut after all processes
5439 // before this process (including this process)
5440 // puts their weights on cut to left.
5441 mj_scalar_t space_to_put_left =
5442 necessary_weight_on_line_for_left -
5443 weight_on_line_upto_process_inclusive;
5444 // add my weight to this space to find out how much space
5445 // is left to me.
5446 mj_scalar_t space_left_to_me =
5447 space_to_put_left + my_weight_on_line;
5448
5449 /*
5450 cout << "expected_part_weight:" << expected_part_weight
5451 << " necessary_weight_on_line_for_left:"
5452 << necessary_weight_on_line_for_left
5453 << " my_weight_on_line" << my_weight_on_line
5454 << " weight_on_line_upto_process_inclusive:"
5455 << weight_on_line_upto_process_inclusive
5456 << " space_to_put_left:" << space_to_put_left
5457 << " space_left_to_me" << space_left_to_me << endl;
5458 */
5459
5460 if(space_left_to_me < 0) {
5461 // space_left_to_me is negative and i dont need to put
5462 // anything to left.
5463 current_part_cut_line_weight_to_put_left(i) = 0;
5464 }
5465 else if(space_left_to_me >= my_weight_on_line) {
5466 // space left to me is bigger than the weight of the
5467 // processor on cut.
5468 // so put everything to left.
5469 current_part_cut_line_weight_to_put_left(i) =
5470 my_weight_on_line;
5471 // cout << "setting current_part_cut_line_weight_to_put_left
5472 // to my_weight_on_line:" << my_weight_on_line << endl;
5473 }
5474 else {
5475 // put only the weight as much as the space.
5476 current_part_cut_line_weight_to_put_left(i) =
5477 space_left_to_me;
5478 // cout << "setting current_part_cut_line_weight_to_put_left
5479 // to space_left_to_me:" << space_left_to_me << endl;
5480 }
5481 }
5482 }
5483 view_rectilinear_cut_count(0) = 0;
5484 });
5485 }
5486
5487 Kokkos::deep_copy(this->incomplete_cut_count, device_incomplete_cut_count);
5488}
5489
5499template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5500 typename mj_part_t, typename mj_node_t>
5501void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5502 get_processor_num_points_in_parts(
5503 mj_part_t num_procs,
5504 mj_part_t num_parts,
5505 mj_gno_t *&num_points_in_all_processor_parts)
5506{
5507 // initially allocation_size is num_parts
5508 size_t allocation_size = num_parts * (num_procs + 1);
5509
5510 // this will be output
5511 // holds how many each processor has in each part.
5512 // last portion is the sum of all processor points in each part.
5513
5514 // allocate memory for the local num coordinates in each part.
5515 mj_gno_t *num_local_points_in_each_part_to_reduce_sum =
5516 new mj_gno_t[allocation_size];
5517
5518 // this is the portion of the memory which will be used
5519 // at the summation to obtain total number of processors' points in each part.
5520 mj_gno_t *my_local_points_to_reduce_sum =
5521 num_local_points_in_each_part_to_reduce_sum + num_procs * num_parts;
5522
5523 // this is the portion of the memory where each stores its local number.
5524 // this information is needed by other processors.
5525 mj_gno_t *my_local_point_counts_in_each_part =
5526 num_local_points_in_each_part_to_reduce_sum + this->myRank * num_parts;
5527
5528 // initialize the array with 0's.
5529 memset(num_local_points_in_each_part_to_reduce_sum, 0,
5530 sizeof(mj_gno_t)*allocation_size);
5531
5532 auto local_new_part_xadj = this->new_part_xadj;
5533 Kokkos::View<mj_gno_t *, typename mj_node_t::device_type> points_per_part(
5534 Kokkos::ViewAllocateWithoutInitializing("points per part"), num_parts);
5535 Kokkos::parallel_for("get vals on device",
5536 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_gno_t>
5537 (0, num_parts), KOKKOS_LAMBDA(mj_gno_t i) {
5538 points_per_part(i) =
5539 local_new_part_xadj(i) - ((i == 0) ? 0 : local_new_part_xadj(i-1));
5540 });
5541 auto host_points_per_part = Kokkos::create_mirror_view(points_per_part);
5542 Kokkos::deep_copy(host_points_per_part, points_per_part);
5543 for(int i = 0; i < num_parts; ++i) {
5544 my_local_points_to_reduce_sum[i] = host_points_per_part(i);
5545 }
5546
5547 // copy the local num parts to the last portion of array, so that this portion
5548 // will represent the global num points in each part after the reduction.
5549 memcpy (my_local_point_counts_in_each_part, my_local_points_to_reduce_sum,
5550 sizeof(mj_gno_t) * (num_parts) );
5551
5552 // reduceAll operation.
5553 // the portion that belongs to a processor with index p
5554 // will start from myRank * num_parts.
5555 // the global number of points will be held at the index
5556 try{
5557 reduceAll<int, mj_gno_t>(
5558 *(this->comm),
5559 Teuchos::REDUCE_SUM,
5560 allocation_size,
5561 num_local_points_in_each_part_to_reduce_sum,
5562 num_points_in_all_processor_parts);
5563 }
5564 Z2_THROW_OUTSIDE_ERROR(*(this->mj_env))
5565
5566 delete [] num_local_points_in_each_part_to_reduce_sum;
5567}
5568
5584template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5585 typename mj_part_t, typename mj_node_t>
5586bool AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5587 mj_check_to_migrate(
5588 size_t migration_reduce_all_population,
5589 mj_lno_t num_coords_for_last_dim_part,
5590 mj_part_t num_procs,
5591 mj_part_t num_parts,
5592 mj_gno_t *num_points_in_all_processor_parts)
5593{
5594 // if reduce all count and population in the last dim is too high
5595 if(migration_reduce_all_population > future_reduceall_cutoff) {
5596 return true;
5597 }
5598
5599 // if the work in a part per processor in the last dim is too low.
5600 if(num_coords_for_last_dim_part < min_work_last_dim) {
5601 return true;
5602 }
5603
5604 // if migration is to be checked and the imbalance is too high
5605 if(this->check_migrate_avoid_migration_option == 0) {
5606 double global_imbalance = 0;
5607 // global shift to reach the sum of coordiante count in each part.
5608 size_t global_shift = num_procs * num_parts;
5609
5610 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5611 for(mj_part_t i = 0; i < num_parts; ++i) {
5612 double ideal_num = num_points_in_all_processor_parts[global_shift + i]
5613 / double(num_procs);
5614
5615 global_imbalance += std::abs(ideal_num -
5616 num_points_in_all_processor_parts[ii * num_parts + i]) / (ideal_num);
5617 }
5618 }
5619 global_imbalance /= num_parts;
5620 global_imbalance /= num_procs;
5621
5622 if(global_imbalance <= this->minimum_migration_imbalance) {
5623 return false;
5624 }
5625 else {
5626 return true;
5627 }
5628 }
5629 else {
5630 // if migration is forced
5631 return true;
5632 }
5633}
5634
5648template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5649 typename mj_part_t, typename mj_node_t>
5650void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5651 assign_send_destinations(
5652 mj_part_t num_parts,
5653 mj_part_t *part_assignment_proc_begin_indices,
5654 mj_part_t *processor_chains_in_parts,
5655 mj_lno_t *send_count_to_each_proc,
5656 int *coordinate_destinations) {
5657
5658 auto host_new_part_xadj = Kokkos::create_mirror_view(this->new_part_xadj);
5659 deep_copy(host_new_part_xadj, this->new_part_xadj);
5660
5661 auto host_new_coordinate_permutations =
5662 Kokkos::create_mirror_view(this->new_coordinate_permutations);
5663 deep_copy(host_new_coordinate_permutations, this->new_coordinate_permutations);
5664
5665 for(mj_part_t p = 0; p < num_parts; ++p) {
5666 mj_lno_t part_begin = 0;
5667 if(p > 0) part_begin = host_new_part_xadj(p - 1);
5668 mj_lno_t part_end = host_new_part_xadj(p);
5669 // get the first part that current processor will send its part-p.
5670 mj_part_t proc_to_sent = part_assignment_proc_begin_indices[p];
5671 // initialize how many point I sent to this processor.
5672 mj_lno_t num_total_send = 0;
5673 for(mj_lno_t j=part_begin; j < part_end; j++) {
5674 mj_lno_t local_ind = host_new_coordinate_permutations(j);
5675 while (num_total_send >= send_count_to_each_proc[proc_to_sent]) {
5676 // then get the next processor to send the points in part p.
5677 num_total_send = 0;
5678 // assign new processor to part_assign_begin[p]
5679 part_assignment_proc_begin_indices[p] =
5680 processor_chains_in_parts[proc_to_sent];
5681 // remove the previous processor
5682 processor_chains_in_parts[proc_to_sent] = -1;
5683 // choose the next processor as the next one to send.
5684 proc_to_sent = part_assignment_proc_begin_indices[p];
5685 }
5686 // write the gno index to corresponding position in sendBuf.
5687 coordinate_destinations[local_ind] = proc_to_sent;
5688 ++num_total_send;
5689 }
5690 }
5691}
5692
5713template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
5714 typename mj_part_t, typename mj_node_t>
5715void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
5716 mj_assign_proc_to_parts(
5717 mj_gno_t * num_points_in_all_processor_parts,
5718 mj_part_t num_parts,
5719 mj_part_t num_procs,
5720 mj_lno_t *send_count_to_each_proc,
5721 std::vector<mj_part_t> &processor_ranks_for_subcomm,
5722 std::vector<mj_part_t> *next_future_num_parts_in_parts,
5723 mj_part_t &out_part_index,
5724 mj_part_t &output_part_numbering_begin_index,
5725 int * coordinate_destinations) {
5726 mj_gno_t *global_num_points_in_parts =
5727 num_points_in_all_processor_parts + num_procs * num_parts;
5728 mj_part_t *num_procs_assigned_to_each_part = new mj_part_t[num_parts];
5729
5730 // boolean variable if the process finds its part to be assigned.
5731 bool did_i_find_my_group = false;
5732
5733 mj_part_t num_free_procs = num_procs;
5734 mj_part_t minimum_num_procs_required_for_rest_of_parts = num_parts - 1;
5735
5736 double max_imbalance_difference = 0;
5737 mj_part_t max_differing_part = 0;
5738
5739 // find how many processor each part requires.
5740 for(mj_part_t i = 0; i < num_parts; i++) {
5741
5742 // scalar portion of the required processors
5743 double scalar_required_proc = num_procs *
5744 (double (global_num_points_in_parts[i]) /
5745 double (this->num_global_coords));
5746
5747 // round it to closest integer; make sure have at least one proc.
5748 mj_part_t required_proc =
5749 static_cast<mj_part_t> (0.5 + scalar_required_proc);
5750 if(required_proc == 0) required_proc = 1;
5751
5752 // if assigning the required num procs, creates problems for the rest
5753 // of the parts, then only assign {num_free_procs -
5754 // (minimum_num_procs_required_for_rest_of_parts)} procs to this part.
5755 if(num_free_procs -
5756 required_proc < minimum_num_procs_required_for_rest_of_parts) {
5757 required_proc = num_free_procs -
5758 (minimum_num_procs_required_for_rest_of_parts);
5759 }
5760
5761 // reduce the free processor count
5762 num_free_procs -= required_proc;
5763
5764 // reduce the free minimum processor count required for the rest of the
5765 // part by 1.
5766 --minimum_num_procs_required_for_rest_of_parts;
5767
5768 // part (i) is assigned to (required_proc) processors.
5769 num_procs_assigned_to_each_part[i] = required_proc;
5770
5771 // because of the roundings some processors might be left as unassigned.
5772 // we want to assign those processors to the part with most imbalance.
5773 // find the part with the maximum imbalance here.
5774 double imbalance_wrt_ideal =
5775 (scalar_required_proc - required_proc) / required_proc;
5776 if(imbalance_wrt_ideal > max_imbalance_difference) {
5777 max_imbalance_difference = imbalance_wrt_ideal;
5778 max_differing_part = i;
5779 }
5780 }
5781
5782 // assign extra processors to the part with maximum imbalance
5783 // than the ideal.
5784 if(num_free_procs > 0) {
5785 num_procs_assigned_to_each_part[max_differing_part] += num_free_procs;
5786 }
5787
5788 // now find what are the best processors with least migration for each part.
5789
5790 // part_assignment_proc_begin_indices ([i]) is the array that holds the
5791 // beginning index of a processor that processor sends its data for part - i
5792 mj_part_t *part_assignment_proc_begin_indices = new mj_part_t[num_parts];
5793
5794 // the next processor send is found in processor_chains_in_parts,
5795 // in linked list manner.
5796 mj_part_t *processor_chains_in_parts = new mj_part_t [num_procs];
5797 mj_part_t *processor_part_assignments = new mj_part_t[num_procs];
5798
5799 // initialize the assignment of each processor.
5800 // this has a linked list implementation.
5801 // the beginning of processors assigned
5802 // to each part is hold at part_assignment_proc_begin_indices[part].
5803 // then the next processor assigned to that part is located at
5804 // proc_part_assignments[part_assign_begins[part]], this is a chain
5805 // until the value of -1 is reached.
5806 for(int i = 0; i < num_procs; ++i ) {
5807 processor_part_assignments[i] = -1;
5808 processor_chains_in_parts[i] = -1;
5809 }
5810 for(int i = 0; i < num_parts; ++i ) {
5811 part_assignment_proc_begin_indices[i] = -1;
5812 }
5813
5814 // std::cout << "Before migration: mig type:" <<
5815 // this->migration_type << std::endl;
5816 // Allocate memory for sorting data structure.
5817 uSignedSortItem<mj_part_t, mj_gno_t, char> *
5818 sort_item_num_part_points_in_procs =
5819 new uSignedSortItem<mj_part_t, mj_gno_t, char>[num_procs];
5820
5821 for(mj_part_t i = 0; i < num_parts; ++i) {
5822 // the algorithm tries to minimize the cost of migration, by assigning the
5823 // processors with highest number of coordinates on that part.
5824 // here we might want to implement a maximum weighted bipartite matching
5825 // algorithm.
5826 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5827 sort_item_num_part_points_in_procs[ii].id = ii;
5828 // if processor is not assigned yet.
5829 // add its num points to the sort data structure.
5830 if(processor_part_assignments[ii] == -1) {
5831 sort_item_num_part_points_in_procs[ii].val =
5832 num_points_in_all_processor_parts[ii * num_parts + i];
5833 // indicate that the processor has positive weight.
5834 sort_item_num_part_points_in_procs[ii].signbit = 1;
5835 }
5836 else {
5837 // if processor is already assigned, insert -nLocal - 1 so that it
5838 // won't be selected again.
5839 // would be same if we simply set it to -1, but more information with
5840 // no extra cost (which is used later) is provided.
5841 // sort_item_num_part_points_in_procs[ii].val =
5842 // -num_points_in_all_processor_parts[ii * num_parts + i] - 1;
5843
5844 // UPDATE: Since above gets warning when unsigned is used to
5845 // represent, we added extra bit to as sign bit to the sort item.
5846 // It is 1 for positives, 0 for negatives.
5847 sort_item_num_part_points_in_procs[ii].val =
5848 num_points_in_all_processor_parts[ii * num_parts + i];
5849 sort_item_num_part_points_in_procs[ii].signbit = 0;
5850 }
5851 }
5852
5853 // sort the processors in the part.
5854 uqSignsort<mj_part_t, mj_gno_t,char>
5855 (num_procs, sort_item_num_part_points_in_procs);
5856
5857 /*
5858 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5859 std::cout << "ii:" << ii << " " <<
5860 sort_item_num_part_points_in_procs[ii].id <<
5861 " " << sort_item_num_part_points_in_procs[ii].val <<
5862 " " << int(sort_item_num_part_points_in_procs[ii].signbit) <<
5863 std::endl;
5864 }
5865 */
5866
5867 mj_part_t required_proc_count = num_procs_assigned_to_each_part[i];
5868 mj_gno_t total_num_points_in_part = global_num_points_in_parts[i];
5869 mj_gno_t ideal_num_points_in_a_proc = Teuchos::as<mj_gno_t>(
5870 ceil(total_num_points_in_part / double (required_proc_count)));
5871
5872 // starts sending to least heaviest part.
5873 mj_part_t next_proc_to_send_index = num_procs - required_proc_count;
5874 mj_part_t next_proc_to_send_id =
5875 sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
5876 mj_lno_t space_left_in_sent_proc = ideal_num_points_in_a_proc -
5877 sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
5878
5879 // find the processors that will be assigned to this part, which are the
5880 // heaviest non assigned processors.
5881 for(mj_part_t ii = num_procs - 1;
5882 ii >= num_procs - required_proc_count; --ii) {
5883 mj_part_t proc_id = sort_item_num_part_points_in_procs[ii].id;
5884 // assign processor to part - i.
5885 processor_part_assignments[proc_id] = i;
5886 }
5887
5888 bool did_change_sign = false;
5889 // if processor has a minus count, reverse it.
5890 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5891 // TODO: THE LINE BELOW PRODUCES A WARNING IF gno_t IS UNSIGNED
5892 // TODO: SEE BUG 6194
5893 if(sort_item_num_part_points_in_procs[ii].signbit == 0) {
5894 did_change_sign = true;
5895 sort_item_num_part_points_in_procs[ii].signbit = 1;
5896 }
5897 else {
5898 break;
5899 }
5900 }
5901
5902 if(did_change_sign) {
5903 // resort the processors in the part for the rest of the processors that
5904 // is not assigned.
5905 uqSignsort<mj_part_t, mj_gno_t>(num_procs - required_proc_count,
5906 sort_item_num_part_points_in_procs);
5907 }
5908
5909 /*
5910 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
5911 std::cout << "after resort ii:" << ii << " " <<
5912 sort_item_num_part_points_in_procs[ii].id <<
5913 " " << sort_item_num_part_points_in_procs[ii].val <<
5914 " " << int(sort_item_num_part_points_in_procs[ii].signbit ) <<
5915 std::endl;
5916 }
5917 */
5918
5919 // check if this processors is one of the procs assigned to this part.
5920 // if it is, then get the group.
5921 if(!did_i_find_my_group) {
5922 for(mj_part_t ii = num_procs - 1; ii >=
5923 num_procs - required_proc_count; --ii) {
5924
5925 mj_part_t proc_id_to_assign = sort_item_num_part_points_in_procs[ii].id;
5926
5927 // add the proc to the group.
5928 processor_ranks_for_subcomm.push_back(proc_id_to_assign);
5929
5930 if(proc_id_to_assign == this->myRank) {
5931 // if the assigned process is me, then I find my group.
5932 did_i_find_my_group = true;
5933
5934 // set the beginning of part i to my rank.
5935 part_assignment_proc_begin_indices[i] = this->myRank;
5936 processor_chains_in_parts[this->myRank] = -1;
5937
5938 // set send count to myself to the number of points that I have
5939 // in part i.
5940 send_count_to_each_proc[this->myRank] =
5941 sort_item_num_part_points_in_procs[ii].val;
5942
5943 // calculate the shift required for the
5944 // output_part_numbering_begin_index
5945 for(mj_part_t in = 0; in < i; ++in) {
5946 output_part_numbering_begin_index +=
5947 (*next_future_num_parts_in_parts)[in];
5948 }
5949 out_part_index = i;
5950 }
5951 }
5952
5953 // if these was not my group,
5954 // clear the subcomminicator processor array.
5955 if(!did_i_find_my_group) {
5956 processor_ranks_for_subcomm.clear();
5957 }
5958 }
5959
5960 // send points of the nonassigned coordinates to the assigned coordinates.
5961 // starts from the heaviest nonassigned processor.
5962 // TODO we might want to play with this part, that allows more
5963 // computational imbalance but having better communication balance.
5964 for(mj_part_t ii = num_procs - required_proc_count - 1; ii >= 0; --ii) {
5965 mj_part_t nonassigned_proc_id =
5966 sort_item_num_part_points_in_procs[ii].id;
5967 mj_lno_t num_points_to_sent =
5968 sort_item_num_part_points_in_procs[ii].val;
5969
5970 // we set number of points to -to_sent - 1 for the assigned processors.
5971 // we reverse it here. This should not happen, as we have already
5972 // reversed them above.
5973#ifdef MJ_DEBUG
5974 if(num_points_to_sent < 0) {
5975 cout << "Migration - processor assignments - for part:" << i
5976 << "from proc:" << nonassigned_proc_id << " num_points_to_sent:"
5977 << num_points_to_sent << std::endl;
5978 std::terminate();
5979 }
5980#endif
5981
5982 switch (migration_type) {
5983 case 0:
5984 {
5985 // now sends the points to the assigned processors.
5986 while (num_points_to_sent > 0) {
5987 // if the processor has enough space.
5988 if(num_points_to_sent <= space_left_in_sent_proc) {
5989 // reduce the space left in the processor.
5990 space_left_in_sent_proc -= num_points_to_sent;
5991 // if my rank is the one that is sending the coordinates.
5992 if(this->myRank == nonassigned_proc_id) {
5993 // set my sent count to the sent processor.
5994 send_count_to_each_proc[next_proc_to_send_id] =
5995 num_points_to_sent;
5996 // save the processor in the list (processor_chains_in_parts
5997 // and part_assignment_proc_begin_indices)
5998 // that the processor will send its point in part-i.
5999 mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
6000 part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
6001 processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
6002 }
6003 num_points_to_sent = 0;
6004 }
6005 else {
6006 // there might be no space left in the processor.
6007 if(space_left_in_sent_proc > 0) {
6008 num_points_to_sent -= space_left_in_sent_proc;
6009
6010 //send as the space left in the processor.
6011 if(this->myRank == nonassigned_proc_id) {
6012 // send as much as the space in this case.
6013 send_count_to_each_proc[next_proc_to_send_id] =
6014 space_left_in_sent_proc;
6015 mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
6016 part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
6017 processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
6018 }
6019 }
6020 // change the sent part
6021 ++next_proc_to_send_index;
6022
6023#ifdef MJ_DEBUG
6024 if(next_part_to_send_index < nprocs - required_proc_count ) {
6025 cout << "Migration - processor assignments - for part:"
6026 << i
6027 << " next_part_to_send :" << next_part_to_send_index
6028 << " nprocs:" << nprocs
6029 << " required_proc_count:" << required_proc_count
6030 << " Error: next_part_to_send_index <" <<
6031 << " nprocs - required_proc_count" << std::endl;
6032 std::terminate();
6033 }
6034#endif
6035 // send the new id.
6036 next_proc_to_send_id =
6037 sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
6038 // set the new space in the processor.
6039 space_left_in_sent_proc = ideal_num_points_in_a_proc -
6040 sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
6041 }
6042 }
6043 }
6044 break;
6045 default:
6046 {
6047 // to minimize messages, we want each processor to send its
6048 // coordinates to only a single point.
6049 // we do not respect imbalances here, we send all points to the
6050 // next processor.
6051 if(this->myRank == nonassigned_proc_id) {
6052 // set my sent count to the sent processor.
6053 send_count_to_each_proc[next_proc_to_send_id] = num_points_to_sent;
6054 // save the processor in the list (processor_chains_in_parts and
6055 // part_assignment_proc_begin_indices)
6056 // that the processor will send its point in part-i.
6057 mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
6058 part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
6059 processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
6060 }
6061 num_points_to_sent = 0;
6062 ++next_proc_to_send_index;
6063
6064 // if we made it to the heaviest processor we round robin and
6065 // go to beginning
6066 if(next_proc_to_send_index == num_procs) {
6067 next_proc_to_send_index = num_procs - required_proc_count;
6068 }
6069 // send the new id.
6070 next_proc_to_send_id =
6071 sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
6072 // set the new space in the processor.
6073 space_left_in_sent_proc = ideal_num_points_in_a_proc -
6074 sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
6075 }
6076 }
6077 }
6078 }
6079
6080 /*
6081 for(int i = 0; i < num_procs;++i) {
6082 std::cout << "me:" << this->myRank << " to part:" << i << " sends:" <<
6083 send_count_to_each_proc[i] << std::endl;
6084 }
6085 */
6086
6087 this->assign_send_destinations(
6088 num_parts,
6089 part_assignment_proc_begin_indices,
6090 processor_chains_in_parts,
6091 send_count_to_each_proc,
6092 coordinate_destinations);
6093 delete [] part_assignment_proc_begin_indices;
6094 delete [] processor_chains_in_parts;
6095 delete [] processor_part_assignments;
6096 delete [] sort_item_num_part_points_in_procs;
6097 delete [] num_procs_assigned_to_each_part;
6098}
6099
6115template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6116 typename mj_part_t, typename mj_node_t>
6117void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6118 assign_send_destinations2(
6119 mj_part_t num_parts,
6120 uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment,
6121 int *coordinate_destinations,
6122 mj_part_t &output_part_numbering_begin_index,
6123 std::vector<mj_part_t> *next_future_num_parts_in_parts)
6124{
6125 mj_part_t part_shift_amount = output_part_numbering_begin_index;
6126 mj_part_t previous_processor = -1;
6127
6128 auto local_new_part_xadj = Kokkos::create_mirror_view(this->new_part_xadj);
6129 Kokkos::deep_copy(local_new_part_xadj, this->new_part_xadj);
6130
6131 auto local_new_coordinate_permutations =
6132 Kokkos::create_mirror_view(this->new_coordinate_permutations);
6133 Kokkos::deep_copy(local_new_coordinate_permutations,
6134 this->new_coordinate_permutations);
6135
6136 for(mj_part_t i = 0; i < num_parts; ++i) {
6137 mj_part_t p = sort_item_part_to_proc_assignment[i].id;
6138
6139 // assigned processors are sorted.
6140 mj_lno_t part_begin_index = 0;
6141
6142 if(p > 0) {
6143 part_begin_index = local_new_part_xadj(p - 1);
6144 }
6145
6146 mj_lno_t part_end_index = local_new_part_xadj(p);
6147
6148 mj_part_t assigned_proc = sort_item_part_to_proc_assignment[i].val;
6149 if(this->myRank == assigned_proc && previous_processor != assigned_proc) {
6150 output_part_numbering_begin_index = part_shift_amount;
6151 }
6152 previous_processor = assigned_proc;
6153 part_shift_amount += (*next_future_num_parts_in_parts)[p];
6154
6155 for(mj_lno_t j= part_begin_index; j < part_end_index; j++) {
6156 mj_lno_t localInd = local_new_coordinate_permutations(j);
6157 coordinate_destinations[localInd] = assigned_proc;
6158 }
6159 }
6160}
6161
6183template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6184 typename mj_part_t, typename mj_node_t>
6185void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6186 mj_assign_parts_to_procs(
6187 mj_gno_t * num_points_in_all_processor_parts,
6188 mj_part_t num_parts,
6189 mj_part_t num_procs,
6190 mj_lno_t *send_count_to_each_proc,
6191 std::vector<mj_part_t> *next_future_num_parts_in_parts,
6192 mj_part_t &out_num_part,
6193 std::vector<mj_part_t> &out_part_indices,
6194 mj_part_t &output_part_numbering_begin_index,
6195 int *coordinate_destinations) {
6196
6197 out_num_part = 0;
6198 mj_gno_t *global_num_points_in_parts =
6199 num_points_in_all_processor_parts + num_procs * num_parts;
6200 out_part_indices.clear();
6201
6202 // to sort the parts that is assigned to the processors.
6203 // id is the part number, sort value is the assigned processor id.
6204 uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment =
6205 new uSortItem<mj_part_t, mj_part_t>[num_parts];
6206 uSortItem<mj_part_t, mj_gno_t> * sort_item_num_points_of_proc_in_part_i =
6207 new uSortItem<mj_part_t, mj_gno_t>[num_procs];
6208
6209 // calculate the optimal number of coordinates that should be assigned
6210 // to each processor.
6211 mj_lno_t work_each =
6212 mj_lno_t (this->num_global_coords / (double (num_procs)) + 0.5f);
6213
6214 // to hold the left space as the number of coordinates to the optimal
6215 // number in each proc.
6216 mj_lno_t *space_in_each_processor = new mj_lno_t[num_procs];
6217
6218 // initialize left space in each.
6219 for(mj_part_t i = 0; i < num_procs; ++i) {
6220 space_in_each_processor[i] = work_each;
6221 }
6222
6223 // we keep track of how many parts each processor is assigned to.
6224 // because in some weird inputs, it might be possible that some
6225 // processors is not assigned to any part. Using these variables,
6226 // we force each processor to have at least one part.
6227 mj_part_t *num_parts_proc_assigned = new mj_part_t[num_procs];
6228 memset(num_parts_proc_assigned, 0, sizeof(mj_part_t) * num_procs);
6229 int empty_proc_count = num_procs;
6230
6231 // to sort the parts with decreasing order of their coordiantes.
6232 // id are the part numbers, sort value is the number of points in each.
6233 uSortItem<mj_part_t, mj_gno_t> * sort_item_point_counts_in_parts =
6234 new uSortItem<mj_part_t, mj_gno_t>[num_parts];
6235
6236 // initially we will sort the parts according to the number of coordinates
6237 // they have, so that we will start assigning with the part that has the most
6238 // number of coordinates.
6239 for(mj_part_t i = 0; i < num_parts; ++i) {
6240 sort_item_point_counts_in_parts[i].id = i;
6241 sort_item_point_counts_in_parts[i].val = global_num_points_in_parts[i];
6242 }
6243
6244 // sort parts with increasing order of loads.
6245 uqsort<mj_part_t, mj_gno_t>(num_parts, sort_item_point_counts_in_parts);
6246
6247 // assigning parts to the processors
6248 // traverse the part with decreasing order of load.
6249 // first assign the heaviest part.
6250 for(mj_part_t j = 0; j < num_parts; ++j) {
6251 // sorted with increasing order, traverse inverse.
6252 mj_part_t i = sort_item_point_counts_in_parts[num_parts - 1 - j].id;
6253
6254 // load of the part
6255 mj_gno_t load = global_num_points_in_parts[i];
6256
6257 // assigned processors
6258 mj_part_t assigned_proc = -1;
6259
6260 // sort processors with increasing number of points in this part.
6261 for(mj_part_t ii = 0; ii < num_procs; ++ii) {
6262 sort_item_num_points_of_proc_in_part_i[ii].id = ii;
6263
6264 // if there are still enough parts to fill empty processors, than proceed
6265 // normally, but if empty processor count is equal to the number of part,
6266 // then we force to part assignments only to empty processors.
6267 if(empty_proc_count < num_parts - j ||
6268 num_parts_proc_assigned[ii] == 0) {
6269 // how many points processor ii has in part i?
6270 sort_item_num_points_of_proc_in_part_i[ii].val =
6271 num_points_in_all_processor_parts[ii * num_parts + i];
6272 }
6273 else {
6274 sort_item_num_points_of_proc_in_part_i[ii].val = -1;
6275 }
6276 }
6277
6278 uqsort<mj_part_t, mj_gno_t>(num_procs,
6279 sort_item_num_points_of_proc_in_part_i);
6280
6281 // traverse all processors with decreasing load.
6282 for(mj_part_t iii = num_procs - 1; iii >= 0; --iii) {
6283 mj_part_t ii = sort_item_num_points_of_proc_in_part_i[iii].id;
6284 if(assigned_proc == -1 ||
6285 (space_in_each_processor[ii] > space_in_each_processor[assigned_proc])) {
6286 assigned_proc = ii;
6287 }
6288 else if(space_in_each_processor[ii] == space_in_each_processor[assigned_proc]) {
6289 if(ii < assigned_proc) {
6290 // ties go to lower proc
6291 // not necessary for a valid result but allows testing to compare
6292 // MPI results and have parts numbers assigned to the same boxes.
6293 // We don't break here because we may have more ties still to check.
6294 // The indeterminate state before this is due to Cuda using
6295 // atomics to refill the permutation array. So non-cuda runs don't
6296 // actualy need this since they will always have the same pattern.
6297 assigned_proc = ii;
6298 }
6299 }
6300 else {
6301 break; // now we can break - we have our part and no more ties.
6302 }
6303 }
6304
6305 if(num_parts_proc_assigned[assigned_proc]++ == 0) {
6306 --empty_proc_count;
6307 }
6308
6309 space_in_each_processor[assigned_proc] -= load;
6310 //to sort later, part-i is assigned to the proccessor - assignment.
6311 sort_item_part_to_proc_assignment[j].id = i; //part i
6312
6313 // assigned to processor - assignment.
6314 sort_item_part_to_proc_assignment[j].val = assigned_proc;
6315
6316 // if assigned processor is me, increase the number.
6317 if(assigned_proc == this->myRank) {
6318 out_num_part++;//assigned_part_count;
6319 out_part_indices.push_back(i);
6320 }
6321
6322 // increase the send to that processor by the number of points in that
6323 // part, as everyone send their coordiantes in this part to the
6324 // processor assigned to this part.
6325 send_count_to_each_proc[assigned_proc] +=
6326 num_points_in_all_processor_parts[this->myRank * num_parts + i];
6327 }
6328
6329 delete [] num_parts_proc_assigned;
6330 delete [] sort_item_num_points_of_proc_in_part_i;
6331 delete [] sort_item_point_counts_in_parts;
6332 delete [] space_in_each_processor;
6333
6334 // sort assignments with respect to the assigned processors.
6335 uqsort<mj_part_t, mj_part_t>(num_parts, sort_item_part_to_proc_assignment);
6336
6337 // fill sendBuf.
6338 this->assign_send_destinations2(
6339 num_parts,
6340 sort_item_part_to_proc_assignment,
6341 coordinate_destinations,
6342 output_part_numbering_begin_index,
6343 next_future_num_parts_in_parts);
6344
6345 delete [] sort_item_part_to_proc_assignment;
6346}
6347
6348
6372template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6373 typename mj_part_t, typename mj_node_t>
6374void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6375 mj_migration_part_proc_assignment(
6376 mj_gno_t * num_points_in_all_processor_parts,
6377 mj_part_t num_parts,
6378 mj_part_t num_procs,
6379 mj_lno_t *send_count_to_each_proc,
6380 std::vector<mj_part_t> &processor_ranks_for_subcomm,
6381 std::vector<mj_part_t> *next_future_num_parts_in_parts,
6382 mj_part_t &out_num_part,
6383 std::vector<mj_part_t> &out_part_indices,
6384 mj_part_t &output_part_numbering_begin_index,
6385 int *coordinate_destinations)
6386{
6387 processor_ranks_for_subcomm.clear();
6388 // if(this->num_local_coords > 0)
6389 if(num_procs > num_parts) {
6390 // if there are more processors than the number of current part
6391 // then processors share the existing parts.
6392 // at the end each processor will have a single part,
6393 // but a part will be shared by a group of processors.
6394 mj_part_t out_part_index = 0;
6395
6396 this->mj_assign_proc_to_parts(
6397 num_points_in_all_processor_parts,
6398 num_parts,
6399 num_procs,
6400 send_count_to_each_proc,
6401 processor_ranks_for_subcomm,
6402 next_future_num_parts_in_parts,
6403 out_part_index,
6404 output_part_numbering_begin_index,
6405 coordinate_destinations
6406 );
6407
6408 out_num_part = 1;
6409 out_part_indices.clear();
6410 out_part_indices.push_back(out_part_index);
6411 }
6412 else {
6413
6414 // there are more parts than the processors.
6415 // therefore a processor will be assigned multiple parts,
6416 // the subcommunicators will only have a single processor.
6417 processor_ranks_for_subcomm.push_back(this->myRank);
6418
6419 // since there are more parts then procs,
6420 // assign multiple parts to processors.
6421
6422 this->mj_assign_parts_to_procs(
6423 num_points_in_all_processor_parts,
6424 num_parts,
6425 num_procs,
6426 send_count_to_each_proc,
6427 next_future_num_parts_in_parts,
6428 out_num_part,
6429 out_part_indices,
6430 output_part_numbering_begin_index,
6431 coordinate_destinations);
6432 }
6433}
6434
6448template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6449 typename mj_part_t, typename mj_node_t>
6450void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6451 mj_migrate_coords(
6452 mj_part_t num_procs,
6453 mj_lno_t &num_new_local_points,
6454 std::string iteration,
6455 int *coordinate_destinations,
6456 mj_part_t num_parts)
6457{
6458
6459#ifdef ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
6460 if(sizeof(mj_lno_t) <= sizeof(int)) {
6461 // Cannot use Zoltan_Comm with local ordinals larger than ints.
6462 // In Zoltan_Comm_Create, the cast int(this->num_local_coords)
6463 // may overflow.
6464 ZOLTAN_COMM_OBJ *plan = NULL;
6465 MPI_Comm mpi_comm = Teuchos::getRawMpiComm(*(this->comm));
6466 int num_incoming_gnos = 0;
6467 int message_tag = 7859;
6468
6469 this->mj_env->timerStart(MACRO_TIMERS,
6470 mj_timer_base_string + "Migration Z1PlanCreating-" + iteration);
6471 int ierr = Zoltan_Comm_Create(
6472 &plan,
6473 int(this->num_local_coords),
6474 coordinate_destinations,
6475 mpi_comm,
6476 message_tag,
6477 &num_incoming_gnos);
6478
6479 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6480 this->mj_env->timerStop(MACRO_TIMERS,
6481 mj_timer_base_string + "Migration Z1PlanCreating-" + iteration);
6482
6483 this->mj_env->timerStart(MACRO_TIMERS,
6484 mj_timer_base_string + "Migration Z1Migration-" + iteration);
6485
6486 // MPI Buffers should be on Kokkos::HostSpace not Kokkos::CudaUVMSpace
6487 // Note, with UVM space, create_mirror_view does NOT create a non-UVM
6488 // view; need the explicit Host creation and deep_copy.
6489
6490 // migrate gnos.
6491 {
6492 auto host_current_mj_gnos = Kokkos::create_mirror_view(
6493 Kokkos::HostSpace(), this->current_mj_gnos);
6494 Kokkos::deep_copy(host_current_mj_gnos, this->current_mj_gnos);
6495 Kokkos::View<mj_gno_t*, device_t> dst_gnos(
6496 Kokkos::ViewAllocateWithoutInitializing("dst_gnos"), num_incoming_gnos);
6497 auto host_dst_gnos = Kokkos::create_mirror_view(
6498 Kokkos::HostSpace(), dst_gnos);
6499 message_tag++;
6500 ierr = Zoltan_Comm_Do(
6501 plan,
6502 message_tag,
6503 (char *) host_current_mj_gnos.data(),
6504 sizeof(mj_gno_t),
6505 (char *) host_dst_gnos.data());
6506 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6507 Kokkos::deep_copy(dst_gnos, host_dst_gnos);
6508 this->current_mj_gnos = dst_gnos;
6509 }
6510
6511 //migrate coordinates
6512 {
6513 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
6514 auto host_src_coordinates = Kokkos::create_mirror_view(
6515 Kokkos::HostSpace(), this->mj_coordinates);
6516 Kokkos::deep_copy(host_src_coordinates, this->mj_coordinates);
6517 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
6518 dst_coordinates(Kokkos::ViewAllocateWithoutInitializing("mj_coordinates"),
6519 num_incoming_gnos, this->coord_dim);
6520 auto host_dst_coordinates = Kokkos::create_mirror_view(
6521 Kokkos::HostSpace(), dst_coordinates);
6522 for(int i = 0; i < this->coord_dim; ++i) {
6523 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> sub_host_src_coordinates
6524 = Kokkos::subview(host_src_coordinates, Kokkos::ALL, i);
6525 Kokkos::View<mj_scalar_t *, Kokkos::HostSpace> sub_host_dst_coordinates
6526 = Kokkos::subview(host_dst_coordinates, Kokkos::ALL, i);
6527 // Note Layout Left means we can do these in contiguous blocks
6528 message_tag++;
6529 ierr = Zoltan_Comm_Do(
6530 plan,
6531 message_tag,
6532 (char *) sub_host_src_coordinates.data(),
6533 sizeof(mj_scalar_t),
6534 (char *) sub_host_dst_coordinates.data());
6535 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6536 }
6537 deep_copy(dst_coordinates, host_dst_coordinates);
6538 this->mj_coordinates = dst_coordinates;
6539 }
6540
6541 // migrate weights.
6542 {
6543 auto host_src_weights = Kokkos::create_mirror_view(
6544 Kokkos::HostSpace(), this->mj_weights);
6545 Kokkos::deep_copy(host_src_weights, this->mj_weights);
6546 Kokkos::View<mj_scalar_t**, device_t> dst_weights(
6547 Kokkos::ViewAllocateWithoutInitializing("mj_weights"),
6548 num_incoming_gnos, this->num_weights_per_coord);
6549 auto host_dst_weights = Kokkos::create_mirror_view(dst_weights);
6550 for(int i = 0; i < this->num_weights_per_coord; ++i) {
6551 auto sub_host_src_weights
6552 = Kokkos::subview(host_src_weights, Kokkos::ALL, i);
6553 auto sub_host_dst_weights
6554 = Kokkos::subview(host_dst_weights, Kokkos::ALL, i);
6555 ArrayRCP<mj_scalar_t> sent_weight(this->num_local_coords);
6556 // Copy because of layout
6557 for(mj_lno_t n = 0; n < this->num_local_coords; ++n) {
6558 sent_weight[n] = sub_host_src_weights(n);
6559 }
6560 ArrayRCP<mj_scalar_t> received_weight(num_incoming_gnos);
6561 message_tag++;
6562 ierr = Zoltan_Comm_Do(
6563 plan,
6564 message_tag,
6565 (char *) sent_weight.getRawPtr(),
6566 sizeof(mj_scalar_t),
6567 (char *) received_weight.getRawPtr());
6568 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6569 // Again we copy by index due to layout
6570 for(mj_lno_t n = 0; n < num_incoming_gnos; ++n) {
6571 sub_host_dst_weights(n) = received_weight[n];
6572 }
6573 }
6574 deep_copy(dst_weights, host_dst_weights);
6575 this->mj_weights = dst_weights;
6576 }
6577
6578 // migrate owners.
6579 {
6580 // Note that owners we kept on Serial
6581 Kokkos::View<int *, Kokkos::HostSpace> dst_owners_of_coordinate(
6582 Kokkos::ViewAllocateWithoutInitializing("owner_of_coordinate"),
6583 num_incoming_gnos);
6584 message_tag++;
6585 ierr = Zoltan_Comm_Do(
6586 plan,
6587 message_tag,
6588 (char *) owner_of_coordinate.data(),
6589 sizeof(int),
6590 (char *) dst_owners_of_coordinate.data());
6591 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6592 this->owner_of_coordinate = dst_owners_of_coordinate;
6593 }
6594
6595 // if num procs is less than num parts,
6596 // we need the part assigment arrays as well, since
6597 // there will be multiple parts in processor.
6598 {
6599 auto host_src_assigned_part_ids = Kokkos::create_mirror_view(
6600 Kokkos::HostSpace(), this->assigned_part_ids);
6601 Kokkos::deep_copy(host_src_assigned_part_ids, this->assigned_part_ids);
6602 Kokkos::View<int *, device_t> dst_assigned_part_ids(
6603 Kokkos::ViewAllocateWithoutInitializing("assigned_part_ids"),
6604 num_incoming_gnos);
6605 auto host_dst_assigned_part_ids = Kokkos::create_mirror_view(
6606 Kokkos::HostSpace(), dst_assigned_part_ids);
6607 mj_part_t *new_parts = new mj_part_t[num_incoming_gnos];
6608 if(num_procs < num_parts) {
6609 message_tag++;
6610 ierr = Zoltan_Comm_Do(
6611 plan,
6612 message_tag,
6613 (char *) host_src_assigned_part_ids.data(),
6614 sizeof(mj_part_t),
6615 (char *) host_dst_assigned_part_ids.data());
6616 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6617 Kokkos::deep_copy(dst_assigned_part_ids, host_dst_assigned_part_ids);
6618 }
6619 // In original code this would just assign to an uninitialized array
6620 // if num_procs < num_parts. We're doing the same here.
6621 this->assigned_part_ids = dst_assigned_part_ids;
6622 }
6623
6624 ierr = Zoltan_Comm_Destroy(&plan);
6625 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
6626 num_new_local_points = num_incoming_gnos;
6627 this->mj_env->timerStop(MACRO_TIMERS,
6628 mj_timer_base_string + "Migration Z1Migration-" + iteration);
6629 }
6630 else
6631#endif // ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
6632 {
6633 this->mj_env->timerStart(MACRO_TIMERS, mj_timer_base_string +
6634 "Migration DistributorPlanCreating-" + iteration);
6635
6636 Tpetra::Distributor distributor(this->comm);
6637 ArrayView<const mj_part_t> destinations( coordinate_destinations,
6638 this->num_local_coords);
6639 mj_lno_t num_incoming_gnos = distributor.createFromSends(destinations);
6640 this->mj_env->timerStop(MACRO_TIMERS, mj_timer_base_string +
6641 "Migration DistributorPlanCreating-" + iteration);
6642 this->mj_env->timerStart(MACRO_TIMERS, mj_timer_base_string +
6643 "Migration DistributorMigration-" + iteration);
6644
6645 // note MPI buffers should all be on Kokkos::HostSpace and not
6646 // Kokkos::CudaUVMSpace.
6647 // Note, with UVM space, create_mirror_view does NOT create a non-UVM
6648 // view; need the explicit Host creation and deep_copy.
6649 // migrate gnos.
6650 {
6651 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> received_gnos(
6652 Kokkos::ViewAllocateWithoutInitializing("received_gnos"),
6653 num_incoming_gnos);
6654
6655 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> sent_gnos(
6656 Kokkos::ViewAllocateWithoutInitializing("sent_gnos"),
6657 this->current_mj_gnos.extent(0));
6658 Kokkos::deep_copy(sent_gnos, this->current_mj_gnos);
6659
6660 distributor.doPostsAndWaits(sent_gnos, 1, received_gnos);
6661
6662 this->current_mj_gnos = Kokkos::View<mj_gno_t*, device_t>(
6663 Kokkos::ViewAllocateWithoutInitializing("gids"), num_incoming_gnos);
6664
6665 Kokkos::deep_copy(this->current_mj_gnos, received_gnos);
6666 }
6667
6668 // migrate coordinates
6669 // coordinates in MJ are LayoutLeft since Tpetra Multivector is LayoutLeft
6670 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
6671 dst_coordinates("mj_coordinates", num_incoming_gnos, this->coord_dim);
6672
6673 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, Kokkos::HostSpace>
6674 host_src_coordinates(
6675 Kokkos::ViewAllocateWithoutInitializing("host_coords"),
6676 this->mj_coordinates.extent(0), this->mj_coordinates.extent(1));
6677 Kokkos::deep_copy(host_src_coordinates, this->mj_coordinates);
6678
6679 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> received_coord(
6680 Kokkos::ViewAllocateWithoutInitializing("received_coord"),
6681 num_incoming_gnos);
6682
6683 for(int i = 0; i < this->coord_dim; ++i) {
6684
6685 // Note Layout Left means we can do these in contiguous blocks
6686
6687 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> sent_coord
6688 = Kokkos::subview(host_src_coordinates, Kokkos::ALL, i);
6689
6690 distributor.doPostsAndWaits(sent_coord, 1, received_coord);
6691
6692 Kokkos::deep_copy(Kokkos::subview(dst_coordinates, Kokkos::ALL, i),
6693 received_coord);
6694
6695 // Kokkos::deep_copy will fence, I think, so it should be safe
6696 // to reuse received_coord in the next lop iteration
6697 }
6698 this->mj_coordinates = dst_coordinates;
6699
6700 // migrate weights.
6701 Kokkos::View<mj_scalar_t**, device_t> dst_weights(
6702 "mj_weights", num_incoming_gnos, this->num_weights_per_coord);
6703 auto host_dst_weights = Kokkos::create_mirror_view(Kokkos::HostSpace(),
6704 dst_weights);
6705
6706 auto host_src_weights = Kokkos::create_mirror_view_and_copy(
6707 Kokkos::HostSpace(), this->mj_weights);
6708
6709 // contiguous buffers to gather potentially strided data
6710 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> sent_weight(
6711 Kokkos::ViewAllocateWithoutInitializing("send_weight_buffer"),
6712 this->num_local_coords);
6713
6714 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> received_weight(
6715 Kokkos::ViewAllocateWithoutInitializing("received_weight_buffer"),
6716 num_incoming_gnos);
6717
6718 for(int i = 0; i < this->num_weights_per_coord; ++i) {
6719
6720 auto sub_host_src_weights
6721 = Kokkos::subview(host_src_weights, Kokkos::ALL, i);
6722
6723 auto sub_host_dst_weights
6724 = Kokkos::subview(host_dst_weights, Kokkos::ALL, i);
6725
6726
6727 // Layout Right means the weights are not contiguous
6728 // However we don't have any systems setup with more than 1 weight so
6729 // really I have not tested any of this code with num weights > 1.
6730 // I think this is the right thing to do.
6731 for(mj_lno_t n = 0; n < this->num_local_coords; ++n) {
6732 sent_weight[n] = sub_host_src_weights(n);
6733 }
6734
6735 distributor.doPostsAndWaits(sent_weight, 1, received_weight);
6736
6737 // Again we copy by index due to layout
6738 for(mj_lno_t n = 0; n < num_incoming_gnos; ++n) {
6739 sub_host_dst_weights(n) = received_weight[n];
6740 }
6741 }
6742 Kokkos::deep_copy(dst_weights, host_dst_weights);
6743 this->mj_weights = dst_weights;
6744
6745 // migrate owners
6746 {
6747 // Note owners we kept on Serial
6748 Kokkos::View<int *, Kokkos::HostSpace> received_owners(
6749 Kokkos::ViewAllocateWithoutInitializing("owner_of_coordinate"),
6750 num_incoming_gnos);
6751
6752 distributor.doPostsAndWaits(owner_of_coordinate, 1, received_owners);
6753
6754 this->owner_of_coordinate = received_owners;
6755 }
6756
6757 // if num procs is less than num parts,
6758 // we need the part assigment arrays as well, since
6759 // there will be multiple parts in processor.
6760 if(num_procs < num_parts) {
6761 Kokkos::View<mj_part_t*, Kokkos::HostSpace> sent_partids(
6762 Kokkos::ViewAllocateWithoutInitializing("host_parts"),
6763 this->assigned_part_ids.extent(0));
6764 Kokkos::deep_copy(sent_partids, assigned_part_ids);
6765
6766 Kokkos::View<mj_part_t*, Kokkos::HostSpace> received_partids(
6767 Kokkos::ViewAllocateWithoutInitializing("received_partids"),
6768 num_incoming_gnos);
6769
6770 distributor.doPostsAndWaits(sent_partids, 1, received_partids);
6771
6772 this->assigned_part_ids = Kokkos::View<mj_part_t *, device_t>
6773 ("assigned_part_ids", num_incoming_gnos);
6774 Kokkos::deep_copy(this->assigned_part_ids, received_partids);
6775 }
6776 else {
6777 this->assigned_part_ids = Kokkos::View<mj_part_t *, device_t>
6778 ("assigned_part_ids", num_incoming_gnos);
6779 }
6780 this->mj_env->timerStop(MACRO_TIMERS, "" + mj_timer_base_string +
6781 "Migration DistributorMigration-" + iteration);
6782
6783 num_new_local_points = num_incoming_gnos;
6784 }
6785}
6786
6792template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6793 typename mj_part_t, typename mj_node_t>
6794void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6795 create_sub_communicator(std::vector<mj_part_t> &processor_ranks_for_subcomm)
6796{
6797 mj_part_t group_size = processor_ranks_for_subcomm.size();
6798 mj_part_t *ids = new mj_part_t[group_size];
6799 for(mj_part_t i = 0; i < group_size; ++i) {
6800 ids[i] = processor_ranks_for_subcomm[i];
6801 }
6802 ArrayView<const mj_part_t> idView(ids, group_size);
6803 this->comm = this->comm->createSubcommunicator(idView);
6804 delete [] ids;
6805}
6806
6812template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6813 typename mj_part_t, typename mj_node_t>
6814void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6815 fill_permutation_array(
6816 mj_part_t output_num_parts,
6817 mj_part_t num_parts)
6818{
6819 // if there is single output part, then simply fill the permutation array.
6820 if(output_num_parts == 1) {
6821 auto local_new_coordinate_permutations = this->new_coordinate_permutations;
6822 Kokkos::parallel_for(
6823 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_lno_t>
6824 (0, this->num_local_coords),
6825 KOKKOS_LAMBDA(mj_lno_t i) {
6826 local_new_coordinate_permutations(i) = i;
6827 });
6828 auto local_new_part_xadj = this->new_part_xadj;
6829 auto local_num_local_coords = this->num_local_coords;
6830 Kokkos::parallel_for(
6831 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0,1),
6832 KOKKOS_LAMBDA(int dummy) {
6833 local_new_part_xadj(0) = local_num_local_coords;
6834 });
6835 }
6836 else {
6837 auto local_num_local_coords = this->num_local_coords;
6838 auto local_assigned_part_ids = this->assigned_part_ids;
6839 auto local_new_part_xadj = this->new_part_xadj;
6840 auto local_new_coordinate_permutations = this->new_coordinate_permutations;
6841
6842 // part shift holds the which part number an old part number corresponds to.
6843 Kokkos::View<mj_part_t*, device_t> part_shifts("part_shifts", num_parts);
6844
6845 // otherwise we need to count how many points are there in each part.
6846 // we allocate here as num_parts, because the sent partids are up to
6847 // num_parts, although there are outout_num_parts different part.
6848 Kokkos::View<mj_lno_t*, device_t> num_points_in_parts(
6849 "num_points_in_parts", num_parts);
6850
6851 Kokkos::parallel_for(
6852 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0,1),
6853 KOKKOS_LAMBDA(int dummy) {
6854
6855 for(mj_lno_t i = 0; i < local_num_local_coords; ++i) {
6856 mj_part_t ii = local_assigned_part_ids(i);
6857 ++num_points_in_parts(ii);
6858 }
6859
6860 // write the end points of the parts.
6861 mj_part_t p = 0;
6862 mj_lno_t prev_index = 0;
6863 for(mj_part_t i = 0; i < num_parts; ++i) {
6864 if(num_points_in_parts(i) > 0) {
6865 local_new_part_xadj(p) = prev_index + num_points_in_parts(i);
6866 prev_index += num_points_in_parts(i);
6867 part_shifts(i) = p++;
6868 }
6869 }
6870
6871 // for the rest of the parts write the end index as end point.
6872 mj_part_t assigned_num_parts = p - 1;
6873 for(;p < num_parts; ++p) {
6874 local_new_part_xadj(p) =
6875 local_new_part_xadj(assigned_num_parts);
6876 }
6877 for(mj_part_t i = 0; i < output_num_parts; ++i) {
6878 num_points_in_parts(i) = local_new_part_xadj(i);
6879 }
6880
6881 // write the permutation array here.
6882 // get the part of the coordinate i, shift it to obtain the new part number.
6883 // assign it to the end of the new part numbers pointer.
6884 for(mj_lno_t i = local_num_local_coords - 1; i >= 0; --i) {
6885 mj_part_t part =
6886 part_shifts[mj_part_t(local_assigned_part_ids(i))];
6887 local_new_coordinate_permutations(--num_points_in_parts[part]) = i;
6888 }
6889 });
6890 }
6891}
6892
6917template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
6918 typename mj_part_t, typename mj_node_t>
6919bool AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
6920 mj_perform_migration(
6921 mj_part_t input_num_parts,
6922 mj_part_t &output_num_parts,
6923 std::vector<mj_part_t> *next_future_num_parts_in_parts,
6924 mj_part_t &output_part_begin_index,
6925 size_t migration_reduce_all_population,
6926 mj_lno_t num_coords_for_last_dim_part,
6927 std::string iteration,
6928 RCP<mj_partBoxVector_t> &input_part_boxes,
6929 RCP<mj_partBoxVector_t> &output_part_boxes)
6930{
6931 mj_part_t num_procs = this->comm->getSize();
6932 this->myRank = this->comm->getRank();
6933
6934 // this array holds how many points each processor has in each part.
6935 // to access how many points processor i has on part j,
6936 // num_points_in_all_processor_parts[i * num_parts + j]
6937 mj_gno_t *num_points_in_all_processor_parts =
6938 new mj_gno_t[input_num_parts * (num_procs + 1)];
6939
6940 // get the number of coordinates in each part in each processor.
6941 this->get_processor_num_points_in_parts(
6942 num_procs,
6943 input_num_parts,
6944 num_points_in_all_processor_parts);
6945
6946 // check if migration will be performed or not.
6947 if(!this->mj_check_to_migrate(
6948 migration_reduce_all_population,
6949 num_coords_for_last_dim_part,
6950 num_procs,
6951 input_num_parts,
6952 num_points_in_all_processor_parts)) {
6953 delete [] num_points_in_all_processor_parts;
6954 return false;
6955 }
6956
6957 mj_lno_t *send_count_to_each_proc = NULL;
6958 int *coordinate_destinations = new int[this->num_local_coords];
6959 send_count_to_each_proc = new mj_lno_t[num_procs];
6960
6961 for(int i = 0; i < num_procs; ++i) {
6962 send_count_to_each_proc[i] = 0;
6963 }
6964
6965 std::vector<mj_part_t> processor_ranks_for_subcomm;
6966 std::vector<mj_part_t> out_part_indices;
6967
6968 // determine which processors are assigned to which parts
6969 this->mj_migration_part_proc_assignment(
6970 num_points_in_all_processor_parts,
6971 input_num_parts,
6972 num_procs,
6973 send_count_to_each_proc,
6974 processor_ranks_for_subcomm,
6975 next_future_num_parts_in_parts,
6976 output_num_parts,
6977 out_part_indices,
6978 output_part_begin_index,
6979 coordinate_destinations);
6980
6981 delete [] send_count_to_each_proc;
6982 std::vector <mj_part_t> tmpv;
6983
6984 std::sort (out_part_indices.begin(), out_part_indices.end());
6985 mj_part_t outP = out_part_indices.size();
6986 mj_gno_t new_global_num_points = 0;
6987 mj_gno_t *global_num_points_in_parts =
6988 num_points_in_all_processor_parts + num_procs * input_num_parts;
6989
6990 if(this->mj_keep_part_boxes) {
6991 input_part_boxes->clear();
6992 }
6993
6994 // now we calculate the new values for next_future_num_parts_in_parts.
6995 // same for the part boxes.
6996 for(mj_part_t i = 0; i < outP; ++i) {
6997 mj_part_t ind = out_part_indices[i];
6998 new_global_num_points += global_num_points_in_parts[ind];
6999 tmpv.push_back((*next_future_num_parts_in_parts)[ind]);
7000 if(this->mj_keep_part_boxes) {
7001 input_part_boxes->push_back((*output_part_boxes)[ind]);
7002 }
7003 }
7004
7005 // swap the input and output part boxes.
7006 if(this->mj_keep_part_boxes) {
7007 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
7008 input_part_boxes = output_part_boxes;
7009 output_part_boxes = tmpPartBoxes;
7010 }
7011 next_future_num_parts_in_parts->clear();
7012 for(mj_part_t i = 0; i < outP; ++i) {
7013 mj_part_t p = tmpv[i];
7014 next_future_num_parts_in_parts->push_back(p);
7015 }
7016
7017 delete [] num_points_in_all_processor_parts;
7018
7019 mj_lno_t num_new_local_points = 0;
7020 //perform the actual migration operation here.
7021 this->mj_migrate_coords(
7022 num_procs,
7023 num_new_local_points,
7024 iteration,
7025 coordinate_destinations,
7026 input_num_parts);
7027
7028 delete [] coordinate_destinations;
7029 if(this->num_local_coords != num_new_local_points) {
7030 this->new_coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>
7031 (Kokkos::ViewAllocateWithoutInitializing("new_coordinate_permutations"),
7032 num_new_local_points);
7033 this->coordinate_permutations = Kokkos::View<mj_lno_t*, device_t>
7034 (Kokkos::ViewAllocateWithoutInitializing("coordinate_permutations"),
7035 num_new_local_points);
7036 }
7037 this->num_local_coords = num_new_local_points;
7038 this->num_global_coords = new_global_num_points;
7039
7040 // create subcommunicator.
7041 this->create_sub_communicator(processor_ranks_for_subcomm);
7042
7043 processor_ranks_for_subcomm.clear();
7044
7045 // fill the new permutation arrays.
7046 this->fill_permutation_array(output_num_parts, input_num_parts);
7047
7048 return true;
7049}
7050
7069template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
7070 typename mj_part_t, typename mj_node_t>
7071void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
7072 create_consistent_chunks(
7073 mj_part_t num_parts,
7074 Kokkos::View<mj_scalar_t *, device_t> & mj_current_dim_coords,
7075 Kokkos::View<mj_scalar_t *, device_t> & current_concurrent_cut_coordinate,
7076 mj_lno_t coordinate_begin,
7077 mj_lno_t coordinate_end,
7078 Kokkos::View<mj_scalar_t *, device_t> & used_local_cut_line_weight_to_left,
7079 Kokkos::View<mj_lno_t *, device_t> & out_part_xadj,
7080 int coordInd,
7081 bool longest_dim_part,
7082 uSignedSortItem<int, mj_scalar_t, char> * p_coord_dimension_range_sorted)
7083{
7084 // Note that this method is only used by task mapper
7085 // All code in this file has been verified to run with UVM off by running
7086 // mj tests and task mapper tests with UVM off. However for this particular
7087 // method I did not do much for UVM off. I heavily use device to host copies
7088 // and more or less preserve the original logic. Due to the handling of
7089 // arrays it will be a bit of work to convert this to as better form.
7090 // Since it's only relevant to task mapper and I wasn't sure how much priority
7091 // to give it, I put that on hold until further discussion.
7092 mj_part_t no_cuts = num_parts - 1;
7093
7094 // now if the rectilinear partitioning is allowed we decide how
7095 // much weight each thread should put to left and right.
7096 if(this->distribute_points_on_cut_lines) {
7097 auto local_thread_cut_line_weight_to_put_left =
7098 this->thread_cut_line_weight_to_put_left;
7099 auto local_thread_part_weight_work =
7100 this->thread_part_weight_work;
7101 auto local_sEpsilon = this->sEpsilon;
7102
7103 Kokkos::parallel_for(
7104 Kokkos::RangePolicy<typename mj_node_t::execution_space,
7105 mj_part_t> (0, no_cuts), KOKKOS_LAMBDA (mj_part_t i) {
7106 // the left to be put on the left of the cut.
7107 mj_scalar_t left_weight = used_local_cut_line_weight_to_left(i);
7108 if(left_weight > local_sEpsilon) {
7109 // the weight of thread ii on cut.
7110 mj_scalar_t thread_ii_weight_on_cut =
7111 local_thread_part_weight_work(i * 2 + 1) -
7112 local_thread_part_weight_work(i * 2);
7113 if(thread_ii_weight_on_cut < left_weight) {
7114 local_thread_cut_line_weight_to_put_left(i) =
7115 thread_ii_weight_on_cut;
7116 }
7117 else {
7118 local_thread_cut_line_weight_to_put_left(i) = left_weight;
7119 }
7120 }
7121 else {
7122 local_thread_cut_line_weight_to_put_left(i) = 0;
7123 }
7124 });
7125
7126 if(no_cuts > 0) {
7127 auto local_least_signifiance = least_signifiance;
7128 auto local_significance_mul = significance_mul;
7129 Kokkos::parallel_for(
7130 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
7131 (0, 1), KOKKOS_LAMBDA (int dummy) {
7132 // this is a special case. If cutlines share the same coordinate,
7133 // their weights are equal.
7134 // we need to adjust the ratio for that.
7135 for(mj_part_t i = no_cuts - 1; i > 0 ; --i) {
7136 mj_scalar_t cut1 = current_concurrent_cut_coordinate(i-1);
7137 mj_scalar_t cut2 = current_concurrent_cut_coordinate(i);
7138 mj_scalar_t delta = cut2 - cut1;
7139 mj_scalar_t abs_delta = (delta > 0) ? delta : -delta;
7140 if(abs_delta < local_sEpsilon) {
7141 local_thread_cut_line_weight_to_put_left(i) -=
7142 local_thread_cut_line_weight_to_put_left(i - 1);
7143 }
7144 local_thread_cut_line_weight_to_put_left(i) =
7145 static_cast<long long>((local_thread_cut_line_weight_to_put_left(i) +
7146 local_least_signifiance) * local_significance_mul) /
7147 static_cast<mj_scalar_t>(local_significance_mul);
7148 }
7149 });
7150 }
7151 }
7152
7153 auto local_thread_point_counts = this->thread_point_counts;
7154 Kokkos::parallel_for(
7155 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
7156 (0, num_parts), KOKKOS_LAMBDA (mj_part_t i) {
7157 local_thread_point_counts(i) = 0;
7158 });
7159
7160 // for this specific case we dont want to distribute the points along the
7161 // cut position randomly, as we need a specific ordering of them. Instead,
7162 // we put the coordinates into a sort item, where we sort those
7163 // using the coordinates of points on other dimensions and the index.
7164
7165 // some of the cuts might share the same position.
7166 // in this case, if cut i and cut j share the same position
7167 // cut_map[i] = cut_map[j] = sort item index.
7168 mj_part_t *cut_map = new mj_part_t[no_cuts];
7169
7170 typedef uMultiSortItem<mj_lno_t, int, mj_scalar_t> multiSItem;
7171 typedef std::vector< multiSItem > multiSVector;
7172 typedef std::vector<multiSVector> multiS2Vector;
7173
7174 // to keep track of the memory allocated.
7175 std::vector<mj_scalar_t *>allocated_memory;
7176
7177 // vector for which the coordinates will be sorted.
7178 multiS2Vector sort_vector_points_on_cut;
7179
7180 // the number of cuts that have different coordinates.
7181 mj_part_t different_cut_count = 1;
7182 cut_map[0] = 0;
7183
7184 // now we insert 1 sort vector for all cuts on the different
7185 // positins.if multiple cuts are on the same position,
7186 // they share sort vectors.
7187 multiSVector tmpMultiSVector;
7188 sort_vector_points_on_cut.push_back(tmpMultiSVector);
7189
7190 auto local_current_concurrent_cut_coordinate =
7191 current_concurrent_cut_coordinate;
7192 auto host_current_concurrent_cut_coordinate =
7193 Kokkos::create_mirror_view(local_current_concurrent_cut_coordinate);
7194 Kokkos::deep_copy(host_current_concurrent_cut_coordinate,
7195 local_current_concurrent_cut_coordinate);
7196
7197 for(mj_part_t i = 1; i < no_cuts ; ++i) {
7198 // if cuts share the same cut coordinates
7199 // set the cutmap accordingly.
7200 if(std::abs(host_current_concurrent_cut_coordinate(i) -
7201 host_current_concurrent_cut_coordinate(i-1)) < this->sEpsilon) {
7202 cut_map[i] = cut_map[i-1];
7203 }
7204 else {
7205 cut_map[i] = different_cut_count++;
7206 multiSVector tmp2MultiSVector;
7207 sort_vector_points_on_cut.push_back(tmp2MultiSVector);
7208 }
7209 }
7210 Kokkos::deep_copy(current_concurrent_cut_coordinate,
7211 host_current_concurrent_cut_coordinate);
7212
7213 // now the actual part assigment.
7214 auto host_coordinate_permutations =
7215 Kokkos::create_mirror_view(coordinate_permutations);
7216 Kokkos::deep_copy(host_coordinate_permutations, coordinate_permutations);
7217
7218 auto host_assigned_part_ids = Kokkos::create_mirror_view(assigned_part_ids);
7219 Kokkos::deep_copy(host_assigned_part_ids, assigned_part_ids);
7220
7221 auto host_mj_coordinates = Kokkos::create_mirror_view(mj_coordinates);
7222 Kokkos::deep_copy(host_mj_coordinates, mj_coordinates);
7223
7224 auto host_thread_point_counts = Kokkos::create_mirror_view(thread_point_counts);
7225 Kokkos::deep_copy(host_thread_point_counts, thread_point_counts);
7226
7227 auto local_coord_dim = this->coord_dim;
7228
7229 for(mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii) {
7230 mj_lno_t i = host_coordinate_permutations(ii);
7231 mj_part_t pp = host_assigned_part_ids(i);
7232 mj_part_t p = pp / 2;
7233 // if the coordinate is on a cut.
7234 if(pp % 2 == 1 ) {
7235 mj_scalar_t *vals = new mj_scalar_t[local_coord_dim -1];
7236 allocated_memory.push_back(vals);
7237
7238 // we insert the coordinates to the sort item here.
7239 int val_ind = 0;
7240
7241 if(longest_dim_part) {
7242 // std::cout << std::endl << std::endl;
7243 for(int dim = local_coord_dim - 2; dim >= 0; --dim) {
7244 // uSignedSortItem<int, mj_scalar_t, char>
7245 // *p_coord_dimension_range_sorted
7246 int next_largest_coord_dim = p_coord_dimension_range_sorted[dim].id;
7247 // std::cout << "next_largest_coord_dim: " <<
7248 // next_largest_coord_dim << " ";
7249 // Note refactor in progress
7250 vals[val_ind++] =
7251 host_mj_coordinates(i,next_largest_coord_dim);
7252 }
7253 }
7254 else {
7255 for(int dim = coordInd + 1; dim < local_coord_dim; ++dim) {
7256 vals[val_ind++] = host_mj_coordinates(i,dim);
7257 }
7258 for(int dim = 0; dim < coordInd; ++dim) {
7259 vals[val_ind++] = host_mj_coordinates(i,dim);
7260 }
7261 }
7262
7263 multiSItem tempSortItem(i, local_coord_dim -1, vals);
7264 //insert the point to the sort vector pointed by the cut_map[p].
7265 mj_part_t cmap = cut_map[p];
7266 sort_vector_points_on_cut[cmap].push_back(tempSortItem);
7267 }
7268 else {
7269 //if it is not on the cut, simple sorting.
7270 ++host_thread_point_counts(p);
7271 host_assigned_part_ids(i) = p;
7272 }
7273 }
7274
7275 // sort all the sort vectors.
7276 for(mj_part_t i = 0; i < different_cut_count; ++i) {
7277 std::sort (sort_vector_points_on_cut[i].begin(),
7278 sort_vector_points_on_cut[i].end());
7279 }
7280
7281 mj_part_t previous_cut_map = cut_map[0];
7282
7283 auto host_thread_cut_line_weight_to_put_left =
7284 Kokkos::create_mirror_view(thread_cut_line_weight_to_put_left);
7285 Kokkos::deep_copy(host_thread_cut_line_weight_to_put_left,
7286 thread_cut_line_weight_to_put_left);
7287
7288 auto host_mj_weights = Kokkos::create_mirror_view(mj_weights);
7289 Kokkos::deep_copy(host_mj_weights, mj_weights);
7290
7291 // this is how much previous part owns the weight of the current part.
7292 // when target part weight is 1.6, and the part on the left is given 2,
7293 // the left has an extra 0.4, while the right has missing 0.4 from the
7294 // previous cut.
7295 // This parameter is used to balance this issues.
7296 // in the above example weight_stolen_from_previous_part will be 0.4.
7297 // if the left part target is 2.2 but it is given 2,
7298 // then weight_stolen_from_previous_part will be -0.2.
7299 mj_scalar_t weight_stolen_from_previous_part = 0;
7300 for(mj_part_t p = 0; p < no_cuts; ++p) {
7301 mj_part_t mapped_cut = cut_map[p];
7302
7303 // if previous cut map is done, and it does not have the same index,
7304 // then assign all points left on that cut to its right.
7305 if(previous_cut_map != mapped_cut) {
7306 mj_lno_t sort_vector_end = (mj_lno_t)
7307 sort_vector_points_on_cut[previous_cut_map].size() - 1;
7308 for(; sort_vector_end >= 0; --sort_vector_end) {
7309 multiSItem t =
7310 sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
7311 mj_lno_t i = t.index;
7312 ++host_thread_point_counts(p);
7313 host_assigned_part_ids(i) = p;
7314 }
7315 sort_vector_points_on_cut[previous_cut_map].clear();
7316 }
7317
7318 // TODO: MD: I dont remember why I have it reverse order here.
7319 mj_lno_t sort_vector_end = (mj_lno_t)
7320 sort_vector_points_on_cut[mapped_cut].size() - 1;
7321 // mj_lno_t sort_vector_begin= 0;
7322 // mj_lno_t sort_vector_size =
7323 // (mj_lno_t)sort_vector_points_on_cut[mapped_cut].size();
7324
7325 // TODO commented for reverse order
7326 for(; sort_vector_end >= 0; --sort_vector_end) {
7327 // for(; sort_vector_begin < sort_vector_size; ++sort_vector_begin) {
7328 // TODO COMMENTED FOR REVERSE ORDER
7329 multiSItem t = sort_vector_points_on_cut[mapped_cut][sort_vector_end];
7330 //multiSItem t = sort_vector_points_on_cut[mapped_cut][sort_vector_begin];
7331 mj_lno_t i = t.index;
7332 mj_scalar_t w = this->mj_uniform_weights(0) ? 1 :
7333 this->mj_weights(i,0);
7334 // part p has enough space for point i, then put it to point i.
7335 if(host_thread_cut_line_weight_to_put_left(p) +
7336 weight_stolen_from_previous_part> this->sEpsilon &&
7337 host_thread_cut_line_weight_to_put_left(p) +
7338 weight_stolen_from_previous_part -
7339 std::abs(host_thread_cut_line_weight_to_put_left(p) +
7340 weight_stolen_from_previous_part - w)> this->sEpsilon)
7341 {
7342 host_thread_cut_line_weight_to_put_left(p) -= w;
7343
7344 sort_vector_points_on_cut[mapped_cut].pop_back();
7345
7346 ++host_thread_point_counts(p);
7347 host_assigned_part_ids(i) = p;
7348 // if putting this weight to left overweights the left cut, then
7349 // increase the space for the next cut using
7350 // weight_stolen_from_previous_part.
7351 if(p < no_cuts - 1 &&
7352 host_thread_cut_line_weight_to_put_left(p) < this->sEpsilon) {
7353 if(mapped_cut == cut_map[p + 1] ) {
7354 // if the cut before the cut indexed at p was also at the same
7355 // position special case, as we handle the weight differently here.
7356 if(previous_cut_map != mapped_cut) {
7357 weight_stolen_from_previous_part =
7358 host_thread_cut_line_weight_to_put_left(p);
7359 }
7360 else {
7361 // if the cut before the cut indexed at p was also at the same
7362 // position we assign extra weights cumulatively in this case.
7363 weight_stolen_from_previous_part +=
7364 host_thread_cut_line_weight_to_put_left(p);
7365 }
7366 }
7367 else{
7368 weight_stolen_from_previous_part =
7369 -host_thread_cut_line_weight_to_put_left(p);
7370 }
7371 // end assignment for part p
7372 break;
7373 }
7374 } else {
7375 // if part p does not have enough space for this point
7376 // and if there is another cut sharing the same positon,
7377 // again increase the space for the next
7378 if(p < no_cuts - 1 && mapped_cut == cut_map[p + 1]) {
7379 if(previous_cut_map != mapped_cut) {
7380 weight_stolen_from_previous_part =
7381 host_thread_cut_line_weight_to_put_left(p);
7382 }
7383 else {
7384 weight_stolen_from_previous_part +=
7385 host_thread_cut_line_weight_to_put_left(p);
7386 }
7387 }
7388 else{
7389 weight_stolen_from_previous_part =
7390 -host_thread_cut_line_weight_to_put_left(p);
7391 }
7392 // end assignment for part p
7393 break;
7394 }
7395 }
7396 previous_cut_map = mapped_cut;
7397 }
7398
7399 // TODO commented for reverse order
7400 // put everything left on the last cut to the last part.
7401 mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[
7402 previous_cut_map].size() - 1;
7403
7404 // mj_lno_t sort_vector_begin= 0;
7405 // mj_lno_t sort_vector_size = (mj_lno_t)
7406 // sort_vector_points_on_cut[previous_cut_map].size();
7407 // TODO commented for reverse order
7408 for(; sort_vector_end >= 0; --sort_vector_end) {
7409 // TODO commented for reverse order
7410 multiSItem t = sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
7411 // multiSItem t =
7412 // sort_vector_points_on_cut[previous_cut_map][sort_vector_begin];
7413 mj_lno_t i = t.index;
7414 ++host_thread_point_counts(no_cuts);
7415 host_assigned_part_ids(i) = no_cuts;
7416 }
7417
7418 sort_vector_points_on_cut[previous_cut_map].clear();
7419 delete [] cut_map;
7420
7421 //free the memory allocated for vertex sort items .
7422 mj_lno_t vSize = (mj_lno_t) allocated_memory.size();
7423 for(mj_lno_t i = 0; i < vSize; ++i) {
7424 delete [] allocated_memory[i];
7425 }
7426
7427 auto local_out_part_xadj = out_part_xadj;
7428 auto host_out_part_xadj = Kokkos::create_mirror_view(local_out_part_xadj);
7429 Kokkos::deep_copy(host_out_part_xadj, out_part_xadj);
7430
7431 // creation of part_xadj as in usual case.
7432 for(mj_part_t j = 0; j < num_parts; ++j) {
7433 host_out_part_xadj(j) = host_thread_point_counts(j);
7434 host_thread_point_counts(j) = 0;
7435 }
7436
7437 // perform prefix sum for num_points in parts.
7438 for(mj_part_t j = 1; j < num_parts; ++j) {
7439 host_out_part_xadj(j) += host_out_part_xadj(j - 1);
7440 }
7441
7442 // shift the num points in threads thread to obtain the
7443 // beginning index of each thread's private space.
7444 for(mj_part_t j = 1; j < num_parts; ++j) {
7445 host_thread_point_counts(j) += host_out_part_xadj(j - 1);
7446 }
7447
7448 auto host_new_coordinate_permutations =
7449 Kokkos::create_mirror_view(new_coordinate_permutations);
7450 Kokkos::deep_copy(host_new_coordinate_permutations,
7451 new_coordinate_permutations);
7452
7453 // now thread gets the coordinate and writes the index of coordinate to
7454 // the permutation array using the part index we calculated.
7455 for(mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii) {
7456 mj_lno_t i = host_coordinate_permutations(ii);
7457 mj_part_t p = host_assigned_part_ids(i);
7458 host_new_coordinate_permutations(coordinate_begin +
7459 host_thread_point_counts(p)++) = i;
7460 }
7461
7462 Kokkos::deep_copy(thread_point_counts, host_thread_point_counts);
7463 Kokkos::deep_copy(new_coordinate_permutations,
7464 host_new_coordinate_permutations);
7465 Kokkos::deep_copy(local_out_part_xadj, host_out_part_xadj);
7466}
7467
7477template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
7478 typename mj_part_t, typename mj_node_t>
7479void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t, mj_node_t>::
7480 set_final_parts(
7481 mj_part_t current_num_parts,
7482 mj_part_t output_part_begin_index,
7483 RCP<mj_partBoxVector_t> &output_part_boxes,
7484 bool is_data_ever_migrated)
7485{
7486 this->mj_env->timerStart(MACRO_TIMERS,
7487 mj_timer_base_string + "Part_Assignment");
7488
7489 auto local_part_xadj = part_xadj;
7490 auto local_mj_keep_part_boxes = mj_keep_part_boxes;
7491 auto local_coordinate_permutations = coordinate_permutations;
7492 auto local_assigned_part_ids = assigned_part_ids;
7493
7494 if(local_mj_keep_part_boxes) {
7495 for(int i = 0; i < current_num_parts; ++i) {
7496 (*output_part_boxes)[i].setpId(i + output_part_begin_index);
7497 }
7498 }
7499
7500 Kokkos::TeamPolicy<typename mj_node_t::execution_space> policy(
7501 current_num_parts, Kokkos::AUTO());
7502 typedef typename Kokkos::TeamPolicy<typename mj_node_t::execution_space>::
7503 member_type member_type;
7504 Kokkos::parallel_for(policy, KOKKOS_LAMBDA(member_type team_member) {
7505 int i = team_member.league_rank();
7506 Kokkos::parallel_for(Kokkos::TeamThreadRange (team_member, (i != 0) ?
7507 local_part_xadj(i-1) : 0, local_part_xadj(i)),
7508 [=] (mj_lno_t ii) {
7509 mj_lno_t k = local_coordinate_permutations(ii);
7510 local_assigned_part_ids(k) = i + output_part_begin_index;
7511 });
7512 });
7513
7514 if(is_data_ever_migrated) {
7515#ifdef ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
7516 if(sizeof(mj_lno_t) <= sizeof(int)) {
7517
7518 // Cannot use Zoltan_Comm with local ordinals larger than ints.
7519 // In Zoltan_Comm_Create, the cast int(this->num_local_coords)
7520 // may overflow.
7521
7522 // if data is migrated, then send part numbers to the original owners.
7523 ZOLTAN_COMM_OBJ *plan = NULL;
7524 MPI_Comm mpi_comm = Teuchos::getRawMpiComm(*(this->mj_problemComm));
7525
7526 int incoming = 0;
7527 int message_tag = 7856;
7528
7529 this->mj_env->timerStart(MACRO_TIMERS,
7530 mj_timer_base_string + "Final Z1PlanCreating");
7531
7532 // setup incoming count
7533 int ierr = Zoltan_Comm_Create( &plan, int(this->num_local_coords),
7534 this->owner_of_coordinate.data(), mpi_comm, message_tag, &incoming);
7535
7536 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
7537 this->mj_env->timerStop(MACRO_TIMERS,
7538 mj_timer_base_string + "Final Z1PlanCreating" );
7539
7540 this->mj_env->timerStart(MACRO_TIMERS,
7541 mj_timer_base_string + "Final Z1PlanComm");
7542
7543 // MPI Buffers should be on Kokkos::HostSpace not Kokkos::CudaUVMSpace
7544 // Note, with UVM space, create_mirror_view does NOT create a non-UVM
7545 // view; need the explicit Host creation and deep_copy.
7546
7547 // migrate gnos to actual owners.
7548 auto host_current_mj_gnos = Kokkos::create_mirror_view(
7549 Kokkos::HostSpace(), this->current_mj_gnos);
7550 deep_copy(host_current_mj_gnos, this->current_mj_gnos);
7551 Kokkos::View<mj_gno_t*, device_t> dst_gnos(
7552 Kokkos::ViewAllocateWithoutInitializing("dst_gnos"), incoming);
7553 auto host_dst_gnos = Kokkos::create_mirror_view(
7554 Kokkos::HostSpace(), dst_gnos);
7555 message_tag++;
7556 ierr = Zoltan_Comm_Do( plan, message_tag,
7557 (char *) host_current_mj_gnos.data(),
7558 sizeof(mj_gno_t), (char *) host_dst_gnos.data());
7559 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
7560 Kokkos::deep_copy(dst_gnos, host_dst_gnos);
7561 this->current_mj_gnos = dst_gnos;
7562
7563 // migrate part ids to actual owners.
7564 auto host_src_part_ids = Kokkos::create_mirror_view(
7565 Kokkos::HostSpace(), this->assigned_part_ids);
7566 deep_copy(host_src_part_ids, this->assigned_part_ids);
7567 Kokkos::View<mj_part_t*, device_t> dst_part_ids(
7568 Kokkos::ViewAllocateWithoutInitializing("dst_part_ids"), incoming);
7569 auto host_dst_part_ids = Kokkos::create_mirror_view(
7570 Kokkos::HostSpace(), dst_part_ids);
7571 message_tag++;
7572 ierr = Zoltan_Comm_Do( plan, message_tag,
7573 (char *) host_src_part_ids.data(),
7574 sizeof(mj_part_t), (char *) host_dst_part_ids.data());
7575 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
7576 Kokkos::deep_copy(dst_part_ids, host_dst_part_ids);
7577 this->assigned_part_ids = dst_part_ids;
7578
7579 ierr = Zoltan_Comm_Destroy(&plan);
7580 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
7581
7582 this->num_local_coords = incoming;
7583
7584 this->mj_env->timerStop(MACRO_TIMERS,
7585 mj_timer_base_string + "Final Z1PlanComm");
7586 }
7587 else
7588#endif // ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION
7589 {
7590 // setup incoming count
7591 this->mj_env->timerStart(MACRO_TIMERS,
7592 mj_timer_base_string + "Final DistributorPlanCreating");
7593 Tpetra::Distributor distributor(this->mj_problemComm);
7594 ArrayView<const mj_part_t> owners_of_coords(
7595 this->owner_of_coordinate.data(), this->num_local_coords);
7596 mj_lno_t incoming = distributor.createFromSends(owners_of_coords);
7597 this->mj_env->timerStop(MACRO_TIMERS,
7598 mj_timer_base_string + "Final DistributorPlanCreating" );
7599
7600 this->mj_env->timerStart(MACRO_TIMERS,
7601 mj_timer_base_string + "Final DistributorPlanComm");
7602
7603 // migrate gnos to actual owners.
7604 // MPI buffers should be Kokkos::HostSpace, not Kokkos::CudaUVMSpace
7605 // Note, with UVM space, create_mirror_view does NOT create a non-UVM
7606 // view; need the explicit Host creation and deep_copy.
7607 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> sent_gnos(
7608 Kokkos::ViewAllocateWithoutInitializing("sent_gnos"),
7609 this->current_mj_gnos.extent(0));
7610 Kokkos::deep_copy(sent_gnos, this->current_mj_gnos);
7611
7612 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> received_gnos(
7613 Kokkos::ViewAllocateWithoutInitializing("received_gnos"),
7614 incoming);
7615
7616 distributor.doPostsAndWaits(sent_gnos, 1, received_gnos);
7617
7618 this->current_mj_gnos = Kokkos::View<mj_gno_t*, device_t>(
7619 Kokkos::ViewAllocateWithoutInitializing("current_mj_gnos"), incoming);
7620
7621 Kokkos::deep_copy(this->current_mj_gnos, received_gnos);
7622
7623 // migrate part ids to actual owners.
7624 Kokkos::View<mj_part_t *, Kokkos::HostSpace> sent_partids(
7625 Kokkos::ViewAllocateWithoutInitializing("sent_partids"),
7626 this->assigned_part_ids.extent(0));
7627 Kokkos::deep_copy(sent_partids, this->assigned_part_ids);
7628
7629 Kokkos::View<mj_part_t *, Kokkos::HostSpace> received_partids(
7630 Kokkos::ViewAllocateWithoutInitializing("received_partids"),
7631 incoming);
7632
7633 distributor.doPostsAndWaits(sent_partids, 1, received_partids);
7634
7635 this->assigned_part_ids =
7636 Kokkos::View<mj_part_t*, device_t>(
7637 Kokkos::ViewAllocateWithoutInitializing("assigned_part_ids"),
7638 incoming);
7639
7640 Kokkos::deep_copy(this->assigned_part_ids, received_partids);
7641 this->num_local_coords = incoming;
7642
7643 this->mj_env->timerStop(MACRO_TIMERS,
7644 mj_timer_base_string + "Final DistributorPlanComm");
7645 }
7646 }
7647
7648 this->mj_env->timerStop(MACRO_TIMERS,
7649 mj_timer_base_string + "Part_Assignment");
7650
7651 this->mj_env->timerStart(MACRO_TIMERS,
7652 mj_timer_base_string + "Solution_Part_Assignment");
7653
7654 // ArrayRCP<mj_part_t> partId;
7655 // partId = arcp(this->assigned_part_ids, 0, this->num_local_coords, true);
7656
7657 if(this->mj_keep_part_boxes) {
7658 this->kept_boxes = compute_global_box_boundaries(output_part_boxes);
7659 }
7660
7661 this->mj_env->timerStop(MACRO_TIMERS,
7662 mj_timer_base_string + "Solution_Part_Assignment");
7663}
7664
7677template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
7678 typename mj_part_t, typename mj_node_t>
7681 bool distribute_points_on_cut_lines_,
7682 int max_concurrent_part_calculation_,
7683 int check_migrate_avoid_migration_option_,
7684 double minimum_migration_imbalance_,
7685 int migration_type_)
7686{
7687 this->distribute_points_on_cut_lines = distribute_points_on_cut_lines_;
7688 this->max_concurrent_part_calculation = max_concurrent_part_calculation_;
7689 this->check_migrate_avoid_migration_option =
7690 check_migrate_avoid_migration_option_;
7691 this->minimum_migration_imbalance = minimum_migration_imbalance_;
7692 this->migration_type = migration_type_;
7693}
7694
7722template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
7723 typename mj_part_t, typename mj_node_t>
7726 const RCP<const Environment> &env,
7727 RCP<const Comm<int> > &problemComm,
7728 double imbalance_tolerance_,
7729 int num_teams_,
7730 size_t num_global_parts_,
7731 Kokkos::View<mj_part_t*, Kokkos::HostSpace> & part_no_array_,
7732 int recursion_depth_,
7733 int coord_dim_,
7734 mj_lno_t num_local_coords_,
7735 mj_gno_t num_global_coords_,
7736 Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos_,
7737 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
7738 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> & mj_coordinates_,
7739 int num_weights_per_coord_,
7740 Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_weights_,
7741 Kokkos::View<mj_scalar_t**, device_t> & mj_weights_,
7742 Kokkos::View<bool*, Kokkos::HostSpace> & mj_uniform_parts_,
7743 Kokkos::View<mj_part_t *, device_t> & result_assigned_part_ids_,
7744 Kokkos::View<mj_gno_t*, device_t> & result_mj_gnos_)
7745{
7746
7747 // see comment above for Zoltan2_AlgMJ_TrackCallsCounter
7748 int execute_counter = Zoltan2_AlgMJ_TrackCallsCounter::get_counter_AlgMJ();
7749 this->mj_timer_base_string = "MJ(" + std::to_string(execute_counter) + ") - ";
7750
7751 this->mj_env = env;
7752 this->mj_problemComm = problemComm;
7753 this->myActualRank = this->myRank = this->mj_problemComm->getRank();
7754 this->mj_env->timerStart(MACRO_TIMERS,
7755 mj_timer_base_string + "Total");
7756 this->mj_env->debug(3, "In MultiJagged Jagged");
7757 this->imbalance_tolerance = imbalance_tolerance_;
7758 this->mj_num_teams = num_teams_;
7759 this->num_global_parts = num_global_parts_;
7760 this->part_no_array = part_no_array_;
7761 this->recursion_depth = recursion_depth_;
7762 this->coord_dim = coord_dim_;
7763 this->num_local_coords = num_local_coords_;
7764 this->num_global_coords = num_global_coords_;
7765 this->mj_coordinates = mj_coordinates_;
7766 this->initial_mj_gnos = initial_mj_gnos_;
7767 this->num_weights_per_coord = num_weights_per_coord_;
7768 this->mj_uniform_weights = mj_uniform_weights_;
7769 this->mj_weights = mj_weights_;
7770 this->mj_uniform_parts = mj_uniform_parts_;
7771
7772 // this->set_input_data();
7773
7774 this->set_part_specifications();
7775
7776 this->mj_env->timerStart(MACRO_TIMERS,
7777 mj_timer_base_string + "Allocate Views");
7778 this->allocate_set_work_memory();
7779 this->mj_env->timerStop(MACRO_TIMERS,
7780 mj_timer_base_string + "Allocate Views");
7781
7782 // We duplicate the comm as we create subcommunicators during migration.
7783 // We keep the problemComm as it is, while comm changes after each migration.
7784 this->comm = this->mj_problemComm->duplicate();
7785
7786#ifdef print_debug
7787 if(comm->getRank() == 0) {
7788 std::cout << "size of gno:" << sizeof(mj_gno_t) << std::endl;
7789 std::cout << "size of lno:" << sizeof(mj_lno_t) << std::endl;
7790 std::cout << "size of mj_scalar_t:" << sizeof(mj_scalar_t) << std::endl;
7791 }
7792#endif
7793
7794 // initially there is a single partition
7795 mj_part_t current_num_parts = 1;
7796 Kokkos::View<mj_scalar_t *, device_t> current_cut_coordinates =
7797 this->all_cut_coordinates;
7798 this->mj_env->timerStart(MACRO_TIMERS,
7799 mj_timer_base_string + "Problem_Partitioning");
7800 mj_part_t output_part_begin_index = 0;
7801 mj_part_t future_num_parts = this->total_num_part;
7802 bool is_data_ever_migrated = false;
7803
7804 std::vector<mj_part_t> *future_num_part_in_parts =
7805 new std::vector<mj_part_t> ();
7806 std::vector<mj_part_t> *next_future_num_parts_in_parts =
7807 new std::vector<mj_part_t> ();
7808
7809 next_future_num_parts_in_parts->push_back(this->num_global_parts);
7810
7811 RCP<mj_partBoxVector_t> input_part_boxes;
7812 RCP<mj_partBoxVector_t> output_part_boxes;
7813
7814 if(this->mj_keep_part_boxes) {
7815 input_part_boxes = RCP<mj_partBoxVector_t>(new mj_partBoxVector_t(), true);
7816 output_part_boxes = RCP<mj_partBoxVector_t>(new mj_partBoxVector_t(), true);
7817 compute_global_box();
7818 this->init_part_boxes(output_part_boxes);
7819 }
7820
7821 auto local_part_xadj = this->part_xadj;
7822
7823 // Need a device counter - how best to allocate?
7824 // Putting this allocation in the loops is very costly so moved out here.
7825 Kokkos::View<mj_part_t*, device_t>
7826 view_rectilinear_cut_count("view_rectilinear_cut_count", 1);
7827 Kokkos::View<size_t*, device_t>
7828 view_total_reduction_size("view_total_reduction_size", 1);
7829
7830 for(int i = 0; i < this->recursion_depth; ++i) {
7831
7832 // convert i to string to be used for debugging purposes.
7833 std::string istring = std::to_string(i);
7834
7835 // next_future_num_parts_in_parts will be as the size of outnumParts,
7836 // and this will hold how many more parts that each output part
7837 // should be divided. this array will also be used to determine the weight
7838 // ratios of the parts. swap the arrays to use iteratively.
7839 std::vector<mj_part_t> *tmpPartVect= future_num_part_in_parts;
7840 future_num_part_in_parts = next_future_num_parts_in_parts;
7841 next_future_num_parts_in_parts = tmpPartVect;
7842
7843 // clear next_future_num_parts_in_parts array as
7844 // getPartitionArrays expects it to be empty.
7845 next_future_num_parts_in_parts->clear();
7846 if(this->mj_keep_part_boxes) {
7847 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
7848 input_part_boxes = output_part_boxes;
7849 output_part_boxes = tmpPartBoxes;
7850 output_part_boxes->clear();
7851 }
7852
7853 // returns the total no. of output parts for this dimension partitioning.
7854 mj_part_t output_part_count_in_dimension =
7855 this->update_part_num_arrays(
7856 future_num_part_in_parts,
7857 next_future_num_parts_in_parts,
7858 future_num_parts,
7859 current_num_parts,
7860 i,
7861 input_part_boxes,
7862 output_part_boxes, 1);
7863
7864 // if the number of obtained parts equal to current number of parts,
7865 // skip this dimension. For example, this happens when 1 is given in the
7866 // input part array is given. P=4,5,1,2
7867 if(output_part_count_in_dimension == current_num_parts) {
7868 //still need to swap the input output arrays.
7869 tmpPartVect= future_num_part_in_parts;
7870 future_num_part_in_parts = next_future_num_parts_in_parts;
7871 next_future_num_parts_in_parts = tmpPartVect;
7872
7873 if(this->mj_keep_part_boxes) {
7874 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
7875 input_part_boxes = output_part_boxes;
7876 output_part_boxes = tmpPartBoxes;
7877 }
7878 continue;
7879 }
7880
7881 // get the coordinate axis along which the partitioning will be done.
7882 int coordInd = i % this->coord_dim;
7883
7884 Kokkos::View<mj_scalar_t *, device_t> mj_current_dim_coords =
7885 Kokkos::subview(this->mj_coordinates, Kokkos::ALL, coordInd);
7886
7887 this->mj_env->timerStart(MACRO_TIMERS,
7888 mj_timer_base_string + "Problem_Partitioning_" + istring);
7889
7890 // alloc Memory to point the indices
7891 // of the parts in the permutation array.
7892 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>(
7893 "new part xadj", output_part_count_in_dimension);
7894
7895 // the index where in the new_part_xadj will be written.
7896 mj_part_t output_part_index = 0;
7897
7898 // whatever is written to output_part_index will be added with
7899 // output_coordinate_end_index so that the points will be shifted.
7900 mj_part_t output_coordinate_end_index = 0;
7901
7902 mj_part_t current_work_part = 0;
7903 mj_part_t current_concurrent_num_parts =
7904 std::min(current_num_parts - current_work_part,
7905 this->max_concurrent_part_calculation);
7906
7907 mj_part_t obtained_part_index = 0;
7908
7909 auto host_process_local_min_max_coord_total_weight =
7910 Kokkos::create_mirror_view(process_local_min_max_coord_total_weight);
7911 auto host_global_min_max_coord_total_weight =
7912 Kokkos::create_mirror_view(global_min_max_coord_total_weight);
7913
7914 // run for all available parts.
7915 for(; current_work_part < current_num_parts;
7916 current_work_part += current_concurrent_num_parts) {
7917
7918 current_concurrent_num_parts =
7919 std::min(current_num_parts - current_work_part,
7920 this->max_concurrent_part_calculation);
7921
7922 int bDoingWork_int; // Can't reduce on bool so use int
7923 auto local_device_num_partitioning_in_current_dim =
7924 device_num_partitioning_in_current_dim;
7925 Kokkos::parallel_reduce("Read bDoingWork",
7926 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
7927 KOKKOS_LAMBDA(int dummy, int & set_single) {
7928 set_single = 0;
7929 for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
7930 if(local_device_num_partitioning_in_current_dim(
7931 current_work_part + kk) != 1) {
7932 set_single = 1;
7933 break;
7934 }
7935 }
7936 }, bDoingWork_int);
7937 bool bDoingWork = (bDoingWork_int != 0) ? true : false;
7938
7939 this->mj_get_local_min_max_coord_totW(
7940 current_work_part,
7941 current_concurrent_num_parts,
7942 mj_current_dim_coords);
7943
7944 // 1D partitioning
7945 if(bDoingWork) {
7946 // obtain global Min max of the part.
7947 this->mj_get_global_min_max_coord_totW(
7948 current_concurrent_num_parts,
7949 this->process_local_min_max_coord_total_weight,
7950 this->global_min_max_coord_total_weight);
7951
7952 // represents the total number of cutlines
7953 // whose coordinate should be determined.
7954 mj_part_t total_incomplete_cut_count = 0;
7955
7956 // Compute weight ratios for parts & cuts:
7957 // e.g., 0.25 0.25 0.5 0.5 0.75 0.75 1
7958 // part0 cut0 part1 cut1 part2 cut2 part3
7959 mj_part_t concurrent_part_cut_shift = 0;
7960 mj_part_t concurrent_part_part_shift = 0;
7961
7962 for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
7963
7964 Kokkos::deep_copy(host_global_min_max_coord_total_weight,
7965 global_min_max_coord_total_weight);
7966
7967 mj_scalar_t min_coordinate =
7968 host_global_min_max_coord_total_weight(kk);
7969 mj_scalar_t max_coordinate =
7970 host_global_min_max_coord_total_weight(
7971 kk + current_concurrent_num_parts);
7972
7973 mj_scalar_t global_total_weight =
7974 host_global_min_max_coord_total_weight(
7975 kk + 2 * current_concurrent_num_parts);
7976
7977 mj_part_t concurrent_current_part_index = current_work_part + kk;
7978
7979 mj_part_t partition_count = host_num_partitioning_in_current_dim(
7980 concurrent_current_part_index);
7981
7982 Kokkos::View<mj_scalar_t *, device_t> usedCutCoordinate =
7983 Kokkos::subview(current_cut_coordinates,
7984 std::pair<mj_lno_t, mj_lno_t>(
7985 concurrent_part_cut_shift, current_cut_coordinates.size()));
7986 Kokkos::View<mj_scalar_t *, device_t>
7987 current_target_part_weights =
7988 Kokkos::subview(target_part_weights,
7989 std::pair<mj_lno_t, mj_lno_t>(
7990 concurrent_part_part_shift, target_part_weights.size()));
7991
7992 // shift the usedCutCoordinate array as noCuts.
7993 concurrent_part_cut_shift += partition_count - 1;
7994 // shift the partRatio array as noParts.
7995 concurrent_part_part_shift += partition_count;
7996
7997 // calculate only if part is not empty,
7998 // and part will be further partitioned.
7999 if(partition_count > 1 && min_coordinate <= max_coordinate) {
8000
8001 // increase num_cuts_do_be_determined by the number of cuts of the
8002 // current part's cut line number.
8003 total_incomplete_cut_count += partition_count - 1;
8004
8005 this->incomplete_cut_count(kk) = partition_count - 1;
8006
8007 // get the target weights of the parts
8008 this->mj_get_initial_cut_coords_target_weights(
8009 min_coordinate,
8010 max_coordinate,
8011 partition_count - 1,
8012 global_total_weight,
8013 usedCutCoordinate,
8014 current_target_part_weights,
8015 future_num_part_in_parts,
8016 next_future_num_parts_in_parts,
8017 concurrent_current_part_index,
8018 obtained_part_index);
8019
8020 mj_lno_t coordinate_end_index =
8021 host_part_xadj(concurrent_current_part_index);
8022 mj_lno_t coordinate_begin_index =
8023 concurrent_current_part_index==0 ? 0 :
8024 host_part_xadj(concurrent_current_part_index - 1);
8025
8026 this->set_initial_coordinate_parts(
8027 max_coordinate,
8028 min_coordinate,
8029 coordinate_begin_index, coordinate_end_index,
8030 this->coordinate_permutations,
8031 mj_current_dim_coords,
8032 this->assigned_part_ids,
8033 partition_count);
8034 }
8035 else {
8036 // e.g., if have fewer coordinates than parts, don't need to do
8037 // next dim.
8038 this->incomplete_cut_count(kk) = 0;
8039 }
8040
8041 obtained_part_index += partition_count;
8042 }
8043
8044 // used imbalance, it is always 0, as it is difficult to
8045 // estimate a range.
8046 double used_imbalance = 0;
8047 // Determine cut lines for all concurrent parts parts here.
8048 this->mj_env->timerStart(MACRO_TIMERS,
8049 mj_timer_base_string + "Problem_Partitioning Get Part Weights");
8050
8051 this->mj_1D_part(
8052 mj_current_dim_coords,
8053 used_imbalance,
8054 current_work_part,
8055 current_concurrent_num_parts,
8056 current_cut_coordinates,
8057 total_incomplete_cut_count,
8058 view_rectilinear_cut_count,
8059 view_total_reduction_size);
8060
8061 this->mj_env->timerStop(MACRO_TIMERS,
8062 mj_timer_base_string + "Problem_Partitioning Get Part Weights");
8063 }
8064
8065 // create new part chunks
8066 {
8067 mj_part_t output_array_shift = 0;
8068 mj_part_t cut_shift = 0;
8069 size_t tlr_shift = 0;
8070 size_t partweight_array_shift = 0;
8071 for(int kk = 0; kk < current_concurrent_num_parts; ++kk) {
8072
8073 mj_part_t current_concurrent_work_part = current_work_part + kk;
8074
8075 mj_part_t num_parts = host_num_partitioning_in_current_dim(
8076 current_concurrent_work_part);
8077
8078 // if the part is empty, skip the part.
8079 int coordinateA_bigger_than_coordinateB =
8080 host_global_min_max_coord_total_weight(kk) >
8081 host_global_min_max_coord_total_weight(
8082 kk + current_concurrent_num_parts);
8083
8084 if((num_parts != 1) && coordinateA_bigger_than_coordinateB) {
8085 // we still need to write the begin and end point of the empty part.
8086 // simply set it zero, the array indices will be shifted later
8087 auto local_new_part_xadj = this->new_part_xadj;
8088 Kokkos::parallel_for(
8089 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
8090 (0, num_parts), KOKKOS_LAMBDA (mj_part_t jj) {
8091 local_new_part_xadj(
8092 output_part_index + output_array_shift + jj) = 0;
8093 });
8094
8095 cut_shift += num_parts - 1;
8096 tlr_shift += (4 *(num_parts - 1) + 1);
8097 output_array_shift += num_parts;
8098 partweight_array_shift += (2 * (num_parts - 1) + 1);
8099 continue;
8100 }
8101
8102 Kokkos::View<mj_scalar_t *, device_t>
8103 current_concurrent_cut_coordinate =
8104 Kokkos::subview(current_cut_coordinates,
8105 std::pair<mj_lno_t, mj_lno_t>(
8106 cut_shift,
8107 current_cut_coordinates.size()));
8108 Kokkos::View<mj_scalar_t *, device_t>
8109 used_local_cut_line_weight_to_left =
8110 Kokkos::subview(process_cut_line_weight_to_put_left,
8111 std::pair<mj_lno_t, mj_lno_t>(
8112 cut_shift,
8113 process_cut_line_weight_to_put_left.size()));
8114
8115 this->thread_part_weight_work =
8116 Kokkos::subview(
8117 this->thread_part_weights,
8118 std::pair<mj_lno_t, mj_lno_t>(
8119 partweight_array_shift,
8120 this->thread_part_weights.extent(0)));
8121
8122 if(num_parts > 1) {
8123 if(this->mj_keep_part_boxes) {
8124 // if part boxes are to be stored update the boundaries.
8125 for(mj_part_t j = 0; j < num_parts - 1; ++j) {
8126 mj_scalar_t temp_get_val;
8127 Kokkos::parallel_reduce("Read single",
8128 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
8129 KOKKOS_LAMBDA(int dummy, mj_scalar_t & set_single) {
8130 set_single = current_concurrent_cut_coordinate(j);
8131 }, temp_get_val);
8132 (*output_part_boxes)
8133 [output_array_shift + output_part_index + j].
8134 updateMinMax(temp_get_val, 1 /*update max*/, coordInd);
8135 (*output_part_boxes)
8136 [output_array_shift + output_part_index + j + 1].
8137 updateMinMax(temp_get_val, 0 /*update max*/, coordInd);
8138 }
8139 }
8140
8141 // Rewrite the indices based on the computed cuts.
8142 Kokkos::View<mj_lno_t*, device_t> sub_new_part_xadj =
8143 Kokkos::subview(this->new_part_xadj,
8144 std::pair<mj_lno_t, mj_lno_t>(
8145 output_part_index + output_array_shift,
8146 this->new_part_xadj.size()));
8147
8148 this->mj_create_new_partitions(
8149 num_parts,
8150 current_concurrent_work_part,
8151 mj_current_dim_coords,
8152 current_concurrent_cut_coordinate,
8153 used_local_cut_line_weight_to_left,
8154 sub_new_part_xadj);
8155 }
8156 else {
8157
8158 mj_lno_t coordinate_end = host_part_xadj(
8159 current_concurrent_work_part);
8160 mj_lno_t coordinate_begin =
8161 current_concurrent_work_part==0 ? 0 : host_part_xadj(
8162 current_concurrent_work_part - 1);
8163
8164 // if this part is partitioned into 1 then just copy
8165 // the old values.
8166 mj_lno_t part_size = coordinate_end - coordinate_begin;
8167
8168 // Awkward here to set one value - need some broader
8169 // refactoring to improve this one.
8170 auto local_new_part_xadj = this->new_part_xadj;
8171 Kokkos::parallel_for(
8172 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
8173 (0, 1), KOKKOS_LAMBDA (int dummy) {
8174 local_new_part_xadj(
8175 output_part_index + output_array_shift) = part_size;
8176 });
8177
8178 auto subview_new_coordinate_permutations =
8179 Kokkos::subview(this->new_coordinate_permutations,
8180 std::pair<mj_lno_t, mj_lno_t>(
8181 coordinate_begin,
8182 coordinate_begin + part_size));
8183 auto subview_coordinate_permutations =
8184 Kokkos::subview(this->coordinate_permutations,
8185 std::pair<mj_lno_t, mj_lno_t>(
8186 coordinate_begin,
8187 coordinate_begin + part_size));
8188 Kokkos::deep_copy(subview_new_coordinate_permutations,
8189 subview_coordinate_permutations);
8190 }
8191 cut_shift += num_parts - 1;
8192 output_array_shift += num_parts;
8193 partweight_array_shift += (2 * (num_parts - 1) + 1);
8194 }
8195
8196 // shift cut coordinates so that all cut coordinates are stored.
8197 // no shift now because we dont keep the cuts.
8198 // current_cut_coordinates += cut_shift;
8199 // mj_create_new_partitions from coordinates partitioned the parts
8200 // and write the indices as if there were a single part.
8201 // now we need to shift the beginning indices.
8202 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk) {
8203 mj_part_t num_parts =
8204 host_num_partitioning_in_current_dim(current_work_part + kk);
8205
8206 // These two kernels are a bit awkward but need broader redesign to
8207 // avoid this situation.
8208 auto local_new_part_xadj = this->new_part_xadj;
8209 Kokkos::parallel_for(
8210 Kokkos::RangePolicy<typename mj_node_t::execution_space, mj_part_t>
8211 (0, num_parts), KOKKOS_LAMBDA (mj_part_t ii) {
8212 local_new_part_xadj(output_part_index+ii) +=
8213 output_coordinate_end_index;
8214 });
8215
8216 // increase the previous count by current end.
8217 mj_part_t temp_get;
8218 Kokkos::parallel_reduce("Read single",
8219 Kokkos::RangePolicy<typename mj_node_t::execution_space, int> (0, 1),
8220 KOKKOS_LAMBDA(int dummy, mj_part_t & set_single) {
8221 set_single =
8222 local_new_part_xadj(output_part_index + num_parts - 1);
8223 }, temp_get);
8224 output_coordinate_end_index = temp_get;
8225 //increase the current out.
8226 output_part_index += num_parts;
8227 }
8228 }
8229 }
8230
8231 // end of this partitioning dimension
8232 int current_world_size = this->comm->getSize();
8233 long migration_reduce_all_population =
8234 this->total_dim_num_reduce_all * current_world_size;
8235 bool is_migrated_in_current_dimension = false;
8236
8237 // we migrate if there are more partitionings to be done after this step
8238 // and if the migration is not forced to be avoided.
8239 // and the operation is not sequential.
8240 if(future_num_parts > 1 &&
8241 this->check_migrate_avoid_migration_option >= 0 &&
8242 current_world_size > 1) {
8243 this->mj_env->timerStart(MACRO_TIMERS,
8244 mj_timer_base_string + "Problem_Migration-" + istring);
8245 mj_part_t num_parts = output_part_count_in_dimension;
8246
8247 if(this->mj_perform_migration(
8248 num_parts,
8249 current_num_parts, //output
8250 next_future_num_parts_in_parts, //output
8251 output_part_begin_index,
8252 migration_reduce_all_population,
8253 this->num_global_coords / (future_num_parts * current_num_parts),
8254 istring,
8255 input_part_boxes, output_part_boxes) )
8256 {
8257 is_migrated_in_current_dimension = true;
8258 is_data_ever_migrated = true;
8259 this->mj_env->timerStop(MACRO_TIMERS,
8260 mj_timer_base_string + "Problem_Migration-" + istring);
8261 // since data is migrated, we reduce the number of reduceAll
8262 // operations for the last part.
8263 this->total_dim_num_reduce_all /= num_parts;
8264 }
8265 else {
8266 is_migrated_in_current_dimension = false;
8267 this->mj_env->timerStop(MACRO_TIMERS,
8268 mj_timer_base_string + "Problem_Migration-" + istring);
8269 }
8270 }
8271
8272 // swap the coordinate permutations for the next dimension.
8273 Kokkos::View<mj_lno_t*, device_t> tmp =
8274 this->coordinate_permutations;
8275 this->coordinate_permutations =
8276 this->new_coordinate_permutations;
8277
8278 this->new_coordinate_permutations = tmp;
8279 if(!is_migrated_in_current_dimension) {
8280 this->total_dim_num_reduce_all -= current_num_parts;
8281 current_num_parts = output_part_count_in_dimension;
8282 }
8283
8284 {
8285 this->part_xadj = this->new_part_xadj;
8286 local_part_xadj = this->new_part_xadj;
8287 this->host_part_xadj = Kokkos::create_mirror_view(part_xadj);
8288 Kokkos::deep_copy(host_part_xadj, part_xadj); // keep in sync
8289
8290 this->new_part_xadj = Kokkos::View<mj_lno_t*, device_t>("empty", 0);
8291 this->mj_env->timerStop(MACRO_TIMERS,
8292 mj_timer_base_string + "Problem_Partitioning_" + istring);
8293 }
8294 }
8295
8296 // Partitioning is done
8297 delete future_num_part_in_parts;
8298 delete next_future_num_parts_in_parts;
8299 this->mj_env->timerStop(MACRO_TIMERS,
8300 mj_timer_base_string + "Problem_Partitioning");
8302
8303 //get the final parts of each initial coordinate
8304 //the results will be written to
8305 //this->assigned_part_ids for gnos given in this->current_mj_gnos
8306 this->set_final_parts(
8307 current_num_parts,
8308 output_part_begin_index,
8309 output_part_boxes,
8310 is_data_ever_migrated);
8311
8312 result_assigned_part_ids_ = this->assigned_part_ids;
8313 result_mj_gnos_ = this->current_mj_gnos;
8314 this->mj_env->timerStop(MACRO_TIMERS,
8315 mj_timer_base_string + "Total");
8316 this->mj_env->debug(3, "Out of MultiJagged");
8317}
8318
8319template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
8320 typename mj_part_t, typename mj_node_t>
8321RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t, mj_node_t>::
8322 mj_partBoxVector_t>
8324 get_kept_boxes() const
8325{
8326 if(this->mj_keep_part_boxes) {
8327 return this->kept_boxes;
8328 }
8329 else {
8330 throw std::logic_error("Error: part boxes are not stored.");
8331 }
8332}
8333
8334template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
8335 typename mj_part_t, typename mj_node_t>
8336RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t, mj_node_t>::
8337 mj_partBoxVector_t>
8339 compute_global_box_boundaries(RCP<mj_partBoxVector_t> &localPartBoxes) const
8340{
8341 typedef typename Zoltan2::coordinateModelPartBox::coord_t coord_t;
8342 mj_part_t ntasks = this->num_global_parts;
8343 int dim = (*localPartBoxes)[0].getDim();
8344 coord_t *localPartBoundaries = new coord_t[ntasks * 2 *dim];
8345
8346 memset(localPartBoundaries, 0, sizeof(coord_t) * ntasks * 2 *dim);
8347
8348 coord_t *globalPartBoundaries = new coord_t[ntasks * 2 *dim];
8349 memset(globalPartBoundaries, 0, sizeof(coord_t) * ntasks * 2 *dim);
8350
8351 coord_t *localPartMins = localPartBoundaries;
8352 coord_t *localPartMaxs = localPartBoundaries + ntasks * dim;
8353
8354 coord_t *globalPartMins = globalPartBoundaries;
8355 coord_t *globalPartMaxs = globalPartBoundaries + ntasks * dim;
8356
8357 mj_part_t boxCount = localPartBoxes->size();
8358 for(mj_part_t i = 0; i < boxCount; ++i) {
8359 mj_part_t pId = (*localPartBoxes)[i].getpId();
8360
8361 // cout << "me:" << comm->getRank() << " has:" << pId << endl;
8362
8363 coord_t *lmins = (*localPartBoxes)[i].getlmins();
8364 coord_t *lmaxs = (*localPartBoxes)[i].getlmaxs();
8365
8366 for(int j = 0; j < dim; ++j) {
8367 localPartMins[dim * pId + j] = lmins[j];
8368 localPartMaxs[dim * pId + j] = lmaxs[j];
8369
8370 /*
8371 std::cout << "me:" << comm->getRank() <<
8372 " dim * pId + j:"<< dim * pId + j <<
8373 " localMin:" << localPartMins[dim * pId + j] <<
8374 " localMax:" << localPartMaxs[dim * pId + j] << std::endl;
8375 */
8376 }
8377 }
8378
8379 Teuchos::Zoltan2_BoxBoundaries<int, coord_t> reductionOp(ntasks * 2 *dim);
8380
8381 reduceAll<int, coord_t>(*mj_problemComm, reductionOp,
8382 ntasks * 2 *dim, localPartBoundaries, globalPartBoundaries);
8383
8384 RCP<mj_partBoxVector_t> pB(new mj_partBoxVector_t(),true);
8385 for(mj_part_t i = 0; i < ntasks; ++i) {
8387 globalPartMins + dim * i,
8388 globalPartMaxs + dim * i);
8389
8390 /*
8391 for(int j = 0; j < dim; ++j) {
8392 std::cout << "me:" << comm->getRank() <<
8393 " dim * pId + j:"<< dim * i + j <<
8394 " globalMin:" << globalPartMins[dim * i + j] <<
8395 " globalMax:" << globalPartMaxs[dim * i + j] << std::endl;
8396 }
8397 */
8398
8399 pB->push_back(tpb);
8400 }
8401 delete []localPartBoundaries;
8402 delete []globalPartBoundaries;
8403 //RCP <mj_partBoxVector_t> tmpRCPBox(pB, true);
8404 return pB;
8405}
8406
8409template <typename Adapter>
8410class Zoltan2_AlgMJ : public Algorithm<Adapter>
8411{
8412
8413private:
8414
8415#ifndef DOXYGEN_SHOULD_SKIP_THIS
8416 // For coordinates and weights, MJ needs floats or doubles
8417 // But Adapter can provide other scalars, e.g., ints.
8418 // So have separate scalar_t for MJ and adapter.
8419 typedef typename Adapter::scalar_t adapter_scalar_t;
8420
8421 // Provide a default type for mj_scalar_t;
8422 typedef float default_mj_scalar_t;
8423
8424 // If Adapter provided float or double scalar_t, use it (prevents copies).
8425 // Otherwise, use the default type of mj_scalar_t;
8426 typedef typename
8427 std::conditional<
8428 (std::is_same<adapter_scalar_t, float>::value ||
8429 std::is_same<adapter_scalar_t, double>::value),
8430 adapter_scalar_t, default_mj_scalar_t>::type mj_scalar_t;
8431
8432 typedef typename Adapter::gno_t mj_gno_t;
8433 typedef typename Adapter::lno_t mj_lno_t;
8434 typedef typename Adapter::part_t mj_part_t;
8435 typedef typename Adapter::node_t mj_node_t;
8436 typedef coordinateModelPartBox mj_partBox_t;
8437 typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
8438 typedef typename mj_node_t::device_type device_t;
8439#endif
8440
8442
8443 RCP<const Environment> mj_env; // the environment object
8444 RCP<const Comm<int> > mj_problemComm; // initial comm object
8445 RCP<const typename Adapter::base_adapter_t> mj_adapter; // coordinate adapter
8446
8447 // PARAMETERS
8448 double imbalance_tolerance; // input imbalance tolerance.
8449
8450 int num_teams; // how many teams to run main loop with
8451
8452 size_t num_global_parts; // the targeted number of parts
8453
8454 // input part array specifying num part to divide along each dim.
8455 Kokkos::View<mj_part_t*, Kokkos::HostSpace> part_no_array;
8456
8457 // the number of steps that partitioning will be solved in.
8458 int recursion_depth;
8459
8460 int coord_dim; // coordinate dimension.
8461 mj_lno_t num_local_coords; //number of local coords.
8462 mj_gno_t num_global_coords; //number of global coords.
8463
8464 // initial global ids of the coordinates.
8465 Kokkos::View<const mj_gno_t*, device_t> initial_mj_gnos;
8466
8467 // two dimension coordinate array.
8468 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8469 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
8470 mj_coordinates;
8471
8472 int num_weights_per_coord; // number of weights per coordinate
8473
8474 // if the target parts are uniform.
8475 Kokkos::View<bool*, Kokkos::HostSpace> mj_uniform_weights;
8476
8477 // two dimensional weight array.
8478 Kokkos::View<mj_scalar_t**, device_t> mj_weights;
8479
8480 // if the target parts are uniform
8481 Kokkos::View<bool*, Kokkos::HostSpace> mj_uniform_parts;
8482
8483 // Nonuniform first level partitioning
8484 // Currently used for Dragonfly task mapping by partitioning Dragonfly RCA
8485 // machine coordinates and application coordinates.
8486 // An optimization that completely partitions the most important machine
8487 // dimension first (i.e. the Dragonfly group coordinate, or RCA's x
8488 // coordinate). The standard MJ alg follows after the nonuniform first level
8489 // partitioning.
8490 // If used, number of parts for the first level partitioning
8491 mj_part_t num_first_level_parts;
8492
8493 // If used, the distribution of parts for the nonuniform
8494 // first level partitioning
8495 Kokkos::View<mj_part_t*, Kokkos::HostSpace> first_level_distribution;
8496
8497 // if partitioning can distribute points on same coordiante to
8498 // different parts.
8499 bool distribute_points_on_cut_lines;
8500
8501 // how many parts we can calculate concurrently.
8502 mj_part_t max_concurrent_part_calculation;
8503
8504 // whether to migrate=1, avoid migrate=2, or leave decision to MJ=0
8505 int check_migrate_avoid_migration_option;
8506
8507 // when doing the migration, 0 will aim for perfect load-imbalance,
8508 int migration_type;
8509
8510 // 1 for minimized messages
8511
8512 // when MJ decides whether to migrate, the minimum imbalance for migration.
8513 double minimum_migration_imbalance;
8514 bool mj_keep_part_boxes; //if the boxes need to be kept.
8515
8516 // if this is set, then recursion depth is adjusted to its maximum value.
8517 bool mj_run_as_rcb;
8518 int mj_premigration_option;
8519 int min_coord_per_rank_for_premigration;
8520
8521 // communication graph xadj
8522 ArrayRCP<mj_part_t> comXAdj_;
8523
8524 // communication graph adj.
8525 ArrayRCP<mj_part_t> comAdj_;
8526
8527 void copy(
8528 const RCP<PartitioningSolution<Adapter> >&solution);
8529
8530 void set_input_parameters(const Teuchos::ParameterList &p);
8531
8532 RCP<mj_partBoxVector_t> getGlobalBoxBoundaries() const;
8533
8534 bool mj_premigrate_to_subset(
8535 int used_num_ranks,
8536 int migration_selection_option,
8537 RCP<const Environment> mj_env_,
8538 RCP<const Comm<int> > mj_problemComm_,
8539 int coord_dim_,
8540 mj_lno_t num_local_coords_,
8541 mj_gno_t num_global_coords_, size_t num_global_parts_,
8542 Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos_,
8543 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8544 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> &
8545 mj_coordinates_,
8546 int num_weights_per_coord_,
8547 Kokkos::View<mj_scalar_t**, device_t> & mj_weights_,
8548 //results
8549 RCP<const Comm<int> > &result_problemComm_,
8550 mj_lno_t & result_num_local_coords_,
8551 Kokkos::View<mj_gno_t*, device_t> & result_initial_mj_gnos_,
8552 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8553 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> &
8554 result_mj_coordinates_,
8555 Kokkos::View<mj_scalar_t**, device_t> & result_mj_weights_,
8556 int * &result_actual_owner_rank_);
8557
8558public:
8559
8560 Zoltan2_AlgMJ(const RCP<const Environment> &env,
8561 RCP<const Comm<int> > &problemComm,
8562 const RCP<const typename Adapter::base_adapter_t> &adapter) :
8563 mj_partitioner(),
8564 mj_env(env),
8565 mj_problemComm(problemComm),
8566 mj_adapter(adapter),
8567 imbalance_tolerance(0),
8568 num_teams(0),
8569 num_global_parts(1),
8570 recursion_depth(0),
8571 coord_dim(0),
8572 num_local_coords(0),
8573 num_global_coords(0),
8574 num_weights_per_coord(0),
8575 num_first_level_parts(1),
8576 distribute_points_on_cut_lines(true),
8577 max_concurrent_part_calculation(1),
8578 check_migrate_avoid_migration_option(0),
8579 migration_type(0),
8580 minimum_migration_imbalance(0.30),
8581 mj_keep_part_boxes(false),
8582 mj_run_as_rcb(false),
8583 mj_premigration_option(0),
8584 min_coord_per_rank_for_premigration(32000),
8585 comXAdj_(),
8586 comAdj_()
8587 {
8588 }
8589
8591 {
8592 }
8593
8596 static void getValidParameters(ParameterList & pl)
8597 {
8598 const bool bUnsorted = true; // this clarifies the flag is for unsrorted
8599 RCP<Zoltan2::IntegerRangeListValidator<int>> mj_parts_Validator =
8600 Teuchos::rcp( new Zoltan2::IntegerRangeListValidator<int>(bUnsorted) );
8601 pl.set("mj_parts", "0", "list of parts for multiJagged partitioning "
8602 "algorithm. As many as the dimension count.", mj_parts_Validator);
8603
8604 pl.set("mj_concurrent_part_count", 1, "The number of parts whose cut "
8605 "coordinates will be calculated concurently.",
8606 Environment::getAnyIntValidator());
8607
8608 pl.set("mj_minimum_migration_imbalance", 1.1,
8609 "mj_minimum_migration_imbalance, the minimum imbalance of the "
8610 "processors to avoid migration",
8611 Environment::getAnyDoubleValidator());
8612
8613 RCP<Teuchos::EnhancedNumberValidator<int>> mj_migration_option_validator =
8614 Teuchos::rcp( new Teuchos::EnhancedNumberValidator<int>(0, 2) );
8615 pl.set("mj_migration_option", 1, "Migration option, 0 for decision "
8616 "depending on the imbalance, 1 for forcing migration, 2 for "
8617 "avoiding migration", mj_migration_option_validator);
8618
8619 RCP<Teuchos::EnhancedNumberValidator<int>> mj_migration_type_validator =
8620 Teuchos::rcp( new Teuchos::EnhancedNumberValidator<int>(0, 1) );
8621 pl.set("mj_migration_type", 0,
8622 "Migration type, 0 for migration to minimize the imbalance "
8623 "1 for migration to minimize messages exchanged the migration.",
8624 mj_migration_option_validator);
8625
8626 // bool parameter
8627 pl.set("mj_keep_part_boxes", false, "Keep the part boundaries of the "
8628 "geometric partitioning.", Environment::getBoolValidator());
8629
8630 // bool parameter
8631 pl.set("mj_enable_rcb", false, "Use MJ as RCB.",
8632 Environment::getBoolValidator());
8633
8634 pl.set("mj_recursion_depth", -1, "Recursion depth for MJ: Must be "
8635 "greater than 0.", Environment::getAnyIntValidator());
8636
8637 RCP<Teuchos::EnhancedNumberValidator<int>>
8638 mj_num_teams_validator =
8639 Teuchos::rcp( new Teuchos::EnhancedNumberValidator<int>(
8640 0, Teuchos::EnhancedNumberTraits<int>::max()) );
8641 pl.set("mj_num_teams", 0,
8642 "How many teams for the main kernel loop"
8643 , mj_num_teams_validator);
8644
8645 RCP<Teuchos::EnhancedNumberValidator<int>>
8646 mj_premigration_option_validator =
8647 Teuchos::rcp( new Teuchos::EnhancedNumberValidator<int>(0, 1024) );
8648
8649 pl.set("mj_premigration_option", 0,
8650 "Whether to do premigration or not. 0 for no migration "
8651 "x > 0 for migration to consecutive processors, "
8652 "the subset will be 0,x,2x,3x,...subset ranks."
8653 , mj_premigration_option_validator);
8654
8655 pl.set("mj_premigration_coordinate_count", 32000, "How many coordinate to "
8656 "assign each rank in multijagged after premigration"
8657 , Environment::getAnyIntValidator());
8658 }
8659
8665 void partition(const RCP<PartitioningSolution<Adapter> > &solution);
8666
8667 mj_partBoxVector_t &getPartBoxesView() const
8668 {
8669 RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
8670 return *pBoxes;
8671 }
8672
8673 mj_part_t pointAssign(int dim, adapter_scalar_t *point) const;
8674
8675 void boxAssign(int dim, adapter_scalar_t *lower, adapter_scalar_t *upper,
8676 size_t &nPartsFound, mj_part_t **partsFound) const;
8677
8680 void getCommunicationGraph(
8681 const PartitioningSolution<Adapter> *solution,
8682 ArrayRCP<mj_part_t> &comXAdj,
8683 ArrayRCP<mj_part_t> &comAdj);
8684
8685 void set_up_partitioning_data( // public for CUDA
8686 const RCP<PartitioningSolution<Adapter> >&solution);
8687
8688 private:
8689 std::string timer_base_string; // used for making timers
8690
8691 // After loading views from coordinate adapter we may need to copy them
8692 // if mj type is different, but otherwise we just want to assign the view.
8693 // So purpose of this code is to make that assign only happen when the types
8694 // match. The empty case would otherwise not compile.
8695 // If they don't match the internal code handles allocating the new view
8696 // and copying the elements. See the test Zoltan2_mj_int_coordinates.
8697 template<class dst_t, class src_t> // version for same types
8698 typename std::enable_if<std::is_same<typename dst_t::value_type,
8699 typename src_t::value_type>::value>::type
8700 assign_if_same(dst_t & dst, const src_t & src) {
8701 dst = src;
8702 }
8703 template<class dst_t, class src_t> // version for different types
8704 typename std::enable_if<!std::is_same<typename dst_t::value_type,
8705 typename src_t::value_type>::value>::type
8706 assign_if_same(dst_t & dst, const src_t & src) {
8707 // do nothing - handled manually
8708 }
8709};
8710
8711template <typename Adapter>
8712bool Zoltan2_AlgMJ<Adapter>::mj_premigrate_to_subset(
8713 int used_num_ranks,
8714 int migration_selection_option,
8715 RCP<const Environment> mj_env_,
8716 RCP<const Comm<int> > mj_problemComm_,
8717 int coord_dim_,
8718 mj_lno_t num_local_coords_,
8719 mj_gno_t num_global_coords_, size_t num_global_parts_,
8720 Kokkos::View<const mj_gno_t*, device_t> & initial_mj_gnos_,
8721 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8722 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> & mj_coordinates_,
8723 int num_weights_per_coord_,
8724 Kokkos::View<mj_scalar_t**, device_t> & mj_weights_,
8725 //results
8726 RCP<const Comm<int> > & result_problemComm_,
8727 mj_lno_t &result_num_local_coords_,
8728 Kokkos::View<mj_gno_t*, device_t> & result_initial_mj_gnos_,
8729 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8730 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> &
8731 result_mj_coordinates_,
8732 Kokkos::View<mj_scalar_t**, device_t> & result_mj_weights_,
8733 int * &result_actual_owner_rank_)
8734{
8735 mj_env_->timerStart(MACRO_TIMERS,
8736 timer_base_string + "PreMigration DistributorPlanCreating");
8737
8738 int myRank = mj_problemComm_->getRank();
8739 int worldSize = mj_problemComm_->getSize();
8740
8741 mj_part_t groupsize = worldSize / used_num_ranks;
8742
8743 std::vector<mj_part_t> group_begins(used_num_ranks + 1, 0);
8744
8745 mj_part_t i_am_sending_to = 0;
8746 bool am_i_a_receiver = false;
8747
8748 for(int i = 0; i < used_num_ranks; ++i) {
8749 group_begins[i+ 1] = group_begins[i] + groupsize;
8750 if(worldSize % used_num_ranks > i) group_begins[i+ 1] += 1;
8751 if(i == used_num_ranks) group_begins[i+ 1] = worldSize;
8752 if(myRank >= group_begins[i] && myRank < group_begins[i + 1]) {
8753 i_am_sending_to = group_begins[i];
8754 }
8755 if(myRank == group_begins[i]) {
8756 am_i_a_receiver = true;
8757 }
8758 }
8759
8760 ArrayView<const mj_part_t> idView(&(group_begins[0]), used_num_ranks );
8761 result_problemComm_ = mj_problemComm_->createSubcommunicator(idView);
8762
8763 Tpetra::Distributor distributor(mj_problemComm_);
8764
8765 std::vector<mj_part_t>
8766 coordinate_destinations(num_local_coords_, i_am_sending_to);
8767
8768 ArrayView<const mj_part_t>
8769 destinations(&(coordinate_destinations[0]), num_local_coords_);
8770 mj_lno_t num_incoming_gnos = distributor.createFromSends(destinations);
8771 result_num_local_coords_ = num_incoming_gnos;
8772 mj_env_->timerStop(MACRO_TIMERS,
8773 timer_base_string + "PreMigration DistributorPlanCreating");
8774
8775 mj_env_->timerStart(MACRO_TIMERS,
8776 timer_base_string + "PreMigration DistributorMigration");
8777
8778
8779 // migrate gnos.
8780 // MPI buffers should be on Kokkos::HostSpace not Kokkos::CudaUVMSpace
8781 // Note, with UVM space, create_mirror_view does NOT create a non-UVM
8782 // view; need the explicit Host creation and deep_copy.
8783 {
8784 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> sent_gnos(
8785 Kokkos::ViewAllocateWithoutInitializing("sent_gnos"),
8786 initial_mj_gnos_.size()); // initial_mj_gnos_ is const mj_gno_t *
8787 Kokkos::deep_copy(sent_gnos, initial_mj_gnos_);
8788
8789 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> received_gnos (
8790 Kokkos::ViewAllocateWithoutInitializing("received_gnos"),
8791 num_incoming_gnos);
8792
8793 distributor.doPostsAndWaits(sent_gnos, 1, received_gnos);
8794
8795 result_initial_mj_gnos_ = Kokkos::View<mj_gno_t*, device_t>(
8796 Kokkos::ViewAllocateWithoutInitializing("result_initial_mj_gnos_"),
8797 num_incoming_gnos);
8798 Kokkos::deep_copy(result_initial_mj_gnos_, received_gnos);
8799 }
8800
8801 // migrate coordinates
8802 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8803
8804 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, Kokkos::HostSpace>
8805 host_src_coordinates(
8806 Kokkos::ViewAllocateWithoutInitializing("mj_coordinates"),
8807 this->mj_coordinates.extent(0), this->mj_coordinates.extent(1));
8808
8809 Kokkos::deep_copy(host_src_coordinates, this->mj_coordinates);
8810
8811 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t> dst_coordinates(
8812 Kokkos::ViewAllocateWithoutInitializing("mj_coordinates"),
8813 num_incoming_gnos, this->coord_dim);
8814
8815 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> received_coord(
8816 Kokkos::ViewAllocateWithoutInitializing("received_coord"),
8817 num_incoming_gnos);
8818
8819 for(int i = 0; i < this->coord_dim; ++i) {
8820
8821 auto sent_coord = Kokkos::subview(host_src_coordinates, Kokkos::ALL, i);
8822
8823 distributor.doPostsAndWaits(sent_coord, 1, received_coord);
8824
8825 Kokkos::deep_copy(Kokkos::subview(dst_coordinates, Kokkos::ALL, i),
8826 received_coord);
8827 Kokkos::fence();
8828 }
8829 result_mj_coordinates_ = dst_coordinates;
8830
8831 // migrate weights.
8832
8833 Kokkos::View<mj_scalar_t**, device_t> dst_weights(
8834 Kokkos::ViewAllocateWithoutInitializing("mj_weights"),
8835 num_incoming_gnos, this->num_weights_per_coord);
8836 auto host_dst_weights = Kokkos::create_mirror_view(dst_weights);
8837
8838 auto host_src_weights = Kokkos::create_mirror_view_and_copy(
8839 Kokkos::HostSpace(), this->mj_weights);
8840
8841 // contiguous buffers to gather potentially strided data
8842 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> sent_weight(
8843 Kokkos::ViewAllocateWithoutInitializing("send_weight_buffer"),
8844 this->num_local_coords);
8845
8846 Kokkos::View<mj_scalar_t*, Kokkos::HostSpace> received_weight(
8847 Kokkos::ViewAllocateWithoutInitializing("received_weight_buffer"),
8848 num_incoming_gnos);
8849
8850 for(int i = 0; i < this->num_weights_per_coord; ++i) {
8851
8852 auto sub_host_src_weights
8853 = Kokkos::subview(host_src_weights, Kokkos::ALL, i);
8854 auto sub_host_dst_weights
8855 = Kokkos::subview(host_dst_weights, Kokkos::ALL, i);
8856
8857 // Layout Right means these weights are not contiguous
8858 // However we don't have any systems setup with more than 1 weight so
8859 // really I have not tested any of this code with num weights > 1.
8860 // I think this is the right thing to do. Note that there are other
8861 // places in the code which don't handle the possibility of more weights.
8862 // So evaluating all that and adding tests would be another project.
8863 for(mj_lno_t n = 0; n < this->num_local_coords; ++n) {
8864 sent_weight[n] = sub_host_src_weights(n);
8865 }
8866
8867 distributor.doPostsAndWaits(sent_weight, 1, received_weight);
8868
8869 // Again we copy by index due to layout
8870 for(mj_lno_t n = 0; n < num_incoming_gnos; ++n) {
8871 sub_host_dst_weights(n) = received_weight[n];
8872 }
8873 }
8874 Kokkos::deep_copy(dst_weights, host_dst_weights);
8875 result_mj_weights_ = dst_weights;
8876
8877 // migrate the owners of the coordinates
8878 {
8879 Kokkos::View<int*, Kokkos::HostSpace> sent_owners(
8880 Kokkos::ViewAllocateWithoutInitializing("sent_owners"),
8881 num_local_coords_);
8882 Kokkos::deep_copy(sent_owners, myRank);
8883
8884 Kokkos::View<int*, Kokkos::HostSpace> received_owners(
8885 Kokkos::ViewAllocateWithoutInitializing("received_owners"),
8886 num_incoming_gnos);
8887
8888 distributor.doPostsAndWaits(sent_owners, 1, received_owners);
8889
8890 result_actual_owner_rank_ = new int[num_incoming_gnos];
8891 memcpy(
8892 result_actual_owner_rank_,
8893 received_owners.data(),
8894 num_incoming_gnos * sizeof(int));
8895 }
8896
8897 mj_env_->timerStop(MACRO_TIMERS,
8898 timer_base_string + "PreMigration DistributorMigration");
8899 return am_i_a_receiver;
8900}
8901
8909template <typename Adapter>
8911 const RCP<PartitioningSolution<Adapter> > &solution)
8912{
8913 // purpose of this code is to validate node and UVM status for the tests
8914 // std::cout << "Memory Space: " << mj_node_t::memory_space::name() << " "
8915 // << "Execution Space: " << mj_node_t::execution_space::name()
8916 // << std::endl;
8917
8918 int execute_counter =
8919 Zoltan2_AlgMJ_TrackCallsCounter::get_counter_Zoltan2_AlgMJ();
8920 timer_base_string = "partition(" + std::to_string(execute_counter) + ") - ";
8921
8922 this->mj_env->timerStart(MACRO_TIMERS, timer_base_string + "all");
8923 {
8924 this->mj_env->timerStart(MACRO_TIMERS, timer_base_string + "setup");
8925
8926 this->set_up_partitioning_data(solution);
8927
8928 this->set_input_parameters(this->mj_env->getParameters());
8929 if(this->mj_keep_part_boxes) {
8930 this->mj_partitioner.set_to_keep_part_boxes();
8931 }
8932
8933 this->mj_partitioner.set_partitioning_parameters(
8934 this->distribute_points_on_cut_lines,
8935 this->max_concurrent_part_calculation,
8936 this->check_migrate_avoid_migration_option,
8937 this->minimum_migration_imbalance, this->migration_type);
8938
8939 RCP<const Comm<int> > result_problemComm = this->mj_problemComm;
8940 mj_lno_t result_num_local_coords = this->num_local_coords;
8941 Kokkos::View<mj_gno_t*, device_t> result_initial_mj_gnos;
8942 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
8943 Kokkos::View<mj_scalar_t**, Kokkos::LayoutLeft, device_t>
8944 result_mj_coordinates = this->mj_coordinates;
8945 Kokkos::View<mj_scalar_t**, device_t> result_mj_weights =
8946 this->mj_weights;
8947 int *result_actual_owner_rank = NULL;
8948
8949 Kokkos::View<const mj_gno_t*, device_t> result_initial_mj_gnos_ =
8950 this->initial_mj_gnos;
8951
8952 // TODO: MD 08/2017: Further discussion is required.
8953 // MueLu calls MJ when it has very few coordinates per processors,
8954 // such as 10. For example, it begins with 1K processor with 1K coordinate
8955 // in each. Then with coarsening this reduces to 10 coordinate per procesor.
8956 // It calls MJ to repartition these to 10 coordinates.
8957 // MJ runs with 1K processor, 10 coordinate in each, and partitions to
8958 // 10 parts. As expected strong scaling is problem here, because
8959 // computation is almost 0, and communication cost of MJ linearly increases.
8960 // Premigration option gathers the coordinates to 10 parts before MJ starts
8961 // therefore MJ will run with a smalller subset of the problem.
8962 // Below, I am migrating the coordinates if mj_premigration_option is set,
8963 // and the result parts are less than the current part count, and the
8964 // average number of local coordinates is less than some threshold.
8965 // For example, premigration may not help if 1000 processors are
8966 // partitioning data to 10, but each of them already have 1M coordinate.
8967 // In that case, we premigration would not help.
8968 int current_world_size = this->mj_problemComm->getSize();
8969 mj_lno_t threshold_num_local_coords =
8970 this->min_coord_per_rank_for_premigration;
8971 bool is_pre_migrated = false;
8972 bool am_i_in_subset = true;
8973
8974 // Note that we need to add testing for migration and should also cover the
8975 // zoltan case when ZOLTAN2_MJ_ENABLE_ZOLTAN_MIGRATION is defined.
8976 // Currently did a minimal test of this code by running mjTest with
8977 // PM=1, TB=0 then run again with C=3 instead of C=4 (numProcs is 4).
8978 if(mj_premigration_option > 0 &&
8979 size_t (current_world_size) > this->num_global_parts &&
8980 this->num_global_coords < mj_gno_t (
8981 current_world_size * threshold_num_local_coords))
8982 {
8983 if(this->mj_keep_part_boxes) {
8984 throw std::logic_error("Multijagged: mj_keep_part_boxes and "
8985 "mj_premigration_option are not supported together yet.");
8986 }
8987
8988 is_pre_migrated =true;
8989 int migration_selection_option = mj_premigration_option;
8990 if(migration_selection_option * this->num_global_parts >
8991 (size_t) (current_world_size)) {
8992 migration_selection_option =
8993 current_world_size / this->num_global_parts;
8994 }
8995
8996 int used_num_ranks = int (this->num_global_coords /
8997 float (threshold_num_local_coords) + 0.5);
8998
8999 if(used_num_ranks == 0) {
9000 used_num_ranks = 1;
9001 }
9002
9003 am_i_in_subset = this->mj_premigrate_to_subset(
9004 used_num_ranks,
9005 migration_selection_option,
9006 this->mj_env,
9007 this->mj_problemComm,
9008 this->coord_dim,
9009 this->num_local_coords,
9010 this->num_global_coords,
9011 this->num_global_parts,
9012 this->initial_mj_gnos,
9013 this->mj_coordinates,
9014 this->num_weights_per_coord,
9015 this->mj_weights,
9016 //results
9017 result_problemComm,
9018 result_num_local_coords,
9019 result_initial_mj_gnos,
9020 result_mj_coordinates,
9021 result_mj_weights,
9022 result_actual_owner_rank);
9023
9024 result_initial_mj_gnos_ = result_initial_mj_gnos;
9025 }
9026
9027 Kokkos::View<mj_part_t *, device_t> result_assigned_part_ids;
9028 Kokkos::View<mj_gno_t*, device_t> result_mj_gnos;
9029
9030 this->mj_env->timerStop(MACRO_TIMERS, timer_base_string + "setup");
9031
9032 if(am_i_in_subset) {
9033 this->mj_partitioner.multi_jagged_part(
9034 this->mj_env,
9035 result_problemComm, //this->mj_problemComm,
9036 this->imbalance_tolerance,
9037 this->num_teams,
9038 this->num_global_parts,
9039 this->part_no_array,
9040 this->recursion_depth,
9041 this->coord_dim,
9042 result_num_local_coords, //this->num_local_coords,
9043 this->num_global_coords,
9044 result_initial_mj_gnos_,
9045 result_mj_coordinates,
9046 this->num_weights_per_coord,
9047 this->mj_uniform_weights,
9048 result_mj_weights,
9049 this->mj_uniform_parts,
9050 result_assigned_part_ids,
9051 result_mj_gnos
9052 );
9053 }
9054
9055 this->mj_env->timerStart(MACRO_TIMERS, timer_base_string + "cleanup");
9056
9057 // Reorder results so that they match the order of the input
9058 std::unordered_map<mj_gno_t, mj_lno_t> localGidToLid;
9059 localGidToLid.reserve(result_num_local_coords);
9060 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> host_result_initial_mj_gnos(
9061 Kokkos::ViewAllocateWithoutInitializing("host_result_initial_mj_gnos"),
9062 result_initial_mj_gnos_.size());
9063 Kokkos::deep_copy(host_result_initial_mj_gnos, result_initial_mj_gnos_);
9064 for(mj_lno_t i = 0; i < result_num_local_coords; i++) {
9065 localGidToLid[host_result_initial_mj_gnos(i)] = i;
9066 }
9067
9068 ArrayRCP<mj_part_t> partId = arcp(new mj_part_t[result_num_local_coords],
9069 0, result_num_local_coords, true);
9070 auto host_result_assigned_part_ids =
9071 Kokkos::create_mirror_view(result_assigned_part_ids);
9072 Kokkos::deep_copy(host_result_assigned_part_ids, result_assigned_part_ids);
9073 auto host_result_mj_gnos = Kokkos::create_mirror_view(result_mj_gnos);
9074 Kokkos::deep_copy(host_result_mj_gnos, result_mj_gnos);
9075 for(mj_lno_t i = 0; i < result_num_local_coords; i++) {
9076 mj_lno_t origLID = localGidToLid[host_result_mj_gnos(i)];
9077 partId[origLID] = host_result_assigned_part_ids(i);
9078 }
9079
9080 //now the results are reordered. but if premigration occured,
9081 //then we need to send these ids to actual owners again.
9082 if(is_pre_migrated) {
9083 this->mj_env->timerStart(MACRO_TIMERS, timer_base_string +
9084 "PostMigration DistributorPlanCreating");
9085 Tpetra::Distributor distributor(this->mj_problemComm);
9086
9087 ArrayView<const mj_part_t> actual_owner_destinations(
9088 result_actual_owner_rank , result_num_local_coords);
9089
9090 mj_lno_t num_incoming_gnos = distributor.createFromSends(
9091 actual_owner_destinations);
9092
9093 if(num_incoming_gnos != this->num_local_coords) {
9094 throw std::logic_error("Zoltan2 - Multijagged Post Migration - "
9095 "num incoming is not equal to num local coords");
9096 }
9097
9098 mj_env->timerStop(MACRO_TIMERS, timer_base_string +
9099 "PostMigration DistributorPlanCreating");
9100 mj_env->timerStart(MACRO_TIMERS, timer_base_string +
9101 "PostMigration DistributorMigration");
9102
9103 Kokkos::View<mj_gno_t*, Kokkos::HostSpace> received_gnos(
9104 Kokkos::ViewAllocateWithoutInitializing("received_gnos"),
9105 num_incoming_gnos);
9106 Kokkos::View<mj_part_t*, Kokkos::HostSpace> received_partids(
9107 Kokkos::ViewAllocateWithoutInitializing("received_partids"),
9108 num_incoming_gnos);
9109
9110 distributor.doPostsAndWaits(host_result_initial_mj_gnos, 1,
9111 received_gnos);
9112 {
9113 Kokkos::View<mj_part_t*, Kokkos::HostSpace> sent_partnos;
9114 if (partId.size() > 0) {
9115 sent_partnos = Kokkos::View<mj_part_t*, Kokkos::HostSpace>(
9116 partId.getRawPtr(), partId.size()); //unmanaged
9117 }
9118 distributor.doPostsAndWaits(sent_partnos, 1, received_partids);
9119 }
9120
9121 partId = arcp(new mj_part_t[this->num_local_coords],
9122 0, this->num_local_coords, true);
9123
9124 {
9125 std::unordered_map<mj_gno_t, mj_lno_t> localGidToLid2;
9126 localGidToLid2.reserve(this->num_local_coords);
9127 auto host_initial_mj_gnos =
9128 Kokkos::create_mirror_view(this->initial_mj_gnos);
9129 Kokkos::deep_copy(host_initial_mj_gnos,
9130 this->initial_mj_gnos);
9131 for(mj_lno_t i = 0; i < this->num_local_coords; i++) {
9132 localGidToLid2[host_initial_mj_gnos(i)] = i;
9133 }
9134
9135 for(mj_lno_t i = 0; i < this->num_local_coords; i++) {
9136 mj_lno_t origLID = localGidToLid2[received_gnos[i]];
9137 partId[origLID] = received_partids[i];
9138 }
9139 }
9140
9141 {
9142 delete [] result_actual_owner_rank;
9143 }
9144 mj_env->timerStop(MACRO_TIMERS,
9145 timer_base_string + "PostMigration DistributorMigration");
9146 }
9147 solution->setParts(partId);
9148 this->mj_env->timerStop(MACRO_TIMERS, timer_base_string + "cleanup");
9149 }
9150
9151 this->mj_env->timerStop(MACRO_TIMERS, timer_base_string + "all");
9152
9153 // reset the view (release the reference to device data)
9154 this->mj_coordinates = Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t>();
9155}
9156
9157/* \brief Sets the partitioning data for multijagged algorithm.
9158 * */
9159template <typename Adapter>
9161 const RCP<PartitioningSolution<Adapter> > &solution
9162)
9163{
9164 modelFlag_t flags;
9165 CoordinateModel<Adapter> mj_coords(mj_adapter, mj_env, mj_problemComm, flags);
9166
9167 this->coord_dim = mj_coords.getCoordinateDim();
9168 this->num_weights_per_coord = mj_coords.getNumWeightsPerCoordinate();
9169 this->num_local_coords = mj_coords.getLocalNumCoordinates();
9170 this->num_global_coords = mj_coords.getGlobalNumCoordinates();
9171
9172 int criteria_dim = (this->num_weights_per_coord ?
9173 this->num_weights_per_coord : 1);
9174 // From the Solution we get part information.
9175 // If the part sizes for a given criteria are not uniform,
9176 // then they are values that sum to 1.0.
9177 this->num_global_parts = solution->getTargetGlobalNumberOfParts();
9178 // allocate only two dimensional pointer.
9179 // raw pointer addresess will be obtained from multivector.
9180 this->mj_uniform_parts = Kokkos::View<bool *, Kokkos::HostSpace>(
9181 "uniform parts", criteria_dim);
9182 this->mj_uniform_weights = Kokkos::View<bool *, Kokkos::HostSpace>(
9183 "uniform weights", criteria_dim);
9184
9185 Kokkos::View<const mj_gno_t *, device_t> gnos;
9186 Kokkos::View<adapter_scalar_t **, Kokkos::LayoutLeft, device_t> xyz_adapter;
9187 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
9188 Kokkos::View<adapter_scalar_t **, device_t> wgts_adapter;
9189 mj_coords.getCoordinatesKokkos(gnos, xyz_adapter, wgts_adapter);
9190 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
9191 Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t> xyz;
9192 Kokkos::View<mj_scalar_t **, device_t> wgts;
9193
9194 // Now we must get the data from the adapter.
9195 // If the types match we point to the view but if not, we must copy.
9196 if(std::is_same<mj_scalar_t, adapter_scalar_t>()) {
9197 // we can just point the views but we must specialize because this code
9198 // only compiles in this case - for is_same false assign does nothing.
9199 assign_if_same(xyz, xyz_adapter);
9200 assign_if_same(wgts, wgts_adapter);
9201 }
9202 else {
9203 // we only allocate a new view if we are going to copy
9204 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
9205 xyz = Kokkos::View<mj_scalar_t **, Kokkos::LayoutLeft, device_t>
9206 (Kokkos::ViewAllocateWithoutInitializing(
9207 "xyz"), xyz_adapter.extent(0), xyz_adapter.extent(1));
9208 wgts = Kokkos::View<mj_scalar_t **, device_t>(
9209 Kokkos::ViewAllocateWithoutInitializing("wgts"),
9210 wgts_adapter.extent(0), wgts_adapter.extent(1));
9211
9212 typedef typename Kokkos::View<mj_scalar_t **, device_t>::size_type view_size_t;
9213 Kokkos::parallel_for(
9214 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
9215 (0, xyz_adapter.extent(0)), KOKKOS_LAMBDA (int i) {
9216 for(view_size_t n = 0; n < xyz_adapter.extent(1); ++n) {
9217 xyz(i, n) = static_cast<mj_scalar_t>(xyz_adapter(i, n));
9218 }
9219 });
9220 Kokkos::parallel_for(
9221 Kokkos::RangePolicy<typename mj_node_t::execution_space, int>
9222 (0, wgts.extent(0)), KOKKOS_LAMBDA (int i) {
9223 for(view_size_t n = 0; n < wgts.extent(1); ++n) {
9224 wgts(i, n) = static_cast<mj_scalar_t>(wgts_adapter(i, n));
9225 }
9226 });
9227 }
9228
9229 // obtain global ids.
9230 this->initial_mj_gnos = gnos;
9231 // extract coordinates from multivector.
9232 this->mj_coordinates = xyz;
9233 // if no weights are provided set uniform weight.
9234
9235 if(this->num_weights_per_coord == 0) {
9236 this->mj_uniform_weights(0) = true;
9237 Kokkos::resize(this->mj_weights, 0, 0);
9238 }
9239 else{
9240 this->mj_weights = wgts;
9241 for(int wdim = 0; wdim < this->num_weights_per_coord; ++wdim) {
9242 this->mj_uniform_weights(wdim) = false;
9243 }
9244 }
9245
9246 for(int wdim = 0; wdim < criteria_dim; ++wdim) {
9247 if(solution->criteriaHasUniformPartSizes(wdim)) {
9248 this->mj_uniform_parts(wdim) = true;
9249 }
9250 else {
9251 printf("Error: MJ does not support non uniform target part weights\n");
9252 std::terminate();
9253 }
9254 }
9255}
9256
9257/* \brief Sets the partitioning parameters for multijagged algorithm.
9258 * \param pl: is the parameter list provided to zoltan2 call
9259 * */
9260template <typename Adapter>
9262 const Teuchos::ParameterList &pl)
9263{
9264 const Teuchos::ParameterEntry *pe = pl.getEntryPtr("imbalance_tolerance");
9265 if(pe) {
9266 double tol;
9267 tol = pe->getValue(&tol);
9268 this->imbalance_tolerance = tol - 1.0;
9269 }
9270
9271 // TODO: May be a more relaxed tolerance is needed. RCB uses 10%
9272 if(this->imbalance_tolerance <= 0) {
9273 this->imbalance_tolerance= 10e-4;
9274 }
9275
9276 // if an input partitioning array is provided.
9277 Kokkos::resize(this->part_no_array, 0);
9278
9279 // the length of the input partitioning array.
9280 this->recursion_depth = 0;
9281
9282 if(pl.getPtr<int>("mj_num_teams")) {
9283 this->num_teams = pl.get<int>("mj_num_teams");
9284 }
9285
9286 if(pl.getPtr<Array <mj_part_t> >("mj_parts")) {
9287 auto mj_parts = pl.get<Array <mj_part_t> >("mj_parts");
9288 int mj_parts_size = static_cast<int>(mj_parts.size());
9289
9290 // build the view we'll have data on and copy values from host
9291 this->part_no_array = Kokkos::View<mj_part_t*, Kokkos::HostSpace>(
9292 "part_no_array", mj_parts_size);
9293 for(int i = 0; i < mj_parts_size; ++i) {
9294 this->part_no_array(i) = mj_parts.getRawPtr()[i];
9295 }
9296
9297 this->recursion_depth = mj_parts_size - 1;
9298 this->mj_env->debug(2, "mj_parts provided by user");
9299 }
9300
9301 // get mj specific parameters.
9302 this->distribute_points_on_cut_lines = true;
9303 this->max_concurrent_part_calculation = 1;
9304
9305 this->mj_run_as_rcb = false;
9306 this->mj_premigration_option = 0;
9307 this->min_coord_per_rank_for_premigration = 32000;
9308
9309 int mj_user_recursion_depth = -1;
9310 this->mj_keep_part_boxes = false;
9311 this->check_migrate_avoid_migration_option = 0;
9312 this->migration_type = 0;
9313 this->minimum_migration_imbalance = 0.35;
9314
9315 pe = pl.getEntryPtr("mj_minimum_migration_imbalance");
9316 if(pe) {
9317 double imb;
9318 imb = pe->getValue(&imb);
9319 this->minimum_migration_imbalance = imb - 1.0;
9320 }
9321
9322 pe = pl.getEntryPtr("mj_migration_option");
9323 if(pe) {
9324 this->check_migrate_avoid_migration_option =
9325 pe->getValue(&this->check_migrate_avoid_migration_option);
9326 } else {
9327 this->check_migrate_avoid_migration_option = 0;
9328 }
9329 if(this->check_migrate_avoid_migration_option > 1) {
9330 this->check_migrate_avoid_migration_option = -1;
9331 }
9332
9334 pe = pl.getEntryPtr("mj_migration_type");
9335 if(pe) {
9336 this->migration_type = pe->getValue(&this->migration_type);
9337 } else {
9338 this->migration_type = 0;
9339 }
9340
9341 //std::cout << " this->migration_type:" << this->migration_type << std::endl;
9343
9344 pe = pl.getEntryPtr("mj_concurrent_part_count");
9345 if(pe) {
9346 this->max_concurrent_part_calculation =
9347 pe->getValue(&this->max_concurrent_part_calculation);
9348 } else {
9349 this->max_concurrent_part_calculation = 1; // Set to 1 if not provided.
9350 }
9351
9352 pe = pl.getEntryPtr("mj_keep_part_boxes");
9353 if(pe) {
9354 this->mj_keep_part_boxes = pe->getValue(&this->mj_keep_part_boxes);
9355 } else {
9356 this->mj_keep_part_boxes = false; // Set to invalid value
9357 }
9358
9359 // For now, need keep_part_boxes to do pointAssign and boxAssign.
9360 // pe = pl.getEntryPtr("keep_cuts");
9361 // if(pe) {
9362 // int tmp = pe->getValue(&tmp);
9363 // if(tmp) this->mj_keep_part_boxes = true;
9364 // }
9365
9366 //need to keep part boxes if mapping type is geometric.
9367 if(this->mj_keep_part_boxes == false) {
9368 pe = pl.getEntryPtr("mapping_type");
9369 if(pe) {
9370 int mapping_type = -1;
9371 mapping_type = pe->getValue(&mapping_type);
9372 if(mapping_type == 0) {
9373 mj_keep_part_boxes = true;
9374 }
9375 }
9376 }
9377
9378 // need to keep part boxes if mapping type is geometric.
9379 pe = pl.getEntryPtr("mj_enable_rcb");
9380 if(pe) {
9381 this->mj_run_as_rcb = pe->getValue(&this->mj_run_as_rcb);
9382 } else {
9383 this->mj_run_as_rcb = false; // Set to invalid value
9384 }
9385
9386 pe = pl.getEntryPtr("mj_premigration_option");
9387 if(pe) {
9388 mj_premigration_option = pe->getValue(&mj_premigration_option);
9389 } else {
9390 mj_premigration_option = 0;
9391 }
9392
9393 pe = pl.getEntryPtr("mj_premigration_coordinate_count");
9394 if(pe) {
9395 min_coord_per_rank_for_premigration = pe->getValue(&mj_premigration_option);
9396 } else {
9397 min_coord_per_rank_for_premigration = 32000;
9398 }
9399
9400 pe = pl.getEntryPtr("mj_recursion_depth");
9401 if(pe) {
9402 mj_user_recursion_depth = pe->getValue(&mj_user_recursion_depth);
9403 } else {
9404 mj_user_recursion_depth = -1; // Set to invalid value
9405 }
9406
9407 bool val = false;
9408 pe = pl.getEntryPtr("rectilinear");
9409 if(pe) {
9410 val = pe->getValue(&val);
9411 }
9412 if(val) {
9413 this->distribute_points_on_cut_lines = false;
9414 } else {
9415 this->distribute_points_on_cut_lines = true;
9416 }
9417
9418 if(this->mj_run_as_rcb) {
9419 mj_user_recursion_depth =
9420 (int)(ceil(log ((this->num_global_parts)) / log (2.0)));
9421 }
9422 if(this->recursion_depth < 1) {
9423 if(mj_user_recursion_depth > 0) {
9424 this->recursion_depth = mj_user_recursion_depth;
9425 }
9426 else {
9427 this->recursion_depth = this->coord_dim;
9428 }
9429 }
9430}
9431
9433template <typename Adapter>
9435 int dim,
9436 adapter_scalar_t *lower,
9437 adapter_scalar_t *upper,
9438 size_t &nPartsFound,
9439 typename Adapter::part_t **partsFound) const
9440{
9441 // TODO: Implement with cuts rather than boxes to reduce algorithmic
9442 // TODO: complexity. Or at least do a search through the boxes, using
9443 // TODO: p x q x r x ... if possible.
9444
9445 nPartsFound = 0;
9446 *partsFound = NULL;
9447
9448 if(this->mj_keep_part_boxes) {
9449
9450 // Get vector of part boxes
9451 RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
9452
9453 size_t nBoxes = (*partBoxes).size();
9454 if(nBoxes == 0) {
9455 throw std::logic_error("no part boxes exist");
9456 }
9457
9458 // Determine whether the box overlaps the globalBox at all
9459 RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
9460
9461 if(globalBox->boxesOverlap(dim, lower, upper)) {
9462
9463 std::vector<typename Adapter::part_t> partlist;
9464
9465 // box overlaps the global box; find specific overlapping boxes
9466 for(size_t i = 0; i < nBoxes; i++) {
9467 try {
9468 if((*partBoxes)[i].boxesOverlap(dim, lower, upper)) {
9469 nPartsFound++;
9470 partlist.push_back((*partBoxes)[i].getpId());
9471 /*
9472 std::cout << "Given box (";
9473 for(int j = 0; j < dim; j++)
9474 std::cout << lower[j] << " ";
9475 std::cout << ") x (";
9476 for(int j = 0; j < dim; j++)
9477 std::cout << upper[j] << " ";
9478 std::cout << ") overlaps PartBox "
9479 << (*partBoxes)[i].getpId() << " (";
9480 for(int j = 0; j < dim; j++)
9481 std::cout << (*partBoxes)[i].getlmins()[j] << " ";
9482 std::cout << ") x (";
9483 for(int j = 0; j < dim; j++)
9484 std::cout << (*partBoxes)[i].getlmaxs()[j] << " ";
9485 std::cout << ")" << std::endl;
9486 */
9487 }
9488 }
9490 }
9491 if(nPartsFound) {
9492 *partsFound = new mj_part_t[nPartsFound];
9493 for(size_t i = 0; i < nPartsFound; i++)
9494 (*partsFound)[i] = partlist[i];
9495 }
9496 }
9497 else {
9498 // Box does not overlap the domain at all. Find the closest part
9499 // Not sure how to perform this operation for MJ without having the
9500 // cuts. With the RCB cuts, the concept of a part extending to
9501 // infinity was natural. With the boxes, it is much more difficult.
9502 // TODO: For now, return information indicating NO OVERLAP.
9503 }
9504 }
9505 else {
9506 throw std::logic_error("need to use keep_cuts parameter for boxAssign");
9507 }
9508}
9509
9511template <typename Adapter>
9513 int dim,
9514 adapter_scalar_t *point) const
9515{
9516 // TODO: Implement with cuts rather than boxes to reduce algorithmic
9517 // TODO: complexity. Or at least do a search through the boxes, using
9518 // TODO: p x q x r x ... if possible.
9519
9520 if(this->mj_keep_part_boxes) {
9521 typename Adapter::part_t foundPart = -1;
9522
9523 // Get vector of part boxes
9524 RCP<mj_partBoxVector_t> partBoxes = this->getGlobalBoxBoundaries();
9525
9526 size_t nBoxes = (*partBoxes).size();
9527 if(nBoxes == 0) {
9528 throw std::logic_error("no part boxes exist");
9529 }
9530
9531 // Determine whether the point is within the global domain
9532 RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
9533
9534 if(globalBox->pointInBox(dim, point)) {
9535
9536 // point is in the global domain; determine in which part it is.
9537 size_t i;
9538 for(i = 0; i < nBoxes; i++) {
9539 try {
9540 if((*partBoxes)[i].pointInBox(dim, point)) {
9541 foundPart = (*partBoxes)[i].getpId();
9542 // std::cout << "Point (";
9543 // for(int j = 0; j < dim; j++) std::cout << point[j] << " ";
9544 // std::cout << ") found in box " << i << " part " << foundPart
9545 // << std::endl;
9546 // (*partBoxes)[i].print();
9547 break;
9548 }
9549 }
9551 }
9552
9553 if(i == nBoxes) {
9554 // This error should never occur
9555 std::ostringstream oss;
9556 oss << "Point (";
9557 for(int j = 0; j < dim; j++) oss << point[j] << " ";
9558 oss << ") not found in domain";
9559 throw std::logic_error(oss.str());
9560 }
9561 }
9562
9563 else {
9564 // Point is outside the global domain.
9565 // Determine to which part it is closest.
9566 // TODO: with cuts, would not need this special case
9567
9568 typedef typename Zoltan2::coordinateModelPartBox::coord_t coord_t;
9569 size_t closestBox = 0;
9570 coord_t minDistance = std::numeric_limits<coord_t>::max();
9571 coord_t *centroid = new coord_t[dim];
9572 for(size_t i = 0; i < nBoxes; i++) {
9573 (*partBoxes)[i].computeCentroid(centroid);
9574 coord_t sum = 0.;
9575 coord_t diff;
9576 for(int j = 0; j < dim; j++) {
9577 diff = centroid[j] - point[j];
9578 sum += diff * diff;
9579 }
9580 if(sum < minDistance) {
9581 minDistance = sum;
9582 closestBox = i;
9583 }
9584 }
9585 foundPart = (*partBoxes)[closestBox].getpId();
9586 delete [] centroid;
9587 }
9588
9589 return foundPart;
9590 }
9591 else {
9592 throw std::logic_error("need to use keep_cuts parameter for pointAssign");
9593 }
9594}
9595
9596template <typename Adapter>
9598 const PartitioningSolution<Adapter> *solution,
9599 ArrayRCP<typename Zoltan2_AlgMJ<Adapter>::mj_part_t> &comXAdj,
9600 ArrayRCP<typename Zoltan2_AlgMJ<Adapter>::mj_part_t> &comAdj)
9601{
9602 if(comXAdj_.getRawPtr() == NULL && comAdj_.getRawPtr() == NULL) {
9603 RCP<mj_partBoxVector_t> pBoxes = this->getGlobalBoxBoundaries();
9604 mj_part_t ntasks = (*pBoxes).size();
9605 int dim = (*pBoxes)[0].getDim();
9606 GridHash grid(pBoxes, ntasks, dim);
9607 grid.getAdjArrays(comXAdj_, comAdj_);
9608 }
9609 comAdj = comAdj_;
9610 comXAdj = comXAdj_;
9611}
9612
9613template <typename Adapter>
9614RCP<typename Zoltan2_AlgMJ<Adapter>::mj_partBoxVector_t>
9616{
9617 return this->mj_partitioner.get_kept_boxes();
9618}
9619} // namespace Zoltan2
9620
9621#endif
Defines the CoordinateModel classes.
#define Z2_FORWARD_EXCEPTIONS
Forward an exception back through call stack.
#define Z2_ASSERT_VALUE(actual, expected)
Throw an error when actual value is not equal to expected value.
#define Z2_THROW_OUTSIDE_ERROR(env)
Throw an error returned from outside the Zoltan2 library.
Define IntegerRangeList validator.
Contains Teuchos redcution operators for the Multi-jagged algorthm.
Defines Parameter related enumerators, declares functions.
A gathering of useful namespace methods.
Zoltan2_BoxBoundaries is a reduction operation to all reduce the all box boundaries.
void reduce(const Ordinal count, const T inBuffer[], T inoutBuffer[]) const
Implement Teuchos::ValueTypeReductionOp interface.
Zoltan2_BoxBoundaries()
Default Constructor.
Zoltan2_BoxBoundaries(Ordinal s_)
Constructor.
Multi Jagged coordinate partitioning algorithm.
void set_partitioning_parameters(bool distribute_points_on_cut_lines_, int max_concurrent_part_calculation_, int check_migrate_avoid_migration_option_, double minimum_migration_imbalance_, int migration_type_=0)
Multi Jagged coordinate partitioning algorithm.
RCP< mj_partBoxVector_t > compute_global_box_boundaries(RCP< mj_partBoxVector_t > &localPartBoxes) const
DOCWORK: Documentation.
void sequential_task_partitioning(const RCP< const Environment > &env, mj_lno_t num_total_coords, mj_lno_t num_selected_coords, size_t num_target_part, int coord_dim, Kokkos::View< mj_scalar_t **, Kokkos::LayoutLeft, device_t > &mj_coordinates_, Kokkos::View< mj_lno_t *, device_t > &initial_selected_coords_output_permutation, mj_lno_t *output_xadj, int recursion_depth_, const Kokkos::View< mj_part_t *, Kokkos::HostSpace > &part_no_array, bool partition_along_longest_dim, int num_ranks_per_node, bool divide_to_prime_first_, mj_part_t num_first_level_parts_=1, const Kokkos::View< mj_part_t *, Kokkos::HostSpace > &first_level_distribution_=Kokkos::View< mj_part_t *, Kokkos::HostSpace >())
Special function for partitioning for task mapping. Runs sequential, and performs deterministic parti...
void multi_jagged_part(const RCP< const Environment > &env, RCP< const Comm< int > > &problemComm, double imbalance_tolerance, int num_teams, size_t num_global_parts, Kokkos::View< mj_part_t *, Kokkos::HostSpace > &part_no_array, int recursion_depth, int coord_dim, mj_lno_t num_local_coords, mj_gno_t num_global_coords, Kokkos::View< const mj_gno_t *, device_t > &initial_mj_gnos, Kokkos::View< mj_scalar_t **, Kokkos::LayoutLeft, device_t > &mj_coordinates, int num_weights_per_coord, Kokkos::View< bool *, Kokkos::HostSpace > &mj_uniform_weights, Kokkos::View< mj_scalar_t **, device_t > &mj_weights, Kokkos::View< bool *, Kokkos::HostSpace > &mj_uniform_parts, Kokkos::View< mj_part_t *, device_t > &result_assigned_part_ids, Kokkos::View< mj_gno_t *, device_t > &result_mj_gnos)
Multi Jagged coordinate partitioning algorithm.
RCP< mj_partBoxVector_t > get_kept_boxes() const
DOCWORK: Documentation.
AlgMJ()
Multi Jagged coordinate partitioning algorithm default constructor.
RCP< mj_partBox_t > get_global_box() const
DOCWORK: Documentation.
void set_to_keep_part_boxes()
Function call, if the part boxes are intended to be kept.
Algorithm defines the base class for all algorithms.
This class provides geometric coordinates with optional weights to the Zoltan2 algorithm.
global_size_t getGlobalNumCoordinates() const
Returns the global number coordinates.
size_t getCoordinatesKokkos(Kokkos::View< const gno_t *, typename node_t::device_type > &Ids, Kokkos::View< scalar_t **, Kokkos::LayoutLeft, typename node_t::device_type > &xyz, Kokkos::View< scalar_t **, typename node_t::device_type > &wgts) const
Returns the coordinate ids, values and optional weights.
int getCoordinateDim() const
Returns the dimension of the coordinates.
size_t getLocalNumCoordinates() const
Returns the number of coordinates on this process.
int getNumWeightsPerCoordinate() const
Returns the number (0 or greater) of weights per coordinate.
GridHash Class, Hashing Class for part boxes.
void getAdjArrays(ArrayRCP< part_t > &comXAdj_, ArrayRCP< part_t > &comAdj_)
GridHash Class, returns the adj arrays.
A ParameterList validator for integer range lists.
A PartitioningSolution is a solution to a partitioning problem.
Multi Jagged coordinate partitioning algorithm.
Zoltan2_AlgMJ(const RCP< const Environment > &env, RCP< const Comm< int > > &problemComm, const RCP< const typename Adapter::base_adapter_t > &adapter)
static void getValidParameters(ParameterList &pl)
Set up validators specific to this algorithm.
mj_partBoxVector_t & getPartBoxesView() const
for partitioning methods, return bounding boxes of the
coordinateModelPartBox Class, represents the boundaries of the box which is a result of a geometric p...
Class for sorting items with multiple values. First sorting with respect to val[0],...
void set(IT index_, CT count_, WT *vals_)
bool operator<(const uMultiSortItem< IT, CT, WT > &other) const
uMultiSortItem(IT index_, CT count_, WT *vals_)
Created by mbenlioglu on Aug 31, 2020.
Tpetra::global_size_t global_size_t
std::bitset< NUM_MODEL_FLAGS > modelFlag_t
@ MACRO_TIMERS
Time an algorithm (or other entity) as a whole.
void uqsort(IT n, uSortItem< IT, WT > *arr)
Quick sort function. Sorts the arr of uSortItems, with respect to increasing vals....
void uqSignsort(IT n, uSignedSortItem< IT, WT, SIGN > *arr)
Quick sort function. Sorts the arr of uSignedSortItems, with respect to increasing vals.
#define epsilon
Definition: nd.cpp:82
static RCP< tMVector_t > coordinates
SparseMatrixAdapter_t::part_t part_t
KOKKOS_INLINE_FUNCTION value_type & reference() const
KOKKOS_INLINE_FUNCTION void join(value_type &dst, const value_type &src) const
Zoltan2_MJArrayType< scalar_t > value_type
KOKKOS_INLINE_FUNCTION void join(volatile value_type &dst, const volatile value_type &src) const
KOKKOS_INLINE_FUNCTION ArrayCombinationReducer(scalar_t mj_max_scalar, value_type &val, int mj_value_count_rightleft, int mj_value_count_weights)
KOKKOS_INLINE_FUNCTION void init(value_type &dst) const
Zoltan2_MJArrayType< scalar_t > value_type
KOKKOS_INLINE_FUNCTION void init(value_type &dst) const
KOKKOS_INLINE_FUNCTION ArrayReducer(value_type &val, int mj_value_count)
KOKKOS_INLINE_FUNCTION void join(value_type &dst, const value_type &src) const
KOKKOS_INLINE_FUNCTION value_type & reference() const
KOKKOS_INLINE_FUNCTION void init(value_type dst) const
Kokkos::View< part_t *, device_t > parts
Kokkos::View< scalar_t * > scalar_view_t
Kokkos::View< index_t *, device_t > part_xadj
ReduceArrayFunctor(part_t mj_concurrent_current_part, part_t mj_weight_array_size, Kokkos::View< index_t *, device_t > &mj_permutations, Kokkos::View< scalar_t *, device_t > &mj_coordinates, Kokkos::View< part_t *, device_t > &mj_parts, Kokkos::View< index_t *, device_t > &mj_part_xadj, Kokkos::View< index_t *, device_t > &mj_track_on_cuts)
Kokkos::View< index_t *, device_t > track_on_cuts
Kokkos::View< scalar_t *, device_t > coordinates
size_t team_shmem_size(int team_size) const
Kokkos::View< index_t *, device_t > permutations
KOKKOS_INLINE_FUNCTION void join(value_type dst, const value_type src) const
Kokkos::View< scalar_t *, device_t > cut_coordinates
KOKKOS_INLINE_FUNCTION void init(value_type dst) const
Kokkos::View< scalar_t **, device_t > weights
ReduceWeightsFunctor(int mj_loop_count, array_t mj_max_scalar, part_t mj_concurrent_current_part, part_t mj_num_cuts, part_t mj_current_work_part, part_t mj_current_concurrent_num_parts, part_t mj_left_right_array_size, part_t mj_weight_array_size, Kokkos::View< index_t *, device_t > &mj_permutations, Kokkos::View< scalar_t *, device_t > &mj_coordinates, Kokkos::View< scalar_t **, device_t > &mj_weights, Kokkos::View< part_t *, device_t > &mj_parts, Kokkos::View< scalar_t *, device_t > &mj_cut_coordinates, Kokkos::View< index_t *, device_t > &mj_part_xadj, bool mj_uniform_weights0, scalar_t mj_sEpsilon)
KOKKOS_INLINE_FUNCTION void join(value_type dst, const value_type src) const
Kokkos::View< scalar_t *, device_t > coordinates
Kokkos::View< part_t *, device_t > parts
size_t team_shmem_size(int team_size) const
Kokkos::View< index_t *, device_t > part_xadj
Kokkos::View< index_t *, device_t > permutations
KOKKOS_INLINE_FUNCTION void operator()(const member_type &teamMember, value_type teamSum) const
Kokkos::View< scalar_t * > scalar_view_t
Zoltan2_MJArrayType< scalar_t > & operator=(const volatile Zoltan2_MJArrayType< scalar_t > &zmj)
KOKKOS_INLINE_FUNCTION Zoltan2_MJArrayType()
KOKKOS_INLINE_FUNCTION Zoltan2_MJArrayType(scalar_t *pSetPtr)
bool operator<=(const uSignedSortItem< IT, WT, SIGN > &rhs)
bool operator<(const uSignedSortItem< IT, WT, SIGN > &rhs) const
Sort items for quick sort function.