40#ifndef TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
41#define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
43#include "TpetraCore_config.h"
44#include "Teuchos_Array.hpp"
45#include "Teuchos_ArrayView.hpp"
46#include "Teuchos_OrdinalTraits.hpp"
54#include "Kokkos_Core.hpp"
86namespace UnpackAndCombineCrsMatrixImpl {
97template<
class ST,
class LO,
class GO>
102 const char imports[],
105 const size_t num_ent,
106 const size_t bytes_per_value)
112 bool unpack_pids = pids_out.size() > 0;
114 const size_t num_ent_beg = offset;
117 const size_t gids_beg = num_ent_beg + num_ent_len;
118 const size_t gids_len =
121 const size_t pids_beg = gids_beg + gids_len;
122 const size_t pids_len = unpack_pids ?
126 const size_t vals_beg = gids_beg + gids_len + pids_len;
127 const size_t vals_len = num_ent * bytes_per_value;
129 const char*
const num_ent_in = imports + num_ent_beg;
130 const char*
const gids_in = imports + gids_beg;
131 const char*
const pids_in = unpack_pids ? imports + pids_beg :
nullptr;
132 const char*
const vals_in = imports + vals_beg;
134 size_t num_bytes_out = 0;
137 if (
static_cast<size_t> (num_ent_out) != num_ent) {
142 Kokkos::pair<int, size_t> p;
147 num_bytes_out += p.second;
154 num_bytes_out += p.second;
161 num_bytes_out += p.second;
164 const size_t expected_num_bytes = num_ent_len + gids_len + pids_len + vals_len;
165 if (num_bytes_out != expected_num_bytes) {
181template<
class LocalMatrix,
class LocalMap,
class BufferDeviceType>
183 typedef LocalMatrix local_matrix_type;
186 typedef typename local_matrix_type::value_type ST;
190 typedef typename DT::execution_space XS;
192 typedef Kokkos::View<const size_t*, BufferDeviceType>
193 num_packets_per_lid_type;
194 typedef Kokkos::View<const size_t*, DT> offsets_type;
195 typedef Kokkos::View<const char*, BufferDeviceType> input_buffer_type;
196 typedef Kokkos::View<const LO*, BufferDeviceType> import_lids_type;
198 typedef Kokkos::View<int, DT> error_type;
199 using member_type =
typename Kokkos::TeamPolicy<XS>::member_type;
201 static_assert (std::is_same<LO, typename local_matrix_type::ordinal_type>::value,
202 "LocalMap::local_ordinal_type and "
203 "LocalMatrix::ordinal_type must be the same.");
205 local_matrix_type local_matrix;
207 input_buffer_type imports;
208 num_packets_per_lid_type num_packets_per_lid;
209 import_lids_type import_lids;
210 Kokkos::View<const LO*[2], DT> batch_info;
211 offsets_type offsets;
214 size_t bytes_per_value;
216 error_type error_code;
219 const local_matrix_type& local_matrix_in,
221 const input_buffer_type& imports_in,
222 const num_packets_per_lid_type& num_packets_per_lid_in,
223 const import_lids_type& import_lids_in,
224 const Kokkos::View<
const LO*[2], DT>& batch_info_in,
225 const offsets_type& offsets_in,
227 const size_t batch_size_in,
228 const size_t bytes_per_value_in,
229 const bool atomic_in) :
230 local_matrix (local_matrix_in),
231 local_col_map (local_col_map_in),
232 imports (imports_in),
233 num_packets_per_lid (num_packets_per_lid_in),
234 import_lids (import_lids_in),
235 batch_info (batch_info_in),
236 offsets (offsets_in),
237 combine_mode (combine_mode_in),
238 batch_size (batch_size_in),
239 bytes_per_value (bytes_per_value_in),
244 KOKKOS_INLINE_FUNCTION
245 void operator()(member_type team_member)
const
248 using Kokkos::subview;
249 using Kokkos::MemoryUnmanaged;
251 const LO batch = team_member.league_rank();
252 const LO lid_no = batch_info(batch, 0);
253 const LO batch_no = batch_info(batch, 1);
255 const size_t num_bytes = num_packets_per_lid(lid_no);
262 const LO import_lid = import_lids(lid_no);
263 const size_t buf_size = imports.size();
264 const size_t offset = offsets(lid_no);
268 const char*
const in_buf = imports.data() + offset;
270 const size_t num_entries_in_row =
static_cast<size_t>(num_ent_LO);
273 size_t expected_num_bytes = 0;
280 if (expected_num_bytes > num_bytes)
283#ifndef KOKKOS_ENABLE_SYCL
285 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
286 "At row %d, the expected number of bytes (%d) != number of unpacked bytes (%d)\n",
287 (
int) lid_no, (
int) expected_num_bytes, (
int) num_bytes
290 Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 21);
294 if (offset > buf_size || offset + num_bytes > buf_size)
297#ifndef KOKKOS_ENABLE_SYCL
299 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
300 "At row %d, the offset (%d) > buffer size (%d)\n",
301 (
int) lid_no, (
int) offset, (
int) buf_size
304 Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 22);
309 size_t num_entries_in_batch = 0;
310 if (num_entries_in_row <= batch_size)
311 num_entries_in_batch = num_entries_in_row;
312 else if (num_entries_in_row >= (batch_no + 1) * batch_size)
313 num_entries_in_batch = batch_size;
315 num_entries_in_batch = num_entries_in_row - batch_no * batch_size;
318 const size_t num_ent_start = offset;
319 const size_t num_ent_end = num_ent_start + bytes_per_lid;
322 const size_t gids_start = num_ent_end;
323 const size_t gids_end = gids_start + num_entries_in_row * bytes_per_gid;
325 const size_t vals_start = gids_end;
327 const size_t shift = batch_no * batch_size;
328 const char*
const num_ent_in = imports.data() + num_ent_start;
329 const char*
const gids_in = imports.data() + gids_start + shift * bytes_per_gid;
330 const char*
const vals_in = imports.data() + vals_start + shift * bytes_per_value;
334 if (
static_cast<size_t>(num_ent_out) != num_entries_in_row)
337#ifndef KOKKOS_ENABLE_SYCL
339 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
340 "At row %d, number of entries (%d) != number of entries unpacked (%d)\n",
341 (
int) lid_no, (
int) num_entries_in_row, (
int) num_ent_out
344 Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 23);
347 constexpr bool matrix_has_sorted_rows =
true;
350 Kokkos::parallel_for(
351 Kokkos::TeamThreadRange(team_member, num_entries_in_batch),
357 distance = j * bytes_per_gid;
367 distance = j * bytes_per_value;
370 if (combine_mode ==
ADD) {
374 const bool use_atomic_updates = atomic;
375 (void)local_matrix.sumIntoValues(
380 matrix_has_sorted_rows,
383 }
else if (combine_mode ==
REPLACE) {
387 const bool use_atomic_updates =
false;
388 (void)local_matrix.replaceValues(
393 matrix_has_sorted_rows,
399#ifndef KOKKOS_ENABLE_SYCL
401 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
402 "At row %d, an unknown error occurred during unpack\n", (
int) lid_no
405 Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 31);
410 team_member.team_barrier();
416 auto error_code_h = Kokkos::create_mirror_view_and_copy(
417 Kokkos::HostSpace(), error_code
419 return error_code_h();
424struct MaxNumEntTag {};
425struct TotNumEntTag {};
435template<
class LO,
class DT,
class BDT>
438 typedef Kokkos::View<const size_t*, BDT> num_packets_per_lid_type;
439 typedef Kokkos::View<const size_t*, DT> offsets_type;
440 typedef Kokkos::View<const char*, BDT> input_buffer_type;
443 typedef size_t value_type;
446 num_packets_per_lid_type num_packets_per_lid;
447 offsets_type offsets;
448 input_buffer_type imports;
452 const offsets_type& offsets_in,
453 const input_buffer_type& imports_in) :
454 num_packets_per_lid (num_packets_per_lid_in),
455 offsets (offsets_in),
459 KOKKOS_INLINE_FUNCTION
void
460 operator() (
const MaxNumEntTag,
const LO i, value_type& update)
const {
462 const size_t num_bytes = num_packets_per_lid(i);
465 const char*
const in_buf = imports.data () + offsets(i);
467 const size_t num_ent =
static_cast<size_t> (num_ent_LO);
469 update = (update < num_ent) ? num_ent : update;
473 KOKKOS_INLINE_FUNCTION
void
474 join (
const MaxNumEntTag,
476 const value_type& src)
const
478 if (dst < src) dst = src;
481 KOKKOS_INLINE_FUNCTION
void
482 operator() (
const TotNumEntTag,
const LO i, value_type& tot_num_ent)
const {
484 const size_t num_bytes = num_packets_per_lid(i);
487 const char*
const in_buf = imports.data () + offsets(i);
489 tot_num_ent +=
static_cast<size_t> (num_ent_LO);
501template<
class LO,
class DT,
class BDT>
504 const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
505 const Kokkos::View<const size_t*, DT>& offsets,
506 const Kokkos::View<const char*, BDT>& imports)
508 typedef typename DT::execution_space XS;
509 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>,
510 MaxNumEntTag> range_policy;
514 const LO numRowsToUnpack =
515 static_cast<LO
> (num_packets_per_lid.extent (0));
516 size_t max_num_ent = 0;
517 Kokkos::parallel_reduce (
"Max num entries in CRS",
518 range_policy (0, numRowsToUnpack),
519 functor, max_num_ent);
530template<
class LO,
class DT,
class BDT>
533 const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
534 const Kokkos::View<const size_t*, DT>& offsets,
535 const Kokkos::View<const char*, BDT>& imports)
537 typedef typename DT::execution_space XS;
538 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>, TotNumEntTag> range_policy;
539 size_t tot_num_ent = 0;
542 const LO numRowsToUnpack =
543 static_cast<LO
> (num_packets_per_lid.extent (0));
544 Kokkos::parallel_reduce (
"Total num entries in CRS to unpack",
545 range_policy (0, numRowsToUnpack),
546 functor, tot_num_ent);
551KOKKOS_INLINE_FUNCTION
553unpackRowCount(
const char imports[],
555 const size_t num_bytes)
561 const size_t p_num_bytes = PT::packValueCount(num_ent_LO);
562 if (p_num_bytes > num_bytes) {
563 return OrdinalTraits<size_t>::invalid();
565 const char*
const in_buf = imports + offset;
566 (void) PT::unpackValue(num_ent_LO, in_buf);
568 return static_cast<size_t>(num_ent_LO);
575template<
class View1,
class View2>
579 const View1& batches_per_lid,
583 using LO =
typename View2::value_type;
585 for (
size_t i=0; i<batches_per_lid.extent(0); i++)
587 for (
size_t batch_no=0; batch_no<batches_per_lid(i); batch_no++)
589 batch_info(batch, 0) =
static_cast<LO
>(i);
590 batch_info(batch, 1) = batch_no;
594 return batch == batch_info.extent(0);
604template<
class LocalMatrix,
class LocalMap,
class BufferDeviceType>
607 const LocalMatrix& local_matrix,
609 const Kokkos::View<const char*, BufferDeviceType>& imports,
610 const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
614 using ST =
typename LocalMatrix::value_type;
617 using XS =
typename DT::execution_space;
618 const char prefix[] =
619 "Tpetra::Details::UnpackAndCombineCrsMatrixImpl::"
620 "unpackAndCombineIntoCrsMatrix: ";
622 const size_t num_import_lids =
static_cast<size_t>(import_lids.extent(0));
623 if (num_import_lids == 0) {
630 TEUCHOS_TEST_FOR_EXCEPTION(combine_mode ==
ABSMAX,
631 std::invalid_argument,
632 prefix <<
"ABSMAX combine mode is not yet implemented for a matrix that has a "
633 "static graph (i.e., was constructed with the CrsMatrix constructor "
634 "that takes a const CrsGraph pointer).");
636 TEUCHOS_TEST_FOR_EXCEPTION(combine_mode ==
INSERT,
637 std::invalid_argument,
638 prefix <<
"INSERT combine mode is not allowed if the matrix has a static graph "
639 "(i.e., was constructed with the CrsMatrix constructor that takes a "
640 "const CrsGraph pointer).");
643 TEUCHOS_TEST_FOR_EXCEPTION(!(combine_mode ==
ADD || combine_mode ==
REPLACE),
644 std::invalid_argument,
645 prefix <<
"Invalid combine mode; should never get "
646 "here! Please report this bug to the Tpetra developers.");
649 bool bad_num_import_lids =
650 num_import_lids !=
static_cast<size_t>(num_packets_per_lid.extent(0));
651 TEUCHOS_TEST_FOR_EXCEPTION(bad_num_import_lids,
652 std::invalid_argument,
653 prefix <<
"importLIDs.size() (" << num_import_lids <<
") != "
654 "numPacketsPerLID.size() (" << num_packets_per_lid.extent(0) <<
").");
658 Kokkos::View<size_t*, DT> offsets(
"offsets", num_import_lids+1);
662 size_t max_num_ent = compute_maximum_num_entries<LO,DT>(num_packets_per_lid, offsets, imports);
664 const size_t batch_size = std::min(default_batch_size, max_num_ent);
667 size_t num_batches = 0;
668 Kokkos::View<LO*[2], DT> batch_info(
"", num_batches);
669 Kokkos::View<size_t*, DT> batches_per_lid(
"", num_import_lids);
671 Kokkos::parallel_reduce(
672 Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t>>(0, num_import_lids),
673 KOKKOS_LAMBDA(
const size_t i,
size_t& batches)
675 const size_t num_entries_in_row = unpackRowCount<LO>(
676 imports.data(), offsets(i), num_packets_per_lid(i)
679 (num_entries_in_row <= batch_size) ?
681 num_entries_in_row / batch_size + (num_entries_in_row % batch_size != 0);
682 batches += batches_per_lid(i);
686 Kokkos::resize(batch_info, num_batches);
688 Kokkos::HostSpace host_space;
689 auto batches_per_lid_h = Kokkos::create_mirror_view(host_space, batches_per_lid);
691 Kokkos::deep_copy(XS(), batches_per_lid_h, batches_per_lid);
693 auto batch_info_h = Kokkos::create_mirror_view(host_space, batch_info);
697 Kokkos::deep_copy(XS(), batch_info, batch_info_h);
704 const bool atomic = XS().concurrency() != 1;
720 using policy = Kokkos::TeamPolicy<XS, Kokkos::IndexType<LO>>;
722#if defined(KOKKOS_ENABLE_CUDA)
723 constexpr bool is_cuda = std::is_same<XS, Kokkos::Cuda>::value;
725 constexpr bool is_cuda =
false;
727 if (!is_cuda || team_size == Teuchos::OrdinalTraits<size_t>::invalid())
729 Kokkos::parallel_for(policy(
static_cast<LO
>(num_batches), Kokkos::AUTO), f);
733 Kokkos::parallel_for(policy(
static_cast<LO
>(num_batches),
static_cast<int>(team_size)), f);
736 auto error_code = f.error();
737 TEUCHOS_TEST_FOR_EXCEPTION(
740 prefix <<
"UnpackCrsMatrixAndCombineFunctor reported error code " << error_code
744template<
class LocalMatrix,
class BufferDeviceType>
747 const LocalMatrix& local_matrix,
749 const Kokkos::View<const char*, BufferDeviceType>& imports,
750 const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
751 const size_t num_same_ids)
753 using Kokkos::parallel_reduce;
754 typedef typename LocalMatrix::ordinal_type LO;
755 typedef typename LocalMatrix::device_type device_type;
756 typedef typename device_type::execution_space XS;
757 typedef typename Kokkos::View<LO*, device_type>::size_type size_type;
758 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO> > range_policy;
759 typedef BufferDeviceType BDT;
765 num_items =
static_cast<LO
>(num_same_ids);
768 parallel_reduce(range_policy(0, num_items),
769 KOKKOS_LAMBDA(
const LO lid,
size_t& update) {
770 update +=
static_cast<size_t>(local_matrix.graph.row_map[lid+1]
771 -local_matrix.graph.row_map[lid]);
777 num_items =
static_cast<LO
>(permute_from_lids.extent(0));
780 parallel_reduce(range_policy(0, num_items),
781 KOKKOS_LAMBDA(
const LO i,
size_t& update) {
782 const LO lid = permute_from_lids(i);
783 update +=
static_cast<size_t> (local_matrix.graph.row_map[lid+1]
784 - local_matrix.graph.row_map[lid]);
791 const size_type np = num_packets_per_lid.extent(0);
792 Kokkos::View<size_t*, device_type> offsets(
"offsets", np+1);
795 compute_total_num_entries<LO, device_type, BDT> (num_packets_per_lid,
803template<
class LO,
class DT,
class BDT>
805setupRowPointersForRemotes(
808 const Kokkos::View<const char*, BDT>& imports,
809 const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
812 using Kokkos::parallel_reduce;
813 typedef typename DT::execution_space XS;
815 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
817 const size_t InvalidNum = OrdinalTraits<size_t>::invalid();
818 const size_type N = num_packets_per_lid.extent(0);
821 parallel_reduce (
"Setup row pointers for remotes",
823 KOKKOS_LAMBDA (
const size_t i,
int& k_error) {
824 typedef typename std::remove_reference<
decltype( tgt_rowptr(0) ) >::type atomic_incr_type;
825 const size_t num_bytes = num_packets_per_lid(i);
826 const size_t offset = offsets(i);
827 const size_t num_ent = unpackRowCount<LO> (imports.data(), offset, num_bytes);
828 if (num_ent == InvalidNum) {
831 Kokkos::atomic_fetch_add (&tgt_rowptr (import_lids(i)), atomic_incr_type(num_ent));
839makeCrsRowPtrFromLengths(
841 const Kokkos::View<size_t*,DT>& new_start_row)
843 using Kokkos::parallel_scan;
844 typedef typename DT::execution_space XS;
845 typedef typename Kokkos::View<size_t*,DT>::size_type size_type;
846 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
847 const size_type N = new_start_row.extent(0);
848 parallel_scan(range_policy(0, N),
849 KOKKOS_LAMBDA(
const size_t& i,
size_t& update,
const bool&
final) {
850 auto cur_val = tgt_rowptr(i);
852 tgt_rowptr(i) = update;
853 new_start_row(i) = tgt_rowptr(i);
860template<
class LocalMatrix,
class LocalMap>
864 const typename PackTraits<int>::output_array_type& tgt_pids,
866 const Kokkos::View<size_t*, typename LocalMap::device_type>& new_start_row,
868 const typename PackTraits<int>::input_array_type& src_pids,
869 const LocalMatrix& local_matrix,
870 const LocalMap& local_col_map,
871 const size_t num_same_ids,
874 using Kokkos::parallel_for;
877 typedef typename DT::execution_space XS;
878 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t> > range_policy;
880 parallel_for(range_policy(0, num_same_ids),
881 KOKKOS_LAMBDA(
const size_t i) {
882 typedef typename std::remove_reference<
decltype( new_start_row(0) ) >::type atomic_incr_type;
884 const LO src_lid =
static_cast<LO
>(i);
885 size_t src_row = local_matrix.graph.row_map(src_lid);
887 const LO tgt_lid =
static_cast<LO
>(i);
888 const size_t tgt_row = tgt_rowptr(tgt_lid);
890 const size_t nsr = local_matrix.graph.row_map(src_lid+1)
891 - local_matrix.graph.row_map(src_lid);
892 Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
894 for (
size_t j=local_matrix.graph.row_map(src_lid);
895 j<local_matrix.graph.row_map(src_lid+1); ++j) {
896 LO src_col = local_matrix.graph.entries(j);
897 tgt_vals(tgt_row + j - src_row) = local_matrix.values(j);
898 tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
899 tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
905template<
class LocalMatrix,
class LocalMap>
907copyDataFromPermuteIDs(
909 const typename PackTraits<int>::output_array_type& tgt_pids,
911 const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
913 const typename PackTraits<int>::input_array_type& src_pids,
916 const LocalMatrix& local_matrix,
917 const LocalMap& local_col_map,
920 using Kokkos::parallel_for;
923 typedef typename DT::execution_space XS;
924 typedef typename PackTraits<LO>::input_array_type::size_type size_type;
925 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
927 const size_type num_permute_to_lids = permute_to_lids.extent(0);
929 parallel_for(range_policy(0, num_permute_to_lids),
930 KOKKOS_LAMBDA(
const size_t i) {
931 typedef typename std::remove_reference<
decltype( new_start_row(0) ) >::type atomic_incr_type;
933 const LO src_lid = permute_from_lids(i);
934 const size_t src_row = local_matrix.graph.row_map(src_lid);
936 const LO tgt_lid = permute_to_lids(i);
937 const size_t tgt_row = tgt_rowptr(tgt_lid);
939 size_t nsr = local_matrix.graph.row_map(src_lid+1)
940 - local_matrix.graph.row_map(src_lid);
941 Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
943 for (
size_t j=local_matrix.graph.row_map(src_lid);
944 j<local_matrix.graph.row_map(src_lid+1); ++j) {
945 LO src_col = local_matrix.graph.entries(j);
946 tgt_vals(tgt_row + j - src_row) = local_matrix.values(j);
947 tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
948 tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
954template<
typename LocalMatrix,
typename LocalMap,
typename BufferDeviceType>
956unpackAndCombineIntoCrsArrays2(
958 const typename PackTraits<int>::output_array_type& tgt_pids,
960 const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
963 const Kokkos::View<const char*, BufferDeviceType>& imports,
964 const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
968 const size_t bytes_per_value)
971 using Kokkos::subview;
972 using Kokkos::MemoryUnmanaged;
973 using Kokkos::parallel_reduce;
974 using Kokkos::atomic_fetch_add;
979 typedef typename LocalMatrix::value_type ST;
980 typedef typename DT::execution_space XS;
981 typedef typename Kokkos::View<LO*, DT>::size_type size_type;
982 typedef typename Kokkos::pair<size_type, size_type> slice;
983 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
985 typedef View<int*,DT, MemoryUnmanaged> pids_out_type;
986 typedef View<GO*, DT, MemoryUnmanaged> gids_out_type;
987 typedef View<ST*, DT, MemoryUnmanaged> vals_out_type;
989 const size_t InvalidNum = OrdinalTraits<size_t>::invalid();
992 const size_type num_import_lids = import_lids.size();
995 parallel_reduce (
"Unpack and combine into CRS",
996 range_policy (0, num_import_lids),
997 KOKKOS_LAMBDA (
const size_t i,
int& k_error) {
998 typedef typename std::remove_reference<
decltype( new_start_row(0) ) >::type atomic_incr_type;
999 const size_t num_bytes = num_packets_per_lid(i);
1000 const size_t offset = offsets(i);
1001 if (num_bytes == 0) {
1005 size_t num_ent = unpackRowCount<LO>(imports.data(), offset, num_bytes);
1006 if (num_ent == InvalidNum) {
1010 const LO lcl_row = import_lids(i);
1011 const size_t start_row = atomic_fetch_add(&new_start_row(lcl_row), atomic_incr_type(num_ent));
1012 const size_t end_row = start_row + num_ent;
1014 gids_out_type gids_out = subview(tgt_colind, slice(start_row, end_row));
1015 vals_out_type vals_out = subview(tgt_vals, slice(start_row, end_row));
1016 pids_out_type pids_out = subview(tgt_pids, slice(start_row, end_row));
1018 k_error += unpackRow<ST,LO,GO>(gids_out, pids_out, vals_out,
1019 imports.data(), offset, num_bytes,
1020 num_ent, bytes_per_value);
1023 for (
size_t j = 0; j < static_cast<size_t>(num_ent); ++j) {
1024 const int pid = pids_out(j);
1025 pids_out(j) = (pid != my_pid) ? pid : -1;
1032template<
typename LocalMatrix,
typename LocalMap,
typename BufferDeviceType>
1035 const LocalMatrix & local_matrix,
1036 const LocalMap & local_col_map,
1038 const Kokkos::View<const char*, BufferDeviceType>& imports,
1039 const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
1045 const typename PackTraits<int>::input_array_type& src_pids,
1046 const typename PackTraits<int>::output_array_type& tgt_pids,
1047 const size_t num_same_ids,
1048 const size_t tgt_num_rows,
1049 const size_t tgt_num_nonzeros,
1050 const int my_tgt_pid,
1051 const size_t bytes_per_value)
1054 using Kokkos::subview;
1055 using Kokkos::parallel_for;
1056 using Kokkos::MemoryUnmanaged;
1060 typedef typename DT::execution_space XS;
1061 typedef typename Kokkos::View<LO*, DT>::size_type size_type;
1062 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t> > range_policy;
1063 typedef BufferDeviceType BDT;
1065 const char prefix[] =
"unpackAndCombineIntoCrsArrays: ";
1067 const size_t N = tgt_num_rows;
1071 const int my_pid = my_tgt_pid;
1074 parallel_for(range_policy(0, N+1),
1075 KOKKOS_LAMBDA(
const size_t i) {
1081 parallel_for(range_policy(0, num_same_ids),
1082 KOKKOS_LAMBDA(
const size_t i) {
1083 const LO tgt_lid =
static_cast<LO
>(i);
1084 const LO src_lid =
static_cast<LO
>(i);
1085 tgt_rowptr(tgt_lid) = local_matrix.graph.row_map(src_lid+1)
1086 - local_matrix.graph.row_map(src_lid);
1091 const size_type num_permute_to_lids = permute_to_lids.extent(0);
1092 parallel_for(range_policy(0, num_permute_to_lids),
1093 KOKKOS_LAMBDA(
const size_t i) {
1094 const LO tgt_lid = permute_to_lids(i);
1095 const LO src_lid = permute_from_lids(i);
1096 tgt_rowptr(tgt_lid) = local_matrix.graph.row_map(src_lid+1)
1097 - local_matrix.graph.row_map(src_lid);
1102 const size_type num_import_lids = import_lids.extent(0);
1103 View<size_t*, DT> offsets(
"offsets", num_import_lids+1);
1106#ifdef HAVE_TPETRA_DEBUG
1108 auto nth_offset_h = getEntryOnHost(offsets, num_import_lids);
1109 const bool condition =
1110 nth_offset_h !=
static_cast<size_t>(imports.extent (0));
1111 TEUCHOS_TEST_FOR_EXCEPTION
1112 (condition, std::logic_error, prefix
1113 <<
"The final offset in bytes " << nth_offset_h
1114 <<
" != imports.size() = " << imports.extent(0)
1115 <<
". Please report this bug to the Tpetra developers.");
1121 setupRowPointersForRemotes<LO,DT,BDT>(tgt_rowptr,
1122 import_lids, imports, num_packets_per_lid, offsets);
1123 TEUCHOS_TEST_FOR_EXCEPTION(k_error != 0, std::logic_error, prefix
1124 <<
" Error transferring data to target row pointers. "
1125 "Please report this bug to the Tpetra developers.");
1129 View<size_t*, DT> new_start_row (
"new_start_row", N+1);
1132 makeCrsRowPtrFromLengths(tgt_rowptr, new_start_row);
1135 copyDataFromSameIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row,
1136 tgt_rowptr, src_pids, local_matrix, local_col_map, num_same_ids, my_pid);
1138 copyDataFromPermuteIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row,
1139 tgt_rowptr, src_pids, permute_to_lids, permute_from_lids,
1140 local_matrix, local_col_map, my_pid);
1142 if (imports.extent(0) <= 0) {
1146 int unpack_err = unpackAndCombineIntoCrsArrays2(tgt_colind, tgt_pids,
1147 tgt_vals, new_start_row, offsets, import_lids, imports, num_packets_per_lid,
1148 local_matrix, local_col_map, my_pid, bytes_per_value);
1149 TEUCHOS_TEST_FOR_EXCEPTION(
1150 unpack_err != 0, std::logic_error, prefix <<
"unpack loop failed. This "
1151 "should never happen. Please report this bug to the Tpetra developers.");
1197template<
typename ST,
typename LO,
typename GO,
typename Node>
1201 const Teuchos::ArrayView<const char>& imports,
1202 const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1203 const Teuchos::ArrayView<const LO>& importLIDs,
1208 typedef typename Node::device_type device_type;
1210 static_assert (std::is_same<device_type, typename local_matrix_device_type::device_type>::value,
1211 "Node::device_type and LocalMatrix::device_type must be the same.");
1214 device_type outputDevice;
1219 auto num_packets_per_lid_d =
1221 numPacketsPerLID.size(),
true,
"num_packets_per_lid");
1223 auto import_lids_d =
1225 importLIDs.size(),
true,
"import_lids");
1229 imports.size(),
true,
"imports");
1232 auto local_col_map = sourceMatrix.
getColMap()->getLocalMap();
1243 UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix(
1244 local_matrix, local_col_map, imports_d, num_packets_per_lid_d,
1245 import_lids_d, combineMode);
1249template<
typename ST,
typename LO,
typename GO,
typename NT>
1251unpackCrsMatrixAndCombineNew(
1253 Kokkos::DualView<
char*,
1255 Kokkos::DualView<
size_t*,
1257 const Kokkos::DualView<
const LO*,
1265 using device_type =
typename crs_matrix_type::device_type;
1266 using local_matrix_device_type =
typename crs_matrix_type::local_matrix_device_type;
1267 using buffer_device_type =
typename dist_object_type::buffer_device_type;
1270 (std::is_same<device_type, typename local_matrix_device_type::device_type>::value,
1271 "crs_matrix_type::device_type and local_matrix_device_type::device_type "
1272 "must be the same.");
1274 if (numPacketsPerLID.need_sync_device()) {
1275 numPacketsPerLID.sync_device ();
1277 auto num_packets_per_lid_d = numPacketsPerLID.view_device ();
1279 TEUCHOS_ASSERT( ! importLIDs.need_sync_device () );
1280 auto import_lids_d = importLIDs.view_device ();
1282 if (imports.need_sync_device()) {
1283 imports.sync_device ();
1285 auto imports_d = imports.view_device ();
1288 auto local_col_map = sourceMatrix.
getColMap ()->getLocalMap ();
1289 typedef decltype (local_col_map) local_map_type;
1291 UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix<
1292 local_matrix_device_type,
1295 > (local_matrix, local_col_map, imports_d, num_packets_per_lid_d,
1296 import_lids_d, combineMode);
1354template<
typename Scalar,
typename LocalOrdinal,
typename GlobalOrdinal,
typename Node>
1358 const Teuchos::ArrayView<const LocalOrdinal> &importLIDs,
1359 const Teuchos::ArrayView<const char> &imports,
1360 const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1364 const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
1365 const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs)
1367 using Kokkos::MemoryUnmanaged;
1369 typedef typename Node::device_type DT;
1371 const char prefix[] =
"unpackAndCombineWithOwningPIDsCount: ";
1373 TEUCHOS_TEST_FOR_EXCEPTION
1374 (permuteToLIDs.size () != permuteFromLIDs.size (), std::invalid_argument,
1375 prefix <<
"permuteToLIDs.size() = " << permuteToLIDs.size () <<
" != "
1376 "permuteFromLIDs.size() = " << permuteFromLIDs.size() <<
".");
1380 TEUCHOS_TEST_FOR_EXCEPTION
1381 (! locallyIndexed, std::invalid_argument, prefix <<
"The input "
1382 "CrsMatrix 'sourceMatrix' must be locally indexed.");
1383 TEUCHOS_TEST_FOR_EXCEPTION
1384 (importLIDs.size () != numPacketsPerLID.size (), std::invalid_argument,
1385 prefix <<
"importLIDs.size() = " << importLIDs.size () <<
" != "
1386 "numPacketsPerLID.size() = " << numPacketsPerLID.size () <<
".");
1389 auto permute_from_lids_d =
1391 permuteFromLIDs.getRawPtr (),
1392 permuteFromLIDs.size (),
true,
1393 "permute_from_lids");
1396 imports.getRawPtr (),
1397 imports.size (),
true,
1399 auto num_packets_per_lid_d =
1401 numPacketsPerLID.getRawPtr (),
1402 numPacketsPerLID.size (),
true,
1403 "num_packets_per_lid");
1405 return UnpackAndCombineCrsMatrixImpl::unpackAndCombineWithOwningPIDsCount(
1406 local_matrix, permute_from_lids_d, imports_d,
1407 num_packets_per_lid_d, numSameIDs);
1424template<
typename Scalar,
typename LocalOrdinal,
typename GlobalOrdinal,
typename Node>
1428 const Teuchos::ArrayView<const LocalOrdinal>& importLIDs,
1429 const Teuchos::ArrayView<const char>& imports,
1430 const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1433 const size_t numSameIDs,
1434 const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
1435 const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs,
1436 size_t TargetNumRows,
1437 size_t TargetNumNonzeros,
1438 const int MyTargetPID,
1439 const Teuchos::ArrayView<size_t>& CRS_rowptr,
1440 const Teuchos::ArrayView<GlobalOrdinal>& CRS_colind,
1442 const Teuchos::ArrayView<const int>& SourcePids,
1443 Teuchos::Array<int>& TargetPids)
1445 using execution_space =
typename Node::execution_space;
1449 using Kokkos::deep_copy;
1451 using Teuchos::ArrayView;
1452 using Teuchos::outArg;
1453 using Teuchos::REDUCE_MAX;
1454 using Teuchos::reduceAll;
1456 typedef LocalOrdinal LO;
1458 typedef typename Node::device_type DT;
1461 typedef typename matrix_type::impl_scalar_type ST;
1462 typedef typename ArrayView<const LO>::size_type size_type;
1464 const char prefix[] =
"Tpetra::Details::unpackAndCombineIntoCrsArrays: ";
1466 TEUCHOS_TEST_FOR_EXCEPTION(
1467 TargetNumRows + 1 !=
static_cast<size_t> (CRS_rowptr.size ()),
1468 std::invalid_argument, prefix <<
"CRS_rowptr.size() = " <<
1469 CRS_rowptr.size () <<
"!= TargetNumRows+1 = " << TargetNumRows+1 <<
".");
1471 TEUCHOS_TEST_FOR_EXCEPTION(
1472 permuteToLIDs.size () != permuteFromLIDs.size (), std::invalid_argument,
1473 prefix <<
"permuteToLIDs.size() = " << permuteToLIDs.size ()
1474 <<
"!= permuteFromLIDs.size() = " << permuteFromLIDs.size () <<
".");
1475 const size_type numImportLIDs = importLIDs.size ();
1477 TEUCHOS_TEST_FOR_EXCEPTION(
1478 numImportLIDs != numPacketsPerLID.size (), std::invalid_argument,
1479 prefix <<
"importLIDs.size() = " << numImportLIDs <<
" != "
1480 "numPacketsPerLID.size() = " << numPacketsPerLID.size() <<
".");
1483 if (
static_cast<size_t> (TargetPids.size ()) != TargetNumNonzeros) {
1484 TargetPids.resize (TargetNumNonzeros);
1486 TargetPids.assign (TargetNumNonzeros, -1);
1490 auto local_col_map = sourceMatrix.
getColMap()->getLocalMap();
1494 auto import_lids_d =
1496 importLIDs.size(),
true,
"import_lids");
1500 imports.size(),
true,
"imports");
1502 auto num_packets_per_lid_d =
1504 numPacketsPerLID.size(),
true,
"num_packets_per_lid");
1506 auto permute_from_lids_d =
1508 permuteFromLIDs.size(),
true,
"permute_from_lids");
1510 auto permute_to_lids_d =
1512 permuteToLIDs.size(),
true,
"permute_to_lids");
1516 CRS_rowptr.size(),
true,
"crs_rowptr");
1520 CRS_colind.size(),
true,
"crs_colidx");
1522#ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1523 static_assert (! std::is_same<
1524 typename std::remove_const<
1525 typename std::decay<
1529 std::complex<double> >::value,
1530 "CRS_vals::value_type is std::complex<double>; this should never happen"
1531 ", since std::complex does not work in Kokkos::View objects.");
1536 CRS_vals.size(),
true,
"crs_vals");
1538#ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1539 static_assert (! std::is_same<
1540 typename decltype (crs_vals_d)::non_const_value_type,
1541 std::complex<double> >::value,
1542 "crs_vals_d::non_const_value_type is std::complex<double>; this should "
1543 "never happen, since std::complex does not work in Kokkos::View objects.");
1548 SourcePids.size(),
true,
"src_pids");
1552 TargetPids.size(),
true,
"tgt_pids");
1554 size_t bytes_per_value = 0;
1568 size_t bytes_per_value_l = 0;
1569 if (local_matrix.values.extent(0) > 0) {
1570 const ST& val = local_matrix.values(0);
1573 const ST& val = crs_vals_d(0);
1576 Teuchos::reduceAll<int, size_t>(*(sourceMatrix.
getComm()),
1577 Teuchos::REDUCE_MAX,
1579 outArg(bytes_per_value));
1582#ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1583 static_assert (! std::is_same<
1584 typename decltype (crs_vals_d)::non_const_value_type,
1585 std::complex<double> >::value,
1586 "crs_vals_d::non_const_value_type is std::complex<double>; this should "
1587 "never happen, since std::complex does not work in Kokkos::View objects.");
1590 UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsArrays(
1591 local_matrix, local_col_map, import_lids_d, imports_d,
1592 num_packets_per_lid_d, permute_to_lids_d, permute_from_lids_d,
1593 crs_rowptr_d, crs_colind_d, crs_vals_d, src_pids_d, tgt_pids_d,
1594 numSameIDs, TargetNumRows, TargetNumNonzeros, MyTargetPID,
1598 typename decltype(crs_rowptr_d)::HostMirror crs_rowptr_h(
1599 CRS_rowptr.getRawPtr(), CRS_rowptr.size());
1601 deep_copy(execution_space(), crs_rowptr_h, crs_rowptr_d);
1603 typename decltype(crs_colind_d)::HostMirror crs_colind_h(
1604 CRS_colind.getRawPtr(), CRS_colind.size());
1606 deep_copy(execution_space(), crs_colind_h, crs_colind_d);
1608 typename decltype(crs_vals_d)::HostMirror crs_vals_h(
1609 CRS_vals.getRawPtr(), CRS_vals.size());
1611 deep_copy(execution_space(), crs_vals_h, crs_vals_d);
1613 typename decltype(tgt_pids_d)::HostMirror tgt_pids_h(
1614 TargetPids.getRawPtr(), TargetPids.size());
1616 deep_copy(execution_space(), tgt_pids_h, tgt_pids_d);
1622#define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT( ST, LO, GO, NT ) \
1624 Details::unpackCrsMatrixAndCombine<ST, LO, GO, NT> ( \
1625 const CrsMatrix<ST, LO, GO, NT>&, \
1626 const Teuchos::ArrayView<const char>&, \
1627 const Teuchos::ArrayView<const size_t>&, \
1628 const Teuchos::ArrayView<const LO>&, \
1632 Details::unpackCrsMatrixAndCombineNew<ST, LO, GO, NT> ( \
1633 const CrsMatrix<ST, LO, GO, NT>&, \
1634 Kokkos::DualView<char*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1635 Kokkos::DualView<size_t*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1636 const Kokkos::DualView<const LO*, typename DistObject<char, LO, GO, NT>::buffer_device_type>&, \
1638 const CombineMode); \
1640 Details::unpackAndCombineIntoCrsArrays<ST, LO, GO, NT> ( \
1641 const CrsMatrix<ST, LO, GO, NT> &, \
1642 const Teuchos::ArrayView<const LO>&, \
1643 const Teuchos::ArrayView<const char>&, \
1644 const Teuchos::ArrayView<const size_t>&, \
1646 const CombineMode, \
1648 const Teuchos::ArrayView<const LO>&, \
1649 const Teuchos::ArrayView<const LO>&, \
1653 const Teuchos::ArrayView<size_t>&, \
1654 const Teuchos::ArrayView<GO>&, \
1655 const Teuchos::ArrayView<CrsMatrix<ST, LO, GO, NT>::impl_scalar_type>&, \
1656 const Teuchos::ArrayView<const int>&, \
1657 Teuchos::Array<int>&); \
1659 Details::unpackAndCombineWithOwningPIDsCount<ST, LO, GO, NT> ( \
1660 const CrsMatrix<ST, LO, GO, NT> &, \
1661 const Teuchos::ArrayView<const LO> &, \
1662 const Teuchos::ArrayView<const char> &, \
1663 const Teuchos::ArrayView<const size_t>&, \
1667 const Teuchos::ArrayView<const LO>&, \
1668 const Teuchos::ArrayView<const LO>&);
Declaration of the Tpetra::CrsMatrix class.
Import KokkosSparse::OrdinalTraits, a traits class for "invalid" (flag) values of integer types,...
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary,...
Declaration and definition of Tpetra::Details::getEntryOnHost.
size_t compute_total_num_entries(const Kokkos::View< const size_t *, BDT > &num_packets_per_lid, const Kokkos::View< const size_t *, DT > &offsets, const Kokkos::View< const char *, BDT > &imports)
Total number of entries in any row of the packed matrix.
void unpackAndCombineIntoCrsMatrix(const LocalMatrix &local_matrix, const LocalMap &local_map, const Kokkos::View< const char *, BufferDeviceType > &imports, const Kokkos::View< const size_t *, BufferDeviceType > &num_packets_per_lid, const typename PackTraits< typename LocalMap::local_ordinal_type >::input_array_type import_lids, const Tpetra::CombineMode combine_mode)
Perform the unpack operation for the matrix.
size_t compute_maximum_num_entries(const Kokkos::View< const size_t *, BDT > &num_packets_per_lid, const Kokkos::View< const size_t *, DT > &offsets, const Kokkos::View< const char *, BDT > &imports)
Maximum number of entries in any row of the packed matrix.
bool compute_batch_info(const View1 &batches_per_lid, View2 &batch_info)
Compute the index and batch number associated with each batch.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
local_matrix_device_type getLocalMatrixDevice() const
The local sparse matrix.
Teuchos::RCP< const map_type > getColMap() const override
The Map that describes the column distribution in this matrix.
bool isLocallyIndexed() const override
Whether the matrix is locally indexed on the calling process.
typename row_matrix_type::impl_scalar_type impl_scalar_type
The type used internally in place of Scalar.
static size_t hierarchicalUnpackBatchSize()
Size of batch for hierarchical unpacking.
static size_t hierarchicalUnpackTeamSize()
Size of team for hierarchical unpacking.
"Local" part of Map suitable for Kokkos kernels.
KOKKOS_INLINE_FUNCTION LocalOrdinal getLocalElement(const GlobalOrdinal globalIndex) const
Get the local index corresponding to the given global index. (device only)
LocalOrdinal local_ordinal_type
The type of local indices.
GlobalOrdinal global_ordinal_type
The type of global indices.
DeviceType device_type
The device type.
Kokkos::parallel_reduce functor to determine the number of entries (to unpack) in a KokkosSparse::Crs...
Base class for distributed Tpetra objects that support data redistribution.
Kokkos::Device< typename device_type::execution_space, buffer_memory_space > buffer_device_type
Kokkos::Device specialization for communication buffers.
Implementation details of Tpetra.
void unpackAndCombineIntoCrsArrays(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const size_t constantNumPackets, const CombineMode combineMode, const size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, const int MyTargetPID, const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< GO > &CRS_colind, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
size_t unpackAndCombineWithOwningPIDsCount(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t constantNumPackets, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs)
Special version of Tpetra::Details::unpackCrsGraphAndCombine that also unpacks owning process ranks.
void unpackCrsMatrixAndCombine(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, const Teuchos::ArrayView< const char > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &importLIDs, size_t constantNumPackets, CombineMode combineMode)
Unpack the imported column indices and values, and combine into matrix.
OffsetsViewType::non_const_value_type computeOffsetsFromCounts(const ExecutionSpace &execSpace, const OffsetsViewType &ptr, const CountsViewType &counts)
Compute offsets from counts.
Namespace Tpetra contains the class and methods constituting the Tpetra library.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
CombineMode
Rule for combining data in an Import or Export.
@ REPLACE
Replace existing values with new values.
@ ABSMAX
Replace old value with maximum of magnitudes of old and new values.
@ INSERT
Insert new values that don't currently exist.
Traits class for packing / unpacking data of type T.
static KOKKOS_INLINE_FUNCTION Kokkos::pair< int, size_t > unpackArray(value_type outBuf[], const char inBuf[], const size_t numEnt)
Unpack numEnt value_type entries from the given input buffer of bytes, to the given output buffer of ...
static KOKKOS_INLINE_FUNCTION size_t unpackValue(T &outVal, const char inBuf[])
Unpack the given value from the given output buffer.
Kokkos::View< value_type *, Kokkos::AnonymousSpace > output_array_type
The type of an output array of value_type.
static KOKKOS_INLINE_FUNCTION size_t packValueCount(const T &)
Number of bytes required to pack or unpack the given value of type value_type.
Kokkos::View< const value_type *, Kokkos::AnonymousSpace > input_array_type
The type of an input array of value_type.
Unpacks and combines a single row of the CrsMatrix.
int error() const
Host function for getting the error.