Tpetra parallel linear algebra Version of the Day
Loading...
Searching...
No Matches
Tpetra_Details_residual.hpp
Go to the documentation of this file.
1// @HEADER
2// ***********************************************************************
3//
4// Tpetra: Templated Linear Algebra Services Package
5// Copyright (2008) Sandia Corporation
6//
7// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8// the U.S. Government retains certain rights in this software.
9//
10// Redistribution and use in source and binary forms, with or without
11// modification, are permitted provided that the following conditions are
12// met:
13//
14// 1. Redistributions of source code must retain the above copyright
15// notice, this list of conditions and the following disclaimer.
16//
17// 2. Redistributions in binary form must reproduce the above copyright
18// notice, this list of conditions and the following disclaimer in the
19// documentation and/or other materials provided with the distribution.
20//
21// 3. Neither the name of the Corporation nor the names of the
22// contributors may be used to endorse or promote products derived from
23// this software without specific prior written permission.
24//
25// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36//
37//
38// ************************************************************************
39// @HEADER
40
41#ifndef TPETRA_DETAILS_RESIDUAL_HPP
42#define TPETRA_DETAILS_RESIDUAL_HPP
43
44#include "TpetraCore_config.h"
45#include "Tpetra_CrsMatrix.hpp"
46#include "Tpetra_LocalCrsMatrixOperator.hpp"
47#include "Tpetra_MultiVector.hpp"
48#include "Teuchos_RCP.hpp"
49#include "Teuchos_ScalarTraits.hpp"
52#include "KokkosSparse_spmv_impl.hpp"
53
59
60namespace Tpetra {
61 namespace Details {
62
63
70template<class AMatrix, class MV, class ConstMV, class Offsets, bool is_MV, bool restrictedMode, bool skipOffRank>
72
73 using execution_space = typename AMatrix::execution_space;
74 using LO = typename AMatrix::non_const_ordinal_type;
75 using value_type = typename AMatrix::non_const_value_type;
76 using team_policy = typename Kokkos::TeamPolicy<execution_space>;
77 using team_member = typename team_policy::member_type;
78 using ATV = Kokkos::ArithTraits<value_type>;
79
80 AMatrix A_lcl;
81 ConstMV X_colmap_lcl;
82 ConstMV B_lcl;
83 MV R_lcl;
84 int rows_per_team;
85 Offsets offsets;
86 ConstMV X_domainmap_lcl;
87
88
89 LocalResidualFunctor (const AMatrix& A_lcl_,
90 const ConstMV& X_colmap_lcl_,
91 const ConstMV& B_lcl_,
92 const MV& R_lcl_,
93 const int rows_per_team_,
94 const Offsets& offsets_,
95 const ConstMV& X_domainmap_lcl_) :
96 A_lcl(A_lcl_),
97 X_colmap_lcl(X_colmap_lcl_),
98 B_lcl(B_lcl_),
99 R_lcl(R_lcl_),
100 rows_per_team(rows_per_team_),
101 offsets(offsets_),
102 X_domainmap_lcl(X_domainmap_lcl_)
103 { }
104
105 KOKKOS_INLINE_FUNCTION
106 void operator() (const team_member& dev) const
107 {
108
109 Kokkos::parallel_for(Kokkos::TeamThreadRange (dev, 0, rows_per_team),[&] (const LO& loop) {
110 const LO lclRow = static_cast<LO> (dev.league_rank ()) * rows_per_team + loop;
111
112 if (lclRow >= A_lcl.numRows ()) {
113 return;
114 }
115
116 if (!is_MV) { // MultiVectors only have a single column
117
118 value_type A_x = ATV::zero ();
119
120 if (!restrictedMode) {
121 const auto A_row = A_lcl.rowConst(lclRow);
122 const LO row_length = static_cast<LO> (A_row.length);
123
124 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, row_length), [&] (const LO iEntry, value_type& lsum) {
125 const auto A_val = A_row.value(iEntry);
126 lsum += A_val * X_colmap_lcl(A_row.colidx(iEntry),0);
127 }, A_x);
128
129 }
130 else {
131
132 const LO offRankOffset = offsets(lclRow);
133 const size_t start = A_lcl.graph.row_map(lclRow);
134 const size_t end = A_lcl.graph.row_map(lclRow+1);
135
136 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, start, end), [&] (const LO iEntry, value_type& lsum) {
137 const auto A_val = A_lcl.values(iEntry);
138 const auto lclCol = A_lcl.graph.entries(iEntry);
139 if (iEntry < offRankOffset)
140 lsum += A_val * X_domainmap_lcl(lclCol,0);
141 else if (!skipOffRank)
142 lsum += A_val * X_colmap_lcl(lclCol,0);
143 }, A_x);
144 }
145
146 Kokkos::single(Kokkos::PerThread(dev),[&] () {
147 R_lcl(lclRow,0) = B_lcl(lclRow,0) - A_x;
148 });
149 }
150 else { // MultiVectors have more than one column
151
152 const LO numVectors = static_cast<LO>(X_colmap_lcl.extent(1));
153
154 for(LO v=0; v<numVectors; v++) {
155
156 value_type A_x = ATV::zero ();
157
158 if (!restrictedMode) {
159
160 const auto A_row = A_lcl.rowConst(lclRow);
161 const LO row_length = static_cast<LO> (A_row.length);
162
163 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, row_length), [&] (const LO iEntry, value_type& lsum) {
164 const auto A_val = A_row.value(iEntry);
165 lsum += A_val * X_colmap_lcl(A_row.colidx(iEntry),v);
166 }, A_x);
167 }
168 else {
169 const LO offRankOffset = offsets(lclRow);
170 const size_t start = A_lcl.graph.row_map(lclRow);
171 const size_t end = A_lcl.graph.row_map(lclRow+1);
172
173 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, start, end), [&] (const LO iEntry, value_type& lsum) {
174 const auto A_val = A_lcl.values(iEntry);
175 const auto lclCol = A_lcl.graph.entries(iEntry);
176 if (iEntry < offRankOffset)
177 lsum += A_val * X_domainmap_lcl(lclCol,v);
178 else if (!skipOffRank)
179 lsum += A_val * X_colmap_lcl(lclCol,v);
180 }, A_x);
181 }
182
183 Kokkos::single(Kokkos::PerThread(dev),[&] () {
184 R_lcl(lclRow,v) = B_lcl(lclRow,v) - A_x;
185 });
186
187 }//end for numVectors
188 }
189 });//end parallel_for TeamThreadRange
190 }
191};
192
193
195template<class AMatrix, class MV, class ConstMV, class Offsets, bool is_MV>
197
198 using execution_space = typename AMatrix::execution_space;
199 using LO = typename AMatrix::non_const_ordinal_type;
200 using value_type = typename AMatrix::non_const_value_type;
201 using team_policy = typename Kokkos::TeamPolicy<execution_space>;
202 using team_member = typename team_policy::member_type;
203 using ATV = Kokkos::ArithTraits<value_type>;
204
205 AMatrix A_lcl;
206 ConstMV X_colmap_lcl;
207 MV R_lcl;
208 int rows_per_team;
209 Offsets offsets;
210
211
212 OffRankUpdateFunctor (const AMatrix& A_lcl_,
213 const ConstMV& X_colmap_lcl_,
214 const MV& R_lcl_,
215 const int rows_per_team_,
216 const Offsets& offsets_) :
217 A_lcl(A_lcl_),
218 X_colmap_lcl(X_colmap_lcl_),
219 R_lcl(R_lcl_),
220 rows_per_team(rows_per_team_),
221 offsets(offsets_)
222 { }
223
224 KOKKOS_INLINE_FUNCTION
225 void operator() (const team_member& dev) const
226 {
227
228 Kokkos::parallel_for(Kokkos::TeamThreadRange (dev, 0, rows_per_team),[&] (const LO& loop) {
229 const LO lclRow = static_cast<LO> (dev.league_rank ()) * rows_per_team + loop;
230
231 if (lclRow >= A_lcl.numRows ()) {
232 return;
233 }
234
235 const LO offRankOffset = offsets(lclRow);
236 const size_t end = A_lcl.graph.row_map(lclRow+1);
237
238 if (!is_MV) { // MultiVectors only have a single column
239
240 value_type A_x = ATV::zero ();
241
242 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, offRankOffset, end), [&] (const LO iEntry, value_type& lsum) {
243 const auto A_val = A_lcl.values(iEntry);
244 const auto lclCol = A_lcl.graph.entries(iEntry);
245 lsum += A_val * X_colmap_lcl(lclCol,0);
246 }, A_x);
247
248 Kokkos::single(Kokkos::PerThread(dev),[&] () {
249 R_lcl(lclRow,0) -= A_x;
250 });
251 }
252 else { // MultiVectors have more than one column
253
254 const LO numVectors = static_cast<LO>(X_colmap_lcl.extent(1));
255
256 for(LO v=0; v<numVectors; v++) {
257
258 value_type A_x = ATV::zero ();
259
260 Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, offRankOffset, end), [&] (const LO iEntry, value_type& lsum) {
261 const auto A_val = A_lcl.values(iEntry);
262 const auto lclCol = A_lcl.graph.entries(iEntry);
263 lsum += A_val * X_colmap_lcl(lclCol,v);
264 }, A_x);
265
266 Kokkos::single(Kokkos::PerThread(dev),[&] () {
267 R_lcl(lclRow,v) -= A_x;
268 });
269
270 }//end for numVectors
271 }
272 });
273 }
274};
275
276template<class SC, class LO, class GO, class NO>
277void localResidual(const CrsMatrix<SC,LO,GO,NO> & A,
278 const MultiVector<SC,LO,GO,NO> & X_colmap,
279 const MultiVector<SC,LO,GO,NO> & B,
281 const Kokkos::View<const size_t*, typename NO::device_type>& offsets,
282 const MultiVector<SC,LO,GO,NO> * X_domainmap=nullptr) {
284 using Teuchos::NO_TRANS;
285 ProfilingRegion regionLocalApply ("Tpetra::CrsMatrix::localResidual");
286
287 using local_matrix_device_type = typename CrsMatrix<SC,LO,GO,NO>::local_matrix_device_type;
288 using local_view_device_type = typename MultiVector<SC,LO,GO,NO>::dual_view_type::t_dev::non_const_type;
289 using const_local_view_device_type = typename MultiVector<SC,LO,GO,NO>::dual_view_type::t_dev::const_type;
290 using offset_type = Kokkos::View<const size_t*, typename NO::device_type>;
291
292 local_matrix_device_type A_lcl = A.getLocalMatrixDevice ();
293 const_local_view_device_type X_colmap_lcl = X_colmap.getLocalViewDevice(Access::ReadOnly);
294 const_local_view_device_type B_lcl = B.getLocalViewDevice(Access::ReadOnly);
295 local_view_device_type R_lcl = R.getLocalViewDevice(Access::OverwriteAll);
296 const_local_view_device_type X_domainmap_lcl;
297
298 const bool debug = ::Tpetra::Details::Behavior::debug ();
299 if (debug) {
300 TEUCHOS_TEST_FOR_EXCEPTION
301 (X_colmap.getNumVectors () != R.getNumVectors (), std::runtime_error,
302 "X.getNumVectors() = " << X_colmap.getNumVectors () << " != "
303 "R.getNumVectors() = " << R.getNumVectors () << ".");
304 TEUCHOS_TEST_FOR_EXCEPTION
305 (X_colmap.getLocalLength () !=
306 A.getColMap ()->getLocalNumElements (), std::runtime_error,
307 "X has the wrong number of local rows. "
308 "X.getLocalLength() = " << X_colmap.getLocalLength () << " != "
309 "A.getColMap()->getLocalNumElements() = " <<
310 A.getColMap ()->getLocalNumElements () << ".");
311 TEUCHOS_TEST_FOR_EXCEPTION
312 (R.getLocalLength () !=
313 A.getRowMap ()->getLocalNumElements (), std::runtime_error,
314 "R has the wrong number of local rows. "
315 "R.getLocalLength() = " << R.getLocalLength () << " != "
316 "A.getRowMap()->getLocalNumElements() = " <<
317 A.getRowMap ()->getLocalNumElements () << ".");
318 TEUCHOS_TEST_FOR_EXCEPTION
319 (B.getLocalLength () !=
320 A.getRowMap ()->getLocalNumElements (), std::runtime_error,
321 "B has the wrong number of local rows. "
322 "B.getLocalLength() = " << B.getLocalLength () << " != "
323 "A.getRowMap()->getLocalNumElements() = " <<
324 A.getRowMap ()->getLocalNumElements () << ".");
325
326 TEUCHOS_TEST_FOR_EXCEPTION
327 (! A.isFillComplete (), std::runtime_error, "The matrix A is not "
328 "fill complete. You must call fillComplete() (possibly with "
329 "domain and range Map arguments) without an intervening "
330 "resumeFill() call before you may call this method.");
331 TEUCHOS_TEST_FOR_EXCEPTION
332 (! X_colmap.isConstantStride () || ! R.isConstantStride () || ! B.isConstantStride (),
333 std::runtime_error, "X, Y and B must be constant stride.");
334 // If the two pointers are NULL, then they don't alias one
335 // another, even though they are equal.
336 TEUCHOS_TEST_FOR_EXCEPTION
337 ((X_colmap_lcl.data () == R_lcl.data () && X_colmap_lcl.data () != nullptr) ||
338 (X_colmap_lcl.data () == B_lcl.data () && X_colmap_lcl.data () != nullptr),
339 std::runtime_error, "X, Y and R may not alias one another.");
340 }
341
342 SC one = Teuchos::ScalarTraits<SC>::one(), negone = -one, zero = Teuchos::ScalarTraits<SC>::zero();
343#ifdef TPETRA_DETAILS_USE_REFERENCE_RESIDUAL
344 // This is currently a "reference implementation" waiting until Kokkos Kernels provides
345 // a residual kernel.
346 A.localApply(X_colmap,R,Teuchos::NO_TRANS, one, zero);
347 R.update(one,B,negone);
348#else
349
350 if (A_lcl.numRows() == 0) {
351 return;
352 }
353
354 int64_t numLocalRows = A_lcl.numRows ();
355 int64_t myNnz = A_lcl.nnz();
356
357 //Check for imbalanced rows. If the logic for SPMV to use merge path is triggered,
358 //use it and follow the reference residual code
359 LO maxRowImbalance = 0;
360 if(numLocalRows != 0)
361 maxRowImbalance = A.getLocalMaxNumRowEntries() - (myNnz / numLocalRows);
362 if(size_t(maxRowImbalance) >= Tpetra::Details::Behavior::rowImbalanceThreshold())
363 {
364 //note: lclOp will be wrapped in shared_ptr
365 auto lclOp = A.getLocalMultiplyOperator();
366 //Call local SPMV, requesting merge path, through A's LocalCrsMatrixOperator
367 lclOp->applyImbalancedRows (X_colmap_lcl, R_lcl, Teuchos::NO_TRANS, one, zero);
368 R.update(one,B,negone);
369 return;
370 }
371
372 int team_size = -1;
373 int vector_length = -1;
374 int64_t rows_per_thread = -1;
375
376 using execution_space = typename CrsMatrix<SC,LO,GO,NO>::execution_space;
377 using policy_type = typename Kokkos::TeamPolicy<execution_space>;
378
379 int64_t rows_per_team = KokkosSparse::Impl::spmv_launch_parameters<execution_space>(numLocalRows, myNnz, rows_per_thread, team_size, vector_length);
380 int64_t worksets = (B_lcl.extent (0) + rows_per_team - 1) / rows_per_team;
381
382 policy_type policy (1, 1);
383 if (team_size < 0) {
384 policy = policy_type (worksets, Kokkos::AUTO, vector_length);
385 }
386 else {
387 policy = policy_type (worksets, team_size, vector_length);
388 }
389
390 bool is_vector = (X_colmap_lcl.extent(1) == 1);
391
392 if(is_vector) {
393
394 if (X_domainmap == nullptr) {
395
396 using functor_type = LocalResidualFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,false,false,false>;
397 functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
398 Kokkos::parallel_for("residual-vector",policy,func);
399
400 }
401 else {
402
403 X_domainmap_lcl = X_domainmap->getLocalViewDevice(Access::ReadOnly);
404 using functor_type = LocalResidualFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,false,true,false>;
405 functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
406 Kokkos::parallel_for("residual-vector",policy,func);
407
408 }
409 }
410 else {
411
412 if (X_domainmap == nullptr) {
413
414 using functor_type = LocalResidualFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,true,false,false>;
415 functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
416 Kokkos::parallel_for("residual-multivector",policy,func);
417
418 }
419 else {
420
421 X_domainmap_lcl = X_domainmap->getLocalViewDevice(Access::ReadOnly);
422 using functor_type = LocalResidualFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,true,true,false>;
423 functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
424 Kokkos::parallel_for("residual-multivector",policy,func);
425
426 }
427 }
428#endif
429}
430
431
432template<class SC, class LO, class GO, class NO>
433void localResidualWithCommCompOverlap(const CrsMatrix<SC,LO,GO,NO> & A,
434 MultiVector<SC,LO,GO,NO> & X_colmap,
435 const MultiVector<SC,LO,GO,NO> & B,
436 MultiVector<SC,LO,GO,NO> & R,
437 const Kokkos::View<const size_t*, typename NO::device_type>& offsets,
438 const MultiVector<SC,LO,GO,NO> & X_domainmap) {
440 using Teuchos::NO_TRANS;
441 using Teuchos::RCP;
442 using import_type = typename CrsMatrix<SC,LO,GO,NO>::import_type;
443
444 ProfilingRegion regionLocalApply ("Tpetra::CrsMatrix::localResidualWithCommCompOverlap");
445
446 using local_matrix_device_type = typename CrsMatrix<SC,LO,GO,NO>::local_matrix_device_type;
447 using local_view_device_type = typename MultiVector<SC,LO,GO,NO>::dual_view_type::t_dev::non_const_type;
448 using const_local_view_device_type = typename MultiVector<SC,LO,GO,NO>::dual_view_type::t_dev::const_type;
449 using offset_type = Kokkos::View<const size_t*, typename NO::device_type>;
450
451 local_matrix_device_type A_lcl = A.getLocalMatrixDevice ();
452 const_local_view_device_type X_colmap_lcl = X_colmap.getLocalViewDevice(Access::ReadOnly);
453 const_local_view_device_type B_lcl = B.getLocalViewDevice(Access::ReadOnly);
454 local_view_device_type R_lcl = R.getLocalViewDevice(Access::OverwriteAll);
455 const_local_view_device_type X_domainmap_lcl = X_domainmap.getLocalViewDevice(Access::ReadOnly);;
456
457 const bool debug = ::Tpetra::Details::Behavior::debug ();
458 if (debug) {
459 TEUCHOS_TEST_FOR_EXCEPTION
460 (X_colmap.getNumVectors () != R.getNumVectors (), std::runtime_error,
461 "X.getNumVectors() = " << X_colmap.getNumVectors () << " != "
462 "R.getNumVectors() = " << R.getNumVectors () << ".");
463 TEUCHOS_TEST_FOR_EXCEPTION
464 (X_colmap.getLocalLength () !=
465 A.getColMap ()->getLocalNumElements (), std::runtime_error,
466 "X has the wrong number of local rows. "
467 "X.getLocalLength() = " << X_colmap.getLocalLength () << " != "
468 "A.getColMap()->getLocalNumElements() = " <<
469 A.getColMap ()->getLocalNumElements () << ".");
470 TEUCHOS_TEST_FOR_EXCEPTION
471 (R.getLocalLength () !=
472 A.getRowMap ()->getLocalNumElements (), std::runtime_error,
473 "R has the wrong number of local rows. "
474 "R.getLocalLength() = " << R.getLocalLength () << " != "
475 "A.getRowMap()->getLocalNumElements() = " <<
476 A.getRowMap ()->getLocalNumElements () << ".");
477 TEUCHOS_TEST_FOR_EXCEPTION
478 (B.getLocalLength () !=
479 A.getRowMap ()->getLocalNumElements (), std::runtime_error,
480 "B has the wrong number of local rows. "
481 "B.getLocalLength() = " << B.getLocalLength () << " != "
482 "A.getRowMap()->getLocalNumElements() = " <<
483 A.getRowMap ()->getLocalNumElements () << ".");
484
485 TEUCHOS_TEST_FOR_EXCEPTION
486 (! A.isFillComplete (), std::runtime_error, "The matrix A is not "
487 "fill complete. You must call fillComplete() (possibly with "
488 "domain and range Map arguments) without an intervening "
489 "resumeFill() call before you may call this method.");
490 TEUCHOS_TEST_FOR_EXCEPTION
491 (! X_colmap.isConstantStride () || ! R.isConstantStride () || ! B.isConstantStride (),
492 std::runtime_error, "X, Y and B must be constant stride.");
493 // If the two pointers are NULL, then they don't alias one
494 // another, even though they are equal.
495 TEUCHOS_TEST_FOR_EXCEPTION
496 ((X_colmap_lcl.data () == R_lcl.data () && X_colmap_lcl.data () != nullptr) ||
497 (X_colmap_lcl.data () == B_lcl.data () && X_colmap_lcl.data () != nullptr),
498 std::runtime_error, "X, Y and R may not alias one another.");
499 }
500
501 if (A_lcl.numRows() == 0) {
502 return;
503 }
504
505 int64_t numLocalRows = A_lcl.numRows ();
506 int64_t myNnz = A_lcl.nnz();
507
508 // //Check for imbalanced rows. If the logic for SPMV to use merge path is triggered,
509 // //use it and follow the reference residual code
510 // LO maxRowImbalance = 0;
511 // if(numLocalRows != 0)
512 // maxRowImbalance = A.getLocalMaxNumRowEntries() - (myNnz / numLocalRows);
513 // if(size_t(maxRowImbalance) >= Tpetra::Details::Behavior::rowImbalanceThreshold())
514 // {
515 // //note: lclOp will be wrapped in shared_ptr
516 // auto lclOp = A.getLocalMultiplyOperator();
517 // //Call local SPMV, requesting merge path, through A's LocalCrsMatrixOperator
518 // lclOp->applyImbalancedRows (X_colmap_lcl, R_lcl, Teuchos::NO_TRANS, one, zero);
519 // R.update(one,B,negone);
520 // return;
521 // }
522
523 int team_size = -1;
524 int vector_length = -1;
525 int64_t rows_per_thread = -1;
526
527 using execution_space = typename CrsMatrix<SC,LO,GO,NO>::execution_space;
528 using policy_type = typename Kokkos::TeamPolicy<execution_space>;
529
530 int64_t rows_per_team = KokkosSparse::Impl::spmv_launch_parameters<execution_space>(numLocalRows, myNnz, rows_per_thread, team_size, vector_length);
531 int64_t worksets = (B_lcl.extent (0) + rows_per_team - 1) / rows_per_team;
532
533 policy_type policy (1, 1);
534 if (team_size < 0) {
535 policy = policy_type (worksets, Kokkos::AUTO, vector_length);
536 }
537 else {
538 policy = policy_type (worksets, team_size, vector_length);
539 }
540
541 bool is_vector = (X_colmap_lcl.extent(1) == 1);
542
543 if(is_vector) {
544
545 using functor_type = LocalResidualFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,false,true,true>;
546 functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
547 Kokkos::parallel_for("residual-vector",policy,func);
548
549 RCP<const import_type> importer = A.getGraph ()->getImporter ();
550 X_colmap.endImport (X_domainmap, *importer, INSERT, true);
551
552 Kokkos::fence();
553
554 using functor_type2 = OffRankUpdateFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,false>;
555 functor_type2 func2 (A_lcl, X_colmap_lcl, R_lcl, rows_per_team, offsets);
556 Kokkos::parallel_for("residual-vector-offrank",policy,func2);
557
558 }
559 else {
560
561 using functor_type = LocalResidualFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,true,true,true>;
562 functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
563 Kokkos::parallel_for("residual-multivector",policy,func);
564
565 RCP<const import_type> importer = A.getGraph ()->getImporter ();
566 X_colmap.endImport (X_domainmap, *importer, INSERT, true);
567
568 Kokkos::fence();
569
570 using functor_type2 = OffRankUpdateFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,true>;
571 functor_type2 func2 (A_lcl, X_colmap_lcl, R_lcl, rows_per_team, offsets);
572 Kokkos::parallel_for("residual-vector-offrank",policy,func2);
573
574 }
575}
576
577
579template<class SC, class LO, class GO, class NO>
581 const MultiVector<SC,LO,GO,NO> & X_in,
582 const MultiVector<SC,LO,GO,NO> & B_in,
585 using Teuchos::RCP;
586 using Teuchos::rcp;
587 using Teuchos::rcp_const_cast;
588 using Teuchos::rcpFromRef;
589
590 const bool debug = ::Tpetra::Details::Behavior::debug ();
591 const bool skipCopyAndPermuteIfPossible = ::Tpetra::Details::Behavior::skipCopyAndPermuteIfPossible ();
592 const bool overlapCommunicationAndComputation = ::Tpetra::Details::Behavior::overlapCommunicationAndComputation ();
593 if (overlapCommunicationAndComputation)
594 TEUCHOS_ASSERT(skipCopyAndPermuteIfPossible);
595
596 // Whether we are using restrictedMode in the import from domain to
597 // column map. Restricted mode skips the copy and permutation of the
598 // local part of X. We are using restrictedMode only when domain and
599 // column map are locally fitted, i.e. when the local indices of
600 // domain and column map match.
601 bool restrictedMode = false;
602
603 const CrsMatrix<SC,LO,GO,NO> * Apt = dynamic_cast<const CrsMatrix<SC,LO,GO,NO>*>(&Aop);
604 if(!Apt) {
605 // If we're not a CrsMatrix, we can't do fusion, so just do apply+update
606 SC one = Teuchos::ScalarTraits<SC>::one(), negone = -one, zero = Teuchos::ScalarTraits<SC>::zero();
607 Aop.apply(X_in,R_in,Teuchos::NO_TRANS, one, zero);
608 R_in.update(one,B_in,negone);
609 return;
610 }
611 const CrsMatrix<SC,LO,GO,NO> & A = *Apt;
612
613 using import_type = typename CrsMatrix<SC,LO,GO,NO>::import_type;
614 using export_type = typename CrsMatrix<SC,LO,GO,NO>::export_type;
615 using MV = MultiVector<SC,LO,GO,NO>;
616 using graph_type = Tpetra::CrsGraph<LO,GO,NO>;
617 using offset_type = typename graph_type::offset_device_view_type;
618
619 // We treat the case of a replicated MV output specially.
620 const bool R_is_replicated =
621 (! R_in.isDistributed () && A.getComm ()->getSize () != 1);
622
623 // It's possible that R is a view of X or B.
624 // We don't try to to detect the more subtle cases (e.g., one is a
625 // subview of the other, but their initial pointers differ). We
626 // only need to do this if this matrix's Import is trivial;
627 // otherwise, we don't actually apply the operator from X into Y.
628
629 RCP<const import_type> importer = A.getGraph ()->getImporter ();
630 RCP<const export_type> exporter = A.getGraph ()->getExporter ();
631
632 // Temporary MV for Import operation. After the block of code
633 // below, this will be an (Imported if necessary) column Map MV
634 // ready to give to localApply(...).
635 RCP<MV> X_colMap;
636 if (importer.is_null ()) {
637 if (! X_in.isConstantStride ()) {
638 // Not all sparse mat-vec kernels can handle an input MV with
639 // nonconstant stride correctly, so we have to copy it in that
640 // case into a constant stride MV. To make a constant stride
641 // copy of X_in, we force creation of the column (== domain)
642 // Map MV (if it hasn't already been created, else fetch the
643 // cached copy). This avoids creating a new MV each time.
644 X_colMap = A.getColumnMapMultiVector (X_in, true);
645 Tpetra::deep_copy (*X_colMap, X_in);
646 // X_colMap = rcp_const_cast<const MV> (X_colMapNonConst);
647 }
648 else {
649 // The domain and column Maps are the same, so do the local
650 // multiply using the domain Map input MV X_in.
651 X_colMap = rcp_const_cast<MV> (rcpFromRef (X_in) );
652 }
653 }
654 else { // need to Import source (multi)vector
655 ProfilingRegion regionImport ("Tpetra::CrsMatrix::residual: Import");
656 // We're doing an Import anyway, which will copy the relevant
657 // elements of the domain Map MV X_in into a separate column Map
658 // MV. Thus, we don't have to worry whether X_in is constant
659 // stride.
660 X_colMap = A.getColumnMapMultiVector (X_in);
661
662 // Do we want to use restrictedMode?
663 restrictedMode = skipCopyAndPermuteIfPossible && importer->isLocallyFitted();
664
665 if (debug && restrictedMode) {
666 TEUCHOS_TEST_FOR_EXCEPTION
667 (!importer->getTargetMap()->isLocallyFitted(*importer->getSourceMap()), std::runtime_error,
668 "Source map and target map are not locally fitted, but Tpetra::residual thinks they are.");
669 }
670
671 // Import from the domain Map MV to the column Map MV.
672 X_colMap->beginImport (X_in, *importer, INSERT, restrictedMode);
673 }
674
675 offset_type offsets;
676 if (restrictedMode)
677 A.getCrsGraph()->getLocalOffRankOffsets(offsets);
678
679 // Get a vector for the R_rowMap output residual, handling the
680 // non-constant stride and exporter cases. Since R gets clobbered
681 // we don't need to worry about the data in it
682 RCP<MV> R_rowMap;
683 if(exporter.is_null()) {
684 if (! R_in.isConstantStride ()) {
685 R_rowMap = A.getRowMapMultiVector(R_in);
686 }
687 else {
688 R_rowMap = rcpFromRef (R_in);
689 }
690 }
691 else {
692 R_rowMap = A.getRowMapMultiVector (R_in);
693 }
694
695 // Get a vector for the B_rowMap output residual, handling the
696 // non-constant stride and exporter cases
697 RCP<const MV> B_rowMap;
698 if(exporter.is_null()) {
699 if (! B_in.isConstantStride ()) {
700 // Do an allocation here. If we need to optimize this later, we can have the matrix
701 // cache this.
702 RCP<MV> B_rowMapNonConst = rcp(new MV(A.getRowMap(),B_in.getNumVectors()));
703 Tpetra::deep_copy (*B_rowMapNonConst, B_in);
704 B_rowMap = rcp_const_cast<const MV> (B_rowMapNonConst);
705 }
706 else {
707 B_rowMap = rcpFromRef (B_in);
708 }
709 }
710 else {
711 // Do an allocation here. If we need to optimize this later, we can have the matrix
712 // cache this.
713 ProfilingRegion regionExport ("Tpetra::CrsMatrix::residual: B Import");
714 RCP<MV> B_rowMapNonConst = rcp(new MV(A.getRowMap(),B_in.getNumVectors()));
715 B_rowMapNonConst->doImport(B_in, *exporter, ADD);
716 B_rowMap = rcp_const_cast<const MV> (B_rowMapNonConst);
717 }
718
719 // If we have a nontrivial Export object, we must perform an
720 // Export. In that case, the local multiply result will go into
721 // the row Map multivector. We don't have to make a
722 // constant-stride version of R_in in this case, because we had to
723 // make a constant stride R_rowMap MV and do an Export anyway.
724 if (! exporter.is_null ()) {
725 if ( ! importer.is_null ())
726 X_colMap->endImport (X_in, *importer, INSERT, restrictedMode);
727 if (restrictedMode && !importer.is_null ())
728 localResidual (A, *X_colMap, *B_rowMap, *R_rowMap, offsets, &X_in);
729 else
730 localResidual (A, *X_colMap, *B_rowMap, *R_rowMap, offsets);
731
732 {
733 ProfilingRegion regionExport ("Tpetra::CrsMatrix::residual: R Export");
734
735 // Do the Export operation.
736 R_in.doExport (*R_rowMap, *exporter, ADD);
737 }
738 }
739 else { // Don't do an Export: row Map and range Map are the same.
740
741 if (restrictedMode)
742 if (overlapCommunicationAndComputation) {
743 localResidualWithCommCompOverlap (A, *X_colMap, *B_rowMap, *R_rowMap, offsets, X_in);
744 } else {
745 X_colMap->endImport (X_in, *importer, INSERT, restrictedMode);
746 localResidual (A, *X_colMap, *B_rowMap, *R_rowMap, offsets, &X_in);
747 }
748 else {
749 if ( ! importer.is_null ())
750 X_colMap->endImport (X_in, *importer, INSERT, restrictedMode);
751 localResidual (A, *X_colMap, *B_rowMap, *R_rowMap, offsets);
752 }
753
754 //
755 // If R_in does not have constant stride,
756 // then we can't let the kernel write directly to R_in.
757 // Instead, we have to use the cached row (== range)
758 // Map MV as temporary storage.
759 //
760 if (! R_in.isConstantStride () ) {
761 // We need to be sure to do a copy out in this case.
762 Tpetra::deep_copy (R_in, *R_rowMap);
763 }
764 }
765
766 // If the range Map is a locally replicated Map, sum up
767 // contributions from each process.
768 if (R_is_replicated) {
769 ProfilingRegion regionReduce ("Tpetra::CrsMatrix::residual: Reduce Y");
770 R_in.reduce ();
771 }
772}
773
774
775
776
777
778 } // namespace Details
779} // namespace Tpetra
780
781#endif // TPETRA_DETAILS_RESIDUAL_HPP
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
A distributed graph accessed by rows (adjacency lists) and stored sparsely.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries.
void localApply(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Y, const Teuchos::ETransp mode=Teuchos::NO_TRANS, const Scalar &alpha=Teuchos::ScalarTraits< Scalar >::one(), const Scalar &beta=Teuchos::ScalarTraits< Scalar >::zero()) const
Compute the local part of a sparse matrix-(Multi)Vector multiply.
typename device_type::execution_space execution_space
The Kokkos execution space.
std::shared_ptr< local_multiply_op_type > getLocalMultiplyOperator() const
The local sparse matrix operator (a wrapper of getLocalMatrixDevice() that supports local matrix-vect...
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
Teuchos::RCP< MV > getColumnMapMultiVector(const MV &X_domainMap, const bool force=false) const
Create a (or fetch a cached) column Map MultiVector.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
size_t getLocalMaxNumRowEntries() const override
Maximum number of entries in any row of the matrix, on this process.
local_matrix_device_type getLocalMatrixDevice() const
The local sparse matrix.
Teuchos::RCP< const map_type > getColMap() const override
The Map that describes the column distribution in this matrix.
Teuchos::RCP< const RowGraph< LocalOrdinal, GlobalOrdinal, Node > > getGraph() const override
This matrix's graph, as a RowGraph.
bool isFillComplete() const override
Whether the matrix is fill complete.
Teuchos::RCP< const map_type > getRowMap() const override
The Map that describes the row distribution in this matrix.
Teuchos::RCP< MV > getRowMapMultiVector(const MV &Y_rangeMap, const bool force=false) const
Create a (or fetch a cached) row Map MultiVector.
Import< LocalOrdinal, GlobalOrdinal, Node > import_type
The Import specialization suitable for this CrsMatrix specialization.
Teuchos::RCP< const crs_graph_type > getCrsGraph() const
This matrix's graph, as a CrsGraph.
static bool debug()
Whether Tpetra is in debug mode.
static bool overlapCommunicationAndComputation()
Overlap communication and computation.
static size_t rowImbalanceThreshold()
Threshold for deciding if a local matrix is "imbalanced" in the number of entries per row....
static bool skipCopyAndPermuteIfPossible()
Skip copyAndPermute if possible.
void doExport(const SrcDistObject &source, const Export< LocalOrdinal, GlobalOrdinal, Node > &exporter, const CombineMode CM, const bool restrictedMode=false)
Export data into this object using an Export object ("forward mode").
bool isDistributed() const
Whether this is a globally distributed object.
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
One or more distributed dense vectors.
void reduce()
Sum values of a locally replicated multivector across all processes.
size_t getLocalLength() const
Local number of rows on the calling process.
size_t getNumVectors() const
Number of columns in the multivector.
dual_view_type::t_dev::const_type getLocalViewDevice(Access::ReadOnlyStruct) const
Return a read-only, up-to-date view of this MultiVector's local data on device. This requires that th...
bool isConstantStride() const
Whether this multivector has constant stride between columns.
void update(const Scalar &alpha, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta)
Update: this = beta*this + alpha*A.
Abstract interface for operators (e.g., matrices and preconditioners).
virtual void apply(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), Scalar beta=Teuchos::ScalarTraits< Scalar >::zero()) const =0
Computes the operator-multivector application.
Implementation details of Tpetra.
void residual(const Operator< SC, LO, GO, NO > &A, const MultiVector< SC, LO, GO, NO > &X, const MultiVector< SC, LO, GO, NO > &B, MultiVector< SC, LO, GO, NO > &R)
Computes R = B - A * X.
Namespace Tpetra contains the class and methods constituting the Tpetra library.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
@ ADD
Sum new values.
@ INSERT
Insert new values that don't currently exist.
Functor for computing the residual.
Functor for computing R -= A_offRank*X_colmap.