Tpetra parallel linear algebra Version of the Day
Loading...
Searching...
No Matches
TpetraExt_MatrixMatrix_Cuda.hpp
1// @HEADER
2// ***********************************************************************
3//
4// Tpetra: Templated Linear Algebra Services Package
5// Copyright (2008) Sandia Corporation
6//
7// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8// the U.S. Government retains certain rights in this software.
9//
10// Redistribution and use in source and binary forms, with or without
11// modification, are permitted provided that the following conditions are
12// met:
13//
14// 1. Redistributions of source code must retain the above copyright
15// notice, this list of conditions and the following disclaimer.
16//
17// 2. Redistributions in binary form must reproduce the above copyright
18// notice, this list of conditions and the following disclaimer in the
19// documentation and/or other materials provided with the distribution.
20//
21// 3. Neither the name of the Corporation nor the names of the
22// contributors may be used to endorse or promote products derived from
23// this software without specific prior written permission.
24//
25// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36//
37// Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38//
39// ************************************************************************
40// @HEADER
41
42#ifndef TPETRA_MATRIXMATRIX_CUDA_DEF_HPP
43#define TPETRA_MATRIXMATRIX_CUDA_DEF_HPP
44
45#ifdef HAVE_TPETRA_INST_CUDA
46namespace Tpetra {
47namespace MMdetails {
48
49/*********************************************************************************************************/
50// MMM KernelWrappers for Partial Specialization to CUDA
51template<class Scalar,
52 class LocalOrdinal,
53 class GlobalOrdinal,
54 class LocalOrdinalViewType>
55struct KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode,LocalOrdinalViewType> {
56 static inline void mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
57 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
58 const LocalOrdinalViewType & Acol2Brow,
59 const LocalOrdinalViewType & Acol2Irow,
60 const LocalOrdinalViewType & Bcol2Ccol,
61 const LocalOrdinalViewType & Icol2Ccol,
62 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
63 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
64 const std::string& label = std::string(),
65 const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
66
67
68
69 static inline void mult_A_B_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
70 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
71 const LocalOrdinalViewType & Acol2Brow,
72 const LocalOrdinalViewType & Acol2Irow,
73 const LocalOrdinalViewType & Bcol2Ccol,
74 const LocalOrdinalViewType & Icol2Ccol,
75 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
76 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
77 const std::string& label = std::string(),
78 const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
79
80};
81
82// Jacobi KernelWrappers for Partial Specialization to Cuda
83template<class Scalar,
84 class LocalOrdinal,
85 class GlobalOrdinal, class LocalOrdinalViewType>
86struct KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode,LocalOrdinalViewType> {
87 static inline void jacobi_A_B_newmatrix_kernel_wrapper(Scalar omega,
88 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> & Dinv,
89 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
90 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
91 const LocalOrdinalViewType & Acol2Brow,
92 const LocalOrdinalViewType & Acol2Irow,
93 const LocalOrdinalViewType & Bcol2Ccol,
94 const LocalOrdinalViewType & Icol2Ccol,
95 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
96 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
97 const std::string& label = std::string(),
98 const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
99
100 static inline void jacobi_A_B_reuse_kernel_wrapper(Scalar omega,
101 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> & Dinv,
102 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
103 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
104 const LocalOrdinalViewType & Acol2Brow,
105 const LocalOrdinalViewType & Acol2Irow,
106 const LocalOrdinalViewType & Bcol2Ccol,
107 const LocalOrdinalViewType & Icol2Ccol,
108 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
109 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
110 const std::string& label = std::string(),
111 const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
112
113 static inline void jacobi_A_B_newmatrix_KokkosKernels(Scalar omega,
114 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> & Dinv,
115 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
116 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
117 const LocalOrdinalViewType & Acol2Brow,
118 const LocalOrdinalViewType & Acol2Irow,
119 const LocalOrdinalViewType & Bcol2Ccol,
120 const LocalOrdinalViewType & Icol2Ccol,
121 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
122 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
123 const std::string& label = std::string(),
124 const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
125};
126
127
128/*********************************************************************************************************/
129// AB NewMatrix Kernel wrappers (KokkosKernels/CUDA Version)
130template<class Scalar,
131 class LocalOrdinal,
132 class GlobalOrdinal,
133 class LocalOrdinalViewType>
134void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode,LocalOrdinalViewType>::mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
135 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
136 const LocalOrdinalViewType & Acol2Brow,
137 const LocalOrdinalViewType & Acol2Irow,
138 const LocalOrdinalViewType & Bcol2Ccol,
139 const LocalOrdinalViewType & Icol2Ccol,
140 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
141 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
142 const std::string& label,
143 const Teuchos::RCP<Teuchos::ParameterList>& params) {
144
145
146#ifdef HAVE_TPETRA_MMM_TIMINGS
147 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
148 using Teuchos::TimeMonitor;
149 Teuchos::RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix CudaWrapper")))));
150#endif
151 // Node-specific code
152 typedef Kokkos::Compat::KokkosCudaWrapperNode Node;
153 std::string nodename("Cuda");
154
155 // Lots and lots of typedefs
156 using Teuchos::RCP;
158 typedef typename KCRS::device_type device_t;
159 typedef typename KCRS::StaticCrsGraphType graph_t;
160 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
161 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
162 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
163 typedef typename KCRS::values_type::non_const_type scalar_view_t;
164 //typedef typename graph_t::row_map_type::const_type lno_view_t_const;
165
166 // Options
167 int team_work_size = 16; // Defaults to 16 as per Deveci 12/7/16 - csiefer
168 std::string myalg("SPGEMM_KK_MEMORY");
169 if(!params.is_null()) {
170 if(params->isParameter("cuda: algorithm"))
171 myalg = params->get("cuda: algorithm",myalg);
172 if(params->isParameter("cuda: team work size"))
173 team_work_size = params->get("cuda: team work size",team_work_size);
174 }
175
176 // KokkosKernelsHandle
177 typedef KokkosKernels::Experimental::KokkosKernelsHandle<
178 typename lno_view_t::const_value_type,typename lno_nnz_view_t::const_value_type, typename scalar_view_t::const_value_type,
179 typename device_t::execution_space, typename device_t::memory_space,typename device_t::memory_space > KernelHandle;
180
181 // Grab the Kokkos::SparseCrsMatrices
182 const KCRS & Amat = Aview.origMatrix->getLocalMatrixDevice();
183 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixDevice();
184
185 c_lno_view_t Arowptr = Amat.graph.row_map,
186 Browptr = Bmat.graph.row_map;
187 const lno_nnz_view_t Acolind = Amat.graph.entries,
188 Bcolind = Bmat.graph.entries;
189 const scalar_view_t Avals = Amat.values,
190 Bvals = Bmat.values;
191
192 c_lno_view_t Irowptr;
193 lno_nnz_view_t Icolind;
194 scalar_view_t Ivals;
195 if(!Bview.importMatrix.is_null()) {
196 auto lclB = Bview.importMatrix->getLocalMatrixDevice();
197 Irowptr = lclB.graph.row_map;
198 Icolind = lclB.graph.entries;
199 Ivals = lclB.values;
200 }
201
202
203 // Get the algorithm mode
204 std::string alg = nodename+std::string(" algorithm");
205 // printf("DEBUG: Using kernel: %s\n",myalg.c_str());
206 if(!params.is_null() && params->isParameter(alg)) myalg = params->get(alg,myalg);
207 KokkosSparse::SPGEMMAlgorithm alg_enum = KokkosSparse::StringToSPGEMMAlgorithm(myalg);
208
209 // Merge the B and Bimport matrices
210 const KCRS Bmerged = Tpetra::MMdetails::merge_matrices(Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C.getColMap()->getLocalNumElements());
211
212#ifdef HAVE_TPETRA_MMM_TIMINGS
213 MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix CudaCore"))));
214#endif
215
216 // Do the multiply on whatever we've got
217 typename KernelHandle::nnz_lno_t AnumRows = Amat.numRows();
218 typename KernelHandle::nnz_lno_t BnumRows = Bmerged.numRows();
219 typename KernelHandle::nnz_lno_t BnumCols = Bmerged.numCols();
220
221 lno_view_t row_mapC (Kokkos::ViewAllocateWithoutInitializing("non_const_lnow_row"), AnumRows + 1);
222 lno_nnz_view_t entriesC;
223 scalar_view_t valuesC;
224 KernelHandle kh;
225 kh.create_spgemm_handle(alg_enum);
226 kh.set_team_work_size(team_work_size);
227
228 KokkosSparse::Experimental::spgemm_symbolic(&kh,AnumRows,BnumRows,BnumCols,Amat.graph.row_map,Amat.graph.entries,false,Bmerged.graph.row_map,Bmerged.graph.entries,false,row_mapC);
229
230 size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
231 if (c_nnz_size){
232 entriesC = lno_nnz_view_t (Kokkos::ViewAllocateWithoutInitializing("entriesC"), c_nnz_size);
233 valuesC = scalar_view_t (Kokkos::ViewAllocateWithoutInitializing("valuesC"), c_nnz_size);
234 }
235 KokkosSparse::Experimental::spgemm_numeric(&kh,AnumRows,BnumRows,BnumCols,Amat.graph.row_map,Amat.graph.entries,Amat.values,false,Bmerged.graph.row_map,Bmerged.graph.entries,Bmerged.values,false,row_mapC,entriesC,valuesC);
236 kh.destroy_spgemm_handle();
237
238#ifdef HAVE_TPETRA_MMM_TIMINGS
239 MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix CudaSort"))));
240#endif
241
242 // Sort & set values
243 if (params.is_null() || params->get("sort entries",true))
244 Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
245 C.setAllValues(row_mapC,entriesC,valuesC);
246
247#ifdef HAVE_TPETRA_MMM_TIMINGS
248 MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix CudaESFC"))));
249#endif
250
251 // Final Fillcomplete
252 RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
253 labelList->set("Timer Label",label);
254 if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
255 RCP<const Export<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > dummyExport;
256 C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
257}
258
259
260/*********************************************************************************************************/
261template<class Scalar,
262 class LocalOrdinal,
263 class GlobalOrdinal,
264 class LocalOrdinalViewType>
265void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode,LocalOrdinalViewType>::mult_A_B_reuse_kernel_wrapper(
266 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
267 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
268 const LocalOrdinalViewType & targetMapToOrigRow_dev,
269 const LocalOrdinalViewType & targetMapToImportRow_dev,
270 const LocalOrdinalViewType & Bcol2Ccol_dev,
271 const LocalOrdinalViewType & Icol2Ccol_dev,
272 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
273 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
274 const std::string& label,
275 const Teuchos::RCP<Teuchos::ParameterList>& params) {
276
277 // FIXME: Right now, this is a cut-and-paste of the serial kernel
278 typedef Kokkos::Compat::KokkosCudaWrapperNode Node;
279
280#ifdef HAVE_TPETRA_MMM_TIMINGS
281 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
282 using Teuchos::TimeMonitor;
283 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse SerialCore"))));
284 Teuchos::RCP<Teuchos::TimeMonitor> MM2;
285#endif
286 using Teuchos::RCP;
287 using Teuchos::rcp;
288
289
290 // Lots and lots of typedefs
291 typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
292 typedef typename KCRS::StaticCrsGraphType graph_t;
293 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
294 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
295 typedef typename KCRS::values_type::non_const_type scalar_view_t;
296
297 typedef Scalar SC;
298 typedef LocalOrdinal LO;
299 typedef GlobalOrdinal GO;
300 typedef Node NO;
301 typedef Map<LO,GO,NO> map_type;
302 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
303 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
304 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
305
306 // Since this is being run on Cuda, we need to fence because the below code will use UVM
307 // typename graph_t::execution_space().fence();
308
309 // KDDKDD UVM Without UVM, need to copy targetMap arrays to host.
310 // KDDKDD UVM Ideally, this function would run on device and use
311 // KDDKDD UVM KokkosKernels instead of this host implementation.
312 auto targetMapToOrigRow =
313 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
314 targetMapToOrigRow_dev);
315 auto targetMapToImportRow =
316 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
317 targetMapToImportRow_dev);
318 auto Bcol2Ccol =
319 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
320 Bcol2Ccol_dev);
321 auto Icol2Ccol =
322 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
323 Icol2Ccol_dev);
324
325 // Sizes
326 RCP<const map_type> Ccolmap = C.getColMap();
327 size_t m = Aview.origMatrix->getLocalNumRows();
328 size_t n = Ccolmap->getLocalNumElements();
329
330 // Grab the Kokkos::SparseCrsMatrices & inner stuff
331 const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
332 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
333 const KCRS & Cmat = C.getLocalMatrixHost();
334
335 c_lno_view_t Arowptr = Amat.graph.row_map,
336 Browptr = Bmat.graph.row_map,
337 Crowptr = Cmat.graph.row_map;
338 const lno_nnz_view_t Acolind = Amat.graph.entries,
339 Bcolind = Bmat.graph.entries,
340 Ccolind = Cmat.graph.entries;
341 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
342 scalar_view_t Cvals = Cmat.values;
343
344 c_lno_view_t Irowptr;
345 lno_nnz_view_t Icolind;
346 scalar_view_t Ivals;
347 if(!Bview.importMatrix.is_null()) {
348 auto lclB = Bview.importMatrix->getLocalMatrixHost();
349 Irowptr = lclB.graph.row_map;
350 Icolind = lclB.graph.entries;
351 Ivals = lclB.values;
352 }
353
354#ifdef HAVE_TPETRA_MMM_TIMINGS
355 MM2 = Teuchos::null; MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix SerialCore - Compare"))));
356#endif
357
358 // Classic csr assembly (low memory edition)
359 // mfh 27 Sep 2016: The c_status array is an implementation detail
360 // of the local sparse matrix-matrix multiply routine.
361
362 // The status array will contain the index into colind where this entry was last deposited.
363 // c_status[i] < CSR_ip - not in the row yet
364 // c_status[i] >= CSR_ip - this is the entry where you can find the data
365 // We start with this filled with INVALID's indicating that there are no entries yet.
366 // Sadly, this complicates the code due to the fact that size_t's are unsigned.
367 std::vector<size_t> c_status(n, ST_INVALID);
368
369 // For each row of A/C
370 size_t CSR_ip = 0, OLD_ip = 0;
371 for (size_t i = 0; i < m; i++) {
372 // First fill the c_status array w/ locations where we're allowed to
373 // generate nonzeros for this row
374 OLD_ip = Crowptr[i];
375 CSR_ip = Crowptr[i+1];
376 for (size_t k = OLD_ip; k < CSR_ip; k++) {
377 c_status[Ccolind[k]] = k;
378
379 // Reset values in the row of C
380 Cvals[k] = SC_ZERO;
381 }
382
383 for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
384 LO Aik = Acolind[k];
385 const SC Aval = Avals[k];
386 if (Aval == SC_ZERO)
387 continue;
388
389 if (targetMapToOrigRow[Aik] != LO_INVALID) {
390 // Local matrix
391 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow[Aik]);
392
393 for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
394 LO Bkj = Bcolind[j];
395 LO Cij = Bcol2Ccol[Bkj];
396
397 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
398 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
399 "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
400
401 Cvals[c_status[Cij]] += Aval * Bvals[j];
402 }
403
404 } else {
405 // Remote matrix
406 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow[Aik]);
407 for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
408 LO Ikj = Icolind[j];
409 LO Cij = Icol2Ccol[Ikj];
410
411 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
412 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
413 "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
414
415 Cvals[c_status[Cij]] += Aval * Ivals[j];
416 }
417 }
418 }
419 }
420
421 C.fillComplete(C.getDomainMap(), C.getRangeMap());
422}
423
424/*********************************************************************************************************/
425template<class Scalar,
426 class LocalOrdinal,
427 class GlobalOrdinal,
428 class LocalOrdinalViewType>
429void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode,LocalOrdinalViewType>::jacobi_A_B_newmatrix_kernel_wrapper(Scalar omega,
430 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> & Dinv,
431 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
432 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
433 const LocalOrdinalViewType & Acol2Brow,
434 const LocalOrdinalViewType & Acol2Irow,
435 const LocalOrdinalViewType & Bcol2Ccol,
436 const LocalOrdinalViewType & Icol2Ccol,
437 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
438 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
439 const std::string& label,
440 const Teuchos::RCP<Teuchos::ParameterList>& params) {
441
442#ifdef HAVE_TPETRA_MMM_TIMINGS
443 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
444 using Teuchos::TimeMonitor;
445 Teuchos::RCP<TimeMonitor> MM;
446#endif
447
448 // Node-specific code
449 using Teuchos::RCP;
450
451 // Options
452 //int team_work_size = 16; // Defaults to 16 as per Deveci 12/7/16 - csiefer // unreferenced
453 std::string myalg("KK");
454 if(!params.is_null()) {
455 if(params->isParameter("cuda: jacobi algorithm"))
456 myalg = params->get("cuda: jacobi algorithm",myalg);
457 }
458
459 if(myalg == "MSAK") {
460 ::Tpetra::MatrixMatrix::ExtraKernels::jacobi_A_B_newmatrix_MultiplyScaleAddKernel(omega,Dinv,Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
461 }
462 else if(myalg == "KK") {
463 jacobi_A_B_newmatrix_KokkosKernels(omega,Dinv,Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
464 }
465 else {
466 throw std::runtime_error("Tpetra::MatrixMatrix::Jacobi newmatrix unknown kernel");
467 }
468
469#ifdef HAVE_TPETRA_MMM_TIMINGS
470 MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix CudaESFC"))));
471#endif
472
473 // Final Fillcomplete
474 RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
475 labelList->set("Timer Label",label);
476 if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
477
478 // NOTE: MSAK already fillCompletes, so we have to check here
479 if(!C.isFillComplete()) {
480 RCP<const Export<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > dummyExport;
481 C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
482 }
483
484}
485
486
487
488/*********************************************************************************************************/
489template<class Scalar,
490 class LocalOrdinal,
491 class GlobalOrdinal,
492 class LocalOrdinalViewType>
493void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode,LocalOrdinalViewType>::jacobi_A_B_reuse_kernel_wrapper(Scalar omega,
494 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> & Dinv,
495 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
496 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
497 const LocalOrdinalViewType & targetMapToOrigRow_dev,
498 const LocalOrdinalViewType & targetMapToImportRow_dev,
499 const LocalOrdinalViewType & Bcol2Ccol_dev,
500 const LocalOrdinalViewType & Icol2Ccol_dev,
501 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
502 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
503 const std::string& label,
504 const Teuchos::RCP<Teuchos::ParameterList>& params) {
505
506 // FIXME: Right now, this is a cut-and-paste of the serial kernel
507 typedef Kokkos::Compat::KokkosCudaWrapperNode Node;
508
509#ifdef HAVE_TPETRA_MMM_TIMINGS
510 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
511 using Teuchos::TimeMonitor;
512 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse CudaCore"))));
513 Teuchos::RCP<Teuchos::TimeMonitor> MM2;
514#endif
515 using Teuchos::RCP;
516 using Teuchos::rcp;
517
518 // Lots and lots of typedefs
519 typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
520 typedef typename KCRS::StaticCrsGraphType graph_t;
521 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
522 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
523 typedef typename KCRS::values_type::non_const_type scalar_view_t;
524 typedef typename scalar_view_t::memory_space scalar_memory_space;
525
526 typedef Scalar SC;
527 typedef LocalOrdinal LO;
528 typedef GlobalOrdinal GO;
529 typedef Node NO;
530 typedef Map<LO,GO,NO> map_type;
531 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
532 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
533 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
534
535 // Since this is being run on Cuda, we need to fence because the below host code will use UVM
536 // KDDKDD typename graph_t::execution_space().fence();
537
538 // KDDKDD UVM Without UVM, need to copy targetMap arrays to host.
539 // KDDKDD UVM Ideally, this function would run on device and use
540 // KDDKDD UVM KokkosKernels instead of this host implementation.
541 auto targetMapToOrigRow =
542 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
543 targetMapToOrigRow_dev);
544 auto targetMapToImportRow =
545 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
546 targetMapToImportRow_dev);
547 auto Bcol2Ccol =
548 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
549 Bcol2Ccol_dev);
550 auto Icol2Ccol =
551 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
552 Icol2Ccol_dev);
553
554
555 // Sizes
556 RCP<const map_type> Ccolmap = C.getColMap();
557 size_t m = Aview.origMatrix->getLocalNumRows();
558 size_t n = Ccolmap->getLocalNumElements();
559
560 // Grab the Kokkos::SparseCrsMatrices & inner stuff
561 const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
562 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
563 const KCRS & Cmat = C.getLocalMatrixHost();
564
565 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
566 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
567 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
568 scalar_view_t Cvals = Cmat.values;
569
570 c_lno_view_t Irowptr;
571 lno_nnz_view_t Icolind;
572 scalar_view_t Ivals;
573 if(!Bview.importMatrix.is_null()) {
574 auto lclB = Bview.importMatrix->getLocalMatrixHost();
575 Irowptr = lclB.graph.row_map;
576 Icolind = lclB.graph.entries;
577 Ivals = lclB.values;
578 }
579
580 // Jacobi-specific inner stuff
581 auto Dvals =
582 Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
583
584#ifdef HAVE_TPETRA_MMM_TIMINGS
585 MM2 = Teuchos::null; MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse CudaCore - Compare"))));
586#endif
587
588 // The status array will contain the index into colind where this entry was last deposited.
589 // c_status[i] < CSR_ip - not in the row yet
590 // c_status[i] >= CSR_ip - this is the entry where you can find the data
591 // We start with this filled with INVALID's indicating that there are no entries yet.
592 // Sadly, this complicates the code due to the fact that size_t's are unsigned.
593 std::vector<size_t> c_status(n, ST_INVALID);
594
595 // For each row of A/C
596 size_t CSR_ip = 0, OLD_ip = 0;
597 for (size_t i = 0; i < m; i++) {
598
599 // First fill the c_status array w/ locations where we're allowed to
600 // generate nonzeros for this row
601 OLD_ip = Crowptr[i];
602 CSR_ip = Crowptr[i+1];
603 for (size_t k = OLD_ip; k < CSR_ip; k++) {
604 c_status[Ccolind[k]] = k;
605
606 // Reset values in the row of C
607 Cvals[k] = SC_ZERO;
608 }
609
610 SC minusOmegaDval = -omega*Dvals(i,0);
611
612 // Entries of B
613 for (size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
614 Scalar Bval = Bvals[j];
615 if (Bval == SC_ZERO)
616 continue;
617 LO Bij = Bcolind[j];
618 LO Cij = Bcol2Ccol[Bij];
619
620 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
621 std::runtime_error, "Trying to insert a new entry into a static graph");
622
623 Cvals[c_status[Cij]] = Bvals[j];
624 }
625
626 // Entries of -omega * Dinv * A * B
627 for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
628 LO Aik = Acolind[k];
629 const SC Aval = Avals[k];
630 if (Aval == SC_ZERO)
631 continue;
632
633 if (targetMapToOrigRow[Aik] != LO_INVALID) {
634 // Local matrix
635 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow[Aik]);
636
637 for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
638 LO Bkj = Bcolind[j];
639 LO Cij = Bcol2Ccol[Bkj];
640
641 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
642 std::runtime_error, "Trying to insert a new entry into a static graph");
643
644 Cvals[c_status[Cij]] += minusOmegaDval * Aval * Bvals[j];
645 }
646
647 } else {
648 // Remote matrix
649 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow[Aik]);
650 for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
651 LO Ikj = Icolind[j];
652 LO Cij = Icol2Ccol[Ikj];
653
654 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
655 std::runtime_error, "Trying to insert a new entry into a static graph");
656
657 Cvals[c_status[Cij]] += minusOmegaDval * Aval * Ivals[j];
658 }
659 }
660 }
661 }
662
663#ifdef HAVE_TPETRA_MMM_TIMINGS
664 MM2= Teuchos::null;
665 MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse ESFC"))));
666#endif
667
668 C.fillComplete(C.getDomainMap(), C.getRangeMap());
669
670}
671
672/*********************************************************************************************************/
673template<class Scalar,
674 class LocalOrdinal,
675 class GlobalOrdinal,
676 class LocalOrdinalViewType>
677void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode,LocalOrdinalViewType>::jacobi_A_B_newmatrix_KokkosKernels(Scalar omega,
678 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> & Dinv,
679 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
680 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
681 const LocalOrdinalViewType & Acol2Brow,
682 const LocalOrdinalViewType & Acol2Irow,
683 const LocalOrdinalViewType & Bcol2Ccol,
684 const LocalOrdinalViewType & Icol2Ccol,
685 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
686 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
687 const std::string& label,
688 const Teuchos::RCP<Teuchos::ParameterList>& params) {
689
690#ifdef HAVE_TPETRA_MMM_TIMINGS
691 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
692 using Teuchos::TimeMonitor;
693 Teuchos::RCP<TimeMonitor> MM;
694#endif
695
696 // Check if the diagonal entries exist in debug mode
697 const bool debug = Tpetra::Details::Behavior::debug();
698 if(debug) {
699
700 auto rowMap = Aview.origMatrix->getRowMap();
701 Tpetra::Vector<Scalar> diags(rowMap);
702 Aview.origMatrix->getLocalDiagCopy(diags);
703 size_t diagLength = rowMap->getLocalNumElements();
704 Teuchos::Array<Scalar> diagonal(diagLength);
705 diags.get1dCopy(diagonal());
706
707 for(size_t i = 0; i < diagLength; ++i) {
708 TEUCHOS_TEST_FOR_EXCEPTION(diagonal[i] == Teuchos::ScalarTraits<Scalar>::zero(),
709 std::runtime_error,
710 "Matrix A has a zero/missing diagonal: " << diagonal[i] << std::endl <<
711 "KokkosKernels Jacobi-fused SpGEMM requires nonzero diagonal entries in A" << std::endl);
712 }
713 }
714
715 // Usings
716 using device_t = typename Kokkos::Compat::KokkosCudaWrapperNode::device_type;
718 using graph_t = typename matrix_t::StaticCrsGraphType;
719 using lno_view_t = typename graph_t::row_map_type::non_const_type;
720 using c_lno_view_t = typename graph_t::row_map_type::const_type;
721 using lno_nnz_view_t = typename graph_t::entries_type::non_const_type;
722 using scalar_view_t = typename matrix_t::values_type::non_const_type;
723
724 // KokkosKernels handle
725 using handle_t = typename KokkosKernels::Experimental::KokkosKernelsHandle<
726 typename lno_view_t::const_value_type,typename lno_nnz_view_t::const_value_type, typename scalar_view_t::const_value_type,
727 typename device_t::execution_space, typename device_t::memory_space,typename device_t::memory_space >;
728
729 // Get the rowPtr, colInd and vals of importMatrix
730 c_lno_view_t Irowptr;
731 lno_nnz_view_t Icolind;
732 scalar_view_t Ivals;
733 if(!Bview.importMatrix.is_null()) {
734 auto lclB = Bview.importMatrix->getLocalMatrixDevice();
735 Irowptr = lclB.graph.row_map;
736 Icolind = lclB.graph.entries;
737 Ivals = lclB.values;
738 }
739
740 // Merge the B and Bimport matrices
741 const matrix_t Bmerged = Tpetra::MMdetails::merge_matrices(Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C.getColMap()->getLocalNumElements());
742
743 // Get the properties and arrays of input matrices
744 const matrix_t & Amat = Aview.origMatrix->getLocalMatrixDevice();
745 const matrix_t & Bmat = Bview.origMatrix->getLocalMatrixDevice();
746
747 typename handle_t::nnz_lno_t AnumRows = Amat.numRows();
748 typename handle_t::nnz_lno_t BnumRows = Bmerged.numRows();
749 typename handle_t::nnz_lno_t BnumCols = Bmerged.numCols();
750
751 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmerged.graph.row_map;
752 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmerged.graph.entries;
753 const scalar_view_t Avals = Amat.values, Bvals = Bmerged.values;
754
755 // Arrays of the output matrix
756 lno_view_t row_mapC (Kokkos::ViewAllocateWithoutInitializing("non_const_lnow_row"), AnumRows + 1);
757 lno_nnz_view_t entriesC;
758 scalar_view_t valuesC;
759
760 // Options
761 int team_work_size = 16;
762 std::string myalg("SPGEMM_KK_MEMORY");
763 if(!params.is_null()) {
764 if(params->isParameter("cuda: algorithm"))
765 myalg = params->get("cuda: algorithm",myalg);
766 if(params->isParameter("cuda: team work size"))
767 team_work_size = params->get("cuda: team work size",team_work_size);
768 }
769
770 // Get the algorithm mode
771 std::string nodename("Cuda");
772 std::string alg = nodename + std::string(" algorithm");
773 if(!params.is_null() && params->isParameter(alg)) myalg = params->get(alg,myalg);
774 KokkosSparse::SPGEMMAlgorithm alg_enum = KokkosSparse::StringToSPGEMMAlgorithm(myalg);
775
776
777 // KokkosKernels call
778 handle_t kh;
779 kh.create_spgemm_handle(alg_enum);
780 kh.set_team_work_size(team_work_size);
781
782 KokkosSparse::Experimental::spgemm_symbolic(&kh, AnumRows, BnumRows, BnumCols,
783 Arowptr, Acolind, false,
784 Browptr, Bcolind, false,
785 row_mapC);
786
787 size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
788 if (c_nnz_size){
789 entriesC = lno_nnz_view_t (Kokkos::ViewAllocateWithoutInitializing("entriesC"), c_nnz_size);
790 valuesC = scalar_view_t (Kokkos::ViewAllocateWithoutInitializing("valuesC"), c_nnz_size);
791 }
792
793 KokkosSparse::Experimental::spgemm_jacobi(&kh, AnumRows, BnumRows, BnumCols,
794 Arowptr, Acolind, Avals, false,
795 Browptr, Bcolind, Bvals, false,
796 row_mapC, entriesC, valuesC,
797 omega, Dinv.getLocalViewDevice(Access::ReadOnly));
798 kh.destroy_spgemm_handle();
799
800#ifdef HAVE_TPETRA_MMM_TIMINGS
801 MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix CudaSort"))));
802#endif
803
804 // Sort & set values
805 if (params.is_null() || params->get("sort entries",true))
806 Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
807 C.setAllValues(row_mapC,entriesC,valuesC);
808
809#ifdef HAVE_TPETRA_MMM_TIMINGS
810 MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix CudaESFC"))));
811#endif
812
813 // Final Fillcomplete
814 Teuchos::RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
815 labelList->set("Timer Label",label);
816 if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
817 Teuchos::RCP<const Export<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > dummyExport;
818 C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
819}
820
821 }//MMdetails
822}//Tpetra
823
824#endif//CUDA
825
826#endif
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
static bool debug()
Whether Tpetra is in debug mode.
A distributed dense vector.
Namespace Tpetra contains the class and methods constituting the Tpetra library.