Tpetra parallel linear algebra Version of the Day
Loading...
Searching...
No Matches
TpetraExt_MatrixMatrix_HIP.hpp
1// @HEADER
2// ***********************************************************************
3//
4// Tpetra: Templated Linear Algebra Services Package
5// Copyright (2008) Sandia Corporation
6//
7// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8// the U.S. Government retains certain rights in this software.
9//
10// Redistribution and use in source and binary forms, with or without
11// modification, are permitted provided that the following conditions are
12// met:
13//
14// 1. Redistributions of source code must retain the above copyright
15// notice, this list of conditions and the following disclaimer.
16//
17// 2. Redistributions in binary form must reproduce the above copyright
18// notice, this list of conditions and the following disclaimer in the
19// documentation and/or other materials provided with the distribution.
20//
21// 3. Neither the name of the Corporation nor the names of the
22// contributors may be used to endorse or promote products derived from
23// this software without specific prior written permission.
24//
25// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36//
37// Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38//
39// ************************************************************************
40// @HEADER
41
42#ifndef TPETRA_MATRIXMATRIX_HIP_DEF_HPP
43#define TPETRA_MATRIXMATRIX_HIP_DEF_HPP
44
45#ifdef HAVE_TPETRA_INST_HIP
46namespace Tpetra {
47namespace MMdetails {
48
49/*********************************************************************************************************/
50// MMM KernelWrappers for Partial Specialization to HIP
51template<class Scalar,
52 class LocalOrdinal,
53 class GlobalOrdinal,
54 class LocalOrdinalViewType>
55struct KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosHIPWrapperNode,LocalOrdinalViewType> {
56 static inline void mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosHIPWrapperNode>& Aview,
57 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosHIPWrapperNode>& Bview,
58 const LocalOrdinalViewType & Acol2Brow,
59 const LocalOrdinalViewType & Acol2Irow,
60 const LocalOrdinalViewType & Bcol2Ccol,
61 const LocalOrdinalViewType & Icol2Ccol,
62 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosHIPWrapperNode>& C,
63 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosHIPWrapperNode> > Cimport,
64 const std::string& label = std::string(),
65 const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
66
67
68
69 static inline void mult_A_B_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosHIPWrapperNode>& Aview,
70 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosHIPWrapperNode>& Bview,
71 const LocalOrdinalViewType & Acol2Brow,
72 const LocalOrdinalViewType & Acol2Irow,
73 const LocalOrdinalViewType & Bcol2Ccol,
74 const LocalOrdinalViewType & Icol2Ccol,
75 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosHIPWrapperNode>& C,
76 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosHIPWrapperNode> > Cimport,
77 const std::string& label = std::string(),
78 const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
79
80};
81
82// Jacobi KernelWrappers for Partial Specialization to HIP
83template<class Scalar,
84 class LocalOrdinal,
85 class GlobalOrdinal, class LocalOrdinalViewType>
86struct KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosHIPWrapperNode,LocalOrdinalViewType> {
87 static inline void jacobi_A_B_newmatrix_kernel_wrapper(Scalar omega,
88 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosHIPWrapperNode> & Dinv,
89 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosHIPWrapperNode>& Aview,
90 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosHIPWrapperNode>& Bview,
91 const LocalOrdinalViewType & Acol2Brow,
92 const LocalOrdinalViewType & Acol2Irow,
93 const LocalOrdinalViewType & Bcol2Ccol,
94 const LocalOrdinalViewType & Icol2Ccol,
95 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosHIPWrapperNode>& C,
96 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosHIPWrapperNode> > Cimport,
97 const std::string& label = std::string(),
98 const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
99
100 static inline void jacobi_A_B_reuse_kernel_wrapper(Scalar omega,
101 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosHIPWrapperNode> & Dinv,
102 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosHIPWrapperNode>& Aview,
103 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosHIPWrapperNode>& Bview,
104 const LocalOrdinalViewType & Acol2Brow,
105 const LocalOrdinalViewType & Acol2Irow,
106 const LocalOrdinalViewType & Bcol2Ccol,
107 const LocalOrdinalViewType & Icol2Ccol,
108 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosHIPWrapperNode>& C,
109 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosHIPWrapperNode> > Cimport,
110 const std::string& label = std::string(),
111 const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
112
113 static inline void jacobi_A_B_newmatrix_KokkosKernels(Scalar omega,
114 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosHIPWrapperNode> & Dinv,
115 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosHIPWrapperNode>& Aview,
116 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosHIPWrapperNode>& Bview,
117 const LocalOrdinalViewType & Acol2Brow,
118 const LocalOrdinalViewType & Acol2Irow,
119 const LocalOrdinalViewType & Bcol2Ccol,
120 const LocalOrdinalViewType & Icol2Ccol,
121 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosHIPWrapperNode>& C,
122 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosHIPWrapperNode> > Cimport,
123 const std::string& label = std::string(),
124 const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
125};
126
127
128/*********************************************************************************************************/
129// AB NewMatrix Kernel wrappers (KokkosKernels/HIP Version)
130template<class Scalar,
131 class LocalOrdinal,
132 class GlobalOrdinal,
133 class LocalOrdinalViewType>
134void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosHIPWrapperNode,LocalOrdinalViewType>::mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosHIPWrapperNode>& Aview,
135 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosHIPWrapperNode>& Bview,
136 const LocalOrdinalViewType & Acol2Brow,
137 const LocalOrdinalViewType & Acol2Irow,
138 const LocalOrdinalViewType & Bcol2Ccol,
139 const LocalOrdinalViewType & Icol2Ccol,
140 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosHIPWrapperNode>& C,
141 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosHIPWrapperNode> > Cimport,
142 const std::string& label,
143 const Teuchos::RCP<Teuchos::ParameterList>& params) {
144
145
146#ifdef HAVE_TPETRA_MMM_TIMINGS
147 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
148 using Teuchos::TimeMonitor;
149 Teuchos::RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix HIPWrapper")))));
150#endif
151 // Node-specific code
152 typedef Kokkos::Compat::KokkosHIPWrapperNode Node;
153 std::string nodename("HIP");
154
155 // Lots and lots of typedefs
156 using Teuchos::RCP;
158 typedef typename KCRS::device_type device_t;
159 typedef typename KCRS::StaticCrsGraphType graph_t;
160 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
161 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
162 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
163 typedef typename KCRS::values_type::non_const_type scalar_view_t;
164 //typedef typename graph_t::row_map_type::const_type lno_view_t_const;
165
166 // Options
167 int team_work_size = 16; // Defaults to 16 as per Deveci 12/7/16 - csiefer
168 std::string myalg("SPGEMM_KK_MEMORY");
169 if(!params.is_null()) {
170 if(params->isParameter("hip: algorithm"))
171 myalg = params->get("hip: algorithm",myalg);
172 if(params->isParameter("hip: team work size"))
173 team_work_size = params->get("hip: team work size",team_work_size);
174 }
175
176 // KokkosKernelsHandle
177 typedef KokkosKernels::Experimental::KokkosKernelsHandle<
178 typename lno_view_t::const_value_type,typename lno_nnz_view_t::const_value_type, typename scalar_view_t::const_value_type,
179 typename device_t::execution_space, typename device_t::memory_space,typename device_t::memory_space > KernelHandle;
180
181 // Grab the Kokkos::SparseCrsMatrices
182 const KCRS & Amat = Aview.origMatrix->getLocalMatrixDevice();
183 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixDevice();
184
185 c_lno_view_t Arowptr = Amat.graph.row_map,
186 Browptr = Bmat.graph.row_map;
187 const lno_nnz_view_t Acolind = Amat.graph.entries,
188 Bcolind = Bmat.graph.entries;
189 const scalar_view_t Avals = Amat.values,
190 Bvals = Bmat.values;
191
192 c_lno_view_t Irowptr;
193 lno_nnz_view_t Icolind;
194 scalar_view_t Ivals;
195 if(!Bview.importMatrix.is_null()) {
196 auto lclB = Bview.importMatrix->getLocalMatrixDevice();
197 Irowptr = lclB.graph.row_map;
198 Icolind = lclB.graph.entries;
199 Ivals = lclB.values;
200 }
201
202
203 // Get the algorithm mode
204 std::string alg = nodename+std::string(" algorithm");
205 // printf("DEBUG: Using kernel: %s\n",myalg.c_str());
206 if(!params.is_null() && params->isParameter(alg)) myalg = params->get(alg,myalg);
207 KokkosSparse::SPGEMMAlgorithm alg_enum = KokkosSparse::StringToSPGEMMAlgorithm(myalg);
208
209 // Merge the B and Bimport matrices
210 const KCRS Bmerged = Tpetra::MMdetails::merge_matrices(Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C.getColMap()->getLocalNumElements());
211
212#ifdef HAVE_TPETRA_MMM_TIMINGS
213 MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix HIPCore"))));
214#endif
215
216 // Do the multiply on whatever we've got
217 typename KernelHandle::nnz_lno_t AnumRows = Amat.numRows();
218 typename KernelHandle::nnz_lno_t BnumRows = Bmerged.numRows();
219 typename KernelHandle::nnz_lno_t BnumCols = Bmerged.numCols();
220
221 lno_view_t row_mapC (Kokkos::ViewAllocateWithoutInitializing("non_const_lnow_row"), AnumRows + 1);
222 lno_nnz_view_t entriesC;
223 scalar_view_t valuesC;
224 KernelHandle kh;
225 kh.create_spgemm_handle(alg_enum);
226 kh.set_team_work_size(team_work_size);
227
228 KokkosSparse::Experimental::spgemm_symbolic(&kh,AnumRows,BnumRows,BnumCols,Amat.graph.row_map,Amat.graph.entries,false,Bmerged.graph.row_map,Bmerged.graph.entries,false,row_mapC);
229
230 size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
231 if (c_nnz_size){
232 entriesC = lno_nnz_view_t (Kokkos::ViewAllocateWithoutInitializing("entriesC"), c_nnz_size);
233 valuesC = scalar_view_t (Kokkos::ViewAllocateWithoutInitializing("valuesC"), c_nnz_size);
234 }
235 KokkosSparse::Experimental::spgemm_numeric(&kh,AnumRows,BnumRows,BnumCols,Amat.graph.row_map,Amat.graph.entries,Amat.values,false,Bmerged.graph.row_map,Bmerged.graph.entries,Bmerged.values,false,row_mapC,entriesC,valuesC);
236 kh.destroy_spgemm_handle();
237
238#ifdef HAVE_TPETRA_MMM_TIMINGS
239 MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix HIPSort"))));
240#endif
241
242 // Sort & set values
243 if (params.is_null() || params->get("sort entries",true))
244 Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
245 C.setAllValues(row_mapC,entriesC,valuesC);
246
247#ifdef HAVE_TPETRA_MMM_TIMINGS
248 MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix HIPESFC"))));
249#endif
250
251 // Final Fillcomplete
252 RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
253 labelList->set("Timer Label",label);
254 if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
255 RCP<const Export<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosHIPWrapperNode> > dummyExport;
256 C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
257}
258
259
260/*********************************************************************************************************/
261template<class Scalar,
262 class LocalOrdinal,
263 class GlobalOrdinal,
264 class LocalOrdinalViewType>
265void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosHIPWrapperNode,LocalOrdinalViewType>::mult_A_B_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosHIPWrapperNode>& Aview,
266 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosHIPWrapperNode>& Bview,
267 const LocalOrdinalViewType & targetMapToOrigRow_dev,
268 const LocalOrdinalViewType & targetMapToImportRow_dev,
269 const LocalOrdinalViewType & Bcol2Ccol_dev,
270 const LocalOrdinalViewType & Icol2Ccol_dev,
271 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosHIPWrapperNode>& C,
272 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosHIPWrapperNode> > Cimport,
273 const std::string& label,
274 const Teuchos::RCP<Teuchos::ParameterList>& params) {
275
276 // FIXME: Right now, this is a cut-and-paste of the serial kernel
277 typedef Kokkos::Compat::KokkosHIPWrapperNode Node;
278
279#ifdef HAVE_TPETRA_MMM_TIMINGS
280 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
281 using Teuchos::TimeMonitor;
282 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse SerialCore"))));
283 Teuchos::RCP<Teuchos::TimeMonitor> MM2;
284#endif
285 using Teuchos::RCP;
286 using Teuchos::rcp;
287
288
289 // Lots and lots of typedefs
290 typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
291 typedef typename KCRS::StaticCrsGraphType graph_t;
292 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
293 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
294 typedef typename KCRS::values_type::non_const_type scalar_view_t;
295
296 typedef Scalar SC;
297 typedef LocalOrdinal LO;
298 typedef GlobalOrdinal GO;
299 typedef Node NO;
300 typedef Map<LO,GO,NO> map_type;
301 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
302 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
303 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
304
305 // KDDKDD UVM Without UVM, need to copy targetMap arrays to host.
306 // KDDKDD UVM Ideally, this function would run on device and use
307 // KDDKDD UVM KokkosKernels instead of this host implementation.
308 auto targetMapToOrigRow =
309 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
310 targetMapToOrigRow_dev);
311 auto targetMapToImportRow =
312 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
313 targetMapToImportRow_dev);
314 auto Bcol2Ccol =
315 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
316 Bcol2Ccol_dev);
317 auto Icol2Ccol =
318 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
319 Icol2Ccol_dev);
320
321 // Sizes
322 RCP<const map_type> Ccolmap = C.getColMap();
323 size_t m = Aview.origMatrix->getLocalNumRows();
324 size_t n = Ccolmap->getLocalNumElements();
325
326 // Grab the Kokkos::SparseCrsMatrices & inner stuff
327 const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
328 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
329 const KCRS & Cmat = C.getLocalMatrixHost();
330
331 c_lno_view_t Arowptr = Amat.graph.row_map,
332 Browptr = Bmat.graph.row_map,
333 Crowptr = Cmat.graph.row_map;
334 const lno_nnz_view_t Acolind = Amat.graph.entries,
335 Bcolind = Bmat.graph.entries,
336 Ccolind = Cmat.graph.entries;
337 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
338 scalar_view_t Cvals = Cmat.values;
339
340 c_lno_view_t Irowptr;
341 lno_nnz_view_t Icolind;
342 scalar_view_t Ivals;
343 if(!Bview.importMatrix.is_null()) {
344 auto lclB = Bview.importMatrix->getLocalMatrixHost();
345 Irowptr = lclB.graph.row_map;
346 Icolind = lclB.graph.entries;
347 Ivals = lclB.values;
348 }
349
350#ifdef HAVE_TPETRA_MMM_TIMINGS
351 MM2 = Teuchos::null; MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix SerialCore - Compare"))));
352#endif
353
354 // Classic csr assembly (low memory edition)
355 // mfh 27 Sep 2016: The c_status array is an implementation detail
356 // of the local sparse matrix-matrix multiply routine.
357
358 // The status array will contain the index into colind where this entry was last deposited.
359 // c_status[i] < CSR_ip - not in the row yet
360 // c_status[i] >= CSR_ip - this is the entry where you can find the data
361 // We start with this filled with INVALID's indicating that there are no entries yet.
362 // Sadly, this complicates the code due to the fact that size_t's are unsigned.
363 std::vector<size_t> c_status(n, ST_INVALID);
364
365 // For each row of A/C
366 size_t CSR_ip = 0, OLD_ip = 0;
367 for (size_t i = 0; i < m; i++) {
368 // First fill the c_status array w/ locations where we're allowed to
369 // generate nonzeros for this row
370 OLD_ip = Crowptr[i];
371 CSR_ip = Crowptr[i+1];
372 for (size_t k = OLD_ip; k < CSR_ip; k++) {
373 c_status[Ccolind[k]] = k;
374
375 // Reset values in the row of C
376 Cvals[k] = SC_ZERO;
377 }
378
379 for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
380 LO Aik = Acolind[k];
381 const SC Aval = Avals[k];
382 if (Aval == SC_ZERO)
383 continue;
384
385 if (targetMapToOrigRow[Aik] != LO_INVALID) {
386 // Local matrix
387 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow[Aik]);
388
389 for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
390 LO Bkj = Bcolind[j];
391 LO Cij = Bcol2Ccol[Bkj];
392
393 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
394 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
395 "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
396
397 Cvals[c_status[Cij]] += Aval * Bvals[j];
398 }
399
400 } else {
401 // Remote matrix
402 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow[Aik]);
403 for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
404 LO Ikj = Icolind[j];
405 LO Cij = Icol2Ccol[Ikj];
406
407 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
408 std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
409 "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
410
411 Cvals[c_status[Cij]] += Aval * Ivals[j];
412 }
413 }
414 }
415 }
416
417 C.fillComplete(C.getDomainMap(), C.getRangeMap());
418}
419
420/*********************************************************************************************************/
421template<class Scalar,
422 class LocalOrdinal,
423 class GlobalOrdinal,
424 class LocalOrdinalViewType>
425void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosHIPWrapperNode,LocalOrdinalViewType>::jacobi_A_B_newmatrix_kernel_wrapper(Scalar omega,
426 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosHIPWrapperNode> & Dinv,
427 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosHIPWrapperNode>& Aview,
428 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosHIPWrapperNode>& Bview,
429 const LocalOrdinalViewType & Acol2Brow,
430 const LocalOrdinalViewType & Acol2Irow,
431 const LocalOrdinalViewType & Bcol2Ccol,
432 const LocalOrdinalViewType & Icol2Ccol,
433 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosHIPWrapperNode>& C,
434 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosHIPWrapperNode> > Cimport,
435 const std::string& label,
436 const Teuchos::RCP<Teuchos::ParameterList>& params) {
437
438#ifdef HAVE_TPETRA_MMM_TIMINGS
439 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
440 using Teuchos::TimeMonitor;
441 Teuchos::RCP<TimeMonitor> MM;
442#endif
443
444 // Node-specific code
445 using Teuchos::RCP;
446
447 // Options
448 //int team_work_size = 16; // Defaults to 16 as per Deveci 12/7/16 - csiefer // unreferenced
449 std::string myalg("KK");
450 if(!params.is_null()) {
451 if(params->isParameter("hip: jacobi algorithm"))
452 myalg = params->get("hip: jacobi algorithm",myalg);
453 }
454
455 if(myalg == "MSAK") {
456 ::Tpetra::MatrixMatrix::ExtraKernels::jacobi_A_B_newmatrix_MultiplyScaleAddKernel(omega,Dinv,Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
457 }
458 else if(myalg == "KK") {
459 jacobi_A_B_newmatrix_KokkosKernels(omega,Dinv,Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
460 }
461 else {
462 throw std::runtime_error("Tpetra::MatrixMatrix::Jacobi newmatrix unknown kernel");
463 }
464
465#ifdef HAVE_TPETRA_MMM_TIMINGS
466 MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix HIPESFC"))));
467#endif
468
469 // Final Fillcomplete
470 RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
471 labelList->set("Timer Label",label);
472 if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
473
474 // NOTE: MSAK already fillCompletes, so we have to check here
475 if(!C.isFillComplete()) {
476 RCP<const Export<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosHIPWrapperNode> > dummyExport;
477 C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
478 }
479
480}
481
482
483
484/*********************************************************************************************************/
485template<class Scalar,
486 class LocalOrdinal,
487 class GlobalOrdinal,
488 class LocalOrdinalViewType>
489void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosHIPWrapperNode,LocalOrdinalViewType>::jacobi_A_B_reuse_kernel_wrapper(Scalar omega,
490 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosHIPWrapperNode> & Dinv,
491 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosHIPWrapperNode>& Aview,
492 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosHIPWrapperNode>& Bview,
493 const LocalOrdinalViewType & targetMapToOrigRow_dev,
494 const LocalOrdinalViewType & targetMapToImportRow_dev,
495 const LocalOrdinalViewType & Bcol2Ccol_dev,
496 const LocalOrdinalViewType & Icol2Ccol_dev,
497 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosHIPWrapperNode>& C,
498 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosHIPWrapperNode> > Cimport,
499 const std::string& label,
500 const Teuchos::RCP<Teuchos::ParameterList>& params) {
501
502 // FIXME: Right now, this is a cut-and-paste of the serial kernel
503 typedef Kokkos::Compat::KokkosHIPWrapperNode Node;
504
505#ifdef HAVE_TPETRA_MMM_TIMINGS
506 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
507 using Teuchos::TimeMonitor;
508 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse HIPCore"))));
509 Teuchos::RCP<Teuchos::TimeMonitor> MM2;
510#endif
511 using Teuchos::RCP;
512 using Teuchos::rcp;
513
514 // Lots and lots of typedefs
515 typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
516 typedef typename KCRS::StaticCrsGraphType graph_t;
517 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
518 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
519 typedef typename KCRS::values_type::non_const_type scalar_view_t;
520 typedef typename scalar_view_t::memory_space scalar_memory_space;
521
522 typedef Scalar SC;
523 typedef LocalOrdinal LO;
524 typedef GlobalOrdinal GO;
525 typedef Node NO;
526 typedef Map<LO,GO,NO> map_type;
527 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
528 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
529 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
530
531 // KDDKDD UVM Without UVM, need to copy targetMap arrays to host.
532 // KDDKDD UVM Ideally, this function would run on device and use
533 // KDDKDD UVM KokkosKernels instead of this host implementation.
534 auto targetMapToOrigRow =
535 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
536 targetMapToOrigRow_dev);
537 auto targetMapToImportRow =
538 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
539 targetMapToImportRow_dev);
540 auto Bcol2Ccol =
541 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
542 Bcol2Ccol_dev);
543 auto Icol2Ccol =
544 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
545 Icol2Ccol_dev);
546
547
548 // Sizes
549 RCP<const map_type> Ccolmap = C.getColMap();
550 size_t m = Aview.origMatrix->getLocalNumRows();
551 size_t n = Ccolmap->getLocalNumElements();
552
553 // Grab the Kokkos::SparseCrsMatrices & inner stuff
554 const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
555 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
556 const KCRS & Cmat = C.getLocalMatrixHost();
557
558 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
559 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
560 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
561 scalar_view_t Cvals = Cmat.values;
562
563 c_lno_view_t Irowptr;
564 lno_nnz_view_t Icolind;
565 scalar_view_t Ivals;
566 if(!Bview.importMatrix.is_null()) {
567 auto lclB = Bview.importMatrix->getLocalMatrixHost();
568 Irowptr = lclB.graph.row_map;
569 Icolind = lclB.graph.entries;
570 Ivals = lclB.values;
571 }
572
573 // Jacobi-specific inner stuff
574 auto Dvals =
575 Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
576
577#ifdef HAVE_TPETRA_MMM_TIMINGS
578 MM2 = Teuchos::null; MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse HIPCore - Compare"))));
579#endif
580
581 // The status array will contain the index into colind where this entry was last deposited.
582 // c_status[i] < CSR_ip - not in the row yet
583 // c_status[i] >= CSR_ip - this is the entry where you can find the data
584 // We start with this filled with INVALID's indicating that there are no entries yet.
585 // Sadly, this complicates the code due to the fact that size_t's are unsigned.
586 std::vector<size_t> c_status(n, ST_INVALID);
587
588 // For each row of A/C
589 size_t CSR_ip = 0, OLD_ip = 0;
590 for (size_t i = 0; i < m; i++) {
591
592 // First fill the c_status array w/ locations where we're allowed to
593 // generate nonzeros for this row
594 OLD_ip = Crowptr[i];
595 CSR_ip = Crowptr[i+1];
596 for (size_t k = OLD_ip; k < CSR_ip; k++) {
597 c_status[Ccolind[k]] = k;
598
599 // Reset values in the row of C
600 Cvals[k] = SC_ZERO;
601 }
602
603 SC minusOmegaDval = -omega*Dvals(i,0);
604
605 // Entries of B
606 for (size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
607 Scalar Bval = Bvals[j];
608 if (Bval == SC_ZERO)
609 continue;
610 LO Bij = Bcolind[j];
611 LO Cij = Bcol2Ccol[Bij];
612
613 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
614 std::runtime_error, "Trying to insert a new entry into a static graph");
615
616 Cvals[c_status[Cij]] = Bvals[j];
617 }
618
619 // Entries of -omega * Dinv * A * B
620 for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
621 LO Aik = Acolind[k];
622 const SC Aval = Avals[k];
623 if (Aval == SC_ZERO)
624 continue;
625
626 if (targetMapToOrigRow[Aik] != LO_INVALID) {
627 // Local matrix
628 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow[Aik]);
629
630 for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
631 LO Bkj = Bcolind[j];
632 LO Cij = Bcol2Ccol[Bkj];
633
634 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
635 std::runtime_error, "Trying to insert a new entry into a static graph");
636
637 Cvals[c_status[Cij]] += minusOmegaDval * Aval * Bvals[j];
638 }
639
640 } else {
641 // Remote matrix
642 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow[Aik]);
643 for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
644 LO Ikj = Icolind[j];
645 LO Cij = Icol2Ccol[Ikj];
646
647 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
648 std::runtime_error, "Trying to insert a new entry into a static graph");
649
650 Cvals[c_status[Cij]] += minusOmegaDval * Aval * Ivals[j];
651 }
652 }
653 }
654 }
655
656#ifdef HAVE_TPETRA_MMM_TIMINGS
657 MM2= Teuchos::null;
658 MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse ESFC"))));
659#endif
660
661 C.fillComplete(C.getDomainMap(), C.getRangeMap());
662
663}
664
665/*********************************************************************************************************/
666template<class Scalar,
667 class LocalOrdinal,
668 class GlobalOrdinal,
669 class LocalOrdinalViewType>
670void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosHIPWrapperNode,LocalOrdinalViewType>::jacobi_A_B_newmatrix_KokkosKernels(Scalar omega,
671 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosHIPWrapperNode> & Dinv,
672 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosHIPWrapperNode>& Aview,
673 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosHIPWrapperNode>& Bview,
674 const LocalOrdinalViewType & Acol2Brow,
675 const LocalOrdinalViewType & Acol2Irow,
676 const LocalOrdinalViewType & Bcol2Ccol,
677 const LocalOrdinalViewType & Icol2Ccol,
678 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosHIPWrapperNode>& C,
679 Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosHIPWrapperNode> > Cimport,
680 const std::string& label,
681 const Teuchos::RCP<Teuchos::ParameterList>& params) {
682
683#ifdef HAVE_TPETRA_MMM_TIMINGS
684 std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
685 using Teuchos::TimeMonitor;
686 Teuchos::RCP<TimeMonitor> MM;
687#endif
688
689 // Check if the diagonal entries exist in debug mode
690 const bool debug = Tpetra::Details::Behavior::debug();
691 if(debug) {
692
693 auto rowMap = Aview.origMatrix->getRowMap();
694 Tpetra::Vector<Scalar> diags(rowMap);
695 Aview.origMatrix->getLocalDiagCopy(diags);
696 size_t diagLength = rowMap->getLocalNumElements();
697 Teuchos::Array<Scalar> diagonal(diagLength);
698 diags.get1dCopy(diagonal());
699
700 for(size_t i = 0; i < diagLength; ++i) {
701 TEUCHOS_TEST_FOR_EXCEPTION(diagonal[i] == Teuchos::ScalarTraits<Scalar>::zero(),
702 std::runtime_error,
703 "Matrix A has a zero/missing diagonal: " << diagonal[i] << std::endl <<
704 "KokkosKernels Jacobi-fused SpGEMM requires nonzero diagonal entries in A" << std::endl);
705 }
706 }
707
708 // Usings
709 using device_t = typename Kokkos::Compat::KokkosHIPWrapperNode::device_type;
711 using graph_t = typename matrix_t::StaticCrsGraphType;
712 using lno_view_t = typename graph_t::row_map_type::non_const_type;
713 using c_lno_view_t = typename graph_t::row_map_type::const_type;
714 using lno_nnz_view_t = typename graph_t::entries_type::non_const_type;
715 using scalar_view_t = typename matrix_t::values_type::non_const_type;
716
717 // KokkosKernels handle
718 using handle_t = typename KokkosKernels::Experimental::KokkosKernelsHandle<
719 typename lno_view_t::const_value_type,typename lno_nnz_view_t::const_value_type, typename scalar_view_t::const_value_type,
720 typename device_t::execution_space, typename device_t::memory_space,typename device_t::memory_space >;
721
722 // Get the rowPtr, colInd and vals of importMatrix
723 c_lno_view_t Irowptr;
724 lno_nnz_view_t Icolind;
725 scalar_view_t Ivals;
726 if(!Bview.importMatrix.is_null()) {
727 auto lclB = Bview.importMatrix->getLocalMatrixDevice();
728 Irowptr = lclB.graph.row_map;
729 Icolind = lclB.graph.entries;
730 Ivals = lclB.values;
731 }
732
733 // Merge the B and Bimport matrices
734 const matrix_t Bmerged = Tpetra::MMdetails::merge_matrices(Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C.getColMap()->getLocalNumElements());
735
736 // Get the properties and arrays of input matrices
737 const matrix_t & Amat = Aview.origMatrix->getLocalMatrixDevice();
738 const matrix_t & Bmat = Bview.origMatrix->getLocalMatrixDevice();
739
740 typename handle_t::nnz_lno_t AnumRows = Amat.numRows();
741 typename handle_t::nnz_lno_t BnumRows = Bmerged.numRows();
742 typename handle_t::nnz_lno_t BnumCols = Bmerged.numCols();
743
744 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmerged.graph.row_map;
745 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmerged.graph.entries;
746 const scalar_view_t Avals = Amat.values, Bvals = Bmerged.values;
747
748 // Arrays of the output matrix
749 lno_view_t row_mapC (Kokkos::ViewAllocateWithoutInitializing("non_const_lnow_row"), AnumRows + 1);
750 lno_nnz_view_t entriesC;
751 scalar_view_t valuesC;
752
753 // Options
754 int team_work_size = 16;
755 std::string myalg("SPGEMM_KK_MEMORY");
756 if(!params.is_null()) {
757 if(params->isParameter("hip: algorithm"))
758 myalg = params->get("hip: algorithm",myalg);
759 if(params->isParameter("hip: team work size"))
760 team_work_size = params->get("hip: team work size",team_work_size);
761 }
762
763 // Get the algorithm mode
764 std::string nodename("HIP");
765 std::string alg = nodename + std::string(" algorithm");
766 if(!params.is_null() && params->isParameter(alg)) myalg = params->get(alg,myalg);
767 KokkosSparse::SPGEMMAlgorithm alg_enum = KokkosSparse::StringToSPGEMMAlgorithm(myalg);
768
769
770 // KokkosKernels call
771 handle_t kh;
772 kh.create_spgemm_handle(alg_enum);
773 kh.set_team_work_size(team_work_size);
774
775 KokkosSparse::Experimental::spgemm_symbolic(&kh, AnumRows, BnumRows, BnumCols,
776 Arowptr, Acolind, false,
777 Browptr, Bcolind, false,
778 row_mapC);
779
780 size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
781 if (c_nnz_size){
782 entriesC = lno_nnz_view_t (Kokkos::ViewAllocateWithoutInitializing("entriesC"), c_nnz_size);
783 valuesC = scalar_view_t (Kokkos::ViewAllocateWithoutInitializing("valuesC"), c_nnz_size);
784 }
785
786 KokkosSparse::Experimental::spgemm_jacobi(&kh, AnumRows, BnumRows, BnumCols,
787 Arowptr, Acolind, Avals, false,
788 Browptr, Bcolind, Bvals, false,
789 row_mapC, entriesC, valuesC,
790 omega, Dinv.getLocalViewDevice(Access::ReadOnly));
791 kh.destroy_spgemm_handle();
792
793#ifdef HAVE_TPETRA_MMM_TIMINGS
794 MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix HIPSort"))));
795#endif
796
797 // Sort & set values
798 if (params.is_null() || params->get("sort entries",true))
799 Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
800 C.setAllValues(row_mapC,entriesC,valuesC);
801
802#ifdef HAVE_TPETRA_MMM_TIMINGS
803 MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix HIPESFC"))));
804#endif
805
806 // Final Fillcomplete
807 Teuchos::RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
808 labelList->set("Timer Label",label);
809 if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
810 Teuchos::RCP<const Export<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosHIPWrapperNode> > dummyExport;
811 C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
812}
813
814 }//MMdetails
815}//Tpetra
816
817#endif//HIP
818
819#endif
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
static bool debug()
Whether Tpetra is in debug mode.
A distributed dense vector.
Namespace Tpetra contains the class and methods constituting the Tpetra library.