Stokhos Package Browser (Single Doxygen Collection) Version of the Day
Loading...
Searching...
No Matches
Kokkos_CrsMatrix_UQ_PCE_Cuda.hpp
Go to the documentation of this file.
1// @HEADER
2// ***********************************************************************
3//
4// Stokhos Package
5// Copyright (2009) Sandia Corporation
6//
7// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8// license for use of this work by or on behalf of the U.S. Government.
9//
10// Redistribution and use in source and binary forms, with or without
11// modification, are permitted provided that the following conditions are
12// met:
13//
14// 1. Redistributions of source code must retain the above copyright
15// notice, this list of conditions and the following disclaimer.
16//
17// 2. Redistributions in binary form must reproduce the above copyright
18// notice, this list of conditions and the following disclaimer in the
19// documentation and/or other materials provided with the distribution.
20//
21// 3. Neither the name of the Corporation nor the names of the
22// contributors may be used to endorse or promote products derived from
23// this software without specific prior written permission.
24//
25// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36//
37// Questions? Contact Eric T. Phipps (etphipp@sandia.gov).
38//
39// ***********************************************************************
40// @HEADER
41
42#ifndef KOKKOS_CRSMATRIX_UQ_PCE_CUDA_HPP
43#define KOKKOS_CRSMATRIX_UQ_PCE_CUDA_HPP
44
45#if defined( __CUDACC__)
46
47#include "Sacado_UQ_PCE.hpp"
50#include "KokkosSparse_CrsMatrix.hpp"
51
52//MD 08/2017 Note: I commented out below, as this file is totally
53//removed from KokkosKernels. It does not look like the
54//included file is used anywhere in the file.
55//#include "Kokkos_MV.hpp" // for some utilities
56
57#include "Stokhos_Multiply.hpp"
59
60#include "Kokkos_Core.hpp"
61
63//#include "Stokhos_Cuda_WarpShuffle.hpp"
64
65#include "Teuchos_TestForException.hpp"
66
67//#include "cuda_profiler_api.h"
68
69#ifdef __CUDA_ARCH__
70# if (__CUDA_ARCH__ >= 300)
71# define HAVE_CUDA_SHUFFLE 1
72# else
73# define HAVE_CUDA_SHUFFLE 0
74# endif
75#else
76# define HAVE_CUDA_SHUFFLE 0
77#endif
78
79namespace Stokhos {
80
81//----------------------------------------------------------------------------
82// Specialization of Kokkos CrsMatrix math functions
83//----------------------------------------------------------------------------
84
85// Kernel implementing y = A * x where
86// A == Kokkos::CrsMatrix< Sacado::UQ::PCE<...>,...>,
87// x, y == Kokkos::View< Sacado::UQ::PCE<...>*,...>,
88// x and y are rank 1
89template <typename MatrixStorage,
90 typename MatrixOrdinal,
91 typename MatrixMemory,
92 typename MatrixSize,
93 typename InputStorage,
94 typename ... InputP,
95 typename OutputStorage,
96 typename ... OutputP>
97class Multiply< KokkosSparse::CrsMatrix< const Sacado::UQ::PCE<MatrixStorage>,
98 MatrixOrdinal,
99 Kokkos::Cuda,
100 MatrixMemory,
101 MatrixSize>,
102 Kokkos::View< const Sacado::UQ::PCE<InputStorage>*,
103 InputP... >,
104 Kokkos::View< Sacado::UQ::PCE<OutputStorage>*,
105 OutputP... >
106 >
107{
108public:
109 typedef Sacado::UQ::PCE<MatrixStorage> MatrixValue;
110 typedef Sacado::UQ::PCE<InputStorage> InputVectorValue;
111 typedef Sacado::UQ::PCE<OutputStorage> OutputVectorValue;
112
113 typedef Kokkos::Cuda MatrixDevice;
114 typedef MatrixDevice execution_space;
115 typedef execution_space::size_type size_type;
116
117 typedef KokkosSparse::CrsMatrix< const MatrixValue,
118 MatrixOrdinal,
119 MatrixDevice,
120 MatrixMemory,
121 MatrixSize> matrix_type;
122 typedef typename matrix_type::values_type matrix_values_type;
123 typedef typename Kokkos::CijkType<matrix_values_type>::type tensor_type;
124 typedef Kokkos::View< const InputVectorValue*,
125 InputP... > input_vector_type;
126 typedef Kokkos::View< OutputVectorValue*,
127 OutputP... > output_vector_type;
128
129private:
130
131 typedef typename matrix_type::StaticCrsGraphType matrix_graph_type;
132 typedef typename matrix_values_type::array_type matrix_array_type;
133 typedef typename input_vector_type::array_type input_array_type;
134 typedef typename output_vector_type::array_type output_array_type;
135
136 typedef typename MatrixValue::value_type matrix_scalar;
137 typedef typename InputVectorValue::value_type input_scalar;
138 typedef typename OutputVectorValue::value_type output_scalar;
139 typedef typename tensor_type::value_type tensor_scalar;
140
141 const matrix_array_type m_A_values ;
142 const matrix_graph_type m_A_graph ;
143 const input_array_type m_x ;
144 const output_array_type m_y ;
145 const tensor_type m_tensor ;
146 const input_scalar m_a ;
147 const output_scalar m_b ;
148 const size_type BlockSize;
149
150 Multiply( const matrix_type & A ,
151 const input_vector_type & x ,
152 const output_vector_type & y ,
153 const input_scalar & a ,
154 const output_scalar & b ,
155 const size_type block_size )
156 : m_A_values( A.values )
157 , m_A_graph( A.graph )
158 , m_x( x )
159 , m_y( y )
160 , m_tensor( Kokkos::cijk(A.values) )
161 , m_a( a )
162 , m_b( b )
163 , BlockSize(block_size)
164 {}
165
166public:
167
168 __device__ void operator()(void) const
169 {
170 // Number of bases in the stochastic system:
171 const size_type dim = m_tensor.dimension();
172
173 // Get shared memory for loading x, A, and y
174 volatile input_scalar * const sh_x =
175 kokkos_impl_cuda_shared_memory<input_scalar>();
176 volatile matrix_scalar * const sh_A = sh_x + BlockSize*dim;
177 volatile output_scalar * const sh_y = sh_A + BlockSize*dim;
178#if !HAVE_CUDA_SHUFFLE
179 volatile output_scalar * const sh_t = sh_y + dim;
180#endif
181
182 const size_type nid = blockDim.x * blockDim.y;
183 const size_type tid = threadIdx.x + blockDim.x * threadIdx.y;
184
185 // Zero y
186 for ( size_type i = tid; i < dim; i += nid ) {
187 sh_y[i] = 0.0;
188 }
189
190 // Loop over columns in the discrete (finite element) system.
191 // blockIdx.x == row in the deterministic (finite element) system
192 const size_type iBlockEntryBeg = m_A_graph.row_map[ blockIdx.x ];
193 const size_type iBlockEntryEnd = m_A_graph.row_map[ blockIdx.x + 1 ];
194 for (size_type iBlockEntry=iBlockEntryBeg; iBlockEntry<iBlockEntryEnd;
195 iBlockEntry += BlockSize) {
196 const size_type block_size =
197 (iBlockEntryEnd-iBlockEntry < BlockSize) ?
198 iBlockEntryEnd-iBlockEntry : BlockSize;
199
200 // Wait for X and A to be used in the previous iteration
201 // before reading new values.
202 __syncthreads();
203
204 // Coalesced read blocks of X and A into shared memory
205 for ( size_type col = 0; col < block_size; ++col ) {
206
207 const size_type iBlockColumn = m_A_graph.entries( iBlockEntry + col );
208 const input_scalar * const x = & m_x( 0, iBlockColumn );
209 const matrix_scalar * const A = & m_A_values( iBlockEntry + col, 0 );
210
211 // Coalesced read by the whole block from global memory:
212 for ( size_type i = tid; i < dim; i += nid ) {
213 sh_x[col + i * BlockSize] = x[i]; // m_x( i, iBlockColumn );
214 sh_A[col + i * BlockSize] = A[i]; // m_A_values( iBlockEntry , i );
215 }
216
217 }
218
219 __syncthreads(); // wait for X and A to be read before using them
220
221 // This cuda block is responsible for computing all values of 'y'
222 for ( size_type i = threadIdx.y; i < dim; i += blockDim.y ) {
223 output_scalar y = 0;
224
225 // Product tensor entries which this warp will iterate:
226 const size_type lBeg = m_tensor.entry_begin( i );
227 const size_type lEnd = m_tensor.entry_end( i );
228
229 // Loop through sparse tensor contributions with coalesced reads.
230 for ( size_type l = lBeg+threadIdx.x; l < lEnd; l += blockDim.x ) {
231
232 // Read 'blockDim.x' entries from the tensor (coalesced)
233 const size_type kj = m_tensor.coord( l );
234 const tensor_scalar v = m_tensor.value( l );
235 const size_type j = ( kj & 0x0ffff ) * BlockSize ;
236 const size_type k = ( kj >> 16 ) * BlockSize ;
237
238 for ( size_type col = 0; col < block_size; ++col ) {
239 y += v * ( sh_A[col+j] * sh_x[col+k] +
240 sh_A[col+k] * sh_x[col+j] );
241 }
242
243 }
244
245 // Reduction of 'y' within 'blockDim.x'
246#if HAVE_CUDA_SHUFFLE
247 if (blockDim.x >= 2) y += Kokkos::shfl_down(y, 1, blockDim.x);
248 if (blockDim.x >= 4) y += Kokkos::shfl_down(y, 2, blockDim.x);
249 if (blockDim.x >= 8) y += Kokkos::shfl_down(y, 4, blockDim.x);
250 if (blockDim.x >= 16) y += Kokkos::shfl_down(y, 8, blockDim.x);
251 if (blockDim.x >= 32) y += Kokkos::shfl_down(y, 16, blockDim.x);
252 if ( threadIdx.x == 0 ) sh_y[i] += y;
253#else
254 sh_t[ tid ] = y;
255 if (threadIdx.x+16 < blockDim.x) sh_t[tid] += sh_t[tid+16];
256 if (threadIdx.x+ 8 < blockDim.x) sh_t[tid] += sh_t[tid+ 8];
257 if (threadIdx.x+ 4 < blockDim.x) sh_t[tid] += sh_t[tid+ 4];
258 if (threadIdx.x+ 2 < blockDim.x) sh_t[tid] += sh_t[tid+ 2];
259 if (threadIdx.x+ 1 < blockDim.x) sh_t[tid] += sh_t[tid+ 1];
260 if (threadIdx.x == 0) sh_y[i] += sh_t[tid];
261#endif
262
263 }
264
265 }
266
267 // Wait for all contributions of y to be completed
268 __syncthreads();
269
270 // Store result back in global memory
271 if ( m_b == output_scalar(0) )
272 for ( size_type i = tid; i < dim; i += nid )
273 m_y( i, blockIdx.x ) = m_a * sh_y[ i ];
274 else
275 for ( size_type i = tid; i < dim; i += nid )
276 m_y( i, blockIdx.x ) = m_a * sh_y[ i ] + m_b * m_y( i, blockIdx.x );
277 }
278
279 struct TensorReadEntry {
280 size_type block_size, shmem, num_blocks, num_warp;
281 double reads;
282 };
283
284 static void apply( const matrix_type & A ,
285 const input_vector_type & x ,
286 const output_vector_type & y ,
287 const input_scalar & a = input_scalar(1) ,
288 const output_scalar & b = output_scalar(0) )
289 {
290 const tensor_type tensor = Kokkos::cijk(A.values);
291 const size_type row_count = A.graph.row_map.extent(0) - 1;
292 const size_type tensor_dimension = tensor.dimension();
293 const size_type tensor_align = tensor_dimension;
294 const size_type avg_tensor_entries_per_row = tensor.avg_entries_per_row();
295
296 // Should compute this from FEM graph
297 const size_type fem_nnz_per_row = 27;
298
299 // Get device properties we need for whatever device is currently selected
300 DeviceProp device_prop;
301 const size_type shcap = device_prop.shared_memory_capacity;
302 const size_type sh_granularity = device_prop.shared_memory_granularity;
303 const size_type max_shmem_per_block = device_prop.max_shmem_per_block;
304 const size_type max_blocks_per_sm = device_prop.max_blocks_per_sm;
305 const size_type warp_size = device_prop.warp_size;
306 const size_type warp_granularity = device_prop.warp_granularity;
307 const size_type max_warps_per_block =
308 std::min(device_prop.max_threads_per_block / warp_size,
309 device_prop.max_warps_per_sm);
310 const size_type min_warps_per_block = 1;
311 const size_type max_regs_per_sm = device_prop.max_regs_per_sm;
312 const size_type max_regs_per_block = device_prop.max_regs_per_block;
313 const size_type reg_bank_size = device_prop.reg_bank_size;
314
315 // Compute number of warps we can fit on each SM based on register limits
316 // Use Cuda introspection to determine number of registers per thread
317 //const size_type regs_per_thread = 46;
318 const size_type regs_per_thread =
319 device_prop.get_kernel_registers(
320 Kokkos::Impl::cuda_parallel_launch_local_memory<Multiply>);
321 const size_type regs_per_warp =
322 (warp_size*regs_per_thread + reg_bank_size-1) & ~(reg_bank_size-1);
323 const size_type warps_per_sm =
324 (max_regs_per_sm/regs_per_warp) & ~(warp_granularity-1);
325 const size_type warps_per_block =
326 (max_regs_per_block/regs_per_warp) & ~(warp_granularity-1);
327
328 // Compute number of threads per stochastic row based on number of
329 // nonzero entries per row.
330 // For double, 16 threads/row is still coalesced, but not for float.
331 // We should reorder the tensor values for a given vector width to
332 // maintain coalesced reads. This would help smaller problems too by
333 // allowing fewer threads/row.
334 const size_type threads_per_row =
335 avg_tensor_entries_per_row >= 88 ? 32 : 16;
336 const size_type rows_per_warp = warp_size / threads_per_row;
337
338 const size_type in_vec_scalar_size = sizeof(input_scalar);
339 const size_type out_vec_scalar_size = sizeof(output_scalar);
340 const size_type mat_scalar_size = sizeof(matrix_scalar);
341
342#define USE_FIXED_BLOCKSIZE 0
343
344#if USE_FIXED_BLOCKSIZE
345
346 const size_type num_blocks = 3;
347 size_type nw = warps_per_sm / num_blocks;
348 while (nw > 1 && num_blocks*nw % warp_granularity) --nw;
349 const size_type num_warp = nw;
350 const size_type sh_per_block = shcap / num_blocks;
351 const size_type sr =
352 device_prop.has_shuffle ? 0 : in_vec_scalar_size*warp_size*num_warp;
353 size_type bs = ((sh_per_block - sr) / tensor_align - out_vec_scalar_size) /
354 (in_vec_scalar_size+mat_scalar_size);
355 if (bs % 2 == 0) --bs;
356 const size_type block_size_max = 31;
357 const size_type block_size = std::min(bs, block_size_max);
358 //const size_type block_size = 7;
359 const size_type shmem =
360 ( ((in_vec_scalar_size+mat_scalar_size)*block_size+out_vec_scalar_size)*tensor_align + sr + sh_granularity-1 ) & ~(sh_granularity-1);
361
362#else
363
364 // We want to maximize the number of blocks per SM (to maximize throughput)
365 // as well as the block_size (to minimize tensor reads), subject to
366 // shared memory and register constraints. Here we try to do this computing
367 // the number of tensor reads per block per thread for each choice of
368 // block_size, and then choose the configuration with the smallest value.
369 // This isn't perfect, but seems to generally work OK. It could be
370 // improved with a better model of:
371 // * Number of blocks versus warps per block (to minimize synchronization)
372 // * Thread efficiency for small numbers of rows per thread
373 const size_type block_size_min = 3;
374 const size_type half_nnz_per_row = fem_nnz_per_row / 2 + 1;
375 const size_type block_size_max =
376 half_nnz_per_row % 2 ? half_nnz_per_row + 1 : half_nnz_per_row;
377 Teuchos::Array<TensorReadEntry> reads_per_thread;
378 for (size_type bs = block_size_min; bs<=block_size_max; bs+=2) {
379 // We don't know the number of warps yet, so we just have to bound
380 // sr by the maximum number possible (which is all warps in 1 block)
381 const size_type sr =
382 device_prop.has_shuffle ? 0 : in_vec_scalar_size*warp_size*warps_per_block;
383 size_type shmem =
384 ((in_vec_scalar_size+mat_scalar_size)*bs+out_vec_scalar_size)*tensor_align+sr;
385 shmem = (shmem + sh_granularity-1) & ~(sh_granularity-1);
386 if (shmem <= max_shmem_per_block) {
387 size_type num_blocks = std::min(shcap / shmem, max_blocks_per_sm);
388 size_type tensor_reads = (fem_nnz_per_row+bs-1) / bs;
389 size_type num_warp =
390 std::min(std::max(std::min(warps_per_sm/num_blocks, warps_per_block),
391 min_warps_per_block),
392 max_warps_per_block);
393 while (num_warp > 1 && num_blocks*num_warp % warp_granularity)
394 --num_warp;
395 TensorReadEntry entry;
396 entry.block_size = bs;
397 entry.shmem = shmem;
398 entry.num_blocks = num_blocks;
399 entry.num_warp = num_warp;
400
401 // Prefer at least 3 blocks
402 size_type factor = std::min(num_blocks,3u);
403 entry.reads = (static_cast<double>(tensor_reads) /
404 static_cast<double>(factor*num_blocks*num_warp));
405 reads_per_thread.push_back(entry);
406 }
407 }
408 TEUCHOS_TEST_FOR_EXCEPTION(
409 reads_per_thread.size() == 0, std::logic_error,
410 "Stochastic problem dimension is too large to fit in shared memory");
411 size_type idx = 0;
412 double min_reads = reads_per_thread[0].reads;
413 for (int i=1; i<reads_per_thread.size(); ++i) {
414 if (reads_per_thread[i].reads < min_reads) {
415 idx = i;
416 min_reads = reads_per_thread[i].reads;
417 }
418 }
419
420 const size_type block_size = reads_per_thread[idx].block_size;
421 const size_type shmem = reads_per_thread[idx].shmem;
422 const size_type num_blocks = reads_per_thread[idx].num_blocks;
423 const size_type num_warp = reads_per_thread[idx].num_warp;
424
425#endif
426
427 // Setup thread blocks and grid
428 const dim3 dBlock( threads_per_row , rows_per_warp*num_warp , 1 );
429 const dim3 dGrid( row_count, 1, 1 );
430
431#if 0
432 std::cout << "block_size = " << block_size
433 << " tensor reads = " << (fem_nnz_per_row+block_size-1)/block_size
434 << " regs_per_thread = " << regs_per_thread
435 << " num blocks = " << num_blocks
436 << " num warps = " << num_warp
437 << " num rows = " << tensor_dimension
438 << " rows/warp = " << tensor_dimension / (num_warp*rows_per_warp)
439 << " avg entries/row = " << avg_tensor_entries_per_row
440 << std::endl;
441#endif
442
443 // Finally launch our kernel
444 //cudaProfilerStart();
445 Kokkos::Impl::cuda_parallel_launch_local_memory<<< dGrid, dBlock, shmem >>>
446 ( Multiply( A, x, y, a, b, block_size ) );
447 //cudaProfilerStop();
448 }
449};
450
451// Kernel implementing y = A * x where
452// A == Kokkos::CrsMatrix< Sacado::UQ::PCE<...>,...>,
453// x, y == Kokkos::View< Sacado::UQ::PCE<...>**,...>,
454// x and y are rank 2
455//
456// Note: Unlike the rank-1 version, this version has not been
457// optimized, and doesn't even include the block-column implementation
458template <typename MatrixStorage,
459 typename MatrixOrdinal,
460 typename MatrixMemory,
461 typename MatrixSize,
462 typename InputStorage,
463 typename ... InputP,
464 typename OutputStorage,
465 typename ... OutputP>
466class Multiply< KokkosSparse::CrsMatrix< const Sacado::UQ::PCE<MatrixStorage>,
467 MatrixOrdinal,
468 Kokkos::Cuda,
469 MatrixMemory,
470 MatrixSize>,
471 Kokkos::View< const Sacado::UQ::PCE<InputStorage>**,
472 InputP... >,
473 Kokkos::View< Sacado::UQ::PCE<OutputStorage>**,
474 OutputP... >
475 >
476{
477public:
478 typedef Sacado::UQ::PCE<MatrixStorage> MatrixValue;
479 typedef Sacado::UQ::PCE<InputStorage> InputVectorValue;
480 typedef Sacado::UQ::PCE<OutputStorage> OutputVectorValue;
481
482 typedef Kokkos::Cuda MatrixDevice;
483 typedef MatrixDevice execution_space;
484 typedef execution_space::size_type size_type;
485
486 typedef KokkosSparse::CrsMatrix< const MatrixValue,
487 MatrixOrdinal,
488 MatrixDevice,
489 MatrixMemory,
490 MatrixSize> matrix_type;
491 typedef Kokkos::View< const InputVectorValue**,
492 InputP... > input_vector_type;
493 typedef Kokkos::View< OutputVectorValue**,
494 OutputP... > output_vector_type;
495 typedef typename InputVectorValue::value_type input_scalar;
496 typedef typename OutputVectorValue::value_type output_scalar;
497
498public:
499
500 static void apply( const matrix_type & A ,
501 const input_vector_type & x ,
502 const output_vector_type & y ,
503 const input_scalar & a = input_scalar(1) ,
504 const output_scalar & b = output_scalar(0) )
505 {
506 typedef Kokkos::View< const InputVectorValue*, InputP... > input_vector_type_1D;
507 typedef Kokkos::View< OutputVectorValue*, OutputP... > output_vector_type_1D;
508 typedef Multiply< matrix_type, input_vector_type_1D,
509 output_vector_type_1D > multiply_type_1D;
510
511 const size_type num_col = y.extent(1);
512 for (size_type col=0; col<num_col; ++col)
513
514 multiply_type_1D::apply(
515 A,
516 Kokkos::subview( x, Kokkos::ALL(), col),
517 Kokkos::subview(y, Kokkos::ALL(), col),
518 a, b );
519 }
520};
521
522template <typename Kernel>
523__global__ void
524#if __CUDA_ARCH__ >= 300
525__launch_bounds__(1024,2)
526#endif
527MeanFullOccupancyKernelLaunch(Kernel kernel) {
528 kernel();
529}
530
531// Kernel implementing y = A * x where PCE size of A is 1
532// A == Kokkos::CrsMatrix< Sacado::UQ::PCE<...>,...>, with A.values.sacado_size() == 1
533// x, y == Kokkos::View< Sacado::UQ::PCE<...>*,...>,
534// x and y are rank 1
535template <typename MatrixStorage,
536 typename MatrixOrdinal,
537 typename MatrixMemory,
538 typename MatrixSize,
539 typename InputStorage,
540 typename ... InputP,
541 typename OutputStorage,
542 typename ... OutputP>
543class MeanMultiply< KokkosSparse::CrsMatrix< const Sacado::UQ::PCE<MatrixStorage>,
544 MatrixOrdinal,
545 Kokkos::Cuda,
546 MatrixMemory,
547 MatrixSize >,
548 Kokkos::View< const Sacado::UQ::PCE<InputStorage>*,
549 InputP... >,
550 Kokkos::View< Sacado::UQ::PCE<OutputStorage>*,
551 OutputP... >,
552 >
553{
554public:
555 typedef Sacado::UQ::PCE<MatrixStorage> MatrixValue;
556 typedef Sacado::UQ::PCE<InputStorage> InputVectorValue;
557 typedef Sacado::UQ::PCE<OutputStorage> OutputVectorValue;
558
559 typedef Kokkos::Cuda MatrixDevice;
560 typedef MatrixDevice execution_space;
561 typedef KokkosSparse::CrsMatrix< const MatrixValue,
562 MatrixOrdinal,
563 MatrixDevice,
564 MatrixMemory,
565 MatrixSize> matrix_type;
566 typedef typename matrix_type::values_type matrix_values_type;
567 typedef typename MatrixValue::ordinal_type size_type;
568 typedef Kokkos::View< const InputVectorValue*,
569 InputP... > input_vector_type;
570 typedef Kokkos::View< OutputVectorValue*,
571 OutputP... > output_vector_type;
572
573 typedef typename matrix_type::StaticCrsGraphType matrix_graph_type;
574 typedef typename MatrixValue::value_type matrix_scalar;
575 typedef typename InputVectorValue::value_type input_scalar;
576 typedef typename OutputVectorValue::value_type output_scalar;
577
578 template <int BlockSize>
579 struct Kernel {
580 typedef MatrixDevice execution_space;
581 typedef typename Kokkos::FlatArrayType<matrix_values_type>::type matrix_array_type;
582 typedef typename input_vector_type::array_type input_array_type;
583 typedef typename output_vector_type::array_type output_array_type;
584
585 const matrix_array_type m_A_values ;
586 const matrix_graph_type m_A_graph ;
587 const output_array_type v_y ;
588 const input_array_type v_x ;
589 const input_scalar m_a ;
590 const output_scalar m_b ;
591 const size_type m_row_count;
592 const size_type dim ;
593
594 Kernel( const matrix_type & A ,
595 const input_vector_type & x ,
596 const output_vector_type & y ,
597 const input_scalar & a ,
598 const output_scalar & b )
599 : m_A_values( A.values )
600 , m_A_graph( A.graph )
601 , v_y( y )
602 , v_x( x )
603 , m_a( a )
604 , m_b( b )
605 , m_row_count( A.graph.row_map.extent(0)-1 )
606 , dim( dimension_scalar(x) )
607 {}
608
609 __device__ void operator()(void) const
610 {
611 // Store matrix values and column indices in shared memory
612 // to reduce global memory reads
613 volatile matrix_scalar * const sh_A =
614 kokkos_impl_cuda_shared_memory<matrix_scalar>();
615 volatile size_type * const sh_col =
616 reinterpret_cast<volatile size_type*>(sh_A + BlockSize*blockDim.y);
617
618 // Check for valid row
619 const size_type iBlockRow = blockDim.y*blockIdx.x + threadIdx.y;
620 if (iBlockRow < m_row_count) {
621
622 const size_type iEntryBegin = m_A_graph.row_map[ iBlockRow ];
623 const size_type iEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
624
625 // Initialize result
626 if (m_b == output_scalar(0))
627 for ( size_type pce = threadIdx.x; pce < dim ; pce+=blockDim.x )
628 v_y(pce, iBlockRow) = 0.0;
629 else
630 for ( size_type pce = threadIdx.x; pce < dim ; pce+=blockDim.x )
631 v_y(pce, iBlockRow) = m_b*v_y(pce, iBlockRow);
632
633 // Loop over columns in chunks of size BlockSize
634 for (size_type col_block=iEntryBegin; col_block<iEntryEnd;
635 col_block+=BlockSize) {
636 const size_type num_col = col_block+BlockSize <= iEntryEnd ?
637 BlockSize : iEntryEnd-col_block;
638
639 // Read BlockSize entries column indices at a time to maintain
640 // coalesced accesses
641 for (size_type col=threadIdx.x; col<num_col; col+=blockDim.x) {
642 sh_col[col*blockDim.y+threadIdx.y] =
643 m_A_graph.entries(col_block+col);
644 sh_A[col*blockDim.y+threadIdx.y] =
645 m_A_values(col_block+col);
646 }
647 if (blockDim.x > Kokkos::Impl::CudaTraits::WarpSize)
648 __syncthreads();
649
650 // Do portion mat-vec for each PCE coefficient and for columns
651 // within this block
652 for ( size_type pce = threadIdx.x; pce < dim ; pce+=blockDim.x ) {
653 output_scalar s = 0.0;
654 for ( size_type col = 0; col < num_col; ++col ) {
655 const size_type iCol = sh_col[col*blockDim.y+threadIdx.y];
656 const matrix_scalar aA = m_a*sh_A[col*blockDim.y+threadIdx.y];
657 s += aA*v_x(pce, iCol);
658 }
659 v_y(pce, iBlockRow) += s;
660 }
661 }
662 }
663 }
664 };
665
666 static void apply( const matrix_type & A ,
667 const input_vector_type & x ,
668 const output_vector_type & y ,
669 const input_scalar & a = input_scalar(1) ,
670 const output_scalar & b = output_scalar(0) )
671 {
672 const size_t row_count = A.graph.row_map.extent(0) - 1;
673 const size_type dim = dimension_scalar(x);
674
675 // Compute number of threads for PCE coefficients and number of
676 // matrix rows processed by each CUDA block. A total of 256 threads
677 // gives best occupancy
678 size_type threads_per_row;
679 size_type rows_per_block;
680 if (dim >= 32) {
681 threads_per_row = 32;
682 rows_per_block = 8;
683 }
684 else {
685 threads_per_row = 16;
686 rows_per_block = 16;
687 }
688 const size_type num_blocks =
689 (row_count + rows_per_block -1 ) / rows_per_block;
690
691 // Setup thread blocks and grid
692 const dim3 dBlock( threads_per_row , rows_per_block , 1 );
693 const dim3 dGrid( num_blocks, 1, 1 );
694
695 // Setup shared memory for storing matrix values and column indices
696 // Number of columns we process at a time -- making this bigger
697 // requires more shared memory and reduces occupancy
698 const int BlockSize = 32;
699 if (sizeof(matrix_scalar) > 4)
700 cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
701 else
702 cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte);
703 const size_t shared =
704 (BlockSize*dBlock.y) * (sizeof(size_type) + sizeof(matrix_scalar));
705
706 // Launch kernel
707 MeanFullOccupancyKernelLaunch<<<dGrid, dBlock, shared >>>
708 ( Kernel<BlockSize>( A, x, y, a, b ) );
709 }
710};
711
712/*
713 * Disable this specialization as it is actually slower than the default
714 * implementation of launching the single column kernel, one column at at time.
715 * It looks like it uses a few more registers which is reducing occupancy.
716
717// Kernel implementing y = A * x where PCE size of A is 1
718// A == Kokkos::CrsMatrix< Sacado::UQ::PCE<...>,...>, with A.values.sacado_size() == 1
719// x, y == Kokkos::View< Sacado::UQ::PCE<...>*,...>,
720// x and y are rank 2
721template <typename MatrixStorage,
722 typename MatrixOrdinal,
723 typename MatrixMemory,
724 typename MatrixSize,
725 typename InputStorage,
726 typename ... InputP,
727 typename OutputStorage,
728 typename ... OutputP>
729class MeanMultiply< Kokkos::CrsMatrix< const Sacado::UQ::PCE<MatrixStorage>,
730 MatrixOrdinal,
731 Kokkos::Cuda,
732 MatrixMemory,
733 MatrixSize >,
734 Kokkos::View< Sacado::UQ::PCE<InputStorage>**,
735 InputP... >,
736 Kokkos::View< Sacado::UQ::PCE<OutputStorage>**,
737 OutputP... >,
738 >
739{
740public:
741 typedef Sacado::UQ::PCE<MatrixStorage> MatrixValue;
742 typedef Sacado::UQ::PCE<InputStorage> InputVectorValue;
743 typedef Sacado::UQ::PCE<OutputStorage> OutputVectorValue;
744
745 typedef Kokkos::Cuda MatrixDevice;
746 typedef MatrixDevice execution_space;
747 typedef Kokkos::CrsMatrix< const MatrixValue,
748 MatrixOrdinal,
749 MatrixDevice,
750 MatrixMemory,
751 MatrixSize> matrix_type;
752 typedef typename matrix_type::values_type matrix_values_type;
753 typedef typename MatrixValue::ordinal_type size_type;
754 typedef Kokkos::View< InputVectorValue**,
755 InputP... > input_vector_type;
756 typedef Kokkos::View< OutputVectorValue**,
757 OutputP... > output_vector_type;
758
759 typedef typename matrix_type::StaticCrsGraphType matrix_graph_type;
760 typedef typename MatrixValue::value_type matrix_scalar;
761 typedef typename InputVectorValue::value_type input_scalar;
762 typedef typename OutputVectorValue::value_type output_scalar;
763
764 template <int BlockSize>
765 struct Kernel {
766 typedef Device execution_space;
767 typedef typename Kokkos::FlatArrayType<matrix_values_type>::type matrix_array_type;
768 typedef typename input_vector_type::array_type input_array_type;
769 typedef typename output_vector_type::array_type output_array_type;
770
771 const matrix_array_type m_A_values ;
772 const matrix_graph_type m_A_graph ;
773 const output_array_type v_y ;
774 const input_array_type v_x ;
775 const input_scalar m_a ;
776 const output_scalar m_b ;
777 const size_type m_row_count;
778 const size_type m_num_col;
779 const size_type dim ;
780
781 Kernel( const matrix_type & A ,
782 const input_vector_type & x ,
783 const output_vector_type & y ,
784 const input_scalar & a ,
785 const output_scalar & b )
786 : m_A_values( A.values )
787 , m_A_graph( A.graph )
788 , v_y( y )
789 , v_x( x )
790 , m_a( a )
791 , m_b( b )
792 , m_row_count( A.graph.row_map.extent(0)-1 )
793 , m_num_col( x.extent(1) )
794 , dim( dimension_scalar(x) )
795 {}
796
797 __device__ void operator()(void) const
798 {
799 // Store matrix values and column indices in shared memory
800 // to reduce global memory reads
801 volatile matrix_scalar * const sh_A =
802 kokkos_impl_cuda_shared_memory<matrix_scalar>();
803 volatile size_type * const sh_col =
804 reinterpret_cast<volatile size_type*>(sh_A + BlockSize*blockDim.y);
805
806 // Check for valid row
807 const size_type iBlockRow = blockDim.y*blockIdx.x + threadIdx.y;
808 if (iBlockRow < m_row_count) {
809
810 const size_type iEntryBegin = m_A_graph.row_map[ iBlockRow ];
811 const size_type iEntryEnd = m_A_graph.row_map[ iBlockRow + 1 ];
812
813 // Initialize result
814 if (m_b == output_scalar(0))
815 for ( size_type xcol = 0; xcol < m_num_col; xcol++)
816 for ( size_type pce = threadIdx.x; pce < dim ; pce+=blockDim.x )
817 v_y(pce, iBlockRow, xcol) = 0.0;
818 else
819 for ( size_type xcol = 0; xcol < m_num_col; xcol++)
820 for ( size_type pce = threadIdx.x; pce < dim ; pce+=blockDim.x )
821 v_y(pce, iBlockRow, xcol) = m_b*v_y(pce, iBlockRow, xcol);
822
823 // Loop over columns in chunks of size BlockSize
824 for (size_type col_block=iEntryBegin; col_block<iEntryEnd;
825 col_block+=BlockSize) {
826 const size_type num_col = col_block+BlockSize <= iEntryEnd ?
827 BlockSize : iEntryEnd-col_block;
828
829 // Read BlockSize entries column indices at a time to maintain
830 // coalesced accesses
831 for (size_type col=threadIdx.x; col<num_col; col+=blockDim.x) {
832 sh_col[col*blockDim.y+threadIdx.y] =
833 m_A_graph.entries(col_block+col);
834 sh_A[col*blockDim.y+threadIdx.y] =
835 m_A_values(col_block+col);
836 }
837 if (blockDim.x > Kokkos::Impl::CudaTraits::WarpSize)
838 __syncthreads();
839
840 // Do portion mat-vec for each PCE coefficient and for columns
841 // within this block
842 for ( size_type xcol = 0; xcol < m_num_col; xcol++) {
843 for ( size_type pce = threadIdx.x; pce < dim ; pce+=blockDim.x ) {
844 output_scalar s = 0.0;
845 for ( size_type col = 0; col < num_col; ++col ) {
846 const size_type iCol = sh_col[col*blockDim.y+threadIdx.y];
847 const matrix_scalar aA = m_a*sh_A[col*blockDim.y+threadIdx.y];
848 s += aA*v_x(pce, iCol, xcol);
849 }
850 v_y(pce, iBlockRow, xcol) += s;
851 }
852 }
853 }
854 }
855 }
856 };
857
858 static void apply( const matrix_type & A ,
859 const input_vector_type & x ,
860 const output_vector_type & y ,
861 const input_scalar & a = input_scalar(1) ,
862 const output_scalar & b = output_scalar(0) )
863 {
864 const size_t row_count = A.graph.row_map.extent(0) - 1;
865 const size_type dim = dimension_scalar(x);
866
867 // Compute number of threads for PCE coefficients and number of
868 // matrix rows processed by each CUDA block. A total of 256 threads
869 // gives best occupancy
870 size_type threads_per_row;
871 size_type rows_per_block;
872 if (dim >= 32) {
873 threads_per_row = 32;
874 rows_per_block = 6;
875 }
876 else {
877 threads_per_row = 16;
878 rows_per_block = 12;
879 }
880 const size_type num_blocks =
881 (row_count + rows_per_block -1 ) / rows_per_block;
882
883 // Setup thread blocks and grid
884 const dim3 dBlock( threads_per_row , rows_per_block , 1 );
885 const dim3 dGrid( num_blocks, 1, 1 );
886
887 // Setup shared memory for storing matrix values and column indices
888 // Number of columns we process at a time -- making this bigger
889 // requires more shared memory and reduces occupancy
890 const int BlockSize = 32;
891 if (sizeof(matrix_scalar) > 4)
892 cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
893 else
894 cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte);
895 const size_t shared =
896 (BlockSize*dBlock.y) * (sizeof(size_type) + sizeof(matrix_scalar));
897
898 // Launch kernel
899 // MeanFullOccupancyKernelLaunch<<<dGrid, dBlock, shared >>>
900 // ( Kernel<BlockSize>( A, x, y, a, b ) );
901
902 Kokkos::Impl::cuda_parallel_launch_local_memory<<<dGrid, dBlock, shared >>>
903 ( Kernel<BlockSize>( A, x, y, a, b ) );
904 }
905};
906*/
907
908} // namespace Stokhos
909
910#endif /* #if defined( __CUDACC__) */
911
912#endif /* #ifndef KOKKOS_CRSMATRIX_UQ_PCE_CUDA_HPP */
Kokkos::DefaultExecutionSpace execution_space
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< is_view_uq_pce< View< T, P... > >::value, unsigned >::type dimension_scalar(const View< T, P... > &view)
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< is_view_uq_pce< view_type >::value, typenameCijkType< view_type >::type >::type cijk(const view_type &view)
Top-level namespace for Stokhos classes and functions.
const IndexType const IndexType const IndexType const IndexType const ValueType const ValueType * x
Definition: csr_vector.h:265
__launch_bounds__(VECTORS_PER_BLOCK *THREADS_PER_VECTOR, 1) __global__ void spmm_csr_vector_kernel_col(const IndexType Anum_rows
const IndexType const IndexType const IndexType const IndexType const ValueType const ValueType ValueType * y
Definition: csr_vector.h:267