Stokhos Package Browser (Single Doxygen Collection) Version of the Day
Loading...
Searching...
No Matches
Stokhos_Cuda_TiledCrsProductTensor.hpp
Go to the documentation of this file.
1// @HEADER
2// ***********************************************************************
3//
4// Stokhos Package
5// Copyright (2009) Sandia Corporation
6//
7// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8// license for use of this work by or on behalf of the U.S. Government.
9//
10// Redistribution and use in source and binary forms, with or without
11// modification, are permitted provided that the following conditions are
12// met:
13//
14// 1. Redistributions of source code must retain the above copyright
15// notice, this list of conditions and the following disclaimer.
16//
17// 2. Redistributions in binary form must reproduce the above copyright
18// notice, this list of conditions and the following disclaimer in the
19// documentation and/or other materials provided with the distribution.
20//
21// 3. Neither the name of the Corporation nor the names of the
22// contributors may be used to endorse or promote products derived from
23// this software without specific prior written permission.
24//
25// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36//
37// Questions? Contact Eric T. Phipps (etphipp@sandia.gov).
38//
39// ***********************************************************************
40// @HEADER
41
42#ifndef STOKHOS_CUDA_TILED_CRS_PRODUCT_TENSOR_HPP
43#define STOKHOS_CUDA_TILED_CRS_PRODUCT_TENSOR_HPP
44
45#include <iostream>
46
47#include "Kokkos_Core.hpp"
48
49#include "Stokhos_Multiply.hpp"
52
53#include "cuda_profiler_api.h"
54
55namespace Stokhos {
56
57#if 1
58
59//----------------------------------------------------------------------------
60
61template< typename TensorScalar ,
62 typename MatrixScalar ,
63 typename VectorScalar >
65 BlockCrsMatrix< TiledCrsProductTensor< TensorScalar, Kokkos::Cuda >,
66 MatrixScalar, Kokkos::Cuda >,
67 Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda>,
68 Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda> >
69{
70public:
71
72 typedef Kokkos::Cuda execution_space ;
73 typedef execution_space::size_type size_type ;
74
77 typedef Kokkos::View< VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda > vector_type ;
78
79 class ProductTensorLoop {
80 public:
81
86
88 const vector_type & x,
89 const vector_type & y,
90 const size_type block_size )
91 : m_A( A ), m_x( x ), m_y( y ), m_block_size(block_size) {}
92
93 __device__
94 void operator()(void) const
95 {
96 const size_type WarpSize = Kokkos::Impl::CudaTraits::WarpSize;
97
98 // Number of bases in the stochastic system:
99 const size_type dim = m_A.block.dimension();
100
101 // Number of Cijk tiles
102 const size_type n_tile = m_A.block.num_tiles();
103 const size_type tile_size = m_A.block.tile_size();
104 const size_type tile_dim = n_tile == 1 ? dim : tile_size;
105 //const size_type tile_dim = tile_size;
106
107 VectorScalar * const sh_x_k =
108 kokkos_impl_cuda_shared_memory<VectorScalar>();
109 VectorScalar * const sh_x_j =
110 n_tile == 1 ? sh_x_k : sh_x_k + m_block_size*tile_dim;
111 VectorScalar * const sh_A_k =
112 sh_x_j + m_block_size*tile_dim;
113 VectorScalar * const sh_A_j =
114 n_tile == 1 ? sh_A_k : sh_A_k + m_block_size*tile_dim;
115 VectorScalar * const sh_y = sh_A_j + m_block_size*tile_dim;
116 volatile VectorScalar * const sh_t = sh_y + tile_dim;
117
118 const size_type nid = blockDim.x * blockDim.y;
119 const size_type tid = threadIdx.x + blockDim.x * threadIdx.y;
120
121 // blockIdx.x == row in the deterministic (finite element) system
122 const size_type iBlockEntryBeg = m_A.graph.row_map[ blockIdx.x ];
123 const size_type iBlockEntryEnd = m_A.graph.row_map[ blockIdx.x + 1 ];
124 size_type numBlock = (iBlockEntryEnd-iBlockEntryBeg) / m_block_size;
125 const size_type remBlock = (iBlockEntryEnd-iBlockEntryBeg) % m_block_size;
126 if (remBlock > 0) ++numBlock;
127
128 // Zero y
129 for (size_type i=tid; i<dim; i+=nid) {
130 m_y(i,blockIdx.x) = 0.0;
131 }
132
133 // Loop over Cijk tiles
134 for (size_type tile = 0; tile<n_tile; ++tile) {
135
136 const size_type i_offset = m_A.block.offset(tile, 0);
137 const size_type j_offset = m_A.block.offset(tile, 1);
138 const size_type k_offset = m_A.block.offset(tile, 2);
139 const size_type i_range = m_A.block.range(tile, 0);
140 const size_type j_range = m_A.block.range(tile, 1);
141 const size_type k_range = m_A.block.range(tile, 2);
142 const size_type n_row = m_A.block.num_rows(tile);
143
144 // Zero y
145 for (size_type i=tid; i<i_range; i+=nid) {
146 sh_y[i] = 0.0;
147 }
148
149 // Loop over finite element column blocks.
150 size_type iBlockEntry = iBlockEntryBeg;
151 for (size_type block=0; block<numBlock;
152 ++block, iBlockEntry+=m_block_size) {
153
154 const size_type block_size =
155 (block == numBlock-1 && remBlock > 0) ? remBlock : m_block_size;
156
157 // Wait for X and A to be used in the previous iteration
158 // before reading new values.
159 __syncthreads();
160
161 // Coalesced read blocks of X and A into shared memory
162 for (size_type col=0; col<block_size; ++col) {
163
164 const size_type iBlockColumn =
165 m_A.graph.entries( iBlockEntry + col );
166 const VectorScalar * const x_k = & m_x( k_offset , iBlockColumn );
167 const VectorScalar * const x_j = & m_x( j_offset , iBlockColumn );
168 const MatrixScalar * const A_k = & m_A.values( k_offset , iBlockEntry + col );
169 const MatrixScalar * const A_j = & m_A.values( j_offset , iBlockEntry + col );
170
171 for (size_type j=tid; j<j_range; j+=nid) {
172 sh_x_j[col+j*m_block_size] = x_j[j];
173 sh_A_j[col+j*m_block_size] = A_j[j];
174 }
175 if (n_tile > 1) {
176 for (size_type k=tid; k<k_range; k+=nid) {
177 sh_x_k[col+k*m_block_size] = x_k[k];
178 sh_A_k[col+k*m_block_size] = A_k[k];
179 }
180 }
181
182 }
183
184 __syncthreads(); // wait for X and A to be read
185
186 // Loop over stochastic rows in this tile
187 for (size_type i=threadIdx.y; i<i_range; i+=blockDim.y) {
188 VectorScalar s = 0;
189
190 // Product tensor entries which this warp will iterate:
191 const size_type lBeg = m_A.block.entry_begin(tile, i);
192 const size_type lEnd = m_A.block.entry_end(tile, i);
193
194 // Loop through sparse tensor contributions with coalesced reads.
195 for (size_type l=lBeg+threadIdx.x; l<lEnd; l+=blockDim.x) {
196
197 const size_type kj = m_A.block.coord( l );
198 const TensorScalar v = m_A.block.value( l );
199 const size_type j = ( kj & 0x0ffff ) * m_block_size ;
200 const size_type k = ( kj >> 16 ) * m_block_size ;
201
202 for ( size_type col = 0; col < block_size; ++col ) {
203 s += v * ( sh_A_j[col+j] * sh_x_k[col+k] + sh_A_k[col+k] * sh_x_j[col+j] );
204 }
205
206 }
207
208 // Reduction of 'y' within 'CudaTraits::WarpSize'
209 sh_t[tid] = s;
210 if ( threadIdx.x + 16 < WarpSize ) sh_t[tid] += sh_t[tid+16];
211 if ( threadIdx.x + 8 < WarpSize ) sh_t[tid] += sh_t[tid+ 8];
212 if ( threadIdx.x + 4 < WarpSize ) sh_t[tid] += sh_t[tid+ 4];
213 if ( threadIdx.x + 2 < WarpSize ) sh_t[tid] += sh_t[tid+ 2];
214 if ( threadIdx.x + 1 < WarpSize ) sh_t[tid] += sh_t[tid+ 1];
215 if ( threadIdx.x == 0 ) sh_y[i] += sh_t[tid];
216
217 }
218
219 }
220
221 // Wait for all threads to complete the tile
222 __syncthreads();
223
224 // Store partial sum for this tile back in global memory
225 for (size_type i=tid; i<i_range; i+=nid) {
226 m_y( i+i_offset , blockIdx.x ) += sh_y[i];
227 }
228 }
229
230 }
231 };
232
233 //------------------------------------
234
235 static void apply( const matrix_type & A ,
236 const vector_type & x ,
237 const vector_type & y )
238 {
239 const size_type row_count = A.graph.row_map.extent(0) - 1;
240 const size_type tensor_dimension = A.block.dimension();
241 const size_type tile_size = A.block.tile_size();
242 const size_type num_tiles = A.block.num_tiles();
243 //const size_type tile_dim = std::min(tensor_dimension, tile_size);
244 const size_type tile_dim = tile_size;
245
246#ifdef STOKHOS_DEBUG
247 const size_type nWarp = 12; // Use fewer warps in debug mode to prevent
248 // launch failures
249#else
250 const size_type nWarp = 16;
251#endif
252 const dim3 dBlock( Kokkos::Impl::CudaTraits::WarpSize , nWarp , 1 );
253 const dim3 dGrid( row_count , 1 , 1 );
254
255 const size_type shmem_factor = num_tiles == 1 ? 2 : 4;
256 const size_type tensor_align = num_tiles == 1 ? tensor_dimension : tile_dim;
257 const size_type shcap =
258 Kokkos::Cuda().impl_internal_space_instance()->m_maxShmemPerBlock / 2;
259 size_type bs = ((shcap / sizeof(VectorScalar) - dBlock.x*dBlock.y) / tensor_align - 1) / shmem_factor;
260 if (bs % 2 == 0)
261 --bs;
262 const size_type block_size_max = 31;
263 const size_type block_size = std::min(bs, block_size_max);
264 // const int block_size = 9;
265 const size_type shmem =
266 sizeof(VectorScalar) * ((shmem_factor*block_size+1) * tensor_align + dBlock.x*dBlock.y);
267
268 /*
269 //const size_type shcap = Kokkos::Impl::CudaTraits::SharedMemoryCapacity;
270 const size_type block_size = 9;
271 size_type shmem;
272 if (num_tiles > 1)
273 shmem = sizeof(VectorScalar) * ((4*block_size+1)*tile_dim + dBlock.x*dBlock.y);
274 else
275 shmem = sizeof(VectorScalar) * ((2*block_size+1)*tensor_dimension + dBlock.x*dBlock.y);
276 */
277
278#if 0
279
280 std::cout << "Multiply< BlockCrsMatrix< CrsProductTensor ... > >::apply"
281 << std::endl
282 << " grid(" << dGrid.x << "," << dGrid.y << ")" << std::endl
283 << " block(" << dBlock.x << "," << dBlock.y << ")" << std::endl
284 << " block_size(" << block_size << ")" << std::endl
285 << " shmem(" << shmem/1024 << " kB)" << std::endl
286 << " row_count(" << row_count << ")" << std::endl
287 << " tensor_dimension(" << tensor_dimension << ")" << std::endl
288 << " tile_size(" << tile_size << ")" << std::endl
289 << " num_tiles(" << num_tiles << ")" << std::endl
290 ;
291#endif
292 //cudaProfilerStart();
293 Kokkos::Impl::cuda_parallel_launch_local_memory<<< dGrid , dBlock , shmem >>>
294 ( ProductTensorLoop( A , x , y, block_size ) );
295 //cudaProfilerStop();
296 }
297};
298
299#elif 0
300
301//----------------------------------------------------------------------------
302
303template< typename TensorScalar ,
304 typename MatrixScalar ,
305 typename VectorScalar >
306class Multiply<
307 BlockCrsMatrix< TiledCrsProductTensor< TensorScalar, Kokkos::Cuda >,
308 MatrixScalar, Kokkos::Cuda >,
309 Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda>,
310 Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda> >
311{
312public:
313
314 typedef Kokkos::Cuda execution_space ;
315 typedef execution_space::size_type size_type ;
316
317 typedef TiledCrsProductTensor< TensorScalar, execution_space > tensor_type ;
318 typedef BlockCrsMatrix< tensor_type, MatrixScalar, execution_space > matrix_type ;
319 typedef Kokkos::View< VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda > vector_type ;
320
321 class ProductTensorLoop {
322 public:
323
324 const matrix_type m_A;
325 const vector_type m_x;
326 const vector_type m_y;
327 const size_type m_tile;
328
329 ProductTensorLoop( const matrix_type & A,
330 const vector_type & x,
331 const vector_type & y,
332 const size_type tile )
333 : m_A( A ), m_x( x ), m_y( y ), m_tile(tile) {}
334
335 __device__
336 void operator()(void) const
337 {
338 const size_type WarpSize = Kokkos::Impl::CudaTraits::WarpSize;
339
340 // Number of bases in the stochastic system:
341 const size_type dim = m_A.block.dimension();
342 const size_type tile_size = m_A.block.tile_size();
343
344 VectorScalar * const sh_x_k =
345 kokkos_impl_cuda_shared_memory<VectorScalar>();
346 VectorScalar * const sh_x_j = sh_x_k + tile_size;
347 VectorScalar * const sh_A_k = sh_x_j + tile_size;
348 VectorScalar * const sh_A_j = sh_A_k + tile_size;
349 VectorScalar * const sh_y = sh_A_j + tile_size;
350 volatile VectorScalar * const sh_t = sh_y + tile_size;
351
352 const size_type nid = blockDim.x * blockDim.y;
353 const size_type tid = threadIdx.x + blockDim.x * threadIdx.y;
354
355 // Divide warps into 4 groups for reading x_j, x_k, A_j, and A_k
356 // const size_type warps_per_group = blockDim.y / 4;
357 // const size_type group_lane = threadIdx.y % warps_per_group;
358 // const size_type gid = threadIdx.x + blockDim.x * group_lane;
359 // const size_type ngid = warps_per_group * blockDim.x;
360 // const size_type group = threadIdx.y / warps_per_group;
361
362 const size_type i_offset = m_A.block.offset(m_tile, 0);
363 const size_type j_offset = m_A.block.offset(m_tile, 1);
364 const size_type k_offset = m_A.block.offset(m_tile, 2);
365 const size_type i_range = m_A.block.range(m_tile, 0);
366 const size_type j_range = m_A.block.range(m_tile, 1);
367 const size_type k_range = m_A.block.range(m_tile, 2);
368 const size_type n_row = m_A.block.num_rows(m_tile);
369
370 // blockIdx.x == row in the deterministic (finite element) system
371 const size_type iBlockEntryBeg = m_A.graph.row_map[ blockIdx.x ];
372 const size_type iBlockEntryEnd = m_A.graph.row_map[ blockIdx.x + 1 ];
373
374 // Zero y
375 for (size_type i=tid; i<i_range; i+=nid) {
376 sh_y[i] = 0.0;
377 }
378
379 const size_type * __restrict cijk_j = &m_A.block.coord(0,0);
380 const size_type * __restrict cijk_k = &m_A.block.coord(0,1);
381 const TensorScalar * __restrict cijk_v = &m_A.block.value(0);
382
383 // Loop over finite element column blocks.
384 for (size_type iBlockEntry = iBlockEntryBeg; iBlockEntry<iBlockEntryEnd;
385 ++iBlockEntry) {
386
387 __syncthreads(); // wait for threads from previous iteration to finish
388
389 // Coalesced read blocks of X and A into shared memory
390 const size_type iBlockColumn = m_A.graph.entries( iBlockEntry );
391 const VectorScalar * const x_k = & m_x( k_offset , iBlockColumn );
392 const VectorScalar * const x_j = & m_x( j_offset , iBlockColumn );
393 const MatrixScalar * const A_k = & m_A.values( k_offset , iBlockEntry );
394 const MatrixScalar * const A_j = & m_A.values( j_offset , iBlockEntry );
395
396 for (size_type j=tid; j<j_range; j+=nid) {
397 sh_x_j[j] = x_j[j];
398 sh_A_j[j] = A_j[j];
399 }
400 for (size_type k=tid; k<k_range; k+=nid) {
401 sh_x_k[k] = x_k[k];
402 sh_A_k[k] = A_k[k];
403 }
404 // if (group == 0)
405 // for (size_type j=gid; j<j_range; j+=ngid) sh_x_j[j] = x_j[j];
406 // if (group == 1)
407 // for (size_type j=gid; j<j_range; j+=ngid) sh_A_j[j] = A_j[j];
408 // if (group == 2)
409 // for (size_type k=gid; k<k_range; k+=ngid) sh_x_k[k] = x_k[k];
410 // if (group == 3)
411 // for (size_type k=gid; k<k_range; k+=ngid) sh_A_k[k] = A_k[k];
412
413 __syncthreads(); // wait for X and A to be read
414
415 // Loop over stochastic rows in this tile
416 for (size_type i=threadIdx.y; i<i_range; i+=blockDim.y) {
417 VectorScalar s = 0;
418
419 // Product tensor entries which this warp will iterate:
420 const size_type lBeg = m_A.block.entry_begin(m_tile, i);
421 const size_type lEnd = m_A.block.entry_end(m_tile, i);
422
423 // Loop through sparse tensor contributions with coalesced reads.
424 for (size_type l=lBeg+threadIdx.x; l<lEnd; l+=blockDim.x) {
425
426 // Read entries from the tensor
427 // const int j = m_A.block.coord(l,0);
428 // const int k = m_A.block.coord(l,1);
429 // const MatrixScalar v = m_A.block.value(l);
430 const size_type j = cijk_j[l];
431 const size_type k = cijk_k[l];
432 const TensorScalar v = cijk_v[l];
433
434 s += v * ( sh_A_j[j] * sh_x_k[k] + sh_A_k[k] * sh_x_j[j] );
435
436 }
437
438 // Reduction of 'y' within 'CudaTraits::WarpSize'
439 sh_t[tid] = s;
440 if ( threadIdx.x + 16 < WarpSize ) sh_t[tid] += sh_t[tid+16];
441 if ( threadIdx.x + 8 < WarpSize ) sh_t[tid] += sh_t[tid+ 8];
442 if ( threadIdx.x + 4 < WarpSize ) sh_t[tid] += sh_t[tid+ 4];
443 if ( threadIdx.x + 2 < WarpSize ) sh_t[tid] += sh_t[tid+ 2];
444 if ( threadIdx.x + 1 < WarpSize ) sh_t[tid] += sh_t[tid+ 1];
445 if ( threadIdx.x == 0 ) sh_y[i] += sh_t[tid];
446
447 }
448
449 }
450
451 // Wait for all threads to complete the tile
452 __syncthreads();
453
454 // Store partial sum for this tile back in global memory
455 for (size_type i=tid; i<i_range; i+=nid) {
456 m_y( i+i_offset , blockIdx.x ) += sh_y[i];
457 }
458
459 }
460 };
461
462 class Zero {
463 public:
464 typedef typename vector_type::value_type value_type;
465 typedef typename vector_type::execution_space execution_space;
466 typedef typename execution_space::size_type size_type;
467
468 Zero( const vector_type & x ) : m_x( x ), m_d(x.extent(0)) {}
469
470 KOKKOS_INLINE_FUNCTION
471 void operator()( const size_type j ) const {
472 for (size_type i=0; i<m_d; ++i)
473 m_x(i,j) = 0.0;
474 }
475
476 private:
477 const vector_type m_x;
478 const size_type m_d;
479
480 };
481
482 //------------------------------------
483
484 static void apply( const matrix_type & A ,
485 const vector_type & x ,
486 const vector_type & y )
487 {
488 const size_type row_count = A.graph.row_map.extent(0) - 1;
489 const size_type tensor_dimension = A.block.dimension();
490 const size_type tile_size = A.block.tile_size();
491 const size_type num_tiles = A.block.num_tiles();
492
493 const size_type nWarp = 16;
494 const dim3 dBlock( Kokkos::Impl::CudaTraits::WarpSize , nWarp , 1 );
495 const dim3 dGrid( row_count , 1 , 1 );
496
497 const size_type shmem =
498 sizeof(VectorScalar) * (5*tile_size + dBlock.x*dBlock.y);
499
500#if 1
501
502 std::cout << "Multiply< BlockCrsMatrix< CrsProductTensor ... > >::apply"
503 << std::endl
504 << " grid(" << dGrid.x << "," << dGrid.y << ")" << std::endl
505 << " block(" << dBlock.x << "," << dBlock.y << ")" << std::endl
506 << " shmem(" << shmem/1024 << " kB)" << std::endl
507 << " row_count(" << row_count << ")" << std::endl
508 << " tensor_dimension(" << tensor_dimension << ")" << std::endl
509 << " tile_size(" << tile_size << ")" << std::endl
510 << " num_tiles(" << num_tiles << ")" << std::endl
511 ;
512#endif
513 //cudaProfilerStart();
514 // Zero y
515 Kokkos::parallel_for( row_count , Zero(y) );
516
517 // Loop over Cijk tiles
518 for (size_type tile = 0; tile<num_tiles; ++tile) {
519 Kokkos::Impl::cuda_parallel_launch_local_memory<<< dGrid , dBlock , shmem >>>
520 ( ProductTensorLoop( A , x , y, tile ) );
521 }
522 //cudaProfilerStop();
523 }
524};
525
526#else
527
528//----------------------------------------------------------------------------
529
530// #define MAX_TENSOR_OFFSETS 2000
531// __constant__ Kokkos::Cuda::size_type tensor_row_begin[MAX_TENSOR_OFFSETS];
532
533template< typename TensorScalar ,
534 typename MatrixScalar ,
535 typename VectorScalar >
536class Multiply<
537 BlockCrsMatrix< TiledCrsProductTensor< TensorScalar, Kokkos::Cuda >,
538 MatrixScalar, Kokkos::Cuda >,
539 Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda>,
540 Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda> >
541{
542public:
543
544 typedef Kokkos::Cuda execution_space ;
545 typedef execution_space::size_type size_type ;
546
547 typedef TiledCrsProductTensor< TensorScalar, execution_space > tensor_type ;
548 typedef BlockCrsMatrix< tensor_type, MatrixScalar, execution_space > matrix_type ;
549 typedef Kokkos::View< VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda > vector_type ;
550
551 class ProductTensorLoop {
552 public:
553
554 const matrix_type m_A;
555 const vector_type m_x;
556 const vector_type m_y;
557
558 ProductTensorLoop( const matrix_type & A,
559 const vector_type & x,
560 const vector_type & y )
561 : m_A( A ), m_x( x ), m_y( y ) {}
562
563 __device__
564 void operator()(void) const
565 {
566 const size_type WarpSize = Kokkos::Impl::CudaTraits::WarpSize;
567
568 // Number of bases in the stochastic system:
569 const size_type dim = m_A.block.dimension();
570 const size_type tile_size = m_A.block.tile_size();
571 const size_type max_num_rows = m_A.block.max_num_rows();
572
573 const size_type nid = blockDim.x * blockDim.y * blockDim.z;
574 const size_type tid =
575 threadIdx.x + blockDim.x * ( threadIdx.y + blockDim.y * threadIdx.z );
576 const size_type thread_lane = threadIdx.x + blockDim.x * threadIdx.y;
577
578 // blockDim.x == FEM column block-size
579 VectorScalar * const sh_x_k =
580 kokkos_impl_cuda_shared_memory<VectorScalar>();
581 //VectorScalar * const sh_x_j = sh_x_k + blockDim.x*tile_size;
582 VectorScalar * const sh_x_j = sh_x_k;
583 VectorScalar * const sh_A_k = sh_x_j + blockDim.x*tile_size;
584 //VectorScalar * const sh_A_j = sh_A_k + blockDim.x*tile_size;
585 VectorScalar * const sh_A_j = sh_A_k;
586 VectorScalar * const sh_y = sh_A_j + blockDim.x*tile_size;
587 volatile VectorScalar * const sh_t = sh_y + tile_size;
588 //volatile VectorScalar * const sh_v = sh_t + nid;
589 size_type * const sh_j = (size_type*) (sh_t + nid);
590 size_type * const sh_k = (size_type*) (sh_j + nid);
591
592 // blockIdx.x == row in the deterministic (finite element) system
593 // These are not coalesced, but there is nothing we can do about it
594 // since we only do one row per block
595 const size_type iBlockEntryBeg = m_A.graph.row_map[ blockIdx.x ];
596 const size_type iBlockEntryEnd = m_A.graph.row_map[ blockIdx.x + 1 ];
597 size_type numBlock = (iBlockEntryEnd-iBlockEntryBeg) / blockDim.x;
598 const size_type remBlock = (iBlockEntryEnd-iBlockEntryBeg) % blockDim.x;
599 if (remBlock > 0) ++numBlock;
600
601 // Zero y
602 for (size_type i=tid; i<dim; i+=nid) {
603 m_y(i,blockIdx.x) = 0.0;
604 }
605
606 // Number of Cijk tiles
607 const size_type n_tile = m_A.block.num_tiles();
608
609 // Loop over Cijk tiles
610 for (size_type tile = 0; tile<n_tile; ++tile) {
611
612 // These are not coalesced
613 const size_type i_offset = m_A.block.offset(tile, 0);
614 const size_type j_offset = m_A.block.offset(tile, 1);
615 const size_type k_offset = m_A.block.offset(tile, 2);
616 const size_type i_range = m_A.block.range(tile, 0);
617 const size_type j_range = m_A.block.range(tile, 1);
618 const size_type k_range = m_A.block.range(tile, 2);
619 const size_type n_row = m_A.block.num_rows(tile);
620
621 // Zero y
622 for (size_type i=tid; i<i_range; i+=nid) {
623 sh_y[i] = 0.0;
624 }
625
626 // Loop over finite element column blocks.
627 // threadIdx.x == FEM column within current block
628 size_type iBlockEntry = iBlockEntryBeg;
629 for (size_type block=0; block<numBlock;
630 ++block, iBlockEntry+=blockDim.x) {
631
632 const size_type block_size =
633 (block == numBlock-1 && remBlock > 0) ? remBlock : blockDim.x;
634
635 // Coalesced read blocks of X and A into shared memory
636 for (size_type col=0; col<block_size; ++col) {
637
638 // This is not a coalesced read
639 const size_type iBlockColumn =
640 m_A.graph.entries( iBlockEntry + col );
641
642 const VectorScalar * const x_k = & m_x( k_offset , iBlockColumn );
643 const VectorScalar * const x_j = & m_x( j_offset , iBlockColumn );
644 const MatrixScalar * const A_k =
645 & m_A.values( k_offset , iBlockEntry + col );
646 const MatrixScalar * const A_j =
647 & m_A.values( j_offset , iBlockEntry + col );
648 for (size_type j=tid; j<j_range; j+=nid) {
649 sh_x_j[col+j*blockDim.x] = x_j[j];
650 sh_A_j[col+j*blockDim.x] = A_j[j];
651 }
652 // for (size_type k=tid; k<k_range; k+=nid) {
653 // sh_x_k[col+k*blockDim.x] = x_k[k];
654 // sh_A_k[col+k*blockDim.x] = A_k[k];
655 // }
656
657 }
658
659 __syncthreads(); // wait for X and A to be read
660
661 // Loop over stochastic rows in this tile
662 for (size_type i=threadIdx.z; i<i_range; i+=blockDim.z) {
663 VectorScalar s = 0;
664
665 // Product tensor entries which this warp will iterate:
666 // These are not coalesced
667 const size_type lBeg = m_A.block.entry_begin(tile, i);
668 const size_type lEnd = m_A.block.entry_end(tile, i);
669 // const size_type lBeg = tensor_row_begin[tile*(max_num_rows+1)+i];
670 // const size_type lEnd = tensor_row_begin[tile*(max_num_rows+1)+i+1];
671
672 // Loop through sparse tensor contributions with coalesced reads.
673 for (size_type l=lBeg; l<lEnd; l+=WarpSize) {
674 //for (size_type l=lBeg+threadIdx.y; l<lEnd; l+=blockDim.y) {
675
676 // Read entries from the tensor
677 if (l+thread_lane < lEnd) {
678 sh_j[tid] = m_A.block.coord(l+thread_lane,0);
679 sh_k[tid] = m_A.block.coord(l+thread_lane,1);
680 sh_t[tid] = m_A.block.value(l+thread_lane);
681 }
682
683 const int mm = (l+WarpSize<lEnd) ? WarpSize : lEnd-l;
684 for (size_type m=threadIdx.y; m<mm; m+=blockDim.y) {
685
686 if (threadIdx.x < block_size) {
687 // const int j = m_A.block.coord(l+m,0);
688 // const int k = m_A.block.coord(l+m,1);
689 // const MatrixScalar v = m_A.block.value(l+m);
690 const size_type j = sh_j[m+WarpSize*threadIdx.z];
691 const size_type k = sh_k[m+WarpSize*threadIdx.z];
692 const TensorScalar v = sh_t[m+WarpSize*threadIdx.z];
693 const size_type jj = threadIdx.x + j*blockDim.x;
694 const size_type kk = threadIdx.x + k*blockDim.x;
695 s += v * ( sh_A_j[jj] * sh_x_k[kk] + sh_A_k[kk] * sh_x_j[jj] );
696 }
697
698 }
699
700 }
701
702 // Reduction of 'y' within 'CudaTraits::WarpSize'
703 sh_t[tid] = s;
704 if ( thread_lane + 16 < WarpSize ) sh_t[tid] += sh_t[tid+16];
705 if ( thread_lane + 8 < WarpSize ) sh_t[tid] += sh_t[tid+ 8];
706 if ( thread_lane + 4 < WarpSize ) sh_t[tid] += sh_t[tid+ 4];
707 if ( thread_lane + 2 < WarpSize ) sh_t[tid] += sh_t[tid+ 2];
708 if ( thread_lane + 1 < WarpSize ) sh_t[tid] += sh_t[tid+ 1];
709 if ( thread_lane == 0 ) sh_y[i] += sh_t[tid];
710
711 }
712
713 }
714
715 // Wait for all threads to complete the tile
716 __syncthreads();
717
718 // Store partial sum for this tile back in global memory
719 for (size_type i=tid; i<i_range; i+=nid) {
720 m_y( i+i_offset , blockIdx.x ) += sh_y[i];
721 }
722 }
723
724 }
725 };
726
727 //------------------------------------
728
729 static void apply( const matrix_type & A ,
730 const vector_type & x ,
731 const vector_type & y )
732 {
733 const size_type row_count = A.graph.row_map.extent(0) - 1;
734 const size_type tensor_dimension = A.block.dimension();
735 const size_type tile_size = A.block.tile_size();
736 const size_type num_tiles = A.block.num_tiles();
737 const size_type max_num_rows = A.block.max_num_rows();
738
739 const size_type warp_size = Kokkos::Impl::CudaTraits::WarpSize;
740 const size_type block_size = 8;
741 const size_type nWarp = 16;
742 const dim3 dBlock( block_size, warp_size / block_size, nWarp );
743 const dim3 dGrid( row_count , 1 , 1 );
744
745 const size_type shmem =
746 sizeof(VectorScalar) * ( (2*block_size+1)*tile_size +
747 dBlock.x*dBlock.y*dBlock.z ) +
748 sizeof(size_type) * 2 * dBlock.x*dBlock.y*dBlock.z;
749
750 const size_type cmem = sizeof(size_type)*(max_num_rows+1)*num_tiles;
751
752#if 1
753
754 std::cout << "Multiply< BlockCrsMatrix< CrsProductTensor ... > >::apply"
755 << std::endl
756 << " grid(" << dGrid.x << "," << dGrid.y << ")" << std::endl
757 << " block(" << dBlock.x << "," << dBlock.y << "," << dBlock.z
758 << ")" << std::endl
759 << " block_size(" << block_size << ")" << std::endl
760 << " shmem(" << shmem/1024 << " kB)" << std::endl
761 << " row_count(" << row_count << ")" << std::endl
762 << " tensor_dimension(" << tensor_dimension << ")" << std::endl
763 << " tile_size(" << tile_size << ")" << std::endl
764 << " num_tiles(" << num_tiles << ")" << std::endl
765 << " cmem(" << cmem/1024 << " kB)" << std::endl
766 ;
767#endif
768
769 // Copy tensor row offsets to constant device memory
770 // cudaMemcpyToSymbol(tensor_row_begin, A.block.row_map_ptr(), cmem, 0,
771 // cudaMemcpyDeviceToDevice);
772
773 //cudaProfilerStart();
774 Kokkos::Impl::cuda_parallel_launch_local_memory<<< dGrid , dBlock , shmem >>>
775 ( ProductTensorLoop( A , x , y ) );
776 //cudaProfilerStop();
777 }
778};
779
780#endif
781
782//----------------------------------------------------------------------------
783//----------------------------------------------------------------------------
784
785} // namespace Stokhos
786
787#endif /* #ifndef STOKHOS_CUDA_TILED_CRS_PRODUCT_TENSOR_HPP */
Zero
Kokkos::DefaultExecutionSpace execution_space
CRS matrix of dense blocks.
Top-level namespace for Stokhos classes and functions.
const IndexType const IndexType const IndexType const IndexType const ValueType const ValueType * x
Definition: csr_vector.h:265
const IndexType thread_lane
Definition: csr_vector.h:274
const IndexType const IndexType const IndexType const IndexType const ValueType const ValueType ValueType * y
Definition: csr_vector.h:267
for(IndexType row=vector_id;row< Anum_rows;row+=num_vectors)
Definition: csr_vector.h:279