Stokhos Package Browser (Single Doxygen Collection) Version of the Day
Loading...
Searching...
No Matches
Stokhos_Cuda_LinearSparse3Tensor.hpp
Go to the documentation of this file.
1// @HEADER
2// ***********************************************************************
3//
4// Stokhos Package
5// Copyright (2009) Sandia Corporation
6//
7// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8// license for use of this work by or on behalf of the U.S. Government.
9//
10// Redistribution and use in source and binary forms, with or without
11// modification, are permitted provided that the following conditions are
12// met:
13//
14// 1. Redistributions of source code must retain the above copyright
15// notice, this list of conditions and the following disclaimer.
16//
17// 2. Redistributions in binary form must reproduce the above copyright
18// notice, this list of conditions and the following disclaimer in the
19// documentation and/or other materials provided with the distribution.
20//
21// 3. Neither the name of the Corporation nor the names of the
22// contributors may be used to endorse or promote products derived from
23// this software without specific prior written permission.
24//
25// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36//
37// Questions? Contact Eric T. Phipps (etphipp@sandia.gov).
38//
39// ***********************************************************************
40// @HEADER
41
42#ifndef STOKHOS_CUDA_LINEAR_SPARSE_3_TENSOR_HPP
43#define STOKHOS_CUDA_LINEAR_SPARSE_3_TENSOR_HPP
44
45#include <iostream>
46
47#include "Kokkos_Core.hpp"
48
49#include "Stokhos_Multiply.hpp"
52
53#include "cuda_profiler_api.h"
54
55namespace Stokhos {
56
57//----------------------------------------------------------------------------
58
59/*
60 * This version uses 4 warps per block, one warp per spatial row, and loops
61 * through stochastic rows 32 at a time.
62 */
63template< typename TensorScalar,
64 typename MatrixScalar,
65 typename VectorScalar,
66 int BlockSize >
68 BlockCrsMatrix< LinearSparse3Tensor< TensorScalar, Kokkos::Cuda, BlockSize >,
69 MatrixScalar, Kokkos::Cuda >,
70 Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda>,
71 Kokkos::View<VectorScalar**, Kokkos::LayoutLeft, Kokkos::Cuda> >
72{
73public:
74
75 typedef Kokkos::Cuda execution_space;
76 typedef execution_space::size_type size_type;
77
80 typedef Kokkos::View< VectorScalar**,
81 Kokkos::LayoutLeft,
82 Kokkos::Cuda > vector_type;
83
84 template <int MAX_COL>
85 class ApplyKernelSymmetric {
86 public:
87
91
93 const vector_type & x,
94 const vector_type & y )
95 : m_A( A ), m_x( x ), m_y( y ) {}
96
97 __device__
98 void operator()(void) const
99 {
100 // Number of bases in the stochastic system:
101 const size_type dim = m_A.block.dimension();
102
103 volatile VectorScalar * const sh =
104 kokkos_impl_cuda_shared_memory<VectorScalar>();
105 volatile VectorScalar * const sh_y0 =
106 sh + blockDim.x*threadIdx.y;
107 volatile VectorScalar * const sh_a0 =
108 sh + blockDim.x*blockDim.y + MAX_COL*threadIdx.y;
109 volatile VectorScalar * const sh_x0 =
110 sh + blockDim.x*blockDim.y + MAX_COL*blockDim.y + MAX_COL*threadIdx.y;
111 volatile size_type * const sh_col =
112 (size_type*)(sh + blockDim.x*blockDim.y + 2*MAX_COL*blockDim.y) + MAX_COL*threadIdx.y;
113
114 // blockIdx.x == row in the deterministic (finite element) system
115 const size_type row = blockIdx.x*blockDim.y + threadIdx.y;
116 if (row < m_A.graph.row_map.extent(0)-1) {
117 const size_type colBeg = m_A.graph.row_map[ row ];
118 const size_type colEnd = m_A.graph.row_map[ row + 1 ];
119
120 // Read tensor entries
121 const TensorScalar c0 = m_A.block.value(0);
122 const TensorScalar c1 = m_A.block.value(1);
123
124 // Zero y
125 VectorScalar y0 = 0.0;
126
127 // Read column offsets for this row
128 for ( size_type lcol = threadIdx.x; lcol < colEnd-colBeg;
129 lcol += blockDim.x )
130 sh_col[lcol] = m_A.graph.entries( lcol+colBeg );
131
132 // Loop over stochastic rows
133 for ( size_type stoch_row = threadIdx.x; stoch_row < dim;
134 stoch_row += blockDim.x) {
135
136 VectorScalar yi = 0.0;
137
138 // Loop over columns in the discrete (finite element) system.
139 // We should probably only loop over a portion at a time to
140 // make better use of L2 cache.
141 //
142 // We could also apply some warps to columns in parallel.
143 // This requires inter-warp reduction of y0 and yi.
144 for ( size_type col_offset = colBeg; col_offset < colEnd;
145 col_offset += 1 ) {
146
147 // Get column index
148 const size_type lcol = col_offset-colBeg;
149 const size_type col = sh_col[lcol];
150
151 // Read a[i] and x[i] (coalesced)
152 const MatrixScalar ai = m_A.values( stoch_row, col_offset );
153 const VectorScalar xi = m_x( stoch_row, col );
154
155 // Store a[0] and x[0] to shared memory for this column
156 if (stoch_row == 0) {
157 sh_a0[lcol] = ai;
158 sh_x0[lcol] = xi;
159 }
160
161 // Retrieve a[0] and x[0] from shared memory for this column
162 const MatrixScalar a0 = sh_a0[lcol];
163 const VectorScalar x0 = sh_x0[lcol];
164
165 // Subtract off contribution of first iteration of loop
166 if (stoch_row == 0) y0 += (c0-3.0*c1)*a0*x0;
167
168 yi += c1*(a0*xi + ai*x0);
169 y0 += c1*ai*xi;
170 }
171
172 // Write y back to global memory
173 m_y( stoch_row, row ) = yi;
174
175 // Sync warps so rows don't get too out-of-sync to make use of
176 // locality in spatial matrix and L2 cache
177 __syncthreads();
178 }
179
180 // Reduction of 'y0' within warp
181 sh_y0[ threadIdx.x ] = y0;
182 if ( threadIdx.x + 16 < blockDim.x )
183 sh_y0[threadIdx.x] += sh_y0[threadIdx.x+16];
184 if ( threadIdx.x + 8 < blockDim.x )
185 sh_y0[threadIdx.x] += sh_y0[threadIdx.x+ 8];
186 if ( threadIdx.x + 4 < blockDim.x )
187 sh_y0[threadIdx.x] += sh_y0[threadIdx.x+ 4];
188 if ( threadIdx.x + 2 < blockDim.x )
189 sh_y0[threadIdx.x] += sh_y0[threadIdx.x+ 2];
190 if ( threadIdx.x + 1 < blockDim.x )
191 sh_y0[threadIdx.x] += sh_y0[threadIdx.x+ 1];
192
193 // Store y0 back in global memory
194 if ( threadIdx.x == 0 ) m_y( 0, row ) += sh_y0[0];
195
196 }
197 }
198 };
199
200 template <int MAX_COL>
201 class ApplyKernelAsymmetric {
202 public:
203
207
209 const vector_type & x,
210 const vector_type & y )
211 : m_A( A ), m_x( x ), m_y( y ) {}
212
213 __device__
214 void operator()(void) const
215 {
216 // Number of bases in the stochastic system:
217 const size_type dim = m_A.block.dimension();
218
219 volatile VectorScalar * const sh =
220 kokkos_impl_cuda_shared_memory<VectorScalar>();
221 volatile VectorScalar * const sh_y0 =
222 sh + blockDim.x*threadIdx.y;
223 volatile VectorScalar * const sh_a0 =
224 sh + blockDim.x*blockDim.y + MAX_COL*threadIdx.y;
225 volatile VectorScalar * const sh_x0 =
226 sh + blockDim.x*blockDim.y + MAX_COL*blockDim.y + MAX_COL*threadIdx.y;
227 volatile size_type * const sh_col =
228 (size_type*)(sh + blockDim.x*blockDim.y + 2*MAX_COL*blockDim.y) + MAX_COL*threadIdx.y;
229
230 // blockIdx.x == row in the deterministic (finite element) system
231 const size_type row = blockIdx.x*blockDim.y + threadIdx.y;
232 if (row < m_A.graph.row_map.extent(0)-1) {
233 const size_type colBeg = m_A.graph.row_map[ row ];
234 const size_type colEnd = m_A.graph.row_map[ row + 1 ];
235
236 // Read tensor entries
237 const TensorScalar c0 = m_A.block.value(0);
238 const TensorScalar c1 = m_A.block.value(1);
239 const TensorScalar c2 = m_A.block.value(2);
240
241 // Zero y
242 VectorScalar y0 = 0.0;
243
244 // Read column offsets for this row
245 for ( size_type lcol = threadIdx.x; lcol < colEnd-colBeg;
246 lcol += blockDim.x )
247 sh_col[lcol] = m_A.graph.entries( lcol+colBeg );
248
249 // Loop over stochastic rows
250 for ( size_type stoch_row = threadIdx.x; stoch_row < dim;
251 stoch_row += blockDim.x) {
252
253 VectorScalar yi = 0.0;
254
255 // Loop over columns in the discrete (finite element) system.
256 // We should probably only loop over a portion at a time to
257 // make better use of L2 cache.
258 //
259 // We could also apply some warps to columns in parallel.
260 // This requires inter-warp reduction of y0 and yi.
261 for ( size_type col_offset = colBeg; col_offset < colEnd;
262 col_offset += 1 ) {
263
264 // Get column index
265 const size_type lcol = col_offset-colBeg;
266 const size_type col = sh_col[lcol];
267
268 // Read a[i] and x[i] (coalesced)
269 const MatrixScalar ai = m_A.values( stoch_row, col_offset );
270 const VectorScalar xi = m_x( stoch_row, col );
271
272 // Store a[0] and x[0] to shared memory for this column
273 if (stoch_row == 0) {
274 sh_a0[lcol] = ai;
275 sh_x0[lcol] = xi;
276 }
277
278 // Retrieve a[0] and x[0] from shared memory for this column
279 const MatrixScalar a0 = sh_a0[lcol];
280 const VectorScalar x0 = sh_x0[lcol];
281
282 // Subtract off contribution of first iteration of loop
283 if (stoch_row == 0) y0 += (c0-3.0*c1-c2)*a0*x0;
284
285 yi += c1*(a0*xi + ai*x0) + c2*ai*xi;
286 y0 += c1*ai*xi;
287 }
288
289 // Write y back to global memory
290 m_y( stoch_row, row ) = yi;
291
292 // Sync warps so rows don't get too out-of-sync to make use of
293 // locality in spatial matrix and L2 cache
294 __syncthreads();
295 }
296
297 // Reduction of 'y0' within warp
298 sh_y0[ threadIdx.x ] = y0;
299 if ( threadIdx.x + 16 < blockDim.x )
300 sh_y0[threadIdx.x] += sh_y0[threadIdx.x+16];
301 if ( threadIdx.x + 8 < blockDim.x )
302 sh_y0[threadIdx.x] += sh_y0[threadIdx.x+ 8];
303 if ( threadIdx.x + 4 < blockDim.x )
304 sh_y0[threadIdx.x] += sh_y0[threadIdx.x+ 4];
305 if ( threadIdx.x + 2 < blockDim.x )
306 sh_y0[threadIdx.x] += sh_y0[threadIdx.x+ 2];
307 if ( threadIdx.x + 1 < blockDim.x )
308 sh_y0[threadIdx.x] += sh_y0[threadIdx.x+ 1];
309
310 // Store y0 back in global memory
311 if ( threadIdx.x == 0 ) m_y( 0, row ) += sh_y0[0];
312
313 }
314 }
315 };
316
317
318 //------------------------------------
319
320 static void apply( const matrix_type & A,
321 const vector_type & x,
322 const vector_type & y )
323 {
324 const size_type num_spatial_rows = A.graph.row_map.extent(0) - 1;
325 const size_type num_stoch_rows = A.block.dimension();
326
327 const size_type warp_size = Kokkos::Impl::CudaTraits::WarpSize;
328 const size_type num_rows_per_block = 4;
329 size_type num_blocks = num_spatial_rows / num_rows_per_block;
330 if (num_blocks * num_rows_per_block < num_spatial_rows)
331 ++num_blocks;
332 const dim3 dBlock( warp_size, num_rows_per_block );
333 const dim3 dGrid( num_blocks , 1, 1 );
334
335 const int MAX_COL = 32; // Maximum number of nonzero columns per row
336 const size_type shmem =
337 sizeof(VectorScalar) * (dBlock.x*dBlock.y + 2*MAX_COL*dBlock.y) +
338 sizeof(size_type) * (MAX_COL*dBlock.y);
339
340#if 0
341
342 std::cout << "Multiply< BlockCrsMatrix< LinearSparse3Tensor ... > >::apply"
343 << std::endl
344 << " grid(" << dGrid.x << "," << dGrid.y << ")" << std::endl
345 << " block(" << dBlock.x << "," << dBlock.y << "," << dBlock.z << ")"<< std::endl
346 << " shmem(" << shmem/1024 << " kB)" << std::endl
347 << " num_spatial_rows(" << num_spatial_rows << ")" << std::endl
348 << " num_stoch_rows(" << num_stoch_rows << ")" << std::endl;
349#endif
350
351 //cudaProfilerStart();
352 if (A.block.symmetric())
353 Kokkos::Impl::cuda_parallel_launch_local_memory<<< dGrid, dBlock, shmem >>>
354 ( ApplyKernelSymmetric<MAX_COL>( A, x, y ) );
355 else
356 Kokkos::Impl::cuda_parallel_launch_local_memory<<< dGrid, dBlock, shmem >>>
357 ( ApplyKernelAsymmetric<MAX_COL>( A, x, y ) );
358 //cudaProfilerStop();
359 }
360};
361
362//----------------------------------------------------------------------------
363//----------------------------------------------------------------------------
364
365} // namespace Stokhos
366
367#endif /* #ifndef STOKHOS_CUDA_LINEAR_SPARSE_3_TENSOR_HPP */
CRS matrix of dense blocks.
Sparse product tensor with replicated entries to provide subsets with a given coordinate.
Top-level namespace for Stokhos classes and functions.