Stokhos Package Browser (Single Doxygen Collection) Version of the Day
Loading...
Searching...
No Matches
Stokhos_SimpleTiledCrsProductTensor.hpp
Go to the documentation of this file.
1// @HEADER
2// ***********************************************************************
3//
4// Stokhos Package
5// Copyright (2009) Sandia Corporation
6//
7// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8// license for use of this work by or on behalf of the U.S. Government.
9//
10// Redistribution and use in source and binary forms, with or without
11// modification, are permitted provided that the following conditions are
12// met:
13//
14// 1. Redistributions of source code must retain the above copyright
15// notice, this list of conditions and the following disclaimer.
16//
17// 2. Redistributions in binary form must reproduce the above copyright
18// notice, this list of conditions and the following disclaimer in the
19// documentation and/or other materials provided with the distribution.
20//
21// 3. Neither the name of the Corporation nor the names of the
22// contributors may be used to endorse or promote products derived from
23// this software without specific prior written permission.
24//
25// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36//
37// Questions? Contact Eric T. Phipps (etphipp@sandia.gov).
38//
39// ***********************************************************************
40// @HEADER
41
42#ifndef STOKHOS_SIMPLE_TILED_CRS_PRODUCT_TENSOR_HPP
43#define STOKHOS_SIMPLE_TILED_CRS_PRODUCT_TENSOR_HPP
44
45#include "Kokkos_Core.hpp"
46
47#include "Stokhos_Multiply.hpp"
51#include "Teuchos_ParameterList.hpp"
52#include "Stokhos_TinyVec.hpp"
53
54
55//----------------------------------------------------------------------------
56//----------------------------------------------------------------------------
57
58namespace Stokhos {
59
60template< typename ValueType, class ExecutionSpace >
62public:
63
64 typedef ExecutionSpace execution_space;
65 typedef int size_type;
66 typedef ValueType value_type;
67
68// Vectorsize used in multiply algorithm
69#if defined(__AVX__)
70 static const size_type host_vectorsize = 32/sizeof(value_type);
71 static const bool use_intrinsics = true;
72#elif defined(__MIC__)
73 static const size_type host_vectorsize = 16;
74 static const bool use_intrinsics = true;
75#else
76 static const size_type host_vectorsize = 2;
77 static const bool use_intrinsics = false;
78#endif
79 static const size_type cuda_vectorsize = 32;
80 static const bool is_cuda =
81#if defined( KOKKOS_ENABLE_CUDA )
82 std::is_same<ExecutionSpace,Kokkos::Cuda>::value;
83#else
84 false ;
85#endif
87
88 // Alignment in terms of number of entries of CRS rows
90
91private:
92
93 typedef Kokkos::View< value_type[], execution_space > vec_type;
94 typedef Kokkos::View< value_type[], execution_space > value_array_type;
95 typedef Kokkos::View< size_type[], execution_space > coord_array_type;
96 typedef Kokkos::View< size_type[][2], Kokkos::LayoutLeft, execution_space > coord2_array_type;
97 typedef Kokkos::View< size_type*, execution_space > i_begin_type;
98 typedef Kokkos::View< size_type*, execution_space > i_size_type;
99 typedef Kokkos::View< size_type*, execution_space > num_j_type;
100 typedef Kokkos::View< size_type**, execution_space > j_begin_type;
101 typedef Kokkos::View< size_type**, execution_space > j_size_type;
102 typedef Kokkos::View< size_type**, execution_space > num_k_type;
103 typedef Kokkos::View< size_type***, execution_space > k_begin_type;
104 typedef Kokkos::View< size_type***, execution_space > k_size_type;
105
106 typedef Kokkos::View< size_type****, Kokkos::LayoutRight, execution_space > row_map_type;
107 typedef Kokkos::View< size_type****, Kokkos::LayoutRight, execution_space > num_entry_type;
108
128
129 struct Coord {
132 };
133
134 template <typename coord_t>
135 struct Tile {
137 Teuchos::Array<coord_t> parts;
138 };
139
143
144public:
145
146 inline
148
149 inline
151 m_value(),
152 m_coord(),
153 m_coord2(),
154 m_i_begin(),
155 m_i_size(),
156 m_num_j(),
157 m_j_begin(),
158 m_j_size(),
159 m_num_k(),
160 m_k_begin(),
161 m_k_size(),
162 m_row_map(),
163 m_num_entry(),
164 m_dimension(0),
167 m_num_i(0),
168 m_nnz(0),
169 m_flops(0) {}
170
171 inline
173 m_value(rhs.m_value),
174 m_coord(rhs.m_coord),
175 m_coord2(rhs.m_coord2),
176 m_i_begin(rhs.m_i_begin),
177 m_i_size(rhs.m_i_size),
178 m_num_j(rhs.m_num_j),
179 m_j_begin(rhs.m_j_begin),
180 m_j_size(rhs.m_j_size),
181 m_num_k(rhs.m_num_k),
182 m_k_begin(rhs.m_k_begin),
183 m_k_size(rhs.m_k_size),
184 m_row_map(rhs.m_row_map),
189 m_num_i(rhs.m_num_i),
190 m_nnz(rhs.m_nnz),
191 m_flops(rhs.m_flops) {}
192
193 inline
195 const SimpleTiledCrsProductTensor & rhs)
196 {
197 m_value = rhs.m_value;
198 m_coord = rhs.m_coord;
199 m_coord2 = rhs.m_coord2;
200 m_i_begin = rhs.m_i_begin;
201 m_i_size = rhs.m_i_size;
202 m_num_j = rhs.m_num_j;
203 m_j_begin = rhs.m_j_begin;
204 m_j_size = rhs.m_j_size;
205 m_num_k = rhs.m_num_k;
206 m_k_begin = rhs.m_k_begin;
207 m_k_size = rhs.m_k_size;
208 m_row_map = rhs.m_row_map;
213 m_num_i = rhs.m_num_i;
214 m_nnz = rhs.m_nnz;
215 m_flops = rhs.m_flops;
216 return *this;
217 }
218
220 KOKKOS_INLINE_FUNCTION
221 size_type dimension() const { return m_dimension; }
222
224 KOKKOS_INLINE_FUNCTION
225 size_type entry_count() const { return m_coord.extent(0); }
226
228 KOKKOS_INLINE_FUNCTION
229 size_type num_i_tiles() const { return m_num_i; }
230
232 KOKKOS_INLINE_FUNCTION
233 size_type i_begin(const size_type i) const { return m_i_begin(i); }
234
236 KOKKOS_INLINE_FUNCTION
237 size_type i_size(const size_type i) const { return m_i_size(i); }
238
240 KOKKOS_INLINE_FUNCTION
241 size_type num_j_tiles(const size_type i) const { return m_num_j(i); }
242
244 KOKKOS_INLINE_FUNCTION
245 size_type j_begin(const size_type i, const size_type j) const {
246 return m_j_begin(i,j);
247 }
248
250 KOKKOS_INLINE_FUNCTION
251 size_type j_size(const size_type i, const size_type j) const {
252 return m_j_size(i,j);
253 }
254
256 KOKKOS_INLINE_FUNCTION
257 size_type num_k_tiles(const size_type i, const size_type j) const {
258 return m_num_k(i,j); }
259
261 KOKKOS_INLINE_FUNCTION
263 const size_type k) const {
264 return m_k_begin(i,j,k);
265 }
266
268 KOKKOS_INLINE_FUNCTION
270 const size_type k) const {
271 return m_k_size(i,j,k);
272 }
273
275 KOKKOS_INLINE_FUNCTION
277 const size_type k, const size_type r) const {
278 return m_num_entry(i,j,k,r);
279 }
280
282 KOKKOS_INLINE_FUNCTION
284 const size_type k, const size_type r) const {
285 return m_row_map(i,j,k,r);
286 }
287
289 KOKKOS_INLINE_FUNCTION
291 const size_type k, const size_type r) const {
292 return m_row_map(i,j,k,r) + m_num_entry(i,j,k,r);
293 }
294
296 KOKKOS_INLINE_FUNCTION
297 const size_type& coord(const size_type entry, const size_type c) const {
298 return m_coord2(entry, c);
299 }
300
302 KOKKOS_INLINE_FUNCTION
303 const size_type& coord(const size_type entry) const {
304 return m_coord(entry);
305 }
306
308 KOKKOS_INLINE_FUNCTION
309 const value_type & value(const size_type entry) const {
310 return m_value(entry);
311 }
312
314 KOKKOS_INLINE_FUNCTION
316 { return m_nnz; }
317
319 KOKKOS_INLINE_FUNCTION
321 { return m_flops; }
322
324 KOKKOS_INLINE_FUNCTION
326
328 KOKKOS_INLINE_FUNCTION
330
331 template <typename OrdinalType>
335 const Teuchos::ParameterList& params)
336 {
337 using Teuchos::rcp;
338 using Teuchos::RCP;
339 using Teuchos::ParameterList;
340 using Teuchos::Array;
341
343 typedef typename Cijk_type::i_iterator i_iterator;
344 typedef typename Cijk_type::ik_iterator ik_iterator;
345 typedef typename Cijk_type::ikj_iterator ikj_iterator;
346
347 const size_type i_tile_size = params.get<OrdinalType>("Tile Size");
348
349 // Build 2-way symmetric Cijk tensor
350 Cijk_type Cijk_sym;
351 i_iterator i_begin = Cijk.i_begin();
352 i_iterator i_end = Cijk.i_end();
353 for (i_iterator i_it=i_begin; i_it!=i_end; ++i_it) {
354 OrdinalType i = index(i_it);
355 ik_iterator k_begin = Cijk.k_begin(i_it);
356 ik_iterator k_end = Cijk.k_end(i_it);
357 for (ik_iterator k_it = k_begin; k_it != k_end; ++k_it) {
358 OrdinalType k = index(k_it);
359 ikj_iterator j_begin = Cijk.j_begin(k_it);
360 ikj_iterator j_end = Cijk.j_end(k_it);
361 for (ikj_iterator j_it = j_begin; j_it != j_end; ++j_it) {
362 OrdinalType j = index(j_it);
363 if (k <= j) {
364 ValueType c = Stokhos::value(j_it);
365 Cijk_sym.add_term(i, j, k, c);
366 }
367 }
368 }
369 }
370 Cijk_sym.fillComplete();
371
372 // First partition based on i
373 size_type j_tile_size = i_tile_size / 2;
374 size_type basis_size = basis.size();
375 size_type num_i_parts = (basis_size + i_tile_size-1) / i_tile_size;
376 //size_type its = basis_size / num_i_parts;
377 size_type its = i_tile_size;
378 Array<ITile> i_tiles(num_i_parts);
379 for (size_type i=0; i<num_i_parts; ++i) {
380 i_tiles[i].lower = i*its;
381 i_tiles[i].upper = std::min(basis_size, (i+1)*its);
382 i_tiles[i].parts.resize(1);
383 i_tiles[i].parts[0].lower = basis_size;
384 i_tiles[i].parts[0].upper = 0;
385 }
386
387 // Next partition j
389 for (i_iterator i_it=Cijk_sym.i_begin(); i_it!=Cijk_sym.i_end(); ++i_it) {
390 OrdinalType i = index(i_it);
391
392 // Find which part i belongs to
393 size_type idx = 0;
394 while (idx < num_i_parts && i >= i_tiles[idx].lower) ++idx;
395 --idx;
396 TEUCHOS_ASSERT(idx >= 0 && idx < num_i_parts);
397
398 ik_iterator k_begin = Cijk_sym.k_begin(i_it);
399 ik_iterator k_end = Cijk_sym.k_end(i_it);
400 for (ik_iterator k_it = k_begin; k_it != k_end; ++k_it) {
401 OrdinalType j = index(k_it); // using symmetry to interchange j and k
402
403 if (j < i_tiles[idx].parts[0].lower)
404 i_tiles[idx].parts[0].lower = j;
405 if (j > i_tiles[idx].parts[0].upper)
406 i_tiles[idx].parts[0].upper = j;
407 }
408 }
409 for (size_type idx=0; idx<num_i_parts; ++idx) {
410 size_type lower = i_tiles[idx].parts[0].lower;
411 size_type upper = i_tiles[idx].parts[0].upper;
412 size_type range = upper - lower + 1;
413 size_type num_j_parts = (range + j_tile_size-1) / j_tile_size;
414 //size_type jts = range / num_j_parts;
415 size_type jts = j_tile_size;
416 max_jk_tile_size = std::max(max_jk_tile_size, jts);
417 Array<JTile> j_tiles(num_j_parts);
418 for (size_type j=0; j<num_j_parts; ++j) {
419 j_tiles[j].lower = lower + j*jts;
420 j_tiles[j].upper = std::min(upper+1, lower + (j+1)*jts);
421 j_tiles[j].parts.resize(1);
422 j_tiles[j].parts[0].lower = basis_size;
423 j_tiles[j].parts[0].upper = 0;
424 }
425 i_tiles[idx].parts.swap(j_tiles);
426 }
427
428 // Now partition k
429 for (i_iterator i_it=Cijk_sym.i_begin(); i_it!=Cijk_sym.i_end(); ++i_it) {
430 OrdinalType i = index(i_it);
431
432 // Find which part i belongs to
433 size_type idx = 0;
434 while (idx < num_i_parts && i >= i_tiles[idx].lower) ++idx;
435 --idx;
436 TEUCHOS_ASSERT(idx >= 0 && idx < num_i_parts);
437
438 ik_iterator k_begin = Cijk_sym.k_begin(i_it);
439 ik_iterator k_end = Cijk_sym.k_end(i_it);
440 for (ik_iterator k_it = k_begin; k_it != k_end; ++k_it) {
441 OrdinalType j = index(k_it); // using symmetry to interchange j and k
442
443 // Find which part j belongs to
444 size_type num_j_parts = i_tiles[idx].parts.size();
445 size_type jdx = 0;
446 while (jdx < num_j_parts && j >= i_tiles[idx].parts[jdx].lower) ++jdx;
447 --jdx;
448 TEUCHOS_ASSERT(jdx >= 0 && jdx < num_j_parts);
449
450 ikj_iterator j_begin = Cijk_sym.j_begin(k_it);
451 ikj_iterator j_end = Cijk_sym.j_end(k_it);
452 for (ikj_iterator j_it = j_begin; j_it != j_end; ++j_it) {
453 OrdinalType k = index(j_it); // using symmetry to interchange j and k
454 ValueType cijk = Stokhos::value(j_it);
455 if (k >= j) {
456 Coord coord;
457 coord.i = i; coord.j = j; coord.k = k; coord.cijk = cijk;
458 i_tiles[idx].parts[jdx].parts[0].parts.push_back(coord);
459 if (k < i_tiles[idx].parts[jdx].parts[0].lower)
460 i_tiles[idx].parts[jdx].parts[0].lower = k;
461 if (k > i_tiles[idx].parts[jdx].parts[0].upper)
462 i_tiles[idx].parts[jdx].parts[0].upper = k;
463 }
464 }
465 }
466 }
467
468 // Now need to divide up k-parts based on lower/upper bounds
469 size_type num_coord = 0;
470 for (size_type idx=0; idx<num_i_parts; ++idx) {
471 size_type num_j_parts = i_tiles[idx].parts.size();
472 for (size_type jdx=0; jdx<num_j_parts; ++jdx) {
473 size_type lower = i_tiles[idx].parts[jdx].parts[0].lower;
474 size_type upper = i_tiles[idx].parts[jdx].parts[0].upper;
475 size_type range = upper - lower + 1;
476 size_type num_k_parts = (range + j_tile_size-1) / j_tile_size;
477 //size_type kts = range / num_k_parts;
478 size_type kts = j_tile_size;
479 max_jk_tile_size = std::max(max_jk_tile_size, kts);
480 Array<KTile> k_tiles(num_k_parts);
481 for (size_type k=0; k<num_k_parts; ++k) {
482 k_tiles[k].lower = lower + k*kts;
483 k_tiles[k].upper = std::min(upper+1, lower + (k+1)*kts);
484 }
485 size_type num_k = i_tiles[idx].parts[jdx].parts[0].parts.size();
486 for (size_type l=0; l<num_k; ++l) {
487 size_type i = i_tiles[idx].parts[jdx].parts[0].parts[l].i;
488 size_type j = i_tiles[idx].parts[jdx].parts[0].parts[l].j;
489 size_type k = i_tiles[idx].parts[jdx].parts[0].parts[l].k;
490 value_type cijk = i_tiles[idx].parts[jdx].parts[0].parts[l].cijk;
491
492 // Find which part k belongs to
493 size_type kdx = 0;
494 while (kdx < num_k_parts && k >= k_tiles[kdx].lower) ++kdx;
495 --kdx;
496 TEUCHOS_ASSERT(kdx >= 0 && kdx < num_k_parts);
497
498 Coord coord;
499 coord.i = i; coord.j = j; coord.k = k; coord.cijk = cijk;
500 k_tiles[kdx].parts.push_back(coord);
501 ++num_coord;
502 if (j != k) ++num_coord;
503 }
504
505 // Eliminate parts with zero size
506 Array<KTile> k_tiles2;
507 for (size_type k=0; k<num_k_parts; ++k) {
508 if (k_tiles[k].parts.size() > 0)
509 k_tiles2.push_back(k_tiles[k]);
510 }
511 i_tiles[idx].parts[jdx].parts.swap(k_tiles2);
512 }
513 }
514 TEUCHOS_ASSERT(num_coord == Cijk.num_entries());
515
516 // Compute number of non-zeros for each row in each part
517 size_type total_num_rows = 0, max_num_rows = 0, entry_count = 0;
518 size_type max_num_j_parts = 0, max_num_k_parts = 0;
519 Array< Array< Array< Array<size_type> > > > coord_work(num_i_parts);
520 for (size_type idx=0; idx<num_i_parts; ++idx) {
521 size_type num_j_parts = i_tiles[idx].parts.size();
522 max_num_j_parts = std::max(max_num_j_parts, num_j_parts);
523 coord_work[idx].resize(num_j_parts);
524 for (size_type jdx=0; jdx<num_j_parts; ++jdx) {
525 size_type num_k_parts = i_tiles[idx].parts[jdx].parts.size();
526 max_num_k_parts = std::max(max_num_k_parts, num_k_parts);
527 coord_work[idx][jdx].resize(num_k_parts);
528 for (size_type kdx=0; kdx<num_k_parts; ++kdx) {
529 size_type num_rows = i_tiles[idx].upper - i_tiles[idx].lower + 1;
530 total_num_rows += num_rows;
531 max_num_rows = std::max(max_num_rows, num_rows);
532 coord_work[idx][jdx][kdx].resize(num_rows, 0);
533
534 size_type nc = i_tiles[idx].parts[jdx].parts[kdx].parts.size();
535 for (size_type c=0; c<nc; ++c) {
536 size_type i = i_tiles[idx].parts[jdx].parts[kdx].parts[c].i;
537 size_type i_begin = i_tiles[idx].lower;
538 ++(coord_work[idx][jdx][kdx][i-i_begin]);
539 ++entry_count;
540 }
541 }
542 }
543 }
544
545 // Pad each row to have size divisible by alignment size
546 for (size_type idx=0; idx<num_i_parts; ++idx) {
547 size_type num_j_parts = i_tiles[idx].parts.size();
548 for (size_type jdx=0; jdx<num_j_parts; ++jdx) {
549 size_type num_k_parts = i_tiles[idx].parts[jdx].parts.size();
550 for (size_type kdx=0; kdx<num_k_parts; ++kdx) {
551 size_type sz = coord_work[idx][jdx][kdx].size();
552 for (size_type i = 0; i < sz; ++i) {
553 const size_t rem = coord_work[idx][jdx][kdx][i] % tensor_align;
554 if (rem > 0) {
555 const size_t pad = tensor_align - rem;
556 coord_work[idx][jdx][kdx][i] += pad;
557 entry_count += pad;
558 }
559 }
560 }
561 }
562 }
563
564 // Allocate tensor data
566 tensor.m_value = value_array_type("value", entry_count);
567 tensor.m_coord = coord_array_type("coord", entry_count);
568 tensor.m_coord2 = coord2_array_type("coord2", entry_count);
569 tensor.m_i_begin = i_begin_type("i_begin", num_i_parts);
570 tensor.m_i_size = i_size_type("i_size", num_i_parts);
571 tensor.m_num_j = num_j_type("num_j", num_i_parts);
572 tensor.m_j_begin = j_begin_type("j_begin", num_i_parts, max_num_j_parts);
573 tensor.m_j_size = j_size_type("j_size", num_i_parts, max_num_j_parts);
574 tensor.m_num_k = num_k_type("num_k", num_i_parts, max_num_j_parts);
575 tensor.m_k_begin = k_begin_type("k_begin", num_i_parts, max_num_j_parts,
576 max_num_k_parts);
577 tensor.m_k_size = k_size_type("k_size", num_i_parts, max_num_j_parts,
578 max_num_k_parts);
579 tensor.m_row_map = row_map_type("row_map", num_i_parts,
580 max_num_j_parts, max_num_k_parts,
581 max_num_rows+1);
582 tensor.m_num_entry = num_entry_type("num_entry", num_i_parts,
583 max_num_j_parts, max_num_k_parts,
584 max_num_rows);
585 tensor.m_dimension = basis.size();
586 tensor.m_max_i_tile_size = i_tile_size;
588 tensor.m_num_i = num_i_parts;
589
590 // Create mirror, is a view if is host memory
591 typename value_array_type::HostMirror host_value =
592 Kokkos::create_mirror_view(tensor.m_value);
593 typename coord_array_type::HostMirror host_coord =
594 Kokkos::create_mirror_view(tensor.m_coord);
595 typename coord2_array_type::HostMirror host_coord2 =
596 Kokkos::create_mirror_view(tensor.m_coord2);
597 typename i_begin_type::HostMirror host_i_begin =
598 Kokkos::create_mirror_view(tensor.m_i_begin);
599 typename i_size_type::HostMirror host_i_size =
600 Kokkos::create_mirror_view(tensor.m_i_size);
601 typename num_j_type::HostMirror host_num_j =
602 Kokkos::create_mirror_view(tensor.m_num_j);
603 typename j_begin_type::HostMirror host_j_begin =
604 Kokkos::create_mirror_view(tensor.m_j_begin);
605 typename j_size_type::HostMirror host_j_size =
606 Kokkos::create_mirror_view(tensor.m_j_size);
607 typename num_k_type::HostMirror host_num_k =
608 Kokkos::create_mirror_view(tensor.m_num_k);
609 typename k_begin_type::HostMirror host_k_begin =
610 Kokkos::create_mirror_view(tensor.m_k_begin);
611 typename k_size_type::HostMirror host_k_size =
612 Kokkos::create_mirror_view(tensor.m_k_size);
613 typename row_map_type::HostMirror host_row_map =
614 Kokkos::create_mirror_view(tensor.m_row_map);
615 typename num_entry_type::HostMirror host_num_entry =
616 Kokkos::create_mirror_view(tensor.m_num_entry);
617
618 // Compute row map
619 size_type sum = 0;
620 for (size_type idx=0; idx<num_i_parts; ++idx) {
621 size_type num_j_parts = i_tiles[idx].parts.size();
622 for (size_type jdx=0; jdx<num_j_parts; ++jdx) {
623 size_type num_k_parts = i_tiles[idx].parts[jdx].parts.size();
624 for (size_type kdx=0; kdx<num_k_parts; ++kdx) {
625 size_type nc = coord_work[idx][jdx][kdx].size();
626 host_row_map(idx,jdx,kdx,0) = sum;
627 for (size_type t=0; t<nc; ++t) {
628 sum += coord_work[idx][jdx][kdx][t];
629 host_row_map(idx,jdx,kdx,t+1) = sum;
630 host_num_entry(idx,jdx,kdx,t) = 0;
631 }
632 }
633 }
634 }
635
636 // Copy per part row offsets back into coord_work
637 for (size_type idx=0; idx<num_i_parts; ++idx) {
638 size_type num_j_parts = i_tiles[idx].parts.size();
639 for (size_type jdx=0; jdx<num_j_parts; ++jdx) {
640 size_type num_k_parts = i_tiles[idx].parts[jdx].parts.size();
641 for (size_type kdx=0; kdx<num_k_parts; ++kdx) {
642 size_type nc = coord_work[idx][jdx][kdx].size();
643 for (size_type t=0; t<nc; ++t) {
644 coord_work[idx][jdx][kdx][t] = host_row_map(idx,jdx,kdx,t);
645 }
646 }
647 }
648 }
649
650 // Fill in coordinate and value arrays
651 for (size_type idx=0; idx<num_i_parts; ++idx) {
652 host_i_begin(idx) = i_tiles[idx].lower;
653 host_i_size(idx) = i_tiles[idx].upper - i_tiles[idx].lower;
654 TEUCHOS_ASSERT(host_i_size(idx) <= i_tile_size);
655 size_type num_j_parts = i_tiles[idx].parts.size();
656 host_num_j(idx) = num_j_parts;
657 for (size_type jdx=0; jdx<num_j_parts; ++jdx) {
658 host_j_begin(idx,jdx) = i_tiles[idx].parts[jdx].lower;
659 host_j_size(idx,jdx) = i_tiles[idx].parts[jdx].upper -
660 i_tiles[idx].parts[jdx].lower;
661 TEUCHOS_ASSERT(host_j_size(idx,jdx) <= max_jk_tile_size);
662 size_type num_k_parts = i_tiles[idx].parts[jdx].parts.size();
663 host_num_k(idx,jdx) = num_k_parts;
664 for (size_type kdx=0; kdx<num_k_parts; ++kdx) {
665 host_k_begin(idx,jdx,kdx) = i_tiles[idx].parts[jdx].parts[kdx].lower;
666 host_k_size(idx,jdx,kdx) = i_tiles[idx].parts[jdx].parts[kdx].upper -
667 i_tiles[idx].parts[jdx].parts[kdx].lower;
668 TEUCHOS_ASSERT(host_k_size(idx,jdx,kdx) <= max_jk_tile_size);
669
670 size_type nc = i_tiles[idx].parts[jdx].parts[kdx].parts.size();
671 for (size_type t=0; t<nc; ++t) {
672 Coord s = i_tiles[idx].parts[jdx].parts[kdx].parts[t];
673 const size_type i = s.i;
674 const size_type j = s.j;
675 const size_type k = s.k;
676 const value_type c = s.cijk;
677
678 const size_type row = i - host_i_begin(idx);
679 const size_type n = coord_work[idx][jdx][kdx][row];
680 ++coord_work[idx][jdx][kdx][row];
681
682 host_value(n) = (j != k) ? c : 0.5*c;
683 host_coord2(n,0) = j - host_j_begin(idx,jdx);
684 host_coord2(n,1) = k - host_k_begin(idx,jdx,kdx);
685 host_coord(n) = (host_coord2(n,1) << 16) | host_coord2(n,0);
686
687 ++host_num_entry(idx,jdx,kdx,row);
688 ++tensor.m_nnz;
689 }
690 }
691 }
692 }
693
694 // Copy data to device if necessary
695 Kokkos::deep_copy(tensor.m_value, host_value);
696 Kokkos::deep_copy(tensor.m_coord, host_coord);
697 Kokkos::deep_copy(tensor.m_coord2, host_coord2);
698 Kokkos::deep_copy(tensor.m_i_begin, host_i_begin);
699 Kokkos::deep_copy(tensor.m_i_size, host_i_size);
700 Kokkos::deep_copy(tensor.m_num_j, host_num_j);
701 Kokkos::deep_copy(tensor.m_j_begin, host_j_begin);
702 Kokkos::deep_copy(tensor.m_j_size, host_j_size);
703 Kokkos::deep_copy(tensor.m_num_k, host_num_k);
704 Kokkos::deep_copy(tensor.m_k_begin, host_k_begin);
705 Kokkos::deep_copy(tensor.m_k_size, host_k_size);
706 Kokkos::deep_copy(tensor.m_row_map, host_row_map);
707 Kokkos::deep_copy(tensor.m_num_entry, host_num_entry);
708
709 tensor.m_flops = 0;
710 for (size_type idx=0; idx<num_i_parts; ++idx) {
711 size_type num_j_parts = i_tiles[idx].parts.size();
712 for (size_type jdx=0; jdx<num_j_parts; ++jdx) {
713 size_type num_k_parts = i_tiles[idx].parts[jdx].parts.size();
714 for (size_type kdx=0; kdx<num_k_parts; ++kdx) {
715 for (size_type i = 0; i < host_i_size(idx); ++i) {
716 tensor.m_flops += 5*host_num_entry(idx,jdx,kdx,i) + 1;
717 }
718 }
719 }
720 }
721
722 return tensor;
723 }
724};
725
726template< class Device, typename OrdinalType, typename ValueType >
727SimpleTiledCrsProductTensor<ValueType, Device>
731 const Teuchos::ParameterList& params)
732{
734 basis, Cijk, params);
735}
736
737template < typename ValueType, typename Device >
738class BlockMultiply< SimpleTiledCrsProductTensor< ValueType , Device > >
739{
740public:
741
742 typedef typename Device::size_type size_type ;
744
745 template< typename MatrixValue , typename VectorValue >
746 KOKKOS_INLINE_FUNCTION
747 static void apply( const tensor_type & tensor ,
748 const MatrixValue * const a ,
749 const VectorValue * const x ,
750 VectorValue * const y )
751 {
752 const size_type block_size = 2;
754
755 const size_type n_i_tile = tensor.num_i_tiles();
756 for (size_type i_tile = 0; i_tile<n_i_tile; ++i_tile) {
757 const size_type i_begin = tensor.i_begin(i_tile);
758 const size_type i_size = tensor.i_size(i_tile);
759
760 const size_type n_j_tile = tensor.num_j_tiles(i_tile);
761 for (size_type j_tile = 0; j_tile<n_j_tile; ++j_tile) {
762 const size_type j_begin = tensor.j_begin(i_tile, j_tile);
763 //const size_type j_size = tensor.j_size(i_tile, j_tile);
764
765 const size_type n_k_tile = tensor.num_k_tiles(i_tile, j_tile);
766 for (size_type k_tile = 0; k_tile<n_k_tile; ++k_tile) {
767 const size_type k_begin = tensor.k_begin(i_tile, j_tile, k_tile);
768 //const size_type k_size = tensor.k_size(i_tile, j_tile, k_tile);
769
770 for (size_type i=0; i<i_size; ++i) {
771
772 const size_type nEntry =
773 tensor.num_entry(i_tile,j_tile,k_tile,i);
774 const size_type iEntryBeg =
775 tensor.entry_begin(i_tile,j_tile,k_tile,i);
776 const size_type iEntryEnd = iEntryBeg + nEntry;
777 size_type iEntry = iEntryBeg;
778
779 VectorValue ytmp = 0 ;
780
781 // Do entries with a blocked loop of size block_size
782 if (block_size > 1) {
783 const size_type nBlock = nEntry / block_size;
784 const size_type nEntryB = nBlock * block_size;
785 const size_type iEnd = iEntryBeg + nEntryB;
786
787 TV vy;
788 vy.zero();
789 int j[block_size], k[block_size];
790
791 for ( ; iEntry < iEnd ; iEntry += block_size ) {
792
793 for (size_type ii=0; ii<block_size; ++ii) {
794 j[ii] = tensor.coord(iEntry+ii,0) + j_begin;
795 k[ii] = tensor.coord(iEntry+ii,1) + k_begin;
796 }
797 TV aj(a, j), ak(a, k), xj(x, j), xk(x, k),
798 c(&(tensor.value(iEntry)));
799
800 // vy += c * ( aj * xk + ak * xj)
801 aj.times_equal(xk);
802 ak.times_equal(xj);
803 aj.plus_equal(ak);
804 c.times_equal(aj);
805 vy.plus_equal(c);
806
807 }
808
809 ytmp += vy.sum();
810 }
811
812 // Do remaining entries with a scalar loop
813 for ( ; iEntry<iEntryEnd; ++iEntry) {
814 const size_type j = tensor.coord(iEntry,0) + j_begin;
815 const size_type k = tensor.coord(iEntry,1) + k_begin;
816
817 ytmp += tensor.value(iEntry) * ( a[j] * x[k] + a[k] * x[j] );
818 }
819
820 y[i+i_begin] += ytmp;
821
822 }
823 }
824 }
825 }
826 }
827
828 KOKKOS_INLINE_FUNCTION
829 static size_type matrix_size( const tensor_type & tensor )
830 { return tensor.dimension(); }
831
832 KOKKOS_INLINE_FUNCTION
833 static size_type vector_size( const tensor_type & tensor )
834 { return tensor.dimension(); }
835};
836
837} /* namespace Stokhos */
838
839//----------------------------------------------------------------------------
840//----------------------------------------------------------------------------
841
842#endif /* #ifndef STOKHOS_SIMPLE_TILED_CRS_PRODUCT_TENSOR_HPP */
static KOKKOS_INLINE_FUNCTION size_type vector_size(const tensor_type &tensor)
static KOKKOS_INLINE_FUNCTION size_type matrix_size(const tensor_type &tensor)
static KOKKOS_INLINE_FUNCTION void apply(const tensor_type &tensor, const MatrixValue *const a, const VectorValue *const x, VectorValue *const y)
virtual ordinal_type size() const =0
Return total size of basis.
Abstract base class for multivariate orthogonal polynomials generated from tensor products of univari...
KOKKOS_INLINE_FUNCTION size_type num_non_zeros() const
Number of non-zero's.
KOKKOS_INLINE_FUNCTION size_type num_k_tiles(const size_type i, const size_type j) const
Number k-tiles.
Kokkos::View< size_type ***, execution_space > k_size_type
KOKKOS_INLINE_FUNCTION size_type max_jk_tile_size() const
Max size of any j/k tile.
KOKKOS_INLINE_FUNCTION size_type j_begin(const size_type i, const size_type j) const
Begin entries with for tile 'i,j'.
Kokkos::View< size_type ****, Kokkos::LayoutRight, execution_space > num_entry_type
KOKKOS_INLINE_FUNCTION size_type k_begin(const size_type i, const size_type j, const size_type k) const
Begin entries with for tile 'i,j,k'.
KOKKOS_INLINE_FUNCTION size_type num_j_tiles(const size_type i) const
Number j-tiles.
KOKKOS_INLINE_FUNCTION size_type entry_end(const size_type i, const size_type j, const size_type k, const size_type r) const
End entries for tile (i,j,k) and row r.
KOKKOS_INLINE_FUNCTION size_type i_begin(const size_type i) const
Begin entries with for tile 'i'.
KOKKOS_INLINE_FUNCTION size_type num_flops() const
Number flop's per multiply-add.
SimpleTiledCrsProductTensor & operator=(const SimpleTiledCrsProductTensor &rhs)
Kokkos::View< size_type **, execution_space > j_size_type
KOKKOS_INLINE_FUNCTION size_type max_i_tile_size() const
Max size of any i tile.
Kokkos::View< size_type ****, Kokkos::LayoutRight, execution_space > row_map_type
KOKKOS_INLINE_FUNCTION size_type dimension() const
Dimension of the tensor.
KOKKOS_INLINE_FUNCTION const size_type & coord(const size_type entry) const
Coordinates of an entry.
Kokkos::View< value_type[], execution_space > value_array_type
KOKKOS_INLINE_FUNCTION size_type k_size(const size_type i, const size_type j, const size_type k) const
Number of entries with for tile 'i,j'.
Kokkos::View< size_type *, execution_space > i_begin_type
KOKKOS_INLINE_FUNCTION size_type entry_begin(const size_type i, const size_type j, const size_type k, const size_type r) const
Begin entries for tile (i,j,k) and row r.
static SimpleTiledCrsProductTensor create(const Stokhos::ProductBasis< OrdinalType, ValueType > &basis, const Stokhos::Sparse3Tensor< OrdinalType, ValueType > &Cijk, const Teuchos::ParameterList &params)
KOKKOS_INLINE_FUNCTION size_type entry_count() const
Number of sparse entries.
KOKKOS_INLINE_FUNCTION size_type num_entry(const size_type i, const size_type j, const size_type k, const size_type r) const
Number of entries for tile (i,j,k) and row r.
Kokkos::View< size_type **, execution_space > num_k_type
Kokkos::View< size_type ***, execution_space > k_begin_type
KOKKOS_INLINE_FUNCTION size_type num_i_tiles() const
Number i-tiles.
Kokkos::View< size_type[], execution_space > coord_array_type
Kokkos::View< size_type *, execution_space > num_j_type
Kokkos::View< size_type[][2], Kokkos::LayoutLeft, execution_space > coord2_array_type
KOKKOS_INLINE_FUNCTION const size_type & coord(const size_type entry, const size_type c) const
Coordinates of an entry.
KOKKOS_INLINE_FUNCTION size_type i_size(const size_type i) const
Number of entries with for tile 'i'.
Kokkos::View< value_type[], execution_space > vec_type
Kokkos::View< size_type **, execution_space > j_begin_type
Kokkos::View< size_type *, execution_space > i_size_type
SimpleTiledCrsProductTensor(const SimpleTiledCrsProductTensor &rhs)
KOKKOS_INLINE_FUNCTION const value_type & value(const size_type entry) const
Value of an entry.
KOKKOS_INLINE_FUNCTION size_type j_size(const size_type i, const size_type j) const
Number of entries with for tile 'i,j'.
Data structure storing a sparse 3-tensor C(i,j,k) in a a compressed format.
kj_iterator j_end(const k_iterator &k) const
Iterator pointing to last j entry for given k.
k_iterator k_begin() const
Iterator pointing to first k entry.
kji_iterator i_begin(const kj_iterator &j) const
Iterator pointing to first i entry for given j and k.
kj_iterator j_begin(const k_iterator &k) const
Iterator pointing to first j entry for given k.
kji_iterator i_end(const kj_iterator &j) const
Iterator pointing to last i entry for given j and k.
ordinal_type num_entries() const
Return number of non-zero entries.
k_iterator k_end() const
Iterator pointing to last k entry.
Top-level namespace for Stokhos classes and functions.
SimpleTiledCrsProductTensor< ValueType, Device > create_simple_tiled_product_tensor(const Stokhos::ProductBasis< OrdinalType, ValueType > &basis, const Stokhos::Sparse3Tensor< OrdinalType, ValueType > &Cijk, const Teuchos::ParameterList &params)