Stokhos Package Browser (Single Doxygen Collection) Version of the Day
Loading...
Searching...
No Matches
MPAssembly/TestAssembly.hpp
Go to the documentation of this file.
1// @HEADER
2// ***********************************************************************
3//
4// Stokhos Package
5// Copyright (2009) Sandia Corporation
6//
7// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8// license for use of this work by or on behalf of the U.S. Government.
9//
10// Redistribution and use in source and binary forms, with or without
11// modification, are permitted provided that the following conditions are
12// met:
13//
14// 1. Redistributions of source code must retain the above copyright
15// notice, this list of conditions and the following disclaimer.
16//
17// 2. Redistributions in binary form must reproduce the above copyright
18// notice, this list of conditions and the following disclaimer in the
19// documentation and/or other materials provided with the distribution.
20//
21// 3. Neither the name of the Corporation nor the names of the
22// contributors may be used to endorse or promote products derived from
23// this software without specific prior written permission.
24//
25// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36//
37// Questions? Contact Eric T. Phipps (etphipp@sandia.gov).
38//
39// ***********************************************************************
40// @HEADER
41#include <iostream>
42
43// MP::Vector
46
47// Compile-time loops
48#include "Sacado_mpl_range_c.hpp"
49#include "Sacado_mpl_for_each.hpp"
50#include "Sacado_mpl_integral_c.hpp"
51
52// Kokkos libraries' headers:
53#include <Kokkos_UnorderedMap.hpp>
54#include <Kokkos_StaticCrsGraph.hpp>
55#include <Kokkos_Timer.hpp>
56
57// Utilities
58#include <Teuchos_CommHelpers.hpp>
59#include "Teuchos_TestingHelpers.hpp"
60#include "Teuchos_VerboseObject.hpp"
61
62// FENL
63#include <BoxElemFixture.hpp>
64#include <VectorImport.hpp>
65#include <fenl_functors.hpp>
66
67inline
68double maximum( const Teuchos::Comm<int>& comm , double local )
69{
70 double global = 0 ;
71 Teuchos::reduceAll( comm , Teuchos::REDUCE_MAX , 1 , & local , & global );
72 return global ;
73}
74
75struct Perf {
76 size_t global_elem_count ;
77 size_t global_node_count ;
78 double import_time ;
79 double fill_time ;
80
83 import_time(0) ,
84 fill_time(0) {}
85
86 void increment(const Perf& p) {
91 }
92
93 void min(const Perf& p) {
96 import_time = std::min( import_time , p.import_time );
97 fill_time = std::min( fill_time , p.fill_time );
98 }
99
100 void scale(double s) {
101 import_time *= s;
102 fill_time *= s;
103 }
104
105 void reduceMax(const Teuchos::Comm<int>& comm) {
106 import_time = maximum( comm , import_time );
107 fill_time = maximum( comm , fill_time );
108 }
109};
110
111template <typename Scalar, typename Device>
113 const Teuchos::RCP<const Teuchos::Comm<int> >& comm ,
114 const int use_print ,
115 const int use_trials ,
116 const int use_nodes[] ,
118 Kokkos::View< Scalar* , Kokkos::LayoutLeft, Device >& nodal_residual)
119{
120 using Teuchos::RCP;
121 using Teuchos::rcp;
122 using Teuchos::rcpFromRef;
123 using Teuchos::arrayView;
124 using Teuchos::ParameterList;
125
127
129
130 typedef typename LocalMatrixType::StaticCrsGraphType
131 LocalGraphType ;
132
134 NodeNodeGraphType ;
135
136 //typedef Kokkos::Example::FENL::ElementComputationConstantCoefficient CoeffFunctionType;
139 ElementComputationType ;
140
142 DirichletComputationType ;
143
144 typedef typename ElementComputationType::vector_type VectorType ;
145
147 typename FixtureType::comm_list_type ,
148 typename FixtureType::send_nodeid_type ,
149 VectorType > ImportType ;
150
151 //------------------------------------
152
153 const int print_flag = use_print && std::is_same< Kokkos::HostSpace , typename Device::memory_space >::value ;
154
155 const int comm_rank = comm->getRank();
156 const int comm_size = comm->getSize();
157
158 // Decompose by node to avoid parallel communication in assembly
159
160 const double bubble_x = 1.0 ;
161 const double bubble_y = 1.0 ;
162 const double bubble_z = 1.0 ;
163
164 const FixtureType fixture( Kokkos::Example::BoxElemPart::DecomposeNode ,
165 comm_size , comm_rank ,
166 use_nodes[0] , use_nodes[1] , use_nodes[2] ,
167 bubble_x , bubble_y , bubble_z );
168
169 if ( maximum(*comm, ( fixture.ok() ? 0 : 1 ) ) ) {
170 throw std::runtime_error(std::string("Problem fixture setup failed"));
171 }
172
173 //------------------------------------
174
175 const ImportType comm_nodal_import(
176 comm ,
177 fixture.recv_node() ,
178 fixture.send_node() ,
179 fixture.send_nodeid() ,
180 fixture.node_count_owned() ,
181 fixture.node_count() - fixture.node_count_owned() );
182
183 //------------------------------------
184
185 const double bc_lower_value = 1 ;
186 const double bc_upper_value = 2 ;
187 //CoeffFunctionType diffusion_coefficient( 1.0 );
188 CoeffFunctionType diffusion_coefficient( 1.0, 0.1, 1.0, 5 );
189 Kokkos::deep_copy( diffusion_coefficient.getRandomVariables(), 1.0 );
190
191 //------------------------------------
192
193 if ( print_flag ) {
194 std::cout << "ElemNode {" << std::endl ;
195 for ( unsigned ielem = 0 ; ielem < fixture.elem_count() ; ++ielem ) {
196 std::cout << " elem[" << ielem << "]{" ;
197 for ( unsigned inode = 0 ; inode < FixtureType::ElemNode ; ++inode ) {
198 std::cout << " " << fixture.elem_node(ielem,inode);
199 }
200 std::cout << " }" << std::endl ;
201 }
202 std::cout << "}" << std::endl ;
203 }
204
205 //------------------------------------
206
207 Kokkos::Timer wall_clock ;
208
209 Perf perf_stats = Perf() ;
210
211 for ( int itrial = 0 ; itrial < use_trials ; ++itrial ) {
212
213 Perf perf = Perf() ;
214
215 perf.global_elem_count = fixture.elem_count_global();
216 perf.global_node_count = fixture.node_count_global();
217
218 //----------------------------------
219 // Create the local sparse matrix graph and element-to-graph map
220 // from the element->to->node identifier array.
221 // The graph only has rows for the owned nodes.
222
223 typename NodeNodeGraphType::Times graph_times;
224 const NodeNodeGraphType
225 mesh_to_graph( fixture.elem_node() , fixture.node_count_owned(),
226 graph_times );
227
228 // Create the local sparse matrix from the graph:
229 LocalMatrixType jacobian( mesh_to_graph.graph );
230
231 //----------------------------------
232
233 // Allocate solution vector for each node in the mesh and residual vector for each owned node
234 VectorType nodal_solution( "nodal_solution" , fixture.node_count() );
235 nodal_residual = VectorType( "nodal_residual" , fixture.node_count_owned() );
236
237 // Get DeviceConfig structs used by some functors
238 Kokkos::Example::FENL::DeviceConfig dev_config_elem, dev_config_bc;
240 dev_config_bc );
241
242 // Create element computation functor
243 const ElementComputationType elemcomp( fixture , diffusion_coefficient ,
244 nodal_solution ,
245 mesh_to_graph.elem_graph ,
246 jacobian , nodal_residual ,
247 dev_config_elem );
248
249 // Create boundary condition functor
250 const DirichletComputationType dirichlet(
251 fixture , nodal_solution , jacobian , nodal_residual ,
252 2 /* apply at 'z' ends */ ,
253 bc_lower_value ,
254 bc_upper_value ,
255 dev_config_bc );
256
257 Kokkos::deep_copy( nodal_solution , Scalar(1) );
258
259 //--------------------------------
260
261 Device().fence();
262 wall_clock.reset();
263
264 comm_nodal_import( nodal_solution );
265
266 Device().fence();
267 perf.import_time = wall_clock.seconds();
268
269 //--------------------------------
270 // Element contributions to residual and jacobian
271
272 Device().fence();
273 wall_clock.reset();
274
275 Kokkos::deep_copy( nodal_residual , Scalar(0) );
276 Kokkos::deep_copy( jacobian.values , Scalar(0) );
277
278 elemcomp.apply();
279
280 //--------------------------------
281 // Apply boundary conditions
282
283 dirichlet.apply();
284
285 Device().fence();
286 perf.fill_time = wall_clock.seconds();
287
288 //--------------------------------
289
290 perf.reduceMax(*comm);
291 if (itrial == 0)
292 perf_stats = perf;
293 else
294 perf_stats.min(perf);
295
296 }
297
298 return perf_stats ;
299}
300
301template <typename ScalarViewType, typename EnsembleViewType>
302bool check_residuals(const ScalarViewType& scalar_residual,
303 const EnsembleViewType& ensemble_residual)
304{
305 const double tol = 1e-14;
306 bool success = true;
307 Teuchos::RCP<Teuchos::FancyOStream> out =
308 Teuchos::VerboseObjectBase::getDefaultOStream();
309 std::stringstream buf;
310 Teuchos::FancyOStream fbuf(Teuchos::rcp(&buf,false));
311
312 typename ScalarViewType::HostMirror host_scalar_residual =
313 Kokkos::create_mirror_view(scalar_residual);
314 typename EnsembleViewType::HostMirror host_ensemble_residual =
315 Kokkos::create_mirror_view(ensemble_residual);
316 Kokkos::deep_copy( host_scalar_residual, scalar_residual );
317 Kokkos::deep_copy( host_ensemble_residual, ensemble_residual );
318
319 TEUCHOS_TEST_EQUALITY( host_scalar_residual.extent(0),
320 host_ensemble_residual.extent(0), fbuf, success );
321
322 const size_t num_node = host_scalar_residual.extent(0);
323 const size_t num_ensemble = Kokkos::dimension_scalar(host_ensemble_residual);
324 for (size_t i=0; i<num_node; ++i) {
325 for (size_t j=0; j<num_ensemble; ++j) {
326 TEUCHOS_TEST_FLOATING_EQUALITY(
327 host_scalar_residual(i), host_ensemble_residual(i).fastAccessCoeff(j),
328 tol, fbuf, success );
329 }
330 }
331
332 if (!success)
333 *out << buf.str();
334
335 return success;
336}
337
338template <class Storage>
339struct PerformanceDriverOp {
340 typedef typename Storage::value_type Scalar;
341 typedef typename Storage::ordinal_type Ordinal;
342 typedef typename Storage::execution_space Device;
343 Teuchos::RCP<const Teuchos::Comm<int> > comm ;
344 const int use_print ;
345 const int use_trials ;
346 const int *use_nodes ;
347 const bool check ;
349
350 PerformanceDriverOp(const Teuchos::RCP<const Teuchos::Comm<int> >& comm_ ,
351 const int use_print_ ,
352 const int use_trials_ ,
353 const int use_nodes_[] ,
354 const bool check_ ,
356 comm(comm_),
357 use_print(use_print_),
358 use_trials(use_trials_),
359 use_nodes(use_nodes_),
360 check(check_),
361 dev_config(dev_config_) {}
362
363 template <typename ArgT>
364 void operator() (ArgT arg) const {
365 const int ensemble = ArgT::value;
366 typedef typename Storage::template apply_N<ensemble> NewStorageApply;
367 typedef typename NewStorageApply::type storage_type;
368 typedef Sacado::MP::Vector<storage_type> mp_vector_type;
369
370 typedef Kokkos::View< Scalar* , Kokkos::LayoutLeft, Device > scalar_vector_type ;
371 typedef Kokkos::View< mp_vector_type* , Kokkos::LayoutLeft, Device > ensemble_vector_type ;
372
373 scalar_vector_type scalar_residual;
374 Kokkos::Example::FENL::DeviceConfig scalar_dev_config(0, 1, 1);
375 Perf perf_scalar =
376 fenl_assembly<Scalar,Device>(
378 scalar_dev_config, scalar_residual );
379
380 ensemble_vector_type ensemble_residual;
382#if defined( KOKKOS_ENABLE_CUDA )
383 const bool is_cuda = std::is_same<Device,Kokkos::Cuda>::value;
384#else
385 const bool is_cuda = false ;
386#endif
387 if (is_cuda) {
388 const size_t block_size = dev_config.block_dim.x * dev_config.block_dim.y;
389 ensemble_dev_config.block_dim.x = ensemble;
390 ensemble_dev_config.block_dim.y = block_size/ensemble;
391 }
392 Perf perf_ensemble =
393 fenl_assembly<mp_vector_type,Device>(
395 ensemble_dev_config, ensemble_residual);
396
397 if (check)
398 check_residuals( scalar_residual, ensemble_residual );
399
400 perf_scalar.scale( 1000.0 / ( perf_scalar.global_node_count ) );
401 perf_ensemble.scale( 1000.0 / ( ensemble * perf_scalar.global_node_count ) );
402
403 if (comm->getRank() == 0) {
404 std::cout.precision(3);
405 std::cout << use_nodes[0] << " , "
406 << perf_scalar.global_node_count << " , "
407 << std::setw(2) << ensemble << " , "
408 << std::scientific
409 << perf_scalar.import_time << " , "
410 << perf_ensemble.import_time << " , "
411 << std::fixed << std::setw(6)
412 << perf_scalar.import_time / perf_ensemble.import_time << " , "
413 << std::scientific
414 << perf_scalar.fill_time << " , "
415 << perf_ensemble.fill_time << " , "
416 << std::fixed << std::setw(6)
417 << perf_scalar.fill_time / perf_ensemble.fill_time << " , "
418 << std::endl;
419 }
420 }
421};
422
423template <class Storage, int entry_min, int entry_max, int entry_step>
424void performance_test_driver( const Teuchos::RCP<const Teuchos::Comm<int> >& comm ,
425 const int use_print ,
426 const int use_trials ,
427 const int use_nodes[] ,
428 const bool check ,
430{
431 if (comm->getRank() == 0) {
432 std::cout.precision(8);
433 std::cout << std::endl
434 << "\"Grid Size\" , "
435 << "\"FEM Size\" , "
436 << "\"Ensemble Size\" , "
437 << "\"Scalar Import Time (ms)\" , "
438 << "\"Ensemble Import Time (ms)\" , "
439 << "\"Ensemble Import Speedup\" , "
440 << "\"Scalar Fill Time (ms)\" , "
441 << "\"Ensemble Fill Time (ms)\" , "
442 << "\"Ensemble Fill Speedup\" , "
443 << std::endl;
444 }
445
446 // Loop over [entry_min, entry_max] vector entries per thread
447 typedef Sacado::mpl::range_c< int, entry_min, entry_max+1, entry_step > Range;
448 PerformanceDriverOp<Storage> op(comm, use_print, use_trials,
449 use_nodes, check, dev_config);
450 Sacado::mpl::for_each_no_kokkos<Range> f(op);
451}
bool check_residuals(const ScalarViewType &scalar_residual, const EnsembleViewType &ensemble_residual)
double maximum(const Teuchos::Comm< int > &comm, double local)
bool check_residuals(const ScalarViewType &scalar_residual, const EnsembleViewType &ensemble_residual)
Perf fenl_assembly(const Teuchos::RCP< const Teuchos::Comm< int > > &comm, const int use_print, const int use_trials, const int use_nodes[], Kokkos::Example::FENL::DeviceConfig dev_config, Kokkos::View< Scalar *, Kokkos::LayoutLeft, Device > &nodal_residual)
void performance_test_driver(const Teuchos::RCP< const Teuchos::Comm< int > > &comm, const int use_print, const int use_trials, const int use_nodes[], const bool check, Kokkos::Example::FENL::DeviceConfig dev_config)
expr1 expr1 expr1 expr2 expr1 expr1 c expr2 expr1 c fastAccessCoeff(j) - expr2.val(j)
Stokhos::StandardStorage< int, double > storage_type
Generate a distributed unstructured finite element mesh from a partitioned NX*NY*NZ box of elements.
int check(Epetra_CrsGraph &A, int NumMyRows1, int NumGlobalRows1, int NumMyNonzeros1, int NumGlobalNonzeros1, int *MyGlobalElements, bool verbose)
ScalarType f(const Teuchos::Array< ScalarType > &x, double a, double b)
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< is_view_uq_pce< View< T, P... > >::value, unsigned >::type dimension_scalar(const View< T, P... > &view)
static void eval(Kokkos::Example::FENL::DeviceConfig &dev_config_elem, Kokkos::Example::FENL::DeviceConfig &dev_config_bc)
void min(const Perf &p)
void scale(double s)
void reduceMax(const Teuchos::Comm< int > &comm)
void increment(const Perf &p)
Kokkos::Example::FENL::DeviceConfig dev_config
Storage::execution_space Device
PerformanceDriverOp(const Teuchos::RCP< const Teuchos::Comm< int > > &comm_, const int use_print_, const int use_trials_, const int use_nodes_[], const bool check_, Kokkos::Example::FENL::DeviceConfig dev_config_)
Storage::ordinal_type Ordinal
Teuchos::RCP< const Teuchos::Comm< int > > comm