Stokhos Package Browser (Single Doxygen Collection) Version of the Day
Loading...
Searching...
No Matches
FadMPAssembly/TestAssembly.cpp
Go to the documentation of this file.
1// @HEADER
2// ***********************************************************************
3//
4// Stokhos Package
5// Copyright (2009) Sandia Corporation
6//
7// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8// license for use of this work by or on behalf of the U.S. Government.
9//
10// Redistribution and use in source and binary forms, with or without
11// modification, are permitted provided that the following conditions are
12// met:
13//
14// 1. Redistributions of source code must retain the above copyright
15// notice, this list of conditions and the following disclaimer.
16//
17// 2. Redistributions in binary form must reproduce the above copyright
18// notice, this list of conditions and the following disclaimer in the
19// documentation and/or other materials provided with the distribution.
20//
21// 3. Neither the name of the Corporation nor the names of the
22// contributors may be used to endorse or promote products derived from
23// this software without specific prior written permission.
24//
25// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36//
37// Questions? Contact Eric T. Phipps (etphipp@sandia.gov).
38//
39// ***********************************************************************
40// @HEADER
41
42#include <iostream>
43
44// Tests
45#include "TestAssembly.hpp"
46
47// Devices
48#include "Kokkos_Core.hpp"
49
50// Utilities
51#include "Teuchos_DefaultComm.hpp"
52#include "Teuchos_CommandLineProcessor.hpp"
53#include "Teuchos_StandardCatchMacros.hpp"
54#ifdef KOKKOS_ENABLE_CUDA
55#include "cuda_runtime_api.h"
56#endif
57
58template <typename Storage,
60void mainHost(const Teuchos::RCP<const Teuchos::Comm<int> >& comm ,
61 const int use_print ,
62 const int use_trials ,
63 const int use_nodes[] ,
64 const bool check ,
66#ifdef __MIC__
67 const int entry_min = 8;
68 const int entry_max = 48;
69 const int entry_step = 8;
70#else
71 const int entry_min = 4;
72 const int entry_max = 32;
73 const int entry_step = 4;
74 // const int entry_min = 1;
75 // const int entry_max = 1;
76 // const int entry_step = 1;
77#endif
78
79 performance_test_driver<Storage,entry_min,entry_max,entry_step,Method>(
80 comm, use_print, use_trials, use_nodes, check, dev_config);
81}
82
83template <typename Storage,
85void mainCuda(const Teuchos::RCP<const Teuchos::Comm<int> >& comm ,
86 const int use_print ,
87 const int use_trials ,
88 const int use_nodes[] ,
89 const bool check ,
91 const int entry_min = 16;
92 const int entry_max = 64;
93 const int entry_step = 16;
94 performance_test_driver<Storage,entry_min,entry_max,entry_step,Method>(
95 comm, use_print, use_trials, use_nodes, check, dev_config);
96}
97
98int main(int argc, char *argv[])
99{
100 bool success = true;
101 bool verbose = false;
102 try {
103
104 Teuchos::oblackholestream blackHole;
105 Teuchos::GlobalMPISession mpiSession (&argc, &argv, &blackHole);
106
107 Teuchos::RCP<const Teuchos::Comm<int> > comm =
108 Teuchos::DefaultComm<int>::getComm();
109
110 const size_t num_sockets = Kokkos::hwloc::get_available_numa_count();
111 const size_t num_cores_per_socket =
112 Kokkos::hwloc::get_available_cores_per_numa();
113 const size_t num_threads_per_core =
114 Kokkos::hwloc::get_available_threads_per_core();
115
116 // Setup command line options
117 Teuchos::CommandLineProcessor CLP;
118 CLP.setDocString(
119 "This test performance of MP::Vector FEM assembly.\n");
120 int nGrid = 32;
121 CLP.setOption("n", &nGrid, "Number of mesh points in the each direction");
122 int nIter = 10;
123 CLP.setOption("ni", &nIter, "Number of assembly iterations");
124 bool print = false;
125 CLP.setOption("print", "no-print", &print, "Print debugging output");
126 bool check = false;
127 int num_cores = num_cores_per_socket * num_sockets;
128 CLP.setOption("cores", &num_cores,
129 "Number of CPU cores to use (defaults to all)");
130 int num_hyper_threads = num_threads_per_core;
131 CLP.setOption("hyperthreads", &num_hyper_threads,
132 "Number of hyper threads per core to use (defaults to all)");
133 int threads_per_vector = 1;
134 CLP.setOption("threads_per_vector", &threads_per_vector,
135 "Number of threads to use within each vector");
136 CLP.setOption("check", "no-check", &check, "Check correctness");
137#ifdef KOKKOS_ENABLE_SERIAL
138 bool serial = true;
139 CLP.setOption("serial", "no-serial", &serial, "Enable Serial device");
140#endif
141#ifdef KOKKOS_ENABLE_THREADS
142 bool threads = true;
143 CLP.setOption("threads", "no-threads", &threads, "Enable Threads device");
144#endif
145#ifdef KOKKOS_ENABLE_OPENMP
146 bool openmp = true;
147 CLP.setOption("openmp", "no-openmp", &openmp, "Enable OpenMP device");
148#endif
149#ifdef KOKKOS_ENABLE_CUDA
150 bool cuda = true;
151 CLP.setOption("cuda", "no-cuda", &cuda, "Enable Cuda device");
152 int cuda_threads_per_vector = 16;
153 CLP.setOption("cuda_threads_per_vector", &cuda_threads_per_vector,
154 "Number of Cuda threads to use within each vector");
155 int cuda_block_size = 256;
156 CLP.setOption("cuda_block_size", &cuda_block_size,
157 "Cuda block size");
158 int num_cuda_blocks = 0;
159 CLP.setOption("num_cuda_blocks", &num_cuda_blocks,
160 "Number of Cuda blocks (0 implies the default choice)");
161 int device_id = -1;
162 CLP.setOption("device", &device_id, "CUDA device ID. Set to default of -1 to use the default device as determined by the local node MPI rank and --ngpus");
163 int ngpus = 1;
164 CLP.setOption("ngpus", &ngpus, "Number of GPUs per node for multi-GPU runs via MPI");
165#endif
166 CLP.parse( argc, argv );
167
168 int use_nodes[3];
169 use_nodes[0] = nGrid; use_nodes[1] = nGrid; use_nodes[2] = nGrid;
170
171 typedef int Ordinal;
172 typedef double Scalar;
175 // const Kokkos::Example::FENL::AssemblyMethod Method =
176 // Kokkos::Example::FENL::Analytic;
177
178#ifdef KOKKOS_ENABLE_SERIAL
179 if (serial) {
180 typedef Kokkos::Serial Device;
182
183 Kokkos::InitializationSettings init_args;
184 init_args.set_num_threads(num_cores*num_hyper_threads);
185 Kokkos::initialize( init_args );
186
187 if (comm->getRank() == 0)
188 std::cout << std::endl
189 << "Serial performance with " << comm->getSize()
190 << " MPI ranks" << std::endl;
191
192 Kokkos::Example::FENL::DeviceConfig dev_config(1, 1, 1);
193
194 mainHost<Storage,Method>(comm, print, nIter, use_nodes, check,
195 dev_config);
196
197 Kokkos::finalize();
198 }
199#endif
200
201#ifdef KOKKOS_ENABLE_THREADS
202 if (threads) {
203 typedef Kokkos::Threads Device;
205
206 Kokkos::InitializationSettings init_args;
207 init_args.set_num_threads(num_cores*num_hyper_threads);
208 Kokkos::initialize( init_args );
209
210 if (comm->getRank() == 0)
211 std::cout << std::endl
212 << "Threads performance with " << comm->getSize()
213 << " MPI ranks and " << num_cores*num_hyper_threads
214 << " threads per rank:" << std::endl;
215
217 threads_per_vector,
218 num_hyper_threads / threads_per_vector);
219
220 mainHost<Storage,Method>(comm, print, nIter, use_nodes, check,
221 dev_config);
222
223 Kokkos::finalize();
224 }
225#endif
226
227#ifdef KOKKOS_ENABLE_OPENMP
228 if (openmp) {
229 typedef Kokkos::OpenMP Device;
231
232 Kokkos::InitializationSettings init_args;
233 init_args.set_num_threads(num_cores*num_hyper_threads);
234 Kokkos::initialize( init_args );
235
236 if (comm->getRank() == 0)
237 std::cout << std::endl
238 << "OpenMP performance with " << comm->getSize()
239 << " MPI ranks and " << num_cores*num_hyper_threads
240 << " threads per rank:" << std::endl;
241
243 threads_per_vector,
244 num_hyper_threads / threads_per_vector);
245
246 mainHost<Storage,Method>(comm, print, nIter, use_nodes, check,
247 dev_config);
248
249 Kokkos::finalize();
250 }
251#endif
252
253#ifdef KOKKOS_ENABLE_CUDA
254 if (cuda) {
255 typedef Kokkos::Cuda Device;
257
258 if (device_id == -1) {
259 int local_rank = 0;
260 char *str;
261 if ((str = std::getenv("SLURM_LOCALID")))
262 local_rank = std::atoi(str);
263 else if ((str = std::getenv("MV2_COMM_WORLD_LOCAL_RANK")))
264 local_rank = std::atoi(str);
265 else if ((str = getenv("OMPI_COMM_WORLD_LOCAL_RANK")))
266 local_rank = std::atoi(str);
267 device_id = local_rank % ngpus;
268
269 // Check device is valid
270 int num_device; cudaGetDeviceCount(&num_device);
271 TEUCHOS_TEST_FOR_EXCEPTION(
272 device_id >= num_device, std::logic_error,
273 "Invalid device ID " << device_id << ". You probably are trying" <<
274 " to run with too many GPUs per node");
275 }
276
277 Kokkos::InitializationSettings init_args;
278 init_args.set_device_id(device_id);
279 Kokkos::initialize( init_args );
280
281 cudaDeviceProp deviceProp;
282 cudaGetDeviceProperties(&deviceProp, device_id);
283 if (comm->getRank() == 0)
284 std::cout << std::endl
285 << "CUDA performance performance with " << comm->getSize()
286 << " MPI ranks and device " << device_id << " ("
287 << deviceProp.name << "):"
288 << std::endl;
289
291 num_cuda_blocks,
292 cuda_threads_per_vector,
293 cuda_threads_per_vector == 0 ? 0 : cuda_block_size / cuda_threads_per_vector);
294
295 mainCuda<Storage,Method>(comm, print, nIter, use_nodes, check,
296 dev_config);
297
298 Kokkos::finalize();
299 }
300#endif
301
302 }
303 TEUCHOS_STANDARD_CATCH_STATEMENTS(verbose, std::cerr, success);
304
305 if (success)
306 return 0;
307 return -1;
308}
int main(int argc, char *argv[])
void mainHost(const Teuchos::RCP< const Teuchos::Comm< int > > &comm, const int use_print, const int use_trials, const int use_nodes[], const bool check, Kokkos::Example::FENL::DeviceConfig dev_config)
void mainCuda(const Teuchos::RCP< const Teuchos::Comm< int > > &comm, const int use_print, const int use_trials, const int use_nodes[], const bool check, Kokkos::Example::FENL::DeviceConfig dev_config)
Statically allocated storage class.
int check(Epetra_CrsGraph &A, int NumMyRows1, int NumGlobalRows1, int NumMyNonzeros1, int NumGlobalNonzeros1, int *MyGlobalElements, bool verbose)
Stokhos::StandardStorage< int, double > Storage