Peano
Loading...
Searching...
No Matches
KernelBenchmarksFVRiemann-main.cpp
Go to the documentation of this file.
1// This file is part of the ExaHyPE2 project. For conditions of distribution and
2// use, please see the copyright notice at www.peano-framework.org
3#include "KernelBenchmarksFVRiemann-main.h"
4
5#include "config.h"
6#include "Constants.h"
7
8#include "repositories/DataRepository.h"
9#include "repositories/SolverRepository.h"
10#include "repositories/StepRepository.h"
11
12#include "tasks/FVRiemannEnclaveTask.h"
13
14#include "peano4/peano.h"
15
23#include "tarch/logging/Log.h"
25#include "tarch/timing/Watch.h"
26
27#include <cstring>
28
29#include <fenv.h>
30#pragma float_control(precise, on)
31#pragma STDC FENV_ACCESS ON
32
33using namespace benchmarks::exahype2::kernelbenchmarks;
34
36
37constexpr double TimeStamp = 0.5;
38constexpr double TimeStepSize = 1e-6;
39constexpr double CellSize = 0.1;
40constexpr double CellOffset = 4.0;
41constexpr int HaloSize = 1;
42
43static_assert(Accuracy >= std::numeric_limits<double>::epsilon() || Accuracy == 0.0);
44
46 = (FVRiemann::NumberOfFiniteVolumesPerAxisPerPatch + 2 * HaloSize)
47 * (FVRiemann::NumberOfFiniteVolumesPerAxisPerPatch + 2 * HaloSize)
48#if Dimensions == 3
49 * (FVRiemann::NumberOfFiniteVolumesPerAxisPerPatch + 2 * HaloSize)
50#endif
51 * (FVRiemann::NumberOfUnknowns + FVRiemann::NumberOfAuxiliaryVariables);
52
54 = (FVRiemann::NumberOfFiniteVolumesPerAxisPerPatch + 0)
55 * (FVRiemann::NumberOfFiniteVolumesPerAxisPerPatch + 0)
56#if Dimensions == 3
57 * (FVRiemann::NumberOfFiniteVolumesPerAxisPerPatch + 0)
58#endif
59 * (FVRiemann::NumberOfUnknowns + FVRiemann::NumberOfAuxiliaryVariables);
60
62 = FVRiemann::NumberOfFiniteVolumesPerAxisPerPatch
63 * FVRiemann::NumberOfFiniteVolumesPerAxisPerPatch
64#if Dimensions == 3
65 * FVRiemann::NumberOfFiniteVolumesPerAxisPerPatch
66#endif
67 ;
68
69// Check the outcomes of each kernel
70double** validQ = nullptr;
71double* validMaxEigenvalue = nullptr;
72bool outcomeIsInvalid = false;
73
82void initInputData(double* Q) {
83 for (int i = 0; i < NumberOfInputEntriesPerPatch; i++) {
84 Q[i] = std::sin(1.0 * i / (NumberOfInputEntriesPerPatch) * tarch::la::PI);
85 }
86}
87
97void allocateAndStoreOutcome(const double* const* Q,
98 const double* const maxEigenvalue,
99 const int numberOfPatches
100) {
101 if constexpr (Accuracy <= 0.0) return;
102 if (validQ == nullptr and validMaxEigenvalue == nullptr) {
103 validQ = new double*[numberOfPatches];
104 for (int patchIndex = 0; patchIndex < numberOfPatches; patchIndex++) {
105 validQ[patchIndex] = new double[NumberOfOutputEntriesPerPatch];
106 std::memcpy(validQ[patchIndex], Q[patchIndex], sizeof(double) * NumberOfOutputEntriesPerPatch);
107 }
108 validMaxEigenvalue = new double[numberOfPatches];
109 std::memcpy(validMaxEigenvalue, maxEigenvalue, sizeof(double) * numberOfPatches);
110 logInfo("storeOutcome(...)", "bookmarked reference solution");
111 }
112}
113
114void freeOutcome(const int numberOfPatches) {
115 if constexpr (Accuracy <= 0.0) return;
116 for (int patchIndex = 0; patchIndex < numberOfPatches; patchIndex++) {
117 delete[] validQ[patchIndex];
118 }
119 delete[] validQ;
120 delete[] validMaxEigenvalue;
121 validQ = nullptr;
122 validMaxEigenvalue = nullptr;
123}
124
134 const double* const* Q,
135 const double* const maxEigenvalue,
136 const int numberOfPatches
137) {
138 if constexpr (Accuracy <= 0.0) return;
139 int errors = 0;
140 double maxDifference = 0.0;
141
142 std::cerr.precision(16);
143 for (int patchIndex = 0; patchIndex < numberOfPatches; patchIndex++) {
144 for (int i = 0; i < NumberOfOutputEntriesPerPatch; i++) {
145 if (not tarch::la::equals(Q[patchIndex][i], validQ[patchIndex][i], Accuracy)) {
146 if (!errors) { // Only print once
147 logError("validateOutcome(...)",
148 std::fixed
149 << "patch " << patchIndex << ": "
150 << "Q[" << i << "]!=validQ[" << i << "] ("
151 << Q[patchIndex][i]
152 << "!="
153 << validQ[patchIndex][i]
154 << ")"
155 );
156 }
157 errors++;
158 maxDifference = std::max(maxDifference, std::abs(Q[patchIndex][i] - validQ[patchIndex][i]));
159 }
160 }
161
162 if (not tarch::la::equals(maxEigenvalue[patchIndex], validMaxEigenvalue[patchIndex], Accuracy)) {
163 if (!errors) {
164 logError("validateOutcome(...)",
165 std::fixed
166 << "maxEigenvalue[" << patchIndex << "]!=validMaxEigenvalue[" << patchIndex << "] ("
167 << maxEigenvalue[patchIndex] << "!=" << validMaxEigenvalue[patchIndex]
168 << ")";
169 );
170 }
171 errors++;
172 maxDifference = std::max(maxDifference, std::abs(maxEigenvalue[patchIndex] - validMaxEigenvalue[patchIndex]));
173 }
174 }
175
176 if (errors > 0) {
177 outcomeIsInvalid = true;
178 logError("validateOutcome(...)",
179 "max difference of outcome from all patches is "
180 << maxDifference
181 << " (admissible accuracy="
182 << Accuracy << ")"
183 << " for " << errors << " entries"
184 );
185 }
186}
187
195 const std::string& kernelIdentificator,
196 const tarch::timing::Measurement& kernelMeasurement,
197 int numberOfPatches
198) {
199 std::stringstream ss;
200 ss << "\n";
201 ss << kernelIdentificator << ":\n\t";
202 ss << kernelMeasurement.getValue() << " |\n\t";
203 ss << (kernelMeasurement.getValue() / numberOfPatches / NumberOfFiniteVolumesPerPatch) << " |\n\t";
204 ss << kernelMeasurement.toString();
205 logInfo("reportRuntime(...)", ss.str());
206}
207
218template <void (*Kernel)(exahype2::CellData<double, double>&, peano4::utils::LoopPlacement),
219 peano4::utils::LoopPlacement loopPlacement>
222 Kernel(patchData, loopPlacement);
223}
224
225template <void (*Kernel)(int, exahype2::CellData<double, double>&)>
228 Kernel(device, patchData);
229}
230
236void runBenchmarks(int numberOfPatches) {
237 exahype2::CellData<double, double> patchData(numberOfPatches);
238 for (int patchIndex = 0; patchIndex < numberOfPatches; patchIndex++) {
239 patchData.QIn[patchIndex] = tarch::allocateMemory<double>(NumberOfInputEntriesPerPatch, tarch::MemoryLocation::ManagedSharedAcceleratorDeviceMemory);
240 patchData.t[patchIndex] = TimeStamp;
241 patchData.dt[patchIndex] = TimeStepSize;
242 patchData.QOut[patchIndex] = tarch::allocateMemory<double>(NumberOfOutputEntriesPerPatch, tarch::MemoryLocation::ManagedSharedAcceleratorDeviceMemory);
245 patchData.maxEigenvalue[patchIndex] = 0.0;
246 initInputData(patchData.QIn[patchIndex]);
247 std::memset(patchData.QOut[patchIndex], 0.0, NumberOfOutputEntriesPerPatch * sizeof(double));
248 }
249
250 auto assessKernel = [&](
251 std::function<void(int device, exahype2::CellData<double, double>& patchData)> kernelWrapper,
252 const std::string& markerName,
253 const int device
254 ) -> void {
255 tarch::timing::Measurement measurement;
256
257 int sample = 0;
258 while (sample <= NumberOfSamples) {
259 // Reset output data
260 for (int patchIndex = 0; patchIndex < numberOfPatches; patchIndex++) {
261 patchData.maxEigenvalue[patchIndex] = 0.0;
262 std::memset(patchData.QOut[patchIndex], 0.0, NumberOfOutputEntriesPerPatch * sizeof(double));
263 }
264
265 parallelFor(launchingThread, NumberOfLaunchingThreads) {
266 tarch::timing::Watch watch("::runBenchmarks", "assessKernel(...)", false);
267 kernelWrapper(device, patchData);
268 watch.stop();
269 measurement.setValue(watch.getCalendarTime());
271
272 sample++;
273 }
274
275 reportRuntime(markerName, measurement, numberOfPatches);
276 allocateAndStoreOutcome(patchData.QOut, patchData.maxEigenvalue, numberOfPatches);
277 validateOutcome(patchData.QOut, patchData.maxEigenvalue, numberOfPatches);
278 };
279
280 if constexpr (AssessHostKernels) {
283 FVRiemann,
284 FVRiemann::NumberOfFiniteVolumesPerAxisPerPatch,
285 HaloSize,
286 FVRiemann::NumberOfUnknowns,
287 FVRiemann::NumberOfAuxiliaryVariables,
288 EvaluateFlux,
289 EvaluateNonconservativeProduct,
290 EvaluateEigenvalues,
291 EvaluateSource,
292 EvaluateRiemann,
293 EvaluateMaximumEigenvalueAfterTimeStep,
296 "host, stateless, patch-wise, AoS, serial",
298 );
299
302 FVRiemann,
303 FVRiemann::NumberOfFiniteVolumesPerAxisPerPatch,
304 HaloSize,
305 FVRiemann::NumberOfUnknowns,
306 FVRiemann::NumberOfAuxiliaryVariables,
307 EvaluateFlux,
308 EvaluateNonconservativeProduct,
309 EvaluateEigenvalues,
310 EvaluateSource,
311 EvaluateRiemann,
312 EvaluateMaximumEigenvalueAfterTimeStep,
315 "host, stateless, patch-wise, AoS, spread-out",
317 );
318 } // AssessHostKernels
319
320 for (int patchIndex = 0; patchIndex < numberOfPatches; patchIndex++) {
321 tarch::freeMemory(patchData.QIn[patchIndex], tarch::MemoryLocation::Heap);
322 tarch::freeMemory(patchData.QOut[patchIndex], tarch::MemoryLocation::Heap);
323 }
324}
325
326int main(int argc, char** argv) {
328 // Do this early, so people can use logInfo properly.
329 repositories::initLogFilters();
333 repositories::initSharedMemoryAndGPUEnvironment();
334
335 if constexpr (EnableFPE) {
336 feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
337 }
338
339 logInfo(
340 "main()",
341 "number of compute threads: "
342 << tarch::multicore::Core::getInstance().getNumberOfThreads()
343 );
344 logInfo(
345 "main()",
346 "number of threads launching compute kernels: "
347 << NumberOfLaunchingThreads
348 );
349 logInfo(
350 "main()",
351 "number of unknowns: "
352 << FVRiemann::NumberOfUnknowns
353 );
354 logInfo(
355 "main()",
356 "number of auxiliary variables: "
357 << FVRiemann::NumberOfAuxiliaryVariables
358 );
359 logInfo(
360 "main()",
361 "number of finite volumes per axis per patch: "
362 << FVRiemann::NumberOfFiniteVolumesPerAxisPerPatch
363 );
364 logInfo(
365 "main()",
366 "number of samples per measurement: "
367 << NumberOfSamples
368 );
369 logInfo(
370 "main()",
371 "evaluate max. eigenvalue (reduction step): "
372 << std::boolalpha << EvaluateMaximumEigenvalueAfterTimeStep
373 );
374 logInfo(
375 "main()",
376 "floating-point exception handler enabled: "
377 << std::boolalpha << EnableFPE
378 );
379 logInfo(
380 "main()",
381 "performing accuracy checks with precision: "
382 << Accuracy
383 );
384#if defined(GPUOffloadingSYCL)
385 logInfo(
386 "main()",
387 "set SYCL_DEVICE_FILTER=gpu or ONEAPI_DEVICE_SELECTOR=cuda:0 when using SYCL on the device"
388 );
389 logInfo(
390 "main()",
391 "set SYCL_PI_TRACE=2 in case of runtime errors"
392 );
393#endif
394
395#if defined(SharedOMP)
396 #pragma omp parallel
397 {
398 #pragma omp master
399 {
400#endif
401 for (int i = 0; i < NumberOfPatchesToStudy.size(); i++) {
402 logInfo("main()", "number of patches: " << NumberOfPatchesToStudy[i]);
403 runBenchmarks(NumberOfPatchesToStudy[i]);
404 freeOutcome(NumberOfPatchesToStudy[i]);
405 }
406#if defined(SharedOMP)
407 }
408 }
409#endif
410
414
415 if (outcomeIsInvalid) {
416 return EXIT_FAILURE; // Make sure the CI pipeline reports an error
417 }
418
419 return EXIT_SUCCESS;
420}
#define assertionEquals(lhs, rhs)
#define assertion(expr)
void assessKernel(std::function< void(int) > kernelCallInLoop, const std::string &name, int numberOfParticles)
void allocateAndStoreOutcome(const double *const *Q, const double *const maxEigenvalue, const int numberOfPatches)
Allocates and stores outcome of one compute kernel.
constexpr int NumberOfOutputEntriesPerPatch
void freeOutcome(const int numberOfPatches)
constexpr int NumberOfInputEntriesPerPatch
void wrapHostKernel(int device, exahype2::CellData< double, double > &patchData)
We want to use all kernels exactly the same way.
void validateOutcome(const double *const *Q, const double *const maxEigenvalue, const int numberOfPatches)
Validate data against pre-stored simulation outcome.
void reportRuntime(const std::string &kernelIdentificator, const tarch::timing::Measurement &kernelMeasurement, int numberOfPatches)
Reports the runtime and throughput of the benchmarks.
tarch::logging::Log _log("::")
void initInputData(double *Q)
Set input data.
constexpr double CellOffset
constexpr double TimeStamp
constexpr int HaloSize
double * validMaxEigenvalue
void runBenchmarks(int numberOfPatches)
Run the benchmark for one particular number of patches.
void wrapDeviceKernel(int device, exahype2::CellData< double, double > &patchData)
constexpr int NumberOfFiniteVolumesPerPatch
constexpr double TimeStepSize
constexpr double CellSize
#define logError(methodName, logMacroMessageStream)
Wrapper macro around tarch::tarch::logging::Log to improve logging.
Definition Log.h:464
#define logInfo(methodName, logMacroMessageStream)
Wrapper macro around tarch::tarch::logging::Log to improve logging.
Definition Log.h:411
#define parallelFor(counter, max)
Definition Loop.h:446
int main()
Definition main.cpp:321
static constexpr int HostDevice
Accelerator devices (GPUs) are enumerated starting from 0.
Definition Device.h:48
Log Device.
Definition Log.h:516
static Core & getInstance()
Definition Core.cpp:56
std::string toString() const
void setValue(const double &value)
Set the value.
A simple class that has to be included to measure the clock ticks required for an operation.
Definition Watch.h:45
double getCalendarTime()
This method returns the elapsed calendar time between the start and stop command of the timer,...
Definition Watch.cpp:74
void stop()
Stop timer.
Definition Watch.cpp:55
#define endParallelFor
Definition Loop.h:63
KeywordToAvoidDuplicateSymbolsForInlinedFunctions void timeStepWithRiemannPatchwiseHeapStateless(CellData< double, double > &patchData, peano4::utils::LoopPlacement loopPlacement=peano4::utils::LoopPlacement::Serial) InlineMethod
void fillLookupTables()
Fill Lookup Tables.
Definition peano.cpp:87
int initParallelEnvironment(int *argc, char ***argv)
Init Parallel Environment.
Definition peano.cpp:101
void shutdownParallelEnvironment()
Shutdown all the parallel environment, i.e.
Definition peano.cpp:127
CF abs(const CF &cf)
constexpr double PI
Definition Scalar.h:12
bool equals(const Matrix< Rows, Cols, Scalar > &lhs, const Matrix< Rows, Cols, Scalar > &rhs, const Scalar &tolerance=NUMERICAL_ZERO_DIFFERENCE)
Compares to matrices on equality by means of a numerical accuracy.
void shutdownSmartMPI()
Definition multicore.cpp:49
void initSmartMPI()
Switch on SmartMPI.
Definition multicore.cpp:33
void freeMemory(void *data, MemoryLocation location, int device=accelerator::Device::HostDevice)
Free memory.
void shutdownNonCriticalAssertionEnvironment()
peano4::shutdownParallelEnvironment().
void initNonCriticalAssertionEnvironment()
Register the assertion tag from the global communicator.
@ Heap
Create data on the heap of the local device.
@ ManagedSharedAcceleratorDeviceMemory
To be used on host only.
Representation of a number of cells which contains all information that's required to process the sto...
Definition CellData.h:77
outType ** QOut
Out values.
Definition CellData.h:116
inType ** QIn
QIn may not be const, as some kernels delete it straightaway once the input data has been handled.
Definition CellData.h:82
double * maxEigenvalue
Out values.
Definition CellData.h:121
tarch::la::Vector< Dimensions, double > * cellCentre
Definition CellData.h:83
tarch::la::Vector< Dimensions, double > * cellSize
Definition CellData.h:84
Simple vector class.
Definition Vector.h:150