ryujin 2.1.1 revision 6dc06e5864abd5d99e5d7ab641dbe621936411d9
hyperbolic_module.template.h
Go to the documentation of this file.
1//
2// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
3// Copyright (C) 2020 - 2024 by the ryujin authors
4//
5
6#pragma once
7
8#include "hyperbolic_module.h"
9#include "introspection.h"
10#include "mpi_ensemble.h"
11#include "openmp.h"
12#include "scope.h"
13#include "simd.h"
14
16
17#include <atomic>
18
19namespace ryujin
20{
21 namespace ShallowWater
22 {
23 struct Description;
24 }
25
26 using namespace dealii;
27
28 template <typename Description, int dim, typename Number>
30 const MPIEnsemble &mpi_ensemble,
31 std::map<std::string, dealii::Timer> &computing_timer,
32 const OfflineData<dim, Number> &offline_data,
33 const HyperbolicSystem &hyperbolic_system,
34 const InitialValues<Description, dim, Number> &initial_values,
35 const std::string &subsection /*= "HyperbolicModule"*/)
36 : ParameterAcceptor(subsection)
37 , id_violation_strategy_(IDViolationStrategy::warn)
38 , indicator_parameters_(subsection + "/indicator")
39 , limiter_parameters_(subsection + "/limiter")
40 , riemann_solver_parameters_(subsection + "/riemann solver")
41 , mpi_ensemble_(mpi_ensemble)
42 , computing_timer_(computing_timer)
43 , offline_data_(&offline_data)
44 , hyperbolic_system_(&hyperbolic_system)
45 , initial_values_(&initial_values)
46 , cfl_(0.2)
47 , n_restarts_(0)
48 , n_corrections_(0)
49 , n_warnings_(0)
50 {
51 }
52
53
54 template <typename Description, int dim, typename Number>
56 {
57#ifdef DEBUG_OUTPUT
58 std::cout << "HyperbolicModule<Description, dim, Number>::prepare()"
59 << std::endl;
60#endif
61
62 AssertThrow(limiter_parameters_.iterations() <= 2,
63 dealii::ExcMessage(
64 "The number of limiter iterations must be between [0,2]"));
65
66 /* Initialize vectors: */
67
68 const auto &scalar_partitioner = offline_data_->scalar_partitioner();
69 alpha_.reinit(scalar_partitioner);
70 bounds_.reinit_with_scalar_partitioner(scalar_partitioner);
71
72 r_.reinit(offline_data_->hyperbolic_vector_partitioner());
73 using View =
74 typename Description::template HyperbolicSystemView<dim, Number>;
75
76 /* Initialize matrices: */
77
78 const auto &sparsity_simd = offline_data_->sparsity_pattern_simd();
79 dij_matrix_.reinit(sparsity_simd);
80 lij_matrix_.reinit(sparsity_simd);
81 lij_matrix_next_.reinit(sparsity_simd);
82 pij_matrix_.reinit(sparsity_simd);
83
84 /* Set up initial precomputed vector: */
85
86 initial_precomputed_ =
87 initial_values_->interpolate_initial_precomputed_vector();
88 }
89
90
91 /*
92 * -------------------------------------------------------------------------
93 * Step 1: Apply boundary conditions and precompute values
94 * -------------------------------------------------------------------------
95 */
96
97
98 template <typename Description, int dim, typename Number>
100 StateVector &state_vector, Number t) const
101 {
102#ifdef DEBUG_OUTPUT
103 std::cout << "HyperbolicModule<Description, dim, "
104 "Number>::prepare_state_vector()"
105 << std::endl;
106#endif
107
108 auto &U = std::get<0>(state_vector);
109 auto &precomputed = std::get<1>(state_vector);
110
111 const unsigned int n_export_indices = offline_data_->n_export_indices();
112 const unsigned int n_internal = offline_data_->n_locally_internal();
113 const unsigned int n_owned = offline_data_->n_locally_owned();
114 const auto &sparsity_simd = offline_data_->sparsity_pattern_simd();
115 const auto &boundary_map = offline_data_->boundary_map();
116 unsigned int channel = 10;
117 using VA = VectorizedArray<Number>;
118
119 Scope scope(computing_timer_,
120 "time step [H] 1 - update boundary values, precompute values");
121
122 LIKWID_MARKER_START("time_step_1a");
123
124 /* FIXME: not thread parallel... */
125 for (const auto &entry : boundary_map) {
126 const auto &[i, normal, normal_mass, boundary_mass, id, position] = entry;
127
128 /*
129 * Relay the task of applying appropriate boundary conditions to the
130 * Problem Description.
131 */
132
133 if (id == Boundary::do_nothing)
134 continue;
135
136 auto U_i = U.get_tensor(i);
137
138 /* Use a lambda to avoid computing unnecessary state values */
139 auto get_dirichlet_data = [position = position, t = t, this]() {
140 return initial_values_->initial_state(position, t);
141 };
142
143 const auto view = hyperbolic_system_->template view<dim, Number>();
144 U_i = view.apply_boundary_conditions(id, U_i, normal, get_dirichlet_data);
145 U.write_tensor(U_i, i);
146 }
147
148 LIKWID_MARKER_STOP("time_step_1a");
149
150 U.update_ghost_values();
151
152 /*
153 * Precompute values
154 */
155
156 if constexpr (n_precomputation_cycles != 0) {
157 for (unsigned int cycle = 0; cycle < n_precomputation_cycles; ++cycle) {
158
159 SynchronizationDispatch synchronization_dispatch([&]() {
160 precomputed.update_ghost_values_start(channel++);
161 precomputed.update_ghost_values_finish();
162 });
163
165 LIKWID_MARKER_START(("time_step_1b"));
166
167 auto loop = [&](auto sentinel, unsigned int left, unsigned int right) {
168 using T = decltype(sentinel);
169
170 /* Stored thread locally: */
171 bool thread_ready = false;
172
173 const auto view = hyperbolic_system_->template view<dim, T>();
174 view.precomputation_loop(
175 cycle,
176 [&](const unsigned int i) {
177 synchronization_dispatch.check(
178 thread_ready, i >= n_export_indices && i < n_internal);
179 },
180 sparsity_simd,
181 state_vector,
182 left,
183 right);
184 };
185
186 /* Parallel non-vectorized loop: */
187 loop(Number(), n_internal, n_owned);
188 /* Parallel vectorized SIMD loop: */
189 loop(VA(), 0, n_internal);
190
191 LIKWID_MARKER_STOP("time_step_1b");
193 }
194 }
195 }
196
197
198 /*
199 * -------------------------------------------------------------------------
200 * Step 2 - 7: Perform an explicit Euler step
201 * -------------------------------------------------------------------------
202 */
203
204
205 namespace
206 {
211 template <typename T>
212 bool all_below_diagonal(unsigned int i, const unsigned int *js)
213 {
214 if constexpr (std::is_same_v<T, typename get_value_type<T>::type>) {
215 /* Non-vectorized sequential access. */
216 const auto j = *js;
217 return j < i;
218
219 } else {
220 /* Vectorized fast access. index must be divisible by simd_length */
221
222 constexpr auto simd_length = T::size();
223
224 bool all_below_diagonal = true;
225 for (unsigned int k = 0; k < simd_length; ++k)
226 if (js[k] >= i + k) {
227 all_below_diagonal = false;
228 break;
229 }
230 return all_below_diagonal;
231 }
232 }
233 } // namespace
234
235
236 template <typename Description, int dim, typename Number>
237 template <int stages>
239 const StateVector &old_state_vector,
240 std::array<std::reference_wrapper<const StateVector>, stages>
241 stage_state_vectors,
242 const std::array<Number, stages> stage_weights,
243 StateVector &new_state_vector,
244 Number tau /*= 0.*/,
245 std::atomic<Number> tau_max /*std::numeric_limits<Number>::max()*/) const
246 {
247#ifdef DEBUG_OUTPUT
248 std::cout << "HyperbolicModule<Description, dim, Number>::step()"
249 << std::endl;
250#endif
251
252 auto &old_U = std::get<0>(old_state_vector);
253 auto &old_precomputed = std::get<1>(old_state_vector);
254 auto &new_U = std::get<0>(new_state_vector);
255
257
258 /*
259 * Workaround: A constexpr boolean storing the fact whether we
260 * instantiate the HyperbolicModule for the shallow water equations.
261 *
262 * Rationale: Currently, the shallow water equations is the only
263 * hyperbolic system for which we have to (a) form equilibrated states
264 * for the low-order update, and (b) apply an affine shift for
265 * computing limiter bounds. It's not so easy to come up with a
266 * meaningful abstraction layer for this (in particular because we only
267 * have one PDE). Thus, for the time being we simply special case a
268 * small amount of code in this routine.
269 *
270 * FIXME: Refactor into a proper abstraction layer / interface.
271 */
272 constexpr bool shallow_water =
273 std::is_same_v<Description, ShallowWater::Description>;
274
275 using VA = VectorizedArray<Number>;
276
277 /* Index ranges for the iteration over the sparsity pattern : */
278
279 constexpr auto simd_length = VA::size();
280 const unsigned int n_export_indices = offline_data_->n_export_indices();
281 const unsigned int n_internal = offline_data_->n_locally_internal();
282 const unsigned int n_owned = offline_data_->n_locally_owned();
283
284 /* References to precomputed matrices and the stencil: */
285
286 const auto &sparsity_simd = offline_data_->sparsity_pattern_simd();
287
288 const auto &mass_matrix = offline_data_->mass_matrix();
289 const auto &mass_matrix_inverse = offline_data_->mass_matrix_inverse();
290 const auto &lumped_mass_matrix = offline_data_->lumped_mass_matrix();
291 const auto &lumped_mass_matrix_inverse =
292 offline_data_->lumped_mass_matrix_inverse();
293
294 const auto &cij_matrix = offline_data_->cij_matrix();
295 const auto &incidence_matrix = offline_data_->incidence_matrix();
296
297 const auto &coupling_boundary_pairs =
298 offline_data_->coupling_boundary_pairs();
299
300 const Number measure_of_omega_inverse =
301 Number(1.) / offline_data_->measure_of_omega();
302
303 /* A monotonically increasing "channel" variable for mpi_tags: */
304 unsigned int channel = 10;
305
306 /* Lambda for creating the computing timer string: */
307 int step_no = 1;
308 const auto scoped_name = [&step_no](const auto &name,
309 const bool advance = true) {
310 advance || step_no--;
311 return "time step [H] " + std::to_string(++step_no) + " - " + name;
312 };
313
314 /* A boolean signalling that a restart is necessary: */
315 std::atomic<bool> restart_needed = false;
316
317 /*
318 * -------------------------------------------------------------------------
319 * Step 2: Compute off-diagonal d_ij, and alpha_i
320 *
321 * The computation of the d_ij is quite costly. So we do a trick to
322 * save a bit of computational resources. Instead of computing all d_ij
323 * entries for a row of a given local index i, we only compute d_ij for
324 * which j > i,
325 *
326 * llllrr
327 * l .xxxxx
328 * l ..xxxx
329 * l ...xxx
330 * l ....xx
331 * r ......
332 * r ......
333 *
334 * and symmetrize in Step 2.
335 *
336 * MM: We could save a bit more computational resources by only
337 * computing entries for which *IN A GLOBAL* enumeration j > i. But
338 * the index translation, subsequent symmetrization, and exchange
339 * sounds a bit too expensive...
340 * -------------------------------------------------------------------------
341 */
342
343 {
344 Scope scope(computing_timer_, scoped_name("compute d_ij, and alpha_i"));
345
346 SynchronizationDispatch synchronization_dispatch([&]() {
347 alpha_.update_ghost_values_start(channel++);
348 alpha_.update_ghost_values_finish();
349 });
350
352 LIKWID_MARKER_START(("time_step_" + std::to_string(step_no)).c_str());
353
354 auto loop = [&](auto sentinel, unsigned int left, unsigned int right) {
355 using T = decltype(sentinel);
356 unsigned int stride_size = get_stride_size<T>;
357
358 /* Stored thread locally: */
359
360 using RiemannSolver =
361 typename Description::template RiemannSolver<dim, T>;
362 RiemannSolver riemann_solver(
363 *hyperbolic_system_, riemann_solver_parameters_, old_precomputed);
364
365 using Indicator = typename Description::template Indicator<dim, T>;
366 Indicator indicator(
367 *hyperbolic_system_, indicator_parameters_, old_precomputed);
368
369 bool thread_ready = false;
370
372 for (unsigned int i = left; i < right; i += stride_size) {
373
374 /* Skip constrained degrees of freedom: */
375 const unsigned int row_length = sparsity_simd.row_length(i);
376 if (row_length == 1)
377 continue;
378
379 synchronization_dispatch.check(
380 thread_ready, i >= n_export_indices && i < n_internal);
381
382 const auto U_i = old_U.template get_tensor<T>(i);
383
384 indicator.reset(i, U_i);
385
386 const unsigned int *js = sparsity_simd.columns(i);
387 for (unsigned int col_idx = 0; col_idx < row_length;
388 ++col_idx, js += stride_size) {
389
390 const auto U_j = old_U.template get_tensor<T>(js);
391
392 const auto c_ij = cij_matrix.template get_tensor<T>(i, col_idx);
393
394 indicator.accumulate(js, U_j, c_ij);
395
396 /* Skip diagonal. */
397 if (col_idx == 0)
398 continue;
399
400 /* Only iterate over the upper triangular portion of d_ij */
401 if (all_below_diagonal<T>(i, js))
402 continue;
403
404 const auto norm = c_ij.norm();
405 const auto n_ij = c_ij / norm;
406 const auto lambda_max =
407 riemann_solver.compute(U_i, U_j, i, js, n_ij);
408 const auto d_ij = norm * lambda_max;
409
410 dij_matrix_.write_entry(d_ij, i, col_idx, true);
411 }
412
413 const auto mass = get_entry<T>(lumped_mass_matrix, i);
414 const auto hd_i = mass * measure_of_omega_inverse;
415 write_entry<T>(alpha_, indicator.alpha(hd_i), i);
416 }
417 };
418
419 /* Parallel non-vectorized loop: */
420 loop(Number(), n_internal, n_owned);
421 /* Parallel vectorized SIMD loop: */
422 loop(VA(), 0, n_internal);
423
424 LIKWID_MARKER_STOP(("time_step_" + std::to_string(step_no)).c_str());
426 }
427
428 /*
429 * -------------------------------------------------------------------------
430 * Step 3: Compute diagonal of d_ij, and maximal time-step size.
431 * -------------------------------------------------------------------------
432 */
433
434 {
435 Scope scope(computing_timer_,
436 scoped_name("compute bdry d_ij, diag d_ii, and tau_max"));
437
438 /* Parallel region */
440 LIKWID_MARKER_START(("time_step_" + std::to_string(step_no)).c_str());
441
442 /*
443 * Complete d_ij at boundary:
444 *
445 * Here, for continuous finite elements the assumption c_ij = -c_ji
446 * no longer holds true. This implies that d_ij != d_ji. We thus need
447 * to compute the lower-triangular part of d_ij, where i and j are
448 * boundary degrees of freedom as well.
449 */
450
451 using RiemannSolver =
452 typename Description::template RiemannSolver<dim, Number>;
453 RiemannSolver riemann_solver(
454 *hyperbolic_system_, riemann_solver_parameters_, old_precomputed);
455
456 Number local_tau_max = std::numeric_limits<Number>::max();
457
458 /*
459 * Note: we need this dance of iterating over an integer and then
460 * accessing the element to make Apple's OpenMP implementation
461 * happy.
462 */
464 for (std::size_t k = 0; k < coupling_boundary_pairs.size(); ++k) {
465 const auto &[i, col_idx, j] = coupling_boundary_pairs[k];
466
467 /*
468 * Only work on index pairs "i < j" that point to the upper
469 * triangular portion of the d_ij matrix. For all of these index
470 * pairs we compute the corresponding d_ji entry and fix up the
471 * d_ij entry (from step 2) by taking the maximum. Note that we
472 * actually do not store anything in the d_ji entry itself because
473 * we symmetrize the matrix later on anyway.
474 */
475 if (j < i)
476 continue;
477
478 const auto U_i = old_U.get_tensor(i);
479 const auto U_j = old_U.get_tensor(j);
480
481 const auto c_ji = cij_matrix.get_transposed_tensor(i, col_idx);
482 Assert(c_ji.norm() > 1.e-12, ExcInternalError());
483 const auto norm_ji = c_ji.norm();
484 const auto n_ji = c_ji / norm_ji;
485
486 const auto d_ij = dij_matrix_.get_entry(i, col_idx);
487
488 const auto lambda_max = riemann_solver.compute(U_j, U_i, j, &i, n_ji);
489 const auto d_ji = norm_ji * lambda_max;
490
491 dij_matrix_.write_entry(std::max(d_ij, d_ji), i, col_idx);
492 }
493
494 /* Symmetrize d_ij: */
495
497 for (unsigned int i = 0; i < n_owned; ++i) {
498
499 /* Skip constrained degrees of freedom: */
500 const unsigned int row_length = sparsity_simd.row_length(i);
501 if (row_length == 1)
502 continue;
503
504 Number d_sum = Number(0.);
505
506 /* skip diagonal: */
507 const unsigned int *js = sparsity_simd.columns(i);
508 for (unsigned int col_idx = 1; col_idx < row_length; ++col_idx) {
509 const auto j =
510 *(i < n_internal ? js + col_idx * simd_length : js + col_idx);
511
512 // fill lower triangular part of dij_matrix missing from step 1
513 if (j < i) {
514 const auto d_ji = dij_matrix_.get_transposed_entry(i, col_idx);
515
516#ifdef DEBUG_SYMMETRY_CHECK
517 /* Verify that d_ji == std::max(d_ij, d_ji): */
518
519 const auto U_i = old_U.get_tensor(i);
520 const auto U_j = old_U.get_tensor(j);
521
522 const auto c_ij = cij_matrix.get_tensor(i, col_idx);
523 Assert(c_ij.norm() > 1.e-12, ExcInternalError());
524 const auto norm_ij = c_ij.norm();
525 const auto n_ij = c_ij / norm_ij;
526
527 const auto lambda_max =
528 riemann_solver.compute(U_i, U_j, i, &j, n_ij);
529 const auto d_ij = norm_ij * lambda_max;
530
531 Assert(d_ij <= d_ji + 1.0e-12,
532 dealii::ExcMessage("d_ij not symmetrized correctly on "
533 "boundary degrees of freedom."));
534#endif
535
536 dij_matrix_.write_entry(d_ji, i, col_idx);
537 }
538
539 d_sum -= dij_matrix_.get_entry(i, col_idx);
540 }
541
542 /*
543 * Make sure that we do not accidentally divide by zero. (Yes, this
544 * can happen for some (admittedly, rather esoteric) scalar
545 * conservation equations...).
546 */
547 d_sum =
548 std::min(d_sum, Number(-1.e6) * std::numeric_limits<Number>::min());
549
550 /* write diagonal element */
551 dij_matrix_.write_entry(d_sum, i, 0);
552
553 const Number mass = lumped_mass_matrix.local_element(i);
554 const Number tau = cfl_ * mass / (Number(-2.) * d_sum);
555 local_tau_max = std::min(local_tau_max, tau);
556 }
557
558 /* Synchronize tau max over all threads: */
559 Number current_tau_max = tau_max.load();
560 while (current_tau_max > local_tau_max &&
561 !tau_max.compare_exchange_weak(current_tau_max, local_tau_max))
562 ;
563
564 LIKWID_MARKER_STOP(("time_step_" + std::to_string(step_no)).c_str());
566 }
567
568 {
569 Scope scope(computing_timer_,
570 "time step [H] _ - synchronization barriers");
571
572 /*
573 * MPI Barrier: Synchronize the maximal time-step size. This has to
574 * happen either over the global, or the local subrange communicator:
575 */
576 tau_max.store(Utilities::MPI::min(
577 tau_max.load(), mpi_ensemble_.synchronization_communicator()));
578
579 AssertThrow(
580 !std::isnan(tau_max) && !std::isinf(tau_max) && tau_max > 0.,
581 ExcMessage(
582 "I'm sorry, Dave. I'm afraid I can't do that.\nWe crashed."));
583
584 tau = (tau == Number(0.) ? tau_max.load() : tau);
585
586#ifdef DEBUG_OUTPUT
587 std::cout << " computed tau_max = " << tau_max << std::endl;
588 std::cout << " perform time-step with tau = " << tau << std::endl;
589#endif
590 }
591
592#ifdef DEBUG
593 /* Exchange d_ij so that we can check for symmetry: */
594 dij_matrix_.update_ghost_rows();
595#endif
596
597 /*
598 * -------------------------------------------------------------------------
599 * Step 4: Low-order update, also compute limiter bounds, R_i
600 * -------------------------------------------------------------------------
601 */
602
603 {
604 Scope scope(computing_timer_,
605 scoped_name("l.-o. update, compute bounds, r_i, and p_ij"));
606
607 SynchronizationDispatch synchronization_dispatch([&]() {
608 r_.update_ghost_values_start(channel++);
609 r_.update_ghost_values_finish();
610 if (offline_data_->discretization().have_discontinuous_ansatz()) {
611 /*
612 * In case we extend bounds over the stencil, we have to ensure
613 * that ghost ranges are properly communicated over all MPI
614 * ranks.
615 */
616 bounds_.update_ghost_values_start(channel++);
617 bounds_.update_ghost_values_finish();
618 }
619 });
620
621 const Number weight =
622 -std::accumulate(stage_weights.begin(), stage_weights.end(), -1.);
623
624 /* Parallel region */
626 LIKWID_MARKER_START(("time_step_" + std::to_string(step_no)).c_str());
627
628 auto loop = [&](auto sentinel,
629 auto have_discontinuous_ansatz,
630 unsigned int left,
631 unsigned int right) {
632 using T = decltype(sentinel);
633 using View =
634 typename Description::template HyperbolicSystemView<dim, T>;
635 using Limiter = typename Description::template Limiter<dim, T>;
636 using flux_contribution_type = typename View::flux_contribution_type;
637 using state_type = typename View::state_type;
638
639 unsigned int stride_size = get_stride_size<T>;
640
641 const auto view = hyperbolic_system_->template view<dim, T>();
642
643 /* Stored thread locally: */
644 Limiter limiter(
645 *hyperbolic_system_, limiter_parameters_, old_precomputed);
646 bool thread_ready = false;
647
649 for (unsigned int i = left; i < right; i += stride_size) {
650
651 /* Skip constrained degrees of freedom: */
652 const unsigned int row_length = sparsity_simd.row_length(i);
653 if (row_length == 1)
654 continue;
655
656 synchronization_dispatch.check(
657 thread_ready, i >= n_export_indices && i < n_internal);
658
659 const auto U_i = old_U.template get_tensor<T>(i);
660 auto U_i_new = U_i;
661
662 const auto alpha_i = get_entry<T>(alpha_, i);
663 const auto m_i = get_entry<T>(lumped_mass_matrix, i);
664 const auto m_i_inv = get_entry<T>(lumped_mass_matrix_inverse, i);
665
666 const auto flux_i = view.flux_contribution(
667 old_precomputed, initial_precomputed_, i, U_i);
668
669 std::array<flux_contribution_type, stages> flux_iHs;
670 [[maybe_unused]] state_type S_iH;
671
672 for (int s = 0; s < stages; ++s) {
673 const auto &[U_s, prec_s, V_s] = stage_state_vectors[s].get();
674
675 const auto U_iHs = U_s.template get_tensor<T>(i);
676 flux_iHs[s] =
677 view.flux_contribution(prec_s, initial_precomputed_, i, U_iHs);
678
679 if constexpr (View::have_source_terms) {
680 S_iH +=
681 stage_weights[s] * view.nodal_source(prec_s, i, U_iHs, tau);
682 }
683 }
684
685 [[maybe_unused]] state_type S_i;
686 state_type F_iH;
687
688 if constexpr (View::have_source_terms) {
689 S_i = view.nodal_source(old_precomputed, i, U_i, tau);
690 S_iH += weight * S_i;
691 U_i_new += tau * /* m_i_inv * m_i */ S_i;
692 F_iH += m_i * S_iH;
693 }
694
695 limiter.reset(i, U_i, flux_i);
696
697 [[maybe_unused]] state_type affine_shift;
698
699 /*
700 * Workaround: For shallow water we need to accumulate an
701 * additional contribution to the affine shift over the stencil
702 * before we can compute limiter bounds.
703 */
704
705 const unsigned int *js = sparsity_simd.columns(i);
706 if constexpr (shallow_water) {
707 for (unsigned int col_idx = 0; col_idx < row_length;
708 ++col_idx, js += stride_size) {
709
710 const auto U_j = old_U.template get_tensor<T>(js);
711 const auto flux_j = view.flux_contribution(
712 old_precomputed, initial_precomputed_, js, U_j);
713
714 const auto d_ij = dij_matrix_.template get_entry<T>(i, col_idx);
715 const auto c_ij = cij_matrix.template get_tensor<T>(i, col_idx);
716
717 const auto B_ij = view.affine_shift(flux_i, flux_j, c_ij, d_ij);
718 affine_shift += B_ij;
719 }
720
721 affine_shift *= tau * m_i_inv;
722 }
723
724 if constexpr (View::have_source_terms) {
725 affine_shift += tau * /* m_i_inv * m_i */ S_i;
726 }
727
728 js = sparsity_simd.columns(i);
729 for (unsigned int col_idx = 0; col_idx < row_length;
730 ++col_idx, js += stride_size) {
731
732 const auto U_j = old_U.template get_tensor<T>(js);
733
734 const auto alpha_j = get_entry<T>(alpha_, js);
735
736 const auto d_ij = dij_matrix_.template get_entry<T>(i, col_idx);
737 auto factor = (alpha_i + alpha_j) * Number(.5);
738
739 if constexpr (have_discontinuous_ansatz) {
740 const auto incidence_ij =
741 incidence_matrix.template get_entry<T>(i, col_idx);
742 factor = std::max(factor, incidence_ij);
743 }
744
745 const auto d_ijH = d_ij * factor;
746
747#ifdef DEBUG_SYMMETRY_CHECK
748 /*
749 * Verify that all local chunks of the d_ij matrix have been
750 * computed consistently over all MPI ranks. For that we import
751 * all ghost rows from neighboring MPI ranks and simply check
752 * that the (local) values of d_ij and d_ji match.
753 */
754 const auto d_ji =
755 dij_matrix_.template get_transposed_entry<T>(i, col_idx);
756 Assert(std::max(std::abs(d_ij - d_ji), T(1.0e-12)) == T(1.0e-12),
757 dealii::ExcMessage(
758 "d_ij not symmetrized correctly over MPI ranks"));
759#endif
760
761 const auto c_ij = cij_matrix.template get_tensor<T>(i, col_idx);
762 constexpr auto eps = std::numeric_limits<Number>::epsilon();
763 const auto scale = dealii::compare_and_apply_mask<
764 dealii::SIMDComparison::less_than>(
765 std::abs(d_ij), T(eps * eps), T(0.), T(1.) / d_ij);
766 const auto scaled_c_ij = c_ij * scale;
767
768 const auto flux_j = view.flux_contribution(
769 old_precomputed, initial_precomputed_, js, U_j);
770
771 const auto m_ij = mass_matrix.template get_entry<T>(i, col_idx);
772
773 /*
774 * Compute low-order flux and limiter bounds:
775 */
776
777 const auto flux_ij = view.flux_divergence(flux_i, flux_j, c_ij);
778 U_i_new += tau * m_i_inv * flux_ij;
779 auto P_ij = -flux_ij;
780
781 if constexpr (shallow_water) {
782 /*
783 * Workaround: Shallow water (and related) are special:
784 */
785
786 const auto &[U_star_ij, U_star_ji] =
787 view.equilibrated_states(flux_i, flux_j);
788
789 U_i_new += tau * m_i_inv * d_ij * (U_star_ji - U_star_ij);
790 F_iH += d_ijH * (U_star_ji - U_star_ij);
791 P_ij += (d_ijH - d_ij) * (U_star_ji - U_star_ij);
792
793 limiter.accumulate(
794 U_j, U_star_ij, U_star_ji, scaled_c_ij, affine_shift);
795
796 } else {
797
798 U_i_new += tau * m_i_inv * d_ij * (U_j - U_i);
799 F_iH += d_ijH * (U_j - U_i);
800 P_ij += (d_ijH - d_ij) * (U_j - U_i);
801
802 limiter.accumulate(js, U_j, flux_j, scaled_c_ij, affine_shift);
803 }
804
805 if constexpr (View::have_source_terms) {
806 F_iH -= m_ij * S_iH;
807 P_ij -= m_ij * /*sic!*/ S_i;
808 }
809
810 /*
811 * Compute high-order fluxes and source terms:
812 */
813
814 if constexpr (View::have_high_order_flux) {
815 const auto high_order_flux_ij =
816 view.high_order_flux_divergence(flux_i, flux_j, c_ij);
817 F_iH += weight * high_order_flux_ij;
818 P_ij += weight * high_order_flux_ij;
819 } else {
820 F_iH += weight * flux_ij;
821 P_ij += weight * flux_ij;
822 }
823
824 if constexpr (View::have_source_terms) {
825 const auto S_j = view.nodal_source(old_precomputed, js, U_j, tau);
826 F_iH += weight * m_ij * S_j;
827 P_ij += weight * m_ij * S_j;
828 }
829
830 for (int s = 0; s < stages; ++s) {
831 const auto &[U_s, prec_s, V_s] = stage_state_vectors[s].get();
832
833 const auto U_jHs = U_s.template get_tensor<T>(js);
834 const auto flux_jHs = view.flux_contribution(
835 prec_s, initial_precomputed_, js, U_jHs);
836
837 if constexpr (View::have_high_order_flux) {
838 const auto high_order_flux_ij = view.high_order_flux_divergence(
839 flux_iHs[s], flux_jHs, c_ij);
840 F_iH += stage_weights[s] * high_order_flux_ij;
841 P_ij += stage_weights[s] * high_order_flux_ij;
842 } else {
843 const auto flux_ij =
844 view.flux_divergence(flux_iHs[s], flux_jHs, c_ij);
845 F_iH += stage_weights[s] * flux_ij;
846 P_ij += stage_weights[s] * flux_ij;
847 }
848
849 if constexpr (View::have_source_terms) {
850 const auto S_js = view.nodal_source(prec_s, js, U_jHs, tau);
851 F_iH += stage_weights[s] * m_ij * S_js;
852 P_ij += stage_weights[s] * m_ij * S_js;
853 }
854 }
855
856 pij_matrix_.write_entry(P_ij, i, col_idx, true);
857 }
858
859#ifdef DEBUG_EXPENSIVE_BOUNDS_CHECK
860 if (!view.is_admissible(U_i_new)) {
861 restart_needed = true;
862 }
863#endif
864
865 new_U.template write_tensor<T>(U_i_new, i);
866 r_.template write_tensor<T>(F_iH, i);
867
868 const auto hd_i = m_i * measure_of_omega_inverse;
869 const auto relaxed_bounds = limiter.bounds(hd_i);
870 bounds_.template write_tensor<T>(relaxed_bounds, i);
871 }
872 };
873
874 /*
875 * Chain through a compile time integral constant std::true_type for
876 * a discontinuous ansatz and std::false_type otherwise. We use the
877 * (constexpr) integral constant later on to avoid branching when
878 * computing d_ijH.
879 */
880 if (offline_data_->discretization().have_discontinuous_ansatz()) {
881 /* Parallel non-vectorized loop and vectorized SIMD loop: */
882 loop(Number(), std::true_type{}, n_internal, n_owned);
883 loop(VA(), std::true_type{}, 0, n_internal);
884 } else {
885 /* Parallel non-vectorized loop and vectorized SIMD loop: */
886 loop(Number(), std::false_type{}, n_internal, n_owned);
887 loop(VA(), std::false_type{}, 0, n_internal);
888 }
889
890 LIKWID_MARKER_STOP(("time_step_" + std::to_string(step_no)).c_str());
892 }
893
894 /*
895 * -------------------------------------------------------------------------
896 * Step 5: Compute second part of P_ij, and l_ij (first round):
897 * -------------------------------------------------------------------------
898 */
899
900 if (limiter_parameters_.iterations() != 0) {
901 Scope scope(computing_timer_, scoped_name("compute p_ij, and l_ij"));
902
903 SynchronizationDispatch synchronization_dispatch([&]() {
904 lij_matrix_.update_ghost_rows_start(channel++);
905 lij_matrix_.update_ghost_rows_finish();
906 });
907
909 LIKWID_MARKER_START(("time_step_" + std::to_string(step_no)).c_str());
910
911 auto loop = [&](auto sentinel,
912 auto have_discontinuous_ansatz,
913 unsigned int left,
914 unsigned int right) {
915 using T = decltype(sentinel);
916 using View =
917 typename Description::template HyperbolicSystemView<dim, T>;
918 using Limiter = typename Description::template Limiter<dim, T>;
919
920 unsigned int stride_size = get_stride_size<T>;
921
922 /* Stored thread locally: */
923 Limiter limiter(
924 *hyperbolic_system_, limiter_parameters_, old_precomputed);
925 bool thread_ready = false;
926
928 for (unsigned int i = left; i < right; i += stride_size) {
929
930 /* Skip constrained degrees of freedom: */
931 const unsigned int row_length = sparsity_simd.row_length(i);
932 if (row_length == 1)
933 continue;
934
935 synchronization_dispatch.check(
936 thread_ready, i >= n_export_indices && i < n_internal);
937
938 auto bounds =
939 bounds_.template get_tensor<T, std::array<T, n_bounds>>(i);
940
941 /*
942 * In case of a discontinuous finite element ansatz we need to
943 * extend bounds over the stencil. We do this by looping over the
944 * stencil once and taking the minimum/maximum:
945 */
946 if constexpr (have_discontinuous_ansatz) {
947 /* Skip diagonal. */
948 const unsigned int *js = sparsity_simd.columns(i) + stride_size;
949 for (unsigned int col_idx = 1; col_idx < row_length;
950 ++col_idx, js += stride_size) {
951 bounds = limiter.combine_bounds(
952 bounds,
953 bounds_.template get_tensor<T, std::array<T, n_bounds>>(js));
954 }
955 bounds_.template write_tensor<T>(bounds, i);
956 }
957
958 [[maybe_unused]] T m_i;
959 if constexpr (have_discontinuous_ansatz)
960 m_i = get_entry<T>(lumped_mass_matrix, i);
961 const auto m_i_inv = get_entry<T>(lumped_mass_matrix_inverse, i);
962
963 const auto U_i_new = new_U.template get_tensor<T>(i);
964
965 const auto F_iH = r_.template get_tensor<T>(i);
966
967 const auto lambda_inv = Number(row_length - 1);
968 const auto factor = tau * m_i_inv * lambda_inv;
969
970 /* Skip diagonal. */
971 const unsigned int *js = sparsity_simd.columns(i) + stride_size;
972 for (unsigned int col_idx = 1; col_idx < row_length;
973 ++col_idx, js += stride_size) {
974
975 auto P_ij = pij_matrix_.template get_tensor<T>(i, col_idx);
976 const auto F_jH = r_.template get_tensor<T>(js);
977
978 /*
979 * Mass matrix correction:
980 */
981
982 const auto kronecker_ij = col_idx == 0 ? T(1.) : T(0.);
983
984 if constexpr (have_discontinuous_ansatz) {
985 /* Use full consistent mass matrix inverse: */
986
987 const auto m_j = get_entry<T>(lumped_mass_matrix, js);
988 const auto m_ij_inv =
989 mass_matrix_inverse.template get_entry<T>(i, col_idx);
990 const auto b_ij = m_i * m_ij_inv - kronecker_ij;
991 const auto b_ji = m_j * m_ij_inv - kronecker_ij;
992
993 P_ij += b_ij * F_jH - b_ji * F_iH;
994
995 } else {
996 /* Use Neumann series expansion: */
997
998 const auto m_j_inv = get_entry<T>(lumped_mass_matrix_inverse, js);
999 const auto m_ij = mass_matrix.template get_entry<T>(i, col_idx);
1000 const auto b_ij = kronecker_ij - m_ij * m_j_inv;
1001 const auto b_ji = kronecker_ij - m_ij * m_i_inv;
1002
1003 P_ij += b_ij * F_jH - b_ji * F_iH;
1004 }
1005
1006 P_ij *= factor;
1007 pij_matrix_.write_entry(P_ij, i, col_idx);
1008
1009 /*
1010 * Compute limiter coefficients:
1011 */
1012
1013 const auto &[l_ij, success] = limiter.limit(bounds, U_i_new, P_ij);
1014 lij_matrix_.template write_entry<T>(l_ij, i, col_idx, true);
1015
1016 /*
1017 * If the success is set to false then the low-order update
1018 * resulted in a state outside of the limiter bounds. This can
1019 * happen if we compute with an aggressive CFL number. We
1020 * signal this condition by setting the restart_needed boolean
1021 * to true and defer further action to the chosen
1022 * IDViolationStrategy and the policy set in the
1023 * TimeIntegrator.
1024 */
1025 if (!success)
1026 restart_needed = true;
1027 }
1028 }
1029 };
1030
1031 /*
1032 * Chain through a compile time integral constant std::true_type for
1033 * a discontinuous ansatz and std::false_type otherwise. We use the
1034 * (constexpr) integral constant later on to avoid branching when
1035 * computing d_ijH.
1036 */
1037 if (offline_data_->discretization().have_discontinuous_ansatz()) {
1038 /* Parallel non-vectorized loop and vectorized SIMD loop: */
1039 loop(Number(), std::true_type{}, n_internal, n_owned);
1040 loop(VA(), std::true_type{}, 0, n_internal);
1041 } else {
1042 /* Parallel non-vectorized loop and vectorized SIMD loop: */
1043 loop(Number(), std::false_type{}, n_internal, n_owned);
1044 loop(VA(), std::false_type{}, 0, n_internal);
1045 }
1046
1047 LIKWID_MARKER_STOP(("time_step_" + std::to_string(step_no)).c_str());
1049 }
1050
1051 /*
1052 * -------------------------------------------------------------------------
1053 * Step 6, 7: Perform high-order update:
1054 *
1055 * Symmetrize l_ij
1056 * High-order update: += l_ij * lambda * P_ij
1057 * Compute next l_ij
1058 * -------------------------------------------------------------------------
1059 */
1060
1061 const auto n_iterations = limiter_parameters_.iterations();
1062 for (unsigned int pass = 0; pass < n_iterations; ++pass) {
1063 bool last_round = (pass + 1 == n_iterations);
1064
1065 std::string additional_step = (last_round ? "" : ", next l_ij");
1066 Scope scope(
1067 computing_timer_,
1068 scoped_name("symmetrize l_ij, h.-o. update" + additional_step));
1069
1070 if ((n_iterations == 2) && last_round) {
1071 std::swap(lij_matrix_, lij_matrix_next_);
1072 }
1073
1074 SynchronizationDispatch synchronization_dispatch([&]() {
1075 if (!last_round) {
1076 lij_matrix_next_.update_ghost_rows_start(channel++);
1077 lij_matrix_next_.update_ghost_rows_finish();
1078 }
1079 });
1080
1082 LIKWID_MARKER_START(("time_step_" + std::to_string(step_no)).c_str());
1083
1084 auto loop = [&](auto sentinel, unsigned int left, unsigned int right) {
1085 using T = decltype(sentinel);
1086 using View =
1087 typename Description::template HyperbolicSystemView<dim, T>;
1088 using Limiter = typename Description::template Limiter<dim, T>;
1089
1090 unsigned int stride_size = get_stride_size<T>;
1091
1092 /* Stored thread locally: */
1093 AlignedVector<T> lij_row;
1094 Limiter limiter(
1095 *hyperbolic_system_, limiter_parameters_, old_precomputed);
1096 bool thread_ready = false;
1097
1099 for (unsigned int i = left; i < right; i += stride_size) {
1100
1101 /* Skip constrained degrees of freedom: */
1102 const unsigned int row_length = sparsity_simd.row_length(i);
1103 if (row_length == 1)
1104 continue;
1105
1106 synchronization_dispatch.check(
1107 thread_ready, i >= n_export_indices && i < n_internal);
1108
1109 auto U_i_new = new_U.template get_tensor<T>(i);
1110
1111 const Number lambda = Number(1.) / Number(row_length - 1);
1112 lij_row.resize_fast(row_length);
1113
1114 /* Skip diagonal. */
1115 for (unsigned int col_idx = 1; col_idx < row_length; ++col_idx) {
1116
1117 const auto l_ij = std::min(
1118 lij_matrix_.template get_entry<T>(i, col_idx),
1119 lij_matrix_.template get_transposed_entry<T>(i, col_idx));
1120
1121 const auto p_ij = pij_matrix_.template get_tensor<T>(i, col_idx);
1122
1123 U_i_new += l_ij * lambda * p_ij;
1124
1125 if (!last_round)
1126 lij_row[col_idx] = l_ij;
1127 }
1128
1129#ifdef DEBUG_EXPENSIVE_BOUNDS_CHECK
1130 const auto view = hyperbolic_system_->template view<dim, T>();
1131 if (!view.is_admissible(U_i_new)) {
1132 restart_needed = true;
1133 }
1134#endif
1135
1136 new_U.template write_tensor<T>(U_i_new, i);
1137
1138 /* Skip computating l_ij and updating p_ij in the last round */
1139 if (last_round)
1140 continue;
1141
1142 const auto bounds =
1143 bounds_.template get_tensor<T, std::array<T, n_bounds>>(i);
1144 /* Skip diagonal. */
1145 for (unsigned int col_idx = 1; col_idx < row_length; ++col_idx) {
1146
1147 const auto old_l_ij = lij_row[col_idx];
1148
1149 const auto new_p_ij =
1150 (T(1.) - old_l_ij) *
1151 pij_matrix_.template get_tensor<T>(i, col_idx);
1152
1153 const auto &[new_l_ij, success] =
1154 limiter.limit(bounds, U_i_new, new_p_ij);
1155
1156 /*
1157 * This is the second pass of the limiter. Under rare
1158 * circumstances the previous high-order update might be
1159 * slightly out of bounds due to roundoff errors. This happens
1160 * for example in flat regions or in stagnation points at a
1161 * (slip boundary) point. The limiter should ensure that we do
1162 * not further manipulate the state in this case. We thus only
1163 * signal a restart condition if the `EXPENSIVE_BOUNDS_CHECK` debug
1164 * macro is defined.
1165 */
1166#ifdef DEBUG_EXPENSIVE_BOUNDS_CHECK
1167 if (!success)
1168 restart_needed = true;
1169#endif
1170
1171 /*
1172 * Shortcut: We omit updating the p_ij and q_ij matrices and
1173 * simply write (1 - l_ij^(1)) * l_ij^(2) into the l_ij matrix.
1174 *
1175 * This approach only works for at most two limiting steps.
1176 */
1177 const auto entry = (T(1.) - old_l_ij) * new_l_ij;
1178 lij_matrix_next_.write_entry(entry, i, col_idx, true);
1179 }
1180 }
1181 };
1182
1183 /* Parallel non-vectorized loop: */
1184 loop(Number(), n_internal, n_owned);
1185 /* Parallel vectorized SIMD loop: */
1186 loop(VA(), 0, n_internal);
1187
1188 LIKWID_MARKER_STOP(("time_step_" + std::to_string(step_no)).c_str());
1190 } /* limiter_iter_ */
1191
1193
1194 /*
1195 * Pass through the parabolic state vector
1196 */
1197 const auto &old_V = std::get<2>(old_state_vector);
1198 auto &new_V = std::get<2>(new_state_vector);
1199 new_V = old_V;
1200
1201 /*
1202 * Do we have to restart?
1203 */
1204
1205 {
1206 Scope scope(computing_timer_,
1207 "time step [H] _ - synchronization barriers");
1208
1209 /*
1210 * Synchronize whether we have to restart the time step. Even though
1211 * the restart condition itself only affects the local ensemble we
1212 * nevertheless need to synchronize the boolean in case we perform
1213 * synchronized global time steps. (Otherwise different ensembles
1214 * might end up with a different time step.)
1215 */
1216 restart_needed.store(Utilities::MPI::logical_or(
1217 restart_needed.load(), mpi_ensemble_.synchronization_communicator()));
1218 }
1219
1220 if (restart_needed) {
1221 switch (id_violation_strategy_) {
1223 n_warnings_++;
1224 break;
1226 n_restarts_++;
1227 throw Restart();
1228 }
1229 }
1230
1231 /* In debug mode poison precomputed values: */
1232 Vectors::debug_poison_precomputed_values<Description>(new_state_vector,
1233 *offline_data_);
1234
1235 /* Return the time step size tau: */
1236 return tau;
1237 }
1238
1239} /* namespace ryujin */
HyperbolicModule(const MPIEnsemble &mpi_ensemble, std::map< std::string, dealii::Timer > &computing_timer, const OfflineData< dim, Number > &offline_data, const HyperbolicSystem &hyperbolic_system, const InitialValues< Description, dim, Number > &initial_values, const std::string &subsection="/HyperbolicModule")
typename Description::HyperbolicSystem HyperbolicSystem
typename View::StateVector StateVector
typename Description::template HyperbolicSystemView< dim, Number > View
Number step(const StateVector &old_state_vector, std::array< std::reference_wrapper< const StateVector >, stages > stage_state_vectors, const std::array< Number, stages > stage_weights, StateVector &new_state_vector, Number tau=Number(0.), std::atomic< Number > tau_max=std::numeric_limits< Number >::max()) const
void prepare_state_vector(StateVector &state_vector, Number t) const
#define RYUJIN_PARALLEL_REGION_BEGIN
Definition: openmp.h:54
#define RYUJIN_OMP_FOR
Definition: openmp.h:70
#define RYUJIN_PARALLEL_REGION_END
Definition: openmp.h:63
#define LIKWID_MARKER_START(opt)
Definition: introspection.h:68
#define CALLGRIND_START_INSTRUMENTATION
Definition: introspection.h:28
#define LIKWID_MARKER_STOP(opt)
Definition: introspection.h:73
#define CALLGRIND_STOP_INSTRUMENTATION
Definition: introspection.h:35