ryujin 2.1.1 revision 0348cbb53a3e4b1da2a4c037e81f88f2d21ce219
hyperbolic_module.template.h
Go to the documentation of this file.
1//
2// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
3// Copyright (C) 2020 - 2024 by the ryujin authors
4//
5
6#pragma once
7
8#include "hyperbolic_module.h"
9#include "introspection.h"
10#include "mpi_ensemble.h"
11#include "openmp.h"
12#include "scope.h"
13#include "simd.h"
14
16
17#include <atomic>
18
19namespace ryujin
20{
21 namespace ShallowWater
22 {
23 struct Description;
24 }
25
26 using namespace dealii;
27
28 template <typename Description, int dim, typename Number>
30 const MPIEnsemble &mpi_ensemble,
31 std::map<std::string, dealii::Timer> &computing_timer,
32 const OfflineData<dim, Number> &offline_data,
33 const HyperbolicSystem &hyperbolic_system,
34 const InitialValues<Description, dim, Number> &initial_values,
35 const std::string &subsection /*= "HyperbolicModule"*/)
36 : ParameterAcceptor(subsection)
37 , id_violation_strategy_(IDViolationStrategy::warn)
38 , indicator_parameters_(subsection + "/indicator")
39 , limiter_parameters_(subsection + "/limiter")
40 , riemann_solver_parameters_(subsection + "/riemann solver")
41 , mpi_ensemble_(mpi_ensemble)
42 , computing_timer_(computing_timer)
43 , offline_data_(&offline_data)
44 , hyperbolic_system_(&hyperbolic_system)
45 , initial_values_(&initial_values)
46 , cfl_(0.2)
47 , n_restarts_(0)
48 , n_warnings_(0)
49 {
50 }
51
52
53 template <typename Description, int dim, typename Number>
55 {
56#ifdef DEBUG_OUTPUT
57 std::cout << "HyperbolicModule<Description, dim, Number>::prepare()"
58 << std::endl;
59#endif
60
61 AssertThrow(limiter_parameters_.iterations() <= 2,
62 dealii::ExcMessage(
63 "The number of limiter iterations must be between [0,2]"));
64
65 /* Initialize vectors: */
66
67 const auto &scalar_partitioner = offline_data_->scalar_partitioner();
68 alpha_.reinit(scalar_partitioner);
69 bounds_.reinit_with_scalar_partitioner(scalar_partitioner);
70
71 r_.reinit(offline_data_->hyperbolic_vector_partitioner());
72 using View =
73 typename Description::template HyperbolicSystemView<dim, Number>;
74
75 /* Initialize matrices: */
76
77 const auto &sparsity_simd = offline_data_->sparsity_pattern_simd();
78 dij_matrix_.reinit(sparsity_simd);
79 lij_matrix_.reinit(sparsity_simd);
80 lij_matrix_next_.reinit(sparsity_simd);
81 pij_matrix_.reinit(sparsity_simd);
82
83 /* Set up initial precomputed vector: */
84
85 initial_precomputed_ =
86 initial_values_->interpolate_initial_precomputed_vector();
87 }
88
89
90 /*
91 * -------------------------------------------------------------------------
92 * Step 1: Apply boundary conditions and precompute values
93 * -------------------------------------------------------------------------
94 */
95
96
97 template <typename Description, int dim, typename Number>
99 StateVector &state_vector, Number t) const
100 {
101#ifdef DEBUG_OUTPUT
102 std::cout << "HyperbolicModule<Description, dim, "
103 "Number>::prepare_state_vector()"
104 << std::endl;
105#endif
106
107 auto &U = std::get<0>(state_vector);
108 auto &precomputed = std::get<1>(state_vector);
109
110 const unsigned int n_export_indices = offline_data_->n_export_indices();
111 const unsigned int n_internal = offline_data_->n_locally_internal();
112 const unsigned int n_owned = offline_data_->n_locally_owned();
113 const auto &sparsity_simd = offline_data_->sparsity_pattern_simd();
114 const auto &boundary_map = offline_data_->boundary_map();
115 unsigned int channel = 10;
116 using VA = VectorizedArray<Number>;
117
118 Scope scope(computing_timer_,
119 "time step [H] 1 - update boundary values, precompute values");
120
121 LIKWID_MARKER_START("time_step_1a");
122
123 /* FIXME: not thread parallel... */
124 for (const auto &entry : boundary_map) {
125 const auto &[i, normal, normal_mass, boundary_mass, id, position] = entry;
126
127 /*
128 * Relay the task of applying appropriate boundary conditions to the
129 * Problem Description.
130 */
131
132 if (id == Boundary::do_nothing)
133 continue;
134
135 auto U_i = U.get_tensor(i);
136
137 /* Use a lambda to avoid computing unnecessary state values */
138 auto get_dirichlet_data = [position = position, t = t, this]() {
139 return initial_values_->initial_state(position, t);
140 };
141
142 const auto view = hyperbolic_system_->template view<dim, Number>();
143 U_i = view.apply_boundary_conditions(id, U_i, normal, get_dirichlet_data);
144 U.write_tensor(U_i, i);
145 }
146
147 LIKWID_MARKER_STOP("time_step_1a");
148
149 U.update_ghost_values();
150
151 /*
152 * Precompute values
153 */
154
155 if constexpr (n_precomputation_cycles != 0) {
156 for (unsigned int cycle = 0; cycle < n_precomputation_cycles; ++cycle) {
158 SynchronizationDispatch synchronization_dispatch([&]() {
159 precomputed.update_ghost_values_start(channel++);
160 precomputed.update_ghost_values_finish();
161 });
162
164 LIKWID_MARKER_START(("time_step_1b"));
165
166 auto loop = [&](auto sentinel, unsigned int left, unsigned int right) {
167 using T = decltype(sentinel);
168
169 /* Stored thread locally: */
170 bool thread_ready = false;
171
172 const auto view = hyperbolic_system_->template view<dim, T>();
173 view.precomputation_loop(
174 cycle,
175 [&](const unsigned int i) {
176 synchronization_dispatch.check(
177 thread_ready, i >= n_export_indices && i < n_internal);
178 },
179 sparsity_simd,
180 state_vector,
181 left,
182 right);
183 };
184
185 /* Parallel non-vectorized loop: */
186 loop(Number(), n_internal, n_owned);
187 /* Parallel vectorized SIMD loop: */
188 loop(VA(), 0, n_internal);
189
190 LIKWID_MARKER_STOP("time_step_1b");
192 }
193 }
194 }
195
196
197 /*
198 * -------------------------------------------------------------------------
199 * Step 2 - 7: Perform an explicit Euler step
200 * -------------------------------------------------------------------------
201 */
202
203
204 namespace
205 {
210 template <typename T>
211 bool all_below_diagonal(unsigned int i, const unsigned int *js)
212 {
213 if constexpr (std::is_same_v<T, typename get_value_type<T>::type>) {
214 /* Non-vectorized sequential access. */
215 const auto j = *js;
216 return j < i;
217
218 } else {
219 /* Vectorized fast access. index must be divisible by simd_length */
220
221 constexpr auto simd_length = T::size();
223 bool all_below_diagonal = true;
224 for (unsigned int k = 0; k < simd_length; ++k)
225 if (js[k] >= i + k) {
226 all_below_diagonal = false;
227 break;
228 }
229 return all_below_diagonal;
230 }
231 }
232 } // namespace
233
234
235 template <typename Description, int dim, typename Number>
236 template <int stages>
238 const StateVector &old_state_vector,
239 std::array<std::reference_wrapper<const StateVector>, stages>
240 stage_state_vectors,
241 const std::array<Number, stages> stage_weights,
242 StateVector &new_state_vector,
243 Number tau /*= 0.*/,
244 std::atomic<Number> tau_max /*std::numeric_limits<Number>::max()*/) const
245 {
246#ifdef DEBUG_OUTPUT
247 std::cout << "HyperbolicModule<Description, dim, Number>::step()"
248 << std::endl;
249#endif
250
251 auto &old_U = std::get<0>(old_state_vector);
252 auto &old_precomputed = std::get<1>(old_state_vector);
253 auto &new_U = std::get<0>(new_state_vector);
254
256
257 /*
258 * Workaround: A constexpr boolean storing the fact whether we
259 * instantiate the HyperbolicModule for the shallow water equations.
260 *
261 * Rationale: Currently, the shallow water equations is the only
262 * hyperbolic system for which we have to (a) form equilibrated states
263 * for the low-order update, and (b) apply an affine shift for
264 * computing limiter bounds. It's not so easy to come up with a
265 * meaningful abstraction layer for this (in particular because we only
266 * have one PDE). Thus, for the time being we simply special case a
267 * small amount of code in this routine.
268 *
269 * FIXME: Refactor into a proper abstraction layer / interface.
270 */
271 constexpr bool shallow_water =
272 std::is_same_v<Description, ShallowWater::Description>;
273
274 using VA = VectorizedArray<Number>;
275
276 /* Index ranges for the iteration over the sparsity pattern : */
277
278 constexpr auto simd_length = VA::size();
279 const unsigned int n_export_indices = offline_data_->n_export_indices();
280 const unsigned int n_internal = offline_data_->n_locally_internal();
281 const unsigned int n_owned = offline_data_->n_locally_owned();
282
283 /* References to precomputed matrices and the stencil: */
284
285 const auto &sparsity_simd = offline_data_->sparsity_pattern_simd();
286
287 const auto &mass_matrix = offline_data_->mass_matrix();
288 const auto &mass_matrix_inverse = offline_data_->mass_matrix_inverse();
289 const auto &lumped_mass_matrix = offline_data_->lumped_mass_matrix();
290 const auto &lumped_mass_matrix_inverse =
291 offline_data_->lumped_mass_matrix_inverse();
292
293 const auto &cij_matrix = offline_data_->cij_matrix();
294 const auto &incidence_matrix = offline_data_->incidence_matrix();
295
296 const auto &coupling_boundary_pairs =
297 offline_data_->coupling_boundary_pairs();
298
299 const Number measure_of_omega_inverse =
300 Number(1.) / offline_data_->measure_of_omega();
301
302 /* A monotonically increasing "channel" variable for mpi_tags: */
303 unsigned int channel = 10;
304
305 /* Lambda for creating the computing timer string: */
306 int step_no = 1;
307 const auto scoped_name = [&step_no](const auto &name,
308 const bool advance = true) {
309 advance || step_no--;
310 return "time step [H] " + std::to_string(++step_no) + " - " + name;
311 };
312
313 /* A boolean signalling that a restart is necessary: */
314 std::atomic<bool> restart_needed = false;
315
316 /*
317 * -------------------------------------------------------------------------
318 * Step 2: Compute off-diagonal d_ij, and alpha_i
319 *
320 * The computation of the d_ij is quite costly. So we do a trick to
321 * save a bit of computational resources. Instead of computing all d_ij
322 * entries for a row of a given local index i, we only compute d_ij for
323 * which j > i,
324 *
325 * llllrr
326 * l .xxxxx
327 * l ..xxxx
328 * l ...xxx
329 * l ....xx
330 * r ......
331 * r ......
332 *
333 * and symmetrize in Step 2.
334 *
335 * MM: We could save a bit more computational resources by only
336 * computing entries for which *IN A GLOBAL* enumeration j > i. But
337 * the index translation, subsequent symmetrization, and exchange
338 * sounds a bit too expensive...
339 * -------------------------------------------------------------------------
340 */
341
342 {
343 Scope scope(computing_timer_, scoped_name("compute d_ij, and alpha_i"));
344
345 SynchronizationDispatch synchronization_dispatch([&]() {
346 alpha_.update_ghost_values_start(channel++);
347 alpha_.update_ghost_values_finish();
348 });
349
351 LIKWID_MARKER_START(("time_step_" + std::to_string(step_no)).c_str());
352
353 auto loop = [&](auto sentinel, unsigned int left, unsigned int right) {
354 using T = decltype(sentinel);
355 unsigned int stride_size = get_stride_size<T>;
356
357 /* Stored thread locally: */
358
359 using RiemannSolver =
360 typename Description::template RiemannSolver<dim, T>;
361 RiemannSolver riemann_solver(
362 *hyperbolic_system_, riemann_solver_parameters_, old_precomputed);
363
364 using Indicator = typename Description::template Indicator<dim, T>;
365 Indicator indicator(
366 *hyperbolic_system_, indicator_parameters_, old_precomputed);
367
368 bool thread_ready = false;
369
371 for (unsigned int i = left; i < right; i += stride_size) {
372
373 /* Skip constrained degrees of freedom: */
374 const unsigned int row_length = sparsity_simd.row_length(i);
375 if (row_length == 1)
376 continue;
377
378 synchronization_dispatch.check(
379 thread_ready, i >= n_export_indices && i < n_internal);
380
381 const auto U_i = old_U.template get_tensor<T>(i);
382
383 indicator.reset(i, U_i);
384
385 const unsigned int *js = sparsity_simd.columns(i);
386 for (unsigned int col_idx = 0; col_idx < row_length;
387 ++col_idx, js += stride_size) {
388
389 const auto U_j = old_U.template get_tensor<T>(js);
390
391 const auto c_ij = cij_matrix.template get_tensor<T>(i, col_idx);
392
393 indicator.accumulate(js, U_j, c_ij);
394
395 /* Skip diagonal. */
396 if (col_idx == 0)
397 continue;
398
399 /* Only iterate over the upper triangular portion of d_ij */
400 if (all_below_diagonal<T>(i, js))
401 continue;
402
403 const auto norm = c_ij.norm();
404 const auto n_ij = c_ij / norm;
405 const auto lambda_max =
406 riemann_solver.compute(U_i, U_j, i, js, n_ij);
407 const auto d_ij = norm * lambda_max;
408
409 dij_matrix_.write_entry(d_ij, i, col_idx, true);
410 }
411
412 const auto mass = get_entry<T>(lumped_mass_matrix, i);
413 const auto hd_i = mass * measure_of_omega_inverse;
414 write_entry<T>(alpha_, indicator.alpha(hd_i), i);
415 }
416 };
417
418 /* Parallel non-vectorized loop: */
419 loop(Number(), n_internal, n_owned);
420 /* Parallel vectorized SIMD loop: */
421 loop(VA(), 0, n_internal);
422
423 LIKWID_MARKER_STOP(("time_step_" + std::to_string(step_no)).c_str());
425 }
426
427 /*
428 * -------------------------------------------------------------------------
429 * Step 3: Compute diagonal of d_ij, and maximal time-step size.
430 * -------------------------------------------------------------------------
431 */
432
433 {
434 Scope scope(computing_timer_,
435 scoped_name("compute bdry d_ij, diag d_ii, and tau_max"));
436
437 /* Parallel region */
439 LIKWID_MARKER_START(("time_step_" + std::to_string(step_no)).c_str());
440
441 /*
442 * Complete d_ij at boundary:
443 *
444 * Here, for continuous finite elements the assumption c_ij = -c_ji
445 * no longer holds true. This implies that d_ij != d_ji. We thus need
446 * to compute the lower-triangular part of d_ij, where i and j are
447 * boundary degrees of freedom as well.
448 */
449
450 using RiemannSolver =
451 typename Description::template RiemannSolver<dim, Number>;
452 RiemannSolver riemann_solver(
453 *hyperbolic_system_, riemann_solver_parameters_, old_precomputed);
454
455 Number local_tau_max = std::numeric_limits<Number>::max();
456
457 /*
458 * Note: we need this dance of iterating over an integer and then
459 * accessing the element to make Apple's OpenMP implementation
460 * happy.
461 */
463 for (std::size_t k = 0; k < coupling_boundary_pairs.size(); ++k) {
464 const auto &[i, col_idx, j] = coupling_boundary_pairs[k];
465
466 /*
467 * Only work on index pairs "i < j" that point to the upper
468 * triangular portion of the d_ij matrix. For all of these index
469 * pairs we compute the corresponding d_ji entry and fix up the
470 * d_ij entry (from step 2) by taking the maximum. Note that we
471 * actually do not store anything in the d_ji entry itself because
472 * we symmetrize the matrix later on anyway.
473 */
474 if (j < i)
475 continue;
476
477 const auto U_i = old_U.get_tensor(i);
478 const auto U_j = old_U.get_tensor(j);
479
480 const auto c_ji = cij_matrix.get_transposed_tensor(i, col_idx);
481 Assert(c_ji.norm() > 1.e-12, ExcInternalError());
482 const auto norm_ji = c_ji.norm();
483 const auto n_ji = c_ji / norm_ji;
484
485 const auto d_ij = dij_matrix_.get_entry(i, col_idx);
486
487 const auto lambda_max = riemann_solver.compute(U_j, U_i, j, &i, n_ji);
488 const auto d_ji = norm_ji * lambda_max;
489
490 dij_matrix_.write_entry(std::max(d_ij, d_ji), i, col_idx);
491 }
492
493 /* Symmetrize d_ij: */
494
496 for (unsigned int i = 0; i < n_owned; ++i) {
497
498 /* Skip constrained degrees of freedom: */
499 const unsigned int row_length = sparsity_simd.row_length(i);
500 if (row_length == 1)
501 continue;
502
503 Number d_sum = Number(0.);
504
505 /* skip diagonal: */
506 const unsigned int *js = sparsity_simd.columns(i);
507 for (unsigned int col_idx = 1; col_idx < row_length; ++col_idx) {
508 const auto j =
509 *(i < n_internal ? js + col_idx * simd_length : js + col_idx);
510
511 // fill lower triangular part of dij_matrix missing from step 1
512 if (j < i) {
513 const auto d_ji = dij_matrix_.get_transposed_entry(i, col_idx);
514
515#ifdef DEBUG
516 /* Verify that d_ji == std::max(d_ij, d_ji): */
517
518 const auto U_i = old_U.get_tensor(i);
519 const auto U_j = old_U.get_tensor(j);
520
521 const auto c_ij = cij_matrix.get_tensor(i, col_idx);
522 Assert(c_ij.norm() > 1.e-12, ExcInternalError());
523 const auto norm_ij = c_ij.norm();
524 const auto n_ij = c_ij / norm_ij;
525
526 const auto lambda_max =
527 riemann_solver.compute(U_i, U_j, i, &j, n_ij);
528 const auto d_ij = norm_ij * lambda_max;
529
530 Assert(d_ij <= d_ji + 1.0e-12,
531 dealii::ExcMessage("d_ij not symmetrized correctly on "
532 "boundary degrees of freedom."));
533#endif
534
535 dij_matrix_.write_entry(d_ji, i, col_idx);
536 }
537
538 d_sum -= dij_matrix_.get_entry(i, col_idx);
539 }
540
541 /*
542 * Make sure that we do not accidentally divide by zero. (Yes, this
543 * can happen for some (admittedly, rather esoteric) scalar
544 * conservation equations...).
545 */
546 d_sum =
547 std::min(d_sum, Number(-1.e6) * std::numeric_limits<Number>::min());
548
549 /* write diagonal element */
550 dij_matrix_.write_entry(d_sum, i, 0);
551
552 const Number mass = lumped_mass_matrix.local_element(i);
553 const Number tau = cfl_ * mass / (Number(-2.) * d_sum);
554 local_tau_max = std::min(local_tau_max, tau);
555 }
556
557 /* Synchronize tau max over all threads: */
558 Number current_tau_max = tau_max.load();
559 while (current_tau_max > local_tau_max &&
560 !tau_max.compare_exchange_weak(current_tau_max, local_tau_max))
561 ;
562
563 LIKWID_MARKER_STOP(("time_step_" + std::to_string(step_no)).c_str());
565 }
566
567 {
568 Scope scope(computing_timer_,
569 "time step [H] _ - synchronization barriers");
570
571 /*
572 * MPI Barrier: Synchronize the maximal time-step size. This has to
573 * happen either over the global, or the local subrange communicator:
574 */
575 tau_max.store(Utilities::MPI::min(
576 tau_max.load(), mpi_ensemble_.synchronization_communicator()));
577
578 AssertThrow(
579 !std::isnan(tau_max) && !std::isinf(tau_max) && tau_max > 0.,
580 ExcMessage(
581 "I'm sorry, Dave. I'm afraid I can't do that.\nWe crashed."));
582
583 tau = (tau == Number(0.) ? tau_max.load() : tau);
584
585#ifdef DEBUG_OUTPUT
586 std::cout << " computed tau_max = " << tau_max << std::endl;
587 std::cout << " perform time-step with tau = " << tau << std::endl;
588#endif
589 }
590
591#ifdef DEBUG
592 /* Exchange d_ij so that we can check for symmetry: */
593 dij_matrix_.update_ghost_rows();
594#endif
595
596 /*
597 * -------------------------------------------------------------------------
598 * Step 4: Low-order update, also compute limiter bounds, R_i
599 * -------------------------------------------------------------------------
600 */
601
602 {
603 Scope scope(computing_timer_,
604 scoped_name("l.-o. update, compute bounds, r_i, and p_ij"));
605
606 SynchronizationDispatch synchronization_dispatch([&]() {
607 r_.update_ghost_values_start(channel++);
608 r_.update_ghost_values_finish();
609 if (offline_data_->discretization().have_discontinuous_ansatz()) {
610 /*
611 * In case we extend bounds over the stencil, we have to ensure
612 * that ghost ranges are properly communicated over all MPI
613 * ranks.
614 */
615 bounds_.update_ghost_values_start(channel++);
616 bounds_.update_ghost_values_finish();
617 }
618 });
619
620 const Number weight =
621 -std::accumulate(stage_weights.begin(), stage_weights.end(), -1.);
622
623 /* Parallel region */
625 LIKWID_MARKER_START(("time_step_" + std::to_string(step_no)).c_str());
626
627 auto loop = [&](auto sentinel,
628 auto have_discontinuous_ansatz,
629 unsigned int left,
630 unsigned int right) {
631 using T = decltype(sentinel);
632 using View =
633 typename Description::template HyperbolicSystemView<dim, T>;
634 using Limiter = typename Description::template Limiter<dim, T>;
635 using flux_contribution_type = typename View::flux_contribution_type;
636 using state_type = typename View::state_type;
637
638 unsigned int stride_size = get_stride_size<T>;
639
640 const auto view = hyperbolic_system_->template view<dim, T>();
641
642 /* Stored thread locally: */
643 Limiter limiter(
644 *hyperbolic_system_, limiter_parameters_, old_precomputed);
645 bool thread_ready = false;
646
648 for (unsigned int i = left; i < right; i += stride_size) {
649
650 /* Skip constrained degrees of freedom: */
651 const unsigned int row_length = sparsity_simd.row_length(i);
652 if (row_length == 1)
653 continue;
654
655 synchronization_dispatch.check(
656 thread_ready, i >= n_export_indices && i < n_internal);
657
658 const auto U_i = old_U.template get_tensor<T>(i);
659 auto U_i_new = U_i;
660
661 const auto alpha_i = get_entry<T>(alpha_, i);
662 const auto m_i = get_entry<T>(lumped_mass_matrix, i);
663 const auto m_i_inv = get_entry<T>(lumped_mass_matrix_inverse, i);
664
665 const auto flux_i = view.flux_contribution(
666 old_precomputed, initial_precomputed_, i, U_i);
667
668 std::array<flux_contribution_type, stages> flux_iHs;
669 [[maybe_unused]] state_type S_iH;
670
671 for (int s = 0; s < stages; ++s) {
672 const auto &[U_s, prec_s, V_s] = stage_state_vectors[s].get();
673
674 const auto U_iHs = U_s.template get_tensor<T>(i);
675 flux_iHs[s] =
676 view.flux_contribution(prec_s, initial_precomputed_, i, U_iHs);
677
678 if constexpr (View::have_source_terms) {
679 S_iH +=
680 stage_weights[s] * view.nodal_source(prec_s, i, U_iHs, tau);
681 }
682 }
683
684 [[maybe_unused]] state_type S_i;
685 state_type F_iH;
686
687 if constexpr (View::have_source_terms) {
688 S_i = view.nodal_source(old_precomputed, i, U_i, tau);
689 S_iH += weight * S_i;
690 U_i_new += tau * /* m_i_inv * m_i */ S_i;
691 F_iH += m_i * S_iH;
692 }
693
694 limiter.reset(i, U_i, flux_i);
695
696 [[maybe_unused]] state_type affine_shift;
697
698 /*
699 * Workaround: For shallow water we need to accumulate an
700 * additional contribution to the affine shift over the stencil
701 * before we can compute limiter bounds.
702 */
703
704 const unsigned int *js = sparsity_simd.columns(i);
705 if constexpr (shallow_water) {
706 for (unsigned int col_idx = 0; col_idx < row_length;
707 ++col_idx, js += stride_size) {
708
709 const auto U_j = old_U.template get_tensor<T>(js);
710 const auto flux_j = view.flux_contribution(
711 old_precomputed, initial_precomputed_, js, U_j);
712
713 const auto d_ij = dij_matrix_.template get_entry<T>(i, col_idx);
714 const auto c_ij = cij_matrix.template get_tensor<T>(i, col_idx);
715
716 const auto B_ij = view.affine_shift(flux_i, flux_j, c_ij, d_ij);
717 affine_shift += B_ij;
718 }
719
720 affine_shift *= tau * m_i_inv;
721 }
722
723 if constexpr (View::have_source_terms) {
724 affine_shift += tau * /* m_i_inv * m_i */ S_i;
725 }
726
727 js = sparsity_simd.columns(i);
728 for (unsigned int col_idx = 0; col_idx < row_length;
729 ++col_idx, js += stride_size) {
730
731 const auto U_j = old_U.template get_tensor<T>(js);
732
733 const auto alpha_j = get_entry<T>(alpha_, js);
734
735 const auto d_ij = dij_matrix_.template get_entry<T>(i, col_idx);
736 auto factor = (alpha_i + alpha_j) * Number(.5);
737
738 if constexpr (have_discontinuous_ansatz) {
739 const auto incidence_ij =
740 incidence_matrix.template get_entry<T>(i, col_idx);
741 factor = std::max(factor, incidence_ij);
742 }
743
744 const auto d_ijH = d_ij * factor;
745
746#ifdef DEBUG
747 /*
748 * Verify that all local chunks of the d_ij matrix have been
749 * computed consistently over all MPI ranks. For that we import
750 * all ghost rows from neighboring MPI ranks and simply check
751 * that the (local) values of d_ij and d_ji match.
752 */
753 const auto d_ji =
754 dij_matrix_.template get_transposed_entry<T>(i, col_idx);
755 Assert(std::max(std::abs(d_ij - d_ji), T(1.0e-12)) == T(1.0e-12),
756 dealii::ExcMessage(
757 "d_ij not symmetrized correctly over MPI ranks"));
758#endif
759
760 const auto c_ij = cij_matrix.template get_tensor<T>(i, col_idx);
761 constexpr auto eps = std::numeric_limits<Number>::epsilon();
762 const auto scale = dealii::compare_and_apply_mask<
763 dealii::SIMDComparison::less_than>(
764 std::abs(d_ij), T(eps * eps), T(0.), T(1.) / d_ij);
765 const auto scaled_c_ij = c_ij * scale;
766
767 const auto flux_j = view.flux_contribution(
768 old_precomputed, initial_precomputed_, js, U_j);
769
770 const auto m_ij = mass_matrix.template get_entry<T>(i, col_idx);
771
772 /*
773 * Compute low-order flux and limiter bounds:
774 */
775
776 const auto flux_ij = view.flux_divergence(flux_i, flux_j, c_ij);
777 U_i_new += tau * m_i_inv * flux_ij;
778 auto P_ij = -flux_ij;
779
780 if constexpr (shallow_water) {
781 /*
782 * Workaround: Shallow water (and related) are special:
783 */
784
785 const auto &[U_star_ij, U_star_ji] =
786 view.equilibrated_states(flux_i, flux_j);
787
788 U_i_new += tau * m_i_inv * d_ij * (U_star_ji - U_star_ij);
789 F_iH += d_ijH * (U_star_ji - U_star_ij);
790 P_ij += (d_ijH - d_ij) * (U_star_ji - U_star_ij);
791
792 limiter.accumulate(
793 U_j, U_star_ij, U_star_ji, scaled_c_ij, affine_shift);
794
795 } else {
796
797 U_i_new += tau * m_i_inv * d_ij * (U_j - U_i);
798 F_iH += d_ijH * (U_j - U_i);
799 P_ij += (d_ijH - d_ij) * (U_j - U_i);
800
801 limiter.accumulate(js, U_j, flux_j, scaled_c_ij, affine_shift);
802 }
803
804 if constexpr (View::have_source_terms) {
805 F_iH -= m_ij * S_iH;
806 P_ij -= m_ij * /*sic!*/ S_i;
807 }
808
809 /*
810 * Compute high-order fluxes and source terms:
811 */
812
813 if constexpr (View::have_high_order_flux) {
814 const auto high_order_flux_ij =
815 view.high_order_flux_divergence(flux_i, flux_j, c_ij);
816 F_iH += weight * high_order_flux_ij;
817 P_ij += weight * high_order_flux_ij;
818 } else {
819 F_iH += weight * flux_ij;
820 P_ij += weight * flux_ij;
821 }
822
823 if constexpr (View::have_source_terms) {
824 const auto S_j = view.nodal_source(old_precomputed, js, U_j, tau);
825 F_iH += weight * m_ij * S_j;
826 P_ij += weight * m_ij * S_j;
827 }
828
829 for (int s = 0; s < stages; ++s) {
830 const auto &[U_s, prec_s, V_s] = stage_state_vectors[s].get();
831
832 const auto U_jHs = U_s.template get_tensor<T>(js);
833 const auto flux_jHs = view.flux_contribution(
834 prec_s, initial_precomputed_, js, U_jHs);
835
836 if constexpr (View::have_high_order_flux) {
837 const auto high_order_flux_ij = view.high_order_flux_divergence(
838 flux_iHs[s], flux_jHs, c_ij);
839 F_iH += stage_weights[s] * high_order_flux_ij;
840 P_ij += stage_weights[s] * high_order_flux_ij;
841 } else {
842 const auto flux_ij =
843 view.flux_divergence(flux_iHs[s], flux_jHs, c_ij);
844 F_iH += stage_weights[s] * flux_ij;
845 P_ij += stage_weights[s] * flux_ij;
846 }
847
848 if constexpr (View::have_source_terms) {
849 const auto S_js = view.nodal_source(prec_s, js, U_jHs, tau);
850 F_iH += stage_weights[s] * m_ij * S_js;
851 P_ij += stage_weights[s] * m_ij * S_js;
852 }
853 }
854
855 pij_matrix_.write_entry(P_ij, i, col_idx, true);
856 }
857
858#ifdef EXPENSIVE_BOUNDS_CHECK
859 if (!view.is_admissible(U_i_new)) {
860 restart_needed = true;
861 }
862#endif
863
864 new_U.template write_tensor<T>(U_i_new, i);
865 r_.template write_tensor<T>(F_iH, i);
866
867 const auto hd_i = m_i * measure_of_omega_inverse;
868 const auto relaxed_bounds = limiter.bounds(hd_i);
869 bounds_.template write_tensor<T>(relaxed_bounds, i);
870 }
871 };
872
873 /*
874 * Chain through a compile time integral constant std::true_type for
875 * a discontinuous ansatz and std::false_type otherwise. We use the
876 * (constexpr) integral constant later on to avoid branching when
877 * computing d_ijH.
878 */
879 if (offline_data_->discretization().have_discontinuous_ansatz()) {
880 /* Parallel non-vectorized loop and vectorized SIMD loop: */
881 loop(Number(), std::true_type{}, n_internal, n_owned);
882 loop(VA(), std::true_type{}, 0, n_internal);
883 } else {
884 /* Parallel non-vectorized loop and vectorized SIMD loop: */
885 loop(Number(), std::false_type{}, n_internal, n_owned);
886 loop(VA(), std::false_type{}, 0, n_internal);
887 }
888
889 LIKWID_MARKER_STOP(("time_step_" + std::to_string(step_no)).c_str());
891 }
892
893 /*
894 * -------------------------------------------------------------------------
895 * Step 5: Compute second part of P_ij, and l_ij (first round):
896 * -------------------------------------------------------------------------
897 */
898
899 if (limiter_parameters_.iterations() != 0) {
900 Scope scope(computing_timer_, scoped_name("compute p_ij, and l_ij"));
901
902 SynchronizationDispatch synchronization_dispatch([&]() {
903 lij_matrix_.update_ghost_rows_start(channel++);
904 lij_matrix_.update_ghost_rows_finish();
905 });
906
908 LIKWID_MARKER_START(("time_step_" + std::to_string(step_no)).c_str());
909
910 auto loop = [&](auto sentinel,
911 auto have_discontinuous_ansatz,
912 unsigned int left,
913 unsigned int right) {
914 using T = decltype(sentinel);
915 using View =
916 typename Description::template HyperbolicSystemView<dim, T>;
917 using Limiter = typename Description::template Limiter<dim, T>;
918
919 unsigned int stride_size = get_stride_size<T>;
920
921 /* Stored thread locally: */
922 Limiter limiter(
923 *hyperbolic_system_, limiter_parameters_, old_precomputed);
924 bool thread_ready = false;
925
927 for (unsigned int i = left; i < right; i += stride_size) {
928
929 /* Skip constrained degrees of freedom: */
930 const unsigned int row_length = sparsity_simd.row_length(i);
931 if (row_length == 1)
932 continue;
933
934 synchronization_dispatch.check(
935 thread_ready, i >= n_export_indices && i < n_internal);
936
937 auto bounds =
938 bounds_.template get_tensor<T, std::array<T, n_bounds>>(i);
939
940 /*
941 * In case of a discontinuous finite element ansatz we need to
942 * extend bounds over the stencil. We do this by looping over the
943 * stencil once and taking the minimum/maximum:
944 */
945 if constexpr (have_discontinuous_ansatz) {
946 /* Skip diagonal. */
947 const unsigned int *js = sparsity_simd.columns(i) + stride_size;
948 for (unsigned int col_idx = 1; col_idx < row_length;
949 ++col_idx, js += stride_size) {
950 bounds = limiter.combine_bounds(
951 bounds,
952 bounds_.template get_tensor<T, std::array<T, n_bounds>>(js));
953 }
954 bounds_.template write_tensor<T>(bounds, i);
955 }
956
957 [[maybe_unused]] T m_i;
958 if constexpr (have_discontinuous_ansatz)
959 m_i = get_entry<T>(lumped_mass_matrix, i);
960 const auto m_i_inv = get_entry<T>(lumped_mass_matrix_inverse, i);
961
962 const auto U_i_new = new_U.template get_tensor<T>(i);
963
964 const auto F_iH = r_.template get_tensor<T>(i);
965
966 const auto lambda_inv = Number(row_length - 1);
967 const auto factor = tau * m_i_inv * lambda_inv;
968
969 /* Skip diagonal. */
970 const unsigned int *js = sparsity_simd.columns(i) + stride_size;
971 for (unsigned int col_idx = 1; col_idx < row_length;
972 ++col_idx, js += stride_size) {
973
974 auto P_ij = pij_matrix_.template get_tensor<T>(i, col_idx);
975 const auto F_jH = r_.template get_tensor<T>(js);
976
977 /*
978 * Mass matrix correction:
979 */
980
981 const auto kronecker_ij = col_idx == 0 ? T(1.) : T(0.);
982
983 if constexpr (have_discontinuous_ansatz) {
984 /* Use full consistent mass matrix inverse: */
985
986 const auto m_j = get_entry<T>(lumped_mass_matrix, js);
987 const auto m_ij_inv =
988 mass_matrix_inverse.template get_entry<T>(i, col_idx);
989 const auto b_ij = m_i * m_ij_inv - kronecker_ij;
990 const auto b_ji = m_j * m_ij_inv - kronecker_ij;
991
992 P_ij += b_ij * F_jH - b_ji * F_iH;
993
994 } else {
995 /* Use Neumann series expansion: */
996
997 const auto m_j_inv = get_entry<T>(lumped_mass_matrix_inverse, js);
998 const auto m_ij = mass_matrix.template get_entry<T>(i, col_idx);
999 const auto b_ij = kronecker_ij - m_ij * m_j_inv;
1000 const auto b_ji = kronecker_ij - m_ij * m_i_inv;
1001
1002 P_ij += b_ij * F_jH - b_ji * F_iH;
1003 }
1004
1005 P_ij *= factor;
1006 pij_matrix_.write_entry(P_ij, i, col_idx);
1007
1008 /*
1009 * Compute limiter coefficients:
1010 */
1011
1012 const auto &[l_ij, success] = limiter.limit(bounds, U_i_new, P_ij);
1013 lij_matrix_.template write_entry<T>(l_ij, i, col_idx, true);
1014
1015 /*
1016 * If the success is set to false then the low-order update
1017 * resulted in a state outside of the limiter bounds. This can
1018 * happen if we compute with an aggressive CFL number. We
1019 * signal this condition by setting the restart_needed boolean
1020 * to true and defer further action to the chosen
1021 * IDViolationStrategy and the policy set in the
1022 * TimeIntegrator.
1023 */
1024 if (!success)
1025 restart_needed = true;
1026 }
1027 }
1028 };
1029
1030 /*
1031 * Chain through a compile time integral constant std::true_type for
1032 * a discontinuous ansatz and std::false_type otherwise. We use the
1033 * (constexpr) integral constant later on to avoid branching when
1034 * computing d_ijH.
1035 */
1036 if (offline_data_->discretization().have_discontinuous_ansatz()) {
1037 /* Parallel non-vectorized loop and vectorized SIMD loop: */
1038 loop(Number(), std::true_type{}, n_internal, n_owned);
1039 loop(VA(), std::true_type{}, 0, n_internal);
1040 } else {
1041 /* Parallel non-vectorized loop and vectorized SIMD loop: */
1042 loop(Number(), std::false_type{}, n_internal, n_owned);
1043 loop(VA(), std::false_type{}, 0, n_internal);
1044 }
1045
1046 LIKWID_MARKER_STOP(("time_step_" + std::to_string(step_no)).c_str());
1048 }
1049
1050 /*
1051 * -------------------------------------------------------------------------
1052 * Step 6, 7: Perform high-order update:
1053 *
1054 * Symmetrize l_ij
1055 * High-order update: += l_ij * lambda * P_ij
1056 * Compute next l_ij
1057 * -------------------------------------------------------------------------
1058 */
1059
1060 const auto n_iterations = limiter_parameters_.iterations();
1061 for (unsigned int pass = 0; pass < n_iterations; ++pass) {
1062 bool last_round = (pass + 1 == n_iterations);
1063
1064 std::string additional_step = (last_round ? "" : ", next l_ij");
1065 Scope scope(
1066 computing_timer_,
1067 scoped_name("symmetrize l_ij, h.-o. update" + additional_step));
1068
1069 if ((n_iterations == 2) && last_round) {
1070 std::swap(lij_matrix_, lij_matrix_next_);
1071 }
1072
1073 SynchronizationDispatch synchronization_dispatch([&]() {
1074 if (!last_round) {
1075 lij_matrix_next_.update_ghost_rows_start(channel++);
1076 lij_matrix_next_.update_ghost_rows_finish();
1077 }
1078 });
1079
1081 LIKWID_MARKER_START(("time_step_" + std::to_string(step_no)).c_str());
1082
1083 auto loop = [&](auto sentinel, unsigned int left, unsigned int right) {
1084 using T = decltype(sentinel);
1085 using View =
1086 typename Description::template HyperbolicSystemView<dim, T>;
1087 using Limiter = typename Description::template Limiter<dim, T>;
1088
1089 unsigned int stride_size = get_stride_size<T>;
1090
1091 /* Stored thread locally: */
1092 AlignedVector<T> lij_row;
1093 Limiter limiter(
1094 *hyperbolic_system_, limiter_parameters_, old_precomputed);
1095 bool thread_ready = false;
1096
1098 for (unsigned int i = left; i < right; i += stride_size) {
1099
1100 /* Skip constrained degrees of freedom: */
1101 const unsigned int row_length = sparsity_simd.row_length(i);
1102 if (row_length == 1)
1103 continue;
1104
1105 synchronization_dispatch.check(
1106 thread_ready, i >= n_export_indices && i < n_internal);
1107
1108 auto U_i_new = new_U.template get_tensor<T>(i);
1109
1110 const Number lambda = Number(1.) / Number(row_length - 1);
1111 lij_row.resize_fast(row_length);
1112
1113 /* Skip diagonal. */
1114 for (unsigned int col_idx = 1; col_idx < row_length; ++col_idx) {
1115
1116 const auto l_ij = std::min(
1117 lij_matrix_.template get_entry<T>(i, col_idx),
1118 lij_matrix_.template get_transposed_entry<T>(i, col_idx));
1119
1120 const auto p_ij = pij_matrix_.template get_tensor<T>(i, col_idx);
1121
1122 U_i_new += l_ij * lambda * p_ij;
1123
1124 if (!last_round)
1125 lij_row[col_idx] = l_ij;
1126 }
1127
1128#ifdef EXPENSIVE_BOUNDS_CHECK
1129 const auto view = hyperbolic_system_->template view<dim, T>();
1130 if (!view.is_admissible(U_i_new)) {
1131 restart_needed = true;
1132 }
1133#endif
1134
1135 new_U.template write_tensor<T>(U_i_new, i);
1136
1137 /* Skip computating l_ij and updating p_ij in the last round */
1138 if (last_round)
1139 continue;
1140
1141 const auto bounds =
1142 bounds_.template get_tensor<T, std::array<T, n_bounds>>(i);
1143 /* Skip diagonal. */
1144 for (unsigned int col_idx = 1; col_idx < row_length; ++col_idx) {
1145
1146 const auto old_l_ij = lij_row[col_idx];
1147
1148 const auto new_p_ij =
1149 (T(1.) - old_l_ij) *
1150 pij_matrix_.template get_tensor<T>(i, col_idx);
1151
1152 const auto &[new_l_ij, success] =
1153 limiter.limit(bounds, U_i_new, new_p_ij);
1154
1155 /*
1156 * This is the second pass of the limiter. Under rare
1157 * circumstances the previous high-order update might be
1158 * slightly out of bounds due to roundoff errors. This happens
1159 * for example in flat regions or in stagnation points at a
1160 * (slip boundary) point. The limiter should ensure that we do
1161 * not further manipulate the state in this case. We thus only
1162 * signal a restart condition if the `EXPENSIVE_BOUNDS_CHECK` debug
1163 * macro is defined.
1164 */
1165#ifdef EXPENSIVE_BOUNDS_CHECK
1166 if (!success)
1167 restart_needed = true;
1168#endif
1169
1170 /*
1171 * Shortcut: We omit updating the p_ij and q_ij matrices and
1172 * simply write (1 - l_ij^(1)) * l_ij^(2) into the l_ij matrix.
1173 *
1174 * This approach only works for at most two limiting steps.
1175 */
1176 const auto entry = (T(1.) - old_l_ij) * new_l_ij;
1177 lij_matrix_next_.write_entry(entry, i, col_idx, true);
1178 }
1179 }
1180 };
1181
1182 /* Parallel non-vectorized loop: */
1183 loop(Number(), n_internal, n_owned);
1184 /* Parallel vectorized SIMD loop: */
1185 loop(VA(), 0, n_internal);
1186
1187 LIKWID_MARKER_STOP(("time_step_" + std::to_string(step_no)).c_str());
1189 } /* limiter_iter_ */
1190
1192
1193 /*
1194 * Do we have to restart?
1195 */
1196
1197 {
1198 Scope scope(computing_timer_,
1199 "time step [H] _ - synchronization barriers");
1200
1201 /*
1202 * Synchronize whether we have to restart the time step. Even though
1203 * the restart condition itself only affects the local ensemble we
1204 * nevertheless need to synchronize the boolean in case we perform
1205 * synchronized global time steps. (Otherwise different ensembles
1206 * might end up with a different time step.)
1207 */
1208 restart_needed.store(Utilities::MPI::logical_or(
1209 restart_needed.load(), mpi_ensemble_.synchronization_communicator()));
1210 }
1211
1212 if (restart_needed) {
1213 switch (id_violation_strategy_) {
1215 n_warnings_++;
1216 break;
1218 n_restarts_++;
1219 throw Restart();
1220 }
1221 }
1222
1223 /* In debug mode poison precomputed values: */
1224 Vectors::debug_poison_precomputed_values<Description>(new_state_vector,
1225 *offline_data_);
1226
1227 /* Return the time step size tau: */
1228 return tau;
1229 }
1230
1231} /* namespace ryujin */
HyperbolicModule(const MPIEnsemble &mpi_ensemble, std::map< std::string, dealii::Timer > &computing_timer, const OfflineData< dim, Number > &offline_data, const HyperbolicSystem &hyperbolic_system, const InitialValues< Description, dim, Number > &initial_values, const std::string &subsection="/HyperbolicModule")
typename Description::HyperbolicSystem HyperbolicSystem
typename View::StateVector StateVector
typename View::state_type state_type
typename Description::template HyperbolicSystemView< dim, Number > View
Number step(const StateVector &old_state_vector, std::array< std::reference_wrapper< const StateVector >, stages > stage_state_vectors, const std::array< Number, stages > stage_weights, StateVector &new_state_vector, Number tau=Number(0.), std::atomic< Number > tau_max=std::numeric_limits< Number >::max()) const
void prepare_state_vector(StateVector &state_vector, Number t) const
void reset(const unsigned int i, const state_type &U_i)
Definition: indicator.h:153
void accumulate(const unsigned int *js, const state_type &U_j, const dealii::Tensor< 1, dim, Number > &c_ij)
Definition: indicator.h:171
Number alpha(const Number h_i) const
Definition: indicator.h:195
Bounds combine_bounds(const Bounds &bounds_left, const Bounds &bounds_right) const
Definition: limiter.h:216
void reset(const unsigned int i, const state_type &U_i, const flux_contribution_type &flux_i)
Definition: limiter.h:228
std::tuple< Number, bool > limit(const Bounds &bounds, const state_type &U, const state_type &P, const Number t_min=Number(0.), const Number t_max=Number(1.))
Bounds bounds(const Number hd_i) const
Definition: limiter.h:288
void accumulate(const unsigned int *js, const state_type &U_j, const flux_contribution_type &flux_j, const dealii::Tensor< 1, dim, Number > &scaled_c_ij, const state_type &affine_shift)
Definition: limiter.h:250
Number compute(const Number &u_i, const Number &u_j, const precomputed_type &prec_i, const precomputed_type &prec_j, const dealii::Tensor< 1, dim, Number > &n_ij) const
#define RYUJIN_PARALLEL_REGION_BEGIN
Definition: openmp.h:54
#define RYUJIN_OMP_FOR
Definition: openmp.h:70
#define RYUJIN_PARALLEL_REGION_END
Definition: openmp.h:63
#define LIKWID_MARKER_START(opt)
Definition: introspection.h:68
#define CALLGRIND_START_INSTRUMENTATION
Definition: introspection.h:28
#define LIKWID_MARKER_STOP(opt)
Definition: introspection.h:73
#define CALLGRIND_STOP_INSTRUMENTATION
Definition: introspection.h:35