ryujin 2.1.1 revision 0eab90fbc6e1ac9f2e0a2e6d16f9f023c13a02f7
hyperbolic_module.template.h
Go to the documentation of this file.
1//
2// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
3// Copyright (C) 2020 - 2024 by the ryujin authors
4//
5
6#pragma once
7
8#include "hyperbolic_module.h"
9#include "introspection.h"
10#include "mpi_ensemble.h"
11#include "openmp.h"
12#include "scope.h"
13#include "simd.h"
14
16
17#include <atomic>
18
19namespace ryujin
20{
21 namespace ShallowWater
22 {
23 struct Description;
24 }
25
26 using namespace dealii;
27
28 template <typename Description, int dim, typename Number>
30 const MPIEnsemble &mpi_ensemble,
31 std::map<std::string, dealii::Timer> &computing_timer,
32 const OfflineData<dim, Number> &offline_data,
33 const HyperbolicSystem &hyperbolic_system,
34 const InitialValues<Description, dim, Number> &initial_values,
35 const std::string &subsection /*= "HyperbolicModule"*/)
36 : ParameterAcceptor(subsection)
37 , id_violation_strategy_(IDViolationStrategy::warn)
38 , indicator_parameters_(subsection + "/indicator")
39 , limiter_parameters_(subsection + "/limiter")
40 , riemann_solver_parameters_(subsection + "/riemann solver")
41 , mpi_ensemble_(mpi_ensemble)
42 , computing_timer_(computing_timer)
43 , offline_data_(&offline_data)
44 , hyperbolic_system_(&hyperbolic_system)
45 , initial_values_(&initial_values)
46 , cfl_(0.2)
47 , acceptable_tau_max_ratio_(1.e6)
48 , n_restarts_(0)
49 , n_corrections_(0)
50 , n_warnings_(0)
51 {
52 }
53
54
55 template <typename Description, int dim, typename Number>
57 {
58#ifdef DEBUG_OUTPUT
59 std::cout << "HyperbolicModule<Description, dim, Number>::prepare()"
60 << std::endl;
61#endif
62
63 AssertThrow(limiter_parameters_.iterations() <= 2,
64 dealii::ExcMessage(
65 "The number of limiter iterations must be between [0,2]"));
66
67 /* Initialize vectors: */
68
69 const auto &scalar_partitioner = offline_data_->scalar_partitioner();
70 alpha_.reinit(scalar_partitioner);
71 bounds_.reinit_with_scalar_partitioner(scalar_partitioner);
72
73 r_.reinit(offline_data_->hyperbolic_vector_partitioner());
74 using View =
75 typename Description::template HyperbolicSystemView<dim, Number>;
76
77 /* Initialize matrices: */
78
79 const auto &sparsity_simd = offline_data_->sparsity_pattern_simd();
80 dij_matrix_.reinit(sparsity_simd);
81 lij_matrix_.reinit(sparsity_simd);
82 lij_matrix_next_.reinit(sparsity_simd);
83 pij_matrix_.reinit(sparsity_simd);
84
85 /* Set up initial precomputed vector: */
86
87 initial_precomputed_ =
88 initial_values_->interpolate_initial_precomputed_vector();
89 }
90
91
92 /*
93 * -------------------------------------------------------------------------
94 * Step 1: Apply boundary conditions and precompute values
95 * -------------------------------------------------------------------------
96 */
97
98
99 template <typename Description, int dim, typename Number>
101 StateVector &state_vector, Number t) const
102 {
103#ifdef DEBUG_OUTPUT
104 std::cout << "HyperbolicModule<Description, dim, "
105 "Number>::prepare_state_vector()"
106 << std::endl;
107#endif
108
109 auto &U = std::get<0>(state_vector);
110 auto &precomputed = std::get<1>(state_vector);
111
112 const unsigned int n_export_indices = offline_data_->n_export_indices();
113 const unsigned int n_internal = offline_data_->n_locally_internal();
114 const unsigned int n_owned = offline_data_->n_locally_owned();
115 const auto &sparsity_simd = offline_data_->sparsity_pattern_simd();
116 const auto &boundary_map = offline_data_->boundary_map();
117 unsigned int channel = 10;
118 using VA = VectorizedArray<Number>;
119
120 Scope scope(computing_timer_,
121 "time step [H] 1 - update boundary values, precompute values");
122
123 LIKWID_MARKER_START("time_step_1a");
124
125 /* FIXME: not thread parallel... */
126 for (const auto &entry : boundary_map) {
127 const auto &[i, normal, normal_mass, boundary_mass, id, position] = entry;
128
129 /*
130 * Relay the task of applying appropriate boundary conditions to the
131 * Problem Description.
132 */
133
134 if (id == Boundary::do_nothing)
135 continue;
136
137 auto U_i = U.get_tensor(i);
138
139 /* Use a lambda to avoid computing unnecessary state values */
140 auto get_dirichlet_data = [position = position, t = t, this]() {
141 return initial_values_->initial_state(position, t);
142 };
143
144 const auto view = hyperbolic_system_->template view<dim, Number>();
145 U_i = view.apply_boundary_conditions(id, U_i, normal, get_dirichlet_data);
146 U.write_tensor(U_i, i);
147 }
148
149 LIKWID_MARKER_STOP("time_step_1a");
151 U.update_ghost_values();
152
153 /*
154 * Precompute values
155 */
156
157 if constexpr (n_precomputation_cycles != 0) {
158 for (unsigned int cycle = 0; cycle < n_precomputation_cycles; ++cycle) {
159
160 SynchronizationDispatch synchronization_dispatch([&]() {
161 precomputed.update_ghost_values_start(channel++);
162 precomputed.update_ghost_values_finish();
163 });
164
166 LIKWID_MARKER_START(("time_step_1b"));
167
168 auto loop = [&](auto sentinel, unsigned int left, unsigned int right) {
169 using T = decltype(sentinel);
170
171 /* Stored thread locally: */
172 bool thread_ready = false;
173
174 const auto view = hyperbolic_system_->template view<dim, T>();
175 view.precomputation_loop(
176 cycle,
177 [&](const unsigned int i) {
178 synchronization_dispatch.check(
179 thread_ready, i >= n_export_indices && i < n_internal);
180 },
181 sparsity_simd,
182 state_vector,
183 left,
184 right);
185 };
187 /* Parallel non-vectorized loop: */
188 loop(Number(), n_internal, n_owned);
189 /* Parallel vectorized SIMD loop: */
190 loop(VA(), 0, n_internal);
191
192 LIKWID_MARKER_STOP("time_step_1b");
194 }
195 }
196 }
197
198
199 /*
200 * -------------------------------------------------------------------------
201 * Step 2 - 7: Perform an explicit Euler step
202 * -------------------------------------------------------------------------
203 */
204
205
206 namespace
207 {
212 template <typename T>
213 bool all_below_diagonal(unsigned int i, const unsigned int *js)
214 {
215 if constexpr (std::is_same_v<T, typename get_value_type<T>::type>) {
216 /* Non-vectorized sequential access. */
217 const auto j = *js;
218 return j < i;
219
220 } else {
221 /* Vectorized fast access. index must be divisible by simd_length */
222
223 constexpr auto simd_length = T::size();
224
225 bool all_below_diagonal = true;
226 for (unsigned int k = 0; k < simd_length; ++k)
227 if (js[k] >= i + k) {
228 all_below_diagonal = false;
229 break;
230 }
231 return all_below_diagonal;
232 }
233 }
234 } // namespace
235
236
237 template <typename Description, int dim, typename Number>
238 template <int stages>
240 const StateVector &old_state_vector,
241 std::array<std::reference_wrapper<const StateVector>, stages>
242 stage_state_vectors,
243 const std::array<Number, stages> stage_weights,
244 StateVector &new_state_vector,
245 Number tau /*= 0.*/,
246 std::atomic<Number> tau_max /*std::numeric_limits<Number>::max()*/) const
247 {
248#ifdef DEBUG_OUTPUT
249 std::cout << "HyperbolicModule<Description, dim, Number>::step()"
250 << std::endl;
251#endif
252
253 auto &old_U = std::get<0>(old_state_vector);
254 auto &old_precomputed = std::get<1>(old_state_vector);
255 auto &new_U = std::get<0>(new_state_vector);
256
258
259 /*
260 * Workaround: A constexpr boolean storing the fact whether we
261 * instantiate the HyperbolicModule for the shallow water equations.
262 *
263 * Rationale: Currently, the shallow water equations is the only
264 * hyperbolic system for which we have to (a) form equilibrated states
265 * for the low-order update, and (b) apply an affine shift for
266 * computing limiter bounds. It's not so easy to come up with a
267 * meaningful abstraction layer for this (in particular because we only
268 * have one PDE). Thus, for the time being we simply special case a
269 * small amount of code in this routine.
270 *
271 * FIXME: Refactor into a proper abstraction layer / interface.
272 */
273 constexpr bool shallow_water =
274 std::is_same_v<Description, ShallowWater::Description>;
275
276 using VA = VectorizedArray<Number>;
277
278 /* Index ranges for the iteration over the sparsity pattern : */
279
280 constexpr auto simd_length = VA::size();
281 const unsigned int n_export_indices = offline_data_->n_export_indices();
282 const unsigned int n_internal = offline_data_->n_locally_internal();
283 const unsigned int n_owned = offline_data_->n_locally_owned();
284
285 /* References to precomputed matrices and the stencil: */
286
287 const auto &sparsity_simd = offline_data_->sparsity_pattern_simd();
288
289 const auto &mass_matrix = offline_data_->mass_matrix();
290 const auto &mass_matrix_inverse = offline_data_->mass_matrix_inverse();
291 const auto &lumped_mass_matrix = offline_data_->lumped_mass_matrix();
292 const auto &lumped_mass_matrix_inverse =
293 offline_data_->lumped_mass_matrix_inverse();
294
295 const auto &cij_matrix = offline_data_->cij_matrix();
296 const auto &incidence_matrix = offline_data_->incidence_matrix();
297
298 const auto &coupling_boundary_pairs =
299 offline_data_->coupling_boundary_pairs();
300
301 const Number measure_of_omega_inverse =
302 Number(1.) / offline_data_->measure_of_omega();
303
304 /* A monotonically increasing "channel" variable for mpi_tags: */
305 unsigned int channel = 10;
306
307 /* Lambda for creating the computing timer string: */
308 int step_no = 1;
309 const auto scoped_name = [&step_no](const auto &name,
310 const bool advance = true) {
311 advance || step_no--;
312 return "time step [H] " + std::to_string(++step_no) + " - " + name;
313 };
314
315 /* A boolean signalling that a restart is necessary: */
316 std::atomic<bool> restart_needed = false;
317
318 /*
319 * -------------------------------------------------------------------------
320 * Step 2: Compute off-diagonal d_ij, and alpha_i
321 *
322 * The computation of the d_ij is quite costly. So we do a trick to
323 * save a bit of computational resources. Instead of computing all d_ij
324 * entries for a row of a given local index i, we only compute d_ij for
325 * which j > i,
326 *
327 * llllrr
328 * l .xxxxx
329 * l ..xxxx
330 * l ...xxx
331 * l ....xx
332 * r ......
333 * r ......
334 *
335 * and symmetrize in Step 2.
336 *
337 * MM: We could save a bit more computational resources by only
338 * computing entries for which *IN A GLOBAL* enumeration j > i. But
339 * the index translation, subsequent symmetrization, and exchange
340 * sounds a bit too expensive...
341 * -------------------------------------------------------------------------
342 */
343
344 {
345 Scope scope(computing_timer_, scoped_name("compute d_ij, and alpha_i"));
346
347 SynchronizationDispatch synchronization_dispatch([&]() {
348 alpha_.update_ghost_values_start(channel++);
349 alpha_.update_ghost_values_finish();
350 });
351
353 LIKWID_MARKER_START(("time_step_" + std::to_string(step_no)).c_str());
354
355 auto loop = [&](auto sentinel, unsigned int left, unsigned int right) {
356 using T = decltype(sentinel);
357 unsigned int stride_size = get_stride_size<T>;
358
359 /* Stored thread locally: */
360
361 using RiemannSolver =
362 typename Description::template RiemannSolver<dim, T>;
363 RiemannSolver riemann_solver(
364 *hyperbolic_system_, riemann_solver_parameters_, old_precomputed);
365
366 using Indicator = typename Description::template Indicator<dim, T>;
367 Indicator indicator(
368 *hyperbolic_system_, indicator_parameters_, old_precomputed);
369
370 bool thread_ready = false;
371
373 for (unsigned int i = left; i < right; i += stride_size) {
374
375 /* Skip constrained degrees of freedom: */
376 const unsigned int row_length = sparsity_simd.row_length(i);
377 if (row_length == 1)
378 continue;
379
380 synchronization_dispatch.check(
381 thread_ready, i >= n_export_indices && i < n_internal);
382
383 const auto U_i = old_U.template get_tensor<T>(i);
384
385 indicator.reset(i, U_i);
386
387 const unsigned int *js = sparsity_simd.columns(i);
388 for (unsigned int col_idx = 0; col_idx < row_length;
389 ++col_idx, js += stride_size) {
390
391 const auto U_j = old_U.template get_tensor<T>(js);
392
393 const auto c_ij = cij_matrix.template get_tensor<T>(i, col_idx);
394
395 indicator.accumulate(js, U_j, c_ij);
396
397 /* Skip diagonal. */
398 if (col_idx == 0)
399 continue;
400
401 /* Only iterate over the upper triangular portion of d_ij */
402 if (all_below_diagonal<T>(i, js))
403 continue;
404
405 const auto norm = c_ij.norm();
406 const auto n_ij = c_ij / norm;
407 const auto lambda_max =
408 riemann_solver.compute(U_i, U_j, i, js, n_ij);
409 const auto d_ij = norm * lambda_max;
410
411 dij_matrix_.write_entry(d_ij, i, col_idx, true);
412 }
413
414 const auto mass = get_entry<T>(lumped_mass_matrix, i);
415 const auto hd_i = mass * measure_of_omega_inverse;
416 write_entry<T>(alpha_, indicator.alpha(hd_i), i);
417 }
418 };
419
420 /* Parallel non-vectorized loop: */
421 loop(Number(), n_internal, n_owned);
422 /* Parallel vectorized SIMD loop: */
423 loop(VA(), 0, n_internal);
424
425 LIKWID_MARKER_STOP(("time_step_" + std::to_string(step_no)).c_str());
427 }
428
429 /*
430 * -------------------------------------------------------------------------
431 * Step 3: Compute diagonal of d_ij, and maximal time-step size.
432 * -------------------------------------------------------------------------
433 */
434
435 {
436 Scope scope(computing_timer_,
437 scoped_name("compute bdry d_ij, diag d_ii, and tau_max"));
438
439 /* Parallel region */
441 LIKWID_MARKER_START(("time_step_" + std::to_string(step_no)).c_str());
442
443 /*
444 * Complete d_ij at boundary:
445 *
446 * Here, for continuous finite elements the assumption c_ij = -c_ji
447 * no longer holds true. This implies that d_ij != d_ji. We thus need
448 * to compute the lower-triangular part of d_ij, where i and j are
449 * boundary degrees of freedom as well.
450 */
451
452 using RiemannSolver =
453 typename Description::template RiemannSolver<dim, Number>;
454 RiemannSolver riemann_solver(
455 *hyperbolic_system_, riemann_solver_parameters_, old_precomputed);
456
457 Number local_tau_max = std::numeric_limits<Number>::max();
458
459 /*
460 * Note: we need this dance of iterating over an integer and then
461 * accessing the element to make Apple's OpenMP implementation
462 * happy.
463 */
465 for (std::size_t k = 0; k < coupling_boundary_pairs.size(); ++k) {
466 const auto &[i, col_idx, j] = coupling_boundary_pairs[k];
467
468 /*
469 * Only work on index pairs "i < j" that point to the upper
470 * triangular portion of the d_ij matrix. For all of these index
471 * pairs we compute the corresponding d_ji entry and fix up the
472 * d_ij entry (from step 2) by taking the maximum. Note that we
473 * actually do not store anything in the d_ji entry itself because
474 * we symmetrize the matrix later on anyway.
475 */
476 if (j < i)
477 continue;
478
479 const auto U_i = old_U.get_tensor(i);
480 const auto U_j = old_U.get_tensor(j);
481
482 const auto c_ji = cij_matrix.get_transposed_tensor(i, col_idx);
483 Assert(c_ji.norm() > 1.e-12, ExcInternalError());
484 const auto norm_ji = c_ji.norm();
485 const auto n_ji = c_ji / norm_ji;
486
487 const auto d_ij = dij_matrix_.get_entry(i, col_idx);
488
489 const auto lambda_max = riemann_solver.compute(U_j, U_i, j, &i, n_ji);
490 const auto d_ji = norm_ji * lambda_max;
491
492 dij_matrix_.write_entry(std::max(d_ij, d_ji), i, col_idx);
493 }
494
495 /* Symmetrize d_ij: */
496
498 for (unsigned int i = 0; i < n_owned; ++i) {
499
500 /* Skip constrained degrees of freedom: */
501 const unsigned int row_length = sparsity_simd.row_length(i);
502 if (row_length == 1)
503 continue;
504
505 Number d_sum = Number(0.);
506
507 /* skip diagonal: */
508 const unsigned int *js = sparsity_simd.columns(i);
509 for (unsigned int col_idx = 1; col_idx < row_length; ++col_idx) {
510 const auto j =
511 *(i < n_internal ? js + col_idx * simd_length : js + col_idx);
512
513 // fill lower triangular part of dij_matrix missing from step 1
514 if (j < i) {
515 const auto d_ji = dij_matrix_.get_transposed_entry(i, col_idx);
516
517#ifdef DEBUG_SYMMETRY_CHECK
518 /* Verify that d_ji == std::max(d_ij, d_ji): */
519
520 const auto U_i = old_U.get_tensor(i);
521 const auto U_j = old_U.get_tensor(j);
522
523 const auto c_ij = cij_matrix.get_tensor(i, col_idx);
524 Assert(c_ij.norm() > 1.e-12, ExcInternalError());
525 const auto norm_ij = c_ij.norm();
526 const auto n_ij = c_ij / norm_ij;
527
528 const auto lambda_max =
529 riemann_solver.compute(U_i, U_j, i, &j, n_ij);
530 const auto d_ij = norm_ij * lambda_max;
531
532 Assert(d_ij <= d_ji + 1.0e-12,
533 dealii::ExcMessage("d_ij not symmetrized correctly on "
534 "boundary degrees of freedom."));
535#endif
536
537 dij_matrix_.write_entry(d_ji, i, col_idx);
538 }
539
540 d_sum -= dij_matrix_.get_entry(i, col_idx);
541 }
542
543 /*
544 * Make sure that we do not accidentally divide by zero. (Yes, this
545 * can happen for some (admittedly, rather esoteric) scalar
546 * conservation equations...).
547 */
548 d_sum =
549 std::min(d_sum, Number(-1.e6) * std::numeric_limits<Number>::min());
550
551 /* write diagonal element */
552 dij_matrix_.write_entry(d_sum, i, 0);
553
554 const Number mass = lumped_mass_matrix.local_element(i);
555 const Number local_tau = cfl_ * mass / (Number(-2.) * d_sum);
556 local_tau_max = std::min(local_tau_max, local_tau);
557 }
558
559 /* Synchronize tau max over all threads: */
560 Number current_tau_max = tau_max.load();
561 while (current_tau_max > local_tau_max &&
562 !tau_max.compare_exchange_weak(current_tau_max, local_tau_max))
563 ;
564
565 LIKWID_MARKER_STOP(("time_step_" + std::to_string(step_no)).c_str());
567 }
568
569 {
570 Scope scope(computing_timer_,
571 "time step [H] _ - synchronization barriers");
572
573 /*
574 * MPI Barrier: Synchronize the maximal time-step size. This has to
575 * happen either over the global, or the local subrange communicator:
576 */
577 tau_max.store(Utilities::MPI::min(
578 tau_max.load(), mpi_ensemble_.synchronization_communicator()));
579
580 AssertThrow(
581 !std::isnan(tau_max) && !std::isinf(tau_max) && tau_max > 0.,
582 ExcMessage(
583 "I'm sorry, Dave. I'm afraid I can't do that.\nWe crashed."));
584
585 tau = (tau == Number(0.) ? tau_max.load() : tau);
586
587#ifdef DEBUG_OUTPUT
588 std::cout << " computed tau_max = " << tau_max
589 << " (CFL = " << cfl_ << ")" << std::endl;
590 std::cout << " step with tau = " << tau << std::endl;
591#endif
592
593 /* We need to signal a restart if the enforced tau is too wacky: */
594 restart_needed = (tau > acceptable_tau_max_ratio_ * tau_max.load());
595 }
596
597#ifdef DEBUG
598 /* Exchange d_ij so that we can check for symmetry: */
599 dij_matrix_.update_ghost_rows();
600#endif
601
602 /*
603 * -------------------------------------------------------------------------
604 * Step 4: Low-order update, also compute limiter bounds, R_i
605 * -------------------------------------------------------------------------
606 */
607
608 {
609 Scope scope(computing_timer_,
610 scoped_name("l.-o. update, compute bounds, r_i, and p_ij"));
611
612 SynchronizationDispatch synchronization_dispatch([&]() {
613 r_.update_ghost_values_start(channel++);
614 r_.update_ghost_values_finish();
615 if (offline_data_->discretization().have_discontinuous_ansatz()) {
616 /*
617 * In case we extend bounds over the stencil, we have to ensure
618 * that ghost ranges are properly communicated over all MPI
619 * ranks.
620 */
621 bounds_.update_ghost_values_start(channel++);
622 bounds_.update_ghost_values_finish();
623 }
624 });
625
626 const Number weight =
627 -std::accumulate(stage_weights.begin(), stage_weights.end(), -1.);
628
629 /* Parallel region */
631 LIKWID_MARKER_START(("time_step_" + std::to_string(step_no)).c_str());
632
633 auto loop = [&](auto sentinel,
634 auto have_discontinuous_ansatz,
635 unsigned int left,
636 unsigned int right) {
637 using T = decltype(sentinel);
638 using View =
639 typename Description::template HyperbolicSystemView<dim, T>;
640 using Limiter = typename Description::template Limiter<dim, T>;
641 using flux_contribution_type = typename View::flux_contribution_type;
642 using state_type = typename View::state_type;
643
644 unsigned int stride_size = get_stride_size<T>;
645
646 const auto view = hyperbolic_system_->template view<dim, T>();
647
648 /* Stored thread locally: */
649 Limiter limiter(
650 *hyperbolic_system_, limiter_parameters_, old_precomputed);
651 bool thread_ready = false;
652
654 for (unsigned int i = left; i < right; i += stride_size) {
655
656 /* Skip constrained degrees of freedom: */
657 const unsigned int row_length = sparsity_simd.row_length(i);
658 if (row_length == 1)
659 continue;
660
661 synchronization_dispatch.check(
662 thread_ready, i >= n_export_indices && i < n_internal);
663
664 const auto U_i = old_U.template get_tensor<T>(i);
665 auto U_i_new = U_i;
666
667 const auto alpha_i = get_entry<T>(alpha_, i);
668 const auto m_i = get_entry<T>(lumped_mass_matrix, i);
669 const auto m_i_inv = get_entry<T>(lumped_mass_matrix_inverse, i);
670
671 const auto flux_i = view.flux_contribution(
672 old_precomputed, initial_precomputed_, i, U_i);
673
674 std::array<flux_contribution_type, stages> flux_iHs;
675 [[maybe_unused]] state_type S_iH;
676
677 for (int s = 0; s < stages; ++s) {
678 const auto &[U_s, prec_s, V_s] = stage_state_vectors[s].get();
679
680 const auto U_iHs = U_s.template get_tensor<T>(i);
681 flux_iHs[s] =
682 view.flux_contribution(prec_s, initial_precomputed_, i, U_iHs);
683
684 if constexpr (View::have_source_terms) {
685 S_iH +=
686 stage_weights[s] * view.nodal_source(prec_s, i, U_iHs, tau);
687 }
688 }
689
690 [[maybe_unused]] state_type S_i;
691 state_type F_iH;
692
693 if constexpr (View::have_source_terms) {
694 S_i = view.nodal_source(old_precomputed, i, U_i, tau);
695 S_iH += weight * S_i;
696 U_i_new += tau * /* m_i_inv * m_i */ S_i;
697 F_iH += m_i * S_iH;
698 }
699
700 limiter.reset(i, U_i, flux_i);
701
702 [[maybe_unused]] state_type affine_shift;
703
704 /*
705 * Workaround: For shallow water we need to accumulate an
706 * additional contribution to the affine shift over the stencil
707 * before we can compute limiter bounds.
708 */
709
710 const unsigned int *js = sparsity_simd.columns(i);
711 if constexpr (shallow_water) {
712 for (unsigned int col_idx = 0; col_idx < row_length;
713 ++col_idx, js += stride_size) {
714
715 const auto U_j = old_U.template get_tensor<T>(js);
716 const auto flux_j = view.flux_contribution(
717 old_precomputed, initial_precomputed_, js, U_j);
718
719 const auto d_ij = dij_matrix_.template get_entry<T>(i, col_idx);
720 const auto c_ij = cij_matrix.template get_tensor<T>(i, col_idx);
721
722 const auto B_ij = view.affine_shift(flux_i, flux_j, c_ij, d_ij);
723 affine_shift += B_ij;
724 }
725
726 affine_shift *= tau * m_i_inv;
727 }
728
729 if constexpr (View::have_source_terms) {
730 affine_shift += tau * /* m_i_inv * m_i */ S_i;
731 }
732
733 js = sparsity_simd.columns(i);
734 for (unsigned int col_idx = 0; col_idx < row_length;
735 ++col_idx, js += stride_size) {
736
737 const auto U_j = old_U.template get_tensor<T>(js);
738
739 const auto alpha_j = get_entry<T>(alpha_, js);
740
741 const auto d_ij = dij_matrix_.template get_entry<T>(i, col_idx);
742 auto factor = (alpha_i + alpha_j) * Number(.5);
743
744 if constexpr (have_discontinuous_ansatz) {
745 const auto incidence_ij =
746 incidence_matrix.template get_entry<T>(i, col_idx);
747 factor = std::max(factor, incidence_ij);
748 }
749
750 const auto d_ijH = d_ij * factor;
751
752#ifdef DEBUG_SYMMETRY_CHECK
753 /*
754 * Verify that all local chunks of the d_ij matrix have been
755 * computed consistently over all MPI ranks. For that we import
756 * all ghost rows from neighboring MPI ranks and simply check
757 * that the (local) values of d_ij and d_ji match.
758 */
759 const auto d_ji =
760 dij_matrix_.template get_transposed_entry<T>(i, col_idx);
761 Assert(std::max(std::abs(d_ij - d_ji), T(1.0e-12)) == T(1.0e-12),
762 dealii::ExcMessage(
763 "d_ij not symmetrized correctly over MPI ranks"));
764#endif
765
766 const auto c_ij = cij_matrix.template get_tensor<T>(i, col_idx);
767 constexpr auto eps = std::numeric_limits<Number>::epsilon();
768 const auto scale = dealii::compare_and_apply_mask<
769 dealii::SIMDComparison::less_than>(
770 std::abs(d_ij), T(eps * eps), T(0.), T(1.) / d_ij);
771 const auto scaled_c_ij = c_ij * scale;
772
773 const auto flux_j = view.flux_contribution(
774 old_precomputed, initial_precomputed_, js, U_j);
775
776 const auto m_ij = mass_matrix.template get_entry<T>(i, col_idx);
777
778 /*
779 * Compute low-order flux and limiter bounds:
780 */
781
782 const auto flux_ij = view.flux_divergence(flux_i, flux_j, c_ij);
783 U_i_new += tau * m_i_inv * flux_ij;
784 auto P_ij = -flux_ij;
785
786 if constexpr (shallow_water) {
787 /*
788 * Workaround: Shallow water (and related) are special:
789 */
790
791 const auto &[U_star_ij, U_star_ji] =
792 view.equilibrated_states(flux_i, flux_j);
793
794 U_i_new += tau * m_i_inv * d_ij * (U_star_ji - U_star_ij);
795 F_iH += d_ijH * (U_star_ji - U_star_ij);
796 P_ij += (d_ijH - d_ij) * (U_star_ji - U_star_ij);
797
798 limiter.accumulate(
799 U_j, U_star_ij, U_star_ji, scaled_c_ij, affine_shift);
800
801 } else {
802
803 U_i_new += tau * m_i_inv * d_ij * (U_j - U_i);
804 F_iH += d_ijH * (U_j - U_i);
805 P_ij += (d_ijH - d_ij) * (U_j - U_i);
806
807 limiter.accumulate(js, U_j, flux_j, scaled_c_ij, affine_shift);
808 }
809
810 if constexpr (View::have_source_terms) {
811 F_iH -= m_ij * S_iH;
812 P_ij -= m_ij * /*sic!*/ S_i;
813 }
814
815 /*
816 * Compute high-order fluxes and source terms:
817 */
818
819 if constexpr (View::have_high_order_flux) {
820 const auto high_order_flux_ij =
821 view.high_order_flux_divergence(flux_i, flux_j, c_ij);
822 F_iH += weight * high_order_flux_ij;
823 P_ij += weight * high_order_flux_ij;
824 } else {
825 F_iH += weight * flux_ij;
826 P_ij += weight * flux_ij;
827 }
828
829 if constexpr (View::have_source_terms) {
830 const auto S_j = view.nodal_source(old_precomputed, js, U_j, tau);
831 F_iH += weight * m_ij * S_j;
832 P_ij += weight * m_ij * S_j;
833 }
834
835 for (int s = 0; s < stages; ++s) {
836 const auto &[U_s, prec_s, V_s] = stage_state_vectors[s].get();
837
838 const auto U_jHs = U_s.template get_tensor<T>(js);
839 const auto flux_jHs = view.flux_contribution(
840 prec_s, initial_precomputed_, js, U_jHs);
841
842 if constexpr (View::have_high_order_flux) {
843 const auto high_order_flux_ij = view.high_order_flux_divergence(
844 flux_iHs[s], flux_jHs, c_ij);
845 F_iH += stage_weights[s] * high_order_flux_ij;
846 P_ij += stage_weights[s] * high_order_flux_ij;
847 } else {
848 const auto flux_ij =
849 view.flux_divergence(flux_iHs[s], flux_jHs, c_ij);
850 F_iH += stage_weights[s] * flux_ij;
851 P_ij += stage_weights[s] * flux_ij;
852 }
853
854 if constexpr (View::have_source_terms) {
855 const auto S_js = view.nodal_source(prec_s, js, U_jHs, tau);
856 F_iH += stage_weights[s] * m_ij * S_js;
857 P_ij += stage_weights[s] * m_ij * S_js;
858 }
859 }
860
861 pij_matrix_.write_entry(P_ij, i, col_idx, true);
862 }
863
864#ifdef DEBUG_EXPENSIVE_BOUNDS_CHECK
865 if (!view.is_admissible(U_i_new)) {
866 restart_needed = true;
867 }
868#endif
869
870 new_U.template write_tensor<T>(U_i_new, i);
871 r_.template write_tensor<T>(F_iH, i);
872
873 const auto hd_i = m_i * measure_of_omega_inverse;
874 const auto relaxed_bounds = limiter.bounds(hd_i);
875 bounds_.template write_tensor<T>(relaxed_bounds, i);
876 }
877 };
878
879 /*
880 * Chain through a compile time integral constant std::true_type for
881 * a discontinuous ansatz and std::false_type otherwise. We use the
882 * (constexpr) integral constant later on to avoid branching when
883 * computing d_ijH.
884 */
885 if (offline_data_->discretization().have_discontinuous_ansatz()) {
886 /* Parallel non-vectorized loop and vectorized SIMD loop: */
887 loop(Number(), std::true_type{}, n_internal, n_owned);
888 loop(VA(), std::true_type{}, 0, n_internal);
889 } else {
890 /* Parallel non-vectorized loop and vectorized SIMD loop: */
891 loop(Number(), std::false_type{}, n_internal, n_owned);
892 loop(VA(), std::false_type{}, 0, n_internal);
893 }
894
895 LIKWID_MARKER_STOP(("time_step_" + std::to_string(step_no)).c_str());
897 }
898
899 /*
900 * -------------------------------------------------------------------------
901 * Step 5: Compute second part of P_ij, and l_ij (first round):
902 * -------------------------------------------------------------------------
903 */
904
905 if (limiter_parameters_.iterations() != 0) {
906 Scope scope(computing_timer_, scoped_name("compute p_ij, and l_ij"));
907
908 SynchronizationDispatch synchronization_dispatch([&]() {
909 lij_matrix_.update_ghost_rows_start(channel++);
910 lij_matrix_.update_ghost_rows_finish();
911 });
912
914 LIKWID_MARKER_START(("time_step_" + std::to_string(step_no)).c_str());
915
916 auto loop = [&](auto sentinel,
917 auto have_discontinuous_ansatz,
918 unsigned int left,
919 unsigned int right) {
920 using T = decltype(sentinel);
921 using View =
922 typename Description::template HyperbolicSystemView<dim, T>;
923 using Limiter = typename Description::template Limiter<dim, T>;
924
925 unsigned int stride_size = get_stride_size<T>;
926
927 /* Stored thread locally: */
928 Limiter limiter(
929 *hyperbolic_system_, limiter_parameters_, old_precomputed);
930 bool thread_ready = false;
931
933 for (unsigned int i = left; i < right; i += stride_size) {
934
935 /* Skip constrained degrees of freedom: */
936 const unsigned int row_length = sparsity_simd.row_length(i);
937 if (row_length == 1)
938 continue;
939
940 synchronization_dispatch.check(
941 thread_ready, i >= n_export_indices && i < n_internal);
942
943 auto bounds =
944 bounds_.template get_tensor<T, std::array<T, n_bounds>>(i);
945
946 /*
947 * In case of a discontinuous finite element ansatz we need to
948 * extend bounds over the stencil. We do this by looping over the
949 * stencil once and taking the minimum/maximum:
950 */
951 if constexpr (have_discontinuous_ansatz) {
952 /* Skip diagonal. */
953 const unsigned int *js = sparsity_simd.columns(i) + stride_size;
954 for (unsigned int col_idx = 1; col_idx < row_length;
955 ++col_idx, js += stride_size) {
956 bounds = limiter.combine_bounds(
957 bounds,
958 bounds_.template get_tensor<T, std::array<T, n_bounds>>(js));
959 }
960 bounds_.template write_tensor<T>(bounds, i);
961 }
962
963 [[maybe_unused]] T m_i;
964 if constexpr (have_discontinuous_ansatz)
965 m_i = get_entry<T>(lumped_mass_matrix, i);
966 const auto m_i_inv = get_entry<T>(lumped_mass_matrix_inverse, i);
967
968 const auto U_i_new = new_U.template get_tensor<T>(i);
969
970 const auto F_iH = r_.template get_tensor<T>(i);
971
972 const auto lambda_inv = Number(row_length - 1);
973 const auto factor = tau * m_i_inv * lambda_inv;
974
975 /* Skip diagonal. */
976 const unsigned int *js = sparsity_simd.columns(i) + stride_size;
977 for (unsigned int col_idx = 1; col_idx < row_length;
978 ++col_idx, js += stride_size) {
979
980 auto P_ij = pij_matrix_.template get_tensor<T>(i, col_idx);
981 const auto F_jH = r_.template get_tensor<T>(js);
982
983 /*
984 * Mass matrix correction:
985 */
986
987 const auto kronecker_ij = col_idx == 0 ? T(1.) : T(0.);
988
989 if constexpr (have_discontinuous_ansatz) {
990 /* Use full consistent mass matrix inverse: */
991
992 const auto m_j = get_entry<T>(lumped_mass_matrix, js);
993 const auto m_ij_inv =
994 mass_matrix_inverse.template get_entry<T>(i, col_idx);
995 const auto b_ij = m_i * m_ij_inv - kronecker_ij;
996 const auto b_ji = m_j * m_ij_inv - kronecker_ij;
997
998 P_ij += b_ij * F_jH - b_ji * F_iH;
999
1000 } else {
1001 /* Use Neumann series expansion: */
1002
1003 const auto m_j_inv = get_entry<T>(lumped_mass_matrix_inverse, js);
1004 const auto m_ij = mass_matrix.template get_entry<T>(i, col_idx);
1005 const auto b_ij = kronecker_ij - m_ij * m_j_inv;
1006 const auto b_ji = kronecker_ij - m_ij * m_i_inv;
1007
1008 P_ij += b_ij * F_jH - b_ji * F_iH;
1009 }
1010
1011 P_ij *= factor;
1012 pij_matrix_.write_entry(P_ij, i, col_idx);
1013
1014 /*
1015 * Compute limiter coefficients:
1016 */
1017
1018 const auto &[l_ij, success] = limiter.limit(bounds, U_i_new, P_ij);
1019 lij_matrix_.template write_entry<T>(l_ij, i, col_idx, true);
1020
1021 /*
1022 * If the success is set to false then the low-order update
1023 * resulted in a state outside of the limiter bounds. This can
1024 * happen if we compute with an aggressive CFL number. We
1025 * signal this condition by setting the restart_needed boolean
1026 * to true and defer further action to the chosen
1027 * IDViolationStrategy and the policy set in the
1028 * TimeIntegrator.
1029 */
1030 if (!success)
1031 restart_needed = true;
1032 }
1033 }
1034 };
1035
1036 /*
1037 * Chain through a compile time integral constant std::true_type for
1038 * a discontinuous ansatz and std::false_type otherwise. We use the
1039 * (constexpr) integral constant later on to avoid branching when
1040 * computing d_ijH.
1041 */
1042 if (offline_data_->discretization().have_discontinuous_ansatz()) {
1043 /* Parallel non-vectorized loop and vectorized SIMD loop: */
1044 loop(Number(), std::true_type{}, n_internal, n_owned);
1045 loop(VA(), std::true_type{}, 0, n_internal);
1046 } else {
1047 /* Parallel non-vectorized loop and vectorized SIMD loop: */
1048 loop(Number(), std::false_type{}, n_internal, n_owned);
1049 loop(VA(), std::false_type{}, 0, n_internal);
1050 }
1051
1052 LIKWID_MARKER_STOP(("time_step_" + std::to_string(step_no)).c_str());
1054 }
1055
1056 /*
1057 * -------------------------------------------------------------------------
1058 * Step 6, 7: Perform high-order update:
1059 *
1060 * Symmetrize l_ij
1061 * High-order update: += l_ij * lambda * P_ij
1062 * Compute next l_ij
1063 * -------------------------------------------------------------------------
1064 */
1065
1066 const auto n_iterations = limiter_parameters_.iterations();
1067 for (unsigned int pass = 0; pass < n_iterations; ++pass) {
1068 bool last_round = (pass + 1 == n_iterations);
1069
1070 std::string additional_step = (last_round ? "" : ", next l_ij");
1071 Scope scope(
1072 computing_timer_,
1073 scoped_name("symmetrize l_ij, h.-o. update" + additional_step));
1074
1075 if ((n_iterations == 2) && last_round) {
1076 std::swap(lij_matrix_, lij_matrix_next_);
1077 }
1078
1079 SynchronizationDispatch synchronization_dispatch([&]() {
1080 if (!last_round) {
1081 lij_matrix_next_.update_ghost_rows_start(channel++);
1082 lij_matrix_next_.update_ghost_rows_finish();
1083 }
1084 });
1085
1087 LIKWID_MARKER_START(("time_step_" + std::to_string(step_no)).c_str());
1088
1089 auto loop = [&](auto sentinel, unsigned int left, unsigned int right) {
1090 using T = decltype(sentinel);
1091 using View =
1092 typename Description::template HyperbolicSystemView<dim, T>;
1093 using Limiter = typename Description::template Limiter<dim, T>;
1094
1095 unsigned int stride_size = get_stride_size<T>;
1096
1097 /* Stored thread locally: */
1098 AlignedVector<T> lij_row;
1099 Limiter limiter(
1100 *hyperbolic_system_, limiter_parameters_, old_precomputed);
1101 bool thread_ready = false;
1102
1104 for (unsigned int i = left; i < right; i += stride_size) {
1105
1106 /* Skip constrained degrees of freedom: */
1107 const unsigned int row_length = sparsity_simd.row_length(i);
1108 if (row_length == 1)
1109 continue;
1110
1111 synchronization_dispatch.check(
1112 thread_ready, i >= n_export_indices && i < n_internal);
1113
1114 auto U_i_new = new_U.template get_tensor<T>(i);
1115
1116 const Number lambda = Number(1.) / Number(row_length - 1);
1117 lij_row.resize_fast(row_length);
1118
1119 /* Skip diagonal. */
1120 for (unsigned int col_idx = 1; col_idx < row_length; ++col_idx) {
1121
1122 const auto l_ij = std::min(
1123 lij_matrix_.template get_entry<T>(i, col_idx),
1124 lij_matrix_.template get_transposed_entry<T>(i, col_idx));
1125
1126 const auto p_ij = pij_matrix_.template get_tensor<T>(i, col_idx);
1127
1128 U_i_new += l_ij * lambda * p_ij;
1129
1130 if (!last_round)
1131 lij_row[col_idx] = l_ij;
1132 }
1133
1134#ifdef DEBUG_EXPENSIVE_BOUNDS_CHECK
1135 const auto view = hyperbolic_system_->template view<dim, T>();
1136 if (!view.is_admissible(U_i_new)) {
1137 restart_needed = true;
1138 }
1139#endif
1140
1141 new_U.template write_tensor<T>(U_i_new, i);
1142
1143 /* Skip computating l_ij and updating p_ij in the last round */
1144 if (last_round)
1145 continue;
1146
1147 const auto bounds =
1148 bounds_.template get_tensor<T, std::array<T, n_bounds>>(i);
1149 /* Skip diagonal. */
1150 for (unsigned int col_idx = 1; col_idx < row_length; ++col_idx) {
1151
1152 const auto old_l_ij = lij_row[col_idx];
1153
1154 const auto new_p_ij =
1155 (T(1.) - old_l_ij) *
1156 pij_matrix_.template get_tensor<T>(i, col_idx);
1157
1158 const auto &[new_l_ij, success] =
1159 limiter.limit(bounds, U_i_new, new_p_ij);
1160
1161 /*
1162 * This is the second pass of the limiter. Under rare
1163 * circumstances the previous high-order update might be
1164 * slightly out of bounds due to roundoff errors. This happens
1165 * for example in flat regions or in stagnation points at a
1166 * (slip boundary) point. The limiter should ensure that we do
1167 * not further manipulate the state in this case. We thus only
1168 * signal a restart condition if the `EXPENSIVE_BOUNDS_CHECK` debug
1169 * macro is defined.
1170 */
1171#ifdef DEBUG_EXPENSIVE_BOUNDS_CHECK
1172 if (!success)
1173 restart_needed = true;
1174#endif
1175
1176 /*
1177 * Shortcut: We omit updating the p_ij and q_ij matrices and
1178 * simply write (1 - l_ij^(1)) * l_ij^(2) into the l_ij matrix.
1179 *
1180 * This approach only works for at most two limiting steps.
1181 */
1182 const auto entry = (T(1.) - old_l_ij) * new_l_ij;
1183 lij_matrix_next_.write_entry(entry, i, col_idx, true);
1184 }
1185 }
1186 };
1187
1188 /* Parallel non-vectorized loop: */
1189 loop(Number(), n_internal, n_owned);
1190 /* Parallel vectorized SIMD loop: */
1191 loop(VA(), 0, n_internal);
1192
1193 LIKWID_MARKER_STOP(("time_step_" + std::to_string(step_no)).c_str());
1195 } /* limiter_iter_ */
1196
1198
1199 /*
1200 * Pass through the parabolic state vector
1201 */
1202 const auto &old_V = std::get<2>(old_state_vector);
1203 auto &new_V = std::get<2>(new_state_vector);
1204 new_V = old_V;
1205
1206 /*
1207 * Do we have to restart?
1208 */
1209
1210 {
1211 Scope scope(computing_timer_,
1212 "time step [H] _ - synchronization barriers");
1213
1214 /*
1215 * Synchronize whether we have to restart the time step. Even though
1216 * the restart condition itself only affects the local ensemble we
1217 * nevertheless need to synchronize the boolean in case we perform
1218 * synchronized global time steps. (Otherwise different ensembles
1219 * might end up with a different time step.)
1220 */
1221 restart_needed.store(Utilities::MPI::logical_or(
1222 restart_needed.load(), mpi_ensemble_.synchronization_communicator()));
1223 }
1224
1225 if (restart_needed) {
1226 switch (id_violation_strategy_) {
1228 n_warnings_++;
1229#ifdef DEBUG_OUTPUT
1230 std::cout << " raised warning, CFL/IDP violation encountered "
1231 << std::endl;
1232#endif
1233 break;
1235 n_restarts_++;
1236 /* Suggest a restart with tau_max: */
1237#ifdef DEBUG_OUTPUT
1238 std::cout << " signalling restart (suggested_tau_max = "
1239 << tau_max << ")" << std::endl;
1240#endif
1241 throw Restart{tau_max};
1242 }
1243 }
1244
1245 /* In debug mode poison precomputed values: */
1246 Vectors::debug_poison_precomputed_values<Description>(new_state_vector,
1247 *offline_data_);
1248
1249 /* Return the time step size tau: */
1250 return tau;
1251 }
1252
1253} /* namespace ryujin */
HyperbolicModule(const MPIEnsemble &mpi_ensemble, std::map< std::string, dealii::Timer > &computing_timer, const OfflineData< dim, Number > &offline_data, const HyperbolicSystem &hyperbolic_system, const InitialValues< Description, dim, Number > &initial_values, const std::string &subsection="/HyperbolicModule")
typename Description::HyperbolicSystem HyperbolicSystem
typename View::StateVector StateVector
typename Description::template HyperbolicSystemView< dim, Number > View
Number step(const StateVector &old_state_vector, std::array< std::reference_wrapper< const StateVector >, stages > stage_state_vectors, const std::array< Number, stages > stage_weights, StateVector &new_state_vector, Number tau=Number(0.), std::atomic< Number > tau_max=std::numeric_limits< Number >::max()) const
void prepare_state_vector(StateVector &state_vector, Number t) const
#define RYUJIN_PARALLEL_REGION_BEGIN
Definition: openmp.h:54
#define RYUJIN_OMP_FOR
Definition: openmp.h:70
#define RYUJIN_PARALLEL_REGION_END
Definition: openmp.h:63
#define LIKWID_MARKER_START(opt)
Definition: introspection.h:68
#define CALLGRIND_START_INSTRUMENTATION
Definition: introspection.h:28
#define LIKWID_MARKER_STOP(opt)
Definition: introspection.h:73
#define CALLGRIND_STOP_INSTRUMENTATION
Definition: introspection.h:35