ryujin 2.1.1 revision dbf0e3ba7acdb60b6d558e4257815df4a8f8daf9
sparse_matrix_simd.h
Go to the documentation of this file.
1//
2// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
3// Copyright (C) 2020 - 2024 by the ryujin authors
4//
5
6#pragma once
7
8#include <compile_time_options.h>
9
10#include <deal.II/base/aligned_vector.h>
11#include <deal.II/base/partitioner.h>
12#include <deal.II/lac/dynamic_sparsity_pattern.h>
13
14namespace ryujin
15{
16 namespace
17 {
22 template <typename Number, int n_components>
23 struct EntryType {
24 using type = dealii::Tensor<1, n_components, Number>;
25 };
26
27 template <typename Number>
28 struct EntryType<Number, 1> {
29 using type = Number;
30 };
31 } // namespace
32
33
34 template <typename Number,
35 int n_components = 1,
36 int simd_length = dealii::VectorizedArray<Number>::size()>
37 class SparseMatrixSIMD;
38
61 template <int simd_length>
63 {
64 public:
69
76 const unsigned int n_internal_dofs,
77 const dealii::DynamicSparsityPattern &sparsity,
78 const std::shared_ptr<const dealii::Utilities::MPI::Partitioner>
80
81
87 void reinit(const unsigned int n_internal_dofs,
88 const dealii::DynamicSparsityPattern &sparsity,
89 const std::shared_ptr<const dealii::Utilities::MPI::Partitioner>
91
97 unsigned int stride_of_row(const unsigned int row) const;
98
99 const unsigned int *columns(const unsigned int row) const;
100
101 unsigned int row_length(const unsigned int row) const;
102
103 unsigned int n_rows() const;
104
105 std::size_t n_nonzero_elements() const;
106
107 protected:
108 unsigned int n_internal_dofs;
110 std::shared_ptr<const dealii::Utilities::MPI::Partitioner> partitioner;
111
112 dealii::AlignedVector<std::size_t> row_starts;
113 dealii::AlignedVector<unsigned int> column_indices;
114 dealii::AlignedVector<unsigned int> indices_transposed;
115
121 std::vector<std::pair<unsigned int, unsigned int>> entries_to_be_sent;
122
128 std::vector<std::pair<unsigned int, unsigned int>> send_targets;
129
141 std::vector<std::pair<unsigned int, unsigned int>> receive_targets;
142
144
145 template <typename, int, int>
146 friend class SparseMatrixSIMD;
147 };
148
149
159 template <typename Number, int n_components, int simd_length>
161 {
162 public:
164
166
168
169 template <typename SparseMatrix>
170 void read_in(const std::array<SparseMatrix, n_components> &sparse_matrix,
171 bool locally_indexed = true);
172
173 template <typename SparseMatrix>
174 void read_in(const SparseMatrix &sparse_matrix,
175 bool locally_indexed = true);
176
177 using VectorizedArray = dealii::VectorizedArray<Number, simd_length>;
178
185 template <typename Number2>
186 using EntryType = typename ryujin::EntryType<Number2, n_components>::type;
187
188 /* Get scalar or tensor-valued entry: */
189
200 template <typename Number2 = Number>
202 get_entry(const unsigned int row,
203 const unsigned int position_within_column) const;
204
216 template <typename Number2 = Number>
217 dealii::Tensor<1, n_components, Number2>
218 get_tensor(const unsigned int row,
219 const unsigned int position_within_column) const;
220
221 /* Get transposed scalar or tensor-valued entry: */
222
234 template <typename Number2 = Number>
236 get_transposed_entry(const unsigned int row,
237 const unsigned int position_within_column) const;
238
250 template <typename Number2 = Number>
251 dealii::Tensor<1, n_components, Number2>
252 get_transposed_tensor(const unsigned int row,
253 const unsigned int position_within_column) const;
254
255 /* Write scalar or tensor entry: */
256
269 template <typename Number2 = Number>
270 void write_entry(const Number2 entry,
271 const unsigned int row,
272 const unsigned int position_within_column,
273 const bool do_streaming_store = false);
274
284 template <typename Number2 = Number>
285 void write_entry(const dealii::Tensor<1, n_components, Number2> &entry,
286 const unsigned int row,
287 const unsigned int position_within_column,
288 const bool do_streaming_store = false);
289
290 /* Synchronize over MPI ranks: */
291
292 void update_ghost_rows_start(const unsigned int communication_channel = 0);
293
295
297
298 protected:
300 dealii::AlignedVector<Number> data;
301 dealii::AlignedVector<Number> exchange_buffer;
302 std::vector<MPI_Request> requests;
303 };
304
305 /*
306 * Inline function definitions:
307 */
308
309
310 template <int simd_length>
311 DEAL_II_ALWAYS_INLINE inline unsigned int
313 {
314 AssertIndexRange(row, row_starts.size());
315
316 if (row < n_internal_dofs)
317 return simd_length;
318 else
319 return 1;
320 }
321
322
323 template <int simd_length>
324 DEAL_II_ALWAYS_INLINE inline const unsigned int *
325 SparsityPatternSIMD<simd_length>::columns(const unsigned int row) const
326 {
327 AssertIndexRange(row, row_starts.size() - 1);
328
329 if (row < n_internal_dofs)
330 return column_indices.data() + row_starts[row / simd_length] +
331 row % simd_length;
332 else
333 return column_indices.data() + row_starts[row];
334 }
335
336
337 template <int simd_length>
338 DEAL_II_ALWAYS_INLINE inline unsigned int
340 {
341 AssertIndexRange(row, row_starts.size() - 1);
342
343 if (row < n_internal_dofs) {
344 const unsigned int simd_row = row / simd_length;
345 return (row_starts[simd_row + 1] - row_starts[simd_row]) / simd_length;
346 } else {
347 return row_starts[row + 1] - row_starts[row];
348 }
349 }
350
351
352 template <int simd_length>
353 DEAL_II_ALWAYS_INLINE inline unsigned int
355 {
356 return row_starts.size() - 1;
357 }
358
359
360 template <int simd_length>
361 DEAL_II_ALWAYS_INLINE inline std::size_t
363 {
364 Assert(row_starts.size() > 0, dealii::ExcNotInitialized());
365
366 return row_starts.back();
367 }
368
369
370 template <typename Number, int n_components, int simd_length>
371 template <typename Number2>
372 DEAL_II_ALWAYS_INLINE inline auto
374 const unsigned int row, const unsigned int position_within_column) const
376 {
377 const auto result = get_tensor<Number2>(row, position_within_column);
378 if constexpr (n_components == 1)
379 return result[0];
380 else
381 return result;
382 }
383
384
385 template <typename Number, int n_components, int simd_length>
386 template <typename Number2>
387 DEAL_II_ALWAYS_INLINE inline dealii::Tensor<1, n_components, Number2>
389 const unsigned int row, const unsigned int position_within_column) const
390 {
391 Assert(sparsity != nullptr, dealii::ExcNotInitialized());
392 AssertIndexRange(row, sparsity->row_starts.size() - 1);
393 AssertIndexRange(position_within_column, sparsity->row_length(row));
394
395 dealii::Tensor<1, n_components, Number2> result;
396
397 if constexpr (std::is_same<Number, Number2>::value) {
398 /*
399 * Non-vectorized slow access. Supports all row indices in
400 * [0,n_owned)
401 */
402 if (row < sparsity->n_internal_dofs) {
403 // go through vectorized part
404 const unsigned int simd_row = row / simd_length;
405 const unsigned int simd_offset = row % simd_length;
406 for (unsigned int d = 0; d < n_components; ++d)
407 result[d] = data[(sparsity->row_starts[simd_row] +
408 position_within_column * simd_length) *
409 n_components +
410 d * simd_length + simd_offset];
411 } else {
412 // go through standard part
413 for (unsigned int d = 0; d < n_components; ++d)
414 result[d] =
415 data[(sparsity->row_starts[row] + position_within_column) *
416 n_components +
417 d];
418 }
419
420 } else if constexpr (std::is_same<VectorizedArray, Number2>::value) {
421 /*
422 * Vectorized fast access. Indices must be in the range
423 * [0,n_internal), index must be divisible by simd_length
424 */
425
426 Assert(row < sparsity->n_internal_dofs,
427 dealii::ExcMessage(
428 "Vectorized access only possible in vectorized part"));
429 Assert(row % simd_length == 0,
430 dealii::ExcMessage(
431 "Access only supported for rows at the SIMD granularity"));
432
433 const Number *load_pos =
434 data.data() + (sparsity->row_starts[row / simd_length] +
435 position_within_column * simd_length) *
436 n_components;
437
438 for (unsigned int d = 0; d < n_components; ++d)
439 result[d].load(load_pos + d * simd_length);
440
441 } else {
442 /* not implemented */
443 __builtin_trap();
444 }
445
446 return result;
447 }
448
449
450 template <typename Number, int n_components, int simd_length>
451 template <typename Number2>
452 DEAL_II_ALWAYS_INLINE inline auto
454 const unsigned int row, const unsigned int position_within_column) const
456 {
457 const auto result =
458 get_transposed_tensor<Number2>(row, position_within_column);
459 if constexpr (n_components == 1)
460 return result[0];
461 else
462 return result;
463 }
464
465
466 template <typename Number, int n_components, int simd_length>
467 template <typename Number2>
468 DEAL_II_ALWAYS_INLINE inline dealii::Tensor<1, n_components, Number2>
470 const unsigned int row, const unsigned int position_within_column) const
471 {
472 Assert(sparsity != nullptr, dealii::ExcNotInitialized());
473 AssertIndexRange(row, sparsity->row_starts.size() - 1);
474 AssertIndexRange(position_within_column, sparsity->row_length(row));
475
476 dealii::Tensor<1, n_components, Number2> result;
477
478 if constexpr (std::is_same<Number, Number2>::value) {
479 /*
480 * Non-vectorized slow access. Supports all row indices in
481 * [0,n_owned)
482 */
483
484 if (row < sparsity->n_internal_dofs) {
485 // go through vectorized part
486 const unsigned int simd_row = row / simd_length;
487 const unsigned int simd_offset = row % simd_length;
488 const std::size_t index =
489 sparsity->indices_transposed[sparsity->row_starts[simd_row] +
490 simd_offset +
491 position_within_column * simd_length];
492 if (n_components > 1) {
493 const unsigned int col =
494 sparsity->column_indices[sparsity->row_starts[simd_row] +
495 simd_offset +
496 position_within_column * simd_length];
497 if (col < sparsity->n_internal_dofs)
498 for (unsigned int d = 0; d < n_components; ++d)
499 result[d] =
500 data[index / simd_length * simd_length * n_components +
501 simd_length * d + index % simd_length];
502 else
503 for (unsigned int d = 0; d < n_components; ++d)
504 result[d] = data[index * n_components + d];
505 } else
506 result[0] = data[index];
507 } else {
508 // go through standard part
509 const std::size_t index =
510 sparsity->indices_transposed[sparsity->row_starts[row] +
511 position_within_column];
512 if (n_components > 1) {
513 const unsigned int col =
514 sparsity->column_indices[sparsity->row_starts[row] +
515 position_within_column];
516 if (col < sparsity->n_internal_dofs)
517 for (unsigned int d = 0; d < n_components; ++d)
518 result[d] =
519 data[index / simd_length * simd_length * n_components +
520 simd_length * d + index % simd_length];
521 else
522 for (unsigned int d = 0; d < n_components; ++d)
523 result[d] = data[index * n_components + d];
524 } else
525 result[0] = data[index];
526 }
527
528 } else if constexpr (std::is_same<VectorizedArray, Number2>::value &&
529 (n_components == 1)) {
530 /*
531 * Vectorized fast access. Indices must be in the range
532 * [0,n_internal), index must be divisible by simd_length
533 */
534
535 Assert(row < sparsity->n_internal_dofs,
536 dealii::ExcMessage(
537 "Vectorized access only possible in vectorized part"));
538 Assert(row % simd_length == 0,
539 dealii::ExcMessage(
540 "Access only supported for rows at the SIMD granularity"));
541
542 const unsigned int offset = sparsity->row_starts[row / simd_length] +
543 position_within_column * simd_length;
544 result[0].gather(data.data(),
545 sparsity->indices_transposed.data() + offset);
546
547 } else {
548 /* not implemented */
549 __builtin_trap();
550 }
551
552 return result;
553 }
554
555
556 template <typename Number, int n_components, int simd_length>
557 template <typename Number2>
558 DEAL_II_ALWAYS_INLINE inline void
560 const Number2 entry,
561 const unsigned int row,
562 const unsigned int position_within_column,
563 const bool do_streaming_store)
564 {
565 static_assert(
566 n_components == 1,
567 "Attempted to write a scalar value into a tensor-valued matrix entry");
568
569 Assert(sparsity != nullptr, dealii::ExcNotInitialized());
570 AssertIndexRange(row, sparsity->row_starts.size() - 1);
571 AssertIndexRange(position_within_column, sparsity->row_length(row));
572
573 dealii::Tensor<1, n_components, Number2> result;
574 result[0] = entry;
575
576 write_entry<Number2>(
577 result, row, position_within_column, do_streaming_store);
578 }
579
580
581 template <typename Number, int n_components, int simd_length>
582 template <typename Number2>
583 DEAL_II_ALWAYS_INLINE inline void
585 const dealii::Tensor<1, n_components, Number2> &entry,
586 const unsigned int row,
587 const unsigned int position_within_column,
588 const bool do_streaming_store)
589 {
590 Assert(sparsity != nullptr, dealii::ExcNotInitialized());
591 AssertIndexRange(row, sparsity->row_starts.size() - 1);
592 AssertIndexRange(position_within_column, sparsity->row_length(row));
593
594 if constexpr (std::is_same<Number, Number2>::value) {
595 /*
596 * Non-vectorized slow access. Supports all row indices in
597 * [0,n_owned)
598 */
599
600 if (row < sparsity->n_internal_dofs) {
601 // go through vectorized part
602 const unsigned int simd_row = row / simd_length;
603 const unsigned int simd_offset = row % simd_length;
604 for (unsigned int d = 0; d < n_components; ++d)
605 data[(sparsity->row_starts[simd_row] +
606 position_within_column * simd_length) *
607 n_components +
608 d * simd_length + simd_offset] = entry[d];
609 } else {
610 // go through standard part
611 for (unsigned int d = 0; d < n_components; ++d)
612 data[(sparsity->row_starts[row] + position_within_column) *
613 n_components +
614 d] = entry[d];
615 }
616
617 } else if constexpr (std::is_same<VectorizedArray, Number2>::value) {
618 /*
619 * Vectorized fast access. Indices must be in the range
620 * [0,n_internal), index must be divisible by simd_length
621 */
622
623 Assert(row < sparsity->n_internal_dofs,
624 dealii::ExcMessage(
625 "Vectorized access only possible in vectorized part"));
626 Assert(row % simd_length == 0,
627 dealii::ExcMessage(
628 "Access only supported for rows at the SIMD granularity"));
629
630 Number *store_pos =
631 data.data() + (sparsity->row_starts[row / simd_length] +
632 position_within_column * simd_length) *
633 n_components;
634 if (do_streaming_store)
635 for (unsigned int d = 0; d < n_components; ++d)
636 entry[d].streaming_store(store_pos + d * simd_length);
637 else
638 for (unsigned int d = 0; d < n_components; ++d)
639 entry[d].store(store_pos + d * simd_length);
640
641 } else {
642 /* not implemented */
643 __builtin_trap();
644 }
645 }
646
647
648 template <typename Number, int n_components, int simd_length>
649 inline void
651 const unsigned int communication_channel)
652 {
653#ifdef DEAL_II_WITH_MPI
654 AssertIndexRange(communication_channel, 200);
655
656 const unsigned int mpi_tag =
657 dealii::Utilities::MPI::internal::Tags::partitioner_export_start +
658 communication_channel;
659 Assert(mpi_tag <=
660 dealii::Utilities::MPI::internal::Tags::partitioner_export_end,
661 dealii::ExcInternalError());
662
663 const std::size_t n_indices = sparsity->entries_to_be_sent.size();
664 exchange_buffer.resize_fast(n_components * n_indices);
665
666 requests.resize(sparsity->receive_targets.size() +
667 sparsity->send_targets.size());
668
669 /*
670 * Set up MPI receive requests. We will always receive data for indices
671 * in the range [n_locally_owned_, n_locally_relevant_), thus the DATA
672 * is stored in non-vectorized CSR format.
673 */
674
675 {
676 const auto &targets = sparsity->receive_targets;
677 for (unsigned int p = 0; p < targets.size(); ++p) {
678 const int ierr = MPI_Irecv(
679 data.data() +
680 n_components *
681 (sparsity->row_starts[sparsity->n_locally_owned_dofs] +
682 (p == 0 ? 0 : targets[p - 1].second)),
683 (targets[p].second - (p == 0 ? 0 : targets[p - 1].second)) *
684 n_components * sizeof(Number),
685 MPI_BYTE,
686 targets[p].first,
687 mpi_tag,
688 sparsity->mpi_communicator,
689 &requests[p]);
690 AssertThrowMPI(ierr);
691 }
692 }
693
694 /*
695 * Copy all entries that we plan to send over to the exchange buffer.
696 * Here, we have to be careful with indices falling into the "locally
697 * internal" range that are stored in an array-of-struct-of-array type.
698 */
699
700 for (std::size_t c = 0; c < n_indices; ++c) {
701
702 const auto &[row, position_within_column] =
703 sparsity->entries_to_be_sent[c];
704
705 Assert(row < sparsity->n_locally_owned_dofs, dealii::ExcInternalError());
706
707 if (row < sparsity->n_internal_dofs) {
708 // go through vectorized part
709 const unsigned int simd_row = row / simd_length;
710 const unsigned int simd_offset = row % simd_length;
711 for (unsigned int d = 0; d < n_components; ++d)
712 exchange_buffer[n_components * c + d] =
713 data[(sparsity->row_starts[simd_row] +
714 position_within_column * simd_length) *
715 n_components +
716 d * simd_length + simd_offset];
717 } else {
718 // go through standard part
719 for (unsigned int d = 0; d < n_components; ++d)
720 exchange_buffer[n_components * c + d] =
721 data[(sparsity->row_starts[row] + position_within_column) *
722 n_components +
723 d];
724 }
725 }
726
727 /*
728 * Set up MPI send requests. We have copied everything we intend to
729 * send to the exchange_buffer compatible with the CSR storage format
730 * of the receiving MPI rank.
731 */
732
733 {
734 const auto &targets = sparsity->send_targets;
735 for (unsigned int p = 0; p < targets.size(); ++p) {
736 const int ierr = MPI_Isend(
737 exchange_buffer.data() +
738 n_components * (p == 0 ? 0 : targets[p - 1].second),
739 (targets[p].second - (p == 0 ? 0 : targets[p - 1].second)) *
740 n_components * sizeof(Number),
741 MPI_BYTE,
742 targets[p].first,
743 mpi_tag,
744 sparsity->mpi_communicator,
745 &requests[p + sparsity->receive_targets.size()]);
746 AssertThrowMPI(ierr);
747 }
748 }
749#endif
750 }
751
752
753 template <typename Number, int n_components, int simd_length>
756 {
757#ifdef DEAL_II_WITH_MPI
758 const int ierr =
759 MPI_Waitall(requests.size(), requests.data(), MPI_STATUSES_IGNORE);
760 AssertThrowMPI(ierr);
761#endif
762 }
763
764
765 template <typename Number, int n_components, int simd_length>
766 inline void
768 {
769 update_ghost_rows_start();
770 update_ghost_rows_finish();
771 }
772
773} // namespace ryujin
void write_entry(const dealii::Tensor< 1, n_components, Number2 > &entry, const unsigned int row, const unsigned int position_within_column, const bool do_streaming_store=false)
EntryType< Number2 > get_entry(const unsigned int row, const unsigned int position_within_column) const
void update_ghost_rows_start(const unsigned int communication_channel=0)
dealii::VectorizedArray< Number, simd_length > VectorizedArray
void write_entry(const Number2 entry, const unsigned int row, const unsigned int position_within_column, const bool do_streaming_store=false)
dealii::Tensor< 1, n_components, Number2 > get_tensor(const unsigned int row, const unsigned int position_within_column) const
std::vector< MPI_Request > requests
typename ryujin::EntryType< Number2, n_components >::type EntryType
dealii::AlignedVector< Number > exchange_buffer
void read_in(const std::array< SparseMatrix, n_components > &sparse_matrix, bool locally_indexed=true)
void reinit(const SparsityPatternSIMD< simd_length > &sparsity)
dealii::AlignedVector< Number > data
dealii::Tensor< 1, n_components, Number2 > get_transposed_tensor(const unsigned int row, const unsigned int position_within_column) const
EntryType< Number2 > get_transposed_entry(const unsigned int row, const unsigned int position_within_column) const
const SparsityPatternSIMD< simd_length > * sparsity
dealii::AlignedVector< unsigned int > indices_transposed
std::vector< std::pair< unsigned int, unsigned int > > receive_targets
void reinit(const unsigned int n_internal_dofs, const dealii::DynamicSparsityPattern &sparsity, const std::shared_ptr< const dealii::Utilities::MPI::Partitioner > &partitioner)
dealii::AlignedVector< std::size_t > row_starts
const unsigned int * columns(const unsigned int row) const
std::shared_ptr< const dealii::Utilities::MPI::Partitioner > partitioner
std::vector< std::pair< unsigned int, unsigned int > > entries_to_be_sent
std::vector< std::pair< unsigned int, unsigned int > > send_targets
std::size_t n_nonzero_elements() const
dealii::AlignedVector< unsigned int > column_indices
unsigned int row_length(const unsigned int row) const
unsigned int stride_of_row(const unsigned int row) const