ryujin 2.1.1 revision 0348cbb53a3e4b1da2a4c037e81f88f2d21ce219
sparse_matrix_simd.h
Go to the documentation of this file.
1//
2// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
3// Copyright (C) 2020 - 2024 by the ryujin authors
4//
5
6#pragma once
7
8#include <deal.II/base/aligned_vector.h>
9#include <deal.II/base/partitioner.h>
10#include <deal.II/lac/dynamic_sparsity_pattern.h>
11
12#include "openmp.h"
13#include "simd.h"
14
15namespace ryujin
16{
17 namespace
18 {
23 template <typename Number, int n_components>
24 struct EntryType {
25 using type = dealii::Tensor<1, n_components, Number>;
26 };
27
28 template <typename Number>
29 struct EntryType<Number, 1> {
30 using type = Number;
31 };
32 } // namespace
33
34
35 template <typename Number,
36 int n_components = 1,
37 int simd_length = dealii::VectorizedArray<Number>::size()>
38 class SparseMatrixSIMD;
39
62 template <int simd_length>
64 {
65 public:
70
77 const unsigned int n_internal_dofs,
78 const dealii::DynamicSparsityPattern &sparsity,
79 const std::shared_ptr<const dealii::Utilities::MPI::Partitioner>
81
82
88 void reinit(const unsigned int n_internal_dofs,
89 const dealii::DynamicSparsityPattern &sparsity,
90 const std::shared_ptr<const dealii::Utilities::MPI::Partitioner>
92
98 unsigned int stride_of_row(const unsigned int row) const;
99
100 const unsigned int *columns(const unsigned int row) const;
101
102 unsigned int row_length(const unsigned int row) const;
103
104 unsigned int n_rows() const;
105
106 std::size_t n_nonzero_elements() const;
107
108 protected:
109 unsigned int n_internal_dofs;
111 std::shared_ptr<const dealii::Utilities::MPI::Partitioner> partitioner;
112
113 dealii::AlignedVector<std::size_t> row_starts;
114 dealii::AlignedVector<unsigned int> column_indices;
115 dealii::AlignedVector<unsigned int> indices_transposed;
116
122 std::vector<std::pair<unsigned int, unsigned int>> entries_to_be_sent;
123
129 std::vector<std::pair<unsigned int, unsigned int>> send_targets;
130
142 std::vector<std::pair<unsigned int, unsigned int>> receive_targets;
143
145
146 template <typename, int, int>
147 friend class SparseMatrixSIMD;
148 };
149
150
160 template <typename Number, int n_components, int simd_length>
162 {
163 public:
165
167
169
170 template <typename SparseMatrix>
171 void read_in(const std::array<SparseMatrix, n_components> &sparse_matrix,
172 bool locally_indexed = true);
173
174 template <typename SparseMatrix>
175 void read_in(const SparseMatrix &sparse_matrix,
176 bool locally_indexed = true);
177
178 using VectorizedArray = dealii::VectorizedArray<Number, simd_length>;
179
186 template <typename Number2>
187 using EntryType = typename ryujin::EntryType<Number2, n_components>::type;
188
189 /* Get scalar or tensor-valued entry: */
190
201 template <typename Number2 = Number>
203 get_entry(const unsigned int row,
204 const unsigned int position_within_column) const;
205
217 template <typename Number2 = Number>
218 dealii::Tensor<1, n_components, Number2>
219 get_tensor(const unsigned int row,
220 const unsigned int position_within_column) const;
221
222 /* Get transposed scalar or tensor-valued entry: */
223
235 template <typename Number2 = Number>
237 get_transposed_entry(const unsigned int row,
238 const unsigned int position_within_column) const;
239
251 template <typename Number2 = Number>
252 dealii::Tensor<1, n_components, Number2>
253 get_transposed_tensor(const unsigned int row,
254 const unsigned int position_within_column) const;
255
256 /* Write scalar or tensor entry: */
257
270 template <typename Number2 = Number>
271 void write_entry(const Number2 entry,
272 const unsigned int row,
273 const unsigned int position_within_column,
274 const bool do_streaming_store = false);
275
285 template <typename Number2 = Number>
286 void write_entry(const dealii::Tensor<1, n_components, Number2> &entry,
287 const unsigned int row,
288 const unsigned int position_within_column,
289 const bool do_streaming_store = false);
290
291 /* Synchronize over MPI ranks: */
292
293 void update_ghost_rows_start(const unsigned int communication_channel = 0);
294
296
298
299 protected:
301 dealii::AlignedVector<Number> data;
302 dealii::AlignedVector<Number> exchange_buffer;
303 std::vector<MPI_Request> requests;
304 };
305
306 /*
307 * Inline function definitions:
308 */
309
310
311 template <int simd_length>
312 DEAL_II_ALWAYS_INLINE inline unsigned int
314 {
315 AssertIndexRange(row, row_starts.size());
316
317 if (row < n_internal_dofs)
318 return simd_length;
319 else
320 return 1;
321 }
322
323
324 template <int simd_length>
325 DEAL_II_ALWAYS_INLINE inline const unsigned int *
326 SparsityPatternSIMD<simd_length>::columns(const unsigned int row) const
327 {
328 AssertIndexRange(row, row_starts.size() - 1);
329
330 if (row < n_internal_dofs)
331 return column_indices.data() + row_starts[row / simd_length] +
332 row % simd_length;
333 else
334 return column_indices.data() + row_starts[row];
335 }
336
337
338 template <int simd_length>
339 DEAL_II_ALWAYS_INLINE inline unsigned int
341 {
342 AssertIndexRange(row, row_starts.size() - 1);
343
344 if (row < n_internal_dofs) {
345 const unsigned int simd_row = row / simd_length;
346 return (row_starts[simd_row + 1] - row_starts[simd_row]) / simd_length;
347 } else {
348 return row_starts[row + 1] - row_starts[row];
349 }
350 }
351
352
353 template <int simd_length>
354 DEAL_II_ALWAYS_INLINE inline unsigned int
356 {
357 return row_starts.size() - 1;
358 }
359
360
361 template <int simd_length>
362 DEAL_II_ALWAYS_INLINE inline std::size_t
364 {
365 Assert(row_starts.size() > 0, dealii::ExcNotInitialized());
366
367 return row_starts.back();
368 }
369
370
371 template <typename Number, int n_components, int simd_length>
372 template <typename Number2>
373 DEAL_II_ALWAYS_INLINE inline auto
375 const unsigned int row, const unsigned int position_within_column) const
377 {
378 const auto result = get_tensor<Number2>(row, position_within_column);
379 if constexpr (n_components == 1)
380 return result[0];
381 else
382 return result;
383 }
384
385
386 template <typename Number, int n_components, int simd_length>
387 template <typename Number2>
388 DEAL_II_ALWAYS_INLINE inline dealii::Tensor<1, n_components, Number2>
390 const unsigned int row, const unsigned int position_within_column) const
391 {
392 Assert(sparsity != nullptr, dealii::ExcNotInitialized());
393 AssertIndexRange(row, sparsity->row_starts.size() - 1);
394 AssertIndexRange(position_within_column, sparsity->row_length(row));
395
396 dealii::Tensor<1, n_components, Number2> result;
397
398 if constexpr (std::is_same<Number, Number2>::value) {
399 /*
400 * Non-vectorized slow access. Supports all row indices in
401 * [0,n_owned)
402 */
403 if (row < sparsity->n_internal_dofs) {
404 // go through vectorized part
405 const unsigned int simd_row = row / simd_length;
406 const unsigned int simd_offset = row % simd_length;
407 for (unsigned int d = 0; d < n_components; ++d)
408 result[d] = data[(sparsity->row_starts[simd_row] +
409 position_within_column * simd_length) *
410 n_components +
411 d * simd_length + simd_offset];
412 } else {
413 // go through standard part
414 for (unsigned int d = 0; d < n_components; ++d)
415 result[d] =
416 data[(sparsity->row_starts[row] + position_within_column) *
417 n_components +
418 d];
419 }
420
421 } else if constexpr (std::is_same<VectorizedArray, Number2>::value) {
422 /*
423 * Vectorized fast access. Indices must be in the range
424 * [0,n_internal), index must be divisible by simd_length
425 */
426
427 Assert(row < sparsity->n_internal_dofs,
428 dealii::ExcMessage(
429 "Vectorized access only possible in vectorized part"));
430 Assert(row % simd_length == 0,
431 dealii::ExcMessage(
432 "Access only supported for rows at the SIMD granularity"));
433
434 const Number *load_pos =
435 data.data() + (sparsity->row_starts[row / simd_length] +
436 position_within_column * simd_length) *
437 n_components;
438
439 for (unsigned int d = 0; d < n_components; ++d)
440 result[d].load(load_pos + d * simd_length);
441
442 } else {
443 /* not implemented */
444 __builtin_trap();
445 }
446
447 return result;
448 }
449
450
451 template <typename Number, int n_components, int simd_length>
452 template <typename Number2>
453 DEAL_II_ALWAYS_INLINE inline auto
455 const unsigned int row, const unsigned int position_within_column) const
457 {
458 const auto result =
459 get_transposed_tensor<Number2>(row, position_within_column);
460 if constexpr (n_components == 1)
461 return result[0];
462 else
463 return result;
464 }
465
466
467 template <typename Number, int n_components, int simd_length>
468 template <typename Number2>
469 DEAL_II_ALWAYS_INLINE inline dealii::Tensor<1, n_components, Number2>
471 const unsigned int row, const unsigned int position_within_column) const
472 {
473 Assert(sparsity != nullptr, dealii::ExcNotInitialized());
474 AssertIndexRange(row, sparsity->row_starts.size() - 1);
475 AssertIndexRange(position_within_column, sparsity->row_length(row));
476
477 dealii::Tensor<1, n_components, Number2> result;
478
479 if constexpr (std::is_same<Number, Number2>::value) {
480 /*
481 * Non-vectorized slow access. Supports all row indices in
482 * [0,n_owned)
483 */
484
485 if (row < sparsity->n_internal_dofs) {
486 // go through vectorized part
487 const unsigned int simd_row = row / simd_length;
488 const unsigned int simd_offset = row % simd_length;
489 const std::size_t index =
490 sparsity->indices_transposed[sparsity->row_starts[simd_row] +
491 simd_offset +
492 position_within_column * simd_length];
493 if (n_components > 1) {
494 const unsigned int col =
495 sparsity->column_indices[sparsity->row_starts[simd_row] +
496 simd_offset +
497 position_within_column * simd_length];
498 if (col < sparsity->n_internal_dofs)
499 for (unsigned int d = 0; d < n_components; ++d)
500 result[d] =
501 data[index / simd_length * simd_length * n_components +
502 simd_length * d + index % simd_length];
503 else
504 for (unsigned int d = 0; d < n_components; ++d)
505 result[d] = data[index * n_components + d];
506 } else
507 result[0] = data[index];
508 } else {
509 // go through standard part
510 const std::size_t index =
511 sparsity->indices_transposed[sparsity->row_starts[row] +
512 position_within_column];
513 if (n_components > 1) {
514 const unsigned int col =
515 sparsity->column_indices[sparsity->row_starts[row] +
516 position_within_column];
517 if (col < sparsity->n_internal_dofs)
518 for (unsigned int d = 0; d < n_components; ++d)
519 result[d] =
520 data[index / simd_length * simd_length * n_components +
521 simd_length * d + index % simd_length];
522 else
523 for (unsigned int d = 0; d < n_components; ++d)
524 result[d] = data[index * n_components + d];
525 } else
526 result[0] = data[index];
527 }
528
529 } else if constexpr (std::is_same<VectorizedArray, Number2>::value &&
530 (n_components == 1)) {
531 /*
532 * Vectorized fast access. Indices must be in the range
533 * [0,n_internal), index must be divisible by simd_length
534 */
535
536 Assert(row < sparsity->n_internal_dofs,
537 dealii::ExcMessage(
538 "Vectorized access only possible in vectorized part"));
539 Assert(row % simd_length == 0,
540 dealii::ExcMessage(
541 "Access only supported for rows at the SIMD granularity"));
542
543 const unsigned int offset = sparsity->row_starts[row / simd_length] +
544 position_within_column * simd_length;
545 result[0].gather(data.data(),
546 sparsity->indices_transposed.data() + offset);
547
548 } else {
549 /* not implemented */
550 __builtin_trap();
551 }
552
553 return result;
554 }
555
556
557 template <typename Number, int n_components, int simd_length>
558 template <typename Number2>
559 DEAL_II_ALWAYS_INLINE inline void
561 const Number2 entry,
562 const unsigned int row,
563 const unsigned int position_within_column,
564 const bool do_streaming_store)
565 {
566 static_assert(
567 n_components == 1,
568 "Attempted to write a scalar value into a tensor-valued matrix entry");
569
570 Assert(sparsity != nullptr, dealii::ExcNotInitialized());
571 AssertIndexRange(row, sparsity->row_starts.size() - 1);
572 AssertIndexRange(position_within_column, sparsity->row_length(row));
573
574 dealii::Tensor<1, n_components, Number2> result;
575 result[0] = entry;
576
577 write_entry<Number2>(
578 result, row, position_within_column, do_streaming_store);
579 }
580
581
582 template <typename Number, int n_components, int simd_length>
583 template <typename Number2>
584 DEAL_II_ALWAYS_INLINE inline void
586 const dealii::Tensor<1, n_components, Number2> &entry,
587 const unsigned int row,
588 const unsigned int position_within_column,
589 const bool do_streaming_store)
590 {
591 Assert(sparsity != nullptr, dealii::ExcNotInitialized());
592 AssertIndexRange(row, sparsity->row_starts.size() - 1);
593 AssertIndexRange(position_within_column, sparsity->row_length(row));
594
595 if constexpr (std::is_same<Number, Number2>::value) {
596 /*
597 * Non-vectorized slow access. Supports all row indices in
598 * [0,n_owned)
599 */
600
601 if (row < sparsity->n_internal_dofs) {
602 // go through vectorized part
603 const unsigned int simd_row = row / simd_length;
604 const unsigned int simd_offset = row % simd_length;
605 for (unsigned int d = 0; d < n_components; ++d)
606 data[(sparsity->row_starts[simd_row] +
607 position_within_column * simd_length) *
608 n_components +
609 d * simd_length + simd_offset] = entry[d];
610 } else {
611 // go through standard part
612 for (unsigned int d = 0; d < n_components; ++d)
613 data[(sparsity->row_starts[row] + position_within_column) *
614 n_components +
615 d] = entry[d];
616 }
617
618 } else if constexpr (std::is_same<VectorizedArray, Number2>::value) {
619 /*
620 * Vectorized fast access. Indices must be in the range
621 * [0,n_internal), index must be divisible by simd_length
622 */
623
624 Assert(row < sparsity->n_internal_dofs,
625 dealii::ExcMessage(
626 "Vectorized access only possible in vectorized part"));
627 Assert(row % simd_length == 0,
628 dealii::ExcMessage(
629 "Access only supported for rows at the SIMD granularity"));
630
631 Number *store_pos =
632 data.data() + (sparsity->row_starts[row / simd_length] +
633 position_within_column * simd_length) *
634 n_components;
635 if (do_streaming_store)
636 for (unsigned int d = 0; d < n_components; ++d)
637 entry[d].streaming_store(store_pos + d * simd_length);
638 else
639 for (unsigned int d = 0; d < n_components; ++d)
640 entry[d].store(store_pos + d * simd_length);
641
642 } else {
643 /* not implemented */
644 __builtin_trap();
645 }
646 }
647
648
649 template <typename Number, int n_components, int simd_length>
650 inline void
652 const unsigned int communication_channel)
653 {
654#ifdef DEAL_II_WITH_MPI
655 AssertIndexRange(communication_channel, 200);
656
657 const unsigned int mpi_tag =
658 dealii::Utilities::MPI::internal::Tags::partitioner_export_start +
659 communication_channel;
660 Assert(mpi_tag <=
661 dealii::Utilities::MPI::internal::Tags::partitioner_export_end,
662 dealii::ExcInternalError());
663
664 const std::size_t n_indices = sparsity->entries_to_be_sent.size();
665 exchange_buffer.resize_fast(n_components * n_indices);
666
667 requests.resize(sparsity->receive_targets.size() +
668 sparsity->send_targets.size());
669
670 /*
671 * Set up MPI receive requests. We will always receive data for indices
672 * in the range [n_locally_owned_, n_locally_relevant_), thus the DATA
673 * is stored in non-vectorized CSR format.
674 */
675
676 {
677 const auto &targets = sparsity->receive_targets;
678 for (unsigned int p = 0; p < targets.size(); ++p) {
679 const int ierr = MPI_Irecv(
680 data.data() +
681 n_components *
682 (sparsity->row_starts[sparsity->n_locally_owned_dofs] +
683 (p == 0 ? 0 : targets[p - 1].second)),
684 (targets[p].second - (p == 0 ? 0 : targets[p - 1].second)) *
685 n_components * sizeof(Number),
686 MPI_BYTE,
687 targets[p].first,
688 mpi_tag,
689 sparsity->mpi_communicator,
690 &requests[p]);
691 AssertThrowMPI(ierr);
692 }
693 }
694
695 /*
696 * Copy all entries that we plan to send over to the exchange buffer.
697 * Here, we have to be careful with indices falling into the "locally
698 * internal" range that are stored in an array-of-struct-of-array type.
699 */
700
701 for (std::size_t c = 0; c < n_indices; ++c) {
702
703 const auto &[row, position_within_column] =
704 sparsity->entries_to_be_sent[c];
705
706 Assert(row < sparsity->n_locally_owned_dofs, dealii::ExcInternalError());
707
708 if (row < sparsity->n_internal_dofs) {
709 // go through vectorized part
710 const unsigned int simd_row = row / simd_length;
711 const unsigned int simd_offset = row % simd_length;
712 for (unsigned int d = 0; d < n_components; ++d)
713 exchange_buffer[n_components * c + d] =
714 data[(sparsity->row_starts[simd_row] +
715 position_within_column * simd_length) *
716 n_components +
717 d * simd_length + simd_offset];
718 } else {
719 // go through standard part
720 for (unsigned int d = 0; d < n_components; ++d)
721 exchange_buffer[n_components * c + d] =
722 data[(sparsity->row_starts[row] + position_within_column) *
723 n_components +
724 d];
725 }
726 }
727
728 /*
729 * Set up MPI send requests. We have copied everything we intend to
730 * send to the exchange_buffer compatible with the CSR storage format
731 * of the receiving MPI rank.
732 */
733
734 {
735 const auto &targets = sparsity->send_targets;
736 for (unsigned int p = 0; p < targets.size(); ++p) {
737 const int ierr = MPI_Isend(
738 exchange_buffer.data() +
739 n_components * (p == 0 ? 0 : targets[p - 1].second),
740 (targets[p].second - (p == 0 ? 0 : targets[p - 1].second)) *
741 n_components * sizeof(Number),
742 MPI_BYTE,
743 targets[p].first,
744 mpi_tag,
745 sparsity->mpi_communicator,
746 &requests[p + sparsity->receive_targets.size()]);
747 AssertThrowMPI(ierr);
748 }
749 }
750#endif
751 }
752
753
754 template <typename Number, int n_components, int simd_length>
757 {
758#ifdef DEAL_II_WITH_MPI
759 const int ierr =
760 MPI_Waitall(requests.size(), requests.data(), MPI_STATUSES_IGNORE);
761 AssertThrowMPI(ierr);
762#endif
763 }
764
765
766 template <typename Number, int n_components, int simd_length>
767 inline void
769 {
770 update_ghost_rows_start();
771 update_ghost_rows_finish();
772 }
773
774} // namespace ryujin
void write_entry(const dealii::Tensor< 1, n_components, Number2 > &entry, const unsigned int row, const unsigned int position_within_column, const bool do_streaming_store=false)
EntryType< Number2 > get_entry(const unsigned int row, const unsigned int position_within_column) const
void update_ghost_rows_start(const unsigned int communication_channel=0)
dealii::VectorizedArray< Number, simd_length > VectorizedArray
void write_entry(const Number2 entry, const unsigned int row, const unsigned int position_within_column, const bool do_streaming_store=false)
dealii::Tensor< 1, n_components, Number2 > get_tensor(const unsigned int row, const unsigned int position_within_column) const
std::vector< MPI_Request > requests
typename ryujin::EntryType< Number2, n_components >::type EntryType
dealii::AlignedVector< Number > exchange_buffer
void read_in(const std::array< SparseMatrix, n_components > &sparse_matrix, bool locally_indexed=true)
void reinit(const SparsityPatternSIMD< simd_length > &sparsity)
dealii::AlignedVector< Number > data
dealii::Tensor< 1, n_components, Number2 > get_transposed_tensor(const unsigned int row, const unsigned int position_within_column) const
EntryType< Number2 > get_transposed_entry(const unsigned int row, const unsigned int position_within_column) const
const SparsityPatternSIMD< simd_length > * sparsity
dealii::AlignedVector< unsigned int > indices_transposed
std::vector< std::pair< unsigned int, unsigned int > > receive_targets
void reinit(const unsigned int n_internal_dofs, const dealii::DynamicSparsityPattern &sparsity, const std::shared_ptr< const dealii::Utilities::MPI::Partitioner > &partitioner)
dealii::AlignedVector< std::size_t > row_starts
const unsigned int * columns(const unsigned int row) const
std::shared_ptr< const dealii::Utilities::MPI::Partitioner > partitioner
std::vector< std::pair< unsigned int, unsigned int > > entries_to_be_sent
std::vector< std::pair< unsigned int, unsigned int > > send_targets
std::size_t n_nonzero_elements() const
dealii::AlignedVector< unsigned int > column_indices
unsigned int row_length(const unsigned int row) const
unsigned int stride_of_row(const unsigned int row) const