ryujin 2.1.1 revision d0a94ad2ccc0c4c2e8c2485c52b06b90e2fc9853
sparse_matrix_simd.h
Go to the documentation of this file.
1//
2// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
3// Copyright (C) 2020 - 2024 by the ryujin authors
4//
5
6#pragma once
7
8#include <deal.II/base/aligned_vector.h>
9#include <deal.II/base/partitioner.h>
10#include <deal.II/lac/dynamic_sparsity_pattern.h>
11
12#include "openmp.h"
13#include "simd.h"
14
15namespace ryujin
16{
17 namespace
18 {
23 template <typename Number, int n_components>
24 struct EntryType {
25 using type = dealii::Tensor<1, n_components, Number>;
26 };
27
28 template <typename Number>
29 struct EntryType<Number, 1> {
30 using type = Number;
31 };
32 } // namespace
33
34
35 template <typename Number,
36 int n_components = 1,
37 int simd_length = dealii::VectorizedArray<Number>::size()>
38 class SparseMatrixSIMD;
39
62 template <int simd_length>
64 {
65 public:
70
77 const unsigned int n_internal_dofs,
78 const dealii::DynamicSparsityPattern &sparsity,
79 const std::shared_ptr<const dealii::Utilities::MPI::Partitioner>
80 &partitioner);
81
82
88 void reinit(const unsigned int n_internal_dofs,
89 const dealii::DynamicSparsityPattern &sparsity,
90 const std::shared_ptr<const dealii::Utilities::MPI::Partitioner>
91 &partitioner);
92
98 unsigned int stride_of_row(const unsigned int row) const;
99
100 const unsigned int *columns(const unsigned int row) const;
101
102 unsigned int row_length(const unsigned int row) const;
103
104 unsigned int n_rows() const;
105
106 std::size_t n_nonzero_elements() const;
107
108 private:
109 unsigned int n_internal_dofs;
110 unsigned int n_locally_owned_dofs;
111 std::shared_ptr<const dealii::Utilities::MPI::Partitioner> partitioner;
112
113 dealii::AlignedVector<std::size_t> row_starts;
114 dealii::AlignedVector<unsigned int> column_indices;
115 dealii::AlignedVector<unsigned int> indices_transposed;
116
122 std::vector<std::pair<unsigned int, unsigned int>> entries_to_be_sent;
123
129 std::vector<std::pair<unsigned int, unsigned int>> send_targets;
130
137 std::vector<std::pair<unsigned int, unsigned int>> receive_targets;
138
139 MPI_Comm mpi_communicator;
140
141 template <typename, int, int>
142 friend class SparseMatrixSIMD;
143 };
144
145
155 template <typename Number, int n_components, int simd_length>
157 {
158 public:
160
162
163 void reinit(const SparsityPatternSIMD<simd_length> &sparsity);
164
165 template <typename SparseMatrix>
166 void read_in(const std::array<SparseMatrix, n_components> &sparse_matrix,
167 bool locally_indexed = true);
168
169 template <typename SparseMatrix>
170 void read_in(const SparseMatrix &sparse_matrix,
171 bool locally_indexed = true);
172
173 using VectorizedArray = dealii::VectorizedArray<Number, simd_length>;
174
181 template <typename Number2>
182 using EntryType = typename ryujin::EntryType<Number2, n_components>::type;
183
184 /* Get scalar or tensor-valued entry: */
185
196 template <typename Number2 = Number>
198 get_entry(const unsigned int row,
199 const unsigned int position_within_column) const;
200
212 template <typename Number2 = Number>
213 dealii::Tensor<1, n_components, Number2>
214 get_tensor(const unsigned int row,
215 const unsigned int position_within_column) const;
216
217 /* Get transposed scalar or tensor-valued entry: */
218
230 template <typename Number2 = Number>
232 get_transposed_entry(const unsigned int row,
233 const unsigned int position_within_column) const;
234
246 template <typename Number2 = Number>
247 dealii::Tensor<1, n_components, Number2>
248 get_transposed_tensor(const unsigned int row,
249 const unsigned int position_within_column) const;
250
251 /* Write scalar or tensor entry: */
252
265 template <typename Number2 = Number>
266 void write_entry(const Number2 entry,
267 const unsigned int row,
268 const unsigned int position_within_column,
269 const bool do_streaming_store = false);
270
280 template <typename Number2 = Number>
281 void write_entry(const dealii::Tensor<1, n_components, Number2> &entry,
282 const unsigned int row,
283 const unsigned int position_within_column,
284 const bool do_streaming_store = false);
285
286 /* Synchronize over MPI ranks: */
287
288 void update_ghost_rows_start(const unsigned int communication_channel = 0);
289
291
293
294 private:
295 const SparsityPatternSIMD<simd_length> *sparsity;
296 dealii::AlignedVector<Number> data;
297 dealii::AlignedVector<Number> exchange_buffer;
298 std::vector<MPI_Request> requests;
299 };
300
301 /*
302 * Inline function definitions:
303 */
304
305
306 template <int simd_length>
307 DEAL_II_ALWAYS_INLINE inline unsigned int
309 {
310 AssertIndexRange(row, row_starts.size());
311
312 if (row < n_internal_dofs)
313 return simd_length;
314 else
315 return 1;
316 }
317
318
319 template <int simd_length>
320 DEAL_II_ALWAYS_INLINE inline const unsigned int *
321 SparsityPatternSIMD<simd_length>::columns(const unsigned int row) const
322 {
323 AssertIndexRange(row, row_starts.size() - 1);
324
325 if (row < n_internal_dofs)
326 return column_indices.data() + row_starts[row / simd_length] +
327 row % simd_length;
328 else
329 return column_indices.data() + row_starts[row];
330 }
331
332
333 template <int simd_length>
334 DEAL_II_ALWAYS_INLINE inline unsigned int
336 {
337 AssertIndexRange(row, row_starts.size() - 1);
338
339 if (row < n_internal_dofs) {
340 const unsigned int simd_row = row / simd_length;
341 return (row_starts[simd_row + 1] - row_starts[simd_row]) / simd_length;
342 } else {
343 return row_starts[row + 1] - row_starts[row];
344 }
345 }
346
347
348 template <int simd_length>
349 DEAL_II_ALWAYS_INLINE inline unsigned int
351 {
352 return row_starts.size() - 1;
353 }
354
355
356 template <int simd_length>
357 DEAL_II_ALWAYS_INLINE inline std::size_t
359 {
360 Assert(row_starts.size() > 0, dealii::ExcNotInitialized());
361
362 return row_starts.back();
363 }
364
365
366 template <typename Number, int n_components, int simd_length>
367 template <typename Number2>
368 DEAL_II_ALWAYS_INLINE inline auto
370 const unsigned int row, const unsigned int position_within_column) const
372 {
373 const auto result = get_tensor<Number2>(row, position_within_column);
374 if constexpr (n_components == 1)
375 return result[0];
376 else
377 return result;
378 }
379
380
381 template <typename Number, int n_components, int simd_length>
382 template <typename Number2>
383 DEAL_II_ALWAYS_INLINE inline dealii::Tensor<1, n_components, Number2>
385 const unsigned int row, const unsigned int position_within_column) const
386 {
387 Assert(sparsity != nullptr, dealii::ExcNotInitialized());
388 AssertIndexRange(row, sparsity->row_starts.size() - 1);
389 AssertIndexRange(position_within_column, sparsity->row_length(row));
390
391 dealii::Tensor<1, n_components, Number2> result;
392
393 if constexpr (std::is_same<Number, Number2>::value) {
394 /*
395 * Non-vectorized slow access. Supports all row indices in
396 * [0,n_owned)
397 */
398 if (row < sparsity->n_internal_dofs) {
399 // go through vectorized part
400 const unsigned int simd_row = row / simd_length;
401 const unsigned int simd_offset = row % simd_length;
402 for (unsigned int d = 0; d < n_components; ++d)
403 result[d] = data[(sparsity->row_starts[simd_row] +
404 position_within_column * simd_length) *
405 n_components +
406 d * simd_length + simd_offset];
407 } else {
408 // go through standard part
409 for (unsigned int d = 0; d < n_components; ++d)
410 result[d] =
411 data[(sparsity->row_starts[row] + position_within_column) *
412 n_components +
413 d];
414 }
415
416 } else if constexpr (std::is_same<VectorizedArray, Number2>::value) {
417 /*
418 * Vectorized fast access. Indices must be in the range
419 * [0,n_internal), index must be divisible by simd_length
420 */
421
422 Assert(row < sparsity->n_internal_dofs,
423 dealii::ExcMessage(
424 "Vectorized access only possible in vectorized part"));
425 Assert(row % simd_length == 0,
426 dealii::ExcMessage(
427 "Access only supported for rows at the SIMD granularity"));
428
429 const Number *load_pos =
430 data.data() + (sparsity->row_starts[row / simd_length] +
431 position_within_column * simd_length) *
432 n_components;
433
434 for (unsigned int d = 0; d < n_components; ++d)
435 result[d].load(load_pos + d * simd_length);
436
437 } else {
438 /* not implemented */
439 __builtin_trap();
440 }
441
442 return result;
443 }
444
445
446 template <typename Number, int n_components, int simd_length>
447 template <typename Number2>
448 DEAL_II_ALWAYS_INLINE inline auto
450 const unsigned int row, const unsigned int position_within_column) const
452 {
453 const auto result =
454 get_transposed_tensor<Number2>(row, position_within_column);
455 if constexpr (n_components == 1)
456 return result[0];
457 else
458 return result;
459 }
460
461
462 template <typename Number, int n_components, int simd_length>
463 template <typename Number2>
464 DEAL_II_ALWAYS_INLINE inline dealii::Tensor<1, n_components, Number2>
466 const unsigned int row, const unsigned int position_within_column) const
467 {
468 Assert(sparsity != nullptr, dealii::ExcNotInitialized());
469 AssertIndexRange(row, sparsity->row_starts.size() - 1);
470 AssertIndexRange(position_within_column, sparsity->row_length(row));
471
472 dealii::Tensor<1, n_components, Number2> result;
473
474 if constexpr (std::is_same<Number, Number2>::value) {
475 /*
476 * Non-vectorized slow access. Supports all row indices in
477 * [0,n_owned)
478 */
479
480 if (row < sparsity->n_internal_dofs) {
481 // go through vectorized part
482 const unsigned int simd_row = row / simd_length;
483 const unsigned int simd_offset = row % simd_length;
484 const std::size_t index =
485 sparsity->indices_transposed[sparsity->row_starts[simd_row] +
486 simd_offset +
487 position_within_column * simd_length];
488 if (n_components > 1) {
489 const unsigned int col =
490 sparsity->column_indices[sparsity->row_starts[simd_row] +
491 simd_offset +
492 position_within_column * simd_length];
493 if (col < sparsity->n_internal_dofs)
494 for (unsigned int d = 0; d < n_components; ++d)
495 result[d] =
496 data[index / simd_length * simd_length * n_components +
497 simd_length * d + index % simd_length];
498 else
499 for (unsigned int d = 0; d < n_components; ++d)
500 result[d] = data[index * n_components + d];
501 } else
502 result[0] = data[index];
503 } else {
504 // go through standard part
505 const std::size_t index =
506 sparsity->indices_transposed[sparsity->row_starts[row] +
507 position_within_column];
508 if (n_components > 1) {
509 const unsigned int col =
510 sparsity->column_indices[sparsity->row_starts[row] +
511 position_within_column];
512 if (col < sparsity->n_internal_dofs)
513 for (unsigned int d = 0; d < n_components; ++d)
514 result[d] =
515 data[index / simd_length * simd_length * n_components +
516 simd_length * d + index % simd_length];
517 else
518 for (unsigned int d = 0; d < n_components; ++d)
519 result[d] = data[index * n_components + d];
520 } else
521 result[0] = data[index];
522 }
523
524 } else if constexpr (std::is_same<VectorizedArray, Number2>::value &&
525 (n_components == 1)) {
526 /*
527 * Vectorized fast access. Indices must be in the range
528 * [0,n_internal), index must be divisible by simd_length
529 */
530
531 Assert(row < sparsity->n_internal_dofs,
532 dealii::ExcMessage(
533 "Vectorized access only possible in vectorized part"));
534 Assert(row % simd_length == 0,
535 dealii::ExcMessage(
536 "Access only supported for rows at the SIMD granularity"));
537
538 const unsigned int offset = sparsity->row_starts[row / simd_length] +
539 position_within_column * simd_length;
540 result[0].gather(data.data(),
541 sparsity->indices_transposed.data() + offset);
542
543 } else {
544 /* not implemented */
545 __builtin_trap();
546 }
547
548 return result;
549 }
550
551
552 template <typename Number, int n_components, int simd_length>
553 template <typename Number2>
554 DEAL_II_ALWAYS_INLINE inline void
556 const Number2 entry,
557 const unsigned int row,
558 const unsigned int position_within_column,
559 const bool do_streaming_store)
560 {
561 static_assert(
562 n_components == 1,
563 "Attempted to write a scalar value into a tensor-valued matrix entry");
564
565 Assert(sparsity != nullptr, dealii::ExcNotInitialized());
566 AssertIndexRange(row, sparsity->row_starts.size() - 1);
567 AssertIndexRange(position_within_column, sparsity->row_length(row));
568
569 dealii::Tensor<1, n_components, Number2> result;
570 result[0] = entry;
571
572 write_entry<Number2>(
573 result, row, position_within_column, do_streaming_store);
574 }
575
576
577 template <typename Number, int n_components, int simd_length>
578 template <typename Number2>
579 DEAL_II_ALWAYS_INLINE inline void
581 const dealii::Tensor<1, n_components, Number2> &entry,
582 const unsigned int row,
583 const unsigned int position_within_column,
584 const bool do_streaming_store)
585 {
586 Assert(sparsity != nullptr, dealii::ExcNotInitialized());
587 AssertIndexRange(row, sparsity->row_starts.size() - 1);
588 AssertIndexRange(position_within_column, sparsity->row_length(row));
589
590 if constexpr (std::is_same<Number, Number2>::value) {
591 /*
592 * Non-vectorized slow access. Supports all row indices in
593 * [0,n_owned)
594 */
595
596 if (row < sparsity->n_internal_dofs) {
597 // go through vectorized part
598 const unsigned int simd_row = row / simd_length;
599 const unsigned int simd_offset = row % simd_length;
600 for (unsigned int d = 0; d < n_components; ++d)
601 data[(sparsity->row_starts[simd_row] +
602 position_within_column * simd_length) *
603 n_components +
604 d * simd_length + simd_offset] = entry[d];
605 } else {
606 // go through standard part
607 for (unsigned int d = 0; d < n_components; ++d)
608 data[(sparsity->row_starts[row] + position_within_column) *
609 n_components +
610 d] = entry[d];
611 }
612
613 } else if constexpr (std::is_same<VectorizedArray, Number2>::value) {
614 /*
615 * Vectorized fast access. Indices must be in the range
616 * [0,n_internal), index must be divisible by simd_length
617 */
618
619 Assert(row < sparsity->n_internal_dofs,
620 dealii::ExcMessage(
621 "Vectorized access only possible in vectorized part"));
622 Assert(row % simd_length == 0,
623 dealii::ExcMessage(
624 "Access only supported for rows at the SIMD granularity"));
625
626 Number *store_pos =
627 data.data() + (sparsity->row_starts[row / simd_length] +
628 position_within_column * simd_length) *
629 n_components;
630 if (do_streaming_store)
631 for (unsigned int d = 0; d < n_components; ++d)
632 entry[d].streaming_store(store_pos + d * simd_length);
633 else
634 for (unsigned int d = 0; d < n_components; ++d)
635 entry[d].store(store_pos + d * simd_length);
636
637 } else {
638 /* not implemented */
639 __builtin_trap();
640 }
641 }
642
643
644 template <typename Number, int n_components, int simd_length>
645 inline void
647 const unsigned int communication_channel)
648 {
649#ifdef DEAL_II_WITH_MPI
650 AssertIndexRange(communication_channel, 200);
651
652 const unsigned int mpi_tag =
653 dealii::Utilities::MPI::internal::Tags::partitioner_export_start +
654 communication_channel;
655 Assert(mpi_tag <=
656 dealii::Utilities::MPI::internal::Tags::partitioner_export_end,
657 dealii::ExcInternalError());
658
659 const std::size_t n_indices = sparsity->entries_to_be_sent.size();
660 exchange_buffer.resize_fast(n_components * n_indices);
661
662 requests.resize(sparsity->receive_targets.size() +
663 sparsity->send_targets.size());
664
665 /*
666 * Set up MPI receive requests. We will always receive data for indices
667 * in the range [n_locally_owned_, n_locally_relevant_), thus the DATA
668 * is stored in non-vectorized CSR format.
669 */
670
671 {
672 const auto &targets = sparsity->receive_targets;
673 for (unsigned int p = 0; p < targets.size(); ++p) {
674 const int ierr = MPI_Irecv(
675 data.data() +
676 n_components *
677 (sparsity->row_starts[sparsity->n_locally_owned_dofs] +
678 (p == 0 ? 0 : targets[p - 1].second)),
679 (targets[p].second - (p == 0 ? 0 : targets[p - 1].second)) *
680 n_components * sizeof(Number),
681 MPI_BYTE,
682 targets[p].first,
683 mpi_tag,
684 sparsity->mpi_communicator,
685 &requests[p]);
686 AssertThrowMPI(ierr);
687 }
688 }
689
690 /*
691 * Copy all entries that we plan to send over to the exchange buffer.
692 * Here, we have to be careful with indices falling into the "locally
693 * internal" range that are stored in an array-of-struct-of-array type.
694 */
695
696 for (std::size_t c = 0; c < n_indices; ++c) {
697
698 const auto &[row, position_within_column] =
699 sparsity->entries_to_be_sent[c];
700
701 Assert(row < sparsity->n_locally_owned_dofs, dealii::ExcInternalError());
702
703 if (row < sparsity->n_internal_dofs) {
704 // go through vectorized part
705 const unsigned int simd_row = row / simd_length;
706 const unsigned int simd_offset = row % simd_length;
707 for (unsigned int d = 0; d < n_components; ++d)
708 exchange_buffer[n_components * c + d] =
709 data[(sparsity->row_starts[simd_row] +
710 position_within_column * simd_length) *
711 n_components +
712 d * simd_length + simd_offset];
713 } else {
714 // go through standard part
715 for (unsigned int d = 0; d < n_components; ++d)
716 exchange_buffer[n_components * c + d] =
717 data[(sparsity->row_starts[row] + position_within_column) *
718 n_components +
719 d];
720 }
721 }
722
723 /*
724 * Set up MPI send requests. We have copied everything we intend to
725 * send to the exchange_buffer compatible with the CSR storage format
726 * of the receiving MPI rank.
727 */
728
729 {
730 const auto &targets = sparsity->send_targets;
731 for (unsigned int p = 0; p < targets.size(); ++p) {
732 const int ierr = MPI_Isend(
733 exchange_buffer.data() +
734 n_components * (p == 0 ? 0 : targets[p - 1].second),
735 (targets[p].second - (p == 0 ? 0 : targets[p - 1].second)) *
736 n_components * sizeof(Number),
737 MPI_BYTE,
738 targets[p].first,
739 mpi_tag,
740 sparsity->mpi_communicator,
741 &requests[p + sparsity->receive_targets.size()]);
742 AssertThrowMPI(ierr);
743 }
744 }
745#endif
746 }
747
748
749 template <typename Number, int n_components, int simd_length>
752 {
753#ifdef DEAL_II_WITH_MPI
754 const int ierr =
755 MPI_Waitall(requests.size(), requests.data(), MPI_STATUSES_IGNORE);
756 AssertThrowMPI(ierr);
757#endif
758 }
759
760
761 template <typename Number, int n_components, int simd_length>
762 inline void
764 {
765 update_ghost_rows_start();
766 update_ghost_rows_finish();
767 }
768
769} // namespace ryujin
void write_entry(const dealii::Tensor< 1, n_components, Number2 > &entry, const unsigned int row, const unsigned int position_within_column, const bool do_streaming_store=false)
EntryType< Number2 > get_entry(const unsigned int row, const unsigned int position_within_column) const
void update_ghost_rows_start(const unsigned int communication_channel=0)
dealii::VectorizedArray< Number, simd_length > VectorizedArray
void write_entry(const Number2 entry, const unsigned int row, const unsigned int position_within_column, const bool do_streaming_store=false)
dealii::Tensor< 1, n_components, Number2 > get_tensor(const unsigned int row, const unsigned int position_within_column) const
typename ryujin::EntryType< Number2, n_components >::type EntryType
void read_in(const std::array< SparseMatrix, n_components > &sparse_matrix, bool locally_indexed=true)
void reinit(const SparsityPatternSIMD< simd_length > &sparsity)
dealii::Tensor< 1, n_components, Number2 > get_transposed_tensor(const unsigned int row, const unsigned int position_within_column) const
EntryType< Number2 > get_transposed_entry(const unsigned int row, const unsigned int position_within_column) const
void reinit(const unsigned int n_internal_dofs, const dealii::DynamicSparsityPattern &sparsity, const std::shared_ptr< const dealii::Utilities::MPI::Partitioner > &partitioner)
const unsigned int * columns(const unsigned int row) const
std::size_t n_nonzero_elements() const
unsigned int row_length(const unsigned int row) const
unsigned int stride_of_row(const unsigned int row) const