ryujin 2.1.1 revision 003d8fa0b21a2fd867c2893d046eb34a0f9e932c
sparse_matrix_simd.template.h
Go to the documentation of this file.
1//
2// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
3// Copyright (C) 2020 - 2024 by the ryujin authors
4//
5
6#pragma once
7
8#include "openmp.h"
9#include "simd.h"
10#include "sparse_matrix_simd.h"
11
12#include <deal.II/base/vectorization.h>
13#include <deal.II/lac/sparse_matrix.h>
14
15namespace ryujin
16{
17
18 template <int simd_length>
20 : n_internal_dofs(0)
21 , row_starts(1)
22 , mpi_communicator(MPI_COMM_SELF)
23 {
24 }
25
26
27 template <int simd_length>
29 const unsigned int n_internal_dofs,
30 const dealii::DynamicSparsityPattern &sparsity,
31 const std::shared_ptr<const dealii::Utilities::MPI::Partitioner>
32 &partitioner)
33 : n_internal_dofs(0)
34 , mpi_communicator(MPI_COMM_SELF)
35 {
37 }
38
39
40 template <int simd_length>
42 const unsigned int n_internal_dofs,
43 const dealii::DynamicSparsityPattern &dsp,
44 const std::shared_ptr<const dealii::Utilities::MPI::Partitioner>
45 &partitioner)
46 {
47 this->mpi_communicator = partitioner->get_mpi_communicator();
48
49 this->n_internal_dofs = n_internal_dofs;
50 this->n_locally_owned_dofs = partitioner->locally_owned_size();
51 this->partitioner = partitioner;
52
53 const auto n_locally_relevant_dofs =
54 partitioner->locally_owned_size() + partitioner->n_ghost_indices();
55
56 /*
57 * First, create a static sparsity pattern (in local indexing), where
58 * the only off-processor rows are the ones for which locally owned rows
59 * request the transpose entries. This will be the one we finally
60 * compute on.
61 */
62
63 dealii::DynamicSparsityPattern dsp_minimal(n_locally_relevant_dofs,
64 n_locally_relevant_dofs);
65 for (unsigned int i = 0; i < n_locally_owned_dofs; ++i) {
66 const auto global_row = partitioner->local_to_global(i);
67 for (auto it = dsp.begin(global_row); it != dsp.end(global_row); ++it) {
68 const auto global_column = it->column();
69 const auto j = partitioner->global_to_local(global_column);
70 dsp_minimal.add(i, j);
71 if (j >= n_locally_owned_dofs) {
72 Assert(j < n_locally_relevant_dofs, dealii::ExcInternalError());
73 dsp_minimal.add(j, i);
74 }
75 }
76 }
77
78 dealii::SparsityPattern sparsity;
79 sparsity.copy_from(dsp_minimal);
80
81 Assert(n_internal_dofs <= sparsity.n_rows(), dealii::ExcInternalError());
82 Assert(n_internal_dofs % simd_length == 0, dealii::ExcInternalError());
83 Assert(n_internal_dofs <= n_locally_owned_dofs, dealii::ExcInternalError());
84 Assert(n_locally_owned_dofs <= sparsity.n_rows(),
85 dealii::ExcInternalError());
86
87 row_starts.resize_fast(sparsity.n_rows() + 1);
88 column_indices.resize_fast(sparsity.n_nonzero_elements());
89 indices_transposed.resize_fast(sparsity.n_nonzero_elements());
90 AssertThrow(sparsity.n_nonzero_elements() <
91 std::numeric_limits<unsigned int>::max(),
92 dealii::ExcMessage("Transposed indices only support up to 4 "
93 "billion matrix entries per MPI rank. Try to"
94 " split into smaller problems with MPI"));
95
96 /* Vectorized part: */
97
98 row_starts[0] = 0;
99
100 unsigned int *col_ptr = column_indices.data();
101 unsigned int *transposed_ptr = indices_transposed.data();
102
103 for (unsigned int i = 0; i < n_internal_dofs; i += simd_length) {
104 auto jts = generate_iterators<simd_length>(
105 [&](auto k) { return sparsity.begin(i + k); });
106
107 for (; jts[0] != sparsity.end(i); increment_iterators(jts))
108 for (unsigned int k = 0; k < simd_length; ++k) {
109 const unsigned int column = jts[k]->column();
110 *col_ptr++ = column;
111 const std::size_t position = sparsity(column, i + k);
112 if (column < n_internal_dofs) {
113 const unsigned int my_row_length = sparsity.row_length(column);
114 const std::size_t position_diag = sparsity(column, column);
115 const std::size_t pos_within_row = position - position_diag;
116 const unsigned int simd_offset = column % simd_length;
117 *transposed_ptr++ = position - simd_offset * my_row_length -
118 pos_within_row + simd_offset +
119 pos_within_row * simd_length;
120 } else
121 *transposed_ptr++ = position;
122 }
123
124 row_starts[i / simd_length + 1] = col_ptr - column_indices.data();
125 }
126
127 /* Rest: */
128
129 row_starts[n_internal_dofs] = row_starts[n_internal_dofs / simd_length];
130
131 for (unsigned int i = n_internal_dofs; i < sparsity.n_rows(); ++i) {
132 for (auto j = sparsity.begin(i); j != sparsity.end(i); ++j) {
133 const unsigned int column = j->column();
134 *col_ptr++ = column;
135 const std::size_t position = sparsity(column, i);
136 if (column < n_internal_dofs) {
137 const unsigned int my_row_length = sparsity.row_length(column);
138 const std::size_t position_diag = sparsity(column, column);
139 const std::size_t pos_within_row = position - position_diag;
140 const unsigned int simd_offset = column % simd_length;
141 *transposed_ptr++ = position - simd_offset * my_row_length -
142 pos_within_row + simd_offset +
143 pos_within_row * simd_length;
144 } else
145 *transposed_ptr++ = position;
146 }
147 row_starts[i + 1] = col_ptr - column_indices.data();
148 }
149
150 Assert(col_ptr == column_indices.end(), dealii::ExcInternalError());
151
152 /* Compute the data exchange pattern: */
153
154 if (sparsity.n_rows() > n_locally_owned_dofs) {
155
156 /*
157 * Set up receive targets.
158 *
159 * We receive our (reduced) ghost rows from MPI ranks in the ghost
160 * range of the (scalar) partitioner. We receive the entire (reduced)
161 * ghost row from that MPI rank; we can thus simply query the
162 * sparsity pattern how many data points we receive from each MPI
163 * rank.
164 */
166 const auto &ghost_targets = partitioner->ghost_targets();
168 receive_targets.resize(ghost_targets.size());
169
170 for (unsigned int p = 0; p < receive_targets.size(); ++p) {
171 receive_targets[p].first = ghost_targets[p].first;
172 }
173
174 const auto gt_begin = ghost_targets.begin();
175 auto gt_ptr = ghost_targets.begin();
176 std::size_t index = 0; /* index into ghost range of sparsity pattern */
177 unsigned int row_count = 0;
178
179 for (unsigned int i = n_locally_owned_dofs; i < sparsity.n_rows(); ++i) {
180 index += sparsity.row_length(i);
181 ++row_count;
182 if (row_count == gt_ptr->second) {
183 receive_targets[gt_ptr - gt_begin].second = index;
184 row_count = 0; /* reset row count and move on to new rank */
185 ++gt_ptr;
186 }
187 }
188
189 Assert(gt_ptr == partitioner->ghost_targets().end(),
190 dealii::ExcInternalError());
191
192 /*
193 * Collect indices to be sent.
194 *
195 * These consist of the diagonal, as well as the part of columns of
196 * the given range. Note that this assumes that the sparsity pattern
197 * only contains those entries in ghosted rows which have a
198 * corresponding transpose entry in the owned rows; which is the case
199 * for our minimized sparsity pattern.
200 */
201
202 std::vector<unsigned int> ghost_ranges(ghost_targets.size() + 1);
203 ghost_ranges[0] = n_locally_owned_dofs;
204 for (unsigned int p = 0; p < receive_targets.size(); ++p) {
205 ghost_ranges[p + 1] = ghost_ranges[p] + ghost_targets[p].second;
206 }
207
208 std::vector<unsigned int> import_indices_part;
209 for (auto i : partitioner->import_indices())
210 for (unsigned int j = i.first; j < i.second; ++j)
211 import_indices_part.push_back(j);
212
213 AssertDimension(import_indices_part.size(),
214 partitioner->n_import_indices());
215
216 const auto &import_targets = partitioner->import_targets();
217 entries_to_be_sent.clear();
218 send_targets.resize(import_targets.size());
219 auto idx = import_indices_part.begin();
220
221 /*
222 * Index p iterates over import_targets() and index p_match iterates
223 * over ghost_ranges. such that
224 * import_targets()[p].first == ghost_rangs[p_match].first
225 */
226 unsigned int p_match = 0;
227 for (unsigned int p = 0; p < import_targets.size(); ++p) {
228
229 /*
230 * Match up the rank index between receive and import targets. If
231 * we do not find a match, which can happen for locally refined
232 * meshes, then we set p_match equal to receive_targets.size().
233 *
234 * When trying to match the next processor index we consequently
235 * have to reset p_match to 0 again. This assumes that processor
236 * indices are sorted in the receive_targets and ghost_targets
237 * vectors.
238 */
239 p_match = (p_match == receive_targets.size() ? 0 : p_match);
240 while (p_match < receive_targets.size() &&
241 receive_targets[p_match].first != import_targets[p].first)
242 p_match++;
243
244 for (unsigned int c = 0; c < import_targets[p].second; ++c, ++idx) {
245 /*
246 * Continue if we do not have a match. Note that we need to enter
247 * and continue this loop till the end in order to advance idx
248 * correctly.
249 */
250 if (p_match == receive_targets.size())
251 continue;
252
253 const unsigned int row = *idx;
254
255 entries_to_be_sent.emplace_back(row, 0);
256 for (auto jt = ++sparsity.begin(row); jt != sparsity.end(row); ++jt) {
257 if (jt->column() >= ghost_ranges[p_match] &&
258 jt->column() < ghost_ranges[p_match + 1]) {
259 const unsigned int position_within_column =
260 jt - sparsity.begin(row);
261 entries_to_be_sent.emplace_back(row, position_within_column);
262 }
263 }
264 }
265
266 send_targets[p].first = partitioner->import_targets()[p].first;
267 send_targets[p].second = entries_to_be_sent.size();
268 }
269 }
270 }
271
272
273 template <typename Number, int n_components, int simd_length>
275 : sparsity(nullptr)
276 {
277 }
278
279
280 template <typename Number, int n_components, int simd_length>
282 const SparsityPatternSIMD<simd_length> &sparsity)
283 : sparsity(&sparsity)
284 {
285 data.resize(sparsity.n_nonzero_elements() * n_components);
286 }
287
288
289 template <typename Number, int n_components, int simd_length>
291 const SparsityPatternSIMD<simd_length> &sparsity)
292 {
293 this->sparsity = &sparsity;
294 data.resize(sparsity.n_nonzero_elements() * n_components);
295 }
296
297
298 template <typename Number, int n_components, int simd_length>
299 template <typename SparseMatrix>
301 const std::array<SparseMatrix, n_components> &sparse_matrix,
302 bool locally_indexed /*= true*/)
303 {
305
306 /*
307 * We use the indirect (and slow) access via operator()(i, j) into the
308 * sparse matrix we are copying from. This allows for significantly
309 * increased flexibility with respect to the sparsity pattern used in
310 * the sparse_matrix object.
311 */
312
314 for (unsigned int i = 0; i < sparsity->n_internal_dofs; i += simd_length) {
315
316 const unsigned int row_length = sparsity->row_length(i);
317
318 const unsigned int *js = sparsity->columns(i);
319 for (unsigned int col_idx = 0; col_idx < row_length;
320 ++col_idx, js += simd_length) {
321
322 dealii::Tensor<1, n_components, VectorizedArray> temp;
323 for (unsigned int k = 0; k < simd_length; ++k)
324 for (unsigned int d = 0; d < n_components; ++d)
325 if (locally_indexed)
326 temp[d][k] = sparse_matrix[d](i + k, js[k]);
327 else
328 temp[d][k] = sparse_matrix[d].el(
329 sparsity->partitioner->local_to_global(i + k),
330 sparsity->partitioner->local_to_global(js[k]));
331
332 write_entry(temp, i, col_idx, true);
333 }
334 }
335
337 for (unsigned int i = sparsity->n_internal_dofs;
338 i < sparsity->n_locally_owned_dofs;
339 ++i) {
340 const unsigned int row_length = sparsity->row_length(i);
341 const unsigned int *js = sparsity->columns(i);
342 for (unsigned int col_idx = 0; col_idx < row_length; ++col_idx, ++js) {
343
344 dealii::Tensor<1, n_components, Number> temp;
345 for (unsigned int d = 0; d < n_components; ++d)
346 if (locally_indexed)
347 temp[d] = sparse_matrix[d](i, js[0]);
348 else
349 temp[d] = sparse_matrix[d].el(
350 sparsity->partitioner->local_to_global(i),
351 sparsity->partitioner->local_to_global(js[0]));
352 write_entry(temp, i, col_idx);
353 }
354 }
355
357 }
358
359
360 template <typename Number, int n_components, int simd_length>
361 template <typename SparseMatrix>
363 const SparseMatrix &sparse_matrix, bool locally_indexed /*= true*/)
364 {
366
367 /*
368 * We use the indirect (and slow) access via operator()(i, j) into the
369 * sparse matrix we are copying from. This allows for significantly
370 * increased flexibility with respect to the sparsity pattern used in
371 * the sparse_matrix object.
372 */
373
375 for (unsigned int i = 0; i < sparsity->n_internal_dofs; i += simd_length) {
376
377 const unsigned int row_length = sparsity->row_length(i);
378
379 const unsigned int *js = sparsity->columns(i);
380 for (unsigned int col_idx = 0; col_idx < row_length;
381 ++col_idx, js += simd_length) {
382
383 VectorizedArray temp = {};
384 for (unsigned int k = 0; k < simd_length; ++k)
385 if (locally_indexed)
386 temp[k] = sparse_matrix(i + k, js[k]);
387 else
388 temp[k] =
389 sparse_matrix.el(sparsity->partitioner->local_to_global(i + k),
390 sparsity->partitioner->local_to_global(js[k]));
391
392 write_entry(temp, i, col_idx, true);
393 }
394 }
395
397 for (unsigned int i = sparsity->n_internal_dofs;
398 i < sparsity->n_locally_owned_dofs;
399 ++i) {
400
401 const unsigned int row_length = sparsity->row_length(i);
402 const unsigned int *js = sparsity->columns(i);
403 for (unsigned int col_idx = 0; col_idx < row_length; ++col_idx, ++js) {
404
405 const Number temp =
406 locally_indexed
407 ? sparse_matrix(i, js[0])
408 : sparse_matrix.el(
409 sparsity->partitioner->local_to_global(i),
410 sparsity->partitioner->local_to_global(js[0]));
411 write_entry(temp, i, col_idx);
412 }
413 }
414
416 }
417
418} // namespace ryujin
dealii::VectorizedArray< Number, simd_length > VectorizedArray
void read_in(const std::array< SparseMatrix, n_components > &sparse_matrix, bool locally_indexed=true)
void reinit(const SparsityPatternSIMD< simd_length > &sparsity)
dealii::AlignedVector< Number > data
const SparsityPatternSIMD< simd_length > * sparsity
void reinit(const unsigned int n_internal_dofs, const dealii::DynamicSparsityPattern &sparsity, const std::shared_ptr< const dealii::Utilities::MPI::Partitioner > &partitioner)
std::shared_ptr< const dealii::Utilities::MPI::Partitioner > partitioner
std::size_t n_nonzero_elements() const
#define RYUJIN_PARALLEL_REGION_BEGIN
Definition: openmp.h:54
#define RYUJIN_OMP_FOR
Definition: openmp.h:70
#define RYUJIN_PARALLEL_REGION_END
Definition: openmp.h:63
DEAL_II_ALWAYS_INLINE void increment_iterators(T &iterators)
Definition: simd.h:94
DEAL_II_ALWAYS_INLINE void write_entry(V &vector, const T &values, unsigned int i)
Definition: simd.h:357