ryujin 2.1.1 revision 6dc06e5864abd5d99e5d7ab641dbe621936411d9
sparse_matrix_simd.template.h
Go to the documentation of this file.
1//
2// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
3// Copyright (C) 2020 - 2024 by the ryujin authors
4//
5
6#pragma once
7
9
10#include <deal.II/base/vectorization.h>
11#include <deal.II/lac/sparse_matrix.h>
12
13namespace ryujin
14{
15
16 template <int simd_length>
18 : n_internal_dofs(0)
19 , row_starts(1)
20 , mpi_communicator(MPI_COMM_SELF)
21 {
22 }
23
24
25 template <int simd_length>
27 const unsigned int n_internal_dofs,
28 const dealii::DynamicSparsityPattern &sparsity,
29 const std::shared_ptr<const dealii::Utilities::MPI::Partitioner>
30 &partitioner)
31 : n_internal_dofs(0)
32 , mpi_communicator(MPI_COMM_SELF)
33 {
35 }
36
37
38 template <int simd_length>
40 const unsigned int n_internal_dofs,
41 const dealii::DynamicSparsityPattern &dsp,
42 const std::shared_ptr<const dealii::Utilities::MPI::Partitioner>
43 &partitioner)
44 {
45 this->mpi_communicator = partitioner->get_mpi_communicator();
46
47 this->n_internal_dofs = n_internal_dofs;
48 this->n_locally_owned_dofs = partitioner->locally_owned_size();
49 this->partitioner = partitioner;
50
51 const auto n_locally_relevant_dofs =
52 partitioner->locally_owned_size() + partitioner->n_ghost_indices();
53
54 /*
55 * First, create a static sparsity pattern (in local indexing), where
56 * the only off-processor rows are the ones for which locally owned rows
57 * request the transpose entries. This will be the one we finally
58 * compute on.
59 */
60
61 dealii::DynamicSparsityPattern dsp_minimal(n_locally_relevant_dofs,
62 n_locally_relevant_dofs);
63 for (unsigned int i = 0; i < n_locally_owned_dofs; ++i) {
64 const auto global_row = partitioner->local_to_global(i);
65 for (auto it = dsp.begin(global_row); it != dsp.end(global_row); ++it) {
66 const auto global_column = it->column();
67 const auto j = partitioner->global_to_local(global_column);
68 dsp_minimal.add(i, j);
69 if (j >= n_locally_owned_dofs) {
70 Assert(j < n_locally_relevant_dofs, dealii::ExcInternalError());
71 dsp_minimal.add(j, i);
72 }
73 }
74 }
75
76 dealii::SparsityPattern sparsity;
77 sparsity.copy_from(dsp_minimal);
78
79 Assert(n_internal_dofs <= sparsity.n_rows(), dealii::ExcInternalError());
80 Assert(n_internal_dofs % simd_length == 0, dealii::ExcInternalError());
81 Assert(n_internal_dofs <= n_locally_owned_dofs, dealii::ExcInternalError());
82 Assert(n_locally_owned_dofs <= sparsity.n_rows(),
83 dealii::ExcInternalError());
84
85 row_starts.resize_fast(sparsity.n_rows() + 1);
86 column_indices.resize_fast(sparsity.n_nonzero_elements());
87 indices_transposed.resize_fast(sparsity.n_nonzero_elements());
88 AssertThrow(sparsity.n_nonzero_elements() <
89 std::numeric_limits<unsigned int>::max(),
90 dealii::ExcMessage("Transposed indices only support up to 4 "
91 "billion matrix entries per MPI rank. Try to"
92 " split into smaller problems with MPI"));
93
94 /* Vectorized part: */
95
96 row_starts[0] = 0;
97
98 unsigned int *col_ptr = column_indices.data();
99 unsigned int *transposed_ptr = indices_transposed.data();
100
101 for (unsigned int i = 0; i < n_internal_dofs; i += simd_length) {
102 auto jts = generate_iterators<simd_length>(
103 [&](auto k) { return sparsity.begin(i + k); });
104
105 for (; jts[0] != sparsity.end(i); increment_iterators(jts))
106 for (unsigned int k = 0; k < simd_length; ++k) {
107 const unsigned int column = jts[k]->column();
108 *col_ptr++ = column;
109 const std::size_t position = sparsity(column, i + k);
110 if (column < n_internal_dofs) {
111 const unsigned int my_row_length = sparsity.row_length(column);
112 const std::size_t position_diag = sparsity(column, column);
113 const std::size_t pos_within_row = position - position_diag;
114 const unsigned int simd_offset = column % simd_length;
115 *transposed_ptr++ = position - simd_offset * my_row_length -
116 pos_within_row + simd_offset +
117 pos_within_row * simd_length;
118 } else
119 *transposed_ptr++ = position;
120 }
121
122 row_starts[i / simd_length + 1] = col_ptr - column_indices.data();
123 }
124
125 /* Rest: */
126
127 row_starts[n_internal_dofs] = row_starts[n_internal_dofs / simd_length];
128
129 for (unsigned int i = n_internal_dofs; i < sparsity.n_rows(); ++i) {
130 for (auto j = sparsity.begin(i); j != sparsity.end(i); ++j) {
131 const unsigned int column = j->column();
132 *col_ptr++ = column;
133 const std::size_t position = sparsity(column, i);
134 if (column < n_internal_dofs) {
135 const unsigned int my_row_length = sparsity.row_length(column);
136 const std::size_t position_diag = sparsity(column, column);
137 const std::size_t pos_within_row = position - position_diag;
138 const unsigned int simd_offset = column % simd_length;
139 *transposed_ptr++ = position - simd_offset * my_row_length -
140 pos_within_row + simd_offset +
141 pos_within_row * simd_length;
142 } else
143 *transposed_ptr++ = position;
144 }
145 row_starts[i + 1] = col_ptr - column_indices.data();
146 }
147
148 Assert(col_ptr == column_indices.end(), dealii::ExcInternalError());
149
150 /* Compute the data exchange pattern: */
151
152 if (sparsity.n_rows() > n_locally_owned_dofs) {
153
154 /*
155 * Set up receive targets.
156 *
157 * We receive our (reduced) ghost rows from MPI ranks in the ghost
158 * range of the (scalar) partitioner. We receive the entire (reduced)
159 * ghost row from that MPI rank; we can thus simply query the
160 * sparsity pattern how many data points we receive from each MPI
161 * rank.
162 */
163
164 const auto &ghost_targets = partitioner->ghost_targets();
165
166 receive_targets.resize(ghost_targets.size());
167
168 for (unsigned int p = 0; p < receive_targets.size(); ++p) {
169 receive_targets[p].first = ghost_targets[p].first;
170 }
172 const auto gt_begin = ghost_targets.begin();
173 auto gt_ptr = ghost_targets.begin();
174 std::size_t index = 0; /* index into ghost range of sparsity pattern */
175 unsigned int row_count = 0;
176
177 for (unsigned int i = n_locally_owned_dofs; i < sparsity.n_rows(); ++i) {
178 index += sparsity.row_length(i);
179 ++row_count;
180 if (row_count == gt_ptr->second) {
181 receive_targets[gt_ptr - gt_begin].second = index;
182 row_count = 0; /* reset row count and move on to new rank */
183 ++gt_ptr;
184 }
185 }
186
187 Assert(gt_ptr == partitioner->ghost_targets().end(),
188 dealii::ExcInternalError());
189
190 /*
191 * Collect indices to be sent.
192 *
193 * These consist of the diagonal, as well as the part of columns of
194 * the given range. Note that this assumes that the sparsity pattern
195 * only contains those entries in ghosted rows which have a
196 * corresponding transpose entry in the owned rows; which is the case
197 * for our minimized sparsity pattern.
198 */
199
200 std::vector<unsigned int> ghost_ranges(ghost_targets.size() + 1);
201 ghost_ranges[0] = n_locally_owned_dofs;
202 for (unsigned int p = 0; p < receive_targets.size(); ++p) {
203 ghost_ranges[p + 1] = ghost_ranges[p] + ghost_targets[p].second;
204 }
205
206 std::vector<unsigned int> import_indices_part;
207 for (auto i : partitioner->import_indices())
208 for (unsigned int j = i.first; j < i.second; ++j)
209 import_indices_part.push_back(j);
210
211 AssertDimension(import_indices_part.size(),
212 partitioner->n_import_indices());
213
214 const auto &import_targets = partitioner->import_targets();
215 entries_to_be_sent.clear();
216 send_targets.resize(import_targets.size());
217 auto idx = import_indices_part.begin();
218
219 /*
220 * Index p iterates over import_targets() and index p_match iterates
221 * over ghost_ranges. such that
222 * import_targets()[p].first == ghost_rangs[p_match].first
223 */
224 unsigned int p_match = 0;
225 for (unsigned int p = 0; p < import_targets.size(); ++p) {
226
227 /*
228 * Match up the rank index between receive and import targets. If
229 * we do not find a match, which can happen for locally refined
230 * meshes, then we set p_match equal to receive_targets.size().
231 *
232 * When trying to match the next processor index we consequently
233 * have to reset p_match to 0 again. This assumes that processor
234 * indices are sorted in the receive_targets and ghost_targets
235 * vectors.
236 */
237 p_match = (p_match == receive_targets.size() ? 0 : p_match);
238 while (p_match < receive_targets.size() &&
239 receive_targets[p_match].first != import_targets[p].first)
240 p_match++;
241
242 for (unsigned int c = 0; c < import_targets[p].second; ++c, ++idx) {
243 /*
244 * Continue if we do not have a match. Note that we need to enter
245 * and continue this loop till the end in order to advance idx
246 * correctly.
247 */
248 if (p_match == receive_targets.size())
249 continue;
250
251 const unsigned int row = *idx;
252
253 entries_to_be_sent.emplace_back(row, 0);
254 for (auto jt = ++sparsity.begin(row); jt != sparsity.end(row); ++jt) {
255 if (jt->column() >= ghost_ranges[p_match] &&
256 jt->column() < ghost_ranges[p_match + 1]) {
257 const unsigned int position_within_column =
258 jt - sparsity.begin(row);
259 entries_to_be_sent.emplace_back(row, position_within_column);
260 }
261 }
262 }
263
264 send_targets[p].first = partitioner->import_targets()[p].first;
265 send_targets[p].second = entries_to_be_sent.size();
266 }
267 }
268 }
269
270
271 template <typename Number, int n_components, int simd_length>
273 : sparsity(nullptr)
274 {
275 }
276
277
278 template <typename Number, int n_components, int simd_length>
280 const SparsityPatternSIMD<simd_length> &sparsity)
281 : sparsity(&sparsity)
282 {
283 data.resize(sparsity.n_nonzero_elements() * n_components);
284 }
285
286
287 template <typename Number, int n_components, int simd_length>
289 const SparsityPatternSIMD<simd_length> &sparsity)
290 {
291 this->sparsity = &sparsity;
292 data.resize(sparsity.n_nonzero_elements() * n_components);
293 }
294
295
296 template <typename Number, int n_components, int simd_length>
297 template <typename SparseMatrix>
299 const std::array<SparseMatrix, n_components> &sparse_matrix,
300 bool locally_indexed /*= true*/)
301 {
303
304 /*
305 * We use the indirect (and slow) access via operator()(i, j) into the
306 * sparse matrix we are copying from. This allows for significantly
307 * increased flexibility with respect to the sparsity pattern used in
308 * the sparse_matrix object.
309 */
310
312 for (unsigned int i = 0; i < sparsity->n_internal_dofs; i += simd_length) {
313
314 const unsigned int row_length = sparsity->row_length(i);
315
316 const unsigned int *js = sparsity->columns(i);
317 for (unsigned int col_idx = 0; col_idx < row_length;
318 ++col_idx, js += simd_length) {
319
320 dealii::Tensor<1, n_components, VectorizedArray> temp;
321 for (unsigned int k = 0; k < simd_length; ++k)
322 for (unsigned int d = 0; d < n_components; ++d)
323 if (locally_indexed)
324 temp[d][k] = sparse_matrix[d](i + k, js[k]);
325 else
326 temp[d][k] = sparse_matrix[d].el(
327 sparsity->partitioner->local_to_global(i + k),
328 sparsity->partitioner->local_to_global(js[k]));
329
330 write_entry(temp, i, col_idx, true);
331 }
332 }
333
335 for (unsigned int i = sparsity->n_internal_dofs;
336 i < sparsity->n_locally_owned_dofs;
337 ++i) {
338 const unsigned int row_length = sparsity->row_length(i);
339 const unsigned int *js = sparsity->columns(i);
340 for (unsigned int col_idx = 0; col_idx < row_length; ++col_idx, ++js) {
341
342 dealii::Tensor<1, n_components, Number> temp;
343 for (unsigned int d = 0; d < n_components; ++d)
344 if (locally_indexed)
345 temp[d] = sparse_matrix[d](i, js[0]);
346 else
347 temp[d] = sparse_matrix[d].el(
348 sparsity->partitioner->local_to_global(i),
349 sparsity->partitioner->local_to_global(js[0]));
350 write_entry(temp, i, col_idx);
351 }
352 }
353
355 }
356
357
358 template <typename Number, int n_components, int simd_length>
359 template <typename SparseMatrix>
361 const SparseMatrix &sparse_matrix, bool locally_indexed /*= true*/)
362 {
364
365 /*
366 * We use the indirect (and slow) access via operator()(i, j) into the
367 * sparse matrix we are copying from. This allows for significantly
368 * increased flexibility with respect to the sparsity pattern used in
369 * the sparse_matrix object.
370 */
371
373 for (unsigned int i = 0; i < sparsity->n_internal_dofs; i += simd_length) {
374
375 const unsigned int row_length = sparsity->row_length(i);
376
377 const unsigned int *js = sparsity->columns(i);
378 for (unsigned int col_idx = 0; col_idx < row_length;
379 ++col_idx, js += simd_length) {
380
381 VectorizedArray temp = {};
382 for (unsigned int k = 0; k < simd_length; ++k)
383 if (locally_indexed)
384 temp[k] = sparse_matrix(i + k, js[k]);
385 else
386 temp[k] =
387 sparse_matrix.el(sparsity->partitioner->local_to_global(i + k),
388 sparsity->partitioner->local_to_global(js[k]));
389
390 write_entry(temp, i, col_idx, true);
391 }
392 }
393
395 for (unsigned int i = sparsity->n_internal_dofs;
396 i < sparsity->n_locally_owned_dofs;
397 ++i) {
398
399 const unsigned int row_length = sparsity->row_length(i);
400 const unsigned int *js = sparsity->columns(i);
401 for (unsigned int col_idx = 0; col_idx < row_length; ++col_idx, ++js) {
402
403 const Number temp =
404 locally_indexed
405 ? sparse_matrix(i, js[0])
406 : sparse_matrix.el(
407 sparsity->partitioner->local_to_global(i),
408 sparsity->partitioner->local_to_global(js[0]));
409 write_entry(temp, i, col_idx);
410 }
411 }
412
414 }
415
416} // namespace ryujin
dealii::VectorizedArray< Number, simd_length > VectorizedArray
void read_in(const std::array< SparseMatrix, n_components > &sparse_matrix, bool locally_indexed=true)
void reinit(const SparsityPatternSIMD< simd_length > &sparsity)
dealii::AlignedVector< Number > data
const SparsityPatternSIMD< simd_length > * sparsity
void reinit(const unsigned int n_internal_dofs, const dealii::DynamicSparsityPattern &sparsity, const std::shared_ptr< const dealii::Utilities::MPI::Partitioner > &partitioner)
std::shared_ptr< const dealii::Utilities::MPI::Partitioner > partitioner
std::size_t n_nonzero_elements() const
#define RYUJIN_PARALLEL_REGION_BEGIN
Definition: openmp.h:54
#define RYUJIN_OMP_FOR
Definition: openmp.h:70
#define RYUJIN_PARALLEL_REGION_END
Definition: openmp.h:63
DEAL_II_ALWAYS_INLINE void increment_iterators(T &iterators)
Definition: simd.h:94
DEAL_II_ALWAYS_INLINE void write_entry(V &vector, const T &values, unsigned int i)
Definition: simd.h:357