MueLu  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
MueLu_TentativePFactory_kokkos_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 //
3 // ***********************************************************************
4 //
5 // MueLu: A package for multigrid based preconditioning
6 // Copyright 2012 Sandia Corporation
7 //
8 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
9 // the U.S. Government retains certain rights in this software.
10 //
11 // Redistribution and use in source and binary forms, with or without
12 // modification, are permitted provided that the following conditions are
13 // met:
14 //
15 // 1. Redistributions of source code must retain the above copyright
16 // notice, this list of conditions and the following disclaimer.
17 //
18 // 2. Redistributions in binary form must reproduce the above copyright
19 // notice, this list of conditions and the following disclaimer in the
20 // documentation and/or other materials provided with the distribution.
21 //
22 // 3. Neither the name of the Corporation nor the names of the
23 // contributors may be used to endorse or promote products derived from
24 // this software without specific prior written permission.
25 //
26 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 //
38 // Questions? Contact
39 // Jonathan Hu (jhu@sandia.gov)
40 // Andrey Prokopenko (aprokop@sandia.gov)
41 // Ray Tuminaro (rstumin@sandia.gov)
42 //
43 // ***********************************************************************
44 //
45 // @HEADER
46 #ifndef MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
47 #define MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
48 
49 #include "Kokkos_UnorderedMap.hpp"
51 
53 
54 #include "MueLu_Aggregates.hpp"
55 #include "MueLu_AmalgamationInfo.hpp"
56 
57 #include "MueLu_MasterList.hpp"
58 #include "MueLu_PerfUtils.hpp"
59 #include "MueLu_Monitor.hpp"
60 
61 #include "Xpetra_IO.hpp"
62 
63 namespace MueLu {
64 
65 namespace { // anonymous
66 
67 template <class LocalOrdinal, class View>
68 class ReduceMaxFunctor {
69  public:
70  ReduceMaxFunctor(View view)
71  : view_(view) {}
72 
73  KOKKOS_INLINE_FUNCTION
74  void operator()(const LocalOrdinal& i, LocalOrdinal& vmax) const {
75  if (vmax < view_(i))
76  vmax = view_(i);
77  }
78 
79  KOKKOS_INLINE_FUNCTION
80  void join(LocalOrdinal& dst, const LocalOrdinal& src) const {
81  if (dst < src) {
82  dst = src;
83  }
84  }
85 
86  KOKKOS_INLINE_FUNCTION
87  void init(LocalOrdinal& dst) const {
88  dst = 0;
89  }
90 
91  private:
93 };
94 
95 // local QR decomposition
96 template <class LOType, class GOType, class SCType, class DeviceType, class NspType, class aggRowsType, class maxAggDofSizeType, class agg2RowMapLOType, class statusType, class rowsType, class rowsAuxType, class colsAuxType, class valsAuxType>
97 class LocalQRDecompFunctor {
98  private:
99  typedef LOType LO;
100  typedef GOType GO;
101  typedef SCType SC;
102 
103  typedef typename DeviceType::execution_space execution_space;
104  typedef typename Kokkos::ArithTraits<SC>::val_type impl_SC;
105  typedef Kokkos::ArithTraits<impl_SC> impl_ATS;
106  typedef typename impl_ATS::magnitudeType Magnitude;
107 
108  typedef Kokkos::View<impl_SC**, typename execution_space::scratch_memory_space, Kokkos::MemoryUnmanaged> shared_matrix;
109  typedef Kokkos::View<impl_SC*, typename execution_space::scratch_memory_space, Kokkos::MemoryUnmanaged> shared_vector;
110 
111  private:
112  NspType fineNS;
113  NspType coarseNS;
114  aggRowsType aggRows;
115  maxAggDofSizeType maxAggDofSize; //< maximum number of dofs in aggregate (max size of aggregate * numDofsPerNode)
116  agg2RowMapLOType agg2RowMapLO;
117  statusType statusAtomic;
118  rowsType rows;
119  rowsAuxType rowsAux;
120  colsAuxType colsAux;
121  valsAuxType valsAux;
122  bool doQRStep;
123 
124  public:
125  LocalQRDecompFunctor(NspType fineNS_, NspType coarseNS_, aggRowsType aggRows_, maxAggDofSizeType maxAggDofSize_, agg2RowMapLOType agg2RowMapLO_, statusType statusAtomic_, rowsType rows_, rowsAuxType rowsAux_, colsAuxType colsAux_, valsAuxType valsAux_, bool doQRStep_)
126  : fineNS(fineNS_)
127  , coarseNS(coarseNS_)
128  , aggRows(aggRows_)
129  , maxAggDofSize(maxAggDofSize_)
130  , agg2RowMapLO(agg2RowMapLO_)
131  , statusAtomic(statusAtomic_)
132  , rows(rows_)
133  , rowsAux(rowsAux_)
134  , colsAux(colsAux_)
135  , valsAux(valsAux_)
136  , doQRStep(doQRStep_) {}
137 
138  KOKKOS_INLINE_FUNCTION
139  void operator()(const typename Kokkos::TeamPolicy<execution_space>::member_type& thread, size_t& nnz) const {
140  auto agg = thread.league_rank();
141 
142  // size of aggregate: number of DOFs in aggregate
143  auto aggSize = aggRows(agg + 1) - aggRows(agg);
144 
145  const impl_SC one = impl_ATS::one();
146  const impl_SC two = one + one;
147  const impl_SC zero = impl_ATS::zero();
148  const auto zeroM = impl_ATS::magnitude(zero);
149 
150  int m = aggSize;
151  int n = fineNS.extent(1);
152 
153  // calculate row offset for coarse nullspace
154  Xpetra::global_size_t offset = agg * n;
155 
156  if (doQRStep) {
157  // Extract the piece of the nullspace corresponding to the aggregate
158  shared_matrix r(thread.team_shmem(), m, n); // A (initially), R (at the end)
159  for (int j = 0; j < n; j++)
160  for (int k = 0; k < m; k++)
161  r(k, j) = fineNS(agg2RowMapLO(aggRows(agg) + k), j);
162 #if 0
163  printf("A\n");
164  for (int i = 0; i < m; i++) {
165  for (int j = 0; j < n; j++)
166  printf(" %5.3lf ", r(i,j));
167  printf("\n");
168  }
169 #endif
170 
171  // Calculate QR decomposition (standard)
172  shared_matrix q(thread.team_shmem(), m, m); // Q
173  if (m >= n) {
174  bool isSingular = false;
175 
176  // Initialize Q^T
177  auto qt = q;
178  for (int i = 0; i < m; i++) {
179  for (int j = 0; j < m; j++)
180  qt(i, j) = zero;
181  qt(i, i) = one;
182  }
183 
184  for (int k = 0; k < n; k++) { // we ignore "n" instead of "n-1" to normalize
185  // FIXME_KOKKOS: use team
186  Magnitude s = zeroM, norm, norm_x;
187  for (int i = k + 1; i < m; i++)
188  s += pow(impl_ATS::magnitude(r(i, k)), 2);
189  norm = sqrt(pow(impl_ATS::magnitude(r(k, k)), 2) + s);
190 
191  if (norm == zero) {
192  isSingular = true;
193  break;
194  }
195 
196  r(k, k) -= norm * one;
197 
198  norm_x = sqrt(pow(impl_ATS::magnitude(r(k, k)), 2) + s);
199  if (norm_x == zeroM) {
200  // We have a single diagonal element in the column.
201  // No reflections required. Just need to restor r(k,k).
202  r(k, k) = norm * one;
203  continue;
204  }
205 
206  // FIXME_KOKKOS: use team
207  for (int i = k; i < m; i++)
208  r(i, k) /= norm_x;
209 
210  // Update R(k:m,k+1:n)
211  for (int j = k + 1; j < n; j++) {
212  // FIXME_KOKKOS: use team in the loops
213  impl_SC si = zero;
214  for (int i = k; i < m; i++)
215  si += r(i, k) * r(i, j);
216  for (int i = k; i < m; i++)
217  r(i, j) -= two * si * r(i, k);
218  }
219 
220  // Update Q^T (k:m,k:m)
221  for (int j = k; j < m; j++) {
222  // FIXME_KOKKOS: use team in the loops
223  impl_SC si = zero;
224  for (int i = k; i < m; i++)
225  si += r(i, k) * qt(i, j);
226  for (int i = k; i < m; i++)
227  qt(i, j) -= two * si * r(i, k);
228  }
229 
230  // Fix R(k:m,k)
231  r(k, k) = norm * one;
232  for (int i = k + 1; i < m; i++)
233  r(i, k) = zero;
234  }
235 
236 #if 0
237  // Q = (Q^T)^T
238  for (int i = 0; i < m; i++)
239  for (int j = 0; j < i; j++) {
240  impl_SC tmp = qt(i,j);
241  qt(i,j) = qt(j,i);
242  qt(j,i) = tmp;
243  }
244 #endif
245 
246  // Build coarse nullspace using the upper triangular part of R
247  for (int j = 0; j < n; j++)
248  for (int k = 0; k <= j; k++)
249  coarseNS(offset + k, j) = r(k, j);
250 
251  if (isSingular) {
252  statusAtomic(1) = true;
253  return;
254  }
255 
256  } else {
257  // Special handling for m < n (i.e. single node aggregates in structural mechanics)
258 
259  // The local QR decomposition is not possible in the "overconstrained"
260  // case (i.e. number of columns in qr > number of rowsAux), which
261  // corresponds to #DOFs in Aggregate < n. For usual problems this
262  // is only possible for single node aggregates in structural mechanics.
263  // (Similar problems may arise in discontinuous Galerkin problems...)
264  // We bypass the QR decomposition and use an identity block in the
265  // tentative prolongator for the single node aggregate and transfer the
266  // corresponding fine level null space information 1-to-1 to the coarse
267  // level null space part.
268 
269  // NOTE: The resulting tentative prolongation operator has
270  // (m*DofsPerNode-n) zero columns leading to a singular
271  // coarse level operator A. To deal with that one has the following
272  // options:
273  // - Use the "RepairMainDiagonal" flag in the RAPFactory (default:
274  // false) to add some identity block to the diagonal of the zero rowsAux
275  // in the coarse level operator A, such that standard level smoothers
276  // can be used again.
277  // - Use special (projection-based) level smoothers, which can deal
278  // with singular matrices (very application specific)
279  // - Adapt the code below to avoid zero columns. However, we do not
280  // support a variable number of DOFs per node in MueLu/Xpetra which
281  // makes the implementation really hard.
282  //
283  // FIXME: do we need to check for singularity here somehow? Zero
284  // columns would be easy but linear dependency would require proper QR.
285 
286  // R = extended (by adding identity rowsAux) qr
287  for (int j = 0; j < n; j++)
288  for (int k = 0; k < n; k++)
289  if (k < m)
290  coarseNS(offset + k, j) = r(k, j);
291  else
292  coarseNS(offset + k, j) = (k == j ? one : zero);
293 
294  // Q = I (rectangular)
295  for (int i = 0; i < m; i++)
296  for (int j = 0; j < n; j++)
297  q(i, j) = (j == i ? one : zero);
298  }
299 
300  // Process each row in the local Q factor and fill helper arrays to assemble P
301  for (int j = 0; j < m; j++) {
302  LO localRow = agg2RowMapLO(aggRows(agg) + j);
303  size_t rowStart = rowsAux(localRow);
304  size_t lnnz = 0;
305  for (int k = 0; k < n; k++) {
306  // skip zeros
307  if (q(j, k) != zero) {
308  colsAux(rowStart + lnnz) = offset + k;
309  valsAux(rowStart + lnnz) = q(j, k);
310  lnnz++;
311  }
312  }
313  rows(localRow + 1) = lnnz;
314  nnz += lnnz;
315  }
316 
317 #if 0
318  printf("R\n");
319  for (int i = 0; i < m; i++) {
320  for (int j = 0; j < n; j++)
321  printf(" %5.3lf ", coarseNS(i,j));
322  printf("\n");
323  }
324 
325  printf("Q\n");
326  for (int i = 0; i < aggSize; i++) {
327  for (int j = 0; j < aggSize; j++)
328  printf(" %5.3lf ", q(i,j));
329  printf("\n");
330  }
331 #endif
332  } else {
334  // "no-QR" option //
336  // Local Q factor is just the fine nullspace support over the current aggregate.
337  // Local R factor is the identity.
338  // TODO I have not implemented any special handling for aggregates that are too
339  // TODO small to locally support the nullspace, as is done in the standard QR
340  // TODO case above.
341 
342  for (int j = 0; j < m; j++) {
343  LO localRow = agg2RowMapLO(aggRows(agg) + j);
344  size_t rowStart = rowsAux(localRow);
345  size_t lnnz = 0;
346  for (int k = 0; k < n; k++) {
347  const impl_SC qr_jk = fineNS(localRow, k);
348  // skip zeros
349  if (qr_jk != zero) {
350  colsAux(rowStart + lnnz) = offset + k;
351  valsAux(rowStart + lnnz) = qr_jk;
352  lnnz++;
353  }
354  }
355  rows(localRow + 1) = lnnz;
356  nnz += lnnz;
357  }
358 
359  for (int j = 0; j < n; j++)
360  coarseNS(offset + j, j) = one;
361  }
362  }
363 
364  // amount of shared memory
365  size_t team_shmem_size(int /* team_size */) const {
366  if (doQRStep) {
367  int m = maxAggDofSize;
368  int n = fineNS.extent(1);
369  return shared_matrix::shmem_size(m, n) + // r
370  shared_matrix::shmem_size(m, m); // q
371  } else
372  return 0;
373  }
374 };
375 
376 } // namespace
377 
378 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
380  RCP<ParameterList> validParamList = rcp(new ParameterList());
381 
382 #define SET_VALID_ENTRY(name) validParamList->setEntry(name, MasterList::getEntry(name))
383  SET_VALID_ENTRY("tentative: calculate qr");
384  SET_VALID_ENTRY("tentative: build coarse coordinates");
385 #undef SET_VALID_ENTRY
386 
387  validParamList->set<RCP<const FactoryBase>>("A", Teuchos::null, "Generating factory of the matrix A");
388  validParamList->set<RCP<const FactoryBase>>("Aggregates", Teuchos::null, "Generating factory of the aggregates");
389  validParamList->set<RCP<const FactoryBase>>("Nullspace", Teuchos::null, "Generating factory of the nullspace");
390  validParamList->set<RCP<const FactoryBase>>("Scaled Nullspace", Teuchos::null, "Generating factory of the scaled nullspace");
391  validParamList->set<RCP<const FactoryBase>>("UnAmalgamationInfo", Teuchos::null, "Generating factory of UnAmalgamationInfo");
392  validParamList->set<RCP<const FactoryBase>>("CoarseMap", Teuchos::null, "Generating factory of the coarse map");
393  validParamList->set<RCP<const FactoryBase>>("Coordinates", Teuchos::null, "Generating factory of the coordinates");
394  validParamList->set<RCP<const FactoryBase>>("Node Comm", Teuchos::null, "Generating factory of the node level communicator");
395 
396  // Make sure we don't recursively validate options for the matrixmatrix kernels
397  ParameterList norecurse;
398  norecurse.disableRecursiveValidation();
399  validParamList->set<ParameterList>("matrixmatrix: kernel params", norecurse, "MatrixMatrix kernel parameters");
400 
401  return validParamList;
402 }
403 
404 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
406  const ParameterList& pL = GetParameterList();
407  // NOTE: This guy can only either be 'Nullspace' or 'Scaled Nullspace' or else the validator above will cause issues
408  std::string nspName = "Nullspace";
409  if (pL.isParameter("Nullspace name")) nspName = pL.get<std::string>("Nullspace name");
410 
411  Input(fineLevel, "A");
412  Input(fineLevel, "Aggregates");
413  Input(fineLevel, nspName);
414  Input(fineLevel, "UnAmalgamationInfo");
415  Input(fineLevel, "CoarseMap");
416  if (fineLevel.GetLevelID() == 0 &&
417  fineLevel.IsAvailable("Coordinates", NoFactory::get()) && // we have coordinates (provided by user app)
418  pL.get<bool>("tentative: build coarse coordinates")) { // and we want coordinates on other levels
419  bTransferCoordinates_ = true; // then set the transfer coordinates flag to true
420  Input(fineLevel, "Coordinates");
421  } else if (bTransferCoordinates_) {
422  Input(fineLevel, "Coordinates");
423  }
424 }
425 
426 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
428  return BuildP(fineLevel, coarseLevel);
429 }
430 
431 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
433  FactoryMonitor m(*this, "Build", coarseLevel);
434 
435  typedef typename Teuchos::ScalarTraits<Scalar>::coordinateType coordinate_type;
436  typedef Xpetra::MultiVectorFactory<coordinate_type, LO, GO, NO> RealValuedMultiVectorFactory;
437  const ParameterList& pL = GetParameterList();
438  std::string nspName = "Nullspace";
439  if (pL.isParameter("Nullspace name")) nspName = pL.get<std::string>("Nullspace name");
440 
441  auto A = Get<RCP<Matrix>>(fineLevel, "A");
442  auto aggregates = Get<RCP<Aggregates>>(fineLevel, "Aggregates");
443  auto amalgInfo = Get<RCP<AmalgamationInfo>>(fineLevel, "UnAmalgamationInfo");
444  auto fineNullspace = Get<RCP<MultiVector>>(fineLevel, nspName);
445  auto coarseMap = Get<RCP<const Map>>(fineLevel, "CoarseMap");
446  RCP<RealValuedMultiVector> fineCoords;
447  if (bTransferCoordinates_) {
448  fineCoords = Get<RCP<RealValuedMultiVector>>(fineLevel, "Coordinates");
449  }
450 
451  RCP<Matrix> Ptentative;
452  // No coarse DoFs so we need to bail by setting Ptentattive to null and returning
453  // This level will ultimately be removed in MueLu_Hierarchy_defs.h via a resize()
454  if (aggregates->GetNumGlobalAggregatesComputeIfNeeded() == 0) {
455  Ptentative = Teuchos::null;
456  Set(coarseLevel, "P", Ptentative);
457  return;
458  }
459  RCP<MultiVector> coarseNullspace;
460  RCP<RealValuedMultiVector> coarseCoords;
461 
462  if (bTransferCoordinates_) {
463  RCP<const Map> coarseCoordMap;
464  using array_type = typename Map::global_indices_array_device_type;
465 
466  LO blkSize = 1;
467  if (rcp_dynamic_cast<const StridedMap>(coarseMap) != Teuchos::null)
468  blkSize = rcp_dynamic_cast<const StridedMap>(coarseMap)->getFixedBlockSize();
469 
470  if (blkSize == 1) {
471  // Scalar system
472  // No amalgamation required, we can use the coarseMap
473  coarseCoordMap = coarseMap;
474  } else {
475  // Vector system
476  // NOTE: There could be further optimizations here where we detect contiguous maps and then
477  // create a contiguous amalgamated maps, which bypasses the expense of the getMyGlobalIndicesDevice()
478  // call (which is free for non-contiguous maps, but costs something if the map is contiguous).
479  using range_policy = Kokkos::RangePolicy<typename Node::execution_space>;
480  array_type elementAList = coarseMap->getMyGlobalIndicesDevice();
481  GO indexBase = coarseMap->getIndexBase();
482  auto numElements = elementAList.size() / blkSize;
483  typename array_type::non_const_type elementList_nc("elementList", numElements);
484 
485  // Amalgamate the map
486  Kokkos::parallel_for(
487  "Amalgamate Element List", range_policy(0, numElements), KOKKOS_LAMBDA(LO i) {
488  elementList_nc[i] = (elementAList[i * blkSize] - indexBase) / blkSize + indexBase;
489  });
490  array_type elementList = elementList_nc;
491  coarseCoordMap = MapFactory::Build(coarseMap->lib(), Teuchos::OrdinalTraits<Xpetra::global_size_t>::invalid(),
492  elementList, indexBase, coarseMap->getComm());
493  }
494 
495  coarseCoords = RealValuedMultiVectorFactory::Build(coarseCoordMap, fineCoords->getNumVectors());
496 
497  // Create overlapped fine coordinates to reduce global communication
498  auto uniqueMap = fineCoords->getMap();
499  RCP<RealValuedMultiVector> ghostedCoords = fineCoords;
500  if (aggregates->AggregatesCrossProcessors()) {
501  auto nonUniqueMap = aggregates->GetMap();
502  auto importer = ImportFactory::Build(uniqueMap, nonUniqueMap);
503 
504  ghostedCoords = RealValuedMultiVectorFactory::Build(nonUniqueMap, fineCoords->getNumVectors());
505  ghostedCoords->doImport(*fineCoords, *importer, Xpetra::INSERT);
506  }
507 
508  // The good new is that his graph has already been constructed for the
509  // TentativePFactory and was cached in Aggregates. So this is a no-op.
510  auto aggGraph = aggregates->GetGraph();
511  auto numAggs = aggGraph.numRows();
512 
513  auto fineCoordsView = fineCoords->getDeviceLocalView(Xpetra::Access::ReadOnly);
514  auto coarseCoordsView = coarseCoords->getDeviceLocalView(Xpetra::Access::OverwriteAll);
515 
516  // Fill in coarse coordinates
517  {
518  SubFactoryMonitor m2(*this, "AverageCoords", coarseLevel);
519 
520  const auto dim = fineCoords->getNumVectors();
521 
522  typename AppendTrait<decltype(fineCoordsView), Kokkos::RandomAccess>::type fineCoordsRandomView = fineCoordsView;
523  for (size_t j = 0; j < dim; j++) {
524  Kokkos::parallel_for(
525  "MueLu::TentativeP::BuildCoords", Kokkos::RangePolicy<local_ordinal_type, execution_space>(0, numAggs),
526  KOKKOS_LAMBDA(const LO i) {
527  // A row in this graph represents all node ids in the aggregate
528  // Therefore, averaging is very easy
529 
530  auto aggregate = aggGraph.rowConst(i);
531 
532  coordinate_type sum = 0.0; // do not use Scalar here (Stokhos)
533  for (size_t colID = 0; colID < static_cast<size_t>(aggregate.length); colID++)
534  sum += fineCoordsRandomView(aggregate(colID), j);
535 
536  coarseCoordsView(i, j) = sum / aggregate.length;
537  });
538  }
539  }
540  }
541 
542  if (!aggregates->AggregatesCrossProcessors()) {
544  BuildPuncoupledBlockCrs(coarseLevel, A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace,
545  coarseLevel.GetLevelID());
546  } else {
547  BuildPuncoupled(coarseLevel, A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace, coarseLevel.GetLevelID());
548  }
549  } else
550  BuildPcoupled(A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace);
551 
552  // If available, use striding information of fine level matrix A for range
553  // map and coarseMap as domain map; otherwise use plain range map of
554  // Ptent = plain range map of A for range map and coarseMap as domain map.
555  // NOTE:
556  // The latter is not really safe, since there is no striding information
557  // for the range map. This is not really a problem, since striding
558  // information is always available on the intermedium levels and the
559  // coarsest levels.
560  if (A->IsView("stridedMaps") == true)
561  Ptentative->CreateView("stridedMaps", A->getRowMap("stridedMaps"), coarseMap);
562  else
563  Ptentative->CreateView("stridedMaps", Ptentative->getRangeMap(), coarseMap);
564 
565  if (bTransferCoordinates_) {
566  Set(coarseLevel, "Coordinates", coarseCoords);
567  }
568 
569  // FIXME: We should remove the NodeComm on levels past the threshold
570  if (fineLevel.IsAvailable("Node Comm")) {
571  RCP<const Teuchos::Comm<int>> nodeComm = Get<RCP<const Teuchos::Comm<int>>>(fineLevel, "Node Comm");
572  Set<RCP<const Teuchos::Comm<int>>>(coarseLevel, "Node Comm", nodeComm);
573  }
574 
575  Set(coarseLevel, "Nullspace", coarseNullspace);
576  Set(coarseLevel, "P", Ptentative);
577 
578  if (IsPrint(Statistics2)) {
579  RCP<ParameterList> params = rcp(new ParameterList());
580  params->set("printLoadBalancingInfo", true);
581  GetOStream(Statistics2) << PerfUtils::PrintMatrixInfo(*Ptentative, "Ptent", params);
582  }
583 }
584 
585 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
587  BuildPuncoupled(Level& coarseLevel, RCP<Matrix> A, RCP<Aggregates> aggregates,
588  RCP<AmalgamationInfo> amalgInfo, RCP<MultiVector> fineNullspace,
589  RCP<const Map> coarseMap, RCP<Matrix>& Ptentative,
590  RCP<MultiVector>& coarseNullspace, const int levelID) const {
591  auto rowMap = A->getRowMap();
592  auto colMap = A->getColMap();
593 
594  const size_t numRows = rowMap->getLocalNumElements();
595  const size_t NSDim = fineNullspace->getNumVectors();
596 
597  typedef Kokkos::ArithTraits<SC> ATS;
598  using impl_SC = typename ATS::val_type;
599  using impl_ATS = Kokkos::ArithTraits<impl_SC>;
600  const impl_SC zero = impl_ATS::zero(), one = impl_ATS::one();
601 
602  const LO INVALID = Teuchos::OrdinalTraits<LO>::invalid();
603 
604  typename Aggregates::local_graph_type aggGraph;
605  {
606  SubFactoryMonitor m2(*this, "Get Aggregates graph", coarseLevel);
607  aggGraph = aggregates->GetGraph();
608  }
609  auto aggRows = aggGraph.row_map;
610  auto aggCols = aggGraph.entries;
611 
612  // Aggregates map is based on the amalgamated column map
613  // We can skip global-to-local conversion if LIDs in row map are
614  // same as LIDs in column map
615  bool goodMap;
616  {
617  SubFactoryMonitor m2(*this, "Check good map", coarseLevel);
618  goodMap = isGoodMap(*rowMap, *colMap);
619  }
620  // FIXME_KOKKOS: need to proofread later code for bad maps
622  "MueLu: TentativePFactory_kokkos: for now works only with good maps "
623  "(i.e. \"matching\" row and column maps)");
624 
625  // STEP 1: do unamalgamation
626  // The non-kokkos version uses member functions from the AmalgamationInfo
627  // container class to unamalgamate the data. In contrast, the kokkos
628  // version of TentativePFactory does the unamalgamation here and only uses
629  // the data of the AmalgamationInfo container class
630 
631  // Extract information for unamalgamation
632  LO fullBlockSize, blockID, stridingOffset, stridedBlockSize;
633  GO indexBase;
634  amalgInfo->GetStridingInformation(fullBlockSize, blockID, stridingOffset, stridedBlockSize, indexBase);
635  GO globalOffset = amalgInfo->GlobalOffset();
636 
637  // Extract aggregation info (already in Kokkos host views)
638  auto procWinner = aggregates->GetProcWinner()->getDeviceLocalView(Xpetra::Access::ReadOnly);
639  auto vertex2AggId = aggregates->GetVertex2AggId()->getDeviceLocalView(Xpetra::Access::ReadOnly);
640  const size_t numAggregates = aggregates->GetNumAggregates();
641 
642  int myPID = aggregates->GetMap()->getComm()->getRank();
643 
644  // Create Kokkos::View (on the device) to store the aggreate dof sizes
645  // Later used to get aggregate dof offsets
646  // NOTE: This zeros itself on construction
647  typedef typename Aggregates::aggregates_sizes_type::non_const_type AggSizeType;
648  AggSizeType aggDofSizes;
649 
650  if (stridedBlockSize == 1) {
651  SubFactoryMonitor m2(*this, "Calc AggSizes", coarseLevel);
652 
653  // FIXME_KOKKOS: use ViewAllocateWithoutInitializing + set a single value
654  aggDofSizes = AggSizeType("agg_dof_sizes", numAggregates + 1);
655 
656  auto sizesConst = aggregates->ComputeAggregateSizes();
657  Kokkos::deep_copy(Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(1), numAggregates + 1)), sizesConst);
658 
659  } else {
660  SubFactoryMonitor m2(*this, "Calc AggSizes", coarseLevel);
661 
662  // FIXME_KOKKOS: use ViewAllocateWithoutInitializing + set a single value
663  aggDofSizes = AggSizeType("agg_dof_sizes", numAggregates + 1);
664 
665  auto nodeMap = aggregates->GetMap()->getLocalMap();
666  auto dofMap = colMap->getLocalMap();
667 
668  Kokkos::parallel_for(
669  "MueLu:TentativePF:Build:compute_agg_sizes", range_type(0, numAggregates),
670  KOKKOS_LAMBDA(const LO agg) {
671  auto aggRowView = aggGraph.rowConst(agg);
672 
673  size_t size = 0;
674  for (LO colID = 0; colID < aggRowView.length; colID++) {
675  GO nodeGID = nodeMap.getGlobalElement(aggRowView(colID));
676 
677  for (LO k = 0; k < stridedBlockSize; k++) {
678  GO dofGID = (nodeGID - indexBase) * fullBlockSize + k + indexBase + globalOffset + stridingOffset;
679 
680  if (dofMap.getLocalElement(dofGID) != INVALID)
681  size++;
682  }
683  }
684  aggDofSizes(agg + 1) = size;
685  });
686  }
687 
688  // Find maximum dof size for aggregates
689  // Later used to reserve enough scratch space for local QR decompositions
690  LO maxAggSize = 0;
691  ReduceMaxFunctor<LO, decltype(aggDofSizes)> reduceMax(aggDofSizes);
692  Kokkos::parallel_reduce("MueLu:TentativePF:Build:max_agg_size", range_type(0, aggDofSizes.extent(0)), reduceMax, maxAggSize);
693 
694  // parallel_scan (exclusive)
695  // The aggDofSizes View then contains the aggregate dof offsets
696  Kokkos::parallel_scan(
697  "MueLu:TentativePF:Build:aggregate_sizes:stage1_scan", range_type(0, numAggregates + 1),
698  KOKKOS_LAMBDA(const LO i, LO& update, const bool& final_pass) {
699  update += aggDofSizes(i);
700  if (final_pass)
701  aggDofSizes(i) = update;
702  });
703 
704  // Create Kokkos::View on the device to store mapping
705  // between (local) aggregate id and row map ids (LIDs)
706  Kokkos::View<LO*, DeviceType> agg2RowMapLO(Kokkos::ViewAllocateWithoutInitializing("agg2row_map_LO"), numRows);
707  {
708  SubFactoryMonitor m2(*this, "Create Agg2RowMap", coarseLevel);
709 
710  AggSizeType aggOffsets(Kokkos::ViewAllocateWithoutInitializing("aggOffsets"), numAggregates);
711  Kokkos::deep_copy(aggOffsets, Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(0), numAggregates)));
712 
713  Kokkos::parallel_for(
714  "MueLu:TentativePF:Build:createAgg2RowMap", range_type(0, vertex2AggId.extent(0)),
715  KOKKOS_LAMBDA(const LO lnode) {
716  if (procWinner(lnode, 0) == myPID) {
717  // No need for atomics, it's one-to-one
718  auto aggID = vertex2AggId(lnode, 0);
719 
720  auto offset = Kokkos::atomic_fetch_add(&aggOffsets(aggID), stridedBlockSize);
721  // FIXME: I think this may be wrong
722  // We unconditionally add the whole block here. When we calculated
723  // aggDofSizes, we did the isLocalElement check. Something's fishy.
724  for (LO k = 0; k < stridedBlockSize; k++)
725  agg2RowMapLO(offset + k) = lnode * stridedBlockSize + k;
726  }
727  });
728  }
729 
730  // STEP 2: prepare local QR decomposition
731  // Reserve memory for tentative prolongation operator
732  coarseNullspace = MultiVectorFactory::Build(coarseMap, NSDim);
733 
734  // Pull out the nullspace vectors so that we can have random access (on the device)
735  auto fineNS = fineNullspace->getDeviceLocalView(Xpetra::Access::ReadWrite);
736  auto coarseNS = coarseNullspace->getDeviceLocalView(Xpetra::Access::OverwriteAll);
737 
738  size_t nnz = 0; // actual number of nnz
739 
740  typedef typename Xpetra::Matrix<SC, LO, GO, NO>::local_matrix_type local_matrix_type;
741  typedef typename local_matrix_type::row_map_type::non_const_type rows_type;
742  typedef typename local_matrix_type::index_type::non_const_type cols_type;
743  typedef typename local_matrix_type::values_type::non_const_type vals_type;
744 
745  // Device View for status (error messages...)
746  typedef Kokkos::View<int[10], DeviceType> status_type;
747  status_type status("status");
748 
751 
752  const ParameterList& pL = GetParameterList();
753  const bool& doQRStep = pL.get<bool>("tentative: calculate qr");
754  if (!doQRStep) {
755  GetOStream(Runtime1) << "TentativePFactory : bypassing local QR phase" << std::endl;
756  if (NSDim > 1)
757  GetOStream(Warnings0) << "TentativePFactor : for nontrivial nullspace, this may degrade performance" << std::endl;
758  }
759 
760  size_t nnzEstimate = numRows * NSDim;
761  rows_type rowsAux(Kokkos::ViewAllocateWithoutInitializing("Ptent_aux_rows"), numRows + 1);
762  cols_type colsAux(Kokkos::ViewAllocateWithoutInitializing("Ptent_aux_cols"), nnzEstimate);
763  vals_type valsAux("Ptent_aux_vals", nnzEstimate);
764  rows_type rows("Ptent_rows", numRows + 1);
765  {
766  // Stage 0: fill in views.
767  SubFactoryMonitor m2(*this, "Stage 0 (InitViews)", coarseLevel);
768 
769  // The main thing to notice is initialization of vals with INVALID. These
770  // values will later be used to compress the arrays
771  Kokkos::parallel_for(
772  "MueLu:TentativePF:BuildPuncoupled:for1", range_type(0, numRows + 1),
773  KOKKOS_LAMBDA(const LO row) {
774  rowsAux(row) = row * NSDim;
775  });
776  Kokkos::parallel_for(
777  "MueLu:TentativePF:BuildUncoupled:for2", range_type(0, nnzEstimate),
778  KOKKOS_LAMBDA(const LO j) {
779  colsAux(j) = INVALID;
780  });
781  }
782 
783  if (NSDim == 1) {
784  // 1D is special, as it is the easiest. We don't even need to the QR,
785  // just normalize an array. Plus, no worries abot small aggregates. In
786  // addition, we do not worry about compression. It is unlikely that
787  // nullspace will have zeros. If it does, a prolongator row would be
788  // zero and we'll get singularity anyway.
789  SubFactoryMonitor m2(*this, "Stage 1 (LocalQR)", coarseLevel);
790 
791  // Set up team policy with numAggregates teams and one thread per team.
792  // Each team handles a slice of the data associated with one aggregate
793  // and performs a local QR decomposition (in this case real QR is
794  // unnecessary).
795  const Kokkos::TeamPolicy<execution_space> policy(numAggregates, 1);
796 
797  if (doQRStep) {
798  Kokkos::parallel_for(
799  "MueLu:TentativePF:BuildUncoupled:main_loop", policy,
800  KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type& thread) {
801  auto agg = thread.league_rank();
802 
803  // size of the aggregate (number of DOFs in aggregate)
804  LO aggSize = aggRows(agg + 1) - aggRows(agg);
805 
806  // Extract the piece of the nullspace corresponding to the aggregate, and
807  // put it in the flat array, "localQR" (in column major format) for the
808  // QR routine. Trivial in 1D.
809  auto norm = impl_ATS::magnitude(zero);
810 
811  // Calculate QR by hand
812  // FIXME: shouldn't there be stridedblock here?
813  // FIXME_KOKKOS: shouldn't there be stridedblock here?
814  for (decltype(aggSize) k = 0; k < aggSize; k++) {
815  auto dnorm = impl_ATS::magnitude(fineNSRandom(agg2RowMapLO(aggRows(agg) + k), 0));
816  norm += dnorm * dnorm;
817  }
818  norm = sqrt(norm);
819 
820  if (norm == zero) {
821  // zero column; terminate the execution
822  statusAtomic(1) = true;
823  return;
824  }
825 
826  // R = norm
827  coarseNS(agg, 0) = norm;
828 
829  // Q = localQR(:,0)/norm
830  for (decltype(aggSize) k = 0; k < aggSize; k++) {
831  LO localRow = agg2RowMapLO(aggRows(agg) + k);
832  impl_SC localVal = fineNSRandom(agg2RowMapLO(aggRows(agg) + k), 0) / norm;
833 
834  rows(localRow + 1) = 1;
835  colsAux(localRow) = agg;
836  valsAux(localRow) = localVal;
837  }
838  });
839 
840  typename status_type::HostMirror statusHost = Kokkos::create_mirror_view(status);
841  Kokkos::deep_copy(statusHost, status);
842  for (decltype(statusHost.size()) i = 0; i < statusHost.size(); i++)
843  if (statusHost(i)) {
844  std::ostringstream oss;
845  oss << "MueLu::TentativePFactory::MakeTentative: ";
846  switch (i) {
847  case 0: oss << "!goodMap is not implemented"; break;
848  case 1: oss << "fine level NS part has a zero column"; break;
849  }
850  throw Exceptions::RuntimeError(oss.str());
851  }
852 
853  } else {
854  Kokkos::parallel_for(
855  "MueLu:TentativePF:BuildUncoupled:main_loop_noqr", policy,
856  KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type& thread) {
857  auto agg = thread.league_rank();
858 
859  // size of the aggregate (number of DOFs in aggregate)
860  LO aggSize = aggRows(agg + 1) - aggRows(agg);
861 
862  // R = norm
863  coarseNS(agg, 0) = one;
864 
865  // Q = localQR(:,0)/norm
866  for (decltype(aggSize) k = 0; k < aggSize; k++) {
867  LO localRow = agg2RowMapLO(aggRows(agg) + k);
868  impl_SC localVal = fineNSRandom(agg2RowMapLO(aggRows(agg) + k), 0);
869 
870  rows(localRow + 1) = 1;
871  colsAux(localRow) = agg;
872  valsAux(localRow) = localVal;
873  }
874  });
875  }
876 
877  Kokkos::parallel_reduce(
878  "MueLu:TentativeP:CountNNZ", range_type(0, numRows + 1),
879  KOKKOS_LAMBDA(const LO i, size_t& nnz_count) {
880  nnz_count += rows(i);
881  },
882  nnz);
883 
884  } else { // NSdim > 1
885  // FIXME_KOKKOS: This code branch is completely unoptimized.
886  // Work to do:
887  // - Optimize QR decomposition
888  // - Remove INVALID usage similarly to CoalesceDropFactory_kokkos by
889  // packing new values in the beginning of each row
890  // We do use auxilary view in this case, so keep a second rows view for
891  // counting nonzeros in rows
892 
893  {
894  SubFactoryMonitor m2 = SubFactoryMonitor(*this, doQRStep ? "Stage 1 (LocalQR)" : "Stage 1 (Fill coarse nullspace and tentative P)", coarseLevel);
895  // Set up team policy with numAggregates teams and one thread per team.
896  // Each team handles a slice of the data associated with one aggregate
897  // and performs a local QR decomposition
898  const Kokkos::TeamPolicy<execution_space> policy(numAggregates, 1); // numAggregates teams a 1 thread
899  LocalQRDecompFunctor<LocalOrdinal, GlobalOrdinal, Scalar, DeviceType, decltype(fineNSRandom),
900  decltype(aggDofSizes /*aggregate sizes in dofs*/), decltype(maxAggSize), decltype(agg2RowMapLO),
901  decltype(statusAtomic), decltype(rows), decltype(rowsAux), decltype(colsAux),
902  decltype(valsAux)>
903  localQRFunctor(fineNSRandom, coarseNS, aggDofSizes, maxAggSize, agg2RowMapLO, statusAtomic,
904  rows, rowsAux, colsAux, valsAux, doQRStep);
905  Kokkos::parallel_reduce("MueLu:TentativePF:BuildUncoupled:main_qr_loop", policy, localQRFunctor, nnz);
906  }
907 
908  typename status_type::HostMirror statusHost = Kokkos::create_mirror_view(status);
909  Kokkos::deep_copy(statusHost, status);
910  for (decltype(statusHost.size()) i = 0; i < statusHost.size(); i++)
911  if (statusHost(i)) {
912  std::ostringstream oss;
913  oss << "MueLu::TentativePFactory::MakeTentative: ";
914  switch (i) {
915  case 0: oss << "!goodMap is not implemented"; break;
916  case 1: oss << "fine level NS part has a zero column"; break;
917  }
918  throw Exceptions::RuntimeError(oss.str());
919  }
920  }
921 
922  // Compress the cols and vals by ignoring INVALID column entries that correspond
923  // to 0 in QR.
924 
925  // The real cols and vals are constructed using calculated (not estimated) nnz
926  cols_type cols;
927  vals_type vals;
928 
929  if (nnz != nnzEstimate) {
930  {
931  // Stage 2: compress the arrays
932  SubFactoryMonitor m2(*this, "Stage 2 (CompressRows)", coarseLevel);
933 
934  Kokkos::parallel_scan(
935  "MueLu:TentativePF:Build:compress_rows", range_type(0, numRows + 1),
936  KOKKOS_LAMBDA(const LO i, LO& upd, const bool& final) {
937  upd += rows(i);
938  if (final)
939  rows(i) = upd;
940  });
941  }
942 
943  {
944  SubFactoryMonitor m2(*this, "Stage 2 (CompressCols)", coarseLevel);
945 
946  cols = cols_type("Ptent_cols", nnz);
947  vals = vals_type("Ptent_vals", nnz);
948 
949  // FIXME_KOKKOS: this can be spedup by moving correct cols and vals values
950  // to the beginning of rows. See CoalesceDropFactory_kokkos for
951  // example.
952  Kokkos::parallel_for(
953  "MueLu:TentativePF:Build:compress_cols_vals", range_type(0, numRows),
954  KOKKOS_LAMBDA(const LO i) {
955  LO rowStart = rows(i);
956 
957  size_t lnnz = 0;
958  for (auto j = rowsAux(i); j < rowsAux(i + 1); j++)
959  if (colsAux(j) != INVALID) {
960  cols(rowStart + lnnz) = colsAux(j);
961  vals(rowStart + lnnz) = valsAux(j);
962  lnnz++;
963  }
964  });
965  }
966 
967  } else {
968  rows = rowsAux;
969  cols = colsAux;
970  vals = valsAux;
971  }
972 
973  GetOStream(Runtime1) << "TentativePFactory : aggregates do not cross process boundaries" << std::endl;
974 
975  {
976  // Stage 3: construct Xpetra::Matrix
977  SubFactoryMonitor m2(*this, "Stage 3 (LocalMatrix+FillComplete)", coarseLevel);
978 
979  local_matrix_type lclMatrix = local_matrix_type("A", numRows, coarseMap->getLocalNumElements(), nnz, vals, rows, cols);
980 
981  // Managing labels & constants for ESFC
982  RCP<ParameterList> FCparams;
983  if (pL.isSublist("matrixmatrix: kernel params"))
984  FCparams = rcp(new ParameterList(pL.sublist("matrixmatrix: kernel params")));
985  else
986  FCparams = rcp(new ParameterList);
987 
988  // By default, we don't need global constants for TentativeP
989  FCparams->set("compute global constants", FCparams->get("compute global constants", false));
990  FCparams->set("Timer Label", std::string("MueLu::TentativeP-") + toString(levelID));
991 
992  auto PtentCrs = CrsMatrixFactory::Build(lclMatrix, rowMap, coarseMap, coarseMap, A->getDomainMap());
993  Ptentative = rcp(new CrsMatrixWrap(PtentCrs));
994  }
995 }
996 
997 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1000  RCP<AmalgamationInfo> amalgInfo, RCP<MultiVector> fineNullspace,
1001  RCP<const Map> coarsePointMap, RCP<Matrix>& Ptentative,
1002  RCP<MultiVector>& coarseNullspace, const int levelID) const {
1003  /* This routine generates a BlockCrs P for a BlockCrs A. There are a few assumptions here, which meet the use cases we care about, but could
1004  be generalized later, if we ever need to do so:
1005  1) Null space dimension === block size of matrix: So no elasticity right now
1006  2) QR is not supported: Under assumption #1, this shouldn't cause problems.
1007  3) Maps are "good": Aka the first chunk of the ColMap is the RowMap.
1008 
1009  These assumptions keep our code way simpler and still support the use cases we actually care about.
1010  */
1011 
1012  RCP<const Map> rowMap = A->getRowMap();
1013  RCP<const Map> rangeMap = A->getRangeMap();
1014  RCP<const Map> colMap = A->getColMap();
1015  // const size_t numFinePointRows = rangeMap->getLocalNumElements();
1016  const size_t numFineBlockRows = rowMap->getLocalNumElements();
1017 
1018  // typedef Teuchos::ScalarTraits<SC> STS;
1019  // typedef typename STS::magnitudeType Magnitude;
1020  const LO INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1021 
1022  typedef Kokkos::ArithTraits<SC> ATS;
1023  using impl_SC = typename ATS::val_type;
1024  using impl_ATS = Kokkos::ArithTraits<impl_SC>;
1025  const impl_SC one = impl_ATS::one();
1026 
1027  // const GO numAggs = aggregates->GetNumAggregates();
1028  const size_t NSDim = fineNullspace->getNumVectors();
1029  auto aggSizes = aggregates->ComputeAggregateSizes();
1030 
1031  typename Aggregates::local_graph_type aggGraph;
1032  {
1033  SubFactoryMonitor m2(*this, "Get Aggregates graph", coarseLevel);
1034  aggGraph = aggregates->GetGraph();
1035  }
1036  auto aggRows = aggGraph.row_map;
1037  auto aggCols = aggGraph.entries;
1038 
1039  // Need to generate the coarse block map
1040  // NOTE: We assume NSDim == block size here
1041  // NOTE: We also assume that coarseMap has contiguous GIDs
1042  // const size_t numCoarsePointRows = coarsePointMap->getLocalNumElements();
1043  const size_t numCoarseBlockRows = coarsePointMap->getLocalNumElements() / NSDim;
1044  RCP<const Map> coarseBlockMap = MapFactory::Build(coarsePointMap->lib(),
1046  numCoarseBlockRows,
1047  coarsePointMap->getIndexBase(),
1048  coarsePointMap->getComm());
1049  // Sanity checking
1050  const ParameterList& pL = GetParameterList();
1051  // const bool &doQRStep = pL.get<bool>("tentative: calculate qr");
1052 
1053  // The aggregates use the amalgamated column map, which in this case is what we want
1054 
1055  // Aggregates map is based on the amalgamated column map
1056  // We can skip global-to-local conversion if LIDs in row map are
1057  // same as LIDs in column map
1058  bool goodMap = MueLu::Utilities<SC, LO, GO, NO>::MapsAreNested(*rowMap, *colMap);
1060  "MueLu: TentativePFactory_kokkos: for now works only with good maps "
1061  "(i.e. \"matching\" row and column maps)");
1062 
1063  // STEP 1: do unamalgamation
1064  // The non-kokkos version uses member functions from the AmalgamationInfo
1065  // container class to unamalgamate the data. In contrast, the kokkos
1066  // version of TentativePFactory does the unamalgamation here and only uses
1067  // the data of the AmalgamationInfo container class
1068 
1069  // Extract information for unamalgamation
1070  LO fullBlockSize, blockID, stridingOffset, stridedBlockSize;
1071  GO indexBase;
1072  amalgInfo->GetStridingInformation(fullBlockSize, blockID, stridingOffset, stridedBlockSize, indexBase);
1073  // GO globalOffset = amalgInfo->GlobalOffset();
1074 
1075  // Extract aggregation info (already in Kokkos host views)
1076  auto procWinner = aggregates->GetProcWinner()->getDeviceLocalView(Xpetra::Access::ReadOnly);
1077  auto vertex2AggId = aggregates->GetVertex2AggId()->getDeviceLocalView(Xpetra::Access::ReadOnly);
1078  const size_t numAggregates = aggregates->GetNumAggregates();
1079 
1080  int myPID = aggregates->GetMap()->getComm()->getRank();
1081 
1082  // Create Kokkos::View (on the device) to store the aggreate dof sizes
1083  // Later used to get aggregate dof offsets
1084  // NOTE: This zeros itself on construction
1085  typedef typename Aggregates::aggregates_sizes_type::non_const_type AggSizeType;
1086  AggSizeType aggDofSizes; // This turns into "starts" after the parallel_scan
1087 
1088  {
1089  SubFactoryMonitor m2(*this, "Calc AggSizes", coarseLevel);
1090 
1091  // FIXME_KOKKOS: use ViewAllocateWithoutInitializing + set a single value
1092  aggDofSizes = AggSizeType("agg_dof_sizes", numAggregates + 1);
1093 
1094  Kokkos::deep_copy(Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(1), numAggregates + 1)), aggSizes);
1095  }
1096 
1097  // Find maximum dof size for aggregates
1098  // Later used to reserve enough scratch space for local QR decompositions
1099  LO maxAggSize = 0;
1100  ReduceMaxFunctor<LO, decltype(aggDofSizes)> reduceMax(aggDofSizes);
1101  Kokkos::parallel_reduce("MueLu:TentativePF:Build:max_agg_size", range_type(0, aggDofSizes.extent(0)), reduceMax, maxAggSize);
1102 
1103  // parallel_scan (exclusive)
1104  // The aggDofSizes View then contains the aggregate dof offsets
1105  Kokkos::parallel_scan(
1106  "MueLu:TentativePF:Build:aggregate_sizes:stage1_scan", range_type(0, numAggregates + 1),
1107  KOKKOS_LAMBDA(const LO i, LO& update, const bool& final_pass) {
1108  update += aggDofSizes(i);
1109  if (final_pass)
1110  aggDofSizes(i) = update;
1111  });
1112 
1113  // Create Kokkos::View on the device to store mapping
1114  // between (local) aggregate id and row map ids (LIDs)
1115  Kokkos::View<LO*, DeviceType> aggToRowMapLO(Kokkos::ViewAllocateWithoutInitializing("aggtorow_map_LO"), numFineBlockRows);
1116  {
1117  SubFactoryMonitor m2(*this, "Create AggToRowMap", coarseLevel);
1118 
1119  AggSizeType aggOffsets(Kokkos::ViewAllocateWithoutInitializing("aggOffsets"), numAggregates);
1120  Kokkos::deep_copy(aggOffsets, Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(0), numAggregates)));
1121 
1122  Kokkos::parallel_for(
1123  "MueLu:TentativePF:Build:createAgg2RowMap", range_type(0, vertex2AggId.extent(0)),
1124  KOKKOS_LAMBDA(const LO lnode) {
1125  if (procWinner(lnode, 0) == myPID) {
1126  // No need for atomics, it's one-to-one
1127  auto aggID = vertex2AggId(lnode, 0);
1128 
1129  auto offset = Kokkos::atomic_fetch_add(&aggOffsets(aggID), stridedBlockSize);
1130  // FIXME: I think this may be wrong
1131  // We unconditionally add the whole block here. When we calculated
1132  // aggDofSizes, we did the isLocalElement check. Something's fishy.
1133  for (LO k = 0; k < stridedBlockSize; k++)
1134  aggToRowMapLO(offset + k) = lnode * stridedBlockSize + k;
1135  }
1136  });
1137  }
1138 
1139  // STEP 2: prepare local QR decomposition
1140  // Reserve memory for tentative prolongation operator
1141  coarseNullspace = MultiVectorFactory::Build(coarsePointMap, NSDim);
1142 
1143  // Pull out the nullspace vectors so that we can have random access (on the device)
1144  auto fineNS = fineNullspace->getDeviceLocalView(Xpetra::Access::ReadWrite);
1145  auto coarseNS = coarseNullspace->getDeviceLocalView(Xpetra::Access::OverwriteAll);
1146 
1147  typedef typename Xpetra::Matrix<SC, LO, GO, NO>::local_matrix_type local_matrix_type;
1148  typedef typename local_matrix_type::row_map_type::non_const_type rows_type;
1149  typedef typename local_matrix_type::index_type::non_const_type cols_type;
1150  // typedef typename local_matrix_type::values_type::non_const_type vals_type;
1151 
1152  // Device View for status (error messages...)
1153  typedef Kokkos::View<int[10], DeviceType> status_type;
1154  status_type status("status");
1155 
1158 
1159  // We're going to bypass QR in the BlockCrs version of the code regardless of what the user asks for
1160  GetOStream(Runtime1) << "TentativePFactory : bypassing local QR phase" << std::endl;
1161 
1162  // BlockCrs requires that we build the (block) graph first, so let's do that...
1163 
1164  // NOTE: Because we're assuming that the NSDim == BlockSize, we only have one
1165  // block non-zero per row in the matrix;
1166  rows_type ia(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_rowptr"), numFineBlockRows + 1);
1167  cols_type ja(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_colind"), numFineBlockRows);
1168 
1169  Kokkos::parallel_for(
1170  "MueLu:TentativePF:BlockCrs:graph_init", range_type(0, numFineBlockRows),
1171  KOKKOS_LAMBDA(const LO j) {
1172  ia[j] = j;
1173  ja[j] = INVALID;
1174 
1175  if (j == (LO)numFineBlockRows - 1)
1176  ia[numFineBlockRows] = numFineBlockRows;
1177  });
1178 
1179  // Fill Graph
1180  const Kokkos::TeamPolicy<execution_space> policy(numAggregates, 1);
1181  Kokkos::parallel_for(
1182  "MueLu:TentativePF:BlockCrs:fillGraph", policy,
1183  KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type& thread) {
1184  auto agg = thread.league_rank();
1185  Xpetra::global_size_t offset = agg;
1186 
1187  // size of the aggregate (number of DOFs in aggregate)
1188  LO aggSize = aggRows(agg + 1) - aggRows(agg);
1189 
1190  for (LO j = 0; j < aggSize; j++) {
1191  // FIXME: Allow for bad maps
1192  const LO localRow = aggToRowMapLO[aggDofSizes[agg] + j];
1193  const size_t rowStart = ia[localRow];
1194  ja[rowStart] = offset;
1195  }
1196  });
1197 
1198  // Compress storage (remove all INVALID, which happen when we skip zeros)
1199  // We do that in-place
1200  {
1201  // Stage 2: compress the arrays
1202  SubFactoryMonitor m2(*this, "Stage 2 (CompressData)", coarseLevel);
1203  // Fill i_temp with the correct row starts
1204  rows_type i_temp(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_rowptr"), numFineBlockRows + 1);
1205  LO nnz = 0;
1206  Kokkos::parallel_scan(
1207  "MueLu:TentativePF:BlockCrs:compress_rows", range_type(0, numFineBlockRows),
1208  KOKKOS_LAMBDA(const LO i, LO& upd, const bool& final) {
1209  if (final)
1210  i_temp[i] = upd;
1211  for (auto j = ia[i]; j < ia[i + 1]; j++)
1212  if (ja[j] != INVALID)
1213  upd++;
1214  if (final && i == (LO)numFineBlockRows - 1)
1215  i_temp[numFineBlockRows] = upd;
1216  },
1217  nnz);
1218 
1219  cols_type j_temp(Kokkos::ViewAllocateWithoutInitializing("BlockGraph_colind"), nnz);
1220 
1221  Kokkos::parallel_for(
1222  "MueLu:TentativePF:BlockCrs:compress_cols", range_type(0, numFineBlockRows),
1223  KOKKOS_LAMBDA(const LO i) {
1224  size_t rowStart = i_temp[i];
1225  size_t lnnz = 0;
1226  for (auto j = ia[i]; j < ia[i + 1]; j++)
1227  if (ja[j] != INVALID) {
1228  j_temp[rowStart + lnnz] = ja[j];
1229  lnnz++;
1230  }
1231  });
1232 
1233  ia = i_temp;
1234  ja = j_temp;
1235  }
1236 
1237  RCP<CrsGraph> BlockGraph = CrsGraphFactory::Build(rowMap, coarseBlockMap, ia, ja);
1238 
1239  // Managing labels & constants for ESFC
1240  {
1241  RCP<ParameterList> FCparams;
1242  if (pL.isSublist("matrixmatrix: kernel params"))
1243  FCparams = rcp(new ParameterList(pL.sublist("matrixmatrix: kernel params")));
1244  else
1245  FCparams = rcp(new ParameterList);
1246  // By default, we don't need global constants for TentativeP
1247  FCparams->set("compute global constants", FCparams->get("compute global constants", false));
1248  std::string levelIDs = toString(levelID);
1249  FCparams->set("Timer Label", std::string("MueLu::TentativeP-") + levelIDs);
1250  RCP<const Export> dummy_e;
1251  RCP<const Import> dummy_i;
1252  BlockGraph->expertStaticFillComplete(coarseBlockMap, rowMap, dummy_i, dummy_e, FCparams);
1253  }
1254 
1255  // We can't leave the ia/ja pointers floating around, because of host/device view counting, so
1256  // we clear them here
1257  ia = rows_type();
1258  ja = cols_type();
1259 
1260  // Now let's make a BlockCrs Matrix
1261  // NOTE: Assumes block size== NSDim
1262  RCP<Xpetra::CrsMatrix<SC, LO, GO, NO>> P_xpetra = Xpetra::CrsMatrixFactory<SC, LO, GO, NO>::BuildBlock(BlockGraph, coarsePointMap, rangeMap, NSDim);
1264  if (P_tpetra.is_null()) throw std::runtime_error("BuildPUncoupled: Matrix factory did not return a Tpetra::BlockCrsMatrix");
1265  RCP<CrsMatrixWrap> P_wrap = rcp(new CrsMatrixWrap(P_xpetra));
1266 
1267  auto values = P_tpetra->getTpetra_BlockCrsMatrix()->getValuesDeviceNonConst();
1268  const LO stride = NSDim * NSDim;
1269 
1270  Kokkos::parallel_for(
1271  "MueLu:TentativePF:BlockCrs:main_loop_noqr", policy,
1272  KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<execution_space>::member_type& thread) {
1273  auto agg = thread.league_rank();
1274 
1275  // size of the aggregate (number of DOFs in aggregate)
1276  LO aggSize = aggRows(agg + 1) - aggRows(agg);
1277  Xpetra::global_size_t offset = agg * NSDim;
1278 
1279  // Q = localQR(:,0)/norm
1280  for (LO j = 0; j < aggSize; j++) {
1281  LO localBlockRow = aggToRowMapLO(aggRows(agg) + j);
1282  LO rowStart = localBlockRow * stride;
1283  for (LO r = 0; r < (LO)NSDim; r++) {
1284  LO localPointRow = localBlockRow * NSDim + r;
1285  for (LO c = 0; c < (LO)NSDim; c++) {
1286  values[rowStart + r * NSDim + c] = fineNSRandom(localPointRow, c);
1287  }
1288  }
1289  }
1290 
1291  // R = norm
1292  for (LO j = 0; j < (LO)NSDim; j++)
1293  coarseNS(offset + j, j) = one;
1294  });
1295 
1296  Ptentative = P_wrap;
1297 }
1298 
1299 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1301  BuildPcoupled(RCP<Matrix> /* A */, RCP<Aggregates> /* aggregates */,
1302  RCP<AmalgamationInfo> /* amalgInfo */, RCP<MultiVector> /* fineNullspace */,
1303  RCP<const Map> /* coarseMap */, RCP<Matrix>& /* Ptentative */,
1304  RCP<MultiVector>& /* coarseNullspace */) const {
1305  throw Exceptions::RuntimeError("MueLu: Construction of coupled tentative P is not implemented");
1306 }
1307 
1308 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1310  isGoodMap(const Map& rowMap, const Map& colMap) const {
1311  auto rowLocalMap = rowMap.getLocalMap();
1312  auto colLocalMap = colMap.getLocalMap();
1313 
1314  const size_t numRows = rowLocalMap.getLocalNumElements();
1315  const size_t numCols = colLocalMap.getLocalNumElements();
1316 
1317  if (numCols < numRows)
1318  return false;
1319 
1320  size_t numDiff = 0;
1321  Kokkos::parallel_reduce(
1322  "MueLu:TentativePF:isGoodMap", range_type(0, numRows),
1323  KOKKOS_LAMBDA(const LO i, size_t& diff) {
1324  diff += (rowLocalMap.getGlobalElement(i) != colLocalMap.getGlobalElement(i));
1325  },
1326  numDiff);
1327 
1328  return (numDiff == 0);
1329 }
1330 
1331 } // namespace MueLu
1332 
1333 #define MUELU_TENTATIVEPFACTORY_KOKKOS_SHORT
1334 #endif // MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
Important warning messages (one line)
void BuildPcoupled(RCP< Matrix > A, RCP< Aggregates > aggregates, RCP< AmalgamationInfo > amalgInfo, RCP< MultiVector > fineNullspace, RCP< const Map > coarseMap, RCP< Matrix > &Ptentative, RCP< MultiVector > &coarseNullspace) const
MueLu::DefaultLocalOrdinal LocalOrdinal
void BuildPuncoupledBlockCrs(Level &coarseLevel, RCP< Matrix > A, RCP< Aggregates > aggregates, RCP< AmalgamationInfo > amalgInfo, RCP< MultiVector > fineNullspace, RCP< const Map > coarseMap, RCP< Matrix > &Ptentative, RCP< MultiVector > &coarseNullspace, const int levelID) const
static bool MapsAreNested(const Xpetra::Map< LocalOrdinal, GlobalOrdinal, Node > &rowMap, const Xpetra::Map< LocalOrdinal, GlobalOrdinal, Node > &colMap)
std::string toString(const T &what)
Little helper function to convert non-string types to strings.
const RCP< LOVector > & GetProcWinner() const
Returns constant vector that maps local node IDs to owning processor IDs.
KOKKOS_INLINE_FUNCTION LO GetNumAggregates() const
void BuildP(Level &fineLevel, Level &coarseLevel) const
Abstract Build method.
GlobalOrdinal GO
T & get(const std::string &name, T def_value)
ParameterList & set(std::string const &name, T const &value, std::string const &docString="", RCP< const ParameterEntryValidator > const &validator=null)
Timer to be used in factories. Similar to Monitor but with additional timers.
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
const RCP< const Map > GetMap() const
returns (overlapping) map of aggregate/node distribution
LocalOrdinal LO
static const NoFactory * get()
Print even more statistics.
#define SET_VALID_ENTRY(name)
bool isParameter(const std::string &name) const
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
void BuildPuncoupled(Level &coarseLevel, RCP< Matrix > A, RCP< Aggregates > aggregates, RCP< AmalgamationInfo > amalgInfo, RCP< MultiVector > fineNullspace, RCP< const Map > coarseMap, RCP< Matrix > &Ptentative, RCP< MultiVector > &coarseNullspace, const int levelID) const
MueLu::DefaultScalar Scalar
MueLu::DefaultGlobalOrdinal GlobalOrdinal
Kokkos::RangePolicy< local_ordinal_type, execution_space > range_type
Class that holds all level-specific information.
Definition: MueLu_Level.hpp:99
bool isSublist(const std::string &name) const
void GetStridingInformation(LO &fullBlockSize, LO &blockID, LO &stridingOffset, LO &stridedBlockSize, GO &indexBase)
returns striding information
Timer to be used in factories. Similar to SubMonitor but adds a timer level by level.
typename LWGraph_kokkos::local_graph_type local_graph_type
void Build(Level &fineLevel, Level &coarseLevel) const
Build an object with this factory.
void DeclareInput(Level &fineLevel, Level &coarseLevel) const
Input.
const RCP< LOMultiVector > & GetVertex2AggId() const
Returns constant vector that maps local node IDs to local aggregates IDs.
static std::string PrintMatrixInfo(const Matrix &A, const std::string &msgTag, RCP< const Teuchos::ParameterList > params=Teuchos::null)
size_t global_size_t
GO GlobalOffset()
returns offset of global dof ids
RCP< const ParameterList > GetValidParameterList() const
Return a const parameter list of valid parameters that setParameterList() will accept.
Scalar SC
ParameterList & sublist(const std::string &name, bool mustAlreadyExist=false, const std::string &docString="")
int GetLevelID() const
Return level number.
Definition: MueLu_Level.cpp:76
Exception throws to report errors in the internal logical of the program.
Description of what is happening (more verbose)
bool isGoodMap(const Map &rowMap, const Map &colMap) const
maxAggDofSizeType maxAggDofSize
local_graph_type GetGraph() const
agg2RowMapLOType agg2RowMapLO
aggregates_sizes_type::const_type ComputeAggregateSizes(bool forceRecompute=false) const
Compute sizes of aggregates.
bool IsAvailable(const std::string &ename, const FactoryBase *factory=NoFactory::get()) const
Test whether a need&#39;s value has been saved.
bool is_null() const