46 #ifndef MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
47 #define MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
49 #include "Kokkos_UnorderedMap.hpp"
54 #include "MueLu_Aggregates.hpp"
55 #include "MueLu_AmalgamationInfo.hpp"
58 #include "MueLu_PerfUtils.hpp"
67 template <
class LocalOrdinal,
class View>
68 class ReduceMaxFunctor {
70 ReduceMaxFunctor(View view)
73 KOKKOS_INLINE_FUNCTION
79 KOKKOS_INLINE_FUNCTION
86 KOKKOS_INLINE_FUNCTION
96 template <
class LOType,
class GOType,
class SCType,
class DeviceType,
class NspType,
class aggRowsType,
class maxAggDofSizeType,
class agg2RowMapLOType,
class statusType,
class rowsType,
class rowsAuxType,
class colsAuxType,
class valsAuxType>
97 class LocalQRDecompFunctor {
103 typedef typename DeviceType::execution_space execution_space;
104 typedef typename Kokkos::ArithTraits<SC>::val_type impl_SC;
105 typedef Kokkos::ArithTraits<impl_SC> impl_ATS;
106 typedef typename impl_ATS::magnitudeType Magnitude;
108 typedef Kokkos::View<impl_SC**, typename execution_space::scratch_memory_space, Kokkos::MemoryUnmanaged> shared_matrix;
109 typedef Kokkos::View<impl_SC*, typename execution_space::scratch_memory_space, Kokkos::MemoryUnmanaged> shared_vector;
125 LocalQRDecompFunctor(NspType fineNS_, NspType coarseNS_, aggRowsType aggRows_, maxAggDofSizeType maxAggDofSize_, agg2RowMapLOType agg2RowMapLO_, statusType statusAtomic_, rowsType rows_, rowsAuxType rowsAux_, colsAuxType colsAux_, valsAuxType valsAux_,
bool doQRStep_)
138 KOKKOS_INLINE_FUNCTION
139 void operator()(
const typename Kokkos::TeamPolicy<execution_space>::member_type& thread,
size_t& nnz)
const {
140 auto agg = thread.league_rank();
145 const impl_SC one = impl_ATS::one();
146 const impl_SC two = one + one;
147 const impl_SC zero = impl_ATS::zero();
148 const auto zeroM = impl_ATS::magnitude(zero);
158 shared_matrix r(thread.team_shmem(), m, n);
159 for (
int j = 0; j < n; j++)
160 for (
int k = 0; k < m; k++)
164 for (
int i = 0; i < m; i++) {
165 for (
int j = 0; j < n; j++)
166 printf(
" %5.3lf ", r(i,j));
172 shared_matrix q(thread.team_shmem(), m, m);
174 bool isSingular =
false;
178 for (
int i = 0; i < m; i++) {
179 for (
int j = 0; j < m; j++)
184 for (
int k = 0; k < n; k++) {
186 Magnitude s = zeroM, norm, norm_x;
187 for (
int i = k + 1; i < m; i++)
188 s += pow(impl_ATS::magnitude(r(i, k)), 2);
189 norm = sqrt(pow(impl_ATS::magnitude(r(k, k)), 2) + s);
196 r(k, k) -= norm * one;
198 norm_x = sqrt(pow(impl_ATS::magnitude(r(k, k)), 2) + s);
199 if (norm_x == zeroM) {
202 r(k, k) = norm * one;
207 for (
int i = k; i < m; i++)
211 for (
int j = k + 1; j < n; j++) {
214 for (
int i = k; i < m; i++)
215 si += r(i, k) * r(i, j);
216 for (
int i = k; i < m; i++)
217 r(i, j) -= two * si * r(i, k);
221 for (
int j = k; j < m; j++) {
224 for (
int i = k; i < m; i++)
225 si += r(i, k) * qt(i, j);
226 for (
int i = k; i < m; i++)
227 qt(i, j) -= two * si * r(i, k);
231 r(k, k) = norm * one;
232 for (
int i = k + 1; i < m; i++)
238 for (
int i = 0; i < m; i++)
239 for (
int j = 0; j < i; j++) {
240 impl_SC tmp = qt(i,j);
247 for (
int j = 0; j < n; j++)
248 for (
int k = 0; k <= j; k++)
287 for (
int j = 0; j < n; j++)
288 for (
int k = 0; k < n; k++)
292 coarseNS(offset + k, j) = (k == j ? one : zero);
295 for (
int i = 0; i < m; i++)
296 for (
int j = 0; j < n; j++)
297 q(i, j) = (j == i ? one : zero);
301 for (
int j = 0; j < m; j++) {
303 size_t rowStart =
rowsAux(localRow);
305 for (
int k = 0; k < n; k++) {
307 if (q(j, k) != zero) {
308 colsAux(rowStart + lnnz) = offset + k;
309 valsAux(rowStart + lnnz) = q(j, k);
313 rows(localRow + 1) = lnnz;
319 for (
int i = 0; i < m; i++) {
320 for (
int j = 0; j < n; j++)
326 for (
int i = 0; i < aggSize; i++) {
327 for (
int j = 0; j < aggSize; j++)
328 printf(
" %5.3lf ", q(i,j));
342 for (
int j = 0; j < m; j++) {
344 size_t rowStart =
rowsAux(localRow);
346 for (
int k = 0; k < n; k++) {
347 const impl_SC qr_jk =
fineNS(localRow, k);
350 colsAux(rowStart + lnnz) = offset + k;
351 valsAux(rowStart + lnnz) = qr_jk;
355 rows(localRow + 1) = lnnz;
359 for (
int j = 0; j < n; j++)
365 size_t team_shmem_size(
int )
const {
369 return shared_matrix::shmem_size(m, n) +
370 shared_matrix::shmem_size(m, m);
378 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
382 #define SET_VALID_ENTRY(name) validParamList->setEntry(name, MasterList::getEntry(name))
385 #undef SET_VALID_ENTRY
390 validParamList->
set<
RCP<const FactoryBase>>(
"Scaled Nullspace", Teuchos::null,
"Generating factory of the scaled nullspace");
391 validParamList->
set<
RCP<const FactoryBase>>(
"UnAmalgamationInfo", Teuchos::null,
"Generating factory of UnAmalgamationInfo");
394 validParamList->
set<
RCP<const FactoryBase>>(
"Node Comm", Teuchos::null,
"Generating factory of the node level communicator");
398 norecurse.disableRecursiveValidation();
399 validParamList->
set<
ParameterList>(
"matrixmatrix: kernel params", norecurse,
"MatrixMatrix kernel parameters");
401 return validParamList;
404 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
408 std::string nspName =
"Nullspace";
409 if (pL.
isParameter(
"Nullspace name")) nspName = pL.
get<std::string>(
"Nullspace name");
411 Input(fineLevel,
"A");
412 Input(fineLevel,
"Aggregates");
413 Input(fineLevel, nspName);
414 Input(fineLevel,
"UnAmalgamationInfo");
415 Input(fineLevel,
"CoarseMap");
418 pL.
get<
bool>(
"tentative: build coarse coordinates")) {
419 bTransferCoordinates_ =
true;
420 Input(fineLevel,
"Coordinates");
421 }
else if (bTransferCoordinates_) {
422 Input(fineLevel,
"Coordinates");
426 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
428 return BuildP(fineLevel, coarseLevel);
431 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
438 std::string nspName =
"Nullspace";
439 if (pL.
isParameter(
"Nullspace name")) nspName = pL.
get<std::string>(
"Nullspace name");
441 auto A = Get<RCP<Matrix>>(fineLevel,
"A");
442 auto aggregates = Get<RCP<Aggregates>>(fineLevel,
"Aggregates");
443 auto amalgInfo = Get<RCP<AmalgamationInfo>>(fineLevel,
"UnAmalgamationInfo");
444 auto fineNullspace = Get<RCP<MultiVector>>(fineLevel, nspName);
445 auto coarseMap = Get<RCP<const Map>>(fineLevel,
"CoarseMap");
447 if (bTransferCoordinates_) {
448 fineCoords = Get<RCP<RealValuedMultiVector>>(fineLevel,
"Coordinates");
454 if (aggregates->GetNumGlobalAggregatesComputeIfNeeded() == 0) {
455 Ptentative = Teuchos::null;
456 Set(coarseLevel,
"P", Ptentative);
462 if (bTransferCoordinates_) {
464 using array_type =
typename Map::global_indices_array_device_type;
467 if (rcp_dynamic_cast<const StridedMap>(coarseMap) != Teuchos::null)
468 blkSize = rcp_dynamic_cast<
const StridedMap>(coarseMap)->getFixedBlockSize();
473 coarseCoordMap = coarseMap;
479 using range_policy = Kokkos::RangePolicy<typename Node::execution_space>;
480 array_type elementAList = coarseMap->getMyGlobalIndicesDevice();
481 GO indexBase = coarseMap->getIndexBase();
482 auto numElements = elementAList.size() / blkSize;
483 typename array_type::non_const_type elementList_nc(
"elementList", numElements);
486 Kokkos::parallel_for(
487 "Amalgamate Element List", range_policy(0, numElements), KOKKOS_LAMBDA(
LO i) {
488 elementList_nc[i] = (elementAList[i * blkSize] - indexBase) / blkSize + indexBase;
490 array_type elementList = elementList_nc;
492 elementList, indexBase, coarseMap->getComm());
495 coarseCoords = RealValuedMultiVectorFactory::Build(coarseCoordMap, fineCoords->getNumVectors());
498 auto uniqueMap = fineCoords->getMap();
500 if (aggregates->AggregatesCrossProcessors()) {
501 auto nonUniqueMap = aggregates->GetMap();
502 auto importer = ImportFactory::Build(uniqueMap, nonUniqueMap);
504 ghostedCoords = RealValuedMultiVectorFactory::Build(nonUniqueMap, fineCoords->getNumVectors());
510 auto aggGraph = aggregates->GetGraph();
511 auto numAggs = aggGraph.numRows();
513 auto fineCoordsView = fineCoords->getDeviceLocalView(Xpetra::Access::ReadOnly);
514 auto coarseCoordsView = coarseCoords->getDeviceLocalView(Xpetra::Access::OverwriteAll);
520 const auto dim = fineCoords->getNumVectors();
523 for (
size_t j = 0; j < dim; j++) {
524 Kokkos::parallel_for(
525 "MueLu::TentativeP::BuildCoords", Kokkos::RangePolicy<local_ordinal_type, execution_space>(0, numAggs),
526 KOKKOS_LAMBDA(
const LO i) {
530 auto aggregate = aggGraph.rowConst(i);
532 coordinate_type sum = 0.0;
533 for (
size_t colID = 0; colID < static_cast<size_t>(aggregate.length); colID++)
534 sum += fineCoordsRandomView(aggregate(colID), j);
536 coarseCoordsView(i, j) = sum / aggregate.length;
542 if (!aggregates->AggregatesCrossProcessors()) {
544 BuildPuncoupledBlockCrs(coarseLevel, A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace,
547 BuildPuncoupled(coarseLevel, A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace, coarseLevel.
GetLevelID());
550 BuildPcoupled(A, aggregates, amalgInfo, fineNullspace, coarseMap, Ptentative, coarseNullspace);
560 if (A->IsView(
"stridedMaps") ==
true)
561 Ptentative->CreateView(
"stridedMaps", A->getRowMap(
"stridedMaps"), coarseMap);
563 Ptentative->CreateView(
"stridedMaps", Ptentative->getRangeMap(), coarseMap);
565 if (bTransferCoordinates_) {
566 Set(coarseLevel,
"Coordinates", coarseCoords);
570 if (fineLevel.IsAvailable(
"Node Comm")) {
572 Set<RCP<const Teuchos::Comm<int>>>(coarseLevel,
"Node Comm", nodeComm);
575 Set(coarseLevel,
"Nullspace", coarseNullspace);
576 Set(coarseLevel,
"P", Ptentative);
580 params->
set(
"printLoadBalancingInfo",
true);
585 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
591 auto rowMap = A->getRowMap();
592 auto colMap = A->getColMap();
594 const size_t numRows = rowMap->getLocalNumElements();
595 const size_t NSDim = fineNullspace->getNumVectors();
597 typedef Kokkos::ArithTraits<SC> ATS;
598 using impl_SC =
typename ATS::val_type;
599 using impl_ATS = Kokkos::ArithTraits<impl_SC>;
600 const impl_SC zero = impl_ATS::zero(), one = impl_ATS::one();
609 auto aggRows = aggGraph.row_map;
610 auto aggCols = aggGraph.entries;
618 goodMap = isGoodMap(*rowMap, *colMap);
622 "MueLu: TentativePFactory_kokkos: for now works only with good maps "
623 "(i.e. \"matching\" row and column maps)");
632 LO fullBlockSize, blockID, stridingOffset, stridedBlockSize;
638 auto procWinner = aggregates->
GetProcWinner()->getDeviceLocalView(Xpetra::Access::ReadOnly);
639 auto vertex2AggId = aggregates->
GetVertex2AggId()->getDeviceLocalView(Xpetra::Access::ReadOnly);
642 int myPID = aggregates->
GetMap()->getComm()->getRank();
647 typedef typename Aggregates::aggregates_sizes_type::non_const_type AggSizeType;
648 AggSizeType aggDofSizes;
650 if (stridedBlockSize == 1) {
654 aggDofSizes = AggSizeType(
"agg_dof_sizes", numAggregates + 1);
657 Kokkos::deep_copy(Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(1), numAggregates + 1)), sizesConst);
663 aggDofSizes = AggSizeType(
"agg_dof_sizes", numAggregates + 1);
665 auto nodeMap = aggregates->
GetMap()->getLocalMap();
666 auto dofMap = colMap->getLocalMap();
668 Kokkos::parallel_for(
669 "MueLu:TentativePF:Build:compute_agg_sizes",
range_type(0, numAggregates),
670 KOKKOS_LAMBDA(
const LO agg) {
671 auto aggRowView = aggGraph.rowConst(agg);
674 for (
LO colID = 0; colID < aggRowView.length; colID++) {
675 GO nodeGID = nodeMap.getGlobalElement(aggRowView(colID));
677 for (
LO k = 0; k < stridedBlockSize; k++) {
678 GO dofGID = (nodeGID - indexBase) * fullBlockSize + k + indexBase + globalOffset + stridingOffset;
680 if (dofMap.getLocalElement(dofGID) != INVALID)
684 aggDofSizes(agg + 1) = size;
691 ReduceMaxFunctor<LO, decltype(aggDofSizes)> reduceMax(aggDofSizes);
692 Kokkos::parallel_reduce(
"MueLu:TentativePF:Build:max_agg_size",
range_type(0, aggDofSizes.extent(0)), reduceMax, maxAggSize);
696 Kokkos::parallel_scan(
697 "MueLu:TentativePF:Build:aggregate_sizes:stage1_scan",
range_type(0, numAggregates + 1),
698 KOKKOS_LAMBDA(
const LO i,
LO& update,
const bool& final_pass) {
699 update += aggDofSizes(i);
701 aggDofSizes(i) = update;
706 Kokkos::View<LO*, DeviceType>
agg2RowMapLO(Kokkos::ViewAllocateWithoutInitializing(
"agg2row_map_LO"), numRows);
710 AggSizeType aggOffsets(Kokkos::ViewAllocateWithoutInitializing(
"aggOffsets"), numAggregates);
711 Kokkos::deep_copy(aggOffsets, Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(0), numAggregates)));
713 Kokkos::parallel_for(
714 "MueLu:TentativePF:Build:createAgg2RowMap",
range_type(0, vertex2AggId.extent(0)),
715 KOKKOS_LAMBDA(
const LO lnode) {
716 if (procWinner(lnode, 0) == myPID) {
718 auto aggID = vertex2AggId(lnode, 0);
720 auto offset = Kokkos::atomic_fetch_add(&aggOffsets(aggID), stridedBlockSize);
724 for (
LO k = 0; k < stridedBlockSize; k++)
725 agg2RowMapLO(offset + k) = lnode * stridedBlockSize + k;
732 coarseNullspace = MultiVectorFactory::Build(coarseMap, NSDim);
735 auto fineNS = fineNullspace->getDeviceLocalView(Xpetra::Access::ReadWrite);
736 auto coarseNS = coarseNullspace->getDeviceLocalView(Xpetra::Access::OverwriteAll);
741 typedef typename local_matrix_type::row_map_type::non_const_type rows_type;
742 typedef typename local_matrix_type::index_type::non_const_type cols_type;
743 typedef typename local_matrix_type::values_type::non_const_type vals_type;
746 typedef Kokkos::View<int[10], DeviceType> status_type;
747 status_type status(
"status");
753 const bool&
doQRStep = pL.
get<
bool>(
"tentative: calculate qr");
755 GetOStream(
Runtime1) <<
"TentativePFactory : bypassing local QR phase" << std::endl;
757 GetOStream(
Warnings0) <<
"TentativePFactor : for nontrivial nullspace, this may degrade performance" << std::endl;
760 size_t nnzEstimate = numRows * NSDim;
761 rows_type
rowsAux(Kokkos::ViewAllocateWithoutInitializing(
"Ptent_aux_rows"), numRows + 1);
762 cols_type
colsAux(Kokkos::ViewAllocateWithoutInitializing(
"Ptent_aux_cols"), nnzEstimate);
763 vals_type
valsAux(
"Ptent_aux_vals", nnzEstimate);
764 rows_type
rows(
"Ptent_rows", numRows + 1);
771 Kokkos::parallel_for(
772 "MueLu:TentativePF:BuildPuncoupled:for1",
range_type(0, numRows + 1),
773 KOKKOS_LAMBDA(
const LO row) {
776 Kokkos::parallel_for(
777 "MueLu:TentativePF:BuildUncoupled:for2",
range_type(0, nnzEstimate),
778 KOKKOS_LAMBDA(
const LO j) {
795 const Kokkos::TeamPolicy<execution_space> policy(numAggregates, 1);
798 Kokkos::parallel_for(
799 "MueLu:TentativePF:BuildUncoupled:main_loop", policy,
800 KOKKOS_LAMBDA(
const typename Kokkos::TeamPolicy<execution_space>::member_type& thread) {
801 auto agg = thread.league_rank();
809 auto norm = impl_ATS::magnitude(zero);
814 for (decltype(aggSize) k = 0; k < aggSize; k++) {
816 norm += dnorm * dnorm;
830 for (decltype(aggSize) k = 0; k < aggSize; k++) {
834 rows(localRow + 1) = 1;
840 typename status_type::HostMirror statusHost = Kokkos::create_mirror_view(status);
841 Kokkos::deep_copy(statusHost, status);
842 for (decltype(statusHost.size()) i = 0; i < statusHost.size(); i++)
844 std::ostringstream oss;
845 oss <<
"MueLu::TentativePFactory::MakeTentative: ";
847 case 0: oss <<
"!goodMap is not implemented";
break;
848 case 1: oss <<
"fine level NS part has a zero column";
break;
854 Kokkos::parallel_for(
855 "MueLu:TentativePF:BuildUncoupled:main_loop_noqr", policy,
856 KOKKOS_LAMBDA(
const typename Kokkos::TeamPolicy<execution_space>::member_type& thread) {
857 auto agg = thread.league_rank();
866 for (decltype(aggSize) k = 0; k < aggSize; k++) {
870 rows(localRow + 1) = 1;
877 Kokkos::parallel_reduce(
878 "MueLu:TentativeP:CountNNZ",
range_type(0, numRows + 1),
879 KOKKOS_LAMBDA(
const LO i,
size_t& nnz_count) {
880 nnz_count +=
rows(i);
898 const Kokkos::TeamPolicy<execution_space> policy(numAggregates, 1);
900 decltype(aggDofSizes ), decltype(maxAggSize), decltype(agg2RowMapLO),
901 decltype(statusAtomic), decltype(rows), decltype(rowsAux), decltype(colsAux),
903 localQRFunctor(fineNSRandom,
coarseNS, aggDofSizes, maxAggSize, agg2RowMapLO, statusAtomic,
904 rows, rowsAux, colsAux, valsAux, doQRStep);
905 Kokkos::parallel_reduce(
"MueLu:TentativePF:BuildUncoupled:main_qr_loop", policy, localQRFunctor, nnz);
908 typename status_type::HostMirror statusHost = Kokkos::create_mirror_view(status);
909 Kokkos::deep_copy(statusHost, status);
910 for (decltype(statusHost.size()) i = 0; i < statusHost.size(); i++)
912 std::ostringstream oss;
913 oss <<
"MueLu::TentativePFactory::MakeTentative: ";
915 case 0: oss <<
"!goodMap is not implemented";
break;
916 case 1: oss <<
"fine level NS part has a zero column";
break;
929 if (nnz != nnzEstimate) {
934 Kokkos::parallel_scan(
935 "MueLu:TentativePF:Build:compress_rows",
range_type(0, numRows + 1),
936 KOKKOS_LAMBDA(
const LO i,
LO& upd,
const bool&
final) {
946 cols = cols_type(
"Ptent_cols", nnz);
947 vals = vals_type(
"Ptent_vals", nnz);
952 Kokkos::parallel_for(
953 "MueLu:TentativePF:Build:compress_cols_vals",
range_type(0, numRows),
954 KOKKOS_LAMBDA(
const LO i) {
960 cols(rowStart + lnnz) =
colsAux(j);
961 vals(rowStart + lnnz) =
valsAux(j);
973 GetOStream(
Runtime1) <<
"TentativePFactory : aggregates do not cross process boundaries" << std::endl;
979 local_matrix_type lclMatrix = local_matrix_type(
"A", numRows, coarseMap->getLocalNumElements(), nnz, vals,
rows, cols);
983 if (pL.
isSublist(
"matrixmatrix: kernel params"))
989 FCparams->set(
"compute global constants", FCparams->get(
"compute global constants",
false));
990 FCparams->set(
"Timer Label", std::string(
"MueLu::TentativeP-") +
toString(levelID));
992 auto PtentCrs = CrsMatrixFactory::Build(lclMatrix, rowMap, coarseMap, coarseMap, A->getDomainMap());
993 Ptentative =
rcp(
new CrsMatrixWrap(PtentCrs));
997 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
1016 const size_t numFineBlockRows = rowMap->getLocalNumElements();
1022 typedef Kokkos::ArithTraits<SC> ATS;
1023 using impl_SC =
typename ATS::val_type;
1024 using impl_ATS = Kokkos::ArithTraits<impl_SC>;
1025 const impl_SC one = impl_ATS::one();
1028 const size_t NSDim = fineNullspace->getNumVectors();
1036 auto aggRows = aggGraph.row_map;
1037 auto aggCols = aggGraph.entries;
1043 const size_t numCoarseBlockRows = coarsePointMap->getLocalNumElements() / NSDim;
1044 RCP<const Map> coarseBlockMap = MapFactory::Build(coarsePointMap->lib(),
1047 coarsePointMap->getIndexBase(),
1048 coarsePointMap->getComm());
1060 "MueLu: TentativePFactory_kokkos: for now works only with good maps "
1061 "(i.e. \"matching\" row and column maps)");
1070 LO fullBlockSize, blockID, stridingOffset, stridedBlockSize;
1076 auto procWinner = aggregates->
GetProcWinner()->getDeviceLocalView(Xpetra::Access::ReadOnly);
1077 auto vertex2AggId = aggregates->
GetVertex2AggId()->getDeviceLocalView(Xpetra::Access::ReadOnly);
1080 int myPID = aggregates->
GetMap()->getComm()->getRank();
1085 typedef typename Aggregates::aggregates_sizes_type::non_const_type AggSizeType;
1086 AggSizeType aggDofSizes;
1092 aggDofSizes = AggSizeType(
"agg_dof_sizes", numAggregates + 1);
1094 Kokkos::deep_copy(Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(1), numAggregates + 1)), aggSizes);
1100 ReduceMaxFunctor<LO, decltype(aggDofSizes)> reduceMax(aggDofSizes);
1101 Kokkos::parallel_reduce(
"MueLu:TentativePF:Build:max_agg_size",
range_type(0, aggDofSizes.extent(0)), reduceMax, maxAggSize);
1105 Kokkos::parallel_scan(
1106 "MueLu:TentativePF:Build:aggregate_sizes:stage1_scan",
range_type(0, numAggregates + 1),
1107 KOKKOS_LAMBDA(
const LO i,
LO& update,
const bool& final_pass) {
1108 update += aggDofSizes(i);
1110 aggDofSizes(i) = update;
1115 Kokkos::View<LO*, DeviceType> aggToRowMapLO(Kokkos::ViewAllocateWithoutInitializing(
"aggtorow_map_LO"), numFineBlockRows);
1119 AggSizeType aggOffsets(Kokkos::ViewAllocateWithoutInitializing(
"aggOffsets"), numAggregates);
1120 Kokkos::deep_copy(aggOffsets, Kokkos::subview(aggDofSizes, Kokkos::make_pair(static_cast<size_t>(0), numAggregates)));
1122 Kokkos::parallel_for(
1123 "MueLu:TentativePF:Build:createAgg2RowMap",
range_type(0, vertex2AggId.extent(0)),
1124 KOKKOS_LAMBDA(
const LO lnode) {
1125 if (procWinner(lnode, 0) == myPID) {
1127 auto aggID = vertex2AggId(lnode, 0);
1129 auto offset = Kokkos::atomic_fetch_add(&aggOffsets(aggID), stridedBlockSize);
1133 for (
LO k = 0; k < stridedBlockSize; k++)
1134 aggToRowMapLO(offset + k) = lnode * stridedBlockSize + k;
1141 coarseNullspace = MultiVectorFactory::Build(coarsePointMap, NSDim);
1144 auto fineNS = fineNullspace->getDeviceLocalView(Xpetra::Access::ReadWrite);
1145 auto coarseNS = coarseNullspace->getDeviceLocalView(Xpetra::Access::OverwriteAll);
1148 typedef typename local_matrix_type::row_map_type::non_const_type rows_type;
1149 typedef typename local_matrix_type::index_type::non_const_type cols_type;
1153 typedef Kokkos::View<int[10], DeviceType> status_type;
1154 status_type status(
"status");
1160 GetOStream(
Runtime1) <<
"TentativePFactory : bypassing local QR phase" << std::endl;
1166 rows_type ia(Kokkos::ViewAllocateWithoutInitializing(
"BlockGraph_rowptr"), numFineBlockRows + 1);
1167 cols_type ja(Kokkos::ViewAllocateWithoutInitializing(
"BlockGraph_colind"), numFineBlockRows);
1169 Kokkos::parallel_for(
1170 "MueLu:TentativePF:BlockCrs:graph_init",
range_type(0, numFineBlockRows),
1171 KOKKOS_LAMBDA(
const LO j) {
1175 if (j == (
LO)numFineBlockRows - 1)
1176 ia[numFineBlockRows] = numFineBlockRows;
1180 const Kokkos::TeamPolicy<execution_space> policy(numAggregates, 1);
1181 Kokkos::parallel_for(
1182 "MueLu:TentativePF:BlockCrs:fillGraph", policy,
1183 KOKKOS_LAMBDA(
const typename Kokkos::TeamPolicy<execution_space>::member_type& thread) {
1184 auto agg = thread.league_rank();
1190 for (
LO j = 0; j < aggSize; j++) {
1192 const LO localRow = aggToRowMapLO[aggDofSizes[agg] + j];
1193 const size_t rowStart = ia[localRow];
1194 ja[rowStart] = offset;
1204 rows_type i_temp(Kokkos::ViewAllocateWithoutInitializing(
"BlockGraph_rowptr"), numFineBlockRows + 1);
1206 Kokkos::parallel_scan(
1207 "MueLu:TentativePF:BlockCrs:compress_rows",
range_type(0, numFineBlockRows),
1208 KOKKOS_LAMBDA(
const LO i,
LO& upd,
const bool&
final) {
1211 for (
auto j = ia[i]; j < ia[i + 1]; j++)
1212 if (ja[j] != INVALID)
1214 if (
final && i == (
LO)numFineBlockRows - 1)
1215 i_temp[numFineBlockRows] = upd;
1219 cols_type j_temp(Kokkos::ViewAllocateWithoutInitializing(
"BlockGraph_colind"), nnz);
1221 Kokkos::parallel_for(
1222 "MueLu:TentativePF:BlockCrs:compress_cols",
range_type(0, numFineBlockRows),
1223 KOKKOS_LAMBDA(
const LO i) {
1224 size_t rowStart = i_temp[i];
1226 for (
auto j = ia[i]; j < ia[i + 1]; j++)
1227 if (ja[j] != INVALID) {
1228 j_temp[rowStart + lnnz] = ja[j];
1237 RCP<CrsGraph> BlockGraph = CrsGraphFactory::Build(rowMap, coarseBlockMap, ia, ja);
1242 if (pL.isSublist(
"matrixmatrix: kernel params"))
1243 FCparams =
rcp(
new ParameterList(pL.sublist(
"matrixmatrix: kernel params")));
1247 FCparams->
set(
"compute global constants", FCparams->
get(
"compute global constants",
false));
1248 std::string levelIDs =
toString(levelID);
1249 FCparams->
set(
"Timer Label", std::string(
"MueLu::TentativeP-") + levelIDs);
1252 BlockGraph->expertStaticFillComplete(coarseBlockMap, rowMap, dummy_i, dummy_e, FCparams);
1264 if (P_tpetra.
is_null())
throw std::runtime_error(
"BuildPUncoupled: Matrix factory did not return a Tpetra::BlockCrsMatrix");
1267 auto values = P_tpetra->getTpetra_BlockCrsMatrix()->getValuesDeviceNonConst();
1268 const LO stride = NSDim * NSDim;
1270 Kokkos::parallel_for(
1271 "MueLu:TentativePF:BlockCrs:main_loop_noqr", policy,
1272 KOKKOS_LAMBDA(
const typename Kokkos::TeamPolicy<execution_space>::member_type& thread) {
1273 auto agg = thread.league_rank();
1280 for (
LO j = 0; j < aggSize; j++) {
1281 LO localBlockRow = aggToRowMapLO(
aggRows(agg) + j);
1282 LO rowStart = localBlockRow * stride;
1283 for (
LO r = 0; r < (
LO)NSDim; r++) {
1284 LO localPointRow = localBlockRow * NSDim + r;
1285 for (
LO c = 0; c < (
LO)NSDim; c++) {
1286 values[rowStart + r * NSDim + c] = fineNSRandom(localPointRow, c);
1292 for (
LO j = 0; j < (
LO)NSDim; j++)
1296 Ptentative = P_wrap;
1299 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
1308 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
1311 auto rowLocalMap = rowMap.getLocalMap();
1312 auto colLocalMap = colMap.getLocalMap();
1314 const size_t numRows = rowLocalMap.getLocalNumElements();
1315 const size_t numCols = colLocalMap.getLocalNumElements();
1317 if (numCols < numRows)
1321 Kokkos::parallel_reduce(
1322 "MueLu:TentativePF:isGoodMap",
range_type(0, numRows),
1323 KOKKOS_LAMBDA(
const LO i,
size_t& diff) {
1324 diff += (rowLocalMap.getGlobalElement(i) != colLocalMap.getGlobalElement(i));
1328 return (numDiff == 0);
1333 #define MUELU_TENTATIVEPFACTORY_KOKKOS_SHORT
1334 #endif // MUELU_TENTATIVEPFACTORY_KOKKOS_DEF_HPP
Important warning messages (one line)
void BuildPcoupled(RCP< Matrix > A, RCP< Aggregates > aggregates, RCP< AmalgamationInfo > amalgInfo, RCP< MultiVector > fineNullspace, RCP< const Map > coarseMap, RCP< Matrix > &Ptentative, RCP< MultiVector > &coarseNullspace) const
MueLu::DefaultLocalOrdinal LocalOrdinal
void BuildPuncoupledBlockCrs(Level &coarseLevel, RCP< Matrix > A, RCP< Aggregates > aggregates, RCP< AmalgamationInfo > amalgInfo, RCP< MultiVector > fineNullspace, RCP< const Map > coarseMap, RCP< Matrix > &Ptentative, RCP< MultiVector > &coarseNullspace, const int levelID) const
static bool MapsAreNested(const Xpetra::Map< LocalOrdinal, GlobalOrdinal, Node > &rowMap, const Xpetra::Map< LocalOrdinal, GlobalOrdinal, Node > &colMap)
std::string toString(const T &what)
Little helper function to convert non-string types to strings.
const RCP< LOVector > & GetProcWinner() const
Returns constant vector that maps local node IDs to owning processor IDs.
KOKKOS_INLINE_FUNCTION LO GetNumAggregates() const
void BuildP(Level &fineLevel, Level &coarseLevel) const
Abstract Build method.
T & get(const std::string &name, T def_value)
ParameterList & set(std::string const &name, T const &value, std::string const &docString="", RCP< const ParameterEntryValidator > const &validator=null)
Timer to be used in factories. Similar to Monitor but with additional timers.
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
const RCP< const Map > GetMap() const
returns (overlapping) map of aggregate/node distribution
static const NoFactory * get()
Print even more statistics.
#define SET_VALID_ENTRY(name)
bool isParameter(const std::string &name) const
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
void BuildPuncoupled(Level &coarseLevel, RCP< Matrix > A, RCP< Aggregates > aggregates, RCP< AmalgamationInfo > amalgInfo, RCP< MultiVector > fineNullspace, RCP< const Map > coarseMap, RCP< Matrix > &Ptentative, RCP< MultiVector > &coarseNullspace, const int levelID) const
MueLu::DefaultScalar Scalar
MueLu::DefaultGlobalOrdinal GlobalOrdinal
Kokkos::RangePolicy< local_ordinal_type, execution_space > range_type
Class that holds all level-specific information.
bool isSublist(const std::string &name) const
void GetStridingInformation(LO &fullBlockSize, LO &blockID, LO &stridingOffset, LO &stridedBlockSize, GO &indexBase)
returns striding information
Timer to be used in factories. Similar to SubMonitor but adds a timer level by level.
typename LWGraph_kokkos::local_graph_type local_graph_type
void Build(Level &fineLevel, Level &coarseLevel) const
Build an object with this factory.
void DeclareInput(Level &fineLevel, Level &coarseLevel) const
Input.
const RCP< LOMultiVector > & GetVertex2AggId() const
Returns constant vector that maps local node IDs to local aggregates IDs.
static std::string PrintMatrixInfo(const Matrix &A, const std::string &msgTag, RCP< const Teuchos::ParameterList > params=Teuchos::null)
GO GlobalOffset()
returns offset of global dof ids
RCP< const ParameterList > GetValidParameterList() const
Return a const parameter list of valid parameters that setParameterList() will accept.
ParameterList & sublist(const std::string &name, bool mustAlreadyExist=false, const std::string &docString="")
int GetLevelID() const
Return level number.
Exception throws to report errors in the internal logical of the program.
Description of what is happening (more verbose)
bool isGoodMap(const Map &rowMap, const Map &colMap) const
maxAggDofSizeType maxAggDofSize
local_graph_type GetGraph() const
Node::device_type DeviceType
agg2RowMapLOType agg2RowMapLO
aggregates_sizes_type::const_type ComputeAggregateSizes(bool forceRecompute=false) const
Compute sizes of aggregates.
bool IsAvailable(const std::string &ename, const FactoryBase *factory=NoFactory::get()) const
Test whether a need's value has been saved.