42 #ifndef TPETRA_MATRIXMATRIX_CUDA_DEF_HPP
43 #define TPETRA_MATRIXMATRIX_CUDA_DEF_HPP
45 #ifdef HAVE_TPETRA_INST_CUDA
51 template<
class Scalar,
54 class LocalOrdinalViewType>
55 struct KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode,LocalOrdinalViewType> {
56 static inline void mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
57 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
58 const LocalOrdinalViewType & Acol2Brow,
59 const LocalOrdinalViewType & Acol2Irow,
60 const LocalOrdinalViewType & Bcol2Ccol,
61 const LocalOrdinalViewType & Icol2Ccol,
62 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
63 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
64 const std::string& label = std::string(),
65 const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
69 static inline void mult_A_B_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
70 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
71 const LocalOrdinalViewType & Acol2Brow,
72 const LocalOrdinalViewType & Acol2Irow,
73 const LocalOrdinalViewType & Bcol2Ccol,
74 const LocalOrdinalViewType & Icol2Ccol,
75 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
76 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
77 const std::string& label = std::string(),
78 const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
83 template<
class Scalar,
85 class GlobalOrdinal,
class LocalOrdinalViewType>
86 struct KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode,LocalOrdinalViewType> {
87 static inline void jacobi_A_B_newmatrix_kernel_wrapper(Scalar omega,
88 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> & Dinv,
89 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
90 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
91 const LocalOrdinalViewType & Acol2Brow,
92 const LocalOrdinalViewType & Acol2Irow,
93 const LocalOrdinalViewType & Bcol2Ccol,
94 const LocalOrdinalViewType & Icol2Ccol,
95 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
96 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
97 const std::string& label = std::string(),
98 const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
100 static inline void jacobi_A_B_reuse_kernel_wrapper(Scalar omega,
101 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> & Dinv,
102 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
103 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
104 const LocalOrdinalViewType & Acol2Brow,
105 const LocalOrdinalViewType & Acol2Irow,
106 const LocalOrdinalViewType & Bcol2Ccol,
107 const LocalOrdinalViewType & Icol2Ccol,
108 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
109 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
110 const std::string& label = std::string(),
111 const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
113 static inline void jacobi_A_B_newmatrix_KokkosKernels(Scalar omega,
114 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> & Dinv,
115 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
116 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
117 const LocalOrdinalViewType & Acol2Brow,
118 const LocalOrdinalViewType & Acol2Irow,
119 const LocalOrdinalViewType & Bcol2Ccol,
120 const LocalOrdinalViewType & Icol2Ccol,
121 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
122 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
123 const std::string& label = std::string(),
124 const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
130 template<
class Scalar,
133 class LocalOrdinalViewType>
134 void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode,LocalOrdinalViewType>::mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
135 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
136 const LocalOrdinalViewType & Acol2Brow,
137 const LocalOrdinalViewType & Acol2Irow,
138 const LocalOrdinalViewType & Bcol2Ccol,
139 const LocalOrdinalViewType & Icol2Ccol,
140 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
141 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
142 const std::string& label,
143 const Teuchos::RCP<Teuchos::ParameterList>& params) {
146 #ifdef HAVE_TPETRA_MMM_TIMINGS
147 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
148 using Teuchos::TimeMonitor;
149 Teuchos::RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix CudaWrapper")))));
152 typedef Tpetra::KokkosCompat::KokkosCudaWrapperNode Node;
153 std::string nodename(
"Cuda");
158 typedef typename KCRS::device_type device_t;
159 typedef typename KCRS::StaticCrsGraphType graph_t;
160 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
161 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
162 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
163 typedef typename KCRS::values_type::non_const_type scalar_view_t;
167 int team_work_size = 16;
168 std::string myalg(
"SPGEMM_KK_MEMORY");
169 if(!params.is_null()) {
170 if(params->isParameter(
"cuda: algorithm"))
171 myalg = params->get(
"cuda: algorithm",myalg);
172 if(params->isParameter(
"cuda: team work size"))
173 team_work_size = params->get(
"cuda: team work size",team_work_size);
177 typedef KokkosKernels::Experimental::KokkosKernelsHandle<
178 typename lno_view_t::const_value_type,
typename lno_nnz_view_t::const_value_type,
typename scalar_view_t::const_value_type,
179 typename device_t::execution_space,
typename device_t::memory_space,
typename device_t::memory_space > KernelHandle;
182 const KCRS & Amat = Aview.origMatrix->getLocalMatrixDevice();
183 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixDevice();
185 c_lno_view_t Arowptr = Amat.graph.row_map,
186 Browptr = Bmat.graph.row_map;
187 const lno_nnz_view_t Acolind = Amat.graph.entries,
188 Bcolind = Bmat.graph.entries;
189 const scalar_view_t Avals = Amat.values,
192 c_lno_view_t Irowptr;
193 lno_nnz_view_t Icolind;
195 if(!Bview.importMatrix.is_null()) {
196 auto lclB = Bview.importMatrix->getLocalMatrixDevice();
197 Irowptr = lclB.graph.row_map;
198 Icolind = lclB.graph.entries;
204 std::string alg = nodename+std::string(
" algorithm");
206 if(!params.is_null() && params->isParameter(alg)) myalg = params->get(alg,myalg);
207 KokkosSparse::SPGEMMAlgorithm alg_enum = KokkosSparse::StringToSPGEMMAlgorithm(myalg);
210 const KCRS Bmerged = Tpetra::MMdetails::merge_matrices(Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C.getColMap()->getLocalNumElements());
212 #ifdef HAVE_TPETRA_MMM_TIMINGS
213 MM = Teuchos::null; MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix CudaCore"))));
217 typename KernelHandle::nnz_lno_t AnumRows = Amat.numRows();
218 typename KernelHandle::nnz_lno_t BnumRows = Bmerged.numRows();
219 typename KernelHandle::nnz_lno_t BnumCols = Bmerged.numCols();
221 lno_view_t row_mapC (Kokkos::ViewAllocateWithoutInitializing(
"non_const_lnow_row"), AnumRows + 1);
222 lno_nnz_view_t entriesC;
223 scalar_view_t valuesC;
225 kh.create_spgemm_handle(alg_enum);
226 kh.set_team_work_size(team_work_size);
228 KokkosSparse::Experimental::spgemm_symbolic(&kh,AnumRows,BnumRows,BnumCols,Amat.graph.row_map,Amat.graph.entries,
false,Bmerged.graph.row_map,Bmerged.graph.entries,
false,row_mapC);
230 size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
232 entriesC = lno_nnz_view_t (Kokkos::ViewAllocateWithoutInitializing(
"entriesC"), c_nnz_size);
233 valuesC = scalar_view_t (Kokkos::ViewAllocateWithoutInitializing(
"valuesC"), c_nnz_size);
235 KokkosSparse::Experimental::spgemm_numeric(&kh,AnumRows,BnumRows,BnumCols,Amat.graph.row_map,Amat.graph.entries,Amat.values,
false,Bmerged.graph.row_map,Bmerged.graph.entries,Bmerged.values,
false,row_mapC,entriesC,valuesC);
236 kh.destroy_spgemm_handle();
238 #ifdef HAVE_TPETRA_MMM_TIMINGS
239 MM = Teuchos::null; MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix CudaSort"))));
243 if (params.is_null() || params->get(
"sort entries",
true))
244 Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
245 C.setAllValues(row_mapC,entriesC,valuesC);
247 #ifdef HAVE_TPETRA_MMM_TIMINGS
248 MM = Teuchos::null; MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix CudaESFC"))));
252 RCP<Teuchos::ParameterList> labelList = rcp(
new Teuchos::ParameterList);
253 labelList->set(
"Timer Label",label);
254 if(!params.is_null()) labelList->set(
"compute global constants",params->get(
"compute global constants",
true));
255 RCP<const Export<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > dummyExport;
256 C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
261 template<
class Scalar,
264 class LocalOrdinalViewType>
265 void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode,LocalOrdinalViewType>::mult_A_B_reuse_kernel_wrapper(
266 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
267 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
268 const LocalOrdinalViewType & targetMapToOrigRow_dev,
269 const LocalOrdinalViewType & targetMapToImportRow_dev,
270 const LocalOrdinalViewType & Bcol2Ccol_dev,
271 const LocalOrdinalViewType & Icol2Ccol_dev,
272 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
273 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
274 const std::string& label,
275 const Teuchos::RCP<Teuchos::ParameterList>& params) {
278 typedef Tpetra::KokkosCompat::KokkosCudaWrapperNode Node;
280 #ifdef HAVE_TPETRA_MMM_TIMINGS
281 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
282 using Teuchos::TimeMonitor;
283 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Reuse SerialCore"))));
284 Teuchos::RCP<Teuchos::TimeMonitor> MM2;
291 typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
292 typedef typename KCRS::StaticCrsGraphType graph_t;
293 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
294 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
295 typedef typename KCRS::values_type::non_const_type scalar_view_t;
298 typedef LocalOrdinal LO;
299 typedef GlobalOrdinal GO;
301 typedef Map<LO,GO,NO> map_type;
302 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
303 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
304 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
312 auto targetMapToOrigRow =
313 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
314 targetMapToOrigRow_dev);
315 auto targetMapToImportRow =
316 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
317 targetMapToImportRow_dev);
319 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
322 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
326 RCP<const map_type> Ccolmap = C.getColMap();
327 size_t m = Aview.origMatrix->getLocalNumRows();
328 size_t n = Ccolmap->getLocalNumElements();
331 const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
332 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
333 const KCRS & Cmat = C.getLocalMatrixHost();
335 c_lno_view_t Arowptr = Amat.graph.row_map,
336 Browptr = Bmat.graph.row_map,
337 Crowptr = Cmat.graph.row_map;
338 const lno_nnz_view_t Acolind = Amat.graph.entries,
339 Bcolind = Bmat.graph.entries,
340 Ccolind = Cmat.graph.entries;
341 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
342 scalar_view_t Cvals = Cmat.values;
344 c_lno_view_t Irowptr;
345 lno_nnz_view_t Icolind;
347 if(!Bview.importMatrix.is_null()) {
348 auto lclB = Bview.importMatrix->getLocalMatrixHost();
349 Irowptr = lclB.graph.row_map;
350 Icolind = lclB.graph.entries;
354 #ifdef HAVE_TPETRA_MMM_TIMINGS
355 MM2 = Teuchos::null; MM2 = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix SerialCore - Compare"))));
367 std::vector<size_t> c_status(n, ST_INVALID);
370 size_t CSR_ip = 0, OLD_ip = 0;
371 for (
size_t i = 0; i < m; i++) {
375 CSR_ip = Crowptr[i+1];
376 for (
size_t k = OLD_ip; k < CSR_ip; k++) {
377 c_status[Ccolind[k]] = k;
383 for (
size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
385 const SC Aval = Avals[k];
389 if (targetMapToOrigRow[Aik] != LO_INVALID) {
391 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow[Aik]);
393 for (
size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
395 LO Cij = Bcol2Ccol[Bkj];
397 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
398 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph " <<
399 "(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
401 Cvals[c_status[Cij]] += Aval * Bvals[j];
406 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow[Aik]);
407 for (
size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
409 LO Cij = Icol2Ccol[Ikj];
411 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
412 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph " <<
413 "(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
415 Cvals[c_status[Cij]] += Aval * Ivals[j];
421 C.fillComplete(C.getDomainMap(), C.getRangeMap());
425 template<
class Scalar,
428 class LocalOrdinalViewType>
429 void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode,LocalOrdinalViewType>::jacobi_A_B_newmatrix_kernel_wrapper(Scalar omega,
430 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> & Dinv,
431 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
432 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
433 const LocalOrdinalViewType & Acol2Brow,
434 const LocalOrdinalViewType & Acol2Irow,
435 const LocalOrdinalViewType & Bcol2Ccol,
436 const LocalOrdinalViewType & Icol2Ccol,
437 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
438 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
439 const std::string& label,
440 const Teuchos::RCP<Teuchos::ParameterList>& params) {
442 #ifdef HAVE_TPETRA_MMM_TIMINGS
443 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
444 using Teuchos::TimeMonitor;
445 Teuchos::RCP<TimeMonitor> MM;
453 std::string myalg(
"KK");
454 if(!params.is_null()) {
455 if(params->isParameter(
"cuda: jacobi algorithm"))
456 myalg = params->get(
"cuda: jacobi algorithm",myalg);
459 if(myalg ==
"MSAK") {
460 ::Tpetra::MatrixMatrix::ExtraKernels::jacobi_A_B_newmatrix_MultiplyScaleAddKernel(omega,Dinv,Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
462 else if(myalg ==
"KK") {
463 jacobi_A_B_newmatrix_KokkosKernels(omega,Dinv,Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
466 throw std::runtime_error(
"Tpetra::MatrixMatrix::Jacobi newmatrix unknown kernel");
469 #ifdef HAVE_TPETRA_MMM_TIMINGS
470 MM = Teuchos::null; MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix CudaESFC"))));
474 RCP<Teuchos::ParameterList> labelList = rcp(
new Teuchos::ParameterList);
475 labelList->set(
"Timer Label",label);
476 if(!params.is_null()) labelList->set(
"compute global constants",params->get(
"compute global constants",
true));
479 if(!C.isFillComplete()) {
480 RCP<const Export<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > dummyExport;
481 C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
489 template<
class Scalar,
492 class LocalOrdinalViewType>
493 void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode,LocalOrdinalViewType>::jacobi_A_B_reuse_kernel_wrapper(Scalar omega,
494 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> & Dinv,
495 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
496 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
497 const LocalOrdinalViewType & targetMapToOrigRow_dev,
498 const LocalOrdinalViewType & targetMapToImportRow_dev,
499 const LocalOrdinalViewType & Bcol2Ccol_dev,
500 const LocalOrdinalViewType & Icol2Ccol_dev,
501 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
502 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
503 const std::string& label,
504 const Teuchos::RCP<Teuchos::ParameterList>& params) {
507 typedef Tpetra::KokkosCompat::KokkosCudaWrapperNode Node;
509 #ifdef HAVE_TPETRA_MMM_TIMINGS
510 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
511 using Teuchos::TimeMonitor;
512 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Reuse CudaCore"))));
513 Teuchos::RCP<Teuchos::TimeMonitor> MM2;
519 typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
520 typedef typename KCRS::StaticCrsGraphType graph_t;
521 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
522 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
523 typedef typename KCRS::values_type::non_const_type scalar_view_t;
524 typedef typename scalar_view_t::memory_space scalar_memory_space;
527 typedef LocalOrdinal LO;
528 typedef GlobalOrdinal GO;
530 typedef Map<LO,GO,NO> map_type;
531 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
532 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
533 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
541 auto targetMapToOrigRow =
542 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
543 targetMapToOrigRow_dev);
544 auto targetMapToImportRow =
545 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
546 targetMapToImportRow_dev);
548 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
551 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
556 RCP<const map_type> Ccolmap = C.getColMap();
557 size_t m = Aview.origMatrix->getLocalNumRows();
558 size_t n = Ccolmap->getLocalNumElements();
561 const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
562 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
563 const KCRS & Cmat = C.getLocalMatrixHost();
565 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
566 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
567 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
568 scalar_view_t Cvals = Cmat.values;
570 c_lno_view_t Irowptr;
571 lno_nnz_view_t Icolind;
573 if(!Bview.importMatrix.is_null()) {
574 auto lclB = Bview.importMatrix->getLocalMatrixHost();
575 Irowptr = lclB.graph.row_map;
576 Icolind = lclB.graph.entries;
582 Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
584 #ifdef HAVE_TPETRA_MMM_TIMINGS
585 MM2 = Teuchos::null; MM2 = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Reuse CudaCore - Compare"))));
593 std::vector<size_t> c_status(n, ST_INVALID);
596 size_t CSR_ip = 0, OLD_ip = 0;
597 for (
size_t i = 0; i < m; i++) {
602 CSR_ip = Crowptr[i+1];
603 for (
size_t k = OLD_ip; k < CSR_ip; k++) {
604 c_status[Ccolind[k]] = k;
610 SC minusOmegaDval = -omega*Dvals(i,0);
613 for (
size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
614 Scalar Bval = Bvals[j];
618 LO Cij = Bcol2Ccol[Bij];
620 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
621 std::runtime_error,
"Trying to insert a new entry into a static graph");
623 Cvals[c_status[Cij]] = Bvals[j];
627 for (
size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
629 const SC Aval = Avals[k];
633 if (targetMapToOrigRow[Aik] != LO_INVALID) {
635 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow[Aik]);
637 for (
size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
639 LO Cij = Bcol2Ccol[Bkj];
641 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
642 std::runtime_error,
"Trying to insert a new entry into a static graph");
644 Cvals[c_status[Cij]] += minusOmegaDval * Aval * Bvals[j];
649 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow[Aik]);
650 for (
size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
652 LO Cij = Icol2Ccol[Ikj];
654 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
655 std::runtime_error,
"Trying to insert a new entry into a static graph");
657 Cvals[c_status[Cij]] += minusOmegaDval * Aval * Ivals[j];
663 #ifdef HAVE_TPETRA_MMM_TIMINGS
665 MM = Teuchos::null; MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Reuse ESFC"))));
668 C.fillComplete(C.getDomainMap(), C.getRangeMap());
673 template<
class Scalar,
676 class LocalOrdinalViewType>
677 void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode,LocalOrdinalViewType>::jacobi_A_B_newmatrix_KokkosKernels(Scalar omega,
678 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> & Dinv,
679 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
680 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
681 const LocalOrdinalViewType & Acol2Brow,
682 const LocalOrdinalViewType & Acol2Irow,
683 const LocalOrdinalViewType & Bcol2Ccol,
684 const LocalOrdinalViewType & Icol2Ccol,
685 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
686 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
687 const std::string& label,
688 const Teuchos::RCP<Teuchos::ParameterList>& params) {
690 #ifdef HAVE_TPETRA_MMM_TIMINGS
691 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
692 using Teuchos::TimeMonitor;
693 Teuchos::RCP<TimeMonitor> MM;
700 auto rowMap = Aview.origMatrix->getRowMap();
702 Aview.origMatrix->getLocalDiagCopy(diags);
703 size_t diagLength = rowMap->getLocalNumElements();
704 Teuchos::Array<Scalar> diagonal(diagLength);
705 diags.get1dCopy(diagonal());
707 for(
size_t i = 0; i < diagLength; ++i) {
708 TEUCHOS_TEST_FOR_EXCEPTION(diagonal[i] == Teuchos::ScalarTraits<Scalar>::zero(),
710 "Matrix A has a zero/missing diagonal: " << diagonal[i] << std::endl <<
711 "KokkosKernels Jacobi-fused SpGEMM requires nonzero diagonal entries in A" << std::endl);
716 using device_t =
typename Tpetra::KokkosCompat::KokkosCudaWrapperNode::device_type;
718 using graph_t =
typename matrix_t::StaticCrsGraphType;
719 using lno_view_t =
typename graph_t::row_map_type::non_const_type;
720 using c_lno_view_t =
typename graph_t::row_map_type::const_type;
721 using lno_nnz_view_t =
typename graph_t::entries_type::non_const_type;
722 using scalar_view_t =
typename matrix_t::values_type::non_const_type;
725 using handle_t =
typename KokkosKernels::Experimental::KokkosKernelsHandle<
726 typename lno_view_t::const_value_type,
typename lno_nnz_view_t::const_value_type,
typename scalar_view_t::const_value_type,
727 typename device_t::execution_space,
typename device_t::memory_space,
typename device_t::memory_space >;
730 c_lno_view_t Irowptr;
731 lno_nnz_view_t Icolind;
733 if(!Bview.importMatrix.is_null()) {
734 auto lclB = Bview.importMatrix->getLocalMatrixDevice();
735 Irowptr = lclB.graph.row_map;
736 Icolind = lclB.graph.entries;
741 const matrix_t Bmerged = Tpetra::MMdetails::merge_matrices(Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C.getColMap()->getLocalNumElements());
744 const matrix_t & Amat = Aview.origMatrix->getLocalMatrixDevice();
745 const matrix_t & Bmat = Bview.origMatrix->getLocalMatrixDevice();
747 typename handle_t::nnz_lno_t AnumRows = Amat.numRows();
748 typename handle_t::nnz_lno_t BnumRows = Bmerged.numRows();
749 typename handle_t::nnz_lno_t BnumCols = Bmerged.numCols();
751 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmerged.graph.row_map;
752 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmerged.graph.entries;
753 const scalar_view_t Avals = Amat.values, Bvals = Bmerged.values;
756 lno_view_t row_mapC (Kokkos::ViewAllocateWithoutInitializing(
"non_const_lnow_row"), AnumRows + 1);
757 lno_nnz_view_t entriesC;
758 scalar_view_t valuesC;
761 int team_work_size = 16;
762 std::string myalg(
"SPGEMM_KK_MEMORY");
763 if(!params.is_null()) {
764 if(params->isParameter(
"cuda: algorithm"))
765 myalg = params->get(
"cuda: algorithm",myalg);
766 if(params->isParameter(
"cuda: team work size"))
767 team_work_size = params->get(
"cuda: team work size",team_work_size);
771 std::string nodename(
"Cuda");
772 std::string alg = nodename + std::string(
" algorithm");
773 if(!params.is_null() && params->isParameter(alg)) myalg = params->get(alg,myalg);
774 KokkosSparse::SPGEMMAlgorithm alg_enum = KokkosSparse::StringToSPGEMMAlgorithm(myalg);
779 kh.create_spgemm_handle(alg_enum);
780 kh.set_team_work_size(team_work_size);
782 KokkosSparse::Experimental::spgemm_symbolic(&kh, AnumRows, BnumRows, BnumCols,
783 Arowptr, Acolind,
false,
784 Browptr, Bcolind,
false,
787 size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
789 entriesC = lno_nnz_view_t (Kokkos::ViewAllocateWithoutInitializing(
"entriesC"), c_nnz_size);
790 valuesC = scalar_view_t (Kokkos::ViewAllocateWithoutInitializing(
"valuesC"), c_nnz_size);
793 KokkosSparse::Experimental::spgemm_jacobi(&kh, AnumRows, BnumRows, BnumCols,
794 Arowptr, Acolind, Avals,
false,
795 Browptr, Bcolind, Bvals,
false,
796 row_mapC, entriesC, valuesC,
797 omega, Dinv.getLocalViewDevice(Access::ReadOnly));
798 kh.destroy_spgemm_handle();
800 #ifdef HAVE_TPETRA_MMM_TIMINGS
801 MM = Teuchos::null; MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix CudaSort"))));
805 if (params.is_null() || params->get(
"sort entries",
true))
806 Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
807 C.setAllValues(row_mapC,entriesC,valuesC);
809 #ifdef HAVE_TPETRA_MMM_TIMINGS
810 MM = Teuchos::null; MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix CudaESFC"))));
814 Teuchos::RCP<Teuchos::ParameterList> labelList = rcp(
new Teuchos::ParameterList);
815 labelList->set(
"Timer Label",label);
816 if(!params.is_null()) labelList->set(
"compute global constants",params->get(
"compute global constants",
true));
817 Teuchos::RCP<const Export<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > dummyExport;
818 C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
static bool debug()
Whether Tpetra is in debug mode.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
A distributed dense vector.