Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Tpetra_Details_residual.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 //
38 // ************************************************************************
39 // @HEADER
40 
41 #ifndef TPETRA_DETAILS_RESIDUAL_HPP
42 #define TPETRA_DETAILS_RESIDUAL_HPP
43 
44 #include "TpetraCore_config.h"
45 #include "Tpetra_CrsMatrix.hpp"
46 #include "Tpetra_LocalCrsMatrixOperator.hpp"
47 #include "Tpetra_MultiVector.hpp"
48 #include "Teuchos_RCP.hpp"
49 #include "Teuchos_ScalarTraits.hpp"
52 #include "KokkosSparse_spmv_impl.hpp"
53 
59 
60 namespace Tpetra {
61  namespace Details {
62 
63 
70 template<class AMatrix, class MV, class ConstMV, class Offsets, bool is_MV, bool restrictedMode, bool skipOffRank>
72 
73  using execution_space = typename AMatrix::execution_space;
74  using LO = typename AMatrix::non_const_ordinal_type;
75  using value_type = typename AMatrix::non_const_value_type;
76  using team_policy = typename Kokkos::TeamPolicy<execution_space>;
77  using team_member = typename team_policy::member_type;
78  using ATV = Kokkos::ArithTraits<value_type>;
79 
80  AMatrix A_lcl;
81  ConstMV X_colmap_lcl;
82  ConstMV B_lcl;
83  MV R_lcl;
84  int rows_per_team;
85  Offsets offsets;
86  ConstMV X_domainmap_lcl;
87 
88 
89  LocalResidualFunctor (const AMatrix& A_lcl_,
90  const ConstMV& X_colmap_lcl_,
91  const ConstMV& B_lcl_,
92  const MV& R_lcl_,
93  const int rows_per_team_,
94  const Offsets& offsets_,
95  const ConstMV& X_domainmap_lcl_) :
96  A_lcl(A_lcl_),
97  X_colmap_lcl(X_colmap_lcl_),
98  B_lcl(B_lcl_),
99  R_lcl(R_lcl_),
100  rows_per_team(rows_per_team_),
101  offsets(offsets_),
102  X_domainmap_lcl(X_domainmap_lcl_)
103  { }
104 
105  KOKKOS_INLINE_FUNCTION
106  void operator() (const team_member& dev) const
107  {
108 
109  Kokkos::parallel_for(Kokkos::TeamThreadRange (dev, 0, rows_per_team),[&] (const LO& loop) {
110  const LO lclRow = static_cast<LO> (dev.league_rank ()) * rows_per_team + loop;
111 
112  if (lclRow >= A_lcl.numRows ()) {
113  return;
114  }
115 
116  if (!is_MV) { // MultiVectors only have a single column
117 
118  value_type A_x = ATV::zero ();
119 
120  if (!restrictedMode) {
121  const auto A_row = A_lcl.rowConst(lclRow);
122  const LO row_length = static_cast<LO> (A_row.length);
123 
124  Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, row_length), [&] (const LO iEntry, value_type& lsum) {
125  const auto A_val = A_row.value(iEntry);
126  lsum += A_val * X_colmap_lcl(A_row.colidx(iEntry),0);
127  }, A_x);
128 
129  }
130  else {
131 
132  const LO offRankOffset = offsets(lclRow);
133  const size_t start = A_lcl.graph.row_map(lclRow);
134  const size_t end = A_lcl.graph.row_map(lclRow+1);
135 
136  Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, start, end), [&] (const LO iEntry, value_type& lsum) {
137  const auto A_val = A_lcl.values(iEntry);
138  const auto lclCol = A_lcl.graph.entries(iEntry);
139  if (iEntry < offRankOffset)
140  lsum += A_val * X_domainmap_lcl(lclCol,0);
141  else if (!skipOffRank)
142  lsum += A_val * X_colmap_lcl(lclCol,0);
143  }, A_x);
144  }
145 
146  Kokkos::single(Kokkos::PerThread(dev),[&] () {
147  R_lcl(lclRow,0) = B_lcl(lclRow,0) - A_x;
148  });
149  }
150  else { // MultiVectors have more than one column
151 
152  const LO numVectors = static_cast<LO>(X_colmap_lcl.extent(1));
153 
154  for(LO v=0; v<numVectors; v++) {
155 
156  value_type A_x = ATV::zero ();
157 
158  if (!restrictedMode) {
159 
160  const auto A_row = A_lcl.rowConst(lclRow);
161  const LO row_length = static_cast<LO> (A_row.length);
162 
163  Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, row_length), [&] (const LO iEntry, value_type& lsum) {
164  const auto A_val = A_row.value(iEntry);
165  lsum += A_val * X_colmap_lcl(A_row.colidx(iEntry),v);
166  }, A_x);
167  }
168  else {
169  const LO offRankOffset = offsets(lclRow);
170  const size_t start = A_lcl.graph.row_map(lclRow);
171  const size_t end = A_lcl.graph.row_map(lclRow+1);
172 
173  Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, start, end), [&] (const LO iEntry, value_type& lsum) {
174  const auto A_val = A_lcl.values(iEntry);
175  const auto lclCol = A_lcl.graph.entries(iEntry);
176  if (iEntry < offRankOffset)
177  lsum += A_val * X_domainmap_lcl(lclCol,v);
178  else if (!skipOffRank)
179  lsum += A_val * X_colmap_lcl(lclCol,v);
180  }, A_x);
181  }
182 
183  Kokkos::single(Kokkos::PerThread(dev),[&] () {
184  R_lcl(lclRow,v) = B_lcl(lclRow,v) - A_x;
185  });
186 
187  }//end for numVectors
188  }
189  });//end parallel_for TeamThreadRange
190  }
191 };
192 
193 
195 template<class AMatrix, class MV, class ConstMV, class Offsets, bool is_MV>
197 
198  using execution_space = typename AMatrix::execution_space;
199  using LO = typename AMatrix::non_const_ordinal_type;
200  using value_type = typename AMatrix::non_const_value_type;
201  using team_policy = typename Kokkos::TeamPolicy<execution_space>;
202  using team_member = typename team_policy::member_type;
203  using ATV = Kokkos::ArithTraits<value_type>;
204 
205  AMatrix A_lcl;
206  ConstMV X_colmap_lcl;
207  MV R_lcl;
208  int rows_per_team;
209  Offsets offsets;
210 
211 
212  OffRankUpdateFunctor (const AMatrix& A_lcl_,
213  const ConstMV& X_colmap_lcl_,
214  const MV& R_lcl_,
215  const int rows_per_team_,
216  const Offsets& offsets_) :
217  A_lcl(A_lcl_),
218  X_colmap_lcl(X_colmap_lcl_),
219  R_lcl(R_lcl_),
220  rows_per_team(rows_per_team_),
221  offsets(offsets_)
222  { }
223 
224  KOKKOS_INLINE_FUNCTION
225  void operator() (const team_member& dev) const
226  {
227 
228  Kokkos::parallel_for(Kokkos::TeamThreadRange (dev, 0, rows_per_team),[&] (const LO& loop) {
229  const LO lclRow = static_cast<LO> (dev.league_rank ()) * rows_per_team + loop;
230 
231  if (lclRow >= A_lcl.numRows ()) {
232  return;
233  }
234 
235  const LO offRankOffset = offsets(lclRow);
236  const size_t end = A_lcl.graph.row_map(lclRow+1);
237 
238  if (!is_MV) { // MultiVectors only have a single column
239 
240  value_type A_x = ATV::zero ();
241 
242  Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, offRankOffset, end), [&] (const LO iEntry, value_type& lsum) {
243  const auto A_val = A_lcl.values(iEntry);
244  const auto lclCol = A_lcl.graph.entries(iEntry);
245  lsum += A_val * X_colmap_lcl(lclCol,0);
246  }, A_x);
247 
248  Kokkos::single(Kokkos::PerThread(dev),[&] () {
249  R_lcl(lclRow,0) -= A_x;
250  });
251  }
252  else { // MultiVectors have more than one column
253 
254  const LO numVectors = static_cast<LO>(X_colmap_lcl.extent(1));
255 
256  for(LO v=0; v<numVectors; v++) {
257 
258  value_type A_x = ATV::zero ();
259 
260  Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, offRankOffset, end), [&] (const LO iEntry, value_type& lsum) {
261  const auto A_val = A_lcl.values(iEntry);
262  const auto lclCol = A_lcl.graph.entries(iEntry);
263  lsum += A_val * X_colmap_lcl(lclCol,v);
264  }, A_x);
265 
266  Kokkos::single(Kokkos::PerThread(dev),[&] () {
267  R_lcl(lclRow,v) -= A_x;
268  });
269 
270  }//end for numVectors
271  }
272  });
273  }
274 };
275 
276 template<class SC, class LO, class GO, class NO>
277 void localResidual(const CrsMatrix<SC,LO,GO,NO> & A,
278  const MultiVector<SC,LO,GO,NO> & X_colmap,
279  const MultiVector<SC,LO,GO,NO> & B,
281  const Kokkos::View<const size_t*, typename NO::device_type>& offsets,
282  const MultiVector<SC,LO,GO,NO> * X_domainmap=nullptr) {
284  using Teuchos::NO_TRANS;
285  ProfilingRegion regionLocalApply ("Tpetra::CrsMatrix::localResidual");
286 
287  using local_matrix_device_type = typename CrsMatrix<SC,LO,GO,NO>::local_matrix_device_type;
288  using local_view_device_type = typename MultiVector<SC,LO,GO,NO>::dual_view_type::t_dev::non_const_type;
289  using const_local_view_device_type = typename MultiVector<SC,LO,GO,NO>::dual_view_type::t_dev::const_type;
290  using offset_type = Kokkos::View<const size_t*, typename NO::device_type>;
291 
292  local_matrix_device_type A_lcl = A.getLocalMatrixDevice ();
293  const_local_view_device_type X_colmap_lcl = X_colmap.getLocalViewDevice(Access::ReadOnly);
294  const_local_view_device_type B_lcl = B.getLocalViewDevice(Access::ReadOnly);
295  local_view_device_type R_lcl = R.getLocalViewDevice(Access::OverwriteAll);
296  const_local_view_device_type X_domainmap_lcl;
297 
298  const bool debug = ::Tpetra::Details::Behavior::debug ();
299  if (debug) {
300  TEUCHOS_TEST_FOR_EXCEPTION
301  (X_colmap.getNumVectors () != R.getNumVectors (), std::runtime_error,
302  "X.getNumVectors() = " << X_colmap.getNumVectors () << " != "
303  "R.getNumVectors() = " << R.getNumVectors () << ".");
304  TEUCHOS_TEST_FOR_EXCEPTION
305  (X_colmap.getLocalLength () !=
306  A.getColMap ()->getLocalNumElements (), std::runtime_error,
307  "X has the wrong number of local rows. "
308  "X.getLocalLength() = " << X_colmap.getLocalLength () << " != "
309  "A.getColMap()->getLocalNumElements() = " <<
310  A.getColMap ()->getLocalNumElements () << ".");
311  TEUCHOS_TEST_FOR_EXCEPTION
312  (R.getLocalLength () !=
313  A.getRowMap ()->getLocalNumElements (), std::runtime_error,
314  "R has the wrong number of local rows. "
315  "R.getLocalLength() = " << R.getLocalLength () << " != "
316  "A.getRowMap()->getLocalNumElements() = " <<
317  A.getRowMap ()->getLocalNumElements () << ".");
318  TEUCHOS_TEST_FOR_EXCEPTION
319  (B.getLocalLength () !=
320  A.getRowMap ()->getLocalNumElements (), std::runtime_error,
321  "B has the wrong number of local rows. "
322  "B.getLocalLength() = " << B.getLocalLength () << " != "
323  "A.getRowMap()->getLocalNumElements() = " <<
324  A.getRowMap ()->getLocalNumElements () << ".");
325 
326  TEUCHOS_TEST_FOR_EXCEPTION
327  (! A.isFillComplete (), std::runtime_error, "The matrix A is not "
328  "fill complete. You must call fillComplete() (possibly with "
329  "domain and range Map arguments) without an intervening "
330  "resumeFill() call before you may call this method.");
331  TEUCHOS_TEST_FOR_EXCEPTION
332  (! X_colmap.isConstantStride () || ! R.isConstantStride () || ! B.isConstantStride (),
333  std::runtime_error, "X, Y and B must be constant stride.");
334  // If the two pointers are NULL, then they don't alias one
335  // another, even though they are equal.
336  TEUCHOS_TEST_FOR_EXCEPTION
337  ((X_colmap_lcl.data () == R_lcl.data () && X_colmap_lcl.data () != nullptr) ||
338  (X_colmap_lcl.data () == B_lcl.data () && X_colmap_lcl.data () != nullptr),
339  std::runtime_error, "X, Y and R may not alias one another.");
340  }
341 
342  const bool fusedResidual = ::Tpetra::Details::Behavior::fusedResidual ();
343  if (!fusedResidual) {
344  SC one = Teuchos::ScalarTraits<SC>::one();
345  SC negone = -one;
346  SC zero = Teuchos::ScalarTraits<SC>::zero();
347  // This is currently a "reference implementation" waiting until Kokkos Kernels provides
348  // a residual kernel.
349  A.localApply(X_colmap,R,Teuchos::NO_TRANS, one, zero);
350  R.update(one,B,negone);
351  return;
352  }
353 
354  if (A_lcl.numRows() == 0) {
355  return;
356  }
357 
358  int64_t numLocalRows = A_lcl.numRows ();
359  int64_t myNnz = A_lcl.nnz();
360 
361  int team_size = -1;
362  int vector_length = -1;
363  int64_t rows_per_thread = -1;
364 
365  using execution_space = typename CrsMatrix<SC,LO,GO,NO>::execution_space;
366  using policy_type = typename Kokkos::TeamPolicy<execution_space>;
367 
368  int64_t rows_per_team = KokkosSparse::Impl::spmv_launch_parameters<execution_space>(numLocalRows, myNnz, rows_per_thread, team_size, vector_length);
369  int64_t worksets = (B_lcl.extent (0) + rows_per_team - 1) / rows_per_team;
370 
371  policy_type policy (1, 1);
372  if (team_size < 0) {
373  policy = policy_type (worksets, Kokkos::AUTO, vector_length);
374  }
375  else {
376  policy = policy_type (worksets, team_size, vector_length);
377  }
378 
379  bool is_vector = (X_colmap_lcl.extent(1) == 1);
380 
381  if(is_vector) {
382 
383  if (X_domainmap == nullptr) {
384 
385  using functor_type = LocalResidualFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,false,false,false>;
386  functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
387  Kokkos::parallel_for("residual-vector",policy,func);
388 
389  }
390  else {
391 
392  X_domainmap_lcl = X_domainmap->getLocalViewDevice(Access::ReadOnly);
393  using functor_type = LocalResidualFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,false,true,false>;
394  functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
395  Kokkos::parallel_for("residual-vector",policy,func);
396 
397  }
398  }
399  else {
400 
401  if (X_domainmap == nullptr) {
402 
403  using functor_type = LocalResidualFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,true,false,false>;
404  functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
405  Kokkos::parallel_for("residual-multivector",policy,func);
406 
407  }
408  else {
409 
410  X_domainmap_lcl = X_domainmap->getLocalViewDevice(Access::ReadOnly);
411  using functor_type = LocalResidualFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,true,true,false>;
412  functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
413  Kokkos::parallel_for("residual-multivector",policy,func);
414 
415  }
416  }
417 }
418 
419 
420 template<class SC, class LO, class GO, class NO>
421 void localResidualWithCommCompOverlap(const CrsMatrix<SC,LO,GO,NO> & A,
422  MultiVector<SC,LO,GO,NO> & X_colmap,
423  const MultiVector<SC,LO,GO,NO> & B,
424  MultiVector<SC,LO,GO,NO> & R,
425  const Kokkos::View<const size_t*, typename NO::device_type>& offsets,
426  const MultiVector<SC,LO,GO,NO> & X_domainmap) {
428  using Teuchos::NO_TRANS;
429  using Teuchos::RCP;
430  using import_type = typename CrsMatrix<SC,LO,GO,NO>::import_type;
431 
432  ProfilingRegion regionLocalApply ("Tpetra::CrsMatrix::localResidualWithCommCompOverlap");
433 
434  using local_matrix_device_type = typename CrsMatrix<SC,LO,GO,NO>::local_matrix_device_type;
435  using local_view_device_type = typename MultiVector<SC,LO,GO,NO>::dual_view_type::t_dev::non_const_type;
436  using const_local_view_device_type = typename MultiVector<SC,LO,GO,NO>::dual_view_type::t_dev::const_type;
437  using offset_type = Kokkos::View<const size_t*, typename NO::device_type>;
438 
439  local_matrix_device_type A_lcl = A.getLocalMatrixDevice ();
440  const_local_view_device_type X_colmap_lcl = X_colmap.getLocalViewDevice(Access::ReadOnly);
441  const_local_view_device_type B_lcl = B.getLocalViewDevice(Access::ReadOnly);
442  local_view_device_type R_lcl = R.getLocalViewDevice(Access::OverwriteAll);
443  const_local_view_device_type X_domainmap_lcl = X_domainmap.getLocalViewDevice(Access::ReadOnly);;
444 
445  const bool debug = ::Tpetra::Details::Behavior::debug ();
446  if (debug) {
447  TEUCHOS_TEST_FOR_EXCEPTION
448  (X_colmap.getNumVectors () != R.getNumVectors (), std::runtime_error,
449  "X.getNumVectors() = " << X_colmap.getNumVectors () << " != "
450  "R.getNumVectors() = " << R.getNumVectors () << ".");
451  TEUCHOS_TEST_FOR_EXCEPTION
452  (X_colmap.getLocalLength () !=
453  A.getColMap ()->getLocalNumElements (), std::runtime_error,
454  "X has the wrong number of local rows. "
455  "X.getLocalLength() = " << X_colmap.getLocalLength () << " != "
456  "A.getColMap()->getLocalNumElements() = " <<
457  A.getColMap ()->getLocalNumElements () << ".");
458  TEUCHOS_TEST_FOR_EXCEPTION
459  (R.getLocalLength () !=
460  A.getRowMap ()->getLocalNumElements (), std::runtime_error,
461  "R has the wrong number of local rows. "
462  "R.getLocalLength() = " << R.getLocalLength () << " != "
463  "A.getRowMap()->getLocalNumElements() = " <<
464  A.getRowMap ()->getLocalNumElements () << ".");
465  TEUCHOS_TEST_FOR_EXCEPTION
466  (B.getLocalLength () !=
467  A.getRowMap ()->getLocalNumElements (), std::runtime_error,
468  "B has the wrong number of local rows. "
469  "B.getLocalLength() = " << B.getLocalLength () << " != "
470  "A.getRowMap()->getLocalNumElements() = " <<
471  A.getRowMap ()->getLocalNumElements () << ".");
472 
473  TEUCHOS_TEST_FOR_EXCEPTION
474  (! A.isFillComplete (), std::runtime_error, "The matrix A is not "
475  "fill complete. You must call fillComplete() (possibly with "
476  "domain and range Map arguments) without an intervening "
477  "resumeFill() call before you may call this method.");
478  TEUCHOS_TEST_FOR_EXCEPTION
479  (! X_colmap.isConstantStride () || ! R.isConstantStride () || ! B.isConstantStride (),
480  std::runtime_error, "X, Y and B must be constant stride.");
481  // If the two pointers are NULL, then they don't alias one
482  // another, even though they are equal.
483  TEUCHOS_TEST_FOR_EXCEPTION
484  ((X_colmap_lcl.data () == R_lcl.data () && X_colmap_lcl.data () != nullptr) ||
485  (X_colmap_lcl.data () == B_lcl.data () && X_colmap_lcl.data () != nullptr),
486  std::runtime_error, "X, Y and R may not alias one another.");
487  }
488 
489  if (A_lcl.numRows() == 0) {
490  return;
491  }
492 
493  int64_t numLocalRows = A_lcl.numRows ();
494  int64_t myNnz = A_lcl.nnz();
495 
496  int team_size = -1;
497  int vector_length = -1;
498  int64_t rows_per_thread = -1;
499 
500  using execution_space = typename CrsMatrix<SC,LO,GO,NO>::execution_space;
501  using policy_type = typename Kokkos::TeamPolicy<execution_space>;
502 
503  int64_t rows_per_team = KokkosSparse::Impl::spmv_launch_parameters<execution_space>(numLocalRows, myNnz, rows_per_thread, team_size, vector_length);
504  int64_t worksets = (B_lcl.extent (0) + rows_per_team - 1) / rows_per_team;
505 
506  policy_type policy (1, 1);
507  if (team_size < 0) {
508  policy = policy_type (worksets, Kokkos::AUTO, vector_length);
509  }
510  else {
511  policy = policy_type (worksets, team_size, vector_length);
512  }
513 
514  bool is_vector = (X_colmap_lcl.extent(1) == 1);
515 
516  if(is_vector) {
517 
518  using functor_type = LocalResidualFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,false,true,true>;
519  functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
520  Kokkos::parallel_for("residual-vector",policy,func);
521 
522  RCP<const import_type> importer = A.getGraph ()->getImporter ();
523  X_colmap.endImport (X_domainmap, *importer, INSERT, true);
524 
525  Kokkos::fence("Tpetra::localResidualWithCommCompOverlap-1");
526 
527  using functor_type2 = OffRankUpdateFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,false>;
528  functor_type2 func2 (A_lcl, X_colmap_lcl, R_lcl, rows_per_team, offsets);
529  Kokkos::parallel_for("residual-vector-offrank",policy,func2);
530 
531  }
532  else {
533 
534  using functor_type = LocalResidualFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,true,true,true>;
535  functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
536  Kokkos::parallel_for("residual-multivector",policy,func);
537 
538  RCP<const import_type> importer = A.getGraph ()->getImporter ();
539  X_colmap.endImport (X_domainmap, *importer, INSERT, true);
540 
541  Kokkos::fence("Tpetra::localResidualWithCommCompOverlap-2");
542 
543  using functor_type2 = OffRankUpdateFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,true>;
544  functor_type2 func2 (A_lcl, X_colmap_lcl, R_lcl, rows_per_team, offsets);
545  Kokkos::parallel_for("residual-vector-offrank",policy,func2);
546 
547  }
548 }
549 
550 
552 template<class SC, class LO, class GO, class NO>
554  const MultiVector<SC,LO,GO,NO> & X_in,
555  const MultiVector<SC,LO,GO,NO> & B_in,
556  MultiVector<SC,LO,GO,NO> & R_in) {
558  using Teuchos::RCP;
559  using Teuchos::rcp;
560  using Teuchos::rcp_const_cast;
561  using Teuchos::rcpFromRef;
562 
563  const bool debug = ::Tpetra::Details::Behavior::debug ();
564  const bool skipCopyAndPermuteIfPossible = ::Tpetra::Details::Behavior::skipCopyAndPermuteIfPossible ();
565  const bool overlapCommunicationAndComputation = ::Tpetra::Details::Behavior::overlapCommunicationAndComputation ();
566  if (overlapCommunicationAndComputation)
567  TEUCHOS_ASSERT(skipCopyAndPermuteIfPossible);
568 
569  // Whether we are using restrictedMode in the import from domain to
570  // column map. Restricted mode skips the copy and permutation of the
571  // local part of X. We are using restrictedMode only when domain and
572  // column map are locally fitted, i.e. when the local indices of
573  // domain and column map match.
574  bool restrictedMode = false;
575 
576  const CrsMatrix<SC,LO,GO,NO> * Apt = dynamic_cast<const CrsMatrix<SC,LO,GO,NO>*>(&Aop);
577  if(!Apt) {
578  // If we're not a CrsMatrix, we can't do fusion, so just do apply+update
579  SC one = Teuchos::ScalarTraits<SC>::one(), negone = -one, zero = Teuchos::ScalarTraits<SC>::zero();
580  Aop.apply(X_in,R_in,Teuchos::NO_TRANS, one, zero);
581  R_in.update(one,B_in,negone);
582  return;
583  }
584  const CrsMatrix<SC,LO,GO,NO> & A = *Apt;
585 
586  using import_type = typename CrsMatrix<SC,LO,GO,NO>::import_type;
587  using export_type = typename CrsMatrix<SC,LO,GO,NO>::export_type;
588  using MV = MultiVector<SC,LO,GO,NO>;
589  using graph_type = Tpetra::CrsGraph<LO,GO,NO>;
590  using offset_type = typename graph_type::offset_device_view_type;
591 
592  // We treat the case of a replicated MV output specially.
593  const bool R_is_replicated =
594  (! R_in.isDistributed () && A.getComm ()->getSize () != 1);
595 
596  // It's possible that R is a view of X or B.
597  // We don't try to to detect the more subtle cases (e.g., one is a
598  // subview of the other, but their initial pointers differ). We
599  // only need to do this if this matrix's Import is trivial;
600  // otherwise, we don't actually apply the operator from X into Y.
601 
602  RCP<const import_type> importer = A.getGraph ()->getImporter ();
603  RCP<const export_type> exporter = A.getGraph ()->getExporter ();
604 
605  // Temporary MV for Import operation. After the block of code
606  // below, this will be an (Imported if necessary) column Map MV
607  // ready to give to localApply(...).
608  RCP<MV> X_colMap;
609  if (importer.is_null ()) {
610  if (! X_in.isConstantStride ()) {
611  // Not all sparse mat-vec kernels can handle an input MV with
612  // nonconstant stride correctly, so we have to copy it in that
613  // case into a constant stride MV. To make a constant stride
614  // copy of X_in, we force creation of the column (== domain)
615  // Map MV (if it hasn't already been created, else fetch the
616  // cached copy). This avoids creating a new MV each time.
617  X_colMap = A.getColumnMapMultiVector (X_in, true);
618  Tpetra::deep_copy (*X_colMap, X_in);
619  // X_colMap = rcp_const_cast<const MV> (X_colMapNonConst);
620  }
621  else {
622  // The domain and column Maps are the same, so do the local
623  // multiply using the domain Map input MV X_in.
624  X_colMap = rcp_const_cast<MV> (rcpFromRef (X_in) );
625  }
626  }
627  else { // need to Import source (multi)vector
628  ProfilingRegion regionImport ("Tpetra::CrsMatrix::residual: Import");
629  // We're doing an Import anyway, which will copy the relevant
630  // elements of the domain Map MV X_in into a separate column Map
631  // MV. Thus, we don't have to worry whether X_in is constant
632  // stride.
633  X_colMap = A.getColumnMapMultiVector (X_in);
634 
635  // Do we want to use restrictedMode?
636  restrictedMode = skipCopyAndPermuteIfPossible && importer->isLocallyFitted();
637 
638  if (debug && restrictedMode) {
639  TEUCHOS_TEST_FOR_EXCEPTION
640  (!importer->getTargetMap()->isLocallyFitted(*importer->getSourceMap()), std::runtime_error,
641  "Source map and target map are not locally fitted, but Tpetra::residual thinks they are.");
642  }
643 
644  // Import from the domain Map MV to the column Map MV.
645  X_colMap->beginImport (X_in, *importer, INSERT, restrictedMode);
646  }
647 
648  offset_type offsets;
649  if (restrictedMode)
650  A.getCrsGraph()->getLocalOffRankOffsets(offsets);
651 
652  // Get a vector for the R_rowMap output residual, handling the
653  // non-constant stride and exporter cases. Since R gets clobbered
654  // we don't need to worry about the data in it
655  RCP<MV> R_rowMap;
656  if(exporter.is_null()) {
657  if (! R_in.isConstantStride ()) {
658  R_rowMap = A.getRowMapMultiVector(R_in);
659  }
660  else {
661  R_rowMap = rcpFromRef (R_in);
662  }
663  }
664  else {
665  R_rowMap = A.getRowMapMultiVector (R_in);
666  }
667 
668  // Get a vector for the B_rowMap output residual, handling the
669  // non-constant stride and exporter cases
670  RCP<const MV> B_rowMap;
671  if(exporter.is_null()) {
672  if (! B_in.isConstantStride ()) {
673  // Do an allocation here. If we need to optimize this later, we can have the matrix
674  // cache this.
675  RCP<MV> B_rowMapNonConst = rcp(new MV(A.getRowMap(),B_in.getNumVectors()));
676  Tpetra::deep_copy (*B_rowMapNonConst, B_in);
677  B_rowMap = rcp_const_cast<const MV> (B_rowMapNonConst);
678  }
679  else {
680  B_rowMap = rcpFromRef (B_in);
681  }
682  }
683  else {
684  // Do an allocation here. If we need to optimize this later, we can have the matrix
685  // cache this.
686  ProfilingRegion regionExport ("Tpetra::CrsMatrix::residual: B Import");
687  RCP<MV> B_rowMapNonConst = rcp(new MV(A.getRowMap(),B_in.getNumVectors()));
688  B_rowMapNonConst->doImport(B_in, *exporter, ADD);
689  B_rowMap = rcp_const_cast<const MV> (B_rowMapNonConst);
690  }
691 
692  // If we have a nontrivial Export object, we must perform an
693  // Export. In that case, the local multiply result will go into
694  // the row Map multivector. We don't have to make a
695  // constant-stride version of R_in in this case, because we had to
696  // make a constant stride R_rowMap MV and do an Export anyway.
697  if (! exporter.is_null ()) {
698  if ( ! importer.is_null ())
699  X_colMap->endImport (X_in, *importer, INSERT, restrictedMode);
700  if (restrictedMode && !importer.is_null ())
701  localResidual (A, *X_colMap, *B_rowMap, *R_rowMap, offsets, &X_in);
702  else
703  localResidual (A, *X_colMap, *B_rowMap, *R_rowMap, offsets);
704 
705  {
706  ProfilingRegion regionExport ("Tpetra::CrsMatrix::residual: R Export");
707 
708  // Do the Export operation.
709  R_in.doExport (*R_rowMap, *exporter, ADD);
710  }
711  }
712  else { // Don't do an Export: row Map and range Map are the same.
713 
714  if (restrictedMode)
715  if (overlapCommunicationAndComputation) {
716  localResidualWithCommCompOverlap (A, *X_colMap, *B_rowMap, *R_rowMap, offsets, X_in);
717  } else {
718  X_colMap->endImport (X_in, *importer, INSERT, restrictedMode);
719  localResidual (A, *X_colMap, *B_rowMap, *R_rowMap, offsets, &X_in);
720  }
721  else {
722  if ( ! importer.is_null ())
723  X_colMap->endImport (X_in, *importer, INSERT, restrictedMode);
724  localResidual (A, *X_colMap, *B_rowMap, *R_rowMap, offsets);
725  }
726 
727  //
728  // If R_in does not have constant stride,
729  // then we can't let the kernel write directly to R_in.
730  // Instead, we have to use the cached row (== range)
731  // Map MV as temporary storage.
732  //
733  if (! R_in.isConstantStride () ) {
734  // We need to be sure to do a copy out in this case.
735  Tpetra::deep_copy (R_in, *R_rowMap);
736  }
737  }
738 
739  // If the range Map is a locally replicated Map, sum up
740  // contributions from each process.
741  if (R_is_replicated) {
742  ProfilingRegion regionReduce ("Tpetra::CrsMatrix::residual: Reduce Y");
743  R_in.reduce ();
744  }
745 }
746 
747 
748 
749 
750 
751  } // namespace Details
752 } // namespace Tpetra
753 
754 #endif // TPETRA_DETAILS_RESIDUAL_HPP
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
static bool overlapCommunicationAndComputation()
Overlap communication and computation.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
size_t getNumVectors() const
Number of columns in the multivector.
size_t getLocalLength() const
Local number of rows on the calling process.
bool isConstantStride() const
Whether this multivector has constant stride between columns.
One or more distributed dense vectors.
virtual void apply(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), Scalar beta=Teuchos::ScalarTraits< Scalar >::zero()) const =0
Computes the operator-multivector application.
bool isDistributed() const
Whether this is a globally distributed object.
Teuchos::RCP< const crs_graph_type > getCrsGraph() const
This matrix&#39;s graph, as a CrsGraph.
static bool debug()
Whether Tpetra is in debug mode.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
static bool fusedResidual()
Fusing SpMV and update in residual instead of using 2 kernel launches. Fusing kernels implies that no...
Functor for computing R -= A_offRank*X_colmap.
offsets_view_type offsets_
Offsets (output argument)
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
Insert new values that don&#39;t currently exist.
bool isFillComplete() const override
Whether the matrix is fill complete.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
Abstract interface for operators (e.g., matrices and preconditioners).
void update(const Scalar &alpha, const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta)
Update: this = beta*this + alpha*A.
local_matrix_device_type getLocalMatrixDevice() const
The local sparse matrix.
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
Functor for computing the residual.
Teuchos::RCP< MV > getColumnMapMultiVector(const MV &X_domainMap, const bool force=false) const
Create a (or fetch a cached) column Map MultiVector.
Teuchos::RCP< const RowGraph< LocalOrdinal, GlobalOrdinal, Node > > getGraph() const override
This matrix&#39;s graph, as a RowGraph.
Sum new values.
Teuchos::RCP< MV > getRowMapMultiVector(const MV &Y_rangeMap, const bool force=false) const
Create a (or fetch a cached) row Map MultiVector.
dual_view_type::t_dev::const_type getLocalViewDevice(Access::ReadOnlyStruct) const
Return a read-only, up-to-date view of this MultiVector&#39;s local data on device. This requires that th...
void start()
Start the deep_copy counter.
A distributed graph accessed by rows (adjacency lists) and stored sparsely.
void doExport(const SrcDistObject &source, const Export< LocalOrdinal, GlobalOrdinal, Node > &exporter, const CombineMode CM, const bool restrictedMode=false)
Export data into this object using an Export object (&quot;forward mode&quot;).
void reduce()
Sum values of a locally replicated multivector across all processes.
static bool skipCopyAndPermuteIfPossible()
Skip copyAndPermute if possible.
void residual(const Operator< SC, LO, GO, NO > &A, const MultiVector< SC, LO, GO, NO > &X, const MultiVector< SC, LO, GO, NO > &B, MultiVector< SC, LO, GO, NO > &R)
Computes R = B - A * X.
Teuchos::RCP< const map_type > getColMap() const override
The Map that describes the column distribution in this matrix.
void localApply(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Y, const Teuchos::ETransp mode=Teuchos::NO_TRANS, const Scalar &alpha=Teuchos::ScalarTraits< Scalar >::one(), const Scalar &beta=Teuchos::ScalarTraits< Scalar >::zero()) const
Compute the local part of a sparse matrix-(Multi)Vector multiply.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra&#39;s behavior.
Teuchos::RCP< const map_type > getRowMap() const override
The Map that describes the row distribution in this matrix.