Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Tpetra_BlockCrsMatrix_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // ************************************************************************
38 // @HEADER
39 // clang-format off
40 
41 #ifndef TPETRA_BLOCKCRSMATRIX_DEF_HPP
42 #define TPETRA_BLOCKCRSMATRIX_DEF_HPP
43 
46 
50 #include "Tpetra_BlockMultiVector.hpp"
51 #include "Tpetra_BlockView.hpp"
52 
53 #include "Teuchos_TimeMonitor.hpp"
54 #ifdef HAVE_TPETRA_DEBUG
55 # include <set>
56 #endif // HAVE_TPETRA_DEBUG
57 
58 #include "KokkosSparse_spmv.hpp"
59 
60 namespace Tpetra {
61 
62 namespace Impl {
63 
64  template<typename T>
65  struct BlockCrsRowStruct {
66  T totalNumEntries, totalNumBytes, maxRowLength;
67  KOKKOS_DEFAULTED_FUNCTION BlockCrsRowStruct() = default;
68  KOKKOS_DEFAULTED_FUNCTION BlockCrsRowStruct(const BlockCrsRowStruct &b) = default;
69  KOKKOS_INLINE_FUNCTION BlockCrsRowStruct(const T& numEnt, const T& numBytes, const T& rowLength)
70  : totalNumEntries(numEnt), totalNumBytes(numBytes), maxRowLength(rowLength) {}
71  };
72 
73  template<typename T>
74  static
75  KOKKOS_INLINE_FUNCTION
76  void operator+=(BlockCrsRowStruct<T> &a, const BlockCrsRowStruct<T> &b) {
77  a.totalNumEntries += b.totalNumEntries;
78  a.totalNumBytes += b.totalNumBytes;
79  a.maxRowLength = a.maxRowLength > b.maxRowLength ? a.maxRowLength : b.maxRowLength;
80  }
81 
82  template<typename T, typename ExecSpace>
83  struct BlockCrsReducer {
84  typedef BlockCrsReducer reducer;
85  typedef T value_type;
86  typedef Kokkos::View<value_type,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
87  value_type *value;
88 
89  KOKKOS_INLINE_FUNCTION
90  BlockCrsReducer(value_type &val) : value(&val) {}
91 
92  KOKKOS_INLINE_FUNCTION void join(value_type &dst, value_type &src) const { dst += src; }
93  KOKKOS_INLINE_FUNCTION void join(value_type &dst, const value_type &src) const { dst += src; }
94  KOKKOS_INLINE_FUNCTION void init(value_type &val) const { val = value_type(); }
95  KOKKOS_INLINE_FUNCTION value_type& reference() { return *value; }
96  KOKKOS_INLINE_FUNCTION result_view_type view() const { return result_view_type(value); }
97  };
98 
99 } // namespace Impl
100 
101 namespace { // (anonymous)
102 
103 // Implementation of BlockCrsMatrix::getLocalDiagCopy (non-deprecated
104 // version that takes two Kokkos::View arguments).
105 template<class Scalar, class LO, class GO, class Node>
106 class GetLocalDiagCopy {
107 public:
108  typedef typename Node::device_type device_type;
109  typedef size_t diag_offset_type;
110  typedef Kokkos::View<const size_t*, device_type,
111  Kokkos::MemoryUnmanaged> diag_offsets_type;
112  typedef typename ::Tpetra::CrsGraph<LO, GO, Node> global_graph_type;
113  typedef typename global_graph_type::local_graph_device_type local_graph_device_type;
114  typedef typename local_graph_device_type::row_map_type row_offsets_type;
115  typedef typename ::Tpetra::BlockMultiVector<Scalar, LO, GO, Node>::impl_scalar_type IST;
116  typedef Kokkos::View<IST***, device_type, Kokkos::MemoryUnmanaged> diag_type;
117  typedef Kokkos::View<const IST*, device_type, Kokkos::MemoryUnmanaged> values_type;
118 
119  // Constructor
120  GetLocalDiagCopy (const diag_type& diag,
121  const values_type& val,
122  const diag_offsets_type& diagOffsets,
123  const row_offsets_type& ptr,
124  const LO blockSize) :
125  diag_ (diag),
126  diagOffsets_ (diagOffsets),
127  ptr_ (ptr),
128  blockSize_ (blockSize),
129  offsetPerBlock_ (blockSize_*blockSize_),
130  val_(val)
131  {}
132 
133  KOKKOS_INLINE_FUNCTION void
134  operator() (const LO& lclRowInd) const
135  {
136  using Kokkos::ALL;
137 
138  // Get row offset
139  const size_t absOffset = ptr_[lclRowInd];
140 
141  // Get offset relative to start of row
142  const size_t relOffset = diagOffsets_[lclRowInd];
143 
144  // Get the total offset
145  const size_t pointOffset = (absOffset+relOffset)*offsetPerBlock_;
146 
147  // Get a view of the block. BCRS currently uses LayoutRight
148  // regardless of the device.
149  typedef Kokkos::View<const IST**, Impl::BlockCrsMatrixLittleBlockArrayLayout,
150  device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
151  const_little_block_type;
152  const_little_block_type D_in (val_.data () + pointOffset,
153  blockSize_, blockSize_);
154  auto D_out = Kokkos::subview (diag_, lclRowInd, ALL (), ALL ());
155  COPY (D_in, D_out);
156  }
157 
158  private:
159  diag_type diag_;
160  diag_offsets_type diagOffsets_;
161  row_offsets_type ptr_;
162  LO blockSize_;
163  LO offsetPerBlock_;
164  values_type val_;
165  };
166 } // namespace (anonymous)
167 
168  template<class Scalar, class LO, class GO, class Node>
169  std::ostream&
170  BlockCrsMatrix<Scalar, LO, GO, Node>::
171  markLocalErrorAndGetStream ()
172  {
173  * (this->localError_) = true;
174  if ((*errs_).is_null ()) {
175  *errs_ = Teuchos::rcp (new std::ostringstream ());
176  }
177  return **errs_;
178  }
179 
180  template<class Scalar, class LO, class GO, class Node>
183  dist_object_type (Teuchos::rcp (new map_type ())), // nonnull, so DistObject doesn't throw
184  graph_ (Teuchos::rcp (new map_type ()), 0), // FIXME (mfh 16 May 2014) no empty ctor yet
185  blockSize_ (static_cast<LO> (0)),
186  X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
187  Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
188  pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
189  offsetPerBlock_ (0),
190  localError_ (new bool (false)),
191  errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
192  {
193  }
194 
195  template<class Scalar, class LO, class GO, class Node>
198  const LO blockSize) :
199  dist_object_type (graph.getMap ()),
200  graph_ (graph),
201  rowMeshMap_ (* (graph.getRowMap ())),
202  blockSize_ (blockSize),
203  X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
204  Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
205  pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
206  offsetPerBlock_ (blockSize * blockSize),
207  localError_ (new bool (false)),
208  errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
209  {
210 
212  TEUCHOS_TEST_FOR_EXCEPTION(
213  ! graph_.isSorted (), std::invalid_argument, "Tpetra::"
214  "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
215  "rows (isSorted() is false). This class assumes sorted rows.");
216 
217  graphRCP_ = Teuchos::rcpFromRef(graph_);
218 
219  // Trick to test whether LO is nonpositive, without a compiler
220  // warning in case LO is unsigned (which is generally a bad idea
221  // anyway). I don't promise that the trick works, but it
222  // generally does with gcc at least, in my experience.
223  const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
224  TEUCHOS_TEST_FOR_EXCEPTION(
225  blockSizeIsNonpositive, std::invalid_argument, "Tpetra::"
226  "BlockCrsMatrix constructor: The input blockSize = " << blockSize <<
227  " <= 0. The block size must be positive.");
228 
229  domainPointMap_ = BMV::makePointMap (* (graph.getDomainMap ()), blockSize);
230  rangePointMap_ = BMV::makePointMap (* (graph.getRangeMap ()), blockSize);
231 
232  {
233  // These are rcp
234  const auto domainPointMap = getDomainMap();
235  const auto colPointMap = Teuchos::rcp
236  (new typename BMV::map_type (BMV::makePointMap (*graph_.getColMap(), blockSize_)));
237  *pointImporter_ = Teuchos::rcp
238  (new typename crs_graph_type::import_type (domainPointMap, colPointMap));
239  }
240  {
241  auto local_graph_h = graph.getLocalGraphHost ();
242  auto ptr_h = local_graph_h.row_map;
243  ptrHost_ = decltype(ptrHost_)(Kokkos::ViewAllocateWithoutInitializing("graph row offset"), ptr_h.extent(0));
244  Kokkos::deep_copy(ptrHost_, ptr_h);
245 
246  auto ind_h = local_graph_h.entries;
247  indHost_ = decltype(indHost_)(Kokkos::ViewAllocateWithoutInitializing("graph column indices"), ind_h.extent(0));
248  // DEEP_COPY REVIEW - HOST-TO-HOST
249  Kokkos::deep_copy (indHost_, ind_h);
250 
251  const auto numValEnt = ind_h.extent(0) * offsetPerBlock ();
252  val_ = decltype (val_) (impl_scalar_type_dualview("val", numValEnt));
253  }
254  }
255 
256  template<class Scalar, class LO, class GO, class Node>
259  const typename local_matrix_device_type::values_type& values,
260  const LO blockSize) :
261  dist_object_type (graph.getMap ()),
262  graph_ (graph),
263  rowMeshMap_ (* (graph.getRowMap ())),
264  blockSize_ (blockSize),
265  X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
266  Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
267  pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
268  offsetPerBlock_ (blockSize * blockSize),
269  localError_ (new bool (false)),
270  errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
271  {
273  TEUCHOS_TEST_FOR_EXCEPTION(
274  ! graph_.isSorted (), std::invalid_argument, "Tpetra::"
275  "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
276  "rows (isSorted() is false). This class assumes sorted rows.");
277 
278  graphRCP_ = Teuchos::rcpFromRef(graph_);
279 
280  // Trick to test whether LO is nonpositive, without a compiler
281  // warning in case LO is unsigned (which is generally a bad idea
282  // anyway). I don't promise that the trick works, but it
283  // generally does with gcc at least, in my experience.
284  const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
285  TEUCHOS_TEST_FOR_EXCEPTION(
286  blockSizeIsNonpositive, std::invalid_argument, "Tpetra::"
287  "BlockCrsMatrix constructor: The input blockSize = " << blockSize <<
288  " <= 0. The block size must be positive.");
289 
290  domainPointMap_ = BMV::makePointMap (* (graph.getDomainMap ()), blockSize);
291  rangePointMap_ = BMV::makePointMap (* (graph.getRangeMap ()), blockSize);
292 
293  {
294  // These are rcp
295  const auto domainPointMap = getDomainMap();
296  const auto colPointMap = Teuchos::rcp
297  (new typename BMV::map_type (BMV::makePointMap (*graph_.getColMap(), blockSize_)));
298  *pointImporter_ = Teuchos::rcp
299  (new typename crs_graph_type::import_type (domainPointMap, colPointMap));
300  }
301  {
302  auto local_graph_h = graph.getLocalGraphHost ();
303  auto ptr_h = local_graph_h.row_map;
304  ptrHost_ = decltype(ptrHost_)(Kokkos::ViewAllocateWithoutInitializing("graph row offset"), ptr_h.extent(0));
305  Kokkos::deep_copy(ptrHost_, ptr_h);
306 
307  auto ind_h = local_graph_h.entries;
308  indHost_ = decltype(indHost_)(Kokkos::ViewAllocateWithoutInitializing("graph column indices"), ind_h.extent(0));
309  Kokkos::deep_copy (indHost_, ind_h);
310 
311  const auto numValEnt = ind_h.extent(0) * offsetPerBlock ();
312  TEUCHOS_ASSERT_EQUALITY(numValEnt, values.size());
313  val_ = decltype (val_) (values);
314  }
315  }
316 
317  template<class Scalar, class LO, class GO, class Node>
320  const map_type& domainPointMap,
321  const map_type& rangePointMap,
322  const LO blockSize) :
323  dist_object_type (graph.getMap ()),
324  graph_ (graph),
325  rowMeshMap_ (* (graph.getRowMap ())),
326  domainPointMap_ (domainPointMap),
327  rangePointMap_ (rangePointMap),
328  blockSize_ (blockSize),
329  X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
330  Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
331  pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
332  offsetPerBlock_ (blockSize * blockSize),
333  localError_ (new bool (false)),
334  errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
335  {
336  TEUCHOS_TEST_FOR_EXCEPTION(
337  ! graph_.isSorted (), std::invalid_argument, "Tpetra::"
338  "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
339  "rows (isSorted() is false). This class assumes sorted rows.");
340 
341  graphRCP_ = Teuchos::rcpFromRef(graph_);
342 
343  // Trick to test whether LO is nonpositive, without a compiler
344  // warning in case LO is unsigned (which is generally a bad idea
345  // anyway). I don't promise that the trick works, but it
346  // generally does with gcc at least, in my experience.
347  const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
348  TEUCHOS_TEST_FOR_EXCEPTION(
349  blockSizeIsNonpositive, std::invalid_argument, "Tpetra::"
350  "BlockCrsMatrix constructor: The input blockSize = " << blockSize <<
351  " <= 0. The block size must be positive.");
352  {
353  // These are rcp
354  const auto rcpDomainPointMap = getDomainMap();
355  const auto colPointMap = Teuchos::rcp
356  (new typename BMV::map_type (BMV::makePointMap (*graph_.getColMap(), blockSize_)));
357  *pointImporter_ = Teuchos::rcp
358  (new typename crs_graph_type::import_type (rcpDomainPointMap, colPointMap));
359  }
360  {
361  auto local_graph_h = graph.getLocalGraphHost ();
362  auto ptr_h = local_graph_h.row_map;
363  ptrHost_ = decltype(ptrHost_)(Kokkos::ViewAllocateWithoutInitializing("graph row offset"), ptr_h.extent(0));
364  // DEEP_COPY REVIEW - HOST-TO-HOST
365  Kokkos::deep_copy(ptrHost_, ptr_h);
366 
367  auto ind_h = local_graph_h.entries;
368  indHost_ = decltype(indHost_)(Kokkos::ViewAllocateWithoutInitializing("graph column indices"), ind_h.extent(0));
369  // DEEP_COPY REVIEW - HOST-TO-HOST
370  Kokkos::deep_copy (indHost_, ind_h);
371 
372  const auto numValEnt = ind_h.extent(0) * offsetPerBlock ();
373  val_ = decltype (val_) (impl_scalar_type_dualview("val", numValEnt));
374 
375  }
376  }
377 
378  template<class Scalar, class LO, class GO, class Node>
379  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
382  { // Copy constructor of map_type does a shallow copy.
383  // We're only returning by RCP for backwards compatibility.
384  return Teuchos::rcp (new map_type (domainPointMap_));
385  }
386 
387  template<class Scalar, class LO, class GO, class Node>
388  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
390  getRangeMap () const
391  { // Copy constructor of map_type does a shallow copy.
392  // We're only returning by RCP for backwards compatibility.
393  return Teuchos::rcp (new map_type (rangePointMap_));
394  }
395 
396  template<class Scalar, class LO, class GO, class Node>
397  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
399  getRowMap () const
400  {
401  return graph_.getRowMap();
402  }
403 
404  template<class Scalar, class LO, class GO, class Node>
405  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
407  getColMap () const
408  {
409  return graph_.getColMap();
410  }
411 
412  template<class Scalar, class LO, class GO, class Node>
416  {
417  return graph_.getGlobalNumRows();
418  }
419 
420  template<class Scalar, class LO, class GO, class Node>
421  size_t
424  {
425  return graph_.getLocalNumRows();
426  }
427 
428  template<class Scalar, class LO, class GO, class Node>
429  size_t
432  {
433  return graph_.getLocalMaxNumRowEntries();
434  }
435 
436  template<class Scalar, class LO, class GO, class Node>
437  void
439  apply (const mv_type& X,
440  mv_type& Y,
441  Teuchos::ETransp mode,
442  Scalar alpha,
443  Scalar beta) const
444  {
445  using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
446  TEUCHOS_TEST_FOR_EXCEPTION(
447  mode != Teuchos::NO_TRANS && mode != Teuchos::TRANS && mode != Teuchos::CONJ_TRANS,
448  std::invalid_argument, "Tpetra::BlockCrsMatrix::apply: "
449  "Invalid 'mode' argument. Valid values are Teuchos::NO_TRANS, "
450  "Teuchos::TRANS, and Teuchos::CONJ_TRANS.");
451 
452  TEUCHOS_TEST_FOR_EXCEPTION(
453  !X.isConstantStride() || !Y.isConstantStride(),
454  std::invalid_argument, "Tpetra::BlockCrsMatrix::apply: "
455  "X and Y must both be constant stride");
456 
457  BMV X_view;
458  BMV Y_view;
459  const LO blockSize = getBlockSize ();
460  try {
461  X_view = BMV (X, * (graph_.getDomainMap ()), blockSize);
462  Y_view = BMV (Y, * (graph_.getRangeMap ()), blockSize);
463  }
464  catch (std::invalid_argument& e) {
465  TEUCHOS_TEST_FOR_EXCEPTION(
466  true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
467  "apply: Either the input MultiVector X or the output MultiVector Y "
468  "cannot be viewed as a BlockMultiVector, given this BlockCrsMatrix's "
469  "graph. BlockMultiVector's constructor threw the following exception: "
470  << e.what ());
471  }
472 
473  try {
474  // mfh 16 May 2014: Casting 'this' to nonconst is icky, but it's
475  // either that or mark fields of this class as 'mutable'. The
476  // problem is that applyBlock wants to do lazy initialization of
477  // temporary block multivectors.
478  const_cast<this_BCRS_type&> (*this).applyBlock (X_view, Y_view, mode, alpha, beta);
479  } catch (std::invalid_argument& e) {
480  TEUCHOS_TEST_FOR_EXCEPTION(
481  true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
482  "apply: The implementation method applyBlock complained about having "
483  "an invalid argument. It reported the following: " << e.what ());
484  } catch (std::logic_error& e) {
485  TEUCHOS_TEST_FOR_EXCEPTION(
486  true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
487  "apply: The implementation method applyBlock complained about a "
488  "possible bug in its implementation. It reported the following: "
489  << e.what ());
490  } catch (std::exception& e) {
491  TEUCHOS_TEST_FOR_EXCEPTION(
492  true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
493  "apply: The implementation method applyBlock threw an exception which "
494  "is neither std::invalid_argument nor std::logic_error, but is a "
495  "subclass of std::exception. It reported the following: "
496  << e.what ());
497  } catch (...) {
498  TEUCHOS_TEST_FOR_EXCEPTION(
499  true, std::logic_error, "Tpetra::BlockCrsMatrix::"
500  "apply: The implementation method applyBlock threw an exception which "
501  "is not an instance of a subclass of std::exception. This probably "
502  "indicates a bug. Please report this to the Tpetra developers.");
503  }
504  }
505 
506  template<class Scalar, class LO, class GO, class Node>
507  void
511  Teuchos::ETransp mode,
512  const Scalar alpha,
513  const Scalar beta)
514  {
515  TEUCHOS_TEST_FOR_EXCEPTION(
516  X.getBlockSize () != Y.getBlockSize (), std::invalid_argument,
517  "Tpetra::BlockCrsMatrix::applyBlock: "
518  "X and Y have different block sizes. X.getBlockSize() = "
519  << X.getBlockSize () << " != Y.getBlockSize() = "
520  << Y.getBlockSize () << ".");
521 
522  if (mode == Teuchos::NO_TRANS) {
523  applyBlockNoTrans (X, Y, alpha, beta);
524  } else if (mode == Teuchos::TRANS || mode == Teuchos::CONJ_TRANS) {
525  applyBlockTrans (X, Y, mode, alpha, beta);
526  } else {
527  TEUCHOS_TEST_FOR_EXCEPTION(
528  true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
529  "applyBlock: Invalid 'mode' argument. Valid values are "
530  "Teuchos::NO_TRANS, Teuchos::TRANS, and Teuchos::CONJ_TRANS.");
531  }
532  }
533 
534  template<class Scalar, class LO, class GO, class Node>
535  void
538  const Import<LO, GO, Node>& importer) const
539  {
540  using Teuchos::RCP;
541  using Teuchos::rcp;
542  using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
543 
544  // Right now, we make many assumptions...
545  TEUCHOS_TEST_FOR_EXCEPTION(!destMatrix.is_null(), std::invalid_argument,
546  "destMatrix is required to be null.");
547 
548  // BlockCrsMatrix requires a complete graph at construction.
549  // So first step is to import and fill complete the destGraph.
550  RCP<crs_graph_type> srcGraph = rcp (new crs_graph_type(this->getCrsGraph()));
551  RCP<crs_graph_type> destGraph = importAndFillCompleteCrsGraph<crs_graph_type>(srcGraph, importer,
552  srcGraph->getDomainMap(),
553  srcGraph->getRangeMap());
554 
555 
556  // Final step, create and import the destMatrix.
557  destMatrix = rcp (new this_BCRS_type (*destGraph, getBlockSize()));
558  destMatrix->doImport(*this, importer, Tpetra::INSERT);
559  }
560 
561 
562  template<class Scalar, class LO, class GO, class Node>
563  void
566  const Export<LO, GO, Node>& exporter) const
567  {
568  using Teuchos::RCP;
569  using Teuchos::rcp;
570  using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
571 
572  // Right now, we make many assumptions...
573  TEUCHOS_TEST_FOR_EXCEPTION(!destMatrix.is_null(), std::invalid_argument,
574  "destMatrix is required to be null.");
575 
576  // BlockCrsMatrix requires a complete graph at construction.
577  // So first step is to import and fill complete the destGraph.
578  RCP<crs_graph_type> srcGraph = rcp (new crs_graph_type(this->getCrsGraph()));
579  RCP<crs_graph_type> destGraph = exportAndFillCompleteCrsGraph<crs_graph_type>(srcGraph, exporter,
580  srcGraph->getDomainMap(),
581  srcGraph->getRangeMap());
582 
583 
584  // Final step, create and import the destMatrix.
585  destMatrix = rcp (new this_BCRS_type (*destGraph, getBlockSize()));
586  destMatrix->doExport(*this, exporter, Tpetra::INSERT);
587  }
588 
589 
590  template<class Scalar, class LO, class GO, class Node>
591  void
593  setAllToScalar (const Scalar& alpha)
594  {
595  auto val_d = val_.getDeviceView(Access::OverwriteAll);
596  Kokkos::deep_copy(val_d, alpha);
597  }
598 
599  template<class Scalar, class LO, class GO, class Node>
600  LO
602  replaceLocalValues (const LO localRowInd,
603  const LO colInds[],
604  const Scalar vals[],
605  const LO numColInds) const
606  {
607  Kokkos::View<ptrdiff_t*,Kokkos::HostSpace>
608  offsets_host_view(Kokkos::ViewAllocateWithoutInitializing("offsets"), numColInds);
609  ptrdiff_t * offsets = offsets_host_view.data();
610  const LO numOffsets = this->getLocalRowOffsets(localRowInd, offsets, colInds, numColInds);
611  const LO validCount = this->replaceLocalValuesByOffsets(localRowInd, offsets, vals, numOffsets);
612  return validCount;
613  }
614 
615  template <class Scalar, class LO, class GO, class Node>
616  void
618  getLocalDiagOffsets (const Kokkos::View<size_t*, device_type,
619  Kokkos::MemoryUnmanaged>& offsets) const
620  {
621  graph_.getLocalDiagOffsets (offsets);
622  }
623 
624  template <class Scalar, class LO, class GO, class Node>
625  void
627  getLocalDiagCopy (const Kokkos::View<impl_scalar_type***, device_type,
628  Kokkos::MemoryUnmanaged>& diag,
629  const Kokkos::View<const size_t*, device_type,
630  Kokkos::MemoryUnmanaged>& offsets) const
631  {
632  using Kokkos::parallel_for;
633  const char prefix[] = "Tpetra::BlockCrsMatrix::getLocalDiagCopy (2-arg): ";
634 
635  const LO lclNumMeshRows = static_cast<LO> (rowMeshMap_.getLocalNumElements ());
636  const LO blockSize = this->getBlockSize ();
637  TEUCHOS_TEST_FOR_EXCEPTION
638  (static_cast<LO> (diag.extent (0)) < lclNumMeshRows ||
639  static_cast<LO> (diag.extent (1)) < blockSize ||
640  static_cast<LO> (diag.extent (2)) < blockSize,
641  std::invalid_argument, prefix <<
642  "The input Kokkos::View is not big enough to hold all the data.");
643  TEUCHOS_TEST_FOR_EXCEPTION
644  (static_cast<LO> (offsets.size ()) < lclNumMeshRows, std::invalid_argument,
645  prefix << "offsets.size() = " << offsets.size () << " < local number of "
646  "diagonal blocks " << lclNumMeshRows << ".");
647 
648  typedef Kokkos::RangePolicy<execution_space, LO> policy_type;
649  typedef GetLocalDiagCopy<Scalar, LO, GO, Node> functor_type;
650 
651  // FIXME (mfh 26 May 2016) Not really OK to const_cast here, since
652  // we reserve the right to do lazy allocation of device data. (We
653  // don't plan to do lazy allocation for host data; the host
654  // version of the data always exists.)
655  auto val_d = val_.getDeviceView(Access::ReadOnly);
656  parallel_for (policy_type (0, lclNumMeshRows),
657  functor_type (diag, val_d, offsets,
658  graph_.getLocalGraphDevice ().row_map, blockSize_));
659  }
660 
661  template<class Scalar, class LO, class GO, class Node>
662  LO
664  absMaxLocalValues (const LO localRowInd,
665  const LO colInds[],
666  const Scalar vals[],
667  const LO numColInds) const
668  {
669  Kokkos::View<ptrdiff_t*,Kokkos::HostSpace>
670  offsets_host_view(Kokkos::ViewAllocateWithoutInitializing("offsets"), numColInds);
671  ptrdiff_t * offsets = offsets_host_view.data();
672  const LO numOffsets = this->getLocalRowOffsets(localRowInd, offsets, colInds, numColInds);
673  const LO validCount = this->absMaxLocalValuesByOffsets(localRowInd, offsets, vals, numOffsets);
674  return validCount;
675  }
676 
677 
678  template<class Scalar, class LO, class GO, class Node>
679  LO
681  sumIntoLocalValues (const LO localRowInd,
682  const LO colInds[],
683  const Scalar vals[],
684  const LO numColInds) const
685  {
686  Kokkos::View<ptrdiff_t*,Kokkos::HostSpace>
687  offsets_host_view(Kokkos::ViewAllocateWithoutInitializing("offsets"), numColInds);
688  ptrdiff_t * offsets = offsets_host_view.data();
689  const LO numOffsets = this->getLocalRowOffsets(localRowInd, offsets, colInds, numColInds);
690  const LO validCount = this->sumIntoLocalValuesByOffsets(localRowInd, offsets, vals, numOffsets);
691  return validCount;
692  }
693  template<class Scalar, class LO, class GO, class Node>
694  void
696  getLocalRowCopy (LO LocalRow,
697  nonconst_local_inds_host_view_type &Indices,
698  nonconst_values_host_view_type &Values,
699  size_t& NumEntries) const
700  {
701  auto vals = getValuesHost(LocalRow);
702  const LO numInds = ptrHost_(LocalRow+1) - ptrHost_(LocalRow);
703  if (numInds > (LO)Indices.extent(0) || numInds*blockSize_*blockSize_ > (LO)Values.extent(0)) {
704  TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error,
705  "Tpetra::BlockCrsMatrix::getLocalRowCopy : Column and/or values array is not large enough to hold "
706  << numInds << " row entries");
707  }
708  const LO * colInds = indHost_.data() + ptrHost_(LocalRow);
709  for (LO i=0; i<numInds; ++i) {
710  Indices[i] = colInds[i];
711  }
712  for (LO i=0; i<numInds*blockSize_*blockSize_; ++i) {
713  Values[i] = vals[i];
714  }
715  NumEntries = numInds;
716  }
717 
718  template<class Scalar, class LO, class GO, class Node>
719  LO
721  getLocalRowOffsets (const LO localRowInd,
722  ptrdiff_t offsets[],
723  const LO colInds[],
724  const LO numColInds) const
725  {
726  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
727  // We got no offsets, because the input local row index is
728  // invalid on the calling process. That may not be an error, if
729  // numColInds is zero anyway; it doesn't matter. This is the
730  // advantage of returning the number of valid indices.
731  return static_cast<LO> (0);
732  }
733 
734  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
735  LO hint = 0; // Guess for the relative offset into the current row
736  LO validCount = 0; // number of valid column indices in colInds
737 
738  for (LO k = 0; k < numColInds; ++k) {
739  const LO relBlockOffset =
740  this->findRelOffsetOfColumnIndex (localRowInd, colInds[k], hint);
741  if (relBlockOffset != LINV) {
742  offsets[k] = static_cast<ptrdiff_t> (relBlockOffset);
743  hint = relBlockOffset + 1;
744  ++validCount;
745  }
746  }
747  return validCount;
748  }
749 
750 
751  template<class Scalar, class LO, class GO, class Node>
752  LO
754  replaceLocalValuesByOffsets (const LO localRowInd,
755  const ptrdiff_t offsets[],
756  const Scalar vals[],
757  const LO numOffsets) const
758  {
759  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
760  // We modified no values, because the input local row index is
761  // invalid on the calling process. That may not be an error, if
762  // numColInds is zero anyway; it doesn't matter. This is the
763  // advantage of returning the number of valid indices.
764  return static_cast<LO> (0);
765  }
766  const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
767  using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
768  auto val_out = const_cast<this_BCRS_type&>(*this).getValuesHostNonConst(localRowInd);
769  impl_scalar_type* vOut = val_out.data();
770 
771  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
772  const ptrdiff_t STINV = Teuchos::OrdinalTraits<ptrdiff_t>::invalid ();
773  size_t pointOffset = 0; // Current offset into input values
774  LO validCount = 0; // number of valid offsets
775 
776  for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
777  const size_t blockOffset = offsets[k]*perBlockSize;
778  if (offsets[k] != STINV) {
779  little_block_type A_old = getNonConstLocalBlockFromInput (vOut, blockOffset);
780  const_little_block_type A_new = getConstLocalBlockFromInput (vIn, pointOffset);
781  COPY (A_new, A_old);
782  ++validCount;
783  }
784  }
785  return validCount;
786  }
787 
788 
789  template<class Scalar, class LO, class GO, class Node>
790  LO
792  absMaxLocalValuesByOffsets (const LO localRowInd,
793  const ptrdiff_t offsets[],
794  const Scalar vals[],
795  const LO numOffsets) const
796  {
797  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
798  // We modified no values, because the input local row index is
799  // invalid on the calling process. That may not be an error, if
800  // numColInds is zero anyway; it doesn't matter. This is the
801  // advantage of returning the number of valid indices.
802  return static_cast<LO> (0);
803  }
804  const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
805  auto val_out = getValuesHost(localRowInd);
806  impl_scalar_type* vOut = const_cast<impl_scalar_type*>(val_out.data());
807 
808  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
809  const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
810  size_t pointOffset = 0; // Current offset into input values
811  LO validCount = 0; // number of valid offsets
812 
813  for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
814  const size_t blockOffset = offsets[k]*perBlockSize;
815  if (blockOffset != STINV) {
816  little_block_type A_old = getNonConstLocalBlockFromInput (vOut, blockOffset);
817  const_little_block_type A_new = getConstLocalBlockFromInput (vIn, pointOffset);
818  ::Tpetra::Impl::absMax (A_old, A_new);
819  ++validCount;
820  }
821  }
822  return validCount;
823  }
824 
825 
826  template<class Scalar, class LO, class GO, class Node>
827  LO
829  sumIntoLocalValuesByOffsets (const LO localRowInd,
830  const ptrdiff_t offsets[],
831  const Scalar vals[],
832  const LO numOffsets) const
833  {
834  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
835  // We modified no values, because the input local row index is
836  // invalid on the calling process. That may not be an error, if
837  // numColInds is zero anyway; it doesn't matter. This is the
838  // advantage of returning the number of valid indices.
839  return static_cast<LO> (0);
840  }
841  const impl_scalar_type ONE = static_cast<impl_scalar_type> (1.0);
842  const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
843  typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_BCRS_type;
844  auto val_out = const_cast<this_BCRS_type&>(*this).getValuesHostNonConst(localRowInd);
845  impl_scalar_type* vOut = val_out.data();
846 
847  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
848  const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
849  size_t pointOffset = 0; // Current offset into input values
850  LO validCount = 0; // number of valid offsets
851 
852  for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
853  const size_t blockOffset = offsets[k]*perBlockSize;
854  if (blockOffset != STINV) {
855  little_block_type A_old = getNonConstLocalBlockFromInput (vOut, blockOffset);
856  const_little_block_type A_new = getConstLocalBlockFromInput (vIn, pointOffset);
857  AXPY (ONE, A_new, A_old);
858  ++validCount;
859  }
860  }
861  return validCount;
862  }
863 
864  template<class Scalar, class LO, class GO, class Node>
866  impl_scalar_type_dualview::t_host::const_type
868  getValuesHost () const
869  {
870  return val_.getHostView(Access::ReadOnly);
871  }
872 
873  template<class Scalar, class LO, class GO, class Node>
874  typename BlockCrsMatrix<Scalar, LO, GO, Node>::
875  impl_scalar_type_dualview::t_dev::const_type
876  BlockCrsMatrix<Scalar, LO, GO, Node>::
877  getValuesDevice () const
878  {
879  return val_.getDeviceView(Access::ReadOnly);
880  }
881 
882  template<class Scalar, class LO, class GO, class Node>
883  typename BlockCrsMatrix<Scalar, LO, GO, Node>::
884  impl_scalar_type_dualview::t_host
887  {
888  return val_.getHostView(Access::ReadWrite);
889  }
890 
891  template<class Scalar, class LO, class GO, class Node>
893  impl_scalar_type_dualview::t_dev
895  getValuesDeviceNonConst () const
896  {
897  return val_.getDeviceView(Access::ReadWrite);
898  }
899 
900  template<class Scalar, class LO, class GO, class Node>
901  typename BlockCrsMatrix<Scalar, LO, GO, Node>::
902  impl_scalar_type_dualview::t_host::const_type
903  BlockCrsMatrix<Scalar, LO, GO, Node>::
904  getValuesHost (const LO& lclRow) const
905  {
906  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
907  auto val = val_.getHostView(Access::ReadOnly);
908  auto r_val = Kokkos::subview(val, Kokkos::pair<LO,LO>(ptrHost_(lclRow)*perBlockSize, ptrHost_(lclRow+1)*perBlockSize));
909  return r_val;
910  }
911 
912  template<class Scalar, class LO, class GO, class Node>
914  impl_scalar_type_dualview::t_dev::const_type
916  getValuesDevice (const LO& lclRow) const
917  {
918  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
919  auto val = val_.getDeviceView(Access::ReadOnly);
920  auto r_val = Kokkos::subview(val, Kokkos::pair<LO,LO>(ptrHost_(lclRow)*perBlockSize, ptrHost_(lclRow+1)*perBlockSize));
921  return r_val;
922  }
923 
924  template<class Scalar, class LO, class GO, class Node>
927  getValuesHostNonConst (const LO& lclRow)
928  {
929  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
930  auto val = val_.getHostView(Access::ReadWrite);
931  auto r_val = Kokkos::subview(val, Kokkos::pair<LO,LO>(ptrHost_(lclRow)*perBlockSize, ptrHost_(lclRow+1)*perBlockSize));
932  return r_val;
933  }
934 
935  template<class Scalar, class LO, class GO, class Node>
938  getValuesDeviceNonConst (const LO& lclRow)
939  {
940  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
941  auto val = val_.getDeviceView(Access::ReadWrite);
942  auto r_val = Kokkos::subview(val, Kokkos::pair<LO,LO>(ptrHost_(lclRow)*perBlockSize, ptrHost_(lclRow+1)*perBlockSize));
943  return r_val;
944  }
945 
946  template<class Scalar, class LO, class GO, class Node>
947  size_t
949  getNumEntriesInLocalRow (const LO localRowInd) const
950  {
951  const size_t numEntInGraph = graph_.getNumEntriesInLocalRow (localRowInd);
952  if (numEntInGraph == Teuchos::OrdinalTraits<size_t>::invalid ()) {
953  return static_cast<LO> (0); // the calling process doesn't have that row
954  } else {
955  return static_cast<LO> (numEntInGraph);
956  }
957  }
958 
959  template<class Scalar, class LO, class GO, class Node>
960  typename BlockCrsMatrix<Scalar, LO, GO, Node>::local_matrix_device_type
963  {
964  auto numCols = this->graph_.getColMap()->getLocalNumElements();
965  auto val = val_.getDeviceView(Access::ReadWrite);
966  const LO blockSize = this->getBlockSize ();
967  const auto graph = this->graph_.getLocalGraphDevice ();
968 
969  return local_matrix_device_type("Tpetra::BlockCrsMatrix::lclMatrixDevice",
970  numCols,
971  val,
972  graph,
973  blockSize);
974  }
975 
976  template<class Scalar, class LO, class GO, class Node>
977  void
981  const Teuchos::ETransp mode,
982  const Scalar alpha,
983  const Scalar beta)
984  {
985  (void) X;
986  (void) Y;
987  (void) mode;
988  (void) alpha;
989  (void) beta;
990 
991  TEUCHOS_TEST_FOR_EXCEPTION(
992  true, std::logic_error, "Tpetra::BlockCrsMatrix::apply: "
993  "transpose and conjugate transpose modes are not yet implemented.");
994  }
995 
996  template<class Scalar, class LO, class GO, class Node>
997  void
998  BlockCrsMatrix<Scalar, LO, GO, Node>::
999  applyBlockNoTrans (const BlockMultiVector<Scalar, LO, GO, Node>& X,
1000  BlockMultiVector<Scalar, LO, GO, Node>& Y,
1001  const Scalar alpha,
1002  const Scalar beta)
1003  {
1004  using Teuchos::RCP;
1005  using Teuchos::rcp;
1006  typedef ::Tpetra::Import<LO, GO, Node> import_type;
1007  typedef ::Tpetra::Export<LO, GO, Node> export_type;
1008  const Scalar zero = STS::zero ();
1009  const Scalar one = STS::one ();
1010  RCP<const import_type> import = graph_.getImporter ();
1011  // "export" is a reserved C++ keyword, so we can't use it.
1012  RCP<const export_type> theExport = graph_.getExporter ();
1013  const char prefix[] = "Tpetra::BlockCrsMatrix::applyBlockNoTrans: ";
1014 
1015  if (alpha == zero) {
1016  if (beta == zero) {
1017  Y.putScalar (zero); // replace Inf or NaN (BLAS rules)
1018  }
1019  else if (beta != one) {
1020  Y.scale (beta);
1021  }
1022  }
1023  else { // alpha != 0
1024  const BMV* X_colMap = NULL;
1025  if (import.is_null ()) {
1026  try {
1027  X_colMap = &X;
1028  }
1029  catch (std::exception& e) {
1030  TEUCHOS_TEST_FOR_EXCEPTION
1031  (true, std::logic_error, prefix << "Tpetra::MultiVector::"
1032  "operator= threw an exception: " << std::endl << e.what ());
1033  }
1034  }
1035  else {
1036  // X_colMap_ is a pointer to a pointer to BMV. Ditto for
1037  // Y_rowMap_ below. This lets us do lazy initialization
1038  // correctly with view semantics of BlockCrsMatrix. All views
1039  // of this BlockCrsMatrix have the same outer pointer. That
1040  // way, we can set the inner pointer in one view, and all
1041  // other views will see it.
1042  if ((*X_colMap_).is_null () ||
1043  (**X_colMap_).getNumVectors () != X.getNumVectors () ||
1044  (**X_colMap_).getBlockSize () != X.getBlockSize ()) {
1045  *X_colMap_ = rcp (new BMV (* (graph_.getColMap ()), getBlockSize (),
1046  static_cast<LO> (X.getNumVectors ())));
1047  }
1048  (*X_colMap_)->getMultiVectorView().doImport (X.getMultiVectorView (),
1049  **pointImporter_,
1051  try {
1052  X_colMap = &(**X_colMap_);
1053  }
1054  catch (std::exception& e) {
1055  TEUCHOS_TEST_FOR_EXCEPTION
1056  (true, std::logic_error, prefix << "Tpetra::MultiVector::"
1057  "operator= threw an exception: " << std::endl << e.what ());
1058  }
1059  }
1060 
1061  BMV* Y_rowMap = NULL;
1062  if (theExport.is_null ()) {
1063  Y_rowMap = &Y;
1064  }
1065  else if ((*Y_rowMap_).is_null () ||
1066  (**Y_rowMap_).getNumVectors () != Y.getNumVectors () ||
1067  (**Y_rowMap_).getBlockSize () != Y.getBlockSize ()) {
1068  *Y_rowMap_ = rcp (new BMV (* (graph_.getRowMap ()), getBlockSize (),
1069  static_cast<LO> (X.getNumVectors ())));
1070  try {
1071  Y_rowMap = &(**Y_rowMap_);
1072  }
1073  catch (std::exception& e) {
1074  TEUCHOS_TEST_FOR_EXCEPTION(
1075  true, std::logic_error, prefix << "Tpetra::MultiVector::"
1076  "operator= threw an exception: " << std::endl << e.what ());
1077  }
1078  }
1079 
1080  try {
1081  localApplyBlockNoTrans (*X_colMap, *Y_rowMap, alpha, beta);
1082  }
1083  catch (std::exception& e) {
1084  TEUCHOS_TEST_FOR_EXCEPTION
1085  (true, std::runtime_error, prefix << "localApplyBlockNoTrans threw "
1086  "an exception: " << e.what ());
1087  }
1088  catch (...) {
1089  TEUCHOS_TEST_FOR_EXCEPTION
1090  (true, std::runtime_error, prefix << "localApplyBlockNoTrans threw "
1091  "an exception not a subclass of std::exception.");
1092  }
1093 
1094  if (! theExport.is_null ()) {
1095  Y.doExport (*Y_rowMap, *theExport, ::Tpetra::REPLACE);
1096  }
1097  }
1098  }
1099 
1100 // clang-format on
1101 template <class Scalar, class LO, class GO, class Node>
1102 void BlockCrsMatrix<Scalar, LO, GO, Node>::localApplyBlockNoTrans(
1103  const BlockMultiVector<Scalar, LO, GO, Node> &X,
1104  BlockMultiVector<Scalar, LO, GO, Node> &Y, const Scalar alpha,
1105  const Scalar beta) {
1106  ::Tpetra::Details::ProfilingRegion profile_region(
1107  "Tpetra::BlockCrsMatrix::localApplyBlockNoTrans");
1108  const impl_scalar_type alpha_impl = alpha;
1109  const auto graph = this->graph_.getLocalGraphDevice();
1110 
1111  mv_type X_mv = X.getMultiVectorView();
1112  mv_type Y_mv = Y.getMultiVectorView();
1113  auto X_lcl = X_mv.getLocalViewDevice(Access::ReadOnly);
1114  auto Y_lcl = Y_mv.getLocalViewDevice(Access::ReadWrite);
1115 
1116 #if KOKKOSKERNELS_VERSION >= 40299
1117  auto A_lcl = getLocalMatrixDevice();
1118  if(!applyHelper.get()) {
1119  // The apply helper does not exist, so create it
1120  applyHelper = std::make_shared<ApplyHelper>(A_lcl.nnz(), A_lcl.graph.row_map);
1121  }
1122  if(applyHelper->shouldUseIntRowptrs())
1123  {
1124  auto A_lcl_int_rowptrs = applyHelper->getIntRowptrMatrix(A_lcl);
1125  KokkosSparse::spmv(
1126  &applyHelper->handle_int, KokkosSparse::NoTranspose,
1127  alpha_impl, A_lcl_int_rowptrs, X_lcl, beta, Y_lcl);
1128  }
1129  else
1130  {
1131  KokkosSparse::spmv(
1132  &applyHelper->handle, KokkosSparse::NoTranspose,
1133  alpha_impl, A_lcl, X_lcl, beta, Y_lcl);
1134  }
1135 #else
1136  auto A_lcl = getLocalMatrixDevice();
1137  KokkosSparse::spmv(KokkosSparse::NoTranspose, alpha_impl, A_lcl, X_lcl, beta,
1138  Y_lcl);
1139 #endif
1140 }
1141 // clang-format off
1142 
1143  template<class Scalar, class LO, class GO, class Node>
1144  LO
1145  BlockCrsMatrix<Scalar, LO, GO, Node>::
1146  findRelOffsetOfColumnIndex (const LO localRowIndex,
1147  const LO colIndexToFind,
1148  const LO hint) const
1149  {
1150  const size_t absStartOffset = ptrHost_[localRowIndex];
1151  const size_t absEndOffset = ptrHost_[localRowIndex+1];
1152  const LO numEntriesInRow = static_cast<LO> (absEndOffset - absStartOffset);
1153  // Amortize pointer arithmetic over the search loop.
1154  const LO* const curInd = indHost_.data () + absStartOffset;
1155 
1156  // If the hint was correct, then the hint is the offset to return.
1157  if (hint < numEntriesInRow && curInd[hint] == colIndexToFind) {
1158  // Always return the offset relative to the current row.
1159  return hint;
1160  }
1161 
1162  // The hint was wrong, so we must search for the given column
1163  // index in the column indices for the given row.
1164  LO relOffset = Teuchos::OrdinalTraits<LO>::invalid ();
1165 
1166  // We require that the graph have sorted rows. However, binary
1167  // search only pays if the current row is longer than a certain
1168  // amount. We set this to 32, but you might want to tune this.
1169  const LO maxNumEntriesForLinearSearch = 32;
1170  if (numEntriesInRow > maxNumEntriesForLinearSearch) {
1171  // Use binary search. It would probably be better for us to
1172  // roll this loop by hand. If we wrote it right, a smart
1173  // compiler could perhaps use conditional loads and avoid
1174  // branches (according to Jed Brown on May 2014).
1175  const LO* beg = curInd;
1176  const LO* end = curInd + numEntriesInRow;
1177  std::pair<const LO*, const LO*> p =
1178  std::equal_range (beg, end, colIndexToFind);
1179  if (p.first != p.second) {
1180  // offset is relative to the current row
1181  relOffset = static_cast<LO> (p.first - beg);
1182  }
1183  }
1184  else { // use linear search
1185  for (LO k = 0; k < numEntriesInRow; ++k) {
1186  if (colIndexToFind == curInd[k]) {
1187  relOffset = k; // offset is relative to the current row
1188  break;
1189  }
1190  }
1191  }
1192 
1193  return relOffset;
1194  }
1195 
1196  template<class Scalar, class LO, class GO, class Node>
1197  LO
1198  BlockCrsMatrix<Scalar, LO, GO, Node>::
1199  offsetPerBlock () const
1200  {
1201  return offsetPerBlock_;
1202  }
1203 
1204  template<class Scalar, class LO, class GO, class Node>
1205  typename BlockCrsMatrix<Scalar, LO, GO, Node>::const_little_block_type
1206  BlockCrsMatrix<Scalar, LO, GO, Node>::
1207  getConstLocalBlockFromInput (const impl_scalar_type* val,
1208  const size_t pointOffset) const
1209  {
1210  // Row major blocks
1211  const LO rowStride = blockSize_;
1212  return const_little_block_type (val + pointOffset, blockSize_, rowStride);
1213  }
1214 
1215  template<class Scalar, class LO, class GO, class Node>
1216  typename BlockCrsMatrix<Scalar, LO, GO, Node>::little_block_type
1217  BlockCrsMatrix<Scalar, LO, GO, Node>::
1218  getNonConstLocalBlockFromInput (impl_scalar_type* val,
1219  const size_t pointOffset) const
1220  {
1221  // Row major blocks
1222  const LO rowStride = blockSize_;
1223  return little_block_type (val + pointOffset, blockSize_, rowStride);
1224  }
1225 
1226  template<class Scalar, class LO, class GO, class Node>
1227  typename BlockCrsMatrix<Scalar, LO, GO, Node>::little_block_host_type
1228  BlockCrsMatrix<Scalar, LO, GO, Node>::
1229  getNonConstLocalBlockFromInputHost (impl_scalar_type* val,
1230  const size_t pointOffset) const
1231  {
1232  // Row major blocks
1233  const LO rowStride = blockSize_;
1234  const size_t bs2 = blockSize_ * blockSize_;
1235  return little_block_host_type (val + bs2 * pointOffset, blockSize_, rowStride);
1236  }
1237 
1238  template<class Scalar, class LO, class GO, class Node>
1239  typename BlockCrsMatrix<Scalar, LO, GO, Node>::little_block_type
1240  BlockCrsMatrix<Scalar, LO, GO, Node>::
1241  getLocalBlockDeviceNonConst (const LO localRowInd, const LO localColInd) const
1242  {
1243  using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
1244 
1245  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1246  const LO relBlockOffset = this->findRelOffsetOfColumnIndex (localRowInd, localColInd);
1247  if (relBlockOffset != Teuchos::OrdinalTraits<LO>::invalid ()) {
1248  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1249  auto vals = const_cast<this_BCRS_type&>(*this).getValuesDeviceNonConst();
1250  auto r_val = getNonConstLocalBlockFromInput (vals.data(), absBlockOffset);
1251  return r_val;
1252  }
1253  else {
1254  return little_block_type ();
1255  }
1256  }
1257 
1258  template<class Scalar, class LO, class GO, class Node>
1259  typename BlockCrsMatrix<Scalar, LO, GO, Node>::little_block_host_type
1260  BlockCrsMatrix<Scalar, LO, GO, Node>::
1261  getLocalBlockHostNonConst (const LO localRowInd, const LO localColInd) const
1262  {
1263  using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
1264 
1265  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1266  const LO relBlockOffset = this->findRelOffsetOfColumnIndex (localRowInd, localColInd);
1267  if (relBlockOffset != Teuchos::OrdinalTraits<LO>::invalid ()) {
1268  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1269  auto vals = const_cast<this_BCRS_type&>(*this).getValuesHostNonConst();
1270  auto r_val = getNonConstLocalBlockFromInputHost (vals.data(), absBlockOffset);
1271  return r_val;
1272  }
1273  else {
1274  return little_block_host_type ();
1275  }
1276  }
1277 
1278 
1279  template<class Scalar, class LO, class GO, class Node>
1280  bool
1281  BlockCrsMatrix<Scalar, LO, GO, Node>::
1282  checkSizes (const ::Tpetra::SrcDistObject& source)
1283  {
1284  using std::endl;
1285  typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_BCRS_type;
1286  const this_BCRS_type* src = dynamic_cast<const this_BCRS_type* > (&source);
1287 
1288  if (src == NULL) {
1289  std::ostream& err = markLocalErrorAndGetStream ();
1290  err << "checkSizes: The source object of the Import or Export "
1291  "must be a BlockCrsMatrix with the same template parameters as the "
1292  "target object." << endl;
1293  }
1294  else {
1295  // Use a string of ifs, not if-elseifs, because we want to know
1296  // all the errors.
1297  if (src->getBlockSize () != this->getBlockSize ()) {
1298  std::ostream& err = markLocalErrorAndGetStream ();
1299  err << "checkSizes: The source and target objects of the Import or "
1300  << "Export must have the same block sizes. The source's block "
1301  << "size = " << src->getBlockSize () << " != the target's block "
1302  << "size = " << this->getBlockSize () << "." << endl;
1303  }
1304  if (! src->graph_.isFillComplete ()) {
1305  std::ostream& err = markLocalErrorAndGetStream ();
1306  err << "checkSizes: The source object of the Import or Export is "
1307  "not fill complete. Both source and target objects must be fill "
1308  "complete." << endl;
1309  }
1310  if (! this->graph_.isFillComplete ()) {
1311  std::ostream& err = markLocalErrorAndGetStream ();
1312  err << "checkSizes: The target object of the Import or Export is "
1313  "not fill complete. Both source and target objects must be fill "
1314  "complete." << endl;
1315  }
1316  if (src->graph_.getColMap ().is_null ()) {
1317  std::ostream& err = markLocalErrorAndGetStream ();
1318  err << "checkSizes: The source object of the Import or Export does "
1319  "not have a column Map. Both source and target objects must have "
1320  "column Maps." << endl;
1321  }
1322  if (this->graph_.getColMap ().is_null ()) {
1323  std::ostream& err = markLocalErrorAndGetStream ();
1324  err << "checkSizes: The target object of the Import or Export does "
1325  "not have a column Map. Both source and target objects must have "
1326  "column Maps." << endl;
1327  }
1328  }
1329 
1330  return ! (* (this->localError_));
1331  }
1332 
1333  template<class Scalar, class LO, class GO, class Node>
1334  void
1337  (const ::Tpetra::SrcDistObject& source,
1338  const size_t numSameIDs,
1339  const Kokkos::DualView<const local_ordinal_type*,
1340  buffer_device_type>& permuteToLIDs,
1341  const Kokkos::DualView<const local_ordinal_type*,
1342  buffer_device_type>& permuteFromLIDs,
1343  const CombineMode /*CM*/)
1344  {
1345  using ::Tpetra::Details::Behavior;
1347  using ::Tpetra::Details::ProfilingRegion;
1348  using std::endl;
1349  using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
1350 
1351  ProfilingRegion profile_region("Tpetra::BlockCrsMatrix::copyAndPermute");
1352  const bool debug = Behavior::debug();
1353  const bool verbose = Behavior::verbose();
1354 
1355  // Define this function prefix
1356  std::string prefix;
1357  {
1358  std::ostringstream os;
1359  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1360  os << "Proc " << myRank << ": BlockCrsMatrix::copyAndPermute : " << endl;
1361  prefix = os.str();
1362  }
1363 
1364  // check if this already includes a local error
1365  if (* (this->localError_)) {
1366  std::ostream& err = this->markLocalErrorAndGetStream ();
1367  err << prefix
1368  << "The target object of the Import or Export is already in an error state."
1369  << endl;
1370  return;
1371  }
1372 
1373  //
1374  // Verbose input dual view status
1375  //
1376  if (verbose) {
1377  std::ostringstream os;
1378  os << prefix << endl
1379  << prefix << " " << dualViewStatusToString (permuteToLIDs, "permuteToLIDs") << endl
1380  << prefix << " " << dualViewStatusToString (permuteFromLIDs, "permuteFromLIDs") << endl;
1381  std::cerr << os.str ();
1382  }
1383 
1387  if (permuteToLIDs.extent (0) != permuteFromLIDs.extent (0)) {
1388  std::ostream& err = this->markLocalErrorAndGetStream ();
1389  err << prefix
1390  << "permuteToLIDs.extent(0) = " << permuteToLIDs.extent (0)
1391  << " != permuteFromLIDs.extent(0) = " << permuteFromLIDs.extent(0)
1392  << "." << endl;
1393  return;
1394  }
1395  if (permuteToLIDs.need_sync_host () || permuteFromLIDs.need_sync_host ()) {
1396  std::ostream& err = this->markLocalErrorAndGetStream ();
1397  err << prefix
1398  << "Both permuteToLIDs and permuteFromLIDs must be sync'd to host."
1399  << endl;
1400  return;
1401  }
1402 
1403  const this_BCRS_type* src = dynamic_cast<const this_BCRS_type* > (&source);
1404  if (src == NULL) {
1405  std::ostream& err = this->markLocalErrorAndGetStream ();
1406  err << prefix
1407  << "The source (input) object of the Import or "
1408  "Export is either not a BlockCrsMatrix, or does not have the right "
1409  "template parameters. checkSizes() should have caught this. "
1410  "Please report this bug to the Tpetra developers." << endl;
1411  return;
1412  }
1413 
1414  bool lclErr = false;
1415 #ifdef HAVE_TPETRA_DEBUG
1416  std::set<LO> invalidSrcCopyRows;
1417  std::set<LO> invalidDstCopyRows;
1418  std::set<LO> invalidDstCopyCols;
1419  std::set<LO> invalidDstPermuteCols;
1420  std::set<LO> invalidPermuteFromRows;
1421 #endif // HAVE_TPETRA_DEBUG
1422 
1423  // Copy the initial sequence of rows that are the same.
1424  //
1425  // The two graphs might have different column Maps, so we need to
1426  // do this using global column indices. This is purely local, so
1427  // we only need to check for local sameness of the two column
1428  // Maps.
1429 
1430 #ifdef HAVE_TPETRA_DEBUG
1431  const map_type& srcRowMap = * (src->graph_.getRowMap ());
1432 #endif // HAVE_TPETRA_DEBUG
1433  const map_type& dstRowMap = * (this->graph_.getRowMap ());
1434  const map_type& srcColMap = * (src->graph_.getColMap ());
1435  const map_type& dstColMap = * (this->graph_.getColMap ());
1436  const bool canUseLocalColumnIndices = srcColMap.locallySameAs (dstColMap);
1437 
1438  const size_t numPermute = static_cast<size_t> (permuteFromLIDs.extent(0));
1439  if (verbose) {
1440  std::ostringstream os;
1441  os << prefix
1442  << "canUseLocalColumnIndices: "
1443  << (canUseLocalColumnIndices ? "true" : "false")
1444  << ", numPermute: " << numPermute
1445  << endl;
1446  std::cerr << os.str ();
1447  }
1448 
1449  const auto permuteToLIDsHost = permuteToLIDs.view_host();
1450  const auto permuteFromLIDsHost = permuteFromLIDs.view_host();
1451 
1452  if (canUseLocalColumnIndices) {
1453  // Copy local rows that are the "same" in both source and target.
1454  for (LO localRow = 0; localRow < static_cast<LO> (numSameIDs); ++localRow) {
1455 #ifdef HAVE_TPETRA_DEBUG
1456  if (! srcRowMap.isNodeLocalElement (localRow)) {
1457  lclErr = true;
1458  invalidSrcCopyRows.insert (localRow);
1459  continue; // skip invalid rows
1460  }
1461 #endif // HAVE_TPETRA_DEBUG
1462 
1463  local_inds_host_view_type lclSrcCols;
1464  values_host_view_type vals;
1465  LO numEntries;
1466  // If this call fails, that means the mesh row local index is
1467  // invalid. That means the Import or Export is invalid somehow.
1468  src->getLocalRowView (localRow, lclSrcCols, vals); numEntries = lclSrcCols.extent(0);
1469  if (numEntries > 0) {
1470  LO err = this->replaceLocalValues (localRow, lclSrcCols.data(), reinterpret_cast<const scalar_type*>(vals.data()), numEntries);
1471  if (err != numEntries) {
1472  lclErr = true;
1473  if (! dstRowMap.isNodeLocalElement (localRow)) {
1474 #ifdef HAVE_TPETRA_DEBUG
1475  invalidDstCopyRows.insert (localRow);
1476 #endif // HAVE_TPETRA_DEBUG
1477  }
1478  else {
1479  // Once there's an error, there's no sense in saving
1480  // time, so we check whether the column indices were
1481  // invalid. However, only remember which ones were
1482  // invalid in a debug build, because that might take a
1483  // lot of space.
1484  for (LO k = 0; k < numEntries; ++k) {
1485  if (! dstColMap.isNodeLocalElement (lclSrcCols[k])) {
1486  lclErr = true;
1487 #ifdef HAVE_TPETRA_DEBUG
1488  (void) invalidDstCopyCols.insert (lclSrcCols[k]);
1489 #endif // HAVE_TPETRA_DEBUG
1490  }
1491  }
1492  }
1493  }
1494  }
1495  } // for each "same" local row
1496 
1497  // Copy the "permute" local rows.
1498  for (size_t k = 0; k < numPermute; ++k) {
1499  const LO srcLclRow = static_cast<LO> (permuteFromLIDsHost(k));
1500  const LO dstLclRow = static_cast<LO> (permuteToLIDsHost(k));
1501 
1502  local_inds_host_view_type lclSrcCols;
1503  values_host_view_type vals;
1504  LO numEntries;
1505  src->getLocalRowView (srcLclRow, lclSrcCols, vals); numEntries = lclSrcCols.extent(0);
1506  if (numEntries > 0) {
1507  LO err = this->replaceLocalValues (dstLclRow, lclSrcCols.data(), reinterpret_cast<const scalar_type*>(vals.data()), numEntries);
1508  if (err != numEntries) {
1509  lclErr = true;
1510 #ifdef HAVE_TPETRA_DEBUG
1511  for (LO c = 0; c < numEntries; ++c) {
1512  if (! dstColMap.isNodeLocalElement (lclSrcCols[c])) {
1513  invalidDstPermuteCols.insert (lclSrcCols[c]);
1514  }
1515  }
1516 #endif // HAVE_TPETRA_DEBUG
1517  }
1518  }
1519  }
1520  }
1521  else { // must convert column indices to global
1522  // Reserve space to store the destination matrix's local column indices.
1523  const size_t maxNumEnt = src->graph_.getLocalMaxNumRowEntries ();
1524  Teuchos::Array<LO> lclDstCols (maxNumEnt);
1525 
1526  // Copy local rows that are the "same" in both source and target.
1527  for (LO localRow = 0; localRow < static_cast<LO> (numSameIDs); ++localRow) {
1528  local_inds_host_view_type lclSrcCols;
1529  values_host_view_type vals;
1530  LO numEntries;
1531 
1532  // If this call fails, that means the mesh row local index is
1533  // invalid. That means the Import or Export is invalid somehow.
1534  try {
1535  src->getLocalRowView (localRow, lclSrcCols, vals); numEntries = lclSrcCols.extent(0);
1536  } catch (std::exception& e) {
1537  if (debug) {
1538  std::ostringstream os;
1539  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1540  os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
1541  << localRow << ", src->getLocalRowView() threw an exception: "
1542  << e.what ();
1543  std::cerr << os.str ();
1544  }
1545  throw e;
1546  }
1547 
1548  if (numEntries > 0) {
1549  if (static_cast<size_t> (numEntries) > static_cast<size_t> (lclDstCols.size ())) {
1550  lclErr = true;
1551  if (debug) {
1552  std::ostringstream os;
1553  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1554  os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
1555  << localRow << ", numEntries = " << numEntries << " > maxNumEnt = "
1556  << maxNumEnt << endl;
1557  std::cerr << os.str ();
1558  }
1559  }
1560  else {
1561  // Convert the source matrix's local column indices to the
1562  // destination matrix's local column indices.
1563  Teuchos::ArrayView<LO> lclDstColsView = lclDstCols.view (0, numEntries);
1564  for (LO j = 0; j < numEntries; ++j) {
1565  lclDstColsView[j] = dstColMap.getLocalElement (srcColMap.getGlobalElement (lclSrcCols[j]));
1566  if (lclDstColsView[j] == Teuchos::OrdinalTraits<LO>::invalid ()) {
1567  lclErr = true;
1568 #ifdef HAVE_TPETRA_DEBUG
1569  invalidDstCopyCols.insert (lclDstColsView[j]);
1570 #endif // HAVE_TPETRA_DEBUG
1571  }
1572  }
1573  LO err(0);
1574  try {
1575  err = this->replaceLocalValues (localRow, lclDstColsView.getRawPtr (), reinterpret_cast<const scalar_type*>(vals.data()), numEntries);
1576  } catch (std::exception& e) {
1577  if (debug) {
1578  std::ostringstream os;
1579  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1580  os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
1581  << localRow << ", this->replaceLocalValues() threw an exception: "
1582  << e.what ();
1583  std::cerr << os.str ();
1584  }
1585  throw e;
1586  }
1587  if (err != numEntries) {
1588  lclErr = true;
1589  if (debug) {
1590  std::ostringstream os;
1591  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1592  os << "Proc " << myRank << ": copyAndPermute: At \"same\" "
1593  "localRow " << localRow << ", this->replaceLocalValues "
1594  "returned " << err << " instead of numEntries = "
1595  << numEntries << endl;
1596  std::cerr << os.str ();
1597  }
1598  }
1599  }
1600  }
1601  }
1602 
1603  // Copy the "permute" local rows.
1604  for (size_t k = 0; k < numPermute; ++k) {
1605  const LO srcLclRow = static_cast<LO> (permuteFromLIDsHost(k));
1606  const LO dstLclRow = static_cast<LO> (permuteToLIDsHost(k));
1607 
1608  local_inds_host_view_type lclSrcCols;
1609  values_host_view_type vals;
1610  LO numEntries;
1611 
1612  try {
1613  src->getLocalRowView (srcLclRow, lclSrcCols, vals); numEntries = lclSrcCols.extent(0);
1614  } catch (std::exception& e) {
1615  if (debug) {
1616  std::ostringstream os;
1617  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1618  os << "Proc " << myRank << ": copyAndPermute: At \"permute\" "
1619  "srcLclRow " << srcLclRow << " and dstLclRow " << dstLclRow
1620  << ", src->getLocalRowView() threw an exception: " << e.what ();
1621  std::cerr << os.str ();
1622  }
1623  throw e;
1624  }
1625 
1626  if (numEntries > 0) {
1627  if (static_cast<size_t> (numEntries) > static_cast<size_t> (lclDstCols.size ())) {
1628  lclErr = true;
1629  }
1630  else {
1631  // Convert the source matrix's local column indices to the
1632  // destination matrix's local column indices.
1633  Teuchos::ArrayView<LO> lclDstColsView = lclDstCols.view (0, numEntries);
1634  for (LO j = 0; j < numEntries; ++j) {
1635  lclDstColsView[j] = dstColMap.getLocalElement (srcColMap.getGlobalElement (lclSrcCols[j]));
1636  if (lclDstColsView[j] == Teuchos::OrdinalTraits<LO>::invalid ()) {
1637  lclErr = true;
1638 #ifdef HAVE_TPETRA_DEBUG
1639  invalidDstPermuteCols.insert (lclDstColsView[j]);
1640 #endif // HAVE_TPETRA_DEBUG
1641  }
1642  }
1643  LO err = this->replaceLocalValues (dstLclRow, lclDstColsView.getRawPtr (), reinterpret_cast<const scalar_type*>(vals.data()), numEntries);
1644  if (err != numEntries) {
1645  lclErr = true;
1646  }
1647  }
1648  }
1649  }
1650  }
1651 
1652  if (lclErr) {
1653  std::ostream& err = this->markLocalErrorAndGetStream ();
1654 #ifdef HAVE_TPETRA_DEBUG
1655  err << "copyAndPermute: The graph structure of the source of the "
1656  "Import or Export must be a subset of the graph structure of the "
1657  "target. ";
1658  err << "invalidSrcCopyRows = [";
1659  for (typename std::set<LO>::const_iterator it = invalidSrcCopyRows.begin ();
1660  it != invalidSrcCopyRows.end (); ++it) {
1661  err << *it;
1662  typename std::set<LO>::const_iterator itp1 = it;
1663  itp1++;
1664  if (itp1 != invalidSrcCopyRows.end ()) {
1665  err << ",";
1666  }
1667  }
1668  err << "], invalidDstCopyRows = [";
1669  for (typename std::set<LO>::const_iterator it = invalidDstCopyRows.begin ();
1670  it != invalidDstCopyRows.end (); ++it) {
1671  err << *it;
1672  typename std::set<LO>::const_iterator itp1 = it;
1673  itp1++;
1674  if (itp1 != invalidDstCopyRows.end ()) {
1675  err << ",";
1676  }
1677  }
1678  err << "], invalidDstCopyCols = [";
1679  for (typename std::set<LO>::const_iterator it = invalidDstCopyCols.begin ();
1680  it != invalidDstCopyCols.end (); ++it) {
1681  err << *it;
1682  typename std::set<LO>::const_iterator itp1 = it;
1683  itp1++;
1684  if (itp1 != invalidDstCopyCols.end ()) {
1685  err << ",";
1686  }
1687  }
1688  err << "], invalidDstPermuteCols = [";
1689  for (typename std::set<LO>::const_iterator it = invalidDstPermuteCols.begin ();
1690  it != invalidDstPermuteCols.end (); ++it) {
1691  err << *it;
1692  typename std::set<LO>::const_iterator itp1 = it;
1693  itp1++;
1694  if (itp1 != invalidDstPermuteCols.end ()) {
1695  err << ",";
1696  }
1697  }
1698  err << "], invalidPermuteFromRows = [";
1699  for (typename std::set<LO>::const_iterator it = invalidPermuteFromRows.begin ();
1700  it != invalidPermuteFromRows.end (); ++it) {
1701  err << *it;
1702  typename std::set<LO>::const_iterator itp1 = it;
1703  itp1++;
1704  if (itp1 != invalidPermuteFromRows.end ()) {
1705  err << ",";
1706  }
1707  }
1708  err << "]" << endl;
1709 #else
1710  err << "copyAndPermute: The graph structure of the source of the "
1711  "Import or Export must be a subset of the graph structure of the "
1712  "target." << endl;
1713 #endif // HAVE_TPETRA_DEBUG
1714  }
1715 
1716  if (debug) {
1717  std::ostringstream os;
1718  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1719  const bool lclSuccess = ! (* (this->localError_));
1720  os << "*** Proc " << myRank << ": copyAndPermute "
1721  << (lclSuccess ? "succeeded" : "FAILED");
1722  if (lclSuccess) {
1723  os << endl;
1724  } else {
1725  os << ": error messages: " << this->errorMessages (); // comes w/ endl
1726  }
1727  std::cerr << os.str ();
1728  }
1729  }
1730 
1731  namespace { // (anonymous)
1732 
1751  template<class LO, class GO>
1752  size_t
1753  packRowCount (const size_t numEnt,
1754  const size_t numBytesPerValue,
1755  const size_t blkSize)
1756  {
1757  using ::Tpetra::Details::PackTraits;
1758 
1759  if (numEnt == 0) {
1760  // Empty rows always take zero bytes, to ensure sparsity.
1761  return 0;
1762  }
1763  else {
1764  // We store the number of entries as a local index (LO).
1765  LO numEntLO = 0; // packValueCount wants this.
1766  GO gid {};
1767  const size_t numEntLen = PackTraits<LO>::packValueCount (numEntLO);
1768  const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount (gid);
1769  const size_t valsLen = numEnt * numBytesPerValue * blkSize * blkSize;
1770  return numEntLen + gidsLen + valsLen;
1771  }
1772  }
1773 
1784  template<class ST, class LO, class GO>
1785  size_t
1786  unpackRowCount (const typename ::Tpetra::Details::PackTraits<LO>::input_buffer_type& imports,
1787  const size_t offset,
1788  const size_t numBytes,
1789  const size_t /* numBytesPerValue */)
1790  {
1791  using Kokkos::subview;
1792  using ::Tpetra::Details::PackTraits;
1793 
1794  if (numBytes == 0) {
1795  // Empty rows always take zero bytes, to ensure sparsity.
1796  return static_cast<size_t> (0);
1797  }
1798  else {
1799  LO numEntLO = 0;
1800  const size_t theNumBytes = PackTraits<LO>::packValueCount (numEntLO);
1801  TEUCHOS_TEST_FOR_EXCEPTION
1802  (theNumBytes > numBytes, std::logic_error, "unpackRowCount: "
1803  "theNumBytes = " << theNumBytes << " < numBytes = " << numBytes
1804  << ".");
1805  const char* const inBuf = imports.data () + offset;
1806  const size_t actualNumBytes = PackTraits<LO>::unpackValue (numEntLO, inBuf);
1807  TEUCHOS_TEST_FOR_EXCEPTION
1808  (actualNumBytes > numBytes, std::logic_error, "unpackRowCount: "
1809  "actualNumBytes = " << actualNumBytes << " < numBytes = " << numBytes
1810  << ".");
1811  return static_cast<size_t> (numEntLO);
1812  }
1813  }
1814 
1818  template<class ST, class LO, class GO>
1819  size_t
1820  packRowForBlockCrs (const typename ::Tpetra::Details::PackTraits<LO>::output_buffer_type exports,
1821  const size_t offset,
1822  const size_t numEnt,
1823  const typename ::Tpetra::Details::PackTraits<GO>::input_array_type& gidsIn,
1824  const typename ::Tpetra::Details::PackTraits<ST>::input_array_type& valsIn,
1825  const size_t numBytesPerValue,
1826  const size_t blockSize)
1827  {
1828  using Kokkos::subview;
1829  using ::Tpetra::Details::PackTraits;
1830 
1831  if (numEnt == 0) {
1832  // Empty rows always take zero bytes, to ensure sparsity.
1833  return 0;
1834  }
1835  const size_t numScalarEnt = numEnt * blockSize * blockSize;
1836 
1837  const GO gid = 0; // packValueCount wants this
1838  const LO numEntLO = static_cast<size_t> (numEnt);
1839 
1840  const size_t numEntBeg = offset;
1841  const size_t numEntLen = PackTraits<LO>::packValueCount (numEntLO);
1842  const size_t gidsBeg = numEntBeg + numEntLen;
1843  const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount (gid);
1844  const size_t valsBeg = gidsBeg + gidsLen;
1845  const size_t valsLen = numScalarEnt * numBytesPerValue;
1846 
1847  char* const numEntOut = exports.data () + numEntBeg;
1848  char* const gidsOut = exports.data () + gidsBeg;
1849  char* const valsOut = exports.data () + valsBeg;
1850 
1851  size_t numBytesOut = 0;
1852  int errorCode = 0;
1853  numBytesOut += PackTraits<LO>::packValue (numEntOut, numEntLO);
1854 
1855  {
1856  Kokkos::pair<int, size_t> p;
1857  p = PackTraits<GO>::packArray (gidsOut, gidsIn.data (), numEnt);
1858  errorCode += p.first;
1859  numBytesOut += p.second;
1860 
1861  p = PackTraits<ST>::packArray (valsOut, valsIn.data (), numScalarEnt);
1862  errorCode += p.first;
1863  numBytesOut += p.second;
1864  }
1865 
1866  const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
1867  TEUCHOS_TEST_FOR_EXCEPTION
1868  (numBytesOut != expectedNumBytes, std::logic_error,
1869  "packRowForBlockCrs: numBytesOut = " << numBytesOut
1870  << " != expectedNumBytes = " << expectedNumBytes << ".");
1871 
1872  TEUCHOS_TEST_FOR_EXCEPTION
1873  (errorCode != 0, std::runtime_error, "packRowForBlockCrs: "
1874  "PackTraits::packArray returned a nonzero error code");
1875 
1876  return numBytesOut;
1877  }
1878 
1879  // Return the number of bytes actually read / used.
1880  template<class ST, class LO, class GO>
1881  size_t
1882  unpackRowForBlockCrs (const typename ::Tpetra::Details::PackTraits<GO>::output_array_type& gidsOut,
1883  const typename ::Tpetra::Details::PackTraits<ST>::output_array_type& valsOut,
1884  const typename ::Tpetra::Details::PackTraits<int>::input_buffer_type& imports,
1885  const size_t offset,
1886  const size_t numBytes,
1887  const size_t numEnt,
1888  const size_t numBytesPerValue,
1889  const size_t blockSize)
1890  {
1891  using ::Tpetra::Details::PackTraits;
1892 
1893  if (numBytes == 0) {
1894  // Rows with zero bytes always have zero entries.
1895  return 0;
1896  }
1897  const size_t numScalarEnt = numEnt * blockSize * blockSize;
1898  TEUCHOS_TEST_FOR_EXCEPTION
1899  (static_cast<size_t> (imports.extent (0)) <= offset,
1900  std::logic_error, "unpackRowForBlockCrs: imports.extent(0) = "
1901  << imports.extent (0) << " <= offset = " << offset << ".");
1902  TEUCHOS_TEST_FOR_EXCEPTION
1903  (static_cast<size_t> (imports.extent (0)) < offset + numBytes,
1904  std::logic_error, "unpackRowForBlockCrs: imports.extent(0) = "
1905  << imports.extent (0) << " < offset + numBytes = "
1906  << (offset + numBytes) << ".");
1907 
1908  const GO gid = 0; // packValueCount wants this
1909  const LO lid = 0; // packValueCount wants this
1910 
1911  const size_t numEntBeg = offset;
1912  const size_t numEntLen = PackTraits<LO>::packValueCount (lid);
1913  const size_t gidsBeg = numEntBeg + numEntLen;
1914  const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount (gid);
1915  const size_t valsBeg = gidsBeg + gidsLen;
1916  const size_t valsLen = numScalarEnt * numBytesPerValue;
1917 
1918  const char* const numEntIn = imports.data () + numEntBeg;
1919  const char* const gidsIn = imports.data () + gidsBeg;
1920  const char* const valsIn = imports.data () + valsBeg;
1921 
1922  size_t numBytesOut = 0;
1923  int errorCode = 0;
1924  LO numEntOut;
1925  numBytesOut += PackTraits<LO>::unpackValue (numEntOut, numEntIn);
1926  TEUCHOS_TEST_FOR_EXCEPTION
1927  (static_cast<size_t> (numEntOut) != numEnt, std::logic_error,
1928  "unpackRowForBlockCrs: Expected number of entries " << numEnt
1929  << " != actual number of entries " << numEntOut << ".");
1930 
1931  {
1932  Kokkos::pair<int, size_t> p;
1933  p = PackTraits<GO>::unpackArray (gidsOut.data (), gidsIn, numEnt);
1934  errorCode += p.first;
1935  numBytesOut += p.second;
1936 
1937  p = PackTraits<ST>::unpackArray (valsOut.data (), valsIn, numScalarEnt);
1938  errorCode += p.first;
1939  numBytesOut += p.second;
1940  }
1941 
1942  TEUCHOS_TEST_FOR_EXCEPTION
1943  (numBytesOut != numBytes, std::logic_error,
1944  "unpackRowForBlockCrs: numBytesOut = " << numBytesOut
1945  << " != numBytes = " << numBytes << ".");
1946 
1947  const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
1948  TEUCHOS_TEST_FOR_EXCEPTION
1949  (numBytesOut != expectedNumBytes, std::logic_error,
1950  "unpackRowForBlockCrs: numBytesOut = " << numBytesOut
1951  << " != expectedNumBytes = " << expectedNumBytes << ".");
1952 
1953  TEUCHOS_TEST_FOR_EXCEPTION
1954  (errorCode != 0, std::runtime_error, "unpackRowForBlockCrs: "
1955  "PackTraits::unpackArray returned a nonzero error code");
1956 
1957  return numBytesOut;
1958  }
1959  } // namespace (anonymous)
1960 
1961  template<class Scalar, class LO, class GO, class Node>
1962  void
1965  (const ::Tpetra::SrcDistObject& source,
1966  const Kokkos::DualView<const local_ordinal_type*,
1967  buffer_device_type>& exportLIDs,
1968  Kokkos::DualView<packet_type*,
1969  buffer_device_type>& exports, // output
1970  Kokkos::DualView<size_t*,
1971  buffer_device_type> numPacketsPerLID, // output
1972  size_t& constantNumPackets)
1973  {
1974  using ::Tpetra::Details::Behavior;
1976  using ::Tpetra::Details::ProfilingRegion;
1977  using ::Tpetra::Details::PackTraits;
1978 
1979  typedef typename Kokkos::View<int*, device_type>::HostMirror::execution_space host_exec;
1980 
1981  typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_BCRS_type;
1982 
1983  ProfilingRegion profile_region("Tpetra::BlockCrsMatrix::packAndPrepare");
1984 
1985  const bool debug = Behavior::debug();
1986  const bool verbose = Behavior::verbose();
1987 
1988  // Define this function prefix
1989  std::string prefix;
1990  {
1991  std::ostringstream os;
1992  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1993  os << "Proc " << myRank << ": BlockCrsMatrix::packAndPrepare: " << std::endl;
1994  prefix = os.str();
1995  }
1996 
1997  // check if this already includes a local error
1998  if (* (this->localError_)) {
1999  std::ostream& err = this->markLocalErrorAndGetStream ();
2000  err << prefix
2001  << "The target object of the Import or Export is already in an error state."
2002  << std::endl;
2003  return;
2004  }
2005 
2006  //
2007  // Verbose input dual view status
2008  //
2009  if (verbose) {
2010  std::ostringstream os;
2011  os << prefix << std::endl
2012  << prefix << " " << dualViewStatusToString (exportLIDs, "exportLIDs") << std::endl
2013  << prefix << " " << dualViewStatusToString (exports, "exports") << std::endl
2014  << prefix << " " << dualViewStatusToString (numPacketsPerLID, "numPacketsPerLID") << std::endl;
2015  std::cerr << os.str ();
2016  }
2017 
2021  if (exportLIDs.extent (0) != numPacketsPerLID.extent (0)) {
2022  std::ostream& err = this->markLocalErrorAndGetStream ();
2023  err << prefix
2024  << "exportLIDs.extent(0) = " << exportLIDs.extent (0)
2025  << " != numPacketsPerLID.extent(0) = " << numPacketsPerLID.extent(0)
2026  << "." << std::endl;
2027  return;
2028  }
2029  if (exportLIDs.need_sync_host ()) {
2030  std::ostream& err = this->markLocalErrorAndGetStream ();
2031  err << prefix << "exportLIDs be sync'd to host." << std::endl;
2032  return;
2033  }
2034 
2035  const this_BCRS_type* src = dynamic_cast<const this_BCRS_type* > (&source);
2036  if (src == NULL) {
2037  std::ostream& err = this->markLocalErrorAndGetStream ();
2038  err << prefix
2039  << "The source (input) object of the Import or "
2040  "Export is either not a BlockCrsMatrix, or does not have the right "
2041  "template parameters. checkSizes() should have caught this. "
2042  "Please report this bug to the Tpetra developers." << std::endl;
2043  return;
2044  }
2045 
2046  // Graphs and matrices are allowed to have a variable number of
2047  // entries per row. We could test whether all rows have the same
2048  // number of entries, but DistObject can only use this
2049  // optimization if all rows on _all_ processes have the same
2050  // number of entries. Rather than do the all-reduce necessary to
2051  // test for this unlikely case, we tell DistObject (by setting
2052  // constantNumPackets to zero) to assume that different rows may
2053  // have different numbers of entries.
2054  constantNumPackets = 0;
2055 
2056  // const values
2057  const crs_graph_type& srcGraph = src->graph_;
2058  const size_t blockSize = static_cast<size_t> (src->getBlockSize ());
2059  const size_t numExportLIDs = exportLIDs.extent (0);
2060  size_t numBytesPerValue(0);
2061  {
2062  auto val_host = val_.getHostView(Access::ReadOnly);
2063  numBytesPerValue =
2064  PackTraits<impl_scalar_type>
2065  ::packValueCount(val_host.extent(0) ? val_host(0) : impl_scalar_type());
2066  }
2067 
2068  // Compute the number of bytes ("packets") per row to pack. While
2069  // we're at it, compute the total # of block entries to send, and
2070  // the max # of block entries in any of the rows we're sending.
2071 
2072  Impl::BlockCrsRowStruct<size_t> rowReducerStruct;
2073 
2074  // Graph information is on host; let's do this on host parallel reduce
2075  auto exportLIDsHost = exportLIDs.view_host();
2076  auto numPacketsPerLIDHost = numPacketsPerLID.view_host(); // we will modify this.
2077  numPacketsPerLID.modify_host ();
2078  {
2079  using reducer_type = Impl::BlockCrsReducer<Impl::BlockCrsRowStruct<size_t>,host_exec>;
2080  const auto policy = Kokkos::RangePolicy<host_exec>(size_t(0), numExportLIDs);
2081  Kokkos::parallel_reduce
2082  (policy,
2083  [=](const size_t &i, typename reducer_type::value_type &update) {
2084  const LO lclRow = exportLIDsHost(i);
2085  size_t numEnt = srcGraph.getNumEntriesInLocalRow (lclRow);
2086  numEnt = (numEnt == Teuchos::OrdinalTraits<size_t>::invalid () ? 0 : numEnt);
2087 
2088  const size_t numBytes =
2089  packRowCount<LO, GO> (numEnt, numBytesPerValue, blockSize);
2090  numPacketsPerLIDHost(i) = numBytes;
2091  update += typename reducer_type::value_type(numEnt, numBytes, numEnt);
2092  }, rowReducerStruct);
2093  }
2094 
2095  // Compute the number of bytes ("packets") per row to pack. While
2096  // we're at it, compute the total # of block entries to send, and
2097  // the max # of block entries in any of the rows we're sending.
2098  const size_t totalNumBytes = rowReducerStruct.totalNumBytes;
2099  const size_t totalNumEntries = rowReducerStruct.totalNumEntries;
2100  const size_t maxRowLength = rowReducerStruct.maxRowLength;
2101 
2102  if (verbose) {
2103  std::ostringstream os;
2104  os << prefix
2105  << "totalNumBytes = " << totalNumBytes << ", totalNumEntries = " << totalNumEntries
2106  << std::endl;
2107  std::cerr << os.str ();
2108  }
2109 
2110  // We use a "struct of arrays" approach to packing each row's
2111  // entries. All the column indices (as global indices) go first,
2112  // then all their owning process ranks, and then the values.
2113  if(exports.extent(0) != totalNumBytes)
2114  {
2115  const std::string oldLabel = exports.d_view.label ();
2116  const std::string newLabel = (oldLabel == "") ? "exports" : oldLabel;
2117  exports = Kokkos::DualView<packet_type*, buffer_device_type>(newLabel, totalNumBytes);
2118  }
2119  if (totalNumEntries > 0) {
2120  // Current position (in bytes) in the 'exports' output array.
2121  Kokkos::View<size_t*, host_exec> offset("offset", numExportLIDs+1);
2122  {
2123  const auto policy = Kokkos::RangePolicy<host_exec>(size_t(0), numExportLIDs+1);
2124  Kokkos::parallel_scan
2125  (policy,
2126  [=](const size_t &i, size_t &update, const bool &final) {
2127  if (final) offset(i) = update;
2128  update += (i == numExportLIDs ? 0 : numPacketsPerLIDHost(i));
2129  });
2130  }
2131  if (offset(numExportLIDs) != totalNumBytes) {
2132  std::ostream& err = this->markLocalErrorAndGetStream ();
2133  err << prefix
2134  << "At end of method, the final offset (in bytes) "
2135  << offset(numExportLIDs) << " does not equal the total number of bytes packed "
2136  << totalNumBytes << ". "
2137  << "Please report this bug to the Tpetra developers." << std::endl;
2138  return;
2139  }
2140 
2141  // For each block row of the matrix owned by the calling
2142  // process, pack that block row's column indices and values into
2143  // the exports array.
2144 
2145  // Source matrix's column Map. We verified in checkSizes() that
2146  // the column Map exists (is not null).
2147  const map_type& srcColMap = * (srcGraph.getColMap ());
2148 
2149  // Pack the data for each row to send, into the 'exports' buffer.
2150  // exports will be modified on host.
2151  exports.modify_host();
2152  {
2153  typedef Kokkos::TeamPolicy<host_exec> policy_type;
2154  const auto policy =
2155  policy_type(numExportLIDs, 1, 1)
2156  .set_scratch_size(0, Kokkos::PerTeam(sizeof(GO)*maxRowLength));
2157  // The following parallel_for needs const access to the local values of src.
2158  // (the local graph is also accessed on host, but this does not use WDVs).
2159  getValuesHost();
2161  Kokkos::parallel_for
2162  (policy,
2163  [=](const typename policy_type::member_type &member) {
2164  const size_t i = member.league_rank();
2165  Kokkos::View<GO*, typename host_exec::scratch_memory_space>
2166  gblColInds(member.team_scratch(0), maxRowLength);
2167 
2168  const LO lclRowInd = exportLIDsHost(i);
2169  local_inds_host_view_type lclColInds;
2170  values_host_view_type vals;
2171 
2172  // It's OK to ignore the return value, since if the calling
2173  // process doesn't own that local row, then the number of
2174  // entries in that row on the calling process is zero.
2175  src->getLocalRowView (lclRowInd, lclColInds, vals);
2176  const size_t numEnt = lclColInds.extent(0);
2177 
2178  // Convert column indices from local to global.
2179  for (size_t j = 0; j < numEnt; ++j)
2180  gblColInds(j) = srcColMap.getGlobalElement (lclColInds(j));
2181 
2182  // Kyungjoo: additional wrapping scratch view is necessary
2183  // the following function interface need the same execution space
2184  // host scratch space somehow is not considered same as the host_exec
2185  // Copy the row's data into the current spot in the exports array.
2186  const size_t numBytes =
2187  packRowForBlockCrs<impl_scalar_type, LO, GO>
2188  (exports.view_host(),
2189  offset(i),
2190  numEnt,
2191  Kokkos::View<const GO*, host_exec>(gblColInds.data(), maxRowLength),
2192  vals,
2193  numBytesPerValue,
2194  blockSize);
2195 
2196  // numBytes should be same as the difference between offsets
2197  if (debug) {
2198  const size_t offsetDiff = offset(i+1) - offset(i);
2199  if (numBytes != offsetDiff) {
2200  std::ostringstream os;
2201  os << prefix
2202  << "numBytes computed from packRowForBlockCrs is different from "
2203  << "precomputed offset values, LID = " << i << std::endl;
2204  std::cerr << os.str ();
2205  }
2206  }
2207  }); // for each LID (of a row) to send
2209  }
2210  } // if totalNumEntries > 0
2211 
2212  if (debug) {
2213  std::ostringstream os;
2214  const bool lclSuccess = ! (* (this->localError_));
2215  os << prefix
2216  << (lclSuccess ? "succeeded" : "FAILED")
2217  << std::endl;
2218  std::cerr << os.str ();
2219  }
2220  }
2221 
2222  template<class Scalar, class LO, class GO, class Node>
2223  void
2226  (const Kokkos::DualView<const local_ordinal_type*,
2227  buffer_device_type>& importLIDs,
2228  Kokkos::DualView<packet_type*,
2229  buffer_device_type> imports,
2230  Kokkos::DualView<size_t*,
2231  buffer_device_type> numPacketsPerLID,
2232  const size_t /* constantNumPackets */,
2233  const CombineMode combineMode)
2234  {
2235  using ::Tpetra::Details::Behavior;
2237  using ::Tpetra::Details::ProfilingRegion;
2238  using ::Tpetra::Details::PackTraits;
2239  using std::endl;
2240  using host_exec =
2241  typename Kokkos::View<int*, device_type>::HostMirror::execution_space;
2242 
2243  ProfilingRegion profile_region("Tpetra::BlockCrsMatrix::unpackAndCombine");
2244  const bool verbose = Behavior::verbose ();
2245 
2246  // Define this function prefix
2247  std::string prefix;
2248  {
2249  std::ostringstream os;
2250  auto map = this->graph_.getRowMap ();
2251  auto comm = map.is_null () ? Teuchos::null : map->getComm ();
2252  const int myRank = comm.is_null () ? -1 : comm->getRank ();
2253  os << "Proc " << myRank << ": Tpetra::BlockCrsMatrix::unpackAndCombine: ";
2254  prefix = os.str ();
2255  if (verbose) {
2256  os << "Start" << endl;
2257  std::cerr << os.str ();
2258  }
2259  }
2260 
2261  // check if this already includes a local error
2262  if (* (this->localError_)) {
2263  std::ostream& err = this->markLocalErrorAndGetStream ();
2264  std::ostringstream os;
2265  os << prefix << "{Im/Ex}port target is already in error." << endl;
2266  if (verbose) {
2267  std::cerr << os.str ();
2268  }
2269  err << os.str ();
2270  return;
2271  }
2272 
2276  if (importLIDs.extent (0) != numPacketsPerLID.extent (0)) {
2277  std::ostream& err = this->markLocalErrorAndGetStream ();
2278  std::ostringstream os;
2279  os << prefix << "importLIDs.extent(0) = " << importLIDs.extent (0)
2280  << " != numPacketsPerLID.extent(0) = " << numPacketsPerLID.extent(0)
2281  << "." << endl;
2282  if (verbose) {
2283  std::cerr << os.str ();
2284  }
2285  err << os.str ();
2286  return;
2287  }
2288 
2289  if (combineMode != ADD && combineMode != INSERT &&
2290  combineMode != REPLACE && combineMode != ABSMAX &&
2291  combineMode != ZERO) {
2292  std::ostream& err = this->markLocalErrorAndGetStream ();
2293  std::ostringstream os;
2294  os << prefix
2295  << "Invalid CombineMode value " << combineMode << ". Valid "
2296  << "values include ADD, INSERT, REPLACE, ABSMAX, and ZERO."
2297  << std::endl;
2298  if (verbose) {
2299  std::cerr << os.str ();
2300  }
2301  err << os.str ();
2302  return;
2303  }
2304 
2305  if (this->graph_.getColMap ().is_null ()) {
2306  std::ostream& err = this->markLocalErrorAndGetStream ();
2307  std::ostringstream os;
2308  os << prefix << "Target matrix's column Map is null." << endl;
2309  if (verbose) {
2310  std::cerr << os.str ();
2311  }
2312  err << os.str ();
2313  return;
2314  }
2315 
2316  // Target matrix's column Map. Use to convert the global column
2317  // indices in the receive buffer to local indices. We verified in
2318  // checkSizes() that the column Map exists (is not null).
2319  const map_type& tgtColMap = * (this->graph_.getColMap ());
2320 
2321  // Const values
2322  const size_t blockSize = this->getBlockSize ();
2323  const size_t numImportLIDs = importLIDs.extent(0);
2324  // FIXME (mfh 06 Feb 2019) For scalar types with run-time sizes, a
2325  // default-constructed instance could have a different size than
2326  // other instances. (We assume all nominally constructed
2327  // instances have the same size; that's not the issue here.) This
2328  // could be bad if the calling process has no entries, but other
2329  // processes have entries that they want to send to us.
2330  size_t numBytesPerValue(0);
2331  {
2332  auto val_host = val_.getHostView(Access::ReadOnly);
2333  numBytesPerValue =
2334  PackTraits<impl_scalar_type>::packValueCount
2335  (val_host.extent (0) ? val_host(0) : impl_scalar_type ());
2336  }
2337  const size_t maxRowNumEnt = graph_.getLocalMaxNumRowEntries ();
2338  const size_t maxRowNumScalarEnt = maxRowNumEnt * blockSize * blockSize;
2339 
2340  if (verbose) {
2341  std::ostringstream os;
2342  os << prefix << "combineMode: "
2343  << ::Tpetra::combineModeToString (combineMode)
2344  << ", blockSize: " << blockSize
2345  << ", numImportLIDs: " << numImportLIDs
2346  << ", numBytesPerValue: " << numBytesPerValue
2347  << ", maxRowNumEnt: " << maxRowNumEnt
2348  << ", maxRowNumScalarEnt: " << maxRowNumScalarEnt << endl;
2349  std::cerr << os.str ();
2350  }
2351 
2352  if (combineMode == ZERO || numImportLIDs == 0) {
2353  if (verbose) {
2354  std::ostringstream os;
2355  os << prefix << "Nothing to unpack. Done!" << std::endl;
2356  std::cerr << os.str ();
2357  }
2358  return;
2359  }
2360 
2361  // NOTE (mfh 07 Feb 2019) If we ever implement unpack on device,
2362  // we can remove this sync.
2363  if (imports.need_sync_host ()) {
2364  imports.sync_host ();
2365  }
2366 
2367  // NOTE (mfh 07 Feb 2019) DistObject::doTransferNew has already
2368  // sync'd numPacketsPerLID to host, since it needs to do that in
2369  // order to post MPI_Irecv messages with the correct lengths. A
2370  // hypothetical device-specific boundary exchange implementation
2371  // could possibly receive data without sync'ing lengths to host,
2372  // but we don't need to design for that nonexistent thing yet.
2373 
2374  if (imports.need_sync_host () || numPacketsPerLID.need_sync_host () ||
2375  importLIDs.need_sync_host ()) {
2376  std::ostream& err = this->markLocalErrorAndGetStream ();
2377  std::ostringstream os;
2378  os << prefix << "All input DualViews must be sync'd to host by now. "
2379  << "imports_nc.need_sync_host()="
2380  << (imports.need_sync_host () ? "true" : "false")
2381  << ", numPacketsPerLID.need_sync_host()="
2382  << (numPacketsPerLID.need_sync_host () ? "true" : "false")
2383  << ", importLIDs.need_sync_host()="
2384  << (importLIDs.need_sync_host () ? "true" : "false")
2385  << "." << endl;
2386  if (verbose) {
2387  std::cerr << os.str ();
2388  }
2389  err << os.str ();
2390  return;
2391  }
2392 
2393  const auto importLIDsHost = importLIDs.view_host ();
2394  const auto numPacketsPerLIDHost = numPacketsPerLID.view_host ();
2395 
2396  // FIXME (mfh 06 Feb 2019) We could fuse the scan with the unpack
2397  // loop, by only unpacking on final==true (when we know the
2398  // current offset's value).
2399 
2400  Kokkos::View<size_t*, host_exec> offset ("offset", numImportLIDs+1);
2401  {
2402  const auto policy = Kokkos::RangePolicy<host_exec>(size_t(0), numImportLIDs+1);
2403  Kokkos::parallel_scan
2404  ("Tpetra::BlockCrsMatrix::unpackAndCombine: offsets", policy,
2405  [=] (const size_t &i, size_t &update, const bool &final) {
2406  if (final) offset(i) = update;
2407  update += (i == numImportLIDs ? 0 : numPacketsPerLIDHost(i));
2408  });
2409  }
2410 
2411  // this variable does not matter with a race condition (just error flag)
2412  //
2413  // NOTE (mfh 06 Feb 2019) CUDA doesn't necessarily like reductions
2414  // or atomics on bool, so we use int instead. (I know we're not
2415  // launching a CUDA loop here, but we could in the future, and I'd
2416  // like to avoid potential trouble.)
2417  Kokkos::View<int, host_exec, Kokkos::MemoryTraits<Kokkos::Atomic> >
2418  errorDuringUnpack ("errorDuringUnpack");
2419  errorDuringUnpack () = 0;
2420  {
2421  using policy_type = Kokkos::TeamPolicy<host_exec>;
2422  size_t scratch_per_row = sizeof(GO) * maxRowNumEnt + sizeof (LO) * maxRowNumEnt + numBytesPerValue * maxRowNumScalarEnt
2423  + 2 * sizeof(GO); // Yeah, this is a fudge factor
2424 
2425  const auto policy = policy_type (numImportLIDs, 1, 1)
2426  .set_scratch_size (0, Kokkos::PerTeam (scratch_per_row));
2427  using host_scratch_space = typename host_exec::scratch_memory_space;
2428 
2429  using pair_type = Kokkos::pair<size_t, size_t>;
2430 
2431  //The following parallel_for modifies values on host while unpacking.
2432  getValuesHostNonConst();
2434  Kokkos::parallel_for
2435  ("Tpetra::BlockCrsMatrix::unpackAndCombine: unpack", policy,
2436  [=] (const typename policy_type::member_type& member) {
2437  const size_t i = member.league_rank();
2438  Kokkos::View<GO*, host_scratch_space> gblColInds
2439  (member.team_scratch (0), maxRowNumEnt);
2440  Kokkos::View<LO*, host_scratch_space> lclColInds
2441  (member.team_scratch (0), maxRowNumEnt);
2442  Kokkos::View<impl_scalar_type*, host_scratch_space> vals
2443  (member.team_scratch (0), maxRowNumScalarEnt);
2444 
2445 
2446  const size_t offval = offset(i);
2447  const LO lclRow = importLIDsHost(i);
2448  const size_t numBytes = numPacketsPerLIDHost(i);
2449  const size_t numEnt =
2450  unpackRowCount<impl_scalar_type, LO, GO>
2451  (imports.view_host (), offval, numBytes, numBytesPerValue);
2452 
2453  if (numBytes > 0) {
2454  if (numEnt > maxRowNumEnt) {
2455  errorDuringUnpack() = 1;
2456  if (verbose) {
2457  std::ostringstream os;
2458  os << prefix
2459  << "At i = " << i << ", numEnt = " << numEnt
2460  << " > maxRowNumEnt = " << maxRowNumEnt
2461  << std::endl;
2462  std::cerr << os.str ();
2463  }
2464  return;
2465  }
2466  }
2467  const size_t numScalarEnt = numEnt*blockSize*blockSize;
2468  auto gidsOut = Kokkos::subview(gblColInds, pair_type(0, numEnt));
2469  auto lidsOut = Kokkos::subview(lclColInds, pair_type(0, numEnt));
2470  auto valsOut = Kokkos::subview(vals, pair_type(0, numScalarEnt));
2471 
2472  // Kyungjoo: additional wrapping scratch view is necessary
2473  // the following function interface need the same execution space
2474  // host scratch space somehow is not considered same as the host_exec
2475  size_t numBytesOut = 0;
2476  try {
2477  numBytesOut =
2478  unpackRowForBlockCrs<impl_scalar_type, LO, GO>
2479  (Kokkos::View<GO*,host_exec>(gidsOut.data(), numEnt),
2480  Kokkos::View<impl_scalar_type*,host_exec>(valsOut.data(), numScalarEnt),
2481  imports.view_host(),
2482  offval, numBytes, numEnt,
2483  numBytesPerValue, blockSize);
2484  }
2485  catch (std::exception& e) {
2486  errorDuringUnpack() = 1;
2487  if (verbose) {
2488  std::ostringstream os;
2489  os << prefix << "At i = " << i << ", unpackRowForBlockCrs threw: "
2490  << e.what () << endl;
2491  std::cerr << os.str ();
2492  }
2493  return;
2494  }
2495 
2496  if (numBytes != numBytesOut) {
2497  errorDuringUnpack() = 1;
2498  if (verbose) {
2499  std::ostringstream os;
2500  os << prefix
2501  << "At i = " << i << ", numBytes = " << numBytes
2502  << " != numBytesOut = " << numBytesOut << "."
2503  << std::endl;
2504  std::cerr << os.str();
2505  }
2506  return;
2507  }
2508 
2509  // Convert incoming global indices to local indices.
2510  for (size_t k = 0; k < numEnt; ++k) {
2511  lidsOut(k) = tgtColMap.getLocalElement (gidsOut(k));
2512  if (lidsOut(k) == Teuchos::OrdinalTraits<LO>::invalid ()) {
2513  if (verbose) {
2514  std::ostringstream os;
2515  os << prefix
2516  << "At i = " << i << ", GID " << gidsOut(k)
2517  << " is not owned by the calling process."
2518  << std::endl;
2519  std::cerr << os.str();
2520  }
2521  return;
2522  }
2523  }
2524 
2525  // Combine the incoming data with the matrix's current data.
2526  LO numCombd = 0;
2527  const LO* const lidsRaw = const_cast<const LO*> (lidsOut.data ());
2528  const Scalar* const valsRaw = reinterpret_cast<const Scalar*>
2529  (const_cast<const impl_scalar_type*> (valsOut.data ()));
2530  if (combineMode == ADD) {
2531  numCombd = this->sumIntoLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
2532  } else if (combineMode == INSERT || combineMode == REPLACE) {
2533  numCombd = this->replaceLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
2534  } else if (combineMode == ABSMAX) {
2535  numCombd = this->absMaxLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
2536  }
2537 
2538  if (static_cast<LO> (numEnt) != numCombd) {
2539  errorDuringUnpack() = 1;
2540  if (verbose) {
2541  std::ostringstream os;
2542  os << prefix << "At i = " << i << ", numEnt = " << numEnt
2543  << " != numCombd = " << numCombd << "."
2544  << endl;
2545  std::cerr << os.str ();
2546  }
2547  return;
2548  }
2549  }); // for each import LID i
2551  }
2552 
2553  if (errorDuringUnpack () != 0) {
2554  std::ostream& err = this->markLocalErrorAndGetStream ();
2555  err << prefix << "Unpacking failed.";
2556  if (! verbose) {
2557  err << " Please run again with the environment variable setting "
2558  "TPETRA_VERBOSE=1 to get more verbose diagnostic output.";
2559  }
2560  err << endl;
2561  }
2562 
2563  if (verbose) {
2564  std::ostringstream os;
2565  const bool lclSuccess = ! (* (this->localError_));
2566  os << prefix
2567  << (lclSuccess ? "succeeded" : "FAILED")
2568  << std::endl;
2569  std::cerr << os.str ();
2570  }
2571  }
2572 
2573  template<class Scalar, class LO, class GO, class Node>
2574  std::string
2576  {
2577  using Teuchos::TypeNameTraits;
2578  std::ostringstream os;
2579  os << "\"Tpetra::BlockCrsMatrix\": { "
2580  << "Template parameters: { "
2581  << "Scalar: " << TypeNameTraits<Scalar>::name ()
2582  << "LO: " << TypeNameTraits<LO>::name ()
2583  << "GO: " << TypeNameTraits<GO>::name ()
2584  << "Node: " << TypeNameTraits<Node>::name ()
2585  << " }"
2586  << ", Label: \"" << this->getObjectLabel () << "\""
2587  << ", Global dimensions: ["
2588  << graph_.getDomainMap ()->getGlobalNumElements () << ", "
2589  << graph_.getRangeMap ()->getGlobalNumElements () << "]"
2590  << ", Block size: " << getBlockSize ()
2591  << " }";
2592  return os.str ();
2593  }
2594 
2595 
2596  template<class Scalar, class LO, class GO, class Node>
2597  void
2599  describe (Teuchos::FancyOStream& out,
2600  const Teuchos::EVerbosityLevel verbLevel) const
2601  {
2602  using Teuchos::ArrayRCP;
2603  using Teuchos::CommRequest;
2604  using Teuchos::FancyOStream;
2605  using Teuchos::getFancyOStream;
2606  using Teuchos::ireceive;
2607  using Teuchos::isend;
2608  using Teuchos::outArg;
2609  using Teuchos::TypeNameTraits;
2610  using Teuchos::VERB_DEFAULT;
2611  using Teuchos::VERB_NONE;
2612  using Teuchos::VERB_LOW;
2613  using Teuchos::VERB_MEDIUM;
2614  // using Teuchos::VERB_HIGH;
2615  using Teuchos::VERB_EXTREME;
2616  using Teuchos::RCP;
2617  using Teuchos::wait;
2618  using std::endl;
2619 
2620  // Set default verbosity if applicable.
2621  const Teuchos::EVerbosityLevel vl =
2622  (verbLevel == VERB_DEFAULT) ? VERB_LOW : verbLevel;
2623 
2624  if (vl == VERB_NONE) {
2625  return; // print nothing
2626  }
2627 
2628  // describe() always starts with a tab before it prints anything.
2629  Teuchos::OSTab tab0 (out);
2630 
2631  out << "\"Tpetra::BlockCrsMatrix\":" << endl;
2632  Teuchos::OSTab tab1 (out);
2633 
2634  out << "Template parameters:" << endl;
2635  {
2636  Teuchos::OSTab tab2 (out);
2637  out << "Scalar: " << TypeNameTraits<Scalar>::name () << endl
2638  << "LO: " << TypeNameTraits<LO>::name () << endl
2639  << "GO: " << TypeNameTraits<GO>::name () << endl
2640  << "Node: " << TypeNameTraits<Node>::name () << endl;
2641  }
2642  out << "Label: \"" << this->getObjectLabel () << "\"" << endl
2643  << "Global dimensions: ["
2644  << graph_.getDomainMap ()->getGlobalNumElements () << ", "
2645  << graph_.getRangeMap ()->getGlobalNumElements () << "]" << endl;
2646 
2647  const LO blockSize = getBlockSize ();
2648  out << "Block size: " << blockSize << endl;
2649 
2650  // constituent objects
2651  if (vl >= VERB_MEDIUM) {
2652  const Teuchos::Comm<int>& comm = * (graph_.getMap ()->getComm ());
2653  const int myRank = comm.getRank ();
2654  if (myRank == 0) {
2655  out << "Row Map:" << endl;
2656  }
2657  getRowMap()->describe (out, vl);
2658 
2659  if (! getColMap ().is_null ()) {
2660  if (getColMap() == getRowMap()) {
2661  if (myRank == 0) {
2662  out << "Column Map: same as row Map" << endl;
2663  }
2664  }
2665  else {
2666  if (myRank == 0) {
2667  out << "Column Map:" << endl;
2668  }
2669  getColMap ()->describe (out, vl);
2670  }
2671  }
2672  if (! getDomainMap ().is_null ()) {
2673  if (getDomainMap () == getRowMap ()) {
2674  if (myRank == 0) {
2675  out << "Domain Map: same as row Map" << endl;
2676  }
2677  }
2678  else if (getDomainMap () == getColMap ()) {
2679  if (myRank == 0) {
2680  out << "Domain Map: same as column Map" << endl;
2681  }
2682  }
2683  else {
2684  if (myRank == 0) {
2685  out << "Domain Map:" << endl;
2686  }
2687  getDomainMap ()->describe (out, vl);
2688  }
2689  }
2690  if (! getRangeMap ().is_null ()) {
2691  if (getRangeMap () == getDomainMap ()) {
2692  if (myRank == 0) {
2693  out << "Range Map: same as domain Map" << endl;
2694  }
2695  }
2696  else if (getRangeMap () == getRowMap ()) {
2697  if (myRank == 0) {
2698  out << "Range Map: same as row Map" << endl;
2699  }
2700  }
2701  else {
2702  if (myRank == 0) {
2703  out << "Range Map: " << endl;
2704  }
2705  getRangeMap ()->describe (out, vl);
2706  }
2707  }
2708  }
2709 
2710  if (vl >= VERB_EXTREME) {
2711  const Teuchos::Comm<int>& comm = * (graph_.getMap ()->getComm ());
2712  const int myRank = comm.getRank ();
2713  const int numProcs = comm.getSize ();
2714 
2715  // Print the calling process' data to the given output stream.
2716  RCP<std::ostringstream> lclOutStrPtr (new std::ostringstream ());
2717  RCP<FancyOStream> osPtr = getFancyOStream (lclOutStrPtr);
2718  FancyOStream& os = *osPtr;
2719  os << "Process " << myRank << ":" << endl;
2720  Teuchos::OSTab tab2 (os);
2721 
2722  const map_type& meshRowMap = * (graph_.getRowMap ());
2723  const map_type& meshColMap = * (graph_.getColMap ());
2724  for (LO meshLclRow = meshRowMap.getMinLocalIndex ();
2725  meshLclRow <= meshRowMap.getMaxLocalIndex ();
2726  ++meshLclRow) {
2727  const GO meshGblRow = meshRowMap.getGlobalElement (meshLclRow);
2728  os << "Row " << meshGblRow << ": {";
2729 
2730  local_inds_host_view_type lclColInds;
2731  values_host_view_type vals;
2732  LO numInds = 0;
2733  this->getLocalRowView (meshLclRow, lclColInds, vals); numInds = lclColInds.extent(0);
2734 
2735  for (LO k = 0; k < numInds; ++k) {
2736  const GO gblCol = meshColMap.getGlobalElement (lclColInds[k]);
2737 
2738  os << "Col " << gblCol << ": [";
2739  for (LO i = 0; i < blockSize; ++i) {
2740  for (LO j = 0; j < blockSize; ++j) {
2741  os << vals[blockSize*blockSize*k + i*blockSize + j];
2742  if (j + 1 < blockSize) {
2743  os << ", ";
2744  }
2745  }
2746  if (i + 1 < blockSize) {
2747  os << "; ";
2748  }
2749  }
2750  os << "]";
2751  if (k + 1 < numInds) {
2752  os << ", ";
2753  }
2754  }
2755  os << "}" << endl;
2756  }
2757 
2758  // Print data on Process 0. This will automatically respect the
2759  // current indentation level.
2760  if (myRank == 0) {
2761  out << lclOutStrPtr->str ();
2762  lclOutStrPtr = Teuchos::null; // clear it to save space
2763  }
2764 
2765  const int sizeTag = 1337;
2766  const int dataTag = 1338;
2767 
2768  ArrayRCP<char> recvDataBuf; // only used on Process 0
2769 
2770  // Send string sizes and data from each process in turn to
2771  // Process 0, and print on that process.
2772  for (int p = 1; p < numProcs; ++p) {
2773  if (myRank == 0) {
2774  // Receive the incoming string's length.
2775  ArrayRCP<size_t> recvSize (1);
2776  recvSize[0] = 0;
2777  RCP<CommRequest<int> > recvSizeReq =
2778  ireceive<int, size_t> (recvSize, p, sizeTag, comm);
2779  wait<int> (comm, outArg (recvSizeReq));
2780  const size_t numCharsToRecv = recvSize[0];
2781 
2782  // Allocate space for the string to receive. Reuse receive
2783  // buffer space if possible. We can do this because in the
2784  // current implementation, we only have one receive in
2785  // flight at a time. Leave space for the '\0' at the end,
2786  // in case the sender doesn't send it.
2787  if (static_cast<size_t>(recvDataBuf.size()) < numCharsToRecv + 1) {
2788  recvDataBuf.resize (numCharsToRecv + 1);
2789  }
2790  ArrayRCP<char> recvData = recvDataBuf.persistingView (0, numCharsToRecv);
2791  // Post the receive of the actual string data.
2792  RCP<CommRequest<int> > recvDataReq =
2793  ireceive<int, char> (recvData, p, dataTag, comm);
2794  wait<int> (comm, outArg (recvDataReq));
2795 
2796  // Print the received data. This will respect the current
2797  // indentation level. Make sure that the string is
2798  // null-terminated.
2799  recvDataBuf[numCharsToRecv] = '\0';
2800  out << recvDataBuf.getRawPtr ();
2801  }
2802  else if (myRank == p) { // if I am not Process 0, and my rank is p
2803  // This deep-copies the string at most twice, depending on
2804  // whether std::string reference counts internally (it
2805  // generally does, so this won't deep-copy at all).
2806  const std::string stringToSend = lclOutStrPtr->str ();
2807  lclOutStrPtr = Teuchos::null; // clear original to save space
2808 
2809  // Send the string's length to Process 0.
2810  const size_t numCharsToSend = stringToSend.size ();
2811  ArrayRCP<size_t> sendSize (1);
2812  sendSize[0] = numCharsToSend;
2813  RCP<CommRequest<int> > sendSizeReq =
2814  isend<int, size_t> (sendSize, 0, sizeTag, comm);
2815  wait<int> (comm, outArg (sendSizeReq));
2816 
2817  // Send the actual string to Process 0. We know that the
2818  // string has length > 0, so it's save to take the address
2819  // of the first entry. Make a nonowning ArrayRCP to hold
2820  // the string. Process 0 will add a null termination
2821  // character at the end of the string, after it receives the
2822  // message.
2823  ArrayRCP<const char> sendData (&stringToSend[0], 0, numCharsToSend, false);
2824  RCP<CommRequest<int> > sendDataReq =
2825  isend<int, char> (sendData, 0, dataTag, comm);
2826  wait<int> (comm, outArg (sendDataReq));
2827  }
2828  } // for each process rank p other than 0
2829  } // extreme verbosity level (print the whole matrix)
2830  }
2831 
2832  template<class Scalar, class LO, class GO, class Node>
2833  Teuchos::RCP<const Teuchos::Comm<int> >
2835  getComm() const
2836  {
2837  return graph_.getComm();
2838  }
2839 
2840 
2841  template<class Scalar, class LO, class GO, class Node>
2845  {
2846  return graph_.getGlobalNumCols();
2847  }
2848 
2849 
2850  template<class Scalar, class LO, class GO, class Node>
2851  size_t
2854  {
2855  return graph_.getLocalNumCols();
2856  }
2857 
2858  template<class Scalar, class LO, class GO, class Node>
2859  GO
2862  {
2863  return graph_.getIndexBase();
2864  }
2865 
2866  template<class Scalar, class LO, class GO, class Node>
2870  {
2871  return graph_.getGlobalNumEntries();
2872  }
2873 
2874  template<class Scalar, class LO, class GO, class Node>
2875  size_t
2878  {
2879  return graph_.getLocalNumEntries();
2880  }
2881 
2882  template<class Scalar, class LO, class GO, class Node>
2883  size_t
2885  getNumEntriesInGlobalRow (GO globalRow) const
2886  {
2887  return graph_.getNumEntriesInGlobalRow(globalRow);
2888  }
2889 
2890 
2891  template<class Scalar, class LO, class GO, class Node>
2892  size_t
2895  {
2896  return graph_.getGlobalMaxNumRowEntries();
2897  }
2898 
2899  template<class Scalar, class LO, class GO, class Node>
2900  bool
2902  hasColMap() const
2903  {
2904  return graph_.hasColMap();
2905  }
2906 
2907 
2908  template<class Scalar, class LO, class GO, class Node>
2909  bool
2912  {
2913  return graph_.isLocallyIndexed();
2914  }
2915 
2916  template<class Scalar, class LO, class GO, class Node>
2917  bool
2920  {
2921  return graph_.isGloballyIndexed();
2922  }
2923 
2924  template<class Scalar, class LO, class GO, class Node>
2925  bool
2928  {
2929  return graph_.isFillComplete ();
2930  }
2931 
2932  template<class Scalar, class LO, class GO, class Node>
2933  bool
2936  {
2937  return false;
2938  }
2939 
2940  template<class Scalar, class LO, class GO, class Node>
2941  void
2943  getGlobalRowCopy (GO /*GlobalRow*/,
2944  nonconst_global_inds_host_view_type &/*Indices*/,
2945  nonconst_values_host_view_type &/*Values*/,
2946  size_t& /*NumEntries*/) const
2947  {
2948  TEUCHOS_TEST_FOR_EXCEPTION(
2949  true, std::logic_error, "Tpetra::BlockCrsMatrix::getGlobalRowCopy: "
2950  "This class doesn't support global matrix indexing.");
2951 
2952  }
2953 
2954  template<class Scalar, class LO, class GO, class Node>
2955  void
2957  getGlobalRowView (GO /* GlobalRow */,
2958  global_inds_host_view_type &/* indices */,
2959  values_host_view_type &/* values */) const
2960  {
2961  TEUCHOS_TEST_FOR_EXCEPTION(
2962  true, std::logic_error, "Tpetra::BlockCrsMatrix::getGlobalRowView: "
2963  "This class doesn't support global matrix indexing.");
2964 
2965  }
2966 
2967  template<class Scalar, class LO, class GO, class Node>
2968  void
2970  getLocalRowView (LO localRowInd,
2971  local_inds_host_view_type &colInds,
2972  values_host_view_type &vals) const
2973  {
2974  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
2975  colInds = local_inds_host_view_type();
2976  vals = values_host_view_type();
2977  }
2978  else {
2979  const size_t absBlockOffsetStart = ptrHost_[localRowInd];
2980  const size_t numInds = ptrHost_[localRowInd + 1] - absBlockOffsetStart;
2981  colInds = local_inds_host_view_type(indHost_.data()+absBlockOffsetStart, numInds);
2982 
2983  vals = getValuesHost (localRowInd);
2984  }
2985  }
2986 
2987  template<class Scalar, class LO, class GO, class Node>
2988  void
2991  local_inds_host_view_type &colInds,
2992  nonconst_values_host_view_type &vals) const
2993  {
2994  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
2995  colInds = nonconst_local_inds_host_view_type();
2996  vals = nonconst_values_host_view_type();
2997  }
2998  else {
2999  const size_t absBlockOffsetStart = ptrHost_[localRowInd];
3000  const size_t numInds = ptrHost_[localRowInd + 1] - absBlockOffsetStart;
3001  colInds = local_inds_host_view_type(indHost_.data()+absBlockOffsetStart, numInds);
3002 
3003  using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
3004  vals = const_cast<this_BCRS_type&>(*this).getValuesHostNonConst(localRowInd);
3005  }
3006  }
3007 
3008  template<class Scalar, class LO, class GO, class Node>
3009  void
3012  {
3013  const size_t lclNumMeshRows = graph_.getLocalNumRows ();
3014 
3015  Kokkos::View<size_t*, device_type> diagOffsets ("diagOffsets", lclNumMeshRows);
3016  graph_.getLocalDiagOffsets (diagOffsets);
3017 
3018  // The code below works on host, so use a host View.
3019  auto diagOffsetsHost = Kokkos::create_mirror_view (diagOffsets);
3020  // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
3021  Kokkos::deep_copy (execution_space(), diagOffsetsHost, diagOffsets);
3022 
3023  auto vals_host_out = val_.getHostView(Access::ReadOnly);
3024  const impl_scalar_type* vals_host_out_raw = vals_host_out.data();
3025 
3026  // TODO amk: This is a temporary measure to make the code run with Ifpack2
3027  size_t rowOffset = 0;
3028  size_t offset = 0;
3029  LO bs = getBlockSize();
3030  for(size_t r=0; r<getLocalNumRows(); r++)
3031  {
3032  // move pointer to start of diagonal block
3033  offset = rowOffset + diagOffsetsHost(r)*bs*bs;
3034  for(int b=0; b<bs; b++)
3035  {
3036  diag.replaceLocalValue(r*bs+b, vals_host_out_raw[offset+b*(bs+1)]);
3037  }
3038  // move pointer to start of next block row
3039  rowOffset += getNumEntriesInLocalRow(r)*bs*bs;
3040  }
3041 
3042  //diag.template sync<memory_space> (); // sync vec of diag entries back to dev
3043  }
3044 
3045  template<class Scalar, class LO, class GO, class Node>
3046  void
3048  leftScale (const ::Tpetra::Vector<Scalar, LO, GO, Node>& /* x */)
3049  {
3050  TEUCHOS_TEST_FOR_EXCEPTION(
3051  true, std::logic_error, "Tpetra::BlockCrsMatrix::leftScale: "
3052  "not implemented.");
3053 
3054  }
3055 
3056  template<class Scalar, class LO, class GO, class Node>
3057  void
3059  rightScale (const ::Tpetra::Vector<Scalar, LO, GO, Node>& /* x */)
3060  {
3061  TEUCHOS_TEST_FOR_EXCEPTION(
3062  true, std::logic_error, "Tpetra::BlockCrsMatrix::rightScale: "
3063  "not implemented.");
3064 
3065  }
3066 
3067  template<class Scalar, class LO, class GO, class Node>
3068  Teuchos::RCP<const ::Tpetra::RowGraph<LO, GO, Node> >
3070  getGraph() const
3071  {
3072  return graphRCP_;
3073  }
3074 
3075  template<class Scalar, class LO, class GO, class Node>
3076  typename ::Tpetra::RowMatrix<Scalar, LO, GO, Node>::mag_type
3079  {
3080  TEUCHOS_TEST_FOR_EXCEPTION(
3081  true, std::logic_error, "Tpetra::BlockCrsMatrix::getFrobeniusNorm: "
3082  "not implemented.");
3083  }
3084 
3085 } // namespace Tpetra
3086 
3087 //
3088 // Explicit instantiation macro
3089 //
3090 // Must be expanded from within the Tpetra namespace!
3091 //
3092 #define TPETRA_BLOCKCRSMATRIX_INSTANT(S,LO,GO,NODE) \
3093  template class BlockCrsMatrix< S, LO, GO, NODE >;
3094 
3095 #endif // TPETRA_BLOCKCRSMATRIX_DEF_HPP
virtual void getLocalRowCopy(LO LocalRow, nonconst_local_inds_host_view_type &Indices, nonconst_values_host_view_type &Values, size_t &NumEntries) const override
Not implemented.
size_t getNumEntriesInLocalRow(const LO localRowInd) const override
Return the number of entries in the given row on the calling process.
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
virtual size_t getLocalNumEntries() const override
The local number of stored (structurally nonzero) entries.
virtual global_size_t getGlobalNumCols() const override
The global number of columns of this matrix.
typename BMV::impl_scalar_type impl_scalar_type
The implementation type of entries in the matrix.
void getLocalDiagCopy(const Kokkos::View< impl_scalar_type ***, device_type, Kokkos::MemoryUnmanaged > &diag, const Kokkos::View< const size_t *, device_type, Kokkos::MemoryUnmanaged > &offsets) const
Variant of getLocalDiagCopy() that uses precomputed offsets and puts diagonal blocks in a 3-D Kokkos:...
void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel) const override
Print a description of this object to the given output stream.
Kokkos::View< impl_scalar_type **, Impl::BlockCrsMatrixLittleBlockArrayLayout, device_type, Kokkos::MemoryTraits< Kokkos::Unmanaged > > little_block_type
The type used to access nonconst matrix blocks.
virtual typename::Tpetra::RowMatrix< Scalar, LO, GO, Node >::mag_type getFrobeniusNorm() const override
The Frobenius norm of the matrix.
LO replaceLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Replace values at the given (mesh, i.e., block) column indices, in the given (mesh, i.e., block) row.
virtual void leftScale(const ::Tpetra::Vector< Scalar, LO, GO, Node > &x) override
Scale the RowMatrix on the left with the given Vector x.
Teuchos::RCP< const map_type > getColMap() const override
Returns the Map that describes the column distribution in this graph.
LO replaceLocalValuesByOffsets(const LO localRowInd, const ptrdiff_t offsets[], const Scalar vals[], const LO numOffsets) const
Like replaceLocalValues, but avoids computing row offsets.
std::string description() const override
One-line description of this object.
Sparse matrix whose entries are small dense square blocks, all of the same dimensions.
typename DistObject< Scalar, LO, GO, Node >::buffer_device_type buffer_device_type
Kokkos::Device specialization for communication buffers.
void exportAndFillComplete(Teuchos::RCP< BlockCrsMatrix< Scalar, LO, GO, Node > > &destMatrix, const Export< LO, GO, Node > &exporter) const
Import from this to the given destination matrix, and make the result fill complete.
virtual bool hasColMap() const override
Whether this matrix has a well-defined column Map.
virtual bool isLocallyIndexed() const override
Whether matrix indices are locally indexed.
LO local_ordinal_type
The type of local indices.
static KOKKOS_INLINE_FUNCTION size_t unpackValue(LO &outVal, const char inBuf[])
Unpack the given value from the given output buffer.
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
void getLocalRowViewNonConst(LO LocalRow, local_inds_host_view_type &indices, nonconst_values_host_view_type &values) const
size_t getNumEntriesInLocalRow(local_ordinal_type localRow) const override
Get the number of entries in the given row (local index).
bool isConstantStride() const
Whether this multivector has constant stride between columns.
void setAllToScalar(const Scalar &alpha)
Set all matrix entries equal to alpha.
One or more distributed dense vectors.
Kokkos::StaticCrsGraph< local_ordinal_type, Kokkos::LayoutLeft, device_type, void, size_t > local_graph_device_type
The type of the part of the sparse graph on each MPI process.
Teuchos::RCP< const map_type > getDomainMap() const override
Returns the Map associated with the domain of this graph.
Linear algebra kernels for small dense matrices and vectors.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
Scalar scalar_type
The type of entries in the matrix (that is, of each entry in each block).
KOKKOS_INLINE_FUNCTION void AXPY(const CoefficientType &alpha, const ViewType1 &x, const ViewType2 &y)
y := y + alpha * x (dense vector or matrix update)
size_t getLocalNumRows() const override
get the local number of block rows
virtual Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which this matrix is distributed.
Kokkos::LayoutRight BlockCrsMatrixLittleBlockArrayLayout
give an option to use layoutleft
virtual void rightScale(const ::Tpetra::Vector< Scalar, LO, GO, Node > &x) override
Scale the RowMatrix on the right with the given Vector x.
void getLocalRowView(LO LocalRow, local_inds_host_view_type &indices, values_host_view_type &values) const override
Get a view of the (mesh, i.e., block) row, using local (mesh, i.e., block) indices.
static map_type makePointMap(const map_type &meshMap, const LO blockSize)
Create and return the point Map corresponding to the given mesh Map and block size.
local_ordinal_type getMinLocalIndex() const
The minimum local index.
MultiVector for multiple degrees of freedom per mesh point.
virtual void copyAndPermute(const SrcDistObject &sourceObj, const size_t numSameIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteToLIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteFromLIDs, const CombineMode CM) override
KOKKOS_INLINE_FUNCTION void absMax(const ViewType2 &Y, const ViewType1 &X)
Implementation of Tpetra&#39;s ABSMAX CombineMode for the small dense blocks in BlockCrsMatrix, and the small dense vectors in BlockMultiVector and BlockVector.
Teuchos::RCP< const map_type > getColMap() const override
get the (mesh) map for the columns of this block matrix.
virtual size_t getGlobalMaxNumRowEntries() const override
The maximum number of entries in any row over all processes in the matrix&#39;s communicator.
size_t global_size_t
Global size_t object.
virtual void getGlobalRowView(GO GlobalRow, global_inds_host_view_type &indices, values_host_view_type &values) const override
Get a constant, nonpersisting, globally indexed view of the given row of the matrix.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
BlockCrsMatrix()
Default constructor: Makes an empty block matrix.
LO absMaxLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Variant of getLocalDiagCopy() that uses precomputed offsets and puts diagonal blocks in a 3-D Kokkos:...
LO getBlockSize() const
Get the number of degrees of freedom per mesh point.
Teuchos::RCP< const map_type > getDomainMap() const override
Get the (point) domain Map of this matrix.
Teuchos::RCP< const map_type > getRowMap() const override
get the (mesh) map for the rows of this block matrix.
static KOKKOS_INLINE_FUNCTION size_t packValue(char outBuf[], const LO &inVal)
Pack the given value of type value_type into the given output buffer of bytes (char).
Insert new values that don&#39;t currently exist.
bool isSorted() const
Whether graph indices in all rows are known to be sorted.
LO sumIntoLocalValuesByOffsets(const LO localRowInd, const ptrdiff_t offsets[], const Scalar vals[], const LO numOffsets) const
Like sumIntoLocalValues, but avoids computing row offsets.
global_size_t getGlobalNumRows() const override
get the global number of block rows
global_ordinal_type getGlobalElement(local_ordinal_type localIndex) const
The global index corresponding to the given local index.
void replaceLocalValue(const LocalOrdinal myRow, const Scalar &value)
Replace current value at the specified location with specified values.
local_ordinal_type getMaxLocalIndex() const
The maximum local index on the calling process.
bool isNodeLocalElement(local_ordinal_type localIndex) const
Whether the given local index is valid for this Map on the calling process.
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
virtual Teuchos::RCP< const ::Tpetra::RowGraph< LO, GO, Node > > getGraph() const override
Get the (mesh) graph.
virtual GO getIndexBase() const override
The index base for global indices in this matrix.
void applyBlock(const BlockMultiVector< Scalar, LO, GO, Node > &X, BlockMultiVector< Scalar, LO, GO, Node > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, const Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), const Scalar beta=Teuchos::ScalarTraits< Scalar >::zero())
Version of apply() that takes BlockMultiVector input and output.
CombineMode
Rule for combining data in an Import or Export.
Sum new values.
virtual bool isGloballyIndexed() const override
Whether matrix indices are globally indexed.
LO getLocalRowOffsets(const LO localRowInd, ptrdiff_t offsets[], const LO colInds[], const LO numColInds) const
Get relative offsets corresponding to the given rows, given by local row index.
size_t getLocalMaxNumRowEntries() const override
Maximum number of entries in any row of the matrix, on this process.
virtual global_size_t getGlobalNumEntries() const override
The global number of stored (structurally nonzero) entries.
Replace old value with maximum of magnitudes of old and new values.
void apply(const mv_type &X, mv_type &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), Scalar beta=Teuchos::ScalarTraits< Scalar >::zero()) const override
For this matrix A, compute Y := beta * Y + alpha * Op(A) * X.
Replace existing values with new values.
device_type::execution_space execution_space
The Kokkos execution space that this class uses.
Teuchos::RCP< const map_type > getRangeMap() const override
Returns the Map associated with the domain of this graph.
void enableWDVTracking()
Enable WrappedDualView reference-count tracking and syncing. Call this after exiting a host-parallel ...
Replace old values with zero.
Teuchos::RCP< const map_type > getRangeMap() const override
Get the (point) range Map of this matrix.
KOKKOS_INLINE_FUNCTION void COPY(const ViewType1 &x, const ViewType2 &y)
Deep copy x into y, where x and y are either rank 1 (vectors) or rank 2 (matrices) with the same dime...
std::string dualViewStatusToString(const DualViewType &dv, const char name[])
Return the status of the given Kokkos::DualView, as a human-readable string.
void getLocalDiagOffsets(const Kokkos::View< size_t *, device_type, Kokkos::MemoryUnmanaged > &offsets) const
Get offsets of the diagonal entries in the matrix.
static KOKKOS_INLINE_FUNCTION size_t packValueCount(const LO &)
Number of bytes required to pack or unpack the given value of type value_type.
char packet_type
Implementation detail; tells.
local_matrix_device_type getLocalMatrixDevice() const
void disableWDVTracking()
Disable WrappedDualView reference-count tracking and syncing. Call this before entering a host-parall...
local_ordinal_type getLocalElement(global_ordinal_type globalIndex) const
The local index corresponding to the given global index.
A distributed dense vector.
virtual bool supportsRowViews() const override
Whether this object implements getLocalRowView() and getGlobalRowView().
virtual void packAndPrepare(const SrcDistObject &sourceObj, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &exportLIDs, Kokkos::DualView< packet_type *, buffer_device_type > &exports, Kokkos::DualView< size_t *, buffer_device_type > numPacketsPerLID, size_t &constantNumPackets) override
virtual void unpackAndCombine(const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &importLIDs, Kokkos::DualView< packet_type *, buffer_device_type > imports, Kokkos::DualView< size_t *, buffer_device_type > numPacketsPerLID, const size_t constantNumPackets, const CombineMode combineMode) override
virtual void getGlobalRowCopy(GO GlobalRow, nonconst_global_inds_host_view_type &Indices, nonconst_values_host_view_type &Values, size_t &NumEntries) const override
Get a copy of the given global row&#39;s entries.
bool locallySameAs(const Map< local_ordinal_type, global_ordinal_type, node_type > &map) const
Is this Map locally the same as the input Map?
virtual bool isFillComplete() const override
Whether fillComplete() has been called.
impl_scalar_type_dualview::t_host getValuesHostNonConst() const
Get the host or device View of the matrix&#39;s values (val_).
Base class for distributed Tpetra objects that support data redistribution.
LO sumIntoLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Sum into values at the given (mesh, i.e., block) column indices, in the given (mesh, i.e., block) row.
virtual size_t getLocalNumCols() const override
The number of columns needed to apply the forward operator on this node.
void importAndFillComplete(Teuchos::RCP< BlockCrsMatrix< Scalar, LO, GO, Node > > &destMatrix, const Import< LO, GO, Node > &importer) const
Import from this to the given destination matrix, and make the result fill complete.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra&#39;s behavior.
virtual size_t getNumEntriesInGlobalRow(GO globalRow) const override
The current number of entries on the calling process in the specified global row. ...