Ifpack2 Templated Preconditioning Package  Version 1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups Pages
Ifpack2_BlockTriDiContainer_impl.hpp
1 /*@HEADER
2 // ***********************************************************************
3 //
4 // Ifpack2: Templated Object-Oriented Algebraic Preconditioner Package
5 // Copyright (2009) Sandia Corporation
6 //
7 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8 // license for use of this work by or on behalf of the U.S. Government.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ***********************************************************************
40 //@HEADER
41 */
42 
43 #ifndef IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP
44 #define IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP
45 
47 
48 #include <Tpetra_Distributor.hpp>
49 #include <Tpetra_BlockMultiVector.hpp>
50 
51 #include <Kokkos_ArithTraits.hpp>
52 #include <KokkosBatched_Util.hpp>
53 #include <KokkosBatched_Vector.hpp>
54 #include <KokkosBatched_Copy_Decl.hpp>
55 #include <KokkosBatched_Copy_Impl.hpp>
56 #include <KokkosBatched_AddRadial_Decl.hpp>
57 #include <KokkosBatched_AddRadial_Impl.hpp>
58 #include <KokkosBatched_SetIdentity_Decl.hpp>
59 #include <KokkosBatched_SetIdentity_Impl.hpp>
60 #include <KokkosBatched_Gemm_Decl.hpp>
61 #include <KokkosBatched_Gemm_Serial_Impl.hpp>
62 #include <KokkosBatched_Gemm_Team_Impl.hpp>
63 #include <KokkosBatched_Gemv_Decl.hpp>
64 #include <KokkosBatched_Gemv_Serial_Impl.hpp>
65 #include <KokkosBatched_Gemv_Team_Impl.hpp>
66 #include <KokkosBatched_Trsm_Decl.hpp>
67 #include <KokkosBatched_Trsm_Serial_Impl.hpp>
68 #include <KokkosBatched_Trsm_Team_Impl.hpp>
69 #include <KokkosBatched_Trsv_Decl.hpp>
70 #include <KokkosBatched_Trsv_Serial_Impl.hpp>
71 #include <KokkosBatched_Trsv_Team_Impl.hpp>
72 #include <KokkosBatched_LU_Decl.hpp>
73 #include <KokkosBatched_LU_Serial_Impl.hpp>
74 #include <KokkosBatched_LU_Team_Impl.hpp>
75 
76 #include <KokkosBlas1_nrm1.hpp>
77 #include <KokkosBlas1_nrm2.hpp>
78 
79 #include <memory>
80 
81 // need to interface this into cmake variable (or only use this flag when it is necessary)
82 #define IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE
83 //#undef IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE
84 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
85 #include "cuda_profiler_api.h"
86 #endif
87 
88 // I am not 100% sure about the mpi 3 on cuda
89 #if MPI_VERSION >= 3
90 #define IFPACK2_BLOCKTRIDICONTAINER_USE_MPI_3
91 #endif
92 
93 namespace Ifpack2 {
94 
95  namespace BlockTriDiContainerDetails {
96 
100  using do_not_initialize_tag = Kokkos::ViewAllocateWithoutInitializing;
101 
102  template <typename MemoryTraitsType, Kokkos::MemoryTraitsFlags flag>
103  using MemoryTraits = Kokkos::MemoryTraits<MemoryTraitsType::Unmanaged |
104  MemoryTraitsType::RandomAccess |
105  flag>;
106 
107  template <typename ViewType>
108  using Unmanaged = Kokkos::View<typename ViewType::data_type,
109  typename ViewType::array_layout,
110  typename ViewType::device_type,
111  MemoryTraits<typename ViewType::memory_traits,Kokkos::Unmanaged> >;
112  template <typename ViewType>
113  using Atomic = Kokkos::View<typename ViewType::data_type,
114  typename ViewType::array_layout,
115  typename ViewType::device_type,
116  MemoryTraits<typename ViewType::memory_traits,Kokkos::Atomic> >;
117  template <typename ViewType>
118  using Const = Kokkos::View<typename ViewType::const_data_type,
119  typename ViewType::array_layout,
120  typename ViewType::device_type,
121  typename ViewType::memory_traits>;
122  template <typename ViewType>
123  using ConstUnmanaged = Const<Unmanaged<ViewType> >;
124 
125  template <typename ViewType>
126  using AtomicUnmanaged = Atomic<Unmanaged<ViewType> >;
127 
128  template <typename ViewType>
129  using Unmanaged = Kokkos::View<typename ViewType::data_type,
130  typename ViewType::array_layout,
131  typename ViewType::device_type,
132  MemoryTraits<typename ViewType::memory_traits,Kokkos::Unmanaged> >;
133 
134 
135  template <typename ViewType>
136  using Scratch = Kokkos::View<typename ViewType::data_type,
137  typename ViewType::array_layout,
138  typename ViewType::execution_space::scratch_memory_space,
139  MemoryTraits<typename ViewType::memory_traits, Kokkos::Unmanaged> >;
140 
144  template<typename T> struct is_cuda { enum : bool { value = false }; };
145 #if defined(KOKKOS_ENABLE_CUDA)
146  template<> struct is_cuda<Kokkos::Cuda> { enum : bool { value = true }; };
147 #endif
148 
152  template<typename CommPtrType>
153  std::string get_msg_prefix (const CommPtrType &comm) {
154  const auto rank = comm->getRank();
155  const auto nranks = comm->getSize();
156  std::stringstream ss;
157  ss << "Rank " << rank << " of " << nranks << ": ";
158  return ss.str();
159  }
160 
164  template<typename T, int N>
165  struct ArrayValueType {
166  T v[N];
167  KOKKOS_INLINE_FUNCTION
168  ArrayValueType() {
169  for (int i=0;i<N;++i)
170  this->v[i] = 0;
171  }
172  KOKKOS_INLINE_FUNCTION
173  ArrayValueType(const ArrayValueType &b) {
174  for (int i=0;i<N;++i)
175  this->v[i] = b.v[i];
176  }
177  };
178  template<typename T, int N>
179  static
180  KOKKOS_INLINE_FUNCTION
181  void
182  operator+=(volatile ArrayValueType<T,N> &a,
183  volatile const ArrayValueType<T,N> &b) {
184  for (int i=0;i<N;++i)
185  a.v[i] += b.v[i];
186  }
187  template<typename T, int N>
188  static
189  KOKKOS_INLINE_FUNCTION
190  void
191  operator+=(ArrayValueType<T,N> &a,
192  const ArrayValueType<T,N> &b) {
193  for (int i=0;i<N;++i)
194  a.v[i] += b.v[i];
195  }
196 
200  template<typename T, int N, typename ExecSpace>
201  struct SumReducer {
202  typedef SumReducer reducer;
204  typedef Kokkos::View<value_type,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
205  value_type *value;
206 
207  KOKKOS_INLINE_FUNCTION
208  SumReducer(value_type &val) : value(&val) {}
209 
210  KOKKOS_INLINE_FUNCTION
211  void join(value_type &dst, value_type &src) const {
212  for (int i=0;i<N;++i)
213  dst.v[i] += src.v[i];
214  }
215  KOKKOS_INLINE_FUNCTION
216  void join(volatile value_type &dst, const volatile value_type &src) const {
217  for (int i=0;i<N;++i)
218  dst.v[i] += src.v[i];
219  }
220  KOKKOS_INLINE_FUNCTION
221  void init(value_type &val) const {
222  for (int i=0;i<N;++i)
223  val.v[i] = Kokkos::reduction_identity<T>::sum();
224  }
225  KOKKOS_INLINE_FUNCTION
226  value_type& reference() {
227  return *value;
228  }
229  KOKKOS_INLINE_FUNCTION
230  result_view_type view() const {
231  return result_view_type(value);
232  }
233  };
234 
235 #if defined(HAVE_IFPACK2_BLOCKTRIDICONTAINER_TIMERS)
236 #define IFPACK2_BLOCKTRIDICONTAINER_TIMER(label) TEUCHOS_FUNC_TIME_MONITOR(label);
237 #else
238 #define IFPACK2_BLOCKTRIDICONTAINER_TIMER(label)
239 #endif
240 
241 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
242 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN(label) \
243  cudaProfilerStart(); \
244  IFPACK2_BLOCKTRIDICONTAINER_TIMER(label)
245 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END \
246  { cudaProfilerStop(); }
247 #else
248 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN(label) \
250  IFPACK2_BLOCKTRIDICONTAINER_TIMER(label)
251 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END
252 #endif
253 
257  template <typename MatrixType>
258  struct ImplType {
262  typedef size_t size_type;
263  typedef typename MatrixType::scalar_type scalar_type;
264  typedef typename MatrixType::local_ordinal_type local_ordinal_type;
265  typedef typename MatrixType::global_ordinal_type global_ordinal_type;
266  typedef typename MatrixType::node_type node_type;
267 
271  typedef typename Kokkos::Details::ArithTraits<scalar_type>::val_type impl_scalar_type;
272  typedef typename Kokkos::ArithTraits<impl_scalar_type>::mag_type magnitude_type;
273 
277  typedef Kokkos::DefaultHostExecutionSpace host_execution_space;
278 
282  typedef typename node_type::device_type device_type;
283  typedef typename device_type::execution_space execution_space;
284  typedef typename device_type::memory_space memory_space;
285  typedef Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_multivector_type;
286  typedef Tpetra::Map<local_ordinal_type,global_ordinal_type,node_type> tpetra_map_type;
287  typedef Tpetra::Import<local_ordinal_type,global_ordinal_type,node_type> tpetra_import_type;
288  typedef Tpetra::RowMatrix<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_row_matrix_type;
289  typedef Tpetra::BlockCrsMatrix<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_block_crs_matrix_type;
290  typedef typename tpetra_block_crs_matrix_type::little_block_type tpetra_block_access_view_type;
291  typedef Tpetra::BlockMultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_block_multivector_type;
292  typedef typename tpetra_block_crs_matrix_type::crs_graph_type::local_graph_type local_crs_graph_type;
293 
297  template<typename T, int l> using Vector = KokkosBatched::Experimental::Vector<T,l>;
298  template<typename T> using SIMD = KokkosBatched::Experimental::SIMD<T>;
299  template<typename T, typename M> using DefaultVectorLength = KokkosBatched::Experimental::DefaultVectorLength<T,M>;
300  template<typename T, typename M> using DefaultInternalVectorLength = KokkosBatched::Experimental::DefaultInternalVectorLength<T,M>;
301 
302  static constexpr int vector_length = DefaultVectorLength<impl_scalar_type,memory_space>::value;
303  static constexpr int internal_vector_length = DefaultInternalVectorLength<impl_scalar_type,memory_space>::value;
304  typedef Vector<SIMD<impl_scalar_type>,vector_length> vector_type;
305  typedef Vector<SIMD<impl_scalar_type>,internal_vector_length> internal_vector_type;
306 
310  typedef Kokkos::View<size_type*,device_type> size_type_1d_view;
311  typedef Kokkos::View<local_ordinal_type*,device_type> local_ordinal_type_1d_view;
312  // tpetra block crs values
313  typedef Kokkos::View<impl_scalar_type*,device_type> impl_scalar_type_1d_view;
314  // tpetra multivector values (layout left): may need to change the typename more explicitly
315  typedef Kokkos::View<impl_scalar_type**,Kokkos::LayoutLeft,device_type> impl_scalar_type_2d_view;
316  // packed data always use layout right
317  typedef Kokkos::View<vector_type*,device_type> vector_type_1d_view;
318  typedef Kokkos::View<vector_type***,Kokkos::LayoutRight,device_type> vector_type_3d_view;
319  typedef Kokkos::View<internal_vector_type***,Kokkos::LayoutRight,device_type> internal_vector_type_3d_view;
320  typedef Kokkos::View<internal_vector_type****,Kokkos::LayoutRight,device_type> internal_vector_type_4d_view;
321  typedef Kokkos::View<impl_scalar_type***,Kokkos::LayoutRight,device_type> impl_scalar_type_3d_view;
322  typedef Kokkos::View<impl_scalar_type****,Kokkos::LayoutRight,device_type> impl_scalar_type_4d_view;
323  };
324 
328  template<typename MatrixType>
330  createBlockCrsTpetraImporter(const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A) {
331  IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::CreateBlockCrsTpetraImporter");
332  using impl_type = ImplType<MatrixType>;
333  using tpetra_map_type = typename impl_type::tpetra_map_type;
334  using tpetra_mv_type = typename impl_type::tpetra_block_multivector_type;
335  using tpetra_import_type = typename impl_type::tpetra_import_type;
336 
337  const auto g = A->getCrsGraph(); // tpetra crs graph object
338  const auto blocksize = A->getBlockSize();
339  const auto src = Teuchos::rcp(new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getDomainMap(), blocksize)));
340  const auto tgt = Teuchos::rcp(new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getColMap() , blocksize)));
341 
342  return Teuchos::rcp(new tpetra_import_type(src, tgt));
343  }
344 
345  // Partial replacement for forward-mode MultiVector::doImport.
346  // Permits overlapped communication and computation, but also supports sync'ed.
347  // I'm finding that overlapped comm/comp can give quite poor performance on some
348  // platforms, so we can't just use it straightforwardly always.
349 
350  template<typename MatrixType>
351  struct AsyncableImport {
352  public:
353  using impl_type = ImplType<MatrixType>;
354 
355  private:
359 #if !defined(HAVE_IFPACK2_MPI)
360  typedef int MPI_Request;
361  typedef int MPI_Comm;
362 #endif
363  using scalar_type = typename impl_type::scalar_type;
366 
367  template <typename T>
368  static int isend(const MPI_Comm comm, const T* buf, int count, int dest, int tag, MPI_Request* ireq) {
369 #ifdef HAVE_IFPACK2_MPI
370  MPI_Request ureq;
371  const auto dt = Teuchos::Details::MpiTypeTraits<scalar_type>::getType();
372  int ret = MPI_Isend(const_cast<T*>(buf), count, dt, dest, tag, comm, ireq == NULL ? &ureq : ireq);
373  if (ireq == NULL) MPI_Request_free(&ureq);
374  return ret;
375 #else
376  return 0;
377 #endif
378  }
379 
380  template <typename T>
381  static int irecv(const MPI_Comm comm, T* buf, int count, int src, int tag, MPI_Request* ireq) {
382 #ifdef HAVE_IFPACK2_MPI
383  MPI_Request ureq;
384  const auto dt = Teuchos::Details::MpiTypeTraits<scalar_type>::getType();
385  int ret = MPI_Irecv(buf, count, dt, src, tag, comm, ireq == NULL ? &ureq : ireq);
386  if (ireq == NULL) MPI_Request_free(&ureq);
387  return ret;
388 #else
389  return 0;
390 #endif
391  }
392 
393  static int waitany(int count, MPI_Request* reqs, int* index) {
394 #ifdef HAVE_IFPACK2_MPI
395  return MPI_Waitany(count, reqs, index, MPI_STATUS_IGNORE);
396 #else
397  return 0;
398 #endif
399  }
400 
401  static int waitall(int count, MPI_Request* reqs) {
402 #ifdef HAVE_IFPACK2_MPI
403  return MPI_Waitall(count, reqs, MPI_STATUS_IGNORE);
404 #else
405  return 0;
406 #endif
407  }
408 
409  public:
410  using tpetra_map_type = typename impl_type::tpetra_map_type;
411  using tpetra_import_type = typename impl_type::tpetra_import_type;
412 
413  using local_ordinal_type = typename impl_type::local_ordinal_type;
414  using global_ordinal_type = typename impl_type::global_ordinal_type;
415  using size_type = typename impl_type::size_type;
416  using impl_scalar_type = typename impl_type::impl_scalar_type;
417 
418  using host_execution_space = typename impl_type::host_execution_space;
419  using int_1d_view_host = Kokkos::View<int*,host_execution_space>;
420  using local_ordinal_type_1d_view_host = typename impl_type::local_ordinal_type_1d_view::HostMirror;
421 
422  using execution_space = typename impl_type::execution_space;
423  using memory_space = typename impl_type::memory_space;
424  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
425  using size_type_1d_view = typename impl_type::size_type_1d_view;
426  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
427  using impl_scalar_type_2d_view = typename impl_type::impl_scalar_type_2d_view;
428 
429 #ifdef HAVE_IFPACK2_MPI
430  MPI_Comm comm;
431 #endif
432  impl_scalar_type_2d_view remote_multivector;
433  local_ordinal_type blocksize;
434 
435  template<typename T>
436  struct SendRecvPair {
437  T send, recv;
438  };
439 
440  // (s)end and (r)eceive data:
441  SendRecvPair<int_1d_view_host> pids; // mpi ranks
442  SendRecvPair<std::vector<MPI_Request> > reqs; // MPI_Request is pointer, cannot use kokkos view
443  SendRecvPair<size_type_1d_view> offset; // offsets to local id list and data buffer
444  SendRecvPair<local_ordinal_type_1d_view> lids; // local id list
445  SendRecvPair<impl_scalar_type_1d_view> buffer; // data buffer
446 
447  local_ordinal_type_1d_view dm2cm; // permutation
448 
449  // for cuda
450  public:
451  void setOffsetValues(const Teuchos::ArrayView<const size_t> &lens,
452  const size_type_1d_view &offs) {
453  // wrap lens to kokkos view and deep copy to device
454  Kokkos::View<size_t*,host_execution_space> lens_host(const_cast<size_t*>(lens.getRawPtr()), lens.size());
455  const auto lens_device = Kokkos::create_mirror_view(memory_space(), lens_host);
456  Kokkos::deep_copy(lens_device, lens_host);
457 
458  // exclusive scan
459  const Kokkos::RangePolicy<execution_space> policy(0,offs.extent(0));
460  const local_ordinal_type lens_size = lens_device.extent(0);
461  Kokkos::parallel_scan
462  ("AsyncableImport::RangePolicy::setOffsetValues",
463  policy, KOKKOS_LAMBDA(const local_ordinal_type &i, size_type &update, const bool &final) {
464  if (final)
465  offs(i) = update;
466  update += (i < lens_size ? lens_device[i] : 0);
467  });
468  }
469 
470  private:
471  void createMpiRequests(const tpetra_import_type &import) {
472  Tpetra::Distributor &distributor = import.getDistributor();
473 
474  // copy pids from distributor
475  const auto pids_from = distributor.getProcsFrom();
476  pids.recv = int_1d_view_host(do_not_initialize_tag("pids recv"), pids_from.size());
477  memcpy(pids.recv.data(), pids_from.getRawPtr(), sizeof(int)*pids.recv.extent(0));
478 
479  const auto pids_to = distributor.getProcsTo();
480  pids.send = int_1d_view_host(do_not_initialize_tag("pids send"), pids_to.size());
481  memcpy(pids.send.data(), pids_to.getRawPtr(), sizeof(int)*pids.send.extent(0));
482 
483  // mpi requests
484  reqs.recv.resize(pids.recv.extent(0)); memset(reqs.recv.data(), 0, reqs.recv.size()*sizeof(MPI_Request));
485  reqs.send.resize(pids.send.extent(0)); memset(reqs.send.data(), 0, reqs.send.size()*sizeof(MPI_Request));
486 
487  // construct offsets
488  const auto lengths_to = distributor.getLengthsTo();
489  offset.send = size_type_1d_view(do_not_initialize_tag("offset send"), lengths_to.size() + 1);
490 
491  const auto lengths_from = distributor.getLengthsFrom();
492  offset.recv = size_type_1d_view(do_not_initialize_tag("offset recv"), lengths_from.size() + 1);
493 
494  setOffsetValues(lengths_to, offset.send);
495  setOffsetValues(lengths_from, offset.recv);
496  }
497 
498  void createSendRecvIDs(const tpetra_import_type &import) {
499  // For each remote PID, the list of LIDs to receive.
500  const auto remote_lids = import.getRemoteLIDs();
501  const local_ordinal_type_1d_view_host
502  remote_lids_view_host(const_cast<local_ordinal_type*>(remote_lids.getRawPtr()), remote_lids.size());
503  lids.recv = local_ordinal_type_1d_view(do_not_initialize_tag("lids recv"), remote_lids.size());
504  Kokkos::deep_copy(lids.recv, remote_lids_view_host);
505 
506  // For each export PID, the list of LIDs to send.
507  auto epids = import.getExportPIDs();
508  auto elids = import.getExportLIDs();
509  TEUCHOS_ASSERT(epids.size() == elids.size());
510  lids.send = local_ordinal_type_1d_view(do_not_initialize_tag("lids send"), elids.size());
511  auto lids_send_host = Kokkos::create_mirror_view(lids.send);
512 
513  // naive search (not sure if pids or epids are sorted)
514  for (local_ordinal_type cnt=0,i=0,iend=pids.send.extent(0);i<iend;++i) {
515  const auto pid_send_value = pids.send[i];
516  for (local_ordinal_type j=0,jend=epids.size();j<jend;++j)
517  if (epids[j] == pid_send_value) lids_send_host[cnt++] = elids[j];
518 #if !defined(__CUDA_ARCH__)
519  TEUCHOS_ASSERT(static_cast<size_t>(cnt) == offset.send[i+1]);
520 #endif
521  }
522  Kokkos::deep_copy(lids.send, lids_send_host);
523  }
524 
525  public:
526  // for cuda, all tag types are public
527  struct ToBuffer {};
528  struct ToMultiVector {};
529 
530  // for cuda, kernel launch should be public too.
531  template<typename PackTag>
532  static
533  void copy(const local_ordinal_type_1d_view &lids_,
534  const impl_scalar_type_1d_view &buffer_,
535  const local_ordinal_type &ibeg_,
536  const local_ordinal_type &iend_,
537  const impl_scalar_type_2d_view &multivector_,
538  const local_ordinal_type blocksize_) {
539  IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::AsyncableImport::Copy");
540  const local_ordinal_type num_vectors = multivector_.extent(1);
541  const local_ordinal_type mv_blocksize = blocksize_*num_vectors;
542  const local_ordinal_type idiff = iend_ - ibeg_;
543  const auto abase = buffer_.data() + mv_blocksize*ibeg_;
544  if (is_cuda<execution_space>::value) {
545 #if defined(KOKKOS_ENABLE_CUDA)
546  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
547  local_ordinal_type vector_size(0);
548  if (blocksize_ <= 4) vector_size = 4;
549  else if (blocksize_ <= 8) vector_size = 8;
550  else if (blocksize_ <= 16) vector_size = 16;
551  else vector_size = 32;
552  const team_policy_type policy(idiff, 1, vector_size);
553  Kokkos::parallel_for
554  ("AsyncableImport::TeamPolicy::copy",
555  policy, KOKKOS_LAMBDA(const typename team_policy_type::member_type &member) {
556  const local_ordinal_type i = member.league_rank();
557  Kokkos::parallel_for
558  (Kokkos::TeamThreadRange(member,num_vectors),[&](const local_ordinal_type &j) {
559  auto aptr = abase + blocksize_*(i + idiff*j);
560  auto bptr = &multivector_(blocksize_*lids_(i + ibeg_), j);
561  if (std::is_same<PackTag,ToBuffer>::value)
562  Kokkos::parallel_for
563  (Kokkos::ThreadVectorRange(member,blocksize_),[&](const local_ordinal_type &k) {
564  aptr[k] = bptr[k];
565  });
566  else
567  Kokkos::parallel_for
568  (Kokkos::ThreadVectorRange(member,blocksize_),[&](const local_ordinal_type &k) {
569  bptr[k] = aptr[k];
570  });
571  });
572  });
573 #endif
574  } else {
575 #if defined(__CUDA_ARCH__)
576  TEUCHOS_TEST_FOR_EXCEPT_MSG(true, "Error: CUDA should not see this code");
577 #else
578  if (num_vectors == 1) {
579  const Kokkos::RangePolicy<execution_space> policy(ibeg_, iend_);
580  Kokkos::parallel_for
581  ("AsyncableImport::RangePolicy::copy",
582  policy, KOKKOS_LAMBDA(const local_ordinal_type &i) {
583  auto aptr = buffer_.data() + blocksize_*i;
584  auto bptr = multivector_.data() + blocksize_*lids_(i);
585  if (std::is_same<PackTag,ToBuffer>::value)
586  memcpy(aptr, bptr, sizeof(impl_scalar_type)*blocksize_);
587  else
588  memcpy(bptr, aptr, sizeof(impl_scalar_type)*blocksize_);
589  });
590 
591  } else {
592  Kokkos::MDRangePolicy
593  <execution_space, Kokkos::Rank<2>, Kokkos::IndexType<local_ordinal_type> >
594  policy( { 0, 0 }, { idiff, num_vectors } );
595 
596  Kokkos::parallel_for
597  ("AsyncableImport::MDRangePolicy::copy",
598  policy, KOKKOS_LAMBDA(const local_ordinal_type &i,
599  const local_ordinal_type &j) {
600  auto aptr = abase + blocksize_*(i + idiff*j);
601  auto bptr = &multivector_(blocksize_*lids_(i + ibeg_), j);
602  if (std::is_same<PackTag,ToBuffer>::value)
603  for (local_ordinal_type k=0;k<blocksize_;++k) aptr[k] = bptr[k];
604  else
605  for (local_ordinal_type k=0;k<blocksize_;++k) bptr[k] = aptr[k];
606  });
607  }
608 #endif
609  }
610  Kokkos::fence();
611  }
612 
613  void createDataBuffer(const local_ordinal_type &num_vectors) {
614  const size_type extent_0 = lids.recv.extent(0)*blocksize;
615  const size_type extent_1 = num_vectors;
616  if (remote_multivector.extent(0) == extent_0 &&
617  remote_multivector.extent(1) == extent_1) {
618  // skip
619  } else {
620  remote_multivector =
621  impl_scalar_type_2d_view(do_not_initialize_tag("remote multivector"), extent_0, extent_1);
622 
623  const auto send_buffer_size = offset.send[offset.send.extent(0)-1]*blocksize*num_vectors;
624  buffer.send = impl_scalar_type_1d_view(do_not_initialize_tag("buffer send"), send_buffer_size);
625 
626  const auto recv_buffer_size = offset.recv[offset.recv.extent(0)-1]*blocksize*num_vectors;
627  buffer.recv = impl_scalar_type_1d_view(do_not_initialize_tag("buffer recv"), recv_buffer_size);
628  }
629  }
630 
631  AsyncableImport (const Teuchos::RCP<const tpetra_map_type>& src_map,
633  const local_ordinal_type blocksize_,
634  const local_ordinal_type_1d_view dm2cm_) {
635  blocksize = blocksize_;
636  dm2cm = dm2cm_;
637 
638 #ifdef HAVE_IFPACK2_MPI
639  const auto mpi_comm = Teuchos::rcp_dynamic_cast<const Teuchos::MpiComm<int> >(tgt_map->getComm());
640  TEUCHOS_ASSERT(!mpi_comm.is_null());
641  comm = *mpi_comm->getRawMpiComm();
642 #endif
643  const tpetra_import_type import(src_map, tgt_map);
644 
645  createMpiRequests(import);
646  createSendRecvIDs(import);
647  }
648 
649  void asyncSendRecv(const impl_scalar_type_2d_view &mv) {
650  IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::AsyncableImport::AsyncSendRecv");
651 
652 #ifdef HAVE_IFPACK2_MPI
653  // constants and reallocate data buffers if necessary
654  const local_ordinal_type num_vectors = mv.extent(1);
655  const local_ordinal_type mv_blocksize = blocksize*num_vectors;
656 
657  // receive async
658  for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
659  irecv(comm,
660  buffer.recv.data() + offset.recv[i]*mv_blocksize,
661  (offset.recv[i+1] - offset.recv[i])*mv_blocksize,
662  pids.recv[i],
663  42,
664  &reqs.recv[i]);
665  }
666 
667  // send async
668  for (local_ordinal_type i=0,iend=pids.send.extent(0);i<iend;++i) {
669  copy<ToBuffer>(lids.send, buffer.send, offset.send(i), offset.send(i+1),
670  mv, blocksize);
671  isend(comm,
672  buffer.send.data() + offset.send[i]*mv_blocksize,
673  (offset.send[i+1] - offset.send[i])*mv_blocksize,
674  pids.send[i],
675  42,
676  &reqs.send[i]);
677  }
678 
679  // I find that issuing an Iprobe seems to nudge some MPIs into action,
680  // which helps with overlapped comm/comp performance.
681  for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
682  int flag;
683  MPI_Status stat;
684  MPI_Iprobe(pids.recv[i], 42, comm, &flag, &stat);
685  }
686 #endif
687  }
688 
689  void cancel () {
690 #ifdef HAVE_IFPACK2_MPI
691  waitall(reqs.recv.size(), reqs.recv.data());
692  waitall(reqs.send.size(), reqs.send.data());
693 #endif
694  }
695 
696  void syncRecv() {
697  IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::AsyncableImport::SyncRecv");
698 #ifdef HAVE_IFPACK2_MPI
699  // receive async.
700  for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
701  local_ordinal_type idx = i;
702  waitany(pids.recv.extent(0), reqs.recv.data(), &idx);
703  copy<ToMultiVector>(lids.recv, buffer.recv, offset.recv(idx), offset.recv(idx+1),
704  remote_multivector, blocksize);
705  }
706  // wait on the sends to match all Isends with a cleanup operation.
707  waitall(reqs.send.size(), reqs.send.data());
708 #endif
709  }
710 
711  void syncExchange(const impl_scalar_type_2d_view &mv) {
712  IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::AsyncableImport::SyncExchange");
713  asyncSendRecv(mv);
714  syncRecv();
715  }
716 
717  impl_scalar_type_2d_view getRemoteMultiVectorLocalView() const { return remote_multivector; }
718  };
719 
723  template<typename MatrixType>
725  createBlockCrsAsyncImporter(const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A) {
726  using impl_type = ImplType<MatrixType>;
727  using tpetra_map_type = typename impl_type::tpetra_map_type;
728  using local_ordinal_type = typename impl_type::local_ordinal_type;
729  using global_ordinal_type = typename impl_type::global_ordinal_type;
730  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
731 
732  const auto g = A->getCrsGraph(); // tpetra crs graph object
733  const auto blocksize = A->getBlockSize();
734  const auto domain_map = g.getDomainMap();
735  const auto column_map = g.getColMap();
736 
737  std::vector<global_ordinal_type> gids;
738  bool separate_remotes = true, found_first = false, need_owned_permutation = false;
739  for (size_t i=0;i<column_map->getNodeNumElements();++i) {
740  const global_ordinal_type gid = column_map->getGlobalElement(i);
741  if (!domain_map->isNodeGlobalElement(gid)) {
742  found_first = true;
743  gids.push_back(gid);
744  } else if (found_first) {
745  separate_remotes = false;
746  break;
747  }
748  if (!need_owned_permutation &&
749  domain_map->getLocalElement(gid) != static_cast<local_ordinal_type>(i)) {
750  // The owned part of the domain and column maps are different
751  // orderings. We *could* do a super efficient impl of this case in the
752  // num_sweeps > 1 case by adding complexity to PermuteAndRepack. But,
753  // really, if a caller cares about speed, they wouldn't make different
754  // local permutations like this. So we punt on the best impl and go for
755  // a pretty good one: the permutation is done in place in
756  // compute_b_minus_Rx for the pure-owned part of the MVP. The only cost
757  // is the presumably worse memory access pattern of the input vector.
758  need_owned_permutation = true;
759  }
760  }
761 
762  if (separate_remotes) {
764  const auto parsimonious_col_map
765  = Teuchos::rcp(new tpetra_map_type(invalid, gids.data(), gids.size(), 0, domain_map->getComm()));
766  if (parsimonious_col_map->getGlobalNumElements() > 0) {
767  // make the importer only if needed.
768  local_ordinal_type_1d_view dm2cm;
769  if (need_owned_permutation) {
770  dm2cm = local_ordinal_type_1d_view("dm2cm", domain_map->getNodeNumElements());
771  const auto dm2cm_host = Kokkos::create_mirror_view(dm2cm);
772  for (size_t i=0;i<domain_map->getNodeNumElements();++i)
773  dm2cm_host(i) = domain_map->getLocalElement(column_map->getGlobalElement(i));
774  Kokkos::deep_copy(dm2cm, dm2cm_host);
775  }
776  return Teuchos::rcp(new AsyncableImport<MatrixType>(domain_map, parsimonious_col_map, blocksize, dm2cm));
777  }
778  }
779  return Teuchos::null;
780  }
781 
782  template<typename MatrixType>
783  struct PartInterface {
784  using local_ordinal_type = typename ImplType<MatrixType>::local_ordinal_type;
785  using local_ordinal_type_1d_view = typename ImplType<MatrixType>::local_ordinal_type_1d_view;
786 
787  PartInterface() = default;
788  PartInterface(const PartInterface &b) = default;
789 
790  // Some terms:
791  // The matrix A is split as A = D + R, where D is the matrix of tridiag
792  // blocks and R is the remainder.
793  // A part is roughly a synonym for a tridiag. The distinction is that a part
794  // is the set of rows belonging to one tridiag and, equivalently, the off-diag
795  // rows in R associated with that tridiag. In contrast, the term tridiag is
796  // used to refer specifically to tridiag data, such as the pointer into the
797  // tridiag data array.
798  // Local (lcl) row arge the LIDs. lclrow lists the LIDs belonging to each
799  // tridiag, and partptr points to the beginning of each tridiag. This is the
800  // LID space.
801  // Row index (idx) is the ordinal in the tridiag ordering. lclrow is indexed
802  // by this ordinal. This is the 'index' space.
803  // A flat index is the mathematical index into an array. A pack index
804  // accounts for SIMD packing.
805 
806  // Local row LIDs. Permutation from caller's index space to tridiag index
807  // space.
808  local_ordinal_type_1d_view lclrow;
809  // partptr_ is the pointer array into lclrow_.
810  local_ordinal_type_1d_view partptr; // np+1
811  // packptr_(i), for i the pack index, indexes partptr_. partptr_(packptr_(i))
812  // is the start of the i'th pack.
813  local_ordinal_type_1d_view packptr; // npack+1
814  // part2rowidx0_(i) is the flat row index of the start of the i'th part. It's
815  // an alias of partptr_ in the case of no overlap.
816  local_ordinal_type_1d_view part2rowidx0; // np+1
817  // part2packrowidx0_(i) is the packed row index. If vector_length is 1, then
818  // it's the same as part2rowidx0_; if it's > 1, then the value is combined
819  // with i % vector_length to get the location in the packed data.
820  local_ordinal_type_1d_view part2packrowidx0; // np+1
821  local_ordinal_type part2packrowidx0_back; // So we don't need to grab the array from the GPU.
822  // rowidx2part_ maps the row index to the part index.
823  local_ordinal_type_1d_view rowidx2part; // nr
824  // True if lcl{row|col} is at most a constant away from row{idx|col}. In
825  // practice, this knowledge is not particularly useful, as packing for batched
826  // processing is done at the same time as the permutation from LID to index
827  // space. But it's easy to detect, so it's recorded in case an optimization
828  // can be made based on it.
829  bool row_contiguous;
830 
831  local_ordinal_type max_partsz;
832  };
833 
837  template<typename MatrixType>
838  PartInterface<MatrixType>
839  createPartInterface(const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A,
840  const Teuchos::Array<Teuchos::Array<typename ImplType<MatrixType>::local_ordinal_type> > &partitions) {
841  using impl_type = ImplType<MatrixType>;
842  using local_ordinal_type = typename impl_type::local_ordinal_type;
843  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
844 
845  constexpr int vector_length = impl_type::vector_length;
846 
847  const auto comm = A->getRowMap()->getComm();
848 
849  PartInterface<MatrixType> interf;
850 
851  const bool jacobi = partitions.size() == 0;
852  const local_ordinal_type A_n_lclrows = A->getNodeNumRows();
853  const local_ordinal_type nparts = jacobi ? A_n_lclrows : partitions.size();
854 
855 #if defined(BLOCKTRIDICONTAINER_DEBUG)
856  local_ordinal_type nrows = 0;
857  if (jacobi)
858  nrows = nparts;
859  else
860  for (local_ordinal_type i=0;i<nparts;++i) nrows += partitions[i].size();
861 
863  (nrows != A_n_lclrows, get_msg_prefix(comm) << "The #rows implied by the local partition is not "
864  << "the same as getNodeNumRows: " << nrows << " vs " << A_n_lclrows);
865 #endif
866 
867  // permutation vector
868  std::vector<local_ordinal_type> p;
869  if (jacobi) {
870  interf.max_partsz = 1;
871  } else {
872  // reorder parts to maximize simd packing efficiency
873  p.resize(nparts);
874 
875  typedef std::pair<local_ordinal_type,local_ordinal_type> size_idx_pair_type;
876  std::vector<size_idx_pair_type> partsz(nparts);
877  for (local_ordinal_type i=0;i<nparts;++i)
878  partsz[i] = size_idx_pair_type(partitions[i].size(), i);
879  std::sort(partsz.begin(), partsz.end(),
880  [] (const size_idx_pair_type& x, const size_idx_pair_type& y) {
881  return x.first > y.first;
882  });
883  for (local_ordinal_type i=0;i<nparts;++i)
884  p[i] = partsz[i].second;
885 
886  interf.max_partsz = partsz[0].first;
887  }
888 
889  // allocate parts
890  interf.partptr = local_ordinal_type_1d_view(do_not_initialize_tag("partptr"), nparts + 1);
891  interf.lclrow = local_ordinal_type_1d_view(do_not_initialize_tag("lclrow"), A_n_lclrows);
892  interf.part2rowidx0 = local_ordinal_type_1d_view(do_not_initialize_tag("part2rowidx0"), nparts + 1);
893  interf.part2packrowidx0 = local_ordinal_type_1d_view(do_not_initialize_tag("part2packrowidx0"), nparts + 1);
894  interf.rowidx2part = local_ordinal_type_1d_view(do_not_initialize_tag("rowidx2part"), A_n_lclrows);
895 
896  // mirror to host and compute on host execution space
897  const auto partptr = Kokkos::create_mirror_view(interf.partptr);
898  const auto lclrow = Kokkos::create_mirror_view(interf.lclrow);
899  const auto part2rowidx0 = Kokkos::create_mirror_view(interf.part2rowidx0);
900  const auto part2packrowidx0 = Kokkos::create_mirror_view(interf.part2packrowidx0);
901  const auto rowidx2part = Kokkos::create_mirror_view(interf.rowidx2part);
902 
903  // Determine parts.
904  interf.row_contiguous = true;
905  partptr(0) = 0;
906  part2rowidx0(0) = 0;
907  part2packrowidx0(0) = 0;
908  local_ordinal_type pack_nrows = 0;
909  for (local_ordinal_type ip=0;ip<nparts;++ip) {
910  const auto* part = jacobi ? NULL : &partitions[p[ip]];
911  const local_ordinal_type ipnrows = jacobi ? 1 : part->size();
912  TEUCHOS_ASSERT(ip == 0 || (jacobi || ipnrows <= static_cast<local_ordinal_type>(partitions[p[ip-1]].size())));
913  TEUCHOS_TEST_FOR_EXCEPT_MSG(ipnrows == 0,
914  get_msg_prefix(comm)
915  << "partition " << p[ip]
916  << " is empty, which is not allowed.");
917  //assume No overlap.
918  part2rowidx0(ip+1) = part2rowidx0(ip) + ipnrows;
919  // Since parts are ordered in nonincreasing size, the size of the first
920  // part in a pack is the size for all parts in the pack.
921  if (ip % vector_length == 0) pack_nrows = ipnrows;
922  part2packrowidx0(ip+1) = part2packrowidx0(ip) + ((ip+1) % vector_length == 0 || ip+1 == nparts ? pack_nrows : 0);
923  const local_ordinal_type os = partptr(ip);
924  for (local_ordinal_type i=0;i<ipnrows;++i) {
925  const auto lcl_row = jacobi ? ip : (*part)[i];
926  TEUCHOS_TEST_FOR_EXCEPT_MSG(lcl_row < 0 || lcl_row >= A_n_lclrows,
927  get_msg_prefix(comm)
928  << "partitions[" << p[ip] << "]["
929  << i << "] = " << lcl_row
930  << " but input matrix implies limits of [0, " << A_n_lclrows-1
931  << "].");
932  lclrow(os+i) = lcl_row;
933  rowidx2part(os+i) = ip;
934  if (interf.row_contiguous && os+i > 0 && lclrow((os+i)-1) + 1 != lcl_row)
935  interf.row_contiguous = false;
936  }
937  partptr(ip+1) = os + ipnrows;
938  }
939 #if defined(BLOCKTRIDICONTAINER_DEBUG)
940  TEUCHOS_ASSERT(partptr(nparts) == nrows);
941 #endif
942  if (lclrow(0) != 0) interf.row_contiguous = false;
943 
944  Kokkos::deep_copy(interf.partptr, partptr);
945  Kokkos::deep_copy(interf.lclrow, lclrow);
946 
947  //assume No overlap. Thus:
948  interf.part2rowidx0 = interf.partptr;
949  Kokkos::deep_copy(interf.part2packrowidx0, part2packrowidx0);
950 
951  interf.part2packrowidx0_back = part2packrowidx0(part2packrowidx0.extent(0) - 1);
952  Kokkos::deep_copy(interf.rowidx2part, rowidx2part);
953 
954  { // Fill packptr.
955  local_ordinal_type npacks = 0;
956  for (local_ordinal_type ip=1;ip<=nparts;++ip)
957  if (part2packrowidx0(ip) != part2packrowidx0(ip-1))
958  ++npacks;
959  interf.packptr = local_ordinal_type_1d_view("packptr", npacks + 1);
960  const auto packptr = Kokkos::create_mirror_view(interf.packptr);
961  packptr(0) = 0;
962  for (local_ordinal_type ip=1,k=1;ip<=nparts;++ip)
963  if (part2packrowidx0(ip) != part2packrowidx0(ip-1))
964  packptr(k++) = ip;
965  Kokkos::deep_copy(interf.packptr, packptr);
966  }
967 
968  return interf;
969  }
970 
974  template <typename MatrixType>
975  struct BlockTridiags {
977  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
978  using size_type_1d_view = typename impl_type::size_type_1d_view;
979  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
980 
981  // flat_td_ptr(i) is the index into flat-array values of the start of the
982  // i'th tridiag. pack_td_ptr is the same, but for packs. If vector_length ==
983  // 1, pack_td_ptr is the same as flat_td_ptr; if vector_length > 1, then i %
984  // vector_length is the position in the pack.
985  size_type_1d_view flat_td_ptr, pack_td_ptr;
986  // List of local column indices into A from which to grab
987  // data. flat_td_ptr(i) points to the start of the i'th tridiag's data.
988  local_ordinal_type_1d_view A_colindsub;
989  // Tridiag block values. pack_td_ptr(i) points to the start of the i'th
990  // tridiag's pack, and i % vector_length gives the position in the pack.
991  vector_type_3d_view values;
992 
993  bool is_diagonal_only;
994 
995  BlockTridiags() = default;
996  BlockTridiags(const BlockTridiags &b) = default;
997 
998  // Index into row-major block of a tridiag.
999  template <typename idx_type>
1000  static KOKKOS_FORCEINLINE_FUNCTION
1001  idx_type IndexToRow (const idx_type& ind) { return (ind + 1) / 3; }
1002  // Given a row of a row-major tridiag, return the index of the first block
1003  // in that row.
1004  template <typename idx_type>
1005  static KOKKOS_FORCEINLINE_FUNCTION
1006  idx_type RowToIndex (const idx_type& row) { return row > 0 ? 3*row - 1 : 0; }
1007  // Number of blocks in a tridiag having a given number of rows.
1008  template <typename idx_type>
1009  static KOKKOS_FORCEINLINE_FUNCTION
1010  idx_type NumBlocks (const idx_type& nrows) { return nrows > 0 ? 3*nrows - 2 : 0; }
1011  };
1012 
1013 
1017  template<typename MatrixType>
1019  createBlockTridiags(const PartInterface<MatrixType> &interf) {
1020  using impl_type = ImplType<MatrixType>;
1021  using execution_space = typename impl_type::execution_space;
1022  using local_ordinal_type = typename impl_type::local_ordinal_type;
1023  using size_type = typename impl_type::size_type;
1024  using size_type_1d_view = typename impl_type::size_type_1d_view;
1025 
1026  constexpr int vector_length = impl_type::vector_length;
1027 
1029 
1030  const local_ordinal_type ntridiags = interf.partptr.extent(0) - 1;
1031 
1032  { // construct the flat index pointers into the tridiag values array.
1033  btdm.flat_td_ptr = size_type_1d_view(do_not_initialize_tag("btdm.flat_td_ptr"), ntridiags + 1);
1034  const Kokkos::RangePolicy<execution_space> policy(0,ntridiags + 1);
1035  Kokkos::parallel_scan
1036  ("createBlockTridiags::RangePolicy::flat_td_ptr",
1037  policy, KOKKOS_LAMBDA(const local_ordinal_type &i, size_type &update, const bool &final) {
1038  if (final)
1039  btdm.flat_td_ptr(i) = update;
1040  if (i < ntridiags) {
1041  const local_ordinal_type nrows = interf.partptr(i+1) - interf.partptr(i);
1042  update += btdm.NumBlocks(nrows);
1043  }
1044  });
1045 
1046  const auto nblocks = Kokkos::create_mirror_view_and_copy
1047  (Kokkos::HostSpace(), Kokkos::subview(btdm.flat_td_ptr, ntridiags));
1048  btdm.is_diagonal_only = (static_cast<local_ordinal_type>(nblocks()) == ntridiags);
1049  }
1050 
1051  // And the packed index pointers.
1052  if (vector_length == 1) {
1053  btdm.pack_td_ptr = btdm.flat_td_ptr;
1054  } else {
1055  const local_ordinal_type npacks = interf.packptr.extent(0) - 1;
1056  btdm.pack_td_ptr = size_type_1d_view(do_not_initialize_tag("btdm.pack_td_ptr"), ntridiags + 1);
1057  const Kokkos::RangePolicy<execution_space> policy(0,npacks);
1058  Kokkos::parallel_scan
1059  ("createBlockTridiags::RangePolicy::pack_td_ptr",
1060  policy, KOKKOS_LAMBDA(const local_ordinal_type &i, size_type &update, const bool &final) {
1061  const local_ordinal_type parti = interf.packptr(i);
1062  const local_ordinal_type parti_next = interf.packptr(i+1);
1063  if (final) {
1064  const size_type nblks = update;
1065  for (local_ordinal_type pti=parti;pti<parti_next;++pti)
1066  btdm.pack_td_ptr(pti) = nblks;
1067  const local_ordinal_type nrows = interf.partptr(parti+1) - interf.partptr(parti);
1068  // last one
1069  if (i == npacks-1)
1070  btdm.pack_td_ptr(ntridiags) = nblks + btdm.NumBlocks(nrows);
1071  }
1072  {
1073  const local_ordinal_type nrows = interf.partptr(parti+1) - interf.partptr(parti);
1074  update += btdm.NumBlocks(nrows);
1075  }
1076  });
1077  }
1078 
1079  // values and A_colindsub are created in the symbolic phase
1080 
1081  return btdm;
1082  }
1083 
1084  // Set the tridiags to be I to the full pack block size. That way, if a
1085  // tridiag within a pack is shorter than the longest one, the extra blocks are
1086  // processed in a safe way. Similarly, in the solve phase, if the extra blocks
1087  // in the packed multvector are 0, and the tridiag LU reflects the extra I
1088  // blocks, then the solve proceeds as though the extra blocks aren't
1089  // present. Since this extra work is part of the SIMD calls, it's not actually
1090  // extra work. Instead, it means we don't have to put checks or masks in, or
1091  // quiet NaNs. This functor has to be called just once, in the symbolic phase,
1092  // since the numeric phase fills in only the used entries, leaving these I
1093  // blocks intact.
1094  template<typename MatrixType>
1095  void
1096  setTridiagsToIdentity
1097  (const BlockTridiags<MatrixType>& btdm,
1098  const typename ImplType<MatrixType>::local_ordinal_type_1d_view& packptr)
1099  {
1100  using impl_type = ImplType<MatrixType>;
1101  using execution_space = typename impl_type::execution_space;
1102  using local_ordinal_type = typename impl_type::local_ordinal_type;
1103  using size_type_1d_view = typename impl_type::size_type_1d_view;
1104 
1105  const ConstUnmanaged<size_type_1d_view> pack_td_ptr(btdm.pack_td_ptr);
1106  const local_ordinal_type blocksize = btdm.values.extent(1);
1107 
1108  {
1109  const int vector_length = impl_type::vector_length;
1110  const int internal_vector_length = impl_type::internal_vector_length;
1111 
1112  using internal_vector_type = typename impl_type::internal_vector_type;
1113  using internal_vector_type_4d_view =
1114  typename impl_type::internal_vector_type_4d_view;
1115  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
1116  const internal_vector_type_4d_view values
1117  (reinterpret_cast<internal_vector_type*>(btdm.values.data()),
1118  btdm.values.extent(0),
1119  btdm.values.extent(1),
1120  btdm.values.extent(2),
1121  vector_length/internal_vector_length);
1122  const local_ordinal_type vector_loop_size = values.extent(3);
1123 #if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDA_ARCH__)
1124  local_ordinal_type total_team_size(0);
1125  if (blocksize <= 5) total_team_size = 32;
1126  else if (blocksize <= 9) total_team_size = 64;
1127  else if (blocksize <= 12) total_team_size = 96;
1128  else if (blocksize <= 16) total_team_size = 128;
1129  else if (blocksize <= 20) total_team_size = 160;
1130  else total_team_size = 160;
1131  const local_ordinal_type team_size = total_team_size/vector_loop_size;
1132  const team_policy_type policy(packptr.extent(0)-1, team_size, vector_loop_size);
1133 #else // Host architecture: team size is always one
1134  const team_policy_type policy(packptr.extent(0)-1, 1, 1);
1135 #endif
1136  Kokkos::parallel_for
1137  ("setTridiagsToIdentity::TeamPolicy",
1138  policy, KOKKOS_LAMBDA(const typename team_policy_type::member_type &member) {
1139  const local_ordinal_type k = member.league_rank();
1140  const local_ordinal_type ibeg = pack_td_ptr(packptr(k));
1141  const local_ordinal_type iend = pack_td_ptr(packptr(k+1));
1142  const local_ordinal_type diff = iend - ibeg;
1143  const local_ordinal_type icount = diff/3 + (diff%3 > 0);
1144  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
1145  Kokkos::parallel_for(Kokkos::TeamThreadRange(member,icount),[&](const local_ordinal_type &ii) {
1146  const local_ordinal_type i = ibeg + ii*3;
1147  for (local_ordinal_type j=0;j<blocksize;++j)
1148  values(i,j,j,v) = 1;
1149  });
1150  });
1151  });
1152 
1153  }
1154  }
1155 
1159  template <typename MatrixType>
1160  struct AmD {
1162  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1163  using size_type_1d_view = typename impl_type::size_type_1d_view;
1164  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
1165 
1166  // rowptr points to the start of each row of A_colindsub.
1167  size_type_1d_view rowptr, rowptr_remote;
1168  // Indices into A's rows giving the blocks to extract. rowptr(i) points to
1169  // the i'th row. Thus, g.entries(A_colindsub(rowptr(row) : rowptr(row+1))),
1170  // where g is A's graph, are the columns AmD uses. If seq_method_, then
1171  // A_colindsub contains all the LIDs and A_colindsub_remote is empty. If !
1172  // seq_method_, then A_colindsub contains owned LIDs and A_colindsub_remote
1173  // contains the remote ones.
1174  local_ordinal_type_1d_view A_colindsub, A_colindsub_remote;
1175 
1176  // Currently always true.
1177  bool is_tpetra_block_crs;
1178 
1179  // If is_tpetra_block_crs, then this is a pointer to A_'s value data.
1180  impl_scalar_type_1d_view tpetra_values;
1181 
1182  AmD() = default;
1183  AmD(const AmD &b) = default;
1184  };
1185 
1189  template<typename MatrixType>
1190  void
1191  performSymbolicPhase(const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A,
1192  const PartInterface<MatrixType> &interf,
1194  AmD<MatrixType> &amd,
1195  const bool overlap_communication_and_computation) {
1196  IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::SymbolicPhase");
1197 
1198  using impl_type = ImplType<MatrixType>;
1199  using memory_space = typename impl_type::memory_space;
1200  using host_execution_space = typename impl_type::host_execution_space;
1201 
1202  using local_ordinal_type = typename impl_type::local_ordinal_type;
1203  using global_ordinal_type = typename impl_type::global_ordinal_type;
1204  using size_type = typename impl_type::size_type;
1205  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1206  using size_type_1d_view = typename impl_type::size_type_1d_view;
1207  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
1208  using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
1209 
1210  constexpr int vector_length = impl_type::vector_length;
1211 
1212  const auto comm = A->getRowMap()->getComm();
1213  const auto& g = A->getCrsGraph();
1214  const auto blocksize = A->getBlockSize();
1215 
1216  // mirroring to host
1217  const auto partptr = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.partptr);
1218  const auto lclrow = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.lclrow);
1219  const auto rowidx2part = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.rowidx2part);
1220  const auto part2rowidx0 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), interf.part2rowidx0);
1221  const auto packptr = Kokkos::create_mirror_view_and_copy (Kokkos::HostSpace(), interf.packptr);
1222 
1223  const local_ordinal_type nrows = partptr(partptr.extent(0) - 1);
1224 
1225  // find column to row map on host
1226  Kokkos::View<local_ordinal_type*,host_execution_space> col2row("col2row", A->getNodeNumCols());
1227  Kokkos::deep_copy(col2row, Teuchos::OrdinalTraits<local_ordinal_type>::invalid());
1228  {
1229  const auto rowmap = g.getRowMap();
1230  const auto colmap = g.getColMap();
1231  const auto dommap = g.getDomainMap();
1232  TEUCHOS_ASSERT( !(rowmap.is_null() || colmap.is_null() || dommap.is_null()));
1233 
1234 #if !defined(__CUDA_ARCH__)
1235  const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1236  Kokkos::parallel_for
1237  ("performSymbolicPhase::RangePolicy::col2row",
1238  policy, KOKKOS_LAMBDA(const local_ordinal_type &lr) {
1239  const global_ordinal_type gid = rowmap->getGlobalElement(lr);
1241  if (dommap->isNodeGlobalElement(gid)) {
1242  const local_ordinal_type lc = colmap->getLocalElement(gid);
1243 # if defined(BLOCKTRIDICONTAINER_DEBUG)
1245  get_msg_prefix(comm) << "GID " << gid
1246  << " gives an invalid local column.");
1247 # endif
1248  col2row(lc) = lr;
1249  }
1250  });
1251 #endif
1252  }
1253 
1254  // construct the D and R graphs in A = D + R.
1255  {
1256  const auto& local_graph = g.getLocalGraph();
1257  const auto& local_graph_rowptr = local_graph.row_map;
1258  TEUCHOS_ASSERT(local_graph_rowptr.size() == static_cast<size_t>(nrows + 1));
1259  const auto& local_graph_colidx = local_graph.entries;
1260 
1261  //assume no overlap.
1262 
1263  Kokkos::View<local_ordinal_type*,host_execution_space> lclrow2idx("lclrow2idx", nrows);
1264  {
1265  const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1266  Kokkos::parallel_for
1267  ("performSymbolicPhase::RangePolicy::lclrow2idx",
1268  policy, KOKKOS_LAMBDA(const local_ordinal_type &i) {
1269  lclrow2idx[lclrow(i)] = i;
1270  });
1271  }
1272 
1273  // count (block) nnzs in D and R.
1274  typedef SumReducer<size_type,3,host_execution_space> sum_reducer_type;
1275  typename sum_reducer_type::value_type sum_reducer_value;
1276  {
1277  const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1278  Kokkos::parallel_reduce
1279  // profiling interface does not work
1280  (//"performSymbolicPhase::RangePolicy::count_nnz",
1281  policy, KOKKOS_LAMBDA(const local_ordinal_type &lr, typename sum_reducer_type::value_type &update) {
1282  // LID -> index.
1283  const local_ordinal_type ri0 = lclrow2idx[lr];
1284  const local_ordinal_type pi0 = rowidx2part(ri0);
1285  for (size_type j=local_graph_rowptr(lr);j<local_graph_rowptr(lr+1);++j) {
1286  const local_ordinal_type lc = local_graph_colidx(j);
1287  const local_ordinal_type lc2r = col2row[lc];
1288  bool incr_R = false;
1289  do { // breakable
1291  incr_R = true;
1292  break;
1293  }
1294  const local_ordinal_type ri = lclrow2idx[lc2r];
1295  const local_ordinal_type pi = rowidx2part(ri);
1296  if (pi != pi0) {
1297  incr_R = true;
1298  break;
1299  }
1300  // Test for being in the tridiag. This is done in index space. In
1301  // LID space, tridiag LIDs in a row are not necessarily related by
1302  // {-1, 0, 1}.
1303  if (ri0 + 1 >= ri && ri0 <= ri + 1)
1304  ++update.v[0]; // D_nnz
1305  else
1306  incr_R = true;
1307  } while (0);
1308  if (incr_R) {
1309  if (lc < nrows) ++update.v[1]; // R_nnz_owned
1310  else ++update.v[2]; // R_nnz_remote
1311  }
1312  }
1313  }, sum_reducer_type(sum_reducer_value));
1314  }
1315  size_type D_nnz = sum_reducer_value.v[0];
1316  size_type R_nnz_owned = sum_reducer_value.v[1];
1317  size_type R_nnz_remote = sum_reducer_value.v[2];
1318 
1319  if (!overlap_communication_and_computation) {
1320  R_nnz_owned += R_nnz_remote;
1321  R_nnz_remote = 0;
1322  }
1323 
1324  // construct the D graph.
1325  {
1326  const auto flat_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.flat_td_ptr);
1327 
1328  btdm.A_colindsub = local_ordinal_type_1d_view("btdm.A_colindsub", D_nnz);
1329  const auto D_A_colindsub = Kokkos::create_mirror_view(btdm.A_colindsub);
1330 
1331 #if defined(BLOCKTRIDICONTAINER_DEBUG)
1332  Kokkos::deep_copy(D_A_colindsub, Teuchos::OrdinalTraits<local_ordinal_type>::invalid());
1333 #endif
1334 
1335  const local_ordinal_type nparts = partptr.extent(0) - 1;
1336  {
1337  const Kokkos::RangePolicy<host_execution_space> policy(0, nparts);
1338  Kokkos::parallel_for
1339  ("performSymbolicPhase::RangePolicy<host_execution_space>::D_graph",
1340  policy, KOKKOS_LAMBDA(const local_ordinal_type &pi0) {
1341  const local_ordinal_type part_ri0 = part2rowidx0(pi0);
1342  local_ordinal_type offset = 0;
1343  for (local_ordinal_type ri0=partptr(pi0);ri0<partptr(pi0+1);++ri0) {
1344  const local_ordinal_type td_row_os = btdm.RowToIndex(ri0 - part_ri0) + offset;
1345  offset = 1;
1346  const local_ordinal_type lr0 = lclrow(ri0);
1347  const size_type j0 = local_graph_rowptr(lr0);
1348  for (size_type j=j0;j<local_graph_rowptr(lr0+1);++j) {
1349  const local_ordinal_type lc = local_graph_colidx(j);
1350  const local_ordinal_type lc2r = col2row[lc];
1352  const local_ordinal_type ri = lclrow2idx[lc2r];
1353  const local_ordinal_type pi = rowidx2part(ri);
1354  if (pi != pi0) continue;
1355  if (ri + 1 < ri0 || ri > ri0 + 1) continue;
1356  const local_ordinal_type row_entry = j - j0;
1357  D_A_colindsub(flat_td_ptr(pi0) + ((td_row_os + ri) - ri0)) = row_entry;
1358  }
1359  }
1360  });
1361  }
1362 #if defined(BLOCKTRIDICONTAINER_DEBUG)
1363  for (size_t i=0;i<D_A_colindsub.extent(0);++i)
1365 #endif
1366  Kokkos::deep_copy(btdm.A_colindsub, D_A_colindsub);
1367 
1368  // Allocate values.
1369  {
1370  const auto pack_td_ptr_last = Kokkos::subview(btdm.pack_td_ptr, nparts);
1371  const auto num_packed_blocks = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), pack_td_ptr_last);
1372  btdm.values = vector_type_3d_view("btdm.values", num_packed_blocks(), blocksize, blocksize);
1373  if (vector_length > 1) setTridiagsToIdentity(btdm, interf.packptr);
1374  }
1375  }
1376 
1377  // Construct the R graph.
1378  {
1379  amd.rowptr = size_type_1d_view("amd.rowptr", nrows + 1);
1380  amd.A_colindsub = local_ordinal_type_1d_view(Kokkos::ViewAllocateWithoutInitializing("amd.A_colindsub"), R_nnz_owned);
1381 
1382  const auto R_rowptr = Kokkos::create_mirror_view(amd.rowptr);
1383  const auto R_A_colindsub = Kokkos::create_mirror_view(amd.A_colindsub);
1384 
1385  amd.rowptr_remote = size_type_1d_view("amd.rowptr_remote", overlap_communication_and_computation ? nrows + 1 : 0);
1386  amd.A_colindsub_remote = local_ordinal_type_1d_view(Kokkos::ViewAllocateWithoutInitializing("amd.A_colindsub_remote"), R_nnz_remote);
1387 
1388  const auto R_rowptr_remote = Kokkos::create_mirror_view(amd.rowptr_remote);
1389  const auto R_A_colindsub_remote = Kokkos::create_mirror_view(amd.A_colindsub_remote);
1390 
1391  {
1392  const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
1393  Kokkos::parallel_for
1394  ("performSymbolicPhase::RangePolicy<host_execution_space>::R_graph_count",
1395  policy, KOKKOS_LAMBDA(const local_ordinal_type &lr) {
1396  const local_ordinal_type ri0 = lclrow2idx[lr];
1397  const local_ordinal_type pi0 = rowidx2part(ri0);
1398  const size_type j0 = local_graph_rowptr(lr);
1399  for (size_type j=j0;j<local_graph_rowptr(lr+1);++j) {
1400  const local_ordinal_type lc = local_graph_colidx(j);
1401  const local_ordinal_type lc2r = col2row[lc];
1403  const local_ordinal_type ri = lclrow2idx[lc2r];
1404  const local_ordinal_type pi = rowidx2part(ri);
1405  if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1) {
1406  continue;
1407  }
1408  }
1409  // exclusive scan will be performed later
1410  if (!overlap_communication_and_computation || lc < nrows) {
1411  ++R_rowptr(lr);
1412  } else {
1413  ++R_rowptr_remote(lr);
1414  }
1415  }
1416  });
1417  }
1418 
1419  // exclusive scan
1420  typedef ArrayValueType<size_type,2> update_type;
1421  {
1422  Kokkos::RangePolicy<host_execution_space> policy(0,nrows+1);
1423  Kokkos::parallel_scan
1424  ("performSymbolicPhase::RangePolicy<host_execution_space>::R_graph_fill",
1425  policy, KOKKOS_LAMBDA(const local_ordinal_type &lr,
1426  update_type &update,
1427  const bool &final) {
1428  update_type val;
1429  val.v[0] = R_rowptr(lr);
1430  if (overlap_communication_and_computation)
1431  val.v[1] = R_rowptr_remote(lr);
1432 
1433  if (final) {
1434  R_rowptr(lr) = update.v[0];
1435  if (overlap_communication_and_computation)
1436  R_rowptr_remote(lr) = update.v[1];
1437 
1438  if (lr < nrows) {
1439  const local_ordinal_type ri0 = lclrow2idx[lr];
1440  const local_ordinal_type pi0 = rowidx2part(ri0);
1441 
1442  size_type cnt_rowptr = R_rowptr(lr);
1443  size_type cnt_rowptr_remote = overlap_communication_and_computation ? R_rowptr_remote(lr) : 0; // when not overlap_communication_and_computation, this value is garbage
1444 
1445  const size_type j0 = local_graph_rowptr(lr);
1446  for (size_type j=j0;j<local_graph_rowptr(lr+1);++j) {
1447  const local_ordinal_type lc = local_graph_colidx(j);
1448  const local_ordinal_type lc2r = col2row[lc];
1450  const local_ordinal_type ri = lclrow2idx[lc2r];
1451  const local_ordinal_type pi = rowidx2part(ri);
1452  if (pi == pi0 && ri + 1 >= ri0 && ri <= ri0 + 1)
1453  continue;
1454  }
1455  const local_ordinal_type row_entry = j - j0;
1456  if (!overlap_communication_and_computation || lc < nrows)
1457  R_A_colindsub(cnt_rowptr++) = row_entry;
1458  else
1459  R_A_colindsub_remote(cnt_rowptr_remote++) = row_entry;
1460  }
1461  }
1462  }
1463  update += val;
1464  });
1465  }
1466  TEUCHOS_ASSERT(R_rowptr(nrows) == R_nnz_owned);
1467  Kokkos::deep_copy(amd.rowptr, R_rowptr);
1468  Kokkos::deep_copy(amd.A_colindsub, R_A_colindsub);
1469  if (overlap_communication_and_computation) {
1470  TEUCHOS_ASSERT(R_rowptr_remote(nrows) == R_nnz_remote);
1471  Kokkos::deep_copy(amd.rowptr_remote, R_rowptr_remote);
1472  Kokkos::deep_copy(amd.A_colindsub_remote, R_A_colindsub_remote);
1473  }
1474 
1475  // Allocate or view values.
1476  amd.tpetra_values = (const_cast<block_crs_matrix_type*>(A.get())->
1477  template getValues<memory_space>());
1478  }
1479  }
1480  }
1481 
1485  template<typename ArgActiveExecutionMemorySpace>
1487 
1488  template<>
1489  struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::HostSpace> {
1490  typedef KokkosBatched::Experimental::Mode::Serial mode_type;
1491 #if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__)
1492  typedef KokkosBatched::Experimental::Algo::Level3::CompactMKL algo_type;
1493 #else
1494  typedef KokkosBatched::Experimental::Algo::Level3::Blocked algo_type;
1495 #endif
1496  static int recommended_team_size(const int /* blksize */,
1497  const int /* vector_length */,
1498  const int /* internal_vector_length */) {
1499  return 1;
1500  }
1501 
1502  };
1503 
1504 #if defined(KOKKOS_ENABLE_CUDA)
1505  static inline int ExtractAndFactorizeRecommendedCudaTeamSize(const int blksize,
1506  const int vector_length,
1507  const int internal_vector_length) {
1508  const int vector_size = vector_length/internal_vector_length;
1509  int total_team_size(0);
1510  if (blksize <= 5) total_team_size = 32;
1511  else if (blksize <= 9) total_team_size = 64;
1512  else if (blksize <= 12) total_team_size = 96;
1513  else if (blksize <= 16) total_team_size = 128;
1514  else if (blksize <= 20) total_team_size = 160;
1515  else total_team_size = 160;
1516  return 2*total_team_size/vector_size;
1517  }
1518  template<>
1519  struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::CudaSpace> {
1520  typedef KokkosBatched::Experimental::Mode::Team mode_type;
1521  typedef KokkosBatched::Experimental::Algo::Level3::Unblocked algo_type;
1522  static int recommended_team_size(const int blksize,
1523  const int vector_length,
1524  const int internal_vector_length) {
1525  return ExtractAndFactorizeRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
1526  }
1527  };
1528  template<>
1529  struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::CudaUVMSpace> {
1530  typedef KokkosBatched::Experimental::Mode::Team mode_type;
1531  typedef KokkosBatched::Experimental::Algo::Level3::Unblocked algo_type;
1532  static int recommended_team_size(const int blksize,
1533  const int vector_length,
1534  const int internal_vector_length) {
1535  return ExtractAndFactorizeRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
1536  }
1537  };
1538 #endif
1539 
1540  template<typename MatrixType>
1541  struct ExtractAndFactorizeTridiags {
1542  public:
1543  using impl_type = ImplType<MatrixType>;
1544  // a functor cannot have both device_type and execution_space; specialization error in kokkos
1545  using execution_space = typename impl_type::execution_space;
1546  using memory_space = typename impl_type::memory_space;
1548  using local_ordinal_type = typename impl_type::local_ordinal_type;
1549  using size_type = typename impl_type::size_type;
1550  using impl_scalar_type = typename impl_type::impl_scalar_type;
1551  using magnitude_type = typename impl_type::magnitude_type;
1553  using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
1555  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1556  using size_type_1d_view = typename impl_type::size_type_1d_view;
1557  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
1559  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
1560  using internal_vector_type_4d_view = typename impl_type::internal_vector_type_4d_view;
1561  using impl_scalar_type_4d_view = typename impl_type::impl_scalar_type_4d_view;
1562  using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
1563  using impl_scalar_scratch_type_3d_view = Scratch<typename impl_type::impl_scalar_type_3d_view>;
1564 
1565  using internal_vector_type = typename impl_type::internal_vector_type;
1566  static constexpr int vector_length = impl_type::vector_length;
1567  static constexpr int internal_vector_length = impl_type::internal_vector_length;
1568 
1570  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
1571  using member_type = typename team_policy_type::member_type;
1572 
1573  private:
1574  // part interface
1575  const ConstUnmanaged<local_ordinal_type_1d_view> partptr, lclrow, packptr;
1576  const local_ordinal_type max_partsz;
1577  // block crs matrix (it could be Kokkos::UVMSpace::size_type, which is int)
1578  using a_rowptr_value_type = typename Kokkos::ViewTraits<local_ordinal_type*,typename impl_type::device_type>::size_type;
1579  const ConstUnmanaged<Kokkos::View<a_rowptr_value_type*,typename impl_type::device_type> > A_rowptr;
1580  const ConstUnmanaged<impl_scalar_type_1d_view> A_values;
1581  // block tridiags
1582  const ConstUnmanaged<size_type_1d_view> pack_td_ptr, flat_td_ptr;
1583  const ConstUnmanaged<local_ordinal_type_1d_view> A_colindsub;
1584  const Unmanaged<internal_vector_type_4d_view> internal_vector_values;
1585  const Unmanaged<impl_scalar_type_4d_view> scalar_values;
1586  // shared information
1587  const local_ordinal_type blocksize, blocksize_square;
1588  // diagonal safety
1589  const magnitude_type tiny;
1590  const local_ordinal_type vector_loop_size;
1591  const local_ordinal_type vector_length_value;
1592 
1593  public:
1594  ExtractAndFactorizeTridiags(const BlockTridiags<MatrixType> &btdm_,
1595  const PartInterface<MatrixType> &interf_,
1597  const magnitude_type& tiny_) :
1598  // interface
1599  partptr(interf_.partptr),
1600  lclrow(interf_.lclrow),
1601  packptr(interf_.packptr),
1602  max_partsz(interf_.max_partsz),
1603  // block crs matrix
1604  A_rowptr(A_->getCrsGraph().getLocalGraph().row_map),
1605  A_values(const_cast<block_crs_matrix_type*>(A_.get())->template getValues<memory_space>()),
1606  // block tridiags
1607  pack_td_ptr(btdm_.pack_td_ptr),
1608  flat_td_ptr(btdm_.flat_td_ptr),
1609  A_colindsub(btdm_.A_colindsub),
1610  internal_vector_values((internal_vector_type*)btdm_.values.data(),
1611  btdm_.values.extent(0),
1612  btdm_.values.extent(1),
1613  btdm_.values.extent(2),
1614  vector_length/internal_vector_length),
1615  scalar_values((impl_scalar_type*)btdm_.values.data(),
1616  btdm_.values.extent(0),
1617  btdm_.values.extent(1),
1618  btdm_.values.extent(2),
1619  vector_length),
1620  blocksize(btdm_.values.extent(1)),
1621  blocksize_square(blocksize*blocksize),
1622  // diagonal weight to avoid zero pivots
1623  tiny(tiny_),
1624  vector_loop_size(vector_length/internal_vector_length),
1625  vector_length_value(vector_length) {}
1626 
1627  private:
1628 
1629  KOKKOS_INLINE_FUNCTION
1630  void
1631  extract(local_ordinal_type partidx,
1632  local_ordinal_type npacks) const {
1633  const size_type kps = pack_td_ptr(partidx);
1634  local_ordinal_type kfs[vector_length] = {};
1635  local_ordinal_type ri0[vector_length] = {};
1636  local_ordinal_type nrows[vector_length] = {};
1637 
1638  for (local_ordinal_type vi=0;vi<npacks;++vi,++partidx) {
1639  kfs[vi] = flat_td_ptr(partidx);
1640  ri0[vi] = partptr(partidx);
1641  nrows[vi] = partptr(partidx+1) - ri0[vi];
1642  }
1643  for (local_ordinal_type tr=0,j=0;tr<nrows[0];++tr) {
1644  for (local_ordinal_type e=0;e<3;++e) {
1645  const impl_scalar_type* block[vector_length] = {};
1646  for (local_ordinal_type vi=0;vi<npacks;++vi) {
1647  const size_type Aj = A_rowptr(lclrow(ri0[vi] + tr)) + A_colindsub(kfs[vi] + j);
1648  block[vi] = &A_values(Aj*blocksize_square);
1649  }
1650  const size_type pi = kps + j;
1651  ++j;
1652  for (local_ordinal_type ii=0;ii<blocksize;++ii) {
1653  for (local_ordinal_type jj=0;jj<blocksize;++jj) {
1654  const auto idx = ii*blocksize + jj;
1655  auto& v = internal_vector_values(pi, ii, jj, 0);
1656  for (local_ordinal_type vi=0;vi<npacks;++vi)
1657  v[vi] = block[vi][idx];
1658  }
1659  }
1660 
1661  if (nrows[0] == 1) break;
1662  if (e == 1 && (tr == 0 || tr+1 == nrows[0])) break;
1663  for (local_ordinal_type vi=1;vi<npacks;++vi) {
1664  if ((e == 0 && nrows[vi] == 1) || (e == 1 && tr+1 == nrows[vi])) {
1665  npacks = vi;
1666  break;
1667  }
1668  }
1669  }
1670  }
1671  }
1672 
1673  KOKKOS_INLINE_FUNCTION
1674  void
1675  extract(const member_type &member,
1676  const local_ordinal_type &partidxbeg,
1677  const local_ordinal_type &npacks,
1678  const local_ordinal_type &vbeg) const {
1679  local_ordinal_type kfs_vals[internal_vector_length] = {};
1680  local_ordinal_type ri0_vals[internal_vector_length] = {};
1681  local_ordinal_type nrows_vals[internal_vector_length] = {};
1682 
1683  const size_type kps = pack_td_ptr(partidxbeg);
1684  for (local_ordinal_type v=vbeg,vi=0;v<npacks && vi<internal_vector_length;++v,++vi) {
1685  kfs_vals[vi] = flat_td_ptr(partidxbeg+vi);
1686  ri0_vals[vi] = partptr(partidxbeg+vi);
1687  nrows_vals[vi] = partptr(partidxbeg+vi+1) - ri0_vals[vi];
1688  }
1689 
1690  local_ordinal_type j_vals[internal_vector_length] = {};
1691  for (local_ordinal_type tr=0;tr<nrows_vals[0];++tr) {
1692  for (local_ordinal_type v=vbeg,vi=0;v<npacks && vi<internal_vector_length;++v,++vi) {
1693  const local_ordinal_type nrows = nrows_vals[vi];
1694  if (tr < nrows) {
1695  auto &j = j_vals[vi];
1696  const local_ordinal_type kfs = kfs_vals[vi];
1697  const local_ordinal_type ri0 = ri0_vals[vi];
1698  const local_ordinal_type lbeg = (tr == 0 ? 1 : 0);
1699  const local_ordinal_type lend = (tr == nrows - 1 ? 2 : 3);
1700  for (local_ordinal_type l=lbeg;l<lend;++l,++j) {
1701  const size_type Aj = A_rowptr(lclrow(ri0 + tr)) + A_colindsub(kfs + j);
1702  const impl_scalar_type* block = &A_values(Aj*blocksize_square);
1703  const size_type pi = kps + j;
1704  Kokkos::parallel_for
1705  (Kokkos::TeamThreadRange(member,blocksize),
1706  [&](const local_ordinal_type &ii) {
1707  for (local_ordinal_type jj=0;jj<blocksize;++jj)
1708  scalar_values(pi, ii, jj, v) = block[ii*blocksize + jj];
1709  });
1710  }
1711  }
1712  }
1713  }
1714  }
1715 
1716  template<typename AAViewType,
1717  typename WWViewType>
1718  KOKKOS_INLINE_FUNCTION
1719  void
1720  factorize(const member_type &member,
1721  const local_ordinal_type &i0,
1722  const local_ordinal_type &nrows,
1723  const local_ordinal_type &v,
1724  const AAViewType &AA,
1725  const WWViewType &WW) const {
1726  namespace KB = KokkosBatched::Experimental;
1727 
1728  typedef ExtractAndFactorizeTridiagsDefaultModeAndAlgo
1729  <Kokkos::Impl::ActiveExecutionMemorySpace> default_mode_and_algo_type;
1730  typedef default_mode_and_algo_type::mode_type default_mode_type;
1731  typedef default_mode_and_algo_type::algo_type default_algo_type;
1732 
1733  // constant
1734  const auto one = Kokkos::ArithTraits<magnitude_type>::one();
1735 
1736  // subview pattern
1737  auto A = Kokkos::subview(AA, i0, Kokkos::ALL(), Kokkos::ALL(), v);
1738  KB::LU<member_type,
1739  default_mode_type,KB::Algo::LU::Unblocked>
1740  ::invoke(member, A , tiny);
1741  if (nrows > 1) {
1742  auto B = A;
1743  auto C = A;
1744  local_ordinal_type i = i0;
1745  for (local_ordinal_type tr=1;tr<nrows;++tr,i+=3) {
1746  B.assign_data( &AA(i+1,0,0,v) );
1747  KB::Trsm<member_type,
1748  KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
1749  default_mode_type,default_algo_type>
1750  ::invoke(member, one, A, B);
1751  C.assign_data( &AA(i+2,0,0,v) );
1752  KB::Trsm<member_type,
1753  KB::Side::Right,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
1754  default_mode_type,default_algo_type>
1755  ::invoke(member, one, A, C);
1756  A.assign_data( &AA(i+3,0,0,v) );
1757  KB::Gemm<member_type,
1758  KB::Trans::NoTranspose,KB::Trans::NoTranspose,
1759  default_mode_type,default_algo_type>
1760  ::invoke(member, -one, C, B, one, A);
1761  KB::LU<member_type,
1762  default_mode_type,KB::Algo::LU::Unblocked>
1763  ::invoke(member, A, tiny);
1764  }
1765  } else {
1766  // for block jacobi invert a matrix here
1767  auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
1768  KB::Copy<member_type,KB::Trans::NoTranspose,default_mode_type>
1769  ::invoke(member, A, W);
1770  KB::SetIdentity<member_type,default_mode_type>
1771  ::invoke(member, A);
1772  member.team_barrier();
1773  KB::Trsm<member_type,
1774  KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
1775  default_mode_type,default_algo_type>
1776  ::invoke(member, one, W, A);
1777  KB::Trsm<member_type,
1778  KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
1779  default_mode_type,default_algo_type>
1780  ::invoke(member, one, W, A);
1781  }
1782  }
1783 
1784  public:
1785 
1786  struct ExtractAndFactorizeTag {};
1787 
1788  KOKKOS_INLINE_FUNCTION
1789  void
1790  operator() (const ExtractAndFactorizeTag &, const member_type &member) const {
1791  // btdm is packed and sorted from largest one
1792  const local_ordinal_type packidx = member.league_rank();
1793 
1794  const local_ordinal_type partidx = packptr(packidx);
1795  const local_ordinal_type npacks = packptr(packidx+1) - partidx;
1796  const local_ordinal_type i0 = pack_td_ptr(partidx);
1797  const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx);
1798 
1799  internal_vector_scratch_type_3d_view
1800  WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
1801  if (vector_loop_size == 1) {
1802  extract(partidx, npacks);
1803  factorize(member, i0, nrows, 0, internal_vector_values, WW);
1804  } else {
1805  Kokkos::parallel_for
1806  (Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const local_ordinal_type &v) {
1807  const local_ordinal_type vbeg = v*internal_vector_length;
1808  if (vbeg < npacks)
1809  extract(member, partidx+vbeg, npacks, vbeg);
1810  factorize(member, i0, nrows, v, internal_vector_values, WW);
1811  });
1812  }
1813  }
1814 
1815  void run() {
1816  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN("BlockTriDi::ExtractAdnFactorize");
1817  const local_ordinal_type team_size =
1818  ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
1819  recommended_team_size(blocksize, vector_length, internal_vector_length);
1820  const local_ordinal_type per_team_scratch = internal_vector_scratch_type_3d_view::
1821  shmem_size(blocksize, blocksize, vector_loop_size);
1822 
1823  Kokkos::TeamPolicy<execution_space,ExtractAndFactorizeTag>
1824  policy(packptr.extent(0)-1, team_size, vector_loop_size);
1825 #if defined(KOKKOS_ENABLE_DEPRECATED_CODE)
1826  Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ExtractAndFactorizeTag>",
1827  policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *this);
1828 #else
1829  policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch));
1830  Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ExtractAndFactorizeTag>",
1831  policy, *this);
1832 #endif
1833  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
1834  }
1835 
1836  };
1837 
1841  template<typename MatrixType>
1842  void
1843  performNumericPhase(const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A,
1844  const PartInterface<MatrixType> &interf,
1846  const typename ImplType<MatrixType>::magnitude_type tiny) {
1847  IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::NumericPhase");
1848  ExtractAndFactorizeTridiags<MatrixType> function(btdm, interf, A, tiny);
1849  function.run();
1850  }
1851 
1855  template<typename MatrixType>
1857  public:
1859  using execution_space = typename impl_type::execution_space;
1860 
1861  using local_ordinal_type = typename impl_type::local_ordinal_type;
1862  using impl_scalar_type = typename impl_type::impl_scalar_type;
1863  using magnitude_type = typename impl_type::magnitude_type;
1864  using tpetra_multivector_type = typename impl_type::tpetra_multivector_type;
1865  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
1866  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
1867  using impl_scalar_type_2d_view = typename impl_type::impl_scalar_type_2d_view;
1868  static constexpr int vector_length = impl_type::vector_length;
1869 
1870  using member_type = typename Kokkos::TeamPolicy<execution_space>::member_type;
1871 
1872  private:
1873  // part interface
1874  const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
1875  const ConstUnmanaged<local_ordinal_type_1d_view> packptr;
1876  const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
1877  const ConstUnmanaged<local_ordinal_type_1d_view> part2rowidx0;
1878  const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
1879  const local_ordinal_type blocksize;
1880  const local_ordinal_type num_vectors;
1881 
1882  // packed multivector output (or input)
1883  Unmanaged<vector_type_3d_view> packed_multivector;
1884  Unmanaged<impl_scalar_type_2d_view> scalar_multivector;
1885 
1886  template<typename TagType>
1887  KOKKOS_INLINE_FUNCTION
1888  void copy_multivectors(const local_ordinal_type &j,
1889  const local_ordinal_type &vi,
1890  const local_ordinal_type &pri,
1891  const local_ordinal_type &ri0) const {
1892  for (local_ordinal_type col=0;col<num_vectors;++col)
1893  for (local_ordinal_type i=0;i<blocksize;++i)
1894  packed_multivector(pri, i, col)[vi] = scalar_multivector(blocksize*lclrow(ri0+j)+i,col);
1895  }
1896 
1897  public:
1898 
1899  MultiVectorConverter(const PartInterface<MatrixType> &interf,
1900  const vector_type_3d_view &pmv)
1901  : partptr(interf.partptr),
1902  packptr(interf.packptr),
1903  part2packrowidx0(interf.part2packrowidx0),
1904  part2rowidx0(interf.part2rowidx0),
1905  lclrow(interf.lclrow),
1906  blocksize(pmv.extent(1)),
1907  num_vectors(pmv.extent(2)),
1908  packed_multivector(pmv) {}
1909 
1910  // TODO:: modify this routine similar to the team level functions
1911  inline
1912  void
1913  operator() (const local_ordinal_type &packidx) const {
1914  local_ordinal_type partidx = packptr(packidx);
1915  local_ordinal_type npacks = packptr(packidx+1) - partidx;
1916  const local_ordinal_type pri0 = part2packrowidx0(partidx);
1917 
1918  local_ordinal_type ri0[vector_length] = {};
1919  local_ordinal_type nrows[vector_length] = {};
1920  for (local_ordinal_type v=0;v<npacks;++v,++partidx) {
1921  ri0[v] = part2rowidx0(partidx);
1922  nrows[v] = part2rowidx0(partidx+1) - ri0[v];
1923  }
1924  for (local_ordinal_type j=0;j<nrows[0];++j) {
1925  local_ordinal_type cnt = 1;
1926  for (;cnt<npacks && j!= nrows[cnt];++cnt);
1927  npacks = cnt;
1928  const local_ordinal_type pri = pri0 + j;
1929  for (local_ordinal_type col=0;col<num_vectors;++col)
1930  for (local_ordinal_type i=0;i<blocksize;++i)
1931  for (local_ordinal_type v=0;v<npacks;++v)
1932  packed_multivector(pri, i, col)[v] = scalar_multivector(blocksize*lclrow(ri0[v]+j)+i,col);
1933  }
1934  }
1935 
1936  KOKKOS_INLINE_FUNCTION
1937  void
1938  operator() (const member_type &member) const {
1939  const local_ordinal_type packidx = member.league_rank();
1940  const local_ordinal_type partidx_begin = packptr(packidx);
1941  const local_ordinal_type npacks = packptr(packidx+1) - partidx_begin;
1942  const local_ordinal_type pri0 = part2packrowidx0(partidx_begin);
1943  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, npacks), [&](const local_ordinal_type &v) {
1944  const local_ordinal_type partidx = partidx_begin + v;
1945  const local_ordinal_type ri0 = part2rowidx0(partidx);
1946  const local_ordinal_type nrows = part2rowidx0(partidx+1) - ri0;
1947 
1948  if (nrows == 1) {
1949  const local_ordinal_type pri = pri0;
1950  for (local_ordinal_type col=0;col<num_vectors;++col) {
1951  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize), [&](const local_ordinal_type &i) {
1952  packed_multivector(pri, i, col)[v] = scalar_multivector(blocksize*lclrow(ri0)+i,col);
1953  });
1954  }
1955  } else {
1956  Kokkos::parallel_for(Kokkos::TeamThreadRange(member, nrows), [&](const local_ordinal_type &j) {
1957  const local_ordinal_type pri = pri0 + j;
1958  for (local_ordinal_type col=0;col<num_vectors;++col)
1959  for (local_ordinal_type i=0;i<blocksize;++i)
1960  packed_multivector(pri, i, col)[v] = scalar_multivector(blocksize*lclrow(ri0+j)+i,col);
1961  });
1962  }
1963  });
1964  }
1965 
1966  template<typename TpetraLocalViewType>
1967  void run(const TpetraLocalViewType &scalar_multivector_) {
1968  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN("BlockTriDi::MultiVectorConverter");
1969 
1970  scalar_multivector = scalar_multivector_;
1972 #if defined(KOKKOS_ENABLE_CUDA)
1973  const local_ordinal_type vl = vector_length;
1974  const Kokkos::TeamPolicy<execution_space> policy(packptr.extent(0) - 1, Kokkos::AUTO(), vl);
1975  Kokkos::parallel_for
1976  ("MultiVectorConverter::TeamPolicy", policy, *this);
1977 #endif
1978  } else {
1979 #if defined(__CUDA_ARCH__)
1980  TEUCHOS_TEST_FOR_EXCEPT_MSG(true, "Error: CUDA should not see this code");
1981 #else
1982  const Kokkos::RangePolicy<execution_space> policy(0, packptr.extent(0) - 1);
1983  Kokkos::parallel_for
1984  ("MultiVectorConverter::RangePolicy", policy, *this);
1985 #endif
1986  }
1987  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
1988  }
1989  };
1990 
1994  template<typename ArgActiveExecutionMemorySpace>
1996 
1997  template<>
1998  struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HostSpace> {
1999  typedef KokkosBatched::Experimental::Mode::Serial mode_type;
2000  typedef KokkosBatched::Experimental::Algo::Level2::Unblocked single_vector_algo_type;
2001 #if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__)
2002  typedef KokkosBatched::Experimental::Algo::Level3::CompactMKL multi_vector_algo_type;
2003 #else
2004  typedef KokkosBatched::Experimental::Algo::Level3::Blocked multi_vector_algo_type;
2005 #endif
2006  static int recommended_team_size(const int /* blksize */,
2007  const int /* vector_length */,
2008  const int /* internal_vector_length */) {
2009  return 1;
2010  }
2011  };
2012 
2013 #if defined(KOKKOS_ENABLE_CUDA)
2014  static inline int SolveTridiagsRecommendedCudaTeamSize(const int blksize,
2015  const int vector_length,
2016  const int internal_vector_length) {
2017  const int vector_size = vector_length/internal_vector_length;
2018  int total_team_size(0);
2019  if (blksize <= 5) total_team_size = 32;
2020  else if (blksize <= 9) total_team_size = 64;
2021  else if (blksize <= 12) total_team_size = 96;
2022  else if (blksize <= 16) total_team_size = 128;
2023  else if (blksize <= 20) total_team_size = 160;
2024  else total_team_size = 160;
2025  return total_team_size/vector_size;
2026  }
2027 
2028  template<>
2029  struct SolveTridiagsDefaultModeAndAlgo<Kokkos::CudaSpace> {
2030  typedef KokkosBatched::Experimental::Mode::Team mode_type;
2031  typedef KokkosBatched::Experimental::Algo::Level2::Unblocked single_vector_algo_type;
2032  typedef KokkosBatched::Experimental::Algo::Level3::Unblocked multi_vector_algo_type;
2033  static int recommended_team_size(const int blksize,
2034  const int vector_length,
2035  const int internal_vector_length) {
2036  return SolveTridiagsRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
2037  }
2038  };
2039  template<>
2040  struct SolveTridiagsDefaultModeAndAlgo<Kokkos::CudaUVMSpace> {
2041  typedef KokkosBatched::Experimental::Mode::Team mode_type;
2042  typedef KokkosBatched::Experimental::Algo::Level2::Unblocked single_vector_algo_type;
2043  typedef KokkosBatched::Experimental::Algo::Level3::Unblocked multi_vector_algo_type;
2044  static int recommended_team_size(const int blksize,
2045  const int vector_length,
2046  const int internal_vector_length) {
2047  return SolveTridiagsRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
2048  }
2049  };
2050 #endif
2051 
2052  template<typename MatrixType>
2053  struct SolveTridiags {
2054  public:
2055  using impl_type = ImplType<MatrixType>;
2056  using execution_space = typename impl_type::execution_space;
2057 
2058  using local_ordinal_type = typename impl_type::local_ordinal_type;
2059  using size_type = typename impl_type::size_type;
2060  using impl_scalar_type = typename impl_type::impl_scalar_type;
2061  using magnitude_type = typename impl_type::magnitude_type;
2063  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
2064  using size_type_1d_view = typename impl_type::size_type_1d_view;
2066  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
2067  using internal_vector_type_4d_view = typename impl_type::internal_vector_type_4d_view;
2068  using impl_scalar_type_4d_view = typename impl_type::impl_scalar_type_4d_view;
2069 
2070  using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
2071 
2072  using internal_vector_type =typename impl_type::internal_vector_type;
2073  static constexpr int vector_length = impl_type::vector_length;
2074  static constexpr int internal_vector_length = impl_type::internal_vector_length;
2075 
2077  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
2078  using impl_scalar_type_2d_view = typename impl_type::impl_scalar_type_2d_view;
2079 
2081  using team_policy_type = Kokkos::TeamPolicy<execution_space>;
2082  using member_type = typename team_policy_type::member_type;
2083 
2084  private:
2085  // part interface
2086  const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
2087  const ConstUnmanaged<local_ordinal_type_1d_view> packptr;
2088  const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
2089  const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
2090 
2091  // block tridiags
2092  const ConstUnmanaged<size_type_1d_view> pack_td_ptr;
2093 
2094  // block tridiags values
2095  const ConstUnmanaged<internal_vector_type_4d_view> D_internal_vector_values;
2096  const Unmanaged<internal_vector_type_4d_view> X_internal_vector_values;
2097 
2098  const local_ordinal_type vector_loop_size;
2099 
2100  // copy to multivectors : damping factor and Y_scalar_multivector
2101  Unmanaged<impl_scalar_type_2d_view> Y_scalar_multivector;
2102 #if defined(__CUDA_ARCH__)
2103  AtomicUnmanaged<impl_scalar_type_1d_view> Z_scalar_vector;
2104 #else
2105  /* */ Unmanaged<impl_scalar_type_1d_view> Z_scalar_vector;
2106 #endif
2107  const impl_scalar_type df;
2108  const bool compute_diff;
2109 
2110  public:
2111  SolveTridiags(const PartInterface<MatrixType> &interf,
2112  const BlockTridiags<MatrixType> &btdm,
2113  const vector_type_3d_view &pmv,
2114  const impl_scalar_type damping_factor,
2115  const bool is_norm_manager_active)
2116  :
2117  // interface
2118  partptr(interf.partptr),
2119  packptr(interf.packptr),
2120  part2packrowidx0(interf.part2packrowidx0),
2121  lclrow(interf.lclrow),
2122  // block tridiags and multivector
2123  pack_td_ptr(btdm.pack_td_ptr),
2124  D_internal_vector_values((internal_vector_type*)btdm.values.data(),
2125  btdm.values.extent(0),
2126  btdm.values.extent(1),
2127  btdm.values.extent(2),
2128  vector_length/internal_vector_length),
2129  X_internal_vector_values((internal_vector_type*)pmv.data(),
2130  pmv.extent(0),
2131  pmv.extent(1),
2132  pmv.extent(2),
2133  vector_length/internal_vector_length),
2134  vector_loop_size(vector_length/internal_vector_length),
2135  Y_scalar_multivector(),
2136  Z_scalar_vector(),
2137  df(damping_factor),
2138  compute_diff(is_norm_manager_active)
2139  {}
2140 
2141  public:
2142 
2144  KOKKOS_INLINE_FUNCTION
2145  void
2146  copyToFlatMultiVector(const member_type &member,
2147  const local_ordinal_type partidxbeg, // partidx for v = 0
2148  const local_ordinal_type npacks,
2149  const local_ordinal_type pri0,
2150  const local_ordinal_type v, // index with a loop of vector_loop_size
2151  const local_ordinal_type blocksize,
2152  const local_ordinal_type num_vectors) const {
2153  const local_ordinal_type vbeg = v*internal_vector_length;
2154  if (vbeg < npacks) {
2155  local_ordinal_type ri0_vals[internal_vector_length] = {};
2156  local_ordinal_type nrows_vals[internal_vector_length] = {};
2157  for (local_ordinal_type vv=vbeg,vi=0;vv<npacks && vi<internal_vector_length;++vv,++vi) {
2158  const local_ordinal_type partidx = partidxbeg+vv;
2159  ri0_vals[vi] = partptr(partidx);
2160  nrows_vals[vi] = partptr(partidx+1) - ri0_vals[vi];
2161  }
2162 
2163  impl_scalar_type z_partial_sum(0);
2164  if (nrows_vals[0] == 1) {
2165  const local_ordinal_type j=0, pri=pri0;
2166  {
2167  for (local_ordinal_type vv=vbeg,vi=0;vv<npacks && vi<internal_vector_length;++vv,++vi) {
2168  const local_ordinal_type ri0 = ri0_vals[vi];
2169  const local_ordinal_type nrows = nrows_vals[vi];
2170  if (j < nrows) {
2171  Kokkos::parallel_for
2172  (Kokkos::TeamThreadRange(member, blocksize),
2173  [&](const local_ordinal_type &i) {
2174  const local_ordinal_type row = blocksize*lclrow(ri0+j)+i;
2175  for (local_ordinal_type col=0;col<num_vectors;++col) {
2176  impl_scalar_type &y = Y_scalar_multivector(row,col);
2177  const impl_scalar_type yd = X_internal_vector_values(pri, i, col, v)[vi] - y;
2178  y += df*yd;
2179 
2180  {//if (compute_diff) {
2181  const auto yd_abs = Kokkos::ArithTraits<impl_scalar_type>::abs(yd);
2182  z_partial_sum += yd_abs*yd_abs;
2183  }
2184  }
2185  });
2186  }
2187  }
2188  }
2189  } else {
2190  Kokkos::parallel_for
2191  (Kokkos::TeamThreadRange(member, nrows_vals[0]),
2192  [&](const local_ordinal_type &j) {
2193  const local_ordinal_type pri = pri0 + j;
2194  for (local_ordinal_type vv=vbeg,vi=0;vv<npacks && vi<internal_vector_length;++vv,++vi) {
2195  const local_ordinal_type ri0 = ri0_vals[vi];
2196  const local_ordinal_type nrows = nrows_vals[vi];
2197  if (j < nrows) {
2198  for (local_ordinal_type col=0;col<num_vectors;++col) {
2199  for (local_ordinal_type i=0;i<blocksize;++i) {
2200  const local_ordinal_type row = blocksize*lclrow(ri0+j)+i;
2201  impl_scalar_type &y = Y_scalar_multivector(row,col);
2202  const impl_scalar_type yd = X_internal_vector_values(pri, i, col, v)[vi] - y;
2203  y += df*yd;
2204 
2205  {//if (compute_diff) {
2206  const auto yd_abs = Kokkos::ArithTraits<impl_scalar_type>::abs(yd);
2207  z_partial_sum += yd_abs*yd_abs;
2208  }
2209  }
2210  }
2211  }
2212  }
2213  });
2214  }
2215  //if (compute_diff)
2216  Z_scalar_vector(member.league_rank()) += z_partial_sum;
2217  }
2218  }
2219 
2223  template<typename WWViewType>
2224  KOKKOS_INLINE_FUNCTION
2225  void
2226  solveSingleVector(const member_type &member,
2227  const local_ordinal_type &blocksize,
2228  const local_ordinal_type &i0,
2229  const local_ordinal_type &r0,
2230  const local_ordinal_type &nrows,
2231  const local_ordinal_type &v,
2232  const WWViewType &WW) const {
2233  namespace KB = KokkosBatched::Experimental;
2234  typedef SolveTridiagsDefaultModeAndAlgo
2235  <Kokkos::Impl::ActiveExecutionMemorySpace> default_mode_and_algo_type;
2236  typedef default_mode_and_algo_type::mode_type default_mode_type;
2237  typedef default_mode_and_algo_type::single_vector_algo_type default_algo_type;
2238 
2239  // base pointers
2240  auto A = D_internal_vector_values.data();
2241  auto X = X_internal_vector_values.data();
2242 
2243  // constant
2244  const auto one = Kokkos::ArithTraits<magnitude_type>::one();
2245  const auto zero = Kokkos::ArithTraits<magnitude_type>::zero();
2246  //const local_ordinal_type num_vectors = X_scalar_values.extent(2);
2247 
2248  // const local_ordinal_type blocksize = D_scalar_values.extent(1);
2249  const local_ordinal_type astep = D_internal_vector_values.stride_0();
2250  const local_ordinal_type as0 = D_internal_vector_values.stride_1(); //blocksize*vector_length;
2251  const local_ordinal_type as1 = D_internal_vector_values.stride_2(); //vector_length;
2252  const local_ordinal_type xstep = X_internal_vector_values.stride_0();
2253  const local_ordinal_type xs0 = X_internal_vector_values.stride_1(); //vector_length;
2254 
2255  // for multiple rhs
2256  //const local_ordinal_type xs0 = num_vectors*vector_length; //X_scalar_values.stride_1();
2257  //const local_ordinal_type xs1 = vector_length; //X_scalar_values.stride_2();
2258 
2259  // move to starting point
2260  A += i0*astep + v;
2261  X += r0*xstep + v;
2262 
2263  //for (local_ordinal_type col=0;col<num_vectors;++col)
2264  if (nrows > 1) {
2265  // solve Lx = x
2266  KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
2267  (default_mode_type,default_algo_type,
2268  member,
2269  KB::Diag::Unit,
2270  blocksize,blocksize,
2271  one,
2272  A, as0, as1,
2273  X, xs0);
2274 
2275  for (local_ordinal_type tr=1;tr<nrows;++tr) {
2276  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2277  (default_mode_type,default_algo_type,
2278  member,
2279  blocksize, blocksize,
2280  -one,
2281  A+2*astep, as0, as1,
2282  X, xs0,
2283  one,
2284  X+1*xstep, xs0);
2285 
2286  KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
2287  (default_mode_type,default_algo_type,
2288  member,
2289  KB::Diag::Unit,
2290  blocksize,blocksize,
2291  one,
2292  A+3*astep, as0, as1,
2293  X+1*xstep, xs0);
2294 
2295  A += 3*astep;
2296  X += 1*xstep;
2297  }
2298 
2299  // solve Ux = x
2300  KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
2301  (default_mode_type,default_algo_type,
2302  member,
2303  KB::Diag::NonUnit,
2304  blocksize, blocksize,
2305  one,
2306  A, as0, as1,
2307  X, xs0);
2308 
2309  for (local_ordinal_type tr=nrows;tr>1;--tr) {
2310  A -= 3*astep;
2311  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2312  (default_mode_type,default_algo_type,
2313  member,
2314  blocksize, blocksize,
2315  -one,
2316  A+1*astep, as0, as1,
2317  X, xs0,
2318  one,
2319  X-1*xstep, xs0);
2320 
2321  KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
2322  (default_mode_type,default_algo_type,
2323  member,
2324  KB::Diag::NonUnit,
2325  blocksize, blocksize,
2326  one,
2327  A, as0, as1,
2328  X-1*xstep,xs0);
2329 
2330  X -= 1*xstep;
2331  }
2332  // for multiple rhs
2333  //X += xs1;
2334  } else {
2335  const local_ordinal_type ws0 = WW.stride_0();
2336  auto W = WW.data() + v;
2337  KOKKOSBATCHED_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE
2338  (default_mode_type,
2339  member, blocksize, X, xs0, W, ws0);
2340  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
2341  (default_mode_type,default_algo_type,
2342  member,
2343  blocksize, blocksize,
2344  one,
2345  A, as0, as1,
2346  W, xs0,
2347  zero,
2348  X, xs0);
2349  }
2350  }
2351 
2352  template<typename WWViewType>
2353  KOKKOS_INLINE_FUNCTION
2354  void
2355  solveMultiVector(const member_type &member,
2356  const local_ordinal_type &/* blocksize */,
2357  const local_ordinal_type &i0,
2358  const local_ordinal_type &r0,
2359  const local_ordinal_type &nrows,
2360  const local_ordinal_type &v,
2361  const WWViewType &WW) const {
2362  namespace KB = KokkosBatched::Experimental;
2363  typedef SolveTridiagsDefaultModeAndAlgo
2364  <Kokkos::Impl::ActiveExecutionMemorySpace> default_mode_and_algo_type;
2365  typedef default_mode_and_algo_type::mode_type default_mode_type;
2366  typedef default_mode_and_algo_type::multi_vector_algo_type default_algo_type;
2367 
2368  // constant
2369  const auto one = Kokkos::ArithTraits<magnitude_type>::one();
2370  const auto zero = Kokkos::ArithTraits<magnitude_type>::zero();
2371 
2372  // subview pattern
2373  auto A = Kokkos::subview(D_internal_vector_values, i0, Kokkos::ALL(), Kokkos::ALL(), v);
2374  auto X1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), Kokkos::ALL(), v);
2375  auto X2 = X1;
2376 
2377  local_ordinal_type i = i0, r = r0;
2378 
2379 
2380  if (nrows > 1) {
2381  // solve Lx = x
2382  KB::Trsm<member_type,
2383  KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
2384  default_mode_type,default_algo_type>
2385  ::invoke(member, one, A, X1);
2386  for (local_ordinal_type tr=1;tr<nrows;++tr,i+=3) {
2387  A.assign_data( &D_internal_vector_values(i+2,0,0,v) );
2388  X2.assign_data( &X_internal_vector_values(++r,0,0,v) );
2389  KB::Gemm<member_type,
2390  KB::Trans::NoTranspose,KB::Trans::NoTranspose,
2391  default_mode_type,default_algo_type>
2392  ::invoke(member, -one, A, X1, one, X2);
2393  A.assign_data( &D_internal_vector_values(i+3,0,0,v) );
2394  KB::Trsm<member_type,
2395  KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
2396  default_mode_type,default_algo_type>
2397  ::invoke(member, one, A, X2);
2398  X1.assign_data( X2.data() );
2399  }
2400 
2401  // solve Ux = x
2402  KB::Trsm<member_type,
2403  KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
2404  default_mode_type,default_algo_type>
2405  ::invoke(member, one, A, X1);
2406  for (local_ordinal_type tr=nrows;tr>1;--tr) {
2407  i -= 3;
2408  A.assign_data( &D_internal_vector_values(i+1,0,0,v) );
2409  X2.assign_data( &X_internal_vector_values(--r,0,0,v) );
2410  KB::Gemm<member_type,
2411  KB::Trans::NoTranspose,KB::Trans::NoTranspose,
2412  default_mode_type,default_algo_type>
2413  ::invoke(member, -one, A, X1, one, X2);
2414  A.assign_data( &D_internal_vector_values(i,0,0,v) );
2415  KB::Trsm<member_type,
2416  KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
2417  default_mode_type,default_algo_type>
2418  ::invoke(member, one, A, X2);
2419  X1.assign_data( X2.data() );
2420  }
2421  } else {
2422  // matrix is already inverted
2423  auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
2424  KB::Copy<member_type,KB::Trans::NoTranspose,default_mode_type>
2425  ::invoke(member, X1, W);
2426  KB::Gemm<member_type,
2427  KB::Trans::NoTranspose,KB::Trans::NoTranspose,
2428  default_mode_type,default_algo_type>
2429  ::invoke(member, one, A, W, zero, X1);
2430  }
2431  }
2432 
2433  template<int B> struct SingleVectorTag {};
2434  template<int B> struct MultiVectorTag {};
2435 
2436  template<int B>
2437  KOKKOS_INLINE_FUNCTION
2438  void
2439  operator() (const SingleVectorTag<B> &, const member_type &member) const {
2440  const local_ordinal_type packidx = member.league_rank();
2441  const local_ordinal_type partidx = packptr(packidx);
2442  const local_ordinal_type npacks = packptr(packidx+1) - partidx;
2443  const local_ordinal_type pri0 = part2packrowidx0(partidx);
2444  const local_ordinal_type i0 = pack_td_ptr(partidx);
2445  const local_ordinal_type r0 = part2packrowidx0(partidx);
2446  const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx);
2447  const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
2448  const local_ordinal_type num_vectors = 1;
2449  internal_vector_scratch_type_3d_view
2450  WW(member.team_scratch(0), blocksize, 1, vector_loop_size);
2451  Kokkos::single(Kokkos::PerTeam(member), [&]() {
2452  Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
2453  });
2454  Kokkos::parallel_for
2455  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
2456  solveSingleVector(member, blocksize, i0, r0, nrows, v, WW);
2457  copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
2458  });
2459  }
2460 
2461  template<int B>
2462  KOKKOS_INLINE_FUNCTION
2463  void
2464  operator() (const MultiVectorTag<B> &, const member_type &member) const {
2465  const local_ordinal_type packidx = member.league_rank();
2466  const local_ordinal_type partidx = packptr(packidx);
2467  const local_ordinal_type npacks = packptr(packidx+1) - partidx;
2468  const local_ordinal_type pri0 = part2packrowidx0(partidx);
2469  const local_ordinal_type i0 = pack_td_ptr(partidx);
2470  const local_ordinal_type r0 = part2packrowidx0(partidx);
2471  const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx);
2472  const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
2473  const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
2474 
2475  internal_vector_scratch_type_3d_view
2476  WW(member.team_scratch(0), blocksize, num_vectors, vector_loop_size);
2477  Kokkos::single(Kokkos::PerTeam(member), [&]() {
2478  Z_scalar_vector(member.league_rank()) = impl_scalar_type(0);
2479  });
2480  Kokkos::parallel_for
2481  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
2482  solveMultiVector(member, blocksize, i0, r0, nrows, v, WW);
2483  copyToFlatMultiVector(member, partidx, npacks, pri0, v, blocksize, num_vectors);
2484  });
2485  }
2486 
2487  void run(const impl_scalar_type_2d_view &Y,
2488  const impl_scalar_type_1d_view &Z) {
2489  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN("BlockTriDi::SolveTridiags");
2490 
2492  this->Y_scalar_multivector = Y;
2493  this->Z_scalar_vector = Z;
2494 
2495  const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
2496  const local_ordinal_type blocksize = D_internal_vector_values.extent(1);
2497 
2498  const local_ordinal_type team_size =
2499  SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
2500  recommended_team_size(blocksize, vector_length, internal_vector_length);
2501  const int per_team_scratch = internal_vector_scratch_type_3d_view
2502  ::shmem_size(blocksize, num_vectors, vector_loop_size);
2503 
2504 #if defined(KOKKOS_ENABLE_DEPRECATED_CODE)
2505 #define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B) \
2506  if (num_vectors == 1) { \
2507  const Kokkos::TeamPolicy<execution_space,SingleVectorTag<B> > \
2508  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
2509  Kokkos::parallel_for \
2510  ("SolveTridiags::TeamPolicy::run<SingleVector>", \
2511  policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *this); \
2512  } else { \
2513  const Kokkos::TeamPolicy<execution_space,MultiVectorTag<B> > \
2514  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
2515  Kokkos::parallel_for \
2516  ("SolveTridiags::TeamPolicy::run<MultiVector>", \
2517  policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *this); \
2518  } break
2519 #else
2520 #define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B) \
2521  if (num_vectors == 1) { \
2522  Kokkos::TeamPolicy<execution_space,SingleVectorTag<B> > \
2523  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
2524  policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
2525  Kokkos::parallel_for \
2526  ("SolveTridiags::TeamPolicy::run<SingleVector>", \
2527  policy, *this); \
2528  } else { \
2529  Kokkos::TeamPolicy<execution_space,MultiVectorTag<B> > \
2530  policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
2531  policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
2532  Kokkos::parallel_for \
2533  ("SolveTridiags::TeamPolicy::run<MultiVector>", \
2534  policy, *this); \
2535  } break
2536 #endif
2537  switch (blocksize) {
2538  case 3: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 3);
2539  case 5: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 5);
2540  case 7: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 7);
2541  case 9: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 9);
2542  case 10: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(10);
2543  case 11: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(11);
2544  case 16: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(16);
2545  case 17: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(17);
2546  case 18: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(18);
2547  default : BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 0);
2548  }
2549 #undef BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS
2550 
2551  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
2552  }
2553  };
2554 
2558  static inline int ComputeResidualVectorRecommendedCudaVectorSize(const int blksize,
2559  const int team_size) {
2560  int total_team_size(0);
2561  if (blksize <= 5) total_team_size = 32;
2562  else if (blksize <= 9) total_team_size = 64;
2563  else if (blksize <= 12) total_team_size = 96;
2564  else if (blksize <= 16) total_team_size = 128;
2565  else if (blksize <= 20) total_team_size = 160;
2566  else total_team_size = 160;
2567  return total_team_size/team_size;
2568  }
2569 
2570  template<typename MatrixType>
2571  struct ComputeResidualVector {
2572  public:
2573  using impl_type = ImplType<MatrixType>;
2574  using execution_space = typename impl_type::execution_space;
2575  using memory_space = typename impl_type::memory_space;
2576 
2577  using local_ordinal_type = typename impl_type::local_ordinal_type;
2578  using size_type = typename impl_type::size_type;
2579  using impl_scalar_type = typename impl_type::impl_scalar_type;
2580  using magnitude_type = typename impl_type::magnitude_type;
2582  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
2583  using size_type_1d_view = typename impl_type::size_type_1d_view;
2584  using tpetra_block_access_view_type = typename impl_type::tpetra_block_access_view_type; // block crs (layout right)
2585  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
2586  using impl_scalar_type_2d_view = typename impl_type::impl_scalar_type_2d_view; // block multivector (layout left)
2587  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
2588  using impl_scalar_type_4d_view = typename impl_type::impl_scalar_type_4d_view;
2589  static constexpr int vector_length = impl_type::vector_length;
2590 
2592  using member_type = typename Kokkos::TeamPolicy<execution_space>::member_type;
2593 
2594  // enum for max blocksize and vector length
2595  enum : int { max_blocksize = 32 };
2596 
2597  private:
2598  ConstUnmanaged<impl_scalar_type_2d_view> b;
2599  ConstUnmanaged<impl_scalar_type_2d_view> x; // x_owned
2600  ConstUnmanaged<impl_scalar_type_2d_view> x_remote;
2601  /* */Unmanaged<impl_scalar_type_2d_view> y;
2602  /* */Unmanaged<vector_type_3d_view> y_packed;
2603  /* */Unmanaged<impl_scalar_type_4d_view> y_packed_scalar;
2604 
2605  // AmD information
2606  const ConstUnmanaged<size_type_1d_view> rowptr, rowptr_remote;
2607  const ConstUnmanaged<local_ordinal_type_1d_view> colindsub, colindsub_remote;
2608  const ConstUnmanaged<impl_scalar_type_1d_view> tpetra_values;
2609 
2610  // block crs graph information
2611  // for cuda (kokkos crs graph uses a different size_type from size_t)
2612  using a_rowptr_value_type = typename Kokkos::ViewTraits<local_ordinal_type*,typename impl_type::device_type>::size_type;
2613  const ConstUnmanaged<Kokkos::View<a_rowptr_value_type*,typename impl_type::device_type> > A_rowptr;
2614  const ConstUnmanaged<local_ordinal_type_1d_view> A_colind;
2615 
2616  // blocksize
2617  const local_ordinal_type blocksize_requested;
2618 
2619  // part interface
2620  const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
2621  const ConstUnmanaged<local_ordinal_type_1d_view> part2rowidx0;
2622  const ConstUnmanaged<local_ordinal_type_1d_view> rowidx2part;
2623  const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
2624  const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
2625  const ConstUnmanaged<local_ordinal_type_1d_view> dm2cm;
2626  const bool is_dm2cm_active;
2627 
2628  public:
2629  template<typename LocalCrsGraphType>
2630  ComputeResidualVector(const AmD<MatrixType> &amd,
2631  const LocalCrsGraphType &graph,
2632  const local_ordinal_type &blocksize_requested_,
2633  const PartInterface<MatrixType> &interf,
2634  const local_ordinal_type_1d_view &dm2cm_)
2635  : rowptr(amd.rowptr), rowptr_remote(amd.rowptr_remote),
2636  colindsub(amd.A_colindsub), colindsub_remote(amd.A_colindsub_remote),
2637  tpetra_values(amd.tpetra_values),
2638  A_rowptr(graph.row_map),
2639  A_colind(graph.entries),
2640  blocksize_requested(blocksize_requested_),
2641  part2packrowidx0(interf.part2packrowidx0),
2642  part2rowidx0(interf.part2rowidx0),
2643  rowidx2part(interf.rowidx2part),
2644  partptr(interf.partptr),
2645  lclrow(interf.lclrow),
2646  dm2cm(dm2cm_),
2647  is_dm2cm_active(dm2cm_.span() > 0)
2648  {}
2649 
2650 
2651  inline
2652  void
2653  SerialGemv(const local_ordinal_type &blocksize,
2654  const impl_scalar_type * const KOKKOS_RESTRICT AA,
2655  const impl_scalar_type * const KOKKOS_RESTRICT xx,
2656  /* */ impl_scalar_type * KOKKOS_RESTRICT yy) const {
2657  for (local_ordinal_type k0=0;k0<blocksize;++k0) {
2658  impl_scalar_type val = 0;
2659  const local_ordinal_type offset = k0*blocksize;
2660 #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
2661 # pragma ivdep
2662 #endif
2663 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
2664 # pragma unroll
2665 #endif
2666  for (local_ordinal_type k1=0;k1<blocksize;++k1)
2667  val += AA[offset+k1]*xx[k1];
2668  yy[k0] -= val;
2669  }
2670  }
2671 
2672  template<typename bbViewType, typename yyViewType>
2673  KOKKOS_INLINE_FUNCTION
2674  void
2675  VectorCopy(const member_type &member,
2676  const local_ordinal_type &blocksize,
2677  const bbViewType &bb,
2678  const yyViewType &yy) const {
2679  Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize), [&](const local_ordinal_type &k0) {
2680  yy(k0) = bb(k0);
2681  });
2682  }
2683 
2684  template<typename AAViewType, typename xxViewType, typename yyViewType>
2685  KOKKOS_INLINE_FUNCTION
2686  void
2687  TeamVectorGemv(const member_type &member,
2688  const local_ordinal_type &blocksize,
2689  const AAViewType &AA,
2690  const xxViewType &xx,
2691  const yyViewType &yy) const {
2692  Kokkos::parallel_for
2693  (Kokkos::TeamThreadRange(member, blocksize),
2694  [&](const local_ordinal_type &k0) {
2695  impl_scalar_type val = 0;
2696  Kokkos::parallel_for
2697  (Kokkos::ThreadVectorRange(member, blocksize),
2698  [&](const local_ordinal_type &k1) {
2699  val += AA(k0,k1)*xx(k1);
2700  });
2701  Kokkos::atomic_fetch_add(&yy(k0), -val);
2702  });
2703  }
2704 
2705  template<typename AAViewType, typename xxViewType, typename yyViewType>
2706  KOKKOS_INLINE_FUNCTION
2707  void
2708  VectorGemv(const member_type &member,
2709  const local_ordinal_type &blocksize,
2710  const AAViewType &AA,
2711  const xxViewType &xx,
2712  const yyViewType &yy) const {
2713  Kokkos::parallel_for
2714  (Kokkos::ThreadVectorRange(member, blocksize),
2715  [&](const local_ordinal_type &k0) {
2716  impl_scalar_type val(0);
2717  for (local_ordinal_type k1=0;k1<blocksize;++k1) {
2718  val += AA(k0,k1)*xx(k1);
2719  }
2720  Kokkos::atomic_fetch_add(&yy(k0), -val);
2721  });
2722  }
2723 
2724  // template<typename AAViewType, typename xxViewType, typename yyViewType>
2725  // KOKKOS_INLINE_FUNCTION
2726  // void
2727  // VectorGemv(const member_type &member,
2728  // const local_ordinal_type &blocksize,
2729  // const AAViewType &AA,
2730  // const xxViewType &xx,
2731  // const yyViewType &yy) const {
2732  // for (local_ordinal_type k0=0;k0<blocksize;++k0) {
2733  // impl_scalar_type val = 0;
2734  // Kokkos::parallel_for
2735  // (Kokkos::ThreadVectorRange(member, blocksize),
2736  // [&](const local_ordinal_type &k1) {
2737  // val += AA(k0,k1)*xx(k1);
2738  // });
2739  // Kokkos::atomic_fetch_add(&yy(k0), -val);
2740  // }
2741  // }
2742 
2743  struct SeqTag {};
2744 
2745  inline
2746  void
2747  operator() (const SeqTag &, const local_ordinal_type& i) const {
2748  const local_ordinal_type blocksize = blocksize_requested;
2749  const local_ordinal_type blocksize_square = blocksize*blocksize;
2750 
2751  // constants
2752  const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
2753  const local_ordinal_type num_vectors = y.extent(1);
2754  const local_ordinal_type row = i*blocksize;
2755  for (local_ordinal_type col=0;col<num_vectors;++col) {
2756  // y := b
2757  impl_scalar_type *yy = &y(row, col);
2758  const impl_scalar_type * const bb = &b(row, col);
2759  memcpy(yy, bb, sizeof(impl_scalar_type)*blocksize);
2760 
2761  // y -= Rx
2762  const size_type A_k0 = A_rowptr[i];
2763  for (size_type k=rowptr[i];k<rowptr[i+1];++k) {
2764  const size_type j = A_k0 + colindsub[k];
2765  const impl_scalar_type * const AA = &tpetra_values(j*blocksize_square);
2766  const impl_scalar_type * const xx = &x(A_colind[j]*blocksize, col);
2767  SerialGemv(blocksize,AA,xx,yy);
2768  }
2769  }
2770  }
2771 
2772  KOKKOS_INLINE_FUNCTION
2773  void
2774  operator() (const SeqTag &, const member_type &member) const {
2775  namespace KB = KokkosBatched::Experimental;
2776 
2777  // constants
2778  const local_ordinal_type blocksize = blocksize_requested;
2779  const local_ordinal_type blocksize_square = blocksize*blocksize;
2780 
2781  const local_ordinal_type lr = member.league_rank();
2782  const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
2783  const local_ordinal_type num_vectors = y.extent(1);
2784 
2785  // subview pattern
2786  auto bb = Kokkos::subview(b, block_range, 0);
2787  auto xx = bb;
2788  auto yy = Kokkos::subview(y, block_range, 0);
2789  auto A_block = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
2790 
2791  const local_ordinal_type row = lr*blocksize;
2792  for (local_ordinal_type col=0;col<num_vectors;++col) {
2793  // y := b
2794  yy.assign_data(&y(row, col));
2795  bb.assign_data(&b(row, col));
2796  if (member.team_rank() == 0)
2797  VectorCopy(member, blocksize, bb, yy);
2798  member.team_barrier();
2799 
2800  // y -= Rx
2801  const size_type A_k0 = A_rowptr[lr];
2802  Kokkos::parallel_for
2803  (Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr+1]),
2804  [&](const local_ordinal_type &k) {
2805  const size_type j = A_k0 + colindsub[k];
2806  A_block.assign_data( &tpetra_values(j*blocksize_square) );
2807  xx.assign_data( &x(A_colind[j]*blocksize, col) );
2808  VectorGemv(member, blocksize, A_block, xx, yy);
2809  });
2810  }
2811  }
2812 
2813  template<int B>
2814  struct AsyncTag {};
2815 
2816  template<int B>
2817  inline
2818  void
2819  operator() (const AsyncTag<B> &, const local_ordinal_type &rowidx) const {
2820  const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
2821  const local_ordinal_type blocksize_square = blocksize*blocksize;
2822 
2823  // constants
2824  const local_ordinal_type partidx = rowidx2part(rowidx);
2825  const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
2826  const local_ordinal_type v = partidx % vector_length;
2827 
2828  const local_ordinal_type num_vectors = y_packed.extent(2);
2829  const local_ordinal_type num_local_rows = lclrow.extent(0);
2830 
2831  // temporary buffer for y flat
2832  impl_scalar_type yy[B == 0 ? max_blocksize : B] = {};
2833 
2834  const local_ordinal_type lr = lclrow(rowidx);
2835  const local_ordinal_type row = lr*blocksize;
2836  for (local_ordinal_type col=0;col<num_vectors;++col) {
2837  // y := b
2838  memcpy(yy, &b(row, col), sizeof(impl_scalar_type)*blocksize);
2839 
2840  // y -= Rx
2841  const size_type A_k0 = A_rowptr[lr];
2842  for (size_type k=rowptr[lr];k<rowptr[lr+1];++k) {
2843  const size_type j = A_k0 + colindsub[k];
2844  const impl_scalar_type * const AA = &tpetra_values(j*blocksize_square);
2845  const local_ordinal_type A_colind_at_j = A_colind[j];
2846  if (A_colind_at_j < num_local_rows) {
2847  const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
2848  const impl_scalar_type * const xx = &x(loc*blocksize, col);
2849  SerialGemv(blocksize, AA,xx,yy);
2850  } else {
2851  const auto loc = A_colind_at_j - num_local_rows;
2852  const impl_scalar_type * const xx_remote = &x_remote(loc*blocksize, col);
2853  SerialGemv(blocksize, AA,xx_remote,yy);
2854  }
2855  }
2856  // move yy to y_packed
2857  for (local_ordinal_type k=0;k<blocksize;++k)
2858  y_packed(pri, k, col)[v] = yy[k];
2859  }
2860  }
2861 
2862  template<int B>
2863  KOKKOS_INLINE_FUNCTION
2864  void
2865  operator() (const AsyncTag<B> &, const member_type &member) const {
2866  const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
2867  const local_ordinal_type blocksize_square = blocksize*blocksize;
2868 
2869  // constants
2870  const local_ordinal_type rowidx = member.league_rank();
2871  const local_ordinal_type partidx = rowidx2part(rowidx);
2872  const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
2873  const local_ordinal_type v = partidx % vector_length;
2874 
2875  const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
2876  const local_ordinal_type num_vectors = y_packed_scalar.extent(2);
2877  const local_ordinal_type num_local_rows = lclrow.extent(0);
2878 
2879  // subview pattern
2880  auto bb = Kokkos::subview(b, block_range, 0);
2881  auto xx = Kokkos::subview(x, block_range, 0);
2882  auto xx_remote = Kokkos::subview(x_remote, block_range, 0);
2883  auto yy = Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0);
2884  auto A_block = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
2885 
2886  const local_ordinal_type lr = lclrow(rowidx);
2887  const local_ordinal_type row = lr*blocksize;
2888  for (local_ordinal_type col=0;col<num_vectors;++col) {
2889  // y := b
2890  bb.assign_data(&b(row, col));
2891  yy.assign_data(&y_packed_scalar(pri, 0, col, v));
2892  if (member.team_rank() == 0)
2893  VectorCopy(member, blocksize, bb, yy);
2894  member.team_barrier();
2895 
2896  // y -= Rx
2897  const size_type A_k0 = A_rowptr[lr];
2898  Kokkos::parallel_for
2899  (Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr+1]),
2900  [&](const local_ordinal_type &k) {
2901  const size_type j = A_k0 + colindsub[k];
2902  A_block.assign_data( &tpetra_values(j*blocksize_square) );
2903 
2904  const local_ordinal_type A_colind_at_j = A_colind[j];
2905  if (A_colind_at_j < num_local_rows) {
2906  const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
2907  xx.assign_data( &x(loc*blocksize, col) );
2908  VectorGemv(member, blocksize, A_block, xx, yy);
2909  } else {
2910  const auto loc = A_colind_at_j - num_local_rows;
2911  xx_remote.assign_data( &x_remote(loc*blocksize, col) );
2912  VectorGemv(member, blocksize, A_block, xx_remote, yy);
2913  }
2914  });
2915  }
2916  }
2917 
2918  template <int P, int B> struct OverlapTag {};
2919 
2920  template<int P, int B>
2921  inline
2922  void
2923  operator() (const OverlapTag<P,B> &, const local_ordinal_type& rowidx) const {
2924  const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
2925  const local_ordinal_type blocksize_square = blocksize*blocksize;
2926 
2927  // constants
2928  const local_ordinal_type partidx = rowidx2part(rowidx);
2929  const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
2930  const local_ordinal_type v = partidx % vector_length;
2931 
2932  const local_ordinal_type num_vectors = y_packed.extent(2);
2933  const local_ordinal_type num_local_rows = lclrow.extent(0);
2934 
2935  // temporary buffer for y flat
2936  impl_scalar_type yy[max_blocksize] = {};
2937 
2938  auto colindsub_used = (P == 0 ? colindsub : colindsub_remote);
2939  auto rowptr_used = (P == 0 ? rowptr : rowptr_remote);
2940 
2941  const local_ordinal_type lr = lclrow(rowidx);
2942  const local_ordinal_type row = lr*blocksize;
2943  for (local_ordinal_type col=0;col<num_vectors;++col) {
2944  if (P == 0) {
2945  // y := b
2946  memcpy(yy, &b(row, col), sizeof(impl_scalar_type)*blocksize);
2947  } else {
2948  // y (temporary) := 0
2949  memset(yy, 0, sizeof(impl_scalar_type)*blocksize);
2950  }
2951 
2952  // y -= Rx
2953  const size_type A_k0 = A_rowptr[lr];
2954  for (size_type k=rowptr_used[lr];k<rowptr_used[lr+1];++k) {
2955  const size_type j = A_k0 + colindsub_used[k];
2956  const impl_scalar_type * const AA = &tpetra_values(j*blocksize_square);
2957  const local_ordinal_type A_colind_at_j = A_colind[j];
2958  if (P == 0) {
2959  const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
2960  const impl_scalar_type * const xx = &x(loc*blocksize, col);
2961  SerialGemv(blocksize,AA,xx,yy);
2962  } else {
2963  const auto loc = A_colind_at_j - num_local_rows;
2964  const impl_scalar_type * const xx_remote = &x_remote(loc*blocksize, col);
2965  SerialGemv(blocksize,AA,xx_remote,yy);
2966  }
2967  }
2968  // move yy to y_packed
2969  if (P == 0) {
2970  for (local_ordinal_type k=0;k<blocksize;++k)
2971  y_packed(pri, k, col)[v] = yy[k];
2972  } else {
2973  for (local_ordinal_type k=0;k<blocksize;++k)
2974  y_packed(pri, k, col)[v] += yy[k];
2975  }
2976  }
2977  }
2978 
2979  template<int P, int B>
2980  KOKKOS_INLINE_FUNCTION
2981  void
2982  operator() (const OverlapTag<P,B> &, const member_type &member) const {
2983  const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
2984  const local_ordinal_type blocksize_square = blocksize*blocksize;
2985 
2986  // constants
2987  const local_ordinal_type rowidx = member.league_rank();
2988  const local_ordinal_type partidx = rowidx2part(rowidx);
2989  const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
2990  const local_ordinal_type v = partidx % vector_length;
2991 
2992  const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
2993  const local_ordinal_type num_vectors = y_packed_scalar.extent(2);
2994  const local_ordinal_type num_local_rows = lclrow.extent(0);
2995 
2996  // subview pattern
2997  auto bb = Kokkos::subview(b, block_range, 0);
2998  auto xx = bb; //Kokkos::subview(x, block_range, 0);
2999  auto xx_remote = bb; //Kokkos::subview(x_remote, block_range, 0);
3000  auto yy = Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0);
3001  auto A_block = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
3002  auto colindsub_used = (P == 0 ? colindsub : colindsub_remote);
3003  auto rowptr_used = (P == 0 ? rowptr : rowptr_remote);
3004 
3005  const local_ordinal_type lr = lclrow(rowidx);
3006  const local_ordinal_type row = lr*blocksize;
3007  for (local_ordinal_type col=0;col<num_vectors;++col) {
3008  yy.assign_data(&y_packed_scalar(pri, 0, col, v));
3009  if (P == 0) {
3010  // y := b
3011  bb.assign_data(&b(row, col));
3012  if (member.team_rank() == 0)
3013  VectorCopy(member, blocksize, bb, yy);
3014  member.team_barrier();
3015  }
3016 
3017  // y -= Rx
3018  const size_type A_k0 = A_rowptr[lr];
3019  Kokkos::parallel_for
3020  (Kokkos::TeamThreadRange(member, rowptr_used[lr], rowptr_used[lr+1]),
3021  [&](const local_ordinal_type &k) {
3022  const size_type j = A_k0 + colindsub_used[k];
3023  A_block.assign_data( &tpetra_values(j*blocksize_square) );
3024 
3025  const local_ordinal_type A_colind_at_j = A_colind[j];
3026  if (P == 0) {
3027  const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
3028  xx.assign_data( &x(loc*blocksize, col) );
3029  VectorGemv(member, blocksize, A_block, xx, yy);
3030  } else {
3031  const auto loc = A_colind_at_j - num_local_rows;
3032  xx_remote.assign_data( &x_remote(loc*blocksize, col) );
3033  VectorGemv(member, blocksize, A_block, xx_remote, yy);
3034  }
3035  });
3036  }
3037  }
3038 
3039  // y = b - Rx; seq method
3040  template<typename MultiVectorLocalViewTypeY,
3041  typename MultiVectorLocalViewTypeB,
3042  typename MultiVectorLocalViewTypeX>
3043  void run(const MultiVectorLocalViewTypeY &y_,
3044  const MultiVectorLocalViewTypeB &b_,
3045  const MultiVectorLocalViewTypeX &x_) {
3046  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN("BlockTriDi::ComputeResidual::<SeqTag>");
3047 
3048  y = y_; b = b_; x = x_;
3049  if (is_cuda<execution_space>::value) {
3050 #if defined(KOKKOS_ENABLE_CUDA)
3051  const local_ordinal_type blocksize = blocksize_requested;
3052  const local_ordinal_type team_size = 8;
3053  const local_ordinal_type vector_size = ComputeResidualVectorRecommendedCudaVectorSize(blocksize, team_size);
3054  const Kokkos::TeamPolicy<execution_space,SeqTag> policy(rowptr.extent(0) - 1, team_size, vector_size);
3055  Kokkos::parallel_for
3056  ("ComputeResidual::TeamPolicy::run<SeqTag>", policy, *this);
3057 #endif
3058  } else {
3059 #if defined(__CUDA_ARCH__)
3060  TEUCHOS_TEST_FOR_EXCEPT_MSG(true, "Error: CUDA should not see this code");
3061 #else
3062  const Kokkos::RangePolicy<execution_space,SeqTag> policy(0, rowptr.extent(0) - 1);
3063  Kokkos::parallel_for
3064  ("ComputeResidual::RangePolicy::run<SeqTag>", policy, *this);
3065 #endif
3066  }
3067  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3068  }
3069 
3070  // y = b - R (x , x_remote)
3071  template<typename MultiVectorLocalViewTypeB,
3072  typename MultiVectorLocalViewTypeX,
3073  typename MultiVectorLocalViewTypeX_Remote>
3074  void run(const vector_type_3d_view &y_packed_,
3075  const MultiVectorLocalViewTypeB &b_,
3076  const MultiVectorLocalViewTypeX &x_,
3077  const MultiVectorLocalViewTypeX_Remote &x_remote_) {
3078  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN("BlockTriDi::ComputeResidual::<AsyncTag>");
3079 
3080  b = b_; x = x_; x_remote = x_remote_;
3081  if (is_cuda<execution_space>::value) {
3082 #if defined(KOKKOS_ENABLE_CUDA)
3083  y_packed_scalar = impl_scalar_type_4d_view((impl_scalar_type*)y_packed_.data(),
3084  y_packed_.extent(0),
3085  y_packed_.extent(1),
3086  y_packed_.extent(2),
3087  vector_length);
3088 #endif
3089  } else {
3090  y_packed = y_packed_;
3091  }
3092 
3093  if (is_cuda<execution_space>::value) {
3094 #if defined(KOKKOS_ENABLE_CUDA)
3095  const local_ordinal_type blocksize = blocksize_requested;
3096  const local_ordinal_type team_size = 8;
3097  const local_ordinal_type vector_size = ComputeResidualVectorRecommendedCudaVectorSize(blocksize, team_size);
3098  // local_ordinal_type vl_power_of_two = 1;
3099  // for (;vl_power_of_two<=blocksize_requested;vl_power_of_two*=2);
3100  // vl_power_of_two *= (vl_power_of_two < blocksize_requested ? 2 : 1);
3101  // const local_ordinal_type vl = vl_power_of_two > vector_length ? vector_length : vl_power_of_two;
3102 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) { \
3103  const Kokkos::TeamPolicy<execution_space,AsyncTag<B> > \
3104  policy(rowidx2part.extent(0), team_size, vector_size); \
3105  Kokkos::parallel_for \
3106  ("ComputeResidual::TeamPolicy::run<AsyncTag>", \
3107  policy, *this); } break
3108  switch (blocksize_requested) {
3109  case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
3110  case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
3111  case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
3112  case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
3113  case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
3114  case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
3115  case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
3116  case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
3117  case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
3118  default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
3119  }
3120 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
3121 #endif
3122  } else {
3123 #if defined(__CUDA_ARCH__)
3124  TEUCHOS_TEST_FOR_EXCEPT_MSG(true, "Error: CUDA should not see this code");
3125 #else
3126 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) { \
3127  const Kokkos::RangePolicy<execution_space,AsyncTag<B> > policy(0, rowidx2part.extent(0)); \
3128  Kokkos::parallel_for \
3129  ("ComputeResidual::RangePolicy::run<AsyncTag>", \
3130  policy, *this); } break
3131  switch (blocksize_requested) {
3132  case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
3133  case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
3134  case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
3135  case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
3136  case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
3137  case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
3138  case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
3139  case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
3140  case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
3141  default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
3142  }
3143 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
3144 #endif
3145  }
3146  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3147  }
3148 
3149  // y = b - R (y , y_remote)
3150  template<typename MultiVectorLocalViewTypeB,
3151  typename MultiVectorLocalViewTypeX,
3152  typename MultiVectorLocalViewTypeX_Remote>
3153  void run(const vector_type_3d_view &y_packed_,
3154  const MultiVectorLocalViewTypeB &b_,
3155  const MultiVectorLocalViewTypeX &x_,
3156  const MultiVectorLocalViewTypeX_Remote &x_remote_,
3157  const bool compute_owned) {
3158  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN("BlockTriDi::ComputeResidual::<OverlapTag>");
3159 
3160  b = b_; x = x_; x_remote = x_remote_;
3161  if (is_cuda<execution_space>::value) {
3162 #if defined(KOKKOS_ENABLE_CUDA)
3163  y_packed_scalar = impl_scalar_type_4d_view((impl_scalar_type*)y_packed_.data(),
3164  y_packed_.extent(0),
3165  y_packed_.extent(1),
3166  y_packed_.extent(2),
3167  vector_length);
3168 #endif
3169  } else {
3170  y_packed = y_packed_;
3171  }
3172 
3173  if (is_cuda<execution_space>::value) {
3174 #if defined(KOKKOS_ENABLE_CUDA)
3175  const local_ordinal_type blocksize = blocksize_requested;
3176  const local_ordinal_type team_size = 8;
3177  const local_ordinal_type vector_size = ComputeResidualVectorRecommendedCudaVectorSize(blocksize, team_size);
3178  // local_ordinal_type vl_power_of_two = 1;
3179  // for (;vl_power_of_two<=blocksize_requested;vl_power_of_two*=2);
3180  // vl_power_of_two *= (vl_power_of_two < blocksize_requested ? 2 : 1);
3181  // const local_ordinal_type vl = vl_power_of_two > vector_length ? vector_length : vl_power_of_two;
3182 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
3183  if (compute_owned) { \
3184  const Kokkos::TeamPolicy<execution_space,OverlapTag<0,B> > \
3185  policy(rowidx2part.extent(0), team_size, vector_size); \
3186  Kokkos::parallel_for \
3187  ("ComputeResidual::TeamPolicy::run<OverlapTag<0> >", policy, *this); \
3188  } else { \
3189  const Kokkos::TeamPolicy<execution_space,OverlapTag<1,B> > \
3190  policy(rowidx2part.extent(0), team_size, vector_size); \
3191  Kokkos::parallel_for \
3192  ("ComputeResidual::TeamPolicy::run<OverlapTag<1> >", policy, *this); \
3193  } break
3194  switch (blocksize_requested) {
3195  case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
3196  case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
3197  case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
3198  case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
3199  case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
3200  case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
3201  case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
3202  case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
3203  case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
3204  default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
3205  }
3206 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
3207 #endif
3208  } else {
3209 #if defined(__CUDA_ARCH__)
3210  TEUCHOS_TEST_FOR_EXCEPT_MSG(true, "Error: CUDA should not see this code");
3211 #else
3212 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \
3213  if (compute_owned) { \
3214  const Kokkos::RangePolicy<execution_space,OverlapTag<0,B> > \
3215  policy(0, rowidx2part.extent(0)); \
3216  Kokkos::parallel_for \
3217  ("ComputeResidual::RangePolicy::run<OverlapTag<0> >", policy, *this); \
3218  } else { \
3219  const Kokkos::RangePolicy<execution_space,OverlapTag<1,B> > \
3220  policy(0, rowidx2part.extent(0)); \
3221  Kokkos::parallel_for \
3222  ("ComputeResidual::RangePolicy::run<OverlapTag<1> >", policy, *this); \
3223  } break
3224 
3225  switch (blocksize_requested) {
3226  case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
3227  case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
3228  case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
3229  case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
3230  case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
3231  case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
3232  case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
3233  case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
3234  case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
3235  default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
3236  }
3237 #undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
3238 #endif
3239  }
3240  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3241  }
3242  };
3243 
3244  template<typename MatrixType>
3245  void reduceVector(const ConstUnmanaged<typename ImplType<MatrixType>::impl_scalar_type_1d_view> zz,
3246  const typename ImplType<MatrixType>::local_ordinal_type ncols,
3247  /* */ typename ImplType<MatrixType>::magnitude_type *vals) {
3248  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN("BlockTriDi::ReduceVector");
3249  using impl_type = ImplType<MatrixType>;
3250  using local_ordinal_type = typename impl_type::local_ordinal_type;
3251  using impl_scalar_type = typename impl_type::impl_scalar_type;
3252  using magnitude_type = typename impl_type::magnitude_type;
3253 #if 0
3254  const auto norm2 = KokkosBlas::nrm1(zz);
3255 #else
3256  impl_scalar_type norm2(0);
3257  Kokkos::parallel_reduce
3258  ("ReduceMultiVector::Device",
3259  Kokkos::RangePolicy<typename impl_type::execution_space>(0,zz.extent(0)),
3260  KOKKOS_LAMBDA(const local_ordinal_type &i, impl_scalar_type &update) {
3261  update += zz(i);
3262  }, norm2);
3263 #endif
3264  const auto norm = Kokkos::ArithTraits<impl_scalar_type>::abs(norm2)/magnitude_type(ncols);
3265  for (local_ordinal_type j=0;j<ncols;++j) vals[j] = norm;
3266 
3267  IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
3268  }
3269 
3273  template<typename MatrixType>
3274  struct NormManager {
3275  public:
3277  using host_execution_space = typename impl_type::host_execution_space;
3278  using magnitude_type = typename impl_type::magnitude_type;
3279 
3280  private:
3281  bool collective_;
3282  int sweep_step_, num_vectors_;
3283 #ifdef HAVE_IFPACK2_MPI
3284  MPI_Request mpi_request_;
3285  MPI_Comm comm_;
3286 #endif
3287  magnitude_type work_[3];
3288 
3289  public:
3290  NormManager() = default;
3291  NormManager(const NormManager &b) = default;
3292  NormManager(const Teuchos::RCP<const Teuchos::Comm<int> >& comm) {
3293  sweep_step_ = 1;
3294  collective_ = comm->getSize() > 1;
3295  if (collective_) {
3296 #ifdef HAVE_IFPACK2_MPI
3297  const auto mpi_comm = Teuchos::rcp_dynamic_cast<const Teuchos::MpiComm<int> >(comm);
3298  TEUCHOS_ASSERT( ! mpi_comm.is_null());
3299  comm_ = *mpi_comm->getRawMpiComm();
3300 #endif
3301  }
3302  }
3303 
3304  // Check the norm every sweep_step sweeps.
3305  void setCheckFrequency(const int sweep_step) {
3306  TEUCHOS_TEST_FOR_EXCEPT_MSG(sweep_step < 1, "sweep step must be >= 1");
3307  sweep_step_ = sweep_step;
3308  }
3309 
3310  // Get the buffer into which to store rank-local squared norms.
3311  magnitude_type* getBuffer() { return &work_[0]; }
3312 
3313  // Call MPI_Iallreduce to find the global squared norms.
3314  void ireduce(const int sweep, const bool force = false) {
3315  if ( ! force && sweep % sweep_step_) return;
3316 
3317  IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::NormManager::Ireduce");
3318  auto send_data = &work_[0] + 1;
3319  auto recv_data = &work_[0];
3320  if (collective_) {
3321  work_[1] = work_[0];
3322 #ifdef HAVE_IFPACK2_MPI
3323 # if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_MPI_3)
3324  MPI_Iallreduce(send_data, recv_data, 1,
3325  Teuchos::Details::MpiTypeTraits<magnitude_type>::getType(),
3326  MPI_SUM, comm_, &mpi_request_);
3327 # else
3328  MPI_Allreduce (send_data, recv_data, 1,
3329  Teuchos::Details::MpiTypeTraits<magnitude_type>::getType(),
3330  MPI_SUM, comm_);
3331 # endif
3332 #endif
3333  }
3334  }
3335 
3336  // Check if the norm-based termination criterion is met. tol2 is the
3337  // tolerance squared. Sweep is the sweep index. If not every iteration is
3338  // being checked, this function immediately returns false. If a check must
3339  // be done at this iteration, it waits for the reduction triggered by
3340  // ireduce to complete, then checks the global norm against the tolerance.
3341  bool checkDone (const int sweep, const magnitude_type tol2, const bool force = false) {
3342  // early return
3343  if (sweep <= 0) return false;
3344 
3345  IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::NormManager::CheckDone");
3346 
3347  TEUCHOS_ASSERT(sweep >= 1);
3348  if ( ! force && (sweep - 1) % sweep_step_) return false;
3349  if (collective_) {
3350 #ifdef HAVE_IFPACK2_MPI
3351 # if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_MPI_3)
3352  MPI_Wait(&mpi_request_, MPI_STATUS_IGNORE);
3353 # else
3354  // Do nothing.
3355 # endif
3356 #endif
3357  }
3358  bool r_val = false;
3359  if (sweep == 1) {
3360  work_[2] = work_[0];
3361  } else {
3362  r_val = (work_[0] < tol2*work_[2]);
3363  }
3364  return r_val;
3365  }
3366 
3367  // After termination has occurred, finalize the norms for use in
3368  // get_norms{0,final}.
3369  void finalize () {
3370  work_[0] = std::sqrt(work_[0]); // after converged
3371  work_[2] = std::sqrt(work_[2]); // first norm
3372  }
3373 
3374  // Report norms to the caller.
3375  const magnitude_type* getNorms0 () const { return &work_[2]; }
3376  const magnitude_type* getNormsFinal () const { return &work_[0]; }
3377  };
3378 
3382  template<typename MatrixType>
3383  int
3384  applyInverseJacobi(// importer
3385  const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A,
3386  const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_import_type> &tpetra_importer,
3387  const Teuchos::RCP<AsyncableImport<MatrixType> > &async_importer,
3388  const bool overlap_communication_and_computation,
3389  // tpetra interface
3390  const typename ImplType<MatrixType>::tpetra_multivector_type &X, // tpetra interface
3391  /* */ typename ImplType<MatrixType>::tpetra_multivector_type &Y, // tpetra interface
3392  /* */ typename ImplType<MatrixType>::tpetra_multivector_type &Z, // temporary tpetra interface (seq_method)
3393  /* */ typename ImplType<MatrixType>::impl_scalar_type_1d_view &W, // temporary tpetra interface (diff)
3394  // local object interface
3395  const PartInterface<MatrixType> &interf, // mesh interface
3396  const BlockTridiags<MatrixType> &btdm, // packed block tridiagonal matrices
3397  const AmD<MatrixType> &amd, // R = A - D
3398  /* */ typename ImplType<MatrixType>::vector_type_1d_view &work, // workspace for packed multivector of right hand side
3399  /* */ NormManager<MatrixType> &norm_manager,
3400  // preconditioner parameters
3401  const typename ImplType<MatrixType>::impl_scalar_type &damping_factor,
3402  /* */ bool is_y_zero,
3403  const int max_num_sweeps,
3404  const typename ImplType<MatrixType>::magnitude_type tol,
3405  const int check_tol_every) {
3406  IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::ApplyInverseJacobi");
3407 
3408  using impl_type = ImplType<MatrixType>;
3409  using memory_space = typename impl_type::memory_space;
3410 
3411  using local_ordinal_type = typename impl_type::local_ordinal_type;
3412  using size_type = typename impl_type::size_type;
3413  using impl_scalar_type = typename impl_type::impl_scalar_type;
3414  using magnitude_type = typename impl_type::magnitude_type;
3415  using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
3416  using vector_type_1d_view = typename impl_type::vector_type_1d_view;
3417  using vector_type_3d_view = typename impl_type::vector_type_3d_view;
3418  using tpetra_multivector_type = typename impl_type::tpetra_multivector_type;
3419 
3420  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
3421 
3422  // either tpetra importer or async importer must be active
3423  TEUCHOS_TEST_FOR_EXCEPT_MSG(!tpetra_importer.is_null() && !async_importer.is_null(),
3424  "Neither Tpetra importer nor Async importer is null.");
3425  // max number of sweeps should be positive number
3426  TEUCHOS_TEST_FOR_EXCEPT_MSG(max_num_sweeps <= 0,
3427  "Maximum number of sweeps must be >= 1.");
3428 
3429  // const parameters
3430  const bool is_seq_method_requested = !tpetra_importer.is_null();
3431  const bool is_async_importer_active = !async_importer.is_null();
3432  const bool is_norm_manager_active = tol > Kokkos::ArithTraits<magnitude_type>::zero();
3433  const magnitude_type tolerance = tol*tol;
3434  const local_ordinal_type blocksize = btdm.values.extent(1);
3435  const local_ordinal_type num_vectors = Y.getNumVectors();
3436  const local_ordinal_type num_blockrows = interf.part2packrowidx0_back;
3437 
3438  const impl_scalar_type zero(0.0);
3439 
3440  TEUCHOS_TEST_FOR_EXCEPT_MSG(is_norm_manager_active && is_seq_method_requested,
3441  "The seq method for applyInverseJacobi, " <<
3442  "which in any case is for developer use only, " <<
3443  "does not support norm-based termination.");
3444 
3445  // if workspace is needed more, resize it
3446  const size_type work_span_required = num_blockrows*num_vectors*blocksize;
3447  if (work.span() < work_span_required)
3448  work = vector_type_1d_view("vector workspace 1d view", work_span_required);
3449 
3450  // construct W
3451  const local_ordinal_type W_size = interf.packptr.extent(0)-1;
3452  if (local_ordinal_type(W.extent(0)) < W_size)
3453  W = impl_scalar_type_1d_view("W", W_size);
3454 
3455  typename AsyncableImport<MatrixType>::impl_scalar_type_2d_view remote_multivector;
3456  {
3457  if (is_seq_method_requested) {
3458  if (Z.getNumVectors() != Y.getNumVectors())
3459  Z = tpetra_multivector_type(tpetra_importer->getTargetMap(), num_vectors, false);
3460  } else {
3461  if (is_async_importer_active) {
3462  // create comm data buffer and keep it here
3463  async_importer->createDataBuffer(num_vectors);
3464  remote_multivector = async_importer->getRemoteMultiVectorLocalView();
3465  }
3466  }
3467  }
3468 
3469  // wrap the workspace with 3d view
3470  vector_type_3d_view pmv(work.data(), num_blockrows, blocksize, num_vectors);
3471  const auto XX = X.template getLocalView<memory_space>();
3472  const auto YY = Y.template getLocalView<memory_space>();
3473  const auto ZZ = Z.template getLocalView<memory_space>();
3474 
3475  MultiVectorConverter<MatrixType> multivector_converter(interf, pmv);
3476  SolveTridiags<MatrixType> solve_tridiags(interf, btdm, pmv,
3477  damping_factor, is_norm_manager_active);
3478 
3479  const local_ordinal_type_1d_view dummy_local_ordinal_type_1d_view;
3480  ComputeResidualVector<MatrixType>
3481  compute_residual_vector(amd, A->getCrsGraph().getLocalGraph(), blocksize, interf,
3482  is_async_importer_active ? async_importer->dm2cm : dummy_local_ordinal_type_1d_view);
3483 
3484  // norm manager workspace resize
3485  if (is_norm_manager_active)
3486  norm_manager.setCheckFrequency(check_tol_every);
3487 
3488  // iterate
3489  int sweep = 0;
3490  for (;sweep<max_num_sweeps;++sweep) {
3491  {
3492  if (is_y_zero) {
3493  // pmv := x(lclrow)
3494  multivector_converter.run(XX);
3495  Kokkos::deep_copy(YY, zero);
3496  } else {
3497  if (is_seq_method_requested) {
3498  // y := x - R y
3499  Z.doImport(Y, *tpetra_importer, Tpetra::REPLACE);
3500  compute_residual_vector.run(YY, XX, ZZ);
3501 
3502  // pmv := y(lclrow).
3503  multivector_converter.run(YY);
3504  } else {
3505  // fused y := x - R y and pmv := y(lclrow);
3506  // real use case does not use overlap comp and comm
3507  if (overlap_communication_and_computation || !is_async_importer_active) {
3508  if (is_async_importer_active) async_importer->asyncSendRecv(YY);
3509  compute_residual_vector.run(pmv, XX, YY, remote_multivector, true);
3510  if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) {
3511  if (is_async_importer_active) async_importer->cancel();
3512  break;
3513  }
3514  if (is_async_importer_active) {
3515  async_importer->syncRecv();
3516  compute_residual_vector.run(pmv, XX, YY, remote_multivector, false);
3517  }
3518  } else {
3519  if (is_async_importer_active)
3520  async_importer->syncExchange(YY);
3521  if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) break;
3522  compute_residual_vector.run(pmv, XX, YY, remote_multivector);
3523  }
3524  }
3525  }
3526  }
3527 
3528  // pmv := inv(D) pmv.
3529  {
3530  solve_tridiags.run(YY, W);
3531  }
3532  {
3533  if (is_norm_manager_active) {
3534  // y(lclrow) = (b - a) y(lclrow) + a pmv, with b = 1 always.
3535  reduceVector<MatrixType>(W, num_vectors, norm_manager.getBuffer());
3536  if (sweep + 1 == max_num_sweeps) {
3537  norm_manager.ireduce(sweep, true);
3538  norm_manager.checkDone(sweep + 1, tolerance, true);
3539  } else {
3540  norm_manager.ireduce(sweep);
3541  }
3542  }
3543  }
3544 
3545  is_y_zero = false;
3546  }
3547 
3548  //sqrt the norms for the caller's use.
3549  if (is_norm_manager_active) norm_manager.finalize();
3550 
3551  return sweep;
3552  }
3553 
3554 
3555  template<typename MatrixType>
3556  struct ImplObject {
3557  using impl_type = ImplType<MatrixType>;
3558  using part_interface_type = PartInterface<MatrixType>;
3559  using block_tridiags_type = BlockTridiags<MatrixType>;
3560  using amd_type = AmD<MatrixType>;
3561  using norm_manager_type = NormManager<MatrixType>;
3562  using async_import_type = AsyncableImport<MatrixType>;
3563 
3564  // distructed objects
3565  Teuchos::RCP<const typename impl_type::tpetra_block_crs_matrix_type> A;
3566  Teuchos::RCP<const typename impl_type::tpetra_import_type> tpetra_importer;
3567  Teuchos::RCP<async_import_type> async_importer;
3568  bool overlap_communication_and_computation;
3569 
3570  // copy of Y (mutable to penentrate const)
3571  mutable typename impl_type::tpetra_multivector_type Z;
3572  mutable typename impl_type::impl_scalar_type_1d_view W;
3573 
3574  // local objects
3575  part_interface_type part_interface;
3576  block_tridiags_type block_tridiags; // D
3577  amd_type a_minus_d; // R = A - D
3578  mutable typename impl_type::vector_type_1d_view work; // right hand side workspace
3579  mutable norm_manager_type norm_manager;
3580  };
3581 
3582  } // namespace BlockTriDiContainerDetails
3583 
3584 } // namespace Ifpack2
3585 
3586 #endif
BlockTridiags< MatrixType > createBlockTridiags(const PartInterface< MatrixType > &interf)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1019
int applyInverseJacobi(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A, const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_import_type > &tpetra_importer, const Teuchos::RCP< AsyncableImport< MatrixType > > &async_importer, const bool overlap_communication_and_computation, const typename ImplType< MatrixType >::tpetra_multivector_type &X, typename ImplType< MatrixType >::tpetra_multivector_type &Y, typename ImplType< MatrixType >::tpetra_multivector_type &Z, typename ImplType< MatrixType >::impl_scalar_type_1d_view &W, const PartInterface< MatrixType > &interf, const BlockTridiags< MatrixType > &btdm, const AmD< MatrixType > &amd, typename ImplType< MatrixType >::vector_type_1d_view &work, NormManager< MatrixType > &norm_manager, const typename ImplType< MatrixType >::impl_scalar_type &damping_factor, bool is_y_zero, const int max_num_sweeps, const typename ImplType< MatrixType >::magnitude_type tol, const int check_tol_every)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:3384
PartInterface< MatrixType > createPartInterface(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A, const Teuchos::Array< Teuchos::Array< typename ImplType< MatrixType >::local_ordinal_type > > &partitions)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:839
size_type size() const
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:165
void performNumericPhase(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A, const PartInterface< MatrixType > &interf, BlockTridiags< MatrixType > &btdm, const typename ImplType< MatrixType >::magnitude_type tiny)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1843
static int ComputeResidualVectorRecommendedCudaVectorSize(const int blksize, const int team_size)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:2558
Kokkos::View< size_type *, device_type > size_type_1d_view
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:310
node_type::device_type device_type
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:282
Kokkos::Details::ArithTraits< scalar_type >::val_type impl_scalar_type
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:271
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
#define TEUCHOS_TEST_FOR_EXCEPT_MSG(throw_exception_test, msg)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:144
size_t size_type
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:262
std::string get_msg_prefix(const CommPtrType &comm)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:153
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:201
Kokkos::ViewAllocateWithoutInitializing do_not_initialize_tag
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:100
void send(const Packet sendBuffer[], const Ordinal count, const int destRank, const int tag, const Comm< Ordinal > &comm)
KokkosBatched::Experimental::Vector< T, l > Vector
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:297
Teuchos::RCP< AsyncableImport< MatrixType > > createBlockCrsAsyncImporter(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:725
T * getRawPtr() const
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:3274
Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_import_type > createBlockCrsTpetraImporter(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:330
Kokkos::DefaultHostExecutionSpace host_execution_space
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:277
void performSymbolicPhase(const Teuchos::RCP< const typename ImplType< MatrixType >::tpetra_block_crs_matrix_type > &A, const PartInterface< MatrixType > &interf, BlockTridiags< MatrixType > &btdm, AmD< MatrixType > &amd, const bool overlap_communication_and_computation)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1191
RCP< CommRequest< Ordinal > > isend(const ArrayRCP< const Packet > &sendBuffer, const int destRank, const int tag, const Comm< Ordinal > &comm)
#define TEUCHOS_ASSERT(assertion_test)
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1160
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:975
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1995
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:258
Definition: Ifpack2_BlockTriDiContainer_impl.hpp:1856