Sacado Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
mat_vec_hierarchical_dfad.cpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Sacado Package
5 // Copyright (2006) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // This library is free software; you can redistribute it and/or modify
11 // it under the terms of the GNU Lesser General Public License as
12 // published by the Free Software Foundation; either version 2.1 of the
13 // License, or (at your option) any later version.
14 //
15 // This library is distributed in the hope that it will be useful, but
16 // WITHOUT ANY WARRANTY; without even the implied warranty of
17 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 // Lesser General Public License for more details.
19 //
20 // You should have received a copy of the GNU Lesser General Public
21 // License along with this library; if not, write to the Free Software
22 // Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
23 // USA
24 // Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps
25 // (etphipp@sandia.gov).
26 //
27 // ***********************************************************************
28 // @HEADER
29 
30 #define SACADO_VIEW_CUDA_HIERARCHICAL_DFAD 1
31 #define SACADO_KOKKOS_USE_MEMORY_POOL 1
32 
33 #include "Sacado.hpp"
34 
36 
37 #include "Kokkos_Timer.hpp"
38 
39 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
40 void run_mat_vec_hierarchical_dfad(const ViewTypeA& A, const ViewTypeB& b,
41  const ViewTypeC& c) {
42  typedef typename ViewTypeC::value_type scalar_type;
43  typedef typename ViewTypeC::execution_space execution_space;
44 
45 #if defined (KOKKOS_ENABLE_CUDA)
47  const unsigned vector_size = is_cuda ? 32 : 1;
48  const unsigned team_size = is_cuda ? 128 / vector_size : 1;
49 #elif defined (KOKKOS_ENABLE_HIP)
51  const unsigned vector_size = is_hip ? 64 : 1;
52  const unsigned team_size = is_hip ? 128 / vector_size : 1;
53 #else
54  const unsigned vector_size = 1;
55  const unsigned team_size = 1;
56 #endif
57 
58  const int m = A.extent(0);
59  const int n = A.extent(1);
60  const int range = (m+team_size-1)/team_size;
61 
62  typedef Kokkos::TeamPolicy<execution_space> Policy;
63  Kokkos::parallel_for(
64  Policy( range,team_size,vector_size ),
65  KOKKOS_LAMBDA (const typename Policy::member_type& team) {
66  const int i = team.league_rank()*team.team_size() + team.team_rank();
67  if (i >= m)
68  return;
69 
70  scalar_type t = 0.0;
71  for (int j=0; j<n; ++j)
72  t += A(i,j)*b(j);
73  c(i) = t;
74  }
75  );
76 }
77 
78 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
80  const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c) {
81  typedef typename ViewTypeC::value_type scalar_type;
82  typedef typename ViewTypeC::execution_space execution_space;
83  typedef Kokkos::TeamPolicy<execution_space> Policy;
84  typedef typename Policy::member_type team_member;
85  typedef Kokkos::View<scalar_type*,Kokkos::LayoutLeft, typename execution_space::scratch_memory_space, Kokkos::MemoryUnmanaged> TmpScratchSpace;
86 
87 #if defined (KOKKOS_ENABLE_CUDA)
89  const unsigned VectorSize = is_cuda ? 32 : 1;
90  const unsigned TeamSize = is_cuda ? 128 / VectorSize : 1;
91 #elif defined (KOKKOS_ENABLE_HIP)
93  const unsigned VectorSize = is_hip ? 64 : 1;
94  const unsigned TeamSize = is_hip ? 128 / VectorSize : 1;
95 #else
96  const unsigned VectorSize = 1;
97  const unsigned TeamSize = 1;
98 #endif
99 
100  const int m = A.extent(0);
101  const int n = A.extent(1);
102  const int p = dimension_scalar(A);
103  const int N = (m+TeamSize-1)/TeamSize;
104 
105  Policy policy(N, TeamSize, VectorSize);
106  const size_t bytes = TmpScratchSpace::shmem_size(TeamSize,p);
107  Kokkos::parallel_for(
108  policy.set_scratch_size(0, Kokkos::PerTeam(bytes)),
109  KOKKOS_LAMBDA (const team_member& team) {
110  const int team_rank = team.team_rank();
111  const int team_size = team.team_size();
112  TmpScratchSpace t(team.team_scratch(0), team_size, p);
113  const int i = team.league_rank()*team_size + team_rank;
114  if (i < m) {
115  t(team_rank) = 0.0;
116  for (int j=0; j<n; ++j)
117  t(team_rank) += A(i,j)*b(j);
118  c(i) = t(team_rank);
119  }
120  }
121  );
122 }
123 
124 template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
125 void
126 check_deriv_hierarchical_dfad(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c)
127 {
128  const double tol = 1.0e-14;
129  typedef typename ViewTypeC::value_type value_type;
130  typename ViewTypeC::HostMirror h_c = Kokkos::create_mirror_view(c);
131  Kokkos::deep_copy(h_c, c);
132  const size_t m = A.extent(0);
133  const size_t n = A.extent(1);
134  const size_t p = Kokkos::dimension_scalar(A);
135  for (size_t i=0; i<m; ++i) {
136  for (size_t j=0; j<p; ++j) {
137  value_type t = (j == p-1 ? n : 2*n);
138  if (std::abs(h_c(i).fastAccessDx(j)- t) > tol) {
139  std::cout << "Comparison failed! " << i << "," << j << " : "
140  << h_c(i).fastAccessDx(j) << " , " << t << std::endl;
141  }
142  }
143  }
144 }
145 
146 template <typename FadType, typename ... ViewArgs>
147 Perf
148 do_time_fad_hierarchical_dfad(const size_t m, const size_t n, const size_t p,
149  const size_t nloop, const bool check)
150 {
151  typedef Kokkos::View<FadType**, ViewArgs...> ViewTypeA;
152  typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeB;
153  typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeC;
154  typedef typename ViewTypeA::execution_space execution_space;
158  typedef Kokkos::View<FadType**, ConLayoutA, execution_space> ConViewTypeA;
159  typedef Kokkos::View<FadType*, ConLayoutB, execution_space> ConViewTypeB;
160  typedef Kokkos::View<FadType*, ConLayoutC, execution_space> ConViewTypeC;
161 
162  ConViewTypeA A("A",m,n,p+1);
163  ConViewTypeB b("B",n,p+1);
164  ConViewTypeC c("c",m,p+1);
165 
166  // FadType a(p, 1.0);
167  // for (size_t k=0; k<p; ++k)
168  // a.fastAccessDx(k) = 1.0;
169  Kokkos::deep_copy(typename ConViewTypeA::array_type(A), 1.0);
170  Kokkos::deep_copy(typename ConViewTypeB::array_type(b), 1.0);
171 
172  Kokkos::Timer wall_clock;
173  Perf perf;
174 
175 #if defined (KOKKOS_ENABLE_CUDA)
177  const size_t warp_dim = is_cuda ? 32 : 1;
178 #elif defined (KOKKOS_ENABLE_HIP)
180  const size_t warp_dim = is_hip ? 64 : 1;
181 #else
182  const size_t warp_dim = 1;
183 #endif
184 
185  const size_t concurrency = execution_space().concurrency();
186  const size_t block_size = p*sizeof(double);
187  const size_t nkernels = concurrency / warp_dim;
188  const size_t mem_pool_size =
189  static_cast<size_t>(1.2*nkernels*block_size);
190  const size_t superblock_size = std::max<size_t>(nkernels / 100, 1) * block_size;
191  execution_space space;
192  Sacado::createGlobalMemoryPool(space, mem_pool_size,
193  block_size,
194  block_size,
195  superblock_size
196  );
197 
198  // Execute the kernel once to warm up
200  execution_space().fence();
201 
202  wall_clock.reset();
203  for (size_t l=0; l<nloop; l++) {
205  }
206  execution_space().fence();
207 
208  perf.time = wall_clock.seconds() / nloop;
209  perf.flops = m*n*(2+4*p);
210  perf.throughput = perf.flops / perf.time / 1.0e9;
211 
212  if (check) {
214  }
215 
217 
218  return perf;
219 }
220 
221 template <typename FadType, typename ... ViewArgs>
222 Perf
224  const size_t m, const size_t n, const size_t p, const size_t nloop,
225  const bool check)
226 {
227  typedef Kokkos::View<FadType**, ViewArgs...> ViewTypeA;
228  typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeB;
229  typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeC;
230  typedef typename ViewTypeA::execution_space execution_space;
234  typedef Kokkos::View<FadType**, ConLayoutA, execution_space> ConViewTypeA;
235  typedef Kokkos::View<FadType*, ConLayoutB, execution_space> ConViewTypeB;
236  typedef Kokkos::View<FadType*, ConLayoutC, execution_space> ConViewTypeC;
237 
238  ConViewTypeA A("A",m,n,p+1);
239  ConViewTypeB b("B",n,p+1);
240  ConViewTypeC c("c",m,p+1);
241 
242  // FadType a(p, 1.0);
243  // for (size_t k=0; k<p; ++k)
244  // a.fastAccessDx(k) = 1.0;
245  Kokkos::deep_copy(typename ConViewTypeA::array_type(A), 1.0);
246  Kokkos::deep_copy(typename ConViewTypeB::array_type(b), 1.0);
247 
248  Kokkos::Timer wall_clock;
249  Perf perf;
250 
251  // Execute the kernel once to warm up
253  execution_space().fence();
254 
255  wall_clock.reset();
256  for (size_t l=0; l<nloop; l++) {
258  }
259  execution_space().fence();
260 
261  perf.time = wall_clock.seconds() / nloop;
262  perf.flops = m*n*(2+4*p);
263  perf.throughput = perf.flops / perf.time / 1.0e9;
264 
265  if (check) {
267  }
268 
269  return perf;
270 }
271 
273 
274 #define INST_FUNC_FAD_DEV(FAD,DEV) \
275  template Perf do_time_fad_hierarchical_dfad< FAD, Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
276  template Perf do_time_fad_hierarchical_dfad< FAD, Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
277  template Perf do_time_fad_hierarchical_dfad< FAD, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
278  template Perf do_time_fad_hierarchical_dfad_scratch< FAD, Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
279  template Perf do_time_fad_hierarchical_dfad_scratch< FAD, Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
280  template Perf do_time_fad_hierarchical_dfad_scratch< FAD, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check );
281 
282 #define INST_FUNC_DEV(DEV) \
283  INST_FUNC_FAD_DEV( DFad_type, DEV )
284 
285 #ifdef KOKKOS_ENABLE_SERIAL
286 INST_FUNC_DEV(Kokkos::Serial)
287 #endif
288 
289 #ifdef KOKKOS_ENABLE_OPENMP
290 INST_FUNC_DEV(Kokkos::OpenMP)
291 #endif
292 
293 #ifdef KOKKOS_ENABLE_THREADS
294 INST_FUNC_DEV(Kokkos::Threads)
295 #endif
296 
297 #ifdef KOKKOS_ENABLE_CUDA
298 INST_FUNC_DEV(Kokkos::Cuda)
299 #endif
300 
301 #ifdef KOKKOS_ENABLE_HIP
302 INST_FUNC_DEV(Kokkos::HIP)
303 #endif
Sacado::Fad::DFad< double > DFad_type
Definition: mat_vec.cpp:516
const char * p
abs(expr.val())
double flops
std::enable_if< !Kokkos::is_view_fad< View2 >::value, bool >::type check(const View1 &v_gold, const View2 &v, const double tol)
void createGlobalMemoryPool(const ExecSpace &space, const size_t min_total_alloc_size, const uint32_t min_block_alloc_size, const uint32_t max_block_alloc_size, const uint32_t min_superblock_size)
double time
void check_deriv_hierarchical_dfad(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
Sacado::Fad::DFad< double > FadType
Perf do_time_fad_hierarchical_dfad_scratch(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
expr expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c *expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr2 expr1 expr2 expr1 expr1 expr1 c
#define A
Definition: Sacado_rad.hpp:572
Perf do_time_fad_hierarchical_dfad(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
const int N
#define INST_FUNC_DEV(DEV)
int value
double throughput
expr expr expr fastAccessDx(i)) FAD_UNARYOP_MACRO(exp
const double tol
void run_mat_vec_hierarchical_dfad_scratch(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
void destroyGlobalMemoryPool(const ExecSpace &space)
int n
void run_mat_vec_hierarchical_dfad(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)