ARTEMIS
CurrentDeposition.H
Go to the documentation of this file.
1 /* Copyright 2019 Axel Huebl, David Grote, Maxence Thevenet
2  * Remi Lehe, Weiqun Zhang, Michael Rowan
3  *
4  * This file is part of WarpX.
5  *
6  * License: BSD-3-Clause-LBNL
7  */
8 #ifndef CURRENTDEPOSITION_H_
9 #define CURRENTDEPOSITION_H_
10 
14 #include "Particles/ShapeFactors.H"
16 #include "Utils/WarpXConst.H"
17 #ifdef WARPX_DIM_RZ
18 # include "Utils/WarpX_Complex.H"
19 #endif
20 
21 #include <AMReX.H>
22 #include <AMReX_Arena.H>
23 #include <AMReX_Array4.H>
24 #include <AMReX_REAL.H>
25 
26 using namespace amrex::literals;
27 
52 template <int depos_order>
53 void doDepositionShapeN (const GetParticlePosition& GetPosition,
54  const amrex::ParticleReal * const wp,
55  const amrex::ParticleReal * const uxp,
56  const amrex::ParticleReal * const uyp,
57  const amrex::ParticleReal * const uzp,
58  const int * const ion_lev,
59  amrex::FArrayBox& jx_fab,
60  amrex::FArrayBox& jy_fab,
61  amrex::FArrayBox& jz_fab,
62  const long np_to_depose,
63  const amrex::Real relative_time,
64  const std::array<amrex::Real,3>& dx,
65  const std::array<amrex::Real,3>& xyzmin,
66  const amrex::Dim3 lo,
67  const amrex::Real q,
68  const int n_rz_azimuthal_modes,
69  amrex::Real* cost,
70  const long load_balance_costs_update_algo)
71 {
72 #if !defined(WARPX_DIM_RZ)
73  amrex::ignore_unused(n_rz_azimuthal_modes);
74 #endif
75 
76 #if !defined(AMREX_USE_GPU)
77  amrex::ignore_unused(cost, load_balance_costs_update_algo);
78 #endif
79 
80  // Whether ion_lev is a null pointer (do_ionization=0) or a real pointer
81  // (do_ionization=1)
82  const bool do_ionization = ion_lev;
83  const amrex::Real dzi = 1.0_rt/dx[2];
84 #if defined(WARPX_DIM_1D_Z)
85  const amrex::Real invvol = dzi;
86 #endif
87 #if defined(WARPX_DIM_XZ) || defined(WARPX_DIM_RZ)
88  const amrex::Real dxi = 1.0_rt/dx[0];
89  const amrex::Real invvol = dxi*dzi;
90 #elif defined(WARPX_DIM_3D)
91  const amrex::Real dxi = 1.0_rt/dx[0];
92  const amrex::Real dyi = 1.0_rt/dx[1];
93  const amrex::Real invvol = dxi*dyi*dzi;
94 #endif
95 
96 #if (AMREX_SPACEDIM >= 2)
97  const amrex::Real xmin = xyzmin[0];
98 #endif
99 #if defined(WARPX_DIM_3D)
100  const amrex::Real ymin = xyzmin[1];
101 #endif
102  const amrex::Real zmin = xyzmin[2];
103 
104  const amrex::Real clightsq = 1.0_rt/PhysConst::c/PhysConst::c;
105 
106  amrex::Array4<amrex::Real> const& jx_arr = jx_fab.array();
107  amrex::Array4<amrex::Real> const& jy_arr = jy_fab.array();
108  amrex::Array4<amrex::Real> const& jz_arr = jz_fab.array();
109  amrex::IntVect const jx_type = jx_fab.box().type();
110  amrex::IntVect const jy_type = jy_fab.box().type();
111  amrex::IntVect const jz_type = jz_fab.box().type();
112 
113  constexpr int zdir = WARPX_ZINDEX;
114  constexpr int NODE = amrex::IndexType::NODE;
115  constexpr int CELL = amrex::IndexType::CELL;
116 
117  // Loop over particles and deposit into jx_fab, jy_fab and jz_fab
118 #if defined(WARPX_USE_GPUCLOCK)
119  amrex::Real* cost_real = nullptr;
120  if( load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::GpuClock) {
121  cost_real = (amrex::Real *) amrex::The_Managed_Arena()->alloc(sizeof(amrex::Real));
122  *cost_real = 0._rt;
123  }
124 #endif
126  np_to_depose,
127  [=] AMREX_GPU_DEVICE (long ip) {
128 #if defined(WARPX_USE_GPUCLOCK)
129  const auto KernelTimer = ablastr::parallelization::KernelTimer(
130  cost && (load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::GpuClock),
131  cost_real);
132 #endif
133 
134  // --- Get particle quantities
135  const amrex::Real gaminv = 1.0_rt/std::sqrt(1.0_rt + uxp[ip]*uxp[ip]*clightsq
136  + uyp[ip]*uyp[ip]*clightsq
137  + uzp[ip]*uzp[ip]*clightsq);
138  amrex::Real wq = q*wp[ip];
139  if (do_ionization){
140  wq *= ion_lev[ip];
141  }
142 
143  amrex::ParticleReal xp, yp, zp;
144  GetPosition(ip, xp, yp, zp);
145 
146  const amrex::Real vx = uxp[ip]*gaminv;
147  const amrex::Real vy = uyp[ip]*gaminv;
148  const amrex::Real vz = uzp[ip]*gaminv;
149  // wqx, wqy wqz are particle current in each direction
150 #if defined(WARPX_DIM_RZ)
151  // In RZ, wqx is actually wqr, and wqy is wqtheta
152  // Convert to cylinderical at the mid point
153  const amrex::Real xpmid = xp + relative_time*vx;
154  const amrex::Real ypmid = yp + relative_time*vy;
155  const amrex::Real rpmid = std::sqrt(xpmid*xpmid + ypmid*ypmid);
156  amrex::Real costheta;
157  amrex::Real sintheta;
158  if (rpmid > 0._rt) {
159  costheta = xpmid/rpmid;
160  sintheta = ypmid/rpmid;
161  } else {
162  costheta = 1._rt;
163  sintheta = 0._rt;
164  }
165  const Complex xy0 = Complex{costheta, sintheta};
166  const amrex::Real wqx = wq*invvol*(+vx*costheta + vy*sintheta);
167  const amrex::Real wqy = wq*invvol*(-vx*sintheta + vy*costheta);
168 #else
169  const amrex::Real wqx = wq*invvol*vx;
170  const amrex::Real wqy = wq*invvol*vy;
171 #endif
172  const amrex::Real wqz = wq*invvol*vz;
173 
174  // --- Compute shape factors
175  Compute_shape_factor< depos_order > const compute_shape_factor;
176 #if (AMREX_SPACEDIM >= 2)
177  // x direction
178  // Get particle position after 1/2 push back in position
179 #if defined(WARPX_DIM_RZ)
180  // Keep these double to avoid bug in single precision
181  const double xmid = (rpmid - xmin)*dxi;
182 #else
183  const double xmid = ((xp - xmin) + relative_time*vx)*dxi;
184 #endif
185  // j_j[xyz] leftmost grid point in x that the particle touches for the centering of each current
186  // sx_j[xyz] shape factor along x for the centering of each current
187  // There are only two possible centerings, node or cell centered, so at most only two shape factor
188  // arrays will be needed.
189  // Keep these double to avoid bug in single precision
190  double sx_node[depos_order + 1] = {0.};
191  double sx_cell[depos_order + 1] = {0.};
192  int j_node = 0;
193  int j_cell = 0;
194  if (jx_type[0] == NODE || jy_type[0] == NODE || jz_type[0] == NODE) {
195  j_node = compute_shape_factor(sx_node, xmid);
196  }
197  if (jx_type[0] == CELL || jy_type[0] == CELL || jz_type[0] == CELL) {
198  j_cell = compute_shape_factor(sx_cell, xmid - 0.5);
199  }
200 
201  amrex::Real sx_jx[depos_order + 1] = {0._rt};
202  amrex::Real sx_jy[depos_order + 1] = {0._rt};
203  amrex::Real sx_jz[depos_order + 1] = {0._rt};
204  for (int ix=0; ix<=depos_order; ix++)
205  {
206  sx_jx[ix] = ((jx_type[0] == NODE) ? amrex::Real(sx_node[ix]) : amrex::Real(sx_cell[ix]));
207  sx_jy[ix] = ((jy_type[0] == NODE) ? amrex::Real(sx_node[ix]) : amrex::Real(sx_cell[ix]));
208  sx_jz[ix] = ((jz_type[0] == NODE) ? amrex::Real(sx_node[ix]) : amrex::Real(sx_cell[ix]));
209  }
210 
211  int const j_jx = ((jx_type[0] == NODE) ? j_node : j_cell);
212  int const j_jy = ((jy_type[0] == NODE) ? j_node : j_cell);
213  int const j_jz = ((jz_type[0] == NODE) ? j_node : j_cell);
214 #endif //AMREX_SPACEDIM >= 2
215 
216 #if defined(WARPX_DIM_3D)
217  // y direction
218  // Keep these double to avoid bug in single precision
219  const double ymid = ((yp - ymin) + relative_time*vy)*dyi;
220  double sy_node[depos_order + 1] = {0.};
221  double sy_cell[depos_order + 1] = {0.};
222  int k_node = 0;
223  int k_cell = 0;
224  if (jx_type[1] == NODE || jy_type[1] == NODE || jz_type[1] == NODE) {
225  k_node = compute_shape_factor(sy_node, ymid);
226  }
227  if (jx_type[1] == CELL || jy_type[1] == CELL || jz_type[1] == CELL) {
228  k_cell = compute_shape_factor(sy_cell, ymid - 0.5);
229  }
230  amrex::Real sy_jx[depos_order + 1] = {0._rt};
231  amrex::Real sy_jy[depos_order + 1] = {0._rt};
232  amrex::Real sy_jz[depos_order + 1] = {0._rt};
233  for (int iy=0; iy<=depos_order; iy++)
234  {
235  sy_jx[iy] = ((jx_type[1] == NODE) ? amrex::Real(sy_node[iy]) : amrex::Real(sy_cell[iy]));
236  sy_jy[iy] = ((jy_type[1] == NODE) ? amrex::Real(sy_node[iy]) : amrex::Real(sy_cell[iy]));
237  sy_jz[iy] = ((jz_type[1] == NODE) ? amrex::Real(sy_node[iy]) : amrex::Real(sy_cell[iy]));
238  }
239  int const k_jx = ((jx_type[1] == NODE) ? k_node : k_cell);
240  int const k_jy = ((jy_type[1] == NODE) ? k_node : k_cell);
241  int const k_jz = ((jz_type[1] == NODE) ? k_node : k_cell);
242 #endif
243 
244  // z direction
245  // Keep these double to avoid bug in single precision
246  const double zmid = ((zp - zmin) + relative_time*vz)*dzi;
247  double sz_node[depos_order + 1] = {0.};
248  double sz_cell[depos_order + 1] = {0.};
249  int l_node = 0;
250  int l_cell = 0;
251  if (jx_type[zdir] == NODE || jy_type[zdir] == NODE || jz_type[zdir] == NODE) {
252  l_node = compute_shape_factor(sz_node, zmid);
253  }
254  if (jx_type[zdir] == CELL || jy_type[zdir] == CELL || jz_type[zdir] == CELL) {
255  l_cell = compute_shape_factor(sz_cell, zmid - 0.5);
256  }
257  amrex::Real sz_jx[depos_order + 1] = {0._rt};
258  amrex::Real sz_jy[depos_order + 1] = {0._rt};
259  amrex::Real sz_jz[depos_order + 1] = {0._rt};
260  for (int iz=0; iz<=depos_order; iz++)
261  {
262  sz_jx[iz] = ((jx_type[zdir] == NODE) ? amrex::Real(sz_node[iz]) : amrex::Real(sz_cell[iz]));
263  sz_jy[iz] = ((jy_type[zdir] == NODE) ? amrex::Real(sz_node[iz]) : amrex::Real(sz_cell[iz]));
264  sz_jz[iz] = ((jz_type[zdir] == NODE) ? amrex::Real(sz_node[iz]) : amrex::Real(sz_cell[iz]));
265  }
266  int const l_jx = ((jx_type[zdir] == NODE) ? l_node : l_cell);
267  int const l_jy = ((jy_type[zdir] == NODE) ? l_node : l_cell);
268  int const l_jz = ((jz_type[zdir] == NODE) ? l_node : l_cell);
269 
270  // Deposit current into jx_arr, jy_arr and jz_arr
271 #if defined(WARPX_DIM_1D_Z)
272  for (int iz=0; iz<=depos_order; iz++){
274  &jx_arr(lo.x+l_jx+iz, 0, 0, 0),
275  sz_jx[iz]*wqx);
277  &jy_arr(lo.x+l_jy+iz, 0, 0, 0),
278  sz_jy[iz]*wqy);
280  &jz_arr(lo.x+l_jz+iz, 0, 0, 0),
281  sz_jz[iz]*wqz);
282  }
283 #endif
284 #if defined(WARPX_DIM_XZ) || defined(WARPX_DIM_RZ)
285  for (int iz=0; iz<=depos_order; iz++){
286  for (int ix=0; ix<=depos_order; ix++){
288  &jx_arr(lo.x+j_jx+ix, lo.y+l_jx+iz, 0, 0),
289  sx_jx[ix]*sz_jx[iz]*wqx);
291  &jy_arr(lo.x+j_jy+ix, lo.y+l_jy+iz, 0, 0),
292  sx_jy[ix]*sz_jy[iz]*wqy);
294  &jz_arr(lo.x+j_jz+ix, lo.y+l_jz+iz, 0, 0),
295  sx_jz[ix]*sz_jz[iz]*wqz);
296 #if defined(WARPX_DIM_RZ)
297  Complex xy = xy0; // Note that xy is equal to e^{i m theta}
298  for (int imode=1 ; imode < n_rz_azimuthal_modes ; imode++) {
299  // The factor 2 on the weighting comes from the normalization of the modes
300  amrex::Gpu::Atomic::AddNoRet( &jx_arr(lo.x+j_jx+ix, lo.y+l_jx+iz, 0, 2*imode-1), 2._rt*sx_jx[ix]*sz_jx[iz]*wqx*xy.real());
301  amrex::Gpu::Atomic::AddNoRet( &jx_arr(lo.x+j_jx+ix, lo.y+l_jx+iz, 0, 2*imode ), 2._rt*sx_jx[ix]*sz_jx[iz]*wqx*xy.imag());
302  amrex::Gpu::Atomic::AddNoRet( &jy_arr(lo.x+j_jy+ix, lo.y+l_jy+iz, 0, 2*imode-1), 2._rt*sx_jy[ix]*sz_jy[iz]*wqy*xy.real());
303  amrex::Gpu::Atomic::AddNoRet( &jy_arr(lo.x+j_jy+ix, lo.y+l_jy+iz, 0, 2*imode ), 2._rt*sx_jy[ix]*sz_jy[iz]*wqy*xy.imag());
304  amrex::Gpu::Atomic::AddNoRet( &jz_arr(lo.x+j_jz+ix, lo.y+l_jz+iz, 0, 2*imode-1), 2._rt*sx_jz[ix]*sz_jz[iz]*wqz*xy.real());
305  amrex::Gpu::Atomic::AddNoRet( &jz_arr(lo.x+j_jz+ix, lo.y+l_jz+iz, 0, 2*imode ), 2._rt*sx_jz[ix]*sz_jz[iz]*wqz*xy.imag());
306  xy = xy*xy0;
307  }
308 #endif
309  }
310  }
311 #elif defined(WARPX_DIM_3D)
312  for (int iz=0; iz<=depos_order; iz++){
313  for (int iy=0; iy<=depos_order; iy++){
314  for (int ix=0; ix<=depos_order; ix++){
316  &jx_arr(lo.x+j_jx+ix, lo.y+k_jx+iy, lo.z+l_jx+iz),
317  sx_jx[ix]*sy_jx[iy]*sz_jx[iz]*wqx);
319  &jy_arr(lo.x+j_jy+ix, lo.y+k_jy+iy, lo.z+l_jy+iz),
320  sx_jy[ix]*sy_jy[iy]*sz_jy[iz]*wqy);
322  &jz_arr(lo.x+j_jz+ix, lo.y+k_jz+iy, lo.z+l_jz+iz),
323  sx_jz[ix]*sy_jz[iy]*sz_jz[iz]*wqz);
324  }
325  }
326  }
327 #endif
328  }
329  );
330 #if defined(WARPX_USE_GPUCLOCK)
331  if(cost && load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::GpuClock) {
333  *cost += *cost_real;
334  amrex::The_Managed_Arena()->free(cost_real);
335  }
336 #endif
337 }
338 
364 template <int depos_order>
366  const amrex::ParticleReal * const wp,
367  const amrex::ParticleReal * const uxp,
368  const amrex::ParticleReal * const uyp,
369  const amrex::ParticleReal * const uzp,
370  const int * const ion_lev,
371  amrex::FArrayBox& jx_fab,
372  amrex::FArrayBox& jy_fab,
373  amrex::FArrayBox& jz_fab,
374  const long np_to_depose,
375  const amrex::Real relative_time,
376  const std::array<amrex::Real,3>& dx,
377  const std::array<amrex::Real,3>& xyzmin,
378  const amrex::Dim3 lo,
379  const amrex::Real q,
380  const int n_rz_azimuthal_modes,
381  amrex::Real* cost,
382  const long load_balance_costs_update_algo,
384  const amrex::Box& box,
385  const amrex::Geometry& geom,
386  const amrex::IntVect& a_tbox_max_size)
387 {
388 #if defined(AMREX_USE_HIP) || defined(AMREX_USE_CUDA)
389  using namespace amrex;
390 
391  auto permutation = a_bins.permutationPtr();
392 
393  amrex::Array4<amrex::Real> const& jx_arr = jx_fab.array();
394  amrex::Array4<amrex::Real> const& jy_arr = jy_fab.array();
395  amrex::Array4<amrex::Real> const& jz_arr = jz_fab.array();
396  amrex::IntVect const jx_type = jx_fab.box().type();
397  amrex::IntVect const jy_type = jy_fab.box().type();
398  amrex::IntVect const jz_type = jz_fab.box().type();
399 
400  constexpr int zdir = WARPX_ZINDEX;
401  constexpr int NODE = amrex::IndexType::NODE;
402  constexpr int CELL = amrex::IndexType::CELL;
403 
404  // Loop over particles and deposit into jx_fab, jy_fab and jz_fab
405 #if defined(WARPX_USE_GPUCLOCK)
406  amrex::Real* cost_real = nullptr;
407  if (cost && load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::GpuClock) {
408  cost_real = (amrex::Real *) amrex::The_Managed_Arena()->alloc(sizeof(amrex::Real));
409  *cost_real = 0._rt;
410  }
411 #endif
412  const auto dxiarr = geom.InvCellSizeArray();
413  const auto plo = geom.ProbLoArray();
414  const auto domain = geom.Domain();
415 
416  amrex::Box sample_tbox(IntVect(AMREX_D_DECL(0,0,0)), a_tbox_max_size - 1);
417  sample_tbox.grow(depos_order);
418 
419  amrex::Box sample_tbox_x = convert(sample_tbox, jx_type);
420  amrex::Box sample_tbox_y = convert(sample_tbox, jy_type);
421  amrex::Box sample_tbox_z = convert(sample_tbox, jz_type);
422 
423  const auto npts = amrex::max(sample_tbox_x.numPts(), sample_tbox_y.numPts(), sample_tbox_z.numPts());
424 
425  const int nblocks = a_bins.numBins();
426  const int threads_per_block = WarpX::shared_mem_current_tpb;
427  const auto offsets_ptr = a_bins.offsetsPtr();
428 
429  const std::size_t shared_mem_bytes = npts*sizeof(amrex::Real);
430  const amrex::IntVect bin_size = WarpX::shared_tilesize;
431  const std::size_t max_shared_mem_bytes = amrex::Gpu::Device::sharedMemPerBlock();
432  WARPX_ALWAYS_ASSERT_WITH_MESSAGE(shared_mem_bytes <= max_shared_mem_bytes,
433  "Tile size too big for GPU shared memory current deposition");
434 
435  amrex::ignore_unused(np_to_depose);
436  // Launch one thread-block per bin
438  nblocks, threads_per_block, shared_mem_bytes, amrex::Gpu::gpuStream(),
439  [=] AMREX_GPU_DEVICE () noexcept {
440 #if defined(WARPX_USE_GPUCLOCK)
441  const auto KernelTimer = ablastr::parallelization::KernelTimer(
442  cost && (load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::GpuClock),
443  cost_real);
444 #endif
445  const int bin_id = blockIdx.x;
446  const unsigned int bin_start = offsets_ptr[bin_id];
447  const unsigned int bin_stop = offsets_ptr[bin_id+1];
448 
449  if (bin_start == bin_stop) { return; /*this bin has no particles*/ }
450 
451  // These boxes define the index space for the shared memory buffers
452  amrex::Box buffer_box;
453  {
454  ParticleReal xp, yp, zp;
455  GetPosition(permutation[bin_start], xp, yp, zp);
456 #if defined(WARPX_DIM_3D)
457  IntVect iv = IntVect(int( amrex::Math::floor((xp-plo[0]) * dxiarr[0]) ),
458  int( amrex::Math::floor((yp-plo[1]) * dxiarr[1]) ),
459  int( amrex::Math::floor((zp-plo[2]) * dxiarr[2]) ));
460 #elif defined(WARPX_DIM_XZ) || defined(WARPX_DIM_RZ)
461  IntVect iv = IntVect(int( amrex::Math::floor((xp-plo[0]) * dxiarr[0]) ),
462  int( amrex::Math::floor((zp-plo[1]) * dxiarr[1]) ));
463 #elif defined(WARPX_DIM_1D_Z)
464  IntVect iv = IntVect(int( amrex::Math::floor((zp-plo[0]) * dxiarr[0]) ));
465 #endif
466  iv += domain.smallEnd();
467  getTileIndex(iv, box, true, bin_size, buffer_box);
468  }
469 
470  buffer_box.grow(depos_order);
471  Box tbox_x = convert(buffer_box, jx_type);
472  Box tbox_y = convert(buffer_box, jy_type);
473  Box tbox_z = convert(buffer_box, jz_type);
474 
476  amrex::Real* const shared = gsm.dataPtr();
477 
478  amrex::Array4<amrex::Real> const jx_buff(shared,
479  amrex::begin(tbox_x), amrex::end(tbox_x), 1);
480  amrex::Array4<amrex::Real> const jy_buff(shared,
481  amrex::begin(tbox_y), amrex::end(tbox_y), 1);
482  amrex::Array4<amrex::Real> const jz_buff(shared,
483  amrex::begin(tbox_z), amrex::end(tbox_z), 1);
484 
485  // Zero-initialize the temporary array in shared memory
486  volatile amrex::Real* vs = shared;
487  for (int i = threadIdx.x; i < npts; i += blockDim.x){
488  vs[i] = 0.0;
489  }
490  __syncthreads();
491  for (unsigned int ip_orig = bin_start+threadIdx.x; ip_orig<bin_stop; ip_orig += blockDim.x)
492  {
493  unsigned int ip = permutation[ip_orig];
494  depositComponent<depos_order>(GetPosition, wp, uxp, uyp, uzp, ion_lev, jx_buff, jx_type,
495  relative_time, dx, xyzmin, lo, q, n_rz_azimuthal_modes,
496  ip, zdir, NODE, CELL, 0);
497  }
498 
499  __syncthreads();
500  addLocalToGlobal(tbox_x, jx_arr, jx_buff);
501  for (int i = threadIdx.x; i < npts; i += blockDim.x){
502  vs[i] = 0.0;
503  }
504 
505  __syncthreads();
506  for (unsigned int ip_orig = bin_start+threadIdx.x; ip_orig<bin_stop; ip_orig += blockDim.x)
507  {
508  unsigned int ip = permutation[ip_orig];
509  depositComponent<depos_order>(GetPosition, wp, uxp, uyp, uzp, ion_lev, jy_buff, jy_type,
510  relative_time, dx, xyzmin, lo, q, n_rz_azimuthal_modes,
511  ip, zdir, NODE, CELL, 1);
512  }
513 
514  __syncthreads();
515  addLocalToGlobal(tbox_y, jy_arr, jy_buff);
516  for (int i = threadIdx.x; i < npts; i += blockDim.x){
517  vs[i] = 0.0;
518  }
519 
520  __syncthreads();
521  for (unsigned int ip_orig = bin_start+threadIdx.x; ip_orig<bin_stop; ip_orig += blockDim.x)
522  {
523  unsigned int ip = permutation[ip_orig];
524  depositComponent<depos_order>(GetPosition, wp, uxp, uyp, uzp, ion_lev, jz_buff, jz_type,
525  relative_time, dx, xyzmin, lo, q, n_rz_azimuthal_modes,
526  ip, zdir, NODE, CELL, 2);
527  }
528 
529  __syncthreads();
530  addLocalToGlobal(tbox_z, jz_arr, jz_buff);
531  });
532 #if defined(WARPX_USE_GPUCLOCK)
533  if (cost && load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::GpuClock) {
535  *cost += *cost_real;
536  amrex::The_Managed_Arena()->free(cost_real);
537  }
538 #endif
539 #else // not using hip/cuda
540  // Note, you should never reach this part of the code. This funcion cannot be called unless
541  // using HIP/CUDA, and those things are checked prior
542  //don't use any args
543  ignore_unused( GetPosition, wp, uxp, uyp, uzp, ion_lev, jx_fab, jy_fab, jz_fab, np_to_depose, relative_time, dx, xyzmin, lo, q, n_rz_azimuthal_modes, cost, load_balance_costs_update_algo, a_bins, box, geom, a_tbox_max_size);
544  amrex::Abort("Shared memory only implemented for HIP/CUDA");
545 #endif
546 }
547 
574 template <int depos_order>
576  const amrex::ParticleReal * const wp,
577  const amrex::ParticleReal * const uxp,
578  const amrex::ParticleReal * const uyp,
579  const amrex::ParticleReal * const uzp,
580  const int * const ion_lev,
581  const amrex::Array4<amrex::Real>& Jx_arr,
582  const amrex::Array4<amrex::Real>& Jy_arr,
583  const amrex::Array4<amrex::Real>& Jz_arr,
584  const long np_to_depose,
585  const amrex::Real dt,
586  const amrex::Real relative_time,
587  const std::array<amrex::Real,3>& dx,
588  const std::array<amrex::Real, 3> xyzmin,
589  const amrex::Dim3 lo,
590  const amrex::Real q,
591  const int n_rz_azimuthal_modes,
592  amrex::Real * const cost,
593  const long load_balance_costs_update_algo)
594 {
595  using namespace amrex;
596 #if !defined(WARPX_DIM_RZ)
597  ignore_unused(n_rz_azimuthal_modes);
598 #endif
599 
600 #if !defined(AMREX_USE_GPU)
601  amrex::ignore_unused(cost, load_balance_costs_update_algo);
602 #endif
603 
604  // Whether ion_lev is a null pointer (do_ionization=0) or a real pointer
605  // (do_ionization=1)
606  bool const do_ionization = ion_lev;
607 #if !defined(WARPX_DIM_1D_Z)
608  Real const dxi = 1.0_rt / dx[0];
609 #endif
610 #if !defined(WARPX_DIM_1D_Z)
611  Real const xmin = xyzmin[0];
612 #endif
613 #if defined(WARPX_DIM_3D)
614  Real const dyi = 1.0_rt / dx[1];
615  Real const ymin = xyzmin[1];
616 #endif
617  Real const dzi = 1.0_rt / dx[2];
618  Real const zmin = xyzmin[2];
619 
620 #if defined(WARPX_DIM_3D)
621  Real const invdtdx = 1.0_rt / (dt*dx[1]*dx[2]);
622  Real const invdtdy = 1.0_rt / (dt*dx[0]*dx[2]);
623  Real const invdtdz = 1.0_rt / (dt*dx[0]*dx[1]);
624 #elif defined(WARPX_DIM_XZ) || defined(WARPX_DIM_RZ)
625  Real const invdtdx = 1.0_rt / (dt*dx[2]);
626  Real const invdtdz = 1.0_rt / (dt*dx[0]);
627  Real const invvol = 1.0_rt / (dx[0]*dx[2]);
628 #elif defined(WARPX_DIM_1D_Z)
629  Real const invdtdz = 1.0_rt / (dt*dx[0]);
630  Real const invvol = 1.0_rt / (dx[2]);
631 #endif
632 
633 #if defined(WARPX_DIM_RZ)
634  Complex const I = Complex{0._rt, 1._rt};
635 #endif
636 
637  Real const clightsq = 1.0_rt / ( PhysConst::c * PhysConst::c );
638 #if !defined(WARPX_DIM_1D_Z)
639  Real constexpr one_third = 1.0_rt / 3.0_rt;
640  Real constexpr one_sixth = 1.0_rt / 6.0_rt;
641 #endif
642 
643  // Loop over particles and deposit into Jx_arr, Jy_arr and Jz_arr
644 #if defined(WARPX_USE_GPUCLOCK)
645  amrex::Real* cost_real = nullptr;
646  if( load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::GpuClock) {
647  cost_real = (amrex::Real *) amrex::The_Managed_Arena()->alloc(sizeof(amrex::Real));
648  *cost_real = 0._rt;
649  }
650 #endif
652  np_to_depose,
653  [=] AMREX_GPU_DEVICE (long const ip) {
654 #if defined(WARPX_USE_GPUCLOCK)
655  const auto KernelTimer = ablastr::parallelization::KernelTimer(
656  cost && (load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::GpuClock),
657  cost_real);
658 #endif
659 
660  // --- Get particle quantities
661  Real const gaminv = 1.0_rt/std::sqrt(1.0_rt + uxp[ip]*uxp[ip]*clightsq
662  + uyp[ip]*uyp[ip]*clightsq
663  + uzp[ip]*uzp[ip]*clightsq);
664 
665  // wqx, wqy wqz are particle current in each direction
666  Real wq = q*wp[ip];
667  if (do_ionization){
668  wq *= ion_lev[ip];
669  }
670 
671  ParticleReal xp, yp, zp;
672  GetPosition(ip, xp, yp, zp);
673 
674 #if !defined(WARPX_DIM_1D_Z)
675  Real const wqx = wq*invdtdx;
676 #endif
677 #if defined(WARPX_DIM_3D)
678  Real const wqy = wq*invdtdy;
679 #endif
680  Real const wqz = wq*invdtdz;
681 
682  // computes current and old position in grid units
683 #if defined(WARPX_DIM_RZ)
684  Real const xp_new = xp + (relative_time + 0.5_rt*dt)*uxp[ip]*gaminv;
685  Real const yp_new = yp + (relative_time + 0.5_rt*dt)*uyp[ip]*gaminv;
686  Real const xp_mid = xp_new - 0.5_rt*dt*uxp[ip]*gaminv;
687  Real const yp_mid = yp_new - 0.5_rt*dt*uyp[ip]*gaminv;
688  Real const xp_old = xp_new - dt*uxp[ip]*gaminv;
689  Real const yp_old = yp_new - dt*uyp[ip]*gaminv;
690  Real const rp_new = std::sqrt(xp_new*xp_new + yp_new*yp_new);
691  Real const rp_mid = std::sqrt(xp_mid*xp_mid + yp_mid*yp_mid);
692  Real const rp_old = std::sqrt(xp_old*xp_old + yp_old*yp_old);
693  Real costheta_new, sintheta_new;
694  if (rp_new > 0._rt) {
695  costheta_new = xp_new/rp_new;
696  sintheta_new = yp_new/rp_new;
697  } else {
698  costheta_new = 1._rt;
699  sintheta_new = 0._rt;
700  }
701  amrex::Real costheta_mid, sintheta_mid;
702  if (rp_mid > 0._rt) {
703  costheta_mid = xp_mid/rp_mid;
704  sintheta_mid = yp_mid/rp_mid;
705  } else {
706  costheta_mid = 1._rt;
707  sintheta_mid = 0._rt;
708  }
709  amrex::Real costheta_old, sintheta_old;
710  if (rp_old > 0._rt) {
711  costheta_old = xp_old/rp_old;
712  sintheta_old = yp_old/rp_old;
713  } else {
714  costheta_old = 1._rt;
715  sintheta_old = 0._rt;
716  }
717  const Complex xy_new0 = Complex{costheta_new, sintheta_new};
718  const Complex xy_mid0 = Complex{costheta_mid, sintheta_mid};
719  const Complex xy_old0 = Complex{costheta_old, sintheta_old};
720  // Keep these double to avoid bug in single precision
721  double const x_new = (rp_new - xmin)*dxi;
722  double const x_old = (rp_old - xmin)*dxi;
723 #else
724 #if !defined(WARPX_DIM_1D_Z)
725  // Keep these double to avoid bug in single precision
726  double const x_new = (xp - xmin + (relative_time + 0.5_rt*dt)*uxp[ip]*gaminv)*dxi;
727  double const x_old = x_new - dt*dxi*uxp[ip]*gaminv;
728 #endif
729 #endif
730 #if defined(WARPX_DIM_3D)
731  // Keep these double to avoid bug in single precision
732  double const y_new = (yp - ymin + (relative_time + 0.5_rt*dt)*uyp[ip]*gaminv)*dyi;
733  double const y_old = y_new - dt*dyi*uyp[ip]*gaminv;
734 #endif
735  // Keep these double to avoid bug in single precision
736  double const z_new = (zp - zmin + (relative_time + 0.5_rt*dt)*uzp[ip]*gaminv)*dzi;
737  double const z_old = z_new - dt*dzi*uzp[ip]*gaminv;
738 
739 #if defined(WARPX_DIM_RZ)
740  Real const vy = (-uxp[ip]*sintheta_mid + uyp[ip]*costheta_mid)*gaminv;
741 #elif defined(WARPX_DIM_XZ)
742  Real const vy = uyp[ip]*gaminv;
743 #elif defined(WARPX_DIM_1D_Z)
744  Real const vx = uxp[ip]*gaminv;
745  Real const vy = uyp[ip]*gaminv;
746 #endif
747 
748  // Shape factor arrays
749  // Note that there are extra values above and below
750  // to possibly hold the factor for the old particle
751  // which can be at a different grid location.
752  // Keep these double to avoid bug in single precision
753 #if !defined(WARPX_DIM_1D_Z)
754  double sx_new[depos_order + 3] = {0.};
755  double sx_old[depos_order + 3] = {0.};
756 #endif
757 #if defined(WARPX_DIM_3D)
758  // Keep these double to avoid bug in single precision
759  double sy_new[depos_order + 3] = {0.};
760  double sy_old[depos_order + 3] = {0.};
761 #endif
762  // Keep these double to avoid bug in single precision
763  double sz_new[depos_order + 3] = {0.};
764  double sz_old[depos_order + 3] = {0.};
765 
766  // --- Compute shape factors
767  // Compute shape factors for position as they are now and at old positions
768  // [ijk]_new: leftmost grid point that the particle touches
769  Compute_shape_factor< depos_order > compute_shape_factor;
770  Compute_shifted_shape_factor< depos_order > compute_shifted_shape_factor;
771 
772 #if !defined(WARPX_DIM_1D_Z)
773  const int i_new = compute_shape_factor(sx_new+1, x_new);
774  const int i_old = compute_shifted_shape_factor(sx_old, x_old, i_new);
775 #endif
776 #if defined(WARPX_DIM_3D)
777  const int j_new = compute_shape_factor(sy_new+1, y_new);
778  const int j_old = compute_shifted_shape_factor(sy_old, y_old, j_new);
779 #endif
780  const int k_new = compute_shape_factor(sz_new+1, z_new);
781  const int k_old = compute_shifted_shape_factor(sz_old, z_old, k_new);
782 
783  // computes min/max positions of current contributions
784 #if !defined(WARPX_DIM_1D_Z)
785  int dil = 1, diu = 1;
786  if (i_old < i_new) dil = 0;
787  if (i_old > i_new) diu = 0;
788 #endif
789 #if defined(WARPX_DIM_3D)
790  int djl = 1, dju = 1;
791  if (j_old < j_new) djl = 0;
792  if (j_old > j_new) dju = 0;
793 #endif
794  int dkl = 1, dku = 1;
795  if (k_old < k_new) dkl = 0;
796  if (k_old > k_new) dku = 0;
797 
798 #if defined(WARPX_DIM_3D)
799 
800  for (int k=dkl; k<=depos_order+2-dku; k++) {
801  for (int j=djl; j<=depos_order+2-dju; j++) {
802  amrex::Real sdxi = 0._rt;
803  for (int i=dil; i<=depos_order+1-diu; i++) {
804  sdxi += wqx*(sx_old[i] - sx_new[i])*(
805  one_third*(sy_new[j]*sz_new[k] + sy_old[j]*sz_old[k])
806  +one_sixth*(sy_new[j]*sz_old[k] + sy_old[j]*sz_new[k]));
807  amrex::Gpu::Atomic::AddNoRet( &Jx_arr(lo.x+i_new-1+i, lo.y+j_new-1+j, lo.z+k_new-1+k), sdxi);
808  }
809  }
810  }
811  for (int k=dkl; k<=depos_order+2-dku; k++) {
812  for (int i=dil; i<=depos_order+2-diu; i++) {
813  amrex::Real sdyj = 0._rt;
814  for (int j=djl; j<=depos_order+1-dju; j++) {
815  sdyj += wqy*(sy_old[j] - sy_new[j])*(
816  one_third*(sx_new[i]*sz_new[k] + sx_old[i]*sz_old[k])
817  +one_sixth*(sx_new[i]*sz_old[k] + sx_old[i]*sz_new[k]));
818  amrex::Gpu::Atomic::AddNoRet( &Jy_arr(lo.x+i_new-1+i, lo.y+j_new-1+j, lo.z+k_new-1+k), sdyj);
819  }
820  }
821  }
822  for (int j=djl; j<=depos_order+2-dju; j++) {
823  for (int i=dil; i<=depos_order+2-diu; i++) {
824  amrex::Real sdzk = 0._rt;
825  for (int k=dkl; k<=depos_order+1-dku; k++) {
826  sdzk += wqz*(sz_old[k] - sz_new[k])*(
827  one_third*(sx_new[i]*sy_new[j] + sx_old[i]*sy_old[j])
828  +one_sixth*(sx_new[i]*sy_old[j] + sx_old[i]*sy_new[j]));
829  amrex::Gpu::Atomic::AddNoRet( &Jz_arr(lo.x+i_new-1+i, lo.y+j_new-1+j, lo.z+k_new-1+k), sdzk);
830  }
831  }
832  }
833 
834 #elif defined(WARPX_DIM_XZ) || defined(WARPX_DIM_RZ)
835 
836  for (int k=dkl; k<=depos_order+2-dku; k++) {
837  amrex::Real sdxi = 0._rt;
838  for (int i=dil; i<=depos_order+1-diu; i++) {
839  sdxi += wqx*(sx_old[i] - sx_new[i])*0.5_rt*(sz_new[k] + sz_old[k]);
840  amrex::Gpu::Atomic::AddNoRet( &Jx_arr(lo.x+i_new-1+i, lo.y+k_new-1+k, 0, 0), sdxi);
841 #if defined(WARPX_DIM_RZ)
842  Complex xy_mid = xy_mid0; // Throughout the following loop, xy_mid takes the value e^{i m theta}
843  for (int imode=1 ; imode < n_rz_azimuthal_modes ; imode++) {
844  // The factor 2 comes from the normalization of the modes
845  const Complex djr_cmplx = 2._rt *sdxi*xy_mid;
846  amrex::Gpu::Atomic::AddNoRet( &Jx_arr(lo.x+i_new-1+i, lo.y+k_new-1+k, 0, 2*imode-1), djr_cmplx.real());
847  amrex::Gpu::Atomic::AddNoRet( &Jx_arr(lo.x+i_new-1+i, lo.y+k_new-1+k, 0, 2*imode), djr_cmplx.imag());
848  xy_mid = xy_mid*xy_mid0;
849  }
850 #endif
851  }
852  }
853  for (int k=dkl; k<=depos_order+2-dku; k++) {
854  for (int i=dil; i<=depos_order+2-diu; i++) {
855  Real const sdyj = wq*vy*invvol*(
856  one_third*(sx_new[i]*sz_new[k] + sx_old[i]*sz_old[k])
857  +one_sixth*(sx_new[i]*sz_old[k] + sx_old[i]*sz_new[k]));
858  amrex::Gpu::Atomic::AddNoRet( &Jy_arr(lo.x+i_new-1+i, lo.y+k_new-1+k, 0, 0), sdyj);
859 #if defined(WARPX_DIM_RZ)
860  Complex xy_new = xy_new0;
861  Complex xy_mid = xy_mid0;
862  Complex xy_old = xy_old0;
863  // Throughout the following loop, xy_ takes the value e^{i m theta_}
864  for (int imode=1 ; imode < n_rz_azimuthal_modes ; imode++) {
865  // The factor 2 comes from the normalization of the modes
866  // The minus sign comes from the different convention with respect to Davidson et al.
867  const Complex djt_cmplx = -2._rt * I*(i_new-1 + i + xmin*dxi)*wq*invdtdx/(amrex::Real)imode
868  *(Complex(sx_new[i]*sz_new[k], 0._rt)*(xy_new - xy_mid)
869  + Complex(sx_old[i]*sz_old[k], 0._rt)*(xy_mid - xy_old));
870  amrex::Gpu::Atomic::AddNoRet( &Jy_arr(lo.x+i_new-1+i, lo.y+k_new-1+k, 0, 2*imode-1), djt_cmplx.real());
871  amrex::Gpu::Atomic::AddNoRet( &Jy_arr(lo.x+i_new-1+i, lo.y+k_new-1+k, 0, 2*imode), djt_cmplx.imag());
872  xy_new = xy_new*xy_new0;
873  xy_mid = xy_mid*xy_mid0;
874  xy_old = xy_old*xy_old0;
875  }
876 #endif
877  }
878  }
879  for (int i=dil; i<=depos_order+2-diu; i++) {
880  Real sdzk = 0._rt;
881  for (int k=dkl; k<=depos_order+1-dku; k++) {
882  sdzk += wqz*(sz_old[k] - sz_new[k])*0.5_rt*(sx_new[i] + sx_old[i]);
883  amrex::Gpu::Atomic::AddNoRet( &Jz_arr(lo.x+i_new-1+i, lo.y+k_new-1+k, 0, 0), sdzk);
884 #if defined(WARPX_DIM_RZ)
885  Complex xy_mid = xy_mid0; // Throughout the following loop, xy_mid takes the value e^{i m theta}
886  for (int imode=1 ; imode < n_rz_azimuthal_modes ; imode++) {
887  // The factor 2 comes from the normalization of the modes
888  const Complex djz_cmplx = 2._rt * sdzk * xy_mid;
889  amrex::Gpu::Atomic::AddNoRet( &Jz_arr(lo.x+i_new-1+i, lo.y+k_new-1+k, 0, 2*imode-1), djz_cmplx.real());
890  amrex::Gpu::Atomic::AddNoRet( &Jz_arr(lo.x+i_new-1+i, lo.y+k_new-1+k, 0, 2*imode), djz_cmplx.imag());
891  xy_mid = xy_mid*xy_mid0;
892  }
893 #endif
894  }
895  }
896 #elif defined(WARPX_DIM_1D_Z)
897 
898  for (int k=dkl; k<=depos_order+2-dku; k++) {
899  amrex::Real const sdxi = wq*vx*invvol*0.5_rt*(sz_old[k] + sz_new[k]);
900  amrex::Gpu::Atomic::AddNoRet( &Jx_arr(lo.x+k_new-1+k, 0, 0, 0), sdxi);
901  }
902  for (int k=dkl; k<=depos_order+2-dku; k++) {
903  amrex::Real const sdyj = wq*vy*invvol*0.5_rt*(sz_old[k] + sz_new[k]);
904  amrex::Gpu::Atomic::AddNoRet( &Jy_arr(lo.x+k_new-1+k, 0, 0, 0), sdyj);
905  }
906  amrex::Real sdzk = 0._rt;
907  for (int k=dkl; k<=depos_order+1-dku; k++) {
908  sdzk += wqz*(sz_old[k] - sz_new[k]);
909  amrex::Gpu::Atomic::AddNoRet( &Jz_arr(lo.x+k_new-1+k, 0, 0, 0), sdzk);
910  }
911 #endif
912  }
913  );
914 #if defined(WARPX_USE_GPUCLOCK)
915  if(cost && load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::GpuClock) {
917  *cost += *cost_real;
918  amrex::The_Managed_Arena()->free(cost_real);
919  }
920 #endif
921 }
922 
952 template <int depos_order>
954  const amrex::ParticleReal* const wp,
955  const amrex::ParticleReal* const uxp,
956  const amrex::ParticleReal* const uyp,
957  const amrex::ParticleReal* const uzp,
958  const int* const ion_lev,
959  amrex::FArrayBox& Dx_fab,
960  amrex::FArrayBox& Dy_fab,
961  amrex::FArrayBox& Dz_fab,
962  const long np_to_depose,
963  const amrex::Real dt,
964  const amrex::Real relative_time,
965  const std::array<amrex::Real,3>& dx,
966  const std::array<amrex::Real,3>& xyzmin,
967  const amrex::Dim3 lo,
968  const amrex::Real q,
969  const int n_rz_azimuthal_modes,
970  amrex::Real* cost,
971  const long load_balance_costs_update_algo)
972 {
973 #if defined(WARPX_DIM_RZ)
974  amrex::ignore_unused(GetPosition,
975  wp, uxp, uyp, uzp, ion_lev, Dx_fab, Dy_fab, Dz_fab,
976  np_to_depose, dt, relative_time, dx, xyzmin, lo, q, n_rz_azimuthal_modes);
977  amrex::Abort("Vay deposition not implemented in RZ geometry");
978 #endif
979 
980 #if defined(WARPX_DIM_1D_Z)
981  amrex::ignore_unused(GetPosition,
982  wp, uxp, uyp, uzp, ion_lev, Dx_fab, Dy_fab, Dz_fab,
983  np_to_depose, dt, relative_time, dx, xyzmin, lo, q, n_rz_azimuthal_modes);
984  amrex::Abort("Vay deposition not implemented in cartesian 1D geometry");
985 #endif
986 
987 #if !defined(AMREX_USE_GPU)
988  amrex::ignore_unused(cost, load_balance_costs_update_algo);
989 #endif
990 
991 #if !(defined WARPX_DIM_RZ || defined WARPX_DIM_1D_Z)
992  amrex::ignore_unused(n_rz_azimuthal_modes);
993 
994  // If ion_lev is a null pointer, then do_ionization=0, else do_ionization=1
995  const bool do_ionization = ion_lev;
996 
997  // Inverse cell volume in each direction
998  const amrex::Real dxi = 1._rt / dx[0];
999  const amrex::Real dzi = 1._rt / dx[2];
1000 #if defined(WARPX_DIM_3D)
1001  const amrex::Real dyi = 1._rt / dx[1];
1002 #endif
1003 
1004  // Inverse of time step
1005  const amrex::Real invdt = 1._rt / dt;
1006 
1007  // Total inverse cell volume
1008 #if defined(WARPX_DIM_XZ)
1009  const amrex::Real invvol = dxi * dzi;
1010 #elif defined(WARPX_DIM_3D)
1011  const amrex::Real invvol = dxi * dyi * dzi;
1012 #endif
1013 
1014  // Lower bound of physical domain in each direction
1015  const amrex::Real xmin = xyzmin[0];
1016  const amrex::Real zmin = xyzmin[2];
1017 #if defined(WARPX_DIM_3D)
1018  const amrex::Real ymin = xyzmin[1];
1019 #endif
1020 
1021  // Allocate temporary arrays
1022 #if defined(WARPX_DIM_3D)
1023  AMREX_ALWAYS_ASSERT(Dx_fab.box() == Dy_fab.box() && Dx_fab.box() == Dz_fab.box());
1024  amrex::FArrayBox temp_fab{Dx_fab.box(), 4};
1025 #elif defined(WARPX_DIM_XZ)
1026  AMREX_ALWAYS_ASSERT(Dx_fab.box() == Dz_fab.box());
1027  amrex::FArrayBox temp_fab{Dx_fab.box(), 2};
1028 #endif
1029  temp_fab.setVal<amrex::RunOn::Device>(0._rt);
1030  amrex::Array4<amrex::Real> const& temp_arr = temp_fab.array();
1031 
1032  // Inverse of light speed squared
1033  const amrex::Real invcsq = 1._rt / (PhysConst::c * PhysConst::c);
1034 
1035  // Arrays where D will be stored
1036  amrex::Array4<amrex::Real> const& Dx_arr = Dx_fab.array();
1037  amrex::Array4<amrex::Real> const& Dy_arr = Dy_fab.array();
1038  amrex::Array4<amrex::Real> const& Dz_arr = Dz_fab.array();
1039 
1040  // Loop over particles and deposit (Dx,Dy,Dz) into Dx_fab, Dy_fab and Dz_fab
1041 #if defined(WARPX_USE_GPUCLOCK)
1042  amrex::Real* cost_real = nullptr;
1043  if( load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::GpuClock) {
1044  cost_real = (amrex::Real *) amrex::The_Managed_Arena()->alloc(sizeof(amrex::Real));
1045  *cost_real = 0._rt;
1046  }
1047 #endif
1048  amrex::ParallelFor(np_to_depose, [=] AMREX_GPU_DEVICE (long ip)
1049  {
1050 #if defined(WARPX_USE_GPUCLOCK)
1051  const auto KernelTimer = ablastr::parallelization::KernelTimer(
1052  cost && (load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::GpuClock),
1053  cost_real);
1054 #endif
1055 
1056  // Inverse of Lorentz factor gamma
1057  const amrex::Real invgam = 1._rt / std::sqrt(1._rt + uxp[ip] * uxp[ip] * invcsq
1058  + uyp[ip] * uyp[ip] * invcsq
1059  + uzp[ip] * uzp[ip] * invcsq);
1060  // Product of particle charges and weights
1061  amrex::Real wq = q * wp[ip];
1062  if (do_ionization) wq *= ion_lev[ip];
1063 
1064  // Current particle positions (in physical units)
1065  amrex::ParticleReal xp, yp, zp;
1066  GetPosition(ip, xp, yp, zp);
1067 
1068  // Particle velocities
1069  const amrex::Real vx = uxp[ip] * invgam;
1070  const amrex::Real vy = uyp[ip] * invgam;
1071  const amrex::Real vz = uzp[ip] * invgam;
1072 
1073  // Modify the particle position to match the time of the deposition
1074  xp += relative_time * vx;
1075  yp += relative_time * vy;
1076  zp += relative_time * vz;
1077 
1078  // Particle current densities
1079 #if defined(WARPX_DIM_XZ)
1080  const amrex::Real wqy = wq * vy * invvol;
1081 #endif
1082 
1083  // Current and old particle positions in grid units
1084  // Keep these double to avoid bug in single precision.
1085  double const x_new = (xp - xmin + 0.5_rt*dt*vx) * dxi;
1086  double const x_old = (xp - xmin - 0.5_rt*dt*vx) * dxi;
1087 #if defined(WARPX_DIM_3D)
1088  // Keep these double to avoid bug in single precision.
1089  double const y_new = (yp - ymin + 0.5_rt*dt*vy) * dyi;
1090  double const y_old = (yp - ymin - 0.5_rt*dt*vy) * dyi;
1091 #endif
1092  // Keep these double to avoid bug in single precision.
1093  double const z_new = (zp - zmin + 0.5_rt*dt*vz) * dzi;
1094  double const z_old = (zp - zmin - 0.5_rt*dt*vz) * dzi;
1095 
1096  // Shape factor arrays for current and old positions (nodal)
1097  // Keep these double to avoid bug in single precision.
1098  double sx_new[depos_order+1] = {0.};
1099  double sx_old[depos_order+1] = {0.};
1100 #if defined(WARPX_DIM_3D)
1101  // Keep these double to avoid bug in single precision.
1102  double sy_new[depos_order+1] = {0.};
1103  double sy_old[depos_order+1] = {0.};
1104 #endif
1105  // Keep these double to avoid bug in single precision.
1106  double sz_new[depos_order+1] = {0.};
1107  double sz_old[depos_order+1] = {0.};
1108 
1109  // Compute shape factors for current positions
1110 
1111  // i_new leftmost grid point in x that the particle touches
1112  // sx_new shape factor along x for the centering of each current
1113  Compute_shape_factor< depos_order > const compute_shape_factor;
1114  const int i_new = compute_shape_factor(sx_new, x_new);
1115 #if defined(WARPX_DIM_3D)
1116  // j_new leftmost grid point in y that the particle touches
1117  // sy_new shape factor along y for the centering of each current
1118  const int j_new = compute_shape_factor(sy_new, y_new);
1119 #endif
1120  // k_new leftmost grid point in z that the particle touches
1121  // sz_new shape factor along z for the centering of each current
1122  const int k_new = compute_shape_factor(sz_new, z_new);
1123 
1124  // Compute shape factors for old positions
1125 
1126  // i_old leftmost grid point in x that the particle touches
1127  // sx_old shape factor along x for the centering of each current
1128  const int i_old = compute_shape_factor(sx_old, x_old);
1129 #if defined(WARPX_DIM_3D)
1130  // j_old leftmost grid point in y that the particle touches
1131  // sy_old shape factor along y for the centering of each current
1132  const int j_old = compute_shape_factor(sy_old, y_old);
1133 #endif
1134  // k_old leftmost grid point in z that the particle touches
1135  // sz_old shape factor along z for the centering of each current
1136  const int k_old = compute_shape_factor(sz_old, z_old);
1137 
1138  // Deposit current into Dx_arr, Dy_arr and Dz_arr
1139 #if defined(WARPX_DIM_XZ)
1140 
1141  for (int k=0; k<=depos_order; k++) {
1142  for (int i=0; i<=depos_order; i++) {
1143 
1144  // Re-casting sx_new and sz_new from double to amrex::Real so that
1145  // Atomic::Add has consistent types in its argument
1146  auto const sxn_szn = static_cast<amrex::Real>(sx_new[i] * sz_new[k]);
1147  auto const sxo_szn = static_cast<amrex::Real>(sx_old[i] * sz_new[k]);
1148  auto const sxn_szo = static_cast<amrex::Real>(sx_new[i] * sz_old[k]);
1149  auto const sxo_szo = static_cast<amrex::Real>(sx_old[i] * sz_old[k]);
1150 
1151  if (i_new == i_old && k_new == k_old) {
1152  // temp arrays for Dx and Dz
1153  amrex::Gpu::Atomic::AddNoRet(&temp_arr(lo.x + i_new + i, lo.y + k_new + k, 0, 0),
1154  wq * invvol * invdt * (sxn_szn - sxo_szo));
1155 
1156  amrex::Gpu::Atomic::AddNoRet(&temp_arr(lo.x + i_new + i, lo.y + k_new + k, 0, 1),
1157  wq * invvol * invdt * (sxn_szo - sxo_szn));
1158 
1159  // Dy
1160  amrex::Gpu::Atomic::AddNoRet(&Dy_arr(lo.x + i_new + i, lo.y + k_new + k, 0, 0),
1161  wqy * 0.25_rt * (sxn_szn + sxn_szo + sxo_szn + sxo_szo));
1162  } else {
1163  // temp arrays for Dx and Dz
1164  amrex::Gpu::Atomic::AddNoRet(&temp_arr(lo.x + i_new + i, lo.y + k_new + k, 0, 0),
1165  wq * invvol * invdt * sxn_szn);
1166 
1167  amrex::Gpu::Atomic::AddNoRet(&temp_arr(lo.x + i_old + i, lo.y + k_old + k, 0, 0),
1168  - wq * invvol * invdt * sxo_szo);
1169 
1170  amrex::Gpu::Atomic::AddNoRet(&temp_arr(lo.x + i_new + i, lo.y + k_old + k, 0, 1),
1171  wq * invvol * invdt * sxn_szo);
1172 
1173  amrex::Gpu::Atomic::AddNoRet(&temp_arr(lo.x + i_old + i, lo.y + k_new + k, 0, 1),
1174  - wq * invvol * invdt * sxo_szn);
1175 
1176  // Dy
1177  amrex::Gpu::Atomic::AddNoRet(&Dy_arr(lo.x + i_new + i, lo.y + k_new + k, 0, 0),
1178  wqy * 0.25_rt * sxn_szn);
1179 
1180  amrex::Gpu::Atomic::AddNoRet(&Dy_arr(lo.x + i_new + i, lo.y + k_old + k, 0, 0),
1181  wqy * 0.25_rt * sxn_szo);
1182 
1183  amrex::Gpu::Atomic::AddNoRet(&Dy_arr(lo.x + i_old + i, lo.y + k_new + k, 0, 0),
1184  wqy * 0.25_rt * sxo_szn);
1185 
1186  amrex::Gpu::Atomic::AddNoRet(&Dy_arr(lo.x + i_old + i, lo.y + k_old + k, 0, 0),
1187  wqy * 0.25_rt * sxo_szo);
1188  }
1189 
1190  }
1191  }
1192 
1193 #elif defined(WARPX_DIM_3D)
1194 
1195  for (int k=0; k<=depos_order; k++) {
1196  for (int j=0; j<=depos_order; j++) {
1197 
1198  auto const syn_szn = static_cast<amrex::Real>(sy_new[j] * sz_new[k]);
1199  auto const syo_szn = static_cast<amrex::Real>(sy_old[j] * sz_new[k]);
1200  auto const syn_szo = static_cast<amrex::Real>(sy_new[j] * sz_old[k]);
1201  auto const syo_szo = static_cast<amrex::Real>(sy_old[j] * sz_old[k]);
1202 
1203  for (int i=0; i<=depos_order; i++) {
1204 
1205  auto const sxn_syn_szn = static_cast<amrex::Real>(sx_new[i]) * syn_szn;
1206  auto const sxo_syn_szn = static_cast<amrex::Real>(sx_old[i]) * syn_szn;
1207  auto const sxn_syo_szn = static_cast<amrex::Real>(sx_new[i]) * syo_szn;
1208  auto const sxo_syo_szn = static_cast<amrex::Real>(sx_old[i]) * syo_szn;
1209  auto const sxn_syn_szo = static_cast<amrex::Real>(sx_new[i]) * syn_szo;
1210  auto const sxo_syn_szo = static_cast<amrex::Real>(sx_old[i]) * syn_szo;
1211  auto const sxn_syo_szo = static_cast<amrex::Real>(sx_new[i]) * syo_szo;
1212  auto const sxo_syo_szo = static_cast<amrex::Real>(sx_old[i]) * syo_szo;
1213 
1214  if (i_new == i_old && j_new == j_old && k_new == k_old) {
1215  // temp arrays for Dx, Dy and Dz
1216  amrex::Gpu::Atomic::AddNoRet(&temp_arr(lo.x + i_new + i, lo.y + j_new + j, lo.z + k_new + k, 0),
1217  wq * invvol * invdt * (sxn_syn_szn - sxo_syo_szo));
1218 
1219  amrex::Gpu::Atomic::AddNoRet(&temp_arr(lo.x + i_new + i, lo.y + j_new + j, lo.z + k_new + k, 1),
1220  wq * invvol * invdt * (sxn_syn_szo - sxo_syo_szn));
1221 
1222  amrex::Gpu::Atomic::AddNoRet(&temp_arr(lo.x + i_new + i, lo.y + j_new + j, lo.z + k_new + k, 2),
1223  wq * invvol * invdt * (sxn_syo_szn - sxo_syn_szo));
1224 
1225  amrex::Gpu::Atomic::AddNoRet(&temp_arr(lo.x + i_new + i, lo.y + j_new + j, lo.z + k_new + k, 3),
1226  wq * invvol * invdt * (sxo_syn_szn - sxn_syo_szo));
1227  } else {
1228  // temp arrays for Dx, Dy and Dz
1229  amrex::Gpu::Atomic::AddNoRet(&temp_arr(lo.x + i_new + i, lo.y + j_new + j, lo.z + k_new + k, 0),
1230  wq * invvol * invdt * sxn_syn_szn);
1231 
1232  amrex::Gpu::Atomic::AddNoRet(&temp_arr(lo.x + i_old + i, lo.y + j_old + j, lo.z + k_old + k, 0),
1233  - wq * invvol * invdt * sxo_syo_szo);
1234 
1235  amrex::Gpu::Atomic::AddNoRet(&temp_arr(lo.x + i_new + i, lo.y + j_new + j, lo.z + k_old + k, 1),
1236  wq * invvol * invdt * sxn_syn_szo);
1237 
1238  amrex::Gpu::Atomic::AddNoRet(&temp_arr(lo.x + i_old + i, lo.y + j_old + j, lo.z + k_new + k, 1),
1239  - wq * invvol * invdt * sxo_syo_szn);
1240 
1241  amrex::Gpu::Atomic::AddNoRet(&temp_arr(lo.x + i_new + i, lo.y + j_old + j, lo.z + k_new + k, 2),
1242  wq * invvol * invdt * sxn_syo_szn);
1243 
1244  amrex::Gpu::Atomic::AddNoRet(&temp_arr(lo.x + i_old + i, lo.y + j_new + j, lo.z + k_old + k, 2),
1245  - wq * invvol * invdt * sxo_syn_szo);
1246 
1247  amrex::Gpu::Atomic::AddNoRet(&temp_arr(lo.x + i_old + i, lo.y + j_new + j, lo.z + k_new + k, 3),
1248  wq * invvol * invdt * sxo_syn_szn);
1249 
1250  amrex::Gpu::Atomic::AddNoRet(&temp_arr(lo.x + i_new + i, lo.y + j_old + j, lo.z + k_old + k, 3),
1251  - wq * invvol * invdt * sxn_syo_szo);
1252  }
1253  }
1254  }
1255  }
1256 #endif
1257  } );
1258 
1259 #if defined(WARPX_DIM_3D)
1260  amrex::ParallelFor(Dx_fab.box(), [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
1261  {
1262  const amrex::Real t_a = temp_arr(i,j,k,0);
1263  const amrex::Real t_b = temp_arr(i,j,k,1);
1264  const amrex::Real t_c = temp_arr(i,j,k,2);
1265  const amrex::Real t_d = temp_arr(i,j,k,3);
1266  Dx_arr(i,j,k) += (1._rt/6._rt)*(2_rt*t_a + t_b + t_c - 2._rt*t_d);
1267  Dy_arr(i,j,k) += (1._rt/6._rt)*(2_rt*t_a + t_b - 2._rt*t_c + t_d);
1268  Dz_arr(i,j,k) += (1._rt/6._rt)*(2_rt*t_a - 2._rt*t_b + t_c + t_d);
1269  });
1270 #elif defined(WARPX_DIM_XZ)
1271  amrex::ParallelFor(Dx_fab.box(), [=] AMREX_GPU_DEVICE (int i, int j, int) noexcept
1272  {
1273  const amrex::Real t_a = temp_arr(i,j,0,0);
1274  const amrex::Real t_b = temp_arr(i,j,0,1);
1275  Dx_arr(i,j,0) += (0.5_rt)*(t_a + t_b);
1276  Dz_arr(i,j,0) += (0.5_rt)*(t_a - t_b);
1277  });
1278 #endif
1279  // Synchronize so that temp_fab can be safely deallocated in its destructor
1281 
1282 
1283 # if defined(WARPX_USE_GPUCLOCK)
1284  if( load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::GpuClock) {
1286  *cost += *cost_real;
1287  amrex::The_Managed_Arena()->free(cost_real);
1288  }
1289 # endif
1290 #endif // #if !(defined WARPX_DIM_RZ || defined WARPX_DIM_1D_Z)
1291 }
1292 #endif // CURRENTDEPOSITION_H_
#define AMREX_ALWAYS_ASSERT(EX)
#define AMREX_GPU_DEVICE
#define AMREX_D_DECL(a, b, c)
void doDepositionShapeN(const GetParticlePosition &GetPosition, const amrex::ParticleReal *const wp, const amrex::ParticleReal *const uxp, const amrex::ParticleReal *const uyp, const amrex::ParticleReal *const uzp, const int *const ion_lev, amrex::FArrayBox &jx_fab, amrex::FArrayBox &jy_fab, amrex::FArrayBox &jz_fab, const long np_to_depose, const amrex::Real relative_time, const std::array< amrex::Real, 3 > &dx, const std::array< amrex::Real, 3 > &xyzmin, const amrex::Dim3 lo, const amrex::Real q, const int n_rz_azimuthal_modes, amrex::Real *cost, const long load_balance_costs_update_algo)
Current Deposition for thread thread_num.
Definition: CurrentDeposition.H:53
void doEsirkepovDepositionShapeN(const GetParticlePosition &GetPosition, const amrex::ParticleReal *const wp, const amrex::ParticleReal *const uxp, const amrex::ParticleReal *const uyp, const amrex::ParticleReal *const uzp, const int *const ion_lev, const amrex::Array4< amrex::Real > &Jx_arr, const amrex::Array4< amrex::Real > &Jy_arr, const amrex::Array4< amrex::Real > &Jz_arr, const long np_to_depose, const amrex::Real dt, const amrex::Real relative_time, const std::array< amrex::Real, 3 > &dx, const std::array< amrex::Real, 3 > xyzmin, const amrex::Dim3 lo, const amrex::Real q, const int n_rz_azimuthal_modes, amrex::Real *const cost, const long load_balance_costs_update_algo)
Esirkepov Current Deposition for thread thread_num.
Definition: CurrentDeposition.H:575
void doVayDepositionShapeN(const GetParticlePosition &GetPosition, const amrex::ParticleReal *const wp, const amrex::ParticleReal *const uxp, const amrex::ParticleReal *const uyp, const amrex::ParticleReal *const uzp, const int *const ion_lev, amrex::FArrayBox &Dx_fab, amrex::FArrayBox &Dy_fab, amrex::FArrayBox &Dz_fab, const long np_to_depose, const amrex::Real dt, const amrex::Real relative_time, const std::array< amrex::Real, 3 > &dx, const std::array< amrex::Real, 3 > &xyzmin, const amrex::Dim3 lo, const amrex::Real q, const int n_rz_azimuthal_modes, amrex::Real *cost, const long load_balance_costs_update_algo)
Vay current deposition (Vay et al, 2013) for thread thread_num: deposit D in real space and store the...
Definition: CurrentDeposition.H:953
void doDepositionSharedShapeN(const GetParticlePosition &GetPosition, const amrex::ParticleReal *const wp, const amrex::ParticleReal *const uxp, const amrex::ParticleReal *const uyp, const amrex::ParticleReal *const uzp, const int *const ion_lev, amrex::FArrayBox &jx_fab, amrex::FArrayBox &jy_fab, amrex::FArrayBox &jz_fab, const long np_to_depose, const amrex::Real relative_time, const std::array< amrex::Real, 3 > &dx, const std::array< amrex::Real, 3 > &xyzmin, const amrex::Dim3 lo, const amrex::Real q, const int n_rz_azimuthal_modes, amrex::Real *cost, const long load_balance_costs_update_algo, const amrex::DenseBins< WarpXParticleContainer::ParticleType > &a_bins, const amrex::Box &box, const amrex::Geometry &geom, const amrex::IntVect &a_tbox_max_size)
Current Deposition for thread thread_num using shared memory.
Definition: CurrentDeposition.H:365
#define WARPX_ALWAYS_ASSERT_WITH_MESSAGE(EX, MSG)
Definition: TextMsg.H:13
amrex::GpuComplex< amrex::Real > Complex
Definition: WarpX_Complex.H:22
NODE
static amrex::IntVect shared_tilesize
tileSize to use for shared current deposition operations
Definition: WarpX.H:370
static int shared_mem_current_tpb
number of threads to use per block in shared deposition
Definition: WarpX.H:367
Defines a timer object to be used on GPU; measures summed thread cycles.
Definition: KernelTimer.H:27
virtual void free(void *pt)=0
virtual void * alloc(std::size_t sz)=0
const Box & box() const noexcept
AMREX_FORCE_INLINE Array4< T const > array() const noexcept
AMREX_GPU_HOST_DEVICE BoxND & grow(int i) noexcept
AMREX_GPU_HOST_DEVICE IntVectND< dim > type() const noexcept
AMREX_GPU_HOST_DEVICE Long numPts() const noexcept
GpuArray< Real, AMREX_SPACEDIM > InvCellSizeArray() const noexcept
index_type * offsetsPtr() noexcept
index_type * permutationPtr() noexcept
Long numBins() const noexcept
const Box & Domain() const noexcept
GpuArray< Real, AMREX_SPACEDIM > ProbLoArray() const noexcept
static std::size_t sharedMemPerBlock() noexcept
static constexpr auto c
vacuum speed of light [m/s]
Definition: constant.H:44
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void AddNoRet(T *sum, T value) noexcept
void streamSynchronize() noexcept
gpuStream_t gpuStream() noexcept
std::enable_if_t< std::is_integral_v< T > > ParallelFor(TypeList< CTOs... > ctos, std::array< int, sizeof...(CTOs)> const &runtime_options, T N, F &&f)
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE BoxND< dim > convert(const BoxND< dim > &b, const IntVectND< dim > &typ) noexcept
void launch(T const &n, L &&f) noexcept
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Dim3 end(BoxND< dim > const &box) noexcept
Arena * The_Managed_Arena()
IntVectND< AMREX_SPACEDIM > IntVect
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void ignore_unused(const Ts &...)
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Dim3 begin(BoxND< dim > const &box) noexcept
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE constexpr const T & max(const T &a, const T &b) noexcept
void Abort(const std::string &msg)
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE int getTileIndex(const IntVect &iv, const Box &box, const bool a_do_tiling, const IntVect &a_tile_size, Box &tbx)
i
Definition: check_interp_points_and_weights.py:174
int dx
Definition: stencil.py:436
int dt
Definition: stencil.py:440
Definition: ShapeFactors.H:27
Definition: ShapeFactors.H:82
Functor that can be used to extract the positions of the macroparticles inside a ParallelFor kernel.
Definition: GetAndSetPosition.H:53
@ GpuClock
Definition: WarpXAlgorithmSelection.H:138
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE constexpr T real() const noexcept
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE constexpr T imag() const noexcept
AMREX_GPU_DEVICE T * dataPtr() noexcept