w3d_utilities.F



[addsortedefield] [convertindextoproc] [domaindecomposefields] [domaindecomposeparticles] [flagparticlesbysortedssn] [getabsgrad3d] [getichild] [getichildpositiveonly] [initializedecomp] [setupdecompositionw3d] [setupgrid2dtype] [setupgrid3dtype] [smooth121nonzero] [smooth3d_121] [smooth3d_121_stride] [smooth3d_121_stride_mask] [sortparticlesbyindex1] [sortparticlesbyindex2] [sortparticlesbyindexgetisort]

#include top.h
 @(#) File w3d_utilities.F, version $Revision: 1.42 $, $Date: 2011/12/08 22:10:59 $
 # Copyright (c) 1990-1998, The Regents of the University of California.
 # All rights reserved.  See LEGAL.LLNL for full text and disclaimer.
   This is main file of package W3D of code WARP
   3d electrostatic PIC code, Cartesian geometry, for beam problems
   Alex Friedman, LLNL, (510)422-0827
   David P. Grote, LLNL, (510)423-7194

      subroutine sortparticlesbyindex1(n,indx,x,y,z,uz,nw,wfact,nblocks,
     &                                 xout,yout,zout,uzout,wfactout,pcounts)
      use Subtimersw3d
      integer(ISZ):: n,nw,nblocks
      integer(ISZ):: indx(n)
      real(kind=8):: x(n),y(n),z(n),uz(n),wfact(nw)
      real(kind=8):: xout(n),yout(n),zout(n),uzout(n),wfactout(nw)
      integer(ISZ):: pcounts(0:nblocks-1)

  Given particle data and an index array, the routine sorts the particles
  based on the index. It returns the sorted particles and a list of the
  number of particles for each index.

      integer(ISZ):: ip,ii,ib
      integer(ISZ):: tcounts(0:nblocks-1)
      real(kind=8):: substarttime,wtime
      if (lw3dtimesubs) substarttime = wtime()

      --- First, count how many in each block there are.
      --- Note that the blocks numbers start at zero.
      pcounts = 0
      do ip=1,n
        ib = indx(ip)
        pcounts(ib) = pcounts(ib) + 1
      enddo

      --- Generate partial sum of pcounts
      tcounts(0) = 1
      do ib=0,nblocks-2
        tcounts(ib+1) = tcounts(ib) + pcounts(ib)
      enddo

      --- Rearrange the particles
      do ip=1,n
        ib = indx(ip)
        ii = tcounts(ib)
        xout(ii) = x(ip)
        yout(ii) = y(ip)
        zout(ii) = z(ip)
        uzout(ii) = uz(ip)
        if (nw == n) wfactout(ii) = wfact(ip)
        tcounts(ib) = tcounts(ib) + 1
      enddo

!$OMP MASTER
      if (lw3dtimesubs) timesortpart_byindex = timesortpart_byindex +
     &                                             wtime() - substarttime
!$OMP END MASTER

      return
      end

      subroutine sortparticlesbyindex2(n,indx,x,y,z,ux,uy,uz,gaminv,nw,wght,
     &                                 nblocks,
     &                                 xout,yout,zout,uxout,uyout,uzout,
     &                                 gaminvout,wghtout,pcounts)
      use Subtimersw3d
      integer(ISZ):: n,nw,nblocks
      integer(ISZ):: indx(n)
      real(kind=8):: x(n),y(n),z(n),ux(n),uy(n),uz(n),gaminv(n)
      real(kind=8):: wght(nw)
      real(kind=8):: xout(n),yout(n),zout(n),uxout(n),uyout(n),uzout(n)
      real(kind=8):: gaminvout(n)
      real(kind=8):: wghtout(nw)
      integer(ISZ):: pcounts(0:nblocks-1)

  Given particle data and an index array, the routine sorts the particles
  based on the index. It returns the sorted particles and a list of the
  number of particles for each index.

      integer(ISZ):: ip,ii,ib
      integer(ISZ):: tcounts(0:nblocks-1)
      real(kind=8):: substarttime,wtime
      if (lw3dtimesubs) substarttime = wtime()

      --- First, count how many in each block there are.
      --- Note that the blocks numbers start at zero.
      pcounts = 0
      do ip=1,n
        ib = indx(ip)
        pcounts(ib) = pcounts(ib) + 1
      enddo

      --- Generate partial sum of pcounts
      tcounts(0) = 1
      do ib=0,nblocks-2
        tcounts(ib+1) = tcounts(ib) + pcounts(ib)
      enddo

      --- Rearrange the particles
      do ip=1,n
        ib = indx(ip)
        ii = tcounts(ib)
        xout(ii) = x(ip)
        yout(ii) = y(ip)
        zout(ii) = z(ip)
        uxout(ii) = ux(ip)
        uyout(ii) = uy(ip)
        uzout(ii) = uz(ip)
        gaminvout(ii) = gaminv(ip)
        if (nw == n) wghtout(ii) = wght(ip)
        tcounts(ib) = tcounts(ib) + 1
      enddo

!$OMP MASTER
      if (lw3dtimesubs) timesortpart_byindex = timesortpart_byindex +
     &                                             wtime() - substarttime
!$OMP END MASTER

      return
      end

      subroutine sortparticlesbyindexgetisort(n,indx,x,y,z,nblocks,
     &                                        xout,yout,zout,isort,pcounts)
      use Subtimersw3d
      integer(ISZ):: n,nblocks
      integer(ISZ):: indx(n)
      real(kind=8):: x(n),y(n),z(n)
      real(kind=8):: xout(n),yout(n),zout(n)
      integer(ISZ):: isort(n)
      integer(ISZ):: pcounts(0:nblocks-1)

  Given particle data and an index array, the routine sorts the particles
  based on the index. It returns the sorted particles and a list of the
  number of particles for each index.

      integer(ISZ):: ip,ii,ib
      integer(ISZ):: tcounts(0:nblocks-1)
      real(kind=8):: substarttime,wtime
      if (lw3dtimesubs) substarttime = wtime()

      --- First, count how many in each block there are.
      --- Note that the blocks numbers start at zero.
      pcounts = 0
      do ip=1,n
        ib = indx(ip)
        pcounts(ib) = pcounts(ib) + 1
      enddo

      --- Generate partial sum of pcounts
      tcounts(0) = 1
      do ib=0,nblocks-2
        tcounts(ib+1) = tcounts(ib) + pcounts(ib)
      enddo

      --- Rearrange the particles
      do ip=1,n
        ib = indx(ip)
        ii = tcounts(ib)
        xout(ii) = x(ip)
        yout(ii) = y(ip)
        zout(ii) = z(ip)
        isort(ii) = ip - 1
        tcounts(ib) = tcounts(ib) + 1
      enddo

!$OMP MASTER
      if (lw3dtimesubs) timesortpart_byindexgetisort =
     &            timesortpart_byindexgetisort + wtime() - substarttime
!$OMP END MASTER

      return
      end

      subroutine getichild(gridnumb,np,x,y,z,ichild,nx,ny,nz,grid,
     &                     xmin,xmax,ymin,ymax,zmin,zmax,zgrid,
     &                     l2symtry,l4symtry)
      use Subtimersw3d
      integer(ISZ):: gridnumb,nx,ny,nz,np
      real(kind=8):: x(np), y(np), z(np)
      integer(ISZ):: ichild(np)
      integer(ISZ):: grid(0:nx,0:ny,0:nz)
      real(kind=8):: xmin,xmax,ymin,ymax,zmin,zmax,zgrid
      logical(ISZ):: l2symtry,l4symtry

  Gathers data from a 3-D mesh onto particle positions.
  Same as getgridngp3d, but only gathers the data if it is positive.

      integer(ISZ):: ip,ix,iy,iz
      real(kind=8):: gx,gy,gz
      real(kind=8):: dxi,dyi,dzi,wx,wy,wz
      real(kind=8):: xsymmetryplane,ysymmetryplane
      real(kind=8):: substarttime,wtime
      if (lw3dtimesubs) substarttime = wtime()

      --- These now default to zero, but could be input quantities.
      xsymmetryplane = 0.
      ysymmetryplane = 0.

      --- set grid cell inverse sizes
      dxi = nx/(xmax - xmin)
      if (ymin == ymax) then
        dyi = 1.
      else
        dyi = ny/(ymax - ymin)
      endif
      dzi = nz/(zmax - zmin)

      ix = 0
      iy = 0
      iz = 0

      --- loop over particles
      do ip=1,np
        if (ichild(ip) .ne. gridnumb) cycle

        --- find location on grid, taking into account any symmetries.
        if (l4symtry) then
          gx = abs(x(ip) - xsymmetryplane) + xsymmetryplane
          gy = abs(y(ip) - ysymmetryplane) + ysymmetryplane
        else if (l2symtry) then
          gx = x(ip)
          gy = abs(y(ip) - ysymmetryplane) + ysymmetryplane
        else
          gx = x(ip)
          gy = y(ip)
        endif

        gz = z(ip) - zgrid

        --- if not within grid, skip it
        if (((gx < xmin .or. gx > xmax) .and. nx > 0) .or.
     &      ((gy < ymin .or. gy > ymax) .and. ny > 0) .or.
     &      ((gz < zmin .or. gz > zmax) .and. nz > 0)) cycle

        if (nx > 0) ix = int((gx - xmin)*dxi)
        if (ny > 0) iy = int((gy - ymin)*dyi)
        if (nz > 0) iz = int((gz - zmin)*dzi)

        ichild(ip) = abs(grid(ix,iy,iz))

      enddo

!$OMP MASTER
      if (lw3dtimesubs) timegetichild = timegetichild + wtime() - substarttime
!$OMP END MASTER

      return
      end

      subroutine getichildpositiveonly(gridnumb,np,x,y,z,ichild,nx,ny,nz,grid,
     &                                 xmin,xmax,ymin,ymax,zmin,zmax,zgrid,
     &                                 l2symtry,l4symtry)
      use Subtimersw3d
      integer(ISZ):: gridnumb,nx,ny,nz,np
      real(kind=8):: x(np), y(np), z(np)
      integer(ISZ):: ichild(np)
      integer(ISZ):: grid(0:nx,0:ny,0:nz)
      real(kind=8):: xmin,xmax,ymin,ymax,zmin,zmax,zgrid
      logical(ISZ):: l2symtry,l4symtry

  Gathers data from a 3-D mesh onto particle positions.
  Same as getgridngp3d, but only gathers the data if it is positive.

      integer(ISZ):: ip,ix,iy,iz
      real(kind=8):: gx,gy,gz
      real(kind=8):: dxi,dyi,dzi,wx,wy,wz
      real(kind=8):: xsymmetryplane,ysymmetryplane
      real(kind=8):: substarttime,wtime
      if (lw3dtimesubs) substarttime = wtime()

      --- These now default to zero, but could be input quantities.
      xsymmetryplane = 0.
      ysymmetryplane = 0.

      --- set grid cell inverse sizes
      dxi = nx/(xmax - xmin)
      if (ymin == ymax) then
        dyi = 1.
      else
        dyi = ny/(ymax - ymin)
      endif
      dzi = nz/(zmax - zmin)

      ix = 0
      iy = 0
      iz = 0

      --- loop over particles
      do ip=1,np
        if (ichild(ip) .ne. gridnumb) cycle

        --- find location on grid, taking into account any symmetries.
        if (l4symtry) then
          gx = abs(x(ip) - xsymmetryplane) + xsymmetryplane
          gy = abs(y(ip) - ysymmetryplane) + ysymmetryplane
        else if (l2symtry) then
          gx = x(ip)
          gy = abs(y(ip) - ysymmetryplane) + ysymmetryplane
        else
          gx = x(ip)
          gy = y(ip)
        endif

        gz = z(ip) - zgrid

        --- if not within grid, skip it
        if (((gx < xmin .or. gx > xmax) .and. nx > 0) .or.
     &      ((gy < ymin .or. gy > ymax) .and. ny > 0) .or.
     &      ((gz < zmin .or. gz > zmax) .and. nz > 0)) cycle

        if (nx > 0) ix = int((gx - xmin)*dxi)
        if (ny > 0) iy = int((gy - ymin)*dyi)
        if (nz > 0) iz = int((gz - zmin)*dzi)

        if (grid(ix,iy,iz) >= 0) ichild(ip) = grid(ix,iy,iz)

      enddo

!$OMP MASTER
      if (lw3dtimesubs) timegetichildpositiveonly = timegetichildpositiveonly +
     &                                              wtime() - substarttime
!$OMP END MASTER

      return
      end

      subroutine addsortedefield(n,isort,tex,tey,tez,ex,ey,ez)
      use Subtimersw3d
      integer(ISZ):: n
      integer(ISZ):: isort(0:n-1)
      real(kind=8),dimension(0:n-1):: tex,tey,tez,ex,ey,ez

      integer(ISZ):: i
      real(kind=8):: substarttime,wtime
      if (lw3dtimesubs) substarttime = wtime()

      do i=0,n-1
        ex(isort(i)) = ex(isort(i)) + tex(i)
        ey(isort(i)) = ey(isort(i)) + tey(i)
        ez(isort(i)) = ez(isort(i)) + tez(i)
      enddo

!$OMP MASTER
      if (lw3dtimesubs) timeaddsortedefield = timeaddsortedefield +
     &                                        wtime() - substarttime
!$OMP END MASTER

      return
      end

      subroutine getabsgrad3d(nx,ny,nz,f,gr,dx,dy,dz)
      use Subtimersw3d
      integer(ISZ):: nx,ny,nz
      real(kind=8):: f(0:nx,0:ny,0:nz)
      real(kind=8):: gr(0:nx,0:ny,0:nz)
      real(kind=8):: dx,dy,dz

      integer(ISZ):: ix,iy,iz
      real(kind=8):: dxi,dyi,dzi
      integer(ISZ):: ixm1,ixp1,iym1,iyp1,izm1,izp1
      real(kind=8):: substarttime,wtime
      if (lw3dtimesubs) substarttime = wtime()

      dxi = 1./dx**2
      dyi = 1./dy**2
      dzi = 1./dz**2

      do iz=0,nz
        do iy=0,ny
          do ix=0,nx
            ixm1 = ix - 1
            ixp1 = ix + 1
            if (ix == 0) ixm1 = 1
            if (ix == nx) ixp1 = nx-1
            iym1 = iy - 1
            iyp1 = iy + 1
            if (iy == 0) iym1 = 1
            if (iy == ny) iyp1 = ny-1
            izm1 = iz - 1
            izp1 = iz + 1
            if (iz == 0) izm1 = 1
            if (iz == nz) izp1 = nz-1

             gr(ix,iy,iz) = (abs(f(ixp1,iy  ,iz  )-f(ix  ,iy  ,iz  )) +
      &                      abs(f(ix  ,iy  ,iz  )-f(ixm1,iy  ,iz  )))*dxi +
      &                     (abs(f(ix  ,iyp1,iz  )-f(ix  ,iy  ,iz  )) +
      &                      abs(f(ix  ,iy  ,iz  )-f(ix  ,iym1,iz  )))*dyi +
      &                     (abs(f(ix  ,iy  ,izp1)-f(ix  ,iy  ,iz  )) +
      &                      abs(f(ix  ,iy  ,iz  )-f(ix  ,iy  ,izm1)))*dzi
            gr(ix,iy,iz) = sqrt( max( abs(f(ixp1,iy  ,iz  )-f(ix  ,iy  ,iz  )),
     &                                abs(f(ix  ,iy  ,iz  )-f(ixm1,iy  ,iz  )))**2*dxi +
     &                           max( abs(f(ix  ,iyp1,iz  )-f(ix  ,iy  ,iz  )),
     &                                abs(f(ix  ,iy  ,iz  )-f(ix  ,iym1,iz  )))**2*dyi +
     &                           max( abs(f(ix  ,iy  ,izp1)-f(ix  ,iy  ,iz  )),
     &                                abs(f(ix  ,iy  ,iz  )-f(ix  ,iy  ,izm1)))**2*dzi)
          enddo
        enddo
      enddo

!$OMP MASTER
      if (lw3dtimesubs) timegetabsgrad = timegetabsgrad + wtime() - substarttime
!$OMP END MASTER

      return
      end

      subroutine smooth121nonzero(uin,uout,nx,ny,nz)
      integer(ISZ):: nx,ny,nz
      real(kind=8):: uin(0:nx,0:ny,0:nz)
      real(kind=8):: uout(0:nx,0:ny,0:nz)

      integer(ISZ):: ix,iy,iz,jx,jy,jz
      real(kind=8):: ww(-1:1,-1:1,-1:1)
      real(kind=8):: wwsum,w1(-1:1)

      w1 = (/0.25,0.5,0.25/)
      do jz=-1,1
        do jy=-1,1
          do jx=-1,1
            ww(jx,jy,jz) = w1(jx)*w1(jy)*w1(jz)
          enddo
        enddo
      enddo

      do iz=1,nz-1
        do iy=1,ny-1
          do ix=1,nx-1

            uout(ix,iy,iz) = 0.
            if (uin(ix,iy,iz) == 0.) cycle

            wwsum = 0.
            do jz=-1,1
              do jy=-1,1
                do jx=-1,1

                  if (uin(ix+jx,iy+jy,iz+jz) .ne. 0.) then
                    uout(ix,iy,iz) = uout(ix,iy,iz) +
     &                               ww(jx,jy,jz)*uin(ix+jx,iy+jy,iz+jz)
                    wwsum = wwsum + ww(jx,jy,jz)
                  endif

                enddo
              enddo
            enddo

            if (wwsum .ne. 0.) then
              uout(ix,iy,iz) = uout(ix,iy,iz)/wwsum
            endif

          enddo
        enddo
      enddo

      return
      end
      logical(ISZ) function arecoordinatesconsist(nx,dx,xmin,xmax,dxi)
      integer(ISZ):: nx
      real(kind=8):: dx,xmin,xmax,dxi

  For the Gridndtype (or any grid coordinate specifications), makes sure
  that the input is consistent. At most one can be unspecified. If all are
  set, the values must be consistent. If one is not specified, it is
  calculated. The grid cell size inverses are always calculated

      integer(ISZ):: defaultcount

      defaultcount = 0
      if (nx == -1) defaultcount = defaultcount + 1
      if (dx == LARGEPOS) defaultcount = defaultcount + 1
      if (xmin == +LARGEPOS) defaultcount = defaultcount + 1
      if (xmax == -LARGEPOS) defaultcount = defaultcount + 1

      if (defaultcount > 1) then
        arecoordinatesconsist = .false.
        return
      else if (defaultcount == 0) then
        arecoordinatesconsist = (nx == nint((xmax - xmin)/dx))
        return
      else
        if (nx == -1) nx = nint((xmax - xmin)/dx)
        if (dx == LARGEPOS) dx = (xmax - xmin)/nx
        if (xmin == +LARGEPOS) xmin = xmax - nx*dx
        if (xmax == -LARGEPOS) xmax = xmin + nx*dx
        dxi = 1./dx
        arecoordinatesconsist = .true.
        return
      endif

      return
      end

[setupiondensitygrid3d]
      subroutine setupgrid3dtype(grid,check)
      use Grid3dtypemodule
      type(Grid3dtype):: grid
      logical(ISZ):: check

  This checks if the input coordinates are consistent, and if so, allocates
  array.

      logical(ISZ):: arecoordinatesconsist

      check = arecoordinatesconsist(grid%nx,grid%dx,grid%xmin,grid%xmax,
     &                              grid%dxi)
      if (.not. check) return
      check = arecoordinatesconsist(grid%ny,grid%dy,grid%ymin,grid%ymax,
     &                              grid%dyi)
      if (.not. check) return
      check = arecoordinatesconsist(grid%nz,grid%dz,grid%zmin,grid%zmax,
     &                              grid%dzi)
      if (.not. check) return

      call Grid3dtypechange(grid)

      return
      end

[multigridberzf] [set_polarization]
      subroutine setupgrid2dtype(grid,check)
      use Grid2dtypemodule
      type(Grid2dtype):: grid
      logical(ISZ):: check

  This checks if the input coordinates are consistent, and if so, allocates
  array.

      logical(ISZ):: arecoordinatesconsist

      check = arecoordinatesconsist(grid%nx,grid%dx,grid%xmin,grid%xmax,
     &                              grid%dxi)
      if (.not. check) return
      check = arecoordinatesconsist(grid%ny,grid%dy,grid%ymin,grid%ymax,
     &                              grid%dyi)
      if (.not. check) return

      call Grid2dtypechange(grid)

      return
      end
  Some routines used by the parallel code that need to be accessible
  from python

[setupdecompositionw3d]
      subroutine domaindecomposefields(nz,nprocs,lfsautodecomp,
     &                                 izdecomp,nzdecomp,overlap)
      use InGen, Only: fstype
      integer(ISZ):: nz,nprocs
      logical(ISZ):: lfsautodecomp
      integer(ISZ):: izdecomp(0:nprocs-1),nzdecomp(0:nprocs-1)
      integer(ISZ):: overlap

      real(kind=8):: zperproc,avezpp,ztot
      integer(ISZ):: bestnz
      integer(ISZ):: i

 ---------------------------------------------------------------------------
      --- An overlap of one plane is needed for the FFT field solvers.
      --- An overlap of two planes is needed for SOR field solver since
      --- plane 0 of each processor must overlap with a plane which is
      --- calculated in the neighboring processor.
      --- As an example, for nz=8 divided among 2 PE's...
      ---   for overlap=1        0 1 2 3 4 5 6 7 8
      ---                       |_________|
      ---                               |_________|
      ---   for overlap=2        0 1 2 3 4 5 6 7 8
      ---                       |___________|
      ---                               |_________|
      if (overlap == 0) then
        if (fstype == 3 .or. fstype == 7 .or. fstype == 8 .or. fstype == 13) then
          overlap = 2
        else
          overlap = 1
        endif
      endif

 ---------------------------------------------------------------------------
      --- The special case of nz == 0.
      if (nz == 0) then
        izdecomp = 0
        nzdecomp = 0
        return
      endif

      --- Make sure that the number of processors is less than the number of
      --- grid cells.
      if (nprocs > nz) then
        call kaboom("domaindecomposefields: The number of grid cells must be
     & greater than the number of processors along each dimension.")
        return
      endif

 ---------------------------------------------------------------------------
      --- Do the domain decomposition for the field solver.
      --- The domain decomposition by default is done so that each processor
      --- has nearly the same number of z planes. If lautodecomp is false and
      --- fstype == 3, then the decomposition is supplied by the user (input
      --- through nzdecomp).
      if (lfsautodecomp .or.
     &    (fstype /= 3 .and. fstype /= 7 .and. fstype /= 12 .and.
     &     fstype /= 13)) then

        --- Calculate average number of z planes per processor, including the
        --- extra space for overlap.  For FFT, the minimum number of planes
        --- allowable is 1 and for PSOR, the minimum number of planes allowable
        --- is 2.
        zperproc = (nz + (nprocs - 1.)*(overlap - 1))/nprocs
        if (zperproc < overlap) zperproc = overlap

        --- bestnz is zperproc rounded down.
        bestnz = int(zperproc)
     
        --- First processor is easy
        nzdecomp(0) = nint(zperproc)
        izdecomp(0) = 0
        ztot = nzdecomp(0)

        --- loop over processors until used all processors or have assigned
        --- all of grid.
        i = 0
        do while (i < nprocs-1 .and.
     &            izdecomp(i)+nzdecomp(i)-overlap+1 < nz)
          i = i + 1

          --- This processor starts at the end of the region covered
          --- by the previous processor, overlapping it by the value of overlap.
          izdecomp(i) = izdecomp(i-1) + nzdecomp(i-1) + 1 - overlap

          --- The number of z planes given to this processor is first assumed to
          --- be bestnz.  If this gives an average number of z planes per
          --- processor that is less then zperproc, it is increased by 1.
          nzdecomp(i) = bestnz
          ztot = ztot + nzdecomp(i)
          avezpp = ztot/(i+1)
          if (avezpp < zperproc) then
            nzdecomp(i) = nzdecomp(i) + 1
            ztot = ztot + 1
          endif

          --- Check if this region extends past the end of the grid.  If so,
          --- recalculate nzdecomp(i).
          if (izdecomp(i) + nzdecomp(i) > nz) then
            nzdecomp(i) = nz - izdecomp(i)
            --- If nzdecomp(i) is less than 3 then skip this
            --- processor and give remaining zones to previous processor.
            if (nzdecomp(i) < 3) then
              i = i - 1
              nzdecomp(i) = nz - izdecomp(i)
            endif
          endif

        enddo

        --- Save the number of processors that have part of the grid assigned
        --- to them.
        nprocs = i+1

      else

        --- nzdecomp is assumed to be input by the user and is assumed to
        --- not include the overlap.

        --- First check to make sure that all values are > 0.
        do i=0,nprocs-1
          if (nzdecomp(i) <= 0) then
            print*,"domaindecomposefields: BAD VALUE ",nz,i,nzdecomp(i)
            call kaboom("domaindecomposefields: ERROR: nz for the field solver
     & for each processor must be greater than zero")
            return
          endif
        enddo

        --- Make sure that the sum of nzdecomp is consistent with nz
        if (sum(nzdecomp) .ne. nz) then
          print*,"domaindecomposefields: BAD VALUE ",nz," != (",nzdecomp,")"
          call kaboom("domaindecomposefields: ERROR: sum of the domain
     &sizes must be equal to the total number of grid cells")
          return
        endif

        --- Fill in the array izdecomp, based on the inputted nzdecomp and
        --- add the overlap to nzdecomp.
        izdecomp(0) = 0
        do i=1,nprocs-1
          izdecomp(i) = izdecomp(i-1) + nzdecomp(i-1)
        enddo
        --- Note that the last processor has no overlap
        do i=0,nprocs-2
          nzdecomp(i) = nzdecomp(i) + overlap - 1
        enddo
        --- Get the new value of nz.
        nz = izdecomp(nprocs-1) + nzdecomp(nprocs-1)

      endif

      return
      end

[setupdecompositionw3d]
      subroutine domaindecomposeparticles(nz,nprocs,nzpextra,zmmin,dz,
     &                                    userdecomp,lautodecomp,
     &                                    izdecomp,nzdecomp,
     &                                    zpslmin,zpslmax)
      use InGen3d,Only: solvergeom,AMRgeom
      integer(ISZ):: nz,nprocs,nzpextra
      real(kind=8):: zmmin,dz
      real(kind=8):: userdecomp(0:nprocs-1)
      logical(ISZ):: lautodecomp
      integer(ISZ):: izdecomp(0:nprocs-1),nzdecomp(0:nprocs-1)
      real(kind=8):: zpslmin(0:nprocs-1),zpslmax(0:nprocs-1)

      integer(ISZ):: i
      real(kind=8):: zlast

      --- nz == 0 is a special case
      if (nz == 0) then
        izdecomp = 0
        nzdecomp = 0
        return
      endif

      --- Make sure that the number of processors is less than the number of
      --- grid cells.
      if (nprocs > nz) then
        call kaboom("domaindecomposeparticles: The number of grid cells must
     & be greater than the number of processors along each dimension.")
        return
      endif

      if (.not. lautodecomp) then

        --- Set the decomposition based on the user set userdecomp. This
        --- is the fractional ranges of the particles for each processor.
        --- It is assumed that userdecomp has been properly normalized so
        --- that if the sum of the ranges covers the whole grid, then the
        --- sum is one.

        --- All values of userdecomp must be > 0.
        do i=0,nprocs-1
          if (userdecomp(i) <= 0.) then
            call kaboom("domaindecomposeparticles: ERROR: The length of all
     & particle domains must be positive. Fix userdecomp appropriately.")
            return
          endif
        enddo

        --- Set minimum z of each processor.
        zlast = zmmin
        do i=0,nprocs-1
          zpslmin(i) = zlast
          zpslmax(i) = zlast + userdecomp(i)*nz*dz
          zlast = zpslmax(i)
        enddo

        --- When using solvergeom==AMRgeom, the particle decomposition must
        --- be aligned with the grid.
        if (solvergeom==AMRgeom) then
         do i=0,nprocs-1
           zpslmin(i) = nint((zpslmin(i) - zmmin)/dz)*dz
           zpslmax(i) = nint((zpslmax(i) - zmmin)/dz)*dz
         enddo
        endif

      endif

      --- Now set iz and nz based off of zpslmin and zpslmax.
      --- This is done so that zmesh[iz] <= zpmin, and zmesh[iz+nz] >= zpmax.
      do i=0,nprocs-1

        izdecomp(i) = int((zpslmin(i) - zmmin)/dz)

        --- Check for the case which has a round-off problem, which
        --- happens when zpmin lies on a grid point. This ensures that if
        --- zpmin does lie on a grid point, then iz is set to that point.
        if (zpslmin(i) == zmmin + (izdecomp(i) + 1)*dz) then
          izdecomp(i) = izdecomp(i) + 1
        endif

        nzdecomp(i) = int((zpslmax(i) - zmmin)/dz) - izdecomp(i)

        --- Check to ensure that zmesh[iz+nz] >= zpmax. This is needed since
        --- the int rounds down.
        if (zpslmax(i) > zmmin + (izdecomp(i) + nzdecomp(i))*dz) then
          nzdecomp(i) = nzdecomp(i) + 1
        endif

        --- Now add in the user specifed extra cells.
        izdecomp(i) = izdecomp(i) - nzpextra
        nzdecomp(i) = nzdecomp(i) + 2*nzpextra

        --- Make sure that the processors don't have grid cells
        --- sticking out the end beyond the edge of the mesh.
        if (izdecomp(i) < 0) then
          nzdecomp(i) = nzdecomp(i) + izdecomp(i)
          izdecomp(i) = 0
        endif
        if (izdecomp(i) + nzdecomp(i) > nz) then
          nzdecomp(i) = nz - izdecomp(i)
        endif

      enddo

      return
      end

[setupdecompositionw3d]
      subroutine initializedecomp(decomp)
      use Decompositionmodule
      type(Decomposition):: decomp

#ifdef MPIPARALLEL
      call initializedecomp_work(decomp)
#else
      decomp%mpi_comm_x = 0
      decomp%mpi_comm_y = 0
      decomp%mpi_comm_z = 0
#endif

      return
      end

      integer(ISZ) function convertindextoproc(ix,iy,iz,nx,ny,nz)
      integer(ISZ):: ix,iy,iz,nx,ny,nz

      integer(ISZ):: ixt,iyt,izt

      ixt = ix
      iyt = iy
      izt = iz

      if (ixt < 0   ) ixt = nx - 1
      if (ixt > nx-1) ixt = 0
      if (iyt < 0   ) iyt = ny - 1
      if (iyt > ny-1) iyt = 0
      if (izt < 0   ) izt = nz - 1
      if (izt > nz-1) izt = 0

      convertindextoproc = ixt + iyt*nx + izt*nx*ny

      return
      end

[w3dgen] [wxygen]
      subroutine setupdecompositionw3d()
      use Subtimersw3d
      use Parallel
      use InGen
      use InGen3d
      use InDiag
      use InPart
      use InMesh3d
      use Picglb
      use Picglb3d
      use Fields3d
      use Fields3dParticles
      use Io
      use Z_arrays
      use LatticeInternal
      use Z_Moments
      use InjectVars
      use InjectVars3d

  This is the routine which divies up the work among the parallel processors.

      integer(ISZ):: convertindextoproc

      --- nslaves is obsolete but still set it since it is used in places.
      nslaves = nprocs

      --- This does the decomposition in z by default, but includes
      --- any user specified transverse decomposition.
      nzprocs = nprocs/(nxprocs*nyprocs)
      if (nxprocs*nyprocs*nzprocs .ne. nprocs) then
        call kaboom("setupdecompositionw3d: nxprocs*nyprocs*nzprocs must be equal to nprocs")
        return
      endif
      call gchange("Parallel",0)

      if (my_index > 0) then
        verbosity = 0
        lprntpara = .false.
      endif

      --- Divide up the processors using fortran ordering, with
      --- x being the most rapidly varying index.
      izproc = int(my_index/(nxprocs*nyprocs))
      iyproc = int((my_index - izproc*(nxprocs*nyprocs))/nxprocs)
      ixproc = my_index - izproc*(nxprocs*nyprocs) - iyproc*nxprocs
      iprocgrid(0) = ixproc
      iprocgrid(1) = iyproc
      iprocgrid(2) = izproc
      nprocgrid(0) = nxprocs
      nprocgrid(1) = nyprocs
      nprocgrid(2) = nzprocs

      --- Find the six neighboring processors
       procneighbors(0,0) = convertindextoproc(ixproc-1,iyproc  ,izproc  ,
     &                                         nxprocs,nyprocs,nzprocs)
       procneighbors(1,0) = convertindextoproc(ixproc+1,iyproc  ,izproc  ,
     &                                         nxprocs,nyprocs,nzprocs)
       procneighbors(0,1) = convertindextoproc(ixproc  ,iyproc-1,izproc  ,
     &                                         nxprocs,nyprocs,nzprocs)
       procneighbors(1,1) = convertindextoproc(ixproc  ,iyproc+1,izproc  ,
     &                                         nxprocs,nyprocs,nzprocs)
       procneighbors(0,2) = convertindextoproc(ixproc  ,iyproc  ,izproc-1,
     &                                         nxprocs,nyprocs,nzprocs)
       procneighbors(1,2) = convertindextoproc(ixproc  ,iyproc  ,izproc+1,
     &                                         nxprocs,nyprocs,nzprocs)

 ---------------------------------------------------------------------------
      --- Setup the decomposition data
      fsdecomp%my_index = my_index
      fsdecomp%nxglobal = nx
      fsdecomp%nyglobal = ny
      fsdecomp%nzglobal = nz
      fsdecomp%ixproc = ixproc
      fsdecomp%iyproc = iyproc
      fsdecomp%izproc = izproc
      fsdecomp%nxprocs = nxprocs
      fsdecomp%nyprocs = nyprocs
      fsdecomp%nzprocs = nzprocs
      call Decompositionchange(fsdecomp)
      fsdecomp%iprocgrid = iprocgrid
      fsdecomp%nprocgrid = nprocgrid

      ppdecomp%my_index = my_index
      ppdecomp%nxglobal = nx
      ppdecomp%nyglobal = ny
      ppdecomp%nzglobal = nz
      ppdecomp%ixproc = ixproc
      ppdecomp%iyproc = iyproc
      ppdecomp%izproc = izproc
      ppdecomp%nxprocs = nxprocs
      ppdecomp%nyprocs = nyprocs
      ppdecomp%nzprocs = nzprocs
      call Decompositionchange(ppdecomp)
      ppdecomp%iprocgrid = iprocgrid
      ppdecomp%nprocgrid = nprocgrid

      --- Create communicator groups for processors along each axis.
      call initializedecomp(fsdecomp)
      ppdecomp%mpi_comm_x = fsdecomp%mpi_comm_x
      ppdecomp%mpi_comm_y = fsdecomp%mpi_comm_y
      ppdecomp%mpi_comm_z = fsdecomp%mpi_comm_z

      --- Calculate how the work is to be arranged among the processors.

      call domaindecomposefields(nx,nxprocs,lfsautodecomp.and.lfsautodecompx,
     &                           fsdecomp%ix,fsdecomp%nx,grid_overlap)
      call domaindecomposefields(ny,nyprocs,lfsautodecomp.and.lfsautodecompy,
     &                           fsdecomp%iy,fsdecomp%ny,grid_overlap)
      call domaindecomposefields(nz,nzprocs,lfsautodecomp.and.lfsautodecompz,
     &                           fsdecomp%iz,fsdecomp%nz,grid_overlap)

      --- Set the grid cell size, which is needed below. It must be set here
      --- after the field solve domain decomposition since nzlocal may be
      --- determined from the user supplied decomposition.
      if (nx > 0) then
        dx = (xmmax - xmmin)/nx
      endif
      if (ny > 0) then
        dy = (ymmax - ymmin)/ny
      endif
      if (nz > 0) then
        dz = (zmmax - zmmin)/nz
      endif

      fsdecomp%xmin = xmmin + fsdecomp%ix*dx
      fsdecomp%ymin = ymmin + fsdecomp%iy*dy
      fsdecomp%zmin = zmmin + fsdecomp%iz*dz
      fsdecomp%xmax = xmmin + (fsdecomp%ix + fsdecomp%nx)*dx
      fsdecomp%ymax = ymmin + (fsdecomp%iy + fsdecomp%ny)*dy
      fsdecomp%zmax = zmmin + (fsdecomp%iz + fsdecomp%nz)*dz

      --- Now setup the decomposition of the particles. Note that the
      --- xmin, xmax etc in ppdecomp will be the boundaries for the particles
      --- and are not necessarily tied to any grid.
      --- Set default values for the ppdecomp.xmin etc. from the values in
      --- fsdecomp. The grid_overlap is removed so that the ppdecomp
      --- domains do not overlap. Note that if lautodecomp is false, these
      --- values will be ignored.
      ppdecomp%xmin = xmmin + (fsdecomp%ix + int((grid_overlap-1)/2))*dx
      ppdecomp%ymin = ymmin + (fsdecomp%iy + int((grid_overlap-1)/2))*dy
      ppdecomp%zmin = zmmin + (fsdecomp%iz + int((grid_overlap-1)/2))*dz
      ppdecomp%xmax = xmmin + (fsdecomp%ix+fsdecomp%nx - int(grid_overlap/2))*dx
      ppdecomp%ymax = ymmin + (fsdecomp%iy+fsdecomp%ny - int(grid_overlap/2))*dy
      ppdecomp%zmax = zmmin + (fsdecomp%iz+fsdecomp%nz - int(grid_overlap/2))*dz
      --- The domain of the first and last processor extends to the edge
      --- of the grid.
      ppdecomp%xmin(0) = fsdecomp%xmin(0)
      ppdecomp%ymin(0) = fsdecomp%ymin(0)
      ppdecomp%zmin(0) = fsdecomp%zmin(0)
      ppdecomp%xmax(nxprocs-1) = fsdecomp%xmax(nxprocs-1)
      ppdecomp%ymax(nyprocs-1) = fsdecomp%ymax(nyprocs-1)
      ppdecomp%zmax(nzprocs-1) = fsdecomp%zmax(nzprocs-1)

      call domaindecomposeparticles(nx,nxprocs,0,xmmin,dx,
     &                              userdecompx,lautodecomp,
     &                              ppdecomp%ix,ppdecomp%nx,
     &                              ppdecomp%xmin,ppdecomp%xmax)
      call domaindecomposeparticles(ny,nyprocs,0,ymmin,dy,
     &                              userdecompy,lautodecomp,
     &                              ppdecomp%iy,ppdecomp%ny,
     &                              ppdecomp%ymin,ppdecomp%ymax)
      call domaindecomposeparticles(nz,nzprocs,0,zmmin,dz,
     &                              userdecompz,lautodecomp,
     &                              ppdecomp%iz,ppdecomp%nz,
     &                              ppdecomp%zmin,ppdecomp%zmax)

 ---------------------------------------------------------------------------
      --- Reset local values
      if (injctspc > 0) then
        injctspc = max(1,int(injctspc/nprocs*1.1))
      endif

      nxlocal = fsdecomp%nx(ixproc)
      nylocal = fsdecomp%ny(iyproc)
      nzlocal = fsdecomp%nz(izproc)
      xmminlocal = fsdecomp%xmin(ixproc)
      ymminlocal = fsdecomp%ymin(iyproc)
      zmminlocal = fsdecomp%zmin(izproc)
      xmmaxlocal = fsdecomp%xmax(ixproc)
      ymmaxlocal = fsdecomp%ymax(iyproc)
      zmmaxlocal = fsdecomp%zmax(izproc)
      nxp = ppdecomp%nx(ixproc)
      nyp = ppdecomp%ny(iyproc)
      nzp = ppdecomp%nz(izproc)

      --- The xmminp etc are tied to the field solver grid, with xmminp being
      --- the location of the grid cell just below the particle boundary and
      --- xmmaxp being just above.
      xmminp = xmmin + ppdecomp%ix(ixproc)*dx
      ymminp = ymmin + ppdecomp%iy(iyproc)*dy
      zmminp = zmmin + ppdecomp%iz(izproc)*dz
      xmmaxp = xmmin + (ppdecomp%ix(ixproc) + ppdecomp%nx(ixproc))*dx
      ymmaxp = ymmin + (ppdecomp%iy(iyproc) + ppdecomp%ny(iyproc))*dy
      zmmaxp = zmmin + (ppdecomp%iz(izproc) + ppdecomp%nz(izproc))*dz

      --- The xpminlocal etc are set to the particle boundaries and are not
      --- tied to any grid.
      xpminlocal = ppdecomp%xmin(ixproc)
      ypminlocal = ppdecomp%ymin(iyproc)
      zpminlocal = ppdecomp%zmin(izproc)
      xpmaxlocal = ppdecomp%xmax(ixproc)
      ypmaxlocal = ppdecomp%ymax(iyproc)
      zpmaxlocal = ppdecomp%zmax(izproc)

      --- Copy the data over to the old arrays, until the rest of the code
      --- has caught up (and then they can be deleted).
      izfsslave = fsdecomp%iz
      nzfsslave = fsdecomp%nz
      izpslave = ppdecomp%iz
      nzpslave = ppdecomp%nz
      zpslmin =  ppdecomp%zmin
      zpslmax =  ppdecomp%zmax

      return
      end

      subroutine smooth3d_121(q,nx,ny,nz,npass,alpha)
      --- performs a smoothing sequence of n passes of a three points stencil 
      --- with coefficients [(1-alpha)/2, alpha, (1-alpha)/2] along x, y and/or z.      
       implicit none

       integer(ISZ) :: nx,ny,nz,i,j,k,l,npass(3)

       real(kind=8), dimension(0:nx,0:ny,0:nz) :: q
       real(kind=8) :: temp0, temp1, alpha(3), a, b

!     x smoothing
        a = alpha(1)
        b = (1.-a)/2.
        do i=1,npass(1)
          do  l=0,nz
            do  k=0,ny
              temp0 = q(0,k,l)
              j=0; q(j,k,l) = q(j,k,l)*(1.-b)+b*q(j+1,k,l)
              do  j=1,nx-1
                temp1 = q(j,k,l)
                q(j,k,l) = a*q(j,k,l)+b*(temp0+q(j+1,k,l))
                temp0 = temp1
              end do
              j=nx;q(j,k,l) = q(j,k,l)*(1.-b)+b*temp0
            end do
          end do
        end do

!      y smoothing
        a = alpha(2)
        b = (1.-a)/2.
        do i=1,npass(2)
          do  l=0,nz
            do  j=0,nx
              temp0 = q(j,0,l)
              k=0; q(j,k,l) = q(j,k,l)*(1.-b)+b*q(j,k+1,l)
              do  k=1,ny-1
                temp1 = q(j,k,l)
                q(j,k,l) = a*q(j,k,l)+b*(temp0+q(j,k+1,l))
                temp0 = temp1
              end do
              k=ny;q(j,k,l) = q(j,k,l)*(1.-b)+b*temp0
            end do
          end do
        end do

!     z smoothing
        a = alpha(3)
        b = (1.-a)/2.
        do i=1,npass(3)
          do  k=0,ny
            do  j=0,nx
              temp0 = q(j,k,0)
              l=0; q(j,k,l) = q(j,k,l)*(1.-b)+b*q(j,k,l+1)
              do  l=1,nz-1
                temp1 = q(j,k,l)
                q(j,k,l) = a*q(j,k,l)+b*(temp0+q(j,k,l+1))
                temp0 = temp1
              end do
              l=nz;q(j,k,l) = q(j,k,l)*(1.-b)+b*temp0
            end do
          end do
        end do

        return
      end subroutine smooth3d_121

      subroutine smooth3d_121_stride(q,nx,ny,nz,npass,alpha,stride)
      --- performs a smoothing sequence of n passes of a three points stencil 
      --- with coefficients [(1-alpha)/2, alpha, (1-alpha)/2] along x, y and/or z,
      --- with a stride.
       implicit none

       integer(ISZ) :: nx,ny,nz,i,j,k,l,npass(3),stride(3)

       real(kind=8), dimension(0:nx,0:ny,0:nz) :: q
       real(kind=8) :: tempx(0:nx), tempy(0:ny), tempz(0:nz), alpha(3), a, b

!           x smoothing
        a = alpha(1)
        b = (1.-a)/2.
        do i=1,npass(1)
          do  l=0,nz
            do  k=0,ny
              tempx = q(:,k,l)
              do j=0,stride(1)-1
                q(j,k,l) = (1.-b)*q(j,k,l)+b*q(j+stride(1),k,l)
              end do
              do j=nx-stride(1)+1,nx
                q(j,k,l) = (1.-b)*q(j,k,l)+b*q(j-stride(1),k,l)
              end do
              do j=stride(1),nx-stride(1)
                q(j,k,l) = a*q(j,k,l)+b*(tempx(j-stride(1))+tempx(j+stride(1)))
              end do
            end do
          end do
        end do

!           y smoothing
        a = alpha(2)
        b = (1.-a)/2.
        do i=1,npass(2)
          do  l=0,nz
            do  j=0,nx
              tempy = q(j,:,l)
              do k=0,stride(2)-1
                q(j,k,l) = (1.-b)*q(j,k,l)+b*q(j,k+stride(2),l)
              end do
              do k=ny-stride(2)+1,ny
                q(j,k,l) = (1.-b)*q(j,k,l)+b*q(j,k-stride(2),l)
              end do
              do  k=stride(2),ny-stride(2)
                q(j,k,l) = a*q(j,k,l)+b*(tempy(k-stride(2))+tempy(k+stride(2)))
              end do
            end do
          end do
        end do

!           z smoothing
        a = alpha(3)
        b = (1.-a)/2.
        do i=1,npass(3)
          do  k=0,ny
            do  j=0,nx
              tempz = q(j,k,:)
              do l=0,stride(3)-1
                q(j,k,l) = (1.-b)*q(j,k,l)+b*q(j,k,l+stride(3))
              end do
              do l=nz-stride(3)+1,nz
                q(j,k,l) = (1.-b)*q(j,k,l)+b*q(j,k,l-stride(3))
              end do
              do  l=stride(3),nz-stride(3)
                q(j,k,l) = a*q(j,k,l)+b*(tempz(l-stride(3))+tempz(l+stride(3)))
              end do
            end do
          end do
        end do

        return
      end subroutine smooth3d_121_stride

      subroutine smooth3d_121_stride_mask(q,mask,nx,ny,nz,npass,alpha,stride)
      --- performs a smoothing sequence of n passes of a three points stencil 
      --- with coefficients [(1-alpha)/2, alpha, (1-alpha)/2] along x, y and/or z,
      --- with a stride.
       implicit none

       integer(ISZ) :: nx,ny,nz,i,j,k,l,npass(3),stride(3)

       real(kind=8), dimension(0:nx,0:ny,0:nz) :: q,mask
       real(kind=8) :: tempx(0:nx), tempy(0:ny), tempz(0:nz), alpha(3), a, b

       if (npass(1) > 0 .and. stride(1) > 0) then
         mask(0:stride(1)-1,:,:)=0.
         mask(nx-stride(1)+1:,:,:)=0.
       endif
       if (npass(2) > 0 .and. stride(2) > 0) then
         if (nyɬ) then
           mask(:,0:stride(2)-1,:)=0.
           mask(:,ny-stride(2)+1:,:)=0.
         end if
       endif
       if (npass(3) > 0 .and. stride(3) > 0) then
         mask(:,:,0:stride(3)-1)=0.
         mask(:,:,nz-stride(3)+1:)=0.
       endif

!           x smoothing
        do i=1,npass(1)
          do  l=0,nz
            do  k=0,ny
              tempx = q(:,k,l)
              q(stride(1):nx-stride(1),k,l) = 0.
!              q(:,k,l) = 0.
              do  j=stride(1),nx-stride(1)
                ! deposit up
                b = (1.-alpha(1))/2.
                b = b*min(mask(j,k,l),mask(j+stride(1),k,l))
                a = 1.-2*b
                q(j,k,l) = q(j,k,l) + a/2*tempx(j)
                q(j+stride(1),k,l) = q(j+stride(1),k,l) + b*tempx(j)
                ! deposit down
                b = (1.-alpha(1))/2.
                b = b*min(mask(j,k,l),mask(j-stride(1),k,l))
                a = 1.-2*b
                q(j,k,l) = q(j,k,l) + a/2*tempx(j)
                q(j-stride(1),k,l) = q(j-stride(1),k,l) + b*tempx(j)
              end do
            end do
          end do
        end do

!           y smoothing
        do i=1,npass(2)
          do  l=0,nz
            do  j=0,nx
              tempy = q(j,:,l)
              q(j,stride(2):ny-stride(2),l) = 0.
!              q(j,:,l) = 0.
              do  k=stride(2),ny-stride(2)
                ! deposit up
                b = (1.-alpha(2))/2.
                b = b*min(mask(j,k,l),mask(j,k+stride(2),l))
                a = 1.-2*b
                q(j,k,l) = q(j,k,l) + a/2*tempy(k)
                q(j,k+stride(2),l) = q(j,k+stride(2),l) + b*tempy(k)
                ! deposit down
                b = (1.-alpha(2))/2.
                b = b*min(mask(j,k,l),mask(j,k-stride(2),l))
                a = 1.-2*b
                q(j,k,l) = q(j,k,l) + a/2*tempy(k)
                q(j,k-stride(2),l) = q(j,k-stride(2),l) + b*tempy(k)
              end do
            end do
          end do
        end do

!           z smoothing
        do i=1,npass(3)
          do  k=0,ny
            do  j=0,nx
              tempz = q(j,k,:)
              q(j,k,stride(3):nz-stride(3)) = 0.
!              q(j,k,:) = 0.
              do  l=stride(3),nz-stride(3)
                ! deposit up
                b = (1.-alpha(3))/2.
                b = b*min(mask(j,k,l),mask(j,k,l+stride(3)))
                a = 1.-2*b
                q(j,k,l) = q(j,k,l) + a/2*tempz(l)
                q(j,k,l+stride(3)) = q(j,k,l+stride(3)) + b*tempz(l)
                ! deposit down
                b = (1.-alpha(3))/2.
                b = b*min(mask(j,k,l),mask(j,k,l-stride(3)))
                a = 1.-2*b
                q(j,k,l) = q(j,k,l) + a/2*tempz(l)
                q(j,k,l-stride(3)) = q(j,k,l-stride(3)) + b*tempz(l)
              end do
            end do
          end do
        end do

        return
      end subroutine smooth3d_121_stride_mask

      subroutine flagparticlesbysortedssn(flag,np,ssn,pflag,
     &                                    nflagged,ssnflagged)
      real(kind=8):: flag
      integer(ISZ):: np,nflagged
      real(kind=8):: ssn(np),pflag(np)
      real(kind=8):: ssnflagged(nflagged)

  Flag all of the particles listed in the ssnflagged array.
  - flag: the value of the flag to be given to the listed particles
  - np: number of particles
  - ssn: a sorted list of ssns of the particles that are to be checked
  - pflag: flag id array that is to be flagged, in the same order as ssn
  - nflagged: number of ssns to be flagged
  - ssnflagged: a sorted list of social security numbers that are to
                be flagged

      integer(ISZ):: ip,ipflag

  --- Loop over the particles, in the order of sorted ssns
      ipflag = 1
      do ip=1,np

        --- Find the next flagged ssn that is equal to or greater than
        --- the ssn of the current particle
        do while (nint(ssnflagged(ipflag)) < nint(ssn(ip)))
          ipflag = ipflag + 1
          --- Exit the inner loop if there are no more flagged ssns
          if (ipflag > nflagged) exit
        enddo
        --- Exit the outer loop if there are no more flagged ssns
        if (ipflag > nflagged) exit

        --- Set the flag if a flagged particle is found
        if (nint(ssn(ip)) == nint(ssnflagged(ipflag))) then
          pflag(ip) = flag
        endif

      enddo

      return
      end subroutine flagparticlesbysortedssn