#include "copyright.i"

!*******************************************************************************
!
! Module: alltasks_setup_mod
!
! Description:  Setup of data structures for uniprocessor code as well as
!               mpi master and slave processors.
!              
!*******************************************************************************

module alltasks_setup_mod

use file_io_dat_mod

  implicit none

#ifdef MPI 
  private       do_initial_pme_atom_division
  private       check_atm_division
#endif

contains

!*******************************************************************************
!
! Subroutine:  alltasks_setup
!
! Description: <TBS>
!              
!*******************************************************************************

subroutine alltasks_setup(num_ints, num_reals)

  use angles_mod
  use bonds_mod
  use cit_mod
  use constraints_mod
  use dihedrals_mod
  use dynamics_mod
  use dynamics_dat_mod
  use extra_pnts_nb14_mod
#ifdef DIRFRC_EFS
  use ene_frc_splines_mod
#endif /* DIRFRC_EFS */
  use pme_direct_mod
  use pme_force_mod
  use pme_recip_mod
  use img_mod
  use inpcrd_dat_mod
  use mdin_ctrl_dat_mod
  use mdin_ewald_dat_mod
  use nb_exclusions_mod
  use nb_pairlist_mod
  use parallel_dat_mod
  use parallel_mod
  use pbc_mod
  use pmemd_lib_mod
  use prmtop_dat_mod
  use random_mod
#ifdef AMOEBA
  use mdin_amoeba_dat_mod
  use amoeba_recip_mod
  use amoeba_interface_mod
  use amoeba_adjust_mod, only : init_adjust_weight_idxs
#endif /* AMOEBA */
#ifdef DBG
  use file_io_mod
#endif /* DBG */

  implicit none

! Formal arguments:

! num_ints and num_reals are used to return allocation counts. Don't zero.

  integer, intent(in out)       :: num_ints, num_reals

! Local variables:

  integer               :: alloc_failed

#ifdef MPI
  integer               :: i
  integer               :: node
  integer               :: taskmap_idx
  integer, allocatable  :: extra_used_atms(:)
  integer,allocatable   :: use_atm_map(:)
#else
  integer               :: use_atm_map(natom)
#endif

#ifdef DBG
  character(max_fn_len) :: dbglog_name
#ifndef MPI
  integer               :: i
#endif

#ifdef MPI
  dbglog = dbglog_base + mytaskid
  write(dbglog_name, '(a,i3.3)') 'dbglog.task', mytaskid
#else
  dbglog = dbglog_base
  write(dbglog_name, '(a,i3.3)') 'dbglog.uni'
#endif
  call amopen(dbglog, dbglog_name, 'N', 'F', 'W')
#endif /* DBG */

#ifdef MPI

! The size required for both use_atm_map and extra_used_atms (both temporary)
! is not known in slave nodes until the broadcasts are done below; thus we
! dynamically allocate and deallocate these arrays.

! Send and receive common blocks from the master node.  In the non-masters,
! memory is allocated as needed.

  call bcast_mdin_ctrl_dat

#ifdef AMOEBA
  if (iamoeba .eq. 0) then
    call bcast_amber_prmtop_dat
  else
    call bcast_mdin_amoeba_dat
    call bcast_amoeba_prmtop_dat
  end if
#else
  call bcast_amber_prmtop_dat
#endif /* AMOEBA */

  if (using_pme_potential) call bcast_mdin_ewald_dat

  call bcast_inpcrd_dat(natom)
#ifdef AMOEBA
  call bcast_dynamics_dat(natom, nspm, ntp, imin, iamoeba)
#else
  call bcast_dynamics_dat(natom, nspm, imin, ntp)
#endif /* AMOEBA */
  call bcast_constraints_dat(natom, ibelly, ntr)

#ifdef AMOEBA
  if (iamoeba .eq. 0) then
    call bcast_extra_pnts_nb14_dat
  end if
#else
  call bcast_extra_pnts_nb14_dat
#endif /* AMOEBA */

  if (ntb .ne. 0) call bcast_pbc

  if (.not. master .and. using_pme_potential) then
    call set_cit_tbl_dims(pbc_box, vdw_cutoff + skinnb, cut_factor)
  end if

  if (using_pme_potential) then

    call bcast_img_dat(natom)                              
    call bcast_nb_pairlist_dat(natom, vdw_cutoff + skinnb)
    call bcast_nb_exclusions_dat(natom, next)
    call bcast_pme_force_dat(ntypes)
    call bcast_pme_direct_dat
#ifdef DIRFRC_EFS
    call bcast_ene_frc_splines_dat
#endif /* DIRFRC_EFS */

#ifdef AMOEBA
    if (iamoeba .ne. 0) call bcast_amoeba_dat
#endif /* AMOEBA */

    ! Set up data structures used in n to n mpi exchanges:

    ! The gbl_taskmap is an ordering of tasks other than this task ordered from
    ! mytaskid + 1 to mytaskid - 1, with wraparound at numtasks.

    allocate(gbl_taskmap(numtasks - 1), &
             gbl_inv_taskmap(numtasks - 1), &
             stat = alloc_failed)

    if (alloc_failed .ne. 0) call setup_alloc_error

    num_ints = num_ints + size(gbl_taskmap) + size(gbl_inv_taskmap)

    ! The following taskmaps set up the order for what is essentially a
    ! "synchronous shuffle exchange" (Tam and Wang) which we discovered
    ! independently.  We use it for our asynchronous comm.

    taskmap_idx = 0
    node = mytaskid + 1

    do
      if (node .ge. numtasks) node = 0
      if (node .eq. mytaskid) exit
      taskmap_idx = taskmap_idx + 1
      gbl_taskmap(taskmap_idx) = node
      node = node + 1
    end do

    ! The order of gbl_inv_taskmap is inverted.  If you receive and send
    ! sequentially, it works best to recv via gbl_taskmap while you
    ! simultaneously send via gbl_inv_taskmap (or vice versa).

    do taskmap_idx = 1, numtasks - 1
      gbl_inv_taskmap(taskmap_idx) = gbl_taskmap(numtasks - taskmap_idx)
    end do

    ! We must know about fft slab allocations before we do atom division:

    call pme_recip_setup(num_ints, num_reals)

    ! Divide atoms up among the processors.  The atom division is redone
    ! periodically under cit, and is either residue or molecule-based, with
    ! locality.  In other words, under cit a contiguous block of atoms owned by
    ! each process is a thing of the past.

    call do_initial_pme_atom_division(natom, num_ints, num_reals)

    ! We can now check that atom division is along residue boundaries.  We do
    ! this for constant pressure runs, where division is molecule-based. There
    ! appears to be a bug in leap whereby users can somehow create residues
    ! split between two molecules.  Ideally, we would find this problem for
    ! single and multiprocessor code, constant pressure and constant volume.
    ! Pragmatically, we just check in pmemd in the case where it will wreak
    ! havoc due to improper atom division among processors.

    if (ntb .eq. 2) call check_atm_division

  else if (using_gb_potential) then

    ! For the first GB implementation, we attempt to evenly divide the
    ! atom workload on residue boundaries.  We will keep track of atom
    ! ownership, but will keep all coordinates updated in all processes,
    ! assuming that large cutoff size and small atom count make it somewhere
    ! between unnecessary and perhaps even detrimental to do otherwise
    ! (in other words, attaining processor spatial locality is not practical
    ! with a small irregularly shaped population of atoms, given the large
    ! nonbonded cutoffs typical of GB).

    call do_initial_gb_atom_division(num_ints, num_reals)

  end if

  ! Bond-angle-dihedral ownership needs to be established.  We use the
  ! use_atm_map for both pme and GB here; in reality the atom usage info
  ! will not be kept for GB though.

  allocate(use_atm_map(natom), stat = alloc_failed)
  if (alloc_failed .ne. 0) call setup_alloc_error
  use_atm_map(:) = 0

#ifdef AMOEBA
  ! BUGBUG - Amoeba may need equivalent code for its valence interactions
  !          at some point...
  if (iamoeba .eq. 0) then
#endif /* AMOEBA */
    call bonds_setup(num_ints, num_reals, use_atm_map)
    call angles_setup(num_ints, num_reals, use_atm_map)
    call dihedrals_setup(num_ints, num_reals, use_atm_map)
    call nb14_setup(num_ints, num_reals, use_atm_map)
#ifdef AMOEBA
  end if
#endif /* AMOEBA */

  if (using_pme_potential) then

    ! Excluded atoms setup only necessary for pme-based potentials:

    ! The mask lists made here are used in nonbonded pairlist processing
    ! to keep nonbonded calcs from being done on excluded atoms.  The lists are
    ! static (atom-based).

    call make_atm_excl_mask_list(natom, atm_numex, next, gbl_natex)

    ! Extra used atoms setup is now possible.  This is only done for pme:

#ifdef AMOEBA
    if (iamoeba .ne. 0) then

      call init_adjust_weight_idxs()

      ! For now we disable the extra used atom mechanism in amoeba, as
      ! something like it will not be necessary until we have further
      ! developed the code that parallelizes Amoeba valence calcs...

      extra_used_atm_cnt = 0

    else
#endif /* AMOEBA */

      call make_nb_adjust_pairlst(my_atm_cnt, gbl_my_atm_lst, &
                                  gbl_atm_owner_map, use_atm_map, &
                                  gbl_nb_adjust_pairlst, &
                                  atm_nb_maskdata, atm_nb_mask)

      allocate(extra_used_atms(natom), stat = alloc_failed) ! temporary
      if (alloc_failed .ne. 0) call setup_alloc_error
      extra_used_atms(:) = 0

      extra_used_atm_cnt = 0

      do i = 1, natom
        if (use_atm_map(i) .ne. 0) then
          if (gbl_atm_owner_map(i) .ne. mytaskid) then
            extra_used_atm_cnt = extra_used_atm_cnt + 1
            extra_used_atms(extra_used_atm_cnt) = i
          end if
        end if
      end do

      if (extra_used_atm_cnt .gt. 0) then

        allocate(gbl_extra_used_atms(extra_used_atm_cnt), stat = alloc_failed)
        if (alloc_failed .ne. 0) call setup_alloc_error
        num_ints = num_ints + size(gbl_extra_used_atms)

        gbl_extra_used_atms(1:extra_used_atm_cnt) = &
          extra_used_atms(1:extra_used_atm_cnt)

!       write(0,*)'DBG: Task ', mytaskid, ' has ', extra_used_atm_cnt, &
!                 'extra used atoms'
      end if

      deallocate(extra_used_atms) ! Release temporary buffer.

#ifdef AMOEBA
    end if
#endif /* AMOEBA */

  else if (using_gb_potential) then

    extra_used_atm_cnt = 0      ! not used...

  end if

  ! BUGBUG - These buffers are not yet used for Amoeba...

  ! Send/recv list allocs can now be done because my_atm_cnt is known. Only
  ! used for pme...

  if (using_pme_potential) then

    allocate(gbl_send_atm_lst(natom), &
             gbl_send_atm_cnts(0 : numtasks - 1), &
             gbl_recv_atm_lsts(my_atm_cnt, numtasks - 1), &
             gbl_recv_atm_cnts(0 : numtasks - 1), &
             stat = alloc_failed)

    if (alloc_failed .ne. 0) call setup_alloc_error

    num_ints = num_ints + size(gbl_send_atm_lst) + &
                          size(gbl_send_atm_cnts) + &
                          size(gbl_recv_atm_lsts) + &
                          size(gbl_recv_atm_cnts)

    gbl_send_atm_lst(:) = 0
    gbl_send_atm_cnts(:) = 0
    gbl_recv_atm_lsts(:,:) = 0
    gbl_recv_atm_cnts(:) = 0

  end if

  ! Allocate buffers for mpi i/o that will remain allocated throughout the
  ! run.  This is done because some mpi implementations (myrinet in particular)
  ! mmap the buffers, and using stack buffers could have negative performance
  ! impacts in result (ifc also mmap/munmaps the dynamic stack space).
  ! These buffers are not used when other static allocations are available, say
  ! when broadcasting initialization data from the master.  All i/o using
  ! these buffers must be completed before routine exit (ie., waits MUST be
  ! done on nonblocking i/o).
  ! We must be sure that the buffer size variables have been updated before
  ! this call...

  ! BUGBUG - beware of Amoeba situations that may require larger mpi buffers;
  !          these calcs here are driven by assumptions about the atom
  !          data sending/receiving mechanisms for amber pme which we will
  !          eventually be using for Amoeba, but not yet.  However, Amoeba
  !          could have other large mpi buffer requirements we are not yet
  !          aware of.

  if (using_pme_potential) then
#ifdef SLOW_NONBLOCKING_MPI
    call set_minimum_mpi_bufs_size(3 * natom, num_reals)
#else
    call set_minimum_mpi_bufs_size(max(3*natom, 3*my_atm_cnt*(numtasks-1)), &
                                   num_reals)
#endif
  else if (using_gb_potential) then
    call set_minimum_mpi_bufs_size(3 * natom, num_reals)
  end if

  if (allocated(use_atm_map)) deallocate(use_atm_map)

#else /* begin non-MPI code */

  ! We set up reciprocal force data structures here in parallel with
  ! where it has to be done for mpi code:

  if (using_pme_potential) call pme_recip_setup(num_ints, num_reals)

  my_mol_cnt = nspm   ! uniprocessor value.
  my_atm_cnt = natom  ! uniprocessor value.

  if (numextra .gt. 0) then
    my_ep_frame_cnt = gbl_frame_cnt
  else
    my_ep_frame_cnt = 0
  end if

#ifdef AMOEBA
  ! BUGBUG - Amoeba may need equivalent code for its valence interactions
  !          at some point...
  if (iamoeba .eq. 0) then
#endif /* AMOEBA */
    call bonds_setup(num_ints, num_reals, use_atm_map)
    call angles_setup(num_ints, num_reals, use_atm_map)
    call dihedrals_setup(num_ints, num_reals, use_atm_map)
    call nb14_setup(num_ints, num_reals, use_atm_map)

    ! gbl_bond is still needed for shake setup and resetup, but gbl_angle and
    ! gbl_dihed can be deallocated .

    num_ints = num_ints - size(gbl_angle) * angle_rec_ints
    num_ints = num_ints - size(gbl_dihed) * dihed_rec_ints

    deallocate(gbl_angle, gbl_dihed)

#ifdef AMOEBA
  end if
#endif /* AMOEBA */

  if (using_pme_potential) then

    ! Excluded atoms setup only necessary for pme-based potentials:

    ! The mask lists made here are used in nonbonded pairlist processing
    ! to keep nonbonded calcs from being done on excluded atoms.  The lists are
    ! static (atom-based).

    call make_atm_excl_mask_list(natom, atm_numex, next, gbl_natex)

#ifdef AMOEBA
    if (iamoeba .ne. 0) call init_adjust_weight_idxs()
#endif /* AMOEBA */

    call make_nb_adjust_pairlst(natom, use_atm_map, &
                                gbl_nb_adjust_pairlst, &
                                atm_nb_maskdata, atm_nb_mask)
  end if

#endif /* not MPI */

! Initialize random number generator at same point in all processors. Then, if
! random initial velocities are needed, generate them in all processors. 
! In general, we must be careful to generate the same sequence of random
! numbers in all processors.

  call amrset(ig)

  if (ntx .eq. 1 .or. ntx .eq. 2) then
    call all_atom_setvel(natom, atm_vel, atm_mass_inv, tempi)
  end if

  if (ibelly .gt. 0) then
    call all_atom_belly(natom, atm_igroup, atm_vel)
  end if

#undef DBG
#ifdef DBG
! Dump extra points per-process data.

  write(dbglog, *) 'gbl_nb14_cnt=', gbl_nb14_cnt
  write(dbglog, *) 'gbl_frame_cnt=', gbl_frame_cnt
  
  write(dbglog, *) 'frameid, ep1 ep2 ep_cnt type parent atm1-3'

  do i = 1, gbl_frame_cnt
    write(dbglog,'(3i7,2i3,4i7)'), i, &
                                   ep_frames(i)%extra_pnt(:), &
                                   ep_frames(i)%ep_cnt, &
                                   ep_frames(i)%type, &
                                   ep_frames(i)%parent_atm, &
                                   ep_frames(i)%frame_atm1, &
                                   ep_frames(i)%frame_atm2, &
                                   ep_frames(i)%frame_atm3
  end do

  write(dbglog, *) 'frameid, ep_lcl_crd:'

  do i = 1, gbl_frame_cnt
    write(dbglog,'(i8, i4, 3f12.5)'), i, 1, ep_lcl_crd(:, 1, i)
    write(dbglog,'(i8, i4, 3f12.5)'), i, 2, ep_lcl_crd(:, 2, i)
  end do
#endif /* DBG */

  return

end subroutine alltasks_setup

#ifdef MPI
!*******************************************************************************
!
! Internal Subroutine:  do_initial_pme_atom_division
!
! Description:  Determine atom distribution for parallel processing.
!
!*******************************************************************************

subroutine do_initial_pme_atom_division(atm_cnt, num_ints, num_reals)

  use cit_mod
  use extra_pnts_nb14_mod
  use pme_fft_mod
  use gbl_constants_mod
  use inpcrd_dat_mod
  use loadbal_mod
  use mdin_ctrl_dat_mod
  use mdin_ewald_dat_mod
  use parallel_dat_mod
  use pbc_mod
  use pmemd_lib_mod
  use prmtop_dat_mod

  implicit none

! Formal arguments:

! num_ints and num_reals are used to return allocation counts. Don't zero.

  integer                       :: atm_cnt
  integer, intent(in out)       :: num_ints, num_reals

! Local variables:

  integer               :: alloc_failed
  integer               :: node
  double precision      :: fraction(3, atm_cnt) ! in range 0.0 - +0.999...
  integer               :: crd_idx_lst_tbl(0 : cit_tbl_x_dim - 1, &
                                           0 : cit_tbl_y_dim - 1, &
                                           0 : cit_tbl_z_dim - 1)
  type(atm_lst_rec)     :: atm_lst(atm_cnt)

  
! The following checks for minimum atoms, residues, and molecules per
! processor are intended to avoid hitting conditions we may not
! have considered.  The use of more processors than atoms, residues or
! molecules would be a bit ridiculous anyway...

! Check for sufficient atoms for the number of processors:

  if (natom .lt. 10 * numtasks) then
    write(mdout, '(a,a)') error_hdr, 'Must have 10x more atoms than processors!'
    call mexit(6, 1)
  end if

! Check for sufficient residues for the number of processors:

  if (nres .lt. 4 * numtasks) then
    write(mdout, '(a,a)') error_hdr, 'Must have 4x more residues than processors!'
    call mexit(6, 1)
  end if

! For CP MD, check for sufficient molecules for the number of processors:

  if (ntp .ne. 0 .and. nspm .lt. 4 * numtasks) then
    write(mdout, '(a,a)') error_hdr, &
                          'Must have 4x more molecules than processors!'
    call mexit(6, 1)
  end if

! Allocate storage for various structures associated with keeping track of the
! division of the atom and image workload.  The gbl_img_div_tbl has one extra
! elements at the end for passing a boolean back to slave nodes indicating that
! atom workload redistribution is needed.

  allocate(gbl_img_div_tbl(0:numtasks + 1), &
           gbl_vec_offsets(0:numtasks), &
           gbl_atm_offsets(0:numtasks), &
           gbl_vec_rcvcnts(0:numtasks), &
           gbl_atm_owner_map(natom), &
           gbl_my_atm_lst(natom), &
           stat = alloc_failed)

  if (alloc_failed .ne. 0) call setup_alloc_error

  num_ints = num_ints + size(gbl_img_div_tbl) + &
                        size(gbl_vec_offsets) + &
                        size(gbl_atm_offsets) + &
                        size(gbl_vec_rcvcnts) + &
                        size(gbl_atm_owner_map) + &
                        size(gbl_my_atm_lst)

  if (numextra .gt. 0) then
    if (gbl_frame_cnt .gt. 0) then
      allocate(gbl_my_ep_frame_lst(gbl_frame_cnt), &
               stat = alloc_failed)

      if (alloc_failed .ne. 0) call setup_alloc_error

      num_ints = num_ints + size(gbl_my_ep_frame_lst)
    end if
  end if

  my_ep_frame_cnt = 0   ! until proven otherwise...

  ! Set up image division for force calcs.  Images ARE assigned in contiguous
  ! blocks, without wraparound.  For respa runs and minimizations, asymmetric
  ! fft slab load balancing (basically, trying to use as few tasks doing
  ! fft's/recip force) will not be done for various technical reasons.

#ifdef AMOEBA
  ! BUGBUG - For Amoeba pme, we also do even image division at this point in
  !          time, since no loadbalancing is yet in effect.
  if (nrespa .eq. 1 .and. imin .eq. 0 .and. iamoeba .eq. 0) then
#else
  if (nrespa .eq. 1 .and. imin .eq. 0) then
#endif /* AMOEBA */
    call divide_images_recip_biased(fft_workload_estimate, gbl_img_div_tbl, &
                                    .false.)
  else
    call divide_images_evenly(gbl_img_div_tbl, .true.) ! Equal image division
  end if                                               ! res boundaries ignored.

  ! Master needs space to receive data for load balancing.  The 5 values
  ! for each task are the "direct force time" due to image nonbonded calcs,
  ! the "reciprocal force time" due to pme nonbonded reciprocal force calcs,
  ! the "bad force time" due to bond-angle-dihedral force calcs, the cit setup
  ! time, and the send_atm_cnts total for each node, which can be used to
  ! determine when to do redistribution of the atom workload.

  if (master) then
    allocate(gbl_loadbal_node_dat(5, 0:numtasks - 1), stat = alloc_failed)
    if (alloc_failed .ne. 0) call setup_alloc_error
    num_ints = num_ints + size(gbl_loadbal_node_dat)
    gbl_loadbal_node_dat(:,:) = 0
  end if

  ! All nodes divide the work in runmd. It is a residue-based division
  ! for constant volume simulations and a molecule-based division for
  ! constant pressure simulations.  The division is map- and list-based, in
  ! order to improve locality.  The division will be redone periodically to
  ! maintain reasonable locality as atoms move.
  
  ! Make the atom to image map:

  call get_fract_crds(atm_cnt, atm_crd, fraction)

  call setup_crd_idx_tbl(atm_cnt, fraction, crd_idx_lst_tbl, atm_lst)

#ifdef AMOEBA
  if (nrespa .eq. 1 .and. imin .eq. 0 .and. iamoeba .eq. 0) then
#else
  if (nrespa .eq. 1 .and. imin .eq. 0) then
#endif /* AMOEBA */
    call divide_atoms(atm_cnt, fraction, crd_idx_lst_tbl, atm_lst, .false.)
  else
    call divide_atoms(atm_cnt, fraction, crd_idx_lst_tbl, atm_lst, .true.)
  end if

  return

end subroutine do_initial_pme_atom_division

!*******************************************************************************
!
! Internal Subroutine:  do_initial_gb_atom_division
!
! Description:  Determine atom distribution for parallel processing.
!
!*******************************************************************************

subroutine do_initial_gb_atom_division(num_ints, num_reals)

  use gbl_constants_mod
  use inpcrd_dat_mod
  use mdin_ctrl_dat_mod
  use mdin_ewald_dat_mod
  use parallel_dat_mod
  use pmemd_lib_mod
  use prmtop_dat_mod

  implicit none

! Formal arguments:

! num_ints and num_reals are used to return allocation counts. Don't zero.

  integer, intent(in out)       :: num_ints, num_reals

! Local variables:

  integer               :: alloc_failed
  integer               :: i
  integer               :: task_id 

! The following checks for minimum atoms and residues per
! processor are intended to avoid hitting conditions we may not
! have considered.  The use of more processors than atoms would be a bit
! ridiculous; with GB though, we may actually get close to 1 processor per
! residue.  We thus allow a lower limit there of 1.01 residue per processor.

! Check for sufficient atoms for the number of processors:

  if (natom .lt. 10 * numtasks) then
    write(mdout, '(a,a)') error_hdr, 'Must have 10x more atoms than processors!'
    call mexit(6, 1)
  end if

! Check for sufficient residues for the number of processors:

  if (dble(nres) .lt. dble(numtasks) * 1.01d0) then
    write(mdout, '(a,a)') error_hdr, &
                          'Must have 1.01x more residues than processors!'
    call mexit(6, 1)
  end if

! Allocate storage for various structures associated with keeping track of the
! division of the atom and image workload.  The gbl_img_div_tbl has one extra
! elements at the end for passing a boolean back to slave nodes indicating that
! atom workload redistribution is needed.

  allocate(gbl_vec_offsets(0:numtasks), &
           gbl_atm_offsets(0:numtasks), &
           gbl_vec_rcvcnts(0:numtasks), &
           gbl_atm_owner_map(natom), &
           gbl_my_atm_lst(natom), &
           stat = alloc_failed)

  if (alloc_failed .ne. 0) call setup_alloc_error

  num_ints = num_ints + size(gbl_vec_offsets) + &
                        size(gbl_atm_offsets) + &
                        size(gbl_vec_rcvcnts) + &
                        size(gbl_atm_owner_map) + &
                        size(gbl_my_atm_lst)

  ! Do residue-based atom division.

  call gb_atom_division

  ! All the other data structures can be set up using gbl_vec_rcvcnts.

  my_atm_cnt = gbl_vec_rcvcnts(mytaskid)

!BEGIN DBG
! if (master) write(6,*)'Node atom cnts =', gbl_vec_rcvcnts(:)
!END DBG

  gbl_atm_offsets(0) = 0

  do task_id = 0, numtasks - 1
    gbl_atm_offsets(task_id + 1) = gbl_atm_offsets(task_id) + &
                                   gbl_vec_rcvcnts(task_id)
  end do

  ! The atom list is not strictly necessary in a GB context, as atoms
  ! are owned in contiguous blocks.  However, the guts of pmemd is set up
  ! without this assumption, so...
  
  do i = 1, my_atm_cnt
    gbl_my_atm_lst(i) = gbl_atm_offsets(mytaskid) + i
  end do

  do task_id = 0, numtasks - 1
    gbl_vec_rcvcnts(task_id) = 3 * gbl_vec_rcvcnts(task_id)
  end do

  gbl_vec_rcvcnts(numtasks) = 0

  gbl_vec_offsets(0) = 0

  do task_id = 0, numtasks - 1
    gbl_vec_offsets(task_id + 1) = gbl_vec_offsets(task_id) + &
                                   gbl_vec_rcvcnts(task_id)
  end do

  return

contains

!*******************************************************************************
!
! Subroutine:  gb_atom_division
!
! Description:  <TBS>
!
!*******************************************************************************

subroutine gb_atom_division

  implicit none

  integer               :: task_id
  integer               :: first_atm_id, last_atm_id
  integer               :: first_res_id, last_res_id
  integer               :: res_assigned
  integer               :: task_res_cnt
  integer               :: task_res_cnts(0:numtasks - 1)
  double precision      :: per_task_target
  double precision      :: total_target

  per_task_target = dble(nres) / dble(numtasks)

  total_target = 0.d0
  res_assigned = 0

  do task_id = 0, numtasks - 2
    total_target = total_target + per_task_target
    task_res_cnt = int(total_target) - res_assigned
    res_assigned = res_assigned + task_res_cnt
    task_res_cnts(task_id) = task_res_cnt
  end do

  task_res_cnts(numtasks - 1) = nres - res_assigned

  ! Initialize the various data structures that will reflect the atom division:

  gbl_vec_rcvcnts(:) = 0
  gbl_atm_owner_map(:) = -1

  ! Use the task_res_cnts() array to do the division...

  first_res_id = 1

  do task_id = 0, numtasks - 1
    last_res_id = first_res_id + task_res_cnts(task_id) - 1
    first_atm_id = gbl_res_atms(first_res_id)
    last_atm_id = gbl_res_atms(last_res_id + 1) - 1
    gbl_vec_rcvcnts(task_id) = last_atm_id - first_atm_id + 1
    gbl_atm_owner_map(first_atm_id:last_atm_id) = task_id
    first_res_id = last_res_id + 1
  end do
  
  return

end subroutine gb_atom_division

end subroutine do_initial_gb_atom_division

!*******************************************************************************
!
! Subroutine:  check_atm_division
!
! Description:  Confirm that no residues are split between two molecules.
!               This is intended for use under constant pressure, mpi.
!
!*******************************************************************************

subroutine check_atm_division

  use gbl_constants_mod
  use parallel_dat_mod
  use prmtop_dat_mod

  implicit none

! Formal arguments:

! Local variables:

  integer       :: atm_idx
  integer       :: first_res_atm
  integer       :: last_res_atm
  integer       :: res_idx
  integer       :: res_owner
  
  do res_idx = 1, nres

    first_res_atm = gbl_res_atms(res_idx)
    last_res_atm = gbl_res_atms(res_idx + 1) - 1

    res_owner = gbl_atm_owner_map(first_res_atm)

    do atm_idx = first_res_atm + 1, last_res_atm

      if (gbl_atm_owner_map(atm_idx) .ne. res_owner) then

        if (master) then
          write(mdout, '(a,a)') error_hdr, &
            'Bad residue/molecule data in prmtop!'
          write(mdout, '(a,a,i6,a,i7,a,i7,a)') extra_line_hdr, &
            'Residue ', res_idx, '(atoms ', first_res_atm, '-', &
            last_res_atm, ') is in multiple molecules.'
          call mexit(6, 1)
        else
          call mexit(6, 0)
        end if

      end if

    end do

  end do

  return

end subroutine check_atm_division

#endif /* MPI */

end module alltasks_setup_mod
