! This program shows how you might implement an MPI_Alltoallv in
! the routine Myall2allv.  Myalltoallv is fairly efficent in that
! it does the sends/recs in orders so that there is less network
! contention.  See the class notes.
!
! Each processor send/rec a different and random amount of data
! to/from other processors.  We use MPI_Alltoall to tell how much
! data is going to be sent.
! 
module mpi
      include 'mpif.h'
end module
module global
	integer numnodes,myid,mpi_err
	integer, parameter :: mpi_root=0
end module
subroutine init
    use mpi
    use global
    implicit none
! do the mpi init stuff
    call MPI_INIT( mpi_err )
    call MPI_COMM_SIZE( MPI_COMM_WORLD, numnodes, mpi_err )
    call MPI_Comm_rank(MPI_COMM_WORLD, myid, mpi_err)
end subroutine init

program test1
! poe a.out -procs 3 -rmpool 1
	use mpi
	use global
	implicit none
	integer, allocatable :: sray(:),rray(:)
	integer, allocatable :: sdisp(:),scounts(:),rdisp(:),rcounts(:)
	integer ssize,rsize,i,k,j
	real z
	call init	
! counts and displacement arrays
	allocate(scounts(0:numnodes-1))
	allocate(rcounts(0:numnodes-1))
	allocate(sdisp(0:numnodes-1))
	allocate(rdisp(0:numnodes-1))
! set counts to send to other processors
    call seed_random
	do i=0,numnodes-1
		call random_number(z)
		scounts(i)=nint(10.0*z)+1
	enddo
	write(*,*)"myid= ",myid," scounts= ",scounts
	call MPI_alltoall(	scounts,1,MPI_INTEGER, &
						rcounts,1,MPI_INTEGER, &
	                 	MPI_COMM_WORLD,mpi_err)
	write(*,*)"myid= ",myid," rcounts= ",rcounts
! calculate displacements and the size of the arrays
	sdisp(0)=0
	do i=1,numnodes-1
		sdisp(i)=scounts(i-1)+sdisp(i-1)
	enddo
	rdisp(0)=0
	do i=1,numnodes-1
		rdisp(i)=rcounts(i-1)+rdisp(i-1)
	enddo
	ssize=sum(scounts)
	rsize=sum(rcounts)
! allocate send and rec arrays
	allocate(sray(0:ssize-1))
	allocate(rray(0:rsize-1))
	sray=myid
! send/rec different amounts of data to/from each processor 
	call myall2allv(	sray,scounts,sdisp,MPI_INTEGER, &
						rray,rcounts,rdisp,MPI_INTEGER, &
	                 	MPI_COMM_WORLD,mpi_err)
	                
	write(*,*)"myid= ",myid,"    rray= ",rray
	call mpi_finalize(mpi_err)
end program

!typical output
! myid=  0  scounts=  1 5 2
! myid=  0  rcounts=  1 1 1
! myid=  0     rray=  0 1 2
  
! myid=  1  scounts=  1 7 3
! myid=  1  rcounts=  5 7 5
! myid=  1     rray=  0 0 0 0 0 1 1 1 1 1 1 1 2 2 2 2 2
  
! myid=  2  scounts=  1 5 10
! myid=  2  rcounts=  2 3 10
! myid=  2     rray=  0 0 1 1 1 2 2 2 2 2 2 2 2 2 2

              
subroutine seed_random
	use global
	implicit none
	integer the_size,j
	integer, allocatable :: seed(:)
	real z
	call random_seed(size=the_size) ! how big is the intrisic seed? 
    allocate(seed(the_size))        ! allocate space for seed 
    do j=1,the_size                 ! create the seed 
    	seed(j)=abs(myid*10)+(j*myid*myid)+100  ! abs is generic 
    enddo 
    call random_seed(put=seed)      ! assign the seed 
    deallocate(seed)
end subroutine
subroutine myall2allv(sendbuf,sendcounts,sdispls,MPI_S_TYPE,&
                      recvbuf,recvcounts,rdispls,MPI_R_TYPE,&
                      comm,ierr)
  use mpi

  integer sendbuf(*),recvbuf(*)
  integer sendcounts(0:*),sdispls(0:*),recvcounts(0:*),rdispls(0:*)
  integer comm,ierr

  integer myid,numnodes,status(MPI_STATUS_SIZE)
  integer i,n2,xchng,my_tag

  call MPI_COMM_RANK( comm, myid, ierr )
  call MPI_COMM_SIZE( comm, numnodes, ierr )
  my_tag=4925

! find the power of two >= numnodes
  n2=nint(log(float(numnodes))/log(2.0))
  if(2**n2 < numnodes)n2=n2+1
  n2=2**n2
  n2=n2-1
  do i=1,n2
! do xor to find the processor xchng
      xchng=ieor(i,myid)
      if(xchng .le. (numnodes-1))then
          if(myid < xchng)then
              call MPI_SEND(sendbuf(sdispls(xchng)+1),sendcounts(xchng),&
                  MPI_S_TYPE,xchng,my_tag,comm,ierr)
              call MPI_RECV(recvbuf(rdispls(xchng)+1),recvcounts(xchng),&
                  MPI_R_TYPE,xchng,my_tag,comm,status,ierr)
          else
              call MPI_RECV(recvbuf(rdispls(xchng)+1),recvcounts(xchng),&
                  MPI_R_TYPE,xchng,my_tag,comm,status,ierr)
              call MPI_SEND(sendbuf(sdispls(xchng)+1),sendcounts(xchng),&
                  MPI_S_TYPE,xchng,my_tag,comm,ierr)
          endif
      endif
  enddo
! exchange data with myself
  do i=1,sendcounts(myid)
        recvbuf(rdispls(myid)+i)=sendbuf(sdispls(myid)+i)
  enddo
  return
end subroutine myall2allv

