! $Id$
!
! MODULE_DOC: This module contains Yin-Yang related types and functions
! MODULE_DOC: which are incompatible with FORTRAN 95.
!
!***************************************************************
!
module Yinyang_mpi
!
  use Yinyang
  use Mpicomm
  use Cparam
!
  implicit none

  include 'yinyang_mpi.h'

  private
!
! Variables for z-averages.
!
  integer, dimension(2,3) :: thrange_gap=0
  integer, dimension(3) :: yinprocs=-1
  type(ind_coeffs), dimension(3) :: indweights_zaver_gap
!
  integer, dimension(2) :: thrange_cap=0, capzprocs=-1
  type(ind_coeffs) :: indweights_zaver_cap
!
  integer, dimension(:,:,:), allocatable :: thranges_gap, thranges_cap
  integer :: offset_cap=0, caproot=-1, nycap=0
!
  contains

!*******************************************************************
    subroutine zsum_yy(fnamexy,iname,m,n,a)

      real, dimension(:,:,:), intent(INOUT) :: fnamexy
      integer,                intent(IN)    :: iname,m,n
      real, dimension(:),     intent(IN)    :: a
!
      integer :: i,j,ith

        if (thrange_gap(1,1)>0) then
!
!  collect contributions to the extended phi lines in the gap.
!
          do i=1,3
!
!  Such lines may come from at most 3 different Yin procs.
!
            if (yinprocs(i)==-1) exit

            do j=1,size(indweights_zaver_gap(i)%inds,3)
              ith=indweights_zaver_gap(i)%inds(m,n,j)
              if (ith==0) exit
              fnamexy(iname,:,ith)=fnamexy(iname,:,ith)+a*indweights_zaver_gap(i)%coeffs(m,n,j)
            enddo
          enddo
        endif
!
!  Collect contributions to the phi lines in the cap(s).
!
        if (thrange_cap(1)>0) then
          do j=1,size(indweights_zaver_cap%inds,3)
            ith=indweights_zaver_cap%inds(m,n,j)
            if (ith==0) exit
            fnamexy(iname,:,ith)=fnamexy(iname,:,ith)+a*indweights_zaver_cap%coeffs(m,n,j)
          enddo
        endif

    endsubroutine zsum_yy
!*******************************************************************
    subroutine reduce_zsum(arrm,temp)
!
!  Calculate z-sums (still depending on x and y).
!
!  25-apr-16/MR: outsourced from zaverages_xy
!
      use General, only: find_proc
      use Mpicomm, only: mpireduce_sum, mpisend_real, mpirecv_real, mpiwait, mpibarrier
      use Cdata

      real, dimension(:,:,:), intent(IN) :: arrm
      real, dimension(:,:,:), intent(OUT):: temp

      real, dimension(:,:,:), allocatable :: buffer
!
      integer :: iprocy, iprocz, iproc_yin, iproc_yang, i, offset, len, source, irequest
      integer, dimension(2) :: rng
      integer, dimension(3) :: sz,requests
!
!  Communicate over all processors along z beams.
!  The result is only present on the z-root processors (of Yin grid).
!  On Yang grid, procs (ipx,0,0) and (ipx,nprocy-1,nprocz-1) collect data 
!  for Yin-phi coordinate lines in polar caps.
!
        sz=(/size(arrm,1),size(arrm,2),size(arrm,3)/)
!
!  Summation of local fnamexy arrays (of Yin grid) into fsumxy of z root
!  processors of Yin grid.
!
        if (.not.lyang) call mpireduce_sum(arrm,temp,sz,idir=3)

        if (lyinyang) then
          if (lyang) then
!
!  If processor in Yang grid contributes to any phi coordinate line in gap send
!  data:    
! 
            if (thrange_gap(1,1)>0) then

              offset=0; irequest=0
              do i=1,3              ! These lines can come from at most 3 z root procs Yin grid.
                iproc_yin=yinprocs(i)
                if (iproc_yin==-1) exit
                rng=thrange_gap(:,i)-(thrange_gap(1,i)-1)+offset
                len=rng(2)-rng(1)+1
!
!if (iproc_yin==0 .or. iproc_yin==1) print'(a,3i3,3e12.5)', 'GAP: SEND:
!iproc_world, iproc_yin, len=', iproc_world, iproc_yin, &
!len,
!sum(fnamexy(:,:,rng(1):rng(2))),maxval(fnamexy(:,:,rng(1):rng(2))),minval(fnamexy(:,:,rng(1):rng(2)))
!  Local sum sent to relevant Yin zroot proc (non-blocking).
!
                irequest=irequest+1
!print*, 'iproc_world, iproc_yin,fnamexy=', iproc_world, iproc_yin,
!maxval(fnamexy(:,rng(1):rng(2),:)), minval(fnamexy(:,rng(1):rng(2),:))
                call mpisend_real(arrm(:,:,rng(1):rng(2)),(/sz(1),sz(2),len/), &
                                  iproc_yin,iproc_world,comm=MPI_COMM_WORLD,nonblock=requests(irequest))
                offset=offset+len   !  Update, as contributions for the <=3 procs are stacked in fnamexy.
              enddo
              do i=1,irequest
                call mpiwait(requests(irequest))
              enddo
            endif

            if (lcaproot) then                     ! If proc is cap collector,
              do iprocz=capzprocs(1),capzprocs(2)  ! iterate over all Yang procs
                do iprocy=0,nprocy-1               ! in cap.
                  rng=thranges_cap(:,iprocy,iprocz)
!print*, 'CAP: RECV: iproc, source, rng=', iproc, find_proc(ipx,iprocy,iprocz), rng
                  if (rng(1)==0) cycle
!
!  If processor in Yang grid contributes to any phi coordinate line in cap:
! 
                  source=find_proc(ipx,iprocy,iprocz)
!if (iproc_world==149) print*, 'source, rng:', source, rng
!rng, rng(2)-rng(1)+1
                  if (source==iproc) then          ! no self-communication
                    temp(:,:,rng(1):rng(2))=temp(:,:,rng(1):rng(2))+arrm(:,:,1+offset_cap:rng(2)-rng(1)+1+offset_cap)
                  else                             ! accumulate contribs from all cap procs.            
                    len=rng(2)-rng(1)+1
                    allocate(buffer(sz(1),sz(2),len))
!
!  Receive data from relevant Yang proc (blocking).
!
!print*, 'CAP: RECV: iproc, source, rng=', iproc, source, len
!rng, rng(2)-rng(1)+1
                    call mpirecv_real(buffer,(/sz(1),sz(2),len/),source,source)
                    temp(:,:,rng(1):rng(2)) = temp(:,:,rng(1):rng(2))+buffer
                    deallocate(buffer)
                  endif
                enddo
              enddo
            elseif (thrange_cap(1)>0) then
!
!  If proc is not cap collector, send to it if necessary (blocking).
!
              rng=thrange_cap-(thrange_cap(1)-1)+offset_cap
!print*, 'CAP:SEND,iproc, caproot=', iproc, caproot, rng(2)-rng(1)+1
              call mpisend_real(arrm(:,:,rng(1):rng(2)),(/sz(1),sz(2),rng(2)-rng(1)+1/), & 
                                caproot,iproc)
            endif

          elseif (lfirst_proc_z) then
!
!  Root processors of z beams in Yin grid accumulate contributions from
!  all Yang grid processors in gap.
!
            do iprocz=nprocz/3-1,2*nprocz/3
              do iprocy=0,nprocy-1
!
!  Range of theta of phi coordinate lines to which processor (ipx,iprocy,iprocz)
!  contributes.
! 
                rng=thranges_gap(:,iprocy,iprocz)
                if (rng(1)>0) then                                        ! if range non-empty
                  len=rng(2)-rng(1)+1
                  allocate(buffer(sz(1),sz(2),len))
                  iproc_yang=find_proc(ipx,iprocy,iprocz)+ncpus           ! world rank of source proc
                  call mpirecv_real(buffer,(/sz(1),sz(2),len/),iproc_yang,iproc_yang,comm=MPI_COMM_WORLD)
!if (iproc_world==0 .or. iproc_world==1) print'(a,3i3,3e12.5)', 'GAP: RECV:
!iproc_world, iproc_yang, len=', iproc_world, &
!iproc_yang, len, sum(buffer),maxval(buffer), minval(buffer)
!print*, 'iproc_world, iproc_yang,buffer=', iproc_world, iproc_yang,
!maxval(buffer), minval(buffer)
                  temp(:,:,rng(1):rng(2)) = temp(:,:,rng(1):rng(2))+buffer
                  deallocate(buffer)
                endif

              enddo
            enddo
          endif

          call mpibarrier
        endif
!
    endsubroutine reduce_zsum
!***********************************************************************
    subroutine initialize_zaver_yy(nlines,nycap_)
!
!  Initializations for calculation of z-sums on Yin-Yang grid.
!  Only valid for equidistant grid in y and z.
!  Returns total number of Yin-phi coordinate lines which cross the executing
!  proc.
!  Can be lines in the "gap" of the Yin grid phi<= Pi/4, phi>=7Pi/4
!  or lines which lie completely within Yang grid, then in the "polar caps"
!  theta<=Pi/4, theta>=3Pi/4. Note that executing proc can have a share of both
!  types.
!
!   11-mar-16/MR: coded
!
      use General, only: indgen, yy_transform_strip_other, find_proc, itoa
      use Mpicomm, only: mpireduce_sum_int, mpisend_int, mpirecv_int, mpiwait, mpibarrier
      use Messages, only: warning
      use Cdata

      integer, intent(INOUT):: nlines
      integer, intent(OUT)  :: nycap_

      type(ind_coeffs) :: intcoeffs
      real, dimension(:,:,:), allocatable :: thphprime
      real, dimension(:), allocatable :: yloc, zloc
      integer, dimension(:), allocatable :: requests
      integer :: nok, noks, noks_all, nyl, nzl, iprocy, iprocz, ifound, request, &
                 iproc_yin, iproc_yang, newlines, offset, nlines_tot, irequest
      integer, dimension(2) :: rng
      logical :: lwith_ghosts
      logical, save :: lcalled=.false.

      if (lcalled) then
        nycap_=nycap
        return                    ! routine has already been called
      else
        lcalled=.true.
      endif

      nzgrid_eff=4./3.*nzgrid     ! total number of points on full Yin-phi coordinate line.
                                  ! could be reduced in polar caps.
      if (lyang) then

        lwith_ghosts = nlines==my
        noks=0

        if (ipz>=nprocz/3-1 .and. ipz<=2*nprocz/3) then
!
!  These procs may see phi coordinate lines which continue from Yin grid into
!  the gap.
!
          nzl=nzgrid/3            ! number of points in gap.
          allocate(thphprime(2,nlines,nzl),yloc(nlines),zloc(nzl))

          yloc=xyz0(2)+(indgen(nlines)-1)*dy      ! valid only for equidistant grids!
          if (lwith_ghosts) yloc=yloc-nghost*dy   !    ~
          zloc=xyz1(3)+indgen(nzl)*dz

          nlines_tot=nlines*nprocy*nzl

          nlines=0
          ifound=0; offset=0
!
!  Loop over y-beam of processors with iprocz=0 in Yin grid.
!
          do iprocy=0,nprocy-1
!  
            iproc_yin=find_proc(ipx,iprocy,0)
!
!  yloc x zloc: strip of Yin-phi coordinate lines from Yin-proc iproc_yin.
!
            call yy_transform_strip_other(yloc,zloc,thphprime)
            nok=prep_interp(thphprime,intcoeffs,iyinyang_intpol_type,th_range=thrange_gap(:,ifound+1))
!
!  nok: number of points of the strip claimed by executing proc; 
!  intcoeffs: interpolation data for these points; thrange_gap: y-range of
!  claimed points,
!  indices refer to local numbering of proc iproc_yin
!            
            if (nok>0) then

              ifound=ifound+1
              newlines=thrange_gap(2,ifound)-thrange_gap(1,ifound)+1
              nlines=nlines+newlines         ! accumulate detected lines.
              yinprocs(ifound)=iproc_yin     ! store Yin-proc from which detected lines emanate.
              noks=noks+nok                  ! accumulate claimed points.
!
!  Derive from interpolation data to which phi-coordinate lines a point (m,n)
!  contributes with which weight.
!
              call coeffs_to_weights(intcoeffs,indweights_zaver_gap(ifound))
              where (indweights_zaver_gap(ifound)%inds>0) &
                indweights_zaver_gap(ifound)%inds=indweights_zaver_gap(ifound)%inds-(thrange_gap(1,ifound)-1)+offset
!
! Entries in indweights_zaver_gap%inds now refer to number of line in local
! fnamexy.    
!
!print*, 'GAP: iproc_world, ifound, thrange_gap=', iproc_world, ifound,
!thrange_gap(:,ifound)                     
!if (ipz==nprocz/3.and.ipy==0)
!print*,'iproc_world,ifound,indweights_zaver_gap=', indweights_zaver_gap(ifound)

              rng=thrange_gap(:,ifound)
              offset=offset+newlines         ! offset of line number for proc iproc_yin in local fnamexy.

            else
              rng=(/0,0/)
            endif
!print*, 'SEND: iproc_yin, iproc_world=', iproc_yin, iproc_world
!  Tell z-root proc of Yin grid, which of its phi coordinate lines are detected
!  within executing proc (maybe none).
!
            call mpisend_int(rng,2,iproc_yin,iproc_world,MPI_COMM_WORLD)

            if (ifound>3) stop               ! lines from at most 3 Yin procs expected.
            yloc=yloc+Lxyz_loc(2)+dy/nprocy  ! shift y coordinates to next z-root proc of Yin grid.

          enddo

          offset_cap=nlines                  ! total number of detected gap lines = offset for cap lines.
          deallocate(thphprime,yloc,zloc)
!print*, 'GAP: iproc_world,yinprocs=', iproc_world,yinprocs
        endif

        call mpireduce_sum_int(noks, noks_all) ! sum up number of claimed gap points over Yang grid.
        if (iproc==0.and.noks_all<nlines_tot) &
          call warning('initialize_zaver_yy',  &
                       'only'//itoa(noks_all)//' points in Yin grid gap claimed by Yang procs (goal:' &
                       //itoa(nlines_tot)//')')

        if (ipz<=nprocz/3 .or. ipz>=2*nprocz/3-1) then
!
!  These procs may see Yin-phi coordinate lines which lie completely inside Yang
!  grid that is, in the polar caps.
!
          nycap=nygrid/2.-1                   ! number of Yin-phi coordinate lines in polar cap
                                              ! could be reduced for bigger pole
                                              ! distance of closest line.
          if (lwith_ghosts) nycap=nycap+nghost

          allocate(thphprime(2,nycap,nzgrid_eff),yloc(nycap),zloc(nzgrid_eff))

          zloc=indgen(nzgrid_eff)*dz
!if (iproc==0) print*, 'zloc=', zloc(1), zloc(nzgrid_eff)
          if (ipz<=nprocz/3) then
!
! Southern cap
!
            yloc=xyz1(2)+indgen(nycap)*dy
            if (lwith_ghosts) yloc=yloc-dy*nghost

            caproot=find_proc(ipx,0,0)                 ! "cap collector" has lowest rank in layer ipx.
            capzprocs=(/0,nprocz/3/)                   ! range of z ranks in cap.      
!if (iproc==caproot)print*, 'South: yloc=', yloc

          elseif (ipz>=2*nprocz/3-1) then
!
! Northern cap
!
            yloc=xyz0(2)-(nycap+1-indgen(nycap))*dy
            if (lwith_ghosts) yloc=yloc+dy*nghost

            caproot=find_proc(ipx,nprocy-1,nprocz-1)   ! "cap collector" has highest rank in layer ipx.
            capzprocs=(/2*nprocz/3-1,nprocz-1/)        ! range of z ranks in cap.     
!if (iproc==caproot) print*, 'North: yloc=', yloc
          endif
!if (iproc==caproot) print*, 'iproc, yloc=', iproc, yloc(1), yloc(nycap)
!
!  yloc x zloc: strip of all Yin-phi coordinate lines in cap.
!
          call yy_transform_strip_other(yloc,zloc,thphprime)
          nok=prep_interp(thphprime,intcoeffs,iyinyang_intpol_type,th_range=thrange_cap)
!
!  nok: number of points of the strip claimed by executing proc; 
!  intcoeffs: interpolation data for these points; thrange_cap: y-range of
!  claimed points,
!  indices refer to bunch of all Yin-phi coordinate lines in cap.
!            
          if (nok>0) then

            nlines=nlines+thrange_cap(2)-thrange_cap(1)+1     ! accumulate detected lines.            
!
!  Derive from interpolation data to which phi-coordinate lines a point (m,n)
!  contributes with which weight.
!
            call coeffs_to_weights(intcoeffs,indweights_zaver_cap)
!print*, 'CAP: iproc_world, thrange_cap=', iproc_world, thrange_cap, &
!minval(indweights_zaver_cap%inds), maxval(indweights_zaver_cap%inds)
            where (indweights_zaver_cap%inds>0) &
              indweights_zaver_cap%inds=indweights_zaver_cap%inds-(thrange_cap(1)-1)+offset_cap
!
! Entries in indweights_zaver_cap%inds now refer to number of line in local
! fnamexy.    
!
          endif

          if (iproc==caproot) then      ! if proc is cap collector
            lcaproot=.true.
            allocate(thranges_cap(2,0:nprocy-1,capzprocs(1):capzprocs(2)))
!if (iproc==caproot) print*,'iproc_world, nycap=', iproc_world, nycap
            allocate(requests(nprocy*(capzprocs(2)-capzprocs(1)+1)-1))
!
!  Cap collector receives from each Yang procs in cap, which of the cap lines
!  are detected
!  within proc (maybe none) and stores this in thranges_cap..
!
            irequest=1
            do iprocz=capzprocs(1),capzprocs(2)
              do iprocy=0,nprocy-1
                ipp=find_proc(ipx,iprocy,iprocz)
                if (iproc==ipp) then            ! no self-communication
                  thranges_cap(:,iprocy,iprocz)=thrange_cap
                else
                  call mpirecv_int(thranges_cap(:,iprocy,iprocz),2,ipp,ipp, &
                                   nonblock=requests(irequest))
                  irequest=irequest+1
                endif
              enddo
            enddo

            do irequest=1,size(requests)
              call mpiwait(requests(irequest))
            enddo
          else
!
!  Tell cap collector, which cap lines are detected
!  within executing proc (maybe none).
!
            call mpisend_int(thrange_cap,2,caproot,iproc)
          endif
!print*, 'iproc_world, offset_cap,nlines=', iproc_world, offset_cap,nlines
        endif

        call mpireduce_sum_int(nok, noks_all)  ! sum up number of claimed cap points (both caps) over Yang.
        if (iproc==0.and.noks_all<2*nycap*nzgrid_eff) &
          call warning('initialize_zaver_yy',  &
                       'only'//itoa(noks_all)//' points in polar caps claimed by Yang procs (goal:' &          
                       //itoa(2*nycap*nzgrid_eff)//')')

        nycap_=nycap

      elseif (lfirst_proc_z) then
!
!  Yin z-beam root proc receives from each Yang procs in gap, which of the gap
!  lines are detected
!  within proc (maybe none) and stores this in thranges_gap..
!
        allocate(thranges_gap(2,0:nprocy-1,nprocz/3-1:2*nprocz/3))
        allocate(requests(nprocy*(nprocz/3+2)))
        irequest=1
        do iprocz=nprocz/3-1,2*nprocz/3
          do iprocy=0,nprocy-1
            iproc_yang=find_proc(ipx,iprocy,iprocz)+ncpus
!print*, 'RECV: iproc_yang, iproc_world=', iproc_yang, iproc_world
            call mpirecv_int(thranges_gap(:,iprocy,iprocz),2,iproc_yang,iproc_yang, &
                             MPI_COMM_WORLD,nonblock=requests(irequest))
            irequest=irequest+1
          enddo
        enddo

        do irequest=1,size(requests)
          call mpiwait(requests(irequest))
        enddo

      endif

      call mpibarrier

    endsubroutine initialize_zaver_yy
!***********************************************************************
end module