#include "cppdefs.h"
#define SUM_BY_PAIRS
#define  ETALON_CHECK
 
      subroutine diag (tile)
      implicit none
      integer tile
#include "param.h"
#include "private_scratch.h"
#include "compute_tile_bounds.h"
      call diag_tile (istr,iend,jstr,jend,  A2d(1,1), A2d(1,2),
     &                                                 A2d(1,3)
#ifdef SOLVE3D
     &       , A2d(1,4), A2d(1,5), A2d(1,6), A2d(1,7), A2d(1,8)
#endif
     &                                                        )
      return
      end
 
      subroutine diag_tile (istr,iend,jstr,jend, dVol, ke,pe
#ifdef SOLVE3D
     &                             , ke2b, ke3bc, kesrf, ub,vb
#endif
     &                                                        )
      implicit none
      integer istr,iend,jstr,jend, i,j,k, nsubs, ierr, ie
#include "param.h"
      real dVol(PRIVATE_2D_SCRATCH_ARRAY),  dA,   v2,
     &       ke(PRIVATE_2D_SCRATCH_ARRAY),  cff,  v2bc,
     &       pe(PRIVATE_2D_SCRATCH_ARRAY),  my_v2d_max
#ifdef SOLVE3D
      real ke2b(PRIVATE_2D_SCRATCH_ARRAY),
     &    ke3bc(PRIVATE_2D_SCRATCH_ARRAY),  my_v3d_max,
     &    kesrf(PRIVATE_2D_SCRATCH_ARRAY),  my_v3bc_max,
     &       ub(PRIVATE_2D_SCRATCH_ARRAY),
     &       vb(PRIVATE_2D_SCRATCH_ARRAY)
#endif
      real*QUAD my_avzeta, my_ke, my_pe, my_ke2b, my_ke3bc, my_kesrf
      logical lsumX, lsumY
      integer max_check_line, inc,jnc
      parameter (max_check_line=128)
      character check_line*(max_check_line)
#ifdef MPI
# include "mpif.h"
      integer size, step, status(MPI_STATUS_SIZE)
      real*QUAD buff(16)
      common /xyz/ buff
#endif
#include "grid.h"
#ifdef SOLVE3D
# include "ocean3d.h"
#else
# include "ocean2d.h"
#endif
#include "scalars.h"
 
#ifdef ETALON_CHECK
      integer ncheck, nparam
      parameter (ncheck=16, nparam=8)
      integer icheck, check_point(ncheck)
      character*(max_check_line) etalon_line(ncheck)
      real    A0(nparam), A1(nparam)
      integer P0(nparam), P1(nparam)
 
      do icheck=1,ncheck          ! reset checklines to all-blanc
        check_point(icheck)=-1    ! status, before filling them with
        etalon_line(icheck)=' '   ! meaningful values. This is needed
      enddo                       ! because some of them may be left
                                  ! uninitialized.
# if defined SOLITON
#  include "etalon_data.SOLITON"
# elif defined SEAMOUNT
#  include "etalon_data.SEAMOUNT"
# elif defined UPWELLING
#  include "etalon_data.UPWELLING"
# elif defined DAMEE_B
#  include "etalon_data.DAMEE_B"
# elif defined USWEST
#  include "etalon_data.USWEST"
# endif
#endif
!
! Compute and report various diagnostics: volume-averaged kinetic and
! potential energy, horizontally averaged free-surface perturbation;
! maximum velocity, etc. Since this operation involves computation of
! global sums, it is done in three stages: at first, summation within
! the tile [subdomain of indices (istr:iend,jstr:jend)] independently
! by individual threads. In the case of three dimensions also perform
! verical summation at this stage; then summation accross the threads
! of the same MPI process (if any), and, finally, MPI reduction to
! compute global integrals.
!
      if (mod(iic-1,ninfo) .eq. 0) then
        my_v2d_max=0.
#ifdef SOLVE3D
        my_v3d_max=0.
        my_v3bc_max=0.
        do j=jstr,jend+1
          do i=istr,iend+1
            ub(i,j)=(Hz(i,j,N)+Hz(i-1,j,N))*u(i,j,N,nstp)
            vb(i,j)=(Hz(i,j,N)+Hz(i,j-1,N))*v(i,j,N,nstp)
          enddo
          do k=N-1,2,-1
            do i=istr,iend+1
              ub(i,j)=ub(i,j)+(Hz(i,j,k)+Hz(i-1,j,k))*u(i,j,k,nstp)
              vb(i,j)=vb(i,j)+(Hz(i,j,k)+Hz(i,j-1,k))*v(i,j,k,nstp)
            enddo
          enddo
          do i=istr,iend+1
            ub(i,j)=(ub(i,j)+(Hz(i,j,1)+Hz(i-1,j,1))*u(i,j,1,nstp))
     &           /(z_w(i,j,N)+z_w(i-1,j,N)-z_w(i,j,0)-z_w(i-1,j,0))
            vb(i,j)=(vb(i,j)+(Hz(i,j,1)+Hz(i,j-1,1))*v(i,j,1,nstp))
     &           /(z_w(i,j,N)+z_w(i,j-1,N)-z_w(i,j,0)-z_w(i,j-1,0))
          enddo
        enddo
 
        cff=g/rho0
        do j=jstr,jend
          do i=istr,iend
            v2=0.5*(ub(i,j)**2+ub(i+1,j)**2 +vb(i,j)**2+vb(i,j+1)**2)
            my_v2d_max=max(my_v2d_max, v2)
 
            ke(i,j)=0.
            pe(i,j)=0.5*g*z_w(i,j,N)*z_w(i,j,N)
 
            ke2b(i,j)=0.5*(z_w(i,j,N)-z_w(i,j,0))*v2
            ke3bc(i,j)=0.
            kesrf(i,j)=0.5*( u(i,j,N,nstp)**2 + u(i+1,j,N,nstp)**2
     &                      +v(i,j,N,nstp)**2 + v(i,j+1,N,nstp)**2)
          enddo
          do k=N,1,-1
            do i=istr,iend
              v2=0.5*( u(i,j,k,nstp)**2 + u(i+1,j,k,nstp)**2
     &                +v(i,j,k,nstp)**2 + v(i,j+1,k,nstp)**2)
 
              v2bc=0.5*( (u(i,j,k,nstp)-ub(i,j))**2
     &                  +(u(i+1,j,k,nstp)-ub(i+1,j))**2
     &                  +(v(i,j,k,nstp)-vb(i,j))**2
     &                  +(v(i,j+1,k,nstp)-vb(i,j+1))**2)
 
              my_v3d_max  = max(my_v3d_max,  v2)
              my_v3bc_max = max(my_v3bc_max, v2bc)
 
              ke(i,j)=ke(i,j) + 0.5*v2*Hz(i,j,k)
 
              pe(i,j)=pe(i,j) + cff*Hz(i,j,k)
# ifdef SPLIT_EOS
     &      *(rho1(i,j,k)+qp1(i,j,k)*(z_w(i,j,N)-z_r(i,j,k)))
# else
     &                                            *rho(i,j,k)
# endif
     &                              *(z_r(i,j,k)-z_w(i,j,0))
 
              ke3bc(i,j)=ke3bc(i,j) + 0.5*v2bc*Hz(i,j,k)
            enddo
          enddo
          do i=istr,iend
# ifdef MASKING
            dA=rmask(i,j)/(pm(i,j)*pn(i,j))
# else
            dA=1./(pm(i,j)*pn(i,j))
# endif
            dVol(i,j) = dA * z_w(i,j,N)
            ke(i,j)   = dA * ke(i,j)
            pe(i,j)   = dA * pe(i,j)
            ke2b(i,j) = dA * ke2b(i,j)
            ke3bc(i,j)= dA * ke3bc(i,j)
            kesrf(i,j)= dA * kesrf(i,j)
          enddo
        enddo
#else
        cff=0.5*g
        do j=jstr,jend
          do i=istr,iend
            v2=0.5*( ubar(i,j,kstp)**2+ubar(i+1,j,kstp)**2
     &              +vbar(i,j,kstp)**2+vbar(i,j+1,kstp)**2)
            my_v2d_max=max(my_v2d_max, v2)
# ifdef MASKING
            dA=rmask(i,j)/(pm(i,j)*pn(i,j))
# else
            dA=1./(pm(i,j)*pn(i,j))
# endif
            dVol(i,j) = dA*zeta(i,j,kstp)
            pe(i,j)   = dVol(i,j)*cff*zeta(i,j,kstp)
            ke(i,j)   = dA * 0.5*(zeta(i,j,kstp)+h(i,j))*v2
          enddo
        enddo
#endif
                                        ! Horizontal summation within
                                        ! the subdomain.
#define SUM_BY_PAIRS
#ifdef SUM_BY_PAIRS
        lsumX=.true.                    ! Compute partial sums within
        lsumY=.true.                    ! the subdomain tile using
        inc=1                           ! split-directional reduction
        jnc=1                           ! by pairs algorithm to avoid
                                        ! accumulation of roundoff
        do while (lsumX .or. lsumY)     ! errors.
          if (istr.le.iend-inc) then
            do j=jstr,jend
              do i=istr,iend-inc,2*inc
                 dVol(i,j) =  dVol(i,j) +  dVol(i+inc,j)
                   ke(i,j) =    ke(i,j) +    ke(i+inc,j)
                   pe(i,j) =    pe(i,j) +    pe(i+inc,j)
# ifdef SOLVE3D
                 ke2b(i,j) =  ke2b(i,j) +  ke2b(i+inc,j)
                ke3bc(i,j) = ke3bc(i,j) + ke3bc(i+inc,j)
                kesrf(i,j) = kesrf(i,j) + kesrf(i+inc,j)
# endif
              enddo
            enddo
            inc=2*inc
          else
            lsumX=.false.
          endif
          if (jstr.le.jend-jnc) then
            do j=jstr,jend-jnc,2*jnc
              do i=istr,iend
                 dVol(i,j) =  dVol(i,j) +  dVol(i,j+jnc)
                   ke(i,j) =    ke(i,j) +    ke(i,j+jnc)
                   pe(i,j) =    pe(i,j) +    pe(i,j+jnc)
# ifdef SOLVE3D
                 ke2b(i,j) =  ke2b(i,j) +  ke2b(i,j+jnc)
                ke3bc(i,j) = ke3bc(i,j) + ke3bc(i,j+jnc)
                kesrf(i,j) = kesrf(i,j) + kesrf(i,j+jnc)
# endif
              enddo
            enddo
            jnc=2*jnc                   ! Code segment below: two-
          else                          ! stage straight summation
            lsumY=.false.               ! algorithm with moderate
          endif                         ! effort to counter roundoff
        enddo                           ! errors.
#else
        do j=jstr+1,jend
          do i=istr,iend
            dVol(i,jstr)  = dVol(i,jstr)  + dVol(i,j)
            ke(i,jstr)    = ke(i,jstr)    + ke(i,j)
            pe(i,jstr)    = pe(i,jstr)    + pe(i,j)
# ifdef SOLVE3D
            ke2b(i,jstr)  = ke2b(i,jstr)  + ke2b(i,j)
            ke3bc(i,jstr) = ke3bc(i,jstr) + ke3bc(i,j)
            kesrf(i,jstr) = kesrf(i,jstr) + kesrf(i,j)
# endif
          enddo
        enddo
        do i=istr+1,iend
          dVol(istr,jstr)  = dVol(istr,jstr)  + dVol(i,jstr)
          ke(istr,jstr)    = ke(istr,jstr)    + ke(i,jstr)
          pe(istr,jstr)    = pe(istr,jstr)    + pe(i,jstr)
# ifdef SOLVE3D
          ke2b(istr,jstr)  = ke2b(istr,jstr)  + ke2b(i,jstr)
          ke3bc(istr,jstr) = ke3bc(istr,jstr) + ke3bc(i,jstr)
          kesrf(istr,jstr) = kesrf(istr,jstr) + kesrf(i,jstr)
# endif
        enddo
#endif
 
        my_avzeta=dVol(istr,jstr)
        my_ke=ke(istr,jstr)
        my_pe=pe(istr,jstr)
#ifdef SOLVE3D
        my_ke2b=ke2b(istr,jstr)
        my_ke3bc=ke3bc(istr,jstr)
        my_kesrf=kesrf(istr,jstr)
#endif
        if (SINGLE_TILE_MODE) then
          nsubs=1
        else
          nsubs=NSUB_X*NSUB_E
        endif
!
! Perform global summation: whoever gets first to the critical region
! resets global sums before global summation starts; after the global
! summation is completed, thread, which is the last one to enter the
! critical region, finalizes the computation of diagnostics and
! prints them out.
!
C$OMP CRITICAL (diag_cr_rgn)
        if (tile_count.eq.0) then
          avzeta=my_avzeta              ! Initialize global sums
          avke=my_ke                    ! for multithreaded shared
          avpe=my_pe                    ! memory summation.
          v2d_max=my_v2d_max
#ifdef SOLVE3D
          avke2b=my_ke2b
          avke3bc=my_ke3bc
          avkesrf=my_kesrf
          v3d_max=my_v3d_max
          v3bc_max=my_v3bc_max
#endif
        else                            ! Perform global summation
          avzeta=avzeta+my_avzeta       ! among the threads within
          avke=avke+my_ke               ! each MPI process.
          avpe=avpe+my_pe
          v2d_max=max(v2d_max, my_v2d_max)
#ifdef SOLVE3D
          avke2b=avke2b+my_ke2b
          avke3bc=avke3bc+my_ke3bc
          avkesrf=avkesrf+my_kesrf
          v3d_max=max(v3d_max, my_v3d_max)
          v3bc_max=max(v3bc_max,my_v3bc_max)
#endif
        endif
 
        tile_count=tile_count+1         ! This counter identifies
                                        ! the last thread, whoever
        if (tile_count.eq.nsubs) then   ! it is, not always master.
          tile_count=0
#ifdef MPI
          if (NNODES.gt.1) then         ! Perform global summation
            size=NNODES                 ! among MPI processes
   1         step=(size+1)/2
              if (mynode.ge.step .and. mynode.lt.size) then
                buff(1)=avzeta
                buff(2)=avke            ! This is MPI_Reduce
                buff(3)=avpe            ! operation. Note that
                buff(4)=avke2b          ! because buff() may be
                buff(5)=avke3bc         ! real*16, the size of
                buff(6)=avkesrf
                buff(7)=v2d_max         ! message in MPI_send
                buff(8)=v3d_max         ! is doubled.
                buff(9)=v3bc_max
 
!>         write(*,*) 'sending ', mynode, '-->', mynode-step
 
                call MPI_Send (buff,  18, MPI_DOUBLE_PRECISION,
     &               mynode-step, 17, MPI_COMM_WORLD,      ierr)
              elseif (mynode .lt. size-step) then
                call MPI_Recv (buff,  18, MPI_DOUBLE_PRECISION,
     &            mynode+step, 17, MPI_COMM_WORLD, status, ierr)
 
!<         write(*,*) 'received ',  mynode, '<--', mynode+step
 
                avzeta=avzeta+buff(1)
                avke=avke+buff(2)
                avpe=avpe+buff(3)
                avke2b=avke2b+buff(4)
                avke3bc=avke3bc+buff(5)
                avkesrf=avkesrf+buff(6)
                v2=buff(7)
                v2d_max=max(v2d_max, v2)
                v2=buff(8)
                v3d_max=max(v3d_max, v2)
                v2bc=buff(9)
                v3bc_max=max(v3bc_max, v2bc)
              endif
             size=step
            if (size.gt.1) goto 1
          endif
          if (mynode.eq.0) then
#endif
            avke=avke/(volume+avzeta)       ! Compute and print out
            avpe=avpe/(volume+avzeta)       ! global diagnostics
            avkp=avke+avpe                  ! (last thread of master
            avke2b=avke2b/(volume+avzeta)   ! MPI process only).
            avke3bc=avke3bc/(volume+avzeta)
            avkesrf=avkesrf/area
            v2d_max=sqrt(v2d_max)
            v3d_max=sqrt(v3d_max)
            v3bc_max=sqrt(v3bc_max)
 
#ifdef SEAMOUNT
            if (first_time.eq.0) then
              first_time=1
              write(stdout,2) 'STEP', 'time[DAYS]', 'KINETIC_ENRG',
     &                        'BAROTR_ENRG', 'MAX_U3D', 'MAX_UBAR',
     &                        'MAX_UBCL', 'POTEN_ENRG'
C$   &                                                     , 'trd'
  2           format(1x,A,3x,A,1x,A,4x,A,4x,A,7x,A,6x,A,6x,A,4x,A)
            endif
            write(check_line,3)  avke, avke2b, v3d_max, v2d_max,
     &                                             v3bc_max, avpe
C$   &                                                   , proc(2)
  3         format(1PE16.10, 1PE16.9, 3(1PE15.8), 1PE16.8,I3)
#else
            if (first_time.eq.0) then
              first_time=1
              write(stdout,'(1x,A,3x,A,1x,A,4x,A,4x,A,4x,A,1x,A)')
     &                 'STEP',       'time[DAYS]', 'KINETIC_ENRG',
     &                 'SURFACE_KE', 'POTEN_ENRG', 'FREE_SURFACE'
C$   &                                                      ,'trd'
            endif
            write(check_line,'(1PE16.10,3(1PE15.8),I3)')
     &                   avke, avkesrf, avpe, avzeta/area !! volume
C$   &                                                   , proc(2)
#endif
            ie=max_check_line
            do while (check_line(ie:ie).eq.' ' .and. ie.gt.0)
              ie=ie-1
            enddo                                 ! Suppress FORTRAN
            i=0                                   ! floating point Es
            do while (i.lt.ie)                    ! to shorten the
              i=i+1                               ! diagnostic line.
              if (check_line(i:i).eq.'E' .or.
     &            check_line(i:i).eq.'e') then
                check_line(i:ie-1)=check_line(i+1:ie)
                check_line(ie:ie)=' '
                ie=ie-1
              elseif (ichar(check_line(i:i)).lt.48 .or.
     &                ichar(check_line(i:i)).gt.57) then
                if (check_line(i:i).ne.' ' .and.
     &              check_line(i:i).ne.'+'  .and. ! Set may_day_flag
     &              check_line(i:i).ne.'-'  .and. ! to terminate the
     &              check_line(i:i).ne.'.') then  ! run in the case
                  may_day_flag=1                  ! of floating point
                endif                             ! exception
              endif
            enddo
            write(stdout,'(I6,F12.5,1x,A)') iic-1,tdays,
     &                                      check_line(1:ie)
 
#ifdef  ETALON_CHECK
# define WRITE_ETALON
!
! The following segment performs comparison of model output from
! current run (stored and formatted as "check_line") with pre-stored
! data ("etalon_line"). This version is designed to handle situation
! where the number of significant digits for each parameter to be
! compared is unknown (determined dynamically) and it is only assumed
! that format contains mantissa and power, which are recognized as
! A1,P1 and A0,P0 for current and etalon data respectively. This is
! necessary to rescale the mantissa of the difference (if occurs) in
! such a way that its power is the same as for the current output.
! This code organization also has the property that no adjustment is
! necessary, if output format (see above) is modified (i.e., number
! of controlled parameters or number of significant digits in each
! parameter is changed).
!
C$          ie=ie-3             ! <--  disregard thread number
            do icheck=1,ncheck
              if (iic-1 .eq. check_point(icheck)) then
# ifdef WRITE_ETALON
                if (icheck.eq.1) open(unit=97,file='etalon_data.NEW')
                write(97,'(8x,A,I2,A,I5/8x,A,I2,A/5x,A,3x,3A)')
     &             'check_point(', icheck, ')=', check_point(icheck),
     &             'etalon_line(', icheck, ')=',
     &                              '&', '''', check_line(1:ie), ''''
                if (icheck.lt.ncheck) then
c**               write(*,*) 'flush 97'
                  call flush(97)
                else
                  close(97)
                endif
# endif
!
! Extract arrays of mantissa and exponent A1,P1 and A0,P0 from check
! and etalon lines respectively. Perform the comparison, choosing the
! number of controlled parameters to be the smaller of one from check
! and etalon line. Rescale "A0" of etalon line to match scale of
! mantissa of check line.
!
                call read_check_line (check_line,ie, A1,P1,j, nparam)
                call read_check_line (etalon_line(icheck),
     &                                           ie, A0,P0,k, nparam)
                k=min(j,k)
                ierr=0
                do i=1,k
                  if (P0(i).eq.P1(i) .and. A0(i).eq.A1(i)) then
                    A1(i)=0.
                  else
                    A1(i)=A1(i)-A0(i)*10.**(P0(i)-P1(i))
                    ierr=ierr+1
                  endif
                enddo                  !--> discard P0,P1
!
! Print out comparison results. If difference is discovered, print
! out only its mantissa, retaining the same number of digits after
! decimal point as in the original format. To do so, analyse the
! format of check line and save the location of decimal point as
! P0(j) and last digit of mantissa P1(j) for each controlled
! parameter indexed j=1,..,k.
!
                if (ierr.gt.0) then
                  do j=1,k
                    P0(j)=0
                    P1(j)=0
                  enddo
                  j=0
                  do i=1,ie
                    if (check_line(i:i).eq.'.' .and.
     &                             j.lt.nparam) then
                      j=j+1
                      P0(j)=i
                    elseif (check_line(i:i).eq.' '
     &                            .and. j.gt.0) then
                      if (P0(j).gt.0 .and.
     &                    P1(j).eq.0) P1(j)=i-1
                    endif
                  enddo
                  do j=1,k
                    write(check_line(P0(j)-1:P0(j)+14),
     &                               '(F16.10)') A1(j)
                    do i=P1(j)+5,P0(j)+14
                      check_line(i:i)=' '
                    enddo
                  enddo
                  j=0
                  do i=2,P1(k)+5
                    if (check_line(i-1:i).eq.'0.') then
                      check_line(i:i)=':'
                      j=1
                    elseif (j.eq.1.and.check_line(i:i).eq.'0') then
                      check_line(i:i)='.'
                    else
                      j=0
                    endif
                  enddo
                  write(stdout,'(1x,A,3x,A/1x,A)') 'difference:',
     &                               check_line(P0(1)-1:P1(k)+5),
     &                        'DIFFERENCE DISCOVERED'
                else
                  write(stdout,*) 'PASSED_ETALON_CHECK'
                endif
              endif       !<-- iic-1 .eq. check_point(icheck)
            enddo       !<-- icheck
#endif /* ETALON_CHECK */
#ifdef MPI
          endif    ! <-- mynode.eq.0
#endif
 
          if (ninfo.eq.1 .and. iic.gt.  8) ninfo=2
          if (ninfo.eq.2 .and. iic.gt. 16) ninfo=4
          if (ninfo.eq.4 .and. iic.gt. 64) ninfo=8
          if (ninfo.eq.8 .and. iic.gt.256) ninfo=16

        endif
C$OMP END CRITICAL (diag_cr_rgn)
      endif              ! <-- mod(iic-1,ninfo).eq.0
      return
      end

#ifdef ETALON_CHECK
      subroutine read_check_line (string,lstr, A,P,np, nparam)
!
! For a given character string of length lstr, which is assumed to
! contain a sequence of floating point numbers written in FY.X, EY.X
! or 1PEY.X format (unknown in advance), determine how many numbers
! "np" are written and extract mantissa A and exponent P separately
! for each of number. If exponent is absent (FY.X format), the
! corresponding P is set to zero.
!
      implicit none
      character string*(*)
      integer  lstr, nparam, P(nparam), np,  i,k, ierr
      real A(nparam)

      np=0
      i=0
      do while (i.lt.lstr)
        i=i+1
        if (string(i:i).eq.'.') then
          k=i-1
          do while (k.gt.1 .and. ichar(string(k:k)).ge.48
     &                     .and. ichar(string(k:k)).le.57)
            k=k-1
          enddo
          if (string(k:k).eq.' ') k=k+1
          i=i+1
          do while (ichar(string(i:i)).ge.48
     &        .and. ichar(string(i:i)).le.57)
            i=i+1
          enddo
          np=np+1
          A(np)=0.
          P(np)=0
          read(string(k:i-1),*,iostat=ierr) A(np)
          if (string(i:i).eq.'E' .or.
     &        string(i:i).eq.'e') then
            string(i:i)=' '
            i=i+1
          endif
          if (string(i:i).eq.'+' .or. string(i:i).eq.'-') then
            k=i
            i=i+1
            do while (ichar(string(i:i)).ge.48
     &          .and. ichar(string(i:i)).le.57)
              i=i+1
            enddo
            if (string(i:i).ne.'.') then
              read(string(k:i-1),*,iostat=ierr) P(np)
              string(k:i-1)=' '
            endif
          endif
c**       write(*,*) np, A(np),P(np)
        endif
      enddo
      return
      end
#endif
