#include <stdio.h>
#include <math.h>
#include "mpi.h"
#define am(p,q) *(a+2*cols*p+q)
#define bm(p,q) *(buff+2*(cols-rows)*p+q)
int main(int argc, char **argv )
{
/*
   2-D binary radix FFT using message passing. Data are
   in strips. Given "size" processors, there are "size"
   strips containing MAX_ROWS/size vectors in each. Complex
   element A(i,j) is in strip i mod (MAX_ROWS/size) and has 
   location &A(i%(MAX_ROWS/size),j) = &A(mod(i,MAX_ROWS/size),j).
   
   From Sections 5.8 and 5.9 of Arbenz and Petersen,
   "Intro. to Parallel Computing," Oxford Univ. Press, 2004.

                                W. Petersen, 22 Oct. 2003
*/

   int MAX_ROWS=1024;
   int MAX_COLS=1024;
   int ierr,master,rank,size,rows,col,cols,i,j,ij,ip,offset; 
   int skip;
   static float seed;
   float *a,*b,*w,*buff;  /* checksum to test results */
   float *acopy,sign,err,fnn;
   float ggl(float*);
   void cffti(int cols, float *w);
   void Checkres(float *a, float *acopy, int rows, int cols);
   void FFT2D(float *a,float *w,float sign,int rows,int cols);
   MPI_Status stat;

   MPI_Init(&argc,&argv);
   MPI_Comm_size(MPI_COMM_WORLD, &size);
   MPI_Comm_rank(MPI_COMM_WORLD, &rank);
   master = 0;

   cols   = MAX_COLS;
   rows   = cols/size;
   a      = (float *) malloc(2*rows*cols*sizeof(float));
   acopy  = (float *) malloc(2*cols*cols*sizeof(float));
   w      = (float *) malloc(2*cols*sizeof(float));
   buff   = (float *) malloc(2*rows*cols*sizeof(float));
/* initialize sine/cosine tables */
   cffti(cols,w);
   if(rank==master){
/* send section of A to each proccessor */
     seed = 331.0;      /* random seed */
     for(ip=0;ip<size;ip++){
        if(ip==0){
           ij = 0;
           for(i=0;i<rows;i++){
              for(j=0;j<2*cols;j++){
                 am(i,j)    = ggl(&seed); 
                 acopy[ij] = am(i,j);
                 ij++;
              }
           }
        } else {
           ij = 0; offset=2*ip*cols*rows;
           skip = ip*rows;
           for(i=0;i<rows;i++){
              for(j=0;j<2*cols;j++){ 
                 buff[ij] = ggl(&seed);
                 acopy[offset+ij] = buff[ij];
                 ij++;
              }
           }
        }
        if(ip>0){
           ierr=MPI_Send(buff,2*rows*cols,MPI_FLOAT,ip,ip,
                         MPI_COMM_WORLD);
        }
     }
   } else { /* slave parts */
      ierr=MPI_Recv(buff,2*rows*cols,MPI_FLOAT,master,MPI_ANY_TAG,
           MPI_COMM_WORLD,&stat);
      if(stat.MPI_TAG!=0){
         ij = 0;
         for(i=0;i<rows;i++){
            for(j=0;j<2*cols;j++){
               am(i,j) = buff[ij++];
            }
         }
      }
   }
   sign = 1.0;
   FFT2D(a,w,sign,rows,cols);
   sign = -1.0;
   FFT2D(a,w,sign,rows,cols); 
   Checkres(a,acopy,rows,cols); 
   free(a); free(w); free(buff);
   MPI_Finalize();
}
void FFT2D(float *a,float *w,float sign,int rows,int cols)
{
   int i,j,offset,rank;
   float *pa;
   void Xpose();
   void cfft2();

   MPI_Comm_rank(MPI_COMM_WORLD,&rank);
   for(i=0;i<rows;i++){
      offset = 2*i*cols;
      pa     = a + offset;
      cfft2(cols,pa,w,sign);
   }
   Xpose(a,cols);
   for(i=0;i<rows;i++){
      offset = 2*i*cols;
      pa     = a + offset;
      cfft2(cols,pa,w,sign); 
   }
   Xpose(a,cols);
}

void Checkres(float *a,float *acopy,int rows,int cols)
{
   int i,ierr,ij,is,j,offset,rank,size,sor;
   float err,fnm2,*buff;
   MPI_Status stat;

   MPI_Comm_rank(MPI_COMM_WORLD, &rank);
   MPI_Comm_size(MPI_COMM_WORLD, &size);

   buff = (float *) malloc(2*rows*cols*sizeof(float));

   if(rank>0){
      ierr=MPI_Send(a,2*rows*cols,MPI_FLOAT,0,0,
                    MPI_COMM_WORLD);
   } else { /* master */
      fnm2 = 1.0/((float) (cols*cols));
/* rank > 0 part of check */
      for(is=1;is<size;is++){
         ierr = MPI_Recv(buff,2*rows*cols,MPI_FLOAT,
                MPI_ANY_SOURCE,MPI_ANY_TAG,MPI_COMM_WORLD,
                &stat);
         sor  = stat.MPI_SOURCE; ij = 0;
         err  = 0.0; offset = 2*rows*cols*sor;
         fnm2 = 1.0/((float) (cols*cols));
         for(i=0;i<rows;i++){
            for(j=0;j<2*cols;j++){
               err += (acopy[offset+ij]-fnm2*buff[ij])*
                      (acopy[offset+ij]-fnm2*buff[ij]);
               ij++;
            }
         }
         err = sqrt(fnm2*err);
         printf(" Error from cpu %d: N=%d, err=%e\n",
                sor,cols,err);
      }
/* rank = 0 part of check */
      err  = 0.0; ij = 0;
      for(i=0;i<rows;i++){
         for(j=0;j<2*cols;j++){
            err += (acopy[ij]-fnm2*a[ij])*
                   (acopy[ij]-fnm2*a[ij]);
            ij++;
         }
      }
      err = sqrt(fnm2*err);
      printf(" error from cpu %d: N=%d, err=%e\n",rank,cols,err);
   }
   free(buff);
}
void Xpose(float *a, int n) {
  float t0,t1;
  static float *buf_io, *buf_out;
  int i, ij, is, j, step, n2, nn;
  static int init=-1;
  int size, rank, other;
  MPI_Status stat;
 
  MPI_Comm_size(MPI_COMM_WORLD, &size);
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 
  /* number of local rows of 2D array */
  nn = n/size;
  n2 = 2*nn; 

  if(init!=n){
     buf_io  = (float *) malloc(nn*n2*sizeof(float)); 
     init    = n;
  }

  /* local transpose of first block (in-place) */
  for(j = 0; j < nn; j ++){
     for(i = 0; i < j; i++) {
       t0 = a[rank*n2+i*2*n+j*2];
       t1 = a[rank*n2+i*2*n+j*2+1];
       a[rank*n2+i*2*n+j*2]   = a[rank*n2+j*2*n+2*i];
       a[rank*n2+i*2*n+j*2+1] = a[rank*n2+j*2*n+2*i+1];
       a[rank*n2+j*2*n+2*i]   = t0;
       a[rank*n2+j*2*n+2*i+1] = t1;
     }
  }

  /* size-1 communication steps */
  for (step = 1; step < size; step++) {
    other = rank ^ step;

    /* fill send buffer */
    ij = 0;
    for(i=0;i<nn;i++){
       is = other*n2 + i*2*n;  
       for(j=0;j<n2;j++){
          buf_io[ij++] = a[is + j];
       }
    } 
    /* exchange data */
    MPI_Sendrecv_replace(buf_io,2*nn*nn,MPI_FLOAT,
                   other,rank,other,other,
                   MPI_COMM_WORLD,&stat);

    /* write back recv buffer in transposed order */
    for(i = 0; i < nn; i ++){
       for(j = 0; j < nn; j ++){
	  a[other*n2+j*2*n+i*2]   = buf_io[i*n2+j*2];
	  a[other*n2+j*2*n+i*2+1] = buf_io[i*n2+j*2+1];
       }
    }
  }
}
void cffti(n,w)
int n;
float w[][2];
{
   int i;
   float pi,arg,t;

   pi = 3.141592653589793;
   t  = 2.0*pi/((float) n);
   for(i=0;i<n/2;i++){
      arg     = t*((float) i);
      w[i][0] = cos(arg);
      w[i][1] = sin(arg);
   }
}
void cfft2(n,x,w,sign)
int n;
float sign, x[][2], w[][2];
{
   int n2,m,j,mj,p2,p3,p4,BK;
   void step0(), step1(), step2();

   m  = (int) (log((float) n)/log(1.99));
/* 
optimization point for Y-MP:     
   BK = max(m/3-2,0);
optimization point for J-90            
   BK = max((m+1)/6,0);
optimization point for IBM:    
   BK = max((m-3)/2,0);
optimization point for DEC Alpha:                             
   BK = max(m/2-2,0);
optimization point for HP9000                     
   BK = max(m/2-3,0);
optimization point for SGI Indigo 
   BK = max((m-1)/4,0);
optimization point for SGI TFP (R8000)
   BK = max((m-1)/2,0);
optimization point for i860/XP                                  
   BK = max((20*(m-5)-(m-5)*(m-5))/25),0);
optimization point for Intel P-6                      
   BK = max((7+5*(m-4))/10+1,0);
optimization point for Sparc-10
   BK = max(0,2*m/3-4); 
plain vanilla: */
   BK = 0; 

   mj = 1;
   n2 = n/2;

   for(j=0;j<m;j++){
      if(j < (m+1)/2){
         p2 = n2/mj;
         step0(n,mj,&x[0][0],&x[p2][0],w,sign);
      }
      else{
         p2 = n2/mj;
         p3 = mj;
         p4 = p2+mj;
         if(j < (m/2+BK)){
            step1(n,mj,&x[0][0],&x[p2][0],&x[p3][0],&x[p4][0],w,sign);
         } 
         else{
            step2(n,mj,&x[0][0],&x[p2][0],&x[p3][0],&x[p4][0],w,sign);
         }
      }
      mj = 2*mj;
   }
}
void step0(n,mj,a,b,w,sign)
int n,mj;
float a[][2],b[][2],w[][2];
float sign;
{
   float wkr,wku,wambr,wambu;
   int i,k,kw,lj,ii,ij;

   lj = n/(2*mj);
   ij = n/mj;
   for(i=0;i<mj;i++){
      ii = i*ij;
      if(sign > 0.){
#pragma ivdep
         for(k=0;k<lj;k++){
             kw    = k*mj;
             wkr   = w[kw][0];
             wku   = w[kw][1];
             wambr = wkr*(a[ii+k][0]-b[ii+k][0]) 
                   - wku*(a[ii+k][1]-b[ii+k][1]);
             wambu = wku*(a[ii+k][0]-b[ii+k][0]) 
                   + wkr*(a[ii+k][1]-b[ii+k][1]);
             a[ii+k][0] = a[ii+k][0]+b[ii+k][0];
             a[ii+k][1] = a[ii+k][1]+b[ii+k][1];
             b[ii+k][0] = wambr;
             b[ii+k][1] = wambu;
         } 
      } else {
#pragma ivdep
         for(k=0;k<lj;k++){
             kw    = k*mj;
             wkr   = w[kw][0];
             wku   = -w[kw][1];
             wambr = wkr*(a[ii+k][0]-b[ii+k][0]) 
                   - wku*(a[ii+k][1]-b[ii+k][1]);
             wambu = wku*(a[ii+k][0]-b[ii+k][0]) 
                   + wkr*(a[ii+k][1]-b[ii+k][1]);
             a[ii+k][0] = a[ii+k][0]+b[ii+k][0];
             a[ii+k][1] = a[ii+k][1]+b[ii+k][1];
             b[ii+k][0] = wambr;
             b[ii+k][1] = wambu;
         } 
      }
   }
}
void step1(n,mj,a,b,c,d,w,sign)
int n,mj;
float a[][2],b[][2],c[][2],d[][2],w[][2];
float sign;
{
   float wkr,wku,wambr,wambu,wcmdr,wcmdu;
   int mj2,i,j,k,kw,lj;

   mj2 = 2*mj;
   lj  = n/mj2;

   for(j=0;j<mj;j+=n/mj){
      if(sign > 0.){
         for(i=j;i<n;i+=mj2){
#pragma ivdep
            for(k=0;k<lj;k++){
                kw    = k*mj;
                wkr   = w[kw][0];
                wku   = w[kw][1];
                wambr = wkr*(a[i+k][0]-b[i+k][0]) 
                      - wku*(a[i+k][1]-b[i+k][1]);
                wambu = wku*(a[i+k][0]-b[i+k][0]) 
                      + wkr*(a[i+k][1]-b[i+k][1]);
                a[i+k][0] = a[i+k][0]+b[i+k][0];
                a[i+k][1] = a[i+k][1]+b[i+k][1];
                b[i+k][0] = c[i+k][0]+d[i+k][0];
                b[i+k][1] = c[i+k][1]+d[i+k][1];
                wcmdr = wkr*(c[i+k][0]-d[i+k][0]) 
                      - wku*(c[i+k][1]-d[i+k][1]);
                wcmdu = wku*(c[i+k][0]-d[i+k][0]) 
                      + wkr*(c[i+k][1]-d[i+k][1]);
                c[i+k][0] = wambr;
                c[i+k][1] = wambu;
                d[i+k][0] = wcmdr;
                d[i+k][1] = wcmdu;
            } 
         } 
      } else {
         for(i=j;i<n;i+=mj2){
#pragma ivdep
            for(k=0;k<lj;k++){
                kw    = k*mj;
                wkr   = w[kw][0];
                wku   = -w[kw][1];
                wambr = wkr*(a[i+k][0]-b[i+k][0]) 
                      - wku*(a[i+k][1]-b[i+k][1]);
                wambu = wku*(a[i+k][0]-b[i+k][0]) 
                      + wkr*(a[i+k][1]-b[i+k][1]);
                a[i+k][0] = a[i+k][0]+b[i+k][0];
                a[i+k][1] = a[i+k][1]+b[i+k][1];
                b[i+k][0] = c[i+k][0]+d[i+k][0];
                b[i+k][1] = c[i+k][1]+d[i+k][1];
                wcmdr = wkr*(c[i+k][0]-d[i+k][0]) 
                      - wku*(c[i+k][1]-d[i+k][1]);
                wcmdu = wku*(c[i+k][0]-d[i+k][0]) 
                      + wkr*(c[i+k][1]-d[i+k][1]);
                c[i+k][0] = wambr;
                c[i+k][1] = wambu;
                d[i+k][0] = wcmdr;
                d[i+k][1] = wcmdu;
            }
         }
      }
   }
}
void step2(n,mj,a,b,c,d,w,sign)
int n,mj;
float a[][2],b[][2],c[][2],d[][2],w[][2];
float sign;
{
   float wkr,wku,wambr,wambu,wcmdr,wcmdu;
   int mj2,i,j,k,kw,lj,ii;

   mj2 = 2*mj;
   lj  = n/mj2;

   for(k=0;k<lj;k++){
      kw    = k*mj;
      wkr   = w[kw][0];
      if(sign > 0.){
         wku   = w[kw][1];
      } else {
         wku   = -w[kw][1];
      }
      for(i=0;i<lj;i++){
         ii = i*mj2;
#pragma ivdep
         for(j=k;j<mj;j+=n/mj){
             wambr = wkr*(a[ii+j][0]-b[ii+j][0]) 
                   - wku*(a[ii+j][1]-b[ii+j][1]);
             wambu = wku*(a[ii+j][0]-b[ii+j][0]) 
                   + wkr*(a[ii+j][1]-b[ii+j][1]);
             a[ii+j][0] = a[ii+j][0]+b[ii+j][0];
             a[ii+j][1] = a[ii+j][1]+b[ii+j][1];
             b[ii+j][0] = c[ii+j][0]+d[ii+j][0];
             b[ii+j][1] = c[ii+j][1]+d[ii+j][1];
             wcmdr = wkr*(c[ii+j][0]-d[ii+j][0]) 
                   - wku*(c[ii+j][1]-d[ii+j][1]);
             wcmdu = wku*(c[ii+j][0]-d[ii+j][0]) 
                   + wkr*(c[ii+j][1]-d[ii+j][1]);
             c[ii+j][0] = wambr;
             c[ii+j][1] = wambu;
             d[ii+j][0] = wcmdr;
             d[ii+j][1] = wcmdu;
         } 
      } 
   } 
}
#include <math.h>
float ggl(float *ds)
{

/* generate u(0,1) distributed random numbers. 
   Seed ds must be saved between calls. ggl is 
   essentially the same as the IMSL routine RNUM. 

   W. Petersen and M. Troyer, 24 Oct. 2002, ETHZ: 
   a modification of a fortran version from 
   I. Vattulainen, Tampere Univ. of Technology, 
   Finland, 1992 */

   double t,d2=0.2147483647e10;
   t   = (float) *ds;
   t   = fmod(0.16807e5*t,d2);
   *ds = (float) t;
   return((float) ((t-1.0e0)/(d2-1.0e0)));
}